From 93984f19e7bce4c18084a6ef3dacafb155b806ed Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:23 +0000
Subject: KVM: Fully serialize gfn=>pfn cache refresh via mutex

Protect gfn=>pfn cache refresh with a mutex to fully serialize refreshes.
The refresh logic doesn't protect against

- concurrent unmaps, or refreshes with different GPAs (which may or may not
  happen in practice, for example if a cache is only used under vcpu->mutex;
  but it's allowed in the code)

- a false negative on the memslot generation.  If the first refresh sees
  a stale memslot generation, it will refresh the hva and generation before
  moving on to the hva=>pfn translation.  If it then drops gpc->lock, a
  different user of the cache can come along, acquire gpc->lock, see that
  the memslot generation is fresh, and skip the hva=>pfn update due to the
  userspace address also matching (because it too was updated).

The refresh path can already sleep during hva=>pfn resolution, so wrap
the refresh with a mutex to ensure that any given refresh runs to
completion before other callers can start their refresh.

Cc: stable@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index ac1ebb37a0ff..f328a01db4fe 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -19,6 +19,7 @@ struct kvm_memslots;
 enum kvm_mr_change;
 
 #include <linux/bits.h>
+#include <linux/mutex.h>
 #include <linux/types.h>
 #include <linux/spinlock_types.h>
 
@@ -69,6 +70,7 @@ struct gfn_to_pfn_cache {
 	struct kvm_vcpu *vcpu;
 	struct list_head list;
 	rwlock_t lock;
+	struct mutex refresh_lock;
 	void *khva;
 	kvm_pfn_t pfn;
 	enum pfn_cache_usage usage;
-- 
cgit 


From 35d02493dba1ae6386fac07072908717affc3ff8 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:21 +0000
Subject: KVM: s390: pv: Add query interface

Some of the query information is already available via sysfs but
having a IOCTL makes the information easier to retrieve.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-4-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-4-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h | 25 ++++++++++++++++
 2 files changed, 101 insertions(+)

(limited to 'include')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 76ad6408cb2c..5859f243d287 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2224,6 +2224,42 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
 	return r;
 }
 
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+	ssize_t len_min;
+
+	switch (info->header.id) {
+	case KVM_PV_INFO_VM: {
+		len_min =  sizeof(info->header) + sizeof(info->vm);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		memcpy(info->vm.inst_calls_list,
+		       uv_info.inst_calls_list,
+		       sizeof(uv_info.inst_calls_list));
+
+		/* It's max cpuid not max cpus, so it's off by one */
+		info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+		info->vm.max_guests = uv_info.max_num_sec_conf;
+		info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+		info->vm.feature_indication = uv_info.uv_feature_indications;
+
+		return len_min;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
 static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 {
 	int r = 0;
@@ -2360,6 +2396,46 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 			     cmd->rc, cmd->rrc);
 		break;
 	}
+	case KVM_PV_INFO: {
+		struct kvm_s390_pv_info info = {};
+		ssize_t data_len;
+
+		/*
+		 * No need to check the VM protection here.
+		 *
+		 * Maybe user space wants to query some of the data
+		 * when the VM is still unprotected. If we see the
+		 * need to fence a new data command we can still
+		 * return an error in the info handler.
+		 */
+
+		r = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info.header)))
+			break;
+
+		r = -EINVAL;
+		if (info.header.len_max < sizeof(info.header))
+			break;
+
+		data_len = kvm_s390_handle_pv_info(&info);
+		if (data_len < 0) {
+			r = data_len;
+			break;
+		}
+		/*
+		 * If a data command struct is extended (multiple
+		 * times) this can be used to determine how much of it
+		 * is valid.
+		 */
+		info.header.len_written = data_len;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &info, data_len))
+			break;
+
+		r = 0;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..5a5f66026dd3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1660,6 +1660,30 @@ struct kvm_s390_pv_unp {
 	__u64 tweak;
 };
 
+enum pv_cmd_info_id {
+	KVM_PV_INFO_VM,
+};
+
+struct kvm_s390_pv_info_vm {
+	__u64 inst_calls_list[4];
+	__u64 max_cpus;
+	__u64 max_guests;
+	__u64 max_guest_addr;
+	__u64 feature_indication;
+};
+
+struct kvm_s390_pv_info_header {
+	__u32 id;
+	__u32 len_max;
+	__u32 len_written;
+	__u32 reserved;
+};
+
+struct kvm_s390_pv_info {
+	struct kvm_s390_pv_info_header header;
+	struct kvm_s390_pv_info_vm vm;
+};
+
 enum pv_cmd_id {
 	KVM_PV_ENABLE,
 	KVM_PV_DISABLE,
@@ -1668,6 +1692,7 @@ enum pv_cmd_id {
 	KVM_PV_VERIFY,
 	KVM_PV_PREP_RESET,
 	KVM_PV_UNSHARE_ALL,
+	KVM_PV_INFO,
 };
 
 struct kvm_pv_cmd {
-- 
cgit 


From fe9a93e07ba4f29def2f8a4318b63e0c70a5c6c2 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:23 +0000
Subject: KVM: s390: pv: Add query dump information

The dump API requires userspace to provide buffers into which we will
store data. The dump information added in this patch tells userspace
how big those buffers need to be.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-6-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-6-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 11 +++++++++++
 include/uapi/linux/kvm.h | 12 +++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5859f243d287..de54f14e081e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2255,6 +2255,17 @@ static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
 
 		return len_min;
 	}
+	case KVM_PV_INFO_DUMP: {
+		len_min =  sizeof(info->header) + sizeof(info->dump);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+		info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+		info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+		return len_min;
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5a5f66026dd3..065a05ec06b6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1662,6 +1662,13 @@ struct kvm_s390_pv_unp {
 
 enum pv_cmd_info_id {
 	KVM_PV_INFO_VM,
+	KVM_PV_INFO_DUMP,
+};
+
+struct kvm_s390_pv_info_dump {
+	__u64 dump_cpu_buffer_len;
+	__u64 dump_config_mem_buffer_per_1m;
+	__u64 dump_config_finalize_len;
 };
 
 struct kvm_s390_pv_info_vm {
@@ -1681,7 +1688,10 @@ struct kvm_s390_pv_info_header {
 
 struct kvm_s390_pv_info {
 	struct kvm_s390_pv_info_header header;
-	struct kvm_s390_pv_info_vm vm;
+	union {
+		struct kvm_s390_pv_info_dump dump;
+		struct kvm_s390_pv_info_vm vm;
+	};
 };
 
 enum pv_cmd_id {
-- 
cgit 


From 0460eb35b443f73f8a8e3be1ea87bd690a852e20 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:24 +0000
Subject: KVM: s390: Add configuration dump functionality

Sometimes dumping inside of a VM fails, is unavailable or doesn't
yield the required data. For these occasions we dump the VM from the
outside, writing memory and cpu data to a file.

Up to now PV guests only supported dumping from the inside of the
guest through dumpers like KDUMP. A PV guest can be dumped from the
hypervisor but the data will be stale and / or encrypted.

To get the actual state of the PV VM we need the help of the
Ultravisor who safeguards the VM state. New UV calls have been added
to initialize the dump, dump storage state data, dump cpu data and
complete the dump process. We expose these calls in this patch via a
new UV ioctl command.

The sensitive parts of the dump data are encrypted, the dump key is
derived from the Customer Communication Key (CCK). This ensures that
only the owner of the VM who has the CCK can decrypt the dump data.

The memory is dumped / read via a normal export call and a re-import
after the dump initialization is not needed (no re-encryption with a
dump key).

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-7-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-7-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   1 +
 arch/s390/kvm/kvm-s390.c         |  93 ++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h         |   4 +
 arch/s390/kvm/pv.c               | 182 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h         |  15 ++++
 5 files changed, 295 insertions(+)

(limited to 'include')

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 766028d54a3e..a0fbe4820e0a 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -923,6 +923,7 @@ struct kvm_s390_pv {
 	u64 guest_len;
 	unsigned long stor_base;
 	void *stor_var;
+	bool dumping;
 };
 
 struct kvm_arch{
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index de54f14e081e..1d00aead6bc5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2271,6 +2271,68 @@ static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
 	}
 }
 
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+			   struct kvm_s390_pv_dmp dmp)
+{
+	int r = -EINVAL;
+	void __user *result_buff = (void __user *)dmp.buff_addr;
+
+	switch (dmp.subcmd) {
+	case KVM_PV_DUMP_INIT: {
+		if (kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * Block SIE entry as concurrent dump UVCs could lead
+		 * to validities.
+		 */
+		kvm_s390_vcpu_block_all(kvm);
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+			     cmd->rc, cmd->rrc);
+		if (!r) {
+			kvm->arch.pv.dumping = true;
+		} else {
+			kvm_s390_vcpu_unblock_all(kvm);
+			r = -EINVAL;
+		}
+		break;
+	}
+	case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * gaddr is an output parameter since we might stop
+		 * early. As dmp will be copied back in our caller, we
+		 * don't need to do it ourselves.
+		 */
+		r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+						&cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_DUMP_COMPLETE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		r = -EINVAL;
+		if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+			break;
+
+		r = kvm_s390_pv_dump_complete(kvm, result_buff,
+					      &cmd->rc, &cmd->rrc);
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 {
 	int r = 0;
@@ -2447,6 +2509,28 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 		r = 0;
 		break;
 	}
+	case KVM_PV_DUMP: {
+		struct kvm_s390_pv_dmp dmp;
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&dmp, argp, sizeof(dmp)))
+			break;
+
+		r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+		if (r)
+			break;
+
+		if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+			r = -EFAULT;
+			break;
+		}
+
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -4564,6 +4648,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	int rc;
 
+	/*
+	 * Running a VM while dumping always has the potential to
+	 * produce inconsistent dump data. But for PV vcpus a SIE
+	 * entry while dumping could also lead to a fatal validity
+	 * intercept which we absolutely want to avoid.
+	 */
+	if (vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
 	if (kvm_run->immediate_exit)
 		return -EINTR;
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 497d52a83c78..2c11eb5ba3ef 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,10 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
 int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 		       unsigned long tweak, u16 *rc, u16 *rrc);
 int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc);
 
 static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
 {
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index cc7c9599f43e..e9912113879c 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/minmax.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
 #include <asm/gmap.h>
@@ -298,3 +299,184 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
 		return -EINVAL;
 	return 0;
 }
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ *	   is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the cache fails
+ *  -EINVAL if gaddr is not aligned to 1MB
+ *  -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_stor_state uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+		.header.len = sizeof(uvcb),
+		.config_handle = kvm->arch.pv.handle,
+		.gaddr = *gaddr,
+		.dump_area_origin = 0,
+	};
+	const u64 increment_len = uv_info.conf_dump_storage_state_len;
+	size_t buff_kvm_size;
+	size_t size_done = 0;
+	u8 *buff_kvm = NULL;
+	int cc, ret;
+
+	ret = -EINVAL;
+	/* UV call processes 1MB guest storage chunks at a time */
+	if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+		goto out;
+
+	/*
+	 * We provide the storage state for 1MB chunks of guest
+	 * storage. The buffer will need to be aligned to
+	 * conf_dump_storage_state_len so we don't end on a partial
+	 * chunk.
+	 */
+	if (!buff_user_len ||
+	    !IS_ALIGNED(buff_user_len, increment_len))
+		goto out;
+
+	/*
+	 * Allocate a buffer from which we will later copy to the user
+	 * process. We don't want userspace to dictate our buffer size
+	 * so we limit it to DUMP_BUFF_LEN.
+	 */
+	ret = -ENOMEM;
+	buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+	buff_kvm = vzalloc(buff_kvm_size);
+	if (!buff_kvm)
+		goto out;
+
+	ret = 0;
+	uvcb.dump_area_origin = (u64)buff_kvm;
+	/* We will loop until the user buffer is filled or an error occurs */
+	do {
+		/* Get 1MB worth of guest storage state data */
+		cc = uv_call_sched(0, (u64)&uvcb);
+
+		/* All or nothing */
+		if (cc) {
+			ret = -EINVAL;
+			break;
+		}
+
+		size_done += increment_len;
+		uvcb.dump_area_origin += increment_len;
+		buff_user_len -= increment_len;
+		uvcb.gaddr += HPAGE_SIZE;
+
+		/* KVM Buffer full, time to copy to the process */
+		if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+			if (copy_to_user(buff_user, buff_kvm, size_done)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			buff_user += size_done;
+			size_done = 0;
+			uvcb.dump_area_origin = (u64)buff_kvm;
+		}
+	} while (buff_user_len);
+
+	/* Report back where we ended dumping */
+	*gaddr = uvcb.gaddr;
+
+	/* Lets only log errors, we don't want to spam */
+out:
+	if (ret)
+		KVM_UV_EVENT(kvm, 3,
+			     "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+			     uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	vfree(buff_kvm);
+
+	return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the completion buffer fails
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_complete complete = {
+		.header.len = sizeof(complete),
+		.header.cmd = UVC_CMD_DUMP_COMPLETE,
+		.config_handle = kvm_s390_pv_get_handle(kvm),
+	};
+	u64 *compl_data;
+	int ret;
+
+	/* Allocate dump area */
+	compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+	if (!compl_data)
+		return -ENOMEM;
+	complete.dump_area_origin = (u64)compl_data;
+
+	ret = uv_call_sched(0, (u64)&complete);
+	*rc = complete.header.rc;
+	*rrc = complete.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+		     complete.header.rc, complete.header.rrc);
+
+	if (!ret) {
+		/*
+		 * kvm_s390_pv_dealloc_vm() will also (mem)set
+		 * this to false on a reboot or other destroy
+		 * operation for this vm.
+		 */
+		kvm->arch.pv.dumping = false;
+		kvm_s390_vcpu_unblock_all(kvm);
+		ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+		if (ret)
+			ret = -EFAULT;
+	}
+	vfree(compl_data);
+	/* If the UVC returned an error, translate it to -EINVAL */
+	if (ret > 0)
+		ret = -EINVAL;
+	return ret;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 065a05ec06b6..673be2061c6c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1660,6 +1660,20 @@ struct kvm_s390_pv_unp {
 	__u64 tweak;
 };
 
+enum pv_cmd_dmp_id {
+	KVM_PV_DUMP_INIT,
+	KVM_PV_DUMP_CONFIG_STOR_STATE,
+	KVM_PV_DUMP_COMPLETE,
+};
+
+struct kvm_s390_pv_dmp {
+	__u64 subcmd;
+	__u64 buff_addr;
+	__u64 buff_len;
+	__u64 gaddr;		/* For dump storage state */
+	__u64 reserved[4];
+};
+
 enum pv_cmd_info_id {
 	KVM_PV_INFO_VM,
 	KVM_PV_INFO_DUMP,
@@ -1703,6 +1717,7 @@ enum pv_cmd_id {
 	KVM_PV_PREP_RESET,
 	KVM_PV_UNSHARE_ALL,
 	KVM_PV_INFO,
+	KVM_PV_DUMP,
 };
 
 struct kvm_pv_cmd {
-- 
cgit 


From 8aba09588d2af37c6cc1a781b87d1d91ebf389ae Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:25 +0000
Subject: KVM: s390: Add CPU dump functionality

The previous patch introduced the per-VM dump functions now let's
focus on dumping the VCPU state via the newly introduced
KVM_S390_PV_CPU_COMMAND ioctl which mirrors the VM UV ioctl and can be
extended with new commands later.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-8-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-8-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h |  1 +
 arch/s390/kvm/pv.c       | 16 +++++++++++
 include/uapi/linux/kvm.h |  4 +++
 4 files changed, 90 insertions(+)

(limited to 'include')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1d00aead6bc5..37be2a33edb5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5096,6 +5096,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
 	return -ENOIOCTLCMD;
 }
 
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+					struct kvm_pv_cmd *cmd)
+{
+	struct kvm_s390_pv_dmp dmp;
+	void *data;
+	int ret;
+
+	/* Dump initialization is a prerequisite */
+	if (!vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
+	if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+		return -EFAULT;
+
+	/* We only handle this subcmd right now */
+	if (dmp.subcmd != KVM_PV_DUMP_CPU)
+		return -EINVAL;
+
+	/* CPU dump length is the same as create cpu storage donation. */
+	if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+		return -EINVAL;
+
+	data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+	VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+		   vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+	if (ret)
+		ret = -EINVAL;
+
+	/* On success copy over the dump data */
+	if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+		ret = -EFAULT;
+
+	kvfree(data);
+	return ret;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -5260,6 +5302,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 					   irq_state.len);
 		break;
 	}
+	case KVM_S390_PV_CPU_COMMAND: {
+		struct kvm_pv_cmd cmd;
+
+		r = -EINVAL;
+		if (!is_prot_virt_host())
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&cmd, argp, sizeof(cmd)))
+			break;
+
+		r = -EINVAL;
+		if (cmd.flags)
+			break;
+
+		/* We only handle this cmd right now */
+		if (cmd.cmd != KVM_PV_DUMP)
+			break;
+
+		r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+		/* Always copy over UV rc / rrc data */
+		if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+				 sizeof(cmd.rc) + sizeof(cmd.rrc)))
+			r = -EFAULT;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 2c11eb5ba3ef..dd01d989816f 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,7 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
 int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 		       unsigned long tweak, u16 *rc, u16 *rrc);
 int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
 int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
 				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
 int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index e9912113879c..b4a499b10b67 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -300,6 +300,22 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
 	return 0;
 }
 
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_cpu uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CPU,
+		.header.len = sizeof(uvcb),
+		.cpu_handle = vcpu->arch.pv.handle,
+		.dump_area_origin = (u64)buff,
+	};
+	int cc;
+
+	cc = uv_call_sched(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	return cc;
+}
+
 /* Size of the cache for the storage state dump data. 1MB for now */
 #define DUMP_BUFF_LEN HPAGE_SIZE
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 673be2061c6c..af5d254f8061 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1664,6 +1664,7 @@ enum pv_cmd_dmp_id {
 	KVM_PV_DUMP_INIT,
 	KVM_PV_DUMP_CONFIG_STOR_STATE,
 	KVM_PV_DUMP_COMPLETE,
+	KVM_PV_DUMP_CPU,
 };
 
 struct kvm_s390_pv_dmp {
@@ -2168,4 +2169,7 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 
+/* Available with KVM_CAP_S390_PROTECTED_DUMP */
+#define KVM_S390_PV_CPU_COMMAND	_IOWR(KVMIO, 0xd0, struct kvm_pv_cmd)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit 


From e9bf3acb23f0a6e18438c35944d6cb618d16cf05 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:26 +0000
Subject: KVM: s390: Add KVM_CAP_S390_PROTECTED_DUMP

The capability indicates dump support for protected VMs.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-9-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-9-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 20 ++++++++++++++++++++
 include/uapi/linux/kvm.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 37be2a33edb5..d1a32eb3cf5d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -606,6 +606,26 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_PROTECTED:
 		r = is_prot_virt_host();
 		break;
+	case KVM_CAP_S390_PROTECTED_DUMP: {
+		u64 pv_cmds_dump[] = {
+			BIT_UVC_CMD_DUMP_INIT,
+			BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+			BIT_UVC_CMD_DUMP_CPU,
+			BIT_UVC_CMD_DUMP_COMPLETE,
+		};
+		int i;
+
+		r = is_prot_virt_host();
+
+		for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+			if (!test_bit_inv(pv_cmds_dump[i],
+					  (unsigned long *)&uv_info.inst_calls_list)) {
+				r = 0;
+				break;
+			}
+		}
+		break;
+	}
 	default:
 		r = 0;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index af5d254f8061..c4a32910b88a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1157,6 +1157,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_TSC_CONTROL 214
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
+#define KVM_CAP_S390_PROTECTED_DUMP 217
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit 


From d8616ee2affcff37c5d315310da557a694a3303d Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Tue, 24 May 2022 15:53:11 +0800
Subject: bpf, sockmap: Fix sk->sk_forward_alloc warn_on in
 sk_stream_kill_queues

During TCP sockmap redirect pressure test, the following warning is triggered:

WARNING: CPU: 3 PID: 2145 at net/core/stream.c:205 sk_stream_kill_queues+0xbc/0xd0
CPU: 3 PID: 2145 Comm: iperf Kdump: loaded Tainted: G        W         5.10.0+ #9
Call Trace:
 inet_csk_destroy_sock+0x55/0x110
 inet_csk_listen_stop+0xbb/0x380
 tcp_close+0x41b/0x480
 inet_release+0x42/0x80
 __sock_release+0x3d/0xa0
 sock_close+0x11/0x20
 __fput+0x9d/0x240
 task_work_run+0x62/0x90
 exit_to_user_mode_prepare+0x110/0x120
 syscall_exit_to_user_mode+0x27/0x190
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

The reason we observed is that:

When the listener is closing, a connection may have completed the three-way
handshake but not accepted, and the client has sent some packets. The child
sks in accept queue release by inet_child_forget()->inet_csk_destroy_sock(),
but psocks of child sks have not released.

To fix, add sock_map_destroy to release psocks.

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220524075311.649153-1-wangyufen@huawei.com
---
 include/linux/bpf.h   |  1 +
 include/linux/skmsg.h |  1 +
 net/core/skmsg.c      |  1 +
 net/core/sock_map.c   | 23 +++++++++++++++++++++++
 net/ipv4/tcp_bpf.c    |  1 +
 5 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b914a56a2c5..8e6092d0ea95 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2104,6 +2104,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
 
 void sock_map_unhash(struct sock *sk);
+void sock_map_destroy(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c5a2d6f50f25..153b6dec9b6a 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -95,6 +95,7 @@ struct sk_psock {
 	spinlock_t			link_lock;
 	refcount_t			refcnt;
 	void (*saved_unhash)(struct sock *sk);
+	void (*saved_destroy)(struct sock *sk);
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 22b983ade0e7..7e03f96e441b 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -715,6 +715,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	psock->eval = __SK_NONE;
 	psock->sk_proto = prot;
 	psock->saved_unhash = prot->unhash;
+	psock->saved_destroy = prot->destroy;
 	psock->saved_close = prot->close;
 	psock->saved_write_space = sk->sk_write_space;
 
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 81d4b4756a02..9f08ccfaf6da 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1561,6 +1561,29 @@ void sock_map_unhash(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sock_map_unhash);
 
+void sock_map_destroy(struct sock *sk)
+{
+	void (*saved_destroy)(struct sock *sk);
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		if (sk->sk_prot->destroy)
+			sk->sk_prot->destroy(sk);
+		return;
+	}
+
+	saved_destroy = psock->saved_destroy;
+	sock_map_remove_links(sk, psock);
+	rcu_read_unlock();
+	sk_psock_stop(psock, true);
+	sk_psock_put(sk, psock);
+	saved_destroy(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_destroy);
+
 void sock_map_close(struct sock *sk, long timeout)
 {
 	void (*saved_close)(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index be3947e70fec..38550bb1b90b 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -540,6 +540,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 				   struct proto *base)
 {
 	prot[TCP_BPF_BASE]			= *base;
+	prot[TCP_BPF_BASE].destroy		= sock_map_destroy;
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
 	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
-- 
cgit 


From 8d5976089c97a4beeeda4de59e2fba9862946893 Mon Sep 17 00:00:00 2001
From: Xiang wangx <wangxiang@cdjrlc.com>
Date: Mon, 6 Jun 2022 10:23:13 +0800
Subject: platform/chrome: cros_ec_commands: Fix syntax errors in comments

Delete the redundant word 'using'.

Signed-off-by: Xiang wangx <wangxiang@cdjrlc.com>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606022313.22912-1-wangxiang@cdjrlc.com
---
 include/linux/platform_data/cros_ec_commands.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8cfa8cfca77e..e59f51c41a1c 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -787,7 +787,7 @@ struct ec_host_response {
  *
  * Packets always start with a request or response header.  They are followed
  * by data_len bytes of data.  If the data_crc_present flag is set, the data
- * bytes are followed by a CRC-8 of that data, using using x^8 + x^2 + x + 1
+ * bytes are followed by a CRC-8 of that data, using x^8 + x^2 + x + 1
  * polynomial.
  *
  * Host algorithm when sending a request q:
-- 
cgit 


From 95099951557c9eb2f180d6bcb9885eecaca97d24 Mon Sep 17 00:00:00 2001
From: Chanho Park <chanho61.park@samsung.com>
Date: Mon, 23 May 2022 21:12:43 +0900
Subject: dt-bindings: soc: add samsung,boot-mode definitions

Adds samsung,boot-mode.h header file which contains boot mode
definitions for bootloader. As for now, there are only boot mode
definitions for Exynos Auto v9 SoC.

Signed-off-by: Chanho Park <chanho61.park@samsung.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220523121244.67341-2-chanho61.park@samsung.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/dt-bindings/soc/samsung,boot-mode.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 include/dt-bindings/soc/samsung,boot-mode.h

(limited to 'include')

diff --git a/include/dt-bindings/soc/samsung,boot-mode.h b/include/dt-bindings/soc/samsung,boot-mode.h
new file mode 100644
index 000000000000..47ef1cdd3916
--- /dev/null
+++ b/include/dt-bindings/soc/samsung,boot-mode.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd.
+ * Author: Chanho Park <chanho61.park@samsung.com>
+ *
+ * Device Tree bindings for Samsung Boot Mode.
+ */
+
+#ifndef __DT_BINDINGS_SAMSUNG_BOOT_MODE_H
+#define __DT_BINDINGS_SAMSUNG_BOOT_MODE_H
+
+/* Boot mode definitions for Exynos Auto v9 SoC */
+
+#define EXYNOSAUTOV9_BOOT_FASTBOOT	0xfa
+#define EXYNOSAUTOV9_BOOT_BOOTLOADER	0xfc
+#define EXYNOSAUTOV9_BOOT_RECOVERY	0xff
+
+#endif /* __DT_BINDINGS_SAMSUNG_BOOT_MODE_H */
-- 
cgit 


From 90b12a88b710cdc80c00552dfbd589228978bffe Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Wed, 11 May 2022 18:23:50 +0200
Subject: ALSA: Add snd_pcm_direction_name() helper

Allow for retrieving string naming a direction of a stream without the
need of substream pointer.

Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20220511162403.3987658-2-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/pcm.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 6b99310b5b88..26523cfe428d 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -1392,6 +1392,20 @@ static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max)
 
 const char *snd_pcm_format_name(snd_pcm_format_t format);
 
+/**
+ * snd_pcm_direction_name - Get a string naming the direction of a stream
+ * @direction: Stream's direction, one of SNDRV_PCM_STREAM_XXX
+ *
+ * Returns a string naming the direction of the stream.
+ */
+static inline const char *snd_pcm_direction_name(int direction)
+{
+	if (direction == SNDRV_PCM_STREAM_PLAYBACK)
+		return "Playback";
+	else
+		return "Capture";
+}
+
 /**
  * snd_pcm_stream_str - Get a string naming the direction of a stream
  * @substream: the pcm substream instance
@@ -1400,10 +1414,7 @@ const char *snd_pcm_format_name(snd_pcm_format_t format);
  */
 static inline const char *snd_pcm_stream_str(struct snd_pcm_substream *substream)
 {
-	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
-		return "Playback";
-	else
-		return "Capture";
+	return snd_pcm_direction_name(substream->stream);
 }
 
 /*
-- 
cgit 


From 905f3a04e184854555fc248ca4e692fdbf2f2547 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 19 May 2022 16:42:23 +0100
Subject: ASoC: core: Add set_fmt_new callback that directly specifies provider

The original set_fmt callback always passes clock provider/consumer
with respect to the CODEC. This made sense when the framework was
directly broken down into platforms and CODECs. Now everything is
componentised it simplifies things if each side of the link is
just told if it is provider or consumer of the clocks. To start
this migration add a new callback that can be used to receive a
direct specification of clocking. As there are more CODEC drivers
than platform drivers, we make the new flags identical to the old
CODEC flags meaning CODEC drivers will not require an update.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220519154318.2153729-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dai.h | 7 +++++++
 sound/soc/soc-core.c    | 3 ++-
 sound/soc/soc-dai.c     | 5 ++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index bbd821d2df9c..9c1d92d5a373 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -124,6 +124,12 @@ struct snd_compr_stream;
 #define SND_SOC_DAIFMT_CBM_CFS		SND_SOC_DAIFMT_CBP_CFC
 #define SND_SOC_DAIFMT_CBS_CFS		SND_SOC_DAIFMT_CBC_CFC
 
+/* when passed to set_fmt directly indicate if the device is provider or consumer */
+#define SND_SOC_DAIFMT_BP_FP		SND_SOC_DAIFMT_CBP_CFP
+#define SND_SOC_DAIFMT_BC_FP		SND_SOC_DAIFMT_CBC_CFP
+#define SND_SOC_DAIFMT_BP_FC		SND_SOC_DAIFMT_CBP_CFC
+#define SND_SOC_DAIFMT_BC_FC		SND_SOC_DAIFMT_CBC_CFC
+
 /* Describes the possible PCM format */
 #define SND_SOC_POSSIBLE_DAIFMT_CLOCK_PROVIDER_SHIFT	48
 #define SND_SOC_POSSIBLE_DAIFMT_CLOCK_PROVIDER_MASK	(0xFFFFULL << SND_SOC_POSSIBLE_DAIFMT_CLOCK_PROVIDER_SHIFT)
@@ -282,6 +288,7 @@ struct snd_soc_dai_ops {
 	 * Called by soc_card drivers, normally in their hw_params.
 	 */
 	int (*set_fmt)(struct snd_soc_dai *dai, unsigned int fmt);
+	int (*set_fmt_new)(struct snd_soc_dai *dai, unsigned int fmt);
 	int (*xlate_tdm_slot_mask)(unsigned int slots,
 		unsigned int *tx_mask, unsigned int *rx_mask);
 	int (*set_tdm_slot)(struct snd_soc_dai *dai,
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 9574f86dd4de..90f4265bea50 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1235,7 +1235,8 @@ int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd,
 	for_each_rtd_cpu_dais(rtd, i, cpu_dai) {
 		unsigned int fmt = dai_fmt;
 
-		if (snd_soc_component_is_codec(cpu_dai->component))
+		if (cpu_dai->driver->ops->set_fmt_new ||
+		    snd_soc_component_is_codec(cpu_dai->component))
 			fmt = inv_dai_fmt;
 
 		ret = snd_soc_dai_set_fmt(cpu_dai, fmt);
diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c
index 6078afe335f8..996712f4d9bf 100644
--- a/sound/soc/soc-dai.c
+++ b/sound/soc/soc-dai.c
@@ -209,7 +209,10 @@ int snd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 	int ret = -ENOTSUPP;
 
 	if (dai->driver->ops &&
-	    dai->driver->ops->set_fmt)
+	    dai->driver->ops->set_fmt_new)
+		ret = dai->driver->ops->set_fmt_new(dai, fmt);
+	else if (dai->driver->ops &&
+		 dai->driver->ops->set_fmt)
 		ret = dai->driver->ops->set_fmt(dai, fmt);
 
 	return soc_dai_ret(dai, ret);
-- 
cgit 


From 19423951a4b5c4f0ca107d6a4bed23f3f63718ca Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 19 May 2022 16:43:17 +0100
Subject: ASoC: soc-dai: Remove set_fmt_new callback

Now the behaviour of the core and all drivers is updated to the new
direct clock specification the temporary set_fmt_new callback can be
completely removed.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220519154318.2153729-56-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dai.h | 1 -
 sound/soc/soc-dai.c     | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index 9c1d92d5a373..ea7509672086 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -288,7 +288,6 @@ struct snd_soc_dai_ops {
 	 * Called by soc_card drivers, normally in their hw_params.
 	 */
 	int (*set_fmt)(struct snd_soc_dai *dai, unsigned int fmt);
-	int (*set_fmt_new)(struct snd_soc_dai *dai, unsigned int fmt);
 	int (*xlate_tdm_slot_mask)(unsigned int slots,
 		unsigned int *tx_mask, unsigned int *rx_mask);
 	int (*set_tdm_slot)(struct snd_soc_dai *dai,
diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c
index 996712f4d9bf..d530e8c2b77b 100644
--- a/sound/soc/soc-dai.c
+++ b/sound/soc/soc-dai.c
@@ -208,11 +208,7 @@ int snd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 {
 	int ret = -ENOTSUPP;
 
-	if (dai->driver->ops &&
-	    dai->driver->ops->set_fmt_new)
-		ret = dai->driver->ops->set_fmt_new(dai, fmt);
-	else if (dai->driver->ops &&
-		 dai->driver->ops->set_fmt)
+	if (dai->driver->ops && dai->driver->ops->set_fmt)
 		ret = dai->driver->ops->set_fmt(dai, fmt);
 
 	return soc_dai_ret(dai, ret);
-- 
cgit 


From 28086d05ada6d03daa886aad0e469854b811311c Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 19 May 2022 16:43:18 +0100
Subject: ASoC: simple-card-utils: Move snd_soc_component_is_codec to be local

The helper function snd_soc_component_is_codec is based off the
presence of the non_legacy_dai_naming flag. This isn't super robust
as CPU side components may also specify this flag, and indeed the
kernel already contains a couple that do. After componentisation there
isn't really a totally robust solution to identifying what is a CODEC
driver, without introducing a flag specifically for that purpose, and
really the desirable direction to move in is that the distinction
doesn't matter.

This patch does two things to try to mitigate these problems. Firstly,
now that all the other users of the helper function have been removed,
it makes the helper function local to the driver rather, than being
part of the core. This should help to discourage any new code from
being created that depends on the CODEC driver distinction. Secondly,
it updates the helper function itself to use the endianness flag
rather than the non_legacy_dai_naming flag. The endianness flag is
definitely invalid on a CPU side component, so it a more reliable
indicator that the device is definitely a CODEC. The vast majority of
buses require the CODEC to set the endianness flag, so the number of
corner cases should be fairly minimal. It is worth noting that CODECs
sending audio over SPI, or built into the CPU CODECs are potential
corner cases, however the hope is that in most cases those types of
devices do not consitute a simple audio card.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220519154318.2153729-57-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-component.h         | 5 -----
 sound/soc/generic/simple-card-utils.c | 7 ++++++-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h
index 5a764c3099d3..5c4cfa70b018 100644
--- a/include/sound/soc-component.h
+++ b/include/sound/soc-component.h
@@ -348,11 +348,6 @@ static inline int snd_soc_component_cache_sync(
 	return regcache_sync(component->regmap);
 }
 
-static inline int snd_soc_component_is_codec(struct snd_soc_component *component)
-{
-	return component->driver->non_legacy_dai_naming;
-}
-
 void snd_soc_component_set_aux(struct snd_soc_component *component,
 			       struct snd_soc_aux_dev *aux);
 int snd_soc_component_init(struct snd_soc_component *component);
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index 539d7f081bd7..50a982708933 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -513,6 +513,11 @@ static int asoc_simple_init_dai(struct snd_soc_dai *dai,
 	return 0;
 }
 
+static inline int asoc_simple_component_is_codec(struct snd_soc_component *component)
+{
+	return component->driver->endianness;
+}
+
 static int asoc_simple_init_dai_link_params(struct snd_soc_pcm_runtime *rtd,
 					    struct simple_dai_props *dai_props)
 {
@@ -524,7 +529,7 @@ static int asoc_simple_init_dai_link_params(struct snd_soc_pcm_runtime *rtd,
 
 	/* Only Codecs */
 	for_each_rtd_components(rtd, i, component) {
-		if (!snd_soc_component_is_codec(component))
+		if (!asoc_simple_component_is_codec(component))
 			return 0;
 	}
 
-- 
cgit 


From 94e0bc317ad241c022a6bb311b3a28b4d51ea8b6 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Wed, 25 May 2022 14:16:30 +0100
Subject: ASoC: cs35l41: Move cs35l41 exit hibernate function into shared code

CS35L41 HDA Driver will support hibernation using DSP firmware,
move the exit hibernate function into shared code so this can
be reused.

Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Signed-off-by: Vitaly Rodionov <vitalyr@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220525131638.5512-10-vitalyr@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l41.h        |  1 +
 sound/soc/codecs/cs35l41-lib.c | 60 +++++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/cs35l41.c     | 61 +-----------------------------------------
 3 files changed, 62 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 8972fa697622..7759f2e14d96 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -881,6 +881,7 @@ void cs35l41_configure_cs_dsp(struct device *dev, struct regmap *reg, struct cs_
 int cs35l41_set_cspl_mbox_cmd(struct device *dev, struct regmap *regmap,
 			      enum cs35l41_cspl_mbox_cmd cmd);
 int cs35l41_write_fs_errata(struct device *dev, struct regmap *regmap);
+int cs35l41_exit_hibernate(struct device *dev, struct regmap *regmap);
 int cs35l41_init_boost(struct device *dev, struct regmap *regmap,
 		       struct cs35l41_hw_cfg *hw_cfg);
 bool cs35l41_safe_reset(struct regmap *regmap, enum cs35l41_boost_type b_type);
diff --git a/sound/soc/codecs/cs35l41-lib.c b/sound/soc/codecs/cs35l41-lib.c
index 6d3070ea9e06..cc5366c8bdd6 100644
--- a/sound/soc/codecs/cs35l41-lib.c
+++ b/sound/soc/codecs/cs35l41-lib.c
@@ -1321,6 +1321,66 @@ int cs35l41_write_fs_errata(struct device *dev, struct regmap *regmap)
 }
 EXPORT_SYMBOL_GPL(cs35l41_write_fs_errata);
 
+static void cs35l41_wait_for_pwrmgt_sts(struct device *dev, struct regmap *regmap)
+{
+	const int pwrmgt_retries = 10;
+	unsigned int sts;
+	int i, ret;
+
+	for (i = 0; i < pwrmgt_retries; i++) {
+		ret = regmap_read(regmap, CS35L41_PWRMGT_STS, &sts);
+		if (ret)
+			dev_err(dev, "Failed to read PWRMGT_STS: %d\n", ret);
+		else if (!(sts & CS35L41_WR_PEND_STS_MASK))
+			return;
+
+		udelay(20);
+	}
+
+	dev_err(dev, "Timed out reading PWRMGT_STS\n");
+}
+
+int cs35l41_exit_hibernate(struct device *dev, struct regmap *regmap)
+{
+	const int wake_retries = 20;
+	const int sleep_retries = 5;
+	int ret, i, j;
+
+	for (i = 0; i < sleep_retries; i++) {
+		dev_dbg(dev, "Exit hibernate\n");
+
+		for (j = 0; j < wake_retries; j++) {
+			ret = cs35l41_set_cspl_mbox_cmd(dev, regmap,
+							CSPL_MBOX_CMD_OUT_OF_HIBERNATE);
+			if (!ret)
+				break;
+
+			usleep_range(100, 200);
+		}
+
+		if (j < wake_retries) {
+			dev_dbg(dev, "Wake success at cycle: %d\n", j);
+			return 0;
+		}
+
+		dev_err(dev, "Wake failed, re-enter hibernate: %d\n", ret);
+
+		cs35l41_wait_for_pwrmgt_sts(dev, regmap);
+		regmap_write(regmap, CS35L41_WAKESRC_CTL, 0x0088);
+
+		cs35l41_wait_for_pwrmgt_sts(dev, regmap);
+		regmap_write(regmap, CS35L41_WAKESRC_CTL, 0x0188);
+
+		cs35l41_wait_for_pwrmgt_sts(dev, regmap);
+		regmap_write(regmap, CS35L41_PWRMGT_CTL, 0x3);
+	}
+
+	dev_err(dev, "Timed out waking device\n");
+
+	return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_GPL(cs35l41_exit_hibernate);
+
 MODULE_DESCRIPTION("CS35L41 library");
 MODULE_AUTHOR("David Rhodes, Cirrus Logic Inc, <david.rhodes@cirrus.com>");
 MODULE_AUTHOR("Lucas Tanure, Cirrus Logic Inc, <tanureal@opensource.cirrus.com>");
diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c
index 3e68a07a3c8e..be7d02517739 100644
--- a/sound/soc/codecs/cs35l41.c
+++ b/sound/soc/codecs/cs35l41.c
@@ -1351,65 +1351,6 @@ static int __maybe_unused cs35l41_runtime_suspend(struct device *dev)
 	return 0;
 }
 
-static void cs35l41_wait_for_pwrmgt_sts(struct cs35l41_private *cs35l41)
-{
-	const int pwrmgt_retries = 10;
-	unsigned int sts;
-	int i, ret;
-
-	for (i = 0; i < pwrmgt_retries; i++) {
-		ret = regmap_read(cs35l41->regmap, CS35L41_PWRMGT_STS, &sts);
-		if (ret)
-			dev_err(cs35l41->dev, "Failed to read PWRMGT_STS: %d\n", ret);
-		else if (!(sts & CS35L41_WR_PEND_STS_MASK))
-			return;
-
-		udelay(20);
-	}
-
-	dev_err(cs35l41->dev, "Timed out reading PWRMGT_STS\n");
-}
-
-static int cs35l41_exit_hibernate(struct cs35l41_private *cs35l41)
-{
-	const int wake_retries = 20;
-	const int sleep_retries = 5;
-	int ret, i, j;
-
-	for (i = 0; i < sleep_retries; i++) {
-		dev_dbg(cs35l41->dev, "Exit hibernate\n");
-
-		for (j = 0; j < wake_retries; j++) {
-			ret = cs35l41_set_cspl_mbox_cmd(cs35l41->dev, cs35l41->regmap,
-							CSPL_MBOX_CMD_OUT_OF_HIBERNATE);
-			if (!ret)
-				break;
-
-			usleep_range(100, 200);
-		}
-
-		if (j < wake_retries) {
-			dev_dbg(cs35l41->dev, "Wake success at cycle: %d\n", j);
-			return 0;
-		}
-
-		dev_err(cs35l41->dev, "Wake failed, re-enter hibernate: %d\n", ret);
-
-		cs35l41_wait_for_pwrmgt_sts(cs35l41);
-		regmap_write(cs35l41->regmap, CS35L41_WAKESRC_CTL, 0x0088);
-
-		cs35l41_wait_for_pwrmgt_sts(cs35l41);
-		regmap_write(cs35l41->regmap, CS35L41_WAKESRC_CTL, 0x0188);
-
-		cs35l41_wait_for_pwrmgt_sts(cs35l41);
-		regmap_write(cs35l41->regmap, CS35L41_PWRMGT_CTL, 0x3);
-	}
-
-	dev_err(cs35l41->dev, "Timed out waking device\n");
-
-	return -ETIMEDOUT;
-}
-
 static int __maybe_unused cs35l41_runtime_resume(struct device *dev)
 {
 	struct cs35l41_private *cs35l41 = dev_get_drvdata(dev);
@@ -1422,7 +1363,7 @@ static int __maybe_unused cs35l41_runtime_resume(struct device *dev)
 
 	regcache_cache_only(cs35l41->regmap, false);
 
-	ret = cs35l41_exit_hibernate(cs35l41);
+	ret = cs35l41_exit_hibernate(cs35l41->dev, cs35l41->regmap);
 	if (ret)
 		return ret;
 
-- 
cgit 


From e341efc308e5374ded6b471f9e1ec01450bcc93e Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Wed, 25 May 2022 14:16:32 +0100
Subject: ASoC: cs35l41: Add common cs35l41 enter hibernate function

Since the CS35L41 HDA driver also support hibernation, it
makes sense to move code from the ASoC driver to enter
hibernation into common code.

Since HDA must support laptops which do not support hibernation
due to lack of external boost GPIO it is necessary to
ensure the function returns an error when an unsupported
boost type is in use.

Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Signed-off-by: Vitaly Rodionov <vitalyr@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220525131638.5512-12-vitalyr@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l41.h        |  2 ++
 sound/soc/codecs/cs35l41-lib.c | 19 +++++++++++++++++++
 sound/soc/codecs/cs35l41.c     | 10 +---------
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 7759f2e14d96..a66ef37184fd 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -881,6 +881,8 @@ void cs35l41_configure_cs_dsp(struct device *dev, struct regmap *reg, struct cs_
 int cs35l41_set_cspl_mbox_cmd(struct device *dev, struct regmap *regmap,
 			      enum cs35l41_cspl_mbox_cmd cmd);
 int cs35l41_write_fs_errata(struct device *dev, struct regmap *regmap);
+int cs35l41_enter_hibernate(struct device *dev, struct regmap *regmap,
+			    enum cs35l41_boost_type b_type);
 int cs35l41_exit_hibernate(struct device *dev, struct regmap *regmap);
 int cs35l41_init_boost(struct device *dev, struct regmap *regmap,
 		       struct cs35l41_hw_cfg *hw_cfg);
diff --git a/sound/soc/codecs/cs35l41-lib.c b/sound/soc/codecs/cs35l41-lib.c
index cc5366c8bdd6..10b754481ca2 100644
--- a/sound/soc/codecs/cs35l41-lib.c
+++ b/sound/soc/codecs/cs35l41-lib.c
@@ -1321,6 +1321,25 @@ int cs35l41_write_fs_errata(struct device *dev, struct regmap *regmap)
 }
 EXPORT_SYMBOL_GPL(cs35l41_write_fs_errata);
 
+int cs35l41_enter_hibernate(struct device *dev, struct regmap *regmap,
+			    enum cs35l41_boost_type b_type)
+{
+	if (!cs35l41_safe_reset(regmap, b_type)) {
+		dev_dbg(dev, "System does not support Suspend\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(dev, "Enter hibernate\n");
+	regmap_write(regmap, CS35L41_WAKESRC_CTL, 0x0088);
+	regmap_write(regmap, CS35L41_WAKESRC_CTL, 0x0188);
+
+	// Don't wait for ACK since bus activity would wake the device
+	regmap_write(regmap, CS35L41_DSP_VIRT1_MBOX_1, CSPL_MBOX_CMD_HIBERNATE);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs35l41_enter_hibernate);
+
 static void cs35l41_wait_for_pwrmgt_sts(struct device *dev, struct regmap *regmap)
 {
 	const int pwrmgt_retries = 10;
diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c
index be7d02517739..a115ea35b92d 100644
--- a/sound/soc/codecs/cs35l41.c
+++ b/sound/soc/codecs/cs35l41.c
@@ -1335,15 +1335,7 @@ static int __maybe_unused cs35l41_runtime_suspend(struct device *dev)
 	if (!cs35l41->dsp.preloaded || !cs35l41->dsp.cs_dsp.running)
 		return 0;
 
-	dev_dbg(cs35l41->dev, "Enter hibernate\n");
-
-	cs35l41_safe_reset(cs35l41->regmap, cs35l41->hw_cfg.bst_type);
-	regmap_write(cs35l41->regmap, CS35L41_WAKESRC_CTL, 0x0088);
-	regmap_write(cs35l41->regmap, CS35L41_WAKESRC_CTL, 0x0188);
-
-	// Don't wait for ACK since bus activity would wake the device
-	regmap_write(cs35l41->regmap, CS35L41_DSP_VIRT1_MBOX_1,
-		     CSPL_MBOX_CMD_HIBERNATE);
+	cs35l41_enter_hibernate(dev, cs35l41->regmap, cs35l41->hw_cfg.bst_type);
 
 	regcache_cache_only(cs35l41->regmap, true);
 	regcache_mark_dirty(cs35l41->regmap);
-- 
cgit 


From 657f8bd88cb5a968d907fd1c891cee52dc156caa Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:10:23 +0200
Subject: spi: fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Link: https://lore.kernel.org/r/20220521111145.81697-13-Julia.Lawall@inria.fr
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index d361ba26203b..eadfd3ba3cbc 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1127,7 +1127,7 @@ spi_max_transfer_size(struct spi_device *spi)
 	if (ctlr->max_transfer_size)
 		tr_max = ctlr->max_transfer_size(spi);
 
-	/* transfer size limit must not be greater than messsage size limit */
+	/* transfer size limit must not be greater than message size limit */
 	return min(tr_max, msg_max);
 }
 
-- 
cgit 


From 6598b91b5ac32bc756d7c3000a31f775d4ead1c4 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 24 May 2022 11:18:08 +0200
Subject: spi: spi.c: Convert statistics to per-cpu u64_stats_t

This change gives a dramatic performance improvement in the hot path,
since many costly spin_lock_irqsave() calls can be avoided.

On an i.MX8MM system with a MCP2518FD CAN controller connected via SPI,
the time the driver takes to handle interrupts, or in other words the time
the IRQ line of the CAN controller stays low is mainly dominated by the
time it takes to do 3 relatively short sync SPI transfers. The effect of
this patch is a reduction of this time from 136us down to only 98us.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220524091808.2269898-1-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 143 +++++++++++++++++++++++++++++++++---------------
 include/linux/spi/spi.h |  52 ++++++++++--------
 2 files changed, 127 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ea09d1b42bf6..d94822bf3cec 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -33,6 +33,7 @@
 #include <linux/idr.h>
 #include <linux/platform_data/x86/apple.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/percpu.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/spi.h>
@@ -49,6 +50,7 @@ static void spidev_release(struct device *dev)
 
 	spi_controller_put(spi->controller);
 	kfree(spi->driver_override);
+	free_percpu(spi->pcpu_statistics);
 	kfree(spi);
 }
 
@@ -93,6 +95,47 @@ static ssize_t driver_override_show(struct device *dev,
 }
 static DEVICE_ATTR_RW(driver_override);
 
+static struct spi_statistics *spi_alloc_pcpu_stats(struct device *dev)
+{
+	struct spi_statistics __percpu *pcpu_stats;
+
+	if (dev)
+		pcpu_stats = devm_alloc_percpu(dev, struct spi_statistics);
+	else
+		pcpu_stats = alloc_percpu_gfp(struct spi_statistics, GFP_KERNEL);
+
+	if (pcpu_stats) {
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct spi_statistics *stat;
+
+			stat = per_cpu_ptr(pcpu_stats, cpu);
+			u64_stats_init(&stat->syncp);
+		}
+	}
+	return pcpu_stats;
+}
+
+#define spi_pcpu_stats_totalize(ret, in, field)				\
+do {									\
+	int i;								\
+	ret = 0;							\
+	for_each_possible_cpu(i) {					\
+		const struct spi_statistics *pcpu_stats;		\
+		u64 inc;						\
+		unsigned int start;					\
+		pcpu_stats = per_cpu_ptr(in, i);			\
+		do {							\
+			start = u64_stats_fetch_begin_irq(		\
+					&pcpu_stats->syncp);		\
+			inc = u64_stats_read(&pcpu_stats->field);	\
+		} while (u64_stats_fetch_retry_irq(			\
+					&pcpu_stats->syncp, start));	\
+		ret += inc;						\
+	}								\
+} while (0)
+
 #define SPI_STATISTICS_ATTRS(field, file)				\
 static ssize_t spi_controller_##field##_show(struct device *dev,	\
 					     struct device_attribute *attr, \
@@ -100,7 +143,7 @@ static ssize_t spi_controller_##field##_show(struct device *dev,	\
 {									\
 	struct spi_controller *ctlr = container_of(dev,			\
 					 struct spi_controller, dev);	\
-	return spi_statistics_##field##_show(&ctlr->statistics, buf);	\
+	return spi_statistics_##field##_show(ctlr->pcpu_statistics, buf); \
 }									\
 static struct device_attribute dev_attr_spi_controller_##field = {	\
 	.attr = { .name = file, .mode = 0444 },				\
@@ -111,47 +154,46 @@ static ssize_t spi_device_##field##_show(struct device *dev,		\
 					char *buf)			\
 {									\
 	struct spi_device *spi = to_spi_device(dev);			\
-	return spi_statistics_##field##_show(&spi->statistics, buf);	\
+	return spi_statistics_##field##_show(spi->pcpu_statistics, buf); \
 }									\
 static struct device_attribute dev_attr_spi_device_##field = {		\
 	.attr = { .name = file, .mode = 0444 },				\
 	.show = spi_device_##field##_show,				\
 }
 
-#define SPI_STATISTICS_SHOW_NAME(name, file, field, format_string)	\
+#define SPI_STATISTICS_SHOW_NAME(name, file, field)			\
 static ssize_t spi_statistics_##name##_show(struct spi_statistics *stat, \
 					    char *buf)			\
 {									\
-	unsigned long flags;						\
 	ssize_t len;							\
-	spin_lock_irqsave(&stat->lock, flags);				\
-	len = sysfs_emit(buf, format_string "\n", stat->field);		\
-	spin_unlock_irqrestore(&stat->lock, flags);			\
+	u64 val;							\
+	spi_pcpu_stats_totalize(val, stat, field);			\
+	len = sysfs_emit(buf, "%llu\n", val);				\
 	return len;							\
 }									\
 SPI_STATISTICS_ATTRS(name, file)
 
-#define SPI_STATISTICS_SHOW(field, format_string)			\
+#define SPI_STATISTICS_SHOW(field)					\
 	SPI_STATISTICS_SHOW_NAME(field, __stringify(field),		\
-				 field, format_string)
+				 field)
 
-SPI_STATISTICS_SHOW(messages, "%lu");
-SPI_STATISTICS_SHOW(transfers, "%lu");
-SPI_STATISTICS_SHOW(errors, "%lu");
-SPI_STATISTICS_SHOW(timedout, "%lu");
+SPI_STATISTICS_SHOW(messages);
+SPI_STATISTICS_SHOW(transfers);
+SPI_STATISTICS_SHOW(errors);
+SPI_STATISTICS_SHOW(timedout);
 
-SPI_STATISTICS_SHOW(spi_sync, "%lu");
-SPI_STATISTICS_SHOW(spi_sync_immediate, "%lu");
-SPI_STATISTICS_SHOW(spi_async, "%lu");
+SPI_STATISTICS_SHOW(spi_sync);
+SPI_STATISTICS_SHOW(spi_sync_immediate);
+SPI_STATISTICS_SHOW(spi_async);
 
-SPI_STATISTICS_SHOW(bytes, "%llu");
-SPI_STATISTICS_SHOW(bytes_rx, "%llu");
-SPI_STATISTICS_SHOW(bytes_tx, "%llu");
+SPI_STATISTICS_SHOW(bytes);
+SPI_STATISTICS_SHOW(bytes_rx);
+SPI_STATISTICS_SHOW(bytes_tx);
 
 #define SPI_STATISTICS_TRANSFER_BYTES_HISTO(index, number)		\
 	SPI_STATISTICS_SHOW_NAME(transfer_bytes_histo##index,		\
 				 "transfer_bytes_histo_" number,	\
-				 transfer_bytes_histo[index],  "%lu")
+				 transfer_bytes_histo[index])
 SPI_STATISTICS_TRANSFER_BYTES_HISTO(0,  "0-1");
 SPI_STATISTICS_TRANSFER_BYTES_HISTO(1,  "2-3");
 SPI_STATISTICS_TRANSFER_BYTES_HISTO(2,  "4-7");
@@ -170,7 +212,7 @@ SPI_STATISTICS_TRANSFER_BYTES_HISTO(14, "16384-32767");
 SPI_STATISTICS_TRANSFER_BYTES_HISTO(15, "32768-65535");
 SPI_STATISTICS_TRANSFER_BYTES_HISTO(16, "65536+");
 
-SPI_STATISTICS_SHOW(transfers_split_maxsize, "%lu");
+SPI_STATISTICS_SHOW(transfers_split_maxsize);
 
 static struct attribute *spi_dev_attrs[] = {
 	&dev_attr_modalias.attr,
@@ -267,30 +309,30 @@ static const struct attribute_group *spi_master_groups[] = {
 	NULL,
 };
 
-static void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
+static void spi_statistics_add_transfer_stats(struct spi_statistics *pcpu_stats,
 					      struct spi_transfer *xfer,
 					      struct spi_controller *ctlr)
 {
-	unsigned long flags;
 	int l2len = min(fls(xfer->len), SPI_STATISTICS_HISTO_SIZE) - 1;
+	struct spi_statistics *stats = this_cpu_ptr(pcpu_stats);
 
 	if (l2len < 0)
 		l2len = 0;
 
-	spin_lock_irqsave(&stats->lock, flags);
+	u64_stats_update_begin(&stats->syncp);
 
-	stats->transfers++;
-	stats->transfer_bytes_histo[l2len]++;
+	u64_stats_inc(&stats->transfers);
+	u64_stats_inc(&stats->transfer_bytes_histo[l2len]);
 
-	stats->bytes += xfer->len;
+	u64_stats_add(&stats->bytes, xfer->len);
 	if ((xfer->tx_buf) &&
 	    (xfer->tx_buf != ctlr->dummy_tx))
-		stats->bytes_tx += xfer->len;
+		u64_stats_add(&stats->bytes_tx, xfer->len);
 	if ((xfer->rx_buf) &&
 	    (xfer->rx_buf != ctlr->dummy_rx))
-		stats->bytes_rx += xfer->len;
+		u64_stats_add(&stats->bytes_rx, xfer->len);
 
-	spin_unlock_irqrestore(&stats->lock, flags);
+	u64_stats_update_end(&stats->syncp);
 }
 
 /*
@@ -519,14 +561,19 @@ struct spi_device *spi_alloc_device(struct spi_controller *ctlr)
 		return NULL;
 	}
 
+	spi->pcpu_statistics = spi_alloc_pcpu_stats(NULL);
+	if (!spi->pcpu_statistics) {
+		kfree(spi);
+		spi_controller_put(ctlr);
+		return NULL;
+	}
+
 	spi->master = spi->controller = ctlr;
 	spi->dev.parent = &ctlr->dev;
 	spi->dev.bus = &spi_bus_type;
 	spi->dev.release = spidev_release;
 	spi->mode = ctlr->buswidth_override_bits;
 
-	spin_lock_init(&spi->statistics.lock);
-
 	device_initialize(&spi->dev);
 	return spi;
 }
@@ -1225,8 +1272,8 @@ static int spi_transfer_wait(struct spi_controller *ctlr,
 			     struct spi_message *msg,
 			     struct spi_transfer *xfer)
 {
-	struct spi_statistics *statm = &ctlr->statistics;
-	struct spi_statistics *stats = &msg->spi->statistics;
+	struct spi_statistics *statm = ctlr->pcpu_statistics;
+	struct spi_statistics *stats = msg->spi->pcpu_statistics;
 	u32 speed_hz = xfer->speed_hz;
 	unsigned long long ms;
 
@@ -1382,8 +1429,8 @@ static int spi_transfer_one_message(struct spi_controller *ctlr,
 	struct spi_transfer *xfer;
 	bool keep_cs = false;
 	int ret = 0;
-	struct spi_statistics *statm = &ctlr->statistics;
-	struct spi_statistics *stats = &msg->spi->statistics;
+	struct spi_statistics *statm = ctlr->pcpu_statistics;
+	struct spi_statistics *stats = msg->spi->pcpu_statistics;
 
 	spi_set_cs(msg->spi, true, false);
 
@@ -3029,7 +3076,11 @@ int spi_register_controller(struct spi_controller *ctlr)
 		}
 	}
 	/* add statistics */
-	spin_lock_init(&ctlr->statistics.lock);
+	ctlr->pcpu_statistics = spi_alloc_pcpu_stats(dev);
+	if (!ctlr->pcpu_statistics) {
+		dev_err(dev, "Error allocating per-cpu statistics\n");
+		goto destroy_queue;
+	}
 
 	mutex_lock(&board_lock);
 	list_add_tail(&ctlr->list, &spi_controller_list);
@@ -3042,6 +3093,8 @@ int spi_register_controller(struct spi_controller *ctlr)
 	acpi_register_spi_devices(ctlr);
 	return status;
 
+destroy_queue:
+	spi_destroy_queue(ctlr);
 free_bus_id:
 	mutex_lock(&board_lock);
 	idr_remove(&spi_master_idr, ctlr->bus_num);
@@ -3367,9 +3420,9 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 	*xferp = &xfers[count - 1];
 
 	/* increment statistics counters */
-	SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics,
+	SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics,
 				       transfers_split_maxsize);
-	SPI_STATISTICS_INCREMENT_FIELD(&msg->spi->statistics,
+	SPI_STATISTICS_INCREMENT_FIELD(msg->spi->pcpu_statistics,
 				       transfers_split_maxsize);
 
 	return 0;
@@ -3760,8 +3813,8 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
 
 	message->spi = spi;
 
-	SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics, spi_async);
-	SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics, spi_async);
+	SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_async);
+	SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_async);
 
 	trace_spi_message_submit(message);
 
@@ -3908,8 +3961,8 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	message->context = &done;
 	message->spi = spi;
 
-	SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics, spi_sync);
-	SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics, spi_sync);
+	SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_sync);
+	SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_sync);
 
 	/*
 	 * If we're not using the legacy transfer method then we will
@@ -3932,9 +3985,9 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	if (status == 0) {
 		/* Push out the messages in the calling context if we can */
 		if (ctlr->transfer == spi_queued_transfer) {
-			SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics,
+			SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics,
 						       spi_sync_immediate);
-			SPI_STATISTICS_INCREMENT_FIELD(&spi->statistics,
+			SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics,
 						       spi_sync_immediate);
 			__spi_pump_messages(ctlr, false);
 		}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index eadfd3ba3cbc..eac8d3caf954 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -17,6 +17,7 @@
 
 #include <uapi/linux/spi/spi.h>
 #include <linux/acpi.h>
+#include <linux/u64_stats_sync.h>
 
 struct dma_chan;
 struct software_node;
@@ -59,37 +60,42 @@ extern struct bus_type spi_bus_type;
  *                 maxsize limit
  */
 struct spi_statistics {
-	spinlock_t		lock; /* lock for the whole structure */
+	struct u64_stats_sync	syncp;
 
-	unsigned long		messages;
-	unsigned long		transfers;
-	unsigned long		errors;
-	unsigned long		timedout;
+	u64_stats_t		messages;
+	u64_stats_t		transfers;
+	u64_stats_t		errors;
+	u64_stats_t		timedout;
 
-	unsigned long		spi_sync;
-	unsigned long		spi_sync_immediate;
-	unsigned long		spi_async;
+	u64_stats_t		spi_sync;
+	u64_stats_t		spi_sync_immediate;
+	u64_stats_t		spi_async;
 
-	unsigned long long	bytes;
-	unsigned long long	bytes_rx;
-	unsigned long long	bytes_tx;
+	u64_stats_t		bytes;
+	u64_stats_t		bytes_rx;
+	u64_stats_t		bytes_tx;
 
 #define SPI_STATISTICS_HISTO_SIZE 17
-	unsigned long transfer_bytes_histo[SPI_STATISTICS_HISTO_SIZE];
+	u64_stats_t	transfer_bytes_histo[SPI_STATISTICS_HISTO_SIZE];
 
-	unsigned long transfers_split_maxsize;
+	u64_stats_t	transfers_split_maxsize;
 };
 
-#define SPI_STATISTICS_ADD_TO_FIELD(stats, field, count)	\
-	do {							\
-		unsigned long flags;				\
-		spin_lock_irqsave(&(stats)->lock, flags);	\
-		(stats)->field += count;			\
-		spin_unlock_irqrestore(&(stats)->lock, flags);	\
+#define SPI_STATISTICS_ADD_TO_FIELD(pcpu_stats, field, count)		\
+	do {								\
+		struct spi_statistics *__lstats = this_cpu_ptr(pcpu_stats); \
+		u64_stats_update_begin(&__lstats->syncp);		\
+		u64_stats_add(&__lstats->field, count);			\
+		u64_stats_update_end(&__lstats->syncp);			\
 	} while (0)
 
-#define SPI_STATISTICS_INCREMENT_FIELD(stats, field)	\
-	SPI_STATISTICS_ADD_TO_FIELD(stats, field, 1)
+#define SPI_STATISTICS_INCREMENT_FIELD(pcpu_stats, field)		\
+	do {								\
+		struct spi_statistics *__lstats = this_cpu_ptr(pcpu_stats); \
+		u64_stats_update_begin(&__lstats->syncp);		\
+		u64_stats_inc(&__lstats->field);			\
+		u64_stats_update_end(&__lstats->syncp);			\
+	} while (0)
 
 /**
  * struct spi_delay - SPI delay information
@@ -194,7 +200,7 @@ struct spi_device {
 	struct spi_delay	cs_inactive;
 
 	/* the statistics */
-	struct spi_statistics	statistics;
+	struct spi_statistics __percpu	*pcpu_statistics;
 
 	/*
 	 * likely need more hooks for more protocol options affecting how
@@ -647,7 +653,7 @@ struct spi_controller {
 	s8			max_native_cs;
 
 	/* statistics */
-	struct spi_statistics	statistics;
+	struct spi_statistics __percpu	*pcpu_statistics;
 
 	/* DMA channels for use with core dmaengine helpers */
 	struct dma_chan		*dma_tx;
-- 
cgit 


From fc602b4f692cb83c937b5f79628bca32b60c4402 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Sat, 4 Jun 2022 12:32:50 +0100
Subject: mtd: spinand: Add support for ATO25D1GA

Add support for the ATO25D1GA SPI NAND flash.

Datasheet:
- https://atta.szlcsc.com/upload/public/pdf/source/20191212/C469320_04599D67B03B078044EB65FF5AEDDDE9.pdf

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220604113250.4745-1-aidanmacdonald.0x0@gmail.com
---
 drivers/mtd/nand/spi/Makefile |  2 +-
 drivers/mtd/nand/spi/ato.c    | 86 +++++++++++++++++++++++++++++++++++++++++++
 drivers/mtd/nand/spi/core.c   |  1 +
 include/linux/mtd/spinand.h   |  1 +
 4 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/ato.c

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 80dabe6ff0f3..b520fe634041 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
+spinand-objs := core.o ato.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/ato.c b/drivers/mtd/nand/spi/ato.c
new file mode 100644
index 000000000000..82b377c06812
--- /dev/null
+++ b/drivers/mtd/nand/spi/ato.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Aidan MacDonald
+ *
+ * Author: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+
+#define SPINAND_MFR_ATO		0x9b
+
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+
+static int ato25d1ga_ooblayout_ecc(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section) + 8;
+	region->length = 8;
+	return 0;
+}
+
+static int ato25d1ga_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	if (section) {
+		region->offset = (16 * section);
+		region->length = 8;
+	} else {
+		/* first byte of section 0 is reserved for the BBM */
+		region->offset = 1;
+		region->length = 7;
+	}
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops ato25d1ga_ooblayout = {
+	.ecc = ato25d1ga_ooblayout_ecc,
+	.free = ato25d1ga_ooblayout_free,
+};
+
+
+static const struct spinand_info ato_spinand_table[] = {
+	SPINAND_INFO("ATO25D1GA",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x12),
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     SPINAND_HAS_QE_BIT,
+		     SPINAND_ECCINFO(&ato25d1ga_ooblayout, NULL)),
+};
+
+static const struct spinand_manufacturer_ops ato_spinand_manuf_ops = {
+};
+
+const struct spinand_manufacturer ato_spinand_manufacturer = {
+	.id = SPINAND_MFR_ATO,
+	.name = "ATO",
+	.chips = ato_spinand_table,
+	.nchips = ARRAY_SIZE(ato_spinand_table),
+	.ops = &ato_spinand_manuf_ops,
+};
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index d5b685d1605e..9d73910a7ae8 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -927,6 +927,7 @@ static const struct nand_ops spinand_ops = {
 };
 
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
+	&ato_spinand_manufacturer,
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 5584d3bb6556..6d3392a7edc6 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -260,6 +260,7 @@ struct spinand_manufacturer {
 };
 
 /* SPI NAND manufacturers */
+extern const struct spinand_manufacturer ato_spinand_manufacturer;
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
-- 
cgit 


From 39d649602be2ceb3f8f8e98483d09e4d71133c6a Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Wed, 1 Jun 2022 10:17:58 +0200
Subject: of: constify of_property_check_flags() prop argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This argument is not modified and thus can be set as const.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220601081801.348571-2-clement.leger@bootlin.com
---
 include/linux/of.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index f0a5d6b10c5a..d74fd82a6963 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -207,7 +207,7 @@ static inline void of_node_clear_flag(struct device_node *n, unsigned long flag)
 }
 
 #if defined(CONFIG_OF_DYNAMIC) || defined(CONFIG_SPARC)
-static inline int of_property_check_flag(struct property *p, unsigned long flag)
+static inline int of_property_check_flag(const struct property *p, unsigned long flag)
 {
 	return test_bit(flag, &p->_flags);
 }
@@ -814,7 +814,8 @@ static inline void of_node_clear_flag(struct device_node *n, unsigned long flag)
 {
 }
 
-static inline int of_property_check_flag(struct property *p, unsigned long flag)
+static inline int of_property_check_flag(const struct property *p,
+					 unsigned long flag)
 {
 	return 0;
 }
-- 
cgit 


From 11fe58c4450a8108b498d2f849976ba2686820fc Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Mon, 6 Jun 2022 15:46:20 -0500
Subject: ASoC: SOF: Intel: add MeteorLake machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for MeteorLake (MTL) machines support, starting with mockup
devices.

Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20220606204622.144424-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi-intel-match.h              |  2 ++
 sound/soc/intel/common/Makefile                   |  1 +
 sound/soc/intel/common/soc-acpi-intel-mtl-match.c | 41 +++++++++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 sound/soc/intel/common/soc-acpi-intel-mtl-match.c

(limited to 'include')

diff --git a/include/sound/soc-acpi-intel-match.h b/include/sound/soc-acpi-intel-match.h
index 59551b1f22f3..bc7fd46ec2bc 100644
--- a/include/sound/soc-acpi-intel-match.h
+++ b/include/sound/soc-acpi-intel-match.h
@@ -30,6 +30,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_tgl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_ehl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_jsl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_adl_machines[];
+extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_machines[];
 
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cnl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cfl_sdw_machines[];
@@ -37,6 +38,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cml_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_icl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_tgl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_adl_sdw_machines[];
+extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[];
 
 /*
  * generic table used for HDA codec-based platforms, possibly with
diff --git a/sound/soc/intel/common/Makefile b/sound/soc/intel/common/Makefile
index fef0b2d1de68..8ca8f872ec80 100644
--- a/sound/soc/intel/common/Makefile
+++ b/sound/soc/intel/common/Makefile
@@ -9,6 +9,7 @@ snd-soc-acpi-intel-match-objs := soc-acpi-intel-byt-match.o soc-acpi-intel-cht-m
 	soc-acpi-intel-cml-match.o soc-acpi-intel-icl-match.o \
 	soc-acpi-intel-tgl-match.o soc-acpi-intel-ehl-match.o \
 	soc-acpi-intel-jsl-match.o soc-acpi-intel-adl-match.o \
+	soc-acpi-intel-mtl-match.o \
 	soc-acpi-intel-hda-match.o \
 	soc-acpi-intel-sdw-mockup-match.o
 
diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c
new file mode 100644
index 000000000000..cc594b27e03b
--- /dev/null
+++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * soc-acpi-intel-mtl-match.c - tables and support for MTL ACPI enumeration.
+ *
+ * Copyright (c) 2022, Intel Corporation.
+ *
+ */
+
+#include <sound/soc-acpi.h>
+#include <sound/soc-acpi-intel-match.h>
+#include "soc-acpi-intel-sdw-mockup-match.h"
+
+struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_machines[] = {
+	{},
+};
+EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_mtl_machines);
+
+/* this table is used when there is no I2S codec present */
+struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[] = {
+	/* mockup tests need to be first */
+	{
+		.link_mask = GENMASK(3, 0),
+		.links = sdw_mockup_headset_2amps_mic,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-mtl-rt711-rt1308-rt715.tplg",
+	},
+	{
+		.link_mask = BIT(0) | BIT(1) | BIT(3),
+		.links = sdw_mockup_headset_1amp_mic,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-mtl-rt711-rt1308-mono-rt715.tplg",
+	},
+	{
+		.link_mask = GENMASK(2, 0),
+		.links = sdw_mockup_mic_headset_1amp,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-mtl-rt715-rt711-rt1308-mono.tplg",
+	},
+	{},
+};
+EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_mtl_sdw_machines);
-- 
cgit 


From ff8372a467fa28752610f57a6512c5365413faa2 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Mon, 6 Jun 2022 10:24:34 +0800
Subject: net: skb: move enum skb_drop_reason to standalone header file

As the skb drop reasons are getting more and more, move the enum
'skb_drop_reason' and related function to the standalone header
'dropreason.h', as Jakub Kicinski suggested.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h   | 179 +--------------------------------------------
 include/net/dropreason.h | 184 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+), 178 deletions(-)
 create mode 100644 include/net/dropreason.h

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d3d10556f0fa..82edf0359ab3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -43,6 +43,7 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 #endif
 #include <net/net_debug.h>
+#include <net/dropreason.h>
 
 /**
  * DOC: skb checksums
@@ -337,184 +338,6 @@ struct sk_buff_head {
 
 struct sk_buff;
 
-/* The reason of skb drop, which is used in kfree_skb_reason().
- * en...maybe they should be splited by group?
- *
- * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is
- * used to translate the reason to string.
- */
-enum skb_drop_reason {
-	SKB_NOT_DROPPED_YET = 0,
-	SKB_DROP_REASON_NOT_SPECIFIED,	/* drop reason is not specified */
-	SKB_DROP_REASON_NO_SOCKET,	/* socket not found */
-	SKB_DROP_REASON_PKT_TOO_SMALL,	/* packet size is too small */
-	SKB_DROP_REASON_TCP_CSUM,	/* TCP checksum error */
-	SKB_DROP_REASON_SOCKET_FILTER,	/* dropped by socket filter */
-	SKB_DROP_REASON_UDP_CSUM,	/* UDP checksum error */
-	SKB_DROP_REASON_NETFILTER_DROP,	/* dropped by netfilter */
-	SKB_DROP_REASON_OTHERHOST,	/* packet don't belong to current
-					 * host (interface is in promisc
-					 * mode)
-					 */
-	SKB_DROP_REASON_IP_CSUM,	/* IP checksum error */
-	SKB_DROP_REASON_IP_INHDR,	/* there is something wrong with
-					 * IP header (see
-					 * IPSTATS_MIB_INHDRERRORS)
-					 */
-	SKB_DROP_REASON_IP_RPFILTER,	/* IP rpfilter validate failed.
-					 * see the document for rp_filter
-					 * in ip-sysctl.rst for more
-					 * information
-					 */
-	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2
-						  * is multicast, but L3 is
-						  * unicast.
-						  */
-	SKB_DROP_REASON_XFRM_POLICY,	/* xfrm policy check failed */
-	SKB_DROP_REASON_IP_NOPROTO,	/* no support for IP protocol */
-	SKB_DROP_REASON_SOCKET_RCVBUFF,	/* socket receive buff is full */
-	SKB_DROP_REASON_PROTO_MEM,	/* proto memory limition, such as
-					 * udp packet drop out of
-					 * udp_memory_allocated.
-					 */
-	SKB_DROP_REASON_TCP_MD5NOTFOUND,	/* no MD5 hash and one
-						 * expected, corresponding
-						 * to LINUX_MIB_TCPMD5NOTFOUND
-						 */
-	SKB_DROP_REASON_TCP_MD5UNEXPECTED,	/* MD5 hash and we're not
-						 * expecting one, corresponding
-						 * to LINUX_MIB_TCPMD5UNEXPECTED
-						 */
-	SKB_DROP_REASON_TCP_MD5FAILURE,	/* MD5 hash and its wrong,
-					 * corresponding to
-					 * LINUX_MIB_TCPMD5FAILURE
-					 */
-	SKB_DROP_REASON_SOCKET_BACKLOG,	/* failed to add skb to socket
-					 * backlog (see
-					 * LINUX_MIB_TCPBACKLOGDROP)
-					 */
-	SKB_DROP_REASON_TCP_FLAGS,	/* TCP flags invalid */
-	SKB_DROP_REASON_TCP_ZEROWINDOW,	/* TCP receive window size is zero,
-					 * see LINUX_MIB_TCPZEROWINDOWDROP
-					 */
-	SKB_DROP_REASON_TCP_OLD_DATA,	/* the TCP data reveived is already
-					 * received before (spurious retrans
-					 * may happened), see
-					 * LINUX_MIB_DELAYEDACKLOST
-					 */
-	SKB_DROP_REASON_TCP_OVERWINDOW,	/* the TCP data is out of window,
-					 * the seq of the first byte exceed
-					 * the right edges of receive
-					 * window
-					 */
-	SKB_DROP_REASON_TCP_OFOMERGE,	/* the data of skb is already in
-					 * the ofo queue, corresponding to
-					 * LINUX_MIB_TCPOFOMERGE
-					 */
-	SKB_DROP_REASON_TCP_RFC7323_PAWS, /* PAWS check, corresponding to
-					   * LINUX_MIB_PAWSESTABREJECTED
-					   */
-	SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */
-	SKB_DROP_REASON_TCP_RESET,	/* Invalid RST packet */
-	SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */
-	SKB_DROP_REASON_TCP_CLOSE,	/* TCP socket in CLOSE state */
-	SKB_DROP_REASON_TCP_FASTOPEN,	/* dropped by FASTOPEN request socket */
-	SKB_DROP_REASON_TCP_OLD_ACK,	/* TCP ACK is old, but in window */
-	SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */
-	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */
-	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */
-	SKB_DROP_REASON_TCP_OFO_DROP,	/* data already in receive queue */
-	SKB_DROP_REASON_IP_OUTNOROUTES,	/* route lookup failed */
-	SKB_DROP_REASON_BPF_CGROUP_EGRESS,	/* dropped by
-						 * BPF_PROG_TYPE_CGROUP_SKB
-						 * eBPF program
-						 */
-	SKB_DROP_REASON_IPV6DISABLED,	/* IPv6 is disabled on the device */
-	SKB_DROP_REASON_NEIGH_CREATEFAIL,	/* failed to create neigh
-						 * entry
-						 */
-	SKB_DROP_REASON_NEIGH_FAILED,	/* neigh entry in failed state */
-	SKB_DROP_REASON_NEIGH_QUEUEFULL,	/* arp_queue for neigh
-						 * entry is full
-						 */
-	SKB_DROP_REASON_NEIGH_DEAD,	/* neigh entry is dead */
-	SKB_DROP_REASON_TC_EGRESS,	/* dropped in TC egress HOOK */
-	SKB_DROP_REASON_QDISC_DROP,	/* dropped by qdisc when packet
-					 * outputting (failed to enqueue to
-					 * current qdisc)
-					 */
-	SKB_DROP_REASON_CPU_BACKLOG,	/* failed to enqueue the skb to
-					 * the per CPU backlog queue. This
-					 * can be caused by backlog queue
-					 * full (see netdev_max_backlog in
-					 * net.rst) or RPS flow limit
-					 */
-	SKB_DROP_REASON_XDP,		/* dropped by XDP in input path */
-	SKB_DROP_REASON_TC_INGRESS,	/* dropped in TC ingress HOOK */
-	SKB_DROP_REASON_UNHANDLED_PROTO,	/* protocol not implemented
-						 * or not supported
-						 */
-	SKB_DROP_REASON_SKB_CSUM,	/* sk_buff checksum computation
-					 * error
-					 */
-	SKB_DROP_REASON_SKB_GSO_SEG,	/* gso segmentation error */
-	SKB_DROP_REASON_SKB_UCOPY_FAULT,	/* failed to copy data from
-						 * user space, e.g., via
-						 * zerocopy_sg_from_iter()
-						 * or skb_orphan_frags_rx()
-						 */
-	SKB_DROP_REASON_DEV_HDR,	/* device driver specific
-					 * header/metadata is invalid
-					 */
-	/* the device is not ready to xmit/recv due to any of its data
-	 * structure that is not up/ready/initialized, e.g., the IFF_UP is
-	 * not set, or driver specific tun->tfiles[txq] is not initialized
-	 */
-	SKB_DROP_REASON_DEV_READY,
-	SKB_DROP_REASON_FULL_RING,	/* ring buffer is full */
-	SKB_DROP_REASON_NOMEM,		/* error due to OOM */
-	SKB_DROP_REASON_HDR_TRUNC,      /* failed to trunc/extract the header
-					 * from networking data, e.g., failed
-					 * to pull the protocol header from
-					 * frags via pskb_may_pull()
-					 */
-	SKB_DROP_REASON_TAP_FILTER,     /* dropped by (ebpf) filter directly
-					 * attached to tun/tap, e.g., via
-					 * TUNSETFILTEREBPF
-					 */
-	SKB_DROP_REASON_TAP_TXFILTER,	/* dropped by tx filter implemented
-					 * at tun/tap, e.g., check_filter()
-					 */
-	SKB_DROP_REASON_ICMP_CSUM,	/* ICMP checksum error */
-	SKB_DROP_REASON_INVALID_PROTO,	/* the packet doesn't follow RFC
-					 * 2211, such as a broadcasts
-					 * ICMP_TIMESTAMP
-					 */
-	SKB_DROP_REASON_IP_INADDRERRORS,	/* host unreachable, corresponding
-						 * to IPSTATS_MIB_INADDRERRORS
-						 */
-	SKB_DROP_REASON_IP_INNOROUTES,	/* network unreachable, corresponding
-					 * to IPSTATS_MIB_INADDRERRORS
-					 */
-	SKB_DROP_REASON_PKT_TOO_BIG,	/* packet size is too big (maybe exceed
-					 * the MTU)
-					 */
-	SKB_DROP_REASON_MAX,
-};
-
-#define SKB_DR_INIT(name, reason)				\
-	enum skb_drop_reason name = SKB_DROP_REASON_##reason
-#define SKB_DR(name)						\
-	SKB_DR_INIT(name, NOT_SPECIFIED)
-#define SKB_DR_SET(name, reason)				\
-	(name = SKB_DROP_REASON_##reason)
-#define SKB_DR_OR(name, reason)					\
-	do {							\
-		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
-		    name == SKB_NOT_DROPPED_YET)		\
-			SKB_DR_SET(name, reason);		\
-	} while (0)
-
 /* To allow 64K frame to be packed as single skb without frag_list we
  * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
  * buffers which do not start on a page boundary.
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
new file mode 100644
index 000000000000..ecd18b7b1364
--- /dev/null
+++ b/include/net/dropreason.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_H
+#define _LINUX_DROPREASON_H
+
+/* The reason of skb drop, which is used in kfree_skb_reason().
+ * en...maybe they should be splited by group?
+ *
+ * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is
+ * used to translate the reason to string.
+ */
+enum skb_drop_reason {
+	SKB_NOT_DROPPED_YET = 0,
+	SKB_DROP_REASON_NOT_SPECIFIED,	/* drop reason is not specified */
+	SKB_DROP_REASON_NO_SOCKET,	/* socket not found */
+	SKB_DROP_REASON_PKT_TOO_SMALL,	/* packet size is too small */
+	SKB_DROP_REASON_TCP_CSUM,	/* TCP checksum error */
+	SKB_DROP_REASON_SOCKET_FILTER,	/* dropped by socket filter */
+	SKB_DROP_REASON_UDP_CSUM,	/* UDP checksum error */
+	SKB_DROP_REASON_NETFILTER_DROP,	/* dropped by netfilter */
+	SKB_DROP_REASON_OTHERHOST,	/* packet don't belong to current
+					 * host (interface is in promisc
+					 * mode)
+					 */
+	SKB_DROP_REASON_IP_CSUM,	/* IP checksum error */
+	SKB_DROP_REASON_IP_INHDR,	/* there is something wrong with
+					 * IP header (see
+					 * IPSTATS_MIB_INHDRERRORS)
+					 */
+	SKB_DROP_REASON_IP_RPFILTER,	/* IP rpfilter validate failed.
+					 * see the document for rp_filter
+					 * in ip-sysctl.rst for more
+					 * information
+					 */
+	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2
+						  * is multicast, but L3 is
+						  * unicast.
+						  */
+	SKB_DROP_REASON_XFRM_POLICY,	/* xfrm policy check failed */
+	SKB_DROP_REASON_IP_NOPROTO,	/* no support for IP protocol */
+	SKB_DROP_REASON_SOCKET_RCVBUFF,	/* socket receive buff is full */
+	SKB_DROP_REASON_PROTO_MEM,	/* proto memory limition, such as
+					 * udp packet drop out of
+					 * udp_memory_allocated.
+					 */
+	SKB_DROP_REASON_TCP_MD5NOTFOUND,	/* no MD5 hash and one
+						 * expected, corresponding
+						 * to LINUX_MIB_TCPMD5NOTFOUND
+						 */
+	SKB_DROP_REASON_TCP_MD5UNEXPECTED,	/* MD5 hash and we're not
+						 * expecting one, corresponding
+						 * to LINUX_MIB_TCPMD5UNEXPECTED
+						 */
+	SKB_DROP_REASON_TCP_MD5FAILURE,	/* MD5 hash and its wrong,
+					 * corresponding to
+					 * LINUX_MIB_TCPMD5FAILURE
+					 */
+	SKB_DROP_REASON_SOCKET_BACKLOG,	/* failed to add skb to socket
+					 * backlog (see
+					 * LINUX_MIB_TCPBACKLOGDROP)
+					 */
+	SKB_DROP_REASON_TCP_FLAGS,	/* TCP flags invalid */
+	SKB_DROP_REASON_TCP_ZEROWINDOW,	/* TCP receive window size is zero,
+					 * see LINUX_MIB_TCPZEROWINDOWDROP
+					 */
+	SKB_DROP_REASON_TCP_OLD_DATA,	/* the TCP data reveived is already
+					 * received before (spurious retrans
+					 * may happened), see
+					 * LINUX_MIB_DELAYEDACKLOST
+					 */
+	SKB_DROP_REASON_TCP_OVERWINDOW,	/* the TCP data is out of window,
+					 * the seq of the first byte exceed
+					 * the right edges of receive
+					 * window
+					 */
+	SKB_DROP_REASON_TCP_OFOMERGE,	/* the data of skb is already in
+					 * the ofo queue, corresponding to
+					 * LINUX_MIB_TCPOFOMERGE
+					 */
+	SKB_DROP_REASON_TCP_RFC7323_PAWS, /* PAWS check, corresponding to
+					   * LINUX_MIB_PAWSESTABREJECTED
+					   */
+	SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */
+	SKB_DROP_REASON_TCP_RESET,	/* Invalid RST packet */
+	SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */
+	SKB_DROP_REASON_TCP_CLOSE,	/* TCP socket in CLOSE state */
+	SKB_DROP_REASON_TCP_FASTOPEN,	/* dropped by FASTOPEN request socket */
+	SKB_DROP_REASON_TCP_OLD_ACK,	/* TCP ACK is old, but in window */
+	SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */
+	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */
+	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */
+	SKB_DROP_REASON_TCP_OFO_DROP,	/* data already in receive queue */
+	SKB_DROP_REASON_IP_OUTNOROUTES,	/* route lookup failed */
+	SKB_DROP_REASON_BPF_CGROUP_EGRESS,	/* dropped by
+						 * BPF_PROG_TYPE_CGROUP_SKB
+						 * eBPF program
+						 */
+	SKB_DROP_REASON_IPV6DISABLED,	/* IPv6 is disabled on the device */
+	SKB_DROP_REASON_NEIGH_CREATEFAIL,	/* failed to create neigh
+						 * entry
+						 */
+	SKB_DROP_REASON_NEIGH_FAILED,	/* neigh entry in failed state */
+	SKB_DROP_REASON_NEIGH_QUEUEFULL,	/* arp_queue for neigh
+						 * entry is full
+						 */
+	SKB_DROP_REASON_NEIGH_DEAD,	/* neigh entry is dead */
+	SKB_DROP_REASON_TC_EGRESS,	/* dropped in TC egress HOOK */
+	SKB_DROP_REASON_QDISC_DROP,	/* dropped by qdisc when packet
+					 * outputting (failed to enqueue to
+					 * current qdisc)
+					 */
+	SKB_DROP_REASON_CPU_BACKLOG,	/* failed to enqueue the skb to
+					 * the per CPU backlog queue. This
+					 * can be caused by backlog queue
+					 * full (see netdev_max_backlog in
+					 * net.rst) or RPS flow limit
+					 */
+	SKB_DROP_REASON_XDP,		/* dropped by XDP in input path */
+	SKB_DROP_REASON_TC_INGRESS,	/* dropped in TC ingress HOOK */
+	SKB_DROP_REASON_UNHANDLED_PROTO,	/* protocol not implemented
+						 * or not supported
+						 */
+	SKB_DROP_REASON_SKB_CSUM,	/* sk_buff checksum computation
+					 * error
+					 */
+	SKB_DROP_REASON_SKB_GSO_SEG,	/* gso segmentation error */
+	SKB_DROP_REASON_SKB_UCOPY_FAULT,	/* failed to copy data from
+						 * user space, e.g., via
+						 * zerocopy_sg_from_iter()
+						 * or skb_orphan_frags_rx()
+						 */
+	SKB_DROP_REASON_DEV_HDR,	/* device driver specific
+					 * header/metadata is invalid
+					 */
+	/* the device is not ready to xmit/recv due to any of its data
+	 * structure that is not up/ready/initialized, e.g., the IFF_UP is
+	 * not set, or driver specific tun->tfiles[txq] is not initialized
+	 */
+	SKB_DROP_REASON_DEV_READY,
+	SKB_DROP_REASON_FULL_RING,	/* ring buffer is full */
+	SKB_DROP_REASON_NOMEM,		/* error due to OOM */
+	SKB_DROP_REASON_HDR_TRUNC,      /* failed to trunc/extract the header
+					 * from networking data, e.g., failed
+					 * to pull the protocol header from
+					 * frags via pskb_may_pull()
+					 */
+	SKB_DROP_REASON_TAP_FILTER,     /* dropped by (ebpf) filter directly
+					 * attached to tun/tap, e.g., via
+					 * TUNSETFILTEREBPF
+					 */
+	SKB_DROP_REASON_TAP_TXFILTER,	/* dropped by tx filter implemented
+					 * at tun/tap, e.g., check_filter()
+					 */
+	SKB_DROP_REASON_ICMP_CSUM,	/* ICMP checksum error */
+	SKB_DROP_REASON_INVALID_PROTO,	/* the packet doesn't follow RFC
+					 * 2211, such as a broadcasts
+					 * ICMP_TIMESTAMP
+					 */
+	SKB_DROP_REASON_IP_INADDRERRORS,	/* host unreachable, corresponding
+						 * to IPSTATS_MIB_INADDRERRORS
+						 */
+	SKB_DROP_REASON_IP_INNOROUTES,	/* network unreachable, corresponding
+					 * to IPSTATS_MIB_INADDRERRORS
+					 */
+	SKB_DROP_REASON_PKT_TOO_BIG,	/* packet size is too big (maybe exceed
+					 * the MTU)
+					 */
+	SKB_DROP_REASON_MAX,
+};
+
+#define SKB_DR_INIT(name, reason)				\
+	enum skb_drop_reason name = SKB_DROP_REASON_##reason
+#define SKB_DR(name)						\
+	SKB_DR_INIT(name, NOT_SPECIFIED)
+#define SKB_DR_SET(name, reason)				\
+	(name = SKB_DROP_REASON_##reason)
+#define SKB_DR_OR(name, reason)					\
+	do {							\
+		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
+		    name == SKB_NOT_DROPPED_YET)		\
+			SKB_DR_SET(name, reason);		\
+	} while (0)
+
+#endif
-- 
cgit 


From ec43908dd556b2292f028c6e412261689405ba6e Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Mon, 6 Jun 2022 10:24:35 +0800
Subject: net: skb: use auto-generation to convert skb drop reason to string

It is annoying to add new skb drop reasons to 'enum skb_drop_reason'
and TRACE_SKB_DROP_REASON in trace/event/skb.h, and it's easy to forget
to add the new reasons we added to TRACE_SKB_DROP_REASON.

TRACE_SKB_DROP_REASON is used to convert drop reason of type number
to string. For now, the string we passed to user space is exactly the
same as the name in 'enum skb_drop_reason' with a 'SKB_DROP_REASON_'
prefix. Therefore, we can use 'auto-generation' to generate these
drop reasons to string at build time.

The new source 'dropreason_str.c' will be auto generated during build
time, which contains the string array
'const char * const drop_reasons[]'.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dropreason.h   |  2 ++
 include/trace/events/skb.h | 89 +---------------------------------------------
 net/core/.gitignore        |  1 +
 net/core/Makefile          | 23 +++++++++++-
 net/core/drop_monitor.c    | 13 -------
 net/core/skbuff.c          |  3 ++
 6 files changed, 29 insertions(+), 102 deletions(-)
 create mode 100644 net/core/.gitignore

(limited to 'include')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index ecd18b7b1364..013ff0f2543e 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -181,4 +181,6 @@ enum skb_drop_reason {
 			SKB_DR_SET(name, reason);		\
 	} while (0)
 
+extern const char * const drop_reasons[];
+
 #endif
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index a477bf907498..45264e4bb254 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -9,92 +9,6 @@
 #include <linux/netdevice.h>
 #include <linux/tracepoint.h>
 
-#define TRACE_SKB_DROP_REASON					\
-	EM(SKB_DROP_REASON_NOT_SPECIFIED, NOT_SPECIFIED)	\
-	EM(SKB_DROP_REASON_NO_SOCKET, NO_SOCKET)		\
-	EM(SKB_DROP_REASON_PKT_TOO_SMALL, PKT_TOO_SMALL)	\
-	EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM)			\
-	EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER)	\
-	EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM)			\
-	EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP)	\
-	EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST)		\
-	EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM)			\
-	EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR)			\
-	EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER)		\
-	EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,		\
-	   UNICAST_IN_L2_MULTICAST)				\
-	EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY)		\
-	EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO)		\
-	EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF)	\
-	EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM)		\
-	EM(SKB_DROP_REASON_TCP_MD5NOTFOUND, TCP_MD5NOTFOUND)	\
-	EM(SKB_DROP_REASON_TCP_MD5UNEXPECTED,			\
-	   TCP_MD5UNEXPECTED)					\
-	EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE)	\
-	EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG)	\
-	EM(SKB_DROP_REASON_TCP_FLAGS, TCP_FLAGS)		\
-	EM(SKB_DROP_REASON_TCP_ZEROWINDOW, TCP_ZEROWINDOW)	\
-	EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA)		\
-	EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW)	\
-	EM(SKB_DROP_REASON_TCP_OFOMERGE, TCP_OFOMERGE)		\
-	EM(SKB_DROP_REASON_TCP_OFO_DROP, TCP_OFO_DROP)		\
-	EM(SKB_DROP_REASON_TCP_RFC7323_PAWS, TCP_RFC7323_PAWS)	\
-	EM(SKB_DROP_REASON_TCP_INVALID_SEQUENCE,		\
-	   TCP_INVALID_SEQUENCE)				\
-	EM(SKB_DROP_REASON_TCP_RESET, TCP_RESET)		\
-	EM(SKB_DROP_REASON_TCP_INVALID_SYN, TCP_INVALID_SYN)	\
-	EM(SKB_DROP_REASON_TCP_CLOSE, TCP_CLOSE)		\
-	EM(SKB_DROP_REASON_TCP_FASTOPEN, TCP_FASTOPEN)		\
-	EM(SKB_DROP_REASON_TCP_OLD_ACK, TCP_OLD_ACK)		\
-	EM(SKB_DROP_REASON_TCP_TOO_OLD_ACK, TCP_TOO_OLD_ACK)	\
-	EM(SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,			\
-	   TCP_ACK_UNSENT_DATA)					\
-	EM(SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,			\
-	  TCP_OFO_QUEUE_PRUNE)					\
-	EM(SKB_DROP_REASON_IP_OUTNOROUTES, IP_OUTNOROUTES)	\
-	EM(SKB_DROP_REASON_BPF_CGROUP_EGRESS,			\
-	   BPF_CGROUP_EGRESS)					\
-	EM(SKB_DROP_REASON_IPV6DISABLED, IPV6DISABLED)		\
-	EM(SKB_DROP_REASON_NEIGH_CREATEFAIL, NEIGH_CREATEFAIL)	\
-	EM(SKB_DROP_REASON_NEIGH_FAILED, NEIGH_FAILED)		\
-	EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL)	\
-	EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD)		\
-	EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS)		\
-	EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP)		\
-	EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG)		\
-	EM(SKB_DROP_REASON_XDP, XDP)				\
-	EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS)		\
-	EM(SKB_DROP_REASON_UNHANDLED_PROTO, UNHANDLED_PROTO)	\
-	EM(SKB_DROP_REASON_SKB_CSUM, SKB_CSUM)			\
-	EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG)		\
-	EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT)	\
-	EM(SKB_DROP_REASON_DEV_HDR, DEV_HDR)			\
-	EM(SKB_DROP_REASON_DEV_READY, DEV_READY)		\
-	EM(SKB_DROP_REASON_FULL_RING, FULL_RING)		\
-	EM(SKB_DROP_REASON_NOMEM, NOMEM)			\
-	EM(SKB_DROP_REASON_HDR_TRUNC, HDR_TRUNC)		\
-	EM(SKB_DROP_REASON_TAP_FILTER, TAP_FILTER)		\
-	EM(SKB_DROP_REASON_TAP_TXFILTER, TAP_TXFILTER)		\
-	EM(SKB_DROP_REASON_ICMP_CSUM, ICMP_CSUM)		\
-	EM(SKB_DROP_REASON_INVALID_PROTO, INVALID_PROTO)	\
-	EM(SKB_DROP_REASON_IP_INADDRERRORS, IP_INADDRERRORS)	\
-	EM(SKB_DROP_REASON_IP_INNOROUTES, IP_INNOROUTES)	\
-	EM(SKB_DROP_REASON_PKT_TOO_BIG, PKT_TOO_BIG)		\
-	EMe(SKB_DROP_REASON_MAX, MAX)
-
-#undef EM
-#undef EMe
-
-#define EM(a, b)	TRACE_DEFINE_ENUM(a);
-#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
-
-TRACE_SKB_DROP_REASON
-
-#undef EM
-#undef EMe
-#define EM(a, b)	{ a, #b },
-#define EMe(a, b)	{ a, #b }
-
 /*
  * Tracepoint for free an sk_buff:
  */
@@ -121,8 +35,7 @@ TRACE_EVENT(kfree_skb,
 
 	TP_printk("skbaddr=%p protocol=%u location=%p reason: %s",
 		  __entry->skbaddr, __entry->protocol, __entry->location,
-		  __print_symbolic(__entry->reason,
-				   TRACE_SKB_DROP_REASON))
+		  drop_reasons[__entry->reason])
 );
 
 TRACE_EVENT(consume_skb,
diff --git a/net/core/.gitignore b/net/core/.gitignore
new file mode 100644
index 000000000000..df1e74372cce
--- /dev/null
+++ b/net/core/.gitignore
@@ -0,0 +1 @@
+dropreason_str.c
diff --git a/net/core/Makefile b/net/core/Makefile
index a8e4f737692b..e8ce3bd283a6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -4,7 +4,8 @@
 #
 
 obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
-	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
+	 flow_dissector.o dropreason_str.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
@@ -39,3 +40,23 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
 obj-$(CONFIG_OF)	+= of_net.o
+
+clean-files := dropreason_str.c
+
+quiet_cmd_dropreason_str = GEN     $@
+cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include <net/dropreason.h>\n"; \
+	print "const char * const drop_reasons[] = {" }\
+	/^enum skb_drop/ { dr=1; }\
+	/^\};/ { dr=0; }\
+	/^\tSKB_DROP_REASON_/ {\
+		if (dr) {\
+			sub(/\tSKB_DROP_REASON_/, "", $$1);\
+			printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\
+		}\
+	}\
+	END{ print "};" }' $< > $@
+
+$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h
+	$(call cmd,dropreason_str)
+
+$(obj)/dropreason_str.o: $(obj)/dropreason_str.c
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 41cac0e4834e..4ad1decce724 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -48,19 +48,6 @@
 static int trace_state = TRACE_OFF;
 static bool monitor_hw;
 
-#undef EM
-#undef EMe
-
-#define EM(a, b)	[a] = #b,
-#define EMe(a, b)	[a] = #b
-
-/* drop_reasons is used to translate 'enum skb_drop_reason' to string,
- * which is reported to user space.
- */
-static const char * const drop_reasons[] = {
-	TRACE_SKB_DROP_REASON
-};
-
 /* net_dm_mutex
  *
  * An overall lock guarding every operation coming from userspace.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5b3559cb1d82..b661040c100e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -91,6 +91,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
 EXPORT_SYMBOL(sysctl_max_skb_frags);
 
+/* The array 'drop_reasons' is auto-generated in dropreason_str.c */
+EXPORT_SYMBOL(drop_reasons);
+
 /**
  *	skb_panic - private function for out-of-line support
  *	@skb:	buffer
-- 
cgit 


From b160f7270e6df0f4ccadcf6750696b380eb3b811 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Mon, 6 Jun 2022 10:24:36 +0800
Subject: net: dropreason: reformat the comment fo skb drop reasons

To make the code clear, reformat the comment in dropreason.h to k-doc
style.

Now, the comment can pass the check of kernel-doc without warnning:

$ ./scripts/kernel-doc -v -none include/linux/dropreason.h
include/linux/dropreason.h:7: info: Scanning doc for enum skb_drop_reason

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dropreason.h | 382 ++++++++++++++++++++++++++++-------------------
 1 file changed, 226 insertions(+), 156 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 013ff0f2543e..fae9b40e54fa 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -3,168 +3,238 @@
 #ifndef _LINUX_DROPREASON_H
 #define _LINUX_DROPREASON_H
 
-/* The reason of skb drop, which is used in kfree_skb_reason().
- * en...maybe they should be splited by group?
+/**
+ * enum skb_drop_reason - the reasons of skb drops
  *
- * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is
- * used to translate the reason to string.
+ * The reason of skb drop, which is used in kfree_skb_reason().
  */
 enum skb_drop_reason {
+	/**
+	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
+	 */
 	SKB_NOT_DROPPED_YET = 0,
-	SKB_DROP_REASON_NOT_SPECIFIED,	/* drop reason is not specified */
-	SKB_DROP_REASON_NO_SOCKET,	/* socket not found */
-	SKB_DROP_REASON_PKT_TOO_SMALL,	/* packet size is too small */
-	SKB_DROP_REASON_TCP_CSUM,	/* TCP checksum error */
-	SKB_DROP_REASON_SOCKET_FILTER,	/* dropped by socket filter */
-	SKB_DROP_REASON_UDP_CSUM,	/* UDP checksum error */
-	SKB_DROP_REASON_NETFILTER_DROP,	/* dropped by netfilter */
-	SKB_DROP_REASON_OTHERHOST,	/* packet don't belong to current
-					 * host (interface is in promisc
-					 * mode)
-					 */
-	SKB_DROP_REASON_IP_CSUM,	/* IP checksum error */
-	SKB_DROP_REASON_IP_INHDR,	/* there is something wrong with
-					 * IP header (see
-					 * IPSTATS_MIB_INHDRERRORS)
-					 */
-	SKB_DROP_REASON_IP_RPFILTER,	/* IP rpfilter validate failed.
-					 * see the document for rp_filter
-					 * in ip-sysctl.rst for more
-					 * information
-					 */
-	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2
-						  * is multicast, but L3 is
-						  * unicast.
-						  */
-	SKB_DROP_REASON_XFRM_POLICY,	/* xfrm policy check failed */
-	SKB_DROP_REASON_IP_NOPROTO,	/* no support for IP protocol */
-	SKB_DROP_REASON_SOCKET_RCVBUFF,	/* socket receive buff is full */
-	SKB_DROP_REASON_PROTO_MEM,	/* proto memory limition, such as
-					 * udp packet drop out of
-					 * udp_memory_allocated.
-					 */
-	SKB_DROP_REASON_TCP_MD5NOTFOUND,	/* no MD5 hash and one
-						 * expected, corresponding
-						 * to LINUX_MIB_TCPMD5NOTFOUND
-						 */
-	SKB_DROP_REASON_TCP_MD5UNEXPECTED,	/* MD5 hash and we're not
-						 * expecting one, corresponding
-						 * to LINUX_MIB_TCPMD5UNEXPECTED
-						 */
-	SKB_DROP_REASON_TCP_MD5FAILURE,	/* MD5 hash and its wrong,
-					 * corresponding to
-					 * LINUX_MIB_TCPMD5FAILURE
-					 */
-	SKB_DROP_REASON_SOCKET_BACKLOG,	/* failed to add skb to socket
-					 * backlog (see
-					 * LINUX_MIB_TCPBACKLOGDROP)
-					 */
-	SKB_DROP_REASON_TCP_FLAGS,	/* TCP flags invalid */
-	SKB_DROP_REASON_TCP_ZEROWINDOW,	/* TCP receive window size is zero,
-					 * see LINUX_MIB_TCPZEROWINDOWDROP
-					 */
-	SKB_DROP_REASON_TCP_OLD_DATA,	/* the TCP data reveived is already
-					 * received before (spurious retrans
-					 * may happened), see
-					 * LINUX_MIB_DELAYEDACKLOST
-					 */
-	SKB_DROP_REASON_TCP_OVERWINDOW,	/* the TCP data is out of window,
-					 * the seq of the first byte exceed
-					 * the right edges of receive
-					 * window
-					 */
-	SKB_DROP_REASON_TCP_OFOMERGE,	/* the data of skb is already in
-					 * the ofo queue, corresponding to
-					 * LINUX_MIB_TCPOFOMERGE
-					 */
-	SKB_DROP_REASON_TCP_RFC7323_PAWS, /* PAWS check, corresponding to
-					   * LINUX_MIB_PAWSESTABREJECTED
-					   */
-	SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */
-	SKB_DROP_REASON_TCP_RESET,	/* Invalid RST packet */
-	SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */
-	SKB_DROP_REASON_TCP_CLOSE,	/* TCP socket in CLOSE state */
-	SKB_DROP_REASON_TCP_FASTOPEN,	/* dropped by FASTOPEN request socket */
-	SKB_DROP_REASON_TCP_OLD_ACK,	/* TCP ACK is old, but in window */
-	SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */
-	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */
-	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */
-	SKB_DROP_REASON_TCP_OFO_DROP,	/* data already in receive queue */
-	SKB_DROP_REASON_IP_OUTNOROUTES,	/* route lookup failed */
-	SKB_DROP_REASON_BPF_CGROUP_EGRESS,	/* dropped by
-						 * BPF_PROG_TYPE_CGROUP_SKB
-						 * eBPF program
-						 */
-	SKB_DROP_REASON_IPV6DISABLED,	/* IPv6 is disabled on the device */
-	SKB_DROP_REASON_NEIGH_CREATEFAIL,	/* failed to create neigh
-						 * entry
-						 */
-	SKB_DROP_REASON_NEIGH_FAILED,	/* neigh entry in failed state */
-	SKB_DROP_REASON_NEIGH_QUEUEFULL,	/* arp_queue for neigh
-						 * entry is full
-						 */
-	SKB_DROP_REASON_NEIGH_DEAD,	/* neigh entry is dead */
-	SKB_DROP_REASON_TC_EGRESS,	/* dropped in TC egress HOOK */
-	SKB_DROP_REASON_QDISC_DROP,	/* dropped by qdisc when packet
-					 * outputting (failed to enqueue to
-					 * current qdisc)
-					 */
-	SKB_DROP_REASON_CPU_BACKLOG,	/* failed to enqueue the skb to
-					 * the per CPU backlog queue. This
-					 * can be caused by backlog queue
-					 * full (see netdev_max_backlog in
-					 * net.rst) or RPS flow limit
-					 */
-	SKB_DROP_REASON_XDP,		/* dropped by XDP in input path */
-	SKB_DROP_REASON_TC_INGRESS,	/* dropped in TC ingress HOOK */
-	SKB_DROP_REASON_UNHANDLED_PROTO,	/* protocol not implemented
-						 * or not supported
-						 */
-	SKB_DROP_REASON_SKB_CSUM,	/* sk_buff checksum computation
-					 * error
-					 */
-	SKB_DROP_REASON_SKB_GSO_SEG,	/* gso segmentation error */
-	SKB_DROP_REASON_SKB_UCOPY_FAULT,	/* failed to copy data from
-						 * user space, e.g., via
-						 * zerocopy_sg_from_iter()
-						 * or skb_orphan_frags_rx()
-						 */
-	SKB_DROP_REASON_DEV_HDR,	/* device driver specific
-					 * header/metadata is invalid
-					 */
-	/* the device is not ready to xmit/recv due to any of its data
-	 * structure that is not up/ready/initialized, e.g., the IFF_UP is
-	 * not set, or driver specific tun->tfiles[txq] is not initialized
+	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
+	SKB_DROP_REASON_NOT_SPECIFIED,
+	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
+	SKB_DROP_REASON_NO_SOCKET,
+	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
+	SKB_DROP_REASON_PKT_TOO_SMALL,
+	/** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */
+	SKB_DROP_REASON_TCP_CSUM,
+	/** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */
+	SKB_DROP_REASON_SOCKET_FILTER,
+	/** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */
+	SKB_DROP_REASON_UDP_CSUM,
+	/** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */
+	SKB_DROP_REASON_NETFILTER_DROP,
+	/**
+	 * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host
+	 * (interface is in promisc mode)
+	 */
+	SKB_DROP_REASON_OTHERHOST,
+	/** @SKB_DROP_REASON_IP_CSUM: IP checksum error */
+	SKB_DROP_REASON_IP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see
+	 * IPSTATS_MIB_INHDRERRORS)
+	 */
+	SKB_DROP_REASON_IP_INHDR,
+	/**
+	 * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the
+	 * document for rp_filter in ip-sysctl.rst for more information
+	 */
+	SKB_DROP_REASON_IP_RPFILTER,
+	/**
+	 * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is
+	 * multicast, but L3 is unicast.
+	 */
+	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,
+	/** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */
+	SKB_DROP_REASON_XFRM_POLICY,
+	/** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */
+	SKB_DROP_REASON_IP_NOPROTO,
+	/** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */
+	SKB_DROP_REASON_SOCKET_RCVBUFF,
+	/**
+	 * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet
+	 * drop out of udp_memory_allocated.
+	 */
+	SKB_DROP_REASON_PROTO_MEM,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
+	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
+	 */
+	SKB_DROP_REASON_TCP_MD5NOTFOUND,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting
+	 * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED
+	 */
+	SKB_DROP_REASON_TCP_MD5UNEXPECTED,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding
+	 * to LINUX_MIB_TCPMD5FAILURE
+	 */
+	SKB_DROP_REASON_TCP_MD5FAILURE,
+	/**
+	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
+	 * see LINUX_MIB_TCPBACKLOGDROP)
+	 */
+	SKB_DROP_REASON_SOCKET_BACKLOG,
+	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
+	SKB_DROP_REASON_TCP_FLAGS,
+	/**
+	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
+	 * see LINUX_MIB_TCPZEROWINDOWDROP
+	 */
+	SKB_DROP_REASON_TCP_ZEROWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already
+	 * received before (spurious retrans may happened), see
+	 * LINUX_MIB_DELAYEDACKLOST
+	 */
+	SKB_DROP_REASON_TCP_OLD_DATA,
+	/**
+	 * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window,
+	 * the seq of the first byte exceed the right edges of receive
+	 * window
+	 */
+	SKB_DROP_REASON_TCP_OVERWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo
+	 * queue, corresponding to LINUX_MIB_TCPOFOMERGE
+	 */
+	SKB_DROP_REASON_TCP_OFOMERGE,
+	/**
+	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
+	 * LINUX_MIB_PAWSESTABREJECTED
+	 */
+	SKB_DROP_REASON_TCP_RFC7323_PAWS,
+	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
+	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
+	SKB_DROP_REASON_TCP_RESET,
+	/**
+	 * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected
+	 * SYN flag
+	 */
+	SKB_DROP_REASON_TCP_INVALID_SYN,
+	/** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */
+	SKB_DROP_REASON_TCP_CLOSE,
+	/** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */
+	SKB_DROP_REASON_TCP_FASTOPEN,
+	/** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
+	SKB_DROP_REASON_TCP_OLD_ACK,
+	/** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
+	SKB_DROP_REASON_TCP_TOO_OLD_ACK,
+	/**
+	 * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
+	 * sent yet
+	 */
+	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,
+	/** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */
+	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,
+	/** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */
+	SKB_DROP_REASON_TCP_OFO_DROP,
+	/** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */
+	SKB_DROP_REASON_IP_OUTNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB
+	 * eBPF program
+	 */
+	SKB_DROP_REASON_BPF_CGROUP_EGRESS,
+	/** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */
+	SKB_DROP_REASON_IPV6DISABLED,
+	/** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */
+	SKB_DROP_REASON_NEIGH_CREATEFAIL,
+	/** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */
+	SKB_DROP_REASON_NEIGH_FAILED,
+	/** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */
+	SKB_DROP_REASON_NEIGH_QUEUEFULL,
+	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
+	SKB_DROP_REASON_NEIGH_DEAD,
+	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
+	SKB_DROP_REASON_TC_EGRESS,
+	/**
+	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
+	 * failed to enqueue to current qdisc)
+	 */
+	SKB_DROP_REASON_QDISC_DROP,
+	/**
+	 * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
+	 * backlog queue. This can be caused by backlog queue full (see
+	 * netdev_max_backlog in net.rst) or RPS flow limit
+	 */
+	SKB_DROP_REASON_CPU_BACKLOG,
+	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
+	SKB_DROP_REASON_XDP,
+	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
+	SKB_DROP_REASON_TC_INGRESS,
+	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
+	SKB_DROP_REASON_UNHANDLED_PROTO,
+	/** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */
+	SKB_DROP_REASON_SKB_CSUM,
+	/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
+	SKB_DROP_REASON_SKB_GSO_SEG,
+	/**
+	 * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
+	 * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
+	 */
+	SKB_DROP_REASON_SKB_UCOPY_FAULT,
+	/** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */
+	SKB_DROP_REASON_DEV_HDR,
+	/**
+	 * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to
+	 * any of its data structure that is not up/ready/initialized,
+	 * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq]
+	 * is not initialized
 	 */
 	SKB_DROP_REASON_DEV_READY,
-	SKB_DROP_REASON_FULL_RING,	/* ring buffer is full */
-	SKB_DROP_REASON_NOMEM,		/* error due to OOM */
-	SKB_DROP_REASON_HDR_TRUNC,      /* failed to trunc/extract the header
-					 * from networking data, e.g., failed
-					 * to pull the protocol header from
-					 * frags via pskb_may_pull()
-					 */
-	SKB_DROP_REASON_TAP_FILTER,     /* dropped by (ebpf) filter directly
-					 * attached to tun/tap, e.g., via
-					 * TUNSETFILTEREBPF
-					 */
-	SKB_DROP_REASON_TAP_TXFILTER,	/* dropped by tx filter implemented
-					 * at tun/tap, e.g., check_filter()
-					 */
-	SKB_DROP_REASON_ICMP_CSUM,	/* ICMP checksum error */
-	SKB_DROP_REASON_INVALID_PROTO,	/* the packet doesn't follow RFC
-					 * 2211, such as a broadcasts
-					 * ICMP_TIMESTAMP
-					 */
-	SKB_DROP_REASON_IP_INADDRERRORS,	/* host unreachable, corresponding
-						 * to IPSTATS_MIB_INADDRERRORS
-						 */
-	SKB_DROP_REASON_IP_INNOROUTES,	/* network unreachable, corresponding
-					 * to IPSTATS_MIB_INADDRERRORS
-					 */
-	SKB_DROP_REASON_PKT_TOO_BIG,	/* packet size is too big (maybe exceed
-					 * the MTU)
-					 */
+	/** @SKB_DROP_REASON_FULL_RING: ring buffer is full */
+	SKB_DROP_REASON_FULL_RING,
+	/** @SKB_DROP_REASON_NOMEM: error due to OOM */
+	SKB_DROP_REASON_NOMEM,
+	/**
+	 * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from
+	 * networking data, e.g., failed to pull the protocol header from
+	 * frags via pskb_may_pull()
+	 */
+	SKB_DROP_REASON_HDR_TRUNC,
+	/**
+	 * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached
+	 * to tun/tap, e.g., via TUNSETFILTEREBPF
+	 */
+	SKB_DROP_REASON_TAP_FILTER,
+	/**
+	 * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at
+	 * tun/tap, e.g., check_filter()
+	 */
+	SKB_DROP_REASON_TAP_TXFILTER,
+	/** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */
+	SKB_DROP_REASON_ICMP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211,
+	 * such as a broadcasts ICMP_TIMESTAMP
+	 */
+	SKB_DROP_REASON_INVALID_PROTO,
+	/**
+	 * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INADDRERRORS,
+	/**
+	 * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the
+	 * MTU)
+	 */
+	SKB_DROP_REASON_PKT_TOO_BIG,
+	/**
+	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
+	 * used as a real 'reason'
+	 */
 	SKB_DROP_REASON_MAX,
 };
 
-- 
cgit 


From e6f08af6340eaf88e9eeff71bd4533eee9a04119 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 5 Jun 2022 17:35:37 +0200
Subject: ASoC: simple-card-utils: Make asoc_simple_clean_reference() return
 void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

asoc_simple_clean_reference() returns zero unconditionally. Letting it
return void instead makes it easier to see in the caller that there is no
error to handle.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220605153537.26591-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/simple_card_utils.h     | 2 +-
 sound/soc/generic/simple-card-utils.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h
index 8faa649f712b..fe2337fde1f4 100644
--- a/include/sound/simple_card_utils.h
+++ b/include/sound/simple_card_utils.h
@@ -173,7 +173,7 @@ void asoc_simple_canonicalize_platform(struct snd_soc_dai_link_component *platfo
 void asoc_simple_canonicalize_cpu(struct snd_soc_dai_link_component *cpus,
 				  int is_single_links);
 
-int asoc_simple_clean_reference(struct snd_soc_card *card);
+void asoc_simple_clean_reference(struct snd_soc_card *card);
 
 void asoc_simple_convert_fixup(struct asoc_simple_data *data,
 				      struct snd_pcm_hw_params *params);
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index fa080f166345..0beda9739ebe 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -609,7 +609,7 @@ void asoc_simple_canonicalize_cpu(struct snd_soc_dai_link_component *cpus,
 }
 EXPORT_SYMBOL_GPL(asoc_simple_canonicalize_cpu);
 
-int asoc_simple_clean_reference(struct snd_soc_card *card)
+void asoc_simple_clean_reference(struct snd_soc_card *card)
 {
 	struct snd_soc_dai_link *dai_link;
 	struct snd_soc_dai_link_component *cpu;
@@ -622,7 +622,6 @@ int asoc_simple_clean_reference(struct snd_soc_card *card)
 		for_each_link_codecs(dai_link, j, codec)
 			of_node_put(codec->of_node);
 	}
-	return 0;
 }
 EXPORT_SYMBOL_GPL(asoc_simple_clean_reference);
 
@@ -877,7 +876,9 @@ int asoc_simple_remove(struct platform_device *pdev)
 {
 	struct snd_soc_card *card = platform_get_drvdata(pdev);
 
-	return asoc_simple_clean_reference(card);
+	asoc_simple_clean_reference(card);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(asoc_simple_remove);
 
-- 
cgit 


From 5f69a6577bc33d8f6d6bbe02bccdeb357b287f56 Mon Sep 17 00:00:00 2001
From: Chen Wandun <chenwandun@huawei.com>
Date: Thu, 26 May 2022 20:26:56 +0800
Subject: psi: dont alloc memory for psi by default

Memory about struct psi_group is allocated by default for
each cgroup even if psi_disabled is true, in this case, these
allocated memory is waste, so alloc memory for struct psi_group
only when psi_disabled is false.

Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  2 +-
 include/linux/cgroup.h      |  2 +-
 kernel/cgroup/cgroup.c      |  8 ++++----
 kernel/sched/psi.c          | 19 +++++++++++++------
 4 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..672de25e3ec8 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -475,7 +475,7 @@ struct cgroup {
 	struct work_struct release_agent_work;
 
 	/* used to track pressure stalls */
-	struct psi_group psi;
+	struct psi_group *psi;
 
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0d1ada8968d7..ed53bfe7c46c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -674,7 +674,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 
 static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 {
-	return &cgrp->psi;
+	return cgrp->psi;
 }
 
 bool cgroup_psi_enabled(void);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..90a654cb8a1e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3609,21 +3609,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_CPU);
 }
@@ -3649,7 +3649,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 		return -EBUSY;
 	}
 
-	psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 	new = psi_trigger_create(psi, buf, nbytes, res);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a337f3e35997..ec66b40bdd40 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -957,10 +957,16 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
 	if (static_branch_likely(&psi_disabled))
 		return 0;
 
-	cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
-	if (!cgroup->psi.pcpu)
+	cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL);
+	if (!cgroup->psi)
 		return -ENOMEM;
-	group_init(&cgroup->psi);
+
+	cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+	if (!cgroup->psi->pcpu) {
+		kfree(cgroup->psi);
+		return -ENOMEM;
+	}
+	group_init(cgroup->psi);
 	return 0;
 }
 
@@ -969,10 +975,11 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
-	free_percpu(cgroup->psi.pcpu);
+	cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+	free_percpu(cgroup->psi->pcpu);
 	/* All triggers must be removed by now */
-	WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
+	WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+	kfree(cgroup->psi);
 }
 
 /**
-- 
cgit 


From 6089fb325cf737eeb2c4d236c94697112ca860da Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 6 Jun 2022 23:26:00 -0700
Subject: bpf: Add btf enum64 support

Currently, BTF only supports upto 32bit enum value with BTF_KIND_ENUM.
But in kernel, some enum indeed has 64bit values, e.g.,
in uapi bpf.h, we have
  enum {
        BPF_F_INDEX_MASK                = 0xffffffffULL,
        BPF_F_CURRENT_CPU               = BPF_F_INDEX_MASK,
        BPF_F_CTXLEN_MASK               = (0xfffffULL << 32),
  };
In this case, BTF_KIND_ENUM will encode the value of BPF_F_CTXLEN_MASK
as 0, which certainly is incorrect.

This patch added a new btf kind, BTF_KIND_ENUM64, which permits
64bit value to cover the above use case. The BTF_KIND_ENUM64 has
the following three fields followed by the common type:
  struct bpf_enum64 {
    __u32 nume_off;
    __u32 val_lo32;
    __u32 val_hi32;
  };
Currently, btf type section has an alignment of 4 as all element types
are u32. Representing the value with __u64 will introduce a pad
for bpf_enum64 and may also introduce misalignment for the 64bit value.
Hence, two members of val_hi32 and val_lo32 are chosen to avoid these issues.

The kflag is also introduced for BTF_KIND_ENUM and BTF_KIND_ENUM64
to indicate whether the value is signed or unsigned. The kflag intends
to provide consistent output of BTF C fortmat with the original
source code. For example, the original BTF_KIND_ENUM bit value is 0xffffffff.
The format C has two choices, printing out 0xffffffff or -1 and current libbpf
prints out as unsigned value. But if the signedness is preserved in btf,
the value can be printed the same as the original source code.
The kflag value 0 means unsigned values, which is consistent to the default
by libbpf and should also cover most cases as well.

The new BTF_KIND_ENUM64 is intended to support the enum value represented as
64bit value. But it can represent all BTF_KIND_ENUM values as well.
The compiler ([1]) and pahole will generate BTF_KIND_ENUM64 only if the value has
to be represented with 64 bits.

In addition, a static inline function btf_kind_core_compat() is introduced which
will be used later when libbpf relo_core.c changed. Here the kernel shares the
same relo_core.c with libbpf.

  [1] https://reviews.llvm.org/D124641

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220607062600.3716578-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h            |  28 ++++++++
 include/uapi/linux/btf.h       |  17 ++++-
 kernel/bpf/btf.c               | 142 +++++++++++++++++++++++++++++++++++++----
 kernel/bpf/verifier.c          |   2 +-
 tools/include/uapi/linux/btf.h |  17 ++++-
 5 files changed, 185 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 2611cea2c2b6..1bfed7fa0428 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -177,6 +177,19 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_is_any_enum(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM ||
+	       BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64;
+}
+
+static inline bool btf_kind_core_compat(const struct btf_type *t1,
+					const struct btf_type *t2)
+{
+	return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) ||
+	       (btf_is_any_enum(t1) && btf_is_any_enum(t2));
+}
+
 static inline bool str_is_empty(const char *s)
 {
 	return !s || !s[0];
@@ -192,6 +205,16 @@ static inline bool btf_is_enum(const struct btf_type *t)
 	return btf_kind(t) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_is_enum64(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ENUM64;
+}
+
+static inline u64 btf_enum64_value(const struct btf_enum64 *e)
+{
+	return ((u64)e->val_hi32 << 32) | e->val_lo32;
+}
+
 static inline bool btf_is_composite(const struct btf_type *t)
 {
 	u16 kind = btf_kind(t);
@@ -332,6 +355,11 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t)
 	return (struct btf_enum *)(t + 1);
 }
 
+static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
+{
+	return (struct btf_enum64 *)(t + 1);
+}
+
 static inline const struct btf_var_secinfo *btf_type_var_secinfo(
 		const struct btf_type *t)
 {
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index a9162a6c0284..ec1798b6d3ff 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -36,10 +36,10 @@ struct btf_type {
 	 * bits 24-28: kind (e.g. int, ptr, array...etc)
 	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
-	 *             struct, union and fwd
+	 *             struct, union, enum, fwd and enum64
 	 */
 	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC.
+	/* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64.
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
@@ -63,7 +63,7 @@ enum {
 	BTF_KIND_ARRAY		= 3,	/* Array	*/
 	BTF_KIND_STRUCT		= 4,	/* Struct	*/
 	BTF_KIND_UNION		= 5,	/* Union	*/
-	BTF_KIND_ENUM		= 6,	/* Enumeration	*/
+	BTF_KIND_ENUM		= 6,	/* Enumeration up to 32-bit values */
 	BTF_KIND_FWD		= 7,	/* Forward	*/
 	BTF_KIND_TYPEDEF	= 8,	/* Typedef	*/
 	BTF_KIND_VOLATILE	= 9,	/* Volatile	*/
@@ -76,6 +76,7 @@ enum {
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
 	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
+	BTF_KIND_ENUM64		= 19,	/* Enumeration up to 64-bit values */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
@@ -186,4 +187,14 @@ struct btf_decl_tag {
        __s32   component_idx;
 };
 
+/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64".
+ * The exact number of btf_enum64 is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum64 {
+	__u32	name_off;
+	__u32	val_lo32;
+	__u32	val_hi32;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7bccaa4646e5..6c0d8480e15c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT]	= "FLOAT",
 	[BTF_KIND_DECL_TAG]	= "DECL_TAG",
 	[BTF_KIND_TYPE_TAG]	= "TYPE_TAG",
+	[BTF_KIND_ENUM64]	= "ENUM64",
 };
 
 const char *btf_type_str(const struct btf_type *t)
@@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t)
 	case BTF_KIND_ENUM:
 	case BTF_KIND_DATASEC:
 	case BTF_KIND_FLOAT:
+	case BTF_KIND_ENUM64:
 		return true;
 	}
 
@@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
 	return (const struct btf_decl_tag *)(t + 1);
 }
 
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+	return (const struct btf_enum64 *)(t + 1);
+}
+
 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 {
 	return kind_ops[BTF_INFO_KIND(t->info)];
@@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show)
 			parens = "{";
 		break;
 	case BTF_KIND_ENUM:
+	case BTF_KIND_ENUM64:
 		prefix = "enum";
 		break;
 	default:
@@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		case BTF_KIND_UNION:
 		case BTF_KIND_ENUM:
 		case BTF_KIND_FLOAT:
+		case BTF_KIND_ENUM64:
 			size = type->size;
 			goto resolved;
 
@@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 {
 	const struct btf_enum *enums = btf_type_enum(t);
 	struct btf *btf = env->btf;
+	const char *fmt_str;
 	u16 i, nr_enums;
 	u32 meta_needed;
 
@@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (btf_type_kflag(t)) {
-		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
-		return -EINVAL;
-	}
-
 	if (t->size > 8 || !is_power_of_2(t->size)) {
 		btf_verifier_log_type(env, t, "Unexpected size");
 		return -EINVAL;
@@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 
 		if (env->log.level == BPF_LOG_KERNEL)
 			continue;
-		btf_verifier_log(env, "\t%s val=%d\n",
+		fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+		btf_verifier_log(env, fmt_str,
 				 __btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
@@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
 		return;
 	}
 
-	btf_show_type_value(show, "%d", v);
+	if (btf_type_kflag(t))
+		btf_show_type_value(show, "%d", v);
+	else
+		btf_show_type_value(show, "%u", v);
 	btf_show_end_type(show);
 }
 
@@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = {
 	.show = btf_enum_show,
 };
 
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	const struct btf_enum64 *enums = btf_type_enum64(t);
+	struct btf *btf = env->btf;
+	const char *fmt_str;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size > 8 || !is_power_of_2(t->size)) {
+		btf_verifier_log_type(env, t, "Unexpected size");
+		return -EINVAL;
+	}
+
+	/* enum type either no name or a valid one */
+	if (t->name_off &&
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name_off);
+			return -EINVAL;
+		}
+
+		/* enum member must have a valid name */
+		if (!enums[i].name_off ||
+		    !btf_name_valid_identifier(btf, enums[i].name_off)) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
+
+		if (env->log.level == BPF_LOG_KERNEL)
+			continue;
+
+		fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+		btf_verifier_log(env, fmt_str,
+				 __btf_name_by_offset(btf, enums[i].name_off),
+				 btf_enum64_value(enums + i));
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offset,
+			    struct btf_show *show)
+{
+	const struct btf_enum64 *enums = btf_type_enum64(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	void *safe_data;
+	s64 v;
+
+	safe_data = btf_show_start_type(show, t, type_id, data);
+	if (!safe_data)
+		return;
+
+	v = *(u64 *)safe_data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v != btf_enum64_value(enums + i))
+			continue;
+
+		btf_show_type_value(show, "%s",
+				    __btf_name_by_offset(btf,
+							 enums[i].name_off));
+
+		btf_show_end_type(show);
+		return;
+	}
+
+	if (btf_type_kflag(t))
+		btf_show_type_value(show, "%lld", v);
+	else
+		btf_show_type_value(show, "%llu", v);
+	btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+	.check_meta = btf_enum64_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_enum_check_member,
+	.check_kflag_member = btf_enum_check_kflag_member,
+	.log_details = btf_enum_log,
+	.show = btf_enum64_show,
+};
+
 static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
 				     const struct btf_type *t,
 				     u32 meta_left)
@@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT] = &float_ops,
 	[BTF_KIND_DECL_TAG] = &decl_tag_ops,
 	[BTF_KIND_TYPE_TAG] = &modifier_ops,
+	[BTF_KIND_ENUM64] = &enum64_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -5299,7 +5412,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
-	if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+	if (btf_type_is_small_int(t) || btf_is_any_enum(t))
 		/* accessing a scalar */
 		return true;
 	if (!btf_type_is_ptr(t)) {
@@ -5763,7 +5876,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
 	if (btf_type_is_ptr(t))
 		/* kernel size of pointer. Not BPF's size of pointer*/
 		return sizeof(void *);
-	if (btf_type_is_int(t) || btf_type_is_enum(t))
+	if (btf_type_is_int(t) || btf_is_any_enum(t))
 		return t->size;
 	*bad_type = t;
 	return -EINVAL;
@@ -5911,7 +6024,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
 		 * to context only. And only global functions can be replaced.
 		 * Hence type check only those types.
 		 */
-		if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+		if (btf_type_is_int(t1) || btf_is_any_enum(t1))
 			continue;
 		if (!btf_type_is_ptr(t1)) {
 			bpf_log(log,
@@ -6408,7 +6521,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
-	if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+	if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
 		bpf_log(log,
 			"Global function %s() doesn't return scalar. Only those are supported.\n",
 			tname);
@@ -6423,7 +6536,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
 			t = btf_type_by_id(btf, t->type);
-		if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+		if (btf_type_is_int(t) || btf_is_any_enum(t)) {
 			reg->type = SCALAR_VALUE;
 			continue;
 		}
@@ -7335,6 +7448,7 @@ recur:
 	case BTF_KIND_UNION:
 	case BTF_KIND_ENUM:
 	case BTF_KIND_FWD:
+	case BTF_KIND_ENUM64:
 		return 1;
 	case BTF_KIND_INT:
 		/* just reject deprecated bitfield-like integers; all other
@@ -7387,10 +7501,10 @@ recur:
  * field-based relocations. This function assumes that root types were already
  * checked for name match. Beyond that initial root-level name check, names
  * are completely ignored. Compatibility rules are as follows:
- *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
  *     kind should match for local and target types (i.e., STRUCT is not
  *     compatible with UNION);
- *   - for ENUMs, the size is ignored;
+ *   - for ENUMs/ENUM64s, the size is ignored;
  *   - for INT, size and signedness are ignored;
  *   - for ARRAY, dimensionality is ignored, element types are checked for
  *     compatibility recursively;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aedac2ac02b9..2d2872682278 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10901,7 +10901,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			goto err_free;
 		ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
 		scalar_return =
-			btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+			btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
 		if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
 			verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
 			goto err_free;
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index a9162a6c0284..ec1798b6d3ff 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -36,10 +36,10 @@ struct btf_type {
 	 * bits 24-28: kind (e.g. int, ptr, array...etc)
 	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
-	 *             struct, union and fwd
+	 *             struct, union, enum, fwd and enum64
 	 */
 	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC.
+	/* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64.
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
@@ -63,7 +63,7 @@ enum {
 	BTF_KIND_ARRAY		= 3,	/* Array	*/
 	BTF_KIND_STRUCT		= 4,	/* Struct	*/
 	BTF_KIND_UNION		= 5,	/* Union	*/
-	BTF_KIND_ENUM		= 6,	/* Enumeration	*/
+	BTF_KIND_ENUM		= 6,	/* Enumeration up to 32-bit values */
 	BTF_KIND_FWD		= 7,	/* Forward	*/
 	BTF_KIND_TYPEDEF	= 8,	/* Typedef	*/
 	BTF_KIND_VOLATILE	= 9,	/* Volatile	*/
@@ -76,6 +76,7 @@ enum {
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
 	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
+	BTF_KIND_ENUM64		= 19,	/* Enumeration up to 64-bit values */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
@@ -186,4 +187,14 @@ struct btf_decl_tag {
        __s32   component_idx;
 };
 
+/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64".
+ * The exact number of btf_enum64 is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum64 {
+	__u32	name_off;
+	__u32	val_lo32;
+	__u32	val_hi32;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
-- 
cgit 


From 0e3c3b901c00364198d31482fa2552ccf2d5c899 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 6 Jun 2022 18:42:59 -0400
Subject: No need of likely/unlikely on calls of check_copy_size()

it's inline and unlikely() inside of it (including the implicit one
in WARN_ON_ONCE()) suffice to convince the compiler that getting
false from check_copy_size() is unlikely.

Spotted-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/include/asm/uaccess.h |  2 +-
 arch/s390/include/asm/uaccess.h    |  4 ++--
 include/linux/uaccess.h            |  4 ++--
 include/linux/uio.h                | 15 ++++++---------
 4 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 9b82b38ff867..105f200b1e31 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -348,7 +348,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned long size)
 static inline unsigned long __must_check
 copy_mc_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (likely(check_copy_size(from, n, true))) {
+	if (check_copy_size(from, n, true)) {
 		if (access_ok(to, n)) {
 			allow_write_to_user(to, n);
 			n = copy_mc_generic((void *)to, from, n);
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index f4511e21d646..c2c9995466e0 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -39,7 +39,7 @@ _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned
 static __always_inline unsigned long __must_check
 copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
 {
-	if (likely(check_copy_size(to, n, false)))
+	if (check_copy_size(to, n, false))
 		n = _copy_from_user_key(to, from, n, key);
 	return n;
 }
@@ -50,7 +50,7 @@ _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned l
 static __always_inline unsigned long __must_check
 copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
 {
-	if (likely(check_copy_size(from, n, true)))
+	if (check_copy_size(from, n, true))
 		n = _copy_to_user_key(to, from, n, key);
 	return n;
 }
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5a328cf02b75..47e5d374c7eb 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -148,7 +148,7 @@ _copy_to_user(void __user *, const void *, unsigned long);
 static __always_inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (likely(check_copy_size(to, n, false)))
+	if (check_copy_size(to, n, false))
 		n = _copy_from_user(to, from, n);
 	return n;
 }
@@ -156,7 +156,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (likely(check_copy_size(from, n, true)))
+	if (check_copy_size(from, n, true))
 		n = _copy_to_user(to, from, n);
 	return n;
 }
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 739285fe5a2f..76d305f3d4c2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -156,19 +156,17 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
 static __always_inline __must_check
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(!check_copy_size(addr, bytes, true)))
-		return 0;
-	else
+	if (check_copy_size(addr, bytes, true))
 		return _copy_to_iter(addr, bytes, i);
+	return 0;
 }
 
 static __always_inline __must_check
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(!check_copy_size(addr, bytes, false)))
-		return 0;
-	else
+	if (check_copy_size(addr, bytes, false))
 		return _copy_from_iter(addr, bytes, i);
+	return 0;
 }
 
 static __always_inline __must_check
@@ -184,10 +182,9 @@ bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
 static __always_inline __must_check
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(!check_copy_size(addr, bytes, false)))
-		return 0;
-	else
+	if (check_copy_size(addr, bytes, false))
 		return _copy_from_iter_nocache(addr, bytes, i);
+	return 0;
 }
 
 static __always_inline __must_check
-- 
cgit 


From a11b80692be5c408a33ea89e3fe1a240bef8c820 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Mon, 23 May 2022 12:59:03 +0300
Subject: scsi: target: iscsi: Add upcast helpers

iSCSI target is cluttered with open-coded container_of() conversions from
se_nacl to iscsi_node_acl. The code could be cleaned by introducing a
helper - to_iscsi_nacl() (similar to other helpers in target core).

While at it, make another iSCSI conversion helper consistent and rename
iscsi_tpg() helper to to_iscsi_tpg().

Link: https://lore.kernel.org/r/20220523095905.26070-2-d.bogdanov@yadro.com
Reviewed-by: Roman Bolshakov <r.bolshakov@yadro.com>
Reviewed-by: Konstantin Shelekhin <k.shelekhin@yadro.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/iscsi_target_configfs.c | 82 ++++++++++------------------
 drivers/target/iscsi/iscsi_target_nego.c     | 14 ++---
 drivers/target/iscsi/iscsi_target_tpg.c      |  3 +-
 include/target/iscsi/iscsi_target_core.h     | 12 ++++
 4 files changed, 49 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index ce14540ba650..b01b6701c144 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -210,7 +210,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
 		return ERR_PTR(ret);
 	}
 
-	tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
+	tpg = to_iscsi_tpg(se_tpg);
 	ret = iscsit_get_tpg(tpg);
 	if (ret < 0)
 		return ERR_PTR(-EINVAL);
@@ -281,9 +281,7 @@ static ssize_t iscsi_nacl_attrib_##name##_show(struct config_item *item,\
 		char *page)						\
 {									\
 	struct se_node_acl *se_nacl = attrib_to_nacl(item);		\
-	struct iscsi_node_acl *nacl = container_of(se_nacl, struct iscsi_node_acl, \
-					se_node_acl);			\
-									\
+	struct iscsi_node_acl *nacl = to_iscsi_nacl(se_nacl);		\
 	return sprintf(page, "%u\n", nacl->node_attrib.name);		\
 }									\
 									\
@@ -291,8 +289,7 @@ static ssize_t iscsi_nacl_attrib_##name##_store(struct config_item *item,\
 		const char *page, size_t count)				\
 {									\
 	struct se_node_acl *se_nacl = attrib_to_nacl(item);		\
-	struct iscsi_node_acl *nacl = container_of(se_nacl, struct iscsi_node_acl, \
-					se_node_acl);			\
+	struct iscsi_node_acl *nacl = to_iscsi_nacl(se_nacl);		\
 	u32 val;							\
 	int ret;							\
 									\
@@ -377,15 +374,14 @@ static ssize_t iscsi_nacl_auth_##name##_show(struct config_item *item,	\
 		char *page)						\
 {									\
 	struct se_node_acl *nacl = auth_to_nacl(item);			\
-	return __iscsi_nacl_auth_##name##_show(container_of(nacl,	\
-			struct iscsi_node_acl, se_node_acl), page);	\
+	return __iscsi_nacl_auth_##name##_show(to_iscsi_nacl(nacl), page);	\
 }									\
 static ssize_t iscsi_nacl_auth_##name##_store(struct config_item *item,	\
 		const char *page, size_t count)				\
 {									\
 	struct se_node_acl *nacl = auth_to_nacl(item);			\
-	return __iscsi_nacl_auth_##name##_store(container_of(nacl,	\
-			struct iscsi_node_acl, se_node_acl), page, count); \
+	return __iscsi_nacl_auth_##name##_store(to_iscsi_nacl(nacl),	\
+						page, count); \
 }									\
 									\
 CONFIGFS_ATTR(iscsi_nacl_auth_, name)
@@ -417,8 +413,7 @@ static ssize_t iscsi_nacl_auth_##name##_show(struct config_item *item,	\
 		char *page)						\
 {									\
 	struct se_node_acl *nacl = auth_to_nacl(item);			\
-	return __iscsi_nacl_auth_##name##_show(container_of(nacl,	\
-			struct iscsi_node_acl, se_node_acl), page);	\
+	return __iscsi_nacl_auth_##name##_show(to_iscsi_nacl(nacl), page);	\
 }									\
 									\
 CONFIGFS_ATTR_RO(iscsi_nacl_auth_, name)
@@ -623,8 +618,7 @@ static ssize_t lio_target_nacl_cmdsn_depth_store(struct config_item *item,
 {
 	struct se_node_acl *se_nacl = acl_to_nacl(item);
 	struct se_portal_group *se_tpg = se_nacl->se_tpg;
-	struct iscsi_portal_group *tpg = container_of(se_tpg,
-			struct iscsi_portal_group, tpg_se_tpg);
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);
 	struct config_item *acl_ci, *tpg_ci, *wwn_ci;
 	u32 cmdsn_depth = 0;
 	int ret;
@@ -700,8 +694,7 @@ static struct configfs_attribute *lio_target_initiator_attrs[] = {
 static int lio_target_init_nodeacl(struct se_node_acl *se_nacl,
 		const char *name)
 {
-	struct iscsi_node_acl *acl =
-		container_of(se_nacl, struct iscsi_node_acl, se_node_acl);
+	struct iscsi_node_acl *acl = to_iscsi_nacl(se_nacl);
 
 	config_group_init_type_name(&acl->node_stat_grps.iscsi_sess_stats_group,
 			"iscsi_sess_stats", &iscsi_stat_sess_cit);
@@ -720,8 +713,7 @@ static ssize_t iscsi_tpg_attrib_##name##_show(struct config_item *item,	\
 		char *page)						\
 {									\
 	struct se_portal_group *se_tpg = attrib_to_tpg(item);		\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,		\
-			struct iscsi_portal_group, tpg_se_tpg);	\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);		\
 	ssize_t rb;							\
 									\
 	if (iscsit_get_tpg(tpg) < 0)					\
@@ -736,8 +728,7 @@ static ssize_t iscsi_tpg_attrib_##name##_store(struct config_item *item,\
 		const char *page, size_t count)				\
 {									\
 	struct se_portal_group *se_tpg = attrib_to_tpg(item);		\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,		\
-			struct iscsi_portal_group, tpg_se_tpg);	\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);		\
 	u32 val;							\
 	int ret;							\
 									\
@@ -800,8 +791,7 @@ static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = {
 static ssize_t __iscsi_##prefix##_##name##_show(struct se_portal_group *se_tpg,	\
 		char *page)							\
 {										\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,			\
-				struct iscsi_portal_group, tpg_se_tpg);		\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);			\
 	struct iscsi_node_auth *auth = &tpg->tpg_demo_auth;			\
 										\
 	if (!capable(CAP_SYS_ADMIN))						\
@@ -813,8 +803,7 @@ static ssize_t __iscsi_##prefix##_##name##_show(struct se_portal_group *se_tpg,
 static ssize_t __iscsi_##prefix##_##name##_store(struct se_portal_group *se_tpg,\
 		const char *page, size_t count)					\
 {										\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,			\
-				struct iscsi_portal_group, tpg_se_tpg);		\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);			\
 	struct iscsi_node_auth *auth = &tpg->tpg_demo_auth;			\
 										\
 	if (!capable(CAP_SYS_ADMIN))						\
@@ -861,8 +850,7 @@ DEF_TPG_AUTH_STR(password_mutual, NAF_PASSWORD_IN_SET);
 static ssize_t __iscsi_##prefix##_##name##_show(struct se_portal_group *se_tpg,	\
 		char *page)								\
 {										\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,			\
-				struct iscsi_portal_group, tpg_se_tpg);		\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);			\
 	struct iscsi_node_auth *auth = &tpg->tpg_demo_auth;			\
 										\
 	if (!capable(CAP_SYS_ADMIN))						\
@@ -900,8 +888,7 @@ static ssize_t iscsi_tpg_param_##name##_show(struct config_item *item,	\
 		char *page)						\
 {									\
 	struct se_portal_group *se_tpg = param_to_tpg(item);		\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,		\
-			struct iscsi_portal_group, tpg_se_tpg);		\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);		\
 	struct iscsi_param *param;					\
 	ssize_t rb;							\
 									\
@@ -923,8 +910,7 @@ static ssize_t iscsi_tpg_param_##name##_store(struct config_item *item, \
 		const char *page, size_t count)				\
 {									\
 	struct se_portal_group *se_tpg = param_to_tpg(item);		\
-	struct iscsi_portal_group *tpg = container_of(se_tpg,		\
-			struct iscsi_portal_group, tpg_se_tpg);		\
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);		\
 	char *buf;							\
 	int ret, len;							\
 									\
@@ -1073,8 +1059,7 @@ free_out:
 static int lio_target_tiqn_enabletpg(struct se_portal_group *se_tpg,
 				     bool enable)
 {
-	struct iscsi_portal_group *tpg = container_of(se_tpg,
-			struct iscsi_portal_group, tpg_se_tpg);
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);
 	int ret;
 
 	ret = iscsit_get_tpg(tpg);
@@ -1106,7 +1091,7 @@ static void lio_target_tiqn_deltpg(struct se_portal_group *se_tpg)
 	struct iscsi_portal_group *tpg;
 	struct iscsi_tiqn *tiqn;
 
-	tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
+	tpg = to_iscsi_tpg(se_tpg);
 	tiqn = tpg->tpg_tiqn;
 	/*
 	 * iscsit_tpg_del_portal_group() assumes force=1
@@ -1416,46 +1401,41 @@ static void lio_aborted_task(struct se_cmd *se_cmd)
 	cmd->conn->conn_transport->iscsit_aborted_task(cmd->conn, cmd);
 }
 
-static inline struct iscsi_portal_group *iscsi_tpg(struct se_portal_group *se_tpg)
-{
-	return container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
-}
-
 static char *lio_tpg_get_endpoint_wwn(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_tiqn->tiqn;
+	return to_iscsi_tpg(se_tpg)->tpg_tiqn->tiqn;
 }
 
 static u16 lio_tpg_get_tag(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpgt;
+	return to_iscsi_tpg(se_tpg)->tpgt;
 }
 
 static u32 lio_tpg_get_default_depth(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_attrib.default_cmdsn_depth;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.default_cmdsn_depth;
 }
 
 static int lio_tpg_check_demo_mode(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_attrib.generate_node_acls;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.generate_node_acls;
 }
 
 static int lio_tpg_check_demo_mode_cache(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_attrib.cache_dynamic_acls;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.cache_dynamic_acls;
 }
 
 static int lio_tpg_check_demo_mode_write_protect(
 	struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_attrib.demo_mode_write_protect;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.demo_mode_write_protect;
 }
 
 static int lio_tpg_check_prod_mode_write_protect(
 	struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_attrib.prod_mode_write_protect;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.prod_mode_write_protect;
 }
 
 static int lio_tpg_check_prot_fabric_only(
@@ -1465,9 +1445,9 @@ static int lio_tpg_check_prot_fabric_only(
 	 * Only report fabric_prot_type if t10_pi has also been enabled
 	 * for incoming ib_isert sessions.
 	 */
-	if (!iscsi_tpg(se_tpg)->tpg_attrib.t10_pi)
+	if (!to_iscsi_tpg(se_tpg)->tpg_attrib.t10_pi)
 		return 0;
-	return iscsi_tpg(se_tpg)->tpg_attrib.fabric_prot_type;
+	return to_iscsi_tpg(se_tpg)->tpg_attrib.fabric_prot_type;
 }
 
 /*
@@ -1504,16 +1484,14 @@ static void lio_tpg_close_session(struct se_session *se_sess)
 
 static u32 lio_tpg_get_inst_index(struct se_portal_group *se_tpg)
 {
-	return iscsi_tpg(se_tpg)->tpg_tiqn->tiqn_index;
+	return to_iscsi_tpg(se_tpg)->tpg_tiqn->tiqn_index;
 }
 
 static void lio_set_default_node_attributes(struct se_node_acl *se_acl)
 {
-	struct iscsi_node_acl *acl = container_of(se_acl, struct iscsi_node_acl,
-				se_node_acl);
+	struct iscsi_node_acl *acl = to_iscsi_nacl(se_acl);
 	struct se_portal_group *se_tpg = se_acl->se_tpg;
-	struct iscsi_portal_group *tpg = container_of(se_tpg,
-				struct iscsi_portal_group, tpg_se_tpg);
+	struct iscsi_portal_group *tpg = to_iscsi_tpg(se_tpg);
 
 	acl->node_attrib.nacl = acl;
 	iscsit_set_default_node_attribues(acl, tpg);
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index b34ac9ecac31..d853bacf1cfc 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -104,8 +104,8 @@ static u32 iscsi_handle_authentication(
 {
 	struct iscsit_session *sess = conn->sess;
 	struct iscsi_node_auth *auth;
-	struct iscsi_node_acl *iscsi_nacl;
-	struct iscsi_portal_group *iscsi_tpg;
+	struct iscsi_node_acl *nacl;
+	struct iscsi_portal_group *tpg;
 	struct se_node_acl *se_nacl;
 
 	if (!sess->sess_ops->SessionType) {
@@ -120,15 +120,13 @@ static u32 iscsi_handle_authentication(
 		}
 
 		if (se_nacl->dynamic_node_acl) {
-			iscsi_tpg = container_of(se_nacl->se_tpg,
-					struct iscsi_portal_group, tpg_se_tpg);
+			tpg = to_iscsi_tpg(se_nacl->se_tpg);
 
-			auth = &iscsi_tpg->tpg_demo_auth;
+			auth = &tpg->tpg_demo_auth;
 		} else {
-			iscsi_nacl = container_of(se_nacl, struct iscsi_node_acl,
-						  se_node_acl);
+			nacl = to_iscsi_nacl(se_nacl);
 
-			auth = &iscsi_nacl->node_auth;
+			auth = &nacl->node_auth;
 		}
 	} else {
 		/*
diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c
index 4339ee517434..3cac1aafef68 100644
--- a/drivers/target/iscsi/iscsi_target_tpg.c
+++ b/drivers/target/iscsi/iscsi_target_tpg.c
@@ -394,8 +394,7 @@ struct iscsi_node_attrib *iscsit_tpg_get_node_attrib(
 {
 	struct se_session *se_sess = sess->se_sess;
 	struct se_node_acl *se_nacl = se_sess->se_node_acl;
-	struct iscsi_node_acl *acl = container_of(se_nacl, struct iscsi_node_acl,
-					se_node_acl);
+	struct iscsi_node_acl *acl = to_iscsi_nacl(se_nacl);
 
 	return &acl->node_attrib;
 }
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 8e68ace428d9..4dd62947f8db 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -758,6 +758,12 @@ struct iscsi_node_acl {
 	struct iscsi_node_stat_grps node_stat_grps;
 };
 
+static inline struct iscsi_node_acl *
+to_iscsi_nacl(struct se_node_acl *se_nacl)
+{
+	return container_of(se_nacl, struct iscsi_node_acl, se_node_acl);
+}
+
 struct iscsi_tpg_attrib {
 	u32			authentication;
 	u32			login_timeout;
@@ -839,6 +845,12 @@ struct iscsi_portal_group {
 	struct list_head	tpg_list;
 } ____cacheline_aligned;
 
+static inline struct iscsi_portal_group *
+to_iscsi_tpg(struct se_portal_group *se_tpg)
+{
+	return container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
+}
+
 struct iscsi_wwn_stat_grps {
 	struct config_group	iscsi_stat_group;
 	struct config_group	iscsi_instance_group;
-- 
cgit 


From a6e0d179764cb31b2981c85e6fd156adc777e4ed Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Mon, 23 May 2022 12:59:05 +0300
Subject: scsi: target: iscsi: Control authentication per ACL

Add acls/{ACL}/attrib/authentication attribute that controls authentication
for particular ACL. By default, this attribute inherits a value of the
authentication attribute of the target port group to keep backward
compatibility.

Authentication attribute has 3 states:

 "0" - authentication is turned off for this ACL

 "1" - authentication is required for this ACL

 "-1" - authentication is inherited from TPG

Link: https://lore.kernel.org/r/20220523095905.26070-4-d.bogdanov@yadro.com
Reviewed-by: Roman Bolshakov <r.bolshakov@yadro.com>
Reviewed-by: Konstantin Shelekhin <k.shelekhin@yadro.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/iscsi_target_configfs.c   | 31 ++++++++++++++++++++++++++
 drivers/target/iscsi/iscsi_target_nego.c       |  8 ++++++-
 drivers/target/iscsi/iscsi_target_nodeattrib.c |  1 +
 include/target/iscsi/iscsi_target_core.h       |  2 ++
 4 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index b01b6701c144..5d0f51822414 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -314,6 +314,36 @@ ISCSI_NACL_ATTR(random_datain_pdu_offsets);
 ISCSI_NACL_ATTR(random_datain_seq_offsets);
 ISCSI_NACL_ATTR(random_r2t_offsets);
 
+static ssize_t iscsi_nacl_attrib_authentication_show(struct config_item *item,
+		char *page)
+{
+	struct se_node_acl *se_nacl = attrib_to_nacl(item);
+	struct iscsi_node_acl *nacl = to_iscsi_nacl(se_nacl);
+
+	return sprintf(page, "%d\n", nacl->node_attrib.authentication);
+}
+
+static ssize_t iscsi_nacl_attrib_authentication_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_node_acl *se_nacl = attrib_to_nacl(item);
+	struct iscsi_node_acl *nacl = to_iscsi_nacl(se_nacl);
+	s32 val;
+	int ret;
+
+	ret = kstrtos32(page, 0, &val);
+	if (ret)
+		return ret;
+	if (val != 0 && val != 1 && val != NA_AUTHENTICATION_INHERITED)
+		return -EINVAL;
+
+	nacl->node_attrib.authentication = val;
+
+	return count;
+}
+
+CONFIGFS_ATTR(iscsi_nacl_attrib_, authentication);
+
 static struct configfs_attribute *lio_target_nacl_attrib_attrs[] = {
 	&iscsi_nacl_attrib_attr_dataout_timeout,
 	&iscsi_nacl_attrib_attr_dataout_timeout_retries,
@@ -323,6 +353,7 @@ static struct configfs_attribute *lio_target_nacl_attrib_attrs[] = {
 	&iscsi_nacl_attrib_attr_random_datain_pdu_offsets,
 	&iscsi_nacl_attrib_attr_random_datain_seq_offsets,
 	&iscsi_nacl_attrib_attr_random_r2t_offsets,
+	&iscsi_nacl_attrib_attr_authentication,
 	NULL,
 };
 
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index f06f16d63fe6..9ce35a59962b 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -813,6 +813,7 @@ static int iscsi_target_do_authentication(
 
 static bool iscsi_conn_auth_required(struct iscsit_conn *conn)
 {
+	struct iscsi_node_acl *nacl;
 	struct se_node_acl *se_nacl;
 
 	if (conn->sess->sess_ops->SessionType) {
@@ -839,7 +840,12 @@ static bool iscsi_conn_auth_required(struct iscsit_conn *conn)
 
 	pr_debug("Known ACL %s is trying to connect\n",
 		 se_nacl->initiatorname);
-	return conn->tpg->tpg_attrib.authentication;
+
+	nacl = to_iscsi_nacl(se_nacl);
+	if (nacl->node_attrib.authentication == NA_AUTHENTICATION_INHERITED)
+		return conn->tpg->tpg_attrib.authentication;
+
+	return nacl->node_attrib.authentication;
 }
 
 static int iscsi_target_handle_csg_zero(
diff --git a/drivers/target/iscsi/iscsi_target_nodeattrib.c b/drivers/target/iscsi/iscsi_target_nodeattrib.c
index 874cb33c9be0..d63efdefb18e 100644
--- a/drivers/target/iscsi/iscsi_target_nodeattrib.c
+++ b/drivers/target/iscsi/iscsi_target_nodeattrib.c
@@ -30,6 +30,7 @@ void iscsit_set_default_node_attribues(
 {
 	struct iscsi_node_attrib *a = &acl->node_attrib;
 
+	a->authentication = NA_AUTHENTICATION_INHERITED;
 	a->dataout_timeout = NA_DATAOUT_TIMEOUT;
 	a->dataout_timeout_retries = NA_DATAOUT_TIMEOUT_RETRIES;
 	a->nopin_timeout = NA_NOPIN_TIMEOUT;
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 4dd62947f8db..94d06ddfd80a 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -26,6 +26,7 @@ struct sock;
 #define ISCSI_RX_THREAD_NAME		"iscsi_trx"
 #define ISCSI_TX_THREAD_NAME		"iscsi_ttx"
 #define ISCSI_IQN_LEN			224
+#define NA_AUTHENTICATION_INHERITED	-1
 
 /* struct iscsi_node_attrib sanity values */
 #define NA_DATAOUT_TIMEOUT		3
@@ -715,6 +716,7 @@ struct iscsi_login {
 } ____cacheline_aligned;
 
 struct iscsi_node_attrib {
+	s32			authentication;
 	u32			dataout_timeout;
 	u32			dataout_timeout_retries;
 	u32			default_erl;
-- 
cgit 


From b1d288d9c3c5ca28df062214656a59cf7ee370e0 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 6 Jun 2022 20:18:03 +0000
Subject: platform/chrome: cros_ec_proto: Rename cros_ec_command function

cros_ec_command() is the name of a function as well as a struct, as such
it can confuse indexing tools (like ctags). Avoid this by renaming it to
cros_ec_cmd(). Update all the callsites to use the new name.

This patch is a find-and-replace, so should not introduce any functional
changes.

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606201825.763788-3-pmalani@chromium.org
---
 drivers/mfd/cros_ec_dev.c                   |  4 +--
 drivers/platform/chrome/cros_ec_proto.c     | 22 ++++++++--------
 drivers/platform/chrome/cros_ec_typec.c     | 39 ++++++++++++++---------------
 drivers/platform/chrome/cros_usbpd_notify.c |  4 +--
 drivers/regulator/cros-ec-regulator.c       | 24 +++++++++---------
 include/linux/platform_data/cros_ec_proto.h |  2 +-
 6 files changed, 47 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 596731caf407..02d4271dfe06 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -250,8 +250,8 @@ static int ec_device_probe(struct platform_device *pdev)
 	 * The PCHG device cannot be detected by sending EC_FEATURE_GET_CMD, but
 	 * it can be detected by querying the number of peripheral chargers.
 	 */
-	retval = cros_ec_command(ec->ec_dev, 0, EC_CMD_PCHG_COUNT, NULL, 0,
-				 &pchg_count, sizeof(pchg_count));
+	retval = cros_ec_cmd(ec->ec_dev, 0, EC_CMD_PCHG_COUNT, NULL, 0,
+			     &pchg_count, sizeof(pchg_count));
 	if (retval >= 0 && pchg_count.port_count) {
 		retval = mfd_add_hotplug_devices(ec->dev,
 					cros_ec_pchg_cells,
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 13ced9d2dd71..b6bea183ee28 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -860,8 +860,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature)
 
 	if (features->flags[0] == -1U && features->flags[1] == -1U) {
 		/* features bitmap not read yet */
-		ret = cros_ec_command(ec->ec_dev, 0, EC_CMD_GET_FEATURES + ec->cmd_offset,
-				      NULL, 0, features, sizeof(*features));
+		ret = cros_ec_cmd(ec->ec_dev, 0, EC_CMD_GET_FEATURES + ec->cmd_offset,
+				  NULL, 0, features, sizeof(*features));
 		if (ret < 0) {
 			dev_warn(ec->dev, "cannot get EC features: %d\n", ret);
 			memset(features, 0, sizeof(*features));
@@ -942,7 +942,7 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec)
 EXPORT_SYMBOL_GPL(cros_ec_get_sensor_count);
 
 /**
- * cros_ec_command - Send a command to the EC.
+ * cros_ec_cmd - Send a command to the EC.
  *
  * @ec_dev: EC device
  * @version: EC command version
@@ -954,13 +954,13 @@ EXPORT_SYMBOL_GPL(cros_ec_get_sensor_count);
  *
  * Return: >= 0 on success, negative error number on failure.
  */
-int cros_ec_command(struct cros_ec_device *ec_dev,
-		    unsigned int version,
-		    int command,
-		    void *outdata,
-		    int outsize,
-		    void *indata,
-		    int insize)
+int cros_ec_cmd(struct cros_ec_device *ec_dev,
+		unsigned int version,
+		int command,
+		void *outdata,
+		int outsize,
+		void *indata,
+		int insize)
 {
 	struct cros_ec_command *msg;
 	int ret;
@@ -987,4 +987,4 @@ error:
 	kfree(msg);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(cros_ec_command);
+EXPORT_SYMBOL_GPL(cros_ec_cmd);
diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
index 7cb2e35c4ded..d6088ba447af 100644
--- a/drivers/platform/chrome/cros_ec_typec.c
+++ b/drivers/platform/chrome/cros_ec_typec.c
@@ -525,8 +525,8 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num,
 	enum typec_orientation orientation;
 	int ret;
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_MUX_INFO,
-			      &req, sizeof(req), &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_MUX_INFO,
+			  &req, sizeof(req), &resp, sizeof(resp));
 	if (ret < 0) {
 		dev_warn(typec->dev, "Failed to get mux info for port: %d, err = %d\n",
 			 port_num, ret);
@@ -585,8 +585,8 @@ mux_ack:
 	/* Sending Acknowledgment to EC */
 	mux_ack.port = port_num;
 
-	if (cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_MUX_ACK, &mux_ack,
-			    sizeof(mux_ack), NULL, 0) < 0)
+	if (cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_MUX_ACK, &mux_ack,
+			sizeof(mux_ack), NULL, 0) < 0)
 		dev_warn(typec->dev,
 			 "Failed to send Mux ACK to EC for port: %d\n",
 			 port_num);
@@ -754,8 +754,8 @@ static int cros_typec_handle_sop_prime_disc(struct cros_typec_data *typec, int p
 	int ret = 0;
 
 	memset(disc, 0, EC_PROTO2_MAX_RESPONSE_SIZE);
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
-			      disc, EC_PROTO2_MAX_RESPONSE_SIZE);
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
+			  disc, EC_PROTO2_MAX_RESPONSE_SIZE);
 	if (ret < 0) {
 		dev_err(typec->dev, "Failed to get SOP' discovery data for port: %d\n", port_num);
 		goto sop_prime_disc_exit;
@@ -837,8 +837,8 @@ static int cros_typec_handle_sop_disc(struct cros_typec_data *typec, int port_nu
 	typec_partner_set_pd_revision(port->partner, pd_revision);
 
 	memset(sop_disc, 0, EC_PROTO2_MAX_RESPONSE_SIZE);
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
-			      sop_disc, EC_PROTO2_MAX_RESPONSE_SIZE);
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
+			  sop_disc, EC_PROTO2_MAX_RESPONSE_SIZE);
 	if (ret < 0) {
 		dev_err(typec->dev, "Failed to get SOP discovery data for port: %d\n", port_num);
 		goto disc_exit;
@@ -870,8 +870,8 @@ static int cros_typec_send_clear_event(struct cros_typec_data *typec, int port_n
 		.clear_events_mask = events_mask,
 	};
 
-	return cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_CONTROL, &req,
-			       sizeof(req), NULL, 0);
+	return cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_CONTROL, &req,
+			   sizeof(req), NULL, 0);
 }
 
 static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num)
@@ -882,8 +882,8 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num
 	};
 	int ret;
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_STATUS, &req, sizeof(req),
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_STATUS, &req, sizeof(req),
+			  &resp, sizeof(resp));
 	if (ret < 0) {
 		dev_warn(typec->dev, "EC_CMD_TYPEC_STATUS failed for port: %d\n", port_num);
 		return;
@@ -960,9 +960,9 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
 	req.mux = USB_PD_CTRL_MUX_NO_CHANGE;
 	req.swap = USB_PD_CTRL_SWAP_NONE;
 
-	ret = cros_ec_command(typec->ec, typec->pd_ctrl_ver,
-			      EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, typec->pd_ctrl_ver,
+			  EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
+			  &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
@@ -997,9 +997,8 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
 
 	/* We're interested in the PD control command version. */
 	req_v1.cmd = EC_CMD_USB_PD_CONTROL;
-	ret = cros_ec_command(typec->ec, 1, EC_CMD_GET_CMD_VERSIONS,
-			      &req_v1, sizeof(req_v1), &resp,
-				    sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 1, EC_CMD_GET_CMD_VERSIONS,
+			  &req_v1, sizeof(req_v1), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
@@ -1090,8 +1089,8 @@ static int cros_typec_probe(struct platform_device *pdev)
 	typec->typec_cmd_supported = cros_ec_check_features(ec_dev, EC_FEATURE_TYPEC_CMD);
 	typec->needs_mux_ack = cros_ec_check_features(ec_dev, EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK);
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
+			  &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c
index 91ce6be91aac..4b5a81c9dc6d 100644
--- a/drivers/platform/chrome/cros_usbpd_notify.c
+++ b/drivers/platform/chrome/cros_usbpd_notify.c
@@ -71,8 +71,8 @@ static void cros_usbpd_get_event_and_notify(struct device  *dev,
 	}
 
 	/* Check for PD host events on EC. */
-	ret = cros_ec_command(ec_dev, 0, EC_CMD_PD_HOST_EVENT_STATUS,
-			      NULL, 0, &host_event_status, sizeof(host_event_status));
+	ret = cros_ec_cmd(ec_dev, 0, EC_CMD_PD_HOST_EVENT_STATUS,
+			  NULL, 0, &host_event_status, sizeof(host_event_status));
 	if (ret < 0) {
 		dev_warn(dev, "Can't get host event status (err: %d)\n", ret);
 		goto send_notify;
diff --git a/drivers/regulator/cros-ec-regulator.c b/drivers/regulator/cros-ec-regulator.c
index 1c5fc74a4776..1591636f86c3 100644
--- a/drivers/regulator/cros-ec-regulator.c
+++ b/drivers/regulator/cros-ec-regulator.c
@@ -30,8 +30,8 @@ static int cros_ec_regulator_enable(struct regulator_dev *dev)
 		.enable = 1,
 	};
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static int cros_ec_regulator_disable(struct regulator_dev *dev)
@@ -42,8 +42,8 @@ static int cros_ec_regulator_disable(struct regulator_dev *dev)
 		.enable = 0,
 	};
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static int cros_ec_regulator_is_enabled(struct regulator_dev *dev)
@@ -55,8 +55,8 @@ static int cros_ec_regulator_is_enabled(struct regulator_dev *dev)
 	struct ec_response_regulator_is_enabled resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_IS_ENABLED, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_IS_ENABLED, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 	return resp.enabled;
@@ -82,8 +82,8 @@ static int cros_ec_regulator_get_voltage(struct regulator_dev *dev)
 	struct ec_response_regulator_get_voltage resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_GET_VOLTAGE, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_GET_VOLTAGE, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 	return resp.voltage_mv * 1000;
@@ -108,8 +108,8 @@ static int cros_ec_regulator_set_voltage(struct regulator_dev *dev, int min_uV,
 	if (min_mV > max_mV)
 		return -EINVAL;
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_SET_VOLTAGE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_SET_VOLTAGE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static const struct regulator_ops cros_ec_regulator_voltage_ops = {
@@ -130,8 +130,8 @@ static int cros_ec_regulator_init_info(struct device *dev,
 	struct ec_response_regulator_get_info resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_GET_INFO, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_GET_INFO, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 138fd912c808..816da4eef3e5 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -231,7 +231,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_command(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
+int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
 		    int outsize, void *indata, int insize);
 
 /**
-- 
cgit 


From f87e15fbf6d8cdb51f953338d41a4a52ad1aca14 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 6 Jun 2022 20:18:05 +0000
Subject: platform/chrome: cros_ec_proto: Update size arg types

cros_ec_cmd() takes 2 size arguments. Update them to be of the more
appropriate type size_t.

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606201825.763788-4-pmalani@chromium.org
---
 drivers/platform/chrome/cros_ec_proto.c     | 4 ++--
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index b6bea183ee28..cefabfe45551 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -958,9 +958,9 @@ int cros_ec_cmd(struct cros_ec_device *ec_dev,
 		unsigned int version,
 		int command,
 		void *outdata,
-		int outsize,
+		size_t outsize,
 		void *indata,
-		int insize)
+		size_t insize)
 {
 	struct cros_ec_command *msg;
 	int ret;
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 816da4eef3e5..85e29300f63d 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -232,7 +232,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
 int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
-		    int outsize, void *indata, int insize);
+		    size_t outsize, void *indata, size_t insize);
 
 /**
  * cros_ec_get_time_ns() - Return time in ns.
-- 
cgit 


From ed2351174e38ad4febbbc0dba802803e6cff8ae0 Mon Sep 17 00:00:00 2001
From: Chenyi Qiang <chenyi.qiang@intel.com>
Date: Tue, 24 May 2022 21:56:21 +0800
Subject: KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple
 fault

For the triple fault sythesized by KVM, e.g. the RSM path or
nested_vmx_abort(), if KVM exits to userspace before the request is
serviced, userspace could migrate the VM and lose the triple fault.

Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault with a
new event KVM_VCPUEVENT_VALID_FAULT_FAULT so that userspace can save and
restore the triple fault event. This extension is guarded by a new KVM
capability KVM_CAP_TRIPLE_FAULT_EVENT.

Note that in the set_vcpu_events path, userspace is able to set/clear
the triple fault request through triple_fault.pending field.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Message-Id: <20220524135624.22988-2-chenyi.qiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  8 ++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/uapi/asm/kvm.h |  6 +++++-
 arch/x86/kvm/x86.c              | 21 ++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  1 +
 5 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 421479a67da5..f67e367c4059 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1150,6 +1150,10 @@ The following bits are defined in the flags field:
   fields contain a valid state. This bit will be set whenever
   KVM_CAP_EXCEPTION_PAYLOAD is enabled.
 
+- KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the
+  triple_fault_pending field contains a valid state. This bit will
+  be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled.
+
 ARM64:
 ^^^^^^
 
@@ -1245,6 +1249,10 @@ can be set in the flags field to signal that the
 exception_has_payload, exception_payload, and exception.pending fields
 contain a valid state and shall be written into the VCPU.
 
+If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT
+can be set in flags field to signal that the triple_fault field contains
+a valid state and shall be written into the VCPU.
+
 ARM64:
 ^^^^^^
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 032278f0ee6d..d6c62276e131 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1174,6 +1174,8 @@ struct kvm_arch {
 	bool guest_can_read_msr_platform_info;
 	bool exception_payload_enabled;
 
+	bool triple_fault_event;
+
 	bool bus_lock_detection_enabled;
 	bool enable_pmu;
 	/*
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 21614807a2cb..24c807c8d5f7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -325,6 +325,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
 #define KVM_VCPUEVENT_VALID_SMM		0x00000008
 #define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
+#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT	0x00000020
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -359,7 +360,10 @@ struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
-	__u8 reserved[27];
+	struct {
+		__u8 pending;
+	} triple_fault;
+	__u8 reserved[26];
 	__u8 exception_has_payload;
 	__u64 exception_payload;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c8dfdef9e52f..422fbb0d7518 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4296,6 +4296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_GET_MSR_FEATURES:
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
+	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_LAST_CPU:
 	case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4942,6 +4943,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 			 | KVM_VCPUEVENT_VALID_SMM);
 	if (vcpu->kvm->arch.exception_payload_enabled)
 		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+	if (vcpu->kvm->arch.triple_fault_event) {
+		events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+	}
 
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -4955,7 +4960,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			      | KVM_VCPUEVENT_VALID_SHADOW
 			      | KVM_VCPUEVENT_VALID_SMM
-			      | KVM_VCPUEVENT_VALID_PAYLOAD))
+			      | KVM_VCPUEVENT_VALID_PAYLOAD
+			      | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
 		return -EINVAL;
 
 	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5028,6 +5034,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+		if (!vcpu->kvm->arch.triple_fault_event)
+			return -EINVAL;
+		if (events->triple_fault.pending)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		else
+			kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+	}
+
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
@@ -6029,6 +6044,10 @@ split_irqchip_unlock:
 		kvm->arch.exception_payload_enabled = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+		kvm->arch.triple_fault_event = cap->args[0];
+		r = 0;
+		break;
 	case KVM_CAP_X86_USER_SPACE_MSR:
 		kvm->arch.user_space_msr_mask = cap->args[0];
 		r = 0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c4a32910b88a..ca799319acfd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1158,6 +1158,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 #define KVM_CAP_S390_PROTECTED_DUMP 217
+#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit 


From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001
From: Tao Xu <tao3.xu@intel.com>
Date: Tue, 24 May 2022 21:56:24 +0800
Subject: KVM: VMX: Enable Notify VM exit

There are cases that malicious virtual machines can cause CPU stuck (due
to event windows don't open up), e.g., infinite loop in microcode when
nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and
IRQ) can be delivered. It leads the CPU to be unavailable to host or
other VMs.

VMM can enable notify VM exit that a VM exit generated if no event
window occurs in VM non-root mode for a specified amount of time (notify
window).

Feature enabling:
- The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to
  enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust
  the expected notify window.
- Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space
  can query and enable this feature in per-VM scope. The argument is a
  64bit value: bits 63:32 are used for notify window, and bits 31:0 are
  for flags. Current supported flags:
  - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify
    window provided.
  - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen.
- It's safe to even set notify window to zero since an internal hardware
  threshold is added to vmcs.notify_window.

VM exit handling:
- Introduce a vcpu state notify_window_exits to records the count of
  notify VM exits and expose it through the debugfs.
- Notify VM exit can happen incident to delivery of a vector event.
  Allow it in KVM.
- Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID
  bit is set.

Nested handling
- Nested notify VM exits are not supported yet. Keep the same notify
  window control in vmcs02 as vmcs01, so that L1 can't escape the
  restriction of notify VM exits through launching L2 VM.

Notify VM exit is defined in latest Intel Architecture Instruction Set
Extensions Programming Reference, chapter 9.2.

Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
Co-developed-by: Chenyi Qiang <chenyi.qiang@intel.com>
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst     | 49 ++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h    |  7 ++++++
 arch/x86/include/asm/vmx.h         |  7 ++++++
 arch/x86/include/asm/vmxfeatures.h |  1 +
 arch/x86/include/uapi/asm/vmx.h    |  4 +++-
 arch/x86/kvm/vmx/capabilities.h    |  6 +++++
 arch/x86/kvm/vmx/nested.c          |  8 +++++++
 arch/x86/kvm/vmx/vmx.c             | 40 +++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c                 | 22 ++++++++++++++++-
 arch/x86/kvm/x86.h                 |  7 ++++++
 include/uapi/linux/kvm.h           | 11 +++++++++
 11 files changed, 158 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index f67e367c4059..30e31a886422 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6557,6 +6557,26 @@ array field represents return values. The userspace should update the return
 values of SBI call before resuming the VCPU. For more details on RISC-V SBI
 spec refer, https://github.com/riscv/riscv-sbi-doc.
 
+::
+
+    /* KVM_EXIT_NOTIFY */
+    struct {
+  #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
+      __u32 flags;
+    } notify;
+
+Used on x86 systems. When the VM capability KVM_CAP_X86_NOTIFY_VMEXIT is
+enabled, a VM exit generated if no event window occurs in VM non-root mode
+for a specified amount of time. Once KVM_X86_NOTIFY_VMEXIT_USER is set when
+enabling the cap, it would exit to userspace with the exit reason
+KVM_EXIT_NOTIFY for further handling. The "flags" field contains more
+detailed info.
+
+The valid value for 'flags' is:
+
+  - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid
+    in VMCS. It would run into unknown result if resume the target VM.
+
 ::
 
 		/* Fix the size of the union. */
@@ -7523,6 +7543,35 @@ if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM
 uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as
 the maximum APIC ID.
 
+7.33 KVM_CAP_X86_NOTIFY_VMEXIT
+------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is the value of notify window as well as some flags
+:Returns: 0 on success, -EINVAL if args[0] contains invalid flags or notify
+          VM exit is unsupported.
+
+Bits 63:32 of args[0] are used for notify window.
+Bits 31:0 of args[0] are for some flags. Valid bits are::
+
+  #define KVM_X86_NOTIFY_VMEXIT_ENABLED    (1 << 0)
+  #define KVM_X86_NOTIFY_VMEXIT_USER       (1 << 1)
+
+This capability allows userspace to configure the notify VM exit on/off
+in per-VM scope during VM creation. Notify VM exit is disabled by default.
+When userspace sets KVM_X86_NOTIFY_VMEXIT_ENABLED bit in args[0], VMM will
+enable this feature with the notify window provided, which will generate
+a VM exit if no event window occurs in VM non-root mode for a specified of
+time (notify window).
+
+If KVM_X86_NOTIFY_VMEXIT_USER is set in args[0], upon notify VM exits happen,
+KVM would exit to userspace for handling.
+
+This capability is aimed to mitigate the threat that malicious VMs can
+cause CPU stuck (due to event windows don't open up) and make the CPU
+unavailable to host or other VMs.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4e00bca08cfa..6cf5d77d7896 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -65,6 +65,9 @@
 #define KVM_BUS_LOCK_DETECTION_VALID_MODE	(KVM_BUS_LOCK_DETECTION_OFF | \
 						 KVM_BUS_LOCK_DETECTION_EXIT)
 
+#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS	(KVM_X86_NOTIFY_VMEXIT_ENABLED | \
+						 KVM_X86_NOTIFY_VMEXIT_USER)
+
 /* x86-specific vcpu->requests bit members */
 #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
 #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
@@ -1178,6 +1181,9 @@ struct kvm_arch {
 
 	bool bus_lock_detection_enabled;
 	bool enable_pmu;
+
+	u32 notify_window;
+	u32 notify_vmexit_flags;
 	/*
 	 * If exit_on_emulation_error is set, and the in-kernel instruction
 	 * emulator fails to emulate an instruction, allow userspace
@@ -1325,6 +1331,7 @@ struct kvm_vcpu_stat {
 	u64 directed_yield_attempted;
 	u64 directed_yield_successful;
 	u64 guest_mode;
+	u64 notify_window_exits;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 89d2172787c5..c371ef695fcc 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -75,6 +75,7 @@
 #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
 #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
 
 /*
  * Definitions of Tertiary Processor-Based VM-Execution Controls.
@@ -280,6 +281,7 @@ enum vmcs_field {
 	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
 	PLE_GAP                         = 0x00004020,
 	PLE_WINDOW                      = 0x00004022,
+	NOTIFY_WINDOW                   = 0x00004024,
 	VM_INSTRUCTION_ERROR            = 0x00004400,
 	VM_EXIT_REASON                  = 0x00004402,
 	VM_EXIT_INTR_INFO               = 0x00004404,
@@ -564,6 +566,11 @@ enum vm_entry_failure_code {
 #define EPT_VIOLATION_GVA_IS_VALID	(1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
+/*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 589608c157bf..c6a7eed03914 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -85,6 +85,7 @@
 #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
 #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
 #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
+#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
 
 /* Tertiary Processor-Based VM-Execution Controls, word 3 */
 #define VMX_FEATURE_IPI_VIRT		( 3*32+  4) /* Enable IPI virtualization */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 946d761adbd3..a5faf6d88f1b 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -91,6 +91,7 @@
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
 #define EXIT_REASON_BUS_LOCK            74
+#define EXIT_REASON_NOTIFY              75
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -153,7 +154,8 @@
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
 	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
-	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
+	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
+	{ EXIT_REASON_NOTIFY,                "NOTIFY" }
 
 #define VMX_EXIT_REASON_FLAGS \
 	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f14c4bef97e0..2d3f13b18714 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -436,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void)
 	return debugctl;
 }
 
+static inline bool cpu_has_notify_vmexit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5c5f4e3762f5..7d8cd0ebcc75 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2133,6 +2133,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 
 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
 {
+	struct kvm *kvm = vmx->vcpu.kvm;
+
 	/*
 	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
 	 * according to L0's settings (vmcs12 is irrelevant here).  Host
@@ -2175,6 +2177,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
 	if (cpu_has_vmx_encls_vmexit())
 		vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
 
+	if (kvm_notify_vmexit_enabled(kvm))
+		vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
 	/*
 	 * Set the MSR load/store lists to match L0's settings.  Only the
 	 * addresses are constant (for vmcs02), the counts can change based
@@ -6112,6 +6117,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
 	case EXIT_REASON_ENCLS:
 		return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+	case EXIT_REASON_NOTIFY:
+		/* Notify VM exit is not exposed to L1 */
+		return false;
 	default:
 		return true;
 	}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6d631941ac1a..2e00890d752a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2499,7 +2499,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_PT_USE_GPA |
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
-			SECONDARY_EXEC_BUS_LOCK_DETECTION;
+			SECONDARY_EXEC_BUS_LOCK_DETECTION |
+			SECONDARY_EXEC_NOTIFY_VM_EXITING;
 		if (cpu_has_sgx())
 			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
 		if (adjust_vmx_controls(min2, opt2,
@@ -4417,6 +4418,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
 		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
 
+	if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
 	return exec_control;
 }
 
@@ -4498,6 +4502,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		vmx->ple_window_dirty = true;
 	}
 
+	if (kvm_notify_vmexit_enabled(kvm))
+		vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -5784,6 +5791,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+	bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+	++vcpu->stat.notify_window_exits;
+
+	/*
+	 * Notify VM exit happened while executing iret from NMI,
+	 * "blocked by NMI" bit has to be set before next VM entry.
+	 */
+	if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+
+	if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+	    context_invalid) {
+		vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+		vcpu->run->notify.flags = context_invalid ?
+					  KVM_NOTIFY_CONTEXT_INVALID : 0;
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5841,6 +5874,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
 	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
+	[EXIT_REASON_NOTIFY]		      = handle_notify,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6214,7 +6248,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
 	     exit_reason.basic != EXIT_REASON_PML_FULL &&
 	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
-	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
 		int ndata = 3;
 
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -8137,6 +8172,7 @@ static __init int hardware_setup(void)
 	kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
 	kvm_caps.tsc_scaling_ratio_frac_bits = 48;
 	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53e5f2ad2422..a8014233fd57 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -284,7 +284,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, nested_run),
 	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
-	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+	STATS_DESC_ICOUNTER(VCPU, guest_mode),
+	STATS_DESC_COUNTER(VCPU, notify_window_exits),
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -4402,6 +4403,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_DISABLE_QUIRKS2:
 		r = KVM_X86_VALID_QUIRKS;
 		break;
+	case KVM_CAP_X86_NOTIFY_VMEXIT:
+		r = kvm_caps.has_notify_vmexit;
+		break;
 	default:
 		break;
 	}
@@ -6125,6 +6129,22 @@ split_irqchip_unlock:
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_X86_NOTIFY_VMEXIT:
+		r = -EINVAL;
+		if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+			break;
+		if (!kvm_caps.has_notify_vmexit)
+			break;
+		if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+			break;
+		mutex_lock(&kvm->lock);
+		if (!kvm->created_vcpus) {
+			kvm->arch.notify_window = cap->args[0] >> 32;
+			kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 359d0454ad28..501b884b8cc4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -21,6 +21,8 @@ struct kvm_caps {
 	u64  default_tsc_scaling_ratio;
 	/* bus lock detection supported? */
 	bool has_bus_lock_exit;
+	/* notify VM exit supported? */
+	bool has_notify_vmexit;
 
 	u64 supported_mce_cap;
 	u64 supported_xcr0;
@@ -364,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
 	return kvm->arch.cstate_in_guest;
 }
 
+static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
+{
+	return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
+}
+
 enum kvm_intr_type {
 	/* Values are arbitrary, but must be non-zero. */
 	KVM_HANDLING_IRQ = 1,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca799319acfd..7569b4ec199c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -270,6 +270,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_X86_BUS_LOCK     33
 #define KVM_EXIT_XEN              34
 #define KVM_EXIT_RISCV_SBI        35
+#define KVM_EXIT_NOTIFY           36
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -496,6 +497,11 @@ struct kvm_run {
 			unsigned long args[6];
 			unsigned long ret[2];
 		} riscv_sbi;
+		/* KVM_EXIT_NOTIFY */
+		struct {
+#define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
+			__u32 flags;
+		} notify;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -1159,6 +1165,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 #define KVM_CAP_S390_PROTECTED_DUMP 217
 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
+#define KVM_CAP_X86_NOTIFY_VMEXIT 219
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2174,4 +2181,8 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_S390_PROTECTED_DUMP */
 #define KVM_S390_PV_CPU_COMMAND	_IOWR(KVMIO, 0xd0, struct kvm_pv_cmd)
 
+/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */
+#define KVM_X86_NOTIFY_VMEXIT_ENABLED		(1ULL << 0)
+#define KVM_X86_NOTIFY_VMEXIT_USER		(1ULL << 1)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit 


From 3ffb20f5c7891ab5bc61cb4044465d3ad1aebf49 Mon Sep 17 00:00:00 2001
From: Akhil R <akhilrajeev@nvidia.com>
Date: Tue, 17 May 2022 13:10:47 +0530
Subject: dt-bindings: Add headers for Tegra234 GPCDMA

Add reset and IOMMU header for Tegra234 GPCDMA

Signed-off-by: Akhil R <akhilrajeev@nvidia.com>
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/memory/tegra234-mc.h   | 1 +
 include/dt-bindings/reset/tegra234-reset.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/memory/tegra234-mc.h b/include/dt-bindings/memory/tegra234-mc.h
index e3b0e9da295d..4bc84a312ad6 100644
--- a/include/dt-bindings/memory/tegra234-mc.h
+++ b/include/dt-bindings/memory/tegra234-mc.h
@@ -11,6 +11,7 @@
 /* NISO0 stream IDs */
 #define TEGRA234_SID_APE	0x02
 #define TEGRA234_SID_HDA	0x03
+#define TEGRA234_SID_GPCDMA	0x04
 #define TEGRA234_SID_PCIE0	0x12
 #define TEGRA234_SID_PCIE4	0x13
 #define TEGRA234_SID_PCIE5	0x14
diff --git a/include/dt-bindings/reset/tegra234-reset.h b/include/dt-bindings/reset/tegra234-reset.h
index 547ca3b60caa..5809245e4e21 100644
--- a/include/dt-bindings/reset/tegra234-reset.h
+++ b/include/dt-bindings/reset/tegra234-reset.h
@@ -15,6 +15,7 @@
 #define TEGRA234_RESET_PEX1_COMMON_APB		13U
 #define TEGRA234_RESET_PEX2_CORE_7		14U
 #define TEGRA234_RESET_PEX2_CORE_7_APB		15U
+#define TEGRA234_RESET_GPCDMA			18U
 #define TEGRA234_RESET_HDA			20U
 #define TEGRA234_RESET_HDACODEC			21U
 #define TEGRA234_RESET_I2C1			24U
-- 
cgit 


From 5dfac65b621733e69b789150a0a3f1bf2f9095a3 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Wed, 8 Jun 2022 17:33:09 +0200
Subject: spi: <linux/spi/spi.h>: Add missing documentation for struct members

Fixes these "make htmldocs" warnings:

include/linux/spi/spi.h:82: warning: Function parameter or member 'syncp' not described in 'spi_statistics'
include/linux/spi/spi.h:213: warning: Function parameter or member 'pcpu_statistics' not described in 'spi_device'
include/linux/spi/spi.h:676: warning: Function parameter or member 'pcpu_statistics' not described in 'spi_controller'

Fixes: 6598b91b5ac3 ("spi: spi.c: Convert statistics to per-cpu u64_stats_t")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David Jander <david@protonic.nl>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220608153309.2899565-1-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index eac8d3caf954..2e63b4935deb 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -35,7 +35,8 @@ extern struct bus_type spi_bus_type;
 
 /**
  * struct spi_statistics - statistics for spi transfers
- * @lock:          lock protecting this structure
+ * @syncp:         seqcount to protect members in this struct for per-cpu udate
+ *                 on 32-bit systems
  *
  * @messages:      number of spi-messages handled
  * @transfers:     number of spi_transfers handled
@@ -155,7 +156,7 @@ extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer);
  * @cs_inactive: delay to be introduced by the controller after CS is
  *	deasserted. If @cs_change_delay is used from @spi_transfer, then the
  *	two delays will be added up.
- * @statistics: statistics for the spi_device
+ * @pcpu_statistics: statistics for the spi_device
  *
  * A @spi_device is used to interchange data between an SPI slave
  * (usually a discrete chip) and CPU memory.
@@ -439,7 +440,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @max_native_cs: When cs_gpiods is used, and this field is filled in,
  *	spi_register_controller() will validate all native CS (including the
  *	unused native CS) against this value.
- * @statistics: statistics for the spi_controller
+ * @pcpu_statistics: statistics for the spi_controller
  * @dma_tx: DMA transmit channel
  * @dma_rx: DMA receive channel
  * @dummy_rx: dummy receive buffer for full-duplex devices
-- 
cgit 


From a84a434baf9427a1c49782fb1f0973d1308016df Mon Sep 17 00:00:00 2001
From: Peter Lafreniere <pjlafren@mtu.edu>
Date: Mon, 6 Jun 2022 07:34:58 -0400
Subject: net: constify some inline functions in sock.h

Despite these inline functions having full visibility to the compiler
at compile time, they still strip const from passed pointers.
This change allows for functions in various network drivers to be marked as
const that could not be marked const before.

Signed-off-by: Peter Lafreniere <pjlafren@mtu.edu>
Link: https://lore.kernel.org/r/20220606113458.35953-1-pjlafren@mtu.edu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index c585ef6565d9..657873e2d90f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -611,7 +611,7 @@ void sock_net_set(struct sock *sk, struct net *net)
 
 int sk_set_peek_off(struct sock *sk, int val);
 
-static inline int sk_peek_offset(struct sock *sk, int flags)
+static inline int sk_peek_offset(const struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
 		return READ_ONCE(sk->sk_peek_off);
@@ -863,7 +863,7 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
 	     pos = rcu_dereference(hlist_next_rcu(pos)))
 
-static inline struct user_namespace *sk_user_ns(struct sock *sk)
+static inline struct user_namespace *sk_user_ns(const struct sock *sk)
 {
 	/* Careful only use this in a context where these parameters
 	 * can not change and must all be valid, such as recvmsg from
@@ -909,7 +909,7 @@ enum sock_flags {
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 
-static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
+static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
 {
 	nsk->sk_flags = osk->sk_flags;
 }
-- 
cgit 


From a1eda864c04cf24ea1130334963c6199318f6f95 Mon Sep 17 00:00:00 2001
From: Michał Kępień <kernel@kempniu.pl>
Date: Mon, 16 May 2022 09:06:00 +0200
Subject: mtdchar: prevent integer overflow in a safety check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 6420ac0af95d ("mtdchar: prevent unbounded allocation in MEMWRITE
ioctl") added a safety check to mtdchar_write_ioctl() which attempts to
ensure that the write request sent by user space does not extend beyond
the MTD device's size.  However, that check contains an addition of two
struct mtd_write_req fields, 'start' and 'len', both of which are u64
variables.  The result of that addition can overflow, allowing the
safety check to be bypassed.

The arguably simplest fix - changing the data types of the relevant
struct mtd_write_req fields - is not feasible as it would break user
space.

Fix by making mtdchar_write_ioctl() truncate the value provided by user
space in the 'len' field of struct mtd_write_req, so that only the lower
32 bits of that field are used, preventing the overflow.

While the 'ooblen' field of struct mtd_write_req is not currently used
in any similarly flawed safety check, also truncate it to 32 bits, for
consistency with the 'len' field and with other MTD routines handling
OOB data.

Update include/uapi/mtd/mtd-abi.h accordingly.

Suggested-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220516070601.11428-2-kernel@kempniu.pl
---
 drivers/mtd/mtdchar.c      | 3 +++
 include/uapi/mtd/mtd-abi.h | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index d0f9c4b0285c..b2700f8467ff 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -615,6 +615,9 @@ static int mtdchar_write_ioctl(struct mtd_info *mtd,
 	if (!usr_oob)
 		req.ooblen = 0;
 
+	req.len &= 0xffffffff;
+	req.ooblen &= 0xffffffff;
+
 	if (req.start + req.len > mtd->size)
 		return -EINVAL;
 
diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h
index b869990c2db2..890d9e5b76d7 100644
--- a/include/uapi/mtd/mtd-abi.h
+++ b/include/uapi/mtd/mtd-abi.h
@@ -69,8 +69,8 @@ enum {
  * struct mtd_write_req - data structure for requesting a write operation
  *
  * @start:	start address
- * @len:	length of data buffer
- * @ooblen:	length of OOB buffer
+ * @len:	length of data buffer (only lower 32 bits are used)
+ * @ooblen:	length of OOB buffer (only lower 32 bits are used)
  * @usr_data:	user-provided data buffer
  * @usr_oob:	user-provided OOB buffer
  * @mode:	MTD mode (see "MTD operation modes")
-- 
cgit 


From 0c90466a7985d39355f743e9cd2139da3e86c4d8 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 3 Jun 2022 23:07:45 +0200
Subject: mtd: hyperbus: Make hyperbus_unregister_device() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only thing that could theoretically fail in that function is
mtd_device_unregister(). However it's not supposed to fail and when
used correctly it doesn't. So wail loudly if it does anyhow.

This matches how other drivers (e.g. nand/raw/nandsim.c) use
mtd_device_unregister().

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220603210758.148493-2-u.kleine-koenig@pengutronix.de
---
 drivers/mtd/hyperbus/hbmc-am654.c    | 6 +++---
 drivers/mtd/hyperbus/hyperbus-core.c | 8 ++------
 drivers/mtd/hyperbus/rpc-if.c        | 5 +++--
 include/linux/mtd/hyperbus.h         | 4 +---
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/hyperbus/hbmc-am654.c b/drivers/mtd/hyperbus/hbmc-am654.c
index a3439b791eeb..a6161ce340d4 100644
--- a/drivers/mtd/hyperbus/hbmc-am654.c
+++ b/drivers/mtd/hyperbus/hbmc-am654.c
@@ -233,16 +233,16 @@ static int am654_hbmc_remove(struct platform_device *pdev)
 {
 	struct am654_hbmc_priv *priv = platform_get_drvdata(pdev);
 	struct am654_hbmc_device_priv *dev_priv = priv->hbdev.priv;
-	int ret;
 
-	ret = hyperbus_unregister_device(&priv->hbdev);
+	hyperbus_unregister_device(&priv->hbdev);
+
 	if (priv->mux_ctrl)
 		mux_control_deselect(priv->mux_ctrl);
 
 	if (dev_priv->rx_chan)
 		dma_release_channel(dev_priv->rx_chan);
 
-	return ret;
+	return 0;
 }
 
 static const struct of_device_id am654_hbmc_dt_ids[] = {
diff --git a/drivers/mtd/hyperbus/hyperbus-core.c b/drivers/mtd/hyperbus/hyperbus-core.c
index 2f9fc4e17d53..4d8047d43e48 100644
--- a/drivers/mtd/hyperbus/hyperbus-core.c
+++ b/drivers/mtd/hyperbus/hyperbus-core.c
@@ -126,16 +126,12 @@ int hyperbus_register_device(struct hyperbus_device *hbdev)
 }
 EXPORT_SYMBOL_GPL(hyperbus_register_device);
 
-int hyperbus_unregister_device(struct hyperbus_device *hbdev)
+void hyperbus_unregister_device(struct hyperbus_device *hbdev)
 {
-	int ret = 0;
-
 	if (hbdev && hbdev->mtd) {
-		ret = mtd_device_unregister(hbdev->mtd);
+		WARN_ON(mtd_device_unregister(hbdev->mtd));
 		map_destroy(hbdev->mtd);
 	}
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(hyperbus_unregister_device);
 
diff --git a/drivers/mtd/hyperbus/rpc-if.c b/drivers/mtd/hyperbus/rpc-if.c
index 6e08ec1d4f09..15a0be63ede1 100644
--- a/drivers/mtd/hyperbus/rpc-if.c
+++ b/drivers/mtd/hyperbus/rpc-if.c
@@ -153,11 +153,12 @@ static int rpcif_hb_probe(struct platform_device *pdev)
 static int rpcif_hb_remove(struct platform_device *pdev)
 {
 	struct rpcif_hyperbus *hyperbus = platform_get_drvdata(pdev);
-	int error = hyperbus_unregister_device(&hyperbus->hbdev);
+
+	hyperbus_unregister_device(&hyperbus->hbdev);
 
 	rpcif_disable_rpm(&hyperbus->rpc);
 
-	return error;
+	return 0;
 }
 
 static struct platform_driver rpcif_platform_driver = {
diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h
index 0ce612428aea..bb6b7121a542 100644
--- a/include/linux/mtd/hyperbus.h
+++ b/include/linux/mtd/hyperbus.h
@@ -89,9 +89,7 @@ int hyperbus_register_device(struct hyperbus_device *hbdev);
 /**
  * hyperbus_unregister_device - deregister HyperBus slave memory device
  * @hbdev: hyperbus_device to be unregistered
- *
- * Return: 0 for success, others for failure.
  */
-int hyperbus_unregister_device(struct hyperbus_device *hbdev);
+void hyperbus_unregister_device(struct hyperbus_device *hbdev);
 
 #endif /* __LINUX_MTD_HYPERBUS_H__ */
-- 
cgit 


From ea7f0f777d28db6e500a05836f2a9d467c7012de Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Thu, 9 Jun 2022 08:49:37 +0000
Subject: platform/chrome: cros_ec_commands: fix compile errors

Fix compile errors when including cros_ec_commands.h solely.

1.
cros_ec_commands.h:587:9: error: unknown type name 'uint8_t'
  587 |         uint8_t flags;
      |         ^~~~~~~

2.
cros_ec_commands.h:1105:43: error: implicit declaration of function 'BIT'
 1105 |         EC_COMMS_STATUS_PROCESSING      = BIT(0),
      |                                           ^~~

Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220609084957.3684698-2-tzungbi@kernel.org
---
 include/linux/platform_data/cros_ec_commands.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index e59f51c41a1c..f13568b3e247 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -13,8 +13,8 @@
 #ifndef __CROS_EC_COMMANDS_H
 #define __CROS_EC_COMMANDS_H
 
-
-
+#include <linux/bits.h>
+#include <linux/types.h>
 
 #define BUILD_ASSERT(_cond)
 
-- 
cgit 


From 3db0c9e5de7bd9dbe52580eb9752b2b3049e38da Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Thu, 9 Jun 2022 08:49:39 +0000
Subject: platform/chrome: use macros for passthru indexes

Move passthru indexes for EC and PD devices to common header.  Also use
them instead of literal constants.

Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220609084957.3684698-4-tzungbi@kernel.org
---
 drivers/platform/chrome/cros_ec.c            |  3 ---
 drivers/platform/chrome/cros_ec_proto.c      |  6 +++---
 drivers/platform/chrome/cros_ec_proto_test.c | 15 ++++++++++-----
 drivers/platform/chrome/cros_ec_trace.h      |  8 ++++----
 include/linux/platform_data/cros_ec_proto.h  |  3 +++
 5 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c
index b3e94cdf7d1a..e51a3f2176c7 100644
--- a/drivers/platform/chrome/cros_ec.c
+++ b/drivers/platform/chrome/cros_ec.c
@@ -19,9 +19,6 @@
 
 #include "cros_ec.h"
 
-#define CROS_EC_DEV_EC_INDEX 0
-#define CROS_EC_DEV_PD_INDEX 1
-
 static struct cros_ec_platform ec_p = {
 	.ec_name = CROS_EC_DEV_NAME,
 	.cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_EC_INDEX),
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index cefabfe45551..cfa3dacce4e5 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -433,7 +433,7 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 
 	/* First try sending with proto v3. */
 	ec_dev->proto_version = 3;
-	ret = cros_ec_host_command_proto_query(ec_dev, 0, proto_msg);
+	ret = cros_ec_host_command_proto_query(ec_dev, CROS_EC_DEV_EC_INDEX, proto_msg);
 
 	if (ret == 0) {
 		proto_info = (struct ec_response_get_protocol_info *)
@@ -459,7 +459,7 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 		/*
 		 * Check for PD
 		 */
-		ret = cros_ec_host_command_proto_query(ec_dev, 1, proto_msg);
+		ret = cros_ec_host_command_proto_query(ec_dev, CROS_EC_DEV_PD_INDEX, proto_msg);
 
 		if (ret) {
 			dev_dbg(ec_dev->dev, "no PD chip found: %d\n", ret);
@@ -609,7 +609,7 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, struct cros_ec_command *msg)
 		msg->insize = ec_dev->max_response;
 	}
 
-	if (msg->command < EC_CMD_PASSTHRU_OFFSET(1)) {
+	if (msg->command < EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX)) {
 		if (msg->outsize > ec_dev->max_request) {
 			dev_err(ec_dev->dev,
 				"request of size %u is too big (max: %u)\n",
diff --git a/drivers/platform/chrome/cros_ec_proto_test.c b/drivers/platform/chrome/cros_ec_proto_test.c
index 675306c16d47..f8dbfb0d8dc8 100644
--- a/drivers/platform/chrome/cros_ec_proto_test.c
+++ b/drivers/platform/chrome/cros_ec_proto_test.c
@@ -281,7 +281,8 @@ static void cros_ec_proto_test_query_all_normal(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -396,7 +397,8 @@ static void cros_ec_proto_test_query_all_no_pd_return_error(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -685,7 +687,8 @@ static void cros_ec_proto_test_query_all_no_mkbp(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -783,7 +786,8 @@ static void cros_ec_proto_test_query_all_no_host_sleep(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -889,7 +893,8 @@ static void cros_ec_proto_test_query_all_default_wake_mask_return_error(struct k
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
diff --git a/drivers/platform/chrome/cros_ec_trace.h b/drivers/platform/chrome/cros_ec_trace.h
index 9bb5cd2c98b8..d7e407de88df 100644
--- a/drivers/platform/chrome/cros_ec_trace.h
+++ b/drivers/platform/chrome/cros_ec_trace.h
@@ -30,8 +30,8 @@ TRACE_EVENT(cros_ec_request_start,
 	),
 	TP_fast_assign(
 		__entry->version = cmd->version;
-		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1);
-		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1);
+		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
+		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
 		__entry->outsize = cmd->outsize;
 		__entry->insize = cmd->insize;
 	),
@@ -55,8 +55,8 @@ TRACE_EVENT(cros_ec_request_done,
 	),
 	TP_fast_assign(
 		__entry->version = cmd->version;
-		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1);
-		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1);
+		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
+		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
 		__entry->outsize = cmd->outsize;
 		__entry->insize = cmd->insize;
 		__entry->result = cmd->result;
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 85e29300f63d..c82a8285d936 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -21,6 +21,9 @@
 #define CROS_EC_DEV_SCP_NAME	"cros_scp"
 #define CROS_EC_DEV_TP_NAME	"cros_tp"
 
+#define CROS_EC_DEV_EC_INDEX 0
+#define CROS_EC_DEV_PD_INDEX 1
+
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
  * simple delay to make sure that the bus stays locked.
-- 
cgit 


From d62607c3fe45911b2331fac073355a8c914bbde2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 7 Jun 2022 21:39:55 -0700
Subject: net: rename reference+tracking helpers

Netdev reference helpers have a dev_ prefix for historic
reasons. Renaming the old helpers would be too much churn
but we can rename the tracking ones which are relatively
recent and should be the default for new code.

Rename:
 dev_hold_track()    -> netdev_hold()
 dev_put_track()     -> netdev_put()
 dev_replace_track() -> netdev_ref_replace()

Link: https://lore.kernel.org/r/20220608043955.919359-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/eql.c              |  4 ++--
 drivers/net/macsec.c           |  4 ++--
 drivers/net/macvlan.c          |  4 ++--
 drivers/net/netconsole.c       |  2 +-
 drivers/net/vrf.c              |  8 ++++----
 include/linux/netdevice.h      | 24 ++++++++++++------------
 include/net/xfrm.h             |  2 +-
 net/8021q/vlan_dev.c           |  4 ++--
 net/ax25/af_ax25.c             |  7 ++++---
 net/ax25/ax25_dev.c            |  6 +++---
 net/bridge/br_if.c             | 10 +++++-----
 net/core/dev.c                 | 10 +++++-----
 net/core/dev_ioctl.c           |  4 ++--
 net/core/drop_monitor.c        |  5 +++--
 net/core/dst.c                 |  8 ++++----
 net/core/failover.c            |  4 ++--
 net/core/link_watch.c          |  2 +-
 net/core/neighbour.c           | 18 +++++++++---------
 net/core/net-sysfs.c           |  8 ++++----
 net/core/netpoll.c             |  2 +-
 net/core/pktgen.c              |  6 +++---
 net/ethtool/ioctl.c            |  4 ++--
 net/ethtool/netlink.c          |  6 +++---
 net/ethtool/netlink.h          |  2 +-
 net/ipv4/devinet.c             |  4 ++--
 net/ipv4/fib_semantics.c       | 11 ++++++-----
 net/ipv4/ipmr.c                |  2 +-
 net/ipv4/route.c               |  7 +++----
 net/ipv4/xfrm4_policy.c        |  2 +-
 net/ipv6/addrconf.c            |  4 ++--
 net/ipv6/addrconf_core.c       |  2 +-
 net/ipv6/ip6_gre.c             |  8 ++++----
 net/ipv6/ip6_tunnel.c          |  4 ++--
 net/ipv6/ip6_vti.c             |  4 ++--
 net/ipv6/ip6mr.c               |  2 +-
 net/ipv6/route.c               | 10 +++++-----
 net/ipv6/sit.c                 |  4 ++--
 net/ipv6/xfrm6_policy.c        |  4 ++--
 net/llc/af_llc.c               |  2 +-
 net/openvswitch/vport-netdev.c |  6 +++---
 net/packet/af_packet.c         | 12 ++++++------
 net/sched/act_mirred.c         |  6 +++---
 net/sched/sch_api.c            |  2 +-
 net/sched/sch_generic.c        | 11 ++++++-----
 net/smc/smc_pnet.c             |  7 ++++---
 net/switchdev/switchdev.c      |  4 ++--
 net/tipc/bearer.c              |  4 ++--
 net/xfrm/xfrm_device.c         |  2 +-
 48 files changed, 141 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 557ca8ff9dec..ca3e4700a813 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -225,7 +225,7 @@ static void eql_kill_one_slave(slave_queue_t *queue, slave_t *slave)
 	list_del(&slave->list);
 	queue->num_slaves--;
 	slave->dev->flags &= ~IFF_SLAVE;
-	dev_put_track(slave->dev, &slave->dev_tracker);
+	netdev_put(slave->dev, &slave->dev_tracker);
 	kfree(slave);
 }
 
@@ -399,7 +399,7 @@ static int __eql_insert_slave(slave_queue_t *queue, slave_t *slave)
 		if (duplicate_slave)
 			eql_kill_one_slave(queue, duplicate_slave);
 
-		dev_hold_track(slave->dev, &slave->dev_tracker, GFP_ATOMIC);
+		netdev_hold(slave->dev, &slave->dev_tracker, GFP_ATOMIC);
 		list_add(&slave->list, &queue->all_slaves);
 		queue->num_slaves++;
 		slave->dev->flags |= IFF_SLAVE;
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 817577e713d7..815738c0e067 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3462,7 +3462,7 @@ static int macsec_dev_init(struct net_device *dev)
 		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
 	/* Get macsec's reference to real_dev */
-	dev_hold_track(real_dev, &macsec->dev_tracker, GFP_KERNEL);
+	netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL);
 
 	return 0;
 }
@@ -3710,7 +3710,7 @@ static void macsec_free_netdev(struct net_device *dev)
 	free_percpu(macsec->secy.tx_sc.stats);
 
 	/* Get rid of the macsec's reference to real_dev */
-	dev_put_track(macsec->real_dev, &macsec->dev_tracker);
+	netdev_put(macsec->real_dev, &macsec->dev_tracker);
 }
 
 static void macsec_setup(struct net_device *dev)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index eff75beb1395..5b46a6526fc6 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -915,7 +915,7 @@ static int macvlan_init(struct net_device *dev)
 	port->count += 1;
 
 	/* Get macvlan's reference to lowerdev */
-	dev_hold_track(lowerdev, &vlan->dev_tracker, GFP_KERNEL);
+	netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL);
 
 	return 0;
 }
@@ -1185,7 +1185,7 @@ static void macvlan_dev_free(struct net_device *dev)
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
 	/* Get rid of the macvlan's reference to lowerdev */
-	dev_put_track(vlan->lowerdev, &vlan->dev_tracker);
+	netdev_put(vlan->lowerdev, &vlan->dev_tracker);
 }
 
 void macvlan_common_setup(struct net_device *dev)
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index ab8cd5551020..ddac61d79145 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -721,7 +721,7 @@ restart:
 				__netpoll_cleanup(&nt->np);
 
 				spin_lock_irqsave(&target_list_lock, flags);
-				dev_put_track(nt->np.dev, &nt->np.dev_tracker);
+				netdev_put(nt->np.dev, &nt->np.dev_tracker);
 				nt->np.dev = NULL;
 				nt->enabled = false;
 				stopped = true;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index cfc30ce4c6e1..40445a12c682 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -814,8 +814,8 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 	 */
 	if (rt6) {
 		dst = &rt6->dst;
-		dev_replace_track(dst->dev, net->loopback_dev,
-				  &dst->dev_tracker, GFP_KERNEL);
+		netdev_ref_replace(dst->dev, net->loopback_dev,
+				   &dst->dev_tracker, GFP_KERNEL);
 		dst->dev = net->loopback_dev;
 		dst_release(dst);
 	}
@@ -1061,8 +1061,8 @@ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
 	 */
 	if (rth) {
 		dst = &rth->dst;
-		dev_replace_track(dst->dev, net->loopback_dev,
-				  &dst->dev_tracker, GFP_KERNEL);
+		netdev_ref_replace(dst->dev, net->loopback_dev,
+				   &dst->dev_tracker, GFP_KERNEL);
 		dst->dev = net->loopback_dev;
 		dst_release(dst);
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f615a66c89e9..e2e5088888b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3981,8 +3981,8 @@ static inline void netdev_tracker_free(struct net_device *dev,
 #endif
 }
 
-static inline void dev_hold_track(struct net_device *dev,
-				  netdevice_tracker *tracker, gfp_t gfp)
+static inline void netdev_hold(struct net_device *dev,
+			       netdevice_tracker *tracker, gfp_t gfp)
 {
 	if (dev) {
 		__dev_hold(dev);
@@ -3990,8 +3990,8 @@ static inline void dev_hold_track(struct net_device *dev,
 	}
 }
 
-static inline void dev_put_track(struct net_device *dev,
-				 netdevice_tracker *tracker)
+static inline void netdev_put(struct net_device *dev,
+			      netdevice_tracker *tracker)
 {
 	if (dev) {
 		netdev_tracker_free(dev, tracker);
@@ -4004,11 +4004,11 @@ static inline void dev_put_track(struct net_device *dev,
  *	@dev: network device
  *
  * Hold reference to device to keep it from being freed.
- * Try using dev_hold_track() instead.
+ * Try using netdev_hold() instead.
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	dev_hold_track(dev, NULL, GFP_ATOMIC);
+	netdev_hold(dev, NULL, GFP_ATOMIC);
 }
 
 /**
@@ -4016,17 +4016,17 @@ static inline void dev_hold(struct net_device *dev)
  *	@dev: network device
  *
  * Release reference to device to allow it to be freed.
- * Try using dev_put_track() instead.
+ * Try using netdev_put() instead.
  */
 static inline void dev_put(struct net_device *dev)
 {
-	dev_put_track(dev, NULL);
+	netdev_put(dev, NULL);
 }
 
-static inline void dev_replace_track(struct net_device *odev,
-				     struct net_device *ndev,
-				     netdevice_tracker *tracker,
-				     gfp_t gfp)
+static inline void netdev_ref_replace(struct net_device *odev,
+				      struct net_device *ndev,
+				      netdevice_tracker *tracker,
+				      gfp_t gfp)
 {
 	if (odev)
 		netdev_tracker_free(odev, tracker);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c39d910d4b45..9287712ad977 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1923,7 +1923,7 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 		if (dev->xfrmdev_ops->xdo_dev_state_free)
 			dev->xfrmdev_ops->xdo_dev_state_free(x);
 		xso->dev = NULL;
-		dev_put_track(dev, &xso->dev_tracker);
+		netdev_put(dev, &xso->dev_tracker);
 	}
 }
 #else
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 839f2020b015..e3dff2df6f54 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -615,7 +615,7 @@ static int vlan_dev_init(struct net_device *dev)
 		return -ENOMEM;
 
 	/* Get vlan's reference to real_dev */
-	dev_hold_track(real_dev, &vlan->dev_tracker, GFP_KERNEL);
+	netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL);
 
 	return 0;
 }
@@ -852,7 +852,7 @@ static void vlan_dev_free(struct net_device *dev)
 	vlan->vlan_pcpu_stats = NULL;
 
 	/* Get rid of the vlan's reference to real_dev */
-	dev_put_track(vlan->real_dev, &vlan->dev_tracker);
+	netdev_put(vlan->real_dev, &vlan->dev_tracker);
 }
 
 void vlan_setup(struct net_device *dev)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 95393bb2760b..1a5c0b071aa3 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -102,7 +102,8 @@ again:
 			ax25_disconnect(s, ENETUNREACH);
 			s->ax25_dev = NULL;
 			if (sk->sk_socket) {
-				dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker);
+				netdev_put(ax25_dev->dev,
+					   &ax25_dev->dev_tracker);
 				ax25_dev_put(ax25_dev);
 			}
 			ax25_cb_del(s);
@@ -1065,7 +1066,7 @@ static int ax25_release(struct socket *sock)
 			del_timer_sync(&ax25->t3timer);
 			del_timer_sync(&ax25->idletimer);
 		}
-		dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker);
+		netdev_put(ax25_dev->dev, &ax25_dev->dev_tracker);
 		ax25_dev_put(ax25_dev);
 	}
 
@@ -1146,7 +1147,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	if (ax25_dev) {
 		ax25_fillin_cb(ax25, ax25_dev);
-		dev_hold_track(ax25_dev->dev, &ax25_dev->dev_tracker, GFP_ATOMIC);
+		netdev_hold(ax25_dev->dev, &ax25_dev->dev_tracker, GFP_ATOMIC);
 	}
 
 done:
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 95a76d571c44..ab88b6ac5401 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -60,7 +60,7 @@ void ax25_dev_device_up(struct net_device *dev)
 	refcount_set(&ax25_dev->refcount, 1);
 	dev->ax25_ptr     = ax25_dev;
 	ax25_dev->dev     = dev;
-	dev_hold_track(dev, &ax25_dev->dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &ax25_dev->dev_tracker, GFP_ATOMIC);
 	ax25_dev->forward = NULL;
 	ax25_dev->device_up = true;
 
@@ -136,7 +136,7 @@ unlock_put:
 	spin_unlock_bh(&ax25_dev_lock);
 	ax25_dev_put(ax25_dev);
 	dev->ax25_ptr = NULL;
-	dev_put_track(dev, &ax25_dev->dev_tracker);
+	netdev_put(dev, &ax25_dev->dev_tracker);
 	ax25_dev_put(ax25_dev);
 }
 
@@ -205,7 +205,7 @@ void __exit ax25_dev_free(void)
 	ax25_dev = ax25_dev_list;
 	while (ax25_dev != NULL) {
 		s        = ax25_dev;
-		dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker);
+		netdev_put(ax25_dev->dev, &ax25_dev->dev_tracker);
 		ax25_dev = ax25_dev->next;
 		kfree(s);
 	}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 47fcbade7389..a84a7cfb9d6d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -274,7 +274,7 @@ static void destroy_nbp(struct net_bridge_port *p)
 
 	p->br = NULL;
 	p->dev = NULL;
-	dev_put_track(dev, &p->dev_tracker);
+	netdev_put(dev, &p->dev_tracker);
 
 	kobject_put(&p->kobj);
 }
@@ -423,7 +423,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 		return ERR_PTR(-ENOMEM);
 
 	p->br = br;
-	dev_hold_track(dev, &p->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
 	p->dev = dev;
 	p->path_cost = port_cost(dev);
 	p->priority = 0x8000 >> BR_PORT_BITS;
@@ -434,7 +434,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	br_stp_port_timer_init(p);
 	err = br_multicast_add_port(p);
 	if (err) {
-		dev_put_track(dev, &p->dev_tracker);
+		netdev_put(dev, &p->dev_tracker);
 		kfree(p);
 		p = ERR_PTR(err);
 	}
@@ -615,7 +615,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	err = dev_set_allmulti(dev, 1);
 	if (err) {
 		br_multicast_del_port(p);
-		dev_put_track(dev, &p->dev_tracker);
+		netdev_put(dev, &p->dev_tracker);
 		kfree(p);	/* kobject not yet init'd, manually free */
 		goto err1;
 	}
@@ -725,7 +725,7 @@ err3:
 	sysfs_remove_link(br->ifobj, p->dev->name);
 err2:
 	br_multicast_del_port(p);
-	dev_put_track(dev, &p->dev_tracker);
+	netdev_put(dev, &p->dev_tracker);
 	kobject_put(&p->kobj);
 	dev_set_allmulti(dev, -1);
 err1:
diff --git a/net/core/dev.c b/net/core/dev.c
index 08ce317fcec8..c9d96b85a825 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7463,7 +7463,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	adj->ref_nr = 1;
 	adj->private = private;
 	adj->ignore = false;
-	dev_hold_track(adj_dev, &adj->dev_tracker, GFP_KERNEL);
+	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
 
 	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
@@ -7492,7 +7492,7 @@ remove_symlinks:
 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 free_adj:
-	dev_put_track(adj_dev, &adj->dev_tracker);
+	netdev_put(adj_dev, &adj->dev_tracker);
 	kfree(adj);
 
 	return ret;
@@ -7534,7 +7534,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
 	list_del_rcu(&adj->list);
 	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 		 adj_dev->name, dev->name, adj_dev->name);
-	dev_put_track(adj_dev, &adj->dev_tracker);
+	netdev_put(adj_dev, &adj->dev_tracker);
 	kfree_rcu(adj, rcu);
 }
 
@@ -10062,7 +10062,7 @@ int register_netdevice(struct net_device *dev)
 
 	dev_init_scheduler(dev);
 
-	dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
+	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
 	list_netdevice(dev);
 
 	add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -10868,7 +10868,7 @@ void unregister_netdevice_many(struct list_head *head)
 	synchronize_net();
 
 	list_for_each_entry(dev, head, unreg_list) {
-		dev_put_track(dev, &dev->dev_registered_tracker);
+		netdev_put(dev, &dev->dev_registered_tracker);
 		net_set_todo(dev);
 	}
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 4f6be442ae7e..7674bb9f3076 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -384,10 +384,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 			return -ENODEV;
 		if (!netif_is_bridge_master(dev))
 			return -EOPNOTSUPP;
-		dev_hold_track(dev, &dev_tracker, GFP_KERNEL);
+		netdev_hold(dev, &dev_tracker, GFP_KERNEL);
 		rtnl_unlock();
 		err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
-		dev_put_track(dev, &dev_tracker);
+		netdev_put(dev, &dev_tracker);
 		rtnl_lock();
 		return err;
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 4ad1decce724..804d02fc245f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -864,7 +864,8 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
 	}
 
 	hw_metadata->input_dev = metadata->input_dev;
-	dev_hold_track(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC);
+	netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker,
+		    GFP_ATOMIC);
 
 	return hw_metadata;
 
@@ -880,7 +881,7 @@ free_hw_metadata:
 static void
 net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata)
 {
-	dev_put_track(hw_metadata->input_dev, &hw_metadata->dev_tracker);
+	netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker);
 	kfree(hw_metadata->fa_cookie);
 	kfree(hw_metadata->trap_name);
 	kfree(hw_metadata->trap_group_name);
diff --git a/net/core/dst.c b/net/core/dst.c
index d16c2c9bfebd..bc9c9be4e080 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -49,7 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      unsigned short flags)
 {
 	dst->dev = dev;
-	dev_hold_track(dev, &dst->dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
 	dst->ops = ops;
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
@@ -117,7 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 
 	if (dst->ops->destroy)
 		dst->ops->destroy(dst);
-	dev_put_track(dst->dev, &dst->dev_tracker);
+	netdev_put(dst->dev, &dst->dev_tracker);
 
 	lwtstate_put(dst->lwtstate);
 
@@ -159,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst)
 	dst->input = dst_discard;
 	dst->output = dst_discard_out;
 	dst->dev = blackhole_netdev;
-	dev_replace_track(dev, blackhole_netdev, &dst->dev_tracker,
-			  GFP_ATOMIC);
+	netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
+			   GFP_ATOMIC);
 }
 EXPORT_SYMBOL(dst_dev_put);
 
diff --git a/net/core/failover.c b/net/core/failover.c
index dcaa92a85ea2..864d2d83eff4 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev,
 		return ERR_PTR(-ENOMEM);
 
 	rcu_assign_pointer(failover->ops, ops);
-	dev_hold_track(dev, &failover->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL);
 	dev->priv_flags |= IFF_FAILOVER;
 	rcu_assign_pointer(failover->failover_dev, dev);
 
@@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover)
 		    failover_dev->name);
 
 	failover_dev->priv_flags &= ~IFF_FAILOVER;
-	dev_put_track(failover_dev, &failover->dev_tracker);
+	netdev_put(failover_dev, &failover->dev_tracker);
 
 	spin_lock(&failover_lock);
 	list_del(&failover->list);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index a244d3bade7d..aa6cb1f90966 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -110,7 +110,7 @@ static void linkwatch_add_event(struct net_device *dev)
 	spin_lock_irqsave(&lweventlist_lock, flags);
 	if (list_empty(&dev->link_watch_list)) {
 		list_add_tail(&dev->link_watch_list, &lweventlist);
-		dev_hold_track(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
+		netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
 	}
 	spin_unlock_irqrestore(&lweventlist_lock, flags);
 }
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 54625287ee5b..d8ec70622ecb 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -624,7 +624,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
 
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
-	dev_hold_track(dev, &n->dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);
 
 	/* Protocol specific setup. */
 	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) {
@@ -770,10 +770,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 	write_pnet(&n->net, net);
 	memcpy(n->key, pkey, key_len);
 	n->dev = dev;
-	dev_hold_track(dev, &n->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &n->dev_tracker, GFP_KERNEL);
 
 	if (tbl->pconstructor && tbl->pconstructor(n)) {
-		dev_put_track(dev, &n->dev_tracker);
+		netdev_put(dev, &n->dev_tracker);
 		kfree(n);
 		n = NULL;
 		goto out;
@@ -805,7 +805,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 			write_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
-			dev_put_track(n->dev, &n->dev_tracker);
+			netdev_put(n->dev, &n->dev_tracker);
 			kfree(n);
 			return 0;
 		}
@@ -838,7 +838,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 		n->next = NULL;
 		if (tbl->pdestructor)
 			tbl->pdestructor(n);
-		dev_put_track(n->dev, &n->dev_tracker);
+		netdev_put(n->dev, &n->dev_tracker);
 		kfree(n);
 	}
 	return -ENOENT;
@@ -879,7 +879,7 @@ void neigh_destroy(struct neighbour *neigh)
 	if (dev->netdev_ops->ndo_neigh_destroy)
 		dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
 
-	dev_put_track(dev, &neigh->dev_tracker);
+	netdev_put(dev, &neigh->dev_tracker);
 	neigh_parms_put(neigh->parms);
 
 	neigh_dbg(2, "neigh %p is destroyed\n", neigh);
@@ -1671,13 +1671,13 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 		refcount_set(&p->refcnt, 1);
 		p->reachable_time =
 				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
-		dev_hold_track(dev, &p->dev_tracker, GFP_KERNEL);
+		netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
 		p->dev = dev;
 		write_pnet(&p->net, net);
 		p->sysctl_table = NULL;
 
 		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
-			dev_put_track(dev, &p->dev_tracker);
+			netdev_put(dev, &p->dev_tracker);
 			kfree(p);
 			return NULL;
 		}
@@ -1708,7 +1708,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
 	list_del(&parms->list);
 	parms->dead = 1;
 	write_unlock_bh(&tbl->lock);
-	dev_put_track(parms->dev, &parms->dev_tracker);
+	netdev_put(parms->dev, &parms->dev_tracker);
 	call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 }
 EXPORT_SYMBOL(neigh_parms_release);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e319e242dddf..d49fc974e630 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1016,7 +1016,7 @@ static void rx_queue_release(struct kobject *kobj)
 #endif
 
 	memset(kobj, 0, sizeof(*kobj));
-	dev_put_track(queue->dev, &queue->dev_tracker);
+	netdev_put(queue->dev, &queue->dev_tracker);
 }
 
 static const void *rx_queue_namespace(struct kobject *kobj)
@@ -1056,7 +1056,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
 	/* Kobject_put later will trigger rx_queue_release call which
 	 * decreases dev refcount: Take that reference here
 	 */
-	dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+	netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
 
 	kobj->kset = dev->queues_kset;
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
@@ -1619,7 +1619,7 @@ static void netdev_queue_release(struct kobject *kobj)
 	struct netdev_queue *queue = to_netdev_queue(kobj);
 
 	memset(kobj, 0, sizeof(*kobj));
-	dev_put_track(queue->dev, &queue->dev_tracker);
+	netdev_put(queue->dev, &queue->dev_tracker);
 }
 
 static const void *netdev_queue_namespace(struct kobject *kobj)
@@ -1659,7 +1659,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
 	/* Kobject_put later will trigger netdev_queue_release call
 	 * which decreases dev refcount: Take that reference here
 	 */
-	dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+	netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
 
 	kobj->kset = dev->queues_kset;
 	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index db724463e7cd..5d27067b72d5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -853,7 +853,7 @@ void netpoll_cleanup(struct netpoll *np)
 	if (!np->dev)
 		goto out;
 	__netpoll_cleanup(np);
-	dev_put_track(np->dev, &np->dev_tracker);
+	netdev_put(np->dev, &np->dev_tracker);
 	np->dev = NULL;
 out:
 	rtnl_unlock();
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 84b62cd7bc57..88906ba6d9a7 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2100,7 +2100,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
 
 	/* Clean old setups */
 	if (pkt_dev->odev) {
-		dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
+		netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
 		pkt_dev->odev = NULL;
 	}
 
@@ -3807,7 +3807,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 
 	return add_dev_to_thread(t, pkt_dev);
 out2:
-	dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
+	netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
 out1:
 #ifdef CONFIG_XFRM
 	free_SAs(pkt_dev);
@@ -3901,7 +3901,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
 	/* Dis-associate from the interface */
 
 	if (pkt_dev->odev) {
-		dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
+		netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
 		pkt_dev->odev = NULL;
 	}
 
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 326e14ee05db..d05ff6a17a1f 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -2010,7 +2010,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
 	 * removal of the device.
 	 */
 	busy = true;
-	dev_hold_track(dev, &dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &dev_tracker, GFP_KERNEL);
 	rtnl_unlock();
 
 	if (rc == 0) {
@@ -2034,7 +2034,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
 	}
 
 	rtnl_lock();
-	dev_put_track(dev, &dev_tracker);
+	netdev_put(dev, &dev_tracker);
 	busy = false;
 
 	(void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 5fe8f4ae2ceb..e26079e11835 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -402,7 +402,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
 		ops->cleanup_data(reply_data);
 
 	genlmsg_end(rskb, reply_payload);
-	dev_put_track(req_info->dev, &req_info->dev_tracker);
+	netdev_put(req_info->dev, &req_info->dev_tracker);
 	kfree(reply_data);
 	kfree(req_info);
 	return genlmsg_reply(rskb, info);
@@ -414,7 +414,7 @@ err_cleanup:
 	if (ops->cleanup_data)
 		ops->cleanup_data(reply_data);
 err_dev:
-	dev_put_track(req_info->dev, &req_info->dev_tracker);
+	netdev_put(req_info->dev, &req_info->dev_tracker);
 	kfree(reply_data);
 	kfree(req_info);
 	return ret;
@@ -550,7 +550,7 @@ static int ethnl_default_start(struct netlink_callback *cb)
 		 * same parser as for non-dump (doit) requests is used, it
 		 * would take reference to the device if it finds one
 		 */
-		dev_put_track(req_info->dev, &req_info->dev_tracker);
+		netdev_put(req_info->dev, &req_info->dev_tracker);
 		req_info->dev = NULL;
 	}
 	if (ret < 0)
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 7919ddb2371c..c0d587611854 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -237,7 +237,7 @@ struct ethnl_req_info {
 
 static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
 {
-	dev_put_track(req_info->dev, &req_info->dev_tracker);
+	netdev_put(req_info->dev, &req_info->dev_tracker);
 }
 
 /**
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b2366ad540e6..92b778e423df 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -244,7 +244,7 @@ void in_dev_finish_destroy(struct in_device *idev)
 #ifdef NET_REFCNT_DEBUG
 	pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
 #endif
-	dev_put_track(dev, &idev->dev_tracker);
+	netdev_put(dev, &idev->dev_tracker);
 	if (!idev->dead)
 		pr_err("Freeing alive in_device %p\n", idev);
 	else
@@ -272,7 +272,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
 	if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
 		dev_disable_lro(dev);
 	/* Reference in_dev->dev */
-	dev_hold_track(dev, &in_dev->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
 	/* Account for reference dev->ip_ptr (below) */
 	refcount_set(&in_dev->refcnt, 1);
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a57ba23571c9..a5439a8414d4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -211,7 +211,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 
 void fib_nh_common_release(struct fib_nh_common *nhc)
 {
-	dev_put_track(nhc->nhc_dev, &nhc->nhc_dev_tracker);
+	netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
 	lwtstate_put(nhc->nhc_lwtstate);
 	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
 	rt_fibinfo_free(&nhc->nhc_rth_input);
@@ -1057,7 +1057,8 @@ static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
 	err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
 	if (!err) {
 		nh->fib_nh_dev = fib6_nh.fib_nh_dev;
-		dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL);
+		netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+			    GFP_KERNEL);
 		nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
 		nh->fib_nh_scope = RT_SCOPE_LINK;
 
@@ -1141,7 +1142,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
 		if (!netif_carrier_ok(dev))
 			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 		nh->fib_nh_dev = dev;
-		dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+		netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
 		nh->fib_nh_scope = RT_SCOPE_LINK;
 		return 0;
 	}
@@ -1195,7 +1196,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
 			       "No egress device for nexthop gateway");
 		goto out;
 	}
-	dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
 	if (!netif_carrier_ok(dev))
 		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 	err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
@@ -1229,7 +1230,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
 	}
 
 	nh->fib_nh_dev = in_dev->dev;
-	dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+	netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
 	nh->fib_nh_scope = RT_SCOPE_HOST;
 	if (!netif_carrier_ok(nh->fib_nh_dev))
 		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 13e6329784fb..8324e541d193 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -691,7 +691,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
-	dev_put_track(dev, &v->dev_tracker);
+	netdev_put(dev, &v->dev_tracker);
 	return 0;
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 356f535f3443..2d16bcc7d346 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1550,9 +1550,8 @@ void rt_flush_dev(struct net_device *dev)
 			if (rt->dst.dev != dev)
 				continue;
 			rt->dst.dev = blackhole_netdev;
-			dev_replace_track(dev, blackhole_netdev,
-					  &rt->dst.dev_tracker,
-					  GFP_ATOMIC);
+			netdev_ref_replace(dev, blackhole_netdev,
+					   &rt->dst.dev_tracker, GFP_ATOMIC);
 			list_move(&rt->rt_uncached, &ul->quarantine);
 		}
 		spin_unlock_bh(&ul->lock);
@@ -2851,7 +2850,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		new->output = dst_discard_out;
 
 		new->dev = net->loopback_dev;
-		dev_hold_track(new->dev, &new->dev_tracker, GFP_ATOMIC);
+		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
 
 		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6fde0b184791..3d0dfa6cf9f9 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -75,7 +75,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 
 	xdst->u.dst.dev = dev;
-	dev_hold_track(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
 
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b1932502e9e..3497ad1362c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -398,13 +398,13 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	if (ndev->cnf.forwarding)
 		dev_disable_lro(dev);
 	/* We refer to the device */
-	dev_hold_track(dev, &ndev->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);
 
 	if (snmp6_alloc_dev(ndev) < 0) {
 		netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
 			   __func__);
 		neigh_parms_release(&nd_tbl, ndev->nd_parms);
-		dev_put_track(dev, &ndev->dev_tracker);
+		netdev_put(dev, &ndev->dev_tracker);
 		kfree(ndev);
 		return ERR_PTR(err);
 	}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 881d1477d24a..507a8353a6bd 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -263,7 +263,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev)
 #ifdef NET_REFCNT_DEBUG
 	pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
 #endif
-	dev_put_track(dev, &idev->dev_tracker);
+	netdev_put(dev, &idev->dev_tracker);
 	if (!idev->dead) {
 		pr_warn("Freeing alive inet6 device %p\n", idev);
 		return;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4e37f7c29900..3e22cbe5966a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -398,7 +398,7 @@ static void ip6erspan_tunnel_uninit(struct net_device *dev)
 	ip6erspan_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	dst_cache_reset(&t->dst_cache);
-	dev_put_track(dev, &t->dev_tracker);
+	netdev_put(dev, &t->dev_tracker);
 }
 
 static void ip6gre_tunnel_uninit(struct net_device *dev)
@@ -411,7 +411,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 	if (ign->fb_tunnel_dev == dev)
 		WRITE_ONCE(ign->fb_tunnel_dev, NULL);
 	dst_cache_reset(&t->dst_cache);
-	dev_put_track(dev, &t->dev_tracker);
+	netdev_put(dev, &t->dev_tracker);
 }
 
 
@@ -1495,7 +1495,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	}
 	ip6gre_tnl_init_features(dev);
 
-	dev_hold_track(dev, &tunnel->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
 	return 0;
 
 cleanup_dst_cache_init:
@@ -1887,7 +1887,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	ip6erspan_tnl_link_config(tunnel, 1);
 
-	dev_hold_track(dev, &tunnel->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
 	return 0;
 
 cleanup_dst_cache_init:
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 19325b7600bb..689de5eb604e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -381,7 +381,7 @@ ip6_tnl_dev_uninit(struct net_device *dev)
 	else
 		ip6_tnl_unlink(ip6n, t);
 	dst_cache_reset(&t->dst_cache);
-	dev_put_track(dev, &t->dev_tracker);
+	netdev_put(dev, &t->dev_tracker);
 }
 
 /**
@@ -1889,7 +1889,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 	dev->min_mtu = ETH_MIN_MTU;
 	dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len;
 
-	dev_hold_track(dev, &t->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
 	return 0;
 
 destroy_dst:
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 3a434d75925c..8fe59a79e800 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -293,7 +293,7 @@ static void vti6_dev_uninit(struct net_device *dev)
 		RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
 	else
 		vti6_tnl_unlink(ip6n, t);
-	dev_put_track(dev, &t->dev_tracker);
+	netdev_put(dev, &t->dev_tracker);
 }
 
 static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
@@ -936,7 +936,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev)
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
-	dev_hold_track(dev, &t->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
 	return 0;
 }
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 4e74bc61a3db..d4aad41c9849 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -741,7 +741,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 	if ((v->flags & MIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
-	dev_put_track(dev, &v->dev_tracker);
+	netdev_put(dev, &v->dev_tracker);
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d25dc83bac62..0be01a4d48c1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -182,9 +182,9 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
 
 			if (rt_dev == dev) {
 				rt->dst.dev = blackhole_netdev;
-				dev_replace_track(rt_dev, blackhole_netdev,
-						  &rt->dst.dev_tracker,
-						  GFP_ATOMIC);
+				netdev_ref_replace(rt_dev, blackhole_netdev,
+						   &rt->dst.dev_tracker,
+						   GFP_ATOMIC);
 				handled = true;
 			}
 			if (handled)
@@ -607,7 +607,7 @@ static void rt6_probe_deferred(struct work_struct *w)
 
 	addrconf_addr_solict_mult(&work->target, &mcaddr);
 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
-	dev_put_track(work->dev, &work->dev_tracker);
+	netdev_put(work->dev, &work->dev_tracker);
 	kfree(work);
 }
 
@@ -661,7 +661,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 	} else {
 		INIT_WORK(&work->work, rt6_probe_deferred);
 		work->target = *nh_gw;
-		dev_hold_track(dev, &work->dev_tracker, GFP_ATOMIC);
+		netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
 		work->dev = dev;
 		schedule_work(&work->work);
 	}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index c0b138c20992..4f1721865fda 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -521,7 +521,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 		ipip6_tunnel_del_prl(tunnel, NULL);
 	}
 	dst_cache_reset(&tunnel->dst_cache);
-	dev_put_track(dev, &tunnel->dev_tracker);
+	netdev_put(dev, &tunnel->dev_tracker);
 }
 
 static int ipip6_err(struct sk_buff *skb, u32 info)
@@ -1463,7 +1463,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
 		dev->tstats = NULL;
 		return err;
 	}
-	dev_hold_track(dev, &tunnel->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
 	return 0;
 }
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e64e427a51cf..4a4b0e49ec92 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -73,11 +73,11 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rt6_info *rt = (struct rt6_info *)xdst->route;
 
 	xdst->u.dst.dev = dev;
-	dev_hold_track(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
 
 	xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
 	if (!xdst->u.rt6.rt6i_idev) {
-		dev_put_track(dev, &xdst->u.dst.dev_tracker);
+		netdev_put(dev, &xdst->u.dst.dev_tracker);
 		return -ENODEV;
 	}
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 7f555d2e5357..da7fe94bea2e 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -224,7 +224,7 @@ static int llc_ui_release(struct socket *sock)
 	} else {
 		release_sock(sk);
 	}
-	dev_put_track(llc->dev, &llc->dev_tracker);
+	netdev_put(llc->dev, &llc->dev_tracker);
 	sock_put(sk);
 	llc_sk_free(sk);
 out:
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index b498dac4e1e0..2f61d5bdce1a 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -115,7 +115,7 @@ error_master_upper_dev_unlink:
 error_unlock:
 	rtnl_unlock();
 error_put:
-	dev_put_track(vport->dev, &vport->dev_tracker);
+	netdev_put(vport->dev, &vport->dev_tracker);
 error_free_vport:
 	ovs_vport_free(vport);
 	return ERR_PTR(err);
@@ -137,7 +137,7 @@ static void vport_netdev_free(struct rcu_head *rcu)
 {
 	struct vport *vport = container_of(rcu, struct vport, rcu);
 
-	dev_put_track(vport->dev, &vport->dev_tracker);
+	netdev_put(vport->dev, &vport->dev_tracker);
 	ovs_vport_free(vport);
 }
 
@@ -173,7 +173,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
 	 */
 	if (vport->dev->reg_state == NETREG_REGISTERED)
 		rtnl_delete_link(vport->dev);
-	dev_put_track(vport->dev, &vport->dev_tracker);
+	netdev_put(vport->dev, &vport->dev_tracker);
 	vport->dev = NULL;
 	rtnl_unlock();
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ca6e92a22923..d08c4728523b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3134,7 +3134,7 @@ static int packet_release(struct socket *sock)
 	packet_cached_dev_reset(po);
 
 	if (po->prot_hook.dev) {
-		dev_put_track(po->prot_hook.dev, &po->prot_hook.dev_tracker);
+		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
 		po->prot_hook.dev = NULL;
 	}
 	spin_unlock(&po->bind_lock);
@@ -3235,15 +3235,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 		WRITE_ONCE(po->num, proto);
 		po->prot_hook.type = proto;
 
-		dev_put_track(po->prot_hook.dev, &po->prot_hook.dev_tracker);
+		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
 
 		if (unlikely(unlisted)) {
 			po->prot_hook.dev = NULL;
 			WRITE_ONCE(po->ifindex, -1);
 			packet_cached_dev_reset(po);
 		} else {
-			dev_hold_track(dev, &po->prot_hook.dev_tracker,
-				       GFP_ATOMIC);
+			netdev_hold(dev, &po->prot_hook.dev_tracker,
+				    GFP_ATOMIC);
 			po->prot_hook.dev = dev;
 			WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
 			packet_cached_dev_assign(po, dev);
@@ -4167,8 +4167,8 @@ static int packet_notifier(struct notifier_block *this,
 				if (msg == NETDEV_UNREGISTER) {
 					packet_cached_dev_reset(po);
 					WRITE_ONCE(po->ifindex, -1);
-					dev_put_track(po->prot_hook.dev,
-						      &po->prot_hook.dev_tracker);
+					netdev_put(po->prot_hook.dev,
+						   &po->prot_hook.dev_tracker);
 					po->prot_hook.dev = NULL;
 				}
 				spin_unlock(&po->bind_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index ebb92fb072ab..a1d70cf86843 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -79,7 +79,7 @@ static void tcf_mirred_release(struct tc_action *a)
 
 	/* last reference to action, no need to lock */
 	dev = rcu_dereference_protected(m->tcfm_dev, 1);
-	dev_put_track(dev, &m->tcfm_dev_tracker);
+	netdev_put(dev, &m->tcfm_dev_tracker);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -181,7 +181,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		mac_header_xmit = dev_is_mac_header_xmit(ndev);
 		odev = rcu_replace_pointer(m->tcfm_dev, ndev,
 					  lockdep_is_held(&m->tcf_lock));
-		dev_put_track(odev, &m->tcfm_dev_tracker);
+		netdev_put(odev, &m->tcfm_dev_tracker);
 		netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC);
 		m->tcfm_mac_header_xmit = mac_header_xmit;
 	}
@@ -402,7 +402,7 @@ static int mirred_device_event(struct notifier_block *unused,
 		list_for_each_entry(m, &mirred_list, tcfm_list) {
 			spin_lock_bh(&m->tcf_lock);
 			if (tcf_mirred_dev_dereference(m) == dev) {
-				dev_put_track(dev, &m->tcfm_dev_tracker);
+				netdev_put(dev, &m->tcfm_dev_tracker);
 				/* Note : no rcu grace period necessary, as
 				 * net_device are already rcu protected.
 				 */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e3c0e8ea2dbb..bf87b50837a8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1292,7 +1292,7 @@ err_out5:
 	if (ops->destroy)
 		ops->destroy(sch);
 err_out3:
-	dev_put_track(dev, &sch->dev_tracker);
+	netdev_put(dev, &sch->dev_tracker);
 	qdisc_free(sch);
 err_out2:
 	module_put(ops->owner);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dba0b3e24af5..cc6eabee2830 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -541,7 +541,7 @@ static void dev_watchdog(struct timer_list *t)
 	spin_unlock(&dev->tx_global_lock);
 
 	if (release)
-		dev_put_track(dev, &dev->watchdog_dev_tracker);
+		netdev_put(dev, &dev->watchdog_dev_tracker);
 }
 
 void __netdev_watchdog_up(struct net_device *dev)
@@ -551,7 +551,8 @@ void __netdev_watchdog_up(struct net_device *dev)
 			dev->watchdog_timeo = 5*HZ;
 		if (!mod_timer(&dev->watchdog_timer,
 			       round_jiffies(jiffies + dev->watchdog_timeo)))
-			dev_hold_track(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC);
+			netdev_hold(dev, &dev->watchdog_dev_tracker,
+				    GFP_ATOMIC);
 	}
 }
 EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
@@ -565,7 +566,7 @@ static void dev_watchdog_down(struct net_device *dev)
 {
 	netif_tx_lock_bh(dev);
 	if (del_timer(&dev->watchdog_timer))
-		dev_put_track(dev, &dev->watchdog_dev_tracker);
+		netdev_put(dev, &dev->watchdog_dev_tracker);
 	netif_tx_unlock_bh(dev);
 }
 
@@ -975,7 +976,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
-	dev_hold_track(dev, &sch->dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
 	refcount_set(&sch->refcnt, 1);
 
 	return sch;
@@ -1067,7 +1068,7 @@ static void qdisc_destroy(struct Qdisc *qdisc)
 		ops->destroy(qdisc);
 
 	module_put(ops->owner);
-	dev_put_track(qdisc_dev(qdisc), &qdisc->dev_tracker);
+	netdev_put(qdisc_dev(qdisc), &qdisc->dev_tracker);
 
 	trace_qdisc_destroy(qdisc);
 
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 7055ed10e316..4c3bf6db7038 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -120,7 +120,8 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 		    smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
 			list_del(&pnetelem->list);
 			if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
-				dev_put_track(pnetelem->ndev, &pnetelem->dev_tracker);
+				netdev_put(pnetelem->ndev,
+					   &pnetelem->dev_tracker);
 				pr_warn_ratelimited("smc: net device %s "
 						    "erased user defined "
 						    "pnetid %.16s\n",
@@ -196,7 +197,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev)
 	list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
 		if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
 		    !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
-			dev_hold_track(ndev, &pnetelem->dev_tracker, GFP_ATOMIC);
+			netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC);
 			pnetelem->ndev = ndev;
 			rc = 0;
 			pr_warn_ratelimited("smc: adding net device %s with "
@@ -227,7 +228,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
 	mutex_lock(&pnettable->lock);
 	list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
 		if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
-			dev_put_track(pnetelem->ndev, &pnetelem->dev_tracker);
+			netdev_put(pnetelem->ndev, &pnetelem->dev_tracker);
 			pnetelem->ndev = NULL;
 			rc = 0;
 			pr_warn_ratelimited("smc: removing net device %s with "
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 474f76383033..8cc42aea19c7 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -64,7 +64,7 @@ void switchdev_deferred_process(void)
 
 	while ((dfitem = switchdev_deferred_dequeue())) {
 		dfitem->func(dfitem->dev, dfitem->data);
-		dev_put_track(dfitem->dev, &dfitem->dev_tracker);
+		netdev_put(dfitem->dev, &dfitem->dev_tracker);
 		kfree(dfitem);
 	}
 }
@@ -91,7 +91,7 @@ static int switchdev_deferred_enqueue(struct net_device *dev,
 	dfitem->dev = dev;
 	dfitem->func = func;
 	memcpy(dfitem->data, data, data_len);
-	dev_hold_track(dev, &dfitem->dev_tracker, GFP_ATOMIC);
+	netdev_hold(dev, &dfitem->dev_tracker, GFP_ATOMIC);
 	spin_lock_bh(&deferred_lock);
 	list_add_tail(&dfitem->list, &deferred);
 	spin_unlock_bh(&deferred_lock);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 932c87b98eca..35cac7733fd3 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -788,7 +788,7 @@ int tipc_attach_loopback(struct net *net)
 	if (!dev)
 		return -ENODEV;
 
-	dev_hold_track(dev, &tn->loopback_pt.dev_tracker, GFP_KERNEL);
+	netdev_hold(dev, &tn->loopback_pt.dev_tracker, GFP_KERNEL);
 	tn->loopback_pt.dev = dev;
 	tn->loopback_pt.type = htons(ETH_P_TIPC);
 	tn->loopback_pt.func = tipc_loopback_rcv_pkt;
@@ -801,7 +801,7 @@ void tipc_detach_loopback(struct net *net)
 	struct tipc_net *tn = tipc_net(net);
 
 	dev_remove_pack(&tn->loopback_pt);
-	dev_put_track(net->loopback_dev, &tn->loopback_pt.dev_tracker);
+	netdev_put(net->loopback_dev, &tn->loopback_pt.dev_tracker);
 }
 
 /* Caller should hold rtnl_lock to protect the bearer */
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 35c7e89b2e7d..637ca8838436 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -275,7 +275,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		xso->dev = NULL;
 		xso->dir = 0;
 		xso->real_dev = NULL;
-		dev_put_track(dev, &xso->dev_tracker);
+		netdev_put(dev, &xso->dev_tracker);
 
 		if (err != -EOPNOTSUPP)
 			return err;
-- 
cgit 


From 09cca53c1656960c38c04bb12da30f61952ca660 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 08:46:32 -0700
Subject: vlan: adopt u64_stats_t

As explained in commit 316580b69d0a ("u64_stats: provide u64_stats_t type")
we should use u64_stats_t and related accessors to avoid load/store tearing.

Add READ_ONCE() when reading rx_errors & tx_dropped.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/macvlan.c      | 18 +++++++++---------
 include/linux/if_macvlan.h |  6 +++---
 include/linux/if_vlan.h    | 10 +++++-----
 net/8021q/vlan_core.c      |  6 +++---
 net/8021q/vlan_dev.c       | 18 +++++++++---------
 5 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 5b46a6526fc6..1080d6ebff63 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -575,8 +575,8 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 
 		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
 		u64_stats_update_begin(&pcpu_stats->syncp);
-		pcpu_stats->tx_packets++;
-		pcpu_stats->tx_bytes += len;
+		u64_stats_inc(&pcpu_stats->tx_packets);
+		u64_stats_add(&pcpu_stats->tx_bytes, len);
 		u64_stats_update_end(&pcpu_stats->syncp);
 	} else {
 		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
@@ -949,11 +949,11 @@ static void macvlan_dev_get_stats64(struct net_device *dev,
 			p = per_cpu_ptr(vlan->pcpu_stats, i);
 			do {
 				start = u64_stats_fetch_begin_irq(&p->syncp);
-				rx_packets	= p->rx_packets;
-				rx_bytes	= p->rx_bytes;
-				rx_multicast	= p->rx_multicast;
-				tx_packets	= p->tx_packets;
-				tx_bytes	= p->tx_bytes;
+				rx_packets	= u64_stats_read(&p->rx_packets);
+				rx_bytes	= u64_stats_read(&p->rx_bytes);
+				rx_multicast	= u64_stats_read(&p->rx_multicast);
+				tx_packets	= u64_stats_read(&p->tx_packets);
+				tx_bytes	= u64_stats_read(&p->tx_bytes);
 			} while (u64_stats_fetch_retry_irq(&p->syncp, start));
 
 			stats->rx_packets	+= rx_packets;
@@ -964,8 +964,8 @@ static void macvlan_dev_get_stats64(struct net_device *dev,
 			/* rx_errors & tx_dropped are u32, updated
 			 * without syncp protection.
 			 */
-			rx_errors	+= p->rx_errors;
-			tx_dropped	+= p->tx_dropped;
+			rx_errors	+= READ_ONCE(p->rx_errors);
+			tx_dropped	+= READ_ONCE(p->tx_dropped);
 		}
 		stats->rx_errors	= rx_errors;
 		stats->rx_dropped	= rx_errors;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index b42294739063..523025106a64 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -46,10 +46,10 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 
 		pcpu_stats = get_cpu_ptr(vlan->pcpu_stats);
 		u64_stats_update_begin(&pcpu_stats->syncp);
-		pcpu_stats->rx_packets++;
-		pcpu_stats->rx_bytes += len;
+		u64_stats_inc(&pcpu_stats->rx_packets);
+		u64_stats_add(&pcpu_stats->rx_bytes, len);
 		if (multicast)
-			pcpu_stats->rx_multicast++;
+			u64_stats_inc(&pcpu_stats->rx_multicast);
 		u64_stats_update_end(&pcpu_stats->syncp);
 		put_cpu_ptr(vlan->pcpu_stats);
 	} else {
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 2be4dd7e90a9..e00c4ee81ff7 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -118,11 +118,11 @@ static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
  *	@tx_dropped: number of tx drops
  */
 struct vlan_pcpu_stats {
-	u64			rx_packets;
-	u64			rx_bytes;
-	u64			rx_multicast;
-	u64			tx_packets;
-	u64			tx_bytes;
+	u64_stats_t		rx_packets;
+	u64_stats_t		rx_bytes;
+	u64_stats_t		rx_multicast;
+	u64_stats_t		tx_packets;
+	u64_stats_t		tx_bytes;
 	struct u64_stats_sync	syncp;
 	u32			rx_errors;
 	u32			tx_dropped;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index acf8c791f320..5aa8144101dc 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -63,10 +63,10 @@ bool vlan_do_receive(struct sk_buff **skbp)
 	rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
 
 	u64_stats_update_begin(&rx_stats->syncp);
-	rx_stats->rx_packets++;
-	rx_stats->rx_bytes += skb->len;
+	u64_stats_inc(&rx_stats->rx_packets);
+	u64_stats_add(&rx_stats->rx_bytes, skb->len);
 	if (skb->pkt_type == PACKET_MULTICAST)
-		rx_stats->rx_multicast++;
+		u64_stats_inc(&rx_stats->rx_multicast);
 	u64_stats_update_end(&rx_stats->syncp);
 
 	return true;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e3dff2df6f54..035812b0461c 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -128,8 +128,8 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 
 		stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
 		u64_stats_update_begin(&stats->syncp);
-		stats->tx_packets++;
-		stats->tx_bytes += len;
+		u64_stats_inc(&stats->tx_packets);
+		u64_stats_add(&stats->tx_bytes, len);
 		u64_stats_update_end(&stats->syncp);
 	} else {
 		this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped);
@@ -713,11 +713,11 @@ static void vlan_dev_get_stats64(struct net_device *dev,
 		p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&p->syncp);
-			rxpackets	= p->rx_packets;
-			rxbytes		= p->rx_bytes;
-			rxmulticast	= p->rx_multicast;
-			txpackets	= p->tx_packets;
-			txbytes		= p->tx_bytes;
+			rxpackets	= u64_stats_read(&p->rx_packets);
+			rxbytes		= u64_stats_read(&p->rx_bytes);
+			rxmulticast	= u64_stats_read(&p->rx_multicast);
+			txpackets	= u64_stats_read(&p->tx_packets);
+			txbytes		= u64_stats_read(&p->tx_bytes);
 		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
 
 		stats->rx_packets	+= rxpackets;
@@ -726,8 +726,8 @@ static void vlan_dev_get_stats64(struct net_device *dev,
 		stats->tx_packets	+= txpackets;
 		stats->tx_bytes		+= txbytes;
 		/* rx_errors & tx_dropped are u32 */
-		rx_errors	+= p->rx_errors;
-		tx_dropped	+= p->tx_dropped;
+		rx_errors	+= READ_ONCE(p->rx_errors);
+		tx_dropped	+= READ_ONCE(p->tx_dropped);
 	}
 	stats->rx_errors  = rx_errors;
 	stats->tx_dropped = tx_dropped;
-- 
cgit 


From 9962acefbcb92736c268aafe5f52200948f60f3e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 08:46:37 -0700
Subject: net: adopt u64_stats_t in struct pcpu_sw_netstats

As explained in commit 316580b69d0a ("u64_stats: provide u64_stats_t type")
we should use u64_stats_t and related accessors to avoid load/store tearing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/macsec.c           |  8 ++++----
 drivers/net/usb/usbnet.c       |  8 ++++----
 drivers/net/vxlan/vxlan_core.c |  8 ++++----
 include/linux/netdevice.h      | 16 ++++++++--------
 include/net/ip_tunnels.h       |  4 ++--
 net/bridge/br_netlink.c        |  8 ++++----
 net/bridge/br_vlan.c           | 36 ++++++++++++++++++++----------------
 net/core/dev.c                 | 18 +++++++++---------
 net/dsa/slave.c                |  8 ++++----
 9 files changed, 59 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 815738c0e067..c881e1bf6f6e 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -523,8 +523,8 @@ static void count_tx(struct net_device *dev, int ret, int len)
 		struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
 
 		u64_stats_update_begin(&stats->syncp);
-		stats->tx_packets++;
-		stats->tx_bytes += len;
+		u64_stats_inc(&stats->tx_packets);
+		u64_stats_add(&stats->tx_bytes, len);
 		u64_stats_update_end(&stats->syncp);
 	}
 }
@@ -825,8 +825,8 @@ static void count_rx(struct net_device *dev, int len)
 	struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
 
 	u64_stats_update_begin(&stats->syncp);
-	stats->rx_packets++;
-	stats->rx_bytes += len;
+	u64_stats_inc(&stats->rx_packets);
+	u64_stats_add(&stats->rx_bytes, len);
 	u64_stats_update_end(&stats->syncp);
 }
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 1cb6dab3e2d0..dc79811535c2 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -337,8 +337,8 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 		skb->protocol = eth_type_trans (skb, dev->net);
 
 	flags = u64_stats_update_begin_irqsave(&stats64->syncp);
-	stats64->rx_packets++;
-	stats64->rx_bytes += skb->len;
+	u64_stats_inc(&stats64->rx_packets);
+	u64_stats_add(&stats64->rx_bytes, skb->len);
 	u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 
 	netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n",
@@ -1258,8 +1258,8 @@ static void tx_complete (struct urb *urb)
 		unsigned long flags;
 
 		flags = u64_stats_update_begin_irqsave(&stats64->syncp);
-		stats64->tx_packets += entry->packets;
-		stats64->tx_bytes += entry->length;
+		u64_stats_add(&stats64->tx_packets, entry->packets);
+		u64_stats_add(&stats64->tx_bytes, entry->length);
 		u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 	} else {
 		dev->net->stats.tx_errors++;
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 265d4a0245e7..8b0710b576c2 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2385,15 +2385,15 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
 
 	u64_stats_update_begin(&tx_stats->syncp);
-	tx_stats->tx_packets++;
-	tx_stats->tx_bytes += len;
+	u64_stats_inc(&tx_stats->tx_packets);
+	u64_stats_add(&tx_stats->tx_bytes, len);
 	u64_stats_update_end(&tx_stats->syncp);
 	vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len);
 
 	if (__netif_rx(skb) == NET_RX_SUCCESS) {
 		u64_stats_update_begin(&rx_stats->syncp);
-		rx_stats->rx_packets++;
-		rx_stats->rx_bytes += len;
+		u64_stats_inc(&rx_stats->rx_packets);
+		u64_stats_add(&rx_stats->rx_bytes, len);
 		u64_stats_update_end(&rx_stats->syncp);
 		vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX,
 				      len);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e2e5088888b1..89afa4f7747d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2636,10 +2636,10 @@ struct packet_offload {
 
 /* often modified stats are per-CPU, other are shared (netdev->stats) */
 struct pcpu_sw_netstats {
-	u64     rx_packets;
-	u64     rx_bytes;
-	u64     tx_packets;
-	u64     tx_bytes;
+	u64_stats_t		rx_packets;
+	u64_stats_t		rx_bytes;
+	u64_stats_t		tx_packets;
+	u64_stats_t		tx_bytes;
 	struct u64_stats_sync   syncp;
 } __aligned(4 * sizeof(u64));
 
@@ -2656,8 +2656,8 @@ static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int l
 	struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
 
 	u64_stats_update_begin(&tstats->syncp);
-	tstats->rx_bytes += len;
-	tstats->rx_packets++;
+	u64_stats_add(&tstats->rx_bytes, len);
+	u64_stats_inc(&tstats->rx_packets);
 	u64_stats_update_end(&tstats->syncp);
 }
 
@@ -2668,8 +2668,8 @@ static inline void dev_sw_netstats_tx_add(struct net_device *dev,
 	struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
 
 	u64_stats_update_begin(&tstats->syncp);
-	tstats->tx_bytes += len;
-	tstats->tx_packets += packets;
+	u64_stats_add(&tstats->tx_bytes, len);
+	u64_stats_add(&tstats->tx_packets, packets);
 	u64_stats_update_end(&tstats->syncp);
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index c24fa934221d..70cbc4a72669 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -456,8 +456,8 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 		struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);
 
 		u64_stats_update_begin(&tstats->syncp);
-		tstats->tx_bytes += pkt_len;
-		tstats->tx_packets++;
+		u64_stats_add(&tstats->tx_bytes, pkt_len);
+		u64_stats_inc(&tstats->tx_packets);
 		u64_stats_update_end(&tstats->syncp);
 		put_cpu_ptr(tstats);
 	} else {
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index bb01776d2d88..1ef14a099c6b 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1770,10 +1770,10 @@ static int br_fill_linkxstats(struct sk_buff *skb,
 			if (v->vid == pvid)
 				vxi.flags |= BRIDGE_VLAN_INFO_PVID;
 			br_vlan_get_stats(v, &stats);
-			vxi.rx_bytes = stats.rx_bytes;
-			vxi.rx_packets = stats.rx_packets;
-			vxi.tx_bytes = stats.tx_bytes;
-			vxi.tx_packets = stats.tx_packets;
+			vxi.rx_bytes = u64_stats_read(&stats.rx_bytes);
+			vxi.rx_packets = u64_stats_read(&stats.rx_packets);
+			vxi.tx_bytes = u64_stats_read(&stats.tx_bytes);
+			vxi.tx_packets = u64_stats_read(&stats.tx_packets);
 
 			if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
 				goto nla_put_failure;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 0f5e75ccac79..6e53dc991409 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -505,8 +505,8 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
 	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 		stats = this_cpu_ptr(v->stats);
 		u64_stats_update_begin(&stats->syncp);
-		stats->tx_bytes += skb->len;
-		stats->tx_packets++;
+		u64_stats_add(&stats->tx_bytes, skb->len);
+		u64_stats_inc(&stats->tx_packets);
 		u64_stats_update_end(&stats->syncp);
 	}
 
@@ -624,8 +624,8 @@ static bool __allowed_ingress(const struct net_bridge *br,
 	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 		stats = this_cpu_ptr(v->stats);
 		u64_stats_update_begin(&stats->syncp);
-		stats->rx_bytes += skb->len;
-		stats->rx_packets++;
+		u64_stats_add(&stats->rx_bytes, skb->len);
+		u64_stats_inc(&stats->rx_packets);
 		u64_stats_update_end(&stats->syncp);
 	}
 
@@ -1379,16 +1379,16 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 		cpu_stats = per_cpu_ptr(v->stats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
-			rxpackets = cpu_stats->rx_packets;
-			rxbytes = cpu_stats->rx_bytes;
-			txbytes = cpu_stats->tx_bytes;
-			txpackets = cpu_stats->tx_packets;
+			rxpackets = u64_stats_read(&cpu_stats->rx_packets);
+			rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
+			txbytes = u64_stats_read(&cpu_stats->tx_bytes);
+			txpackets = u64_stats_read(&cpu_stats->tx_packets);
 		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
 
-		stats->rx_packets += rxpackets;
-		stats->rx_bytes += rxbytes;
-		stats->tx_bytes += txbytes;
-		stats->tx_packets += txpackets;
+		u64_stats_add(&stats->rx_packets, rxpackets);
+		u64_stats_add(&stats->rx_bytes, rxbytes);
+		u64_stats_add(&stats->tx_bytes, txbytes);
+		u64_stats_add(&stats->tx_packets, txpackets);
 	}
 }
 
@@ -1779,14 +1779,18 @@ static bool br_vlan_stats_fill(struct sk_buff *skb,
 		return false;
 
 	br_vlan_get_stats(v, &stats);
-	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, stats.rx_bytes,
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES,
+			      u64_stats_read(&stats.rx_bytes),
 			      BRIDGE_VLANDB_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
-			      stats.rx_packets, BRIDGE_VLANDB_STATS_PAD) ||
-	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, stats.tx_bytes,
+			      u64_stats_read(&stats.rx_packets),
+			      BRIDGE_VLANDB_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES,
+			      u64_stats_read(&stats.tx_bytes),
 			      BRIDGE_VLANDB_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
-			      stats.tx_packets, BRIDGE_VLANDB_STATS_PAD))
+			      u64_stats_read(&stats.tx_packets),
+			      BRIDGE_VLANDB_STATS_PAD))
 		goto out_err;
 
 	nla_nest_end(skb, nest);
diff --git a/net/core/dev.c b/net/core/dev.c
index c9d96b85a825..511cf5d3aade 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10459,23 +10459,23 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 		const struct pcpu_sw_netstats *stats;
-		struct pcpu_sw_netstats tmp;
 		unsigned int start;
 
 		stats = per_cpu_ptr(netstats, cpu);
 		do {
 			start = u64_stats_fetch_begin_irq(&stats->syncp);
-			tmp.rx_packets = stats->rx_packets;
-			tmp.rx_bytes   = stats->rx_bytes;
-			tmp.tx_packets = stats->tx_packets;
-			tmp.tx_bytes   = stats->tx_bytes;
+			rx_packets = u64_stats_read(&stats->rx_packets);
+			rx_bytes   = u64_stats_read(&stats->rx_bytes);
+			tx_packets = u64_stats_read(&stats->tx_packets);
+			tx_bytes   = u64_stats_read(&stats->tx_bytes);
 		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 
-		s->rx_packets += tmp.rx_packets;
-		s->rx_bytes   += tmp.rx_bytes;
-		s->tx_packets += tmp.tx_packets;
-		s->tx_bytes   += tmp.tx_bytes;
+		s->rx_packets += rx_packets;
+		s->rx_bytes   += rx_bytes;
+		s->tx_packets += tx_packets;
+		s->tx_bytes   += tx_bytes;
 	}
 }
 EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 801a5d445833..2e1ac638d135 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -935,10 +935,10 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
 		s = per_cpu_ptr(dev->tstats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&s->syncp);
-			tx_packets = s->tx_packets;
-			tx_bytes = s->tx_bytes;
-			rx_packets = s->rx_packets;
-			rx_bytes = s->rx_bytes;
+			tx_packets = u64_stats_read(&s->tx_packets);
+			tx_bytes = u64_stats_read(&s->tx_bytes);
+			rx_packets = u64_stats_read(&s->rx_packets);
+			rx_bytes = u64_stats_read(&s->rx_bytes);
 		} while (u64_stats_fetch_retry_irq(&s->syncp, start));
 		data[0] += tx_packets;
 		data[1] += tx_bytes;
-- 
cgit 


From 9ec321aba2eaca109139a2a67c65ede7f07bd8ec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 08:46:40 -0700
Subject: team: adopt u64_stats_t

As explained in commit 316580b69d0a ("u64_stats: provide u64_stats_t type")
we should use u64_stats_t and related accessors to avoid load/store tearing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/team/team.c | 26 +++++++++++++-------------
 include/linux/if_team.h | 10 +++++-----
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index b07dde6f0abf..aac133a1e27a 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -749,10 +749,10 @@ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb)
 
 		pcpu_stats = this_cpu_ptr(team->pcpu_stats);
 		u64_stats_update_begin(&pcpu_stats->syncp);
-		pcpu_stats->rx_packets++;
-		pcpu_stats->rx_bytes += skb->len;
+		u64_stats_inc(&pcpu_stats->rx_packets);
+		u64_stats_add(&pcpu_stats->rx_bytes, skb->len);
 		if (skb->pkt_type == PACKET_MULTICAST)
-			pcpu_stats->rx_multicast++;
+			u64_stats_inc(&pcpu_stats->rx_multicast);
 		u64_stats_update_end(&pcpu_stats->syncp);
 
 		skb->dev = team->dev;
@@ -1720,8 +1720,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		pcpu_stats = this_cpu_ptr(team->pcpu_stats);
 		u64_stats_update_begin(&pcpu_stats->syncp);
-		pcpu_stats->tx_packets++;
-		pcpu_stats->tx_bytes += len;
+		u64_stats_inc(&pcpu_stats->tx_packets);
+		u64_stats_add(&pcpu_stats->tx_bytes, len);
 		u64_stats_update_end(&pcpu_stats->syncp);
 	} else {
 		this_cpu_inc(team->pcpu_stats->tx_dropped);
@@ -1854,11 +1854,11 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		p = per_cpu_ptr(team->pcpu_stats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&p->syncp);
-			rx_packets	= p->rx_packets;
-			rx_bytes	= p->rx_bytes;
-			rx_multicast	= p->rx_multicast;
-			tx_packets	= p->tx_packets;
-			tx_bytes	= p->tx_bytes;
+			rx_packets	= u64_stats_read(&p->rx_packets);
+			rx_bytes	= u64_stats_read(&p->rx_bytes);
+			rx_multicast	= u64_stats_read(&p->rx_multicast);
+			tx_packets	= u64_stats_read(&p->tx_packets);
+			tx_bytes	= u64_stats_read(&p->tx_bytes);
 		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
 
 		stats->rx_packets	+= rx_packets;
@@ -1870,9 +1870,9 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		 * rx_dropped, tx_dropped & rx_nohandler are u32,
 		 * updated without syncp protection.
 		 */
-		rx_dropped	+= p->rx_dropped;
-		tx_dropped	+= p->tx_dropped;
-		rx_nohandler	+= p->rx_nohandler;
+		rx_dropped	+= READ_ONCE(p->rx_dropped);
+		tx_dropped	+= READ_ONCE(p->tx_dropped);
+		rx_nohandler	+= READ_ONCE(p->rx_nohandler);
 	}
 	stats->rx_dropped	= rx_dropped;
 	stats->tx_dropped	= tx_dropped;
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index add607943c95..fc985e5c739d 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -12,11 +12,11 @@
 #include <uapi/linux/if_team.h>
 
 struct team_pcpu_stats {
-	u64			rx_packets;
-	u64			rx_bytes;
-	u64			rx_multicast;
-	u64			tx_packets;
-	u64			tx_bytes;
+	u64_stats_t		rx_packets;
+	u64_stats_t		rx_bytes;
+	u64_stats_t		rx_multicast;
+	u64_stats_t		tx_packets;
+	u64_stats_t		tx_bytes;
 	struct u64_stats_sync	syncp;
 	u32			rx_dropped;
 	u32			tx_dropped;
-- 
cgit 


From 2bff369b23542ea22d0111aaa8e0b8208bf0dc96 Mon Sep 17 00:00:00 2001
From: Jonathan Toppins <jtoppins@redhat.com>
Date: Wed, 8 Jun 2022 14:14:56 -0400
Subject: bonding: netlink error message support for options

Add support for reporting errors via extack in both bond_newlink
and bond_changelink.

Instead of having to look in the kernel log for why an option was not
correct just report the error to the user via the extack variable.

What is currently reported today:
  ip link add bond0 type bond
  ip link set bond0 up
  ip link set bond0 type bond mode 4
 RTNETLINK answers: Device or resource busy

After this change:
  ip link add bond0 type bond
  ip link set bond0 up
  ip link set bond0 type bond mode 4
 Error: unable to set option because the bond is up.

Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_netlink.c | 101 +++++++++++++++++++++++++------------
 drivers/net/bonding/bond_options.c |  32 +++++++++---
 include/net/bond_options.h         |   3 +-
 3 files changed, 95 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 6f404f9c34e3..5a6f44455b95 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -151,7 +151,8 @@ static int bond_slave_changelink(struct net_device *bond_dev,
 		snprintf(queue_id_str, sizeof(queue_id_str), "%s:%u\n",
 			 slave_dev->name, queue_id);
 		bond_opt_initstr(&newval, queue_id_str);
-		err = __bond_opt_set(bond, BOND_OPT_QUEUE_ID, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_QUEUE_ID, &newval,
+				     data[IFLA_BOND_SLAVE_QUEUE_ID], extack);
 		if (err)
 			return err;
 	}
@@ -175,7 +176,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int mode = nla_get_u8(data[IFLA_BOND_MODE]);
 
 		bond_opt_initval(&newval, mode);
-		err = __bond_opt_set(bond, BOND_OPT_MODE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_MODE, &newval,
+				     data[IFLA_BOND_MODE], extack);
 		if (err)
 			return err;
 	}
@@ -192,7 +194,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			active_slave = slave_dev->name;
 		}
 		bond_opt_initstr(&newval, active_slave);
-		err = __bond_opt_set(bond, BOND_OPT_ACTIVE_SLAVE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_ACTIVE_SLAVE, &newval,
+				     data[IFLA_BOND_ACTIVE_SLAVE], extack);
 		if (err)
 			return err;
 	}
@@ -200,7 +203,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		miimon = nla_get_u32(data[IFLA_BOND_MIIMON]);
 
 		bond_opt_initval(&newval, miimon);
-		err = __bond_opt_set(bond, BOND_OPT_MIIMON, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_MIIMON, &newval,
+				     data[IFLA_BOND_MIIMON], extack);
 		if (err)
 			return err;
 	}
@@ -208,7 +212,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int updelay = nla_get_u32(data[IFLA_BOND_UPDELAY]);
 
 		bond_opt_initval(&newval, updelay);
-		err = __bond_opt_set(bond, BOND_OPT_UPDELAY, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_UPDELAY, &newval,
+				     data[IFLA_BOND_UPDELAY], extack);
 		if (err)
 			return err;
 	}
@@ -216,7 +221,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int downdelay = nla_get_u32(data[IFLA_BOND_DOWNDELAY]);
 
 		bond_opt_initval(&newval, downdelay);
-		err = __bond_opt_set(bond, BOND_OPT_DOWNDELAY, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_DOWNDELAY, &newval,
+				     data[IFLA_BOND_DOWNDELAY], extack);
 		if (err)
 			return err;
 	}
@@ -224,7 +230,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int delay = nla_get_u32(data[IFLA_BOND_PEER_NOTIF_DELAY]);
 
 		bond_opt_initval(&newval, delay);
-		err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval,
+				     data[IFLA_BOND_PEER_NOTIF_DELAY], extack);
 		if (err)
 			return err;
 	}
@@ -232,7 +239,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]);
 
 		bond_opt_initval(&newval, use_carrier);
-		err = __bond_opt_set(bond, BOND_OPT_USE_CARRIER, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_USE_CARRIER, &newval,
+				     data[IFLA_BOND_USE_CARRIER], extack);
 		if (err)
 			return err;
 	}
@@ -240,12 +248,14 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int arp_interval = nla_get_u32(data[IFLA_BOND_ARP_INTERVAL]);
 
 		if (arp_interval && miimon) {
-			netdev_err(bond->dev, "ARP monitoring cannot be used with MII monitoring\n");
+			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL],
+					    "ARP monitoring cannot be used with MII monitoring");
 			return -EINVAL;
 		}
 
 		bond_opt_initval(&newval, arp_interval);
-		err = __bond_opt_set(bond, BOND_OPT_ARP_INTERVAL, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_ARP_INTERVAL, &newval,
+				     data[IFLA_BOND_ARP_INTERVAL], extack);
 		if (err)
 			return err;
 	}
@@ -264,7 +274,9 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 
 			bond_opt_initval(&newval, (__force u64)target);
 			err = __bond_opt_set(bond, BOND_OPT_ARP_TARGETS,
-					     &newval);
+					     &newval,
+					     data[IFLA_BOND_ARP_IP_TARGET],
+					     extack);
 			if (err)
 				break;
 			i++;
@@ -292,7 +304,9 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 
 			bond_opt_initextra(&newval, &addr6, sizeof(addr6));
 			err = __bond_opt_set(bond, BOND_OPT_NS_TARGETS,
-					     &newval);
+					     &newval,
+					     data[IFLA_BOND_NS_IP6_TARGET],
+					     extack);
 			if (err)
 				break;
 			i++;
@@ -307,12 +321,14 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int arp_validate = nla_get_u32(data[IFLA_BOND_ARP_VALIDATE]);
 
 		if (arp_validate && miimon) {
-			netdev_err(bond->dev, "ARP validating cannot be used with MII monitoring\n");
+			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL],
+					    "ARP validating cannot be used with MII monitoring");
 			return -EINVAL;
 		}
 
 		bond_opt_initval(&newval, arp_validate);
-		err = __bond_opt_set(bond, BOND_OPT_ARP_VALIDATE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_ARP_VALIDATE, &newval,
+				     data[IFLA_BOND_ARP_VALIDATE], extack);
 		if (err)
 			return err;
 	}
@@ -321,7 +337,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u32(data[IFLA_BOND_ARP_ALL_TARGETS]);
 
 		bond_opt_initval(&newval, arp_all_targets);
-		err = __bond_opt_set(bond, BOND_OPT_ARP_ALL_TARGETS, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_ARP_ALL_TARGETS, &newval,
+				     data[IFLA_BOND_ARP_ALL_TARGETS], extack);
 		if (err)
 			return err;
 	}
@@ -335,7 +352,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			primary = dev->name;
 
 		bond_opt_initstr(&newval, primary);
-		err = __bond_opt_set(bond, BOND_OPT_PRIMARY, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_PRIMARY, &newval,
+				     data[IFLA_BOND_PRIMARY], extack);
 		if (err)
 			return err;
 	}
@@ -344,7 +362,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_PRIMARY_RESELECT]);
 
 		bond_opt_initval(&newval, primary_reselect);
-		err = __bond_opt_set(bond, BOND_OPT_PRIMARY_RESELECT, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_PRIMARY_RESELECT, &newval,
+				     data[IFLA_BOND_PRIMARY_RESELECT], extack);
 		if (err)
 			return err;
 	}
@@ -353,7 +372,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_FAIL_OVER_MAC]);
 
 		bond_opt_initval(&newval, fail_over_mac);
-		err = __bond_opt_set(bond, BOND_OPT_FAIL_OVER_MAC, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_FAIL_OVER_MAC, &newval,
+				     data[IFLA_BOND_FAIL_OVER_MAC], extack);
 		if (err)
 			return err;
 	}
@@ -362,7 +382,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_XMIT_HASH_POLICY]);
 
 		bond_opt_initval(&newval, xmit_hash_policy);
-		err = __bond_opt_set(bond, BOND_OPT_XMIT_HASH, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_XMIT_HASH, &newval,
+				     data[IFLA_BOND_XMIT_HASH_POLICY], extack);
 		if (err)
 			return err;
 	}
@@ -371,7 +392,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u32(data[IFLA_BOND_RESEND_IGMP]);
 
 		bond_opt_initval(&newval, resend_igmp);
-		err = __bond_opt_set(bond, BOND_OPT_RESEND_IGMP, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_RESEND_IGMP, &newval,
+				     data[IFLA_BOND_RESEND_IGMP], extack);
 		if (err)
 			return err;
 	}
@@ -380,7 +402,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_NUM_PEER_NOTIF]);
 
 		bond_opt_initval(&newval, num_peer_notif);
-		err = __bond_opt_set(bond, BOND_OPT_NUM_PEER_NOTIF, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_NUM_PEER_NOTIF, &newval,
+				     data[IFLA_BOND_NUM_PEER_NOTIF], extack);
 		if (err)
 			return err;
 	}
@@ -389,7 +412,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_ALL_SLAVES_ACTIVE]);
 
 		bond_opt_initval(&newval, all_slaves_active);
-		err = __bond_opt_set(bond, BOND_OPT_ALL_SLAVES_ACTIVE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_ALL_SLAVES_ACTIVE, &newval,
+				     data[IFLA_BOND_ALL_SLAVES_ACTIVE], extack);
 		if (err)
 			return err;
 	}
@@ -398,7 +422,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u32(data[IFLA_BOND_MIN_LINKS]);
 
 		bond_opt_initval(&newval, min_links);
-		err = __bond_opt_set(bond, BOND_OPT_MINLINKS, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_MINLINKS, &newval,
+				     data[IFLA_BOND_MIN_LINKS], extack);
 		if (err)
 			return err;
 	}
@@ -407,7 +432,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u32(data[IFLA_BOND_LP_INTERVAL]);
 
 		bond_opt_initval(&newval, lp_interval);
-		err = __bond_opt_set(bond, BOND_OPT_LP_INTERVAL, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_LP_INTERVAL, &newval,
+				     data[IFLA_BOND_LP_INTERVAL], extack);
 		if (err)
 			return err;
 	}
@@ -416,7 +442,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u32(data[IFLA_BOND_PACKETS_PER_SLAVE]);
 
 		bond_opt_initval(&newval, packets_per_slave);
-		err = __bond_opt_set(bond, BOND_OPT_PACKETS_PER_SLAVE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_PACKETS_PER_SLAVE, &newval,
+				     data[IFLA_BOND_PACKETS_PER_SLAVE], extack);
 		if (err)
 			return err;
 	}
@@ -425,7 +452,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]);
 
 		bond_opt_initval(&newval, lacp_active);
-		err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval,
+				     data[IFLA_BOND_AD_LACP_ACTIVE], extack);
 		if (err)
 			return err;
 	}
@@ -435,7 +463,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]);
 
 		bond_opt_initval(&newval, lacp_rate);
-		err = __bond_opt_set(bond, BOND_OPT_LACP_RATE, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_LACP_RATE, &newval,
+				     data[IFLA_BOND_AD_LACP_RATE], extack);
 		if (err)
 			return err;
 	}
@@ -444,7 +473,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u8(data[IFLA_BOND_AD_SELECT]);
 
 		bond_opt_initval(&newval, ad_select);
-		err = __bond_opt_set(bond, BOND_OPT_AD_SELECT, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_AD_SELECT, &newval,
+				     data[IFLA_BOND_AD_SELECT], extack);
 		if (err)
 			return err;
 	}
@@ -453,7 +483,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u16(data[IFLA_BOND_AD_ACTOR_SYS_PRIO]);
 
 		bond_opt_initval(&newval, actor_sys_prio);
-		err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYS_PRIO, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYS_PRIO, &newval,
+				     data[IFLA_BOND_AD_ACTOR_SYS_PRIO], extack);
 		if (err)
 			return err;
 	}
@@ -462,7 +493,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			nla_get_u16(data[IFLA_BOND_AD_USER_PORT_KEY]);
 
 		bond_opt_initval(&newval, port_key);
-		err = __bond_opt_set(bond, BOND_OPT_AD_USER_PORT_KEY, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_AD_USER_PORT_KEY, &newval,
+				     data[IFLA_BOND_AD_USER_PORT_KEY], extack);
 		if (err)
 			return err;
 	}
@@ -472,7 +504,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 
 		bond_opt_initval(&newval,
 				 nla_get_u64(data[IFLA_BOND_AD_ACTOR_SYSTEM]));
-		err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYSTEM, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYSTEM, &newval,
+				     data[IFLA_BOND_AD_ACTOR_SYSTEM], extack);
 		if (err)
 			return err;
 	}
@@ -480,7 +513,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int dynamic_lb = nla_get_u8(data[IFLA_BOND_TLB_DYNAMIC_LB]);
 
 		bond_opt_initval(&newval, dynamic_lb);
-		err = __bond_opt_set(bond, BOND_OPT_TLB_DYNAMIC_LB, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_TLB_DYNAMIC_LB, &newval,
+				     data[IFLA_BOND_TLB_DYNAMIC_LB], extack);
 		if (err)
 			return err;
 	}
@@ -489,7 +523,8 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		int missed_max = nla_get_u8(data[IFLA_BOND_MISSED_MAX]);
 
 		bond_opt_initval(&newval, missed_max);
-		err = __bond_opt_set(bond, BOND_OPT_MISSED_MAX, &newval);
+		err = __bond_opt_set(bond, BOND_OPT_MISSED_MAX, &newval,
+				     data[IFLA_BOND_MISSED_MAX], extack);
 		if (err)
 			return err;
 	}
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 1f8323ad5282..96eef19cffc4 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -632,27 +632,35 @@ static int bond_opt_check_deps(struct bonding *bond,
 }
 
 static void bond_opt_dep_print(struct bonding *bond,
-			       const struct bond_option *opt)
+			       const struct bond_option *opt,
+			       struct nlattr *bad_attr,
+			       struct netlink_ext_ack *extack)
 {
 	const struct bond_opt_value *modeval;
 	struct bond_params *params;
 
 	params = &bond->params;
 	modeval = bond_opt_get_val(BOND_OPT_MODE, params->mode);
-	if (test_bit(params->mode, &opt->unsuppmodes))
+	if (test_bit(params->mode, &opt->unsuppmodes)) {
 		netdev_err(bond->dev, "option %s: mode dependency failed, not supported in mode %s(%llu)\n",
 			   opt->name, modeval->string, modeval->value);
+		NL_SET_ERR_MSG_ATTR(extack, bad_attr,
+				    "option not supported in mode");
+	}
 }
 
 static void bond_opt_error_interpret(struct bonding *bond,
 				     const struct bond_option *opt,
-				     int error, const struct bond_opt_value *val)
+				     int error, const struct bond_opt_value *val,
+				     struct nlattr *bad_attr,
+				     struct netlink_ext_ack *extack)
 {
 	const struct bond_opt_value *minval, *maxval;
 	char *p;
 
 	switch (error) {
 	case -EINVAL:
+		NL_SET_ERR_MSG_ATTR(extack, bad_attr, "invalid option value");
 		if (val) {
 			if (val->string) {
 				/* sometimes RAWVAL opts may have new lines */
@@ -674,13 +682,17 @@ static void bond_opt_error_interpret(struct bonding *bond,
 			   opt->name, minval ? minval->value : 0, maxval->value);
 		break;
 	case -EACCES:
-		bond_opt_dep_print(bond, opt);
+		bond_opt_dep_print(bond, opt, bad_attr, extack);
 		break;
 	case -ENOTEMPTY:
+		NL_SET_ERR_MSG_ATTR(extack, bad_attr,
+				    "unable to set option because the bond device has slaves");
 		netdev_err(bond->dev, "option %s: unable to set because the bond device has slaves\n",
 			   opt->name);
 		break;
 	case -EBUSY:
+		NL_SET_ERR_MSG_ATTR(extack, bad_attr,
+				    "unable to set option because the bond is up");
 		netdev_err(bond->dev, "option %s: unable to set because the bond device is up\n",
 			   opt->name);
 		break;
@@ -691,6 +703,8 @@ static void bond_opt_error_interpret(struct bonding *bond,
 				*p = '\0';
 			netdev_err(bond->dev, "option %s: interface %s does not exist!\n",
 				   opt->name, val->string);
+			NL_SET_ERR_MSG_ATTR(extack, bad_attr,
+					    "interface does not exist");
 		}
 		break;
 	default:
@@ -703,13 +717,17 @@ static void bond_opt_error_interpret(struct bonding *bond,
  * @bond: target bond device
  * @option: option to set
  * @val: value to set it to
+ * @bad_attr: netlink attribue that caused the error
+ * @extack: extended netlink error structure, used when an error message
+ *          needs to be returned to the caller via netlink
  *
  * This function is used to change the bond's option value, it can be
  * used for both enabling/changing an option and for disabling it. RTNL lock
  * must be obtained before calling this function.
  */
 int __bond_opt_set(struct bonding *bond,
-		   unsigned int option, struct bond_opt_value *val)
+		   unsigned int option, struct bond_opt_value *val,
+		   struct nlattr *bad_attr, struct netlink_ext_ack *extack)
 {
 	const struct bond_opt_value *retval = NULL;
 	const struct bond_option *opt;
@@ -731,7 +749,7 @@ int __bond_opt_set(struct bonding *bond,
 	ret = opt->set(bond, retval);
 out:
 	if (ret)
-		bond_opt_error_interpret(bond, opt, ret, val);
+		bond_opt_error_interpret(bond, opt, ret, val, bad_attr, extack);
 
 	return ret;
 }
@@ -753,7 +771,7 @@ int __bond_opt_set_notify(struct bonding *bond,
 
 	ASSERT_RTNL();
 
-	ret = __bond_opt_set(bond, option, val);
+	ret = __bond_opt_set(bond, option, val, NULL, NULL);
 
 	if (!ret && (bond->dev->reg_state == NETREG_REGISTERED))
 		call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev);
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 61b49063791c..1618b76f4903 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -107,7 +107,8 @@ struct bond_option {
 };
 
 int __bond_opt_set(struct bonding *bond, unsigned int option,
-		   struct bond_opt_value *val);
+		   struct bond_opt_value *val,
+		   struct nlattr *bad_attr, struct netlink_ext_ack *extack);
 int __bond_opt_set_notify(struct bonding *bond, unsigned int option,
 			  struct bond_opt_value *val);
 int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf);
-- 
cgit 


From 21ab562c1f65e583b5d89081acaf096805522482 Mon Sep 17 00:00:00 2001
From: Po Hao Huang <phhuang@realtek.com>
Date: Wed, 8 Jun 2022 19:32:22 +0800
Subject: ieee80211: add trigger frame definition

Define trigger stype of control frame, and its checking function, struct
and trigger type within common_info of trigger.

Signed-off-by: Po Hao Huang <phhuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220608113224.11193-2-pkshih@realtek.com
---
 include/linux/ieee80211.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 75d40acb60c1..5c65ae6b8154 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -76,6 +76,7 @@
 #define IEEE80211_STYPE_ACTION		0x00D0
 
 /* control */
+#define IEEE80211_STYPE_TRIGGER		0x0020
 #define IEEE80211_STYPE_CTL_EXT		0x0060
 #define IEEE80211_STYPE_BACK_REQ	0x0080
 #define IEEE80211_STYPE_BACK		0x0090
@@ -295,6 +296,17 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 
 #define IEEE80211_HT_CTL_LEN		4
 
+/* trigger type within common_info of trigger frame */
+#define IEEE80211_TRIGGER_TYPE_MASK		0xf
+#define IEEE80211_TRIGGER_TYPE_BASIC		0x0
+#define IEEE80211_TRIGGER_TYPE_BFRP		0x1
+#define IEEE80211_TRIGGER_TYPE_MU_BAR		0x2
+#define IEEE80211_TRIGGER_TYPE_MU_RTS		0x3
+#define IEEE80211_TRIGGER_TYPE_BSRP		0x4
+#define IEEE80211_TRIGGER_TYPE_GCR_MU_BAR	0x5
+#define IEEE80211_TRIGGER_TYPE_BQRP		0x6
+#define IEEE80211_TRIGGER_TYPE_NFRP		0x7
+
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
@@ -324,6 +336,15 @@ struct ieee80211_qos_hdr {
 	__le16 qos_ctrl;
 } __packed __aligned(2);
 
+struct ieee80211_trigger {
+	__le16 frame_control;
+	__le16 duration;
+	u8 ra[ETH_ALEN];
+	u8 ta[ETH_ALEN];
+	__le64 common_info;
+	u8 variable[];
+} __packed __aligned(2);
+
 /**
  * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set
  * @fc: frame control bytes in little-endian byteorder
@@ -729,6 +750,16 @@ static inline bool ieee80211_is_qos_nullfunc(__le16 fc)
 	       cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
 }
 
+/**
+ * ieee80211_is_trigger - check if frame is trigger frame
+ * @fc: frame control field in little-endian byteorder
+ */
+static inline bool ieee80211_is_trigger(__le16 fc)
+{
+	return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
+	       cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_TRIGGER);
+}
+
 /**
  * ieee80211_is_any_nullfunc - check if frame is regular or QoS nullfunc frame
  * @fc: frame control bytes in little-endian byteorder
-- 
cgit 


From f3c923a09c4c4f5861b1ed53cf75673992a6ba68 Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 20 May 2022 18:14:54 +0000
Subject: crypto: polyval - Add POLYVAL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for POLYVAL, an ε-Δ-universal hash function similar to
GHASH.  This patch only uses POLYVAL as a component to implement HCTR2
mode.  It should be noted that POLYVAL was originally specified for use
in AES-GCM-SIV (RFC 8452), but the kernel does not currently support
this mode.

POLYVAL is implemented as an shash algorithm.  The implementation is
modified from ghash-generic.c.

For more information on POLYVAL see:
Length-preserving encryption with HCTR2:
  https://eprint.iacr.org/2021/1441.pdf
AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption:
  https://datatracker.ietf.org/doc/html/rfc8452

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig           |   8 ++
 crypto/Makefile          |   1 +
 crypto/polyval-generic.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++
 crypto/tcrypt.c          |   4 +
 crypto/testmgr.c         |   6 ++
 crypto/testmgr.h         | 171 +++++++++++++++++++++++++++++++++++++++
 include/crypto/polyval.h |  17 ++++
 7 files changed, 412 insertions(+)
 create mode 100644 crypto/polyval-generic.c
 create mode 100644 include/crypto/polyval.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b9e4d511bf5a..d59e70dca197 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -773,6 +773,14 @@ config CRYPTO_GHASH
 	  GHASH is the hash function used in GCM (Galois/Counter Mode).
 	  It is not a general-purpose cryptographic hash function.
 
+config CRYPTO_POLYVAL
+	tristate
+	select CRYPTO_GF128MUL
+	select CRYPTO_HASH
+	help
+	  POLYVAL is the hash function used in HCTR2.  It is not a general-purpose
+	  cryptographic hash function.
+
 config CRYPTO_POLY1305
 	tristate "Poly1305 authenticator algorithm"
 	select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index 93d0afeb3a77..7694ed0a44d5 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -172,6 +172,7 @@ UBSAN_SANITIZE_jitterentropy.o = n
 jitterentropy_rng-y := jitterentropy.o jitterentropy-kcapi.o
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
+obj-$(CONFIG_CRYPTO_POLYVAL) += polyval-generic.o
 obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c
new file mode 100644
index 000000000000..bf2b03b7bfc0
--- /dev/null
+++ b/crypto/polyval-generic.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POLYVAL: hash function for HCTR2.
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Code based on crypto/ghash-generic.c
+ *
+ * POLYVAL is a keyed hash function similar to GHASH. POLYVAL uses a different
+ * modulus for finite field multiplication which makes hardware accelerated
+ * implementations on little-endian machines faster. POLYVAL is used in the
+ * kernel to implement HCTR2, but was originally specified for AES-GCM-SIV
+ * (RFC 8452).
+ *
+ * For more information see:
+ * Length-preserving encryption with HCTR2:
+ *   https://eprint.iacr.org/2021/1441.pdf
+ * AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption:
+ *   https://datatracker.ietf.org/doc/html/rfc8452
+ *
+ * Like GHASH, POLYVAL is not a cryptographic hash function and should
+ * not be used outside of crypto modes explicitly designed to use POLYVAL.
+ *
+ * This implementation uses a convenient trick involving the GHASH and POLYVAL
+ * fields. This trick allows multiplication in the POLYVAL field to be
+ * implemented by using multiplication in the GHASH field as a subroutine. An
+ * element of the POLYVAL field can be converted to an element of the GHASH
+ * field by computing x*REVERSE(a), where REVERSE reverses the byte-ordering of
+ * a. Similarly, an element of the GHASH field can be converted back to the
+ * POLYVAL field by computing REVERSE(x^{-1}*a). For more information, see:
+ * https://datatracker.ietf.org/doc/html/rfc8452#appendix-A
+ *
+ * By using this trick, we do not need to implement the POLYVAL field for the
+ * generic implementation.
+ *
+ * Warning: this generic implementation is not intended to be used in practice
+ * and is not constant time. For practical use, a hardware accelerated
+ * implementation of POLYVAL should be used instead.
+ *
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/gf128mul.h>
+#include <crypto/polyval.h>
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+struct polyval_tfm_ctx {
+	struct gf128mul_4k *gf128;
+};
+
+struct polyval_desc_ctx {
+	union {
+		u8 buffer[POLYVAL_BLOCK_SIZE];
+		be128 buffer128;
+	};
+	u32 bytes;
+};
+
+static void copy_and_reverse(u8 dst[POLYVAL_BLOCK_SIZE],
+			     const u8 src[POLYVAL_BLOCK_SIZE])
+{
+	u64 a = get_unaligned((const u64 *)&src[0]);
+	u64 b = get_unaligned((const u64 *)&src[8]);
+
+	put_unaligned(swab64(a), (u64 *)&dst[8]);
+	put_unaligned(swab64(b), (u64 *)&dst[0]);
+}
+
+static int polyval_setkey(struct crypto_shash *tfm,
+			  const u8 *key, unsigned int keylen)
+{
+	struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	be128 k;
+
+	if (keylen != POLYVAL_BLOCK_SIZE)
+		return -EINVAL;
+
+	gf128mul_free_4k(ctx->gf128);
+
+	BUILD_BUG_ON(sizeof(k) != POLYVAL_BLOCK_SIZE);
+	copy_and_reverse((u8 *)&k, key);
+	gf128mul_x_lle(&k, &k);
+
+	ctx->gf128 = gf128mul_init_4k_lle(&k);
+	memzero_explicit(&k, POLYVAL_BLOCK_SIZE);
+
+	if (!ctx->gf128)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int polyval_init(struct shash_desc *desc)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	memset(dctx, 0, sizeof(*dctx));
+
+	return 0;
+}
+
+static int polyval_update(struct shash_desc *desc,
+			 const u8 *src, unsigned int srclen)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+	const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	u8 *pos;
+	u8 tmp[POLYVAL_BLOCK_SIZE];
+	int n;
+
+	if (dctx->bytes) {
+		n = min(srclen, dctx->bytes);
+		pos = dctx->buffer + dctx->bytes - 1;
+
+		dctx->bytes -= n;
+		srclen -= n;
+
+		while (n--)
+			*pos-- ^= *src++;
+
+		if (!dctx->bytes)
+			gf128mul_4k_lle(&dctx->buffer128, ctx->gf128);
+	}
+
+	while (srclen >= POLYVAL_BLOCK_SIZE) {
+		copy_and_reverse(tmp, src);
+		crypto_xor(dctx->buffer, tmp, POLYVAL_BLOCK_SIZE);
+		gf128mul_4k_lle(&dctx->buffer128, ctx->gf128);
+		src += POLYVAL_BLOCK_SIZE;
+		srclen -= POLYVAL_BLOCK_SIZE;
+	}
+
+	if (srclen) {
+		dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - 1;
+		while (srclen--)
+			*pos-- ^= *src++;
+	}
+
+	return 0;
+}
+
+static int polyval_final(struct shash_desc *desc, u8 *dst)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+	const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm);
+
+	if (dctx->bytes)
+		gf128mul_4k_lle(&dctx->buffer128, ctx->gf128);
+	copy_and_reverse(dst, dctx->buffer);
+	return 0;
+}
+
+static void polyval_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct polyval_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	gf128mul_free_4k(ctx->gf128);
+}
+
+static struct shash_alg polyval_alg = {
+	.digestsize	= POLYVAL_DIGEST_SIZE,
+	.init		= polyval_init,
+	.update		= polyval_update,
+	.final		= polyval_final,
+	.setkey		= polyval_setkey,
+	.descsize	= sizeof(struct polyval_desc_ctx),
+	.base		= {
+		.cra_name		= "polyval",
+		.cra_driver_name	= "polyval-generic",
+		.cra_priority		= 100,
+		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
+		.cra_module		= THIS_MODULE,
+		.cra_exit		= polyval_exit_tfm,
+	},
+};
+
+static int __init polyval_mod_init(void)
+{
+	return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_mod_exit(void)
+{
+	crypto_unregister_shash(&polyval_alg);
+}
+
+subsys_initcall(polyval_mod_init);
+module_exit(polyval_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-generic");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index fd671d0e2012..dd9cf216029b 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1730,6 +1730,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("ccm(sm4)");
 		break;
 
+	case 57:
+		ret += tcrypt_test("polyval");
+		break;
+
 	case 100:
 		ret += tcrypt_test("hmac(md5)");
 		break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 0ea77dcdc6c0..0f40e260b5a9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5342,6 +5342,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(poly1305_tv_template)
 		}
+	}, {
+		.alg = "polyval",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(polyval_tv_template)
+		}
 	}, {
 		.alg = "rfc3686(ctr(aes))",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 7179df0a39b6..b58e13930307 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -34944,4 +34944,175 @@ static const struct cipher_testvec aes_xctr_tv_template[] = {
 
 };
 
+/*
+ * Test vectors generated using https://github.com/google/hctr2
+ *
+ * To ensure compatibility with RFC 8452, some tests were sourced from
+ * https://datatracker.ietf.org/doc/html/rfc8452
+ */
+static const struct hash_testvec polyval_tv_template[] = {
+	{ // From RFC 8452
+		.key	= "\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+			  "\x37\xb2\x43\x16\xc3\xfa\xb9\xa0",
+		.plaintext	= "\x65\x78\x61\x6d\x70\x6c\x65\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+			  "\x72\x6c\x64\x00\x00\x00\x00\x00"
+			  "\x38\x00\x00\x00\x00\x00\x00\x00"
+			  "\x58\x00\x00\x00\x00\x00\x00\x00",
+		.digest	= "\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+			  "\x62\x67\x2f\x3c\x5f\x95\x13\x8f",
+		.psize	= 48,
+		.ksize	= 16,
+	},
+	{ // From RFC 8452
+		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
+			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
+		.plaintext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.psize	= 16,
+		.ksize	= 16,
+	},
+	{ // From RFC 8452
+		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
+			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
+		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x40\x00\x00\x00\x00\x00\x00\x00",
+		.digest	= "\xeb\x93\xb7\x74\x09\x62\xc5\xe4"
+			  "\x9d\x2a\x90\xa7\xdc\x5c\xec\x74",
+		.psize	= 32,
+		.ksize	= 16,
+	},
+	{ // From RFC 8452
+		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
+			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
+		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x02\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x03\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x80\x01\x00\x00\x00\x00\x00\x00",
+		.digest	= "\x81\x38\x87\x46\xbc\x22\xd2\x6b"
+			  "\x2a\xbc\x3d\xcb\x15\x75\x42\x22",
+		.psize	= 64,
+		.ksize	= 16,
+	},
+	{ // From RFC 8452
+		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
+			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
+		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x02\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x03\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x04\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x02\x00\x00\x00\x00\x00\x00",
+		.digest	= "\x1e\x39\xb6\xd3\x34\x4d\x34\x8f"
+			  "\x60\x44\xf8\x99\x35\xd1\xcf\x78",
+		.psize	= 80,
+		.ksize	= 16,
+	},
+	{ // From RFC 8452
+		.key	= "\xd9\xb3\x60\x27\x96\x94\x94\x1a"
+			  "\xc5\xdb\xc6\x98\x7a\xda\x73\x77",
+		.plaintext	= "\x01\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x02\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x03\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x04\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x05\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x08\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x02\x00\x00\x00\x00\x00\x00",
+		.digest	= "\xff\xcd\x05\xd5\x77\x0f\x34\xad"
+			  "\x92\x67\xf0\xa5\x99\x94\xb1\x5a",
+		.psize	= 96,
+		.ksize	= 16,
+	},
+	{ // Random ( 1)
+		.key	= "\x90\xcc\xac\xee\xba\xd7\xd4\x68"
+			  "\x98\xa6\x79\x70\xdf\x66\x15\x6c",
+		.plaintext	= "",
+		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.psize	= 0,
+		.ksize	= 16,
+	},
+	{ // Random ( 1)
+		.key	= "\xc1\x45\x71\xf0\x30\x07\x94\xe7"
+			  "\x3a\xdd\xe4\xc6\x19\x2d\x02\xa2",
+		.plaintext	= "\xc1\x5d\x47\xc7\x4c\x7c\x5e\x07"
+			  "\x85\x14\x8f\x79\xcc\x73\x83\xf7"
+			  "\x35\xb8\xcb\x73\x61\xf0\x53\x31"
+			  "\xbf\x84\xde\xb6\xde\xaf\xb0\xb8"
+			  "\xb7\xd9\x11\x91\x89\xfd\x1e\x4c"
+			  "\x84\x4a\x1f\x2a\x87\xa4\xaf\x62"
+			  "\x8d\x7d\x58\xf6\x43\x35\xfc\x53"
+			  "\x8f\x1a\xf6\x12\xe1\x13\x3f\x66"
+			  "\x91\x4b\x13\xd6\x45\xfb\xb0\x7a"
+			  "\xe0\x8b\x8e\x99\xf7\x86\x46\x37"
+			  "\xd1\x22\x9e\x52\xf3\x3f\xd9\x75"
+			  "\x2c\x2c\xc6\xbb\x0e\x08\x14\x29"
+			  "\xe8\x50\x2f\xd8\xbe\xf4\xe9\x69"
+			  "\x4a\xee\xf7\xae\x15\x65\x35\x1e",
+		.digest	= "\x00\x4f\x5d\xe9\x3b\xc0\xd6\x50"
+			  "\x3e\x38\x73\x86\xc6\xda\xca\x7f",
+		.psize	= 112,
+		.ksize	= 16,
+	},
+	{ // Random ( 1)
+		.key	= "\x37\xbe\x68\x16\x50\xb9\x4e\xb0"
+			  "\x47\xde\xe2\xbd\xde\xe4\x48\x09",
+		.plaintext	= "\x87\xfc\x68\x9f\xff\xf2\x4a\x1e"
+			  "\x82\x3b\x73\x8f\xc1\xb2\x1b\x7a"
+			  "\x6c\x4f\x81\xbc\x88\x9b\x6c\xa3"
+			  "\x9c\xc2\xa5\xbc\x14\x70\x4c\x9b"
+			  "\x0c\x9f\x59\x92\x16\x4b\x91\x3d"
+			  "\x18\x55\x22\x68\x12\x8c\x63\xb2"
+			  "\x51\xcb\x85\x4b\xd2\xae\x0b\x1c"
+			  "\x5d\x28\x9d\x1d\xb1\xc8\xf0\x77"
+			  "\xe9\xb5\x07\x4e\x06\xc8\xee\xf8"
+			  "\x1b\xed\x72\x2a\x55\x7d\x16\xc9"
+			  "\xf2\x54\xe7\xe9\xe0\x44\x5b\x33"
+			  "\xb1\x49\xee\xff\x43\xfb\x82\xcd"
+			  "\x4a\x70\x78\x81\xa4\x34\x36\xe8"
+			  "\x4c\x28\x54\xa6\x6c\xc3\x6b\x78"
+			  "\xe7\xc0\x5d\xc6\x5d\x81\xab\x70"
+			  "\x08\x86\xa1\xfd\xf4\x77\x55\xfd"
+			  "\xa3\xe9\xe2\x1b\xdf\x99\xb7\x80"
+			  "\xf9\x0a\x4f\x72\x4a\xd3\xaf\xbb"
+			  "\xb3\x3b\xeb\x08\x58\x0f\x79\xce"
+			  "\xa5\x99\x05\x12\x34\xd4\xf4\x86"
+			  "\x37\x23\x1d\xc8\x49\xc0\x92\xae"
+			  "\xa6\xac\x9b\x31\x55\xed\x15\xc6"
+			  "\x05\x17\x37\x8d\x90\x42\xe4\x87"
+			  "\x89\x62\x88\x69\x1c\x6a\xfd\xe3"
+			  "\x00\x2b\x47\x1a\x73\xc1\x51\xc2"
+			  "\xc0\x62\x74\x6a\x9e\xb2\xe5\x21"
+			  "\xbe\x90\xb5\xb0\x50\xca\x88\x68"
+			  "\xe1\x9d\x7a\xdf\x6c\xb7\xb9\x98"
+			  "\xee\x28\x62\x61\x8b\xd1\x47\xf9"
+			  "\x04\x7a\x0b\x5d\xcd\x2b\x65\xf5"
+			  "\x12\xa3\xfe\x1a\xaa\x2c\x78\x42"
+			  "\xb8\xbe\x7d\x74\xeb\x59\xba\xba",
+		.digest	= "\xae\x11\xd4\x60\x2a\x5f\x9e\x42"
+			  "\x89\x04\xc2\x34\x8d\x55\x94\x0a",
+		.psize	= 256,
+		.ksize	= 16,
+	},
+
+};
+
 #endif	/* _CRYPTO_TESTMGR_H */
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
new file mode 100644
index 000000000000..b14c38aa9166
--- /dev/null
+++ b/include/crypto/polyval.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for the Polyval hash algorithm
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#ifndef _CRYPTO_POLYVAL_H
+#define _CRYPTO_POLYVAL_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+#define POLYVAL_BLOCK_SIZE	16
+#define POLYVAL_DIGEST_SIZE	16
+
+#endif
-- 
cgit 


From 34f7f6c3011276313383099156be287ac745bcea Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 20 May 2022 18:14:59 +0000
Subject: crypto: x86/polyval - Add PCLMULQDQ accelerated implementation of
 POLYVAL

Add hardware accelerated version of POLYVAL for x86-64 CPUs with
PCLMULQDQ support.

This implementation is accelerated using PCLMULQDQ instructions to
perform the finite field computations.  For added efficiency, 8 blocks
of the message are processed simultaneously by precomputing the first
8 powers of the key.

Schoolbook multiplication is used instead of Karatsuba multiplication
because it was found to be slightly faster on x86-64 machines.
Montgomery reduction must be used instead of Barrett reduction due to
the difference in modulus between POLYVAL's field and other finite
fields.

More information on POLYVAL can be found in the HCTR2 paper:
"Length-preserving encryption with HCTR2":
https://eprint.iacr.org/2021/1441.pdf

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile               |   3 +
 arch/x86/crypto/polyval-clmulni_asm.S  | 321 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/polyval-clmulni_glue.c | 203 +++++++++++++++++++++
 crypto/Kconfig                         |   9 +
 crypto/polyval-generic.c               |  40 ++++
 include/crypto/polyval.h               |   5 +
 6 files changed, 581 insertions(+)
 create mode 100644 arch/x86/crypto/polyval-clmulni_asm.S
 create mode 100644 arch/x86/crypto/polyval-clmulni_glue.c

(limited to 'include')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2831685adf6f..b9847152acd8 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -69,6 +69,9 @@ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
+obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
+polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
+
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
new file mode 100644
index 000000000000..a6ebe4e7dd2b
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_asm.S
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define KEY_POWERS %rdi
+#define MSG %rsi
+#define BLOCKS_LEFT %rdx
+#define ACCUMULATOR %rcx
+#define TMP %rax
+
+.section    .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+	.quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+	.set i, 0
+	.rept (\count)
+		schoolbook1_iteration i 0
+		.set i, (i +1)
+	.endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *   X = [X_1 : X_0]
+ *   Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *   LO += X_0 * Y_0
+ *   MI += X_0 * Y_1 + X_1 * Y_0
+ *   HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+	movups (16*\i)(MSG), %xmm0
+	.if (\i == 0 && \xor_sum == 1)
+		pxor SUM, %xmm0
+	.endif
+	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+	vpxor %xmm2, MI, MI
+	vpxor %xmm1, LO, LO
+	vpxor %xmm4, HI, HI
+	vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+	vpclmulqdq $0x01, %xmm0, %xmm1, MI
+	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+	vpclmulqdq $0x00, %xmm0, %xmm1, LO
+	vpclmulqdq $0x11, %xmm0, %xmm1, HI
+	vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+	vpslldq $8, MI, PL
+	vpsrldq $8, MI, PH
+	pxor LO, PL
+	pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
+	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
+	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+	vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+	pxor LO, LO
+	pxor HI, HI
+	pxor MI, MI
+
+	schoolbook1_iteration 7 0
+	.if \reduce
+		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 6 0
+	.if \reduce
+		pshufd $0b01001110, TMP_XMM, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 5 0
+	.if \reduce
+		pxor PL, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 4 0
+	.if \reduce
+		pxor TMP_XMM, PH
+	.endif
+
+	schoolbook1_iteration 3 0
+	.if \reduce
+		pclmulqdq $0x11, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 2 0
+	.if \reduce
+		vpxor TMP_XMM, PH, SUM
+	.endif
+
+	schoolbook1_iteration 1 0
+
+	schoolbook1_iteration 0 1
+
+	addq $(8*16), MSG
+	schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+	mov BLOCKS_LEFT, TMP
+	shlq $4, TMP
+	addq $(16*STRIDE_BLOCKS), KEY_POWERS
+	subq TMP, KEY_POWERS
+
+	movups (MSG), %xmm0
+	pxor SUM, %xmm0
+	movaps (KEY_POWERS), %xmm1
+	schoolbook1_noload
+	dec BLOCKS_LEFT
+	addq $16, MSG
+	addq $16, KEY_POWERS
+
+	test $4, BLOCKS_LEFT
+	jz .Lpartial4BlocksDone
+	schoolbook1 4
+	addq $(4*16), MSG
+	addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+	test $2, BLOCKS_LEFT
+	jz .Lpartial2BlocksDone
+	schoolbook1 2
+	addq $(2*16), MSG
+	addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+	test $1, BLOCKS_LEFT
+	jz .LpartialDone
+	schoolbook1 1
+.LpartialDone:
+	schoolbook2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ *
+ * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
+ * If op1, op2 are in montgomery form, this computes the montgomery
+ * form of op1*op2.
+ *
+ * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ */
+SYM_FUNC_START(clmul_polyval_mul)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (%rdi), %xmm0
+	movups (%rsi), %xmm1
+	schoolbook1_noload
+	schoolbook2
+	montgomery_reduction SUM
+	movups SUM, (%rdi)
+	FRAME_END
+	RET
+SYM_FUNC_END(clmul_polyval_mul)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to precomputed key powers h^8 ... h^1
+ * rsi - pointer to message blocks
+ * rdx - number of blocks to hash
+ * rcx - pointer to the accumulator
+ *
+ * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+ *	const u8 *in, size_t nblocks, u8 *accumulator);
+ */
+SYM_FUNC_START(clmul_polyval_update)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (ACCUMULATOR), SUM
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExit
+	full_stride 0
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	jns .LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	add $STRIDE_BLOCKS, BLOCKS_LEFT
+	jz .LskipPartial
+	partial_stride
+.LskipPartial:
+	movups SUM, (ACCUMULATOR)
+	FRAME_END
+	RET
+SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
new file mode 100644
index 000000000000..b7664d018851
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Glue code for POLYVAL using PCMULQDQ-NI
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Glue code based on ghash-clmulni-intel_glue.c.
+ *
+ * This implementation of POLYVAL uses montgomery multiplication
+ * accelerated by PCLMULQDQ-NI to implement the finite field
+ * operations.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/polyval.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+#define NUM_KEY_POWERS	8
+
+struct polyval_tfm_ctx {
+	/*
+	 * These powers must be in the order h^8, ..., h^1.
+	 */
+	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
+};
+
+struct polyval_desc_ctx {
+	u8 buffer[POLYVAL_BLOCK_SIZE];
+	u32 bytes;
+};
+
+asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+	const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
+
+static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
+	const u8 *in, size_t nblocks, u8 *accumulator)
+{
+	if (likely(crypto_simd_usable())) {
+		kernel_fpu_begin();
+		clmul_polyval_update(keys, in, nblocks, accumulator);
+		kernel_fpu_end();
+	} else {
+		polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
+			nblocks, accumulator);
+	}
+}
+
+static void internal_polyval_mul(u8 *op1, const u8 *op2)
+{
+	if (likely(crypto_simd_usable())) {
+		kernel_fpu_begin();
+		clmul_polyval_mul(op1, op2);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_non4k(op1, op2);
+	}
+}
+
+static int polyval_x86_setkey(struct crypto_shash *tfm,
+			const u8 *key, unsigned int keylen)
+{
+	struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+	int i;
+
+	if (keylen != POLYVAL_BLOCK_SIZE)
+		return -EINVAL;
+
+	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
+
+	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
+		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
+		internal_polyval_mul(tctx->key_powers[i],
+				     tctx->key_powers[i+1]);
+	}
+
+	return 0;
+}
+
+static int polyval_x86_init(struct shash_desc *desc)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	memset(dctx, 0, sizeof(*dctx));
+
+	return 0;
+}
+
+static int polyval_x86_update(struct shash_desc *desc,
+			 const u8 *src, unsigned int srclen)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	u8 *pos;
+	unsigned int nblocks;
+	unsigned int n;
+
+	if (dctx->bytes) {
+		n = min(srclen, dctx->bytes);
+		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
+
+		dctx->bytes -= n;
+		srclen -= n;
+
+		while (n--)
+			*pos++ ^= *src++;
+
+		if (!dctx->bytes)
+			internal_polyval_mul(dctx->buffer,
+					    tctx->key_powers[NUM_KEY_POWERS-1]);
+	}
+
+	while (srclen >= POLYVAL_BLOCK_SIZE) {
+		/* Allow rescheduling every 4K bytes. */
+		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
+		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
+		src += nblocks * POLYVAL_BLOCK_SIZE;
+	}
+
+	if (srclen) {
+		dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+		pos = dctx->buffer;
+		while (srclen--)
+			*pos++ ^= *src++;
+	}
+
+	return 0;
+}
+
+static int polyval_x86_final(struct shash_desc *desc, u8 *dst)
+{
+	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+
+	if (dctx->bytes) {
+		internal_polyval_mul(dctx->buffer,
+				     tctx->key_powers[NUM_KEY_POWERS-1]);
+	}
+
+	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg polyval_alg = {
+	.digestsize	= POLYVAL_DIGEST_SIZE,
+	.init		= polyval_x86_init,
+	.update		= polyval_x86_update,
+	.final		= polyval_x86_final,
+	.setkey		= polyval_x86_setkey,
+	.descsize	= sizeof(struct polyval_desc_ctx),
+	.base		= {
+		.cra_name		= "polyval",
+		.cra_driver_name	= "polyval-clmulni",
+		.cra_priority		= 200,
+		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
+		.cra_module		= THIS_MODULE,
+	},
+};
+
+__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
+	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init polyval_clmulni_mod_init(void)
+{
+	if (!x86_match_cpu(pcmul_cpu_id))
+		return -ENODEV;
+
+	if (!boot_cpu_has(X86_FEATURE_AVX))
+		return -ENODEV;
+
+	return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_clmulni_mod_exit(void)
+{
+	crypto_unregister_shash(&polyval_alg);
+}
+
+module_init(polyval_clmulni_mod_init);
+module_exit(polyval_clmulni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index dfcc3235e918..9b654984de79 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -792,6 +792,15 @@ config CRYPTO_POLYVAL
 	  POLYVAL is the hash function used in HCTR2.  It is not a general-purpose
 	  cryptographic hash function.
 
+config CRYPTO_POLYVAL_CLMUL_NI
+	tristate "POLYVAL hash function (CLMUL-NI accelerated)"
+	depends on X86 && 64BIT
+	select CRYPTO_POLYVAL
+	help
+	  This is the x86_64 CLMUL-NI accelerated implementation of POLYVAL. It is
+	  used to efficiently implement HCTR2 on x86-64 processors that support
+	  carry-less multiplication instructions.
+
 config CRYPTO_POLY1305
 	tristate "Poly1305 authenticator algorithm"
 	select CRYPTO_HASH
diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c
index bf2b03b7bfc0..16bfa6925b31 100644
--- a/crypto/polyval-generic.c
+++ b/crypto/polyval-generic.c
@@ -76,6 +76,46 @@ static void copy_and_reverse(u8 dst[POLYVAL_BLOCK_SIZE],
 	put_unaligned(swab64(b), (u64 *)&dst[0]);
 }
 
+/*
+ * Performs multiplication in the POLYVAL field using the GHASH field as a
+ * subroutine.  This function is used as a fallback for hardware accelerated
+ * implementations when simd registers are unavailable.
+ *
+ * Note: This function is not used for polyval-generic, instead we use the 4k
+ * lookup table implementation for finite field multiplication.
+ */
+void polyval_mul_non4k(u8 *op1, const u8 *op2)
+{
+	be128 a, b;
+
+	// Assume one argument is in Montgomery form and one is not.
+	copy_and_reverse((u8 *)&a, op1);
+	copy_and_reverse((u8 *)&b, op2);
+	gf128mul_x_lle(&a, &a);
+	gf128mul_lle(&a, &b);
+	copy_and_reverse(op1, (u8 *)&a);
+}
+EXPORT_SYMBOL_GPL(polyval_mul_non4k);
+
+/*
+ * Perform a POLYVAL update using non4k multiplication.  This function is used
+ * as a fallback for hardware accelerated implementations when simd registers
+ * are unavailable.
+ *
+ * Note: This function is not used for polyval-generic, instead we use the 4k
+ * lookup table implementation of finite field multiplication.
+ */
+void polyval_update_non4k(const u8 *key, const u8 *in,
+			  size_t nblocks, u8 *accumulator)
+{
+	while (nblocks--) {
+		crypto_xor(accumulator, in, POLYVAL_BLOCK_SIZE);
+		polyval_mul_non4k(accumulator, key);
+		in += POLYVAL_BLOCK_SIZE;
+	}
+}
+EXPORT_SYMBOL_GPL(polyval_update_non4k);
+
 static int polyval_setkey(struct crypto_shash *tfm,
 			  const u8 *key, unsigned int keylen)
 {
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index b14c38aa9166..1d630f371f77 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -14,4 +14,9 @@
 #define POLYVAL_BLOCK_SIZE	16
 #define POLYVAL_DIGEST_SIZE	16
 
+void polyval_mul_non4k(u8 *op1, const u8 *op2);
+
+void polyval_update_non4k(const u8 *key, const u8 *in,
+			  size_t nblocks, u8 *accumulator);
+
 #endif
-- 
cgit 


From 6b2a51ff03bf0c54cbc699ee85a9a49eb203ebfc Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 20 May 2022 18:15:01 +0000
Subject: fscrypt: Add HCTR2 support for filename encryption

HCTR2 is a tweakable, length-preserving encryption mode that is intended
for use on CPUs with dedicated crypto instructions.  HCTR2 has the
property that a bitflip in the plaintext changes the entire ciphertext.
This property fixes a known weakness with filename encryption: when two
filenames in the same directory share a prefix of >= 16 bytes, with
AES-CTS-CBC their encrypted filenames share a common substring, leaking
information.  HCTR2 does not have this problem.

More information on HCTR2 can be found here: "Length-preserving
encryption with HCTR2": https://eprint.iacr.org/2021/1441.pdf

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/filesystems/fscrypt.rst | 22 +++++++++++++++++-----
 fs/crypto/fscrypt_private.h           |  2 +-
 fs/crypto/keysetup.c                  |  7 +++++++
 fs/crypto/policy.c                    | 14 +++++++++++---
 include/uapi/linux/fscrypt.h          |  3 ++-
 5 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 2e9aaa295125..5ba5817c17c2 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -337,6 +337,7 @@ Currently, the following pairs of encryption modes are supported:
 - AES-256-XTS for contents and AES-256-CTS-CBC for filenames
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
 - Adiantum for both contents and filenames
+- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
 
 If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
 
@@ -357,6 +358,17 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled.  Also, fast
 implementations of ChaCha and NHPoly1305 should be enabled, e.g.
 CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
 
+AES-256-HCTR2 is another true wide-block encryption mode that is intended for
+use on CPUs with dedicated crypto instructions.  AES-256-HCTR2 has the property
+that a bitflip in the plaintext changes the entire ciphertext.  This property
+makes it desirable for filename encryption since initialization vectors are
+reused within a directory.  For more details on AES-256-HCTR2, see the paper
+"Length-preserving encryption with HCTR2"
+(https://eprint.iacr.org/2021/1441.pdf).  To use AES-256-HCTR2,
+CONFIG_CRYPTO_HCTR2 must be enabled.  Also, fast implementations of XCTR and
+POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
+CRYPTO_AES_ARM64_CE_BLK for ARM64.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing
@@ -404,11 +416,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
 inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
 Thus, IV reuse is limited to within a single directory.
 
-With CTS-CBC, the IV reuse means that when the plaintext filenames
-share a common prefix at least as long as the cipher block size (16
-bytes for AES), the corresponding encrypted filenames will also share
-a common prefix.  This is undesirable.  Adiantum does not have this
-weakness, as it is a wide-block encryption mode.
+With CTS-CBC, the IV reuse means that when the plaintext filenames share a
+common prefix at least as long as the cipher block size (16 bytes for AES), the
+corresponding encrypted filenames will also share a common prefix.  This is
+undesirable.  Adiantum and HCTR2 do not have this weakness, as they are
+wide-block encryption modes.
 
 All supported filenames encryption modes accept any plaintext length
 >= 16 bytes; cipher block alignment is not required.  However,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 6b4c8094cc7b..f5be777d8279 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -31,7 +31,7 @@
 #define FSCRYPT_CONTEXT_V2	2
 
 /* Keep this in sync with include/uapi/linux/fscrypt.h */
-#define FSCRYPT_MODE_MAX	FSCRYPT_MODE_ADIANTUM
+#define FSCRYPT_MODE_MAX	FSCRYPT_MODE_AES_256_HCTR2
 
 struct fscrypt_context_v1 {
 	u8 version; /* FSCRYPT_CONTEXT_V1 */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index c35711896bd4..fbc71abdabe3 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -53,6 +53,13 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.ivsize = 32,
 		.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
 	},
+	[FSCRYPT_MODE_AES_256_HCTR2] = {
+		.friendly_name = "AES-256-HCTR2",
+		.cipher_str = "hctr2(aes)",
+		.keysize = 32,
+		.security_strength = 32,
+		.ivsize = 32,
+	},
 };
 
 static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 5f858cee1e3b..8a054e6d1e68 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb)
 	return sb->s_cop->get_dummy_policy(sb);
 }
 
-static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
+static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
 {
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
 	    filenames_mode == FSCRYPT_MODE_AES_256_CTS)
@@ -78,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
 	return false;
 }
 
+static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
+{
+	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+	    filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
+		return true;
+	return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
+}
+
 static bool supported_direct_key_modes(const struct inode *inode,
 				       u32 contents_mode, u32 filenames_mode)
 {
@@ -151,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
 					const struct inode *inode)
 {
-	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+	if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode,
 				     policy->filenames_encryption_mode)) {
 		fscrypt_warn(inode,
 			     "Unsupported encryption modes (contents %d, filenames %d)",
@@ -187,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 {
 	int count = 0;
 
-	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+	if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode,
 				     policy->filenames_encryption_mode)) {
 		fscrypt_warn(inode,
 			     "Unsupported encryption modes (contents %d, filenames %d)",
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 9f4428be3e36..a756b29afcc2 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -27,7 +27,8 @@
 #define FSCRYPT_MODE_AES_128_CBC		5
 #define FSCRYPT_MODE_AES_128_CTS		6
 #define FSCRYPT_MODE_ADIANTUM			9
-/* If adding a mode number > 9, update FSCRYPT_MODE_MAX in fscrypt_private.h */
+#define FSCRYPT_MODE_AES_256_HCTR2		10
+/* If adding a mode number > 10, update FSCRYPT_MODE_MAX in fscrypt_private.h */
 
 /*
  * Legacy policy version; ad-hoc KDF and no key verification.
-- 
cgit 


From 2d16803c562ecc644803d42ba98a8e0aef9c014e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sat, 28 May 2022 21:44:07 +0200
Subject: crypto: blake2s - remove shash module

BLAKE2s has no currently known use as an shash. Just remove all of this
unnecessary plumbing. Removing this shash was something we talked about
back when we were making BLAKE2s a built-in, but I simply never got
around to doing it. So this completes that project.

Importantly, this fixs a bug in which the lib code depends on
crypto_simd_disabled_for_test, causing linker errors.

Also add more alignment tests to the selftests and compare SIMD and
non-SIMD compression functions, to make up for what we lose from
testmgr.c.

Reported-by: gaochao <gaochao49@huawei.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: stable@vger.kernel.org
Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig           |   2 +-
 arch/arm/crypto/Makefile          |   4 +-
 arch/arm/crypto/blake2s-shash.c   |  75 -------------
 arch/x86/crypto/Makefile          |   4 +-
 arch/x86/crypto/blake2s-glue.c    |   3 +-
 arch/x86/crypto/blake2s-shash.c   |  77 --------------
 crypto/Kconfig                    |  20 +---
 crypto/Makefile                   |   1 -
 crypto/blake2s_generic.c          |  75 -------------
 crypto/tcrypt.c                   |  12 ---
 crypto/testmgr.c                  |  24 -----
 crypto/testmgr.h                  | 217 --------------------------------------
 include/crypto/internal/blake2s.h | 108 -------------------
 lib/crypto/blake2s-selftest.c     |  41 +++++++
 lib/crypto/blake2s.c              |  37 +++++--
 15 files changed, 76 insertions(+), 624 deletions(-)
 delete mode 100644 arch/arm/crypto/blake2s-shash.c
 delete mode 100644 arch/x86/crypto/blake2s-shash.c
 delete mode 100644 crypto/blake2s_generic.c

(limited to 'include')

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index e4dba5461cb3..149a5bd6b88c 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM
 	  using optimized ARM assembler and NEON, when available.
 
 config CRYPTO_BLAKE2S_ARM
-	tristate "BLAKE2s digest algorithm (ARM)"
+	bool "BLAKE2s digest algorithm (ARM)"
 	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 0274f81cc8ea..971e74546fb1 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,8 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
@@ -32,7 +31,6 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
 sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
-blake2s-arm-y   := blake2s-shash.o
 libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c
deleted file mode 100644
index 763c73beea2d..000000000000
--- a/arch/arm/crypto/blake2s-shash.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- */
-
-#include <crypto/internal/blake2s.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/module.h>
-
-static int crypto_blake2s_update_arm(struct shash_desc *desc,
-				     const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2s_update(desc, in, inlen, false);
-}
-
-static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
-{
-	return crypto_blake2s_final(desc, out, false);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
-		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2s_setkey,	\
-		.init			= crypto_blake2s_init,		\
-		.update			= crypto_blake2s_update_arm,	\
-		.final			= crypto_blake2s_final_arm,	\
-		.descsize		= sizeof(struct blake2s_state),	\
-	}
-
-static struct shash_alg blake2s_arm_algs[] = {
-	BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
-};
-
-static int __init blake2s_arm_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shashes(blake2s_arm_algs,
-					ARRAY_SIZE(blake2s_arm_algs)) : 0;
-}
-
-static void __exit blake2s_arm_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		crypto_unregister_shashes(blake2s_arm_algs,
-					  ARRAY_SIZE(blake2s_arm_algs));
-}
-
-module_init(blake2s_arm_mod_init);
-module_exit(blake2s_arm_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-arm");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-arm");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-arm");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-arm");
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b9847152acd8..04d07ab744b2 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -61,9 +61,7 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-shash.o
-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
 libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 69853c13e8fb..aaba21230528 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -4,7 +4,6 @@
  */
 
 #include <crypto/internal/blake2s.h>
-#include <crypto/internal/simd.h>
 
 #include <linux/types.h>
 #include <linux/jump_label.h>
@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
-	if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
 		blake2s_compress_generic(state, block, nblocks, inc);
 		return;
 	}
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
deleted file mode 100644
index 59ae28abe35c..000000000000
--- a/arch/x86/crypto/blake2s-shash.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
-				     const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2s_update(desc, in, inlen, false);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
-	return crypto_blake2s_final(desc, out, false);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
-		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2s_setkey,	\
-		.init			= crypto_blake2s_init,		\
-		.update			= crypto_blake2s_update_x86,	\
-		.final			= crypto_blake2s_final_x86,	\
-		.descsize		= sizeof(struct blake2s_state),	\
-	}
-
-static struct shash_alg blake2s_algs[] = {
-	BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
-
-static int __init blake2s_mod_init(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
-		return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
-	return 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
-		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
-}
-
-module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
-MODULE_LICENSE("GPL v2");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 6e30e8138057..59489a300cd1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -712,26 +712,8 @@ config CRYPTO_BLAKE2B
 
 	  See https://blake2.net for further information.
 
-config CRYPTO_BLAKE2S
-	tristate "BLAKE2s digest algorithm"
-	select CRYPTO_LIB_BLAKE2S_GENERIC
-	select CRYPTO_HASH
-	help
-	  Implementation of cryptographic hash function BLAKE2s
-	  optimized for 8-32bit platforms and can produce digests of any size
-	  between 1 to 32.  The keyed hash is also implemented.
-
-	  This module provides the following algorithms:
-
-	  - blake2s-128
-	  - blake2s-160
-	  - blake2s-224
-	  - blake2s-256
-
-	  See https://blake2.net for further information.
-
 config CRYPTO_BLAKE2S_X86
-	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
+	bool "BLAKE2s digest algorithm (x86 accelerated version)"
 	depends on X86 && 64BIT
 	select CRYPTO_LIB_BLAKE2S_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
diff --git a/crypto/Makefile b/crypto/Makefile
index 1f529704fe80..a4a84860fe43 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -84,7 +84,6 @@ obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o
-obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c
deleted file mode 100644
index 5f96a21f8788..000000000000
--- a/crypto/blake2s_generic.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * shash interface to the generic implementation of BLAKE2s
- *
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static int crypto_blake2s_update_generic(struct shash_desc *desc,
-					 const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2s_update(desc, in, inlen, true);
-}
-
-static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out)
-{
-	return crypto_blake2s_final(desc, out, true);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 100,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
-		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2s_setkey,	\
-		.init			= crypto_blake2s_init,		\
-		.update			= crypto_blake2s_update_generic, \
-		.final			= crypto_blake2s_final_generic,	\
-		.descsize		= sizeof(struct blake2s_state),	\
-	}
-
-static struct shash_alg blake2s_algs[] = {
-	BLAKE2S_ALG("blake2s-128", "blake2s-128-generic",
-		    BLAKE2S_128_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-160", "blake2s-160-generic",
-		    BLAKE2S_160_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-224", "blake2s-224-generic",
-		    BLAKE2S_224_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-256", "blake2s-256-generic",
-		    BLAKE2S_256_HASH_SIZE),
-};
-
-static int __init blake2s_mod_init(void)
-{
-	return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
-}
-
-static void __exit blake2s_mod_exit(void)
-{
-	crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
-}
-
-subsys_initcall(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-generic");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-generic");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-generic");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-generic");
-MODULE_LICENSE("GPL v2");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 336598da8eac..a8831060c4ce 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1670,10 +1670,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("rmd160");
 		break;
 
-	case 41:
-		ret += tcrypt_test("blake2s-256");
-		break;
-
 	case 42:
 		ret += tcrypt_test("blake2b-512");
 		break;
@@ -2250,10 +2246,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_hash_speed("rmd160", sec, generic_hash_speed_template);
 		if (mode > 300 && mode < 400) break;
 		fallthrough;
-	case 316:
-		test_hash_speed("blake2s-256", sec, generic_hash_speed_template);
-		if (mode > 300 && mode < 400) break;
-		fallthrough;
 	case 317:
 		test_hash_speed("blake2b-512", sec, generic_hash_speed_template);
 		if (mode > 300 && mode < 400) break;
@@ -2362,10 +2354,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_ahash_speed("rmd160", sec, generic_hash_speed_template);
 		if (mode > 400 && mode < 500) break;
 		fallthrough;
-	case 416:
-		test_ahash_speed("blake2s-256", sec, generic_hash_speed_template);
-		if (mode > 400 && mode < 500) break;
-		fallthrough;
 	case 417:
 		test_ahash_speed("blake2b-512", sec, generic_hash_speed_template);
 		if (mode > 400 && mode < 500) break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 485097461068..7a8a56749960 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4375,30 +4375,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(blake2b_512_tv_template)
 		}
-	}, {
-		.alg = "blake2s-128",
-		.test = alg_test_hash,
-		.suite = {
-			.hash = __VECS(blakes2s_128_tv_template)
-		}
-	}, {
-		.alg = "blake2s-160",
-		.test = alg_test_hash,
-		.suite = {
-			.hash = __VECS(blakes2s_160_tv_template)
-		}
-	}, {
-		.alg = "blake2s-224",
-		.test = alg_test_hash,
-		.suite = {
-			.hash = __VECS(blakes2s_224_tv_template)
-		}
-	}, {
-		.alg = "blake2s-256",
-		.test = alg_test_hash,
-		.suite = {
-			.hash = __VECS(blakes2s_256_tv_template)
-		}
 	}, {
 		.alg = "cbc(aes)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 808ba07baa04..4f3955ea40bf 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -34034,223 +34034,6 @@ static const struct hash_testvec blake2b_512_tv_template[] = {{
 			  0xae, 0x15, 0x81, 0x15, 0xd0, 0x88, 0xa0, 0x3c, },
 }};
 
-static const struct hash_testvec blakes2s_128_tv_template[] = {{
-	.digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01,
-			  0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, },
-}, {
-	.plaintext = blake2_ordered_sequence,
-	.psize = 64,
-	.digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01,
-			  0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 1,
-	.digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82,
-			  0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 7,
-	.digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd,
-			  0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 15,
-	.digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2,
-			  0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 247,
-	.digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe,
-			  0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 256,
-	.digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6,
-			  0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, },
-}};
-
-static const struct hash_testvec blakes2s_160_tv_template[] = {{
-	.plaintext = blake2_ordered_sequence,
-	.psize = 7,
-	.digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e,
-			  0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62,
-			  0xe3, 0xf2, 0x84, 0xff, },
-}, {
-	.plaintext = blake2_ordered_sequence,
-	.psize = 256,
-	.digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2,
-			  0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1,
-			  0x9b, 0x2d, 0x35, 0x05, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3,
-			  0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1,
-			  0x79, 0x65, 0x32, 0x93, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 1,
-	.digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71,
-			  0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef,
-			  0xa2, 0x3a, 0x56, 0x9c, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 15,
-	.digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19,
-			  0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34,
-			  0x83, 0x39, 0x0f, 0x30, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 64,
-	.digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5,
-			  0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d,
-			  0xac, 0xa6, 0x81, 0x63, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 247,
-	.digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01,
-			  0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10,
-			  0x0a, 0xf6, 0x73, 0xe8, },
-}};
-
-static const struct hash_testvec blakes2s_224_tv_template[] = {{
-	.plaintext = blake2_ordered_sequence,
-	.psize = 1,
-	.digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91,
-			  0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12,
-			  0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc,
-			  0x48, 0x21, 0x97, 0xbb, },
-}, {
-	.plaintext = blake2_ordered_sequence,
-	.psize = 247,
-	.digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e,
-			  0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4,
-			  0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc,
-			  0x2b, 0xa4, 0xd5, 0xf6, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f,
-			  0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3,
-			  0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32,
-			  0xa7, 0x19, 0xfc, 0xb8, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 7,
-	.digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76,
-			  0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6,
-			  0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8,
-			  0x7b, 0x45, 0xfe, 0x05, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 15,
-	.digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36,
-			  0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43,
-			  0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e,
-			  0x25, 0xab, 0xc5, 0x02, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 64,
-	.digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7,
-			  0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c,
-			  0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93,
-			  0x6a, 0x31, 0x83, 0xb5, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 256,
-	.digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86,
-			  0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2,
-			  0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45,
-			  0xb3, 0xd7, 0xec, 0xcc, },
-}};
-
-static const struct hash_testvec blakes2s_256_tv_template[] = {{
-	.plaintext = blake2_ordered_sequence,
-	.psize = 15,
-	.digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
-			  0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
-			  0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
-			  0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b,
-			  0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b,
-			  0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a,
-			  0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 1,
-	.digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03,
-			  0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18,
-			  0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b,
-			  0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 7,
-	.digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03,
-			  0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15,
-			  0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34,
-			  0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, },
-}, {
-	.ksize = 32,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 64,
-	.digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66,
-			  0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26,
-			  0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab,
-			  0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, },
-}, {
-	.ksize = 1,
-	.key = "B",
-	.plaintext = blake2_ordered_sequence,
-	.psize = 247,
-	.digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84,
-			  0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66,
-			  0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0,
-			  0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, },
-}, {
-	.ksize = 16,
-	.key = blake2_ordered_sequence,
-	.plaintext = blake2_ordered_sequence,
-	.psize = 256,
-	.digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b,
-			  0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1,
-			  0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed,
-			  0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, },
-}};
-
 /*
  * Test vectors generated using https://github.com/google/hctr2
  */
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 52363eee2b20..506d56530ca9 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -8,7 +8,6 @@
 #define _CRYPTO_INTERNAL_BLAKE2S_H
 
 #include <crypto/blake2s.h>
-#include <crypto/internal/hash.h>
 #include <linux/string.h>
 
 void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
@@ -19,111 +18,4 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
 
 bool blake2s_selftest(void);
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
-{
-	state->f[0] = -1;
-}
-
-/* Helper functions for BLAKE2s shared by the library and shash APIs */
-
-static __always_inline void
-__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen,
-		 bool force_generic)
-{
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
-
-	if (unlikely(!inlen))
-		return;
-	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		if (force_generic)
-			blake2s_compress_generic(state, state->buf, 1,
-						 BLAKE2S_BLOCK_SIZE);
-		else
-			blake2s_compress(state, state->buf, 1,
-					 BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
-		in += fill;
-		inlen -= fill;
-	}
-	if (inlen > BLAKE2S_BLOCK_SIZE) {
-		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		/* Hash one less (full) block than strictly possible */
-		if (force_generic)
-			blake2s_compress_generic(state, in, nblocks - 1,
-						 BLAKE2S_BLOCK_SIZE);
-		else
-			blake2s_compress(state, in, nblocks - 1,
-					 BLAKE2S_BLOCK_SIZE);
-		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
-		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
-	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
-}
-
-static __always_inline void
-__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic)
-{
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	if (force_generic)
-		blake2s_compress_generic(state, state->buf, 1, state->buflen);
-	else
-		blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-}
-
-/* Helper functions for shash implementations of BLAKE2s */
-
-struct blake2s_tfm_ctx {
-	u8 key[BLAKE2S_KEY_SIZE];
-	unsigned int keylen;
-};
-
-static inline int crypto_blake2s_setkey(struct crypto_shash *tfm,
-					const u8 *key, unsigned int keylen)
-{
-	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
-	if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key, key, keylen);
-	tctx->keylen = keylen;
-
-	return 0;
-}
-
-static inline int crypto_blake2s_init(struct shash_desc *desc)
-{
-	const struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	struct blake2s_state *state = shash_desc_ctx(desc);
-	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
-
-	__blake2s_init(state, outlen, tctx->key, tctx->keylen);
-	return 0;
-}
-
-static inline int crypto_blake2s_update(struct shash_desc *desc,
-					const u8 *in, unsigned int inlen,
-					bool force_generic)
-{
-	struct blake2s_state *state = shash_desc_ctx(desc);
-
-	__blake2s_update(state, in, inlen, force_generic);
-	return 0;
-}
-
-static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out,
-				       bool force_generic)
-{
-	struct blake2s_state *state = shash_desc_ctx(desc);
-
-	__blake2s_final(state, out, force_generic);
-	return 0;
-}
-
 #endif /* _CRYPTO_INTERNAL_BLAKE2S_H */
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
index 409e4b728770..66f505220f43 100644
--- a/lib/crypto/blake2s-selftest.c
+++ b/lib/crypto/blake2s-selftest.c
@@ -4,6 +4,8 @@
  */
 
 #include <crypto/internal/blake2s.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
 #include <linux/string.h>
 
 /*
@@ -587,5 +589,44 @@ bool __init blake2s_selftest(void)
 		}
 	}
 
+	for (i = 0; i < 32; ++i) {
+		enum { TEST_ALIGNMENT = 16 };
+		u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1]
+					__aligned(TEST_ALIGNMENT);
+		u8 blocks[BLAKE2S_BLOCK_SIZE * 3];
+		struct blake2s_state state1, state2;
+
+		get_random_bytes(blocks, sizeof(blocks));
+		get_random_bytes(&state, sizeof(state));
+
+#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
+    defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
+		memcpy(&state1, &state, sizeof(state1));
+		memcpy(&state2, &state, sizeof(state2));
+		blake2s_compress(&state1, blocks, 3, BLAKE2S_BLOCK_SIZE);
+		blake2s_compress_generic(&state2, blocks, 3, BLAKE2S_BLOCK_SIZE);
+		if (memcmp(&state1, &state2, sizeof(state1))) {
+			pr_err("blake2s random compress self-test %d: FAIL\n",
+			       i + 1);
+			success = false;
+		}
+#endif
+
+		memcpy(&state1, &state, sizeof(state1));
+		blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
+		for (l = 1; l < TEST_ALIGNMENT; ++l) {
+			memcpy(unaligned_block + l, blocks,
+			       BLAKE2S_BLOCK_SIZE);
+			memcpy(&state2, &state, sizeof(state2));
+			blake2s_compress(&state2, unaligned_block + l, 1,
+					 BLAKE2S_BLOCK_SIZE);
+			if (memcmp(&state1, &state2, sizeof(state1))) {
+				pr_err("blake2s random compress align %d self-test %d: FAIL\n",
+				       l, i + 1);
+				success = false;
+			}
+		}
+	}
+
 	return success;
 }
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index c71c09621c09..98e688c6d891 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,16 +16,44 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
+static inline void blake2s_set_lastblock(struct blake2s_state *state)
+{
+	state->f[0] = -1;
+}
+
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
-	__blake2s_update(state, in, inlen, false);
+	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(state->buf + state->buflen, in, fill);
+		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+		state->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2S_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(state->buf + state->buflen, in, inlen);
+	state->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
 void blake2s_final(struct blake2s_state *state, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	__blake2s_final(state, out, false);
+	blake2s_set_lastblock(state);
+	memset(state->buf + state->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+	blake2s_compress(state, state->buf, 1, state->buflen);
+	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+	memcpy(out, state->h, state->outlen);
 	memzero_explicit(state, sizeof(*state));
 }
 EXPORT_SYMBOL(blake2s_final);
@@ -38,12 +66,7 @@ static int __init blake2s_mod_init(void)
 	return 0;
 }
 
-static void __exit blake2s_mod_exit(void)
-{
-}
-
 module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("BLAKE2s hash function");
 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-- 
cgit 


From b5c8b3fe8946c1927155599dfb79045477901009 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 20 May 2022 13:48:45 +0300
Subject: xfrm: no need to set DST_NOPOLICY in IPv4

This is a cleanup patch following commit e6175a2ed1f1
("xfrm: fix "disable_policy" flag use when arriving from different devices")
which made DST_NOPOLICY no longer be used for inbound policy checks.

On outbound the flag was set, but never used.

As such, avoid setting it altogether and remove the nopolicy argument
from rt_dst_alloc().

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 drivers/net/vrf.c   |  2 +-
 include/net/route.h |  3 +--
 net/ipv4/route.c    | 24 ++++++++----------------
 3 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 40445a12c682..5df7a0abc39d 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1077,7 +1077,7 @@ static int vrf_rtable_create(struct net_device *dev)
 		return -ENOMEM;
 
 	/* create a dst for routing packets out through a VRF device */
-	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1);
+	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1);
 	if (!rth)
 		return -ENOMEM;
 
diff --git a/include/net/route.h b/include/net/route.h
index 991a3985712d..b6743ff88e30 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -244,8 +244,7 @@ void ip_rt_multicast_event(struct in_device *);
 int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
 struct rtable *rt_dst_alloc(struct net_device *dev,
-			     unsigned int flags, u16 type,
-			     bool nopolicy, bool noxfrm);
+			    unsigned int flags, u16 type, bool noxfrm);
 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt);
 
 struct in_ifaddr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2d16bcc7d346..bd351fab46e6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1626,12 +1626,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 struct rtable *rt_dst_alloc(struct net_device *dev,
 			    unsigned int flags, u16 type,
-			    bool nopolicy, bool noxfrm)
+			    bool noxfrm)
 {
 	struct rtable *rt;
 
 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
-		       (nopolicy ? DST_NOPOLICY : 0) |
 		       (noxfrm ? DST_NOXFRM : 0));
 
 	if (rt) {
@@ -1726,7 +1725,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	unsigned int flags = RTCF_MULTICAST;
 	struct rtable *rth;
-	bool no_policy;
 	u32 itag = 0;
 	int err;
 
@@ -1737,12 +1735,11 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (our)
 		flags |= RTCF_LOCAL;
 
-	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
-	if (no_policy)
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
 
 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
-			   no_policy, false);
+			   false);
 	if (!rth)
 		return -ENOBUFS;
 
@@ -1801,7 +1798,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
-	bool do_cache, no_policy;
+	bool do_cache;
 	u32 itag = 0;
 
 	/* get a working reference to the output device */
@@ -1846,8 +1843,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
-	if (no_policy)
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
 
 	fnhe = find_exception(nhc, daddr);
@@ -1862,7 +1858,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
+	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
 			   IN_DEV_ORCONF(out_dev, NOXFRM));
 	if (!rth) {
 		err = -ENOBUFS;
@@ -2237,7 +2233,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	struct rtable	*rth;
 	struct flowi4	fl4;
 	bool do_cache = true;
-	bool no_policy;
 
 	/* IP on this device is disabled. */
 
@@ -2356,8 +2351,7 @@ brd_input:
 	RT_CACHE_STAT_INC(in_brd);
 
 local_input:
-	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
-	if (no_policy)
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
 
 	do_cache &= res->fi && !itag;
@@ -2373,8 +2367,7 @@ local_input:
 	}
 
 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
-			   flags | RTCF_LOCAL, res->type,
-			   no_policy, false);
+			   flags | RTCF_LOCAL, res->type, false);
 	if (!rth)
 		goto e_nobufs;
 
@@ -2597,7 +2590,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 add:
 	rth = rt_dst_alloc(dev_out, flags, type,
-			   IN_DEV_ORCONF(in_dev, NOPOLICY),
 			   IN_DEV_ORCONF(in_dev, NOXFRM));
 	if (!rth)
 		return ERR_PTR(-ENOBUFS);
-- 
cgit 


From e3fa404a261bb8b7795b62be1b9af7b7dc1030f6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 28 May 2022 16:59:17 +0200
Subject: USB: Follow-up to SPDX identifiers addition - remove now useless
 comments

All these files have been updated in the commit given in the Fixes: tag
below.

When the SPDX-License-Identifier: has been added, the corresponding text at
the beginning of the files has not been deleted.
All these texts are about GPL-2.0, with different variation in the wording.

Remove these now useless lines to save some LoC.

Fixes: 5fd54ace4721 ("USB: add SPDX identifiers to all remaining files in drivers/usb/")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/audio-v2.h      |  3 ---
 include/linux/usb/audio.h         |  3 ---
 include/linux/usb/cdc-wdm.h       |  4 ----
 include/linux/usb/cdc.h           |  4 ----
 include/linux/usb/gadget.h        |  2 --
 include/linux/usb/input.h         |  4 ----
 include/linux/usb/isp1301.h       | 10 ----------
 include/linux/usb/m66592.h        | 14 --------------
 include/linux/usb/of.h            |  2 --
 include/linux/usb/r8a66597.h      | 14 --------------
 include/linux/usb/serial.h        |  5 -----
 include/linux/usb/storage.h       |  2 --
 include/linux/usb/tegra_usb_phy.h | 10 ----------
 include/linux/usb/ulpi.h          |  4 ----
 include/linux/usb/xhci-dbgp.h     |  4 ----
 15 files changed, 85 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index 8fc2abd7aecb..ca796dc1a984 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -2,9 +2,6 @@
 /*
  * Copyright (c) 2010 Daniel Mack <daniel@caiaq.de>
  *
- * This software is distributed under the terms of the GNU General Public
- * License ("GPL") version 2, as published by the Free Software Foundation.
- *
  * This file holds USB constants and structures defined
  * by the USB Device Class Definition for Audio Devices in version 2.0.
  * Comments below reference relevant sections of the documents contained
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index 170acd500ea1..0747b24a1a7c 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -6,9 +6,6 @@
  * Developed for Thumtronics by Grey Innovation
  * Ben Williamson <ben.williamson@greyinnovation.com>
  *
- * This software is distributed under the terms of the GNU General Public
- * License ("GPL") version 2, as published by the Free Software Foundation.
- *
  * This file holds USB constants and structures defined
  * by the USB Device Class Definition for Audio Devices.
  * Comments below reference relevant sections of that document:
diff --git a/include/linux/usb/cdc-wdm.h b/include/linux/usb/cdc-wdm.h
index 9f5a51f79ba5..85417f00a89a 100644
--- a/include/linux/usb/cdc-wdm.h
+++ b/include/linux/usb/cdc-wdm.h
@@ -3,10 +3,6 @@
  * USB CDC Device Management subdriver
  *
  * Copyright (c) 2012  Bjørn Mork <bjorn@mork.no>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
  */
 
 #ifndef __LINUX_USB_CDC_WDM_H
diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
index 35d784cf32a4..0af0db51bc89 100644
--- a/include/linux/usb/cdc.h
+++ b/include/linux/usb/cdc.h
@@ -3,10 +3,6 @@
  * USB CDC common helpers
  *
  * Copyright (c) 2015 Oliver Neukum <oneukum@suse.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
  */
 #ifndef __LINUX_USB_CDC_H
 #define __LINUX_USB_CDC_H
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3ad58b7a0824..dc3092cea99e 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -10,8 +10,6 @@
  *
  * (C) Copyright 2002-2004 by David Brownell
  * All Rights Reserved.
- *
- * This software is licensed under the GNU GPL version 2.
  */
 
 #ifndef __LINUX_USB_GADGET_H
diff --git a/include/linux/usb/input.h b/include/linux/usb/input.h
index 974befa72ac0..5e759b2cf551 100644
--- a/include/linux/usb/input.h
+++ b/include/linux/usb/input.h
@@ -1,10 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005 Dmitry Torokhov
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #ifndef __LINUX_USB_INPUT_H
diff --git a/include/linux/usb/isp1301.h b/include/linux/usb/isp1301.h
index dedb3b2473e8..fa986b926a12 100644
--- a/include/linux/usb/isp1301.h
+++ b/include/linux/usb/isp1301.h
@@ -3,16 +3,6 @@
  * NXP ISP1301 USB transceiver driver
  *
  * Copyright (C) 2012 Roland Stigge <stigge@antcom.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #ifndef __LINUX_USB_ISP1301_H
diff --git a/include/linux/usb/m66592.h b/include/linux/usb/m66592.h
index 2dfe68183495..5f04de2b47fd 100644
--- a/include/linux/usb/m66592.h
+++ b/include/linux/usb/m66592.h
@@ -3,20 +3,6 @@
  * M66592 driver platform data
  *
  * Copyright (C) 2009  Renesas Solutions Corp.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #ifndef __LINUX_USB_M66592_H
diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h
index dba55ccb9b53..98487fd7ab11 100644
--- a/include/linux/usb/of.h
+++ b/include/linux/usb/of.h
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * OF helpers for usb devices.
- *
- * This file is released under the GPLv2
  */
 
 #ifndef __LINUX_USB_OF_H
diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h
index c0753d026bbf..f0fa7ddadbaa 100644
--- a/include/linux/usb/r8a66597.h
+++ b/include/linux/usb/r8a66597.h
@@ -5,20 +5,6 @@
  * Copyright (C) 2009  Renesas Solutions Corp.
  *
  * Author : Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #ifndef __LINUX_USB_R8A66597_H
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 16ea5a4cc586..8ea319f89e1f 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -4,11 +4,6 @@
  *
  *	Copyright (C) 1999 - 2012
  *	    Greg Kroah-Hartman (greg@kroah.com)
- *
- *	This program is free software; you can redistribute it and/or modify
- *	it under the terms of the GNU General Public License as published by
- *	the Free Software Foundation; version 2 of the License.
- *
  */
 
 #ifndef __LINUX_USB_SERIAL_H
diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h
index e0240f864548..2827ce72e502 100644
--- a/include/linux/usb/storage.h
+++ b/include/linux/usb/storage.h
@@ -9,8 +9,6 @@
  *
  * This file contains definitions taken from the
  * USB Mass Storage Class Specification Overview
- *
- * Distributed under the terms of the GNU GPL, version two.
  */
 
 /* Storage subclass codes */
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index d3e65eb9e16f..46e73584b6e6 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -1,16 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #ifndef __TEGRA_USB_PHY_H
diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 36c2982780ad..5050f502c1ed 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -3,10 +3,6 @@
  * ulpi.h -- ULPI defines and function prorotypes
  *
  * Copyright (C) 2010 Nokia Corporation
- *
- * This software is distributed under the terms of the GNU General
- * Public License ("GPL") as published by the Free Software Foundation,
- * version 2 of that License.
  */
 
 #ifndef __LINUX_USB_ULPI_H
diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h
index 01fe768873f9..171fd74b1cfc 100644
--- a/include/linux/usb/xhci-dbgp.h
+++ b/include/linux/usb/xhci-dbgp.h
@@ -5,10 +5,6 @@
  * Copyright (C) 2016 Intel Corporation
  *
  * Author: Lu Baolu <baolu.lu@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef __LINUX_XHCI_DBGP_H
-- 
cgit 


From 3e00a22fdc9a9457f1c64d885532710e174babd8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 28 May 2022 17:13:56 +0200
Subject: USB: Follow-up to SPDX GPL-2.0+ identifiers addition - remove now
 useless comments

All these files have been updated in the commit given in the Fixes: tag
below.

When the SPDX-License-Identifier: has been added, the corresponding text at
the beginning of the files has not been deleted.
All these texts are about GPL-2.0+, with different variation in the
wording.

Remove these now useless lines to save some LoC.

Fixes: 5fd54ace4721 ("USB: add SPDX identifiers to all remaining files in drivers/usb/")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/c67x00.h        | 15 ---------------
 include/linux/usb/composite.h     | 14 --------------
 include/linux/usb/ehci_def.h      | 14 --------------
 include/linux/usb/ehci_pdriver.h  | 14 --------------
 include/linux/usb/g_hid.h         | 14 --------------
 include/linux/usb/hcd.h           | 14 --------------
 include/linux/usb/musb-ux500.h    | 10 ----------
 include/linux/usb/net2280.h       | 14 --------------
 include/linux/usb/ohci_pdriver.h  | 14 --------------
 include/linux/usb/otg-fsm.h       | 17 ++---------------
 include/linux/usb/phy_companion.h | 10 ----------
 include/linux/usb/rndis_host.h    | 14 --------------
 include/linux/usb/usb338x.h       | 11 -----------
 include/linux/usb/usbnet.h        | 14 --------------
 14 files changed, 2 insertions(+), 187 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/c67x00.h b/include/linux/usb/c67x00.h
index 2fc39e3b7281..45e0757e58f3 100644
--- a/include/linux/usb/c67x00.h
+++ b/include/linux/usb/c67x00.h
@@ -3,21 +3,6 @@
  * usb_c67x00.h: platform definitions for the Cypress C67X00 USB chip
  *
  * Copyright (C) 2006-2008 Barco N.V.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA  02110-1301  USA.
  */
 
 #ifndef _LINUX_USB_C67X00_H
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 9d2762279286..43ac3fa760db 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -3,20 +3,6 @@
  * composite.h -- framework for usb gadgets which are composite devices
  *
  * Copyright (C) 2006-2008 David Brownell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #ifndef	__LINUX_USB_COMPOSITE_H
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index c892c5bc6638..fbabadd3b372 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -1,20 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __LINUX_USB_EHCI_DEF_H
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index 89fc901e778f..0f1b166f5aa0 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -1,20 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __USB_CORE_EHCI_PDRIVER_H
diff --git a/include/linux/usb/g_hid.h b/include/linux/usb/g_hid.h
index 7581e488c237..d56bfedeb079 100644
--- a/include/linux/usb/g_hid.h
+++ b/include/linux/usb/g_hid.h
@@ -3,20 +3,6 @@
  * g_hid.h -- Header file for USB HID gadget driver
  *
  * Copyright (C) 2010 Fabien Chouteau <fabien.chouteau@barco.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef __LINUX_USB_G_HID_H
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 2c1fc9212cf2..37ccb31b1d40 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -1,20 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __USB_CORE_HCD_H
diff --git a/include/linux/usb/musb-ux500.h b/include/linux/usb/musb-ux500.h
index c4b7ad9850ca..d60dcfc56b5a 100644
--- a/include/linux/usb/musb-ux500.h
+++ b/include/linux/usb/musb-ux500.h
@@ -1,16 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2013 ST-Ericsson AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __MUSB_UX500_H__
diff --git a/include/linux/usb/net2280.h b/include/linux/usb/net2280.h
index 08b85caecfaf..f29fe6a1f415 100644
--- a/include/linux/usb/net2280.h
+++ b/include/linux/usb/net2280.h
@@ -5,20 +5,6 @@
  *
  * Copyright (C) 2002 NetChip Technology, Inc. (http://www.netchip.com)
  * Copyright (C) 2003 David Brownell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef __LINUX_USB_NET2280_H
diff --git a/include/linux/usb/ohci_pdriver.h b/include/linux/usb/ohci_pdriver.h
index 7eb16cf587ee..2447c78b1766 100644
--- a/include/linux/usb/ohci_pdriver.h
+++ b/include/linux/usb/ohci_pdriver.h
@@ -1,20 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __USB_CORE_OHCI_PDRIVER_H
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 784659d4dc99..6135d076c53d 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -1,19 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
-/* Copyright (C) 2007,2008 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the  GNU General Public License along
- * with this program; if not, write  to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
+/*
+ * Copyright (C) 2007,2008 Freescale Semiconductor, Inc.
  */
 
 #ifndef __LINUX_USB_OTG_FSM_H
diff --git a/include/linux/usb/phy_companion.h b/include/linux/usb/phy_companion.h
index 263196f05015..862aaeca2319 100644
--- a/include/linux/usb/phy_companion.h
+++ b/include/linux/usb/phy_companion.h
@@ -3,18 +3,8 @@
  * phy-companion.h -- phy companion to indicate the comparator part of PHY
  *
  * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
  *
  * Author: Kishon Vijay Abraham I <kishon@ti.com>
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #ifndef __DRIVERS_PHY_COMPANION_H
diff --git a/include/linux/usb/rndis_host.h b/include/linux/usb/rndis_host.h
index cc42db51bbba..489cfb1d00f6 100644
--- a/include/linux/usb/rndis_host.h
+++ b/include/linux/usb/rndis_host.h
@@ -2,20 +2,6 @@
 /*
  * Host Side support for RNDIS Networking Links
  * Copyright (C) 2005 by David Brownell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef	__LINUX_USB_RNDIS_HOST_H
diff --git a/include/linux/usb/usb338x.h b/include/linux/usb/usb338x.h
index 20020c1336d5..70a7e3cdb3c9 100644
--- a/include/linux/usb/usb338x.h
+++ b/include/linux/usb/usb338x.h
@@ -6,17 +6,6 @@
  * Copyright (C) 2002 NetChip Technology, Inc. (http://www.netchip.com)
  * Copyright (C) 2003 David Brownell
  * Copyright (C) 2014 Ricardo Ribalda - Qtechnology/AS
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #ifndef __LINUX_USB_USB338X_H
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 1b4d72d5e891..4f5608cbd9ab 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -4,20 +4,6 @@
  *
  * Copyright (C) 2000-2005 by David Brownell <dbrownell@users.sourceforge.net>
  * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef	__LINUX_USB_USBNET_H
-- 
cgit 


From 924b290655c0f17ac84e752addfc9bc3ec361069 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 24 May 2022 13:47:40 -0700
Subject: xfrm: convert alg_key to flexible array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Iproute2 build generates a warning when built with gcc-12.
This is because the alg_key in xfrm.h API has zero size
array element instead of flexible array.

    CC       xfrm_state.o
In function ‘xfrm_algo_parse’,
    inlined from ‘xfrm_state_modify.constprop’ at xfrm_state.c:573:5:
xfrm_state.c:162:32: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
  162 |                         buf[j] = val;
      |                         ~~~~~~~^~~~~

This patch convert the alg_key into flexible array member.
There are other zero size arrays here that should be converted as
well.

This patch is RFC only since it is only compile tested and
passes trivial iproute2 tests.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..3ed61df9cc91 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -102,21 +102,21 @@ struct xfrm_replay_state_esn {
 struct xfrm_algo {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_auth {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
 	unsigned int	alg_trunc_len;  /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_aead {
 	char		alg_name[64];
 	unsigned int	alg_key_len;	/* in bits */
 	unsigned int	alg_icv_len;	/* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_stats {
-- 
cgit 


From 4173f018aae16b6496d292c234b858241f85254f Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:12 +0200
Subject: tty/vt: consolemap: rename and document struct uni_pagedir

struct uni_pagedir contains 32 unicode page directories, so the name of
the structure is a bit misleading. Rename the structure to uni_pagedict,
so it looks like this:
struct uni_pagedict
  -> 32 page dirs
     -> 32 rows
       -> 64 glyphs

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c    | 47 +++++++++++++++++++++++++-----------------
 drivers/video/console/vgacon.c |  4 ++--
 include/linux/console_struct.h |  6 +++---
 3 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 839d75d1a6c0..5acafeea9afc 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -186,17 +186,26 @@ static unsigned short translations[][256] = {
 
 static int inv_translate[MAX_NR_CONSOLES];
 
-struct uni_pagedir {
-	u16 		**uni_pgdir[32];
+/**
+ * struct uni_pagedict -- unicode directory
+ *
+ * @uni_pgdir: 32*32*64 table with glyphs
+ * @refcount: reference count of this structure
+ * @sum: checksum
+ * @inverse_translations: best-effort inverse mapping
+ * @inverse_trans_unicode: best-effort inverse mapping to unicode
+ */
+struct uni_pagedict {
+	u16		**uni_pgdir[32];
 	unsigned long	refcount;
 	unsigned long	sum;
 	unsigned char	*inverse_translations[4];
 	u16		*inverse_trans_unicode;
 };
 
-static struct uni_pagedir *dflt;
+static struct uni_pagedict *dflt;
 
-static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int i)
+static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p, int i)
 {
 	int j, glyph;
 	unsigned short *t = translations[i];
@@ -221,7 +230,7 @@ static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int
 }
 
 static void set_inverse_trans_unicode(struct vc_data *conp,
-				      struct uni_pagedir *p)
+				      struct uni_pagedict *p)
 {
 	int i, j, k, glyph;
 	u16 **p1, *p2;
@@ -270,7 +279,7 @@ unsigned short *set_translate(int m, struct vc_data *vc)
  */
 u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 	int m;
 	if (glyph < 0 || glyph >= MAX_GLYPH)
 		return 0;
@@ -297,7 +306,7 @@ EXPORT_SYMBOL_GPL(inverse_translate);
 static void update_user_maps(void)
 {
 	int i;
-	struct uni_pagedir *p, *q = NULL;
+	struct uni_pagedict *p, *q = NULL;
 	
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
@@ -393,7 +402,7 @@ int con_get_trans_new(ushort __user * arg)
 extern u8 dfont_unicount[];	/* Defined in console_defmap.c */
 extern u16 dfont_unitable[];
 
-static void con_release_unimap(struct uni_pagedir *p)
+static void con_release_unimap(struct uni_pagedict *p)
 {
 	u16 **p1;
 	int i, j;
@@ -419,7 +428,7 @@ static void con_release_unimap(struct uni_pagedir *p)
 /* Caller must hold the console lock */
 void con_free_unimap(struct vc_data *vc)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	p = *vc->vc_uni_pagedir_loc;
 	if (!p)
@@ -431,10 +440,10 @@ void con_free_unimap(struct vc_data *vc)
 	kfree(p);
 }
   
-static int con_unify_unimap(struct vc_data *conp, struct uni_pagedir *p)
+static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *p)
 {
 	int i, j, k;
-	struct uni_pagedir *q;
+	struct uni_pagedict *q;
 	
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
@@ -472,7 +481,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedir *p)
 }
 
 static int
-con_insert_unipair(struct uni_pagedir *p, u_short unicode, u_short fontpos)
+con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos)
 {
 	int i, n;
 	u16 **p1, *p2;
@@ -503,7 +512,7 @@ con_insert_unipair(struct uni_pagedir *p, u_short unicode, u_short fontpos)
 /* Caller must hold the lock */
 static int con_do_clear_unimap(struct vc_data *vc)
 {
-	struct uni_pagedir *p, *q;
+	struct uni_pagedict *p, *q;
 
 	p = *vc->vc_uni_pagedir_loc;
 	if (!p || --p->refcount) {
@@ -536,7 +545,7 @@ int con_clear_unimap(struct vc_data *vc)
 int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 {
 	int err = 0, err1, i;
-	struct uni_pagedir *p, *q;
+	struct uni_pagedict *p, *q;
 	struct unipair *unilist, *plist;
 
 	if (!ct)
@@ -569,7 +578,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 		
 		/*
 		 * Since refcount was > 1, con_clear_unimap() allocated a
-		 * a new uni_pagedir for this vc.  Re: p != q
+		 * a new uni_pagedict for this vc.  Re: p != q
 		 */
 		q = *vc->vc_uni_pagedir_loc;
 
@@ -660,7 +669,7 @@ int con_set_default_unimap(struct vc_data *vc)
 {
 	int i, j, err = 0, err1;
 	u16 *q;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	if (dflt) {
 		p = *vc->vc_uni_pagedir_loc;
@@ -714,7 +723,7 @@ EXPORT_SYMBOL(con_set_default_unimap);
  */
 int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
 {
-	struct uni_pagedir *q;
+	struct uni_pagedict *q;
 
 	if (!*src_vc->vc_uni_pagedir_loc)
 		return -EINVAL;
@@ -739,7 +748,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct uni
 	int i, j, k, ret = 0;
 	ushort ect;
 	u16 **p1, *p2;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 	struct unipair *unilist;
 
 	unilist = kvmalloc_array(ct, sizeof(struct unipair), GFP_KERNEL);
@@ -810,7 +819,7 @@ conv_uni_to_pc(struct vc_data *conp, long ucs)
 {
 	int h;
 	u16 **p1, *p2;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
   
 	/* Only 16-bit codes supported at this time */
 	if (ucs > 0xffff)
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 576612f18d59..058a78b8dbcf 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -75,7 +75,7 @@ static void vgacon_scrolldelta(struct vc_data *c, int lines);
 static int vgacon_set_origin(struct vc_data *c);
 static void vgacon_save_screen(struct vc_data *c);
 static void vgacon_invert_region(struct vc_data *c, u16 * p, int count);
-static struct uni_pagedir *vgacon_uni_pagedir;
+static struct uni_pagedict *vgacon_uni_pagedir;
 static int vgacon_refcount;
 
 /* Description of the hardware situation */
@@ -342,7 +342,7 @@ static const char *vgacon_startup(void)
 
 static void vgacon_init(struct vc_data *c, int init)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	/*
 	 * We cannot be loaded as a module, therefore init will be 1
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index d5b9c8d40c18..f75033f0277f 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -17,7 +17,7 @@
 #include <linux/vt.h>
 #include <linux/workqueue.h>
 
-struct uni_pagedir;
+struct uni_pagedict;
 struct uni_screen;
 
 #define NPAR 16
@@ -157,8 +157,8 @@ struct vc_data {
 	unsigned int	vc_bell_duration;	/* Console bell duration */
 	unsigned short	vc_cur_blink_ms;	/* Cursor blink duration */
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
-	struct uni_pagedir *vc_uni_pagedir;
-	struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */
+	struct uni_pagedict *vc_uni_pagedir;
+	struct uni_pagedict **vc_uni_pagedir_loc; /* [!] Location of uni_pagedict variable for this console */
 	struct uni_screen *vc_uni_screen;	/* unicode screen content */
 	/* additional information is in vt_kern.h */
 };
-- 
cgit 


From 0b75f7968d61566d150747512344cabaa59e54c6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:15 +0200
Subject: tty/vt: consolemap: remove extern from function decls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extern keyword is not needed for function declarations. Remove it,
so that the consolemap header conforms to other tty headers.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/consolemap.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index bcfce748c9d8..abc28e4450bf 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -17,12 +17,11 @@
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
 struct vc_data;
 
-extern u16 inverse_translate(const struct vc_data *conp, int glyph,
-		int use_unicode);
-extern unsigned short *set_translate(int m, struct vc_data *vc);
-extern int conv_uni_to_pc(struct vc_data *conp, long ucs);
-extern u32 conv_8bit_to_uni(unsigned char c);
-extern int conv_uni_to_8bit(u32 uni);
+u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
+unsigned short *set_translate(int m, struct vc_data *vc);
+int conv_uni_to_pc(struct vc_data *conp, long ucs);
+u32 conv_8bit_to_uni(unsigned char c);
+int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
 #define inverse_translate(conp, glyph, uni) ((uint16_t)glyph)
-- 
cgit 


From f827c754f9b6247f4d4f9cbb508b22bff2cf8e6d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:16 +0200
Subject: tty/vt: consolemap: convert macros to static inlines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit changes !CONFIG_CONSOLE_TRANSLATIONS definitions to real
(inline) functions. So the commit:
* makes type checking much stronger,
* removes the need of many parentheses and casts, and
* makes the code more readable.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-6-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/consolemap.h | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index abc28e4450bf..98171dbed51f 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -14,9 +14,9 @@
 
 #include <linux/types.h>
 
-#ifdef CONFIG_CONSOLE_TRANSLATIONS
 struct vc_data;
 
+#ifdef CONFIG_CONSOLE_TRANSLATIONS
 u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
 unsigned short *set_translate(int m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
@@ -24,12 +24,33 @@ u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
-#define inverse_translate(conp, glyph, uni) ((uint16_t)glyph)
-#define set_translate(m, vc) ((unsigned short *)NULL)
-#define conv_uni_to_pc(conp, ucs) ((int) (ucs > 0xff ? -1: ucs))
-#define conv_8bit_to_uni(c) ((uint32_t)(c))
-#define conv_uni_to_8bit(c) ((int) ((c) & 0xff))
-#define console_map_init(c) do { ; } while (0)
+static inline u16 inverse_translate(const struct vc_data *conp, int glyph,
+		int use_unicode)
+{
+	return glyph;
+}
+
+static inline unsigned short *set_translate(int m, struct vc_data *vc)
+{
+	return NULL;
+}
+
+static inline int conv_uni_to_pc(struct vc_data *conp, long ucs)
+{
+	return ucs > 0xff ? -1 : ucs;
+}
+
+static inline u32 conv_8bit_to_uni(unsigned char c)
+{
+	return c;
+}
+
+static inline int conv_uni_to_8bit(u32 uni)
+{
+	return uni & 0xff;
+}
+
+static inline void console_map_init(void) { }
 #endif /* CONFIG_CONSOLE_TRANSLATIONS */
 
 #endif /* __LINUX_CONSOLEMAP_H__ */
-- 
cgit 


From d9ebb906a45adc71a62aceb1e3608c847cafa660 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:17 +0200
Subject: tty/vt: consolemap: make parameters of inverse_translate() saner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- int use_unicode -> bool: it's used as bool at some places already, so
  make it explicit.
- int glyph -> u16: every caller passes a u16 in. So make it explicit
  too. And remove a negative check from inverse_translate() as it never
  could be negative.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-7-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/braille/braille_console.c | 2 +-
 drivers/accessibility/speakup/main.c            | 2 +-
 drivers/tty/vt/consolemap.c                     | 4 ++--
 drivers/tty/vt/selection.c                      | 3 ++-
 drivers/tty/vt/vt.c                             | 2 +-
 include/linux/consolemap.h                      | 6 +++---
 6 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c
index fdc6b593f500..c4d54a5326b1 100644
--- a/drivers/accessibility/braille/braille_console.c
+++ b/drivers/accessibility/braille/braille_console.c
@@ -131,7 +131,7 @@ static void vc_refresh(struct vc_data *vc)
 	for (i = 0; i < WIDTH; i++) {
 		u16 glyph = screen_glyph(vc,
 				2 * (vc_x + i) + vc_y * vc->vc_size_row);
-		buf[i] = inverse_translate(vc, glyph, 1);
+		buf[i] = inverse_translate(vc, glyph, true);
 	}
 	braille_write(buf);
 }
diff --git a/drivers/accessibility/speakup/main.c b/drivers/accessibility/speakup/main.c
index d726537fa16c..f52265293482 100644
--- a/drivers/accessibility/speakup/main.c
+++ b/drivers/accessibility/speakup/main.c
@@ -470,7 +470,7 @@ static u16 get_char(struct vc_data *vc, u16 *pos, u_char *attribs)
 			c |= 0x100;
 		}
 
-		ch = inverse_translate(vc, c, 1);
+		ch = inverse_translate(vc, c, true);
 		*attribs = (w & 0xff00) >> 8;
 	}
 	return ch;
diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index fb61158f4dc6..157c7f936294 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -281,12 +281,12 @@ unsigned short *set_translate(int m, struct vc_data *vc)
  *    was active.
  * Still, it is now possible to a certain extent to cut and paste non-ASCII.
  */
-u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode)
+u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 {
 	struct uni_pagedict *p;
 	int m;
 
-	if (glyph < 0 || glyph >= MAX_GLYPH)
+	if (glyph >= MAX_GLYPH)
 		return 0;
 
 	p = *conp->vc_uni_pagedir_loc;
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index f7755e73696e..6ef22f01cc51 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -68,7 +68,8 @@ sel_pos(int n, bool unicode)
 {
 	if (unicode)
 		return screen_glyph_unicode(vc_sel.cons, n / 2);
-	return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n), 0);
+	return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n),
+			false);
 }
 
 /**
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index f8c87c4d7399..1ea1c11c42fd 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4741,7 +4741,7 @@ u32 screen_glyph_unicode(const struct vc_data *vc, int n)
 
 	if (uniscr)
 		return uniscr->lines[n / vc->vc_cols][n % vc->vc_cols];
-	return inverse_translate(vc, screen_glyph(vc, n * 2), 1);
+	return inverse_translate(vc, screen_glyph(vc, n * 2), true);
 }
 EXPORT_SYMBOL_GPL(screen_glyph_unicode);
 
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index 98171dbed51f..1ff2bf55eb85 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -17,15 +17,15 @@
 struct vc_data;
 
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
-u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
+u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode);
 unsigned short *set_translate(int m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
 u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
-static inline u16 inverse_translate(const struct vc_data *conp, int glyph,
-		int use_unicode)
+static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
+		bool use_unicode)
 {
 	return glyph;
 }
-- 
cgit 


From 5a904a936b407624cd1ff5ee3f1675ca3d2366a5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:27 +0200
Subject: tty/vt: consolemap: introduce enum translation_map and use it

Again, instead of magic constants in the code, declare an enum and be a
little bit more explicit. Both in the translations definition and in the
loops etc.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-17-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c | 39 ++++++++++++++++++++-------------------
 include/linux/consolemap.h  | 18 ++++++++++++------
 2 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 92b5dddb00d9..80536687acef 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -38,7 +38,7 @@
 
 static unsigned short translations[][256] = {
   /* 8-bit Latin-1 mapped to Unicode -- trivial mapping */
-  {
+  [LAT1_MAP] = {
     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -71,9 +71,9 @@ static unsigned short translations[][256] = {
     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
-  }, 
+  },
   /* VT100 graphics mapped to Unicode */
-  {
+  [GRAF_MAP] = {
     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -108,8 +108,8 @@ static unsigned short translations[][256] = {
     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
   },
   /* IBM Codepage 437 mapped to Unicode */
-  {
-    0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 
+  [IBMPC_MAP] = {
+    0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
     0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
     0x25b6, 0x25c0, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8,
     0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc,
@@ -141,9 +141,9 @@ static unsigned short translations[][256] = {
     0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
     0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,
     0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
-  }, 
+  },
   /* User mapping -- default to codes for direct font mapping */
-  {
+  [USER_MAP] = {
     0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007,
     0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f,
     0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017,
@@ -184,7 +184,7 @@ static unsigned short translations[][256] = {
 
 #define MAX_GLYPH 512		/* Max possible glyph value */
 
-static int inv_translate[MAX_NR_CONSOLES];
+static enum translation_map inv_translate[MAX_NR_CONSOLES];
 
 #define UNI_DIRS	32U
 #define UNI_DIR_ROWS	32U
@@ -208,24 +208,25 @@ struct uni_pagedict {
 	u16		**uni_pgdir[UNI_DIRS];
 	unsigned long	refcount;
 	unsigned long	sum;
-	unsigned char	*inverse_translations[4];
+	unsigned char	*inverse_translations[LAST_MAP + 1];
 	u16		*inverse_trans_unicode;
 };
 
 static struct uni_pagedict *dflt;
 
-static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p, int i)
+static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p,
+	       enum translation_map m)
 {
 	int j, glyph;
-	unsigned short *t = translations[i];
+	unsigned short *t = translations[m];
 	unsigned char *q;
 	
 	if (!p)
 		return;
-	q = p->inverse_translations[i];
+	q = p->inverse_translations[m];
 
 	if (!q) {
-		q = p->inverse_translations[i] = kmalloc(MAX_GLYPH, GFP_KERNEL);
+		q = p->inverse_translations[m] = kmalloc(MAX_GLYPH, GFP_KERNEL);
 		if (!q)
 			return;
 	}
@@ -276,7 +277,7 @@ static void set_inverse_trans_unicode(struct vc_data *conp,
 	}
 }
 
-unsigned short *set_translate(int m, struct vc_data *vc)
+unsigned short *set_translate(enum translation_map m, struct vc_data *vc)
 {
 	inv_translate[vc->vc_num] = m;
 	return translations[m];
@@ -292,7 +293,7 @@ unsigned short *set_translate(int m, struct vc_data *vc)
 u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 {
 	struct uni_pagedict *p;
-	int m;
+	enum translation_map m;
 
 	if (glyph >= MAX_GLYPH)
 		return 0;
@@ -669,8 +670,8 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 	if (con_unify_unimap(vc, p))
 		goto out_unlock;
 
-	for (i = 0; i <= 3; i++)
-		set_inverse_transl(vc, p, i); /* Update inverse translations */
+	for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++)
+		set_inverse_transl(vc, p, m); /* Update inverse translations */
 	set_inverse_trans_unicode(vc, p);
 
 out_unlock:
@@ -731,8 +732,8 @@ int con_set_default_unimap(struct vc_data *vc)
 		return err;
 	}
 
-	for (i = 0; i <= 3; i++)
-		set_inverse_transl(vc, p, i);	/* Update all inverse translations */
+	for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++)
+		set_inverse_transl(vc, p, m);	/* Update all inverse translations */
 	set_inverse_trans_unicode(vc, p);
 	dflt = p;
 	return err;
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index 1ff2bf55eb85..c35db4896c37 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -7,10 +7,15 @@
 #ifndef __LINUX_CONSOLEMAP_H__
 #define __LINUX_CONSOLEMAP_H__
 
-#define LAT1_MAP 0
-#define GRAF_MAP 1
-#define IBMPC_MAP 2
-#define USER_MAP 3
+enum translation_map {
+	LAT1_MAP,
+	GRAF_MAP,
+	IBMPC_MAP,
+	USER_MAP,
+
+	FIRST_MAP = LAT1_MAP,
+	LAST_MAP = USER_MAP,
+};
 
 #include <linux/types.h>
 
@@ -18,7 +23,7 @@ struct vc_data;
 
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
 u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode);
-unsigned short *set_translate(int m, struct vc_data *vc);
+unsigned short *set_translate(enum translation_map m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
 u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
@@ -30,7 +35,8 @@ static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
 	return glyph;
 }
 
-static inline unsigned short *set_translate(int m, struct vc_data *vc)
+static inline unsigned short *set_translate(enum translation_map m,
+		struct vc_data *vc)
 {
 	return NULL;
 }
-- 
cgit 


From 8322b1f527159de578aab277629296575a11eb3c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 13:03:58 +0300
Subject: serial: Add uart_rs485_config()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few serial drivers make a call to rs485_config() themselves (all
these seem to relate to init). Convert them all to use a common helper
which makes it easy to make adjustments on tasks related to it as
serial_rs485 struct sanitization is going to be added.

In pci_fintek_setup() (in 8250_pci.c), the rs485_config() call was made
with NULL, however, it can be changed to pass uart_port's rs485 struct.
No other callers should pass NULL into rs485_config() so the NULL check
can now be eliminated.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606100433.13793-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_pci.c  | 6 ++----
 drivers/tty/serial/8250/8250_port.c | 2 +-
 drivers/tty/serial/fsl_lpuart.c     | 2 +-
 drivers/tty/serial/imx.c            | 2 +-
 drivers/tty/serial/serial_core.c    | 6 ++++++
 include/linux/serial_core.h         | 1 +
 6 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index a17619db7939..fb0a49e39072 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1562,9 +1562,7 @@ static int pci_fintek_rs485_config(struct uart_port *port,
 
 	pci_read_config_byte(pci_dev, 0x40 + 8 * *index + 7, &setting);
 
-	if (!rs485)
-		rs485 = &port->rs485;
-	else if (rs485->flags & SER_RS485_ENABLED)
+	if (rs485->flags & SER_RS485_ENABLED)
 		memset(rs485->padding, 0, sizeof(rs485->padding));
 	else
 		memset(rs485, 0, sizeof(*rs485));
@@ -1689,7 +1687,7 @@ static int pci_fintek_init(struct pci_dev *dev)
 			 * pciserial_resume_ports()
 			 */
 			port = serial8250_get_port(priv->line[i]);
-			pci_fintek_rs485_config(&port->port, NULL);
+			uart_rs485_config(&port->port);
 		} else {
 			/* First init without port data
 			 * force init to RS232 Mode
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 30e0aaf52adc..af550a4a27f8 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -3191,7 +3191,7 @@ static void serial8250_config_port(struct uart_port *port, int flags)
 		autoconfig(up);
 
 	if (port->rs485.flags & SER_RS485_ENABLED)
-		port->rs485_config(port, &port->rs485);
+		uart_rs485_config(port);
 
 	/* if access method is AU, it is a 16550 with a quirk */
 	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 0d6e62f6bb07..509a7912fa9d 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2724,7 +2724,7 @@ static int lpuart_probe(struct platform_device *pdev)
 	    sport->port.rs485.delay_rts_after_send)
 		dev_err(&pdev->dev, "driver doesn't support RTS delays\n");
 
-	sport->port.rs485_config(&sport->port, &sport->port.rs485);
+	uart_rs485_config(&sport->port);
 
 	ret = devm_request_irq(&pdev->dev, sport->port.irq, handler, 0,
 				DRIVER_NAME, sport);
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 30edb35a6a15..17fb9a57078b 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -2338,7 +2338,7 @@ static int imx_uart_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev,
 			"low-active RTS not possible when receiver is off, enabling receiver\n");
 
-	imx_uart_rs485_config(&sport->port, &sport->port.rs485);
+	uart_rs485_config(&sport->port);
 
 	/* Disable interrupts before requesting them */
 	ucr1 = imx_uart_readl(sport, UCR1);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 9a85b41caa0a..8466181db4e9 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1276,6 +1276,12 @@ static int uart_get_icount(struct tty_struct *tty,
 	return 0;
 }
 
+int uart_rs485_config(struct uart_port *port)
+{
+	return port->rs485_config(port, &port->rs485);
+}
+EXPORT_SYMBOL_GPL(uart_rs485_config);
+
 static int uart_get_rs485_config(struct uart_port *port,
 			 struct serial_rs485 __user *rs485)
 {
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index cbd5070bc87f..d3ebb4db2d80 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -592,4 +592,5 @@ static inline int uart_handle_break(struct uart_port *port)
 					 !((cflag) & CLOCAL))
 
 int uart_get_rs485_mode(struct uart_port *port);
+int uart_rs485_config(struct uart_port *port);
 #endif /* LINUX_SERIAL_CORE_H */
-- 
cgit 


From 8925c31c1ac2f1e05da988581f2a70a2a8c4d638 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 13:04:00 +0300
Subject: serial: Add rs485_supported to uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Preparing to move serial_rs485 struct sanitization into serial core,
each driver has to provide what fields/flags it supports. This
information is pointed into by rs485_supported.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606100433.13793-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 1 +
 include/linux/serial_core.h         | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index b0320de3379c..90ddc8924811 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1003,6 +1003,7 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 		uart->port.throttle	= up->port.throttle;
 		uart->port.unthrottle	= up->port.unthrottle;
 		uart->port.rs485_config	= up->port.rs485_config;
+		uart->port.rs485_supported = up->port.rs485_supported;
 		uart->port.rs485	= up->port.rs485;
 		uart->rs485_start_tx	= up->rs485_start_tx;
 		uart->rs485_stop_tx	= up->rs485_stop_tx;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d3ebb4db2d80..5518b70177b3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -254,6 +254,7 @@ struct uart_port {
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
+	const struct serial_rs485	*rs485_supported;	/* Supported mask for serial_rs485 */
 	struct gpio_desc	*rs485_term_gpio;	/* enable RS485 bus termination */
 	struct serial_iso7816   iso7816;
 	void			*private_data;		/* generic platform data pointer */
-- 
cgit 


From 6bb6fa6908ebd3cb4e14cd4f0ce272ec885d2eb0 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 18:36:51 +0300
Subject: tty: Implement lookahead to process XON/XOFF timely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When tty is not read from, XON/XOFF may get stuck into an
intermediate buffer. As those characters are there to do software
flow-control, it is not very useful. In the case where neither end
reads from ttys, the receiving ends might not be able receive the
XOFF characters and just keep sending more data to the opposite
direction. This problem is almost guaranteed to occur with DMA
which sends data in large chunks.

If TTY is slow to process characters, that is, eats less than given
amount in receive_buf, invoke lookahead for the rest of the chars
to process potential XON/XOFF characters.

We need to keep track of how many characters have been processed by the
lookahead to avoid processing the flow control char again on the normal
path. Bookkeeping occurs parallel on two layers (tty_buffer and n_tty)
to avoid passing the lookahead_count through the whole call chain.

When a flow-control char is processed, two things must occur:
  a) it must not be treated as normal char
  b) if not yet processed, flow-control actions need to be taken
The return value of n_tty_receive_char_flow_ctrl() tells caller a), and
b) is kept internal to n_tty_receive_char_flow_ctrl().

If characters were previous looked ahead, __receive_buf() makes two
calls to the appropriate n_tty_receive_buf_* function. First call is
made with lookahead_done=true for the characters that were subject to
lookahead earlier and then with lookahead=false for the new characters.
Either of the calls might be skipped when it has no characters to
handle.

Reported-by: Gilles Buloz <gilles.buloz@kontron.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606153652.63554-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c        | 91 ++++++++++++++++++++++++++++++++++++++--------
 drivers/tty/tty_buffer.c   | 59 +++++++++++++++++++++++++-----
 drivers/tty/tty_port.c     | 21 +++++++++++
 include/linux/tty_buffer.h |  1 +
 include/linux/tty_ldisc.h  | 14 +++++++
 include/linux/tty_port.h   |  2 +
 6 files changed, 163 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 640c9e871044..8d80384df874 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -118,6 +118,9 @@ struct n_tty_data {
 	size_t read_tail;
 	size_t line_start;
 
+	/* # of chars looked ahead (to find software flow control chars) */
+	size_t lookahead_count;
+
 	/* protected by output lock */
 	unsigned int column;
 	unsigned int canon_column;
@@ -333,6 +336,8 @@ static void reset_buffer_flags(struct n_tty_data *ldata)
 	ldata->erasing = 0;
 	bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
 	ldata->push = 0;
+
+	ldata->lookahead_count = 0;
 }
 
 static void n_tty_packet_mode_flush(struct tty_struct *tty)
@@ -1225,12 +1230,30 @@ static bool n_tty_is_char_flow_ctrl(struct tty_struct *tty, unsigned char c)
 	return c == START_CHAR(tty) || c == STOP_CHAR(tty);
 }
 
-/* Returns true if c is consumed as flow-control character */
-static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c)
+/**
+ * n_tty_receive_char_flow_ctrl - receive flow control chars
+ * @tty: terminal device
+ * @c: character
+ * @lookahead_done: lookahead has processed this character already
+ *
+ * Receive and process flow control character actions.
+ *
+ * In case lookahead for flow control chars already handled the character in
+ * advance to the normal receive, the actions are skipped during normal
+ * receive.
+ *
+ * Returns true if @c is consumed as flow-control character, the character
+ * must not be treated as normal character.
+ */
+static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c,
+					 bool lookahead_done)
 {
 	if (!n_tty_is_char_flow_ctrl(tty, c))
 		return false;
 
+	if (lookahead_done)
+		return true;
+
 	if (c == START_CHAR(tty)) {
 		start_tty(tty);
 		process_echoes(tty);
@@ -1242,11 +1265,12 @@ static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c
 	return true;
 }
 
-static void n_tty_receive_char_special(struct tty_struct *tty, unsigned char c)
+static void n_tty_receive_char_special(struct tty_struct *tty, unsigned char c,
+				       bool lookahead_done)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	if (I_IXON(tty) && n_tty_receive_char_flow_ctrl(tty, c))
+	if (I_IXON(tty) && n_tty_receive_char_flow_ctrl(tty, c, lookahead_done))
 		return;
 
 	if (L_ISIG(tty)) {
@@ -1401,7 +1425,8 @@ static void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 	put_tty_queue(c, ldata);
 }
 
-static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
+static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c,
+				       bool lookahead_done)
 {
 	if (I_ISTRIP(tty))
 		c &= 0x7f;
@@ -1409,9 +1434,12 @@ static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
 		c = tolower(c);
 
 	if (I_IXON(tty)) {
-		if (c == STOP_CHAR(tty))
-			stop_tty(tty);
-		else if (c == START_CHAR(tty) ||
+		if (c == STOP_CHAR(tty)) {
+			if (!lookahead_done)
+				stop_tty(tty);
+		} else if (c == START_CHAR(tty) && lookahead_done) {
+			return;
+		} else if (c == START_CHAR(tty) ||
 			 (tty->flow.stopped && !tty->flow.tco_stopped && I_IXANY(tty) &&
 			  c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) &&
 			  c != SUSP_CHAR(tty))) {
@@ -1457,6 +1485,27 @@ n_tty_receive_char_lnext(struct tty_struct *tty, unsigned char c, char flag)
 		n_tty_receive_char_flagged(tty, c, flag);
 }
 
+/* Caller must ensure count > 0 */
+static void n_tty_lookahead_flow_ctrl(struct tty_struct *tty, const unsigned char *cp,
+				      const unsigned char *fp, unsigned int count)
+{
+	struct n_tty_data *ldata = tty->disc_data;
+	unsigned char flag = TTY_NORMAL;
+
+	ldata->lookahead_count += count;
+
+	if (!I_IXON(tty))
+		return;
+
+	while (count--) {
+		if (fp)
+			flag = *fp++;
+		if (likely(flag == TTY_NORMAL))
+			n_tty_receive_char_flow_ctrl(tty, *cp, false);
+		cp++;
+	}
+}
+
 static void
 n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp,
 			   const char *fp, int count)
@@ -1496,7 +1545,7 @@ n_tty_receive_buf_raw(struct tty_struct *tty, const unsigned char *cp,
 
 static void
 n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
-			  const char *fp, int count)
+			  const char *fp, int count, bool lookahead_done)
 {
 	char flag = TTY_NORMAL;
 
@@ -1504,12 +1553,12 @@ n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
 		if (fp)
 			flag = *fp++;
 		if (likely(flag == TTY_NORMAL))
-			n_tty_receive_char_closing(tty, *cp++);
+			n_tty_receive_char_closing(tty, *cp++, lookahead_done);
 	}
 }
 
 static void n_tty_receive_buf_standard(struct tty_struct *tty,
-		const unsigned char *cp, const char *fp, int count)
+		const unsigned char *cp, const char *fp, int count, bool lookahead_done)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	char flag = TTY_NORMAL;
@@ -1540,7 +1589,7 @@ static void n_tty_receive_buf_standard(struct tty_struct *tty,
 		}
 
 		if (test_bit(c, ldata->char_map))
-			n_tty_receive_char_special(tty, c);
+			n_tty_receive_char_special(tty, c, lookahead_done);
 		else
 			n_tty_receive_char(tty, c);
 	}
@@ -1551,21 +1600,30 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	bool preops = I_ISTRIP(tty) || (I_IUCLC(tty) && L_IEXTEN(tty));
+	size_t la_count = min_t(size_t, ldata->lookahead_count, count);
 
 	if (ldata->real_raw)
 		n_tty_receive_buf_real_raw(tty, cp, fp, count);
 	else if (ldata->raw || (L_EXTPROC(tty) && !preops))
 		n_tty_receive_buf_raw(tty, cp, fp, count);
-	else if (tty->closing && !L_EXTPROC(tty))
-		n_tty_receive_buf_closing(tty, cp, fp, count);
-	else {
-		n_tty_receive_buf_standard(tty, cp, fp, count);
+	else if (tty->closing && !L_EXTPROC(tty)) {
+		if (la_count > 0)
+			n_tty_receive_buf_closing(tty, cp, fp, la_count, true);
+		if (count > la_count)
+			n_tty_receive_buf_closing(tty, cp, fp, count - la_count, false);
+	} else {
+		if (la_count > 0)
+			n_tty_receive_buf_standard(tty, cp, fp, la_count, true);
+		if (count > la_count)
+			n_tty_receive_buf_standard(tty, cp, fp, count - la_count, false);
 
 		flush_echoes(tty);
 		if (tty->ops->flush_chars)
 			tty->ops->flush_chars(tty);
 	}
 
+	ldata->lookahead_count -= la_count;
+
 	if (ldata->icanon && !L_EXTPROC(tty))
 		return;
 
@@ -2446,6 +2504,7 @@ static struct tty_ldisc_ops n_tty_ops = {
 	.receive_buf     = n_tty_receive_buf,
 	.write_wakeup    = n_tty_write_wakeup,
 	.receive_buf2	 = n_tty_receive_buf2,
+	.lookahead_buf	 = n_tty_lookahead_flow_ctrl,
 };
 
 /**
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index bfa431a8e690..754fa43670cc 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -5,6 +5,7 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/minmax.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_flip.h>
@@ -104,6 +105,7 @@ static void tty_buffer_reset(struct tty_buffer *p, size_t size)
 	p->size = size;
 	p->next = NULL;
 	p->commit = 0;
+	p->lookahead = 0;
 	p->read = 0;
 	p->flags = 0;
 }
@@ -234,6 +236,7 @@ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld)
 		buf->head = next;
 	}
 	buf->head->read = buf->head->commit;
+	buf->head->lookahead = buf->head->read;
 
 	if (ld && ld->ops->flush_buffer)
 		ld->ops->flush_buffer(tty);
@@ -276,13 +279,15 @@ static int __tty_buffer_request_room(struct tty_port *port, size_t size,
 		if (n != NULL) {
 			n->flags = flags;
 			buf->tail = n;
-			/* paired w/ acquire in flush_to_ldisc(); ensures
-			 * flush_to_ldisc() sees buffer data.
+			/*
+			 * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
+			 * ensures they see all buffer data.
 			 */
 			smp_store_release(&b->commit, b->used);
-			/* paired w/ acquire in flush_to_ldisc(); ensures the
-			 * latest commit value can be read before the head is
-			 * advanced to the next buffer
+			/*
+			 * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
+			 * ensures the latest commit value can be read before the head
+			 * is advanced to the next buffer.
 			 */
 			smp_store_release(&b->next, n);
 		} else if (change)
@@ -459,6 +464,40 @@ int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 }
 EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf);
 
+static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head)
+{
+	head->lookahead = max(head->lookahead, head->read);
+
+	while (head) {
+		struct tty_buffer *next;
+		unsigned char *p, *f = NULL;
+		unsigned int count;
+
+		/*
+		 * Paired w/ release in __tty_buffer_request_room();
+		 * ensures commit value read is not stale if the head
+		 * is advancing to the next buffer.
+		 */
+		next = smp_load_acquire(&head->next);
+		/*
+		 * Paired w/ release in __tty_buffer_request_room() or in
+		 * tty_buffer_flush(); ensures we see the committed buffer data.
+		 */
+		count = smp_load_acquire(&head->commit) - head->lookahead;
+		if (!count) {
+			head = next;
+			continue;
+		}
+
+		p = char_buf_ptr(head, head->lookahead);
+		if (~head->flags & TTYB_NORMAL)
+			f = flag_buf_ptr(head, head->lookahead);
+
+		port->client_ops->lookahead_buf(port, p, f, count);
+		head->lookahead += count;
+	}
+}
+
 static int
 receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
 {
@@ -496,7 +535,7 @@ static void flush_to_ldisc(struct work_struct *work)
 	while (1) {
 		struct tty_buffer *head = buf->head;
 		struct tty_buffer *next;
-		int count;
+		int count, rcvd;
 
 		/* Ldisc or user is trying to gain exclusive access */
 		if (atomic_read(&buf->priority))
@@ -519,10 +558,12 @@ static void flush_to_ldisc(struct work_struct *work)
 			continue;
 		}
 
-		count = receive_buf(port, head, count);
-		if (!count)
+		rcvd = receive_buf(port, head, count);
+		head->read += rcvd;
+		if (rcvd < count)
+			lookahead_bufs(port, head);
+		if (!rcvd)
 			break;
-		head->read += count;
 
 		if (need_resched())
 			cond_resched();
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 880608a65773..dce08a6d7b5e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -43,6 +43,26 @@ static int tty_port_default_receive_buf(struct tty_port *port,
 	return ret;
 }
 
+static void tty_port_default_lookahead_buf(struct tty_port *port, const unsigned char *p,
+					   const unsigned char *f, unsigned int count)
+{
+	struct tty_struct *tty;
+	struct tty_ldisc *disc;
+
+	tty = READ_ONCE(port->itty);
+	if (!tty)
+		return;
+
+	disc = tty_ldisc_ref(tty);
+	if (!disc)
+		return;
+
+	if (disc->ops->lookahead_buf)
+		disc->ops->lookahead_buf(disc->tty, p, f, count);
+
+	tty_ldisc_deref(disc);
+}
+
 static void tty_port_default_wakeup(struct tty_port *port)
 {
 	struct tty_struct *tty = tty_port_tty_get(port);
@@ -55,6 +75,7 @@ static void tty_port_default_wakeup(struct tty_port *port)
 
 const struct tty_port_client_operations tty_port_default_client_ops = {
 	.receive_buf = tty_port_default_receive_buf,
+	.lookahead_buf = tty_port_default_lookahead_buf,
 	.write_wakeup = tty_port_default_wakeup,
 };
 EXPORT_SYMBOL_GPL(tty_port_default_client_ops);
diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h
index 3b9d77604291..1796648c2907 100644
--- a/include/linux/tty_buffer.h
+++ b/include/linux/tty_buffer.h
@@ -15,6 +15,7 @@ struct tty_buffer {
 	int used;
 	int size;
 	int commit;
+	int lookahead;		/* Lazy update on recv, can become less than "read" */
 	int read;
 	int flags;
 	/* Data points here */
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index e85002b56752..33678e1936f6 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -186,6 +186,18 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	indicate all data received is %TTY_NORMAL. If assigned, prefer this
  *	function for automatic flow control.
  *
+ * @lookahead_buf: [DRV] ``void ()(struct tty_struct *tty,
+ *			const unsigned char *cp, const char *fp, int count)
+ *
+ *	This function is called by the low-level tty driver for characters
+ *	not eaten by ->receive_buf() or ->receive_buf2(). It is useful for
+ *	processing high-priority characters such as software flow-control
+ *	characters that could otherwise get stuck into the intermediate
+ *	buffer until tty has room to receive them. Ldisc must be able to
+ *	handle later a ->receive_buf() or ->receive_buf2() call for the
+ *	same characters (e.g. by skipping the actions for high-priority
+ *	characters already handled by ->lookahead_buf()).
+ *
  * @owner: module containting this ldisc (for reference counting)
  *
  * This structure defines the interface between the tty line discipline
@@ -229,6 +241,8 @@ struct tty_ldisc_ops {
 	void	(*dcd_change)(struct tty_struct *tty, unsigned int status);
 	int	(*receive_buf2)(struct tty_struct *tty, const unsigned char *cp,
 				const char *fp, int count);
+	void	(*lookahead_buf)(struct tty_struct *tty, const unsigned char *cp,
+				 const unsigned char *fp, unsigned int count);
 
 	struct  module *owner;
 };
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 58e9619116b7..fa3c3bdaa234 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -40,6 +40,8 @@ struct tty_port_operations {
 
 struct tty_port_client_operations {
 	int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t);
+	void (*lookahead_buf)(struct tty_port *port, const unsigned char *cp,
+			      const unsigned char *fp, unsigned int count);
 	void (*write_wakeup)(struct tty_port *port);
 };
 
-- 
cgit 


From bd3df9ff25b32b66630c283bb2e065e8bb822e72 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Fri, 10 Jun 2022 11:35:47 +0300
Subject: ASoC: SOF: ipc4: Add set_core_state pm_ops implementation

IPC4 uses the SET_DX message to enable/disable cores managed by the DSP.
The dx_state.core_mask indicates which core is going to change state,
the dx_state.dx_mask is to power on (1) or off (0) the core.
In the dx_mask only those bits (cores) checked which bit is set in the
core_mask, other bits (cores) ignored.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20220610083549.16773-5-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/ipc4/header.h |  8 ++++++++
 sound/soc/sof/ipc4.c            | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h
index b8b8e5b5e3e1..a795deacc2ea 100644
--- a/include/sound/sof/ipc4/header.h
+++ b/include/sound/sof/ipc4/header.h
@@ -385,6 +385,14 @@ struct sof_ipc4_fw_version {
 	uint16_t build;
 } __packed;
 
+/* Payload data for SOF_IPC4_MOD_SET_DX */
+struct sof_ipc4_dx_state_info {
+	/* core(s) to apply the change */
+	uint32_t core_mask;
+	/* core state: 0: put core_id to D3; 1: put core_id to D0 */
+	uint32_t dx_mask;
+} __packed __aligned(4);
+
 /* Reply messages */
 
 /*
diff --git a/sound/soc/sof/ipc4.c b/sound/soc/sof/ipc4.c
index 658802c86685..b2cb92745ec6 100644
--- a/sound/soc/sof/ipc4.c
+++ b/sound/soc/sof/ipc4.c
@@ -597,10 +597,36 @@ static void sof_ipc4_rx_msg(struct snd_sof_dev *sdev)
 	}
 }
 
+static int sof_ipc4_set_core_state(struct snd_sof_dev *sdev, int core_idx, bool on)
+{
+	struct sof_ipc4_dx_state_info dx_state;
+	struct sof_ipc4_msg msg;
+
+	dx_state.core_mask = BIT(core_idx);
+	if (on)
+		dx_state.dx_mask = BIT(core_idx);
+	else
+		dx_state.dx_mask = 0;
+
+	msg.primary = SOF_IPC4_MSG_TYPE_SET(SOF_IPC4_MOD_SET_DX);
+	msg.primary |= SOF_IPC4_MSG_DIR(SOF_IPC4_MSG_REQUEST);
+	msg.primary |= SOF_IPC4_MSG_TARGET(SOF_IPC4_MODULE_MSG);
+	msg.extension = 0;
+	msg.data_ptr = &dx_state;
+	msg.data_size = sizeof(dx_state);
+
+	return sof_ipc4_tx_msg(sdev, &msg, msg.data_size, NULL, 0, false);
+}
+
+static const struct sof_ipc_pm_ops ipc4_pm_ops = {
+	.set_core_state = sof_ipc4_set_core_state,
+};
+
 const struct sof_ipc_ops ipc4_ops = {
 	.tx_msg = sof_ipc4_tx_msg,
 	.rx_msg = sof_ipc4_rx_msg,
 	.set_get_data = sof_ipc4_set_get_data,
 	.get_reply = sof_ipc4_get_reply,
+	.pm = &ipc4_pm_ops,
 	.fw_loader = &ipc4_loader_ops,
 };
-- 
cgit 


From bd10cd5ec54616a488d0bda695f78694ad79f779 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Wed, 8 Jun 2022 20:26:21 -0700
Subject: ASoC: SOF: Add topology tokens for IPC4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the required tokens for parsing the topology for IPC4.

Co-developed-by: Rander Wang <rander.wang@linux.intel.com>
Signed-off-by: Rander Wang <rander.wang@linux.intel.com>
Co-developed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://lore.kernel.org/r/20220609032643.916882-2-ranjani.sridharan@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/tokens.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index b72fa385bebf..f7b2019065ad 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -52,11 +52,17 @@
 #define SOF_TKN_SCHED_FRAMES			204
 #define SOF_TKN_SCHED_TIME_DOMAIN		205
 #define SOF_TKN_SCHED_DYNAMIC_PIPELINE		206
+#define SOF_TKN_SCHED_LP_MODE			207
+#define SOF_TKN_SCHED_MEM_USAGE			208
 
 /* volume */
 #define SOF_TKN_VOLUME_RAMP_STEP_TYPE		250
 #define SOF_TKN_VOLUME_RAMP_STEP_MS		251
 
+#define SOF_TKN_GAIN_RAMP_TYPE			260
+#define SOF_TKN_GAIN_RAMP_DURATION		261
+#define SOF_TKN_GAIN_VAL			262
+
 /* SRC */
 #define SOF_TKN_SRC_RATE_IN			300
 #define SOF_TKN_SRC_RATE_OUT			301
@@ -79,6 +85,9 @@
  */
 #define SOF_TKN_COMP_CORE_ID			404
 #define SOF_TKN_COMP_UUID                       405
+#define SOF_TKN_COMP_CPC			406
+#define SOF_TKN_COMP_IS_PAGES			409
+#define SOF_TKN_COMP_NUM_AUDIO_FORMATS		410
 
 /* SSP */
 #define SOF_TKN_INTEL_SSP_CLKS_CONTROL		500
@@ -145,4 +154,35 @@
 #define SOF_TKN_MEDIATEK_AFE_CH			1601
 #define SOF_TKN_MEDIATEK_AFE_FORMAT		1602
 
+/* MIXER */
+#define SOF_TKN_MIXER_TYPE			1700
+
+/* CAVS AUDIO FORMAT */
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_RATE	1900
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_BIT_DEPTH	1901
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_VALID_BIT	1902
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CHANNELS	1903
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_MAP	1904
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_CH_CFG	1905
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_INTERLEAVING_STYLE	1906
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_FMT_CFG	1907
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IN_SAMPLE_TYPE	1908
+/* intentional token numbering discontinuity, reserved for future use */
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_RATE	1930
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_BIT_DEPTH	1931
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_VALID_BIT	1932
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CHANNELS	1933
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_MAP	1934
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_CH_CFG	1935
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_INTERLEAVING_STYLE	1936
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_FMT_CFG	1937
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OUT_SAMPLE_TYPE	1938
+/* intentional token numbering discontinuity, reserved for future use */
+#define SOF_TKN_CAVS_AUDIO_FORMAT_IBS		1970
+#define SOF_TKN_CAVS_AUDIO_FORMAT_OBS		1971
+#define SOF_TKN_CAVS_AUDIO_FORMAT_DMA_BUFFER_SIZE	1972
+
+/* COPIER */
+#define SOF_TKN_INTEL_COPIER_NODE_TYPE		1980
+
 #endif
-- 
cgit 


From bc433fd76faefb8484f5bc653d846043822a2d35 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Wed, 8 Jun 2022 20:26:37 -0700
Subject: ASoC: SOF: Add ops_free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the ops_free callback in struct sof_dev_desc.

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220609032643.916882-18-ranjani.sridharan@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof.h  | 1 +
 sound/soc/sof/core.c | 7 ++++++-
 sound/soc/sof/ops.h  | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sof.h b/include/sound/sof.h
index 1a82a0db5e7f..367dccfea7ad 100644
--- a/include/sound/sof.h
+++ b/include/sound/sof.h
@@ -138,6 +138,7 @@ struct sof_dev_desc {
 
 	struct snd_sof_dsp_ops *ops;
 	int (*ops_init)(struct snd_sof_dev *sdev);
+	void (*ops_free)(struct snd_sof_dev *sdev);
 };
 
 int sof_dai_get_mclk(struct snd_soc_pcm_runtime *rtd);
diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c
index 53719c04658f..c99b5e6c026c 100644
--- a/sound/soc/sof/core.c
+++ b/sound/soc/sof/core.c
@@ -189,7 +189,7 @@ static int sof_probe_continue(struct snd_sof_dev *sdev)
 	ret = snd_sof_probe(sdev);
 	if (ret < 0) {
 		dev_err(sdev->dev, "error: failed to probe DSP %d\n", ret);
-		return ret;
+		goto probe_err;
 	}
 
 	sof_set_fw_state(sdev, SOF_FW_BOOT_PREPARE);
@@ -317,6 +317,8 @@ dbg_err:
 	snd_sof_free_debug(sdev);
 dsp_err:
 	snd_sof_remove(sdev);
+probe_err:
+	sof_ops_free(sdev);
 
 	/* all resources freed, update state to match */
 	sof_set_fw_state(sdev, SOF_FW_BOOT_NOT_STARTED);
@@ -374,6 +376,7 @@ int snd_sof_device_probe(struct device *dev, struct snd_sof_pdata *plat_data)
 	    !sof_ops(sdev)->block_read || !sof_ops(sdev)->block_write ||
 	    !sof_ops(sdev)->send_msg || !sof_ops(sdev)->load_firmware ||
 	    !sof_ops(sdev)->ipc_msg_data) {
+		sof_ops_free(sdev);
 		dev_err(dev, "error: missing mandatory ops\n");
 		return -EINVAL;
 	}
@@ -457,6 +460,8 @@ int snd_sof_device_remove(struct device *dev)
 		snd_sof_remove(sdev);
 	}
 
+	sof_ops_free(sdev);
+
 	/* release firmware */
 	snd_sof_fw_unload(sdev);
 
diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h
index b79ae4f66eba..55d43adb6a29 100644
--- a/sound/soc/sof/ops.h
+++ b/sound/soc/sof/ops.h
@@ -29,6 +29,12 @@ static inline int sof_ops_init(struct snd_sof_dev *sdev)
 	return 0;
 }
 
+static inline void sof_ops_free(struct snd_sof_dev *sdev)
+{
+	if (sdev->pdata->desc->ops_free)
+		sdev->pdata->desc->ops_free(sdev);
+}
+
 /* Mandatory operations are verified during probing */
 
 /* init */
-- 
cgit 


From 4453d24d10fdd9e40c84673e3eda7701055081ea Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Wed, 8 Jun 2022 20:26:39 -0700
Subject: ASoC: SOF: Add two new structures for topology manifest data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a couple of structures for parsing and saving the topology manifest
data.

Co-developed-by: Jaska Uimonen <jaska.uimonen@linux.intel.com>
Signed-off-by: Jaska Uimonen <jaska.uimonen@linux.intel.com>
Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220609032643.916882-20-ranjani.sridharan@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/abi.h    |  2 ++
 include/uapi/sound/sof/header.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h
index 0e7dccdc25fd..c88f467374ae 100644
--- a/include/uapi/sound/sof/abi.h
+++ b/include/uapi/sound/sof/abi.h
@@ -24,6 +24,8 @@
 #ifndef __INCLUDE_UAPI_SOUND_SOF_ABI_H__
 #define __INCLUDE_UAPI_SOUND_SOF_ABI_H__
 
+#include <linux/types.h>
+
 /* SOF ABI version major, minor and patch numbers */
 #define SOF_ABI_MAJOR 3
 #define SOF_ABI_MINOR 21
diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index 5f4518e7a972..f125f7772ee7 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -26,4 +26,34 @@ struct sof_abi_hdr {
 	__u32 data[0];		/**< Component data - opaque to core */
 }  __packed;
 
+#define SOF_MANIFEST_DATA_TYPE_NHLT 1
+
+/**
+ * struct sof_manifest_tlv - SOF manifest TLV data
+ * @type: type of data
+ * @size: data size (not including the size of this struct)
+ * @data: payload data
+ */
+struct sof_manifest_tlv {
+	__le32 type;
+	__le32 size;
+	__u8 data[];
+};
+
+/**
+ * struct sof_manifest - SOF topology manifest
+ * @abi_major: Major ABI version
+ * @abi_minor: Minor ABI version
+ * @abi_patch: ABI patch
+ * @count: count of tlv items
+ * @items: consecutive variable size tlv items
+ */
+struct sof_manifest {
+	__le16 abi_major;
+	__le16 abi_minor;
+	__le16 abi_patch;
+	__le16 count;
+	struct sof_manifest_tlv items[];
+};
+
 #endif
-- 
cgit 


From df4d27b19b892f464685ea45fa6132dd1a2b6864 Mon Sep 17 00:00:00 2001
From: Martin Povišer <povik+lin@cutebit.org>
Date: Mon, 6 Jun 2022 21:19:09 +0200
Subject: ASoC: Introduce 'fixup_controls' card method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new method is called just before the card is registered, providing
an opportune time for machine-level drivers to do some final controls
amending: deactivating individual controls or obtaining control
references for later use.

Some controls can be created by DAPM after 'late_probe' has been called,
hence the need for this new method.

Signed-off-by: Martin Povišer <povik+lin@cutebit.org>
Link: https://lore.kernel.org/r/20220606191910.16580-5-povik+lin@cutebit.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-card.h | 1 +
 include/sound/soc.h      | 1 +
 sound/soc/soc-card.c     | 6 ++++++
 sound/soc/soc-core.c     | 1 +
 4 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc-card.h b/include/sound/soc-card.h
index df08573bd80c..9d31a5c0db33 100644
--- a/include/sound/soc-card.h
+++ b/include/sound/soc-card.h
@@ -29,6 +29,7 @@ int snd_soc_card_resume_post(struct snd_soc_card *card);
 
 int snd_soc_card_probe(struct snd_soc_card *card);
 int snd_soc_card_late_probe(struct snd_soc_card *card);
+void snd_soc_card_fixup_controls(struct snd_soc_card *card);
 int snd_soc_card_remove(struct snd_soc_card *card);
 
 int snd_soc_card_set_bias_level(struct snd_soc_card *card,
diff --git a/include/sound/soc.h b/include/sound/soc.h
index f20f5f890794..faef85e37e32 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -916,6 +916,7 @@ struct snd_soc_card {
 
 	int (*probe)(struct snd_soc_card *card);
 	int (*late_probe)(struct snd_soc_card *card);
+	void (*fixup_controls)(struct snd_soc_card *card);
 	int (*remove)(struct snd_soc_card *card);
 
 	/* the pre and post PM functions are used to do any PM work before and
diff --git a/sound/soc/soc-card.c b/sound/soc/soc-card.c
index 4158f5aacfd3..285ab4c9c716 100644
--- a/sound/soc/soc-card.c
+++ b/sound/soc/soc-card.c
@@ -197,6 +197,12 @@ int snd_soc_card_late_probe(struct snd_soc_card *card)
 	return 0;
 }
 
+void snd_soc_card_fixup_controls(struct snd_soc_card *card)
+{
+	if (card->fixup_controls)
+		card->fixup_controls(card);
+}
+
 int snd_soc_card_remove(struct snd_soc_card *card)
 {
 	int ret = 0;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 227540851ded..57f7105c12b7 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2066,6 +2066,7 @@ static int snd_soc_bind_card(struct snd_soc_card *card)
 		goto probe_end;
 
 	snd_soc_dapm_new_widgets(card);
+	snd_soc_card_fixup_controls(card);
 
 	ret = snd_card_register(card->snd_card);
 	if (ret < 0) {
-- 
cgit 


From 67b9d64139e13621d3ab8bb0daad7602e5fe0778 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Thu, 9 Jun 2022 14:13:34 +0200
Subject: spi: Fix per-cpu stats access on 32 bit systems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 32 bit systems, the following kernel BUG is hit:

BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
caller is debug_smp_processor_id+0x18/0x24
CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.19.0-rc1-00001-g6ae0aec8a366 #181
Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
Backtrace:
 dump_backtrace from show_stack+0x20/0x24
 r7:81024ffd r6:00000000 r5:81024ffd r4:60000013
 show_stack from dump_stack_lvl+0x60/0x78
 dump_stack_lvl from dump_stack+0x14/0x1c
 r7:81024ffd r6:80f652de r5:80bec180 r4:819a2500
 dump_stack from check_preemption_disabled+0xc8/0xf0
 check_preemption_disabled from debug_smp_processor_id+0x18/0x24
 r8:8119b7e0 r7:81205534 r6:819f5c00 r5:819f4c00 r4:c083d724
 debug_smp_processor_id from __spi_sync+0x78/0x220
 __spi_sync from spi_sync+0x34/0x4c
 r10:bb7bf4e0 r9:c083d724 r8:00000007 r7:81a068c0 r6:822a83c0 r5:c083d724
 r4:819f4c00
 spi_sync from spi_mem_exec_op+0x338/0x370
 r5:000000b4 r4:c083d910
 spi_mem_exec_op from spi_nor_read_id+0x98/0xdc
 r10:bb7bf4e0 r9:00000000 r8:00000000 r7:00000000 r6:00000000 r5:82358040
 r4:819f7c40
 spi_nor_read_id from spi_nor_detect+0x38/0x114
 r7:82358040 r6:00000000 r5:819f7c40 r4:819f7c40
 spi_nor_detect from spi_nor_scan+0x11c/0xbec
 r10:bb7bf4e0 r9:00000000 r8:00000000 r7:c083da4c r6:00000000 r5:00010101
 r4:819f7c40
 spi_nor_scan from spi_nor_probe+0x10c/0x2d0
 r10:bb7bf4e0 r9:bb7bf4d0 r8:00000000 r7:819f4c00 r6:00000000 r5:00000000
 r4:819f7c40

per-cpu access needs to be guarded against preemption.

Fixes: 6598b91b5ac3 ("spi: spi.c: Convert statistics to per-cpu u64_stats_t")
Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: David Jander <david@protonic.nl>
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Link: https://lore.kernel.org/r/20220609121334.2984808-1-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       |  5 ++++-
 include/linux/spi/spi.h | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index d94822bf3cec..ac61824b87b5 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -314,11 +314,13 @@ static void spi_statistics_add_transfer_stats(struct spi_statistics *pcpu_stats,
 					      struct spi_controller *ctlr)
 {
 	int l2len = min(fls(xfer->len), SPI_STATISTICS_HISTO_SIZE) - 1;
-	struct spi_statistics *stats = this_cpu_ptr(pcpu_stats);
+	struct spi_statistics *stats;
 
 	if (l2len < 0)
 		l2len = 0;
 
+	get_cpu();
+	stats = this_cpu_ptr(pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
 
 	u64_stats_inc(&stats->transfers);
@@ -333,6 +335,7 @@ static void spi_statistics_add_transfer_stats(struct spi_statistics *pcpu_stats,
 		u64_stats_add(&stats->bytes_rx, xfer->len);
 
 	u64_stats_update_end(&stats->syncp);
+	put_cpu();
 }
 
 /*
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 2e63b4935deb..c96f526d9a20 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -84,18 +84,24 @@ struct spi_statistics {
 
 #define SPI_STATISTICS_ADD_TO_FIELD(pcpu_stats, field, count)		\
 	do {								\
-		struct spi_statistics *__lstats = this_cpu_ptr(pcpu_stats); \
+		struct spi_statistics *__lstats;			\
+		get_cpu();						\
+		__lstats = this_cpu_ptr(pcpu_stats);			\
 		u64_stats_update_begin(&__lstats->syncp);		\
 		u64_stats_add(&__lstats->field, count);			\
 		u64_stats_update_end(&__lstats->syncp);			\
+		put_cpu();						\
 	} while (0)
 
 #define SPI_STATISTICS_INCREMENT_FIELD(pcpu_stats, field)		\
 	do {								\
-		struct spi_statistics *__lstats = this_cpu_ptr(pcpu_stats); \
+		struct spi_statistics *__lstats;			\
+		get_cpu();						\
+		__lstats = this_cpu_ptr(pcpu_stats);			\
 		u64_stats_update_begin(&__lstats->syncp);		\
 		u64_stats_inc(&__lstats->field);			\
 		u64_stats_update_end(&__lstats->syncp);			\
+		put_cpu();						\
 	} while (0)
 
 /**
-- 
cgit 


From 20646f5b1e798bcc20044ae90ac3702f177bf254 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2022 17:23:45 +0200
Subject: netfilter: xtables: Bring SPDX identifier back

Commit e2be04c7f995 ("License cleanup: add SPDX license identifier to
uapi header files with a license") added the correct SPDX identifier to
include/uapi/linux/netfilter/xt_IDLETIMER.h.

A subsequent commit removed it for no reason and reintroduced the UAPI
license incorrectness as the file is now missing the UAPI exception
again.

Add it back and remove the GPLv2 boilerplate while at it.

Fixes: 68983a354a65 ("netfilter: xtables: Add snapshot of hardidletimer target")
Cc: Manoj Basapathi <manojbm@codeaurora.org>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/netfilter/xt_IDLETIMER.h | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 49ddcdc61c09..7bfb31a66fc9 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -1,6 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
 /*
- * linux/include/linux/netfilter/xt_IDLETIMER.h
- *
  * Header file for Xtables timer target module.
  *
  * Copyright (C) 2004, 2010 Nokia Corporation
@@ -10,20 +9,6 @@
  * by Luciano Coelho <luciano.coelho@nokia.com>
  *
  * Contact: Luciano Coelho <luciano.coelho@nokia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
  */
 
 #ifndef _XT_IDLETIMER_H
-- 
cgit 


From a6546f89eac9bfad4b50a7f8ccfbefe0ae01e7df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:10 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_8.RULE

Based on the normalized pattern:

    this program is free software you can redistribute it and/or modify it
    under the terms of the gnu general public license version 2 as
    published by the free software foundation

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/vfio-mediated-device.rst | 4 +---
 arch/arm/boot/bootp/bootp.lds                     | 5 +----
 include/linux/input/elan-i2c-ids.h                | 5 +----
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index eded8719180f..66bd00d7aecf 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -1,3 +1,4 @@
+.. SPDX-License-Identifier: GPL-2.0-only
 .. include:: <isonum.txt>
 
 =====================
@@ -8,9 +9,6 @@ VFIO Mediated devices
 :Author: Neo Jia <cjia@nvidia.com>
 :Author: Kirti Wankhede <kwankhede@nvidia.com>
 
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License version 2 as
-published by the Free Software Foundation.
 
 
 Virtual Function I/O (VFIO) Mediated devices[1]
diff --git a/arch/arm/boot/bootp/bootp.lds b/arch/arm/boot/bootp/bootp.lds
index fc54394f4340..160128186bf8 100644
--- a/arch/arm/boot/bootp/bootp.lds
+++ b/arch/arm/boot/bootp/bootp.lds
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  linux/arch/arm/boot/bootp/bootp.lds
  *
  *  Copyright (C) 2000-2002 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 OUTPUT_ARCH(arm)
 ENTRY(_start)
diff --git a/include/linux/input/elan-i2c-ids.h b/include/linux/input/elan-i2c-ids.h
index 520858d12680..51cca17ee94c 100644
--- a/include/linux/input/elan-i2c-ids.h
+++ b/include/linux/input/elan-i2c-ids.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Elan I2C/SMBus Touchpad device whitelist
  *
@@ -11,10 +12,6 @@
  * copyright (c) 2011-2012 Cypress Semiconductor, Inc.
  * copyright (c) 2011-2012 Google, Inc.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
  * Trademarks are the property of their respective owners.
  */
 
-- 
cgit 


From 2aec85b26f39fa9e036c5872950c0ef9b479a1ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:13 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_30.RULE (part 2)

Based on the normalized pattern:

    this program is free software you can redistribute it and/or modify it
    under the terms of the gnu general public license as published by the
    free software foundation version 2  this program is distributed as is
    without any warranty of any kind whether express or implied without
    even the implied warranty of merchantability or fitness for a
    particular purpose see the gnu general public license for more details

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/dma/bcm-sba-raid.c                       | 14 ++------------
 drivers/dma/ti/edma.c                            | 10 +---------
 drivers/gpio/gpio-bcm-kona.c                     | 10 +---------
 drivers/gpio/gpio-brcmstb.c                      | 14 ++------------
 drivers/i2c/busses/i2c-bcm-iproc.c               | 14 ++------------
 drivers/i2c/busses/i2c-bcm-kona.c                | 14 ++------------
 drivers/i2c/busses/i2c-brcmstb.c                 | 14 ++------------
 drivers/input/keyboard/bcm-keypad.c              | 14 ++------------
 drivers/input/misc/gpio_decoder.c                | 10 +---------
 drivers/input/touchscreen/bcm_iproc_tsc.c        |  9 +--------
 drivers/irqchip/irq-keystone.c                   | 10 +---------
 drivers/mailbox/bcm-flexrm-mailbox.c             | 14 ++------------
 drivers/media/i2c/adv7343_regs.h                 | 10 +---------
 drivers/media/i2c/adv7393_regs.h                 | 10 +---------
 drivers/media/platform/ti/davinci/vpif.h         | 11 +----------
 drivers/media/platform/ti/davinci/vpif_display.h | 10 +---------
 drivers/memory/ti-emif-sram-pm.S                 | 10 +---------
 drivers/mfd/lp873x.c                             | 10 +---------
 drivers/mfd/tps65217.c                           | 10 +---------
 drivers/mmc/host/sdhci-bcm-kona.c                | 14 ++------------
 drivers/mmc/host/sdhci-iproc.c                   | 14 ++------------
 drivers/net/can/ti_hecc.c                        | 11 +----------
 drivers/nvmem/bcm-ocotp.c                        | 14 ++------------
 drivers/phy/broadcom/phy-bcm-cygnus-pcie.c       | 14 ++------------
 drivers/phy/broadcom/phy-bcm-ns2-pcie.c          | 14 ++------------
 drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c        | 14 ++------------
 drivers/phy/motorola/phy-cpcap-usb.c             | 10 +---------
 drivers/phy/ti/phy-dm816x-usb.c                  | 11 +----------
 drivers/pinctrl/bcm/pinctrl-bcm281xx.c           | 14 ++------------
 drivers/pinctrl/bcm/pinctrl-cygnus-mux.c         | 14 ++------------
 drivers/pinctrl/bcm/pinctrl-ns2-mux.c            | 10 +---------
 drivers/pinctrl/bcm/pinctrl-nsp-gpio.c           | 14 ++------------
 drivers/pinctrl/bcm/pinctrl-nsp-mux.c            | 10 +---------
 drivers/power/reset/brcm-kona-reset.c            | 14 ++------------
 drivers/power/reset/brcmstb-reboot.c             | 14 ++------------
 drivers/ptp/ptp_dte.c                            | 14 ++------------
 drivers/pwm/pwm-bcm-iproc.c                      | 14 ++------------
 drivers/pwm/pwm-bcm-kona.c                       | 14 ++------------
 drivers/regulator/cpcap-regulator.c              | 10 +---------
 drivers/regulator/isl6271a-regulator.c           | 10 +---------
 drivers/regulator/tps65023-regulator.c           | 10 +---------
 drivers/regulator/tps6507x-regulator.c           | 10 +---------
 drivers/regulator/tps65217-regulator.c           | 10 +---------
 drivers/soc/ti/knav_dma.c                        | 10 +---------
 drivers/uio/uio_pruss.c                          | 10 +---------
 drivers/video/backlight/tps65217_bl.c            | 10 +---------
 include/dt-bindings/clock/bcm21664.h             | 10 +---------
 include/dt-bindings/clock/bcm281xx.h             | 10 +---------
 include/linux/mfd/lp873x.h                       | 10 +---------
 include/linux/mfd/tps65217.h                     | 10 +---------
 include/linux/platform_data/davinci_asp.h        | 10 +---------
 include/linux/platform_data/gpio-davinci.h       | 10 +---------
 include/linux/platform_data/uio_dmem_genirq.h    | 10 +---------
 include/linux/platform_data/uio_pruss.h          | 10 +---------
 include/linux/reset/bcm63xx_pmb.h                | 10 +---------
 include/linux/soc/ti/knav_dma.h                  | 10 +---------
 include/linux/soc/ti/knav_qmss.h                 | 10 +---------
 include/linux/sram.h                             | 14 ++------------
 include/linux/ti-emif-sram.h                     | 10 +---------
 include/linux/wkup_m3_ipc.h                      | 10 +---------
 include/media/i2c/adv7343.h                      | 10 +---------
 include/media/i2c/adv7393.h                      | 10 +---------
 net/hsr/hsr_debugfs.c                            | 10 +---------
 sound/soc/bcm/cygnus-pcm.c                       | 14 ++------------
 sound/soc/bcm/cygnus-ssp.c                       | 14 ++------------
 sound/soc/bcm/cygnus-ssp.h                       | 14 ++------------
 66 files changed, 91 insertions(+), 671 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
index 64239da02e74..064761289a73 100644
--- a/drivers/dma/bcm-sba-raid.c
+++ b/drivers/dma/bcm-sba-raid.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2017 Broadcom
 
 /*
  * Broadcom SBA RAID Driver
diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c
index 3ea8ef7f57df..4cbca80ee16e 100644
--- a/drivers/dma/ti/edma.c
+++ b/drivers/dma/ti/edma.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI EDMA DMA engine driver
  *
  * Copyright 2012 Texas Instruments
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/dmaengine.h>
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index e84474494429..70770429ba48 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Broadcom Kona GPIO Driver
  *
  * Author: Broadcom Corporation <bcm-kernel-feedback-list@broadcom.com>
  * Copyright (C) 2012-2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/bitops.h>
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 6b7439b44690..85429dd6b3b6 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2015-2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015-2017 Broadcom
 
 #include <linux/bitops.h>
 #include <linux/gpio/driver.h>
diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c
index 6304d1dd2dd6..85d8a6b04885 100644
--- a/drivers/i2c/busses/i2c-bcm-iproc.c
+++ b/drivers/i2c/busses/i2c-bcm-iproc.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014 Broadcom Corporation
 
 #include <linux/delay.h>
 #include <linux/i2c.h>
diff --git a/drivers/i2c/busses/i2c-bcm-kona.c b/drivers/i2c/busses/i2c-bcm-kona.c
index 8e350f20cde0..16bf41f1f086 100644
--- a/drivers/i2c/busses/i2c-bcm-kona.c
+++ b/drivers/i2c/busses/i2c-bcm-kona.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2013 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2013 Broadcom Corporation
 
 #include <linux/device.h>
 #include <linux/kernel.h>
diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c
index b00f35c0b066..ff63ed5ec961 100644
--- a/drivers/i2c/busses/i2c-brcmstb.c
+++ b/drivers/i2c/busses/i2c-brcmstb.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014 Broadcom Corporation
 
 #include <linux/clk.h>
 #include <linux/delay.h>
diff --git a/drivers/input/keyboard/bcm-keypad.c b/drivers/input/keyboard/bcm-keypad.c
index 166d6023a538..56a919ec23b5 100644
--- a/drivers/input/keyboard/bcm-keypad.c
+++ b/drivers/input/keyboard/bcm-keypad.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014 Broadcom Corporation
 
 #include <linux/bitops.h>
 #include <linux/clk.h>
diff --git a/drivers/input/misc/gpio_decoder.c b/drivers/input/misc/gpio_decoder.c
index 145826a1a9a1..ee668eba302f 100644
--- a/drivers/input/misc/gpio_decoder.c
+++ b/drivers/input/misc/gpio_decoder.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  * A generic driver to read multiple gpio lines and translate the
  * encoded numeric value into an input event.
  */
diff --git a/drivers/input/touchscreen/bcm_iproc_tsc.c b/drivers/input/touchscreen/bcm_iproc_tsc.c
index 7de1fd24ce36..35e2fe9911a4 100644
--- a/drivers/input/touchscreen/bcm_iproc_tsc.c
+++ b/drivers/input/touchscreen/bcm_iproc_tsc.c
@@ -1,14 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (C) 2015 Broadcom Corporation
 *
-* This program is free software; you can redistribute it and/or
-* modify it under the terms of the GNU General Public License as
-* published by the Free Software Foundation version 2.
-*
-* This program is distributed "as is" WITHOUT ANY WARRANTY of any
-* kind, whether express or implied; without even the implied warranty
-* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU General Public License for more details.
 */
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c
index d47c8041e5bc..ba9792e60329 100644
--- a/drivers/irqchip/irq-keystone.c
+++ b/drivers/irqchip/irq-keystone.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Texas Instruments Keystone IRQ controller IP driver
  *
  * Copyright (C) 2014 Texas Instruments, Inc.
  * Author: Sajesh Kumar Saran <sajesh@ti.com>
  *	   Grygorii Strashko <grygorii.strashko@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/irq.h>
diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c
index 22acb51531cb..fda16f76401e 100644
--- a/drivers/mailbox/bcm-flexrm-mailbox.c
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2017 Broadcom
 
 /*
  * Broadcom FlexRM Mailbox Driver
diff --git a/drivers/media/i2c/adv7343_regs.h b/drivers/media/i2c/adv7343_regs.h
index 2f04ce4b9118..e0357e6272e3 100644
--- a/drivers/media/i2c/adv7343_regs.h
+++ b/drivers/media/i2c/adv7343_regs.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ADV7343 encoder related structure and register definitions
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef ADV7343_REGS_H
diff --git a/drivers/media/i2c/adv7393_regs.h b/drivers/media/i2c/adv7393_regs.h
index 78968330f0be..6eb8732b5324 100644
--- a/drivers/media/i2c/adv7393_regs.h
+++ b/drivers/media/i2c/adv7393_regs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ADV7393 encoder related structure and register definitions
  *
@@ -7,15 +8,6 @@
  * Based on ADV7343 driver,
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef ADV7393_REGS_H
diff --git a/drivers/media/platform/ti/davinci/vpif.h b/drivers/media/platform/ti/davinci/vpif.h
index c6d1d890478a..651943e3e375 100644
--- a/drivers/media/platform/ti/davinci/vpif.h
+++ b/drivers/media/platform/ti/davinci/vpif.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * VPIF header file
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef VPIF_H
@@ -685,4 +677,3 @@ struct vpif_params {
 };
 
 #endif				/* End of #ifndef VPIF_H */
-
diff --git a/drivers/media/platform/ti/davinci/vpif_display.h b/drivers/media/platform/ti/davinci/vpif_display.h
index f98062e79167..f27474e0fc36 100644
--- a/drivers/media/platform/ti/davinci/vpif_display.h
+++ b/drivers/media/platform/ti/davinci/vpif_display.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * VPIF display header file
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef VPIF_DISPLAY_H
diff --git a/drivers/memory/ti-emif-sram-pm.S b/drivers/memory/ti-emif-sram-pm.S
index d1c83bd5b98e..9bcac35c3304 100644
--- a/drivers/memory/ti-emif-sram-pm.S
+++ b/drivers/memory/ti-emif-sram-pm.S
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Low level PM code for TI EMIF
  *
  * Copyright (C) 2016-2017 Texas Instruments Incorporated - http://www.ti.com/
  *	Dave Gerlach
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/linkage.h>
diff --git a/drivers/mfd/lp873x.c b/drivers/mfd/lp873x.c
index 858c9e0a49a4..b6166dec492d 100644
--- a/drivers/mfd/lp873x.c
+++ b/drivers/mfd/lp873x.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2016 Texas Instruments Incorporated - https://www.ti.com/
  *
  * Author: Keerthy <j-keerthy@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/interrupt.h>
diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index 8027b0a9e14f..8e8da204a02e 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps65217.c
  *
  * TPS65217 chip family multi-function driver
  *
  * Copyright (C) 2011 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mmc/host/sdhci-bcm-kona.c b/drivers/mmc/host/sdhci-bcm-kona.c
index 4d4aac85cc7a..61a12f2f7f03 100644
--- a/drivers/mmc/host/sdhci-bcm-kona.c
+++ b/drivers/mmc/host/sdhci-bcm-kona.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2013 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2013 Broadcom Corporation
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index 032bf852397f..6db35b1b8557 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014 Broadcom Corporation
 
 /*
  * iProc SDHCI platform driver
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index debe17bfd0f0..7ce26cdbeb77 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI HECC (CAN) device driver
  *
@@ -6,16 +7,6 @@
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
  * Copyright (C) 2019 Jeroen Hofstee <jhofstee@victronenergy.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed as is WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #include <linux/module.h>
diff --git a/drivers/nvmem/bcm-ocotp.c b/drivers/nvmem/bcm-ocotp.c
index dfea96c52463..a128c7f5e351 100644
--- a/drivers/nvmem/bcm-ocotp.c
+++ b/drivers/nvmem/bcm-ocotp.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2016 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2016 Broadcom
 
 #include <linux/acpi.h>
 #include <linux/delay.h>
diff --git a/drivers/phy/broadcom/phy-bcm-cygnus-pcie.c b/drivers/phy/broadcom/phy-bcm-cygnus-pcie.c
index 548e46776100..cc29b08e49eb 100644
--- a/drivers/phy/broadcom/phy-bcm-cygnus-pcie.c
+++ b/drivers/phy/broadcom/phy-bcm-cygnus-pcie.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2015 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015 Broadcom Corporation
 
 #include <linux/delay.h>
 #include <linux/io.h>
diff --git a/drivers/phy/broadcom/phy-bcm-ns2-pcie.c b/drivers/phy/broadcom/phy-bcm-ns2-pcie.c
index 9e7434a0d3e0..2eaa41f8fc70 100644
--- a/drivers/phy/broadcom/phy-bcm-ns2-pcie.c
+++ b/drivers/phy/broadcom/phy-bcm-ns2-pcie.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2016 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2016 Broadcom
 
 #include <linux/device.h>
 #include <linux/module.h>
diff --git a/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c b/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
index 65a399acc845..36ad02c33ac5 100644
--- a/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
+++ b/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2017 Broadcom
 
 #include <linux/delay.h>
 #include <linux/extcon-provider.h>
diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c
index 6ee478bc5211..2f8210167b77 100644
--- a/drivers/phy/motorola/phy-cpcap-usb.c
+++ b/drivers/phy/motorola/phy-cpcap-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Motorola CPCAP PMIC USB PHY driver
  * Copyright (C) 2017 Tony Lindgren <tony@atomide.com>
@@ -5,15 +6,6 @@
  * Some parts based on earlier Motorola Linux kernel tree code in
  * board-mapphone-usb.c and cpcap-usb-det.c:
  * Copyright (C) 2007 - 2011 Motorola, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #include <linux/atomic.h>
diff --git a/drivers/phy/ti/phy-dm816x-usb.c b/drivers/phy/ti/phy-dm816x-usb.c
index 9fe6ea6fdae5..fb619908f912 100644
--- a/drivers/phy/ti/phy-dm816x-usb.c
+++ b/drivers/phy/ti/phy-dm816x-usb.c
@@ -1,13 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
diff --git a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
index 9ab1f427286a..fd52a83387ef 100644
--- a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
+++ b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2013-2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2013-2017 Broadcom
 
 #include <linux/err.h>
 #include <linux/io.h>
diff --git a/drivers/pinctrl/bcm/pinctrl-cygnus-mux.c b/drivers/pinctrl/bcm/pinctrl-cygnus-mux.c
index 4344c5732400..5251460f6327 100644
--- a/drivers/pinctrl/bcm/pinctrl-cygnus-mux.c
+++ b/drivers/pinctrl/bcm/pinctrl-cygnus-mux.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014-2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014-2017 Broadcom
 
 /*
  * Broadcom Cygnus IOMUX driver
diff --git a/drivers/pinctrl/bcm/pinctrl-ns2-mux.c b/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
index 0fe4a1fcdf00..960e253f0be4 100644
--- a/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
+++ b/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2016 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  *
  * This file contains the Northstar2 IOMUX driver that supports group
  * based PINMUX configuration. The PWM is functional only when the
diff --git a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
index 643dbd315033..3c792bf03bda 100644
--- a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014-2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014-2017 Broadcom
 
 /*
  * This file contains the Broadcom Northstar Plus (NSP) GPIO driver that
diff --git a/drivers/pinctrl/bcm/pinctrl-nsp-mux.c b/drivers/pinctrl/bcm/pinctrl-nsp-mux.c
index f1d60a708815..db8f79920ff0 100644
--- a/drivers/pinctrl/bcm/pinctrl-nsp-mux.c
+++ b/drivers/pinctrl/bcm/pinctrl-nsp-mux.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2015 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  *
  * This file contains the Northstar plus (NSP) IOMUX driver that supports
  * group based PINMUX configuration. The Northstar plus IOMUX controller
diff --git a/drivers/power/reset/brcm-kona-reset.c b/drivers/power/reset/brcm-kona-reset.c
index 8eaa959d8be6..3de024e3ceb7 100644
--- a/drivers/power/reset/brcm-kona-reset.c
+++ b/drivers/power/reset/brcm-kona-reset.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2016 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2016 Broadcom
 
 #include <linux/io.h>
 #include <linux/of_address.h>
diff --git a/drivers/power/reset/brcmstb-reboot.c b/drivers/power/reset/brcmstb-reboot.c
index 884b53c483c0..0f2944dc9355 100644
--- a/drivers/power/reset/brcmstb-reboot.c
+++ b/drivers/power/reset/brcmstb-reboot.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2013 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2013 Broadcom Corporation
 
 #include <linux/bitops.h>
 #include <linux/device.h>
diff --git a/drivers/ptp/ptp_dte.c b/drivers/ptp/ptp_dte.c
index 82d31ba32690..8641fd060491 100644
--- a/drivers/ptp/ptp_dte.c
+++ b/drivers/ptp/ptp_dte.c
@@ -1,15 +1,5 @@
-/*
- * Copyright 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2017 Broadcom
 
 #include <linux/err.h>
 #include <linux/io.h>
diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c
index 0226bf697f09..7251037d4dd5 100644
--- a/drivers/pwm/pwm-bcm-iproc.c
+++ b/drivers/pwm/pwm-bcm-iproc.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2016 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2016 Broadcom
 
 #include <linux/clk.h>
 #include <linux/delay.h>
diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c
index f171169c1c1f..4fa6e249e4cf 100644
--- a/drivers/pwm/pwm-bcm-kona.c
+++ b/drivers/pwm/pwm-bcm-kona.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014 Broadcom Corporation
 
 #include <linux/clk.h>
 #include <linux/delay.h>
diff --git a/drivers/regulator/cpcap-regulator.c b/drivers/regulator/cpcap-regulator.c
index 79b3eb3222c6..b0c225d98631 100644
--- a/drivers/regulator/cpcap-regulator.c
+++ b/drivers/regulator/cpcap-regulator.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Motorola CPCAP PMIC regulator driver
  *
@@ -6,15 +7,6 @@
  *
  * Rewritten for mainline kernel to use device tree and regmap
  * Copyright (C) 2017 Tony Lindgren <tony@atomide.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #include <linux/err.h>
diff --git a/drivers/regulator/isl6271a-regulator.c b/drivers/regulator/isl6271a-regulator.c
index 6f28bba81d13..591a64e1ca61 100644
--- a/drivers/regulator/isl6271a-regulator.c
+++ b/drivers/regulator/isl6271a-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * isl6271a-regulator.c
  *
  * Support for Intersil ISL6271A voltage regulator
  *
  * Copyright (C) 2010 Marek Vasut <marek.vasut@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any kind,
- * whether express or implied; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/regulator/tps65023-regulator.c b/drivers/regulator/tps65023-regulator.c
index f25806531c7e..d24333344f93 100644
--- a/drivers/regulator/tps65023-regulator.c
+++ b/drivers/regulator/tps65023-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps65023-regulator.c
  *
  * Supports TPS65023 Regulator
  *
  * Copyright (C) 2009 Texas Instrument Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any kind,
- * whether express or implied; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/regulator/tps6507x-regulator.c b/drivers/regulator/tps6507x-regulator.c
index eafbc2bb4b57..b83816ee6867 100644
--- a/drivers/regulator/tps6507x-regulator.c
+++ b/drivers/regulator/tps6507x-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps6507x-regulator.c
  *
  * Regulator driver for TPS65073 PMIC
  *
  * Copyright (C) 2009 Texas Instrument Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any kind,
- * whether express or implied; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/regulator/tps65217-regulator.c b/drivers/regulator/tps65217-regulator.c
index e88ed96f4744..6bb5b02e19e2 100644
--- a/drivers/regulator/tps65217-regulator.c
+++ b/drivers/regulator/tps65217-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps65217-regulator.c
  *
  * Regulator driver for TPS65217 PMIC
  *
  * Copyright (C) 2011 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/soc/ti/knav_dma.c b/drivers/soc/ti/knav_dma.c
index d756591de973..84afebd355be 100644
--- a/drivers/soc/ti/knav_dma.c
+++ b/drivers/soc/ti/knav_dma.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2014 Texas Instruments Incorporated
  * Authors:	Santosh Shilimkar <santosh.shilimkar@ti.com>
  *		Sandeep Nair <sandeep_n@ti.com>
  *		Cyril Chemparathy <cyril@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/io.h>
diff --git a/drivers/uio/uio_pruss.c b/drivers/uio/uio_pruss.c
index e9096f53b4cc..83966dbd3bbf 100644
--- a/drivers/uio/uio_pruss.c
+++ b/drivers/uio/uio_pruss.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Programmable Real-Time Unit Sub System (PRUSS) UIO driver (uio_pruss)
  *
@@ -5,15 +6,6 @@
  * and DDR RAM to user space for applications interacting with PRUSS firmware
  *
  * Copyright (C) 2010-11 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/device.h>
 #include <linux/module.h>
diff --git a/drivers/video/backlight/tps65217_bl.c b/drivers/video/backlight/tps65217_bl.c
index 8457166f357f..d96d713fe7db 100644
--- a/drivers/video/backlight/tps65217_bl.c
+++ b/drivers/video/backlight/tps65217_bl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps65217_bl.c
  *
@@ -5,15 +6,6 @@
  *
  * Copyright (C) 2012 Matthias Kaehlcke
  * Author: Matthias Kaehlcke <matthias@kaehlcke.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/include/dt-bindings/clock/bcm21664.h b/include/dt-bindings/clock/bcm21664.h
index 5a7f0e4750a8..7c7492742f3d 100644
--- a/include/dt-bindings/clock/bcm21664.h
+++ b/include/dt-bindings/clock/bcm21664.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2013 Broadcom Corporation
  * Copyright 2013 Linaro Limited
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _CLOCK_BCM21664_H
diff --git a/include/dt-bindings/clock/bcm281xx.h b/include/dt-bindings/clock/bcm281xx.h
index a763460cf1af..d74ca42112e7 100644
--- a/include/dt-bindings/clock/bcm281xx.h
+++ b/include/dt-bindings/clock/bcm281xx.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2013 Broadcom Corporation
  * Copyright 2013 Linaro Limited
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _CLOCK_BCM281XX_H
diff --git a/include/linux/mfd/lp873x.h b/include/linux/mfd/lp873x.h
index 5546688c7da7..fe8174cc8637 100644
--- a/include/linux/mfd/lp873x.h
+++ b/include/linux/mfd/lp873x.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Functions to access LP873X power management chip.
  *
  * Copyright (C) 2016 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __LINUX_MFD_LP873X_H
diff --git a/include/linux/mfd/tps65217.h b/include/linux/mfd/tps65217.h
index db7091824ed0..877d9c41c53d 100644
--- a/include/linux/mfd/tps65217.h
+++ b/include/linux/mfd/tps65217.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * linux/mfd/tps65217.h
  *
  * Functions to access TPS65217 power management chip.
  *
  * Copyright (C) 2011 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __LINUX_MFD_TPS65217_H
diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h
index 76b13ef67562..c8645b2ed3c0 100644
--- a/include/linux/platform_data/davinci_asp.h
+++ b/include/linux/platform_data/davinci_asp.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * TI DaVinci Audio Serial Port support
  *
  * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #ifndef __DAVINCI_ASP_H
diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index e182a46e609f..b82e44662efe 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * DaVinci GPIO Platform Related Defines
  *
  * Copyright (C) 2013 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __DAVINCI_GPIO_PLATFORM_H
diff --git a/include/linux/platform_data/uio_dmem_genirq.h b/include/linux/platform_data/uio_dmem_genirq.h
index 973c1bb32168..c8f6de685306 100644
--- a/include/linux/platform_data/uio_dmem_genirq.h
+++ b/include/linux/platform_data/uio_dmem_genirq.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * include/linux/platform_data/uio_dmem_genirq.h
  *
  * Copyright (C) 2012 Damian Hobson-Garcia
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _UIO_DMEM_GENIRQ_H
diff --git a/include/linux/platform_data/uio_pruss.h b/include/linux/platform_data/uio_pruss.h
index 31f2e22661bc..f76fa393b802 100644
--- a/include/linux/platform_data/uio_pruss.h
+++ b/include/linux/platform_data/uio_pruss.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * include/linux/platform_data/uio_pruss.h
  *
  * Platform data for uio_pruss driver
  *
  * Copyright (C) 2010-11 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _UIO_PRUSS_H_
diff --git a/include/linux/reset/bcm63xx_pmb.h b/include/linux/reset/bcm63xx_pmb.h
index bb4af7b5eb36..c77b6999518a 100644
--- a/include/linux/reset/bcm63xx_pmb.h
+++ b/include/linux/reset/bcm63xx_pmb.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Broadcom BCM63xx Processor Monitor Bus shared routines (SMP and reset)
  *
  * Copyright (C) 2015, Broadcom Corporation
  * Author: Florian Fainelli <f.fainelli@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #ifndef __BCM63XX_PMB_H
 #define __BCM63XX_PMB_H
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 7127ec301537..18d806a8e52c 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2014 Texas Instruments Incorporated
  * Authors:	Sandeep Nair <sandeep_n@ti.com
  *		Cyril Chemparathy <cyril@ti.com
 		Santosh Shilimkar <santosh.shilimkar@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__
diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h
index c75ef99c99ca..175f466ebcc3 100644
--- a/include/linux/soc/ti/knav_qmss.h
+++ b/include/linux/soc/ti/knav_qmss.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Keystone Navigator Queue Management Sub-System header
  *
@@ -5,15 +6,6 @@
  * Author:	Sandeep Nair <sandeep_n@ti.com>
  *		Cyril Chemparathy <cyril@ti.com>
  *		Santosh Shilimkar <santosh.shilimkar@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef __SOC_TI_KNAV_QMSS_H__
diff --git a/include/linux/sram.h b/include/linux/sram.h
index 4fb405fb0480..d7dee19505c6 100644
--- a/include/linux/sram.h
+++ b/include/linux/sram.h
@@ -1,15 +1,5 @@
-/*
- * Generic SRAM Driver Interface
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Generic SRAM Driver Interface */
 #ifndef __LINUX_SRAM_H__
 #define __LINUX_SRAM_H__
 
diff --git a/include/linux/ti-emif-sram.h b/include/linux/ti-emif-sram.h
index 2fc854155c27..441b2988e66a 100644
--- a/include/linux/ti-emif-sram.h
+++ b/include/linux/ti-emif-sram.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * TI AM33XX EMIF Routines
  *
  * Copyright (C) 2016-2017 Texas Instruments Inc.
  *	Dave Gerlach
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #ifndef __LINUX_TI_EMIF_H
 #define __LINUX_TI_EMIF_H
diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h
index 26d1eb058fa3..5e1b26f988e2 100644
--- a/include/linux/wkup_m3_ipc.h
+++ b/include/linux/wkup_m3_ipc.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * TI Wakeup M3 for AMx3 SoCs Power Management Routines
  *
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  * Dave Gerlach <d-gerlach@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _LINUX_WKUP_M3_IPC_H
diff --git a/include/media/i2c/adv7343.h b/include/media/i2c/adv7343.h
index b8937035c5d3..d35d3e925795 100644
--- a/include/media/i2c/adv7343.h
+++ b/include/media/i2c/adv7343.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ADV7343 header file
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef ADV7343_H
diff --git a/include/media/i2c/adv7393.h b/include/media/i2c/adv7393.h
index b28edf351842..c73b36321d06 100644
--- a/include/media/i2c/adv7393.h
+++ b/include/media/i2c/adv7393.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ADV7393 header file
  *
@@ -7,15 +8,6 @@
  * Based on ADV7343 driver,
  *
  * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed .as is. WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef ADV7393_H
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index fe6094e9a2db..de476a417631 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * debugfs code for HSR & PRP
  * Copyright (C) 2019 Texas Instruments Incorporated
  *
  * Author(s):
  *	Murali Karicheri <m-karicheri2@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/module.h>
 #include <linux/errno.h>
diff --git a/sound/soc/bcm/cygnus-pcm.c b/sound/soc/bcm/cygnus-pcm.c
index 3abeaf0f1b1c..8f488f92936b 100644
--- a/sound/soc/bcm/cygnus-pcm.c
+++ b/sound/soc/bcm/cygnus-pcm.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014-2015 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014-2015 Broadcom Corporation
 #include <linux/debugfs.h>
 #include <linux/dma-mapping.h>
 #include <linux/init.h>
diff --git a/sound/soc/bcm/cygnus-ssp.c b/sound/soc/bcm/cygnus-ssp.c
index 9698f4531c90..b0254e724ec9 100644
--- a/sound/soc/bcm/cygnus-ssp.c
+++ b/sound/soc/bcm/cygnus-ssp.c
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014-2015 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2014-2015 Broadcom Corporation
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/sound/soc/bcm/cygnus-ssp.h b/sound/soc/bcm/cygnus-ssp.h
index 33dd34305928..74152b2d770d 100644
--- a/sound/soc/bcm/cygnus-ssp.h
+++ b/sound/soc/bcm/cygnus-ssp.h
@@ -1,15 +1,5 @@
-/*
- * Copyright (C) 2014-2015 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2014-2015 Broadcom Corporation */
 #ifndef __CYGNUS_SSP_H__
 #define __CYGNUS_SSP_H__
 
-- 
cgit 


From 94c0ded75c85201bfb8fce95dab44cc53addfcb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:18 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_133.RULE

Based on the normalized pattern:

    this program is free software you can redistribute it and/or modify it
    under the terms of the gnu general public license version 2 as
    published by the free software foundation  this program is distributed
    in the hope that it will be useful merchantability or fitness for a
    particular purpose see the gnu general public license for more details

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/boot/dts/mediatek/mt6755.dtsi | 9 +--------
 arch/arm64/boot/dts/mediatek/mt6795.dtsi | 9 +--------
 arch/arm64/boot/dts/mediatek/mt8173.dtsi | 9 +--------
 include/dt-bindings/power/mt6797-power.h | 9 +--------
 4 files changed, 4 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/boot/dts/mediatek/mt6755.dtsi b/arch/arm64/boot/dts/mediatek/mt6755.dtsi
index 01ba77669717..b55d3fac9bd4 100644
--- a/arch/arm64/boot/dts/mediatek/mt6755.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt6755.dtsi
@@ -1,14 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2016 MediaTek Inc.
  * Author: Mars.C <mars.cheng@mediatek.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <dt-bindings/interrupt-controller/irq.h>
diff --git a/arch/arm64/boot/dts/mediatek/mt6795.dtsi b/arch/arm64/boot/dts/mediatek/mt6795.dtsi
index c85659d0ff5d..1413f32c70af 100644
--- a/arch/arm64/boot/dts/mediatek/mt6795.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt6795.dtsi
@@ -1,14 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2015 MediaTek Inc.
  * Author: Mars.C <mars.cheng@mediatek.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <dt-bindings/interrupt-controller/irq.h>
diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index 40d7b47fc52e..e14b6e68c4df 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -1,14 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2014 MediaTek Inc.
  * Author: Eddie Huang <eddie.huang@mediatek.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <dt-bindings/clock/mt8173-clk.h>
diff --git a/include/dt-bindings/power/mt6797-power.h b/include/dt-bindings/power/mt6797-power.h
index a60c1d81cf75..bd451d860e6a 100644
--- a/include/dt-bindings/power/mt6797-power.h
+++ b/include/dt-bindings/power/mt6797-power.h
@@ -1,14 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (c) 2017 MediaTek Inc.
  * Author: Mars.C <mars.cheng@mediatek.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _DT_BINDINGS_POWER_MT6797_POWER_H
-- 
cgit 


From 1accad5e74635d7f9d994e692649fbe736afe150 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:20 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_149.RULE

Based on the normalized pattern:

    netapp provides this source code under the gpl v2 license the gpl v2
    license is available at https://opensource org/licenses/gpl-license
    php  this software is provided by the copyright holders and
    contributors as is and any express or implied warranties including but
    not limited to the implied warranties of merchantability and fitness
    for a particular purpose are disclaimed in no event shall the
    copyright owner or contributors be liable for any direct indirect
    incidental special exemplary or consequential damages (including but
    not limited to procurement of substitute goods or services loss of use
    data or profits or business interruption) however caused and on any
    theory of liability whether in contract strict liability or tort
    (including negligence or otherwise) arising in any way out of the use
    of this software even if advised of the possibility of such damage

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sunrpc/bc_xprt.h | 17 +----------------
 net/sunrpc/backchannel_rqst.c  | 16 +---------------
 net/sunrpc/sunrpc.h            | 16 +---------------
 3 files changed, 3 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index f07c334c599f..db30a159f9d5 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 
 (c) 2008 NetApp.  All Rights Reserved.
 
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-https://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ******************************************************************************/
 
@@ -83,4 +69,3 @@ static inline void xprt_free_bc_request(struct rpc_rqst *req)
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 #endif /* _LINUX_SUNRPC_BC_XPRT_H */
-
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 5a6b61dcdf2d..253a54c2fcfe 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 
 (c) 2007 Network Appliance, Inc.  All Rights Reserved.
 (c) 2009 NetApp.  All Rights Reserved.
 
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-https://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ******************************************************************************/
 
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 2f59464e6524..d4a362c9e4b3 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 
 (c) 2008 NetApp.  All Rights Reserved.
 
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-https://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ******************************************************************************/
 
-- 
cgit 


From 298b95f111be85f2d5a18dc0177eb9a64130f9b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:21 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_152.RULE

Based on the normalized pattern:

    this software is distributed under the terms of the gnu general public
    license ( gpl ) version 2 as published by the free software foundation
    this software is provided by the copyright holders and contributors as
    is and any express or implied warranties including but not limited to
    the implied warranties of merchantability and fitness for a particular
    purpose are disclaimed in no event shall the copyright owner or
    contributors be liable for any direct indirect incidental special
    exemplary or consequential damages (including but not limited to
    procurement of substitute goods or services loss of use data or
    profits or business interruption) however caused and on any theory of
    liability whether in contract strict liability or tort (including
    negligence or otherwise) arising in any way out of the use of this
    software even if advised of the possibility of such damage

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/usb-omap.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/usb-omap.h b/include/linux/platform_data/usb-omap.h
index 5e70d667031c..580978e468f8 100644
--- a/include/linux/platform_data/usb-omap.h
+++ b/include/linux/platform_data/usb-omap.h
@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * usb-omap.h - Platform data for the various OMAP USB IPs
  *
  * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com
- *
- * This software is distributed under the terms of the GNU General Public
- * License ("GPL") version 2, as published by the Free Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #define OMAP3_HS_USB_PORTS	3
-- 
cgit 


From dfb99b050e4bc5bfb74973761752395d82644e48 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:25 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_168.RULE (part 2)

Based on the normalized pattern:

    this program is free software you may redistribute it and/or modify it
    under the terms of the gnu general public license as published by the
    free software foundation version 2 of the license  the software is
    provided as is without warranty of any kind express or implied
    including but not limited to the warranties of merchantability fitness
    for a particular purpose and noninfringement in no event shall the
    authors or copyright holders be liable for any claim damages or other
    liability whether in an action of contract tort or otherwise arising
    from out of or in connection with the software or the use or other
    dealings in the software

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/snic/snic_attrs.c    | 18 ++----------------
 drivers/scsi/snic/snic_ctl.c      | 18 ++----------------
 drivers/scsi/snic/snic_debugfs.c  | 18 ++----------------
 drivers/scsi/snic/snic_disc.c     | 18 ++----------------
 drivers/scsi/snic/snic_disc.h     | 18 ++----------------
 drivers/scsi/snic/snic_fwint.h    | 18 ++----------------
 drivers/scsi/snic/snic_io.c       | 18 ++----------------
 drivers/scsi/snic/snic_io.h       | 18 ++----------------
 drivers/scsi/snic/snic_isr.c      | 18 ++----------------
 drivers/scsi/snic/snic_main.c     | 18 ++----------------
 drivers/scsi/snic/snic_res.c      | 18 ++----------------
 drivers/scsi/snic/snic_res.h      | 18 ++----------------
 drivers/scsi/snic/snic_scsi.c     | 18 ++----------------
 drivers/scsi/snic/snic_stats.h    | 18 ++----------------
 drivers/scsi/snic/snic_trc.c      | 18 ++----------------
 drivers/scsi/snic/snic_trc.h      | 18 ++----------------
 drivers/scsi/snic/vnic_cq.c       | 18 ++----------------
 drivers/scsi/snic/vnic_cq.h       | 18 ++----------------
 drivers/scsi/snic/vnic_cq_fw.h    | 18 ++----------------
 drivers/scsi/snic/vnic_dev.c      | 18 ++----------------
 drivers/scsi/snic/vnic_dev.h      | 18 ++----------------
 drivers/scsi/snic/vnic_devcmd.h   | 18 ++----------------
 drivers/scsi/snic/vnic_intr.c     | 18 ++----------------
 drivers/scsi/snic/vnic_intr.h     | 18 ++----------------
 drivers/scsi/snic/vnic_resource.h | 18 ++----------------
 drivers/scsi/snic/vnic_snic.h     | 18 ++----------------
 drivers/scsi/snic/vnic_stats.h    | 18 ++----------------
 drivers/scsi/snic/vnic_wq.c       | 18 ++----------------
 drivers/scsi/snic/vnic_wq.h       | 18 ++----------------
 drivers/scsi/snic/wq_enet_desc.h  | 18 ++----------------
 include/media/i2c/ov2659.h        | 14 +-------------
 samples/v4l/v4l2-pci-skeleton.c   | 14 +-------------
 32 files changed, 62 insertions(+), 506 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/snic/snic_attrs.c b/drivers/scsi/snic/snic_attrs.c
index dc03ce1ec909..3ddbdbc3ded1 100644
--- a/drivers/scsi/snic/snic_attrs.c
+++ b/drivers/scsi/snic/snic_attrs.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/string.h>
 #include <linux/device.h>
diff --git a/drivers/scsi/snic/snic_ctl.c b/drivers/scsi/snic/snic_ctl.c
index 703f229862fc..5f4fca96b192 100644
--- a/drivers/scsi/snic/snic_ctl.c
+++ b/drivers/scsi/snic/snic_ctl.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/pci.h>
diff --git a/drivers/scsi/snic/snic_debugfs.c b/drivers/scsi/snic/snic_debugfs.c
index 5e0faeba516e..57bdc3ba49d9 100644
--- a/drivers/scsi/snic/snic_debugfs.c
+++ b/drivers/scsi/snic/snic_debugfs.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/module.h>
 #include <linux/errno.h>
diff --git a/drivers/scsi/snic/snic_disc.c b/drivers/scsi/snic/snic_disc.c
index 27e98df83b31..9b2b5f8c23b9 100644
--- a/drivers/scsi/snic/snic_disc.c
+++ b/drivers/scsi/snic/snic_disc.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/mempool.h>
diff --git a/drivers/scsi/snic/snic_disc.h b/drivers/scsi/snic/snic_disc.h
index 97fa3f5c5bb4..9ad7f84a3484 100644
--- a/drivers/scsi/snic/snic_disc.h
+++ b/drivers/scsi/snic/snic_disc.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef __SNIC_DISC_H
 #define __SNIC_DISC_H
diff --git a/drivers/scsi/snic/snic_fwint.h b/drivers/scsi/snic/snic_fwint.h
index 2a045a57e365..e6b3e8b431c0 100644
--- a/drivers/scsi/snic/snic_fwint.h
+++ b/drivers/scsi/snic/snic_fwint.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef __SNIC_FWINT_H
 #define __SNIC_FWINT_H
diff --git a/drivers/scsi/snic/snic_io.c b/drivers/scsi/snic/snic_io.c
index 159ee94d2a55..32a77bee41d5 100644
--- a/drivers/scsi/snic/snic_io.c
+++ b/drivers/scsi/snic/snic_io.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/pci.h>
diff --git a/drivers/scsi/snic/snic_io.h b/drivers/scsi/snic/snic_io.h
index 093d6524cd42..de6694a24c5f 100644
--- a/drivers/scsi/snic/snic_io.h
+++ b/drivers/scsi/snic/snic_io.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _SNIC_IO_H
 #define _SNIC_IO_H
diff --git a/drivers/scsi/snic/snic_isr.c b/drivers/scsi/snic/snic_isr.c
index c4da3673f2ae..471a37422da9 100644
--- a/drivers/scsi/snic/snic_isr.c
+++ b/drivers/scsi/snic/snic_isr.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/drivers/scsi/snic/snic_main.c b/drivers/scsi/snic/snic_main.c
index 29d56396058c..174f7811fe50 100644
--- a/drivers/scsi/snic/snic_main.c
+++ b/drivers/scsi/snic/snic_main.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/module.h>
 #include <linux/mempool.h>
diff --git a/drivers/scsi/snic/snic_res.c b/drivers/scsi/snic/snic_res.c
index b54912c8ca0c..43f1a2823514 100644
--- a/drivers/scsi/snic/snic_res.c
+++ b/drivers/scsi/snic/snic_res.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/types.h>
diff --git a/drivers/scsi/snic/snic_res.h b/drivers/scsi/snic/snic_res.h
index 273f72f2a023..53cf6b19ab28 100644
--- a/drivers/scsi/snic/snic_res.h
+++ b/drivers/scsi/snic/snic_res.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef __SNIC_RES_H
 #define __SNIC_RES_H
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index 5f17666f3e1d..961af6fc21bc 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/mempool.h>
 #include <linux/errno.h>
diff --git a/drivers/scsi/snic/snic_stats.h b/drivers/scsi/snic/snic_stats.h
index faf0cb601954..f0285c5a35f8 100644
--- a/drivers/scsi/snic/snic_stats.h
+++ b/drivers/scsi/snic/snic_stats.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef __SNIC_STATS_H
 #define __SNIC_STATS_H
diff --git a/drivers/scsi/snic/snic_trc.c b/drivers/scsi/snic/snic_trc.c
index f23fe2f88438..c2e5ab7e976c 100644
--- a/drivers/scsi/snic/snic_trc.c
+++ b/drivers/scsi/snic/snic_trc.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/module.h>
 #include <linux/mempool.h>
diff --git a/drivers/scsi/snic/snic_trc.h b/drivers/scsi/snic/snic_trc.h
index ce305b4b8fa2..c38e0dadc958 100644
--- a/drivers/scsi/snic/snic_trc.h
+++ b/drivers/scsi/snic/snic_trc.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef __SNIC_TRC_H
 #define __SNIC_TRC_H
diff --git a/drivers/scsi/snic/vnic_cq.c b/drivers/scsi/snic/vnic_cq.c
index 3455dd7e73f4..0d5d3bd4be1c 100644
--- a/drivers/scsi/snic/vnic_cq.c
+++ b/drivers/scsi/snic/vnic_cq.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/types.h>
diff --git a/drivers/scsi/snic/vnic_cq.h b/drivers/scsi/snic/vnic_cq.h
index 6e651c3e16f7..6cee911eec5f 100644
--- a/drivers/scsi/snic/vnic_cq.h
+++ b/drivers/scsi/snic/vnic_cq.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_CQ_H_
 #define _VNIC_CQ_H_
diff --git a/drivers/scsi/snic/vnic_cq_fw.h b/drivers/scsi/snic/vnic_cq_fw.h
index c2d1bbd44bd1..d74954bc70e3 100644
--- a/drivers/scsi/snic/vnic_cq_fw.h
+++ b/drivers/scsi/snic/vnic_cq_fw.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_CQ_FW_H_
 #define _VNIC_CQ_FW_H_
diff --git a/drivers/scsi/snic/vnic_dev.c b/drivers/scsi/snic/vnic_dev.c
index 05e374f80946..760f3f22095c 100644
--- a/drivers/scsi/snic/vnic_dev.c
+++ b/drivers/scsi/snic/vnic_dev.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/drivers/scsi/snic/vnic_dev.h b/drivers/scsi/snic/vnic_dev.h
index e65726da6504..d2f9b6f7b313 100644
--- a/drivers/scsi/snic/vnic_dev.h
+++ b/drivers/scsi/snic/vnic_dev.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_DEV_H_
 #define _VNIC_DEV_H_
diff --git a/drivers/scsi/snic/vnic_devcmd.h b/drivers/scsi/snic/vnic_devcmd.h
index 0e0fa38f8d90..9d82fcb7414b 100644
--- a/drivers/scsi/snic/vnic_devcmd.h
+++ b/drivers/scsi/snic/vnic_devcmd.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_DEVCMD_H_
 #define _VNIC_DEVCMD_H_
diff --git a/drivers/scsi/snic/vnic_intr.c b/drivers/scsi/snic/vnic_intr.c
index a7d54806787d..23627f9591f2 100644
--- a/drivers/scsi/snic/vnic_intr.c
+++ b/drivers/scsi/snic/vnic_intr.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/drivers/scsi/snic/vnic_intr.h b/drivers/scsi/snic/vnic_intr.h
index 4547f603fe5e..7bff60fafb07 100644
--- a/drivers/scsi/snic/vnic_intr.h
+++ b/drivers/scsi/snic/vnic_intr.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_INTR_H_
 #define _VNIC_INTR_H_
diff --git a/drivers/scsi/snic/vnic_resource.h b/drivers/scsi/snic/vnic_resource.h
index 9713d6835db3..372596b0915f 100644
--- a/drivers/scsi/snic/vnic_resource.h
+++ b/drivers/scsi/snic/vnic_resource.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_RESOURCE_H_
 #define _VNIC_RESOURCE_H_
diff --git a/drivers/scsi/snic/vnic_snic.h b/drivers/scsi/snic/vnic_snic.h
index 514d39f5cf00..ffc8a0fee577 100644
--- a/drivers/scsi/snic/vnic_snic.h
+++ b/drivers/scsi/snic/vnic_snic.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_SNIC_H_
 #define _VNIC_SNIC_H_
diff --git a/drivers/scsi/snic/vnic_stats.h b/drivers/scsi/snic/vnic_stats.h
index 370a37c97748..38155aae7a52 100644
--- a/drivers/scsi/snic/vnic_stats.h
+++ b/drivers/scsi/snic/vnic_stats.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_STATS_H_
 #define _VNIC_STATS_H_
diff --git a/drivers/scsi/snic/vnic_wq.c b/drivers/scsi/snic/vnic_wq.c
index 1e91d432089e..48be9a3f4c3d 100644
--- a/drivers/scsi/snic/vnic_wq.c
+++ b/drivers/scsi/snic/vnic_wq.c
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2014 Cisco Systems, Inc.  All rights reserved.
 
 #include <linux/errno.h>
 #include <linux/types.h>
diff --git a/drivers/scsi/snic/vnic_wq.h b/drivers/scsi/snic/vnic_wq.h
index 7cc031c7ceba..1415da4b68dc 100644
--- a/drivers/scsi/snic/vnic_wq.h
+++ b/drivers/scsi/snic/vnic_wq.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _VNIC_WQ_H_
 #define _VNIC_WQ_H_
diff --git a/drivers/scsi/snic/wq_enet_desc.h b/drivers/scsi/snic/wq_enet_desc.h
index 68f62b6d105b..e8025331b503 100644
--- a/drivers/scsi/snic/wq_enet_desc.h
+++ b/drivers/scsi/snic/wq_enet_desc.h
@@ -1,19 +1,5 @@
-/*
- * Copyright 2014 Cisco Systems, Inc.  All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2014 Cisco Systems, Inc.  All rights reserved. */
 
 #ifndef _WQ_ENET_DESC_H_
 #define _WQ_ENET_DESC_H_
diff --git a/include/media/i2c/ov2659.h b/include/media/i2c/ov2659.h
index 4216adc1ede2..c9ea318a8fc3 100644
--- a/include/media/i2c/ov2659.h
+++ b/include/media/i2c/ov2659.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Omnivision OV2659 CMOS Image Sensor driver
  *
@@ -5,19 +6,6 @@
  *
  * Benoit Parrot <bparrot@ti.com>
  * Lad, Prabhakar <prabhakar.csengg@gmail.com>
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef OV2659_H
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
index 6311b7465220..a61f94db18d9 100644
--- a/samples/v4l/v4l2-pci-skeleton.c
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source
  * for use with other PCI drivers.
@@ -6,19 +7,6 @@
  * input 0 and an HDMI connector as input 1.
  *
  * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
- *
- * This program is free software; you may redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #include <linux/types.h>
-- 
cgit 


From abd462747539bbc630e50e8be59e3aebca63441d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:31 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_319.RULE

Based on the normalized pattern:

    this program is free software you can redistribute it and/or modify it
    under the terms of the gnu general public license version 2 as
    published by the free software foundation  this program is distributed
    as is without any warranty of any kind whether expressed or implied
    without even the implied warranty of merchantability or fitness for a
    particular purpose see the gnu general public license version 2 for
    more details

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpio/gpio-lp873x.c             | 10 +---------
 drivers/gpio/gpio-lp87565.c            | 10 +---------
 drivers/gpio/gpio-pisosr.c             | 10 +---------
 drivers/gpio/gpio-tpic2810.c           | 10 +---------
 drivers/mfd/tps65086.c                 | 10 +---------
 drivers/mfd/tps65218.c                 | 10 +---------
 drivers/mfd/tps65912-core.c            | 10 +---------
 drivers/mfd/tps65912-i2c.c             | 10 +---------
 drivers/mfd/tps65912-spi.c             | 10 +---------
 drivers/regulator/lp873x-regulator.c   | 10 +---------
 drivers/regulator/tps65086-regulator.c | 10 +---------
 drivers/regulator/tps65218-regulator.c | 10 +---------
 drivers/regulator/tps65912-regulator.c | 10 +---------
 include/linux/mfd/tps65086.h           | 10 +---------
 include/linux/mfd/tps65218.h           | 10 +---------
 include/linux/mfd/tps65912.h           | 10 +---------
 16 files changed, 16 insertions(+), 144 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-lp873x.c b/drivers/gpio/gpio-lp873x.c
index 70fad87ff2db..5c79ba1f229c 100644
--- a/drivers/gpio/gpio-lp873x.c
+++ b/drivers/gpio/gpio-lp873x.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
  *	Keerthy <j-keerthy@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver
  */
 
diff --git a/drivers/gpio/gpio-lp87565.c b/drivers/gpio/gpio-lp87565.c
index fcde6708b5df..d3ce027de081 100644
--- a/drivers/gpio/gpio-lp87565.c
+++ b/drivers/gpio/gpio-lp87565.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2017 Texas Instruments Incorporated - http://www.ti.com/
  *	Keerthy <j-keerthy@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the LP873X driver
  */
 
diff --git a/drivers/gpio/gpio-pisosr.c b/drivers/gpio/gpio-pisosr.c
index 81a47ae09ff8..67071bea08c2 100644
--- a/drivers/gpio/gpio-pisosr.c
+++ b/drivers/gpio/gpio-pisosr.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #include <linux/bitmap.h>
diff --git a/drivers/gpio/gpio-tpic2810.c b/drivers/gpio/gpio-tpic2810.c
index 99d5a84a9129..a09b1e69b072 100644
--- a/drivers/gpio/gpio-tpic2810.c
+++ b/drivers/gpio/gpio-tpic2810.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #include <linux/gpio/driver.h>
diff --git a/drivers/mfd/tps65086.c b/drivers/mfd/tps65086.c
index 3bd5728844a0..cbae9777a24e 100644
--- a/drivers/mfd/tps65086.c
+++ b/drivers/mfd/tps65086.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65912 driver
  */
 
diff --git a/drivers/mfd/tps65218.c b/drivers/mfd/tps65218.c
index 167e9fc308ef..49bb8fd168f8 100644
--- a/drivers/mfd/tps65218.c
+++ b/drivers/mfd/tps65218.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Driver for TPS65218 Integrated power management chipsets
  *
  * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/mfd/tps65912-core.c b/drivers/mfd/tps65912-core.c
index c282a05e7146..7d994b8a5965 100644
--- a/drivers/mfd/tps65912-core.c
+++ b/drivers/mfd/tps65912-core.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Core functions for TI TPS65912x PMICs
  *
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver and the previous TPS65912 driver by
  * Margarita Olaya Cabrera <magi@slimlogic.co.uk>
  */
diff --git a/drivers/mfd/tps65912-i2c.c b/drivers/mfd/tps65912-i2c.c
index 06eb2784d322..afb7f7d97dc0 100644
--- a/drivers/mfd/tps65912-i2c.c
+++ b/drivers/mfd/tps65912-i2c.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * I2C access driver for TI TPS65912x PMICs
  *
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver and the previous TPS65912 driver by
  * Margarita Olaya Cabrera <magi@slimlogic.co.uk>
  */
diff --git a/drivers/mfd/tps65912-spi.c b/drivers/mfd/tps65912-spi.c
index bba38fbc781d..9e976f9c6bbe 100644
--- a/drivers/mfd/tps65912-spi.c
+++ b/drivers/mfd/tps65912-spi.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SPI access driver for TI TPS65912x PMICs
  *
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver and the previous TPS65912 driver by
  * Margarita Olaya Cabrera <magi@slimlogic.co.uk>
  */
diff --git a/drivers/regulator/lp873x-regulator.c b/drivers/regulator/lp873x-regulator.c
index c38387e0fbb2..d6e597922cb5 100644
--- a/drivers/regulator/lp873x-regulator.c
+++ b/drivers/regulator/lp873x-regulator.c
@@ -1,16 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regulator driver for LP873X PMIC
  *
  * Copyright (C) 2016 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #include <linux/module.h>
diff --git a/drivers/regulator/tps65086-regulator.c b/drivers/regulator/tps65086-regulator.c
index 070c956216b0..f1bc54c825dd 100644
--- a/drivers/regulator/tps65086-regulator.c
+++ b/drivers/regulator/tps65086-regulator.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *
  * Author: Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65912 driver
  */
 
diff --git a/drivers/regulator/tps65218-regulator.c b/drivers/regulator/tps65218-regulator.c
index fa263545a70e..48809c3b3abc 100644
--- a/drivers/regulator/tps65218-regulator.c
+++ b/drivers/regulator/tps65218-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * tps65218-regulator.c
  *
  * Regulator driver for TPS65218 PMIC
  *
  * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/regulator/tps65912-regulator.c b/drivers/regulator/tps65912-regulator.c
index b52d4f2874b7..76f90202ae09 100644
--- a/drivers/regulator/tps65912-regulator.c
+++ b/drivers/regulator/tps65912-regulator.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regulator driver for TI TPS65912x PMICs
  *
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver and the previous TPS65912 driver by
  * Margarita Olaya Cabrera <magi@slimlogic.co.uk>
  */
diff --git a/include/linux/mfd/tps65086.h b/include/linux/mfd/tps65086.h
index e0a417e53766..16f87cccc003 100644
--- a/include/linux/mfd/tps65086.h
+++ b/include/linux/mfd/tps65086.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65912 driver
  */
 
diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h
index 122e24ddbd4b..2946be2f15f3 100644
--- a/include/linux/mfd/tps65218.h
+++ b/include/linux/mfd/tps65218.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * linux/mfd/tps65218.h
  *
  * Functions to access TPS65218 power management chip.
  *
  * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
  */
 
 #ifndef __LINUX_MFD_TPS65218_H
diff --git a/include/linux/mfd/tps65912.h b/include/linux/mfd/tps65912.h
index 8a61386cb8c1..860ec0a16c96 100644
--- a/include/linux/mfd/tps65912.h
+++ b/include/linux/mfd/tps65912.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65218 driver and the previous TPS65912 driver by
  * Margarita Olaya Cabrera <magi@slimlogic.co.uk>
  */
-- 
cgit 


From 5a729246e57eac410e4a13f5aba66ae2dc552632 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2022 16:11:32 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX -
 gpl-2.0_320.RULE

Based on the normalized pattern:

    this program is free software you can redistribute it and/or modify it
    under the terms of the gnu general public license version 2 as
    published by the free software foundation  this program is distributed
    as is without any warranty of any kind whether express or implied
    without even the implied warranty of merchantability or fitness for a
    particular purpose see the gnu general public license for more details

extracted by the scancode license scanner the SPDX license identifier

    GPL-2.0-only

has been chosen to replace the boilerplate/reference.

Reviewed-by: Allison Randal <allison@lohutok.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-omap1/timer.c             | 10 +---------
 arch/arm/mach-omap2/display.c           | 10 +---------
 arch/arm/mach-omap2/omap_opp_data.h     | 10 +---------
 arch/arm/mach-omap2/opp3xxx_data.c      | 10 +---------
 arch/arm/mach-omap2/opp4xxx_data.c      | 10 +---------
 arch/arm/mach-versatile/spc.c           | 10 +---------
 drivers/bus/omap_l3_noc.c               | 10 +---------
 drivers/bus/omap_l3_noc.h               | 10 +---------
 drivers/clk/keystone/sci-clk.c          | 10 +---------
 drivers/clk/ti/apll.c                   | 10 +---------
 drivers/clk/ti/autoidle.c               | 10 +---------
 drivers/clk/ti/clk-dra7-atl.c           | 10 +---------
 drivers/clk/ti/clk.c                    | 10 +---------
 drivers/clk/ti/clkctrl.c                | 10 +---------
 drivers/clk/ti/clkt_dflt.c              | 10 +---------
 drivers/clk/ti/clockdomain.c            | 10 +---------
 drivers/clk/ti/composite.c              | 10 +---------
 drivers/clk/ti/divider.c                | 10 +---------
 drivers/clk/ti/dpll.c                   | 10 +---------
 drivers/clk/ti/fixed-factor.c           | 10 +---------
 drivers/clk/ti/gate.c                   | 10 +---------
 drivers/clk/ti/interface.c              | 10 +---------
 drivers/clk/ti/mux.c                    | 10 +---------
 drivers/cpufreq/scpi-cpufreq.c          | 10 +---------
 drivers/input/misc/palmas-pwrbutton.c   | 10 +---------
 drivers/input/misc/tps65218-pwrbutton.c | 10 +---------
 drivers/misc/sram-exec.c                | 10 +---------
 drivers/power/supply/cpcap-battery.c    | 10 +---------
 drivers/regulator/ti-abb-regulator.c    | 10 +---------
 drivers/reset/reset-ti-sci.c            | 10 +---------
 drivers/reset/reset-ti-syscon.c         | 10 +---------
 drivers/thermal/hisi_thermal.c          | 10 +---------
 include/dt-bindings/clock/ti-dra7-atl.h | 10 +---------
 include/dt-bindings/pinctrl/hisi.h      | 10 +---------
 include/dt-bindings/pinctrl/keystone.h  | 10 +---------
 include/linux/clk/ti.h                  | 10 +---------
 include/linux/pm_wakeirq.h              | 14 ++------------
 include/linux/soc/ti/ti-msgmgr.h        | 10 +---------
 38 files changed, 39 insertions(+), 345 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-omap1/timer.c b/arch/arm/mach-omap1/timer.c
index 9ed64345f06e..f5cd4bbf7566 100644
--- a/arch/arm/mach-omap1/timer.c
+++ b/arch/arm/mach-omap1/timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP1 Dual-Mode Timers - platform device registration
  *
@@ -9,15 +10,6 @@
  * Copyright (C) 2011 Texas Instruments Incorporated - https://www.ti.com/
  * Tarun Kanti DebBarma <tarun.kanti@ti.com>
  * Thara Gopinath <thara@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk.h>
diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index 21413a9b7b6c..47149b278ad5 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP2plus display device setup / initialization.
  *
  * Copyright (C) 2010 Texas Instruments Incorporated - https://www.ti.com/
  *	Senthilvadivu Guruswamy
  *	Sumit Semwal
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/string.h>
diff --git a/arch/arm/mach-omap2/omap_opp_data.h b/arch/arm/mach-omap2/omap_opp_data.h
index 533dd643069a..88375ab38e31 100644
--- a/arch/arm/mach-omap2/omap_opp_data.h
+++ b/arch/arm/mach-omap2/omap_opp_data.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * OMAP SoC specific OPP Data helpers
  *
@@ -6,15 +7,6 @@
  *	Kevin Hilman
  * Copyright (C) 2010 Nokia Corporation.
  *      Eduardo Valentin
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #ifndef __ARCH_ARM_MACH_OMAP2_OMAP_OPP_DATA_H
 #define __ARCH_ARM_MACH_OMAP2_OMAP_OPP_DATA_H
diff --git a/arch/arm/mach-omap2/opp3xxx_data.c b/arch/arm/mach-omap2/opp3xxx_data.c
index b610c5fb423b..90257e2fb3d6 100644
--- a/arch/arm/mach-omap2/opp3xxx_data.c
+++ b/arch/arm/mach-omap2/opp3xxx_data.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP3 OPP table definitions.
  *
@@ -7,15 +8,6 @@
  * Copyright (C) 2010-2011 Nokia Corporation.
  *      Eduardo Valentin
  *      Paul Walmsley
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/module.h>
 
diff --git a/arch/arm/mach-omap2/opp4xxx_data.c b/arch/arm/mach-omap2/opp4xxx_data.c
index d937c5ef41c6..a9851886017d 100644
--- a/arch/arm/mach-omap2/opp4xxx_data.c
+++ b/arch/arm/mach-omap2/opp4xxx_data.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP4 OPP table definitions.
  *
@@ -8,15 +9,6 @@
  * Copyright (C) 2010-2011 Nokia Corporation.
  *      Eduardo Valentin
  *      Paul Walmsley
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/module.h>
 
diff --git a/arch/arm/mach-versatile/spc.c b/arch/arm/mach-versatile/spc.c
index 6e6985e756af..5e44170e1a9a 100644
--- a/arch/arm/mach-versatile/spc.c
+++ b/arch/arm/mach-versatile/spc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Versatile Express Serial Power Controller (SPC) support
  *
@@ -6,15 +7,6 @@
  * Authors: Sudeep KarkadaNagesha <sudeep.karkadanagesha@arm.com>
  *          Achin Gupta           <achin.gupta@arm.com>
  *          Lorenzo Pieralisi     <lorenzo.pieralisi@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/bus/omap_l3_noc.c b/drivers/bus/omap_l3_noc.c
index dcfb32ee5cb6..eb1ba6319fda 100644
--- a/drivers/bus/omap_l3_noc.c
+++ b/drivers/bus/omap_l3_noc.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP L3 Interconnect error handling driver
  *
  * Copyright (C) 2011-2015 Texas Instruments Incorporated - http://www.ti.com/
  *	Santosh Shilimkar <santosh.shilimkar@ti.com>
  *	Sricharan <r.sricharan@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
diff --git a/drivers/bus/omap_l3_noc.h b/drivers/bus/omap_l3_noc.h
index 73431f81da28..bb3eebd3465d 100644
--- a/drivers/bus/omap_l3_noc.h
+++ b/drivers/bus/omap_l3_noc.h
@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * OMAP L3 Interconnect  error handling driver header
  *
  * Copyright (C) 2011-2015 Texas Instruments Incorporated - http://www.ti.com/
  *	Santosh Shilimkar <santosh.shilimkar@ti.com>
  *	sricharan <r.sricharan@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #ifndef __OMAP_L3_NOC_H
 #define __OMAP_L3_NOC_H
diff --git a/drivers/clk/keystone/sci-clk.c b/drivers/clk/keystone/sci-clk.c
index 7e1b136e71ae..d4b4e74e22da 100644
--- a/drivers/clk/keystone/sci-clk.c
+++ b/drivers/clk/keystone/sci-clk.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SCI Clock driver for keystone based devices
  *
  * Copyright (C) 2015-2016 Texas Instruments Incorporated - https://www.ti.com/
  *	Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/clk-provider.h>
 #include <linux/err.h>
diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index e4db6b9a55c6..dd0709c9c249 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP APLL clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * J Keerthy <j-keerthy@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index d6e5f1511ace..27e6b9cb1881 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI clock autoidle support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/clk-dra7-atl.c b/drivers/clk/ti/clk-dra7-atl.c
index aa0950c4f498..f0f5bf68b6d2 100644
--- a/drivers/clk/ti/clk-dra7-atl.c
+++ b/drivers/clk/ti/clk-dra7-atl.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * DRA7 ATL (Audio Tracking Logic) clock driver
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Peter Ujfalusi <peter.ujfalusi@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/init.h>
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 3463579220b5..ef2a445c63a3 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/clk/ti/clkctrl.c b/drivers/clk/ti/clkctrl.c
index 617360e20d86..3b5ebae3740f 100644
--- a/drivers/clk/ti/clkctrl.c
+++ b/drivers/clk/ti/clkctrl.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP clkctrl clock support
  *
  * Copyright (C) 2017 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/clkt_dflt.c b/drivers/clk/ti/clkt_dflt.c
index 91751dd26b16..a756ab1a5856 100644
--- a/drivers/clk/ti/clkt_dflt.c
+++ b/drivers/clk/ti/clkt_dflt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Default clock type
  *
@@ -8,15 +9,6 @@
  * Richard Woodruff <r-woodruff2@ti.com>
  * Paul Walmsley
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 24179c907774..c897ad7e681e 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP clockdomain support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/clk/ti/composite.c b/drivers/clk/ti/composite.c
index 779b9900f636..77b771dd050a 100644
--- a/drivers/clk/ti/composite.c
+++ b/drivers/clk/ti/composite.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI composite clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/divider.c b/drivers/clk/ti/divider.c
index 9fbea0997b43..488d3da60c31 100644
--- a/drivers/clk/ti/divider.c
+++ b/drivers/clk/ti/divider.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI Divider Clock
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 7c6dc8449b22..8ed43bc6b7cc 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP DPLL clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/clk/ti/fixed-factor.c b/drivers/clk/ti/fixed-factor.c
index 8cb00d0af966..c80cee0f5d3d 100644
--- a/drivers/clk/ti/fixed-factor.c
+++ b/drivers/clk/ti/fixed-factor.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI Fixed Factor Clock
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index 0033de9beb4c..307702921431 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP gate clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index dd2b455183a9..f47beeea211e 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMAP interface clock support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/clk/ti/mux.c b/drivers/clk/ti/mux.c
index 15de513d2d81..46b45b3e8319 100644
--- a/drivers/clk/ti/mux.c
+++ b/drivers/clk/ti/mux.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI Multiplexer Clock
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Tero Kristo <t-kristo@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/clk-provider.h>
diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index bda3e7d42964..fd2c16821d54 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * System Control and Power Interface (SCPI) based CPUFreq Interface driver
  *
  * Copyright (C) 2015 ARM Ltd.
  * Sudeep Holla <sudeep.holla@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/input/misc/palmas-pwrbutton.c b/drivers/input/misc/palmas-pwrbutton.c
index 2213e06b611d..465e6693077a 100644
--- a/drivers/input/misc/palmas-pwrbutton.c
+++ b/drivers/input/misc/palmas-pwrbutton.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Texas Instruments' Palmas Power Button Input Driver
  *
  * Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
  *	Girish S Ghongdemath
  *	Nishanth Menon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/bitfield.h>
diff --git a/drivers/input/misc/tps65218-pwrbutton.c b/drivers/input/misc/tps65218-pwrbutton.c
index f011447c44fb..fc450fce0932 100644
--- a/drivers/input/misc/tps65218-pwrbutton.c
+++ b/drivers/input/misc/tps65218-pwrbutton.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Texas Instruments' TPS65217 and TPS65218 Power Button Input Driver
  *
  * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com/
  * Author: Felipe Balbi <balbi@ti.com>
  * Author: Marcin Niestroj <m.niestroj@grinn-global.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/init.h>
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 6cc31789b38d..a948e95d4375 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SRAM protect-exec region helper functions
  *
  * Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
  *	Dave Gerlach
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/device.h>
diff --git a/drivers/power/supply/cpcap-battery.c b/drivers/power/supply/cpcap-battery.c
index ae284bdd6cc3..d98d9244e394 100644
--- a/drivers/power/supply/cpcap-battery.c
+++ b/drivers/power/supply/cpcap-battery.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Battery driver for CPCAP PMIC
  *
@@ -7,15 +8,6 @@
  * drivers:
  *
  * Copyright (C) 2009-2010 Motorola, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/regulator/ti-abb-regulator.c b/drivers/regulator/ti-abb-regulator.c
index bd7b2f287250..cf18452386ca 100644
--- a/drivers/regulator/ti-abb-regulator.c
+++ b/drivers/regulator/ti-abb-regulator.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Texas Instruments SoC Adaptive Body Bias(ABB) Regulator
  *
@@ -7,15 +8,6 @@
  * Copyright (C) 2012-2013 Texas Instruments, Inc.
  * Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
  * Nishanth Menon <nm@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/clk.h>
 #include <linux/delay.h>
diff --git a/drivers/reset/reset-ti-sci.c b/drivers/reset/reset-ti-sci.c
index b799aefad547..cc01fa5b0bea 100644
--- a/drivers/reset/reset-ti-sci.c
+++ b/drivers/reset/reset-ti-sci.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Texas Instrument's System Control Interface (TI-SCI) reset driver
  *
  * Copyright (C) 2015-2017 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/idr.h>
diff --git a/drivers/reset/reset-ti-syscon.c b/drivers/reset/reset-ti-syscon.c
index 2b92775d58f0..f0dd7ffc3b72 100644
--- a/drivers/reset/reset-ti-syscon.c
+++ b/drivers/reset/reset-ti-syscon.c
@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * TI SYSCON regmap reset driver
  *
  * Copyright (C) 2015-2016 Texas Instruments Incorporated - https://www.ti.com/
  *	Andrew F. Davis <afd@ti.com>
  *	Suman Anna <afd@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/mfd/syscon.h>
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index b29ab09040d5..19a242c69ce6 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * HiSilicon thermal sensor driver
  *
@@ -6,15 +7,6 @@
  *
  * Xinwei Kong <kong.kongxinwei@hisilicon.com>
  * Leo Yan <leo.yan@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #include <linux/cpufreq.h>
diff --git a/include/dt-bindings/clock/ti-dra7-atl.h b/include/dt-bindings/clock/ti-dra7-atl.h
index 42dd4164f6f4..b0e71e3cce95 100644
--- a/include/dt-bindings/clock/ti-dra7-atl.h
+++ b/include/dt-bindings/clock/ti-dra7-atl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This header provides constants for DRA7 ATL (Audio Tracking Logic)
  *
@@ -6,15 +7,6 @@
  * Copyright (C) 2013 Texas Instruments, Inc.
  *
  * Peter Ujfalusi <peter.ujfalusi@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _DT_BINDINGS_CLK_DRA7_ATL_H
diff --git a/include/dt-bindings/pinctrl/hisi.h b/include/dt-bindings/pinctrl/hisi.h
index 93064c750c8c..2175ec89c82f 100644
--- a/include/dt-bindings/pinctrl/hisi.h
+++ b/include/dt-bindings/pinctrl/hisi.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This header provides constants for hisilicon pinctrl bindings.
  *
  * Copyright (c) 2015 HiSilicon Limited.
  * Copyright (c) 2015 Linaro Limited.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
  */
 
 #ifndef _DT_BINDINGS_PINCTRL_HISI_H
diff --git a/include/dt-bindings/pinctrl/keystone.h b/include/dt-bindings/pinctrl/keystone.h
index 7f97d776a8ff..66f8aecada53 100644
--- a/include/dt-bindings/pinctrl/keystone.h
+++ b/include/dt-bindings/pinctrl/keystone.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This header provides constants for Keystone pinctrl bindings.
  *
  * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef _DT_BINDINGS_PINCTRL_KEYSTONE_H
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 3486f20a3753..cbfcbf186ce3 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -1,16 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * TI clock drivers support
  *
  * Copyright (C) 2013 Texas Instruments, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #ifndef __LINUX_CLK_TI_H__
 #define __LINUX_CLK_TI_H__
diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h
index e63a63aa47a3..dd42d16945d0 100644
--- a/include/linux/pm_wakeirq.h
+++ b/include/linux/pm_wakeirq.h
@@ -1,15 +1,5 @@
-/*
- * pm_wakeirq.h - Device wakeirq helper functions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* pm_wakeirq.h - Device wakeirq helper functions */
 
 #ifndef _LINUX_PM_WAKEIRQ_H
 #define _LINUX_PM_WAKEIRQ_H
diff --git a/include/linux/soc/ti/ti-msgmgr.h b/include/linux/soc/ti/ti-msgmgr.h
index 69a8d7682c4b..543da257a5f2 100644
--- a/include/linux/soc/ti/ti-msgmgr.h
+++ b/include/linux/soc/ti/ti-msgmgr.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Texas Instruments' Message Manager
  *
  * Copyright (C) 2015-2022 Texas Instruments Incorporated - https://www.ti.com/
  *	Nishanth Menon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #ifndef TI_MSGMGR_H
-- 
cgit 


From 17472bc2c3d00e4ed72c3c7b9af0697edea02fff Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:11:27 +0200
Subject: wifi: nl80211: fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Link: https://lore.kernel.org/r/20220521111145.81697-77-Julia.Lawall@inria.fr
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d9490e3062a7..98f905f16411 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5874,7 +5874,7 @@ enum nl80211_ap_sme_features {
  * @NL80211_FEATURE_INACTIVITY_TIMER: This driver takes care of freeing up
  *	the connected inactive stations in AP mode.
  * @NL80211_FEATURE_CELL_BASE_REG_HINTS: This driver has been tested
- *	to work properly to suppport receiving regulatory hints from
+ *	to work properly to support receiving regulatory hints from
  *	cellular base stations.
  * @NL80211_FEATURE_P2P_DEVICE_NEEDS_CHANNEL: (no longer available, only
  *	here to reserve the value for API/ABI compatibility)
-- 
cgit 


From 23a5f0af6ff43195c6fd15d8ae59d019836a6046 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 9 Feb 2022 13:14:26 +0100
Subject: wifi: mac80211: remove cipher scheme support

The only driver using this was iwlwifi, where we just removed
the support because it was never really used. Remove the code
from mac80211 as well.

Change-Id: I1667417a5932315ee9d81f5c233c56a354923f09
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  35 ------------
 net/mac80211/cfg.c         |  26 +--------
 net/mac80211/ieee80211_i.h |  11 +---
 net/mac80211/iface.c       |   7 +--
 net/mac80211/key.c         |  22 +-------
 net/mac80211/key.h         |   7 +--
 net/mac80211/main.c        |  69 ++---------------------
 net/mac80211/mesh_hwmp.c   |   6 +-
 net/mac80211/mlme.c        |   6 +-
 net/mac80211/rx.c          |  49 ++++-------------
 net/mac80211/sta_info.h    |   4 +-
 net/mac80211/tx.c          |  21 ++-----
 net/mac80211/util.c        |  70 +-----------------------
 net/mac80211/wpa.c         | 133 +--------------------------------------------
 net/mac80211/wpa.h         |   5 +-
 15 files changed, 39 insertions(+), 432 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ebadb2103968..5c9e97eca739 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1958,36 +1958,6 @@ struct ieee80211_key_seq {
 	};
 };
 
-/**
- * struct ieee80211_cipher_scheme - cipher scheme
- *
- * This structure contains a cipher scheme information defining
- * the secure packet crypto handling.
- *
- * @cipher: a cipher suite selector
- * @iftype: a cipher iftype bit mask indicating an allowed cipher usage
- * @hdr_len: a length of a security header used the cipher
- * @pn_len: a length of a packet number in the security header
- * @pn_off: an offset of pn from the beginning of the security header
- * @key_idx_off: an offset of key index byte in the security header
- * @key_idx_mask: a bit mask of key_idx bits
- * @key_idx_shift: a bit shift needed to get key_idx
- *     key_idx value calculation:
- *      (sec_header_base[key_idx_off] & key_idx_mask) >> key_idx_shift
- * @mic_len: a mic length in bytes
- */
-struct ieee80211_cipher_scheme {
-	u32 cipher;
-	u16 iftype;
-	u8 hdr_len;
-	u8 pn_len;
-	u8 pn_off;
-	u8 key_idx_off;
-	u8 key_idx_mask;
-	u8 key_idx_shift;
-	u8 mic_len;
-};
-
 /**
  * enum set_key_cmd - key command
  *
@@ -2664,9 +2634,6 @@ enum ieee80211_hw_flags {
  *	deliver to a WMM STA during any Service Period triggered by the WMM STA.
  *	Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values.
  *
- * @n_cipher_schemes: a size of an array of cipher schemes definitions.
- * @cipher_schemes: a pointer to an array of cipher scheme definitions
- *	supported by HW.
  * @max_nan_de_entries: maximum number of NAN DE functions supported by the
  *	device.
  *
@@ -2716,8 +2683,6 @@ struct ieee80211_hw {
 	netdev_features_t netdev_features;
 	u8 uapsd_queues;
 	u8 uapsd_max_sp_len;
-	u8 n_cipher_schemes;
-	const struct ieee80211_cipher_scheme *cipher_schemes;
 	u8 max_nan_de_entries;
 	u8 tx_sk_pacing_shift;
 	u8 weight_multiplier;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f7896f257e1b..f3b10cee9299 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -438,7 +438,6 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta = NULL;
-	const struct ieee80211_cipher_scheme *cs = NULL;
 	struct ieee80211_key *key;
 	int err;
 
@@ -456,23 +455,12 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 		if (WARN_ON_ONCE(fips_enabled))
 			return -EINVAL;
 		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-	case WLAN_CIPHER_SUITE_CCMP_256:
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
-	case WLAN_CIPHER_SUITE_GCMP:
-	case WLAN_CIPHER_SUITE_GCMP_256:
-		break;
 	default:
-		cs = ieee80211_cs_get(local, params->cipher, sdata->vif.type);
 		break;
 	}
 
 	key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
-				  params->key, params->seq_len, params->seq,
-				  cs);
+				  params->key, params->seq_len, params->seq);
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
@@ -537,9 +525,6 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 		break;
 	}
 
-	if (sta)
-		sta->cipher_scheme = cs;
-
 	err = ieee80211_key_link(key, sdata, sta);
 
  out_unlock:
@@ -1207,9 +1192,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 				params->crypto.control_port_over_nl80211;
 	sdata->control_port_no_preauth =
 				params->crypto.control_port_no_preauth;
-	sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local,
-							&params->crypto,
-							sdata->vif.type);
 
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
 		vlan->control_port_protocol =
@@ -1220,10 +1202,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 			params->crypto.control_port_over_nl80211;
 		vlan->control_port_no_preauth =
 			params->crypto.control_port_no_preauth;
-		vlan->encrypt_headroom =
-			ieee80211_cs_headroom(sdata->local,
-					      &params->crypto,
-					      vlan->vif.type);
 	}
 
 	sdata->vif.bss_conf.dtim_period = params->dtim_period;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 86ef0a46a68c..1cf331572de1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 
 #ifndef IEEE80211_I_H
@@ -944,7 +944,6 @@ struct ieee80211_sub_if_data {
 	bool control_port_no_encrypt;
 	bool control_port_no_preauth;
 	bool control_port_over_nl80211;
-	int encrypt_headroom;
 
 	atomic_t num_tx_queued;
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
@@ -2483,14 +2482,6 @@ void ieee80211_dfs_radar_detected_work(struct work_struct *work);
 int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
 			      struct cfg80211_csa_settings *csa_settings);
 
-bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs);
-bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n);
-const struct ieee80211_cipher_scheme *
-ieee80211_cs_get(struct ieee80211_local *local, u32 cipher,
-		 enum nl80211_iftype iftype);
-int ieee80211_cs_headroom(struct ieee80211_local *local,
-			  struct cfg80211_crypto_settings *crypto,
-			  enum nl80211_iftype iftype);
 void ieee80211_recalc_dtim(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata);
 int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 41531478437c..fb8d102fca48 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -8,7 +8,7 @@
  * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (c) 2016        Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -1036,8 +1036,6 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 		 wiphy_name(local->hw.wiphy));
 	sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
 
-	sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
-
 	ieee80211_set_default_queues(sdata);
 
 	ret = drv_add_interface(local, sdata);
@@ -1644,7 +1642,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	sdata->control_port_no_encrypt = false;
 	sdata->control_port_over_nl80211 = false;
 	sdata->control_port_no_preauth = false;
-	sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
 	sdata->vif.bss_conf.idle = true;
 	sdata->vif.bss_conf.txpower = INT_MIN; /* unset */
 
@@ -2116,8 +2113,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 	sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
 	sdata->user_power_level = local->user_power_level;
 
-	sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
-
 	/* setup type-dependent data */
 	ieee80211_setup_sdata(sdata, type);
 
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 0fcf8aebedc4..c3476de4b14d 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,7 +6,7 @@
  * Copyright 2007-2008	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright 2018-2020  Intel Corporation
+ * Copyright 2018-2020, 2022  Intel Corporation
  */
 
 #include <linux/if_ether.h>
@@ -531,8 +531,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 struct ieee80211_key *
 ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		    const u8 *key_data,
-		    size_t seq_len, const u8 *seq,
-		    const struct ieee80211_cipher_scheme *cs)
+		    size_t seq_len, const u8 *seq)
 {
 	struct ieee80211_key *key;
 	int i, j, err;
@@ -675,21 +674,6 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 			return ERR_PTR(err);
 		}
 		break;
-	default:
-		if (cs) {
-			if (seq_len && seq_len != cs->pn_len) {
-				kfree(key);
-				return ERR_PTR(-EINVAL);
-			}
-
-			key->conf.iv_len = cs->hdr_len;
-			key->conf.icv_len = cs->mic_len;
-			for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
-				for (j = 0; j < seq_len; j++)
-					key->u.gen.rx_pn[i][j] =
-							seq[seq_len - j - 1];
-			key->flags |= KEY_FLAG_CIPHER_SCHEME;
-		}
 	}
 	memcpy(key->conf.key, key_data, key_len);
 	INIT_LIST_HEAD(&key->list);
@@ -1294,7 +1278,7 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
 
 	key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx,
 				  keyconf->keylen, keyconf->key,
-				  0, NULL, NULL);
+				  0, NULL);
 	if (IS_ERR(key))
 		return ERR_CAST(key);
 
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 1e326c89d721..b3fb41c0c77f 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -2,7 +2,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019, 2022 Intel Corporation
  */
 
 #ifndef IEEE80211_KEY_H
@@ -30,12 +30,10 @@ struct sta_info;
  * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present
  *	in the hardware for TX crypto hardware acceleration.
  * @KEY_FLAG_TAINTED: Key is tainted and packets should be dropped.
- * @KEY_FLAG_CIPHER_SCHEME: This key is for a hardware cipher scheme
  */
 enum ieee80211_internal_key_flags {
 	KEY_FLAG_UPLOADED_TO_HARDWARE	= BIT(0),
 	KEY_FLAG_TAINTED		= BIT(1),
-	KEY_FLAG_CIPHER_SCHEME		= BIT(2),
 };
 
 enum ieee80211_internal_tkip_state {
@@ -140,8 +138,7 @@ struct ieee80211_key {
 struct ieee80211_key *
 ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		    const u8 *key_data,
-		    size_t seq_len, const u8 *seq,
-		    const struct ieee80211_cipher_scheme *cs);
+		    size_t seq_len, const u8 *seq);
 /*
  * Insert a key into data structures (sdata, sta if necessary)
  * to make it used, free old key. On failure, also free the new key.
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 5a385d4146b9..4f3e93c0819b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2017     Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 
 #include <net/mac80211.h>
@@ -778,7 +778,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 {
 	bool have_wep = !fips_enabled; /* FIPS does not permit the use of RC4 */
 	bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE);
-	int n_suites = 0, r = 0, w = 0;
+	int r = 0, w = 0;
 	u32 *suites;
 	static const u32 cipher_suites[] = {
 		/* keep WEP first, it may be removed below */
@@ -824,10 +824,9 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 				continue;
 			suites[w++] = suite;
 		}
-	} else if (!local->hw.cipher_schemes) {
-		/* If the driver doesn't have cipher schemes, there's nothing
-		 * else to do other than assign the (software supported and
-		 * perhaps offloaded) cipher suites.
+	} else {
+		/* assign the (software supported and perhaps offloaded)
+		 * cipher suites
 		 */
 		local->hw.wiphy->cipher_suites = cipher_suites;
 		local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
@@ -842,58 +841,6 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 
 		/* not dynamically allocated, so just return */
 		return 0;
-	} else {
-		const struct ieee80211_cipher_scheme *cs;
-
-		cs = local->hw.cipher_schemes;
-
-		/* Driver specifies cipher schemes only (but not cipher suites
-		 * including the schemes)
-		 *
-		 * We start counting ciphers defined by schemes, TKIP, CCMP,
-		 * CCMP-256, GCMP, and GCMP-256
-		 */
-		n_suites = local->hw.n_cipher_schemes + 5;
-
-		/* check if we have WEP40 and WEP104 */
-		if (have_wep)
-			n_suites += 2;
-
-		/* check if we have AES_CMAC, BIP-CMAC-256, BIP-GMAC-128,
-		 * BIP-GMAC-256
-		 */
-		if (have_mfp)
-			n_suites += 4;
-
-		suites = kmalloc_array(n_suites, sizeof(u32), GFP_KERNEL);
-		if (!suites)
-			return -ENOMEM;
-
-		suites[w++] = WLAN_CIPHER_SUITE_CCMP;
-		suites[w++] = WLAN_CIPHER_SUITE_CCMP_256;
-		suites[w++] = WLAN_CIPHER_SUITE_TKIP;
-		suites[w++] = WLAN_CIPHER_SUITE_GCMP;
-		suites[w++] = WLAN_CIPHER_SUITE_GCMP_256;
-
-		if (have_wep) {
-			suites[w++] = WLAN_CIPHER_SUITE_WEP40;
-			suites[w++] = WLAN_CIPHER_SUITE_WEP104;
-		}
-
-		if (have_mfp) {
-			suites[w++] = WLAN_CIPHER_SUITE_AES_CMAC;
-			suites[w++] = WLAN_CIPHER_SUITE_BIP_CMAC_256;
-			suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_128;
-			suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256;
-		}
-
-		for (r = 0; r < local->hw.n_cipher_schemes; r++) {
-			suites[w++] = cs[r].cipher;
-			if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) {
-				kfree(suites);
-				return -EINVAL;
-			}
-		}
 	}
 
 	local->hw.wiphy->cipher_suites = suites;
@@ -1168,12 +1115,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (local->hw.wiphy->max_scan_ie_len)
 		local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
 
-	if (WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes,
-					     local->hw.n_cipher_schemes))) {
-		result = -EINVAL;
-		goto fail_workqueue;
-	}
-
 	result = ieee80211_init_cipher_suites(local);
 	if (result < 0)
 		goto fail_workqueue;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 58ebdcd69d05..45e7c1b307bc 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2019, 2021 Intel Corporation
+ * Copyright (C) 2019, 2021-2022 Intel Corporation
  * Author:     Luis Carlos Cobo <luisca@cozybit.com>
  */
 
@@ -247,13 +247,13 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
 		return -EAGAIN;
 
 	skb = dev_alloc_skb(local->tx_headroom +
-			    sdata->encrypt_headroom +
+			    IEEE80211_ENCRYPT_HEADROOM +
 			    IEEE80211_ENCRYPT_TAILROOM +
 			    hdr_len +
 			    2 + 15 /* PERR IE */);
 	if (!skb)
 		return -1;
-	skb_reserve(skb, local->tx_headroom + sdata->encrypt_headroom);
+	skb_reserve(skb, local->tx_headroom + IEEE80211_ENCRYPT_HEADROOM);
 	mgmt = skb_put_zero(skb, hdr_len);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 58d48dcae030..6d5ad71ef02c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -8,7 +8,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  */
 
 #include <linux/delay.h>
@@ -2496,8 +2496,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec));
 	cancel_delayed_work_sync(&ifmgd->tx_tspec_wk);
 
-	sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
-
 	bss_conf->pwr_reduction = 0;
 	bss_conf->tx_pwr_env_num = 0;
 	memset(bss_conf->tx_pwr_env, 0, sizeof(bss_conf->tx_pwr_env));
@@ -6071,8 +6069,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	sdata->control_port_over_nl80211 =
 					req->crypto.control_port_over_nl80211;
 	sdata->control_port_no_preauth = req->crypto.control_port_no_preauth;
-	sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto,
-							sdata->vif.type);
 
 	/* kick off associate process */
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3c08ae04ddbc..a9f4e90ad893 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -6,7 +6,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 
 #include <linux/jiffies.h>
@@ -1009,43 +1009,20 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
 	return -1;
 }
 
-static int ieee80211_get_keyid(struct sk_buff *skb,
-			       const struct ieee80211_cipher_scheme *cs)
+static int ieee80211_get_keyid(struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	__le16 fc;
-	int hdrlen;
-	int minlen;
-	u8 key_idx_off;
-	u8 key_idx_shift;
+	__le16 fc = hdr->frame_control;
+	int hdrlen = ieee80211_hdrlen(fc);
 	u8 keyid;
 
-	fc = hdr->frame_control;
-	hdrlen = ieee80211_hdrlen(fc);
-
-	if (cs) {
-		minlen = hdrlen + cs->hdr_len;
-		key_idx_off = hdrlen + cs->key_idx_off;
-		key_idx_shift = cs->key_idx_shift;
-	} else {
-		/* WEP, TKIP, CCMP and GCMP */
-		minlen = hdrlen + IEEE80211_WEP_IV_LEN;
-		key_idx_off = hdrlen + 3;
-		key_idx_shift = 6;
-	}
-
-	if (unlikely(skb->len < minlen))
+	/* WEP, TKIP, CCMP and GCMP */
+	if (unlikely(skb->len < hdrlen + IEEE80211_WEP_IV_LEN))
 		return -EINVAL;
 
-	skb_copy_bits(skb, key_idx_off, &keyid, 1);
+	skb_copy_bits(skb, hdrlen + 3, &keyid, 1);
 
-	if (cs)
-		keyid &= cs->key_idx_mask;
-	keyid >>= key_idx_shift;
-
-	/* cs could use more than the usual two bits for the keyid */
-	if (unlikely(keyid >= NUM_DEFAULT_KEYS))
-		return -EINVAL;
+	keyid >>= 6;
 
 	return keyid;
 }
@@ -1916,7 +1893,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	struct ieee80211_key *ptk_idx = NULL;
 	int mmie_keyidx = -1;
 	__le16 fc;
-	const struct ieee80211_cipher_scheme *cs = NULL;
 
 	if (ieee80211_is_ext(hdr->frame_control))
 		return RX_CONTINUE;
@@ -1959,8 +1935,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 		if (ieee80211_has_protected(fc) &&
 		    !(status->flag & RX_FLAG_IV_STRIPPED)) {
-			cs = rx->sta->cipher_scheme;
-			keyid = ieee80211_get_keyid(rx->skb, cs);
+			keyid = ieee80211_get_keyid(rx->skb);
 
 			if (unlikely(keyid < 0))
 				return RX_DROP_UNUSABLE;
@@ -2065,7 +2040,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		    (status->flag & RX_FLAG_IV_STRIPPED))
 			return RX_CONTINUE;
 
-		keyidx = ieee80211_get_keyid(rx->skb, cs);
+		keyidx = ieee80211_get_keyid(rx->skb);
 
 		if (unlikely(keyidx < 0))
 			return RX_DROP_UNUSABLE;
@@ -2131,7 +2106,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		result = ieee80211_crypto_gcmp_decrypt(rx);
 		break;
 	default:
-		result = ieee80211_crypto_hw_decrypt(rx);
+		result = RX_DROP_UNUSABLE;
 	}
 
 	/* the hdr variable is invalid after the decrypt handlers */
@@ -2945,7 +2920,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 		tailroom = IEEE80211_ENCRYPT_TAILROOM;
 
 	fwd_skb = skb_copy_expand(skb, local->tx_headroom +
-				       sdata->encrypt_headroom,
+				       IEEE80211_ENCRYPT_HEADROOM,
 				  tailroom, GFP_ATOMIC);
 	if (!fwd_skb)
 		goto out;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 35c390bedfba..aa6950aa49a9 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -3,7 +3,7 @@
  * Copyright 2002-2005, Devicescape Software, Inc.
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright(c) 2020-2021 Intel Corporation
+ * Copyright(c) 2020-2022 Intel Corporation
  */
 
 #ifndef STA_INFO_H
@@ -616,7 +616,6 @@ struct link_sta_info {
  *	taken from HT/VHT capabilities or VHT operating mode notification
  * @known_smps_mode: the smps_mode the client thinks we are in. Relevant for
  *	AP only.
- * @cipher_scheme: optional cipher scheme for this station
  * @cparams: CoDel parameters for this station.
  * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
  * @fast_tx: TX fastpath information
@@ -700,7 +699,6 @@ struct sta_info {
 #endif
 
 	enum ieee80211_smps_mode known_smps_mode;
-	const struct ieee80211_cipher_scheme *cipher_scheme;
 
 	struct codel_params cparams;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 0e4efc08c762..37fe72bb5ab0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * Transmit and frame generation functions.
  */
@@ -882,7 +882,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx,
 		rem -= fraglen;
 		tmp = dev_alloc_skb(local->tx_headroom +
 				    frag_threshold +
-				    tx->sdata->encrypt_headroom +
+				    IEEE80211_ENCRYPT_HEADROOM +
 				    IEEE80211_ENCRYPT_TAILROOM);
 		if (!tmp)
 			return -ENOMEM;
@@ -890,7 +890,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx,
 		__skb_queue_tail(&tx->skbs, tmp);
 
 		skb_reserve(tmp,
-			    local->tx_headroom + tx->sdata->encrypt_headroom);
+			    local->tx_headroom + IEEE80211_ENCRYPT_HEADROOM);
 
 		/* copy control information */
 		memcpy(tmp->cb, skb->cb, sizeof(tmp->cb));
@@ -1040,8 +1040,6 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		return ieee80211_crypto_gcmp_encrypt(tx);
-	default:
-		return ieee80211_crypto_hw_encrypt(tx);
 	}
 
 	return TX_DROP;
@@ -2013,7 +2011,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 
 	headroom = local->tx_headroom;
 	if (encrypt != ENCRYPT_NO)
-		headroom += sdata->encrypt_headroom;
+		headroom += IEEE80211_ENCRYPT_HEADROOM;
 	headroom -= skb_headroom(skb);
 	headroom = max_t(int, 0, headroom);
 
@@ -2867,7 +2865,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	 */
 
 	if (head_need > 0 || skb_cloned(skb)) {
-		head_need += sdata->encrypt_headroom;
+		head_need += IEEE80211_ENCRYPT_HEADROOM;
 		head_need += local->tx_headroom;
 		head_need = max_t(int, 0, head_need);
 		if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) {
@@ -3128,15 +3126,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 			/* we don't know how to generate IVs for this at all */
 			if (WARN_ON(gen_iv))
 				goto out;
-			/* pure hardware keys are OK, of course */
-			if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME))
-				break;
-			/* cipher scheme might require space allocation */
-			if (iv_spc &&
-			    build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV)
-				goto out;
-			if (iv_spc)
-				build.hdr_len += build.key->conf.iv_len;
 		}
 
 		fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1e26b5235add..9e6c4dcef280 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * utilities for mac80211
  */
@@ -4212,74 +4212,6 @@ int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs)
-{
-	return !(cs == NULL || cs->cipher == 0 ||
-		 cs->hdr_len < cs->pn_len + cs->pn_off ||
-		 cs->hdr_len <= cs->key_idx_off ||
-		 cs->key_idx_shift > 7 ||
-		 cs->key_idx_mask == 0);
-}
-
-bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n)
-{
-	int i;
-
-	/* Ensure we have enough iftype bitmap space for all iftype values */
-	WARN_ON((NUM_NL80211_IFTYPES / 8 + 1) > sizeof(cs[0].iftype));
-
-	for (i = 0; i < n; i++)
-		if (!ieee80211_cs_valid(&cs[i]))
-			return false;
-
-	return true;
-}
-
-const struct ieee80211_cipher_scheme *
-ieee80211_cs_get(struct ieee80211_local *local, u32 cipher,
-		 enum nl80211_iftype iftype)
-{
-	const struct ieee80211_cipher_scheme *l = local->hw.cipher_schemes;
-	int n = local->hw.n_cipher_schemes;
-	int i;
-	const struct ieee80211_cipher_scheme *cs = NULL;
-
-	for (i = 0; i < n; i++) {
-		if (l[i].cipher == cipher) {
-			cs = &l[i];
-			break;
-		}
-	}
-
-	if (!cs || !(cs->iftype & BIT(iftype)))
-		return NULL;
-
-	return cs;
-}
-
-int ieee80211_cs_headroom(struct ieee80211_local *local,
-			  struct cfg80211_crypto_settings *crypto,
-			  enum nl80211_iftype iftype)
-{
-	const struct ieee80211_cipher_scheme *cs;
-	int headroom = IEEE80211_ENCRYPT_HEADROOM;
-	int i;
-
-	for (i = 0; i < crypto->n_ciphers_pairwise; i++) {
-		cs = ieee80211_cs_get(local, crypto->ciphers_pairwise[i],
-				      iftype);
-
-		if (cs && headroom < cs->hdr_len)
-			headroom = cs->hdr_len;
-	}
-
-	cs = ieee80211_cs_get(local, crypto->cipher_group, iftype);
-	if (cs && headroom < cs->hdr_len)
-		headroom = cs->hdr_len;
-
-	return headroom;
-}
-
 static bool
 ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i)
 {
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 5fd8a3e8b5b4..93ec2f349748 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -3,7 +3,7 @@
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2008, Jouni Malinen <j@w1.fi>
  * Copyright (C) 2016-2017 Intel Deutschland GmbH
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
  */
 
 #include <linux/netdevice.h>
@@ -778,102 +778,6 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
-static ieee80211_tx_result
-ieee80211_crypto_cs_encrypt(struct ieee80211_tx_data *tx,
-			    struct sk_buff *skb)
-{
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	struct ieee80211_key *key = tx->key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	int hdrlen;
-	u8 *pos, iv_len = key->conf.iv_len;
-
-	if (info->control.hw_key &&
-	    !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) {
-		/* hwaccel has no need for preallocated head room */
-		return TX_CONTINUE;
-	}
-
-	if (unlikely(skb_headroom(skb) < iv_len &&
-		     pskb_expand_head(skb, iv_len, 0, GFP_ATOMIC)))
-		return TX_DROP;
-
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-
-	pos = skb_push(skb, iv_len);
-	memmove(pos, pos + iv_len, hdrlen);
-
-	return TX_CONTINUE;
-}
-
-static inline int ieee80211_crypto_cs_pn_compare(u8 *pn1, u8 *pn2, int len)
-{
-	int i;
-
-	/* pn is little endian */
-	for (i = len - 1; i >= 0; i--) {
-		if (pn1[i] < pn2[i])
-			return -1;
-		else if (pn1[i] > pn2[i])
-			return 1;
-	}
-
-	return 0;
-}
-
-static ieee80211_rx_result
-ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx)
-{
-	struct ieee80211_key *key = rx->key;
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
-	const struct ieee80211_cipher_scheme *cs = NULL;
-	int hdrlen = ieee80211_hdrlen(hdr->frame_control);
-	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
-	int data_len;
-	u8 *rx_pn;
-	u8 *skb_pn;
-	u8 qos_tid;
-
-	if (!rx->sta || !rx->sta->cipher_scheme ||
-	    !(status->flag & RX_FLAG_DECRYPTED))
-		return RX_DROP_UNUSABLE;
-
-	if (!ieee80211_is_data(hdr->frame_control))
-		return RX_CONTINUE;
-
-	cs = rx->sta->cipher_scheme;
-
-	data_len = rx->skb->len - hdrlen - cs->hdr_len;
-
-	if (data_len < 0)
-		return RX_DROP_UNUSABLE;
-
-	if (ieee80211_is_data_qos(hdr->frame_control))
-		qos_tid = ieee80211_get_tid(hdr);
-	else
-		qos_tid = 0;
-
-	if (skb_linearize(rx->skb))
-		return RX_DROP_UNUSABLE;
-
-	rx_pn = key->u.gen.rx_pn[qos_tid];
-	skb_pn = rx->skb->data + hdrlen + cs->pn_off;
-
-	if (ieee80211_crypto_cs_pn_compare(skb_pn, rx_pn, cs->pn_len) <= 0)
-		return RX_DROP_UNUSABLE;
-
-	memcpy(rx_pn, skb_pn, cs->pn_len);
-
-	/* remove security header and MIC */
-	if (pskb_trim(rx->skb, rx->skb->len - cs->mic_len))
-		return RX_DROP_UNUSABLE;
-
-	memmove(rx->skb->data + cs->hdr_len, rx->skb->data, hdrlen);
-	skb_pull(rx->skb, cs->hdr_len);
-
-	return RX_CONTINUE;
-}
-
 static void bip_aad(struct sk_buff *skb, u8 *aad)
 {
 	__le16 mask_fc;
@@ -1212,38 +1116,3 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
 
 	return RX_CONTINUE;
 }
-
-ieee80211_tx_result
-ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx)
-{
-	struct sk_buff *skb;
-	struct ieee80211_tx_info *info = NULL;
-	ieee80211_tx_result res;
-
-	skb_queue_walk(&tx->skbs, skb) {
-		info  = IEEE80211_SKB_CB(skb);
-
-		/* handle hw-only algorithm */
-		if (!info->control.hw_key)
-			return TX_DROP;
-
-		if (tx->key->flags & KEY_FLAG_CIPHER_SCHEME) {
-			res = ieee80211_crypto_cs_encrypt(tx, skb);
-			if (res != TX_CONTINUE)
-				return res;
-		}
-	}
-
-	ieee80211_tx_set_protected(tx);
-
-	return TX_CONTINUE;
-}
-
-ieee80211_rx_result
-ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx)
-{
-	if (rx->sta && rx->sta->cipher_scheme)
-		return ieee80211_crypto_cs_decrypt(rx);
-
-	return RX_DROP_UNUSABLE;
-}
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
index af3272284e85..a9a81abb5479 100644
--- a/net/mac80211/wpa.h
+++ b/net/mac80211/wpa.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright (C) 2022 Intel Corporation
  */
 
 #ifndef WPA_H
@@ -39,10 +40,6 @@ ieee80211_tx_result
 ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
 ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx);
-ieee80211_tx_result
-ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx);
-ieee80211_rx_result
-ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx);
 
 ieee80211_tx_result
 ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx);
-- 
cgit 


From 35ba63b8f6d07d353159505423cfca5a4378a11c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 6 Jun 2022 10:41:05 +0200
Subject: vme: move back to staging

The VME subsystem graduated from staging into a top-level subsystem in
2012, with commit db3b9e990e75 ("Staging: VME: move VME drivers out of
staging") stating:

    The VME device drivers have not moved out yet due to some API
    questions they are still working through, that should happen soon,
    hopefully.

However, this never happened: maintenance of drivers/vme effectively
stopped in 2017, with all subsequent changes being treewide cleanups.
No hardware driver remains in staging, only the limited user-level
access, and I just removed one of the two bridge drivers and the only
remaining board.

drivers/staging/vme/devices/ was recently moved to
drivers/staging/vme_user/, but as the vme_user driver is the only one
remaining for this subsystem, it is easier to just move the remaining
three source files into this directory rather than keeping the original
hierarchy.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20220606084109.4108188-3-arnd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/vme.rst      |    4 +-
 MAINTAINERS                           |    4 +-
 drivers/Kconfig                       |    2 -
 drivers/Makefile                      |    1 -
 drivers/staging/vme_user/Kconfig      |   27 +
 drivers/staging/vme_user/Makefile     |    3 +
 drivers/staging/vme_user/vme.c        | 2015 +++++++++++++++++++++++++
 drivers/staging/vme_user/vme.h        |  190 +++
 drivers/staging/vme_user/vme_bridge.h |  190 +++
 drivers/staging/vme_user/vme_fake.c   | 1305 ++++++++++++++++
 drivers/staging/vme_user/vme_tsi148.c | 2661 +++++++++++++++++++++++++++++++++
 drivers/staging/vme_user/vme_tsi148.h | 1407 +++++++++++++++++
 drivers/staging/vme_user/vme_user.c   |    2 +-
 drivers/vme/Kconfig                   |   16 -
 drivers/vme/Makefile                  |    7 -
 drivers/vme/bridges/Kconfig           |   17 -
 drivers/vme/bridges/Makefile          |    3 -
 drivers/vme/bridges/vme_fake.c        | 1305 ----------------
 drivers/vme/bridges/vme_tsi148.c      | 2661 ---------------------------------
 drivers/vme/bridges/vme_tsi148.h      | 1407 -----------------
 drivers/vme/vme.c                     | 2015 -------------------------
 drivers/vme/vme_bridge.h              |  190 ---
 include/linux/vme.h                   |  190 ---
 23 files changed, 7802 insertions(+), 7820 deletions(-)
 create mode 100644 drivers/staging/vme_user/vme.c
 create mode 100644 drivers/staging/vme_user/vme.h
 create mode 100644 drivers/staging/vme_user/vme_bridge.h
 create mode 100644 drivers/staging/vme_user/vme_fake.c
 create mode 100644 drivers/staging/vme_user/vme_tsi148.c
 create mode 100644 drivers/staging/vme_user/vme_tsi148.h
 delete mode 100644 drivers/vme/Kconfig
 delete mode 100644 drivers/vme/Makefile
 delete mode 100644 drivers/vme/bridges/Kconfig
 delete mode 100644 drivers/vme/bridges/Makefile
 delete mode 100644 drivers/vme/bridges/vme_fake.c
 delete mode 100644 drivers/vme/bridges/vme_tsi148.c
 delete mode 100644 drivers/vme/bridges/vme_tsi148.h
 delete mode 100644 drivers/vme/vme.c
 delete mode 100644 drivers/vme/vme_bridge.h
 delete mode 100644 include/linux/vme.h

(limited to 'include')

diff --git a/Documentation/driver-api/vme.rst b/Documentation/driver-api/vme.rst
index def139c13410..c0b475369de0 100644
--- a/Documentation/driver-api/vme.rst
+++ b/Documentation/driver-api/vme.rst
@@ -290,8 +290,8 @@ The function :c:func:`vme_bus_num` returns the bus ID of the provided bridge.
 VME API
 -------
 
-.. kernel-doc:: include/linux/vme.h
+.. kernel-doc:: drivers/staging/vme_user/vme.h
    :internal:
 
-.. kernel-doc:: drivers/vme/vme.c
+.. kernel-doc:: drivers/staging/vme_user/vme.c
    :export:
diff --git a/MAINTAINERS b/MAINTAINERS
index 60e5e4007844..7097b824ba18 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21230,12 +21230,10 @@ M:	Martyn Welch <martyn@welchs.me.uk>
 M:	Manohar Vanga <manohar.vanga@gmail.com>
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 L:	linux-kernel@vger.kernel.org
-S:	Maintained
+S:	Odd fixes
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
 F:	Documentation/driver-api/vme.rst
 F:	drivers/staging/vme_user/
-F:	drivers/vme/
-F:	include/linux/vme*
 
 VM SOCKETS (AF_VSOCK)
 M:	Stefano Garzarella <sgarzare@redhat.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index b6a172d32a7d..19ee995bd0ae 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -183,8 +183,6 @@ source "drivers/iio/Kconfig"
 
 source "drivers/ntb/Kconfig"
 
-source "drivers/vme/Kconfig"
-
 source "drivers/pwm/Kconfig"
 
 source "drivers/irqchip/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 9a30842b22c5..dadf2678277f 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -165,7 +165,6 @@ obj-$(CONFIG_PM_DEVFREQ)	+= devfreq/
 obj-$(CONFIG_EXTCON)		+= extcon/
 obj-$(CONFIG_MEMORY)		+= memory/
 obj-$(CONFIG_IIO)		+= iio/
-obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IPACK_BUS)		+= ipack/
 obj-$(CONFIG_NTB)		+= ntb/
 obj-$(CONFIG_POWERCAP)		+= powercap/
diff --git a/drivers/staging/vme_user/Kconfig b/drivers/staging/vme_user/Kconfig
index e8b4461bf27f..c8eabf8f40f1 100644
--- a/drivers/staging/vme_user/Kconfig
+++ b/drivers/staging/vme_user/Kconfig
@@ -1,4 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0
+menuconfig VME_BUS
+	bool "VME bridge support"
+	depends on STAGING && PCI
+	help
+	  If you say Y here you get support for the VME bridge Framework.
+
+if VME_BUS
+
+comment "VME Bridge Drivers"
+
+config VME_TSI148
+	tristate "Tempe"
+	depends on HAS_DMA
+	help
+	 If you say Y here you get support for the Tundra TSI148 VME bridge
+	 chip.
+
+config VME_FAKE
+	tristate "Fake"
+	help
+	 If you say Y here you get support for the fake VME bridge. This
+	 provides a virtualised VME Bus for devices with no VME bridge. This
+	 is mainly useful for VME development (in the absence of VME
+	 hardware).
+
 comment "VME Device Drivers"
 
 config VME_USER
@@ -11,3 +36,5 @@ config VME_USER
 
 	  To compile this driver as a module, choose M here. The module will
 	  be called vme_user. If unsure, say N.
+
+endif
diff --git a/drivers/staging/vme_user/Makefile b/drivers/staging/vme_user/Makefile
index 5380115139b0..8dcc6938ce5c 100644
--- a/drivers/staging/vme_user/Makefile
+++ b/drivers/staging/vme_user/Makefile
@@ -3,4 +3,7 @@
 # Makefile for the VME device drivers.
 #
 
+obj-$(CONFIG_VME_BUS)		+= vme.o
 obj-$(CONFIG_VME_USER)		+= vme_user.o
+obj-$(CONFIG_VME_TSI148)	+= vme_tsi148.o
+obj-$(CONFIG_VME_FAKE)		+= vme_fake.o
diff --git a/drivers/staging/vme_user/vme.c b/drivers/staging/vme_user/vme.c
new file mode 100644
index 000000000000..b5555683a069
--- /dev/null
+++ b/drivers/staging/vme_user/vme.c
@@ -0,0 +1,2015 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * VME Bridge Framework
+ *
+ * Author: Martyn Welch <martyn.welch@ge.com>
+ * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
+ *
+ * Based on work by Tom Armistead and Ajit Prem
+ * Copyright 2004 Motorola Inc.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/syscalls.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include "vme.h"
+#include "vme_bridge.h"
+
+/* Bitmask and list of registered buses both protected by common mutex */
+static unsigned int vme_bus_numbers;
+static LIST_HEAD(vme_bus_list);
+static DEFINE_MUTEX(vme_buses_lock);
+
+static int __init vme_init(void);
+
+static struct vme_dev *dev_to_vme_dev(struct device *dev)
+{
+	return container_of(dev, struct vme_dev, dev);
+}
+
+/*
+ * Find the bridge that the resource is associated with.
+ */
+static struct vme_bridge *find_bridge(struct vme_resource *resource)
+{
+	/* Get list to search */
+	switch (resource->type) {
+	case VME_MASTER:
+		return list_entry(resource->entry, struct vme_master_resource,
+			list)->parent;
+	case VME_SLAVE:
+		return list_entry(resource->entry, struct vme_slave_resource,
+			list)->parent;
+	case VME_DMA:
+		return list_entry(resource->entry, struct vme_dma_resource,
+			list)->parent;
+	case VME_LM:
+		return list_entry(resource->entry, struct vme_lm_resource,
+			list)->parent;
+	default:
+		printk(KERN_ERR "Unknown resource type\n");
+		return NULL;
+	}
+}
+
+/**
+ * vme_alloc_consistent - Allocate contiguous memory.
+ * @resource: Pointer to VME resource.
+ * @size: Size of allocation required.
+ * @dma: Pointer to variable to store physical address of allocation.
+ *
+ * Allocate a contiguous block of memory for use by the driver. This is used to
+ * create the buffers for the slave windows.
+ *
+ * Return: Virtual address of allocation on success, NULL on failure.
+ */
+void *vme_alloc_consistent(struct vme_resource *resource, size_t size,
+	dma_addr_t *dma)
+{
+	struct vme_bridge *bridge;
+
+	if (!resource) {
+		printk(KERN_ERR "No resource\n");
+		return NULL;
+	}
+
+	bridge = find_bridge(resource);
+	if (!bridge) {
+		printk(KERN_ERR "Can't find bridge\n");
+		return NULL;
+	}
+
+	if (!bridge->parent) {
+		printk(KERN_ERR "Dev entry NULL for bridge %s\n", bridge->name);
+		return NULL;
+	}
+
+	if (!bridge->alloc_consistent) {
+		printk(KERN_ERR "alloc_consistent not supported by bridge %s\n",
+		       bridge->name);
+		return NULL;
+	}
+
+	return bridge->alloc_consistent(bridge->parent, size, dma);
+}
+EXPORT_SYMBOL(vme_alloc_consistent);
+
+/**
+ * vme_free_consistent - Free previously allocated memory.
+ * @resource: Pointer to VME resource.
+ * @size: Size of allocation to free.
+ * @vaddr: Virtual address of allocation.
+ * @dma: Physical address of allocation.
+ *
+ * Free previously allocated block of contiguous memory.
+ */
+void vme_free_consistent(struct vme_resource *resource, size_t size,
+	void *vaddr, dma_addr_t dma)
+{
+	struct vme_bridge *bridge;
+
+	if (!resource) {
+		printk(KERN_ERR "No resource\n");
+		return;
+	}
+
+	bridge = find_bridge(resource);
+	if (!bridge) {
+		printk(KERN_ERR "Can't find bridge\n");
+		return;
+	}
+
+	if (!bridge->parent) {
+		printk(KERN_ERR "Dev entry NULL for bridge %s\n", bridge->name);
+		return;
+	}
+
+	if (!bridge->free_consistent) {
+		printk(KERN_ERR "free_consistent not supported by bridge %s\n",
+		       bridge->name);
+		return;
+	}
+
+	bridge->free_consistent(bridge->parent, size, vaddr, dma);
+}
+EXPORT_SYMBOL(vme_free_consistent);
+
+/**
+ * vme_get_size - Helper function returning size of a VME window
+ * @resource: Pointer to VME slave or master resource.
+ *
+ * Determine the size of the VME window provided. This is a helper
+ * function, wrappering the call to vme_master_get or vme_slave_get
+ * depending on the type of window resource handed to it.
+ *
+ * Return: Size of the window on success, zero on failure.
+ */
+size_t vme_get_size(struct vme_resource *resource)
+{
+	int enabled, retval;
+	unsigned long long base, size;
+	dma_addr_t buf_base;
+	u32 aspace, cycle, dwidth;
+
+	switch (resource->type) {
+	case VME_MASTER:
+		retval = vme_master_get(resource, &enabled, &base, &size,
+			&aspace, &cycle, &dwidth);
+		if (retval)
+			return 0;
+
+		return size;
+	case VME_SLAVE:
+		retval = vme_slave_get(resource, &enabled, &base, &size,
+			&buf_base, &aspace, &cycle);
+		if (retval)
+			return 0;
+
+		return size;
+	case VME_DMA:
+		return 0;
+	default:
+		printk(KERN_ERR "Unknown resource type\n");
+		return 0;
+	}
+}
+EXPORT_SYMBOL(vme_get_size);
+
+int vme_check_window(u32 aspace, unsigned long long vme_base,
+		     unsigned long long size)
+{
+	int retval = 0;
+
+	if (vme_base + size < size)
+		return -EINVAL;
+
+	switch (aspace) {
+	case VME_A16:
+		if (vme_base + size > VME_A16_MAX)
+			retval = -EFAULT;
+		break;
+	case VME_A24:
+		if (vme_base + size > VME_A24_MAX)
+			retval = -EFAULT;
+		break;
+	case VME_A32:
+		if (vme_base + size > VME_A32_MAX)
+			retval = -EFAULT;
+		break;
+	case VME_A64:
+		/* The VME_A64_MAX limit is actually U64_MAX + 1 */
+		break;
+	case VME_CRCSR:
+		if (vme_base + size > VME_CRCSR_MAX)
+			retval = -EFAULT;
+		break;
+	case VME_USER1:
+	case VME_USER2:
+	case VME_USER3:
+	case VME_USER4:
+		/* User Defined */
+		break;
+	default:
+		printk(KERN_ERR "Invalid address space\n");
+		retval = -EINVAL;
+		break;
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL(vme_check_window);
+
+static u32 vme_get_aspace(int am)
+{
+	switch (am) {
+	case 0x29:
+	case 0x2D:
+		return VME_A16;
+	case 0x38:
+	case 0x39:
+	case 0x3A:
+	case 0x3B:
+	case 0x3C:
+	case 0x3D:
+	case 0x3E:
+	case 0x3F:
+		return VME_A24;
+	case 0x8:
+	case 0x9:
+	case 0xA:
+	case 0xB:
+	case 0xC:
+	case 0xD:
+	case 0xE:
+	case 0xF:
+		return VME_A32;
+	case 0x0:
+	case 0x1:
+	case 0x3:
+		return VME_A64;
+	}
+
+	return 0;
+}
+
+/**
+ * vme_slave_request - Request a VME slave window resource.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @address: Required VME address space.
+ * @cycle: Required VME data transfer cycle type.
+ *
+ * Request use of a VME window resource capable of being set for the requested
+ * address space and data transfer cycle.
+ *
+ * Return: Pointer to VME resource on success, NULL on failure.
+ */
+struct vme_resource *vme_slave_request(struct vme_dev *vdev, u32 address,
+	u32 cycle)
+{
+	struct vme_bridge *bridge;
+	struct list_head *slave_pos = NULL;
+	struct vme_slave_resource *allocated_image = NULL;
+	struct vme_slave_resource *slave_image = NULL;
+	struct vme_resource *resource = NULL;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		goto err_bus;
+	}
+
+	/* Loop through slave resources */
+	list_for_each(slave_pos, &bridge->slave_resources) {
+		slave_image = list_entry(slave_pos,
+			struct vme_slave_resource, list);
+
+		if (!slave_image) {
+			printk(KERN_ERR "Registered NULL Slave resource\n");
+			continue;
+		}
+
+		/* Find an unlocked and compatible image */
+		mutex_lock(&slave_image->mtx);
+		if (((slave_image->address_attr & address) == address) &&
+			((slave_image->cycle_attr & cycle) == cycle) &&
+			(slave_image->locked == 0)) {
+
+			slave_image->locked = 1;
+			mutex_unlock(&slave_image->mtx);
+			allocated_image = slave_image;
+			break;
+		}
+		mutex_unlock(&slave_image->mtx);
+	}
+
+	/* No free image */
+	if (!allocated_image)
+		goto err_image;
+
+	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource)
+		goto err_alloc;
+
+	resource->type = VME_SLAVE;
+	resource->entry = &allocated_image->list;
+
+	return resource;
+
+err_alloc:
+	/* Unlock image */
+	mutex_lock(&slave_image->mtx);
+	slave_image->locked = 0;
+	mutex_unlock(&slave_image->mtx);
+err_image:
+err_bus:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_slave_request);
+
+/**
+ * vme_slave_set - Set VME slave window configuration.
+ * @resource: Pointer to VME slave resource.
+ * @enabled: State to which the window should be configured.
+ * @vme_base: Base address for the window.
+ * @size: Size of the VME window.
+ * @buf_base: Based address of buffer used to provide VME slave window storage.
+ * @aspace: VME address space for the VME window.
+ * @cycle: VME data transfer cycle type for the VME window.
+ *
+ * Set configuration for provided VME slave window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device, if an invalid resource has been provided or invalid
+ *         attributes are provided. Hardware specific errors may also be
+ *         returned.
+ */
+int vme_slave_set(struct vme_resource *resource, int enabled,
+	unsigned long long vme_base, unsigned long long size,
+	dma_addr_t buf_base, u32 aspace, u32 cycle)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_slave_resource *image;
+	int retval;
+
+	if (resource->type != VME_SLAVE) {
+		printk(KERN_ERR "Not a slave resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_slave_resource, list);
+
+	if (!bridge->slave_set) {
+		printk(KERN_ERR "Function not supported\n");
+		return -ENOSYS;
+	}
+
+	if (!(((image->address_attr & aspace) == aspace) &&
+		((image->cycle_attr & cycle) == cycle))) {
+		printk(KERN_ERR "Invalid attributes\n");
+		return -EINVAL;
+	}
+
+	retval = vme_check_window(aspace, vme_base, size);
+	if (retval)
+		return retval;
+
+	return bridge->slave_set(image, enabled, vme_base, size, buf_base,
+		aspace, cycle);
+}
+EXPORT_SYMBOL(vme_slave_set);
+
+/**
+ * vme_slave_get - Retrieve VME slave window configuration.
+ * @resource: Pointer to VME slave resource.
+ * @enabled: Pointer to variable for storing state.
+ * @vme_base: Pointer to variable for storing window base address.
+ * @size: Pointer to variable for storing window size.
+ * @buf_base: Pointer to variable for storing slave buffer base address.
+ * @aspace: Pointer to variable for storing VME address space.
+ * @cycle: Pointer to variable for storing VME data transfer cycle type.
+ *
+ * Return configuration for provided VME slave window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if an invalid resource has been provided.
+ */
+int vme_slave_get(struct vme_resource *resource, int *enabled,
+	unsigned long long *vme_base, unsigned long long *size,
+	dma_addr_t *buf_base, u32 *aspace, u32 *cycle)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_slave_resource *image;
+
+	if (resource->type != VME_SLAVE) {
+		printk(KERN_ERR "Not a slave resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_slave_resource, list);
+
+	if (!bridge->slave_get) {
+		printk(KERN_ERR "vme_slave_get not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->slave_get(image, enabled, vme_base, size, buf_base,
+		aspace, cycle);
+}
+EXPORT_SYMBOL(vme_slave_get);
+
+/**
+ * vme_slave_free - Free VME slave window
+ * @resource: Pointer to VME slave resource.
+ *
+ * Free the provided slave resource so that it may be reallocated.
+ */
+void vme_slave_free(struct vme_resource *resource)
+{
+	struct vme_slave_resource *slave_image;
+
+	if (resource->type != VME_SLAVE) {
+		printk(KERN_ERR "Not a slave resource\n");
+		return;
+	}
+
+	slave_image = list_entry(resource->entry, struct vme_slave_resource,
+		list);
+	if (!slave_image) {
+		printk(KERN_ERR "Can't find slave resource\n");
+		return;
+	}
+
+	/* Unlock image */
+	mutex_lock(&slave_image->mtx);
+	if (slave_image->locked == 0)
+		printk(KERN_ERR "Image is already free\n");
+
+	slave_image->locked = 0;
+	mutex_unlock(&slave_image->mtx);
+
+	/* Free up resource memory */
+	kfree(resource);
+}
+EXPORT_SYMBOL(vme_slave_free);
+
+/**
+ * vme_master_request - Request a VME master window resource.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @address: Required VME address space.
+ * @cycle: Required VME data transfer cycle type.
+ * @dwidth: Required VME data transfer width.
+ *
+ * Request use of a VME window resource capable of being set for the requested
+ * address space, data transfer cycle and width.
+ *
+ * Return: Pointer to VME resource on success, NULL on failure.
+ */
+struct vme_resource *vme_master_request(struct vme_dev *vdev, u32 address,
+	u32 cycle, u32 dwidth)
+{
+	struct vme_bridge *bridge;
+	struct list_head *master_pos = NULL;
+	struct vme_master_resource *allocated_image = NULL;
+	struct vme_master_resource *master_image = NULL;
+	struct vme_resource *resource = NULL;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		goto err_bus;
+	}
+
+	/* Loop through master resources */
+	list_for_each(master_pos, &bridge->master_resources) {
+		master_image = list_entry(master_pos,
+			struct vme_master_resource, list);
+
+		if (!master_image) {
+			printk(KERN_WARNING "Registered NULL master resource\n");
+			continue;
+		}
+
+		/* Find an unlocked and compatible image */
+		spin_lock(&master_image->lock);
+		if (((master_image->address_attr & address) == address) &&
+			((master_image->cycle_attr & cycle) == cycle) &&
+			((master_image->width_attr & dwidth) == dwidth) &&
+			(master_image->locked == 0)) {
+
+			master_image->locked = 1;
+			spin_unlock(&master_image->lock);
+			allocated_image = master_image;
+			break;
+		}
+		spin_unlock(&master_image->lock);
+	}
+
+	/* Check to see if we found a resource */
+	if (!allocated_image) {
+		printk(KERN_ERR "Can't find a suitable resource\n");
+		goto err_image;
+	}
+
+	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource)
+		goto err_alloc;
+
+	resource->type = VME_MASTER;
+	resource->entry = &allocated_image->list;
+
+	return resource;
+
+err_alloc:
+	/* Unlock image */
+	spin_lock(&master_image->lock);
+	master_image->locked = 0;
+	spin_unlock(&master_image->lock);
+err_image:
+err_bus:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_master_request);
+
+/**
+ * vme_master_set - Set VME master window configuration.
+ * @resource: Pointer to VME master resource.
+ * @enabled: State to which the window should be configured.
+ * @vme_base: Base address for the window.
+ * @size: Size of the VME window.
+ * @aspace: VME address space for the VME window.
+ * @cycle: VME data transfer cycle type for the VME window.
+ * @dwidth: VME data transfer width for the VME window.
+ *
+ * Set configuration for provided VME master window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device, if an invalid resource has been provided or invalid
+ *         attributes are provided. Hardware specific errors may also be
+ *         returned.
+ */
+int vme_master_set(struct vme_resource *resource, int enabled,
+	unsigned long long vme_base, unsigned long long size, u32 aspace,
+	u32 cycle, u32 dwidth)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_master_resource *image;
+	int retval;
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+
+	if (!bridge->master_set) {
+		printk(KERN_WARNING "vme_master_set not supported\n");
+		return -EINVAL;
+	}
+
+	if (!(((image->address_attr & aspace) == aspace) &&
+		((image->cycle_attr & cycle) == cycle) &&
+		((image->width_attr & dwidth) == dwidth))) {
+		printk(KERN_WARNING "Invalid attributes\n");
+		return -EINVAL;
+	}
+
+	retval = vme_check_window(aspace, vme_base, size);
+	if (retval)
+		return retval;
+
+	return bridge->master_set(image, enabled, vme_base, size, aspace,
+		cycle, dwidth);
+}
+EXPORT_SYMBOL(vme_master_set);
+
+/**
+ * vme_master_get - Retrieve VME master window configuration.
+ * @resource: Pointer to VME master resource.
+ * @enabled: Pointer to variable for storing state.
+ * @vme_base: Pointer to variable for storing window base address.
+ * @size: Pointer to variable for storing window size.
+ * @aspace: Pointer to variable for storing VME address space.
+ * @cycle: Pointer to variable for storing VME data transfer cycle type.
+ * @dwidth: Pointer to variable for storing VME data transfer width.
+ *
+ * Return configuration for provided VME master window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if an invalid resource has been provided.
+ */
+int vme_master_get(struct vme_resource *resource, int *enabled,
+	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
+	u32 *cycle, u32 *dwidth)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_master_resource *image;
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+
+	if (!bridge->master_get) {
+		printk(KERN_WARNING "%s not supported\n", __func__);
+		return -EINVAL;
+	}
+
+	return bridge->master_get(image, enabled, vme_base, size, aspace,
+		cycle, dwidth);
+}
+EXPORT_SYMBOL(vme_master_get);
+
+/**
+ * vme_master_read - Read data from VME space into a buffer.
+ * @resource: Pointer to VME master resource.
+ * @buf: Pointer to buffer where data should be transferred.
+ * @count: Number of bytes to transfer.
+ * @offset: Offset into VME master window at which to start transfer.
+ *
+ * Perform read of count bytes of data from location on VME bus which maps into
+ * the VME master window at offset to buf.
+ *
+ * Return: Number of bytes read, -EINVAL if resource is not a VME master
+ *         resource or read operation is not supported. -EFAULT returned if
+ *         invalid offset is provided. Hardware specific errors may also be
+ *         returned.
+ */
+ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count,
+	loff_t offset)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_master_resource *image;
+	size_t length;
+
+	if (!bridge->master_read) {
+		printk(KERN_WARNING "Reading from resource not supported\n");
+		return -EINVAL;
+	}
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+
+	length = vme_get_size(resource);
+
+	if (offset > length) {
+		printk(KERN_WARNING "Invalid Offset\n");
+		return -EFAULT;
+	}
+
+	if ((offset + count) > length)
+		count = length - offset;
+
+	return bridge->master_read(image, buf, count, offset);
+
+}
+EXPORT_SYMBOL(vme_master_read);
+
+/**
+ * vme_master_write - Write data out to VME space from a buffer.
+ * @resource: Pointer to VME master resource.
+ * @buf: Pointer to buffer holding data to transfer.
+ * @count: Number of bytes to transfer.
+ * @offset: Offset into VME master window at which to start transfer.
+ *
+ * Perform write of count bytes of data from buf to location on VME bus which
+ * maps into the VME master window at offset.
+ *
+ * Return: Number of bytes written, -EINVAL if resource is not a VME master
+ *         resource or write operation is not supported. -EFAULT returned if
+ *         invalid offset is provided. Hardware specific errors may also be
+ *         returned.
+ */
+ssize_t vme_master_write(struct vme_resource *resource, void *buf,
+	size_t count, loff_t offset)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_master_resource *image;
+	size_t length;
+
+	if (!bridge->master_write) {
+		printk(KERN_WARNING "Writing to resource not supported\n");
+		return -EINVAL;
+	}
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+
+	length = vme_get_size(resource);
+
+	if (offset > length) {
+		printk(KERN_WARNING "Invalid Offset\n");
+		return -EFAULT;
+	}
+
+	if ((offset + count) > length)
+		count = length - offset;
+
+	return bridge->master_write(image, buf, count, offset);
+}
+EXPORT_SYMBOL(vme_master_write);
+
+/**
+ * vme_master_rmw - Perform read-modify-write cycle.
+ * @resource: Pointer to VME master resource.
+ * @mask: Bits to be compared and swapped in operation.
+ * @compare: Bits to be compared with data read from offset.
+ * @swap: Bits to be swapped in data read from offset.
+ * @offset: Offset into VME master window at which to perform operation.
+ *
+ * Perform read-modify-write cycle on provided location:
+ * - Location on VME bus is read.
+ * - Bits selected by mask are compared with compare.
+ * - Where a selected bit matches that in compare and are selected in swap,
+ * the bit is swapped.
+ * - Result written back to location on VME bus.
+ *
+ * Return: Bytes written on success, -EINVAL if resource is not a VME master
+ *         resource or RMW operation is not supported. Hardware specific
+ *         errors may also be returned.
+ */
+unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask,
+	unsigned int compare, unsigned int swap, loff_t offset)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_master_resource *image;
+
+	if (!bridge->master_rmw) {
+		printk(KERN_WARNING "Writing to resource not supported\n");
+		return -EINVAL;
+	}
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+
+	return bridge->master_rmw(image, mask, compare, swap, offset);
+}
+EXPORT_SYMBOL(vme_master_rmw);
+
+/**
+ * vme_master_mmap - Mmap region of VME master window.
+ * @resource: Pointer to VME master resource.
+ * @vma: Pointer to definition of user mapping.
+ *
+ * Memory map a region of the VME master window into user space.
+ *
+ * Return: Zero on success, -EINVAL if resource is not a VME master
+ *         resource or -EFAULT if map exceeds window size. Other generic mmap
+ *         errors may also be returned.
+ */
+int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma)
+{
+	struct vme_master_resource *image;
+	phys_addr_t phys_addr;
+	unsigned long vma_size;
+
+	if (resource->type != VME_MASTER) {
+		pr_err("Not a master resource\n");
+		return -EINVAL;
+	}
+
+	image = list_entry(resource->entry, struct vme_master_resource, list);
+	phys_addr = image->bus_resource.start + (vma->vm_pgoff << PAGE_SHIFT);
+	vma_size = vma->vm_end - vma->vm_start;
+
+	if (phys_addr + vma_size > image->bus_resource.end + 1) {
+		pr_err("Map size cannot exceed the window size\n");
+		return -EFAULT;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return vm_iomap_memory(vma, phys_addr, vma->vm_end - vma->vm_start);
+}
+EXPORT_SYMBOL(vme_master_mmap);
+
+/**
+ * vme_master_free - Free VME master window
+ * @resource: Pointer to VME master resource.
+ *
+ * Free the provided master resource so that it may be reallocated.
+ */
+void vme_master_free(struct vme_resource *resource)
+{
+	struct vme_master_resource *master_image;
+
+	if (resource->type != VME_MASTER) {
+		printk(KERN_ERR "Not a master resource\n");
+		return;
+	}
+
+	master_image = list_entry(resource->entry, struct vme_master_resource,
+		list);
+	if (!master_image) {
+		printk(KERN_ERR "Can't find master resource\n");
+		return;
+	}
+
+	/* Unlock image */
+	spin_lock(&master_image->lock);
+	if (master_image->locked == 0)
+		printk(KERN_ERR "Image is already free\n");
+
+	master_image->locked = 0;
+	spin_unlock(&master_image->lock);
+
+	/* Free up resource memory */
+	kfree(resource);
+}
+EXPORT_SYMBOL(vme_master_free);
+
+/**
+ * vme_dma_request - Request a DMA controller.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @route: Required src/destination combination.
+ *
+ * Request a VME DMA controller with capability to perform transfers bewteen
+ * requested source/destination combination.
+ *
+ * Return: Pointer to VME DMA resource on success, NULL on failure.
+ */
+struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route)
+{
+	struct vme_bridge *bridge;
+	struct list_head *dma_pos = NULL;
+	struct vme_dma_resource *allocated_ctrlr = NULL;
+	struct vme_dma_resource *dma_ctrlr = NULL;
+	struct vme_resource *resource = NULL;
+
+	/* XXX Not checking resource attributes */
+	printk(KERN_ERR "No VME resource Attribute tests done\n");
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		goto err_bus;
+	}
+
+	/* Loop through DMA resources */
+	list_for_each(dma_pos, &bridge->dma_resources) {
+		dma_ctrlr = list_entry(dma_pos,
+			struct vme_dma_resource, list);
+		if (!dma_ctrlr) {
+			printk(KERN_ERR "Registered NULL DMA resource\n");
+			continue;
+		}
+
+		/* Find an unlocked and compatible controller */
+		mutex_lock(&dma_ctrlr->mtx);
+		if (((dma_ctrlr->route_attr & route) == route) &&
+			(dma_ctrlr->locked == 0)) {
+
+			dma_ctrlr->locked = 1;
+			mutex_unlock(&dma_ctrlr->mtx);
+			allocated_ctrlr = dma_ctrlr;
+			break;
+		}
+		mutex_unlock(&dma_ctrlr->mtx);
+	}
+
+	/* Check to see if we found a resource */
+	if (!allocated_ctrlr)
+		goto err_ctrlr;
+
+	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource)
+		goto err_alloc;
+
+	resource->type = VME_DMA;
+	resource->entry = &allocated_ctrlr->list;
+
+	return resource;
+
+err_alloc:
+	/* Unlock image */
+	mutex_lock(&dma_ctrlr->mtx);
+	dma_ctrlr->locked = 0;
+	mutex_unlock(&dma_ctrlr->mtx);
+err_ctrlr:
+err_bus:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_dma_request);
+
+/**
+ * vme_new_dma_list - Create new VME DMA list.
+ * @resource: Pointer to VME DMA resource.
+ *
+ * Create a new VME DMA list. It is the responsibility of the user to free
+ * the list once it is no longer required with vme_dma_list_free().
+ *
+ * Return: Pointer to new VME DMA list, NULL on allocation failure or invalid
+ *         VME DMA resource.
+ */
+struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource)
+{
+	struct vme_dma_list *dma_list;
+
+	if (resource->type != VME_DMA) {
+		printk(KERN_ERR "Not a DMA resource\n");
+		return NULL;
+	}
+
+	dma_list = kmalloc(sizeof(*dma_list), GFP_KERNEL);
+	if (!dma_list)
+		return NULL;
+
+	INIT_LIST_HEAD(&dma_list->entries);
+	dma_list->parent = list_entry(resource->entry,
+				      struct vme_dma_resource,
+				      list);
+	mutex_init(&dma_list->mtx);
+
+	return dma_list;
+}
+EXPORT_SYMBOL(vme_new_dma_list);
+
+/**
+ * vme_dma_pattern_attribute - Create "Pattern" type VME DMA list attribute.
+ * @pattern: Value to use used as pattern
+ * @type: Type of pattern to be written.
+ *
+ * Create VME DMA list attribute for pattern generation. It is the
+ * responsibility of the user to free used attributes using
+ * vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
+ */
+struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type)
+{
+	struct vme_dma_attr *attributes;
+	struct vme_dma_pattern *pattern_attr;
+
+	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
+	if (!attributes)
+		goto err_attr;
+
+	pattern_attr = kmalloc(sizeof(*pattern_attr), GFP_KERNEL);
+	if (!pattern_attr)
+		goto err_pat;
+
+	attributes->type = VME_DMA_PATTERN;
+	attributes->private = (void *)pattern_attr;
+
+	pattern_attr->pattern = pattern;
+	pattern_attr->type = type;
+
+	return attributes;
+
+err_pat:
+	kfree(attributes);
+err_attr:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_dma_pattern_attribute);
+
+/**
+ * vme_dma_pci_attribute - Create "PCI" type VME DMA list attribute.
+ * @address: PCI base address for DMA transfer.
+ *
+ * Create VME DMA list attribute pointing to a location on PCI for DMA
+ * transfers. It is the responsibility of the user to free used attributes
+ * using vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
+ */
+struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t address)
+{
+	struct vme_dma_attr *attributes;
+	struct vme_dma_pci *pci_attr;
+
+	/* XXX Run some sanity checks here */
+
+	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
+	if (!attributes)
+		goto err_attr;
+
+	pci_attr = kmalloc(sizeof(*pci_attr), GFP_KERNEL);
+	if (!pci_attr)
+		goto err_pci;
+
+	attributes->type = VME_DMA_PCI;
+	attributes->private = (void *)pci_attr;
+
+	pci_attr->address = address;
+
+	return attributes;
+
+err_pci:
+	kfree(attributes);
+err_attr:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_dma_pci_attribute);
+
+/**
+ * vme_dma_vme_attribute - Create "VME" type VME DMA list attribute.
+ * @address: VME base address for DMA transfer.
+ * @aspace: VME address space to use for DMA transfer.
+ * @cycle: VME bus cycle to use for DMA transfer.
+ * @dwidth: VME data width to use for DMA transfer.
+ *
+ * Create VME DMA list attribute pointing to a location on the VME bus for DMA
+ * transfers. It is the responsibility of the user to free used attributes
+ * using vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
+ */
+struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long address,
+	u32 aspace, u32 cycle, u32 dwidth)
+{
+	struct vme_dma_attr *attributes;
+	struct vme_dma_vme *vme_attr;
+
+	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
+	if (!attributes)
+		goto err_attr;
+
+	vme_attr = kmalloc(sizeof(*vme_attr), GFP_KERNEL);
+	if (!vme_attr)
+		goto err_vme;
+
+	attributes->type = VME_DMA_VME;
+	attributes->private = (void *)vme_attr;
+
+	vme_attr->address = address;
+	vme_attr->aspace = aspace;
+	vme_attr->cycle = cycle;
+	vme_attr->dwidth = dwidth;
+
+	return attributes;
+
+err_vme:
+	kfree(attributes);
+err_attr:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_dma_vme_attribute);
+
+/**
+ * vme_dma_free_attribute - Free DMA list attribute.
+ * @attributes: Pointer to DMA list attribute.
+ *
+ * Free VME DMA list attribute. VME DMA list attributes can be safely freed
+ * once vme_dma_list_add() has returned.
+ */
+void vme_dma_free_attribute(struct vme_dma_attr *attributes)
+{
+	kfree(attributes->private);
+	kfree(attributes);
+}
+EXPORT_SYMBOL(vme_dma_free_attribute);
+
+/**
+ * vme_dma_list_add - Add enty to a VME DMA list.
+ * @list: Pointer to VME list.
+ * @src: Pointer to DMA list attribute to use as source.
+ * @dest: Pointer to DMA list attribute to use as destination.
+ * @count: Number of bytes to transfer.
+ *
+ * Add an entry to the provided VME DMA list. Entry requires pointers to source
+ * and destination DMA attributes and a count.
+ *
+ * Please note, the attributes supported as source and destinations for
+ * transfers are hardware dependent.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if the link list has already been submitted for execution.
+ *         Hardware specific errors also possible.
+ */
+int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src,
+	struct vme_dma_attr *dest, size_t count)
+{
+	struct vme_bridge *bridge = list->parent->parent;
+	int retval;
+
+	if (!bridge->dma_list_add) {
+		printk(KERN_WARNING "Link List DMA generation not supported\n");
+		return -EINVAL;
+	}
+
+	if (!mutex_trylock(&list->mtx)) {
+		printk(KERN_ERR "Link List already submitted\n");
+		return -EINVAL;
+	}
+
+	retval = bridge->dma_list_add(list, src, dest, count);
+
+	mutex_unlock(&list->mtx);
+
+	return retval;
+}
+EXPORT_SYMBOL(vme_dma_list_add);
+
+/**
+ * vme_dma_list_exec - Queue a VME DMA list for execution.
+ * @list: Pointer to VME list.
+ *
+ * Queue the provided VME DMA list for execution. The call will return once the
+ * list has been executed.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device. Hardware specific errors also possible.
+ */
+int vme_dma_list_exec(struct vme_dma_list *list)
+{
+	struct vme_bridge *bridge = list->parent->parent;
+	int retval;
+
+	if (!bridge->dma_list_exec) {
+		printk(KERN_ERR "Link List DMA execution not supported\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&list->mtx);
+
+	retval = bridge->dma_list_exec(list);
+
+	mutex_unlock(&list->mtx);
+
+	return retval;
+}
+EXPORT_SYMBOL(vme_dma_list_exec);
+
+/**
+ * vme_dma_list_free - Free a VME DMA list.
+ * @list: Pointer to VME list.
+ *
+ * Free the provided DMA list and all its entries.
+ *
+ * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
+ *         is still in use. Hardware specific errors also possible.
+ */
+int vme_dma_list_free(struct vme_dma_list *list)
+{
+	struct vme_bridge *bridge = list->parent->parent;
+	int retval;
+
+	if (!bridge->dma_list_empty) {
+		printk(KERN_WARNING "Emptying of Link Lists not supported\n");
+		return -EINVAL;
+	}
+
+	if (!mutex_trylock(&list->mtx)) {
+		printk(KERN_ERR "Link List in use\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * Empty out all of the entries from the DMA list. We need to go to the
+	 * low level driver as DMA entries are driver specific.
+	 */
+	retval = bridge->dma_list_empty(list);
+	if (retval) {
+		printk(KERN_ERR "Unable to empty link-list entries\n");
+		mutex_unlock(&list->mtx);
+		return retval;
+	}
+	mutex_unlock(&list->mtx);
+	kfree(list);
+
+	return retval;
+}
+EXPORT_SYMBOL(vme_dma_list_free);
+
+/**
+ * vme_dma_free - Free a VME DMA resource.
+ * @resource: Pointer to VME DMA resource.
+ *
+ * Free the provided DMA resource so that it may be reallocated.
+ *
+ * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
+ *         is still active.
+ */
+int vme_dma_free(struct vme_resource *resource)
+{
+	struct vme_dma_resource *ctrlr;
+
+	if (resource->type != VME_DMA) {
+		printk(KERN_ERR "Not a DMA resource\n");
+		return -EINVAL;
+	}
+
+	ctrlr = list_entry(resource->entry, struct vme_dma_resource, list);
+
+	if (!mutex_trylock(&ctrlr->mtx)) {
+		printk(KERN_ERR "Resource busy, can't free\n");
+		return -EBUSY;
+	}
+
+	if (!(list_empty(&ctrlr->pending) && list_empty(&ctrlr->running))) {
+		printk(KERN_WARNING "Resource still processing transfers\n");
+		mutex_unlock(&ctrlr->mtx);
+		return -EBUSY;
+	}
+
+	ctrlr->locked = 0;
+
+	mutex_unlock(&ctrlr->mtx);
+
+	kfree(resource);
+
+	return 0;
+}
+EXPORT_SYMBOL(vme_dma_free);
+
+void vme_bus_error_handler(struct vme_bridge *bridge,
+			   unsigned long long address, int am)
+{
+	struct list_head *handler_pos = NULL;
+	struct vme_error_handler *handler;
+	int handler_triggered = 0;
+	u32 aspace = vme_get_aspace(am);
+
+	list_for_each(handler_pos, &bridge->vme_error_handlers) {
+		handler = list_entry(handler_pos, struct vme_error_handler,
+				     list);
+		if ((aspace == handler->aspace) &&
+		    (address >= handler->start) &&
+		    (address < handler->end)) {
+			if (!handler->num_errors)
+				handler->first_error = address;
+			if (handler->num_errors != UINT_MAX)
+				handler->num_errors++;
+			handler_triggered = 1;
+		}
+	}
+
+	if (!handler_triggered)
+		dev_err(bridge->parent,
+			"Unhandled VME access error at address 0x%llx\n",
+			address);
+}
+EXPORT_SYMBOL(vme_bus_error_handler);
+
+struct vme_error_handler *vme_register_error_handler(
+	struct vme_bridge *bridge, u32 aspace,
+	unsigned long long address, size_t len)
+{
+	struct vme_error_handler *handler;
+
+	handler = kmalloc(sizeof(*handler), GFP_ATOMIC);
+	if (!handler)
+		return NULL;
+
+	handler->aspace = aspace;
+	handler->start = address;
+	handler->end = address + len;
+	handler->num_errors = 0;
+	handler->first_error = 0;
+	list_add_tail(&handler->list, &bridge->vme_error_handlers);
+
+	return handler;
+}
+EXPORT_SYMBOL(vme_register_error_handler);
+
+void vme_unregister_error_handler(struct vme_error_handler *handler)
+{
+	list_del(&handler->list);
+	kfree(handler);
+}
+EXPORT_SYMBOL(vme_unregister_error_handler);
+
+void vme_irq_handler(struct vme_bridge *bridge, int level, int statid)
+{
+	void (*call)(int, int, void *);
+	void *priv_data;
+
+	call = bridge->irq[level - 1].callback[statid].func;
+	priv_data = bridge->irq[level - 1].callback[statid].priv_data;
+	if (call)
+		call(level, statid, priv_data);
+	else
+		printk(KERN_WARNING "Spurious VME interrupt, level:%x, vector:%x\n",
+		       level, statid);
+}
+EXPORT_SYMBOL(vme_irq_handler);
+
+/**
+ * vme_irq_request - Request a specific VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority being requested.
+ * @statid: Interrupt vector being requested.
+ * @callback: Pointer to callback function called when VME interrupt/vector
+ *            received.
+ * @priv_data: Generic pointer that will be passed to the callback function.
+ *
+ * Request callback to be attached as a handler for VME interrupts with provided
+ * level and statid.
+ *
+ * Return: Zero on success, -EINVAL on invalid vme device, level or if the
+ *         function is not supported, -EBUSY if the level/statid combination is
+ *         already in use. Hardware specific errors also possible.
+ */
+int vme_irq_request(struct vme_dev *vdev, int level, int statid,
+	void (*callback)(int, int, void *),
+	void *priv_data)
+{
+	struct vme_bridge *bridge;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		return -EINVAL;
+	}
+
+	if ((level < 1) || (level > 7)) {
+		printk(KERN_ERR "Invalid interrupt level\n");
+		return -EINVAL;
+	}
+
+	if (!bridge->irq_set) {
+		printk(KERN_ERR "Configuring interrupts not supported\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&bridge->irq_mtx);
+
+	if (bridge->irq[level - 1].callback[statid].func) {
+		mutex_unlock(&bridge->irq_mtx);
+		printk(KERN_WARNING "VME Interrupt already taken\n");
+		return -EBUSY;
+	}
+
+	bridge->irq[level - 1].count++;
+	bridge->irq[level - 1].callback[statid].priv_data = priv_data;
+	bridge->irq[level - 1].callback[statid].func = callback;
+
+	/* Enable IRQ level */
+	bridge->irq_set(bridge, level, 1, 1);
+
+	mutex_unlock(&bridge->irq_mtx);
+
+	return 0;
+}
+EXPORT_SYMBOL(vme_irq_request);
+
+/**
+ * vme_irq_free - Free a VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority of interrupt being freed.
+ * @statid: Interrupt vector of interrupt being freed.
+ *
+ * Remove previously attached callback from VME interrupt priority/vector.
+ */
+void vme_irq_free(struct vme_dev *vdev, int level, int statid)
+{
+	struct vme_bridge *bridge;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		return;
+	}
+
+	if ((level < 1) || (level > 7)) {
+		printk(KERN_ERR "Invalid interrupt level\n");
+		return;
+	}
+
+	if (!bridge->irq_set) {
+		printk(KERN_ERR "Configuring interrupts not supported\n");
+		return;
+	}
+
+	mutex_lock(&bridge->irq_mtx);
+
+	bridge->irq[level - 1].count--;
+
+	/* Disable IRQ level if no more interrupts attached at this level*/
+	if (bridge->irq[level - 1].count == 0)
+		bridge->irq_set(bridge, level, 0, 1);
+
+	bridge->irq[level - 1].callback[statid].func = NULL;
+	bridge->irq[level - 1].callback[statid].priv_data = NULL;
+
+	mutex_unlock(&bridge->irq_mtx);
+}
+EXPORT_SYMBOL(vme_irq_free);
+
+/**
+ * vme_irq_generate - Generate VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority at which to assert the interrupt.
+ * @statid: Interrupt vector to associate with the interrupt.
+ *
+ * Generate a VME interrupt of the provided level and with the provided
+ * statid.
+ *
+ * Return: Zero on success, -EINVAL on invalid vme device, level or if the
+ *         function is not supported. Hardware specific errors also possible.
+ */
+int vme_irq_generate(struct vme_dev *vdev, int level, int statid)
+{
+	struct vme_bridge *bridge;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		return -EINVAL;
+	}
+
+	if ((level < 1) || (level > 7)) {
+		printk(KERN_WARNING "Invalid interrupt level\n");
+		return -EINVAL;
+	}
+
+	if (!bridge->irq_generate) {
+		printk(KERN_WARNING "Interrupt generation not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->irq_generate(bridge, level, statid);
+}
+EXPORT_SYMBOL(vme_irq_generate);
+
+/**
+ * vme_lm_request - Request a VME location monitor
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Allocate a location monitor resource to the driver. A location monitor
+ * allows the driver to monitor accesses to a contiguous number of
+ * addresses on the VME bus.
+ *
+ * Return: Pointer to a VME resource on success or NULL on failure.
+ */
+struct vme_resource *vme_lm_request(struct vme_dev *vdev)
+{
+	struct vme_bridge *bridge;
+	struct list_head *lm_pos = NULL;
+	struct vme_lm_resource *allocated_lm = NULL;
+	struct vme_lm_resource *lm = NULL;
+	struct vme_resource *resource = NULL;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		goto err_bus;
+	}
+
+	/* Loop through LM resources */
+	list_for_each(lm_pos, &bridge->lm_resources) {
+		lm = list_entry(lm_pos,
+			struct vme_lm_resource, list);
+		if (!lm) {
+			printk(KERN_ERR "Registered NULL Location Monitor resource\n");
+			continue;
+		}
+
+		/* Find an unlocked controller */
+		mutex_lock(&lm->mtx);
+		if (lm->locked == 0) {
+			lm->locked = 1;
+			mutex_unlock(&lm->mtx);
+			allocated_lm = lm;
+			break;
+		}
+		mutex_unlock(&lm->mtx);
+	}
+
+	/* Check to see if we found a resource */
+	if (!allocated_lm)
+		goto err_lm;
+
+	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource)
+		goto err_alloc;
+
+	resource->type = VME_LM;
+	resource->entry = &allocated_lm->list;
+
+	return resource;
+
+err_alloc:
+	/* Unlock image */
+	mutex_lock(&lm->mtx);
+	lm->locked = 0;
+	mutex_unlock(&lm->mtx);
+err_lm:
+err_bus:
+	return NULL;
+}
+EXPORT_SYMBOL(vme_lm_request);
+
+/**
+ * vme_lm_count - Determine number of VME Addresses monitored
+ * @resource: Pointer to VME location monitor resource.
+ *
+ * The number of contiguous addresses monitored is hardware dependent.
+ * Return the number of contiguous addresses monitored by the
+ * location monitor.
+ *
+ * Return: Count of addresses monitored or -EINVAL when provided with an
+ *	   invalid location monitor resource.
+ */
+int vme_lm_count(struct vme_resource *resource)
+{
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return -EINVAL;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	return lm->monitors;
+}
+EXPORT_SYMBOL(vme_lm_count);
+
+/**
+ * vme_lm_set - Configure location monitor
+ * @resource: Pointer to VME location monitor resource.
+ * @lm_base: Base address to monitor.
+ * @aspace: VME address space to monitor.
+ * @cycle: VME bus cycle type to monitor.
+ *
+ * Set the base address, address space and cycle type of accesses to be
+ * monitored by the location monitor.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
+int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base,
+	u32 aspace, u32 cycle)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return -EINVAL;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	if (!bridge->lm_set) {
+		printk(KERN_ERR "vme_lm_set not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->lm_set(lm, lm_base, aspace, cycle);
+}
+EXPORT_SYMBOL(vme_lm_set);
+
+/**
+ * vme_lm_get - Retrieve location monitor settings
+ * @resource: Pointer to VME location monitor resource.
+ * @lm_base: Pointer used to output the base address monitored.
+ * @aspace: Pointer used to output the address space monitored.
+ * @cycle: Pointer used to output the VME bus cycle type monitored.
+ *
+ * Retrieve the base address, address space and cycle type of accesses to
+ * be monitored by the location monitor.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
+int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base,
+	u32 *aspace, u32 *cycle)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return -EINVAL;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	if (!bridge->lm_get) {
+		printk(KERN_ERR "vme_lm_get not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->lm_get(lm, lm_base, aspace, cycle);
+}
+EXPORT_SYMBOL(vme_lm_get);
+
+/**
+ * vme_lm_attach - Provide callback for location monitor address
+ * @resource: Pointer to VME location monitor resource.
+ * @monitor: Offset to which callback should be attached.
+ * @callback: Pointer to callback function called when triggered.
+ * @data: Generic pointer that will be passed to the callback function.
+ *
+ * Attach a callback to the specificed offset into the location monitors
+ * monitored addresses. A generic pointer is provided to allow data to be
+ * passed to the callback when called.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
+int vme_lm_attach(struct vme_resource *resource, int monitor,
+	void (*callback)(void *), void *data)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return -EINVAL;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	if (!bridge->lm_attach) {
+		printk(KERN_ERR "vme_lm_attach not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->lm_attach(lm, monitor, callback, data);
+}
+EXPORT_SYMBOL(vme_lm_attach);
+
+/**
+ * vme_lm_detach - Remove callback for location monitor address
+ * @resource: Pointer to VME location monitor resource.
+ * @monitor: Offset to which callback should be removed.
+ *
+ * Remove the callback associated with the specificed offset into the
+ * location monitors monitored addresses.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
+int vme_lm_detach(struct vme_resource *resource, int monitor)
+{
+	struct vme_bridge *bridge = find_bridge(resource);
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return -EINVAL;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	if (!bridge->lm_detach) {
+		printk(KERN_ERR "vme_lm_detach not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->lm_detach(lm, monitor);
+}
+EXPORT_SYMBOL(vme_lm_detach);
+
+/**
+ * vme_lm_free - Free allocated VME location monitor
+ * @resource: Pointer to VME location monitor resource.
+ *
+ * Free allocation of a VME location monitor.
+ *
+ * WARNING: This function currently expects that any callbacks that have
+ *          been attached to the location monitor have been removed.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource.
+ */
+void vme_lm_free(struct vme_resource *resource)
+{
+	struct vme_lm_resource *lm;
+
+	if (resource->type != VME_LM) {
+		printk(KERN_ERR "Not a Location Monitor resource\n");
+		return;
+	}
+
+	lm = list_entry(resource->entry, struct vme_lm_resource, list);
+
+	mutex_lock(&lm->mtx);
+
+	/* XXX
+	 * Check to see that there aren't any callbacks still attached, if
+	 * there are we should probably be detaching them!
+	 */
+
+	lm->locked = 0;
+
+	mutex_unlock(&lm->mtx);
+
+	kfree(resource);
+}
+EXPORT_SYMBOL(vme_lm_free);
+
+/**
+ * vme_slot_num - Retrieve slot ID
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Retrieve the slot ID associated with the provided VME device.
+ *
+ * Return: The slot ID on success, -EINVAL if VME bridge cannot be determined
+ *         or the function is not supported. Hardware specific errors may also
+ *         be returned.
+ */
+int vme_slot_num(struct vme_dev *vdev)
+{
+	struct vme_bridge *bridge;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		printk(KERN_ERR "Can't find VME bus\n");
+		return -EINVAL;
+	}
+
+	if (!bridge->slot_get) {
+		printk(KERN_WARNING "vme_slot_num not supported\n");
+		return -EINVAL;
+	}
+
+	return bridge->slot_get(bridge);
+}
+EXPORT_SYMBOL(vme_slot_num);
+
+/**
+ * vme_bus_num - Retrieve bus number
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Retrieve the bus enumeration associated with the provided VME device.
+ *
+ * Return: The bus number on success, -EINVAL if VME bridge cannot be
+ *         determined.
+ */
+int vme_bus_num(struct vme_dev *vdev)
+{
+	struct vme_bridge *bridge;
+
+	bridge = vdev->bridge;
+	if (!bridge) {
+		pr_err("Can't find VME bus\n");
+		return -EINVAL;
+	}
+
+	return bridge->num;
+}
+EXPORT_SYMBOL(vme_bus_num);
+
+/* - Bridge Registration --------------------------------------------------- */
+
+static void vme_dev_release(struct device *dev)
+{
+	kfree(dev_to_vme_dev(dev));
+}
+
+/* Common bridge initialization */
+struct vme_bridge *vme_init_bridge(struct vme_bridge *bridge)
+{
+	INIT_LIST_HEAD(&bridge->vme_error_handlers);
+	INIT_LIST_HEAD(&bridge->master_resources);
+	INIT_LIST_HEAD(&bridge->slave_resources);
+	INIT_LIST_HEAD(&bridge->dma_resources);
+	INIT_LIST_HEAD(&bridge->lm_resources);
+	mutex_init(&bridge->irq_mtx);
+
+	return bridge;
+}
+EXPORT_SYMBOL(vme_init_bridge);
+
+int vme_register_bridge(struct vme_bridge *bridge)
+{
+	int i;
+	int ret = -1;
+
+	mutex_lock(&vme_buses_lock);
+	for (i = 0; i < sizeof(vme_bus_numbers) * 8; i++) {
+		if ((vme_bus_numbers & (1 << i)) == 0) {
+			vme_bus_numbers |= (1 << i);
+			bridge->num = i;
+			INIT_LIST_HEAD(&bridge->devices);
+			list_add_tail(&bridge->bus_list, &vme_bus_list);
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&vme_buses_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vme_register_bridge);
+
+void vme_unregister_bridge(struct vme_bridge *bridge)
+{
+	struct vme_dev *vdev;
+	struct vme_dev *tmp;
+
+	mutex_lock(&vme_buses_lock);
+	vme_bus_numbers &= ~(1 << bridge->num);
+	list_for_each_entry_safe(vdev, tmp, &bridge->devices, bridge_list) {
+		list_del(&vdev->drv_list);
+		list_del(&vdev->bridge_list);
+		device_unregister(&vdev->dev);
+	}
+	list_del(&bridge->bus_list);
+	mutex_unlock(&vme_buses_lock);
+}
+EXPORT_SYMBOL(vme_unregister_bridge);
+
+/* - Driver Registration --------------------------------------------------- */
+
+static int __vme_register_driver_bus(struct vme_driver *drv,
+	struct vme_bridge *bridge, unsigned int ndevs)
+{
+	int err;
+	unsigned int i;
+	struct vme_dev *vdev;
+	struct vme_dev *tmp;
+
+	for (i = 0; i < ndevs; i++) {
+		vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+		if (!vdev) {
+			err = -ENOMEM;
+			goto err_devalloc;
+		}
+		vdev->num = i;
+		vdev->bridge = bridge;
+		vdev->dev.platform_data = drv;
+		vdev->dev.release = vme_dev_release;
+		vdev->dev.parent = bridge->parent;
+		vdev->dev.bus = &vme_bus_type;
+		dev_set_name(&vdev->dev, "%s.%u-%u", drv->name, bridge->num,
+			vdev->num);
+
+		err = device_register(&vdev->dev);
+		if (err)
+			goto err_reg;
+
+		if (vdev->dev.platform_data) {
+			list_add_tail(&vdev->drv_list, &drv->devices);
+			list_add_tail(&vdev->bridge_list, &bridge->devices);
+		} else
+			device_unregister(&vdev->dev);
+	}
+	return 0;
+
+err_reg:
+	put_device(&vdev->dev);
+err_devalloc:
+	list_for_each_entry_safe(vdev, tmp, &drv->devices, drv_list) {
+		list_del(&vdev->drv_list);
+		list_del(&vdev->bridge_list);
+		device_unregister(&vdev->dev);
+	}
+	return err;
+}
+
+static int __vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
+{
+	struct vme_bridge *bridge;
+	int err = 0;
+
+	mutex_lock(&vme_buses_lock);
+	list_for_each_entry(bridge, &vme_bus_list, bus_list) {
+		/*
+		 * This cannot cause trouble as we already have vme_buses_lock
+		 * and if the bridge is removed, it will have to go through
+		 * vme_unregister_bridge() to do it (which calls remove() on
+		 * the bridge which in turn tries to acquire vme_buses_lock and
+		 * will have to wait).
+		 */
+		err = __vme_register_driver_bus(drv, bridge, ndevs);
+		if (err)
+			break;
+	}
+	mutex_unlock(&vme_buses_lock);
+	return err;
+}
+
+/**
+ * vme_register_driver - Register a VME driver
+ * @drv: Pointer to VME driver structure to register.
+ * @ndevs: Maximum number of devices to allow to be enumerated.
+ *
+ * Register a VME device driver with the VME subsystem.
+ *
+ * Return: Zero on success, error value on registration failure.
+ */
+int vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
+{
+	int err;
+
+	drv->driver.name = drv->name;
+	drv->driver.bus = &vme_bus_type;
+	INIT_LIST_HEAD(&drv->devices);
+
+	err = driver_register(&drv->driver);
+	if (err)
+		return err;
+
+	err = __vme_register_driver(drv, ndevs);
+	if (err)
+		driver_unregister(&drv->driver);
+
+	return err;
+}
+EXPORT_SYMBOL(vme_register_driver);
+
+/**
+ * vme_unregister_driver - Unregister a VME driver
+ * @drv: Pointer to VME driver structure to unregister.
+ *
+ * Unregister a VME device driver from the VME subsystem.
+ */
+void vme_unregister_driver(struct vme_driver *drv)
+{
+	struct vme_dev *dev, *dev_tmp;
+
+	mutex_lock(&vme_buses_lock);
+	list_for_each_entry_safe(dev, dev_tmp, &drv->devices, drv_list) {
+		list_del(&dev->drv_list);
+		list_del(&dev->bridge_list);
+		device_unregister(&dev->dev);
+	}
+	mutex_unlock(&vme_buses_lock);
+
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(vme_unregister_driver);
+
+/* - Bus Registration ------------------------------------------------------ */
+
+static int vme_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct vme_driver *vme_drv;
+
+	vme_drv = container_of(drv, struct vme_driver, driver);
+
+	if (dev->platform_data == vme_drv) {
+		struct vme_dev *vdev = dev_to_vme_dev(dev);
+
+		if (vme_drv->match && vme_drv->match(vdev))
+			return 1;
+
+		dev->platform_data = NULL;
+	}
+	return 0;
+}
+
+static int vme_bus_probe(struct device *dev)
+{
+	struct vme_driver *driver;
+	struct vme_dev *vdev = dev_to_vme_dev(dev);
+
+	driver = dev->platform_data;
+	if (driver->probe)
+		return driver->probe(vdev);
+
+	return -ENODEV;
+}
+
+static void vme_bus_remove(struct device *dev)
+{
+	struct vme_driver *driver;
+	struct vme_dev *vdev = dev_to_vme_dev(dev);
+
+	driver = dev->platform_data;
+	if (driver->remove)
+		driver->remove(vdev);
+}
+
+struct bus_type vme_bus_type = {
+	.name = "vme",
+	.match = vme_bus_match,
+	.probe = vme_bus_probe,
+	.remove = vme_bus_remove,
+};
+EXPORT_SYMBOL(vme_bus_type);
+
+static int __init vme_init(void)
+{
+	return bus_register(&vme_bus_type);
+}
+subsys_initcall(vme_init);
diff --git a/drivers/staging/vme_user/vme.h b/drivers/staging/vme_user/vme.h
new file mode 100644
index 000000000000..b204a9b4be1b
--- /dev/null
+++ b/drivers/staging/vme_user/vme.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VME_H_
+#define _VME_H_
+
+/* Resource Type */
+enum vme_resource_type {
+	VME_MASTER,
+	VME_SLAVE,
+	VME_DMA,
+	VME_LM
+};
+
+/* VME Address Spaces */
+#define VME_A16		0x1
+#define VME_A24		0x2
+#define	VME_A32		0x4
+#define VME_A64		0x8
+#define VME_CRCSR	0x10
+#define VME_USER1	0x20
+#define VME_USER2	0x40
+#define VME_USER3	0x80
+#define VME_USER4	0x100
+
+#define VME_A16_MAX	0x10000ULL
+#define VME_A24_MAX	0x1000000ULL
+#define VME_A32_MAX	0x100000000ULL
+#define VME_A64_MAX	0x10000000000000000ULL
+#define VME_CRCSR_MAX	0x1000000ULL
+
+
+/* VME Cycle Types */
+#define VME_SCT		0x1
+#define VME_BLT		0x2
+#define VME_MBLT	0x4
+#define VME_2eVME	0x8
+#define VME_2eSST	0x10
+#define VME_2eSSTB	0x20
+
+#define VME_2eSST160	0x100
+#define VME_2eSST267	0x200
+#define VME_2eSST320	0x400
+
+#define	VME_SUPER	0x1000
+#define	VME_USER	0x2000
+#define	VME_PROG	0x4000
+#define	VME_DATA	0x8000
+
+/* VME Data Widths */
+#define VME_D8		0x1
+#define VME_D16		0x2
+#define VME_D32		0x4
+#define VME_D64		0x8
+
+/* Arbitration Scheduling Modes */
+#define VME_R_ROBIN_MODE	0x1
+#define VME_PRIORITY_MODE	0x2
+
+#define VME_DMA_PATTERN			(1<<0)
+#define VME_DMA_PCI			(1<<1)
+#define VME_DMA_VME			(1<<2)
+
+#define VME_DMA_PATTERN_BYTE		(1<<0)
+#define VME_DMA_PATTERN_WORD		(1<<1)
+#define VME_DMA_PATTERN_INCREMENT	(1<<2)
+
+#define VME_DMA_VME_TO_MEM		(1<<0)
+#define VME_DMA_MEM_TO_VME		(1<<1)
+#define VME_DMA_VME_TO_VME		(1<<2)
+#define VME_DMA_MEM_TO_MEM		(1<<3)
+#define VME_DMA_PATTERN_TO_VME		(1<<4)
+#define VME_DMA_PATTERN_TO_MEM		(1<<5)
+
+struct vme_dma_attr {
+	u32 type;
+	void *private;
+};
+
+struct vme_resource {
+	enum vme_resource_type type;
+	struct list_head *entry;
+};
+
+extern struct bus_type vme_bus_type;
+
+/* Number of VME interrupt vectors */
+#define VME_NUM_STATUSID	256
+
+/* VME_MAX_BRIDGES comes from the type of vme_bus_numbers */
+#define VME_MAX_BRIDGES		(sizeof(unsigned int)*8)
+#define VME_MAX_SLOTS		32
+
+#define VME_SLOT_CURRENT	-1
+#define VME_SLOT_ALL		-2
+
+/**
+ * struct vme_dev - Structure representing a VME device
+ * @num: The device number
+ * @bridge: Pointer to the bridge device this device is on
+ * @dev: Internal device structure
+ * @drv_list: List of devices (per driver)
+ * @bridge_list: List of devices (per bridge)
+ */
+struct vme_dev {
+	int num;
+	struct vme_bridge *bridge;
+	struct device dev;
+	struct list_head drv_list;
+	struct list_head bridge_list;
+};
+
+/**
+ * struct vme_driver - Structure representing a VME driver
+ * @name: Driver name, should be unique among VME drivers and usually the same
+ *        as the module name.
+ * @match: Callback used to determine whether probe should be run.
+ * @probe: Callback for device binding, called when new device is detected.
+ * @remove: Callback, called on device removal.
+ * @driver: Underlying generic device driver structure.
+ * @devices: List of VME devices (struct vme_dev) associated with this driver.
+ */
+struct vme_driver {
+	const char *name;
+	int (*match)(struct vme_dev *);
+	int (*probe)(struct vme_dev *);
+	void (*remove)(struct vme_dev *);
+	struct device_driver driver;
+	struct list_head devices;
+};
+
+void *vme_alloc_consistent(struct vme_resource *, size_t, dma_addr_t *);
+void vme_free_consistent(struct vme_resource *, size_t,  void *,
+	dma_addr_t);
+
+size_t vme_get_size(struct vme_resource *);
+int vme_check_window(u32 aspace, unsigned long long vme_base,
+		     unsigned long long size);
+
+struct vme_resource *vme_slave_request(struct vme_dev *, u32, u32);
+int vme_slave_set(struct vme_resource *, int, unsigned long long,
+	unsigned long long, dma_addr_t, u32, u32);
+int vme_slave_get(struct vme_resource *, int *, unsigned long long *,
+	unsigned long long *, dma_addr_t *, u32 *, u32 *);
+void vme_slave_free(struct vme_resource *);
+
+struct vme_resource *vme_master_request(struct vme_dev *, u32, u32, u32);
+int vme_master_set(struct vme_resource *, int, unsigned long long,
+	unsigned long long, u32, u32, u32);
+int vme_master_get(struct vme_resource *, int *, unsigned long long *,
+	unsigned long long *, u32 *, u32 *, u32 *);
+ssize_t vme_master_read(struct vme_resource *, void *, size_t, loff_t);
+ssize_t vme_master_write(struct vme_resource *, void *, size_t, loff_t);
+unsigned int vme_master_rmw(struct vme_resource *, unsigned int, unsigned int,
+	unsigned int, loff_t);
+int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma);
+void vme_master_free(struct vme_resource *);
+
+struct vme_resource *vme_dma_request(struct vme_dev *, u32);
+struct vme_dma_list *vme_new_dma_list(struct vme_resource *);
+struct vme_dma_attr *vme_dma_pattern_attribute(u32, u32);
+struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t);
+struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long, u32, u32, u32);
+void vme_dma_free_attribute(struct vme_dma_attr *);
+int vme_dma_list_add(struct vme_dma_list *, struct vme_dma_attr *,
+	struct vme_dma_attr *, size_t);
+int vme_dma_list_exec(struct vme_dma_list *);
+int vme_dma_list_free(struct vme_dma_list *);
+int vme_dma_free(struct vme_resource *);
+
+int vme_irq_request(struct vme_dev *, int, int,
+	void (*callback)(int, int, void *), void *);
+void vme_irq_free(struct vme_dev *, int, int);
+int vme_irq_generate(struct vme_dev *, int, int);
+
+struct vme_resource *vme_lm_request(struct vme_dev *);
+int vme_lm_count(struct vme_resource *);
+int vme_lm_set(struct vme_resource *, unsigned long long, u32, u32);
+int vme_lm_get(struct vme_resource *, unsigned long long *, u32 *, u32 *);
+int vme_lm_attach(struct vme_resource *, int, void (*callback)(void *), void *);
+int vme_lm_detach(struct vme_resource *, int);
+void vme_lm_free(struct vme_resource *);
+
+int vme_slot_num(struct vme_dev *);
+int vme_bus_num(struct vme_dev *);
+
+int vme_register_driver(struct vme_driver *, unsigned int);
+void vme_unregister_driver(struct vme_driver *);
+
+
+#endif /* _VME_H_ */
+
diff --git a/drivers/staging/vme_user/vme_bridge.h b/drivers/staging/vme_user/vme_bridge.h
new file mode 100644
index 000000000000..0bbefe9851d7
--- /dev/null
+++ b/drivers/staging/vme_user/vme_bridge.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VME_BRIDGE_H_
+#define _VME_BRIDGE_H_
+
+#include "vme.h"
+
+#define VME_CRCSR_BUF_SIZE (508*1024)
+/*
+ * Resource structures
+ */
+struct vme_master_resource {
+	struct list_head list;
+	struct vme_bridge *parent;
+	/*
+	 * We are likely to need to access the VME bus in interrupt context, so
+	 * protect master routines with a spinlock rather than a mutex.
+	 */
+	spinlock_t lock;
+	int locked;
+	int number;
+	u32 address_attr;
+	u32 cycle_attr;
+	u32 width_attr;
+	struct resource bus_resource;
+	void __iomem *kern_base;
+};
+
+struct vme_slave_resource {
+	struct list_head list;
+	struct vme_bridge *parent;
+	struct mutex mtx;
+	int locked;
+	int number;
+	u32 address_attr;
+	u32 cycle_attr;
+};
+
+struct vme_dma_pattern {
+	u32 pattern;
+	u32 type;
+};
+
+struct vme_dma_pci {
+	dma_addr_t address;
+};
+
+struct vme_dma_vme {
+	unsigned long long address;
+	u32 aspace;
+	u32 cycle;
+	u32 dwidth;
+};
+
+struct vme_dma_list {
+	struct list_head list;
+	struct vme_dma_resource *parent;
+	struct list_head entries;
+	struct mutex mtx;
+};
+
+struct vme_dma_resource {
+	struct list_head list;
+	struct vme_bridge *parent;
+	struct mutex mtx;
+	int locked;
+	int number;
+	struct list_head pending;
+	struct list_head running;
+	u32 route_attr;
+};
+
+struct vme_lm_resource {
+	struct list_head list;
+	struct vme_bridge *parent;
+	struct mutex mtx;
+	int locked;
+	int number;
+	int monitors;
+};
+
+struct vme_error_handler {
+	struct list_head list;
+	unsigned long long start;	/* Beginning of error window */
+	unsigned long long end;		/* End of error window */
+	unsigned long long first_error;	/* Address of the first error */
+	u32 aspace;			/* Address space of error window*/
+	unsigned num_errors;		/* Number of errors */
+};
+
+struct vme_callback {
+	void (*func)(int, int, void*);
+	void *priv_data;
+};
+
+struct vme_irq {
+	int count;
+	struct vme_callback callback[VME_NUM_STATUSID];
+};
+
+/* Allow 16 characters for name (including null character) */
+#define VMENAMSIZ 16
+
+/* This structure stores all the information about one bridge
+ * The structure should be dynamically allocated by the driver and one instance
+ * of the structure should be present for each VME chip present in the system.
+ */
+struct vme_bridge {
+	char name[VMENAMSIZ];
+	int num;
+	struct list_head master_resources;
+	struct list_head slave_resources;
+	struct list_head dma_resources;
+	struct list_head lm_resources;
+
+	/* List for registered errors handlers */
+	struct list_head vme_error_handlers;
+	/* List of devices on this bridge */
+	struct list_head devices;
+
+	/* Bridge Info - XXX Move to private structure? */
+	struct device *parent;	/* Parent device (eg. pdev->dev for PCI) */
+	void *driver_priv;	/* Private pointer for the bridge driver */
+	struct list_head bus_list; /* list of VME buses */
+
+	/* Interrupt callbacks */
+	struct vme_irq irq[7];
+	/* Locking for VME irq callback configuration */
+	struct mutex irq_mtx;
+
+	/* Slave Functions */
+	int (*slave_get) (struct vme_slave_resource *, int *,
+		unsigned long long *, unsigned long long *, dma_addr_t *,
+		u32 *, u32 *);
+	int (*slave_set) (struct vme_slave_resource *, int, unsigned long long,
+		unsigned long long, dma_addr_t, u32, u32);
+
+	/* Master Functions */
+	int (*master_get) (struct vme_master_resource *, int *,
+		unsigned long long *, unsigned long long *, u32 *, u32 *,
+		u32 *);
+	int (*master_set) (struct vme_master_resource *, int,
+		unsigned long long, unsigned long long,  u32, u32, u32);
+	ssize_t (*master_read) (struct vme_master_resource *, void *, size_t,
+		loff_t);
+	ssize_t (*master_write) (struct vme_master_resource *, void *, size_t,
+		loff_t);
+	unsigned int (*master_rmw) (struct vme_master_resource *, unsigned int,
+		unsigned int, unsigned int, loff_t);
+
+	/* DMA Functions */
+	int (*dma_list_add) (struct vme_dma_list *, struct vme_dma_attr *,
+		struct vme_dma_attr *, size_t);
+	int (*dma_list_exec) (struct vme_dma_list *);
+	int (*dma_list_empty) (struct vme_dma_list *);
+
+	/* Interrupt Functions */
+	void (*irq_set) (struct vme_bridge *, int, int, int);
+	int (*irq_generate) (struct vme_bridge *, int, int);
+
+	/* Location monitor functions */
+	int (*lm_set) (struct vme_lm_resource *, unsigned long long, u32, u32);
+	int (*lm_get) (struct vme_lm_resource *, unsigned long long *, u32 *,
+		u32 *);
+	int (*lm_attach)(struct vme_lm_resource *, int,
+			 void (*callback)(void *), void *);
+	int (*lm_detach) (struct vme_lm_resource *, int);
+
+	/* CR/CSR space functions */
+	int (*slot_get) (struct vme_bridge *);
+
+	/* Bridge parent interface */
+	void *(*alloc_consistent)(struct device *dev, size_t size,
+		dma_addr_t *dma);
+	void (*free_consistent)(struct device *dev, size_t size,
+		void *vaddr, dma_addr_t dma);
+};
+
+void vme_bus_error_handler(struct vme_bridge *bridge,
+			   unsigned long long address, int am);
+void vme_irq_handler(struct vme_bridge *, int, int);
+
+struct vme_bridge *vme_init_bridge(struct vme_bridge *);
+int vme_register_bridge(struct vme_bridge *);
+void vme_unregister_bridge(struct vme_bridge *);
+struct vme_error_handler *vme_register_error_handler(
+	struct vme_bridge *bridge, u32 aspace,
+	unsigned long long address, size_t len);
+void vme_unregister_error_handler(struct vme_error_handler *handler);
+
+#endif /* _VME_BRIDGE_H_ */
diff --git a/drivers/staging/vme_user/vme_fake.c b/drivers/staging/vme_user/vme_fake.c
new file mode 100644
index 000000000000..dd646b0c531d
--- /dev/null
+++ b/drivers/staging/vme_user/vme_fake.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Fake VME bridge support.
+ *
+ * This drive provides a fake VME bridge chip, this enables debugging of the
+ * VME framework in the absence of a VME system.
+ *
+ * This driver has to do a number of things in software that would be driven
+ * by hardware if it was available, it will also result in extra overhead at
+ * times when compared with driving actual hardware.
+ *
+ * Author: Martyn Welch <martyn@welches.me.uk>
+ * Copyright (c) 2014 Martyn Welch
+ *
+ * Based on vme_tsi148.c:
+ *
+ * Author: Martyn Welch <martyn.welch@ge.com>
+ * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
+ *
+ * Based on work by Tom Armistead and Ajit Prem
+ * Copyright 2004 Motorola Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "vme.h"
+#include "vme_bridge.h"
+
+/*
+ *  Define the number of each that the fake driver supports.
+ */
+#define FAKE_MAX_MASTER		8	/* Max Master Windows */
+#define FAKE_MAX_SLAVE		8	/* Max Slave Windows */
+
+/* Structures to hold information normally held in device registers */
+struct fake_slave_window {
+	int enabled;
+	unsigned long long vme_base;
+	unsigned long long size;
+	void *buf_base;
+	u32 aspace;
+	u32 cycle;
+};
+
+struct fake_master_window {
+	int enabled;
+	unsigned long long vme_base;
+	unsigned long long size;
+	u32 aspace;
+	u32 cycle;
+	u32 dwidth;
+};
+
+/* Structure used to hold driver specific information */
+struct fake_driver {
+	struct vme_bridge *parent;
+	struct fake_slave_window slaves[FAKE_MAX_SLAVE];
+	struct fake_master_window masters[FAKE_MAX_MASTER];
+	u32 lm_enabled;
+	unsigned long long lm_base;
+	u32 lm_aspace;
+	u32 lm_cycle;
+	void (*lm_callback[4])(void *);
+	void *lm_data[4];
+	struct tasklet_struct int_tasklet;
+	int int_level;
+	int int_statid;
+	void *crcsr_kernel;
+	dma_addr_t crcsr_bus;
+	/* Only one VME interrupt can be generated at a time, provide locking */
+	struct mutex vme_int;
+};
+
+/* Module parameter */
+static int geoid;
+
+static const char driver_name[] = "vme_fake";
+
+static struct vme_bridge *exit_pointer;
+
+static struct device *vme_root;
+
+/*
+ * Calling VME bus interrupt callback if provided.
+ */
+static void fake_VIRQ_tasklet(unsigned long data)
+{
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = (struct vme_bridge *) data;
+	bridge = fake_bridge->driver_priv;
+
+	vme_irq_handler(fake_bridge, bridge->int_level, bridge->int_statid);
+}
+
+/*
+ * Configure VME interrupt
+ */
+static void fake_irq_set(struct vme_bridge *fake_bridge, int level,
+		int state, int sync)
+{
+	/* Nothing to do */
+}
+
+static void *fake_pci_to_ptr(dma_addr_t addr)
+{
+	return (void *)(uintptr_t)addr;
+}
+
+static dma_addr_t fake_ptr_to_pci(void *addr)
+{
+	return (dma_addr_t)(uintptr_t)addr;
+}
+
+/*
+ * Generate a VME bus interrupt at the requested level & vector. Wait for
+ * interrupt to be acked.
+ */
+static int fake_irq_generate(struct vme_bridge *fake_bridge, int level,
+		int statid)
+{
+	struct fake_driver *bridge;
+
+	bridge = fake_bridge->driver_priv;
+
+	mutex_lock(&bridge->vme_int);
+
+	bridge->int_level = level;
+
+	bridge->int_statid = statid;
+
+	/*
+	 * Schedule tasklet to run VME handler to emulate normal VME interrupt
+	 * handler behaviour.
+	 */
+	tasklet_schedule(&bridge->int_tasklet);
+
+	mutex_unlock(&bridge->vme_int);
+
+	return 0;
+}
+
+/*
+ * Initialize a slave window with the requested attributes.
+ */
+static int fake_slave_set(struct vme_slave_resource *image, int enabled,
+		unsigned long long vme_base, unsigned long long size,
+		dma_addr_t buf_base, u32 aspace, u32 cycle)
+{
+	unsigned int i, granularity = 0;
+	unsigned long long vme_bound;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = image->parent;
+	bridge = fake_bridge->driver_priv;
+
+	i = image->number;
+
+	switch (aspace) {
+	case VME_A16:
+		granularity = 0x10;
+		break;
+	case VME_A24:
+		granularity = 0x1000;
+		break;
+	case VME_A32:
+		granularity = 0x10000;
+		break;
+	case VME_A64:
+		granularity = 0x10000;
+		break;
+	case VME_CRCSR:
+	case VME_USER1:
+	case VME_USER2:
+	case VME_USER3:
+	case VME_USER4:
+	default:
+		pr_err("Invalid address space\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Bound address is a valid address for the window, adjust
+	 * accordingly
+	 */
+	vme_bound = vme_base + size - granularity;
+
+	if (vme_base & (granularity - 1)) {
+		pr_err("Invalid VME base alignment\n");
+		return -EINVAL;
+	}
+	if (vme_bound & (granularity - 1)) {
+		pr_err("Invalid VME bound alignment\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&image->mtx);
+
+	bridge->slaves[i].enabled = enabled;
+	bridge->slaves[i].vme_base = vme_base;
+	bridge->slaves[i].size = size;
+	bridge->slaves[i].buf_base = fake_pci_to_ptr(buf_base);
+	bridge->slaves[i].aspace = aspace;
+	bridge->slaves[i].cycle = cycle;
+
+	mutex_unlock(&image->mtx);
+
+	return 0;
+}
+
+/*
+ * Get slave window configuration.
+ */
+static int fake_slave_get(struct vme_slave_resource *image, int *enabled,
+		unsigned long long *vme_base, unsigned long long *size,
+		dma_addr_t *buf_base, u32 *aspace, u32 *cycle)
+{
+	unsigned int i;
+	struct fake_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	i = image->number;
+
+	mutex_lock(&image->mtx);
+
+	*enabled = bridge->slaves[i].enabled;
+	*vme_base = bridge->slaves[i].vme_base;
+	*size = bridge->slaves[i].size;
+	*buf_base = fake_ptr_to_pci(bridge->slaves[i].buf_base);
+	*aspace = bridge->slaves[i].aspace;
+	*cycle = bridge->slaves[i].cycle;
+
+	mutex_unlock(&image->mtx);
+
+	return 0;
+}
+
+/*
+ * Set the attributes of an outbound window.
+ */
+static int fake_master_set(struct vme_master_resource *image, int enabled,
+		unsigned long long vme_base, unsigned long long size,
+		u32 aspace, u32 cycle, u32 dwidth)
+{
+	int retval = 0;
+	unsigned int i;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = image->parent;
+
+	bridge = fake_bridge->driver_priv;
+
+	/* Verify input data */
+	if (vme_base & 0xFFFF) {
+		pr_err("Invalid VME Window alignment\n");
+		retval = -EINVAL;
+		goto err_window;
+	}
+
+	if (size & 0xFFFF) {
+		pr_err("Invalid size alignment\n");
+		retval = -EINVAL;
+		goto err_window;
+	}
+
+	if ((size == 0) && (enabled != 0)) {
+		pr_err("Size must be non-zero for enabled windows\n");
+		retval = -EINVAL;
+		goto err_window;
+	}
+
+	/* Setup data width */
+	switch (dwidth) {
+	case VME_D8:
+	case VME_D16:
+	case VME_D32:
+		break;
+	default:
+		pr_err("Invalid data width\n");
+		retval = -EINVAL;
+		goto err_dwidth;
+	}
+
+	/* Setup address space */
+	switch (aspace) {
+	case VME_A16:
+	case VME_A24:
+	case VME_A32:
+	case VME_A64:
+	case VME_CRCSR:
+	case VME_USER1:
+	case VME_USER2:
+	case VME_USER3:
+	case VME_USER4:
+		break;
+	default:
+		pr_err("Invalid address space\n");
+		retval = -EINVAL;
+		goto err_aspace;
+	}
+
+	spin_lock(&image->lock);
+
+	i = image->number;
+
+	bridge->masters[i].enabled = enabled;
+	bridge->masters[i].vme_base = vme_base;
+	bridge->masters[i].size = size;
+	bridge->masters[i].aspace = aspace;
+	bridge->masters[i].cycle = cycle;
+	bridge->masters[i].dwidth = dwidth;
+
+	spin_unlock(&image->lock);
+
+	return 0;
+
+err_aspace:
+err_dwidth:
+err_window:
+	return retval;
+
+}
+
+/*
+ * Set the attributes of an outbound window.
+ */
+static int __fake_master_get(struct vme_master_resource *image, int *enabled,
+		unsigned long long *vme_base, unsigned long long *size,
+		u32 *aspace, u32 *cycle, u32 *dwidth)
+{
+	unsigned int i;
+	struct fake_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	i = image->number;
+
+	*enabled = bridge->masters[i].enabled;
+	*vme_base = bridge->masters[i].vme_base;
+	*size = bridge->masters[i].size;
+	*aspace = bridge->masters[i].aspace;
+	*cycle = bridge->masters[i].cycle;
+	*dwidth = bridge->masters[i].dwidth;
+
+	return 0;
+}
+
+
+static int fake_master_get(struct vme_master_resource *image, int *enabled,
+		unsigned long long *vme_base, unsigned long long *size,
+		u32 *aspace, u32 *cycle, u32 *dwidth)
+{
+	int retval;
+
+	spin_lock(&image->lock);
+
+	retval = __fake_master_get(image, enabled, vme_base, size, aspace,
+			cycle, dwidth);
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+
+static void fake_lm_check(struct fake_driver *bridge, unsigned long long addr,
+			  u32 aspace, u32 cycle)
+{
+	struct vme_bridge *fake_bridge;
+	unsigned long long lm_base;
+	u32 lm_aspace, lm_cycle;
+	int i;
+	struct vme_lm_resource *lm;
+	struct list_head *pos = NULL, *n;
+
+	/* Get vme_bridge */
+	fake_bridge = bridge->parent;
+
+	/* Loop through each location monitor resource */
+	list_for_each_safe(pos, n, &fake_bridge->lm_resources) {
+		lm = list_entry(pos, struct vme_lm_resource, list);
+
+		/* If disabled, we're done */
+		if (bridge->lm_enabled == 0)
+			return;
+
+		lm_base = bridge->lm_base;
+		lm_aspace = bridge->lm_aspace;
+		lm_cycle = bridge->lm_cycle;
+
+		/* First make sure that the cycle and address space match */
+		if ((lm_aspace == aspace) && (lm_cycle == cycle)) {
+			for (i = 0; i < lm->monitors; i++) {
+				/* Each location monitor covers 8 bytes */
+				if (((lm_base + (8 * i)) <= addr) &&
+				    ((lm_base + (8 * i) + 8) > addr)) {
+					if (bridge->lm_callback[i])
+						bridge->lm_callback[i](
+							bridge->lm_data[i]);
+				}
+			}
+		}
+	}
+}
+
+static noinline_for_stack u8 fake_vmeread8(struct fake_driver *bridge,
+					   unsigned long long addr,
+					   u32 aspace, u32 cycle)
+{
+	u8 retval = 0xff;
+	int i;
+	unsigned long long start, end, offset;
+	u8 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		if ((addr >= start) && (addr < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u8 *)(bridge->slaves[i].buf_base + offset);
+			retval = *loc;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+	return retval;
+}
+
+static noinline_for_stack u16 fake_vmeread16(struct fake_driver *bridge,
+					     unsigned long long addr,
+					     u32 aspace, u32 cycle)
+{
+	u16 retval = 0xffff;
+	int i;
+	unsigned long long start, end, offset;
+	u16 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if ((addr >= start) && ((addr + 1) < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u16 *)(bridge->slaves[i].buf_base + offset);
+			retval = *loc;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+	return retval;
+}
+
+static noinline_for_stack u32 fake_vmeread32(struct fake_driver *bridge,
+					     unsigned long long addr,
+					     u32 aspace, u32 cycle)
+{
+	u32 retval = 0xffffffff;
+	int i;
+	unsigned long long start, end, offset;
+	u32 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if ((addr >= start) && ((addr + 3) < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u32 *)(bridge->slaves[i].buf_base + offset);
+			retval = *loc;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+	return retval;
+}
+
+static ssize_t fake_master_read(struct vme_master_resource *image, void *buf,
+		size_t count, loff_t offset)
+{
+	int retval;
+	u32 aspace, cycle, dwidth;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *priv;
+	int i;
+	unsigned long long addr;
+	unsigned int done = 0;
+	unsigned int count32;
+
+	fake_bridge = image->parent;
+
+	priv = fake_bridge->driver_priv;
+
+	i = image->number;
+
+	addr = (unsigned long long)priv->masters[i].vme_base + offset;
+	aspace = priv->masters[i].aspace;
+	cycle = priv->masters[i].cycle;
+	dwidth = priv->masters[i].dwidth;
+
+	spin_lock(&image->lock);
+
+	/* The following code handles VME address alignment. We cannot use
+	 * memcpy_xxx here because it may cut data transfers in to 8-bit
+	 * cycles when D16 or D32 cycles are required on the VME bus.
+	 * On the other hand, the bridge itself assures that the maximum data
+	 * cycle configured for the transfer is used and splits it
+	 * automatically for non-aligned addresses, so we don't want the
+	 * overhead of needlessly forcing small transfers for the entire cycle.
+	 */
+	if (addr & 0x1) {
+		*(u8 *)buf = fake_vmeread8(priv, addr, aspace, cycle);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
+		if ((addr + done) & 0x2) {
+			if ((count - done) < 2) {
+				*(u8 *)(buf + done) = fake_vmeread8(priv,
+						addr + done, aspace, cycle);
+				done += 1;
+				goto out;
+			} else {
+				*(u16 *)(buf + done) = fake_vmeread16(priv,
+						addr + done, aspace, cycle);
+				done += 2;
+			}
+		}
+	}
+
+	if (dwidth == VME_D32) {
+		count32 = (count - done) & ~0x3;
+		while (done < count32) {
+			*(u32 *)(buf + done) = fake_vmeread32(priv, addr + done,
+					aspace, cycle);
+			done += 4;
+		}
+	} else if (dwidth == VME_D16) {
+		count32 = (count - done) & ~0x3;
+		while (done < count32) {
+			*(u16 *)(buf + done) = fake_vmeread16(priv, addr + done,
+					aspace, cycle);
+			done += 2;
+		}
+	} else if (dwidth == VME_D8) {
+		count32 = (count - done);
+		while (done < count32) {
+			*(u8 *)(buf + done) = fake_vmeread8(priv, addr + done,
+					aspace, cycle);
+			done += 1;
+		}
+
+	}
+
+	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
+		if ((count - done) & 0x2) {
+			*(u16 *)(buf + done) = fake_vmeread16(priv, addr + done,
+					aspace, cycle);
+			done += 2;
+		}
+	}
+	if ((count - done) & 0x1) {
+		*(u8 *)(buf + done) = fake_vmeread8(priv, addr + done, aspace,
+				cycle);
+		done += 1;
+	}
+
+out:
+	retval = count;
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+static noinline_for_stack void fake_vmewrite8(struct fake_driver *bridge,
+					      u8 *buf, unsigned long long addr,
+					      u32 aspace, u32 cycle)
+{
+	int i;
+	unsigned long long start, end, offset;
+	u8 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if ((addr >= start) && (addr < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u8 *)((void *)bridge->slaves[i].buf_base + offset);
+			*loc = *buf;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+}
+
+static noinline_for_stack void fake_vmewrite16(struct fake_driver *bridge,
+					       u16 *buf, unsigned long long addr,
+					       u32 aspace, u32 cycle)
+{
+	int i;
+	unsigned long long start, end, offset;
+	u16 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if ((addr >= start) && ((addr + 1) < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u16 *)((void *)bridge->slaves[i].buf_base + offset);
+			*loc = *buf;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+}
+
+static noinline_for_stack void fake_vmewrite32(struct fake_driver *bridge,
+					       u32 *buf, unsigned long long addr,
+					       u32 aspace, u32 cycle)
+{
+	int i;
+	unsigned long long start, end, offset;
+	u32 *loc;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		if (aspace != bridge->slaves[i].aspace)
+			continue;
+
+		if (cycle != bridge->slaves[i].cycle)
+			continue;
+
+		start = bridge->slaves[i].vme_base;
+		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
+
+		if ((addr >= start) && ((addr + 3) < end)) {
+			offset = addr - bridge->slaves[i].vme_base;
+			loc = (u32 *)((void *)bridge->slaves[i].buf_base + offset);
+			*loc = *buf;
+
+			break;
+		}
+	}
+
+	fake_lm_check(bridge, addr, aspace, cycle);
+
+}
+
+static ssize_t fake_master_write(struct vme_master_resource *image, void *buf,
+		size_t count, loff_t offset)
+{
+	int retval = 0;
+	u32 aspace, cycle, dwidth;
+	unsigned long long addr;
+	int i;
+	unsigned int done = 0;
+	unsigned int count32;
+
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = image->parent;
+
+	bridge = fake_bridge->driver_priv;
+
+	i = image->number;
+
+	addr = bridge->masters[i].vme_base + offset;
+	aspace = bridge->masters[i].aspace;
+	cycle = bridge->masters[i].cycle;
+	dwidth = bridge->masters[i].dwidth;
+
+	spin_lock(&image->lock);
+
+	/* Here we apply for the same strategy we do in master_read
+	 * function in order to assure the correct cycles.
+	 */
+	if (addr & 0x1) {
+		fake_vmewrite8(bridge, (u8 *)buf, addr, aspace, cycle);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+
+	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
+		if ((addr + done) & 0x2) {
+			if ((count - done) < 2) {
+				fake_vmewrite8(bridge, (u8 *)(buf + done),
+						addr + done, aspace, cycle);
+				done += 1;
+				goto out;
+			} else {
+				fake_vmewrite16(bridge, (u16 *)(buf + done),
+						addr + done, aspace, cycle);
+				done += 2;
+			}
+		}
+	}
+
+	if (dwidth == VME_D32) {
+		count32 = (count - done) & ~0x3;
+		while (done < count32) {
+			fake_vmewrite32(bridge, (u32 *)(buf + done),
+					addr + done, aspace, cycle);
+			done += 4;
+		}
+	} else if (dwidth == VME_D16) {
+		count32 = (count - done) & ~0x3;
+		while (done < count32) {
+			fake_vmewrite16(bridge, (u16 *)(buf + done),
+					addr + done, aspace, cycle);
+			done += 2;
+		}
+	} else if (dwidth == VME_D8) {
+		count32 = (count - done);
+		while (done < count32) {
+			fake_vmewrite8(bridge, (u8 *)(buf + done), addr + done,
+					aspace, cycle);
+			done += 1;
+		}
+
+	}
+
+	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
+		if ((count - done) & 0x2) {
+			fake_vmewrite16(bridge, (u16 *)(buf + done),
+					addr + done, aspace, cycle);
+			done += 2;
+		}
+	}
+
+	if ((count - done) & 0x1) {
+		fake_vmewrite8(bridge, (u8 *)(buf + done), addr + done, aspace,
+				cycle);
+		done += 1;
+	}
+
+out:
+	retval = count;
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+/*
+ * Perform an RMW cycle on the VME bus.
+ *
+ * Requires a previously configured master window, returns final value.
+ */
+static unsigned int fake_master_rmw(struct vme_master_resource *image,
+		unsigned int mask, unsigned int compare, unsigned int swap,
+		loff_t offset)
+{
+	u32 tmp, base;
+	u32 aspace, cycle;
+	int i;
+	struct fake_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	/* Find the PCI address that maps to the desired VME address */
+	i = image->number;
+
+	base = bridge->masters[i].vme_base;
+	aspace = bridge->masters[i].aspace;
+	cycle = bridge->masters[i].cycle;
+
+	/* Lock image */
+	spin_lock(&image->lock);
+
+	/* Read existing value */
+	tmp = fake_vmeread32(bridge, base + offset, aspace, cycle);
+
+	/* Perform check */
+	if ((tmp && mask) == (compare && mask)) {
+		tmp = tmp | (mask | swap);
+		tmp = tmp & (~mask | swap);
+
+		/* Write back */
+		fake_vmewrite32(bridge, &tmp, base + offset, aspace, cycle);
+	}
+
+	/* Unlock image */
+	spin_unlock(&image->lock);
+
+	return tmp;
+}
+
+/*
+ * All 4 location monitors reside at the same base - this is therefore a
+ * system wide configuration.
+ *
+ * This does not enable the LM monitor - that should be done when the first
+ * callback is attached and disabled when the last callback is removed.
+ */
+static int fake_lm_set(struct vme_lm_resource *lm, unsigned long long lm_base,
+		u32 aspace, u32 cycle)
+{
+	int i;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = lm->parent;
+
+	bridge = fake_bridge->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* If we already have a callback attached, we can't move it! */
+	for (i = 0; i < lm->monitors; i++) {
+		if (bridge->lm_callback[i]) {
+			mutex_unlock(&lm->mtx);
+			pr_err("Location monitor callback attached, can't reset\n");
+			return -EBUSY;
+		}
+	}
+
+	switch (aspace) {
+	case VME_A16:
+	case VME_A24:
+	case VME_A32:
+	case VME_A64:
+		break;
+	default:
+		mutex_unlock(&lm->mtx);
+		pr_err("Invalid address space\n");
+		return -EINVAL;
+	}
+
+	bridge->lm_base = lm_base;
+	bridge->lm_aspace = aspace;
+	bridge->lm_cycle = cycle;
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/* Get configuration of the callback monitor and return whether it is enabled
+ * or disabled.
+ */
+static int fake_lm_get(struct vme_lm_resource *lm,
+		unsigned long long *lm_base, u32 *aspace, u32 *cycle)
+{
+	struct fake_driver *bridge;
+
+	bridge = lm->parent->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	*lm_base = bridge->lm_base;
+	*aspace = bridge->lm_aspace;
+	*cycle = bridge->lm_cycle;
+
+	mutex_unlock(&lm->mtx);
+
+	return bridge->lm_enabled;
+}
+
+/*
+ * Attach a callback to a specific location monitor.
+ *
+ * Callback will be passed the monitor triggered.
+ */
+static int fake_lm_attach(struct vme_lm_resource *lm, int monitor,
+		void (*callback)(void *), void *data)
+{
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = lm->parent;
+
+	bridge = fake_bridge->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* Ensure that the location monitor is configured - need PGM or DATA */
+	if (bridge->lm_cycle == 0) {
+		mutex_unlock(&lm->mtx);
+		pr_err("Location monitor not properly configured\n");
+		return -EINVAL;
+	}
+
+	/* Check that a callback isn't already attached */
+	if (bridge->lm_callback[monitor]) {
+		mutex_unlock(&lm->mtx);
+		pr_err("Existing callback attached\n");
+		return -EBUSY;
+	}
+
+	/* Attach callback */
+	bridge->lm_callback[monitor] = callback;
+	bridge->lm_data[monitor] = data;
+
+	/* Ensure that global Location Monitor Enable set */
+	bridge->lm_enabled = 1;
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/*
+ * Detach a callback function forn a specific location monitor.
+ */
+static int fake_lm_detach(struct vme_lm_resource *lm, int monitor)
+{
+	u32 tmp;
+	int i;
+	struct fake_driver *bridge;
+
+	bridge = lm->parent->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* Detach callback */
+	bridge->lm_callback[monitor] = NULL;
+	bridge->lm_data[monitor] = NULL;
+
+	/* If all location monitors disabled, disable global Location Monitor */
+	tmp = 0;
+	for (i = 0; i < lm->monitors; i++) {
+		if (bridge->lm_callback[i])
+			tmp = 1;
+	}
+
+	if (tmp == 0)
+		bridge->lm_enabled = 0;
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/*
+ * Determine Geographical Addressing
+ */
+static int fake_slot_get(struct vme_bridge *fake_bridge)
+{
+	return geoid;
+}
+
+static void *fake_alloc_consistent(struct device *parent, size_t size,
+		dma_addr_t *dma)
+{
+	void *alloc = kmalloc(size, GFP_KERNEL);
+
+	if (alloc)
+		*dma = fake_ptr_to_pci(alloc);
+
+	return alloc;
+}
+
+static void fake_free_consistent(struct device *parent, size_t size,
+		void *vaddr, dma_addr_t dma)
+{
+	kfree(vaddr);
+/*
+	dma_free_coherent(parent, size, vaddr, dma);
+*/
+}
+
+/*
+ * Configure CR/CSR space
+ *
+ * Access to the CR/CSR can be configured at power-up. The location of the
+ * CR/CSR registers in the CR/CSR address space is determined by the boards
+ * Geographic address.
+ *
+ * Each board has a 512kB window, with the highest 4kB being used for the
+ * boards registers, this means there is a fix length 508kB window which must
+ * be mapped onto PCI memory.
+ */
+static int fake_crcsr_init(struct vme_bridge *fake_bridge)
+{
+	u32 vstat;
+	struct fake_driver *bridge;
+
+	bridge = fake_bridge->driver_priv;
+
+	/* Allocate mem for CR/CSR image */
+	bridge->crcsr_kernel = kzalloc(VME_CRCSR_BUF_SIZE, GFP_KERNEL);
+	bridge->crcsr_bus = fake_ptr_to_pci(bridge->crcsr_kernel);
+	if (!bridge->crcsr_kernel)
+		return -ENOMEM;
+
+	vstat = fake_slot_get(fake_bridge);
+
+	pr_info("CR/CSR Offset: %d\n", vstat);
+
+	return 0;
+}
+
+static void fake_crcsr_exit(struct vme_bridge *fake_bridge)
+{
+	struct fake_driver *bridge;
+
+	bridge = fake_bridge->driver_priv;
+
+	kfree(bridge->crcsr_kernel);
+}
+
+
+static int __init fake_init(void)
+{
+	int retval, i;
+	struct list_head *pos = NULL, *n;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *fake_device;
+	struct vme_master_resource *master_image;
+	struct vme_slave_resource *slave_image;
+	struct vme_lm_resource *lm;
+
+	/* We need a fake parent device */
+	vme_root = __root_device_register("vme", THIS_MODULE);
+
+	/* If we want to support more than one bridge at some point, we need to
+	 * dynamically allocate this so we get one per device.
+	 */
+	fake_bridge = kzalloc(sizeof(*fake_bridge), GFP_KERNEL);
+	if (!fake_bridge) {
+		retval = -ENOMEM;
+		goto err_struct;
+	}
+
+	fake_device = kzalloc(sizeof(*fake_device), GFP_KERNEL);
+	if (!fake_device) {
+		retval = -ENOMEM;
+		goto err_driver;
+	}
+
+	fake_bridge->driver_priv = fake_device;
+
+	fake_bridge->parent = vme_root;
+
+	fake_device->parent = fake_bridge;
+
+	/* Initialize wait queues & mutual exclusion flags */
+	mutex_init(&fake_device->vme_int);
+	mutex_init(&fake_bridge->irq_mtx);
+	tasklet_init(&fake_device->int_tasklet, fake_VIRQ_tasklet,
+			(unsigned long) fake_bridge);
+
+	strcpy(fake_bridge->name, driver_name);
+
+	/* Add master windows to list */
+	INIT_LIST_HEAD(&fake_bridge->master_resources);
+	for (i = 0; i < FAKE_MAX_MASTER; i++) {
+		master_image = kmalloc(sizeof(*master_image), GFP_KERNEL);
+		if (!master_image) {
+			retval = -ENOMEM;
+			goto err_master;
+		}
+		master_image->parent = fake_bridge;
+		spin_lock_init(&master_image->lock);
+		master_image->locked = 0;
+		master_image->number = i;
+		master_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
+			VME_A64;
+		master_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
+			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
+			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
+			VME_PROG | VME_DATA;
+		master_image->width_attr = VME_D16 | VME_D32;
+		memset(&master_image->bus_resource, 0,
+				sizeof(struct resource));
+		master_image->kern_base  = NULL;
+		list_add_tail(&master_image->list,
+				&fake_bridge->master_resources);
+	}
+
+	/* Add slave windows to list */
+	INIT_LIST_HEAD(&fake_bridge->slave_resources);
+	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
+		slave_image = kmalloc(sizeof(*slave_image), GFP_KERNEL);
+		if (!slave_image) {
+			retval = -ENOMEM;
+			goto err_slave;
+		}
+		slave_image->parent = fake_bridge;
+		mutex_init(&slave_image->mtx);
+		slave_image->locked = 0;
+		slave_image->number = i;
+		slave_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
+			VME_A64 | VME_CRCSR | VME_USER1 | VME_USER2 |
+			VME_USER3 | VME_USER4;
+		slave_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
+			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
+			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
+			VME_PROG | VME_DATA;
+		list_add_tail(&slave_image->list,
+				&fake_bridge->slave_resources);
+	}
+
+	/* Add location monitor to list */
+	INIT_LIST_HEAD(&fake_bridge->lm_resources);
+	lm = kmalloc(sizeof(*lm), GFP_KERNEL);
+	if (!lm) {
+		retval = -ENOMEM;
+		goto err_lm;
+	}
+	lm->parent = fake_bridge;
+	mutex_init(&lm->mtx);
+	lm->locked = 0;
+	lm->number = 1;
+	lm->monitors = 4;
+	list_add_tail(&lm->list, &fake_bridge->lm_resources);
+
+	fake_bridge->slave_get = fake_slave_get;
+	fake_bridge->slave_set = fake_slave_set;
+	fake_bridge->master_get = fake_master_get;
+	fake_bridge->master_set = fake_master_set;
+	fake_bridge->master_read = fake_master_read;
+	fake_bridge->master_write = fake_master_write;
+	fake_bridge->master_rmw = fake_master_rmw;
+	fake_bridge->irq_set = fake_irq_set;
+	fake_bridge->irq_generate = fake_irq_generate;
+	fake_bridge->lm_set = fake_lm_set;
+	fake_bridge->lm_get = fake_lm_get;
+	fake_bridge->lm_attach = fake_lm_attach;
+	fake_bridge->lm_detach = fake_lm_detach;
+	fake_bridge->slot_get = fake_slot_get;
+	fake_bridge->alloc_consistent = fake_alloc_consistent;
+	fake_bridge->free_consistent = fake_free_consistent;
+
+	pr_info("Board is%s the VME system controller\n",
+			(geoid == 1) ? "" : " not");
+
+	pr_info("VME geographical address is set to %d\n", geoid);
+
+	retval = fake_crcsr_init(fake_bridge);
+	if (retval) {
+		pr_err("CR/CSR configuration failed.\n");
+		goto err_crcsr;
+	}
+
+	retval = vme_register_bridge(fake_bridge);
+	if (retval != 0) {
+		pr_err("Chip Registration failed.\n");
+		goto err_reg;
+	}
+
+	exit_pointer = fake_bridge;
+
+	return 0;
+
+err_reg:
+	fake_crcsr_exit(fake_bridge);
+err_crcsr:
+err_lm:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &fake_bridge->lm_resources) {
+		lm = list_entry(pos, struct vme_lm_resource, list);
+		list_del(pos);
+		kfree(lm);
+	}
+err_slave:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &fake_bridge->slave_resources) {
+		slave_image = list_entry(pos, struct vme_slave_resource, list);
+		list_del(pos);
+		kfree(slave_image);
+	}
+err_master:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &fake_bridge->master_resources) {
+		master_image = list_entry(pos, struct vme_master_resource,
+				list);
+		list_del(pos);
+		kfree(master_image);
+	}
+
+	kfree(fake_device);
+err_driver:
+	kfree(fake_bridge);
+err_struct:
+	return retval;
+
+}
+
+
+static void __exit fake_exit(void)
+{
+	struct list_head *pos = NULL;
+	struct list_head *tmplist;
+	struct vme_master_resource *master_image;
+	struct vme_slave_resource *slave_image;
+	int i;
+	struct vme_bridge *fake_bridge;
+	struct fake_driver *bridge;
+
+	fake_bridge = exit_pointer;
+
+	bridge = fake_bridge->driver_priv;
+
+	pr_debug("Driver is being unloaded.\n");
+
+	/*
+	 *  Shutdown all inbound and outbound windows.
+	 */
+	for (i = 0; i < FAKE_MAX_MASTER; i++)
+		bridge->masters[i].enabled = 0;
+
+	for (i = 0; i < FAKE_MAX_SLAVE; i++)
+		bridge->slaves[i].enabled = 0;
+
+	/*
+	 *  Shutdown Location monitor.
+	 */
+	bridge->lm_enabled = 0;
+
+	vme_unregister_bridge(fake_bridge);
+
+	fake_crcsr_exit(fake_bridge);
+	/* resources are stored in link list */
+	list_for_each_safe(pos, tmplist, &fake_bridge->slave_resources) {
+		slave_image = list_entry(pos, struct vme_slave_resource, list);
+		list_del(pos);
+		kfree(slave_image);
+	}
+
+	/* resources are stored in link list */
+	list_for_each_safe(pos, tmplist, &fake_bridge->master_resources) {
+		master_image = list_entry(pos, struct vme_master_resource,
+				list);
+		list_del(pos);
+		kfree(master_image);
+	}
+
+	kfree(fake_bridge->driver_priv);
+
+	kfree(fake_bridge);
+
+	root_device_unregister(vme_root);
+}
+
+
+MODULE_PARM_DESC(geoid, "Set geographical addressing");
+module_param(geoid, int, 0);
+
+MODULE_DESCRIPTION("Fake VME bridge driver");
+MODULE_LICENSE("GPL");
+
+module_init(fake_init);
+module_exit(fake_exit);
diff --git a/drivers/staging/vme_user/vme_tsi148.c b/drivers/staging/vme_user/vme_tsi148.c
new file mode 100644
index 000000000000..956476213241
--- /dev/null
+++ b/drivers/staging/vme_user/vme_tsi148.c
@@ -0,0 +1,2661 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Support for the Tundra TSI148 VME-PCI Bridge Chip
+ *
+ * Author: Martyn Welch <martyn.welch@ge.com>
+ * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
+ *
+ * Based on work by Tom Armistead and Ajit Prem
+ * Copyright 2004 Motorola Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/byteorder/generic.h>
+
+#include "vme.h"
+#include "vme_bridge.h"
+#include "vme_tsi148.h"
+
+static int tsi148_probe(struct pci_dev *, const struct pci_device_id *);
+static void tsi148_remove(struct pci_dev *);
+
+
+/* Module parameter */
+static bool err_chk;
+static int geoid;
+
+static const char driver_name[] = "vme_tsi148";
+
+static const struct pci_device_id tsi148_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_TUNDRA, PCI_DEVICE_ID_TUNDRA_TSI148) },
+	{ },
+};
+
+MODULE_DEVICE_TABLE(pci, tsi148_ids);
+
+static struct pci_driver tsi148_driver = {
+	.name = driver_name,
+	.id_table = tsi148_ids,
+	.probe = tsi148_probe,
+	.remove = tsi148_remove,
+};
+
+static void reg_join(unsigned int high, unsigned int low,
+	unsigned long long *variable)
+{
+	*variable = (unsigned long long)high << 32;
+	*variable |= (unsigned long long)low;
+}
+
+static void reg_split(unsigned long long variable, unsigned int *high,
+	unsigned int *low)
+{
+	*low = (unsigned int)variable & 0xFFFFFFFF;
+	*high = (unsigned int)(variable >> 32);
+}
+
+/*
+ * Wakes up DMA queue.
+ */
+static u32 tsi148_DMA_irqhandler(struct tsi148_driver *bridge,
+	int channel_mask)
+{
+	u32 serviced = 0;
+
+	if (channel_mask & TSI148_LCSR_INTS_DMA0S) {
+		wake_up(&bridge->dma_queue[0]);
+		serviced |= TSI148_LCSR_INTC_DMA0C;
+	}
+	if (channel_mask & TSI148_LCSR_INTS_DMA1S) {
+		wake_up(&bridge->dma_queue[1]);
+		serviced |= TSI148_LCSR_INTC_DMA1C;
+	}
+
+	return serviced;
+}
+
+/*
+ * Wake up location monitor queue
+ */
+static u32 tsi148_LM_irqhandler(struct tsi148_driver *bridge, u32 stat)
+{
+	int i;
+	u32 serviced = 0;
+
+	for (i = 0; i < 4; i++) {
+		if (stat & TSI148_LCSR_INTS_LMS[i]) {
+			/* We only enable interrupts if the callback is set */
+			bridge->lm_callback[i](bridge->lm_data[i]);
+			serviced |= TSI148_LCSR_INTC_LMC[i];
+		}
+	}
+
+	return serviced;
+}
+
+/*
+ * Wake up mail box queue.
+ *
+ * XXX This functionality is not exposed up though API.
+ */
+static u32 tsi148_MB_irqhandler(struct vme_bridge *tsi148_bridge, u32 stat)
+{
+	int i;
+	u32 val;
+	u32 serviced = 0;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	for (i = 0; i < 4; i++) {
+		if (stat & TSI148_LCSR_INTS_MBS[i]) {
+			val = ioread32be(bridge->base +	TSI148_GCSR_MBOX[i]);
+			dev_err(tsi148_bridge->parent, "VME Mailbox %d received"
+				": 0x%x\n", i, val);
+			serviced |= TSI148_LCSR_INTC_MBC[i];
+		}
+	}
+
+	return serviced;
+}
+
+/*
+ * Display error & status message when PERR (PCI) exception interrupt occurs.
+ */
+static u32 tsi148_PERR_irqhandler(struct vme_bridge *tsi148_bridge)
+{
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	dev_err(tsi148_bridge->parent, "PCI Exception at address: 0x%08x:%08x, "
+		"attributes: %08x\n",
+		ioread32be(bridge->base + TSI148_LCSR_EDPAU),
+		ioread32be(bridge->base + TSI148_LCSR_EDPAL),
+		ioread32be(bridge->base + TSI148_LCSR_EDPAT));
+
+	dev_err(tsi148_bridge->parent, "PCI-X attribute reg: %08x, PCI-X split "
+		"completion reg: %08x\n",
+		ioread32be(bridge->base + TSI148_LCSR_EDPXA),
+		ioread32be(bridge->base + TSI148_LCSR_EDPXS));
+
+	iowrite32be(TSI148_LCSR_EDPAT_EDPCL, bridge->base + TSI148_LCSR_EDPAT);
+
+	return TSI148_LCSR_INTC_PERRC;
+}
+
+/*
+ * Save address and status when VME error interrupt occurs.
+ */
+static u32 tsi148_VERR_irqhandler(struct vme_bridge *tsi148_bridge)
+{
+	unsigned int error_addr_high, error_addr_low;
+	unsigned long long error_addr;
+	u32 error_attrib;
+	int error_am;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	error_addr_high = ioread32be(bridge->base + TSI148_LCSR_VEAU);
+	error_addr_low = ioread32be(bridge->base + TSI148_LCSR_VEAL);
+	error_attrib = ioread32be(bridge->base + TSI148_LCSR_VEAT);
+	error_am = (error_attrib & TSI148_LCSR_VEAT_AM_M) >> 8;
+
+	reg_join(error_addr_high, error_addr_low, &error_addr);
+
+	/* Check for exception register overflow (we have lost error data) */
+	if (error_attrib & TSI148_LCSR_VEAT_VEOF) {
+		dev_err(tsi148_bridge->parent, "VME Bus Exception Overflow "
+			"Occurred\n");
+	}
+
+	if (err_chk)
+		vme_bus_error_handler(tsi148_bridge, error_addr, error_am);
+	else
+		dev_err(tsi148_bridge->parent,
+			"VME Bus Error at address: 0x%llx, attributes: %08x\n",
+			error_addr, error_attrib);
+
+	/* Clear Status */
+	iowrite32be(TSI148_LCSR_VEAT_VESCL, bridge->base + TSI148_LCSR_VEAT);
+
+	return TSI148_LCSR_INTC_VERRC;
+}
+
+/*
+ * Wake up IACK queue.
+ */
+static u32 tsi148_IACK_irqhandler(struct tsi148_driver *bridge)
+{
+	wake_up(&bridge->iack_queue);
+
+	return TSI148_LCSR_INTC_IACKC;
+}
+
+/*
+ * Calling VME bus interrupt callback if provided.
+ */
+static u32 tsi148_VIRQ_irqhandler(struct vme_bridge *tsi148_bridge,
+	u32 stat)
+{
+	int vec, i, serviced = 0;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	for (i = 7; i > 0; i--) {
+		if (stat & (1 << i)) {
+			/*
+			 * Note: Even though the registers are defined as
+			 * 32-bits in the spec, we only want to issue 8-bit
+			 * IACK cycles on the bus, read from offset 3.
+			 */
+			vec = ioread8(bridge->base + TSI148_LCSR_VIACK[i] + 3);
+
+			vme_irq_handler(tsi148_bridge, i, vec);
+
+			serviced |= (1 << i);
+		}
+	}
+
+	return serviced;
+}
+
+/*
+ * Top level interrupt handler.  Clears appropriate interrupt status bits and
+ * then calls appropriate sub handler(s).
+ */
+static irqreturn_t tsi148_irqhandler(int irq, void *ptr)
+{
+	u32 stat, enable, serviced = 0;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	tsi148_bridge = ptr;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	/* Determine which interrupts are unmasked and set */
+	enable = ioread32be(bridge->base + TSI148_LCSR_INTEO);
+	stat = ioread32be(bridge->base + TSI148_LCSR_INTS);
+
+	/* Only look at unmasked interrupts */
+	stat &= enable;
+
+	if (unlikely(!stat))
+		return IRQ_NONE;
+
+	/* Call subhandlers as appropriate */
+	/* DMA irqs */
+	if (stat & (TSI148_LCSR_INTS_DMA1S | TSI148_LCSR_INTS_DMA0S))
+		serviced |= tsi148_DMA_irqhandler(bridge, stat);
+
+	/* Location monitor irqs */
+	if (stat & (TSI148_LCSR_INTS_LM3S | TSI148_LCSR_INTS_LM2S |
+			TSI148_LCSR_INTS_LM1S | TSI148_LCSR_INTS_LM0S))
+		serviced |= tsi148_LM_irqhandler(bridge, stat);
+
+	/* Mail box irqs */
+	if (stat & (TSI148_LCSR_INTS_MB3S | TSI148_LCSR_INTS_MB2S |
+			TSI148_LCSR_INTS_MB1S | TSI148_LCSR_INTS_MB0S))
+		serviced |= tsi148_MB_irqhandler(tsi148_bridge, stat);
+
+	/* PCI bus error */
+	if (stat & TSI148_LCSR_INTS_PERRS)
+		serviced |= tsi148_PERR_irqhandler(tsi148_bridge);
+
+	/* VME bus error */
+	if (stat & TSI148_LCSR_INTS_VERRS)
+		serviced |= tsi148_VERR_irqhandler(tsi148_bridge);
+
+	/* IACK irq */
+	if (stat & TSI148_LCSR_INTS_IACKS)
+		serviced |= tsi148_IACK_irqhandler(bridge);
+
+	/* VME bus irqs */
+	if (stat & (TSI148_LCSR_INTS_IRQ7S | TSI148_LCSR_INTS_IRQ6S |
+			TSI148_LCSR_INTS_IRQ5S | TSI148_LCSR_INTS_IRQ4S |
+			TSI148_LCSR_INTS_IRQ3S | TSI148_LCSR_INTS_IRQ2S |
+			TSI148_LCSR_INTS_IRQ1S))
+		serviced |= tsi148_VIRQ_irqhandler(tsi148_bridge, stat);
+
+	/* Clear serviced interrupts */
+	iowrite32be(serviced, bridge->base + TSI148_LCSR_INTC);
+
+	return IRQ_HANDLED;
+}
+
+static int tsi148_irq_init(struct vme_bridge *tsi148_bridge)
+{
+	int result;
+	unsigned int tmp;
+	struct pci_dev *pdev;
+	struct tsi148_driver *bridge;
+
+	pdev = to_pci_dev(tsi148_bridge->parent);
+
+	bridge = tsi148_bridge->driver_priv;
+
+	result = request_irq(pdev->irq,
+			     tsi148_irqhandler,
+			     IRQF_SHARED,
+			     driver_name, tsi148_bridge);
+	if (result) {
+		dev_err(tsi148_bridge->parent, "Can't get assigned pci irq "
+			"vector %02X\n", pdev->irq);
+		return result;
+	}
+
+	/* Enable and unmask interrupts */
+	tmp = TSI148_LCSR_INTEO_DMA1EO | TSI148_LCSR_INTEO_DMA0EO |
+		TSI148_LCSR_INTEO_MB3EO | TSI148_LCSR_INTEO_MB2EO |
+		TSI148_LCSR_INTEO_MB1EO | TSI148_LCSR_INTEO_MB0EO |
+		TSI148_LCSR_INTEO_PERREO | TSI148_LCSR_INTEO_VERREO |
+		TSI148_LCSR_INTEO_IACKEO;
+
+	/* This leaves the following interrupts masked.
+	 * TSI148_LCSR_INTEO_VIEEO
+	 * TSI148_LCSR_INTEO_SYSFLEO
+	 * TSI148_LCSR_INTEO_ACFLEO
+	 */
+
+	/* Don't enable Location Monitor interrupts here - they will be
+	 * enabled when the location monitors are properly configured and
+	 * a callback has been attached.
+	 * TSI148_LCSR_INTEO_LM0EO
+	 * TSI148_LCSR_INTEO_LM1EO
+	 * TSI148_LCSR_INTEO_LM2EO
+	 * TSI148_LCSR_INTEO_LM3EO
+	 */
+
+	/* Don't enable VME interrupts until we add a handler, else the board
+	 * will respond to it and we don't want that unless it knows how to
+	 * properly deal with it.
+	 * TSI148_LCSR_INTEO_IRQ7EO
+	 * TSI148_LCSR_INTEO_IRQ6EO
+	 * TSI148_LCSR_INTEO_IRQ5EO
+	 * TSI148_LCSR_INTEO_IRQ4EO
+	 * TSI148_LCSR_INTEO_IRQ3EO
+	 * TSI148_LCSR_INTEO_IRQ2EO
+	 * TSI148_LCSR_INTEO_IRQ1EO
+	 */
+
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
+
+	return 0;
+}
+
+static void tsi148_irq_exit(struct vme_bridge *tsi148_bridge,
+	struct pci_dev *pdev)
+{
+	struct tsi148_driver *bridge = tsi148_bridge->driver_priv;
+
+	/* Turn off interrupts */
+	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTEO);
+	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTEN);
+
+	/* Clear all interrupts */
+	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_INTC);
+
+	/* Detach interrupt handler */
+	free_irq(pdev->irq, tsi148_bridge);
+}
+
+/*
+ * Check to see if an IACk has been received, return true (1) or false (0).
+ */
+static int tsi148_iack_received(struct tsi148_driver *bridge)
+{
+	u32 tmp;
+
+	tmp = ioread32be(bridge->base + TSI148_LCSR_VICR);
+
+	if (tmp & TSI148_LCSR_VICR_IRQS)
+		return 0;
+	else
+		return 1;
+}
+
+/*
+ * Configure VME interrupt
+ */
+static void tsi148_irq_set(struct vme_bridge *tsi148_bridge, int level,
+	int state, int sync)
+{
+	struct pci_dev *pdev;
+	u32 tmp;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	/* We need to do the ordering differently for enabling and disabling */
+	if (state == 0) {
+		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
+		tmp &= ~TSI148_LCSR_INTEN_IRQEN[level - 1];
+		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
+
+		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
+		tmp &= ~TSI148_LCSR_INTEO_IRQEO[level - 1];
+		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
+
+		if (sync != 0) {
+			pdev = to_pci_dev(tsi148_bridge->parent);
+			synchronize_irq(pdev->irq);
+		}
+	} else {
+		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
+		tmp |= TSI148_LCSR_INTEO_IRQEO[level - 1];
+		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
+
+		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
+		tmp |= TSI148_LCSR_INTEN_IRQEN[level - 1];
+		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
+	}
+}
+
+/*
+ * Generate a VME bus interrupt at the requested level & vector. Wait for
+ * interrupt to be acked.
+ */
+static int tsi148_irq_generate(struct vme_bridge *tsi148_bridge, int level,
+	int statid)
+{
+	u32 tmp;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	mutex_lock(&bridge->vme_int);
+
+	/* Read VICR register */
+	tmp = ioread32be(bridge->base + TSI148_LCSR_VICR);
+
+	/* Set Status/ID */
+	tmp = (tmp & ~TSI148_LCSR_VICR_STID_M) |
+		(statid & TSI148_LCSR_VICR_STID_M);
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_VICR);
+
+	/* Assert VMEbus IRQ */
+	tmp = tmp | TSI148_LCSR_VICR_IRQL[level];
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_VICR);
+
+	/* XXX Consider implementing a timeout? */
+	wait_event_interruptible(bridge->iack_queue,
+		tsi148_iack_received(bridge));
+
+	mutex_unlock(&bridge->vme_int);
+
+	return 0;
+}
+
+/*
+ * Initialize a slave window with the requested attributes.
+ */
+static int tsi148_slave_set(struct vme_slave_resource *image, int enabled,
+	unsigned long long vme_base, unsigned long long size,
+	dma_addr_t pci_base, u32 aspace, u32 cycle)
+{
+	unsigned int i, addr = 0, granularity = 0;
+	unsigned int temp_ctl = 0;
+	unsigned int vme_base_low, vme_base_high;
+	unsigned int vme_bound_low, vme_bound_high;
+	unsigned int pci_offset_low, pci_offset_high;
+	unsigned long long vme_bound, pci_offset;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	tsi148_bridge = image->parent;
+	bridge = tsi148_bridge->driver_priv;
+
+	i = image->number;
+
+	switch (aspace) {
+	case VME_A16:
+		granularity = 0x10;
+		addr |= TSI148_LCSR_ITAT_AS_A16;
+		break;
+	case VME_A24:
+		granularity = 0x1000;
+		addr |= TSI148_LCSR_ITAT_AS_A24;
+		break;
+	case VME_A32:
+		granularity = 0x10000;
+		addr |= TSI148_LCSR_ITAT_AS_A32;
+		break;
+	case VME_A64:
+		granularity = 0x10000;
+		addr |= TSI148_LCSR_ITAT_AS_A64;
+		break;
+	default:
+		dev_err(tsi148_bridge->parent, "Invalid address space\n");
+		return -EINVAL;
+	}
+
+	/* Convert 64-bit variables to 2x 32-bit variables */
+	reg_split(vme_base, &vme_base_high, &vme_base_low);
+
+	/*
+	 * Bound address is a valid address for the window, adjust
+	 * accordingly
+	 */
+	vme_bound = vme_base + size - granularity;
+	reg_split(vme_bound, &vme_bound_high, &vme_bound_low);
+	pci_offset = (unsigned long long)pci_base - vme_base;
+	reg_split(pci_offset, &pci_offset_high, &pci_offset_low);
+
+	if (vme_base_low & (granularity - 1)) {
+		dev_err(tsi148_bridge->parent, "Invalid VME base alignment\n");
+		return -EINVAL;
+	}
+	if (vme_bound_low & (granularity - 1)) {
+		dev_err(tsi148_bridge->parent, "Invalid VME bound alignment\n");
+		return -EINVAL;
+	}
+	if (pci_offset_low & (granularity - 1)) {
+		dev_err(tsi148_bridge->parent, "Invalid PCI Offset "
+			"alignment\n");
+		return -EINVAL;
+	}
+
+	/*  Disable while we are mucking around */
+	temp_ctl = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITAT);
+	temp_ctl &= ~TSI148_LCSR_ITAT_EN;
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITAT);
+
+	/* Setup mapping */
+	iowrite32be(vme_base_high, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITSAU);
+	iowrite32be(vme_base_low, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITSAL);
+	iowrite32be(vme_bound_high, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITEAU);
+	iowrite32be(vme_bound_low, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITEAL);
+	iowrite32be(pci_offset_high, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITOFU);
+	iowrite32be(pci_offset_low, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITOFL);
+
+	/* Setup 2eSST speeds */
+	temp_ctl &= ~TSI148_LCSR_ITAT_2eSSTM_M;
+	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
+	case VME_2eSST160:
+		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_160;
+		break;
+	case VME_2eSST267:
+		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_267;
+		break;
+	case VME_2eSST320:
+		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_320;
+		break;
+	}
+
+	/* Setup cycle types */
+	temp_ctl &= ~(0x1F << 7);
+	if (cycle & VME_BLT)
+		temp_ctl |= TSI148_LCSR_ITAT_BLT;
+	if (cycle & VME_MBLT)
+		temp_ctl |= TSI148_LCSR_ITAT_MBLT;
+	if (cycle & VME_2eVME)
+		temp_ctl |= TSI148_LCSR_ITAT_2eVME;
+	if (cycle & VME_2eSST)
+		temp_ctl |= TSI148_LCSR_ITAT_2eSST;
+	if (cycle & VME_2eSSTB)
+		temp_ctl |= TSI148_LCSR_ITAT_2eSSTB;
+
+	/* Setup address space */
+	temp_ctl &= ~TSI148_LCSR_ITAT_AS_M;
+	temp_ctl |= addr;
+
+	temp_ctl &= ~0xF;
+	if (cycle & VME_SUPER)
+		temp_ctl |= TSI148_LCSR_ITAT_SUPR ;
+	if (cycle & VME_USER)
+		temp_ctl |= TSI148_LCSR_ITAT_NPRIV;
+	if (cycle & VME_PROG)
+		temp_ctl |= TSI148_LCSR_ITAT_PGM;
+	if (cycle & VME_DATA)
+		temp_ctl |= TSI148_LCSR_ITAT_DATA;
+
+	/* Write ctl reg without enable */
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITAT);
+
+	if (enabled)
+		temp_ctl |= TSI148_LCSR_ITAT_EN;
+
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITAT);
+
+	return 0;
+}
+
+/*
+ * Get slave window configuration.
+ */
+static int tsi148_slave_get(struct vme_slave_resource *image, int *enabled,
+	unsigned long long *vme_base, unsigned long long *size,
+	dma_addr_t *pci_base, u32 *aspace, u32 *cycle)
+{
+	unsigned int i, granularity = 0, ctl = 0;
+	unsigned int vme_base_low, vme_base_high;
+	unsigned int vme_bound_low, vme_bound_high;
+	unsigned int pci_offset_low, pci_offset_high;
+	unsigned long long vme_bound, pci_offset;
+	struct tsi148_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	i = image->number;
+
+	/* Read registers */
+	ctl = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITAT);
+
+	vme_base_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITSAU);
+	vme_base_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITSAL);
+	vme_bound_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITEAU);
+	vme_bound_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITEAL);
+	pci_offset_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITOFU);
+	pci_offset_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
+		TSI148_LCSR_OFFSET_ITOFL);
+
+	/* Convert 64-bit variables to 2x 32-bit variables */
+	reg_join(vme_base_high, vme_base_low, vme_base);
+	reg_join(vme_bound_high, vme_bound_low, &vme_bound);
+	reg_join(pci_offset_high, pci_offset_low, &pci_offset);
+
+	*pci_base = (dma_addr_t)(*vme_base + pci_offset);
+
+	*enabled = 0;
+	*aspace = 0;
+	*cycle = 0;
+
+	if (ctl & TSI148_LCSR_ITAT_EN)
+		*enabled = 1;
+
+	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A16) {
+		granularity = 0x10;
+		*aspace |= VME_A16;
+	}
+	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A24) {
+		granularity = 0x1000;
+		*aspace |= VME_A24;
+	}
+	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A32) {
+		granularity = 0x10000;
+		*aspace |= VME_A32;
+	}
+	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A64) {
+		granularity = 0x10000;
+		*aspace |= VME_A64;
+	}
+
+	/* Need granularity before we set the size */
+	*size = (unsigned long long)((vme_bound - *vme_base) + granularity);
+
+
+	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_160)
+		*cycle |= VME_2eSST160;
+	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_267)
+		*cycle |= VME_2eSST267;
+	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_320)
+		*cycle |= VME_2eSST320;
+
+	if (ctl & TSI148_LCSR_ITAT_BLT)
+		*cycle |= VME_BLT;
+	if (ctl & TSI148_LCSR_ITAT_MBLT)
+		*cycle |= VME_MBLT;
+	if (ctl & TSI148_LCSR_ITAT_2eVME)
+		*cycle |= VME_2eVME;
+	if (ctl & TSI148_LCSR_ITAT_2eSST)
+		*cycle |= VME_2eSST;
+	if (ctl & TSI148_LCSR_ITAT_2eSSTB)
+		*cycle |= VME_2eSSTB;
+
+	if (ctl & TSI148_LCSR_ITAT_SUPR)
+		*cycle |= VME_SUPER;
+	if (ctl & TSI148_LCSR_ITAT_NPRIV)
+		*cycle |= VME_USER;
+	if (ctl & TSI148_LCSR_ITAT_PGM)
+		*cycle |= VME_PROG;
+	if (ctl & TSI148_LCSR_ITAT_DATA)
+		*cycle |= VME_DATA;
+
+	return 0;
+}
+
+/*
+ * Allocate and map PCI Resource
+ */
+static int tsi148_alloc_resource(struct vme_master_resource *image,
+	unsigned long long size)
+{
+	unsigned long long existing_size;
+	int retval = 0;
+	struct pci_dev *pdev;
+	struct vme_bridge *tsi148_bridge;
+
+	tsi148_bridge = image->parent;
+
+	pdev = to_pci_dev(tsi148_bridge->parent);
+
+	existing_size = (unsigned long long)(image->bus_resource.end -
+		image->bus_resource.start);
+
+	/* If the existing size is OK, return */
+	if ((size != 0) && (existing_size == (size - 1)))
+		return 0;
+
+	if (existing_size != 0) {
+		iounmap(image->kern_base);
+		image->kern_base = NULL;
+		kfree(image->bus_resource.name);
+		release_resource(&image->bus_resource);
+		memset(&image->bus_resource, 0, sizeof(image->bus_resource));
+	}
+
+	/* Exit here if size is zero */
+	if (size == 0)
+		return 0;
+
+	if (!image->bus_resource.name) {
+		image->bus_resource.name = kmalloc(VMENAMSIZ+3, GFP_ATOMIC);
+		if (!image->bus_resource.name) {
+			retval = -ENOMEM;
+			goto err_name;
+		}
+	}
+
+	sprintf((char *)image->bus_resource.name, "%s.%d", tsi148_bridge->name,
+		image->number);
+
+	image->bus_resource.start = 0;
+	image->bus_resource.end = (unsigned long)size;
+	image->bus_resource.flags = IORESOURCE_MEM;
+
+	retval = pci_bus_alloc_resource(pdev->bus,
+		&image->bus_resource, size, 0x10000, PCIBIOS_MIN_MEM,
+		0, NULL, NULL);
+	if (retval) {
+		dev_err(tsi148_bridge->parent, "Failed to allocate mem "
+			"resource for window %d size 0x%lx start 0x%lx\n",
+			image->number, (unsigned long)size,
+			(unsigned long)image->bus_resource.start);
+		goto err_resource;
+	}
+
+	image->kern_base = ioremap(
+		image->bus_resource.start, size);
+	if (!image->kern_base) {
+		dev_err(tsi148_bridge->parent, "Failed to remap resource\n");
+		retval = -ENOMEM;
+		goto err_remap;
+	}
+
+	return 0;
+
+err_remap:
+	release_resource(&image->bus_resource);
+err_resource:
+	kfree(image->bus_resource.name);
+	memset(&image->bus_resource, 0, sizeof(image->bus_resource));
+err_name:
+	return retval;
+}
+
+/*
+ * Free and unmap PCI Resource
+ */
+static void tsi148_free_resource(struct vme_master_resource *image)
+{
+	iounmap(image->kern_base);
+	image->kern_base = NULL;
+	release_resource(&image->bus_resource);
+	kfree(image->bus_resource.name);
+	memset(&image->bus_resource, 0, sizeof(image->bus_resource));
+}
+
+/*
+ * Set the attributes of an outbound window.
+ */
+static int tsi148_master_set(struct vme_master_resource *image, int enabled,
+	unsigned long long vme_base, unsigned long long size, u32 aspace,
+	u32 cycle, u32 dwidth)
+{
+	int retval = 0;
+	unsigned int i;
+	unsigned int temp_ctl = 0;
+	unsigned int pci_base_low, pci_base_high;
+	unsigned int pci_bound_low, pci_bound_high;
+	unsigned int vme_offset_low, vme_offset_high;
+	unsigned long long pci_bound, vme_offset, pci_base;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+	struct pci_bus_region region;
+	struct pci_dev *pdev;
+
+	tsi148_bridge = image->parent;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	pdev = to_pci_dev(tsi148_bridge->parent);
+
+	/* Verify input data */
+	if (vme_base & 0xFFFF) {
+		dev_err(tsi148_bridge->parent, "Invalid VME Window "
+			"alignment\n");
+		retval = -EINVAL;
+		goto err_window;
+	}
+
+	if ((size == 0) && (enabled != 0)) {
+		dev_err(tsi148_bridge->parent, "Size must be non-zero for "
+			"enabled windows\n");
+		retval = -EINVAL;
+		goto err_window;
+	}
+
+	spin_lock(&image->lock);
+
+	/* Let's allocate the resource here rather than further up the stack as
+	 * it avoids pushing loads of bus dependent stuff up the stack. If size
+	 * is zero, any existing resource will be freed.
+	 */
+	retval = tsi148_alloc_resource(image, size);
+	if (retval) {
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Unable to allocate memory for "
+			"resource\n");
+		goto err_res;
+	}
+
+	if (size == 0) {
+		pci_base = 0;
+		pci_bound = 0;
+		vme_offset = 0;
+	} else {
+		pcibios_resource_to_bus(pdev->bus, &region,
+					&image->bus_resource);
+		pci_base = region.start;
+
+		/*
+		 * Bound address is a valid address for the window, adjust
+		 * according to window granularity.
+		 */
+		pci_bound = pci_base + (size - 0x10000);
+		vme_offset = vme_base - pci_base;
+	}
+
+	/* Convert 64-bit variables to 2x 32-bit variables */
+	reg_split(pci_base, &pci_base_high, &pci_base_low);
+	reg_split(pci_bound, &pci_bound_high, &pci_bound_low);
+	reg_split(vme_offset, &vme_offset_high, &vme_offset_low);
+
+	if (pci_base_low & 0xFFFF) {
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Invalid PCI base alignment\n");
+		retval = -EINVAL;
+		goto err_gran;
+	}
+	if (pci_bound_low & 0xFFFF) {
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Invalid PCI bound alignment\n");
+		retval = -EINVAL;
+		goto err_gran;
+	}
+	if (vme_offset_low & 0xFFFF) {
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Invalid VME Offset "
+			"alignment\n");
+		retval = -EINVAL;
+		goto err_gran;
+	}
+
+	i = image->number;
+
+	/* Disable while we are mucking around */
+	temp_ctl = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTAT);
+	temp_ctl &= ~TSI148_LCSR_OTAT_EN;
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTAT);
+
+	/* Setup 2eSST speeds */
+	temp_ctl &= ~TSI148_LCSR_OTAT_2eSSTM_M;
+	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
+	case VME_2eSST160:
+		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_160;
+		break;
+	case VME_2eSST267:
+		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_267;
+		break;
+	case VME_2eSST320:
+		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_320;
+		break;
+	}
+
+	/* Setup cycle types */
+	if (cycle & VME_BLT) {
+		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
+		temp_ctl |= TSI148_LCSR_OTAT_TM_BLT;
+	}
+	if (cycle & VME_MBLT) {
+		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
+		temp_ctl |= TSI148_LCSR_OTAT_TM_MBLT;
+	}
+	if (cycle & VME_2eVME) {
+		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
+		temp_ctl |= TSI148_LCSR_OTAT_TM_2eVME;
+	}
+	if (cycle & VME_2eSST) {
+		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
+		temp_ctl |= TSI148_LCSR_OTAT_TM_2eSST;
+	}
+	if (cycle & VME_2eSSTB) {
+		dev_warn(tsi148_bridge->parent, "Currently not setting "
+			"Broadcast Select Registers\n");
+		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
+		temp_ctl |= TSI148_LCSR_OTAT_TM_2eSSTB;
+	}
+
+	/* Setup data width */
+	temp_ctl &= ~TSI148_LCSR_OTAT_DBW_M;
+	switch (dwidth) {
+	case VME_D16:
+		temp_ctl |= TSI148_LCSR_OTAT_DBW_16;
+		break;
+	case VME_D32:
+		temp_ctl |= TSI148_LCSR_OTAT_DBW_32;
+		break;
+	default:
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Invalid data width\n");
+		retval = -EINVAL;
+		goto err_dwidth;
+	}
+
+	/* Setup address space */
+	temp_ctl &= ~TSI148_LCSR_OTAT_AMODE_M;
+	switch (aspace) {
+	case VME_A16:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A16;
+		break;
+	case VME_A24:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A24;
+		break;
+	case VME_A32:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A32;
+		break;
+	case VME_A64:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A64;
+		break;
+	case VME_CRCSR:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_CRCSR;
+		break;
+	case VME_USER1:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER1;
+		break;
+	case VME_USER2:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER2;
+		break;
+	case VME_USER3:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER3;
+		break;
+	case VME_USER4:
+		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER4;
+		break;
+	default:
+		spin_unlock(&image->lock);
+		dev_err(tsi148_bridge->parent, "Invalid address space\n");
+		retval = -EINVAL;
+		goto err_aspace;
+	}
+
+	temp_ctl &= ~(3<<4);
+	if (cycle & VME_SUPER)
+		temp_ctl |= TSI148_LCSR_OTAT_SUP;
+	if (cycle & VME_PROG)
+		temp_ctl |= TSI148_LCSR_OTAT_PGM;
+
+	/* Setup mapping */
+	iowrite32be(pci_base_high, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAU);
+	iowrite32be(pci_base_low, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAL);
+	iowrite32be(pci_bound_high, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTEAU);
+	iowrite32be(pci_bound_low, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTEAL);
+	iowrite32be(vme_offset_high, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTOFU);
+	iowrite32be(vme_offset_low, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTOFL);
+
+	/* Write ctl reg without enable */
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTAT);
+
+	if (enabled)
+		temp_ctl |= TSI148_LCSR_OTAT_EN;
+
+	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTAT);
+
+	spin_unlock(&image->lock);
+	return 0;
+
+err_aspace:
+err_dwidth:
+err_gran:
+	tsi148_free_resource(image);
+err_res:
+err_window:
+	return retval;
+
+}
+
+/*
+ * Set the attributes of an outbound window.
+ *
+ * XXX Not parsing prefetch information.
+ */
+static int __tsi148_master_get(struct vme_master_resource *image, int *enabled,
+	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
+	u32 *cycle, u32 *dwidth)
+{
+	unsigned int i, ctl;
+	unsigned int pci_base_low, pci_base_high;
+	unsigned int pci_bound_low, pci_bound_high;
+	unsigned int vme_offset_low, vme_offset_high;
+
+	unsigned long long pci_base, pci_bound, vme_offset;
+	struct tsi148_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	i = image->number;
+
+	ctl = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTAT);
+
+	pci_base_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAU);
+	pci_base_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAL);
+	pci_bound_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTEAU);
+	pci_bound_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTEAL);
+	vme_offset_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTOFU);
+	vme_offset_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTOFL);
+
+	/* Convert 64-bit variables to 2x 32-bit variables */
+	reg_join(pci_base_high, pci_base_low, &pci_base);
+	reg_join(pci_bound_high, pci_bound_low, &pci_bound);
+	reg_join(vme_offset_high, vme_offset_low, &vme_offset);
+
+	*vme_base = pci_base + vme_offset;
+	*size = (unsigned long long)(pci_bound - pci_base) + 0x10000;
+
+	*enabled = 0;
+	*aspace = 0;
+	*cycle = 0;
+	*dwidth = 0;
+
+	if (ctl & TSI148_LCSR_OTAT_EN)
+		*enabled = 1;
+
+	/* Setup address space */
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A16)
+		*aspace |= VME_A16;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A24)
+		*aspace |= VME_A24;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A32)
+		*aspace |= VME_A32;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A64)
+		*aspace |= VME_A64;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_CRCSR)
+		*aspace |= VME_CRCSR;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER1)
+		*aspace |= VME_USER1;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER2)
+		*aspace |= VME_USER2;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER3)
+		*aspace |= VME_USER3;
+	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER4)
+		*aspace |= VME_USER4;
+
+	/* Setup 2eSST speeds */
+	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_160)
+		*cycle |= VME_2eSST160;
+	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_267)
+		*cycle |= VME_2eSST267;
+	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_320)
+		*cycle |= VME_2eSST320;
+
+	/* Setup cycle types */
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_SCT)
+		*cycle |= VME_SCT;
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_BLT)
+		*cycle |= VME_BLT;
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_MBLT)
+		*cycle |= VME_MBLT;
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eVME)
+		*cycle |= VME_2eVME;
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eSST)
+		*cycle |= VME_2eSST;
+	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eSSTB)
+		*cycle |= VME_2eSSTB;
+
+	if (ctl & TSI148_LCSR_OTAT_SUP)
+		*cycle |= VME_SUPER;
+	else
+		*cycle |= VME_USER;
+
+	if (ctl & TSI148_LCSR_OTAT_PGM)
+		*cycle |= VME_PROG;
+	else
+		*cycle |= VME_DATA;
+
+	/* Setup data width */
+	if ((ctl & TSI148_LCSR_OTAT_DBW_M) == TSI148_LCSR_OTAT_DBW_16)
+		*dwidth = VME_D16;
+	if ((ctl & TSI148_LCSR_OTAT_DBW_M) == TSI148_LCSR_OTAT_DBW_32)
+		*dwidth = VME_D32;
+
+	return 0;
+}
+
+
+static int tsi148_master_get(struct vme_master_resource *image, int *enabled,
+	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
+	u32 *cycle, u32 *dwidth)
+{
+	int retval;
+
+	spin_lock(&image->lock);
+
+	retval = __tsi148_master_get(image, enabled, vme_base, size, aspace,
+		cycle, dwidth);
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+static ssize_t tsi148_master_read(struct vme_master_resource *image, void *buf,
+	size_t count, loff_t offset)
+{
+	int retval, enabled;
+	unsigned long long vme_base, size;
+	u32 aspace, cycle, dwidth;
+	struct vme_error_handler *handler = NULL;
+	struct vme_bridge *tsi148_bridge;
+	void __iomem *addr = image->kern_base + offset;
+	unsigned int done = 0;
+	unsigned int count32;
+
+	tsi148_bridge = image->parent;
+
+	spin_lock(&image->lock);
+
+	if (err_chk) {
+		__tsi148_master_get(image, &enabled, &vme_base, &size, &aspace,
+				    &cycle, &dwidth);
+		handler = vme_register_error_handler(tsi148_bridge, aspace,
+						     vme_base + offset, count);
+		if (!handler) {
+			spin_unlock(&image->lock);
+			return -ENOMEM;
+		}
+	}
+
+	/* The following code handles VME address alignment. We cannot use
+	 * memcpy_xxx here because it may cut data transfers in to 8-bit
+	 * cycles when D16 or D32 cycles are required on the VME bus.
+	 * On the other hand, the bridge itself assures that the maximum data
+	 * cycle configured for the transfer is used and splits it
+	 * automatically for non-aligned addresses, so we don't want the
+	 * overhead of needlessly forcing small transfers for the entire cycle.
+	 */
+	if ((uintptr_t)addr & 0x1) {
+		*(u8 *)buf = ioread8(addr);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+	if ((uintptr_t)(addr + done) & 0x2) {
+		if ((count - done) < 2) {
+			*(u8 *)(buf + done) = ioread8(addr + done);
+			done += 1;
+			goto out;
+		} else {
+			*(u16 *)(buf + done) = ioread16(addr + done);
+			done += 2;
+		}
+	}
+
+	count32 = (count - done) & ~0x3;
+	while (done < count32) {
+		*(u32 *)(buf + done) = ioread32(addr + done);
+		done += 4;
+	}
+
+	if ((count - done) & 0x2) {
+		*(u16 *)(buf + done) = ioread16(addr + done);
+		done += 2;
+	}
+	if ((count - done) & 0x1) {
+		*(u8 *)(buf + done) = ioread8(addr + done);
+		done += 1;
+	}
+
+out:
+	retval = count;
+
+	if (err_chk) {
+		if (handler->num_errors) {
+			dev_err(image->parent->parent,
+				"First VME read error detected an at address 0x%llx\n",
+				handler->first_error);
+			retval = handler->first_error - (vme_base + offset);
+		}
+		vme_unregister_error_handler(handler);
+	}
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+
+static ssize_t tsi148_master_write(struct vme_master_resource *image, void *buf,
+	size_t count, loff_t offset)
+{
+	int retval = 0, enabled;
+	unsigned long long vme_base, size;
+	u32 aspace, cycle, dwidth;
+	void __iomem *addr = image->kern_base + offset;
+	unsigned int done = 0;
+	unsigned int count32;
+
+	struct vme_error_handler *handler = NULL;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	tsi148_bridge = image->parent;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	spin_lock(&image->lock);
+
+	if (err_chk) {
+		__tsi148_master_get(image, &enabled, &vme_base, &size, &aspace,
+				    &cycle, &dwidth);
+		handler = vme_register_error_handler(tsi148_bridge, aspace,
+						     vme_base + offset, count);
+		if (!handler) {
+			spin_unlock(&image->lock);
+			return -ENOMEM;
+		}
+	}
+
+	/* Here we apply for the same strategy we do in master_read
+	 * function in order to assure the correct cycles.
+	 */
+	if ((uintptr_t)addr & 0x1) {
+		iowrite8(*(u8 *)buf, addr);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+	if ((uintptr_t)(addr + done) & 0x2) {
+		if ((count - done) < 2) {
+			iowrite8(*(u8 *)(buf + done), addr + done);
+			done += 1;
+			goto out;
+		} else {
+			iowrite16(*(u16 *)(buf + done), addr + done);
+			done += 2;
+		}
+	}
+
+	count32 = (count - done) & ~0x3;
+	while (done < count32) {
+		iowrite32(*(u32 *)(buf + done), addr + done);
+		done += 4;
+	}
+
+	if ((count - done) & 0x2) {
+		iowrite16(*(u16 *)(buf + done), addr + done);
+		done += 2;
+	}
+	if ((count - done) & 0x1) {
+		iowrite8(*(u8 *)(buf + done), addr + done);
+		done += 1;
+	}
+
+out:
+	retval = count;
+
+	/*
+	 * Writes are posted. We need to do a read on the VME bus to flush out
+	 * all of the writes before we check for errors. We can't guarantee
+	 * that reading the data we have just written is safe. It is believed
+	 * that there isn't any read, write re-ordering, so we can read any
+	 * location in VME space, so lets read the Device ID from the tsi148's
+	 * own registers as mapped into CR/CSR space.
+	 *
+	 * We check for saved errors in the written address range/space.
+	 */
+
+	if (err_chk) {
+		ioread16(bridge->flush_image->kern_base + 0x7F000);
+
+		if (handler->num_errors) {
+			dev_warn(tsi148_bridge->parent,
+				 "First VME write error detected an at address 0x%llx\n",
+				 handler->first_error);
+			retval = handler->first_error - (vme_base + offset);
+		}
+		vme_unregister_error_handler(handler);
+	}
+
+	spin_unlock(&image->lock);
+
+	return retval;
+}
+
+/*
+ * Perform an RMW cycle on the VME bus.
+ *
+ * Requires a previously configured master window, returns final value.
+ */
+static unsigned int tsi148_master_rmw(struct vme_master_resource *image,
+	unsigned int mask, unsigned int compare, unsigned int swap,
+	loff_t offset)
+{
+	unsigned long long pci_addr;
+	unsigned int pci_addr_high, pci_addr_low;
+	u32 tmp, result;
+	int i;
+	struct tsi148_driver *bridge;
+
+	bridge = image->parent->driver_priv;
+
+	/* Find the PCI address that maps to the desired VME address */
+	i = image->number;
+
+	/* Locking as we can only do one of these at a time */
+	mutex_lock(&bridge->vme_rmw);
+
+	/* Lock image */
+	spin_lock(&image->lock);
+
+	pci_addr_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAU);
+	pci_addr_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
+		TSI148_LCSR_OFFSET_OTSAL);
+
+	reg_join(pci_addr_high, pci_addr_low, &pci_addr);
+	reg_split(pci_addr + offset, &pci_addr_high, &pci_addr_low);
+
+	/* Configure registers */
+	iowrite32be(mask, bridge->base + TSI148_LCSR_RMWEN);
+	iowrite32be(compare, bridge->base + TSI148_LCSR_RMWC);
+	iowrite32be(swap, bridge->base + TSI148_LCSR_RMWS);
+	iowrite32be(pci_addr_high, bridge->base + TSI148_LCSR_RMWAU);
+	iowrite32be(pci_addr_low, bridge->base + TSI148_LCSR_RMWAL);
+
+	/* Enable RMW */
+	tmp = ioread32be(bridge->base + TSI148_LCSR_VMCTRL);
+	tmp |= TSI148_LCSR_VMCTRL_RMWEN;
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_VMCTRL);
+
+	/* Kick process off with a read to the required address. */
+	result = ioread32be(image->kern_base + offset);
+
+	/* Disable RMW */
+	tmp = ioread32be(bridge->base + TSI148_LCSR_VMCTRL);
+	tmp &= ~TSI148_LCSR_VMCTRL_RMWEN;
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_VMCTRL);
+
+	spin_unlock(&image->lock);
+
+	mutex_unlock(&bridge->vme_rmw);
+
+	return result;
+}
+
+static int tsi148_dma_set_vme_src_attributes(struct device *dev, __be32 *attr,
+	u32 aspace, u32 cycle, u32 dwidth)
+{
+	u32 val;
+
+	val = be32_to_cpu(*attr);
+
+	/* Setup 2eSST speeds */
+	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
+	case VME_2eSST160:
+		val |= TSI148_LCSR_DSAT_2eSSTM_160;
+		break;
+	case VME_2eSST267:
+		val |= TSI148_LCSR_DSAT_2eSSTM_267;
+		break;
+	case VME_2eSST320:
+		val |= TSI148_LCSR_DSAT_2eSSTM_320;
+		break;
+	}
+
+	/* Setup cycle types */
+	if (cycle & VME_SCT)
+		val |= TSI148_LCSR_DSAT_TM_SCT;
+
+	if (cycle & VME_BLT)
+		val |= TSI148_LCSR_DSAT_TM_BLT;
+
+	if (cycle & VME_MBLT)
+		val |= TSI148_LCSR_DSAT_TM_MBLT;
+
+	if (cycle & VME_2eVME)
+		val |= TSI148_LCSR_DSAT_TM_2eVME;
+
+	if (cycle & VME_2eSST)
+		val |= TSI148_LCSR_DSAT_TM_2eSST;
+
+	if (cycle & VME_2eSSTB) {
+		dev_err(dev, "Currently not setting Broadcast Select "
+			"Registers\n");
+		val |= TSI148_LCSR_DSAT_TM_2eSSTB;
+	}
+
+	/* Setup data width */
+	switch (dwidth) {
+	case VME_D16:
+		val |= TSI148_LCSR_DSAT_DBW_16;
+		break;
+	case VME_D32:
+		val |= TSI148_LCSR_DSAT_DBW_32;
+		break;
+	default:
+		dev_err(dev, "Invalid data width\n");
+		return -EINVAL;
+	}
+
+	/* Setup address space */
+	switch (aspace) {
+	case VME_A16:
+		val |= TSI148_LCSR_DSAT_AMODE_A16;
+		break;
+	case VME_A24:
+		val |= TSI148_LCSR_DSAT_AMODE_A24;
+		break;
+	case VME_A32:
+		val |= TSI148_LCSR_DSAT_AMODE_A32;
+		break;
+	case VME_A64:
+		val |= TSI148_LCSR_DSAT_AMODE_A64;
+		break;
+	case VME_CRCSR:
+		val |= TSI148_LCSR_DSAT_AMODE_CRCSR;
+		break;
+	case VME_USER1:
+		val |= TSI148_LCSR_DSAT_AMODE_USER1;
+		break;
+	case VME_USER2:
+		val |= TSI148_LCSR_DSAT_AMODE_USER2;
+		break;
+	case VME_USER3:
+		val |= TSI148_LCSR_DSAT_AMODE_USER3;
+		break;
+	case VME_USER4:
+		val |= TSI148_LCSR_DSAT_AMODE_USER4;
+		break;
+	default:
+		dev_err(dev, "Invalid address space\n");
+		return -EINVAL;
+	}
+
+	if (cycle & VME_SUPER)
+		val |= TSI148_LCSR_DSAT_SUP;
+	if (cycle & VME_PROG)
+		val |= TSI148_LCSR_DSAT_PGM;
+
+	*attr = cpu_to_be32(val);
+
+	return 0;
+}
+
+static int tsi148_dma_set_vme_dest_attributes(struct device *dev, __be32 *attr,
+	u32 aspace, u32 cycle, u32 dwidth)
+{
+	u32 val;
+
+	val = be32_to_cpu(*attr);
+
+	/* Setup 2eSST speeds */
+	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
+	case VME_2eSST160:
+		val |= TSI148_LCSR_DDAT_2eSSTM_160;
+		break;
+	case VME_2eSST267:
+		val |= TSI148_LCSR_DDAT_2eSSTM_267;
+		break;
+	case VME_2eSST320:
+		val |= TSI148_LCSR_DDAT_2eSSTM_320;
+		break;
+	}
+
+	/* Setup cycle types */
+	if (cycle & VME_SCT)
+		val |= TSI148_LCSR_DDAT_TM_SCT;
+
+	if (cycle & VME_BLT)
+		val |= TSI148_LCSR_DDAT_TM_BLT;
+
+	if (cycle & VME_MBLT)
+		val |= TSI148_LCSR_DDAT_TM_MBLT;
+
+	if (cycle & VME_2eVME)
+		val |= TSI148_LCSR_DDAT_TM_2eVME;
+
+	if (cycle & VME_2eSST)
+		val |= TSI148_LCSR_DDAT_TM_2eSST;
+
+	if (cycle & VME_2eSSTB) {
+		dev_err(dev, "Currently not setting Broadcast Select "
+			"Registers\n");
+		val |= TSI148_LCSR_DDAT_TM_2eSSTB;
+	}
+
+	/* Setup data width */
+	switch (dwidth) {
+	case VME_D16:
+		val |= TSI148_LCSR_DDAT_DBW_16;
+		break;
+	case VME_D32:
+		val |= TSI148_LCSR_DDAT_DBW_32;
+		break;
+	default:
+		dev_err(dev, "Invalid data width\n");
+		return -EINVAL;
+	}
+
+	/* Setup address space */
+	switch (aspace) {
+	case VME_A16:
+		val |= TSI148_LCSR_DDAT_AMODE_A16;
+		break;
+	case VME_A24:
+		val |= TSI148_LCSR_DDAT_AMODE_A24;
+		break;
+	case VME_A32:
+		val |= TSI148_LCSR_DDAT_AMODE_A32;
+		break;
+	case VME_A64:
+		val |= TSI148_LCSR_DDAT_AMODE_A64;
+		break;
+	case VME_CRCSR:
+		val |= TSI148_LCSR_DDAT_AMODE_CRCSR;
+		break;
+	case VME_USER1:
+		val |= TSI148_LCSR_DDAT_AMODE_USER1;
+		break;
+	case VME_USER2:
+		val |= TSI148_LCSR_DDAT_AMODE_USER2;
+		break;
+	case VME_USER3:
+		val |= TSI148_LCSR_DDAT_AMODE_USER3;
+		break;
+	case VME_USER4:
+		val |= TSI148_LCSR_DDAT_AMODE_USER4;
+		break;
+	default:
+		dev_err(dev, "Invalid address space\n");
+		return -EINVAL;
+	}
+
+	if (cycle & VME_SUPER)
+		val |= TSI148_LCSR_DDAT_SUP;
+	if (cycle & VME_PROG)
+		val |= TSI148_LCSR_DDAT_PGM;
+
+	*attr = cpu_to_be32(val);
+
+	return 0;
+}
+
+/*
+ * Add a link list descriptor to the list
+ *
+ * Note: DMA engine expects the DMA descriptor to be big endian.
+ */
+static int tsi148_dma_list_add(struct vme_dma_list *list,
+	struct vme_dma_attr *src, struct vme_dma_attr *dest, size_t count)
+{
+	struct tsi148_dma_entry *entry, *prev;
+	u32 address_high, address_low, val;
+	struct vme_dma_pattern *pattern_attr;
+	struct vme_dma_pci *pci_attr;
+	struct vme_dma_vme *vme_attr;
+	int retval = 0;
+	struct vme_bridge *tsi148_bridge;
+
+	tsi148_bridge = list->parent->parent;
+
+	/* Descriptor must be aligned on 64-bit boundaries */
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		retval = -ENOMEM;
+		goto err_mem;
+	}
+
+	/* Test descriptor alignment */
+	if ((unsigned long)&entry->descriptor & 0x7) {
+		dev_err(tsi148_bridge->parent, "Descriptor not aligned to 8 "
+			"byte boundary as required: %p\n",
+			&entry->descriptor);
+		retval = -EINVAL;
+		goto err_align;
+	}
+
+	/* Given we are going to fill out the structure, we probably don't
+	 * need to zero it, but better safe than sorry for now.
+	 */
+	memset(&entry->descriptor, 0, sizeof(entry->descriptor));
+
+	/* Fill out source part */
+	switch (src->type) {
+	case VME_DMA_PATTERN:
+		pattern_attr = src->private;
+
+		entry->descriptor.dsal = cpu_to_be32(pattern_attr->pattern);
+
+		val = TSI148_LCSR_DSAT_TYP_PAT;
+
+		/* Default behaviour is 32 bit pattern */
+		if (pattern_attr->type & VME_DMA_PATTERN_BYTE)
+			val |= TSI148_LCSR_DSAT_PSZ;
+
+		/* It seems that the default behaviour is to increment */
+		if ((pattern_attr->type & VME_DMA_PATTERN_INCREMENT) == 0)
+			val |= TSI148_LCSR_DSAT_NIN;
+		entry->descriptor.dsat = cpu_to_be32(val);
+		break;
+	case VME_DMA_PCI:
+		pci_attr = src->private;
+
+		reg_split((unsigned long long)pci_attr->address, &address_high,
+			&address_low);
+		entry->descriptor.dsau = cpu_to_be32(address_high);
+		entry->descriptor.dsal = cpu_to_be32(address_low);
+		entry->descriptor.dsat = cpu_to_be32(TSI148_LCSR_DSAT_TYP_PCI);
+		break;
+	case VME_DMA_VME:
+		vme_attr = src->private;
+
+		reg_split((unsigned long long)vme_attr->address, &address_high,
+			&address_low);
+		entry->descriptor.dsau = cpu_to_be32(address_high);
+		entry->descriptor.dsal = cpu_to_be32(address_low);
+		entry->descriptor.dsat = cpu_to_be32(TSI148_LCSR_DSAT_TYP_VME);
+
+		retval = tsi148_dma_set_vme_src_attributes(
+			tsi148_bridge->parent, &entry->descriptor.dsat,
+			vme_attr->aspace, vme_attr->cycle, vme_attr->dwidth);
+		if (retval < 0)
+			goto err_source;
+		break;
+	default:
+		dev_err(tsi148_bridge->parent, "Invalid source type\n");
+		retval = -EINVAL;
+		goto err_source;
+	}
+
+	/* Assume last link - this will be over-written by adding another */
+	entry->descriptor.dnlau = cpu_to_be32(0);
+	entry->descriptor.dnlal = cpu_to_be32(TSI148_LCSR_DNLAL_LLA);
+
+	/* Fill out destination part */
+	switch (dest->type) {
+	case VME_DMA_PCI:
+		pci_attr = dest->private;
+
+		reg_split((unsigned long long)pci_attr->address, &address_high,
+			&address_low);
+		entry->descriptor.ddau = cpu_to_be32(address_high);
+		entry->descriptor.ddal = cpu_to_be32(address_low);
+		entry->descriptor.ddat = cpu_to_be32(TSI148_LCSR_DDAT_TYP_PCI);
+		break;
+	case VME_DMA_VME:
+		vme_attr = dest->private;
+
+		reg_split((unsigned long long)vme_attr->address, &address_high,
+			&address_low);
+		entry->descriptor.ddau = cpu_to_be32(address_high);
+		entry->descriptor.ddal = cpu_to_be32(address_low);
+		entry->descriptor.ddat = cpu_to_be32(TSI148_LCSR_DDAT_TYP_VME);
+
+		retval = tsi148_dma_set_vme_dest_attributes(
+			tsi148_bridge->parent, &entry->descriptor.ddat,
+			vme_attr->aspace, vme_attr->cycle, vme_attr->dwidth);
+		if (retval < 0)
+			goto err_dest;
+		break;
+	default:
+		dev_err(tsi148_bridge->parent, "Invalid destination type\n");
+		retval = -EINVAL;
+		goto err_dest;
+	}
+
+	/* Fill out count */
+	entry->descriptor.dcnt = cpu_to_be32((u32)count);
+
+	/* Add to list */
+	list_add_tail(&entry->list, &list->entries);
+
+	entry->dma_handle = dma_map_single(tsi148_bridge->parent,
+					   &entry->descriptor,
+					   sizeof(entry->descriptor),
+					   DMA_TO_DEVICE);
+	if (dma_mapping_error(tsi148_bridge->parent, entry->dma_handle)) {
+		dev_err(tsi148_bridge->parent, "DMA mapping error\n");
+		retval = -EINVAL;
+		goto err_dma;
+	}
+
+	/* Fill out previous descriptors "Next Address" */
+	if (entry->list.prev != &list->entries) {
+		reg_split((unsigned long long)entry->dma_handle, &address_high,
+			&address_low);
+		prev = list_entry(entry->list.prev, struct tsi148_dma_entry,
+				  list);
+		prev->descriptor.dnlau = cpu_to_be32(address_high);
+		prev->descriptor.dnlal = cpu_to_be32(address_low);
+
+	}
+
+	return 0;
+
+err_dma:
+err_dest:
+err_source:
+err_align:
+		kfree(entry);
+err_mem:
+	return retval;
+}
+
+/*
+ * Check to see if the provided DMA channel is busy.
+ */
+static int tsi148_dma_busy(struct vme_bridge *tsi148_bridge, int channel)
+{
+	u32 tmp;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	tmp = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
+		TSI148_LCSR_OFFSET_DSTA);
+
+	if (tmp & TSI148_LCSR_DSTA_BSY)
+		return 0;
+	else
+		return 1;
+
+}
+
+/*
+ * Execute a previously generated link list
+ *
+ * XXX Need to provide control register configuration.
+ */
+static int tsi148_dma_list_exec(struct vme_dma_list *list)
+{
+	struct vme_dma_resource *ctrlr;
+	int channel, retval;
+	struct tsi148_dma_entry *entry;
+	u32 bus_addr_high, bus_addr_low;
+	u32 val, dctlreg = 0;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	ctrlr = list->parent;
+
+	tsi148_bridge = ctrlr->parent;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	mutex_lock(&ctrlr->mtx);
+
+	channel = ctrlr->number;
+
+	if (!list_empty(&ctrlr->running)) {
+		/*
+		 * XXX We have an active DMA transfer and currently haven't
+		 *     sorted out the mechanism for "pending" DMA transfers.
+		 *     Return busy.
+		 */
+		/* Need to add to pending here */
+		mutex_unlock(&ctrlr->mtx);
+		return -EBUSY;
+	} else {
+		list_add(&list->list, &ctrlr->running);
+	}
+
+	/* Get first bus address and write into registers */
+	entry = list_first_entry(&list->entries, struct tsi148_dma_entry,
+		list);
+
+	mutex_unlock(&ctrlr->mtx);
+
+	reg_split(entry->dma_handle, &bus_addr_high, &bus_addr_low);
+
+	iowrite32be(bus_addr_high, bridge->base +
+		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DNLAU);
+	iowrite32be(bus_addr_low, bridge->base +
+		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DNLAL);
+
+	dctlreg = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
+		TSI148_LCSR_OFFSET_DCTL);
+
+	/* Start the operation */
+	iowrite32be(dctlreg | TSI148_LCSR_DCTL_DGO, bridge->base +
+		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DCTL);
+
+	retval = wait_event_interruptible(bridge->dma_queue[channel],
+		tsi148_dma_busy(ctrlr->parent, channel));
+
+	if (retval) {
+		iowrite32be(dctlreg | TSI148_LCSR_DCTL_ABT, bridge->base +
+			TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DCTL);
+		/* Wait for the operation to abort */
+		wait_event(bridge->dma_queue[channel],
+			   tsi148_dma_busy(ctrlr->parent, channel));
+		retval = -EINTR;
+		goto exit;
+	}
+
+	/*
+	 * Read status register, this register is valid until we kick off a
+	 * new transfer.
+	 */
+	val = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
+		TSI148_LCSR_OFFSET_DSTA);
+
+	if (val & TSI148_LCSR_DSTA_VBE) {
+		dev_err(tsi148_bridge->parent, "DMA Error. DSTA=%08X\n", val);
+		retval = -EIO;
+	}
+
+exit:
+	/* Remove list from running list */
+	mutex_lock(&ctrlr->mtx);
+	list_del(&list->list);
+	mutex_unlock(&ctrlr->mtx);
+
+	return retval;
+}
+
+/*
+ * Clean up a previously generated link list
+ *
+ * We have a separate function, don't assume that the chain can't be reused.
+ */
+static int tsi148_dma_list_empty(struct vme_dma_list *list)
+{
+	struct list_head *pos, *temp;
+	struct tsi148_dma_entry *entry;
+
+	struct vme_bridge *tsi148_bridge = list->parent->parent;
+
+	/* detach and free each entry */
+	list_for_each_safe(pos, temp, &list->entries) {
+		list_del(pos);
+		entry = list_entry(pos, struct tsi148_dma_entry, list);
+
+		dma_unmap_single(tsi148_bridge->parent, entry->dma_handle,
+			sizeof(struct tsi148_dma_descriptor), DMA_TO_DEVICE);
+		kfree(entry);
+	}
+
+	return 0;
+}
+
+/*
+ * All 4 location monitors reside at the same base - this is therefore a
+ * system wide configuration.
+ *
+ * This does not enable the LM monitor - that should be done when the first
+ * callback is attached and disabled when the last callback is removed.
+ */
+static int tsi148_lm_set(struct vme_lm_resource *lm, unsigned long long lm_base,
+	u32 aspace, u32 cycle)
+{
+	u32 lm_base_high, lm_base_low, lm_ctl = 0;
+	int i;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	tsi148_bridge = lm->parent;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* If we already have a callback attached, we can't move it! */
+	for (i = 0; i < lm->monitors; i++) {
+		if (bridge->lm_callback[i]) {
+			mutex_unlock(&lm->mtx);
+			dev_err(tsi148_bridge->parent, "Location monitor "
+				"callback attached, can't reset\n");
+			return -EBUSY;
+		}
+	}
+
+	switch (aspace) {
+	case VME_A16:
+		lm_ctl |= TSI148_LCSR_LMAT_AS_A16;
+		break;
+	case VME_A24:
+		lm_ctl |= TSI148_LCSR_LMAT_AS_A24;
+		break;
+	case VME_A32:
+		lm_ctl |= TSI148_LCSR_LMAT_AS_A32;
+		break;
+	case VME_A64:
+		lm_ctl |= TSI148_LCSR_LMAT_AS_A64;
+		break;
+	default:
+		mutex_unlock(&lm->mtx);
+		dev_err(tsi148_bridge->parent, "Invalid address space\n");
+		return -EINVAL;
+	}
+
+	if (cycle & VME_SUPER)
+		lm_ctl |= TSI148_LCSR_LMAT_SUPR ;
+	if (cycle & VME_USER)
+		lm_ctl |= TSI148_LCSR_LMAT_NPRIV;
+	if (cycle & VME_PROG)
+		lm_ctl |= TSI148_LCSR_LMAT_PGM;
+	if (cycle & VME_DATA)
+		lm_ctl |= TSI148_LCSR_LMAT_DATA;
+
+	reg_split(lm_base, &lm_base_high, &lm_base_low);
+
+	iowrite32be(lm_base_high, bridge->base + TSI148_LCSR_LMBAU);
+	iowrite32be(lm_base_low, bridge->base + TSI148_LCSR_LMBAL);
+	iowrite32be(lm_ctl, bridge->base + TSI148_LCSR_LMAT);
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/* Get configuration of the callback monitor and return whether it is enabled
+ * or disabled.
+ */
+static int tsi148_lm_get(struct vme_lm_resource *lm,
+	unsigned long long *lm_base, u32 *aspace, u32 *cycle)
+{
+	u32 lm_base_high, lm_base_low, lm_ctl, enabled = 0;
+	struct tsi148_driver *bridge;
+
+	bridge = lm->parent->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	lm_base_high = ioread32be(bridge->base + TSI148_LCSR_LMBAU);
+	lm_base_low = ioread32be(bridge->base + TSI148_LCSR_LMBAL);
+	lm_ctl = ioread32be(bridge->base + TSI148_LCSR_LMAT);
+
+	reg_join(lm_base_high, lm_base_low, lm_base);
+
+	if (lm_ctl & TSI148_LCSR_LMAT_EN)
+		enabled = 1;
+
+	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A16)
+		*aspace |= VME_A16;
+
+	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A24)
+		*aspace |= VME_A24;
+
+	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A32)
+		*aspace |= VME_A32;
+
+	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A64)
+		*aspace |= VME_A64;
+
+
+	if (lm_ctl & TSI148_LCSR_LMAT_SUPR)
+		*cycle |= VME_SUPER;
+	if (lm_ctl & TSI148_LCSR_LMAT_NPRIV)
+		*cycle |= VME_USER;
+	if (lm_ctl & TSI148_LCSR_LMAT_PGM)
+		*cycle |= VME_PROG;
+	if (lm_ctl & TSI148_LCSR_LMAT_DATA)
+		*cycle |= VME_DATA;
+
+	mutex_unlock(&lm->mtx);
+
+	return enabled;
+}
+
+/*
+ * Attach a callback to a specific location monitor.
+ *
+ * Callback will be passed the monitor triggered.
+ */
+static int tsi148_lm_attach(struct vme_lm_resource *lm, int monitor,
+	void (*callback)(void *), void *data)
+{
+	u32 lm_ctl, tmp;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *bridge;
+
+	tsi148_bridge = lm->parent;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* Ensure that the location monitor is configured - need PGM or DATA */
+	lm_ctl = ioread32be(bridge->base + TSI148_LCSR_LMAT);
+	if ((lm_ctl & (TSI148_LCSR_LMAT_PGM | TSI148_LCSR_LMAT_DATA)) == 0) {
+		mutex_unlock(&lm->mtx);
+		dev_err(tsi148_bridge->parent, "Location monitor not properly "
+			"configured\n");
+		return -EINVAL;
+	}
+
+	/* Check that a callback isn't already attached */
+	if (bridge->lm_callback[monitor]) {
+		mutex_unlock(&lm->mtx);
+		dev_err(tsi148_bridge->parent, "Existing callback attached\n");
+		return -EBUSY;
+	}
+
+	/* Attach callback */
+	bridge->lm_callback[monitor] = callback;
+	bridge->lm_data[monitor] = data;
+
+	/* Enable Location Monitor interrupt */
+	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
+	tmp |= TSI148_LCSR_INTEN_LMEN[monitor];
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
+
+	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
+	tmp |= TSI148_LCSR_INTEO_LMEO[monitor];
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
+
+	/* Ensure that global Location Monitor Enable set */
+	if ((lm_ctl & TSI148_LCSR_LMAT_EN) == 0) {
+		lm_ctl |= TSI148_LCSR_LMAT_EN;
+		iowrite32be(lm_ctl, bridge->base + TSI148_LCSR_LMAT);
+	}
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/*
+ * Detach a callback function forn a specific location monitor.
+ */
+static int tsi148_lm_detach(struct vme_lm_resource *lm, int monitor)
+{
+	u32 lm_en, tmp;
+	struct tsi148_driver *bridge;
+
+	bridge = lm->parent->driver_priv;
+
+	mutex_lock(&lm->mtx);
+
+	/* Disable Location Monitor and ensure previous interrupts are clear */
+	lm_en = ioread32be(bridge->base + TSI148_LCSR_INTEN);
+	lm_en &= ~TSI148_LCSR_INTEN_LMEN[monitor];
+	iowrite32be(lm_en, bridge->base + TSI148_LCSR_INTEN);
+
+	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
+	tmp &= ~TSI148_LCSR_INTEO_LMEO[monitor];
+	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
+
+	iowrite32be(TSI148_LCSR_INTC_LMC[monitor],
+		 bridge->base + TSI148_LCSR_INTC);
+
+	/* Detach callback */
+	bridge->lm_callback[monitor] = NULL;
+	bridge->lm_data[monitor] = NULL;
+
+	/* If all location monitors disabled, disable global Location Monitor */
+	if ((lm_en & (TSI148_LCSR_INTS_LM0S | TSI148_LCSR_INTS_LM1S |
+			TSI148_LCSR_INTS_LM2S | TSI148_LCSR_INTS_LM3S)) == 0) {
+		tmp = ioread32be(bridge->base + TSI148_LCSR_LMAT);
+		tmp &= ~TSI148_LCSR_LMAT_EN;
+		iowrite32be(tmp, bridge->base + TSI148_LCSR_LMAT);
+	}
+
+	mutex_unlock(&lm->mtx);
+
+	return 0;
+}
+
+/*
+ * Determine Geographical Addressing
+ */
+static int tsi148_slot_get(struct vme_bridge *tsi148_bridge)
+{
+	u32 slot = 0;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	if (!geoid) {
+		slot = ioread32be(bridge->base + TSI148_LCSR_VSTAT);
+		slot = slot & TSI148_LCSR_VSTAT_GA_M;
+	} else
+		slot = geoid;
+
+	return (int)slot;
+}
+
+static void *tsi148_alloc_consistent(struct device *parent, size_t size,
+	dma_addr_t *dma)
+{
+	struct pci_dev *pdev;
+
+	/* Find pci_dev container of dev */
+	pdev = to_pci_dev(parent);
+
+	return dma_alloc_coherent(&pdev->dev, size, dma, GFP_KERNEL);
+}
+
+static void tsi148_free_consistent(struct device *parent, size_t size,
+	void *vaddr, dma_addr_t dma)
+{
+	struct pci_dev *pdev;
+
+	/* Find pci_dev container of dev */
+	pdev = to_pci_dev(parent);
+
+	dma_free_coherent(&pdev->dev, size, vaddr, dma);
+}
+
+/*
+ * Configure CR/CSR space
+ *
+ * Access to the CR/CSR can be configured at power-up. The location of the
+ * CR/CSR registers in the CR/CSR address space is determined by the boards
+ * Auto-ID or Geographic address. This function ensures that the window is
+ * enabled at an offset consistent with the boards geopgraphic address.
+ *
+ * Each board has a 512kB window, with the highest 4kB being used for the
+ * boards registers, this means there is a fix length 508kB window which must
+ * be mapped onto PCI memory.
+ */
+static int tsi148_crcsr_init(struct vme_bridge *tsi148_bridge,
+	struct pci_dev *pdev)
+{
+	u32 cbar, crat, vstat;
+	u32 crcsr_bus_high, crcsr_bus_low;
+	int retval;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	/* Allocate mem for CR/CSR image */
+	bridge->crcsr_kernel = dma_alloc_coherent(&pdev->dev,
+						  VME_CRCSR_BUF_SIZE,
+						  &bridge->crcsr_bus, GFP_KERNEL);
+	if (!bridge->crcsr_kernel) {
+		dev_err(tsi148_bridge->parent, "Failed to allocate memory for "
+			"CR/CSR image\n");
+		return -ENOMEM;
+	}
+
+	reg_split(bridge->crcsr_bus, &crcsr_bus_high, &crcsr_bus_low);
+
+	iowrite32be(crcsr_bus_high, bridge->base + TSI148_LCSR_CROU);
+	iowrite32be(crcsr_bus_low, bridge->base + TSI148_LCSR_CROL);
+
+	/* Ensure that the CR/CSR is configured at the correct offset */
+	cbar = ioread32be(bridge->base + TSI148_CBAR);
+	cbar = (cbar & TSI148_CRCSR_CBAR_M)>>3;
+
+	vstat = tsi148_slot_get(tsi148_bridge);
+
+	if (cbar != vstat) {
+		cbar = vstat;
+		dev_info(tsi148_bridge->parent, "Setting CR/CSR offset\n");
+		iowrite32be(cbar<<3, bridge->base + TSI148_CBAR);
+	}
+	dev_info(tsi148_bridge->parent, "CR/CSR Offset: %d\n", cbar);
+
+	crat = ioread32be(bridge->base + TSI148_LCSR_CRAT);
+	if (crat & TSI148_LCSR_CRAT_EN)
+		dev_info(tsi148_bridge->parent, "CR/CSR already enabled\n");
+	else {
+		dev_info(tsi148_bridge->parent, "Enabling CR/CSR space\n");
+		iowrite32be(crat | TSI148_LCSR_CRAT_EN,
+			bridge->base + TSI148_LCSR_CRAT);
+	}
+
+	/* If we want flushed, error-checked writes, set up a window
+	 * over the CR/CSR registers. We read from here to safely flush
+	 * through VME writes.
+	 */
+	if (err_chk) {
+		retval = tsi148_master_set(bridge->flush_image, 1,
+			(vstat * 0x80000), 0x80000, VME_CRCSR, VME_SCT,
+			VME_D16);
+		if (retval)
+			dev_err(tsi148_bridge->parent, "Configuring flush image"
+				" failed\n");
+	}
+
+	return 0;
+
+}
+
+static void tsi148_crcsr_exit(struct vme_bridge *tsi148_bridge,
+	struct pci_dev *pdev)
+{
+	u32 crat;
+	struct tsi148_driver *bridge;
+
+	bridge = tsi148_bridge->driver_priv;
+
+	/* Turn off CR/CSR space */
+	crat = ioread32be(bridge->base + TSI148_LCSR_CRAT);
+	iowrite32be(crat & ~TSI148_LCSR_CRAT_EN,
+		bridge->base + TSI148_LCSR_CRAT);
+
+	/* Free image */
+	iowrite32be(0, bridge->base + TSI148_LCSR_CROU);
+	iowrite32be(0, bridge->base + TSI148_LCSR_CROL);
+
+	dma_free_coherent(&pdev->dev, VME_CRCSR_BUF_SIZE,
+			  bridge->crcsr_kernel, bridge->crcsr_bus);
+}
+
+static int tsi148_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int retval, i, master_num;
+	u32 data;
+	struct list_head *pos = NULL, *n;
+	struct vme_bridge *tsi148_bridge;
+	struct tsi148_driver *tsi148_device;
+	struct vme_master_resource *master_image;
+	struct vme_slave_resource *slave_image;
+	struct vme_dma_resource *dma_ctrlr;
+	struct vme_lm_resource *lm;
+
+	/* If we want to support more than one of each bridge, we need to
+	 * dynamically generate this so we get one per device
+	 */
+	tsi148_bridge = kzalloc(sizeof(*tsi148_bridge), GFP_KERNEL);
+	if (!tsi148_bridge) {
+		retval = -ENOMEM;
+		goto err_struct;
+	}
+	vme_init_bridge(tsi148_bridge);
+
+	tsi148_device = kzalloc(sizeof(*tsi148_device), GFP_KERNEL);
+	if (!tsi148_device) {
+		retval = -ENOMEM;
+		goto err_driver;
+	}
+
+	tsi148_bridge->driver_priv = tsi148_device;
+
+	/* Enable the device */
+	retval = pci_enable_device(pdev);
+	if (retval) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto err_enable;
+	}
+
+	/* Map Registers */
+	retval = pci_request_regions(pdev, driver_name);
+	if (retval) {
+		dev_err(&pdev->dev, "Unable to reserve resources\n");
+		goto err_resource;
+	}
+
+	/* map registers in BAR 0 */
+	tsi148_device->base = ioremap(pci_resource_start(pdev, 0),
+		4096);
+	if (!tsi148_device->base) {
+		dev_err(&pdev->dev, "Unable to remap CRG region\n");
+		retval = -EIO;
+		goto err_remap;
+	}
+
+	/* Check to see if the mapping worked out */
+	data = ioread32(tsi148_device->base + TSI148_PCFS_ID) & 0x0000FFFF;
+	if (data != PCI_VENDOR_ID_TUNDRA) {
+		dev_err(&pdev->dev, "CRG region check failed\n");
+		retval = -EIO;
+		goto err_test;
+	}
+
+	/* Initialize wait queues & mutual exclusion flags */
+	init_waitqueue_head(&tsi148_device->dma_queue[0]);
+	init_waitqueue_head(&tsi148_device->dma_queue[1]);
+	init_waitqueue_head(&tsi148_device->iack_queue);
+	mutex_init(&tsi148_device->vme_int);
+	mutex_init(&tsi148_device->vme_rmw);
+
+	tsi148_bridge->parent = &pdev->dev;
+	strcpy(tsi148_bridge->name, driver_name);
+
+	/* Setup IRQ */
+	retval = tsi148_irq_init(tsi148_bridge);
+	if (retval != 0) {
+		dev_err(&pdev->dev, "Chip Initialization failed.\n");
+		goto err_irq;
+	}
+
+	/* If we are going to flush writes, we need to read from the VME bus.
+	 * We need to do this safely, thus we read the devices own CR/CSR
+	 * register. To do this we must set up a window in CR/CSR space and
+	 * hence have one less master window resource available.
+	 */
+	master_num = TSI148_MAX_MASTER;
+	if (err_chk) {
+		master_num--;
+
+		tsi148_device->flush_image =
+			kmalloc(sizeof(*tsi148_device->flush_image),
+				GFP_KERNEL);
+		if (!tsi148_device->flush_image) {
+			retval = -ENOMEM;
+			goto err_master;
+		}
+		tsi148_device->flush_image->parent = tsi148_bridge;
+		spin_lock_init(&tsi148_device->flush_image->lock);
+		tsi148_device->flush_image->locked = 1;
+		tsi148_device->flush_image->number = master_num;
+		memset(&tsi148_device->flush_image->bus_resource, 0,
+		       sizeof(tsi148_device->flush_image->bus_resource));
+		tsi148_device->flush_image->kern_base  = NULL;
+	}
+
+	/* Add master windows to list */
+	for (i = 0; i < master_num; i++) {
+		master_image = kmalloc(sizeof(*master_image), GFP_KERNEL);
+		if (!master_image) {
+			retval = -ENOMEM;
+			goto err_master;
+		}
+		master_image->parent = tsi148_bridge;
+		spin_lock_init(&master_image->lock);
+		master_image->locked = 0;
+		master_image->number = i;
+		master_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
+			VME_A64 | VME_CRCSR | VME_USER1 | VME_USER2 |
+			VME_USER3 | VME_USER4;
+		master_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
+			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
+			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
+			VME_PROG | VME_DATA;
+		master_image->width_attr = VME_D16 | VME_D32;
+		memset(&master_image->bus_resource, 0,
+		       sizeof(master_image->bus_resource));
+		master_image->kern_base  = NULL;
+		list_add_tail(&master_image->list,
+			&tsi148_bridge->master_resources);
+	}
+
+	/* Add slave windows to list */
+	for (i = 0; i < TSI148_MAX_SLAVE; i++) {
+		slave_image = kmalloc(sizeof(*slave_image), GFP_KERNEL);
+		if (!slave_image) {
+			retval = -ENOMEM;
+			goto err_slave;
+		}
+		slave_image->parent = tsi148_bridge;
+		mutex_init(&slave_image->mtx);
+		slave_image->locked = 0;
+		slave_image->number = i;
+		slave_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
+			VME_A64;
+		slave_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
+			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
+			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
+			VME_PROG | VME_DATA;
+		list_add_tail(&slave_image->list,
+			&tsi148_bridge->slave_resources);
+	}
+
+	/* Add dma engines to list */
+	for (i = 0; i < TSI148_MAX_DMA; i++) {
+		dma_ctrlr = kmalloc(sizeof(*dma_ctrlr), GFP_KERNEL);
+		if (!dma_ctrlr) {
+			retval = -ENOMEM;
+			goto err_dma;
+		}
+		dma_ctrlr->parent = tsi148_bridge;
+		mutex_init(&dma_ctrlr->mtx);
+		dma_ctrlr->locked = 0;
+		dma_ctrlr->number = i;
+		dma_ctrlr->route_attr = VME_DMA_VME_TO_MEM |
+			VME_DMA_MEM_TO_VME | VME_DMA_VME_TO_VME |
+			VME_DMA_MEM_TO_MEM | VME_DMA_PATTERN_TO_VME |
+			VME_DMA_PATTERN_TO_MEM;
+		INIT_LIST_HEAD(&dma_ctrlr->pending);
+		INIT_LIST_HEAD(&dma_ctrlr->running);
+		list_add_tail(&dma_ctrlr->list,
+			&tsi148_bridge->dma_resources);
+	}
+
+	/* Add location monitor to list */
+	lm = kmalloc(sizeof(*lm), GFP_KERNEL);
+	if (!lm) {
+		retval = -ENOMEM;
+		goto err_lm;
+	}
+	lm->parent = tsi148_bridge;
+	mutex_init(&lm->mtx);
+	lm->locked = 0;
+	lm->number = 1;
+	lm->monitors = 4;
+	list_add_tail(&lm->list, &tsi148_bridge->lm_resources);
+
+	tsi148_bridge->slave_get = tsi148_slave_get;
+	tsi148_bridge->slave_set = tsi148_slave_set;
+	tsi148_bridge->master_get = tsi148_master_get;
+	tsi148_bridge->master_set = tsi148_master_set;
+	tsi148_bridge->master_read = tsi148_master_read;
+	tsi148_bridge->master_write = tsi148_master_write;
+	tsi148_bridge->master_rmw = tsi148_master_rmw;
+	tsi148_bridge->dma_list_add = tsi148_dma_list_add;
+	tsi148_bridge->dma_list_exec = tsi148_dma_list_exec;
+	tsi148_bridge->dma_list_empty = tsi148_dma_list_empty;
+	tsi148_bridge->irq_set = tsi148_irq_set;
+	tsi148_bridge->irq_generate = tsi148_irq_generate;
+	tsi148_bridge->lm_set = tsi148_lm_set;
+	tsi148_bridge->lm_get = tsi148_lm_get;
+	tsi148_bridge->lm_attach = tsi148_lm_attach;
+	tsi148_bridge->lm_detach = tsi148_lm_detach;
+	tsi148_bridge->slot_get = tsi148_slot_get;
+	tsi148_bridge->alloc_consistent = tsi148_alloc_consistent;
+	tsi148_bridge->free_consistent = tsi148_free_consistent;
+
+	data = ioread32be(tsi148_device->base + TSI148_LCSR_VSTAT);
+	dev_info(&pdev->dev, "Board is%s the VME system controller\n",
+		(data & TSI148_LCSR_VSTAT_SCONS) ? "" : " not");
+	if (!geoid)
+		dev_info(&pdev->dev, "VME geographical address is %d\n",
+			data & TSI148_LCSR_VSTAT_GA_M);
+	else
+		dev_info(&pdev->dev, "VME geographical address is set to %d\n",
+			geoid);
+
+	dev_info(&pdev->dev, "VME Write and flush and error check is %s\n",
+		err_chk ? "enabled" : "disabled");
+
+	retval = tsi148_crcsr_init(tsi148_bridge, pdev);
+	if (retval) {
+		dev_err(&pdev->dev, "CR/CSR configuration failed.\n");
+		goto err_crcsr;
+	}
+
+	retval = vme_register_bridge(tsi148_bridge);
+	if (retval != 0) {
+		dev_err(&pdev->dev, "Chip Registration failed.\n");
+		goto err_reg;
+	}
+
+	pci_set_drvdata(pdev, tsi148_bridge);
+
+	/* Clear VME bus "board fail", and "power-up reset" lines */
+	data = ioread32be(tsi148_device->base + TSI148_LCSR_VSTAT);
+	data &= ~TSI148_LCSR_VSTAT_BRDFL;
+	data |= TSI148_LCSR_VSTAT_CPURST;
+	iowrite32be(data, tsi148_device->base + TSI148_LCSR_VSTAT);
+
+	return 0;
+
+err_reg:
+	tsi148_crcsr_exit(tsi148_bridge, pdev);
+err_crcsr:
+err_lm:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &tsi148_bridge->lm_resources) {
+		lm = list_entry(pos, struct vme_lm_resource, list);
+		list_del(pos);
+		kfree(lm);
+	}
+err_dma:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &tsi148_bridge->dma_resources) {
+		dma_ctrlr = list_entry(pos, struct vme_dma_resource, list);
+		list_del(pos);
+		kfree(dma_ctrlr);
+	}
+err_slave:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &tsi148_bridge->slave_resources) {
+		slave_image = list_entry(pos, struct vme_slave_resource, list);
+		list_del(pos);
+		kfree(slave_image);
+	}
+err_master:
+	/* resources are stored in link list */
+	list_for_each_safe(pos, n, &tsi148_bridge->master_resources) {
+		master_image = list_entry(pos, struct vme_master_resource,
+			list);
+		list_del(pos);
+		kfree(master_image);
+	}
+
+	tsi148_irq_exit(tsi148_bridge, pdev);
+err_irq:
+err_test:
+	iounmap(tsi148_device->base);
+err_remap:
+	pci_release_regions(pdev);
+err_resource:
+	pci_disable_device(pdev);
+err_enable:
+	kfree(tsi148_device);
+err_driver:
+	kfree(tsi148_bridge);
+err_struct:
+	return retval;
+
+}
+
+static void tsi148_remove(struct pci_dev *pdev)
+{
+	struct list_head *pos = NULL;
+	struct list_head *tmplist;
+	struct vme_master_resource *master_image;
+	struct vme_slave_resource *slave_image;
+	struct vme_dma_resource *dma_ctrlr;
+	int i;
+	struct tsi148_driver *bridge;
+	struct vme_bridge *tsi148_bridge = pci_get_drvdata(pdev);
+
+	bridge = tsi148_bridge->driver_priv;
+
+
+	dev_dbg(&pdev->dev, "Driver is being unloaded.\n");
+
+	/*
+	 *  Shutdown all inbound and outbound windows.
+	 */
+	for (i = 0; i < 8; i++) {
+		iowrite32be(0, bridge->base + TSI148_LCSR_IT[i] +
+			TSI148_LCSR_OFFSET_ITAT);
+		iowrite32be(0, bridge->base + TSI148_LCSR_OT[i] +
+			TSI148_LCSR_OFFSET_OTAT);
+	}
+
+	/*
+	 *  Shutdown Location monitor.
+	 */
+	iowrite32be(0, bridge->base + TSI148_LCSR_LMAT);
+
+	/*
+	 *  Shutdown CRG map.
+	 */
+	iowrite32be(0, bridge->base + TSI148_LCSR_CSRAT);
+
+	/*
+	 *  Clear error status.
+	 */
+	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_EDPAT);
+	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_VEAT);
+	iowrite32be(0x07000700, bridge->base + TSI148_LCSR_PSTAT);
+
+	/*
+	 *  Remove VIRQ interrupt (if any)
+	 */
+	if (ioread32be(bridge->base + TSI148_LCSR_VICR) & 0x800)
+		iowrite32be(0x8000, bridge->base + TSI148_LCSR_VICR);
+
+	/*
+	 *  Map all Interrupts to PCI INTA
+	 */
+	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTM1);
+	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTM2);
+
+	tsi148_irq_exit(tsi148_bridge, pdev);
+
+	vme_unregister_bridge(tsi148_bridge);
+
+	tsi148_crcsr_exit(tsi148_bridge, pdev);
+
+	/* resources are stored in link list */
+	list_for_each_safe(pos, tmplist, &tsi148_bridge->dma_resources) {
+		dma_ctrlr = list_entry(pos, struct vme_dma_resource, list);
+		list_del(pos);
+		kfree(dma_ctrlr);
+	}
+
+	/* resources are stored in link list */
+	list_for_each_safe(pos, tmplist, &tsi148_bridge->slave_resources) {
+		slave_image = list_entry(pos, struct vme_slave_resource, list);
+		list_del(pos);
+		kfree(slave_image);
+	}
+
+	/* resources are stored in link list */
+	list_for_each_safe(pos, tmplist, &tsi148_bridge->master_resources) {
+		master_image = list_entry(pos, struct vme_master_resource,
+			list);
+		list_del(pos);
+		kfree(master_image);
+	}
+
+	iounmap(bridge->base);
+
+	pci_release_regions(pdev);
+
+	pci_disable_device(pdev);
+
+	kfree(tsi148_bridge->driver_priv);
+
+	kfree(tsi148_bridge);
+}
+
+module_pci_driver(tsi148_driver);
+
+MODULE_PARM_DESC(err_chk, "Check for VME errors on reads and writes");
+module_param(err_chk, bool, 0);
+
+MODULE_PARM_DESC(geoid, "Override geographical addressing");
+module_param(geoid, int, 0);
+
+MODULE_DESCRIPTION("VME driver for the Tundra Tempe VME bridge");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/vme_user/vme_tsi148.h b/drivers/staging/vme_user/vme_tsi148.h
new file mode 100644
index 000000000000..226fedc6f167
--- /dev/null
+++ b/drivers/staging/vme_user/vme_tsi148.h
@@ -0,0 +1,1407 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * tsi148.h
+ *
+ * Support for the Tundra TSI148 VME Bridge chip
+ *
+ * Author: Tom Armistead
+ * Updated and maintained by Ajit Prem
+ * Copyright 2004 Motorola Inc.
+ */
+
+#ifndef TSI148_H
+#define TSI148_H
+
+#ifndef	PCI_VENDOR_ID_TUNDRA
+#define	PCI_VENDOR_ID_TUNDRA 0x10e3
+#endif
+
+#ifndef	PCI_DEVICE_ID_TUNDRA_TSI148
+#define	PCI_DEVICE_ID_TUNDRA_TSI148 0x148
+#endif
+
+/*
+ *  Define the number of each that the Tsi148 supports.
+ */
+#define TSI148_MAX_MASTER		8	/* Max Master Windows */
+#define TSI148_MAX_SLAVE		8	/* Max Slave Windows */
+#define TSI148_MAX_DMA			2	/* Max DMA Controllers */
+#define TSI148_MAX_MAILBOX		4	/* Max Mail Box registers */
+#define TSI148_MAX_SEMAPHORE		8	/* Max Semaphores */
+
+/* Structure used to hold driver specific information */
+struct tsi148_driver {
+	void __iomem *base;	/* Base Address of device registers */
+	wait_queue_head_t dma_queue[2];
+	wait_queue_head_t iack_queue;
+	void (*lm_callback[4])(void *);	/* Called in interrupt handler */
+	void *lm_data[4];
+	void *crcsr_kernel;
+	dma_addr_t crcsr_bus;
+	struct vme_master_resource *flush_image;
+	struct mutex vme_rmw;		/* Only one RMW cycle at a time */
+	struct mutex vme_int;		/*
+					 * Only one VME interrupt can be
+					 * generated at a time, provide locking
+					 */
+};
+
+/*
+ * Layout of a DMAC Linked-List Descriptor
+ *
+ * Note: This structure is accessed via the chip and therefore must be
+ *       correctly laid out - It must also be aligned on 64-bit boundaries.
+ */
+struct tsi148_dma_descriptor {
+	__be32 dsau;      /* Source Address */
+	__be32 dsal;
+	__be32 ddau;      /* Destination Address */
+	__be32 ddal;
+	__be32 dsat;      /* Source attributes */
+	__be32 ddat;      /* Destination attributes */
+	__be32 dnlau;     /* Next link address */
+	__be32 dnlal;
+	__be32 dcnt;      /* Byte count */
+	__be32 ddbs;      /* 2eSST Broadcast select */
+};
+
+struct tsi148_dma_entry {
+	/*
+	 * The descriptor needs to be aligned on a 64-bit boundary, we increase
+	 * the chance of this by putting it first in the structure.
+	 */
+	struct tsi148_dma_descriptor descriptor;
+	struct list_head list;
+	dma_addr_t dma_handle;
+};
+
+/*
+ *  TSI148 ASIC register structure overlays and bit field definitions.
+ *
+ *      Note:   Tsi148 Register Group (CRG) consists of the following
+ *              combination of registers:
+ *                      PCFS    - PCI Configuration Space Registers
+ *                      LCSR    - Local Control and Status Registers
+ *                      GCSR    - Global Control and Status Registers
+ *                      CR/CSR  - Subset of Configuration ROM /
+ *                                Control and Status Registers
+ */
+
+
+/*
+ *  Command/Status Registers (CRG + $004)
+ */
+#define TSI148_PCFS_ID			0x0
+#define TSI148_PCFS_CSR			0x4
+#define TSI148_PCFS_CLASS		0x8
+#define TSI148_PCFS_MISC0		0xC
+#define TSI148_PCFS_MBARL		0x10
+#define TSI148_PCFS_MBARU		0x14
+
+#define TSI148_PCFS_SUBID		0x28
+
+#define TSI148_PCFS_CAPP		0x34
+
+#define TSI148_PCFS_MISC1		0x3C
+
+#define TSI148_PCFS_XCAPP		0x40
+#define TSI148_PCFS_XSTAT		0x44
+
+/*
+ * LCSR definitions
+ */
+
+/*
+ *    Outbound Translations
+ */
+#define TSI148_LCSR_OT0_OTSAU		0x100
+#define TSI148_LCSR_OT0_OTSAL		0x104
+#define TSI148_LCSR_OT0_OTEAU		0x108
+#define TSI148_LCSR_OT0_OTEAL		0x10C
+#define TSI148_LCSR_OT0_OTOFU		0x110
+#define TSI148_LCSR_OT0_OTOFL		0x114
+#define TSI148_LCSR_OT0_OTBS		0x118
+#define TSI148_LCSR_OT0_OTAT		0x11C
+
+#define TSI148_LCSR_OT1_OTSAU		0x120
+#define TSI148_LCSR_OT1_OTSAL		0x124
+#define TSI148_LCSR_OT1_OTEAU		0x128
+#define TSI148_LCSR_OT1_OTEAL		0x12C
+#define TSI148_LCSR_OT1_OTOFU		0x130
+#define TSI148_LCSR_OT1_OTOFL		0x134
+#define TSI148_LCSR_OT1_OTBS		0x138
+#define TSI148_LCSR_OT1_OTAT		0x13C
+
+#define TSI148_LCSR_OT2_OTSAU		0x140
+#define TSI148_LCSR_OT2_OTSAL		0x144
+#define TSI148_LCSR_OT2_OTEAU		0x148
+#define TSI148_LCSR_OT2_OTEAL		0x14C
+#define TSI148_LCSR_OT2_OTOFU		0x150
+#define TSI148_LCSR_OT2_OTOFL		0x154
+#define TSI148_LCSR_OT2_OTBS		0x158
+#define TSI148_LCSR_OT2_OTAT		0x15C
+
+#define TSI148_LCSR_OT3_OTSAU		0x160
+#define TSI148_LCSR_OT3_OTSAL		0x164
+#define TSI148_LCSR_OT3_OTEAU		0x168
+#define TSI148_LCSR_OT3_OTEAL		0x16C
+#define TSI148_LCSR_OT3_OTOFU		0x170
+#define TSI148_LCSR_OT3_OTOFL		0x174
+#define TSI148_LCSR_OT3_OTBS		0x178
+#define TSI148_LCSR_OT3_OTAT		0x17C
+
+#define TSI148_LCSR_OT4_OTSAU		0x180
+#define TSI148_LCSR_OT4_OTSAL		0x184
+#define TSI148_LCSR_OT4_OTEAU		0x188
+#define TSI148_LCSR_OT4_OTEAL		0x18C
+#define TSI148_LCSR_OT4_OTOFU		0x190
+#define TSI148_LCSR_OT4_OTOFL		0x194
+#define TSI148_LCSR_OT4_OTBS		0x198
+#define TSI148_LCSR_OT4_OTAT		0x19C
+
+#define TSI148_LCSR_OT5_OTSAU		0x1A0
+#define TSI148_LCSR_OT5_OTSAL		0x1A4
+#define TSI148_LCSR_OT5_OTEAU		0x1A8
+#define TSI148_LCSR_OT5_OTEAL		0x1AC
+#define TSI148_LCSR_OT5_OTOFU		0x1B0
+#define TSI148_LCSR_OT5_OTOFL		0x1B4
+#define TSI148_LCSR_OT5_OTBS		0x1B8
+#define TSI148_LCSR_OT5_OTAT		0x1BC
+
+#define TSI148_LCSR_OT6_OTSAU		0x1C0
+#define TSI148_LCSR_OT6_OTSAL		0x1C4
+#define TSI148_LCSR_OT6_OTEAU		0x1C8
+#define TSI148_LCSR_OT6_OTEAL		0x1CC
+#define TSI148_LCSR_OT6_OTOFU		0x1D0
+#define TSI148_LCSR_OT6_OTOFL		0x1D4
+#define TSI148_LCSR_OT6_OTBS		0x1D8
+#define TSI148_LCSR_OT6_OTAT		0x1DC
+
+#define TSI148_LCSR_OT7_OTSAU		0x1E0
+#define TSI148_LCSR_OT7_OTSAL		0x1E4
+#define TSI148_LCSR_OT7_OTEAU		0x1E8
+#define TSI148_LCSR_OT7_OTEAL		0x1EC
+#define TSI148_LCSR_OT7_OTOFU		0x1F0
+#define TSI148_LCSR_OT7_OTOFL		0x1F4
+#define TSI148_LCSR_OT7_OTBS		0x1F8
+#define TSI148_LCSR_OT7_OTAT		0x1FC
+
+#define TSI148_LCSR_OT0		0x100
+#define TSI148_LCSR_OT1		0x120
+#define TSI148_LCSR_OT2		0x140
+#define TSI148_LCSR_OT3		0x160
+#define TSI148_LCSR_OT4		0x180
+#define TSI148_LCSR_OT5		0x1A0
+#define TSI148_LCSR_OT6		0x1C0
+#define TSI148_LCSR_OT7		0x1E0
+
+static const int TSI148_LCSR_OT[8] = { TSI148_LCSR_OT0, TSI148_LCSR_OT1,
+					 TSI148_LCSR_OT2, TSI148_LCSR_OT3,
+					 TSI148_LCSR_OT4, TSI148_LCSR_OT5,
+					 TSI148_LCSR_OT6, TSI148_LCSR_OT7 };
+
+#define TSI148_LCSR_OFFSET_OTSAU	0x0
+#define TSI148_LCSR_OFFSET_OTSAL	0x4
+#define TSI148_LCSR_OFFSET_OTEAU	0x8
+#define TSI148_LCSR_OFFSET_OTEAL	0xC
+#define TSI148_LCSR_OFFSET_OTOFU	0x10
+#define TSI148_LCSR_OFFSET_OTOFL	0x14
+#define TSI148_LCSR_OFFSET_OTBS		0x18
+#define TSI148_LCSR_OFFSET_OTAT		0x1C
+
+/*
+ * VMEbus interrupt ack
+ * offset  200
+ */
+#define TSI148_LCSR_VIACK1	0x204
+#define TSI148_LCSR_VIACK2	0x208
+#define TSI148_LCSR_VIACK3	0x20C
+#define TSI148_LCSR_VIACK4	0x210
+#define TSI148_LCSR_VIACK5	0x214
+#define TSI148_LCSR_VIACK6	0x218
+#define TSI148_LCSR_VIACK7	0x21C
+
+static const int TSI148_LCSR_VIACK[8] = { 0, TSI148_LCSR_VIACK1,
+				TSI148_LCSR_VIACK2, TSI148_LCSR_VIACK3,
+				TSI148_LCSR_VIACK4, TSI148_LCSR_VIACK5,
+				TSI148_LCSR_VIACK6, TSI148_LCSR_VIACK7 };
+
+/*
+ * RMW
+ * offset    220
+ */
+#define TSI148_LCSR_RMWAU	0x220
+#define TSI148_LCSR_RMWAL	0x224
+#define TSI148_LCSR_RMWEN	0x228
+#define TSI148_LCSR_RMWC	0x22C
+#define TSI148_LCSR_RMWS	0x230
+
+/*
+ * VMEbus control
+ * offset    234
+ */
+#define TSI148_LCSR_VMCTRL	0x234
+#define TSI148_LCSR_VCTRL	0x238
+#define TSI148_LCSR_VSTAT	0x23C
+
+/*
+ * PCI status
+ * offset  240
+ */
+#define TSI148_LCSR_PSTAT	0x240
+
+/*
+ * VME filter.
+ * offset  250
+ */
+#define TSI148_LCSR_VMEFL	0x250
+
+	/*
+	 * VME exception.
+	 * offset  260
+ */
+#define TSI148_LCSR_VEAU	0x260
+#define TSI148_LCSR_VEAL	0x264
+#define TSI148_LCSR_VEAT	0x268
+
+	/*
+	 * PCI error
+	 * offset  270
+	 */
+#define TSI148_LCSR_EDPAU	0x270
+#define TSI148_LCSR_EDPAL	0x274
+#define TSI148_LCSR_EDPXA	0x278
+#define TSI148_LCSR_EDPXS	0x27C
+#define TSI148_LCSR_EDPAT	0x280
+
+	/*
+	 * Inbound Translations
+	 * offset  300
+	 */
+#define TSI148_LCSR_IT0_ITSAU		0x300
+#define TSI148_LCSR_IT0_ITSAL		0x304
+#define TSI148_LCSR_IT0_ITEAU		0x308
+#define TSI148_LCSR_IT0_ITEAL		0x30C
+#define TSI148_LCSR_IT0_ITOFU		0x310
+#define TSI148_LCSR_IT0_ITOFL		0x314
+#define TSI148_LCSR_IT0_ITAT		0x318
+
+#define TSI148_LCSR_IT1_ITSAU		0x320
+#define TSI148_LCSR_IT1_ITSAL		0x324
+#define TSI148_LCSR_IT1_ITEAU		0x328
+#define TSI148_LCSR_IT1_ITEAL		0x32C
+#define TSI148_LCSR_IT1_ITOFU		0x330
+#define TSI148_LCSR_IT1_ITOFL		0x334
+#define TSI148_LCSR_IT1_ITAT		0x338
+
+#define TSI148_LCSR_IT2_ITSAU		0x340
+#define TSI148_LCSR_IT2_ITSAL		0x344
+#define TSI148_LCSR_IT2_ITEAU		0x348
+#define TSI148_LCSR_IT2_ITEAL		0x34C
+#define TSI148_LCSR_IT2_ITOFU		0x350
+#define TSI148_LCSR_IT2_ITOFL		0x354
+#define TSI148_LCSR_IT2_ITAT		0x358
+
+#define TSI148_LCSR_IT3_ITSAU		0x360
+#define TSI148_LCSR_IT3_ITSAL		0x364
+#define TSI148_LCSR_IT3_ITEAU		0x368
+#define TSI148_LCSR_IT3_ITEAL		0x36C
+#define TSI148_LCSR_IT3_ITOFU		0x370
+#define TSI148_LCSR_IT3_ITOFL		0x374
+#define TSI148_LCSR_IT3_ITAT		0x378
+
+#define TSI148_LCSR_IT4_ITSAU		0x380
+#define TSI148_LCSR_IT4_ITSAL		0x384
+#define TSI148_LCSR_IT4_ITEAU		0x388
+#define TSI148_LCSR_IT4_ITEAL		0x38C
+#define TSI148_LCSR_IT4_ITOFU		0x390
+#define TSI148_LCSR_IT4_ITOFL		0x394
+#define TSI148_LCSR_IT4_ITAT		0x398
+
+#define TSI148_LCSR_IT5_ITSAU		0x3A0
+#define TSI148_LCSR_IT5_ITSAL		0x3A4
+#define TSI148_LCSR_IT5_ITEAU		0x3A8
+#define TSI148_LCSR_IT5_ITEAL		0x3AC
+#define TSI148_LCSR_IT5_ITOFU		0x3B0
+#define TSI148_LCSR_IT5_ITOFL		0x3B4
+#define TSI148_LCSR_IT5_ITAT		0x3B8
+
+#define TSI148_LCSR_IT6_ITSAU		0x3C0
+#define TSI148_LCSR_IT6_ITSAL		0x3C4
+#define TSI148_LCSR_IT6_ITEAU		0x3C8
+#define TSI148_LCSR_IT6_ITEAL		0x3CC
+#define TSI148_LCSR_IT6_ITOFU		0x3D0
+#define TSI148_LCSR_IT6_ITOFL		0x3D4
+#define TSI148_LCSR_IT6_ITAT		0x3D8
+
+#define TSI148_LCSR_IT7_ITSAU		0x3E0
+#define TSI148_LCSR_IT7_ITSAL		0x3E4
+#define TSI148_LCSR_IT7_ITEAU		0x3E8
+#define TSI148_LCSR_IT7_ITEAL		0x3EC
+#define TSI148_LCSR_IT7_ITOFU		0x3F0
+#define TSI148_LCSR_IT7_ITOFL		0x3F4
+#define TSI148_LCSR_IT7_ITAT		0x3F8
+
+
+#define TSI148_LCSR_IT0		0x300
+#define TSI148_LCSR_IT1		0x320
+#define TSI148_LCSR_IT2		0x340
+#define TSI148_LCSR_IT3		0x360
+#define TSI148_LCSR_IT4		0x380
+#define TSI148_LCSR_IT5		0x3A0
+#define TSI148_LCSR_IT6		0x3C0
+#define TSI148_LCSR_IT7		0x3E0
+
+static const int TSI148_LCSR_IT[8] = { TSI148_LCSR_IT0, TSI148_LCSR_IT1,
+					 TSI148_LCSR_IT2, TSI148_LCSR_IT3,
+					 TSI148_LCSR_IT4, TSI148_LCSR_IT5,
+					 TSI148_LCSR_IT6, TSI148_LCSR_IT7 };
+
+#define TSI148_LCSR_OFFSET_ITSAU	0x0
+#define TSI148_LCSR_OFFSET_ITSAL	0x4
+#define TSI148_LCSR_OFFSET_ITEAU	0x8
+#define TSI148_LCSR_OFFSET_ITEAL	0xC
+#define TSI148_LCSR_OFFSET_ITOFU	0x10
+#define TSI148_LCSR_OFFSET_ITOFL	0x14
+#define TSI148_LCSR_OFFSET_ITAT		0x18
+
+	/*
+	 * Inbound Translation GCSR
+	 * offset  400
+	 */
+#define TSI148_LCSR_GBAU	0x400
+#define TSI148_LCSR_GBAL	0x404
+#define TSI148_LCSR_GCSRAT	0x408
+
+	/*
+	 * Inbound Translation CRG
+	 * offset  40C
+	 */
+#define TSI148_LCSR_CBAU	0x40C
+#define TSI148_LCSR_CBAL	0x410
+#define TSI148_LCSR_CSRAT	0x414
+
+	/*
+	 * Inbound Translation CR/CSR
+	 *         CRG
+	 * offset  418
+	 */
+#define TSI148_LCSR_CROU	0x418
+#define TSI148_LCSR_CROL	0x41C
+#define TSI148_LCSR_CRAT	0x420
+
+	/*
+	 * Inbound Translation Location Monitor
+	 * offset  424
+	 */
+#define TSI148_LCSR_LMBAU	0x424
+#define TSI148_LCSR_LMBAL	0x428
+#define TSI148_LCSR_LMAT	0x42C
+
+	/*
+	 * VMEbus Interrupt Control.
+	 * offset  430
+	 */
+#define TSI148_LCSR_BCU		0x430
+#define TSI148_LCSR_BCL		0x434
+#define TSI148_LCSR_BPGTR	0x438
+#define TSI148_LCSR_BPCTR	0x43C
+#define TSI148_LCSR_VICR	0x440
+
+	/*
+	 * Local Bus Interrupt Control.
+	 * offset  448
+	 */
+#define TSI148_LCSR_INTEN	0x448
+#define TSI148_LCSR_INTEO	0x44C
+#define TSI148_LCSR_INTS	0x450
+#define TSI148_LCSR_INTC	0x454
+#define TSI148_LCSR_INTM1	0x458
+#define TSI148_LCSR_INTM2	0x45C
+
+	/*
+	 * DMA Controllers
+	 * offset 500
+	 */
+#define TSI148_LCSR_DCTL0	0x500
+#define TSI148_LCSR_DSTA0	0x504
+#define TSI148_LCSR_DCSAU0	0x508
+#define TSI148_LCSR_DCSAL0	0x50C
+#define TSI148_LCSR_DCDAU0	0x510
+#define TSI148_LCSR_DCDAL0	0x514
+#define TSI148_LCSR_DCLAU0	0x518
+#define TSI148_LCSR_DCLAL0	0x51C
+#define TSI148_LCSR_DSAU0	0x520
+#define TSI148_LCSR_DSAL0	0x524
+#define TSI148_LCSR_DDAU0	0x528
+#define TSI148_LCSR_DDAL0	0x52C
+#define TSI148_LCSR_DSAT0	0x530
+#define TSI148_LCSR_DDAT0	0x534
+#define TSI148_LCSR_DNLAU0	0x538
+#define TSI148_LCSR_DNLAL0	0x53C
+#define TSI148_LCSR_DCNT0	0x540
+#define TSI148_LCSR_DDBS0	0x544
+
+#define TSI148_LCSR_DCTL1	0x580
+#define TSI148_LCSR_DSTA1	0x584
+#define TSI148_LCSR_DCSAU1	0x588
+#define TSI148_LCSR_DCSAL1	0x58C
+#define TSI148_LCSR_DCDAU1	0x590
+#define TSI148_LCSR_DCDAL1	0x594
+#define TSI148_LCSR_DCLAU1	0x598
+#define TSI148_LCSR_DCLAL1	0x59C
+#define TSI148_LCSR_DSAU1	0x5A0
+#define TSI148_LCSR_DSAL1	0x5A4
+#define TSI148_LCSR_DDAU1	0x5A8
+#define TSI148_LCSR_DDAL1	0x5AC
+#define TSI148_LCSR_DSAT1	0x5B0
+#define TSI148_LCSR_DDAT1	0x5B4
+#define TSI148_LCSR_DNLAU1	0x5B8
+#define TSI148_LCSR_DNLAL1	0x5BC
+#define TSI148_LCSR_DCNT1	0x5C0
+#define TSI148_LCSR_DDBS1	0x5C4
+
+#define TSI148_LCSR_DMA0	0x500
+#define TSI148_LCSR_DMA1	0x580
+
+
+static const int TSI148_LCSR_DMA[TSI148_MAX_DMA] = { TSI148_LCSR_DMA0,
+						TSI148_LCSR_DMA1 };
+
+#define TSI148_LCSR_OFFSET_DCTL		0x0
+#define TSI148_LCSR_OFFSET_DSTA		0x4
+#define TSI148_LCSR_OFFSET_DCSAU	0x8
+#define TSI148_LCSR_OFFSET_DCSAL	0xC
+#define TSI148_LCSR_OFFSET_DCDAU	0x10
+#define TSI148_LCSR_OFFSET_DCDAL	0x14
+#define TSI148_LCSR_OFFSET_DCLAU	0x18
+#define TSI148_LCSR_OFFSET_DCLAL	0x1C
+#define TSI148_LCSR_OFFSET_DSAU		0x20
+#define TSI148_LCSR_OFFSET_DSAL		0x24
+#define TSI148_LCSR_OFFSET_DDAU		0x28
+#define TSI148_LCSR_OFFSET_DDAL		0x2C
+#define TSI148_LCSR_OFFSET_DSAT		0x30
+#define TSI148_LCSR_OFFSET_DDAT		0x34
+#define TSI148_LCSR_OFFSET_DNLAU	0x38
+#define TSI148_LCSR_OFFSET_DNLAL	0x3C
+#define TSI148_LCSR_OFFSET_DCNT		0x40
+#define TSI148_LCSR_OFFSET_DDBS		0x44
+
+	/*
+	 * GCSR Register Group
+	 */
+
+	/*
+	 *         GCSR    CRG
+	 * offset   00     600 - DEVI/VENI
+	 * offset   04     604 - CTRL/GA/REVID
+	 * offset   08     608 - Semaphore3/2/1/0
+	 * offset   0C     60C - Seamphore7/6/5/4
+	 */
+#define TSI148_GCSR_ID		0x600
+#define TSI148_GCSR_CSR		0x604
+#define TSI148_GCSR_SEMA0	0x608
+#define TSI148_GCSR_SEMA1	0x60C
+
+	/*
+	 * Mail Box
+	 *         GCSR    CRG
+	 * offset   10     610 - Mailbox0
+	 */
+#define TSI148_GCSR_MBOX0	0x610
+#define TSI148_GCSR_MBOX1	0x614
+#define TSI148_GCSR_MBOX2	0x618
+#define TSI148_GCSR_MBOX3	0x61C
+
+static const int TSI148_GCSR_MBOX[4] = { TSI148_GCSR_MBOX0,
+					TSI148_GCSR_MBOX1,
+					TSI148_GCSR_MBOX2,
+					TSI148_GCSR_MBOX3 };
+
+	/*
+	 * CR/CSR
+	 */
+
+	/*
+	 *        CR/CSR   CRG
+	 * offset  7FFF4   FF4 - CSRBCR
+	 * offset  7FFF8   FF8 - CSRBSR
+	 * offset  7FFFC   FFC - CBAR
+	 */
+#define TSI148_CSRBCR	0xFF4
+#define TSI148_CSRBSR	0xFF8
+#define TSI148_CBAR	0xFFC
+
+
+
+
+	/*
+	 *  TSI148 Register Bit Definitions
+	 */
+
+	/*
+	 *  PFCS Register Set
+	 */
+#define TSI148_PCFS_CMMD_SERR          (1<<8)	/* SERR_L out pin ssys err */
+#define TSI148_PCFS_CMMD_PERR          (1<<6)	/* PERR_L out pin  parity */
+#define TSI148_PCFS_CMMD_MSTR          (1<<2)	/* PCI bus master */
+#define TSI148_PCFS_CMMD_MEMSP         (1<<1)	/* PCI mem space access  */
+#define TSI148_PCFS_CMMD_IOSP          (1<<0)	/* PCI I/O space enable */
+
+#define TSI148_PCFS_STAT_RCPVE         (1<<15)	/* Detected Parity Error */
+#define TSI148_PCFS_STAT_SIGSE         (1<<14)	/* Signalled System Error */
+#define TSI148_PCFS_STAT_RCVMA         (1<<13)	/* Received Master Abort */
+#define TSI148_PCFS_STAT_RCVTA         (1<<12)	/* Received Target Abort */
+#define TSI148_PCFS_STAT_SIGTA         (1<<11)	/* Signalled Target Abort */
+#define TSI148_PCFS_STAT_SELTIM        (3<<9)	/* DELSEL Timing */
+#define TSI148_PCFS_STAT_DPAR          (1<<8)	/* Data Parity Err Reported */
+#define TSI148_PCFS_STAT_FAST          (1<<7)	/* Fast back-to-back Cap */
+#define TSI148_PCFS_STAT_P66M          (1<<5)	/* 66 MHz Capable */
+#define TSI148_PCFS_STAT_CAPL          (1<<4)	/* Capab List - address $34 */
+
+/*
+ *  Revision ID/Class Code Registers   (CRG +$008)
+ */
+#define TSI148_PCFS_CLAS_M             (0xFF<<24)	/* Class ID */
+#define TSI148_PCFS_SUBCLAS_M          (0xFF<<16)	/* Sub-Class ID */
+#define TSI148_PCFS_PROGIF_M           (0xFF<<8)	/* Sub-Class ID */
+#define TSI148_PCFS_REVID_M            (0xFF<<0)	/* Rev ID */
+
+/*
+ * Cache Line Size/ Master Latency Timer/ Header Type Registers (CRG + $00C)
+ */
+#define TSI148_PCFS_HEAD_M             (0xFF<<16)	/* Master Lat Timer */
+#define TSI148_PCFS_MLAT_M             (0xFF<<8)	/* Master Lat Timer */
+#define TSI148_PCFS_CLSZ_M             (0xFF<<0)	/* Cache Line Size */
+
+/*
+ *  Memory Base Address Lower Reg (CRG + $010)
+ */
+#define TSI148_PCFS_MBARL_BASEL_M      (0xFFFFF<<12) /* Base Addr Lower Mask */
+#define TSI148_PCFS_MBARL_PRE          (1<<3)	/* Prefetch */
+#define TSI148_PCFS_MBARL_MTYPE_M      (3<<1)	/* Memory Type Mask */
+#define TSI148_PCFS_MBARL_IOMEM        (1<<0)	/* I/O Space Indicator */
+
+/*
+ *  Message Signaled Interrupt Capabilities Register (CRG + $040)
+ */
+#define TSI148_PCFS_MSICAP_64BAC       (1<<7)	/* 64-bit Address Capable */
+#define TSI148_PCFS_MSICAP_MME_M       (7<<4)	/* Multiple Msg Enable Mask */
+#define TSI148_PCFS_MSICAP_MMC_M       (7<<1)	/* Multiple Msg Capable Mask */
+#define TSI148_PCFS_MSICAP_MSIEN       (1<<0)	/* Msg signaled INT Enable */
+
+/*
+ *  Message Address Lower Register (CRG +$044)
+ */
+#define TSI148_PCFS_MSIAL_M            (0x3FFFFFFF<<2)	/* Mask */
+
+/*
+ *  Message Data Register (CRG + 4C)
+ */
+#define TSI148_PCFS_MSIMD_M            (0xFFFF<<0)	/* Mask */
+
+/*
+ *  PCI-X Capabilities Register (CRG + $050)
+ */
+#define TSI148_PCFS_PCIXCAP_MOST_M     (7<<4)	/* Max outstanding Split Tran */
+#define TSI148_PCFS_PCIXCAP_MMRBC_M    (3<<2)	/* Max Mem Read byte cnt */
+#define TSI148_PCFS_PCIXCAP_ERO        (1<<1)	/* Enable Relaxed Ordering */
+#define TSI148_PCFS_PCIXCAP_DPERE      (1<<0)	/* Data Parity Recover Enable */
+
+/*
+ *  PCI-X Status Register (CRG +$054)
+ */
+#define TSI148_PCFS_PCIXSTAT_RSCEM     (1<<29)	/* Received Split Comp Error */
+#define TSI148_PCFS_PCIXSTAT_DMCRS_M   (7<<26)	/* max Cumulative Read Size */
+#define TSI148_PCFS_PCIXSTAT_DMOST_M   (7<<23)	/* max outstanding Split Trans
+						 */
+#define TSI148_PCFS_PCIXSTAT_DMMRC_M   (3<<21)	/* max mem read byte count */
+#define TSI148_PCFS_PCIXSTAT_DC        (1<<20)	/* Device Complexity */
+#define TSI148_PCFS_PCIXSTAT_USC       (1<<19)	/* Unexpected Split comp */
+#define TSI148_PCFS_PCIXSTAT_SCD       (1<<18)	/* Split completion discard */
+#define TSI148_PCFS_PCIXSTAT_133C      (1<<17)	/* 133MHz capable */
+#define TSI148_PCFS_PCIXSTAT_64D       (1<<16)	/* 64 bit device */
+#define TSI148_PCFS_PCIXSTAT_BN_M      (0xFF<<8)	/* Bus number */
+#define TSI148_PCFS_PCIXSTAT_DN_M      (0x1F<<3)	/* Device number */
+#define TSI148_PCFS_PCIXSTAT_FN_M      (7<<0)	/* Function Number */
+
+/*
+ *  LCSR Registers
+ */
+
+/*
+ *  Outbound Translation Starting Address Lower
+ */
+#define TSI148_LCSR_OTSAL_M            (0xFFFF<<16)	/* Mask */
+
+/*
+ *  Outbound Translation Ending Address Lower
+ */
+#define TSI148_LCSR_OTEAL_M            (0xFFFF<<16)	/* Mask */
+
+/*
+ *  Outbound Translation Offset Lower
+ */
+#define TSI148_LCSR_OTOFFL_M           (0xFFFF<<16)	/* Mask */
+
+/*
+ *  Outbound Translation 2eSST Broadcast Select
+ */
+#define TSI148_LCSR_OTBS_M             (0xFFFFF<<0)	/* Mask */
+
+/*
+ *  Outbound Translation Attribute
+ */
+#define TSI148_LCSR_OTAT_EN            (1<<31)	/* Window Enable */
+#define TSI148_LCSR_OTAT_MRPFD         (1<<18)	/* Prefetch Disable */
+
+#define TSI148_LCSR_OTAT_PFS_M         (3<<16)	/* Prefetch Size Mask */
+#define TSI148_LCSR_OTAT_PFS_2         (0<<16)	/* 2 Cache Lines P Size */
+#define TSI148_LCSR_OTAT_PFS_4         (1<<16)	/* 4 Cache Lines P Size */
+#define TSI148_LCSR_OTAT_PFS_8         (2<<16)	/* 8 Cache Lines P Size */
+#define TSI148_LCSR_OTAT_PFS_16        (3<<16)	/* 16 Cache Lines P Size */
+
+#define TSI148_LCSR_OTAT_2eSSTM_M      (7<<11)	/* 2eSST Xfer Rate Mask */
+#define TSI148_LCSR_OTAT_2eSSTM_160    (0<<11)	/* 160MB/s 2eSST Xfer Rate */
+#define TSI148_LCSR_OTAT_2eSSTM_267    (1<<11)	/* 267MB/s 2eSST Xfer Rate */
+#define TSI148_LCSR_OTAT_2eSSTM_320    (2<<11)	/* 320MB/s 2eSST Xfer Rate */
+
+#define TSI148_LCSR_OTAT_TM_M          (7<<8)	/* Xfer Protocol Mask */
+#define TSI148_LCSR_OTAT_TM_SCT        (0<<8)	/* SCT Xfer Protocol */
+#define TSI148_LCSR_OTAT_TM_BLT        (1<<8)	/* BLT Xfer Protocol */
+#define TSI148_LCSR_OTAT_TM_MBLT       (2<<8)	/* MBLT Xfer Protocol */
+#define TSI148_LCSR_OTAT_TM_2eVME      (3<<8)	/* 2eVME Xfer Protocol */
+#define TSI148_LCSR_OTAT_TM_2eSST      (4<<8)	/* 2eSST Xfer Protocol */
+#define TSI148_LCSR_OTAT_TM_2eSSTB     (5<<8)	/* 2eSST Bcast Xfer Protocol */
+
+#define TSI148_LCSR_OTAT_DBW_M         (3<<6)	/* Max Data Width */
+#define TSI148_LCSR_OTAT_DBW_16        (0<<6)	/* 16-bit Data Width */
+#define TSI148_LCSR_OTAT_DBW_32        (1<<6)	/* 32-bit Data Width */
+
+#define TSI148_LCSR_OTAT_SUP           (1<<5)	/* Supervisory Access */
+#define TSI148_LCSR_OTAT_PGM           (1<<4)	/* Program Access */
+
+#define TSI148_LCSR_OTAT_AMODE_M       (0xf<<0)	/* Address Mode Mask */
+#define TSI148_LCSR_OTAT_AMODE_A16     (0<<0)	/* A16 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_A24     (1<<0)	/* A24 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_A32     (2<<0)	/* A32 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_A64     (4<<0)	/* A32 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_CRCSR   (5<<0)	/* CR/CSR Address Space */
+#define TSI148_LCSR_OTAT_AMODE_USER1   (8<<0)	/* User1 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_USER2   (9<<0)	/* User2 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_USER3   (10<<0)	/* User3 Address Space */
+#define TSI148_LCSR_OTAT_AMODE_USER4   (11<<0)	/* User4 Address Space */
+
+/*
+ *  VME Master Control Register  CRG+$234
+ */
+#define TSI148_LCSR_VMCTRL_VSA         (1<<27)	/* VMEbus Stop Ack */
+#define TSI148_LCSR_VMCTRL_VS          (1<<26)	/* VMEbus Stop */
+#define TSI148_LCSR_VMCTRL_DHB         (1<<25)	/* Device Has Bus */
+#define TSI148_LCSR_VMCTRL_DWB         (1<<24)	/* Device Wants Bus */
+
+#define TSI148_LCSR_VMCTRL_RMWEN       (1<<20)	/* RMW Enable */
+
+#define TSI148_LCSR_VMCTRL_ATO_M       (7<<16)	/* Master Access Time-out Mask
+						 */
+#define TSI148_LCSR_VMCTRL_ATO_32      (0<<16)	/* 32 us */
+#define TSI148_LCSR_VMCTRL_ATO_128     (1<<16)	/* 128 us */
+#define TSI148_LCSR_VMCTRL_ATO_512     (2<<16)	/* 512 us */
+#define TSI148_LCSR_VMCTRL_ATO_2M      (3<<16)	/* 2 ms */
+#define TSI148_LCSR_VMCTRL_ATO_8M      (4<<16)	/* 8 ms */
+#define TSI148_LCSR_VMCTRL_ATO_32M     (5<<16)	/* 32 ms */
+#define TSI148_LCSR_VMCTRL_ATO_128M    (6<<16)	/* 128 ms */
+#define TSI148_LCSR_VMCTRL_ATO_DIS     (7<<16)	/* Disabled */
+
+#define TSI148_LCSR_VMCTRL_VTOFF_M     (7<<12)	/* VMEbus Master Time off */
+#define TSI148_LCSR_VMCTRL_VTOFF_0     (0<<12)	/* 0us */
+#define TSI148_LCSR_VMCTRL_VTOFF_1     (1<<12)	/* 1us */
+#define TSI148_LCSR_VMCTRL_VTOFF_2     (2<<12)	/* 2us */
+#define TSI148_LCSR_VMCTRL_VTOFF_4     (3<<12)	/* 4us */
+#define TSI148_LCSR_VMCTRL_VTOFF_8     (4<<12)	/* 8us */
+#define TSI148_LCSR_VMCTRL_VTOFF_16    (5<<12)	/* 16us */
+#define TSI148_LCSR_VMCTRL_VTOFF_32    (6<<12)	/* 32us */
+#define TSI148_LCSR_VMCTRL_VTOFF_64    (7<<12)	/* 64us */
+
+#define TSI148_LCSR_VMCTRL_VTON_M      (7<<8)	/* VMEbus Master Time On */
+#define TSI148_LCSR_VMCTRL_VTON_4      (0<<8)	/* 8us */
+#define TSI148_LCSR_VMCTRL_VTON_8      (1<<8)	/* 8us */
+#define TSI148_LCSR_VMCTRL_VTON_16     (2<<8)	/* 16us */
+#define TSI148_LCSR_VMCTRL_VTON_32     (3<<8)	/* 32us */
+#define TSI148_LCSR_VMCTRL_VTON_64     (4<<8)	/* 64us */
+#define TSI148_LCSR_VMCTRL_VTON_128    (5<<8)	/* 128us */
+#define TSI148_LCSR_VMCTRL_VTON_256    (6<<8)	/* 256us */
+#define TSI148_LCSR_VMCTRL_VTON_512    (7<<8)	/* 512us */
+
+#define TSI148_LCSR_VMCTRL_VREL_M      (3<<3)	/* VMEbus Master Rel Mode Mask
+						 */
+#define TSI148_LCSR_VMCTRL_VREL_T_D    (0<<3)	/* Time on or Done */
+#define TSI148_LCSR_VMCTRL_VREL_T_R_D  (1<<3)	/* Time on and REQ or Done */
+#define TSI148_LCSR_VMCTRL_VREL_T_B_D  (2<<3)	/* Time on and BCLR or Done */
+#define TSI148_LCSR_VMCTRL_VREL_T_D_R  (3<<3)	/* Time on or Done and REQ */
+
+#define TSI148_LCSR_VMCTRL_VFAIR       (1<<2)	/* VMEbus Master Fair Mode */
+#define TSI148_LCSR_VMCTRL_VREQL_M     (3<<0)	/* VMEbus Master Req Level Mask
+						 */
+
+/*
+ *  VMEbus Control Register CRG+$238
+ */
+#define TSI148_LCSR_VCTRL_LRE          (1<<31)	/* Late Retry Enable */
+
+#define TSI148_LCSR_VCTRL_DLT_M        (0xF<<24)	/* Deadlock Timer */
+#define TSI148_LCSR_VCTRL_DLT_OFF      (0<<24)	/* Deadlock Timer Off */
+#define TSI148_LCSR_VCTRL_DLT_16       (1<<24)	/* 16 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_32       (2<<24)	/* 32 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_64       (3<<24)	/* 64 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_128      (4<<24)	/* 128 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_256      (5<<24)	/* 256 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_512      (6<<24)	/* 512 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_1024     (7<<24)	/* 1024 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_2048     (8<<24)	/* 2048 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_4096     (9<<24)	/* 4096 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_8192     (0xA<<24)	/* 8192 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_16384    (0xB<<24)	/* 16384 VCLKS */
+#define TSI148_LCSR_VCTRL_DLT_32768    (0xC<<24)	/* 32768 VCLKS */
+
+#define TSI148_LCSR_VCTRL_NERBB        (1<<20)	/* No Early Release of Bus Busy
+						 */
+
+#define TSI148_LCSR_VCTRL_SRESET       (1<<17)	/* System Reset */
+#define TSI148_LCSR_VCTRL_LRESET       (1<<16)	/* Local Reset */
+
+#define TSI148_LCSR_VCTRL_SFAILAI      (1<<15)	/* SYSFAIL Auto Slot ID */
+#define TSI148_LCSR_VCTRL_BID_M        (0x1F<<8)	/* Broadcast ID Mask */
+
+#define TSI148_LCSR_VCTRL_ATOEN        (1<<7)	/* Arbiter Time-out Enable */
+#define TSI148_LCSR_VCTRL_ROBIN        (1<<6)	/* VMEbus Round Robin */
+
+#define TSI148_LCSR_VCTRL_GTO_M        (7<<0)	/* VMEbus Global Time-out Mask
+						 */
+#define TSI148_LCSR_VCTRL_GTO_8	      (0<<0)	/* 8 us */
+#define TSI148_LCSR_VCTRL_GTO_16	      (1<<0)	/* 16 us */
+#define TSI148_LCSR_VCTRL_GTO_32	      (2<<0)	/* 32 us */
+#define TSI148_LCSR_VCTRL_GTO_64	      (3<<0)	/* 64 us */
+#define TSI148_LCSR_VCTRL_GTO_128      (4<<0)	/* 128 us */
+#define TSI148_LCSR_VCTRL_GTO_256      (5<<0)	/* 256 us */
+#define TSI148_LCSR_VCTRL_GTO_512      (6<<0)	/* 512 us */
+#define TSI148_LCSR_VCTRL_GTO_DIS      (7<<0)	/* Disabled */
+
+/*
+ *  VMEbus Status Register  CRG + $23C
+ */
+#define TSI148_LCSR_VSTAT_CPURST       (1<<15)	/* Clear power up reset */
+#define TSI148_LCSR_VSTAT_BRDFL        (1<<14)	/* Board fail */
+#define TSI148_LCSR_VSTAT_PURSTS       (1<<12)	/* Power up reset status */
+#define TSI148_LCSR_VSTAT_BDFAILS      (1<<11)	/* Board Fail Status */
+#define TSI148_LCSR_VSTAT_SYSFAILS     (1<<10)	/* System Fail Status */
+#define TSI148_LCSR_VSTAT_ACFAILS      (1<<9)	/* AC fail status */
+#define TSI148_LCSR_VSTAT_SCONS        (1<<8)	/* System Cont Status */
+#define TSI148_LCSR_VSTAT_GAP          (1<<5)	/* Geographic Addr Parity */
+#define TSI148_LCSR_VSTAT_GA_M         (0x1F<<0)  /* Geographic Addr Mask */
+
+/*
+ *  PCI Configuration Status Register CRG+$240
+ */
+#define TSI148_LCSR_PSTAT_REQ64S       (1<<6)	/* Request 64 status set */
+#define TSI148_LCSR_PSTAT_M66ENS       (1<<5)	/* M66ENS 66Mhz enable */
+#define TSI148_LCSR_PSTAT_FRAMES       (1<<4)	/* Frame Status */
+#define TSI148_LCSR_PSTAT_IRDYS        (1<<3)	/* IRDY status */
+#define TSI148_LCSR_PSTAT_DEVSELS      (1<<2)	/* DEVL status */
+#define TSI148_LCSR_PSTAT_STOPS        (1<<1)	/* STOP status */
+#define TSI148_LCSR_PSTAT_TRDYS        (1<<0)	/* TRDY status */
+
+/*
+ *  VMEbus Exception Attributes Register  CRG + $268
+ */
+#define TSI148_LCSR_VEAT_VES           (1<<31)	/* Status */
+#define TSI148_LCSR_VEAT_VEOF          (1<<30)	/* Overflow */
+#define TSI148_LCSR_VEAT_VESCL         (1<<29)	/* Status Clear */
+#define TSI148_LCSR_VEAT_2EOT          (1<<21)	/* 2e Odd Termination */
+#define TSI148_LCSR_VEAT_2EST          (1<<20)	/* 2e Slave terminated */
+#define TSI148_LCSR_VEAT_BERR          (1<<19)	/* Bus Error */
+#define TSI148_LCSR_VEAT_LWORD         (1<<18)	/* LWORD_ signal state */
+#define TSI148_LCSR_VEAT_WRITE         (1<<17)	/* WRITE_ signal state */
+#define TSI148_LCSR_VEAT_IACK          (1<<16)	/* IACK_ signal state */
+#define TSI148_LCSR_VEAT_DS1           (1<<15)	/* DS1_ signal state */
+#define TSI148_LCSR_VEAT_DS0           (1<<14)	/* DS0_ signal state */
+#define TSI148_LCSR_VEAT_AM_M          (0x3F<<8)	/* Address Mode Mask */
+#define TSI148_LCSR_VEAT_XAM_M         (0xFF<<0)	/* Master AMode Mask */
+
+
+/*
+ * VMEbus PCI Error Diagnostics PCI/X Attributes Register  CRG + $280
+ */
+#define TSI148_LCSR_EDPAT_EDPCL        (1<<29)
+
+/*
+ *  Inbound Translation Starting Address Lower
+ */
+#define TSI148_LCSR_ITSAL6432_M        (0xFFFF<<16)	/* Mask */
+#define TSI148_LCSR_ITSAL24_M          (0x00FFF<<12)	/* Mask */
+#define TSI148_LCSR_ITSAL16_M          (0x0000FFF<<4)	/* Mask */
+
+/*
+ *  Inbound Translation Ending Address Lower
+ */
+#define TSI148_LCSR_ITEAL6432_M        (0xFFFF<<16)	/* Mask */
+#define TSI148_LCSR_ITEAL24_M          (0x00FFF<<12)	/* Mask */
+#define TSI148_LCSR_ITEAL16_M          (0x0000FFF<<4)	/* Mask */
+
+/*
+ *  Inbound Translation Offset Lower
+ */
+#define TSI148_LCSR_ITOFFL6432_M       (0xFFFF<<16)	/* Mask */
+#define TSI148_LCSR_ITOFFL24_M         (0xFFFFF<<12)	/* Mask */
+#define TSI148_LCSR_ITOFFL16_M         (0xFFFFFFF<<4)	/* Mask */
+
+/*
+ *  Inbound Translation Attribute
+ */
+#define TSI148_LCSR_ITAT_EN            (1<<31)	/* Window Enable */
+#define TSI148_LCSR_ITAT_TH            (1<<18)	/* Prefetch Threshold */
+
+#define TSI148_LCSR_ITAT_VFS_M         (3<<16)	/* Virtual FIFO Size Mask */
+#define TSI148_LCSR_ITAT_VFS_64        (0<<16)	/* 64 bytes Virtual FIFO Size */
+#define TSI148_LCSR_ITAT_VFS_128       (1<<16)	/* 128 bytes Virtual FIFO Sz */
+#define TSI148_LCSR_ITAT_VFS_256       (2<<16)	/* 256 bytes Virtual FIFO Sz */
+#define TSI148_LCSR_ITAT_VFS_512       (3<<16)	/* 512 bytes Virtual FIFO Sz */
+
+#define TSI148_LCSR_ITAT_2eSSTM_M      (7<<12)	/* 2eSST Xfer Rate Mask */
+#define TSI148_LCSR_ITAT_2eSSTM_160    (0<<12)	/* 160MB/s 2eSST Xfer Rate */
+#define TSI148_LCSR_ITAT_2eSSTM_267    (1<<12)	/* 267MB/s 2eSST Xfer Rate */
+#define TSI148_LCSR_ITAT_2eSSTM_320    (2<<12)	/* 320MB/s 2eSST Xfer Rate */
+
+#define TSI148_LCSR_ITAT_2eSSTB        (1<<11)	/* 2eSST Bcast Xfer Protocol */
+#define TSI148_LCSR_ITAT_2eSST         (1<<10)	/* 2eSST Xfer Protocol */
+#define TSI148_LCSR_ITAT_2eVME         (1<<9)	/* 2eVME Xfer Protocol */
+#define TSI148_LCSR_ITAT_MBLT          (1<<8)	/* MBLT Xfer Protocol */
+#define TSI148_LCSR_ITAT_BLT           (1<<7)	/* BLT Xfer Protocol */
+
+#define TSI148_LCSR_ITAT_AS_M          (7<<4)	/* Address Space Mask */
+#define TSI148_LCSR_ITAT_AS_A16        (0<<4)	/* A16 Address Space */
+#define TSI148_LCSR_ITAT_AS_A24        (1<<4)	/* A24 Address Space */
+#define TSI148_LCSR_ITAT_AS_A32        (2<<4)	/* A32 Address Space */
+#define TSI148_LCSR_ITAT_AS_A64        (4<<4)	/* A64 Address Space */
+
+#define TSI148_LCSR_ITAT_SUPR          (1<<3)	/* Supervisor Access */
+#define TSI148_LCSR_ITAT_NPRIV         (1<<2)	/* Non-Priv (User) Access */
+#define TSI148_LCSR_ITAT_PGM           (1<<1)	/* Program Access */
+#define TSI148_LCSR_ITAT_DATA          (1<<0)	/* Data Access */
+
+/*
+ *  GCSR Base Address Lower Address  CRG +$404
+ */
+#define TSI148_LCSR_GBAL_M             (0x7FFFFFF<<5)	/* Mask */
+
+/*
+ *  GCSR Attribute Register CRG + $408
+ */
+#define TSI148_LCSR_GCSRAT_EN          (1<<7)	/* Enable access to GCSR */
+
+#define TSI148_LCSR_GCSRAT_AS_M        (7<<4)	/* Address Space Mask */
+#define TSI148_LCSR_GCSRAT_AS_A16       (0<<4)	/* Address Space 16 */
+#define TSI148_LCSR_GCSRAT_AS_A24       (1<<4)	/* Address Space 24 */
+#define TSI148_LCSR_GCSRAT_AS_A32       (2<<4)	/* Address Space 32 */
+#define TSI148_LCSR_GCSRAT_AS_A64       (4<<4)	/* Address Space 64 */
+
+#define TSI148_LCSR_GCSRAT_SUPR        (1<<3)	/* Sup set -GCSR decoder */
+#define TSI148_LCSR_GCSRAT_NPRIV       (1<<2)	/* Non-Privliged set - CGSR */
+#define TSI148_LCSR_GCSRAT_PGM         (1<<1)	/* Program set - GCSR decoder */
+#define TSI148_LCSR_GCSRAT_DATA        (1<<0)	/* DATA set GCSR decoder */
+
+/*
+ *  CRG Base Address Lower Address  CRG + $410
+ */
+#define TSI148_LCSR_CBAL_M             (0xFFFFF<<12)
+
+/*
+ *  CRG Attribute Register  CRG + $414
+ */
+#define TSI148_LCSR_CRGAT_EN           (1<<7)	/* Enable PRG Access */
+
+#define TSI148_LCSR_CRGAT_AS_M         (7<<4)	/* Address Space */
+#define TSI148_LCSR_CRGAT_AS_A16       (0<<4)	/* Address Space 16 */
+#define TSI148_LCSR_CRGAT_AS_A24       (1<<4)	/* Address Space 24 */
+#define TSI148_LCSR_CRGAT_AS_A32       (2<<4)	/* Address Space 32 */
+#define TSI148_LCSR_CRGAT_AS_A64       (4<<4)	/* Address Space 64 */
+
+#define TSI148_LCSR_CRGAT_SUPR         (1<<3)	/* Supervisor Access */
+#define TSI148_LCSR_CRGAT_NPRIV        (1<<2)	/* Non-Privliged(User) Access */
+#define TSI148_LCSR_CRGAT_PGM          (1<<1)	/* Program Access */
+#define TSI148_LCSR_CRGAT_DATA         (1<<0)	/* Data Access */
+
+/*
+ *  CR/CSR Offset Lower Register  CRG + $41C
+ */
+#define TSI148_LCSR_CROL_M             (0x1FFF<<19)	/* Mask */
+
+/*
+ *  CR/CSR Attribute register  CRG + $420
+ */
+#define TSI148_LCSR_CRAT_EN            (1<<7)	/* Enable access to CR/CSR */
+
+/*
+ *  Location Monitor base address lower register  CRG + $428
+ */
+#define TSI148_LCSR_LMBAL_M            (0x7FFFFFF<<5)	/* Mask */
+
+/*
+ *  Location Monitor Attribute Register  CRG + $42C
+ */
+#define TSI148_LCSR_LMAT_EN            (1<<7)	/* Enable Location Monitor */
+
+#define TSI148_LCSR_LMAT_AS_M          (7<<4)	/* Address Space MASK  */
+#define TSI148_LCSR_LMAT_AS_A16        (0<<4)	/* A16 */
+#define TSI148_LCSR_LMAT_AS_A24        (1<<4)	/* A24 */
+#define TSI148_LCSR_LMAT_AS_A32        (2<<4)	/* A32 */
+#define TSI148_LCSR_LMAT_AS_A64        (4<<4)	/* A64 */
+
+#define TSI148_LCSR_LMAT_SUPR          (1<<3)	/* Supervisor Access */
+#define TSI148_LCSR_LMAT_NPRIV         (1<<2)	/* Non-Priv (User) Access */
+#define TSI148_LCSR_LMAT_PGM           (1<<1)	/* Program Access */
+#define TSI148_LCSR_LMAT_DATA          (1<<0)	/* Data Access  */
+
+/*
+ *  Broadcast Pulse Generator Timer Register  CRG + $438
+ */
+#define TSI148_LCSR_BPGTR_BPGT_M       (0xFFFF<<0)	/* Mask */
+
+/*
+ *  Broadcast Programmable Clock Timer Register  CRG + $43C
+ */
+#define TSI148_LCSR_BPCTR_BPCT_M       (0xFFFFFF<<0)	/* Mask */
+
+/*
+ *  VMEbus Interrupt Control Register           CRG + $43C
+ */
+#define TSI148_LCSR_VICR_CNTS_M        (3<<22)	/* Cntr Source MASK */
+#define TSI148_LCSR_VICR_CNTS_DIS      (1<<22)	/* Cntr Disable */
+#define TSI148_LCSR_VICR_CNTS_IRQ1     (2<<22)	/* IRQ1 to Cntr */
+#define TSI148_LCSR_VICR_CNTS_IRQ2     (3<<22)	/* IRQ2 to Cntr */
+
+#define TSI148_LCSR_VICR_EDGIS_M       (3<<20)	/* Edge interrupt MASK */
+#define TSI148_LCSR_VICR_EDGIS_DIS     (1<<20)	/* Edge interrupt Disable */
+#define TSI148_LCSR_VICR_EDGIS_IRQ1    (2<<20)	/* IRQ1 to Edge */
+#define TSI148_LCSR_VICR_EDGIS_IRQ2    (3<<20)	/* IRQ2 to Edge */
+
+#define TSI148_LCSR_VICR_IRQIF_M       (3<<18)	/* IRQ1* Function MASK */
+#define TSI148_LCSR_VICR_IRQIF_NORM    (1<<18)	/* Normal */
+#define TSI148_LCSR_VICR_IRQIF_PULSE   (2<<18)	/* Pulse Generator */
+#define TSI148_LCSR_VICR_IRQIF_PROG    (3<<18)	/* Programmable Clock */
+#define TSI148_LCSR_VICR_IRQIF_1U      (4<<18)	/* 1us Clock */
+
+#define TSI148_LCSR_VICR_IRQ2F_M       (3<<16)	/* IRQ2* Function MASK */
+#define TSI148_LCSR_VICR_IRQ2F_NORM    (1<<16)	/* Normal */
+#define TSI148_LCSR_VICR_IRQ2F_PULSE   (2<<16)	/* Pulse Generator */
+#define TSI148_LCSR_VICR_IRQ2F_PROG    (3<<16)	/* Programmable Clock */
+#define TSI148_LCSR_VICR_IRQ2F_1U      (4<<16)	/* 1us Clock */
+
+#define TSI148_LCSR_VICR_BIP           (1<<15)	/* Broadcast Interrupt Pulse */
+
+#define TSI148_LCSR_VICR_IRQC          (1<<12)	/* VMEbus IRQ Clear */
+#define TSI148_LCSR_VICR_IRQS          (1<<11)	/* VMEbus IRQ Status */
+
+#define TSI148_LCSR_VICR_IRQL_M        (7<<8)	/* VMEbus SW IRQ Level Mask */
+#define TSI148_LCSR_VICR_IRQL_1        (1<<8)	/* VMEbus SW IRQ Level 1 */
+#define TSI148_LCSR_VICR_IRQL_2        (2<<8)	/* VMEbus SW IRQ Level 2 */
+#define TSI148_LCSR_VICR_IRQL_3        (3<<8)	/* VMEbus SW IRQ Level 3 */
+#define TSI148_LCSR_VICR_IRQL_4        (4<<8)	/* VMEbus SW IRQ Level 4 */
+#define TSI148_LCSR_VICR_IRQL_5        (5<<8)	/* VMEbus SW IRQ Level 5 */
+#define TSI148_LCSR_VICR_IRQL_6        (6<<8)	/* VMEbus SW IRQ Level 6 */
+#define TSI148_LCSR_VICR_IRQL_7        (7<<8)	/* VMEbus SW IRQ Level 7 */
+
+static const int TSI148_LCSR_VICR_IRQL[8] = { 0, TSI148_LCSR_VICR_IRQL_1,
+			TSI148_LCSR_VICR_IRQL_2, TSI148_LCSR_VICR_IRQL_3,
+			TSI148_LCSR_VICR_IRQL_4, TSI148_LCSR_VICR_IRQL_5,
+			TSI148_LCSR_VICR_IRQL_6, TSI148_LCSR_VICR_IRQL_7 };
+
+#define TSI148_LCSR_VICR_STID_M        (0xFF<<0)	/* Status/ID Mask */
+
+/*
+ *  Interrupt Enable Register   CRG + $440
+ */
+#define TSI148_LCSR_INTEN_DMA1EN       (1<<25)	/* DMAC 1 */
+#define TSI148_LCSR_INTEN_DMA0EN       (1<<24)	/* DMAC 0 */
+#define TSI148_LCSR_INTEN_LM3EN        (1<<23)	/* Location Monitor 3 */
+#define TSI148_LCSR_INTEN_LM2EN        (1<<22)	/* Location Monitor 2 */
+#define TSI148_LCSR_INTEN_LM1EN        (1<<21)	/* Location Monitor 1 */
+#define TSI148_LCSR_INTEN_LM0EN        (1<<20)	/* Location Monitor 0 */
+#define TSI148_LCSR_INTEN_MB3EN        (1<<19)	/* Mail Box 3 */
+#define TSI148_LCSR_INTEN_MB2EN        (1<<18)	/* Mail Box 2 */
+#define TSI148_LCSR_INTEN_MB1EN        (1<<17)	/* Mail Box 1 */
+#define TSI148_LCSR_INTEN_MB0EN        (1<<16)	/* Mail Box 0 */
+#define TSI148_LCSR_INTEN_PERREN       (1<<13)	/* PCI/X Error */
+#define TSI148_LCSR_INTEN_VERREN       (1<<12)	/* VMEbus Error */
+#define TSI148_LCSR_INTEN_VIEEN        (1<<11)	/* VMEbus IRQ Edge */
+#define TSI148_LCSR_INTEN_IACKEN       (1<<10)	/* IACK */
+#define TSI148_LCSR_INTEN_SYSFLEN      (1<<9)	/* System Fail */
+#define TSI148_LCSR_INTEN_ACFLEN       (1<<8)	/* AC Fail */
+#define TSI148_LCSR_INTEN_IRQ7EN       (1<<7)	/* IRQ7 */
+#define TSI148_LCSR_INTEN_IRQ6EN       (1<<6)	/* IRQ6 */
+#define TSI148_LCSR_INTEN_IRQ5EN       (1<<5)	/* IRQ5 */
+#define TSI148_LCSR_INTEN_IRQ4EN       (1<<4)	/* IRQ4 */
+#define TSI148_LCSR_INTEN_IRQ3EN       (1<<3)	/* IRQ3 */
+#define TSI148_LCSR_INTEN_IRQ2EN       (1<<2)	/* IRQ2 */
+#define TSI148_LCSR_INTEN_IRQ1EN       (1<<1)	/* IRQ1 */
+
+static const int TSI148_LCSR_INTEN_LMEN[4] = { TSI148_LCSR_INTEN_LM0EN,
+					TSI148_LCSR_INTEN_LM1EN,
+					TSI148_LCSR_INTEN_LM2EN,
+					TSI148_LCSR_INTEN_LM3EN };
+
+static const int TSI148_LCSR_INTEN_IRQEN[7] = { TSI148_LCSR_INTEN_IRQ1EN,
+					TSI148_LCSR_INTEN_IRQ2EN,
+					TSI148_LCSR_INTEN_IRQ3EN,
+					TSI148_LCSR_INTEN_IRQ4EN,
+					TSI148_LCSR_INTEN_IRQ5EN,
+					TSI148_LCSR_INTEN_IRQ6EN,
+					TSI148_LCSR_INTEN_IRQ7EN };
+
+/*
+ *  Interrupt Enable Out Register CRG + $444
+ */
+#define TSI148_LCSR_INTEO_DMA1EO       (1<<25)	/* DMAC 1 */
+#define TSI148_LCSR_INTEO_DMA0EO       (1<<24)	/* DMAC 0 */
+#define TSI148_LCSR_INTEO_LM3EO        (1<<23)	/* Loc Monitor 3 */
+#define TSI148_LCSR_INTEO_LM2EO        (1<<22)	/* Loc Monitor 2 */
+#define TSI148_LCSR_INTEO_LM1EO        (1<<21)	/* Loc Monitor 1 */
+#define TSI148_LCSR_INTEO_LM0EO        (1<<20)	/* Location Monitor 0 */
+#define TSI148_LCSR_INTEO_MB3EO        (1<<19)	/* Mail Box 3 */
+#define TSI148_LCSR_INTEO_MB2EO        (1<<18)	/* Mail Box 2 */
+#define TSI148_LCSR_INTEO_MB1EO        (1<<17)	/* Mail Box 1 */
+#define TSI148_LCSR_INTEO_MB0EO        (1<<16)	/* Mail Box 0 */
+#define TSI148_LCSR_INTEO_PERREO       (1<<13)	/* PCI/X Error */
+#define TSI148_LCSR_INTEO_VERREO       (1<<12)	/* VMEbus Error */
+#define TSI148_LCSR_INTEO_VIEEO        (1<<11)	/* VMEbus IRQ Edge */
+#define TSI148_LCSR_INTEO_IACKEO       (1<<10)	/* IACK */
+#define TSI148_LCSR_INTEO_SYSFLEO      (1<<9)	/* System Fail */
+#define TSI148_LCSR_INTEO_ACFLEO       (1<<8)	/* AC Fail */
+#define TSI148_LCSR_INTEO_IRQ7EO       (1<<7)	/* IRQ7 */
+#define TSI148_LCSR_INTEO_IRQ6EO       (1<<6)	/* IRQ6 */
+#define TSI148_LCSR_INTEO_IRQ5EO       (1<<5)	/* IRQ5 */
+#define TSI148_LCSR_INTEO_IRQ4EO       (1<<4)	/* IRQ4 */
+#define TSI148_LCSR_INTEO_IRQ3EO       (1<<3)	/* IRQ3 */
+#define TSI148_LCSR_INTEO_IRQ2EO       (1<<2)	/* IRQ2 */
+#define TSI148_LCSR_INTEO_IRQ1EO       (1<<1)	/* IRQ1 */
+
+static const int TSI148_LCSR_INTEO_LMEO[4] = { TSI148_LCSR_INTEO_LM0EO,
+					TSI148_LCSR_INTEO_LM1EO,
+					TSI148_LCSR_INTEO_LM2EO,
+					TSI148_LCSR_INTEO_LM3EO };
+
+static const int TSI148_LCSR_INTEO_IRQEO[7] = { TSI148_LCSR_INTEO_IRQ1EO,
+					TSI148_LCSR_INTEO_IRQ2EO,
+					TSI148_LCSR_INTEO_IRQ3EO,
+					TSI148_LCSR_INTEO_IRQ4EO,
+					TSI148_LCSR_INTEO_IRQ5EO,
+					TSI148_LCSR_INTEO_IRQ6EO,
+					TSI148_LCSR_INTEO_IRQ7EO };
+
+/*
+ *  Interrupt Status Register CRG + $448
+ */
+#define TSI148_LCSR_INTS_DMA1S         (1<<25)	/* DMA 1 */
+#define TSI148_LCSR_INTS_DMA0S         (1<<24)	/* DMA 0 */
+#define TSI148_LCSR_INTS_LM3S          (1<<23)	/* Location Monitor 3 */
+#define TSI148_LCSR_INTS_LM2S          (1<<22)	/* Location Monitor 2 */
+#define TSI148_LCSR_INTS_LM1S          (1<<21)	/* Location Monitor 1 */
+#define TSI148_LCSR_INTS_LM0S          (1<<20)	/* Location Monitor 0 */
+#define TSI148_LCSR_INTS_MB3S          (1<<19)	/* Mail Box 3 */
+#define TSI148_LCSR_INTS_MB2S          (1<<18)	/* Mail Box 2 */
+#define TSI148_LCSR_INTS_MB1S          (1<<17)	/* Mail Box 1 */
+#define TSI148_LCSR_INTS_MB0S          (1<<16)	/* Mail Box 0 */
+#define TSI148_LCSR_INTS_PERRS         (1<<13)	/* PCI/X Error */
+#define TSI148_LCSR_INTS_VERRS         (1<<12)	/* VMEbus Error */
+#define TSI148_LCSR_INTS_VIES          (1<<11)	/* VMEbus IRQ Edge */
+#define TSI148_LCSR_INTS_IACKS         (1<<10)	/* IACK */
+#define TSI148_LCSR_INTS_SYSFLS        (1<<9)	/* System Fail */
+#define TSI148_LCSR_INTS_ACFLS         (1<<8)	/* AC Fail */
+#define TSI148_LCSR_INTS_IRQ7S         (1<<7)	/* IRQ7 */
+#define TSI148_LCSR_INTS_IRQ6S         (1<<6)	/* IRQ6 */
+#define TSI148_LCSR_INTS_IRQ5S         (1<<5)	/* IRQ5 */
+#define TSI148_LCSR_INTS_IRQ4S         (1<<4)	/* IRQ4 */
+#define TSI148_LCSR_INTS_IRQ3S         (1<<3)	/* IRQ3 */
+#define TSI148_LCSR_INTS_IRQ2S         (1<<2)	/* IRQ2 */
+#define TSI148_LCSR_INTS_IRQ1S         (1<<1)	/* IRQ1 */
+
+static const int TSI148_LCSR_INTS_LMS[4] = { TSI148_LCSR_INTS_LM0S,
+					TSI148_LCSR_INTS_LM1S,
+					TSI148_LCSR_INTS_LM2S,
+					TSI148_LCSR_INTS_LM3S };
+
+static const int TSI148_LCSR_INTS_MBS[4] = { TSI148_LCSR_INTS_MB0S,
+					TSI148_LCSR_INTS_MB1S,
+					TSI148_LCSR_INTS_MB2S,
+					TSI148_LCSR_INTS_MB3S };
+
+/*
+ *  Interrupt Clear Register CRG + $44C
+ */
+#define TSI148_LCSR_INTC_DMA1C         (1<<25)	/* DMA 1 */
+#define TSI148_LCSR_INTC_DMA0C         (1<<24)	/* DMA 0 */
+#define TSI148_LCSR_INTC_LM3C          (1<<23)	/* Location Monitor 3 */
+#define TSI148_LCSR_INTC_LM2C          (1<<22)	/* Location Monitor 2 */
+#define TSI148_LCSR_INTC_LM1C          (1<<21)	/* Location Monitor 1 */
+#define TSI148_LCSR_INTC_LM0C          (1<<20)	/* Location Monitor 0 */
+#define TSI148_LCSR_INTC_MB3C          (1<<19)	/* Mail Box 3 */
+#define TSI148_LCSR_INTC_MB2C          (1<<18)	/* Mail Box 2 */
+#define TSI148_LCSR_INTC_MB1C          (1<<17)	/* Mail Box 1 */
+#define TSI148_LCSR_INTC_MB0C          (1<<16)	/* Mail Box 0 */
+#define TSI148_LCSR_INTC_PERRC         (1<<13)	/* VMEbus Error */
+#define TSI148_LCSR_INTC_VERRC         (1<<12)	/* VMEbus Access Time-out */
+#define TSI148_LCSR_INTC_VIEC          (1<<11)	/* VMEbus IRQ Edge */
+#define TSI148_LCSR_INTC_IACKC         (1<<10)	/* IACK */
+#define TSI148_LCSR_INTC_SYSFLC        (1<<9)	/* System Fail */
+#define TSI148_LCSR_INTC_ACFLC         (1<<8)	/* AC Fail */
+
+static const int TSI148_LCSR_INTC_LMC[4] = { TSI148_LCSR_INTC_LM0C,
+					TSI148_LCSR_INTC_LM1C,
+					TSI148_LCSR_INTC_LM2C,
+					TSI148_LCSR_INTC_LM3C };
+
+static const int TSI148_LCSR_INTC_MBC[4] = { TSI148_LCSR_INTC_MB0C,
+					TSI148_LCSR_INTC_MB1C,
+					TSI148_LCSR_INTC_MB2C,
+					TSI148_LCSR_INTC_MB3C };
+
+/*
+ *  Interrupt Map Register 1 CRG + $458
+ */
+#define TSI148_LCSR_INTM1_DMA1M_M      (3<<18)	/* DMA 1 */
+#define TSI148_LCSR_INTM1_DMA0M_M      (3<<16)	/* DMA 0 */
+#define TSI148_LCSR_INTM1_LM3M_M       (3<<14)	/* Location Monitor 3 */
+#define TSI148_LCSR_INTM1_LM2M_M       (3<<12)	/* Location Monitor 2 */
+#define TSI148_LCSR_INTM1_LM1M_M       (3<<10)	/* Location Monitor 1 */
+#define TSI148_LCSR_INTM1_LM0M_M       (3<<8)	/* Location Monitor 0 */
+#define TSI148_LCSR_INTM1_MB3M_M       (3<<6)	/* Mail Box 3 */
+#define TSI148_LCSR_INTM1_MB2M_M       (3<<4)	/* Mail Box 2 */
+#define TSI148_LCSR_INTM1_MB1M_M       (3<<2)	/* Mail Box 1 */
+#define TSI148_LCSR_INTM1_MB0M_M       (3<<0)	/* Mail Box 0 */
+
+/*
+ *  Interrupt Map Register 2 CRG + $45C
+ */
+#define TSI148_LCSR_INTM2_PERRM_M      (3<<26)	/* PCI Bus Error */
+#define TSI148_LCSR_INTM2_VERRM_M      (3<<24)	/* VMEbus Error */
+#define TSI148_LCSR_INTM2_VIEM_M       (3<<22)	/* VMEbus IRQ Edge */
+#define TSI148_LCSR_INTM2_IACKM_M      (3<<20)	/* IACK */
+#define TSI148_LCSR_INTM2_SYSFLM_M     (3<<18)	/* System Fail */
+#define TSI148_LCSR_INTM2_ACFLM_M      (3<<16)	/* AC Fail */
+#define TSI148_LCSR_INTM2_IRQ7M_M      (3<<14)	/* IRQ7 */
+#define TSI148_LCSR_INTM2_IRQ6M_M      (3<<12)	/* IRQ6 */
+#define TSI148_LCSR_INTM2_IRQ5M_M      (3<<10)	/* IRQ5 */
+#define TSI148_LCSR_INTM2_IRQ4M_M      (3<<8)	/* IRQ4 */
+#define TSI148_LCSR_INTM2_IRQ3M_M      (3<<6)	/* IRQ3 */
+#define TSI148_LCSR_INTM2_IRQ2M_M      (3<<4)	/* IRQ2 */
+#define TSI148_LCSR_INTM2_IRQ1M_M      (3<<2)	/* IRQ1 */
+
+/*
+ *  DMA Control (0-1) Registers CRG + $500
+ */
+#define TSI148_LCSR_DCTL_ABT           (1<<27)	/* Abort */
+#define TSI148_LCSR_DCTL_PAU           (1<<26)	/* Pause */
+#define TSI148_LCSR_DCTL_DGO           (1<<25)	/* DMA Go */
+
+#define TSI148_LCSR_DCTL_MOD           (1<<23)	/* Mode */
+
+#define TSI148_LCSR_DCTL_VBKS_M        (7<<12)	/* VMEbus block Size MASK */
+#define TSI148_LCSR_DCTL_VBKS_32       (0<<12)	/* VMEbus block Size 32 */
+#define TSI148_LCSR_DCTL_VBKS_64       (1<<12)	/* VMEbus block Size 64 */
+#define TSI148_LCSR_DCTL_VBKS_128      (2<<12)	/* VMEbus block Size 128 */
+#define TSI148_LCSR_DCTL_VBKS_256      (3<<12)	/* VMEbus block Size 256 */
+#define TSI148_LCSR_DCTL_VBKS_512      (4<<12)	/* VMEbus block Size 512 */
+#define TSI148_LCSR_DCTL_VBKS_1024     (5<<12)	/* VMEbus block Size 1024 */
+#define TSI148_LCSR_DCTL_VBKS_2048     (6<<12)	/* VMEbus block Size 2048 */
+#define TSI148_LCSR_DCTL_VBKS_4096     (7<<12)	/* VMEbus block Size 4096 */
+
+#define TSI148_LCSR_DCTL_VBOT_M        (7<<8)	/* VMEbus back-off MASK */
+#define TSI148_LCSR_DCTL_VBOT_0        (0<<8)	/* VMEbus back-off  0us */
+#define TSI148_LCSR_DCTL_VBOT_1        (1<<8)	/* VMEbus back-off 1us */
+#define TSI148_LCSR_DCTL_VBOT_2        (2<<8)	/* VMEbus back-off 2us */
+#define TSI148_LCSR_DCTL_VBOT_4        (3<<8)	/* VMEbus back-off 4us */
+#define TSI148_LCSR_DCTL_VBOT_8        (4<<8)	/* VMEbus back-off 8us */
+#define TSI148_LCSR_DCTL_VBOT_16       (5<<8)	/* VMEbus back-off 16us */
+#define TSI148_LCSR_DCTL_VBOT_32       (6<<8)	/* VMEbus back-off 32us */
+#define TSI148_LCSR_DCTL_VBOT_64       (7<<8)	/* VMEbus back-off 64us */
+
+#define TSI148_LCSR_DCTL_PBKS_M        (7<<4)	/* PCI block size MASK */
+#define TSI148_LCSR_DCTL_PBKS_32       (0<<4)	/* PCI block size 32 bytes */
+#define TSI148_LCSR_DCTL_PBKS_64       (1<<4)	/* PCI block size 64 bytes */
+#define TSI148_LCSR_DCTL_PBKS_128      (2<<4)	/* PCI block size 128 bytes */
+#define TSI148_LCSR_DCTL_PBKS_256      (3<<4)	/* PCI block size 256 bytes */
+#define TSI148_LCSR_DCTL_PBKS_512      (4<<4)	/* PCI block size 512 bytes */
+#define TSI148_LCSR_DCTL_PBKS_1024     (5<<4)	/* PCI block size 1024 bytes */
+#define TSI148_LCSR_DCTL_PBKS_2048     (6<<4)	/* PCI block size 2048 bytes */
+#define TSI148_LCSR_DCTL_PBKS_4096     (7<<4)	/* PCI block size 4096 bytes */
+
+#define TSI148_LCSR_DCTL_PBOT_M        (7<<0)	/* PCI back off MASK */
+#define TSI148_LCSR_DCTL_PBOT_0        (0<<0)	/* PCI back off 0us */
+#define TSI148_LCSR_DCTL_PBOT_1        (1<<0)	/* PCI back off 1us */
+#define TSI148_LCSR_DCTL_PBOT_2        (2<<0)	/* PCI back off 2us */
+#define TSI148_LCSR_DCTL_PBOT_4        (3<<0)	/* PCI back off 3us */
+#define TSI148_LCSR_DCTL_PBOT_8        (4<<0)	/* PCI back off 4us */
+#define TSI148_LCSR_DCTL_PBOT_16       (5<<0)	/* PCI back off 8us */
+#define TSI148_LCSR_DCTL_PBOT_32       (6<<0)	/* PCI back off 16us */
+#define TSI148_LCSR_DCTL_PBOT_64       (7<<0)	/* PCI back off 32us */
+
+/*
+ *  DMA Status Registers (0-1)  CRG + $504
+ */
+#define TSI148_LCSR_DSTA_SMA           (1<<31)	/* PCI Signalled Master Abt */
+#define TSI148_LCSR_DSTA_RTA           (1<<30)	/* PCI Received Target Abt */
+#define TSI148_LCSR_DSTA_MRC           (1<<29)	/* PCI Max Retry Count */
+#define TSI148_LCSR_DSTA_VBE           (1<<28)	/* VMEbus error */
+#define TSI148_LCSR_DSTA_ABT           (1<<27)	/* Abort */
+#define TSI148_LCSR_DSTA_PAU           (1<<26)	/* Pause */
+#define TSI148_LCSR_DSTA_DON           (1<<25)	/* Done */
+#define TSI148_LCSR_DSTA_BSY           (1<<24)	/* Busy */
+
+/*
+ *  DMA Current Link Address Lower (0-1)
+ */
+#define TSI148_LCSR_DCLAL_M            (0x3FFFFFF<<6)	/* Mask */
+
+/*
+ *  DMA Source Attribute (0-1) Reg
+ */
+#define TSI148_LCSR_DSAT_TYP_M         (3<<28)	/* Source Bus Type */
+#define TSI148_LCSR_DSAT_TYP_PCI       (0<<28)	/* PCI Bus */
+#define TSI148_LCSR_DSAT_TYP_VME       (1<<28)	/* VMEbus */
+#define TSI148_LCSR_DSAT_TYP_PAT       (2<<28)	/* Data Pattern */
+
+#define TSI148_LCSR_DSAT_PSZ           (1<<25)	/* Pattern Size */
+#define TSI148_LCSR_DSAT_NIN           (1<<24)	/* No Increment */
+
+#define TSI148_LCSR_DSAT_2eSSTM_M      (3<<11)	/* 2eSST Trans Rate Mask */
+#define TSI148_LCSR_DSAT_2eSSTM_160    (0<<11)	/* 160 MB/s */
+#define TSI148_LCSR_DSAT_2eSSTM_267    (1<<11)	/* 267 MB/s */
+#define TSI148_LCSR_DSAT_2eSSTM_320    (2<<11)	/* 320 MB/s */
+
+#define TSI148_LCSR_DSAT_TM_M          (7<<8)	/* Bus Transfer Protocol Mask */
+#define TSI148_LCSR_DSAT_TM_SCT        (0<<8)	/* SCT */
+#define TSI148_LCSR_DSAT_TM_BLT        (1<<8)	/* BLT */
+#define TSI148_LCSR_DSAT_TM_MBLT       (2<<8)	/* MBLT */
+#define TSI148_LCSR_DSAT_TM_2eVME      (3<<8)	/* 2eVME */
+#define TSI148_LCSR_DSAT_TM_2eSST      (4<<8)	/* 2eSST */
+#define TSI148_LCSR_DSAT_TM_2eSSTB     (5<<8)	/* 2eSST Broadcast */
+
+#define TSI148_LCSR_DSAT_DBW_M         (3<<6)	/* Max Data Width MASK */
+#define TSI148_LCSR_DSAT_DBW_16        (0<<6)	/* 16 Bits */
+#define TSI148_LCSR_DSAT_DBW_32        (1<<6)	/* 32 Bits */
+
+#define TSI148_LCSR_DSAT_SUP           (1<<5)	/* Supervisory Mode */
+#define TSI148_LCSR_DSAT_PGM           (1<<4)	/* Program Mode */
+
+#define TSI148_LCSR_DSAT_AMODE_M       (0xf<<0)	/* Address Space Mask */
+#define TSI148_LCSR_DSAT_AMODE_A16     (0<<0)	/* A16 */
+#define TSI148_LCSR_DSAT_AMODE_A24     (1<<0)	/* A24 */
+#define TSI148_LCSR_DSAT_AMODE_A32     (2<<0)	/* A32 */
+#define TSI148_LCSR_DSAT_AMODE_A64     (4<<0)	/* A64 */
+#define TSI148_LCSR_DSAT_AMODE_CRCSR   (5<<0)	/* CR/CSR */
+#define TSI148_LCSR_DSAT_AMODE_USER1   (8<<0)	/* User1 */
+#define TSI148_LCSR_DSAT_AMODE_USER2   (9<<0)	/* User2 */
+#define TSI148_LCSR_DSAT_AMODE_USER3   (0xa<<0)	/* User3 */
+#define TSI148_LCSR_DSAT_AMODE_USER4   (0xb<<0)	/* User4 */
+
+/*
+ *  DMA Destination Attribute Registers (0-1)
+ */
+#define TSI148_LCSR_DDAT_TYP_PCI       (0<<28)	/* Destination PCI Bus  */
+#define TSI148_LCSR_DDAT_TYP_VME       (1<<28)	/* Destination VMEbus */
+
+#define TSI148_LCSR_DDAT_2eSSTM_M      (3<<11)	/* 2eSST Transfer Rate Mask */
+#define TSI148_LCSR_DDAT_2eSSTM_160    (0<<11)	/* 160 MB/s */
+#define TSI148_LCSR_DDAT_2eSSTM_267    (1<<11)	/* 267 MB/s */
+#define TSI148_LCSR_DDAT_2eSSTM_320    (2<<11)	/* 320 MB/s */
+
+#define TSI148_LCSR_DDAT_TM_M          (7<<8)	/* Bus Transfer Protocol Mask */
+#define TSI148_LCSR_DDAT_TM_SCT        (0<<8)	/* SCT */
+#define TSI148_LCSR_DDAT_TM_BLT        (1<<8)	/* BLT */
+#define TSI148_LCSR_DDAT_TM_MBLT       (2<<8)	/* MBLT */
+#define TSI148_LCSR_DDAT_TM_2eVME      (3<<8)	/* 2eVME */
+#define TSI148_LCSR_DDAT_TM_2eSST      (4<<8)	/* 2eSST */
+#define TSI148_LCSR_DDAT_TM_2eSSTB     (5<<8)	/* 2eSST Broadcast */
+
+#define TSI148_LCSR_DDAT_DBW_M         (3<<6)	/* Max Data Width MASK */
+#define TSI148_LCSR_DDAT_DBW_16        (0<<6)	/* 16 Bits */
+#define TSI148_LCSR_DDAT_DBW_32        (1<<6)	/* 32 Bits */
+
+#define TSI148_LCSR_DDAT_SUP           (1<<5)	/* Supervisory/User Access */
+#define TSI148_LCSR_DDAT_PGM           (1<<4)	/* Program/Data Access */
+
+#define TSI148_LCSR_DDAT_AMODE_M       (0xf<<0)	/* Address Space Mask */
+#define TSI148_LCSR_DDAT_AMODE_A16      (0<<0)	/* A16 */
+#define TSI148_LCSR_DDAT_AMODE_A24      (1<<0)	/* A24 */
+#define TSI148_LCSR_DDAT_AMODE_A32      (2<<0)	/* A32 */
+#define TSI148_LCSR_DDAT_AMODE_A64      (4<<0)	/* A64 */
+#define TSI148_LCSR_DDAT_AMODE_CRCSR   (5<<0)	/* CRC/SR */
+#define TSI148_LCSR_DDAT_AMODE_USER1   (8<<0)	/* User1 */
+#define TSI148_LCSR_DDAT_AMODE_USER2   (9<<0)	/* User2 */
+#define TSI148_LCSR_DDAT_AMODE_USER3   (0xa<<0)	/* User3 */
+#define TSI148_LCSR_DDAT_AMODE_USER4   (0xb<<0)	/* User4 */
+
+/*
+ *  DMA Next Link Address Lower
+ */
+#define TSI148_LCSR_DNLAL_DNLAL_M      (0x3FFFFFF<<6)	/* Address Mask */
+#define TSI148_LCSR_DNLAL_LLA          (1<<0)  /* Last Link Address Indicator */
+
+/*
+ *  DMA 2eSST Broadcast Select
+ */
+#define TSI148_LCSR_DBS_M              (0x1FFFFF<<0)	/* Mask */
+
+/*
+ *  GCSR Register Group
+ */
+
+/*
+ *  GCSR Control and Status Register  CRG + $604
+ */
+#define TSI148_GCSR_GCTRL_LRST         (1<<15)	/* Local Reset */
+#define TSI148_GCSR_GCTRL_SFAILEN      (1<<14)	/* System Fail enable */
+#define TSI148_GCSR_GCTRL_BDFAILS      (1<<13)	/* Board Fail Status */
+#define TSI148_GCSR_GCTRL_SCON         (1<<12)	/* System Copntroller */
+#define TSI148_GCSR_GCTRL_MEN          (1<<11)	/* Module Enable (READY) */
+
+#define TSI148_GCSR_GCTRL_LMI3S        (1<<7)	/* Loc Monitor 3 Int Status */
+#define TSI148_GCSR_GCTRL_LMI2S        (1<<6)	/* Loc Monitor 2 Int Status */
+#define TSI148_GCSR_GCTRL_LMI1S        (1<<5)	/* Loc Monitor 1 Int Status */
+#define TSI148_GCSR_GCTRL_LMI0S        (1<<4)	/* Loc Monitor 0 Int Status */
+#define TSI148_GCSR_GCTRL_MBI3S        (1<<3)	/* Mail box 3 Int Status */
+#define TSI148_GCSR_GCTRL_MBI2S        (1<<2)	/* Mail box 2 Int Status */
+#define TSI148_GCSR_GCTRL_MBI1S        (1<<1)	/* Mail box 1 Int Status */
+#define TSI148_GCSR_GCTRL_MBI0S        (1<<0)	/* Mail box 0 Int Status */
+
+#define TSI148_GCSR_GAP                (1<<5)	/* Geographic Addr Parity */
+#define TSI148_GCSR_GA_M               (0x1F<<0)  /* Geographic Address Mask */
+
+/*
+ *  CR/CSR Register Group
+ */
+
+/*
+ *  CR/CSR Bit Clear Register CRG + $FF4
+ */
+#define TSI148_CRCSR_CSRBCR_LRSTC      (1<<7)	/* Local Reset Clear */
+#define TSI148_CRCSR_CSRBCR_SFAILC     (1<<6)	/* System Fail Enable Clear */
+#define TSI148_CRCSR_CSRBCR_BDFAILS    (1<<5)	/* Board Fail Status */
+#define TSI148_CRCSR_CSRBCR_MENC       (1<<4)	/* Module Enable Clear */
+#define TSI148_CRCSR_CSRBCR_BERRSC     (1<<3)	/* Bus Error Status Clear */
+
+/*
+ *  CR/CSR Bit Set Register CRG+$FF8
+ */
+#define TSI148_CRCSR_CSRBSR_LISTS      (1<<7)	/* Local Reset Clear */
+#define TSI148_CRCSR_CSRBSR_SFAILS     (1<<6)	/* System Fail Enable Clear */
+#define TSI148_CRCSR_CSRBSR_BDFAILS    (1<<5)	/* Board Fail Status */
+#define TSI148_CRCSR_CSRBSR_MENS       (1<<4)	/* Module Enable Clear */
+#define TSI148_CRCSR_CSRBSR_BERRS      (1<<3)	/* Bus Error Status Clear */
+
+/*
+ *  CR/CSR Base Address Register CRG + FFC
+ */
+#define TSI148_CRCSR_CBAR_M            (0x1F<<3)	/* Mask */
+
+#endif				/* TSI148_H */
diff --git a/drivers/staging/vme_user/vme_user.c b/drivers/staging/vme_user/vme_user.c
index 859af797630c..4e533c0bfe6d 100644
--- a/drivers/staging/vme_user/vme_user.c
+++ b/drivers/staging/vme_user/vme_user.c
@@ -33,8 +33,8 @@
 
 #include <linux/io.h>
 #include <linux/uaccess.h>
-#include <linux/vme.h>
 
+#include "vme.h"
 #include "vme_user.h"
 
 static const char driver_name[] = "vme_user";
diff --git a/drivers/vme/Kconfig b/drivers/vme/Kconfig
deleted file mode 100644
index 26feabba19d2..000000000000
--- a/drivers/vme/Kconfig
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# VME configuration.
-#
-
-menuconfig VME_BUS
-	bool "VME bridge support"
-	depends on PCI
-	help
-	  If you say Y here you get support for the VME bridge Framework.
-
-if VME_BUS
-
-source "drivers/vme/bridges/Kconfig"
-
-endif # VME
diff --git a/drivers/vme/Makefile b/drivers/vme/Makefile
deleted file mode 100644
index 2dfb929a23de..000000000000
--- a/drivers/vme/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the VME bridge device drivers.
-#
-obj-$(CONFIG_VME_BUS)		+= vme.o
-
-obj-y				+= bridges/
diff --git a/drivers/vme/bridges/Kconfig b/drivers/vme/bridges/Kconfig
deleted file mode 100644
index 9493b22b5276..000000000000
--- a/drivers/vme/bridges/Kconfig
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-comment "VME Bridge Drivers"
-
-config VME_TSI148
-	tristate "Tempe"
-	depends on HAS_DMA
-	help
-	 If you say Y here you get support for the Tundra TSI148 VME bridge
-	 chip.
-
-config VME_FAKE
-	tristate "Fake"
-	help
-	 If you say Y here you get support for the fake VME bridge. This
-	 provides a virtualised VME Bus for devices with no VME bridge. This
-	 is mainly useful for VME development (in the absence of VME
-	 hardware).
diff --git a/drivers/vme/bridges/Makefile b/drivers/vme/bridges/Makefile
deleted file mode 100644
index 043f9cd7a510..000000000000
--- a/drivers/vme/bridges/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_VME_TSI148)	+= vme_tsi148.o
-obj-$(CONFIG_VME_FAKE)		+= vme_fake.o
diff --git a/drivers/vme/bridges/vme_fake.c b/drivers/vme/bridges/vme_fake.c
deleted file mode 100644
index 6a1bc284f297..000000000000
--- a/drivers/vme/bridges/vme_fake.c
+++ /dev/null
@@ -1,1305 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Fake VME bridge support.
- *
- * This drive provides a fake VME bridge chip, this enables debugging of the
- * VME framework in the absence of a VME system.
- *
- * This driver has to do a number of things in software that would be driven
- * by hardware if it was available, it will also result in extra overhead at
- * times when compared with driving actual hardware.
- *
- * Author: Martyn Welch <martyn@welches.me.uk>
- * Copyright (c) 2014 Martyn Welch
- *
- * Based on vme_tsi148.c:
- *
- * Author: Martyn Welch <martyn.welch@ge.com>
- * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
- *
- * Based on work by Tom Armistead and Ajit Prem
- * Copyright 2004 Motorola Inc.
- */
-
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/vme.h>
-
-#include "../vme_bridge.h"
-
-/*
- *  Define the number of each that the fake driver supports.
- */
-#define FAKE_MAX_MASTER		8	/* Max Master Windows */
-#define FAKE_MAX_SLAVE		8	/* Max Slave Windows */
-
-/* Structures to hold information normally held in device registers */
-struct fake_slave_window {
-	int enabled;
-	unsigned long long vme_base;
-	unsigned long long size;
-	void *buf_base;
-	u32 aspace;
-	u32 cycle;
-};
-
-struct fake_master_window {
-	int enabled;
-	unsigned long long vme_base;
-	unsigned long long size;
-	u32 aspace;
-	u32 cycle;
-	u32 dwidth;
-};
-
-/* Structure used to hold driver specific information */
-struct fake_driver {
-	struct vme_bridge *parent;
-	struct fake_slave_window slaves[FAKE_MAX_SLAVE];
-	struct fake_master_window masters[FAKE_MAX_MASTER];
-	u32 lm_enabled;
-	unsigned long long lm_base;
-	u32 lm_aspace;
-	u32 lm_cycle;
-	void (*lm_callback[4])(void *);
-	void *lm_data[4];
-	struct tasklet_struct int_tasklet;
-	int int_level;
-	int int_statid;
-	void *crcsr_kernel;
-	dma_addr_t crcsr_bus;
-	/* Only one VME interrupt can be generated at a time, provide locking */
-	struct mutex vme_int;
-};
-
-/* Module parameter */
-static int geoid;
-
-static const char driver_name[] = "vme_fake";
-
-static struct vme_bridge *exit_pointer;
-
-static struct device *vme_root;
-
-/*
- * Calling VME bus interrupt callback if provided.
- */
-static void fake_VIRQ_tasklet(unsigned long data)
-{
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = (struct vme_bridge *) data;
-	bridge = fake_bridge->driver_priv;
-
-	vme_irq_handler(fake_bridge, bridge->int_level, bridge->int_statid);
-}
-
-/*
- * Configure VME interrupt
- */
-static void fake_irq_set(struct vme_bridge *fake_bridge, int level,
-		int state, int sync)
-{
-	/* Nothing to do */
-}
-
-static void *fake_pci_to_ptr(dma_addr_t addr)
-{
-	return (void *)(uintptr_t)addr;
-}
-
-static dma_addr_t fake_ptr_to_pci(void *addr)
-{
-	return (dma_addr_t)(uintptr_t)addr;
-}
-
-/*
- * Generate a VME bus interrupt at the requested level & vector. Wait for
- * interrupt to be acked.
- */
-static int fake_irq_generate(struct vme_bridge *fake_bridge, int level,
-		int statid)
-{
-	struct fake_driver *bridge;
-
-	bridge = fake_bridge->driver_priv;
-
-	mutex_lock(&bridge->vme_int);
-
-	bridge->int_level = level;
-
-	bridge->int_statid = statid;
-
-	/*
-	 * Schedule tasklet to run VME handler to emulate normal VME interrupt
-	 * handler behaviour.
-	 */
-	tasklet_schedule(&bridge->int_tasklet);
-
-	mutex_unlock(&bridge->vme_int);
-
-	return 0;
-}
-
-/*
- * Initialize a slave window with the requested attributes.
- */
-static int fake_slave_set(struct vme_slave_resource *image, int enabled,
-		unsigned long long vme_base, unsigned long long size,
-		dma_addr_t buf_base, u32 aspace, u32 cycle)
-{
-	unsigned int i, granularity = 0;
-	unsigned long long vme_bound;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = image->parent;
-	bridge = fake_bridge->driver_priv;
-
-	i = image->number;
-
-	switch (aspace) {
-	case VME_A16:
-		granularity = 0x10;
-		break;
-	case VME_A24:
-		granularity = 0x1000;
-		break;
-	case VME_A32:
-		granularity = 0x10000;
-		break;
-	case VME_A64:
-		granularity = 0x10000;
-		break;
-	case VME_CRCSR:
-	case VME_USER1:
-	case VME_USER2:
-	case VME_USER3:
-	case VME_USER4:
-	default:
-		pr_err("Invalid address space\n");
-		return -EINVAL;
-	}
-
-	/*
-	 * Bound address is a valid address for the window, adjust
-	 * accordingly
-	 */
-	vme_bound = vme_base + size - granularity;
-
-	if (vme_base & (granularity - 1)) {
-		pr_err("Invalid VME base alignment\n");
-		return -EINVAL;
-	}
-	if (vme_bound & (granularity - 1)) {
-		pr_err("Invalid VME bound alignment\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(&image->mtx);
-
-	bridge->slaves[i].enabled = enabled;
-	bridge->slaves[i].vme_base = vme_base;
-	bridge->slaves[i].size = size;
-	bridge->slaves[i].buf_base = fake_pci_to_ptr(buf_base);
-	bridge->slaves[i].aspace = aspace;
-	bridge->slaves[i].cycle = cycle;
-
-	mutex_unlock(&image->mtx);
-
-	return 0;
-}
-
-/*
- * Get slave window configuration.
- */
-static int fake_slave_get(struct vme_slave_resource *image, int *enabled,
-		unsigned long long *vme_base, unsigned long long *size,
-		dma_addr_t *buf_base, u32 *aspace, u32 *cycle)
-{
-	unsigned int i;
-	struct fake_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	i = image->number;
-
-	mutex_lock(&image->mtx);
-
-	*enabled = bridge->slaves[i].enabled;
-	*vme_base = bridge->slaves[i].vme_base;
-	*size = bridge->slaves[i].size;
-	*buf_base = fake_ptr_to_pci(bridge->slaves[i].buf_base);
-	*aspace = bridge->slaves[i].aspace;
-	*cycle = bridge->slaves[i].cycle;
-
-	mutex_unlock(&image->mtx);
-
-	return 0;
-}
-
-/*
- * Set the attributes of an outbound window.
- */
-static int fake_master_set(struct vme_master_resource *image, int enabled,
-		unsigned long long vme_base, unsigned long long size,
-		u32 aspace, u32 cycle, u32 dwidth)
-{
-	int retval = 0;
-	unsigned int i;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = image->parent;
-
-	bridge = fake_bridge->driver_priv;
-
-	/* Verify input data */
-	if (vme_base & 0xFFFF) {
-		pr_err("Invalid VME Window alignment\n");
-		retval = -EINVAL;
-		goto err_window;
-	}
-
-	if (size & 0xFFFF) {
-		pr_err("Invalid size alignment\n");
-		retval = -EINVAL;
-		goto err_window;
-	}
-
-	if ((size == 0) && (enabled != 0)) {
-		pr_err("Size must be non-zero for enabled windows\n");
-		retval = -EINVAL;
-		goto err_window;
-	}
-
-	/* Setup data width */
-	switch (dwidth) {
-	case VME_D8:
-	case VME_D16:
-	case VME_D32:
-		break;
-	default:
-		pr_err("Invalid data width\n");
-		retval = -EINVAL;
-		goto err_dwidth;
-	}
-
-	/* Setup address space */
-	switch (aspace) {
-	case VME_A16:
-	case VME_A24:
-	case VME_A32:
-	case VME_A64:
-	case VME_CRCSR:
-	case VME_USER1:
-	case VME_USER2:
-	case VME_USER3:
-	case VME_USER4:
-		break;
-	default:
-		pr_err("Invalid address space\n");
-		retval = -EINVAL;
-		goto err_aspace;
-	}
-
-	spin_lock(&image->lock);
-
-	i = image->number;
-
-	bridge->masters[i].enabled = enabled;
-	bridge->masters[i].vme_base = vme_base;
-	bridge->masters[i].size = size;
-	bridge->masters[i].aspace = aspace;
-	bridge->masters[i].cycle = cycle;
-	bridge->masters[i].dwidth = dwidth;
-
-	spin_unlock(&image->lock);
-
-	return 0;
-
-err_aspace:
-err_dwidth:
-err_window:
-	return retval;
-
-}
-
-/*
- * Set the attributes of an outbound window.
- */
-static int __fake_master_get(struct vme_master_resource *image, int *enabled,
-		unsigned long long *vme_base, unsigned long long *size,
-		u32 *aspace, u32 *cycle, u32 *dwidth)
-{
-	unsigned int i;
-	struct fake_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	i = image->number;
-
-	*enabled = bridge->masters[i].enabled;
-	*vme_base = bridge->masters[i].vme_base;
-	*size = bridge->masters[i].size;
-	*aspace = bridge->masters[i].aspace;
-	*cycle = bridge->masters[i].cycle;
-	*dwidth = bridge->masters[i].dwidth;
-
-	return 0;
-}
-
-
-static int fake_master_get(struct vme_master_resource *image, int *enabled,
-		unsigned long long *vme_base, unsigned long long *size,
-		u32 *aspace, u32 *cycle, u32 *dwidth)
-{
-	int retval;
-
-	spin_lock(&image->lock);
-
-	retval = __fake_master_get(image, enabled, vme_base, size, aspace,
-			cycle, dwidth);
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-
-static void fake_lm_check(struct fake_driver *bridge, unsigned long long addr,
-			  u32 aspace, u32 cycle)
-{
-	struct vme_bridge *fake_bridge;
-	unsigned long long lm_base;
-	u32 lm_aspace, lm_cycle;
-	int i;
-	struct vme_lm_resource *lm;
-	struct list_head *pos = NULL, *n;
-
-	/* Get vme_bridge */
-	fake_bridge = bridge->parent;
-
-	/* Loop through each location monitor resource */
-	list_for_each_safe(pos, n, &fake_bridge->lm_resources) {
-		lm = list_entry(pos, struct vme_lm_resource, list);
-
-		/* If disabled, we're done */
-		if (bridge->lm_enabled == 0)
-			return;
-
-		lm_base = bridge->lm_base;
-		lm_aspace = bridge->lm_aspace;
-		lm_cycle = bridge->lm_cycle;
-
-		/* First make sure that the cycle and address space match */
-		if ((lm_aspace == aspace) && (lm_cycle == cycle)) {
-			for (i = 0; i < lm->monitors; i++) {
-				/* Each location monitor covers 8 bytes */
-				if (((lm_base + (8 * i)) <= addr) &&
-				    ((lm_base + (8 * i) + 8) > addr)) {
-					if (bridge->lm_callback[i])
-						bridge->lm_callback[i](
-							bridge->lm_data[i]);
-				}
-			}
-		}
-	}
-}
-
-static noinline_for_stack u8 fake_vmeread8(struct fake_driver *bridge,
-					   unsigned long long addr,
-					   u32 aspace, u32 cycle)
-{
-	u8 retval = 0xff;
-	int i;
-	unsigned long long start, end, offset;
-	u8 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		if ((addr >= start) && (addr < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u8 *)(bridge->slaves[i].buf_base + offset);
-			retval = *loc;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-	return retval;
-}
-
-static noinline_for_stack u16 fake_vmeread16(struct fake_driver *bridge,
-					     unsigned long long addr,
-					     u32 aspace, u32 cycle)
-{
-	u16 retval = 0xffff;
-	int i;
-	unsigned long long start, end, offset;
-	u16 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if ((addr >= start) && ((addr + 1) < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u16 *)(bridge->slaves[i].buf_base + offset);
-			retval = *loc;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-	return retval;
-}
-
-static noinline_for_stack u32 fake_vmeread32(struct fake_driver *bridge,
-					     unsigned long long addr,
-					     u32 aspace, u32 cycle)
-{
-	u32 retval = 0xffffffff;
-	int i;
-	unsigned long long start, end, offset;
-	u32 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if ((addr >= start) && ((addr + 3) < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u32 *)(bridge->slaves[i].buf_base + offset);
-			retval = *loc;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-	return retval;
-}
-
-static ssize_t fake_master_read(struct vme_master_resource *image, void *buf,
-		size_t count, loff_t offset)
-{
-	int retval;
-	u32 aspace, cycle, dwidth;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *priv;
-	int i;
-	unsigned long long addr;
-	unsigned int done = 0;
-	unsigned int count32;
-
-	fake_bridge = image->parent;
-
-	priv = fake_bridge->driver_priv;
-
-	i = image->number;
-
-	addr = (unsigned long long)priv->masters[i].vme_base + offset;
-	aspace = priv->masters[i].aspace;
-	cycle = priv->masters[i].cycle;
-	dwidth = priv->masters[i].dwidth;
-
-	spin_lock(&image->lock);
-
-	/* The following code handles VME address alignment. We cannot use
-	 * memcpy_xxx here because it may cut data transfers in to 8-bit
-	 * cycles when D16 or D32 cycles are required on the VME bus.
-	 * On the other hand, the bridge itself assures that the maximum data
-	 * cycle configured for the transfer is used and splits it
-	 * automatically for non-aligned addresses, so we don't want the
-	 * overhead of needlessly forcing small transfers for the entire cycle.
-	 */
-	if (addr & 0x1) {
-		*(u8 *)buf = fake_vmeread8(priv, addr, aspace, cycle);
-		done += 1;
-		if (done == count)
-			goto out;
-	}
-	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
-		if ((addr + done) & 0x2) {
-			if ((count - done) < 2) {
-				*(u8 *)(buf + done) = fake_vmeread8(priv,
-						addr + done, aspace, cycle);
-				done += 1;
-				goto out;
-			} else {
-				*(u16 *)(buf + done) = fake_vmeread16(priv,
-						addr + done, aspace, cycle);
-				done += 2;
-			}
-		}
-	}
-
-	if (dwidth == VME_D32) {
-		count32 = (count - done) & ~0x3;
-		while (done < count32) {
-			*(u32 *)(buf + done) = fake_vmeread32(priv, addr + done,
-					aspace, cycle);
-			done += 4;
-		}
-	} else if (dwidth == VME_D16) {
-		count32 = (count - done) & ~0x3;
-		while (done < count32) {
-			*(u16 *)(buf + done) = fake_vmeread16(priv, addr + done,
-					aspace, cycle);
-			done += 2;
-		}
-	} else if (dwidth == VME_D8) {
-		count32 = (count - done);
-		while (done < count32) {
-			*(u8 *)(buf + done) = fake_vmeread8(priv, addr + done,
-					aspace, cycle);
-			done += 1;
-		}
-
-	}
-
-	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
-		if ((count - done) & 0x2) {
-			*(u16 *)(buf + done) = fake_vmeread16(priv, addr + done,
-					aspace, cycle);
-			done += 2;
-		}
-	}
-	if ((count - done) & 0x1) {
-		*(u8 *)(buf + done) = fake_vmeread8(priv, addr + done, aspace,
-				cycle);
-		done += 1;
-	}
-
-out:
-	retval = count;
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-static noinline_for_stack void fake_vmewrite8(struct fake_driver *bridge,
-					      u8 *buf, unsigned long long addr,
-					      u32 aspace, u32 cycle)
-{
-	int i;
-	unsigned long long start, end, offset;
-	u8 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if ((addr >= start) && (addr < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u8 *)((void *)bridge->slaves[i].buf_base + offset);
-			*loc = *buf;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-}
-
-static noinline_for_stack void fake_vmewrite16(struct fake_driver *bridge,
-					       u16 *buf, unsigned long long addr,
-					       u32 aspace, u32 cycle)
-{
-	int i;
-	unsigned long long start, end, offset;
-	u16 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if ((addr >= start) && ((addr + 1) < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u16 *)((void *)bridge->slaves[i].buf_base + offset);
-			*loc = *buf;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-}
-
-static noinline_for_stack void fake_vmewrite32(struct fake_driver *bridge,
-					       u32 *buf, unsigned long long addr,
-					       u32 aspace, u32 cycle)
-{
-	int i;
-	unsigned long long start, end, offset;
-	u32 *loc;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		if (aspace != bridge->slaves[i].aspace)
-			continue;
-
-		if (cycle != bridge->slaves[i].cycle)
-			continue;
-
-		start = bridge->slaves[i].vme_base;
-		end = bridge->slaves[i].vme_base + bridge->slaves[i].size;
-
-		if ((addr >= start) && ((addr + 3) < end)) {
-			offset = addr - bridge->slaves[i].vme_base;
-			loc = (u32 *)((void *)bridge->slaves[i].buf_base + offset);
-			*loc = *buf;
-
-			break;
-		}
-	}
-
-	fake_lm_check(bridge, addr, aspace, cycle);
-
-}
-
-static ssize_t fake_master_write(struct vme_master_resource *image, void *buf,
-		size_t count, loff_t offset)
-{
-	int retval = 0;
-	u32 aspace, cycle, dwidth;
-	unsigned long long addr;
-	int i;
-	unsigned int done = 0;
-	unsigned int count32;
-
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = image->parent;
-
-	bridge = fake_bridge->driver_priv;
-
-	i = image->number;
-
-	addr = bridge->masters[i].vme_base + offset;
-	aspace = bridge->masters[i].aspace;
-	cycle = bridge->masters[i].cycle;
-	dwidth = bridge->masters[i].dwidth;
-
-	spin_lock(&image->lock);
-
-	/* Here we apply for the same strategy we do in master_read
-	 * function in order to assure the correct cycles.
-	 */
-	if (addr & 0x1) {
-		fake_vmewrite8(bridge, (u8 *)buf, addr, aspace, cycle);
-		done += 1;
-		if (done == count)
-			goto out;
-	}
-
-	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
-		if ((addr + done) & 0x2) {
-			if ((count - done) < 2) {
-				fake_vmewrite8(bridge, (u8 *)(buf + done),
-						addr + done, aspace, cycle);
-				done += 1;
-				goto out;
-			} else {
-				fake_vmewrite16(bridge, (u16 *)(buf + done),
-						addr + done, aspace, cycle);
-				done += 2;
-			}
-		}
-	}
-
-	if (dwidth == VME_D32) {
-		count32 = (count - done) & ~0x3;
-		while (done < count32) {
-			fake_vmewrite32(bridge, (u32 *)(buf + done),
-					addr + done, aspace, cycle);
-			done += 4;
-		}
-	} else if (dwidth == VME_D16) {
-		count32 = (count - done) & ~0x3;
-		while (done < count32) {
-			fake_vmewrite16(bridge, (u16 *)(buf + done),
-					addr + done, aspace, cycle);
-			done += 2;
-		}
-	} else if (dwidth == VME_D8) {
-		count32 = (count - done);
-		while (done < count32) {
-			fake_vmewrite8(bridge, (u8 *)(buf + done), addr + done,
-					aspace, cycle);
-			done += 1;
-		}
-
-	}
-
-	if ((dwidth == VME_D16) || (dwidth == VME_D32)) {
-		if ((count - done) & 0x2) {
-			fake_vmewrite16(bridge, (u16 *)(buf + done),
-					addr + done, aspace, cycle);
-			done += 2;
-		}
-	}
-
-	if ((count - done) & 0x1) {
-		fake_vmewrite8(bridge, (u8 *)(buf + done), addr + done, aspace,
-				cycle);
-		done += 1;
-	}
-
-out:
-	retval = count;
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-/*
- * Perform an RMW cycle on the VME bus.
- *
- * Requires a previously configured master window, returns final value.
- */
-static unsigned int fake_master_rmw(struct vme_master_resource *image,
-		unsigned int mask, unsigned int compare, unsigned int swap,
-		loff_t offset)
-{
-	u32 tmp, base;
-	u32 aspace, cycle;
-	int i;
-	struct fake_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	/* Find the PCI address that maps to the desired VME address */
-	i = image->number;
-
-	base = bridge->masters[i].vme_base;
-	aspace = bridge->masters[i].aspace;
-	cycle = bridge->masters[i].cycle;
-
-	/* Lock image */
-	spin_lock(&image->lock);
-
-	/* Read existing value */
-	tmp = fake_vmeread32(bridge, base + offset, aspace, cycle);
-
-	/* Perform check */
-	if ((tmp && mask) == (compare && mask)) {
-		tmp = tmp | (mask | swap);
-		tmp = tmp & (~mask | swap);
-
-		/* Write back */
-		fake_vmewrite32(bridge, &tmp, base + offset, aspace, cycle);
-	}
-
-	/* Unlock image */
-	spin_unlock(&image->lock);
-
-	return tmp;
-}
-
-/*
- * All 4 location monitors reside at the same base - this is therefore a
- * system wide configuration.
- *
- * This does not enable the LM monitor - that should be done when the first
- * callback is attached and disabled when the last callback is removed.
- */
-static int fake_lm_set(struct vme_lm_resource *lm, unsigned long long lm_base,
-		u32 aspace, u32 cycle)
-{
-	int i;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = lm->parent;
-
-	bridge = fake_bridge->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* If we already have a callback attached, we can't move it! */
-	for (i = 0; i < lm->monitors; i++) {
-		if (bridge->lm_callback[i]) {
-			mutex_unlock(&lm->mtx);
-			pr_err("Location monitor callback attached, can't reset\n");
-			return -EBUSY;
-		}
-	}
-
-	switch (aspace) {
-	case VME_A16:
-	case VME_A24:
-	case VME_A32:
-	case VME_A64:
-		break;
-	default:
-		mutex_unlock(&lm->mtx);
-		pr_err("Invalid address space\n");
-		return -EINVAL;
-	}
-
-	bridge->lm_base = lm_base;
-	bridge->lm_aspace = aspace;
-	bridge->lm_cycle = cycle;
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/* Get configuration of the callback monitor and return whether it is enabled
- * or disabled.
- */
-static int fake_lm_get(struct vme_lm_resource *lm,
-		unsigned long long *lm_base, u32 *aspace, u32 *cycle)
-{
-	struct fake_driver *bridge;
-
-	bridge = lm->parent->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	*lm_base = bridge->lm_base;
-	*aspace = bridge->lm_aspace;
-	*cycle = bridge->lm_cycle;
-
-	mutex_unlock(&lm->mtx);
-
-	return bridge->lm_enabled;
-}
-
-/*
- * Attach a callback to a specific location monitor.
- *
- * Callback will be passed the monitor triggered.
- */
-static int fake_lm_attach(struct vme_lm_resource *lm, int monitor,
-		void (*callback)(void *), void *data)
-{
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = lm->parent;
-
-	bridge = fake_bridge->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* Ensure that the location monitor is configured - need PGM or DATA */
-	if (bridge->lm_cycle == 0) {
-		mutex_unlock(&lm->mtx);
-		pr_err("Location monitor not properly configured\n");
-		return -EINVAL;
-	}
-
-	/* Check that a callback isn't already attached */
-	if (bridge->lm_callback[monitor]) {
-		mutex_unlock(&lm->mtx);
-		pr_err("Existing callback attached\n");
-		return -EBUSY;
-	}
-
-	/* Attach callback */
-	bridge->lm_callback[monitor] = callback;
-	bridge->lm_data[monitor] = data;
-
-	/* Ensure that global Location Monitor Enable set */
-	bridge->lm_enabled = 1;
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/*
- * Detach a callback function forn a specific location monitor.
- */
-static int fake_lm_detach(struct vme_lm_resource *lm, int monitor)
-{
-	u32 tmp;
-	int i;
-	struct fake_driver *bridge;
-
-	bridge = lm->parent->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* Detach callback */
-	bridge->lm_callback[monitor] = NULL;
-	bridge->lm_data[monitor] = NULL;
-
-	/* If all location monitors disabled, disable global Location Monitor */
-	tmp = 0;
-	for (i = 0; i < lm->monitors; i++) {
-		if (bridge->lm_callback[i])
-			tmp = 1;
-	}
-
-	if (tmp == 0)
-		bridge->lm_enabled = 0;
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/*
- * Determine Geographical Addressing
- */
-static int fake_slot_get(struct vme_bridge *fake_bridge)
-{
-	return geoid;
-}
-
-static void *fake_alloc_consistent(struct device *parent, size_t size,
-		dma_addr_t *dma)
-{
-	void *alloc = kmalloc(size, GFP_KERNEL);
-
-	if (alloc)
-		*dma = fake_ptr_to_pci(alloc);
-
-	return alloc;
-}
-
-static void fake_free_consistent(struct device *parent, size_t size,
-		void *vaddr, dma_addr_t dma)
-{
-	kfree(vaddr);
-/*
-	dma_free_coherent(parent, size, vaddr, dma);
-*/
-}
-
-/*
- * Configure CR/CSR space
- *
- * Access to the CR/CSR can be configured at power-up. The location of the
- * CR/CSR registers in the CR/CSR address space is determined by the boards
- * Geographic address.
- *
- * Each board has a 512kB window, with the highest 4kB being used for the
- * boards registers, this means there is a fix length 508kB window which must
- * be mapped onto PCI memory.
- */
-static int fake_crcsr_init(struct vme_bridge *fake_bridge)
-{
-	u32 vstat;
-	struct fake_driver *bridge;
-
-	bridge = fake_bridge->driver_priv;
-
-	/* Allocate mem for CR/CSR image */
-	bridge->crcsr_kernel = kzalloc(VME_CRCSR_BUF_SIZE, GFP_KERNEL);
-	bridge->crcsr_bus = fake_ptr_to_pci(bridge->crcsr_kernel);
-	if (!bridge->crcsr_kernel)
-		return -ENOMEM;
-
-	vstat = fake_slot_get(fake_bridge);
-
-	pr_info("CR/CSR Offset: %d\n", vstat);
-
-	return 0;
-}
-
-static void fake_crcsr_exit(struct vme_bridge *fake_bridge)
-{
-	struct fake_driver *bridge;
-
-	bridge = fake_bridge->driver_priv;
-
-	kfree(bridge->crcsr_kernel);
-}
-
-
-static int __init fake_init(void)
-{
-	int retval, i;
-	struct list_head *pos = NULL, *n;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *fake_device;
-	struct vme_master_resource *master_image;
-	struct vme_slave_resource *slave_image;
-	struct vme_lm_resource *lm;
-
-	/* We need a fake parent device */
-	vme_root = __root_device_register("vme", THIS_MODULE);
-
-	/* If we want to support more than one bridge at some point, we need to
-	 * dynamically allocate this so we get one per device.
-	 */
-	fake_bridge = kzalloc(sizeof(*fake_bridge), GFP_KERNEL);
-	if (!fake_bridge) {
-		retval = -ENOMEM;
-		goto err_struct;
-	}
-
-	fake_device = kzalloc(sizeof(*fake_device), GFP_KERNEL);
-	if (!fake_device) {
-		retval = -ENOMEM;
-		goto err_driver;
-	}
-
-	fake_bridge->driver_priv = fake_device;
-
-	fake_bridge->parent = vme_root;
-
-	fake_device->parent = fake_bridge;
-
-	/* Initialize wait queues & mutual exclusion flags */
-	mutex_init(&fake_device->vme_int);
-	mutex_init(&fake_bridge->irq_mtx);
-	tasklet_init(&fake_device->int_tasklet, fake_VIRQ_tasklet,
-			(unsigned long) fake_bridge);
-
-	strcpy(fake_bridge->name, driver_name);
-
-	/* Add master windows to list */
-	INIT_LIST_HEAD(&fake_bridge->master_resources);
-	for (i = 0; i < FAKE_MAX_MASTER; i++) {
-		master_image = kmalloc(sizeof(*master_image), GFP_KERNEL);
-		if (!master_image) {
-			retval = -ENOMEM;
-			goto err_master;
-		}
-		master_image->parent = fake_bridge;
-		spin_lock_init(&master_image->lock);
-		master_image->locked = 0;
-		master_image->number = i;
-		master_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
-			VME_A64;
-		master_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
-			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
-			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
-			VME_PROG | VME_DATA;
-		master_image->width_attr = VME_D16 | VME_D32;
-		memset(&master_image->bus_resource, 0,
-				sizeof(struct resource));
-		master_image->kern_base  = NULL;
-		list_add_tail(&master_image->list,
-				&fake_bridge->master_resources);
-	}
-
-	/* Add slave windows to list */
-	INIT_LIST_HEAD(&fake_bridge->slave_resources);
-	for (i = 0; i < FAKE_MAX_SLAVE; i++) {
-		slave_image = kmalloc(sizeof(*slave_image), GFP_KERNEL);
-		if (!slave_image) {
-			retval = -ENOMEM;
-			goto err_slave;
-		}
-		slave_image->parent = fake_bridge;
-		mutex_init(&slave_image->mtx);
-		slave_image->locked = 0;
-		slave_image->number = i;
-		slave_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
-			VME_A64 | VME_CRCSR | VME_USER1 | VME_USER2 |
-			VME_USER3 | VME_USER4;
-		slave_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
-			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
-			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
-			VME_PROG | VME_DATA;
-		list_add_tail(&slave_image->list,
-				&fake_bridge->slave_resources);
-	}
-
-	/* Add location monitor to list */
-	INIT_LIST_HEAD(&fake_bridge->lm_resources);
-	lm = kmalloc(sizeof(*lm), GFP_KERNEL);
-	if (!lm) {
-		retval = -ENOMEM;
-		goto err_lm;
-	}
-	lm->parent = fake_bridge;
-	mutex_init(&lm->mtx);
-	lm->locked = 0;
-	lm->number = 1;
-	lm->monitors = 4;
-	list_add_tail(&lm->list, &fake_bridge->lm_resources);
-
-	fake_bridge->slave_get = fake_slave_get;
-	fake_bridge->slave_set = fake_slave_set;
-	fake_bridge->master_get = fake_master_get;
-	fake_bridge->master_set = fake_master_set;
-	fake_bridge->master_read = fake_master_read;
-	fake_bridge->master_write = fake_master_write;
-	fake_bridge->master_rmw = fake_master_rmw;
-	fake_bridge->irq_set = fake_irq_set;
-	fake_bridge->irq_generate = fake_irq_generate;
-	fake_bridge->lm_set = fake_lm_set;
-	fake_bridge->lm_get = fake_lm_get;
-	fake_bridge->lm_attach = fake_lm_attach;
-	fake_bridge->lm_detach = fake_lm_detach;
-	fake_bridge->slot_get = fake_slot_get;
-	fake_bridge->alloc_consistent = fake_alloc_consistent;
-	fake_bridge->free_consistent = fake_free_consistent;
-
-	pr_info("Board is%s the VME system controller\n",
-			(geoid == 1) ? "" : " not");
-
-	pr_info("VME geographical address is set to %d\n", geoid);
-
-	retval = fake_crcsr_init(fake_bridge);
-	if (retval) {
-		pr_err("CR/CSR configuration failed.\n");
-		goto err_crcsr;
-	}
-
-	retval = vme_register_bridge(fake_bridge);
-	if (retval != 0) {
-		pr_err("Chip Registration failed.\n");
-		goto err_reg;
-	}
-
-	exit_pointer = fake_bridge;
-
-	return 0;
-
-err_reg:
-	fake_crcsr_exit(fake_bridge);
-err_crcsr:
-err_lm:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &fake_bridge->lm_resources) {
-		lm = list_entry(pos, struct vme_lm_resource, list);
-		list_del(pos);
-		kfree(lm);
-	}
-err_slave:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &fake_bridge->slave_resources) {
-		slave_image = list_entry(pos, struct vme_slave_resource, list);
-		list_del(pos);
-		kfree(slave_image);
-	}
-err_master:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &fake_bridge->master_resources) {
-		master_image = list_entry(pos, struct vme_master_resource,
-				list);
-		list_del(pos);
-		kfree(master_image);
-	}
-
-	kfree(fake_device);
-err_driver:
-	kfree(fake_bridge);
-err_struct:
-	return retval;
-
-}
-
-
-static void __exit fake_exit(void)
-{
-	struct list_head *pos = NULL;
-	struct list_head *tmplist;
-	struct vme_master_resource *master_image;
-	struct vme_slave_resource *slave_image;
-	int i;
-	struct vme_bridge *fake_bridge;
-	struct fake_driver *bridge;
-
-	fake_bridge = exit_pointer;
-
-	bridge = fake_bridge->driver_priv;
-
-	pr_debug("Driver is being unloaded.\n");
-
-	/*
-	 *  Shutdown all inbound and outbound windows.
-	 */
-	for (i = 0; i < FAKE_MAX_MASTER; i++)
-		bridge->masters[i].enabled = 0;
-
-	for (i = 0; i < FAKE_MAX_SLAVE; i++)
-		bridge->slaves[i].enabled = 0;
-
-	/*
-	 *  Shutdown Location monitor.
-	 */
-	bridge->lm_enabled = 0;
-
-	vme_unregister_bridge(fake_bridge);
-
-	fake_crcsr_exit(fake_bridge);
-	/* resources are stored in link list */
-	list_for_each_safe(pos, tmplist, &fake_bridge->slave_resources) {
-		slave_image = list_entry(pos, struct vme_slave_resource, list);
-		list_del(pos);
-		kfree(slave_image);
-	}
-
-	/* resources are stored in link list */
-	list_for_each_safe(pos, tmplist, &fake_bridge->master_resources) {
-		master_image = list_entry(pos, struct vme_master_resource,
-				list);
-		list_del(pos);
-		kfree(master_image);
-	}
-
-	kfree(fake_bridge->driver_priv);
-
-	kfree(fake_bridge);
-
-	root_device_unregister(vme_root);
-}
-
-
-MODULE_PARM_DESC(geoid, "Set geographical addressing");
-module_param(geoid, int, 0);
-
-MODULE_DESCRIPTION("Fake VME bridge driver");
-MODULE_LICENSE("GPL");
-
-module_init(fake_init);
-module_exit(fake_exit);
diff --git a/drivers/vme/bridges/vme_tsi148.c b/drivers/vme/bridges/vme_tsi148.c
deleted file mode 100644
index be9051b02f24..000000000000
--- a/drivers/vme/bridges/vme_tsi148.c
+++ /dev/null
@@ -1,2661 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Support for the Tundra TSI148 VME-PCI Bridge Chip
- *
- * Author: Martyn Welch <martyn.welch@ge.com>
- * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
- *
- * Based on work by Tom Armistead and Ajit Prem
- * Copyright 2004 Motorola Inc.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/dma-mapping.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <linux/byteorder/generic.h>
-#include <linux/vme.h>
-
-#include "../vme_bridge.h"
-#include "vme_tsi148.h"
-
-static int tsi148_probe(struct pci_dev *, const struct pci_device_id *);
-static void tsi148_remove(struct pci_dev *);
-
-
-/* Module parameter */
-static bool err_chk;
-static int geoid;
-
-static const char driver_name[] = "vme_tsi148";
-
-static const struct pci_device_id tsi148_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_TUNDRA, PCI_DEVICE_ID_TUNDRA_TSI148) },
-	{ },
-};
-
-MODULE_DEVICE_TABLE(pci, tsi148_ids);
-
-static struct pci_driver tsi148_driver = {
-	.name = driver_name,
-	.id_table = tsi148_ids,
-	.probe = tsi148_probe,
-	.remove = tsi148_remove,
-};
-
-static void reg_join(unsigned int high, unsigned int low,
-	unsigned long long *variable)
-{
-	*variable = (unsigned long long)high << 32;
-	*variable |= (unsigned long long)low;
-}
-
-static void reg_split(unsigned long long variable, unsigned int *high,
-	unsigned int *low)
-{
-	*low = (unsigned int)variable & 0xFFFFFFFF;
-	*high = (unsigned int)(variable >> 32);
-}
-
-/*
- * Wakes up DMA queue.
- */
-static u32 tsi148_DMA_irqhandler(struct tsi148_driver *bridge,
-	int channel_mask)
-{
-	u32 serviced = 0;
-
-	if (channel_mask & TSI148_LCSR_INTS_DMA0S) {
-		wake_up(&bridge->dma_queue[0]);
-		serviced |= TSI148_LCSR_INTC_DMA0C;
-	}
-	if (channel_mask & TSI148_LCSR_INTS_DMA1S) {
-		wake_up(&bridge->dma_queue[1]);
-		serviced |= TSI148_LCSR_INTC_DMA1C;
-	}
-
-	return serviced;
-}
-
-/*
- * Wake up location monitor queue
- */
-static u32 tsi148_LM_irqhandler(struct tsi148_driver *bridge, u32 stat)
-{
-	int i;
-	u32 serviced = 0;
-
-	for (i = 0; i < 4; i++) {
-		if (stat & TSI148_LCSR_INTS_LMS[i]) {
-			/* We only enable interrupts if the callback is set */
-			bridge->lm_callback[i](bridge->lm_data[i]);
-			serviced |= TSI148_LCSR_INTC_LMC[i];
-		}
-	}
-
-	return serviced;
-}
-
-/*
- * Wake up mail box queue.
- *
- * XXX This functionality is not exposed up though API.
- */
-static u32 tsi148_MB_irqhandler(struct vme_bridge *tsi148_bridge, u32 stat)
-{
-	int i;
-	u32 val;
-	u32 serviced = 0;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	for (i = 0; i < 4; i++) {
-		if (stat & TSI148_LCSR_INTS_MBS[i]) {
-			val = ioread32be(bridge->base +	TSI148_GCSR_MBOX[i]);
-			dev_err(tsi148_bridge->parent, "VME Mailbox %d received"
-				": 0x%x\n", i, val);
-			serviced |= TSI148_LCSR_INTC_MBC[i];
-		}
-	}
-
-	return serviced;
-}
-
-/*
- * Display error & status message when PERR (PCI) exception interrupt occurs.
- */
-static u32 tsi148_PERR_irqhandler(struct vme_bridge *tsi148_bridge)
-{
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	dev_err(tsi148_bridge->parent, "PCI Exception at address: 0x%08x:%08x, "
-		"attributes: %08x\n",
-		ioread32be(bridge->base + TSI148_LCSR_EDPAU),
-		ioread32be(bridge->base + TSI148_LCSR_EDPAL),
-		ioread32be(bridge->base + TSI148_LCSR_EDPAT));
-
-	dev_err(tsi148_bridge->parent, "PCI-X attribute reg: %08x, PCI-X split "
-		"completion reg: %08x\n",
-		ioread32be(bridge->base + TSI148_LCSR_EDPXA),
-		ioread32be(bridge->base + TSI148_LCSR_EDPXS));
-
-	iowrite32be(TSI148_LCSR_EDPAT_EDPCL, bridge->base + TSI148_LCSR_EDPAT);
-
-	return TSI148_LCSR_INTC_PERRC;
-}
-
-/*
- * Save address and status when VME error interrupt occurs.
- */
-static u32 tsi148_VERR_irqhandler(struct vme_bridge *tsi148_bridge)
-{
-	unsigned int error_addr_high, error_addr_low;
-	unsigned long long error_addr;
-	u32 error_attrib;
-	int error_am;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	error_addr_high = ioread32be(bridge->base + TSI148_LCSR_VEAU);
-	error_addr_low = ioread32be(bridge->base + TSI148_LCSR_VEAL);
-	error_attrib = ioread32be(bridge->base + TSI148_LCSR_VEAT);
-	error_am = (error_attrib & TSI148_LCSR_VEAT_AM_M) >> 8;
-
-	reg_join(error_addr_high, error_addr_low, &error_addr);
-
-	/* Check for exception register overflow (we have lost error data) */
-	if (error_attrib & TSI148_LCSR_VEAT_VEOF) {
-		dev_err(tsi148_bridge->parent, "VME Bus Exception Overflow "
-			"Occurred\n");
-	}
-
-	if (err_chk)
-		vme_bus_error_handler(tsi148_bridge, error_addr, error_am);
-	else
-		dev_err(tsi148_bridge->parent,
-			"VME Bus Error at address: 0x%llx, attributes: %08x\n",
-			error_addr, error_attrib);
-
-	/* Clear Status */
-	iowrite32be(TSI148_LCSR_VEAT_VESCL, bridge->base + TSI148_LCSR_VEAT);
-
-	return TSI148_LCSR_INTC_VERRC;
-}
-
-/*
- * Wake up IACK queue.
- */
-static u32 tsi148_IACK_irqhandler(struct tsi148_driver *bridge)
-{
-	wake_up(&bridge->iack_queue);
-
-	return TSI148_LCSR_INTC_IACKC;
-}
-
-/*
- * Calling VME bus interrupt callback if provided.
- */
-static u32 tsi148_VIRQ_irqhandler(struct vme_bridge *tsi148_bridge,
-	u32 stat)
-{
-	int vec, i, serviced = 0;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	for (i = 7; i > 0; i--) {
-		if (stat & (1 << i)) {
-			/*
-			 * Note: Even though the registers are defined as
-			 * 32-bits in the spec, we only want to issue 8-bit
-			 * IACK cycles on the bus, read from offset 3.
-			 */
-			vec = ioread8(bridge->base + TSI148_LCSR_VIACK[i] + 3);
-
-			vme_irq_handler(tsi148_bridge, i, vec);
-
-			serviced |= (1 << i);
-		}
-	}
-
-	return serviced;
-}
-
-/*
- * Top level interrupt handler.  Clears appropriate interrupt status bits and
- * then calls appropriate sub handler(s).
- */
-static irqreturn_t tsi148_irqhandler(int irq, void *ptr)
-{
-	u32 stat, enable, serviced = 0;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	tsi148_bridge = ptr;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	/* Determine which interrupts are unmasked and set */
-	enable = ioread32be(bridge->base + TSI148_LCSR_INTEO);
-	stat = ioread32be(bridge->base + TSI148_LCSR_INTS);
-
-	/* Only look at unmasked interrupts */
-	stat &= enable;
-
-	if (unlikely(!stat))
-		return IRQ_NONE;
-
-	/* Call subhandlers as appropriate */
-	/* DMA irqs */
-	if (stat & (TSI148_LCSR_INTS_DMA1S | TSI148_LCSR_INTS_DMA0S))
-		serviced |= tsi148_DMA_irqhandler(bridge, stat);
-
-	/* Location monitor irqs */
-	if (stat & (TSI148_LCSR_INTS_LM3S | TSI148_LCSR_INTS_LM2S |
-			TSI148_LCSR_INTS_LM1S | TSI148_LCSR_INTS_LM0S))
-		serviced |= tsi148_LM_irqhandler(bridge, stat);
-
-	/* Mail box irqs */
-	if (stat & (TSI148_LCSR_INTS_MB3S | TSI148_LCSR_INTS_MB2S |
-			TSI148_LCSR_INTS_MB1S | TSI148_LCSR_INTS_MB0S))
-		serviced |= tsi148_MB_irqhandler(tsi148_bridge, stat);
-
-	/* PCI bus error */
-	if (stat & TSI148_LCSR_INTS_PERRS)
-		serviced |= tsi148_PERR_irqhandler(tsi148_bridge);
-
-	/* VME bus error */
-	if (stat & TSI148_LCSR_INTS_VERRS)
-		serviced |= tsi148_VERR_irqhandler(tsi148_bridge);
-
-	/* IACK irq */
-	if (stat & TSI148_LCSR_INTS_IACKS)
-		serviced |= tsi148_IACK_irqhandler(bridge);
-
-	/* VME bus irqs */
-	if (stat & (TSI148_LCSR_INTS_IRQ7S | TSI148_LCSR_INTS_IRQ6S |
-			TSI148_LCSR_INTS_IRQ5S | TSI148_LCSR_INTS_IRQ4S |
-			TSI148_LCSR_INTS_IRQ3S | TSI148_LCSR_INTS_IRQ2S |
-			TSI148_LCSR_INTS_IRQ1S))
-		serviced |= tsi148_VIRQ_irqhandler(tsi148_bridge, stat);
-
-	/* Clear serviced interrupts */
-	iowrite32be(serviced, bridge->base + TSI148_LCSR_INTC);
-
-	return IRQ_HANDLED;
-}
-
-static int tsi148_irq_init(struct vme_bridge *tsi148_bridge)
-{
-	int result;
-	unsigned int tmp;
-	struct pci_dev *pdev;
-	struct tsi148_driver *bridge;
-
-	pdev = to_pci_dev(tsi148_bridge->parent);
-
-	bridge = tsi148_bridge->driver_priv;
-
-	result = request_irq(pdev->irq,
-			     tsi148_irqhandler,
-			     IRQF_SHARED,
-			     driver_name, tsi148_bridge);
-	if (result) {
-		dev_err(tsi148_bridge->parent, "Can't get assigned pci irq "
-			"vector %02X\n", pdev->irq);
-		return result;
-	}
-
-	/* Enable and unmask interrupts */
-	tmp = TSI148_LCSR_INTEO_DMA1EO | TSI148_LCSR_INTEO_DMA0EO |
-		TSI148_LCSR_INTEO_MB3EO | TSI148_LCSR_INTEO_MB2EO |
-		TSI148_LCSR_INTEO_MB1EO | TSI148_LCSR_INTEO_MB0EO |
-		TSI148_LCSR_INTEO_PERREO | TSI148_LCSR_INTEO_VERREO |
-		TSI148_LCSR_INTEO_IACKEO;
-
-	/* This leaves the following interrupts masked.
-	 * TSI148_LCSR_INTEO_VIEEO
-	 * TSI148_LCSR_INTEO_SYSFLEO
-	 * TSI148_LCSR_INTEO_ACFLEO
-	 */
-
-	/* Don't enable Location Monitor interrupts here - they will be
-	 * enabled when the location monitors are properly configured and
-	 * a callback has been attached.
-	 * TSI148_LCSR_INTEO_LM0EO
-	 * TSI148_LCSR_INTEO_LM1EO
-	 * TSI148_LCSR_INTEO_LM2EO
-	 * TSI148_LCSR_INTEO_LM3EO
-	 */
-
-	/* Don't enable VME interrupts until we add a handler, else the board
-	 * will respond to it and we don't want that unless it knows how to
-	 * properly deal with it.
-	 * TSI148_LCSR_INTEO_IRQ7EO
-	 * TSI148_LCSR_INTEO_IRQ6EO
-	 * TSI148_LCSR_INTEO_IRQ5EO
-	 * TSI148_LCSR_INTEO_IRQ4EO
-	 * TSI148_LCSR_INTEO_IRQ3EO
-	 * TSI148_LCSR_INTEO_IRQ2EO
-	 * TSI148_LCSR_INTEO_IRQ1EO
-	 */
-
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
-
-	return 0;
-}
-
-static void tsi148_irq_exit(struct vme_bridge *tsi148_bridge,
-	struct pci_dev *pdev)
-{
-	struct tsi148_driver *bridge = tsi148_bridge->driver_priv;
-
-	/* Turn off interrupts */
-	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTEO);
-	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTEN);
-
-	/* Clear all interrupts */
-	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_INTC);
-
-	/* Detach interrupt handler */
-	free_irq(pdev->irq, tsi148_bridge);
-}
-
-/*
- * Check to see if an IACk has been received, return true (1) or false (0).
- */
-static int tsi148_iack_received(struct tsi148_driver *bridge)
-{
-	u32 tmp;
-
-	tmp = ioread32be(bridge->base + TSI148_LCSR_VICR);
-
-	if (tmp & TSI148_LCSR_VICR_IRQS)
-		return 0;
-	else
-		return 1;
-}
-
-/*
- * Configure VME interrupt
- */
-static void tsi148_irq_set(struct vme_bridge *tsi148_bridge, int level,
-	int state, int sync)
-{
-	struct pci_dev *pdev;
-	u32 tmp;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	/* We need to do the ordering differently for enabling and disabling */
-	if (state == 0) {
-		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
-		tmp &= ~TSI148_LCSR_INTEN_IRQEN[level - 1];
-		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
-
-		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
-		tmp &= ~TSI148_LCSR_INTEO_IRQEO[level - 1];
-		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
-
-		if (sync != 0) {
-			pdev = to_pci_dev(tsi148_bridge->parent);
-			synchronize_irq(pdev->irq);
-		}
-	} else {
-		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
-		tmp |= TSI148_LCSR_INTEO_IRQEO[level - 1];
-		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
-
-		tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
-		tmp |= TSI148_LCSR_INTEN_IRQEN[level - 1];
-		iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
-	}
-}
-
-/*
- * Generate a VME bus interrupt at the requested level & vector. Wait for
- * interrupt to be acked.
- */
-static int tsi148_irq_generate(struct vme_bridge *tsi148_bridge, int level,
-	int statid)
-{
-	u32 tmp;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	mutex_lock(&bridge->vme_int);
-
-	/* Read VICR register */
-	tmp = ioread32be(bridge->base + TSI148_LCSR_VICR);
-
-	/* Set Status/ID */
-	tmp = (tmp & ~TSI148_LCSR_VICR_STID_M) |
-		(statid & TSI148_LCSR_VICR_STID_M);
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_VICR);
-
-	/* Assert VMEbus IRQ */
-	tmp = tmp | TSI148_LCSR_VICR_IRQL[level];
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_VICR);
-
-	/* XXX Consider implementing a timeout? */
-	wait_event_interruptible(bridge->iack_queue,
-		tsi148_iack_received(bridge));
-
-	mutex_unlock(&bridge->vme_int);
-
-	return 0;
-}
-
-/*
- * Initialize a slave window with the requested attributes.
- */
-static int tsi148_slave_set(struct vme_slave_resource *image, int enabled,
-	unsigned long long vme_base, unsigned long long size,
-	dma_addr_t pci_base, u32 aspace, u32 cycle)
-{
-	unsigned int i, addr = 0, granularity = 0;
-	unsigned int temp_ctl = 0;
-	unsigned int vme_base_low, vme_base_high;
-	unsigned int vme_bound_low, vme_bound_high;
-	unsigned int pci_offset_low, pci_offset_high;
-	unsigned long long vme_bound, pci_offset;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	tsi148_bridge = image->parent;
-	bridge = tsi148_bridge->driver_priv;
-
-	i = image->number;
-
-	switch (aspace) {
-	case VME_A16:
-		granularity = 0x10;
-		addr |= TSI148_LCSR_ITAT_AS_A16;
-		break;
-	case VME_A24:
-		granularity = 0x1000;
-		addr |= TSI148_LCSR_ITAT_AS_A24;
-		break;
-	case VME_A32:
-		granularity = 0x10000;
-		addr |= TSI148_LCSR_ITAT_AS_A32;
-		break;
-	case VME_A64:
-		granularity = 0x10000;
-		addr |= TSI148_LCSR_ITAT_AS_A64;
-		break;
-	default:
-		dev_err(tsi148_bridge->parent, "Invalid address space\n");
-		return -EINVAL;
-	}
-
-	/* Convert 64-bit variables to 2x 32-bit variables */
-	reg_split(vme_base, &vme_base_high, &vme_base_low);
-
-	/*
-	 * Bound address is a valid address for the window, adjust
-	 * accordingly
-	 */
-	vme_bound = vme_base + size - granularity;
-	reg_split(vme_bound, &vme_bound_high, &vme_bound_low);
-	pci_offset = (unsigned long long)pci_base - vme_base;
-	reg_split(pci_offset, &pci_offset_high, &pci_offset_low);
-
-	if (vme_base_low & (granularity - 1)) {
-		dev_err(tsi148_bridge->parent, "Invalid VME base alignment\n");
-		return -EINVAL;
-	}
-	if (vme_bound_low & (granularity - 1)) {
-		dev_err(tsi148_bridge->parent, "Invalid VME bound alignment\n");
-		return -EINVAL;
-	}
-	if (pci_offset_low & (granularity - 1)) {
-		dev_err(tsi148_bridge->parent, "Invalid PCI Offset "
-			"alignment\n");
-		return -EINVAL;
-	}
-
-	/*  Disable while we are mucking around */
-	temp_ctl = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITAT);
-	temp_ctl &= ~TSI148_LCSR_ITAT_EN;
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITAT);
-
-	/* Setup mapping */
-	iowrite32be(vme_base_high, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITSAU);
-	iowrite32be(vme_base_low, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITSAL);
-	iowrite32be(vme_bound_high, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITEAU);
-	iowrite32be(vme_bound_low, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITEAL);
-	iowrite32be(pci_offset_high, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITOFU);
-	iowrite32be(pci_offset_low, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITOFL);
-
-	/* Setup 2eSST speeds */
-	temp_ctl &= ~TSI148_LCSR_ITAT_2eSSTM_M;
-	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
-	case VME_2eSST160:
-		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_160;
-		break;
-	case VME_2eSST267:
-		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_267;
-		break;
-	case VME_2eSST320:
-		temp_ctl |= TSI148_LCSR_ITAT_2eSSTM_320;
-		break;
-	}
-
-	/* Setup cycle types */
-	temp_ctl &= ~(0x1F << 7);
-	if (cycle & VME_BLT)
-		temp_ctl |= TSI148_LCSR_ITAT_BLT;
-	if (cycle & VME_MBLT)
-		temp_ctl |= TSI148_LCSR_ITAT_MBLT;
-	if (cycle & VME_2eVME)
-		temp_ctl |= TSI148_LCSR_ITAT_2eVME;
-	if (cycle & VME_2eSST)
-		temp_ctl |= TSI148_LCSR_ITAT_2eSST;
-	if (cycle & VME_2eSSTB)
-		temp_ctl |= TSI148_LCSR_ITAT_2eSSTB;
-
-	/* Setup address space */
-	temp_ctl &= ~TSI148_LCSR_ITAT_AS_M;
-	temp_ctl |= addr;
-
-	temp_ctl &= ~0xF;
-	if (cycle & VME_SUPER)
-		temp_ctl |= TSI148_LCSR_ITAT_SUPR ;
-	if (cycle & VME_USER)
-		temp_ctl |= TSI148_LCSR_ITAT_NPRIV;
-	if (cycle & VME_PROG)
-		temp_ctl |= TSI148_LCSR_ITAT_PGM;
-	if (cycle & VME_DATA)
-		temp_ctl |= TSI148_LCSR_ITAT_DATA;
-
-	/* Write ctl reg without enable */
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITAT);
-
-	if (enabled)
-		temp_ctl |= TSI148_LCSR_ITAT_EN;
-
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITAT);
-
-	return 0;
-}
-
-/*
- * Get slave window configuration.
- */
-static int tsi148_slave_get(struct vme_slave_resource *image, int *enabled,
-	unsigned long long *vme_base, unsigned long long *size,
-	dma_addr_t *pci_base, u32 *aspace, u32 *cycle)
-{
-	unsigned int i, granularity = 0, ctl = 0;
-	unsigned int vme_base_low, vme_base_high;
-	unsigned int vme_bound_low, vme_bound_high;
-	unsigned int pci_offset_low, pci_offset_high;
-	unsigned long long vme_bound, pci_offset;
-	struct tsi148_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	i = image->number;
-
-	/* Read registers */
-	ctl = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITAT);
-
-	vme_base_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITSAU);
-	vme_base_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITSAL);
-	vme_bound_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITEAU);
-	vme_bound_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITEAL);
-	pci_offset_high = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITOFU);
-	pci_offset_low = ioread32be(bridge->base + TSI148_LCSR_IT[i] +
-		TSI148_LCSR_OFFSET_ITOFL);
-
-	/* Convert 64-bit variables to 2x 32-bit variables */
-	reg_join(vme_base_high, vme_base_low, vme_base);
-	reg_join(vme_bound_high, vme_bound_low, &vme_bound);
-	reg_join(pci_offset_high, pci_offset_low, &pci_offset);
-
-	*pci_base = (dma_addr_t)(*vme_base + pci_offset);
-
-	*enabled = 0;
-	*aspace = 0;
-	*cycle = 0;
-
-	if (ctl & TSI148_LCSR_ITAT_EN)
-		*enabled = 1;
-
-	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A16) {
-		granularity = 0x10;
-		*aspace |= VME_A16;
-	}
-	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A24) {
-		granularity = 0x1000;
-		*aspace |= VME_A24;
-	}
-	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A32) {
-		granularity = 0x10000;
-		*aspace |= VME_A32;
-	}
-	if ((ctl & TSI148_LCSR_ITAT_AS_M) == TSI148_LCSR_ITAT_AS_A64) {
-		granularity = 0x10000;
-		*aspace |= VME_A64;
-	}
-
-	/* Need granularity before we set the size */
-	*size = (unsigned long long)((vme_bound - *vme_base) + granularity);
-
-
-	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_160)
-		*cycle |= VME_2eSST160;
-	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_267)
-		*cycle |= VME_2eSST267;
-	if ((ctl & TSI148_LCSR_ITAT_2eSSTM_M) == TSI148_LCSR_ITAT_2eSSTM_320)
-		*cycle |= VME_2eSST320;
-
-	if (ctl & TSI148_LCSR_ITAT_BLT)
-		*cycle |= VME_BLT;
-	if (ctl & TSI148_LCSR_ITAT_MBLT)
-		*cycle |= VME_MBLT;
-	if (ctl & TSI148_LCSR_ITAT_2eVME)
-		*cycle |= VME_2eVME;
-	if (ctl & TSI148_LCSR_ITAT_2eSST)
-		*cycle |= VME_2eSST;
-	if (ctl & TSI148_LCSR_ITAT_2eSSTB)
-		*cycle |= VME_2eSSTB;
-
-	if (ctl & TSI148_LCSR_ITAT_SUPR)
-		*cycle |= VME_SUPER;
-	if (ctl & TSI148_LCSR_ITAT_NPRIV)
-		*cycle |= VME_USER;
-	if (ctl & TSI148_LCSR_ITAT_PGM)
-		*cycle |= VME_PROG;
-	if (ctl & TSI148_LCSR_ITAT_DATA)
-		*cycle |= VME_DATA;
-
-	return 0;
-}
-
-/*
- * Allocate and map PCI Resource
- */
-static int tsi148_alloc_resource(struct vme_master_resource *image,
-	unsigned long long size)
-{
-	unsigned long long existing_size;
-	int retval = 0;
-	struct pci_dev *pdev;
-	struct vme_bridge *tsi148_bridge;
-
-	tsi148_bridge = image->parent;
-
-	pdev = to_pci_dev(tsi148_bridge->parent);
-
-	existing_size = (unsigned long long)(image->bus_resource.end -
-		image->bus_resource.start);
-
-	/* If the existing size is OK, return */
-	if ((size != 0) && (existing_size == (size - 1)))
-		return 0;
-
-	if (existing_size != 0) {
-		iounmap(image->kern_base);
-		image->kern_base = NULL;
-		kfree(image->bus_resource.name);
-		release_resource(&image->bus_resource);
-		memset(&image->bus_resource, 0, sizeof(image->bus_resource));
-	}
-
-	/* Exit here if size is zero */
-	if (size == 0)
-		return 0;
-
-	if (!image->bus_resource.name) {
-		image->bus_resource.name = kmalloc(VMENAMSIZ+3, GFP_ATOMIC);
-		if (!image->bus_resource.name) {
-			retval = -ENOMEM;
-			goto err_name;
-		}
-	}
-
-	sprintf((char *)image->bus_resource.name, "%s.%d", tsi148_bridge->name,
-		image->number);
-
-	image->bus_resource.start = 0;
-	image->bus_resource.end = (unsigned long)size;
-	image->bus_resource.flags = IORESOURCE_MEM;
-
-	retval = pci_bus_alloc_resource(pdev->bus,
-		&image->bus_resource, size, 0x10000, PCIBIOS_MIN_MEM,
-		0, NULL, NULL);
-	if (retval) {
-		dev_err(tsi148_bridge->parent, "Failed to allocate mem "
-			"resource for window %d size 0x%lx start 0x%lx\n",
-			image->number, (unsigned long)size,
-			(unsigned long)image->bus_resource.start);
-		goto err_resource;
-	}
-
-	image->kern_base = ioremap(
-		image->bus_resource.start, size);
-	if (!image->kern_base) {
-		dev_err(tsi148_bridge->parent, "Failed to remap resource\n");
-		retval = -ENOMEM;
-		goto err_remap;
-	}
-
-	return 0;
-
-err_remap:
-	release_resource(&image->bus_resource);
-err_resource:
-	kfree(image->bus_resource.name);
-	memset(&image->bus_resource, 0, sizeof(image->bus_resource));
-err_name:
-	return retval;
-}
-
-/*
- * Free and unmap PCI Resource
- */
-static void tsi148_free_resource(struct vme_master_resource *image)
-{
-	iounmap(image->kern_base);
-	image->kern_base = NULL;
-	release_resource(&image->bus_resource);
-	kfree(image->bus_resource.name);
-	memset(&image->bus_resource, 0, sizeof(image->bus_resource));
-}
-
-/*
- * Set the attributes of an outbound window.
- */
-static int tsi148_master_set(struct vme_master_resource *image, int enabled,
-	unsigned long long vme_base, unsigned long long size, u32 aspace,
-	u32 cycle, u32 dwidth)
-{
-	int retval = 0;
-	unsigned int i;
-	unsigned int temp_ctl = 0;
-	unsigned int pci_base_low, pci_base_high;
-	unsigned int pci_bound_low, pci_bound_high;
-	unsigned int vme_offset_low, vme_offset_high;
-	unsigned long long pci_bound, vme_offset, pci_base;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-	struct pci_bus_region region;
-	struct pci_dev *pdev;
-
-	tsi148_bridge = image->parent;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	pdev = to_pci_dev(tsi148_bridge->parent);
-
-	/* Verify input data */
-	if (vme_base & 0xFFFF) {
-		dev_err(tsi148_bridge->parent, "Invalid VME Window "
-			"alignment\n");
-		retval = -EINVAL;
-		goto err_window;
-	}
-
-	if ((size == 0) && (enabled != 0)) {
-		dev_err(tsi148_bridge->parent, "Size must be non-zero for "
-			"enabled windows\n");
-		retval = -EINVAL;
-		goto err_window;
-	}
-
-	spin_lock(&image->lock);
-
-	/* Let's allocate the resource here rather than further up the stack as
-	 * it avoids pushing loads of bus dependent stuff up the stack. If size
-	 * is zero, any existing resource will be freed.
-	 */
-	retval = tsi148_alloc_resource(image, size);
-	if (retval) {
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Unable to allocate memory for "
-			"resource\n");
-		goto err_res;
-	}
-
-	if (size == 0) {
-		pci_base = 0;
-		pci_bound = 0;
-		vme_offset = 0;
-	} else {
-		pcibios_resource_to_bus(pdev->bus, &region,
-					&image->bus_resource);
-		pci_base = region.start;
-
-		/*
-		 * Bound address is a valid address for the window, adjust
-		 * according to window granularity.
-		 */
-		pci_bound = pci_base + (size - 0x10000);
-		vme_offset = vme_base - pci_base;
-	}
-
-	/* Convert 64-bit variables to 2x 32-bit variables */
-	reg_split(pci_base, &pci_base_high, &pci_base_low);
-	reg_split(pci_bound, &pci_bound_high, &pci_bound_low);
-	reg_split(vme_offset, &vme_offset_high, &vme_offset_low);
-
-	if (pci_base_low & 0xFFFF) {
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Invalid PCI base alignment\n");
-		retval = -EINVAL;
-		goto err_gran;
-	}
-	if (pci_bound_low & 0xFFFF) {
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Invalid PCI bound alignment\n");
-		retval = -EINVAL;
-		goto err_gran;
-	}
-	if (vme_offset_low & 0xFFFF) {
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Invalid VME Offset "
-			"alignment\n");
-		retval = -EINVAL;
-		goto err_gran;
-	}
-
-	i = image->number;
-
-	/* Disable while we are mucking around */
-	temp_ctl = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTAT);
-	temp_ctl &= ~TSI148_LCSR_OTAT_EN;
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTAT);
-
-	/* Setup 2eSST speeds */
-	temp_ctl &= ~TSI148_LCSR_OTAT_2eSSTM_M;
-	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
-	case VME_2eSST160:
-		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_160;
-		break;
-	case VME_2eSST267:
-		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_267;
-		break;
-	case VME_2eSST320:
-		temp_ctl |= TSI148_LCSR_OTAT_2eSSTM_320;
-		break;
-	}
-
-	/* Setup cycle types */
-	if (cycle & VME_BLT) {
-		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
-		temp_ctl |= TSI148_LCSR_OTAT_TM_BLT;
-	}
-	if (cycle & VME_MBLT) {
-		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
-		temp_ctl |= TSI148_LCSR_OTAT_TM_MBLT;
-	}
-	if (cycle & VME_2eVME) {
-		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
-		temp_ctl |= TSI148_LCSR_OTAT_TM_2eVME;
-	}
-	if (cycle & VME_2eSST) {
-		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
-		temp_ctl |= TSI148_LCSR_OTAT_TM_2eSST;
-	}
-	if (cycle & VME_2eSSTB) {
-		dev_warn(tsi148_bridge->parent, "Currently not setting "
-			"Broadcast Select Registers\n");
-		temp_ctl &= ~TSI148_LCSR_OTAT_TM_M;
-		temp_ctl |= TSI148_LCSR_OTAT_TM_2eSSTB;
-	}
-
-	/* Setup data width */
-	temp_ctl &= ~TSI148_LCSR_OTAT_DBW_M;
-	switch (dwidth) {
-	case VME_D16:
-		temp_ctl |= TSI148_LCSR_OTAT_DBW_16;
-		break;
-	case VME_D32:
-		temp_ctl |= TSI148_LCSR_OTAT_DBW_32;
-		break;
-	default:
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Invalid data width\n");
-		retval = -EINVAL;
-		goto err_dwidth;
-	}
-
-	/* Setup address space */
-	temp_ctl &= ~TSI148_LCSR_OTAT_AMODE_M;
-	switch (aspace) {
-	case VME_A16:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A16;
-		break;
-	case VME_A24:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A24;
-		break;
-	case VME_A32:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A32;
-		break;
-	case VME_A64:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_A64;
-		break;
-	case VME_CRCSR:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_CRCSR;
-		break;
-	case VME_USER1:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER1;
-		break;
-	case VME_USER2:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER2;
-		break;
-	case VME_USER3:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER3;
-		break;
-	case VME_USER4:
-		temp_ctl |= TSI148_LCSR_OTAT_AMODE_USER4;
-		break;
-	default:
-		spin_unlock(&image->lock);
-		dev_err(tsi148_bridge->parent, "Invalid address space\n");
-		retval = -EINVAL;
-		goto err_aspace;
-	}
-
-	temp_ctl &= ~(3<<4);
-	if (cycle & VME_SUPER)
-		temp_ctl |= TSI148_LCSR_OTAT_SUP;
-	if (cycle & VME_PROG)
-		temp_ctl |= TSI148_LCSR_OTAT_PGM;
-
-	/* Setup mapping */
-	iowrite32be(pci_base_high, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAU);
-	iowrite32be(pci_base_low, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAL);
-	iowrite32be(pci_bound_high, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTEAU);
-	iowrite32be(pci_bound_low, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTEAL);
-	iowrite32be(vme_offset_high, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTOFU);
-	iowrite32be(vme_offset_low, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTOFL);
-
-	/* Write ctl reg without enable */
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTAT);
-
-	if (enabled)
-		temp_ctl |= TSI148_LCSR_OTAT_EN;
-
-	iowrite32be(temp_ctl, bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTAT);
-
-	spin_unlock(&image->lock);
-	return 0;
-
-err_aspace:
-err_dwidth:
-err_gran:
-	tsi148_free_resource(image);
-err_res:
-err_window:
-	return retval;
-
-}
-
-/*
- * Set the attributes of an outbound window.
- *
- * XXX Not parsing prefetch information.
- */
-static int __tsi148_master_get(struct vme_master_resource *image, int *enabled,
-	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
-	u32 *cycle, u32 *dwidth)
-{
-	unsigned int i, ctl;
-	unsigned int pci_base_low, pci_base_high;
-	unsigned int pci_bound_low, pci_bound_high;
-	unsigned int vme_offset_low, vme_offset_high;
-
-	unsigned long long pci_base, pci_bound, vme_offset;
-	struct tsi148_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	i = image->number;
-
-	ctl = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTAT);
-
-	pci_base_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAU);
-	pci_base_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAL);
-	pci_bound_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTEAU);
-	pci_bound_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTEAL);
-	vme_offset_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTOFU);
-	vme_offset_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTOFL);
-
-	/* Convert 64-bit variables to 2x 32-bit variables */
-	reg_join(pci_base_high, pci_base_low, &pci_base);
-	reg_join(pci_bound_high, pci_bound_low, &pci_bound);
-	reg_join(vme_offset_high, vme_offset_low, &vme_offset);
-
-	*vme_base = pci_base + vme_offset;
-	*size = (unsigned long long)(pci_bound - pci_base) + 0x10000;
-
-	*enabled = 0;
-	*aspace = 0;
-	*cycle = 0;
-	*dwidth = 0;
-
-	if (ctl & TSI148_LCSR_OTAT_EN)
-		*enabled = 1;
-
-	/* Setup address space */
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A16)
-		*aspace |= VME_A16;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A24)
-		*aspace |= VME_A24;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A32)
-		*aspace |= VME_A32;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_A64)
-		*aspace |= VME_A64;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_CRCSR)
-		*aspace |= VME_CRCSR;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER1)
-		*aspace |= VME_USER1;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER2)
-		*aspace |= VME_USER2;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER3)
-		*aspace |= VME_USER3;
-	if ((ctl & TSI148_LCSR_OTAT_AMODE_M) == TSI148_LCSR_OTAT_AMODE_USER4)
-		*aspace |= VME_USER4;
-
-	/* Setup 2eSST speeds */
-	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_160)
-		*cycle |= VME_2eSST160;
-	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_267)
-		*cycle |= VME_2eSST267;
-	if ((ctl & TSI148_LCSR_OTAT_2eSSTM_M) == TSI148_LCSR_OTAT_2eSSTM_320)
-		*cycle |= VME_2eSST320;
-
-	/* Setup cycle types */
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_SCT)
-		*cycle |= VME_SCT;
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_BLT)
-		*cycle |= VME_BLT;
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_MBLT)
-		*cycle |= VME_MBLT;
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eVME)
-		*cycle |= VME_2eVME;
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eSST)
-		*cycle |= VME_2eSST;
-	if ((ctl & TSI148_LCSR_OTAT_TM_M) == TSI148_LCSR_OTAT_TM_2eSSTB)
-		*cycle |= VME_2eSSTB;
-
-	if (ctl & TSI148_LCSR_OTAT_SUP)
-		*cycle |= VME_SUPER;
-	else
-		*cycle |= VME_USER;
-
-	if (ctl & TSI148_LCSR_OTAT_PGM)
-		*cycle |= VME_PROG;
-	else
-		*cycle |= VME_DATA;
-
-	/* Setup data width */
-	if ((ctl & TSI148_LCSR_OTAT_DBW_M) == TSI148_LCSR_OTAT_DBW_16)
-		*dwidth = VME_D16;
-	if ((ctl & TSI148_LCSR_OTAT_DBW_M) == TSI148_LCSR_OTAT_DBW_32)
-		*dwidth = VME_D32;
-
-	return 0;
-}
-
-
-static int tsi148_master_get(struct vme_master_resource *image, int *enabled,
-	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
-	u32 *cycle, u32 *dwidth)
-{
-	int retval;
-
-	spin_lock(&image->lock);
-
-	retval = __tsi148_master_get(image, enabled, vme_base, size, aspace,
-		cycle, dwidth);
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-static ssize_t tsi148_master_read(struct vme_master_resource *image, void *buf,
-	size_t count, loff_t offset)
-{
-	int retval, enabled;
-	unsigned long long vme_base, size;
-	u32 aspace, cycle, dwidth;
-	struct vme_error_handler *handler = NULL;
-	struct vme_bridge *tsi148_bridge;
-	void __iomem *addr = image->kern_base + offset;
-	unsigned int done = 0;
-	unsigned int count32;
-
-	tsi148_bridge = image->parent;
-
-	spin_lock(&image->lock);
-
-	if (err_chk) {
-		__tsi148_master_get(image, &enabled, &vme_base, &size, &aspace,
-				    &cycle, &dwidth);
-		handler = vme_register_error_handler(tsi148_bridge, aspace,
-						     vme_base + offset, count);
-		if (!handler) {
-			spin_unlock(&image->lock);
-			return -ENOMEM;
-		}
-	}
-
-	/* The following code handles VME address alignment. We cannot use
-	 * memcpy_xxx here because it may cut data transfers in to 8-bit
-	 * cycles when D16 or D32 cycles are required on the VME bus.
-	 * On the other hand, the bridge itself assures that the maximum data
-	 * cycle configured for the transfer is used and splits it
-	 * automatically for non-aligned addresses, so we don't want the
-	 * overhead of needlessly forcing small transfers for the entire cycle.
-	 */
-	if ((uintptr_t)addr & 0x1) {
-		*(u8 *)buf = ioread8(addr);
-		done += 1;
-		if (done == count)
-			goto out;
-	}
-	if ((uintptr_t)(addr + done) & 0x2) {
-		if ((count - done) < 2) {
-			*(u8 *)(buf + done) = ioread8(addr + done);
-			done += 1;
-			goto out;
-		} else {
-			*(u16 *)(buf + done) = ioread16(addr + done);
-			done += 2;
-		}
-	}
-
-	count32 = (count - done) & ~0x3;
-	while (done < count32) {
-		*(u32 *)(buf + done) = ioread32(addr + done);
-		done += 4;
-	}
-
-	if ((count - done) & 0x2) {
-		*(u16 *)(buf + done) = ioread16(addr + done);
-		done += 2;
-	}
-	if ((count - done) & 0x1) {
-		*(u8 *)(buf + done) = ioread8(addr + done);
-		done += 1;
-	}
-
-out:
-	retval = count;
-
-	if (err_chk) {
-		if (handler->num_errors) {
-			dev_err(image->parent->parent,
-				"First VME read error detected an at address 0x%llx\n",
-				handler->first_error);
-			retval = handler->first_error - (vme_base + offset);
-		}
-		vme_unregister_error_handler(handler);
-	}
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-
-static ssize_t tsi148_master_write(struct vme_master_resource *image, void *buf,
-	size_t count, loff_t offset)
-{
-	int retval = 0, enabled;
-	unsigned long long vme_base, size;
-	u32 aspace, cycle, dwidth;
-	void __iomem *addr = image->kern_base + offset;
-	unsigned int done = 0;
-	unsigned int count32;
-
-	struct vme_error_handler *handler = NULL;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	tsi148_bridge = image->parent;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	spin_lock(&image->lock);
-
-	if (err_chk) {
-		__tsi148_master_get(image, &enabled, &vme_base, &size, &aspace,
-				    &cycle, &dwidth);
-		handler = vme_register_error_handler(tsi148_bridge, aspace,
-						     vme_base + offset, count);
-		if (!handler) {
-			spin_unlock(&image->lock);
-			return -ENOMEM;
-		}
-	}
-
-	/* Here we apply for the same strategy we do in master_read
-	 * function in order to assure the correct cycles.
-	 */
-	if ((uintptr_t)addr & 0x1) {
-		iowrite8(*(u8 *)buf, addr);
-		done += 1;
-		if (done == count)
-			goto out;
-	}
-	if ((uintptr_t)(addr + done) & 0x2) {
-		if ((count - done) < 2) {
-			iowrite8(*(u8 *)(buf + done), addr + done);
-			done += 1;
-			goto out;
-		} else {
-			iowrite16(*(u16 *)(buf + done), addr + done);
-			done += 2;
-		}
-	}
-
-	count32 = (count - done) & ~0x3;
-	while (done < count32) {
-		iowrite32(*(u32 *)(buf + done), addr + done);
-		done += 4;
-	}
-
-	if ((count - done) & 0x2) {
-		iowrite16(*(u16 *)(buf + done), addr + done);
-		done += 2;
-	}
-	if ((count - done) & 0x1) {
-		iowrite8(*(u8 *)(buf + done), addr + done);
-		done += 1;
-	}
-
-out:
-	retval = count;
-
-	/*
-	 * Writes are posted. We need to do a read on the VME bus to flush out
-	 * all of the writes before we check for errors. We can't guarantee
-	 * that reading the data we have just written is safe. It is believed
-	 * that there isn't any read, write re-ordering, so we can read any
-	 * location in VME space, so lets read the Device ID from the tsi148's
-	 * own registers as mapped into CR/CSR space.
-	 *
-	 * We check for saved errors in the written address range/space.
-	 */
-
-	if (err_chk) {
-		ioread16(bridge->flush_image->kern_base + 0x7F000);
-
-		if (handler->num_errors) {
-			dev_warn(tsi148_bridge->parent,
-				 "First VME write error detected an at address 0x%llx\n",
-				 handler->first_error);
-			retval = handler->first_error - (vme_base + offset);
-		}
-		vme_unregister_error_handler(handler);
-	}
-
-	spin_unlock(&image->lock);
-
-	return retval;
-}
-
-/*
- * Perform an RMW cycle on the VME bus.
- *
- * Requires a previously configured master window, returns final value.
- */
-static unsigned int tsi148_master_rmw(struct vme_master_resource *image,
-	unsigned int mask, unsigned int compare, unsigned int swap,
-	loff_t offset)
-{
-	unsigned long long pci_addr;
-	unsigned int pci_addr_high, pci_addr_low;
-	u32 tmp, result;
-	int i;
-	struct tsi148_driver *bridge;
-
-	bridge = image->parent->driver_priv;
-
-	/* Find the PCI address that maps to the desired VME address */
-	i = image->number;
-
-	/* Locking as we can only do one of these at a time */
-	mutex_lock(&bridge->vme_rmw);
-
-	/* Lock image */
-	spin_lock(&image->lock);
-
-	pci_addr_high = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAU);
-	pci_addr_low = ioread32be(bridge->base + TSI148_LCSR_OT[i] +
-		TSI148_LCSR_OFFSET_OTSAL);
-
-	reg_join(pci_addr_high, pci_addr_low, &pci_addr);
-	reg_split(pci_addr + offset, &pci_addr_high, &pci_addr_low);
-
-	/* Configure registers */
-	iowrite32be(mask, bridge->base + TSI148_LCSR_RMWEN);
-	iowrite32be(compare, bridge->base + TSI148_LCSR_RMWC);
-	iowrite32be(swap, bridge->base + TSI148_LCSR_RMWS);
-	iowrite32be(pci_addr_high, bridge->base + TSI148_LCSR_RMWAU);
-	iowrite32be(pci_addr_low, bridge->base + TSI148_LCSR_RMWAL);
-
-	/* Enable RMW */
-	tmp = ioread32be(bridge->base + TSI148_LCSR_VMCTRL);
-	tmp |= TSI148_LCSR_VMCTRL_RMWEN;
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_VMCTRL);
-
-	/* Kick process off with a read to the required address. */
-	result = ioread32be(image->kern_base + offset);
-
-	/* Disable RMW */
-	tmp = ioread32be(bridge->base + TSI148_LCSR_VMCTRL);
-	tmp &= ~TSI148_LCSR_VMCTRL_RMWEN;
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_VMCTRL);
-
-	spin_unlock(&image->lock);
-
-	mutex_unlock(&bridge->vme_rmw);
-
-	return result;
-}
-
-static int tsi148_dma_set_vme_src_attributes(struct device *dev, __be32 *attr,
-	u32 aspace, u32 cycle, u32 dwidth)
-{
-	u32 val;
-
-	val = be32_to_cpu(*attr);
-
-	/* Setup 2eSST speeds */
-	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
-	case VME_2eSST160:
-		val |= TSI148_LCSR_DSAT_2eSSTM_160;
-		break;
-	case VME_2eSST267:
-		val |= TSI148_LCSR_DSAT_2eSSTM_267;
-		break;
-	case VME_2eSST320:
-		val |= TSI148_LCSR_DSAT_2eSSTM_320;
-		break;
-	}
-
-	/* Setup cycle types */
-	if (cycle & VME_SCT)
-		val |= TSI148_LCSR_DSAT_TM_SCT;
-
-	if (cycle & VME_BLT)
-		val |= TSI148_LCSR_DSAT_TM_BLT;
-
-	if (cycle & VME_MBLT)
-		val |= TSI148_LCSR_DSAT_TM_MBLT;
-
-	if (cycle & VME_2eVME)
-		val |= TSI148_LCSR_DSAT_TM_2eVME;
-
-	if (cycle & VME_2eSST)
-		val |= TSI148_LCSR_DSAT_TM_2eSST;
-
-	if (cycle & VME_2eSSTB) {
-		dev_err(dev, "Currently not setting Broadcast Select "
-			"Registers\n");
-		val |= TSI148_LCSR_DSAT_TM_2eSSTB;
-	}
-
-	/* Setup data width */
-	switch (dwidth) {
-	case VME_D16:
-		val |= TSI148_LCSR_DSAT_DBW_16;
-		break;
-	case VME_D32:
-		val |= TSI148_LCSR_DSAT_DBW_32;
-		break;
-	default:
-		dev_err(dev, "Invalid data width\n");
-		return -EINVAL;
-	}
-
-	/* Setup address space */
-	switch (aspace) {
-	case VME_A16:
-		val |= TSI148_LCSR_DSAT_AMODE_A16;
-		break;
-	case VME_A24:
-		val |= TSI148_LCSR_DSAT_AMODE_A24;
-		break;
-	case VME_A32:
-		val |= TSI148_LCSR_DSAT_AMODE_A32;
-		break;
-	case VME_A64:
-		val |= TSI148_LCSR_DSAT_AMODE_A64;
-		break;
-	case VME_CRCSR:
-		val |= TSI148_LCSR_DSAT_AMODE_CRCSR;
-		break;
-	case VME_USER1:
-		val |= TSI148_LCSR_DSAT_AMODE_USER1;
-		break;
-	case VME_USER2:
-		val |= TSI148_LCSR_DSAT_AMODE_USER2;
-		break;
-	case VME_USER3:
-		val |= TSI148_LCSR_DSAT_AMODE_USER3;
-		break;
-	case VME_USER4:
-		val |= TSI148_LCSR_DSAT_AMODE_USER4;
-		break;
-	default:
-		dev_err(dev, "Invalid address space\n");
-		return -EINVAL;
-	}
-
-	if (cycle & VME_SUPER)
-		val |= TSI148_LCSR_DSAT_SUP;
-	if (cycle & VME_PROG)
-		val |= TSI148_LCSR_DSAT_PGM;
-
-	*attr = cpu_to_be32(val);
-
-	return 0;
-}
-
-static int tsi148_dma_set_vme_dest_attributes(struct device *dev, __be32 *attr,
-	u32 aspace, u32 cycle, u32 dwidth)
-{
-	u32 val;
-
-	val = be32_to_cpu(*attr);
-
-	/* Setup 2eSST speeds */
-	switch (cycle & (VME_2eSST160 | VME_2eSST267 | VME_2eSST320)) {
-	case VME_2eSST160:
-		val |= TSI148_LCSR_DDAT_2eSSTM_160;
-		break;
-	case VME_2eSST267:
-		val |= TSI148_LCSR_DDAT_2eSSTM_267;
-		break;
-	case VME_2eSST320:
-		val |= TSI148_LCSR_DDAT_2eSSTM_320;
-		break;
-	}
-
-	/* Setup cycle types */
-	if (cycle & VME_SCT)
-		val |= TSI148_LCSR_DDAT_TM_SCT;
-
-	if (cycle & VME_BLT)
-		val |= TSI148_LCSR_DDAT_TM_BLT;
-
-	if (cycle & VME_MBLT)
-		val |= TSI148_LCSR_DDAT_TM_MBLT;
-
-	if (cycle & VME_2eVME)
-		val |= TSI148_LCSR_DDAT_TM_2eVME;
-
-	if (cycle & VME_2eSST)
-		val |= TSI148_LCSR_DDAT_TM_2eSST;
-
-	if (cycle & VME_2eSSTB) {
-		dev_err(dev, "Currently not setting Broadcast Select "
-			"Registers\n");
-		val |= TSI148_LCSR_DDAT_TM_2eSSTB;
-	}
-
-	/* Setup data width */
-	switch (dwidth) {
-	case VME_D16:
-		val |= TSI148_LCSR_DDAT_DBW_16;
-		break;
-	case VME_D32:
-		val |= TSI148_LCSR_DDAT_DBW_32;
-		break;
-	default:
-		dev_err(dev, "Invalid data width\n");
-		return -EINVAL;
-	}
-
-	/* Setup address space */
-	switch (aspace) {
-	case VME_A16:
-		val |= TSI148_LCSR_DDAT_AMODE_A16;
-		break;
-	case VME_A24:
-		val |= TSI148_LCSR_DDAT_AMODE_A24;
-		break;
-	case VME_A32:
-		val |= TSI148_LCSR_DDAT_AMODE_A32;
-		break;
-	case VME_A64:
-		val |= TSI148_LCSR_DDAT_AMODE_A64;
-		break;
-	case VME_CRCSR:
-		val |= TSI148_LCSR_DDAT_AMODE_CRCSR;
-		break;
-	case VME_USER1:
-		val |= TSI148_LCSR_DDAT_AMODE_USER1;
-		break;
-	case VME_USER2:
-		val |= TSI148_LCSR_DDAT_AMODE_USER2;
-		break;
-	case VME_USER3:
-		val |= TSI148_LCSR_DDAT_AMODE_USER3;
-		break;
-	case VME_USER4:
-		val |= TSI148_LCSR_DDAT_AMODE_USER4;
-		break;
-	default:
-		dev_err(dev, "Invalid address space\n");
-		return -EINVAL;
-	}
-
-	if (cycle & VME_SUPER)
-		val |= TSI148_LCSR_DDAT_SUP;
-	if (cycle & VME_PROG)
-		val |= TSI148_LCSR_DDAT_PGM;
-
-	*attr = cpu_to_be32(val);
-
-	return 0;
-}
-
-/*
- * Add a link list descriptor to the list
- *
- * Note: DMA engine expects the DMA descriptor to be big endian.
- */
-static int tsi148_dma_list_add(struct vme_dma_list *list,
-	struct vme_dma_attr *src, struct vme_dma_attr *dest, size_t count)
-{
-	struct tsi148_dma_entry *entry, *prev;
-	u32 address_high, address_low, val;
-	struct vme_dma_pattern *pattern_attr;
-	struct vme_dma_pci *pci_attr;
-	struct vme_dma_vme *vme_attr;
-	int retval = 0;
-	struct vme_bridge *tsi148_bridge;
-
-	tsi148_bridge = list->parent->parent;
-
-	/* Descriptor must be aligned on 64-bit boundaries */
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry) {
-		retval = -ENOMEM;
-		goto err_mem;
-	}
-
-	/* Test descriptor alignment */
-	if ((unsigned long)&entry->descriptor & 0x7) {
-		dev_err(tsi148_bridge->parent, "Descriptor not aligned to 8 "
-			"byte boundary as required: %p\n",
-			&entry->descriptor);
-		retval = -EINVAL;
-		goto err_align;
-	}
-
-	/* Given we are going to fill out the structure, we probably don't
-	 * need to zero it, but better safe than sorry for now.
-	 */
-	memset(&entry->descriptor, 0, sizeof(entry->descriptor));
-
-	/* Fill out source part */
-	switch (src->type) {
-	case VME_DMA_PATTERN:
-		pattern_attr = src->private;
-
-		entry->descriptor.dsal = cpu_to_be32(pattern_attr->pattern);
-
-		val = TSI148_LCSR_DSAT_TYP_PAT;
-
-		/* Default behaviour is 32 bit pattern */
-		if (pattern_attr->type & VME_DMA_PATTERN_BYTE)
-			val |= TSI148_LCSR_DSAT_PSZ;
-
-		/* It seems that the default behaviour is to increment */
-		if ((pattern_attr->type & VME_DMA_PATTERN_INCREMENT) == 0)
-			val |= TSI148_LCSR_DSAT_NIN;
-		entry->descriptor.dsat = cpu_to_be32(val);
-		break;
-	case VME_DMA_PCI:
-		pci_attr = src->private;
-
-		reg_split((unsigned long long)pci_attr->address, &address_high,
-			&address_low);
-		entry->descriptor.dsau = cpu_to_be32(address_high);
-		entry->descriptor.dsal = cpu_to_be32(address_low);
-		entry->descriptor.dsat = cpu_to_be32(TSI148_LCSR_DSAT_TYP_PCI);
-		break;
-	case VME_DMA_VME:
-		vme_attr = src->private;
-
-		reg_split((unsigned long long)vme_attr->address, &address_high,
-			&address_low);
-		entry->descriptor.dsau = cpu_to_be32(address_high);
-		entry->descriptor.dsal = cpu_to_be32(address_low);
-		entry->descriptor.dsat = cpu_to_be32(TSI148_LCSR_DSAT_TYP_VME);
-
-		retval = tsi148_dma_set_vme_src_attributes(
-			tsi148_bridge->parent, &entry->descriptor.dsat,
-			vme_attr->aspace, vme_attr->cycle, vme_attr->dwidth);
-		if (retval < 0)
-			goto err_source;
-		break;
-	default:
-		dev_err(tsi148_bridge->parent, "Invalid source type\n");
-		retval = -EINVAL;
-		goto err_source;
-	}
-
-	/* Assume last link - this will be over-written by adding another */
-	entry->descriptor.dnlau = cpu_to_be32(0);
-	entry->descriptor.dnlal = cpu_to_be32(TSI148_LCSR_DNLAL_LLA);
-
-	/* Fill out destination part */
-	switch (dest->type) {
-	case VME_DMA_PCI:
-		pci_attr = dest->private;
-
-		reg_split((unsigned long long)pci_attr->address, &address_high,
-			&address_low);
-		entry->descriptor.ddau = cpu_to_be32(address_high);
-		entry->descriptor.ddal = cpu_to_be32(address_low);
-		entry->descriptor.ddat = cpu_to_be32(TSI148_LCSR_DDAT_TYP_PCI);
-		break;
-	case VME_DMA_VME:
-		vme_attr = dest->private;
-
-		reg_split((unsigned long long)vme_attr->address, &address_high,
-			&address_low);
-		entry->descriptor.ddau = cpu_to_be32(address_high);
-		entry->descriptor.ddal = cpu_to_be32(address_low);
-		entry->descriptor.ddat = cpu_to_be32(TSI148_LCSR_DDAT_TYP_VME);
-
-		retval = tsi148_dma_set_vme_dest_attributes(
-			tsi148_bridge->parent, &entry->descriptor.ddat,
-			vme_attr->aspace, vme_attr->cycle, vme_attr->dwidth);
-		if (retval < 0)
-			goto err_dest;
-		break;
-	default:
-		dev_err(tsi148_bridge->parent, "Invalid destination type\n");
-		retval = -EINVAL;
-		goto err_dest;
-	}
-
-	/* Fill out count */
-	entry->descriptor.dcnt = cpu_to_be32((u32)count);
-
-	/* Add to list */
-	list_add_tail(&entry->list, &list->entries);
-
-	entry->dma_handle = dma_map_single(tsi148_bridge->parent,
-					   &entry->descriptor,
-					   sizeof(entry->descriptor),
-					   DMA_TO_DEVICE);
-	if (dma_mapping_error(tsi148_bridge->parent, entry->dma_handle)) {
-		dev_err(tsi148_bridge->parent, "DMA mapping error\n");
-		retval = -EINVAL;
-		goto err_dma;
-	}
-
-	/* Fill out previous descriptors "Next Address" */
-	if (entry->list.prev != &list->entries) {
-		reg_split((unsigned long long)entry->dma_handle, &address_high,
-			&address_low);
-		prev = list_entry(entry->list.prev, struct tsi148_dma_entry,
-				  list);
-		prev->descriptor.dnlau = cpu_to_be32(address_high);
-		prev->descriptor.dnlal = cpu_to_be32(address_low);
-
-	}
-
-	return 0;
-
-err_dma:
-err_dest:
-err_source:
-err_align:
-		kfree(entry);
-err_mem:
-	return retval;
-}
-
-/*
- * Check to see if the provided DMA channel is busy.
- */
-static int tsi148_dma_busy(struct vme_bridge *tsi148_bridge, int channel)
-{
-	u32 tmp;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	tmp = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
-		TSI148_LCSR_OFFSET_DSTA);
-
-	if (tmp & TSI148_LCSR_DSTA_BSY)
-		return 0;
-	else
-		return 1;
-
-}
-
-/*
- * Execute a previously generated link list
- *
- * XXX Need to provide control register configuration.
- */
-static int tsi148_dma_list_exec(struct vme_dma_list *list)
-{
-	struct vme_dma_resource *ctrlr;
-	int channel, retval;
-	struct tsi148_dma_entry *entry;
-	u32 bus_addr_high, bus_addr_low;
-	u32 val, dctlreg = 0;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	ctrlr = list->parent;
-
-	tsi148_bridge = ctrlr->parent;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	mutex_lock(&ctrlr->mtx);
-
-	channel = ctrlr->number;
-
-	if (!list_empty(&ctrlr->running)) {
-		/*
-		 * XXX We have an active DMA transfer and currently haven't
-		 *     sorted out the mechanism for "pending" DMA transfers.
-		 *     Return busy.
-		 */
-		/* Need to add to pending here */
-		mutex_unlock(&ctrlr->mtx);
-		return -EBUSY;
-	} else {
-		list_add(&list->list, &ctrlr->running);
-	}
-
-	/* Get first bus address and write into registers */
-	entry = list_first_entry(&list->entries, struct tsi148_dma_entry,
-		list);
-
-	mutex_unlock(&ctrlr->mtx);
-
-	reg_split(entry->dma_handle, &bus_addr_high, &bus_addr_low);
-
-	iowrite32be(bus_addr_high, bridge->base +
-		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DNLAU);
-	iowrite32be(bus_addr_low, bridge->base +
-		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DNLAL);
-
-	dctlreg = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
-		TSI148_LCSR_OFFSET_DCTL);
-
-	/* Start the operation */
-	iowrite32be(dctlreg | TSI148_LCSR_DCTL_DGO, bridge->base +
-		TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DCTL);
-
-	retval = wait_event_interruptible(bridge->dma_queue[channel],
-		tsi148_dma_busy(ctrlr->parent, channel));
-
-	if (retval) {
-		iowrite32be(dctlreg | TSI148_LCSR_DCTL_ABT, bridge->base +
-			TSI148_LCSR_DMA[channel] + TSI148_LCSR_OFFSET_DCTL);
-		/* Wait for the operation to abort */
-		wait_event(bridge->dma_queue[channel],
-			   tsi148_dma_busy(ctrlr->parent, channel));
-		retval = -EINTR;
-		goto exit;
-	}
-
-	/*
-	 * Read status register, this register is valid until we kick off a
-	 * new transfer.
-	 */
-	val = ioread32be(bridge->base + TSI148_LCSR_DMA[channel] +
-		TSI148_LCSR_OFFSET_DSTA);
-
-	if (val & TSI148_LCSR_DSTA_VBE) {
-		dev_err(tsi148_bridge->parent, "DMA Error. DSTA=%08X\n", val);
-		retval = -EIO;
-	}
-
-exit:
-	/* Remove list from running list */
-	mutex_lock(&ctrlr->mtx);
-	list_del(&list->list);
-	mutex_unlock(&ctrlr->mtx);
-
-	return retval;
-}
-
-/*
- * Clean up a previously generated link list
- *
- * We have a separate function, don't assume that the chain can't be reused.
- */
-static int tsi148_dma_list_empty(struct vme_dma_list *list)
-{
-	struct list_head *pos, *temp;
-	struct tsi148_dma_entry *entry;
-
-	struct vme_bridge *tsi148_bridge = list->parent->parent;
-
-	/* detach and free each entry */
-	list_for_each_safe(pos, temp, &list->entries) {
-		list_del(pos);
-		entry = list_entry(pos, struct tsi148_dma_entry, list);
-
-		dma_unmap_single(tsi148_bridge->parent, entry->dma_handle,
-			sizeof(struct tsi148_dma_descriptor), DMA_TO_DEVICE);
-		kfree(entry);
-	}
-
-	return 0;
-}
-
-/*
- * All 4 location monitors reside at the same base - this is therefore a
- * system wide configuration.
- *
- * This does not enable the LM monitor - that should be done when the first
- * callback is attached and disabled when the last callback is removed.
- */
-static int tsi148_lm_set(struct vme_lm_resource *lm, unsigned long long lm_base,
-	u32 aspace, u32 cycle)
-{
-	u32 lm_base_high, lm_base_low, lm_ctl = 0;
-	int i;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	tsi148_bridge = lm->parent;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* If we already have a callback attached, we can't move it! */
-	for (i = 0; i < lm->monitors; i++) {
-		if (bridge->lm_callback[i]) {
-			mutex_unlock(&lm->mtx);
-			dev_err(tsi148_bridge->parent, "Location monitor "
-				"callback attached, can't reset\n");
-			return -EBUSY;
-		}
-	}
-
-	switch (aspace) {
-	case VME_A16:
-		lm_ctl |= TSI148_LCSR_LMAT_AS_A16;
-		break;
-	case VME_A24:
-		lm_ctl |= TSI148_LCSR_LMAT_AS_A24;
-		break;
-	case VME_A32:
-		lm_ctl |= TSI148_LCSR_LMAT_AS_A32;
-		break;
-	case VME_A64:
-		lm_ctl |= TSI148_LCSR_LMAT_AS_A64;
-		break;
-	default:
-		mutex_unlock(&lm->mtx);
-		dev_err(tsi148_bridge->parent, "Invalid address space\n");
-		return -EINVAL;
-	}
-
-	if (cycle & VME_SUPER)
-		lm_ctl |= TSI148_LCSR_LMAT_SUPR ;
-	if (cycle & VME_USER)
-		lm_ctl |= TSI148_LCSR_LMAT_NPRIV;
-	if (cycle & VME_PROG)
-		lm_ctl |= TSI148_LCSR_LMAT_PGM;
-	if (cycle & VME_DATA)
-		lm_ctl |= TSI148_LCSR_LMAT_DATA;
-
-	reg_split(lm_base, &lm_base_high, &lm_base_low);
-
-	iowrite32be(lm_base_high, bridge->base + TSI148_LCSR_LMBAU);
-	iowrite32be(lm_base_low, bridge->base + TSI148_LCSR_LMBAL);
-	iowrite32be(lm_ctl, bridge->base + TSI148_LCSR_LMAT);
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/* Get configuration of the callback monitor and return whether it is enabled
- * or disabled.
- */
-static int tsi148_lm_get(struct vme_lm_resource *lm,
-	unsigned long long *lm_base, u32 *aspace, u32 *cycle)
-{
-	u32 lm_base_high, lm_base_low, lm_ctl, enabled = 0;
-	struct tsi148_driver *bridge;
-
-	bridge = lm->parent->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	lm_base_high = ioread32be(bridge->base + TSI148_LCSR_LMBAU);
-	lm_base_low = ioread32be(bridge->base + TSI148_LCSR_LMBAL);
-	lm_ctl = ioread32be(bridge->base + TSI148_LCSR_LMAT);
-
-	reg_join(lm_base_high, lm_base_low, lm_base);
-
-	if (lm_ctl & TSI148_LCSR_LMAT_EN)
-		enabled = 1;
-
-	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A16)
-		*aspace |= VME_A16;
-
-	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A24)
-		*aspace |= VME_A24;
-
-	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A32)
-		*aspace |= VME_A32;
-
-	if ((lm_ctl & TSI148_LCSR_LMAT_AS_M) == TSI148_LCSR_LMAT_AS_A64)
-		*aspace |= VME_A64;
-
-
-	if (lm_ctl & TSI148_LCSR_LMAT_SUPR)
-		*cycle |= VME_SUPER;
-	if (lm_ctl & TSI148_LCSR_LMAT_NPRIV)
-		*cycle |= VME_USER;
-	if (lm_ctl & TSI148_LCSR_LMAT_PGM)
-		*cycle |= VME_PROG;
-	if (lm_ctl & TSI148_LCSR_LMAT_DATA)
-		*cycle |= VME_DATA;
-
-	mutex_unlock(&lm->mtx);
-
-	return enabled;
-}
-
-/*
- * Attach a callback to a specific location monitor.
- *
- * Callback will be passed the monitor triggered.
- */
-static int tsi148_lm_attach(struct vme_lm_resource *lm, int monitor,
-	void (*callback)(void *), void *data)
-{
-	u32 lm_ctl, tmp;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *bridge;
-
-	tsi148_bridge = lm->parent;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* Ensure that the location monitor is configured - need PGM or DATA */
-	lm_ctl = ioread32be(bridge->base + TSI148_LCSR_LMAT);
-	if ((lm_ctl & (TSI148_LCSR_LMAT_PGM | TSI148_LCSR_LMAT_DATA)) == 0) {
-		mutex_unlock(&lm->mtx);
-		dev_err(tsi148_bridge->parent, "Location monitor not properly "
-			"configured\n");
-		return -EINVAL;
-	}
-
-	/* Check that a callback isn't already attached */
-	if (bridge->lm_callback[monitor]) {
-		mutex_unlock(&lm->mtx);
-		dev_err(tsi148_bridge->parent, "Existing callback attached\n");
-		return -EBUSY;
-	}
-
-	/* Attach callback */
-	bridge->lm_callback[monitor] = callback;
-	bridge->lm_data[monitor] = data;
-
-	/* Enable Location Monitor interrupt */
-	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEN);
-	tmp |= TSI148_LCSR_INTEN_LMEN[monitor];
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEN);
-
-	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
-	tmp |= TSI148_LCSR_INTEO_LMEO[monitor];
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
-
-	/* Ensure that global Location Monitor Enable set */
-	if ((lm_ctl & TSI148_LCSR_LMAT_EN) == 0) {
-		lm_ctl |= TSI148_LCSR_LMAT_EN;
-		iowrite32be(lm_ctl, bridge->base + TSI148_LCSR_LMAT);
-	}
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/*
- * Detach a callback function forn a specific location monitor.
- */
-static int tsi148_lm_detach(struct vme_lm_resource *lm, int monitor)
-{
-	u32 lm_en, tmp;
-	struct tsi148_driver *bridge;
-
-	bridge = lm->parent->driver_priv;
-
-	mutex_lock(&lm->mtx);
-
-	/* Disable Location Monitor and ensure previous interrupts are clear */
-	lm_en = ioread32be(bridge->base + TSI148_LCSR_INTEN);
-	lm_en &= ~TSI148_LCSR_INTEN_LMEN[monitor];
-	iowrite32be(lm_en, bridge->base + TSI148_LCSR_INTEN);
-
-	tmp = ioread32be(bridge->base + TSI148_LCSR_INTEO);
-	tmp &= ~TSI148_LCSR_INTEO_LMEO[monitor];
-	iowrite32be(tmp, bridge->base + TSI148_LCSR_INTEO);
-
-	iowrite32be(TSI148_LCSR_INTC_LMC[monitor],
-		 bridge->base + TSI148_LCSR_INTC);
-
-	/* Detach callback */
-	bridge->lm_callback[monitor] = NULL;
-	bridge->lm_data[monitor] = NULL;
-
-	/* If all location monitors disabled, disable global Location Monitor */
-	if ((lm_en & (TSI148_LCSR_INTS_LM0S | TSI148_LCSR_INTS_LM1S |
-			TSI148_LCSR_INTS_LM2S | TSI148_LCSR_INTS_LM3S)) == 0) {
-		tmp = ioread32be(bridge->base + TSI148_LCSR_LMAT);
-		tmp &= ~TSI148_LCSR_LMAT_EN;
-		iowrite32be(tmp, bridge->base + TSI148_LCSR_LMAT);
-	}
-
-	mutex_unlock(&lm->mtx);
-
-	return 0;
-}
-
-/*
- * Determine Geographical Addressing
- */
-static int tsi148_slot_get(struct vme_bridge *tsi148_bridge)
-{
-	u32 slot = 0;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	if (!geoid) {
-		slot = ioread32be(bridge->base + TSI148_LCSR_VSTAT);
-		slot = slot & TSI148_LCSR_VSTAT_GA_M;
-	} else
-		slot = geoid;
-
-	return (int)slot;
-}
-
-static void *tsi148_alloc_consistent(struct device *parent, size_t size,
-	dma_addr_t *dma)
-{
-	struct pci_dev *pdev;
-
-	/* Find pci_dev container of dev */
-	pdev = to_pci_dev(parent);
-
-	return dma_alloc_coherent(&pdev->dev, size, dma, GFP_KERNEL);
-}
-
-static void tsi148_free_consistent(struct device *parent, size_t size,
-	void *vaddr, dma_addr_t dma)
-{
-	struct pci_dev *pdev;
-
-	/* Find pci_dev container of dev */
-	pdev = to_pci_dev(parent);
-
-	dma_free_coherent(&pdev->dev, size, vaddr, dma);
-}
-
-/*
- * Configure CR/CSR space
- *
- * Access to the CR/CSR can be configured at power-up. The location of the
- * CR/CSR registers in the CR/CSR address space is determined by the boards
- * Auto-ID or Geographic address. This function ensures that the window is
- * enabled at an offset consistent with the boards geopgraphic address.
- *
- * Each board has a 512kB window, with the highest 4kB being used for the
- * boards registers, this means there is a fix length 508kB window which must
- * be mapped onto PCI memory.
- */
-static int tsi148_crcsr_init(struct vme_bridge *tsi148_bridge,
-	struct pci_dev *pdev)
-{
-	u32 cbar, crat, vstat;
-	u32 crcsr_bus_high, crcsr_bus_low;
-	int retval;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	/* Allocate mem for CR/CSR image */
-	bridge->crcsr_kernel = dma_alloc_coherent(&pdev->dev,
-						  VME_CRCSR_BUF_SIZE,
-						  &bridge->crcsr_bus, GFP_KERNEL);
-	if (!bridge->crcsr_kernel) {
-		dev_err(tsi148_bridge->parent, "Failed to allocate memory for "
-			"CR/CSR image\n");
-		return -ENOMEM;
-	}
-
-	reg_split(bridge->crcsr_bus, &crcsr_bus_high, &crcsr_bus_low);
-
-	iowrite32be(crcsr_bus_high, bridge->base + TSI148_LCSR_CROU);
-	iowrite32be(crcsr_bus_low, bridge->base + TSI148_LCSR_CROL);
-
-	/* Ensure that the CR/CSR is configured at the correct offset */
-	cbar = ioread32be(bridge->base + TSI148_CBAR);
-	cbar = (cbar & TSI148_CRCSR_CBAR_M)>>3;
-
-	vstat = tsi148_slot_get(tsi148_bridge);
-
-	if (cbar != vstat) {
-		cbar = vstat;
-		dev_info(tsi148_bridge->parent, "Setting CR/CSR offset\n");
-		iowrite32be(cbar<<3, bridge->base + TSI148_CBAR);
-	}
-	dev_info(tsi148_bridge->parent, "CR/CSR Offset: %d\n", cbar);
-
-	crat = ioread32be(bridge->base + TSI148_LCSR_CRAT);
-	if (crat & TSI148_LCSR_CRAT_EN)
-		dev_info(tsi148_bridge->parent, "CR/CSR already enabled\n");
-	else {
-		dev_info(tsi148_bridge->parent, "Enabling CR/CSR space\n");
-		iowrite32be(crat | TSI148_LCSR_CRAT_EN,
-			bridge->base + TSI148_LCSR_CRAT);
-	}
-
-	/* If we want flushed, error-checked writes, set up a window
-	 * over the CR/CSR registers. We read from here to safely flush
-	 * through VME writes.
-	 */
-	if (err_chk) {
-		retval = tsi148_master_set(bridge->flush_image, 1,
-			(vstat * 0x80000), 0x80000, VME_CRCSR, VME_SCT,
-			VME_D16);
-		if (retval)
-			dev_err(tsi148_bridge->parent, "Configuring flush image"
-				" failed\n");
-	}
-
-	return 0;
-
-}
-
-static void tsi148_crcsr_exit(struct vme_bridge *tsi148_bridge,
-	struct pci_dev *pdev)
-{
-	u32 crat;
-	struct tsi148_driver *bridge;
-
-	bridge = tsi148_bridge->driver_priv;
-
-	/* Turn off CR/CSR space */
-	crat = ioread32be(bridge->base + TSI148_LCSR_CRAT);
-	iowrite32be(crat & ~TSI148_LCSR_CRAT_EN,
-		bridge->base + TSI148_LCSR_CRAT);
-
-	/* Free image */
-	iowrite32be(0, bridge->base + TSI148_LCSR_CROU);
-	iowrite32be(0, bridge->base + TSI148_LCSR_CROL);
-
-	dma_free_coherent(&pdev->dev, VME_CRCSR_BUF_SIZE,
-			  bridge->crcsr_kernel, bridge->crcsr_bus);
-}
-
-static int tsi148_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	int retval, i, master_num;
-	u32 data;
-	struct list_head *pos = NULL, *n;
-	struct vme_bridge *tsi148_bridge;
-	struct tsi148_driver *tsi148_device;
-	struct vme_master_resource *master_image;
-	struct vme_slave_resource *slave_image;
-	struct vme_dma_resource *dma_ctrlr;
-	struct vme_lm_resource *lm;
-
-	/* If we want to support more than one of each bridge, we need to
-	 * dynamically generate this so we get one per device
-	 */
-	tsi148_bridge = kzalloc(sizeof(*tsi148_bridge), GFP_KERNEL);
-	if (!tsi148_bridge) {
-		retval = -ENOMEM;
-		goto err_struct;
-	}
-	vme_init_bridge(tsi148_bridge);
-
-	tsi148_device = kzalloc(sizeof(*tsi148_device), GFP_KERNEL);
-	if (!tsi148_device) {
-		retval = -ENOMEM;
-		goto err_driver;
-	}
-
-	tsi148_bridge->driver_priv = tsi148_device;
-
-	/* Enable the device */
-	retval = pci_enable_device(pdev);
-	if (retval) {
-		dev_err(&pdev->dev, "Unable to enable device\n");
-		goto err_enable;
-	}
-
-	/* Map Registers */
-	retval = pci_request_regions(pdev, driver_name);
-	if (retval) {
-		dev_err(&pdev->dev, "Unable to reserve resources\n");
-		goto err_resource;
-	}
-
-	/* map registers in BAR 0 */
-	tsi148_device->base = ioremap(pci_resource_start(pdev, 0),
-		4096);
-	if (!tsi148_device->base) {
-		dev_err(&pdev->dev, "Unable to remap CRG region\n");
-		retval = -EIO;
-		goto err_remap;
-	}
-
-	/* Check to see if the mapping worked out */
-	data = ioread32(tsi148_device->base + TSI148_PCFS_ID) & 0x0000FFFF;
-	if (data != PCI_VENDOR_ID_TUNDRA) {
-		dev_err(&pdev->dev, "CRG region check failed\n");
-		retval = -EIO;
-		goto err_test;
-	}
-
-	/* Initialize wait queues & mutual exclusion flags */
-	init_waitqueue_head(&tsi148_device->dma_queue[0]);
-	init_waitqueue_head(&tsi148_device->dma_queue[1]);
-	init_waitqueue_head(&tsi148_device->iack_queue);
-	mutex_init(&tsi148_device->vme_int);
-	mutex_init(&tsi148_device->vme_rmw);
-
-	tsi148_bridge->parent = &pdev->dev;
-	strcpy(tsi148_bridge->name, driver_name);
-
-	/* Setup IRQ */
-	retval = tsi148_irq_init(tsi148_bridge);
-	if (retval != 0) {
-		dev_err(&pdev->dev, "Chip Initialization failed.\n");
-		goto err_irq;
-	}
-
-	/* If we are going to flush writes, we need to read from the VME bus.
-	 * We need to do this safely, thus we read the devices own CR/CSR
-	 * register. To do this we must set up a window in CR/CSR space and
-	 * hence have one less master window resource available.
-	 */
-	master_num = TSI148_MAX_MASTER;
-	if (err_chk) {
-		master_num--;
-
-		tsi148_device->flush_image =
-			kmalloc(sizeof(*tsi148_device->flush_image),
-				GFP_KERNEL);
-		if (!tsi148_device->flush_image) {
-			retval = -ENOMEM;
-			goto err_master;
-		}
-		tsi148_device->flush_image->parent = tsi148_bridge;
-		spin_lock_init(&tsi148_device->flush_image->lock);
-		tsi148_device->flush_image->locked = 1;
-		tsi148_device->flush_image->number = master_num;
-		memset(&tsi148_device->flush_image->bus_resource, 0,
-		       sizeof(tsi148_device->flush_image->bus_resource));
-		tsi148_device->flush_image->kern_base  = NULL;
-	}
-
-	/* Add master windows to list */
-	for (i = 0; i < master_num; i++) {
-		master_image = kmalloc(sizeof(*master_image), GFP_KERNEL);
-		if (!master_image) {
-			retval = -ENOMEM;
-			goto err_master;
-		}
-		master_image->parent = tsi148_bridge;
-		spin_lock_init(&master_image->lock);
-		master_image->locked = 0;
-		master_image->number = i;
-		master_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
-			VME_A64 | VME_CRCSR | VME_USER1 | VME_USER2 |
-			VME_USER3 | VME_USER4;
-		master_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
-			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
-			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
-			VME_PROG | VME_DATA;
-		master_image->width_attr = VME_D16 | VME_D32;
-		memset(&master_image->bus_resource, 0,
-		       sizeof(master_image->bus_resource));
-		master_image->kern_base  = NULL;
-		list_add_tail(&master_image->list,
-			&tsi148_bridge->master_resources);
-	}
-
-	/* Add slave windows to list */
-	for (i = 0; i < TSI148_MAX_SLAVE; i++) {
-		slave_image = kmalloc(sizeof(*slave_image), GFP_KERNEL);
-		if (!slave_image) {
-			retval = -ENOMEM;
-			goto err_slave;
-		}
-		slave_image->parent = tsi148_bridge;
-		mutex_init(&slave_image->mtx);
-		slave_image->locked = 0;
-		slave_image->number = i;
-		slave_image->address_attr = VME_A16 | VME_A24 | VME_A32 |
-			VME_A64;
-		slave_image->cycle_attr = VME_SCT | VME_BLT | VME_MBLT |
-			VME_2eVME | VME_2eSST | VME_2eSSTB | VME_2eSST160 |
-			VME_2eSST267 | VME_2eSST320 | VME_SUPER | VME_USER |
-			VME_PROG | VME_DATA;
-		list_add_tail(&slave_image->list,
-			&tsi148_bridge->slave_resources);
-	}
-
-	/* Add dma engines to list */
-	for (i = 0; i < TSI148_MAX_DMA; i++) {
-		dma_ctrlr = kmalloc(sizeof(*dma_ctrlr), GFP_KERNEL);
-		if (!dma_ctrlr) {
-			retval = -ENOMEM;
-			goto err_dma;
-		}
-		dma_ctrlr->parent = tsi148_bridge;
-		mutex_init(&dma_ctrlr->mtx);
-		dma_ctrlr->locked = 0;
-		dma_ctrlr->number = i;
-		dma_ctrlr->route_attr = VME_DMA_VME_TO_MEM |
-			VME_DMA_MEM_TO_VME | VME_DMA_VME_TO_VME |
-			VME_DMA_MEM_TO_MEM | VME_DMA_PATTERN_TO_VME |
-			VME_DMA_PATTERN_TO_MEM;
-		INIT_LIST_HEAD(&dma_ctrlr->pending);
-		INIT_LIST_HEAD(&dma_ctrlr->running);
-		list_add_tail(&dma_ctrlr->list,
-			&tsi148_bridge->dma_resources);
-	}
-
-	/* Add location monitor to list */
-	lm = kmalloc(sizeof(*lm), GFP_KERNEL);
-	if (!lm) {
-		retval = -ENOMEM;
-		goto err_lm;
-	}
-	lm->parent = tsi148_bridge;
-	mutex_init(&lm->mtx);
-	lm->locked = 0;
-	lm->number = 1;
-	lm->monitors = 4;
-	list_add_tail(&lm->list, &tsi148_bridge->lm_resources);
-
-	tsi148_bridge->slave_get = tsi148_slave_get;
-	tsi148_bridge->slave_set = tsi148_slave_set;
-	tsi148_bridge->master_get = tsi148_master_get;
-	tsi148_bridge->master_set = tsi148_master_set;
-	tsi148_bridge->master_read = tsi148_master_read;
-	tsi148_bridge->master_write = tsi148_master_write;
-	tsi148_bridge->master_rmw = tsi148_master_rmw;
-	tsi148_bridge->dma_list_add = tsi148_dma_list_add;
-	tsi148_bridge->dma_list_exec = tsi148_dma_list_exec;
-	tsi148_bridge->dma_list_empty = tsi148_dma_list_empty;
-	tsi148_bridge->irq_set = tsi148_irq_set;
-	tsi148_bridge->irq_generate = tsi148_irq_generate;
-	tsi148_bridge->lm_set = tsi148_lm_set;
-	tsi148_bridge->lm_get = tsi148_lm_get;
-	tsi148_bridge->lm_attach = tsi148_lm_attach;
-	tsi148_bridge->lm_detach = tsi148_lm_detach;
-	tsi148_bridge->slot_get = tsi148_slot_get;
-	tsi148_bridge->alloc_consistent = tsi148_alloc_consistent;
-	tsi148_bridge->free_consistent = tsi148_free_consistent;
-
-	data = ioread32be(tsi148_device->base + TSI148_LCSR_VSTAT);
-	dev_info(&pdev->dev, "Board is%s the VME system controller\n",
-		(data & TSI148_LCSR_VSTAT_SCONS) ? "" : " not");
-	if (!geoid)
-		dev_info(&pdev->dev, "VME geographical address is %d\n",
-			data & TSI148_LCSR_VSTAT_GA_M);
-	else
-		dev_info(&pdev->dev, "VME geographical address is set to %d\n",
-			geoid);
-
-	dev_info(&pdev->dev, "VME Write and flush and error check is %s\n",
-		err_chk ? "enabled" : "disabled");
-
-	retval = tsi148_crcsr_init(tsi148_bridge, pdev);
-	if (retval) {
-		dev_err(&pdev->dev, "CR/CSR configuration failed.\n");
-		goto err_crcsr;
-	}
-
-	retval = vme_register_bridge(tsi148_bridge);
-	if (retval != 0) {
-		dev_err(&pdev->dev, "Chip Registration failed.\n");
-		goto err_reg;
-	}
-
-	pci_set_drvdata(pdev, tsi148_bridge);
-
-	/* Clear VME bus "board fail", and "power-up reset" lines */
-	data = ioread32be(tsi148_device->base + TSI148_LCSR_VSTAT);
-	data &= ~TSI148_LCSR_VSTAT_BRDFL;
-	data |= TSI148_LCSR_VSTAT_CPURST;
-	iowrite32be(data, tsi148_device->base + TSI148_LCSR_VSTAT);
-
-	return 0;
-
-err_reg:
-	tsi148_crcsr_exit(tsi148_bridge, pdev);
-err_crcsr:
-err_lm:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &tsi148_bridge->lm_resources) {
-		lm = list_entry(pos, struct vme_lm_resource, list);
-		list_del(pos);
-		kfree(lm);
-	}
-err_dma:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &tsi148_bridge->dma_resources) {
-		dma_ctrlr = list_entry(pos, struct vme_dma_resource, list);
-		list_del(pos);
-		kfree(dma_ctrlr);
-	}
-err_slave:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &tsi148_bridge->slave_resources) {
-		slave_image = list_entry(pos, struct vme_slave_resource, list);
-		list_del(pos);
-		kfree(slave_image);
-	}
-err_master:
-	/* resources are stored in link list */
-	list_for_each_safe(pos, n, &tsi148_bridge->master_resources) {
-		master_image = list_entry(pos, struct vme_master_resource,
-			list);
-		list_del(pos);
-		kfree(master_image);
-	}
-
-	tsi148_irq_exit(tsi148_bridge, pdev);
-err_irq:
-err_test:
-	iounmap(tsi148_device->base);
-err_remap:
-	pci_release_regions(pdev);
-err_resource:
-	pci_disable_device(pdev);
-err_enable:
-	kfree(tsi148_device);
-err_driver:
-	kfree(tsi148_bridge);
-err_struct:
-	return retval;
-
-}
-
-static void tsi148_remove(struct pci_dev *pdev)
-{
-	struct list_head *pos = NULL;
-	struct list_head *tmplist;
-	struct vme_master_resource *master_image;
-	struct vme_slave_resource *slave_image;
-	struct vme_dma_resource *dma_ctrlr;
-	int i;
-	struct tsi148_driver *bridge;
-	struct vme_bridge *tsi148_bridge = pci_get_drvdata(pdev);
-
-	bridge = tsi148_bridge->driver_priv;
-
-
-	dev_dbg(&pdev->dev, "Driver is being unloaded.\n");
-
-	/*
-	 *  Shutdown all inbound and outbound windows.
-	 */
-	for (i = 0; i < 8; i++) {
-		iowrite32be(0, bridge->base + TSI148_LCSR_IT[i] +
-			TSI148_LCSR_OFFSET_ITAT);
-		iowrite32be(0, bridge->base + TSI148_LCSR_OT[i] +
-			TSI148_LCSR_OFFSET_OTAT);
-	}
-
-	/*
-	 *  Shutdown Location monitor.
-	 */
-	iowrite32be(0, bridge->base + TSI148_LCSR_LMAT);
-
-	/*
-	 *  Shutdown CRG map.
-	 */
-	iowrite32be(0, bridge->base + TSI148_LCSR_CSRAT);
-
-	/*
-	 *  Clear error status.
-	 */
-	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_EDPAT);
-	iowrite32be(0xFFFFFFFF, bridge->base + TSI148_LCSR_VEAT);
-	iowrite32be(0x07000700, bridge->base + TSI148_LCSR_PSTAT);
-
-	/*
-	 *  Remove VIRQ interrupt (if any)
-	 */
-	if (ioread32be(bridge->base + TSI148_LCSR_VICR) & 0x800)
-		iowrite32be(0x8000, bridge->base + TSI148_LCSR_VICR);
-
-	/*
-	 *  Map all Interrupts to PCI INTA
-	 */
-	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTM1);
-	iowrite32be(0x0, bridge->base + TSI148_LCSR_INTM2);
-
-	tsi148_irq_exit(tsi148_bridge, pdev);
-
-	vme_unregister_bridge(tsi148_bridge);
-
-	tsi148_crcsr_exit(tsi148_bridge, pdev);
-
-	/* resources are stored in link list */
-	list_for_each_safe(pos, tmplist, &tsi148_bridge->dma_resources) {
-		dma_ctrlr = list_entry(pos, struct vme_dma_resource, list);
-		list_del(pos);
-		kfree(dma_ctrlr);
-	}
-
-	/* resources are stored in link list */
-	list_for_each_safe(pos, tmplist, &tsi148_bridge->slave_resources) {
-		slave_image = list_entry(pos, struct vme_slave_resource, list);
-		list_del(pos);
-		kfree(slave_image);
-	}
-
-	/* resources are stored in link list */
-	list_for_each_safe(pos, tmplist, &tsi148_bridge->master_resources) {
-		master_image = list_entry(pos, struct vme_master_resource,
-			list);
-		list_del(pos);
-		kfree(master_image);
-	}
-
-	iounmap(bridge->base);
-
-	pci_release_regions(pdev);
-
-	pci_disable_device(pdev);
-
-	kfree(tsi148_bridge->driver_priv);
-
-	kfree(tsi148_bridge);
-}
-
-module_pci_driver(tsi148_driver);
-
-MODULE_PARM_DESC(err_chk, "Check for VME errors on reads and writes");
-module_param(err_chk, bool, 0);
-
-MODULE_PARM_DESC(geoid, "Override geographical addressing");
-module_param(geoid, int, 0);
-
-MODULE_DESCRIPTION("VME driver for the Tundra Tempe VME bridge");
-MODULE_LICENSE("GPL");
diff --git a/drivers/vme/bridges/vme_tsi148.h b/drivers/vme/bridges/vme_tsi148.h
deleted file mode 100644
index 226fedc6f167..000000000000
--- a/drivers/vme/bridges/vme_tsi148.h
+++ /dev/null
@@ -1,1407 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * tsi148.h
- *
- * Support for the Tundra TSI148 VME Bridge chip
- *
- * Author: Tom Armistead
- * Updated and maintained by Ajit Prem
- * Copyright 2004 Motorola Inc.
- */
-
-#ifndef TSI148_H
-#define TSI148_H
-
-#ifndef	PCI_VENDOR_ID_TUNDRA
-#define	PCI_VENDOR_ID_TUNDRA 0x10e3
-#endif
-
-#ifndef	PCI_DEVICE_ID_TUNDRA_TSI148
-#define	PCI_DEVICE_ID_TUNDRA_TSI148 0x148
-#endif
-
-/*
- *  Define the number of each that the Tsi148 supports.
- */
-#define TSI148_MAX_MASTER		8	/* Max Master Windows */
-#define TSI148_MAX_SLAVE		8	/* Max Slave Windows */
-#define TSI148_MAX_DMA			2	/* Max DMA Controllers */
-#define TSI148_MAX_MAILBOX		4	/* Max Mail Box registers */
-#define TSI148_MAX_SEMAPHORE		8	/* Max Semaphores */
-
-/* Structure used to hold driver specific information */
-struct tsi148_driver {
-	void __iomem *base;	/* Base Address of device registers */
-	wait_queue_head_t dma_queue[2];
-	wait_queue_head_t iack_queue;
-	void (*lm_callback[4])(void *);	/* Called in interrupt handler */
-	void *lm_data[4];
-	void *crcsr_kernel;
-	dma_addr_t crcsr_bus;
-	struct vme_master_resource *flush_image;
-	struct mutex vme_rmw;		/* Only one RMW cycle at a time */
-	struct mutex vme_int;		/*
-					 * Only one VME interrupt can be
-					 * generated at a time, provide locking
-					 */
-};
-
-/*
- * Layout of a DMAC Linked-List Descriptor
- *
- * Note: This structure is accessed via the chip and therefore must be
- *       correctly laid out - It must also be aligned on 64-bit boundaries.
- */
-struct tsi148_dma_descriptor {
-	__be32 dsau;      /* Source Address */
-	__be32 dsal;
-	__be32 ddau;      /* Destination Address */
-	__be32 ddal;
-	__be32 dsat;      /* Source attributes */
-	__be32 ddat;      /* Destination attributes */
-	__be32 dnlau;     /* Next link address */
-	__be32 dnlal;
-	__be32 dcnt;      /* Byte count */
-	__be32 ddbs;      /* 2eSST Broadcast select */
-};
-
-struct tsi148_dma_entry {
-	/*
-	 * The descriptor needs to be aligned on a 64-bit boundary, we increase
-	 * the chance of this by putting it first in the structure.
-	 */
-	struct tsi148_dma_descriptor descriptor;
-	struct list_head list;
-	dma_addr_t dma_handle;
-};
-
-/*
- *  TSI148 ASIC register structure overlays and bit field definitions.
- *
- *      Note:   Tsi148 Register Group (CRG) consists of the following
- *              combination of registers:
- *                      PCFS    - PCI Configuration Space Registers
- *                      LCSR    - Local Control and Status Registers
- *                      GCSR    - Global Control and Status Registers
- *                      CR/CSR  - Subset of Configuration ROM /
- *                                Control and Status Registers
- */
-
-
-/*
- *  Command/Status Registers (CRG + $004)
- */
-#define TSI148_PCFS_ID			0x0
-#define TSI148_PCFS_CSR			0x4
-#define TSI148_PCFS_CLASS		0x8
-#define TSI148_PCFS_MISC0		0xC
-#define TSI148_PCFS_MBARL		0x10
-#define TSI148_PCFS_MBARU		0x14
-
-#define TSI148_PCFS_SUBID		0x28
-
-#define TSI148_PCFS_CAPP		0x34
-
-#define TSI148_PCFS_MISC1		0x3C
-
-#define TSI148_PCFS_XCAPP		0x40
-#define TSI148_PCFS_XSTAT		0x44
-
-/*
- * LCSR definitions
- */
-
-/*
- *    Outbound Translations
- */
-#define TSI148_LCSR_OT0_OTSAU		0x100
-#define TSI148_LCSR_OT0_OTSAL		0x104
-#define TSI148_LCSR_OT0_OTEAU		0x108
-#define TSI148_LCSR_OT0_OTEAL		0x10C
-#define TSI148_LCSR_OT0_OTOFU		0x110
-#define TSI148_LCSR_OT0_OTOFL		0x114
-#define TSI148_LCSR_OT0_OTBS		0x118
-#define TSI148_LCSR_OT0_OTAT		0x11C
-
-#define TSI148_LCSR_OT1_OTSAU		0x120
-#define TSI148_LCSR_OT1_OTSAL		0x124
-#define TSI148_LCSR_OT1_OTEAU		0x128
-#define TSI148_LCSR_OT1_OTEAL		0x12C
-#define TSI148_LCSR_OT1_OTOFU		0x130
-#define TSI148_LCSR_OT1_OTOFL		0x134
-#define TSI148_LCSR_OT1_OTBS		0x138
-#define TSI148_LCSR_OT1_OTAT		0x13C
-
-#define TSI148_LCSR_OT2_OTSAU		0x140
-#define TSI148_LCSR_OT2_OTSAL		0x144
-#define TSI148_LCSR_OT2_OTEAU		0x148
-#define TSI148_LCSR_OT2_OTEAL		0x14C
-#define TSI148_LCSR_OT2_OTOFU		0x150
-#define TSI148_LCSR_OT2_OTOFL		0x154
-#define TSI148_LCSR_OT2_OTBS		0x158
-#define TSI148_LCSR_OT2_OTAT		0x15C
-
-#define TSI148_LCSR_OT3_OTSAU		0x160
-#define TSI148_LCSR_OT3_OTSAL		0x164
-#define TSI148_LCSR_OT3_OTEAU		0x168
-#define TSI148_LCSR_OT3_OTEAL		0x16C
-#define TSI148_LCSR_OT3_OTOFU		0x170
-#define TSI148_LCSR_OT3_OTOFL		0x174
-#define TSI148_LCSR_OT3_OTBS		0x178
-#define TSI148_LCSR_OT3_OTAT		0x17C
-
-#define TSI148_LCSR_OT4_OTSAU		0x180
-#define TSI148_LCSR_OT4_OTSAL		0x184
-#define TSI148_LCSR_OT4_OTEAU		0x188
-#define TSI148_LCSR_OT4_OTEAL		0x18C
-#define TSI148_LCSR_OT4_OTOFU		0x190
-#define TSI148_LCSR_OT4_OTOFL		0x194
-#define TSI148_LCSR_OT4_OTBS		0x198
-#define TSI148_LCSR_OT4_OTAT		0x19C
-
-#define TSI148_LCSR_OT5_OTSAU		0x1A0
-#define TSI148_LCSR_OT5_OTSAL		0x1A4
-#define TSI148_LCSR_OT5_OTEAU		0x1A8
-#define TSI148_LCSR_OT5_OTEAL		0x1AC
-#define TSI148_LCSR_OT5_OTOFU		0x1B0
-#define TSI148_LCSR_OT5_OTOFL		0x1B4
-#define TSI148_LCSR_OT5_OTBS		0x1B8
-#define TSI148_LCSR_OT5_OTAT		0x1BC
-
-#define TSI148_LCSR_OT6_OTSAU		0x1C0
-#define TSI148_LCSR_OT6_OTSAL		0x1C4
-#define TSI148_LCSR_OT6_OTEAU		0x1C8
-#define TSI148_LCSR_OT6_OTEAL		0x1CC
-#define TSI148_LCSR_OT6_OTOFU		0x1D0
-#define TSI148_LCSR_OT6_OTOFL		0x1D4
-#define TSI148_LCSR_OT6_OTBS		0x1D8
-#define TSI148_LCSR_OT6_OTAT		0x1DC
-
-#define TSI148_LCSR_OT7_OTSAU		0x1E0
-#define TSI148_LCSR_OT7_OTSAL		0x1E4
-#define TSI148_LCSR_OT7_OTEAU		0x1E8
-#define TSI148_LCSR_OT7_OTEAL		0x1EC
-#define TSI148_LCSR_OT7_OTOFU		0x1F0
-#define TSI148_LCSR_OT7_OTOFL		0x1F4
-#define TSI148_LCSR_OT7_OTBS		0x1F8
-#define TSI148_LCSR_OT7_OTAT		0x1FC
-
-#define TSI148_LCSR_OT0		0x100
-#define TSI148_LCSR_OT1		0x120
-#define TSI148_LCSR_OT2		0x140
-#define TSI148_LCSR_OT3		0x160
-#define TSI148_LCSR_OT4		0x180
-#define TSI148_LCSR_OT5		0x1A0
-#define TSI148_LCSR_OT6		0x1C0
-#define TSI148_LCSR_OT7		0x1E0
-
-static const int TSI148_LCSR_OT[8] = { TSI148_LCSR_OT0, TSI148_LCSR_OT1,
-					 TSI148_LCSR_OT2, TSI148_LCSR_OT3,
-					 TSI148_LCSR_OT4, TSI148_LCSR_OT5,
-					 TSI148_LCSR_OT6, TSI148_LCSR_OT7 };
-
-#define TSI148_LCSR_OFFSET_OTSAU	0x0
-#define TSI148_LCSR_OFFSET_OTSAL	0x4
-#define TSI148_LCSR_OFFSET_OTEAU	0x8
-#define TSI148_LCSR_OFFSET_OTEAL	0xC
-#define TSI148_LCSR_OFFSET_OTOFU	0x10
-#define TSI148_LCSR_OFFSET_OTOFL	0x14
-#define TSI148_LCSR_OFFSET_OTBS		0x18
-#define TSI148_LCSR_OFFSET_OTAT		0x1C
-
-/*
- * VMEbus interrupt ack
- * offset  200
- */
-#define TSI148_LCSR_VIACK1	0x204
-#define TSI148_LCSR_VIACK2	0x208
-#define TSI148_LCSR_VIACK3	0x20C
-#define TSI148_LCSR_VIACK4	0x210
-#define TSI148_LCSR_VIACK5	0x214
-#define TSI148_LCSR_VIACK6	0x218
-#define TSI148_LCSR_VIACK7	0x21C
-
-static const int TSI148_LCSR_VIACK[8] = { 0, TSI148_LCSR_VIACK1,
-				TSI148_LCSR_VIACK2, TSI148_LCSR_VIACK3,
-				TSI148_LCSR_VIACK4, TSI148_LCSR_VIACK5,
-				TSI148_LCSR_VIACK6, TSI148_LCSR_VIACK7 };
-
-/*
- * RMW
- * offset    220
- */
-#define TSI148_LCSR_RMWAU	0x220
-#define TSI148_LCSR_RMWAL	0x224
-#define TSI148_LCSR_RMWEN	0x228
-#define TSI148_LCSR_RMWC	0x22C
-#define TSI148_LCSR_RMWS	0x230
-
-/*
- * VMEbus control
- * offset    234
- */
-#define TSI148_LCSR_VMCTRL	0x234
-#define TSI148_LCSR_VCTRL	0x238
-#define TSI148_LCSR_VSTAT	0x23C
-
-/*
- * PCI status
- * offset  240
- */
-#define TSI148_LCSR_PSTAT	0x240
-
-/*
- * VME filter.
- * offset  250
- */
-#define TSI148_LCSR_VMEFL	0x250
-
-	/*
-	 * VME exception.
-	 * offset  260
- */
-#define TSI148_LCSR_VEAU	0x260
-#define TSI148_LCSR_VEAL	0x264
-#define TSI148_LCSR_VEAT	0x268
-
-	/*
-	 * PCI error
-	 * offset  270
-	 */
-#define TSI148_LCSR_EDPAU	0x270
-#define TSI148_LCSR_EDPAL	0x274
-#define TSI148_LCSR_EDPXA	0x278
-#define TSI148_LCSR_EDPXS	0x27C
-#define TSI148_LCSR_EDPAT	0x280
-
-	/*
-	 * Inbound Translations
-	 * offset  300
-	 */
-#define TSI148_LCSR_IT0_ITSAU		0x300
-#define TSI148_LCSR_IT0_ITSAL		0x304
-#define TSI148_LCSR_IT0_ITEAU		0x308
-#define TSI148_LCSR_IT0_ITEAL		0x30C
-#define TSI148_LCSR_IT0_ITOFU		0x310
-#define TSI148_LCSR_IT0_ITOFL		0x314
-#define TSI148_LCSR_IT0_ITAT		0x318
-
-#define TSI148_LCSR_IT1_ITSAU		0x320
-#define TSI148_LCSR_IT1_ITSAL		0x324
-#define TSI148_LCSR_IT1_ITEAU		0x328
-#define TSI148_LCSR_IT1_ITEAL		0x32C
-#define TSI148_LCSR_IT1_ITOFU		0x330
-#define TSI148_LCSR_IT1_ITOFL		0x334
-#define TSI148_LCSR_IT1_ITAT		0x338
-
-#define TSI148_LCSR_IT2_ITSAU		0x340
-#define TSI148_LCSR_IT2_ITSAL		0x344
-#define TSI148_LCSR_IT2_ITEAU		0x348
-#define TSI148_LCSR_IT2_ITEAL		0x34C
-#define TSI148_LCSR_IT2_ITOFU		0x350
-#define TSI148_LCSR_IT2_ITOFL		0x354
-#define TSI148_LCSR_IT2_ITAT		0x358
-
-#define TSI148_LCSR_IT3_ITSAU		0x360
-#define TSI148_LCSR_IT3_ITSAL		0x364
-#define TSI148_LCSR_IT3_ITEAU		0x368
-#define TSI148_LCSR_IT3_ITEAL		0x36C
-#define TSI148_LCSR_IT3_ITOFU		0x370
-#define TSI148_LCSR_IT3_ITOFL		0x374
-#define TSI148_LCSR_IT3_ITAT		0x378
-
-#define TSI148_LCSR_IT4_ITSAU		0x380
-#define TSI148_LCSR_IT4_ITSAL		0x384
-#define TSI148_LCSR_IT4_ITEAU		0x388
-#define TSI148_LCSR_IT4_ITEAL		0x38C
-#define TSI148_LCSR_IT4_ITOFU		0x390
-#define TSI148_LCSR_IT4_ITOFL		0x394
-#define TSI148_LCSR_IT4_ITAT		0x398
-
-#define TSI148_LCSR_IT5_ITSAU		0x3A0
-#define TSI148_LCSR_IT5_ITSAL		0x3A4
-#define TSI148_LCSR_IT5_ITEAU		0x3A8
-#define TSI148_LCSR_IT5_ITEAL		0x3AC
-#define TSI148_LCSR_IT5_ITOFU		0x3B0
-#define TSI148_LCSR_IT5_ITOFL		0x3B4
-#define TSI148_LCSR_IT5_ITAT		0x3B8
-
-#define TSI148_LCSR_IT6_ITSAU		0x3C0
-#define TSI148_LCSR_IT6_ITSAL		0x3C4
-#define TSI148_LCSR_IT6_ITEAU		0x3C8
-#define TSI148_LCSR_IT6_ITEAL		0x3CC
-#define TSI148_LCSR_IT6_ITOFU		0x3D0
-#define TSI148_LCSR_IT6_ITOFL		0x3D4
-#define TSI148_LCSR_IT6_ITAT		0x3D8
-
-#define TSI148_LCSR_IT7_ITSAU		0x3E0
-#define TSI148_LCSR_IT7_ITSAL		0x3E4
-#define TSI148_LCSR_IT7_ITEAU		0x3E8
-#define TSI148_LCSR_IT7_ITEAL		0x3EC
-#define TSI148_LCSR_IT7_ITOFU		0x3F0
-#define TSI148_LCSR_IT7_ITOFL		0x3F4
-#define TSI148_LCSR_IT7_ITAT		0x3F8
-
-
-#define TSI148_LCSR_IT0		0x300
-#define TSI148_LCSR_IT1		0x320
-#define TSI148_LCSR_IT2		0x340
-#define TSI148_LCSR_IT3		0x360
-#define TSI148_LCSR_IT4		0x380
-#define TSI148_LCSR_IT5		0x3A0
-#define TSI148_LCSR_IT6		0x3C0
-#define TSI148_LCSR_IT7		0x3E0
-
-static const int TSI148_LCSR_IT[8] = { TSI148_LCSR_IT0, TSI148_LCSR_IT1,
-					 TSI148_LCSR_IT2, TSI148_LCSR_IT3,
-					 TSI148_LCSR_IT4, TSI148_LCSR_IT5,
-					 TSI148_LCSR_IT6, TSI148_LCSR_IT7 };
-
-#define TSI148_LCSR_OFFSET_ITSAU	0x0
-#define TSI148_LCSR_OFFSET_ITSAL	0x4
-#define TSI148_LCSR_OFFSET_ITEAU	0x8
-#define TSI148_LCSR_OFFSET_ITEAL	0xC
-#define TSI148_LCSR_OFFSET_ITOFU	0x10
-#define TSI148_LCSR_OFFSET_ITOFL	0x14
-#define TSI148_LCSR_OFFSET_ITAT		0x18
-
-	/*
-	 * Inbound Translation GCSR
-	 * offset  400
-	 */
-#define TSI148_LCSR_GBAU	0x400
-#define TSI148_LCSR_GBAL	0x404
-#define TSI148_LCSR_GCSRAT	0x408
-
-	/*
-	 * Inbound Translation CRG
-	 * offset  40C
-	 */
-#define TSI148_LCSR_CBAU	0x40C
-#define TSI148_LCSR_CBAL	0x410
-#define TSI148_LCSR_CSRAT	0x414
-
-	/*
-	 * Inbound Translation CR/CSR
-	 *         CRG
-	 * offset  418
-	 */
-#define TSI148_LCSR_CROU	0x418
-#define TSI148_LCSR_CROL	0x41C
-#define TSI148_LCSR_CRAT	0x420
-
-	/*
-	 * Inbound Translation Location Monitor
-	 * offset  424
-	 */
-#define TSI148_LCSR_LMBAU	0x424
-#define TSI148_LCSR_LMBAL	0x428
-#define TSI148_LCSR_LMAT	0x42C
-
-	/*
-	 * VMEbus Interrupt Control.
-	 * offset  430
-	 */
-#define TSI148_LCSR_BCU		0x430
-#define TSI148_LCSR_BCL		0x434
-#define TSI148_LCSR_BPGTR	0x438
-#define TSI148_LCSR_BPCTR	0x43C
-#define TSI148_LCSR_VICR	0x440
-
-	/*
-	 * Local Bus Interrupt Control.
-	 * offset  448
-	 */
-#define TSI148_LCSR_INTEN	0x448
-#define TSI148_LCSR_INTEO	0x44C
-#define TSI148_LCSR_INTS	0x450
-#define TSI148_LCSR_INTC	0x454
-#define TSI148_LCSR_INTM1	0x458
-#define TSI148_LCSR_INTM2	0x45C
-
-	/*
-	 * DMA Controllers
-	 * offset 500
-	 */
-#define TSI148_LCSR_DCTL0	0x500
-#define TSI148_LCSR_DSTA0	0x504
-#define TSI148_LCSR_DCSAU0	0x508
-#define TSI148_LCSR_DCSAL0	0x50C
-#define TSI148_LCSR_DCDAU0	0x510
-#define TSI148_LCSR_DCDAL0	0x514
-#define TSI148_LCSR_DCLAU0	0x518
-#define TSI148_LCSR_DCLAL0	0x51C
-#define TSI148_LCSR_DSAU0	0x520
-#define TSI148_LCSR_DSAL0	0x524
-#define TSI148_LCSR_DDAU0	0x528
-#define TSI148_LCSR_DDAL0	0x52C
-#define TSI148_LCSR_DSAT0	0x530
-#define TSI148_LCSR_DDAT0	0x534
-#define TSI148_LCSR_DNLAU0	0x538
-#define TSI148_LCSR_DNLAL0	0x53C
-#define TSI148_LCSR_DCNT0	0x540
-#define TSI148_LCSR_DDBS0	0x544
-
-#define TSI148_LCSR_DCTL1	0x580
-#define TSI148_LCSR_DSTA1	0x584
-#define TSI148_LCSR_DCSAU1	0x588
-#define TSI148_LCSR_DCSAL1	0x58C
-#define TSI148_LCSR_DCDAU1	0x590
-#define TSI148_LCSR_DCDAL1	0x594
-#define TSI148_LCSR_DCLAU1	0x598
-#define TSI148_LCSR_DCLAL1	0x59C
-#define TSI148_LCSR_DSAU1	0x5A0
-#define TSI148_LCSR_DSAL1	0x5A4
-#define TSI148_LCSR_DDAU1	0x5A8
-#define TSI148_LCSR_DDAL1	0x5AC
-#define TSI148_LCSR_DSAT1	0x5B0
-#define TSI148_LCSR_DDAT1	0x5B4
-#define TSI148_LCSR_DNLAU1	0x5B8
-#define TSI148_LCSR_DNLAL1	0x5BC
-#define TSI148_LCSR_DCNT1	0x5C0
-#define TSI148_LCSR_DDBS1	0x5C4
-
-#define TSI148_LCSR_DMA0	0x500
-#define TSI148_LCSR_DMA1	0x580
-
-
-static const int TSI148_LCSR_DMA[TSI148_MAX_DMA] = { TSI148_LCSR_DMA0,
-						TSI148_LCSR_DMA1 };
-
-#define TSI148_LCSR_OFFSET_DCTL		0x0
-#define TSI148_LCSR_OFFSET_DSTA		0x4
-#define TSI148_LCSR_OFFSET_DCSAU	0x8
-#define TSI148_LCSR_OFFSET_DCSAL	0xC
-#define TSI148_LCSR_OFFSET_DCDAU	0x10
-#define TSI148_LCSR_OFFSET_DCDAL	0x14
-#define TSI148_LCSR_OFFSET_DCLAU	0x18
-#define TSI148_LCSR_OFFSET_DCLAL	0x1C
-#define TSI148_LCSR_OFFSET_DSAU		0x20
-#define TSI148_LCSR_OFFSET_DSAL		0x24
-#define TSI148_LCSR_OFFSET_DDAU		0x28
-#define TSI148_LCSR_OFFSET_DDAL		0x2C
-#define TSI148_LCSR_OFFSET_DSAT		0x30
-#define TSI148_LCSR_OFFSET_DDAT		0x34
-#define TSI148_LCSR_OFFSET_DNLAU	0x38
-#define TSI148_LCSR_OFFSET_DNLAL	0x3C
-#define TSI148_LCSR_OFFSET_DCNT		0x40
-#define TSI148_LCSR_OFFSET_DDBS		0x44
-
-	/*
-	 * GCSR Register Group
-	 */
-
-	/*
-	 *         GCSR    CRG
-	 * offset   00     600 - DEVI/VENI
-	 * offset   04     604 - CTRL/GA/REVID
-	 * offset   08     608 - Semaphore3/2/1/0
-	 * offset   0C     60C - Seamphore7/6/5/4
-	 */
-#define TSI148_GCSR_ID		0x600
-#define TSI148_GCSR_CSR		0x604
-#define TSI148_GCSR_SEMA0	0x608
-#define TSI148_GCSR_SEMA1	0x60C
-
-	/*
-	 * Mail Box
-	 *         GCSR    CRG
-	 * offset   10     610 - Mailbox0
-	 */
-#define TSI148_GCSR_MBOX0	0x610
-#define TSI148_GCSR_MBOX1	0x614
-#define TSI148_GCSR_MBOX2	0x618
-#define TSI148_GCSR_MBOX3	0x61C
-
-static const int TSI148_GCSR_MBOX[4] = { TSI148_GCSR_MBOX0,
-					TSI148_GCSR_MBOX1,
-					TSI148_GCSR_MBOX2,
-					TSI148_GCSR_MBOX3 };
-
-	/*
-	 * CR/CSR
-	 */
-
-	/*
-	 *        CR/CSR   CRG
-	 * offset  7FFF4   FF4 - CSRBCR
-	 * offset  7FFF8   FF8 - CSRBSR
-	 * offset  7FFFC   FFC - CBAR
-	 */
-#define TSI148_CSRBCR	0xFF4
-#define TSI148_CSRBSR	0xFF8
-#define TSI148_CBAR	0xFFC
-
-
-
-
-	/*
-	 *  TSI148 Register Bit Definitions
-	 */
-
-	/*
-	 *  PFCS Register Set
-	 */
-#define TSI148_PCFS_CMMD_SERR          (1<<8)	/* SERR_L out pin ssys err */
-#define TSI148_PCFS_CMMD_PERR          (1<<6)	/* PERR_L out pin  parity */
-#define TSI148_PCFS_CMMD_MSTR          (1<<2)	/* PCI bus master */
-#define TSI148_PCFS_CMMD_MEMSP         (1<<1)	/* PCI mem space access  */
-#define TSI148_PCFS_CMMD_IOSP          (1<<0)	/* PCI I/O space enable */
-
-#define TSI148_PCFS_STAT_RCPVE         (1<<15)	/* Detected Parity Error */
-#define TSI148_PCFS_STAT_SIGSE         (1<<14)	/* Signalled System Error */
-#define TSI148_PCFS_STAT_RCVMA         (1<<13)	/* Received Master Abort */
-#define TSI148_PCFS_STAT_RCVTA         (1<<12)	/* Received Target Abort */
-#define TSI148_PCFS_STAT_SIGTA         (1<<11)	/* Signalled Target Abort */
-#define TSI148_PCFS_STAT_SELTIM        (3<<9)	/* DELSEL Timing */
-#define TSI148_PCFS_STAT_DPAR          (1<<8)	/* Data Parity Err Reported */
-#define TSI148_PCFS_STAT_FAST          (1<<7)	/* Fast back-to-back Cap */
-#define TSI148_PCFS_STAT_P66M          (1<<5)	/* 66 MHz Capable */
-#define TSI148_PCFS_STAT_CAPL          (1<<4)	/* Capab List - address $34 */
-
-/*
- *  Revision ID/Class Code Registers   (CRG +$008)
- */
-#define TSI148_PCFS_CLAS_M             (0xFF<<24)	/* Class ID */
-#define TSI148_PCFS_SUBCLAS_M          (0xFF<<16)	/* Sub-Class ID */
-#define TSI148_PCFS_PROGIF_M           (0xFF<<8)	/* Sub-Class ID */
-#define TSI148_PCFS_REVID_M            (0xFF<<0)	/* Rev ID */
-
-/*
- * Cache Line Size/ Master Latency Timer/ Header Type Registers (CRG + $00C)
- */
-#define TSI148_PCFS_HEAD_M             (0xFF<<16)	/* Master Lat Timer */
-#define TSI148_PCFS_MLAT_M             (0xFF<<8)	/* Master Lat Timer */
-#define TSI148_PCFS_CLSZ_M             (0xFF<<0)	/* Cache Line Size */
-
-/*
- *  Memory Base Address Lower Reg (CRG + $010)
- */
-#define TSI148_PCFS_MBARL_BASEL_M      (0xFFFFF<<12) /* Base Addr Lower Mask */
-#define TSI148_PCFS_MBARL_PRE          (1<<3)	/* Prefetch */
-#define TSI148_PCFS_MBARL_MTYPE_M      (3<<1)	/* Memory Type Mask */
-#define TSI148_PCFS_MBARL_IOMEM        (1<<0)	/* I/O Space Indicator */
-
-/*
- *  Message Signaled Interrupt Capabilities Register (CRG + $040)
- */
-#define TSI148_PCFS_MSICAP_64BAC       (1<<7)	/* 64-bit Address Capable */
-#define TSI148_PCFS_MSICAP_MME_M       (7<<4)	/* Multiple Msg Enable Mask */
-#define TSI148_PCFS_MSICAP_MMC_M       (7<<1)	/* Multiple Msg Capable Mask */
-#define TSI148_PCFS_MSICAP_MSIEN       (1<<0)	/* Msg signaled INT Enable */
-
-/*
- *  Message Address Lower Register (CRG +$044)
- */
-#define TSI148_PCFS_MSIAL_M            (0x3FFFFFFF<<2)	/* Mask */
-
-/*
- *  Message Data Register (CRG + 4C)
- */
-#define TSI148_PCFS_MSIMD_M            (0xFFFF<<0)	/* Mask */
-
-/*
- *  PCI-X Capabilities Register (CRG + $050)
- */
-#define TSI148_PCFS_PCIXCAP_MOST_M     (7<<4)	/* Max outstanding Split Tran */
-#define TSI148_PCFS_PCIXCAP_MMRBC_M    (3<<2)	/* Max Mem Read byte cnt */
-#define TSI148_PCFS_PCIXCAP_ERO        (1<<1)	/* Enable Relaxed Ordering */
-#define TSI148_PCFS_PCIXCAP_DPERE      (1<<0)	/* Data Parity Recover Enable */
-
-/*
- *  PCI-X Status Register (CRG +$054)
- */
-#define TSI148_PCFS_PCIXSTAT_RSCEM     (1<<29)	/* Received Split Comp Error */
-#define TSI148_PCFS_PCIXSTAT_DMCRS_M   (7<<26)	/* max Cumulative Read Size */
-#define TSI148_PCFS_PCIXSTAT_DMOST_M   (7<<23)	/* max outstanding Split Trans
-						 */
-#define TSI148_PCFS_PCIXSTAT_DMMRC_M   (3<<21)	/* max mem read byte count */
-#define TSI148_PCFS_PCIXSTAT_DC        (1<<20)	/* Device Complexity */
-#define TSI148_PCFS_PCIXSTAT_USC       (1<<19)	/* Unexpected Split comp */
-#define TSI148_PCFS_PCIXSTAT_SCD       (1<<18)	/* Split completion discard */
-#define TSI148_PCFS_PCIXSTAT_133C      (1<<17)	/* 133MHz capable */
-#define TSI148_PCFS_PCIXSTAT_64D       (1<<16)	/* 64 bit device */
-#define TSI148_PCFS_PCIXSTAT_BN_M      (0xFF<<8)	/* Bus number */
-#define TSI148_PCFS_PCIXSTAT_DN_M      (0x1F<<3)	/* Device number */
-#define TSI148_PCFS_PCIXSTAT_FN_M      (7<<0)	/* Function Number */
-
-/*
- *  LCSR Registers
- */
-
-/*
- *  Outbound Translation Starting Address Lower
- */
-#define TSI148_LCSR_OTSAL_M            (0xFFFF<<16)	/* Mask */
-
-/*
- *  Outbound Translation Ending Address Lower
- */
-#define TSI148_LCSR_OTEAL_M            (0xFFFF<<16)	/* Mask */
-
-/*
- *  Outbound Translation Offset Lower
- */
-#define TSI148_LCSR_OTOFFL_M           (0xFFFF<<16)	/* Mask */
-
-/*
- *  Outbound Translation 2eSST Broadcast Select
- */
-#define TSI148_LCSR_OTBS_M             (0xFFFFF<<0)	/* Mask */
-
-/*
- *  Outbound Translation Attribute
- */
-#define TSI148_LCSR_OTAT_EN            (1<<31)	/* Window Enable */
-#define TSI148_LCSR_OTAT_MRPFD         (1<<18)	/* Prefetch Disable */
-
-#define TSI148_LCSR_OTAT_PFS_M         (3<<16)	/* Prefetch Size Mask */
-#define TSI148_LCSR_OTAT_PFS_2         (0<<16)	/* 2 Cache Lines P Size */
-#define TSI148_LCSR_OTAT_PFS_4         (1<<16)	/* 4 Cache Lines P Size */
-#define TSI148_LCSR_OTAT_PFS_8         (2<<16)	/* 8 Cache Lines P Size */
-#define TSI148_LCSR_OTAT_PFS_16        (3<<16)	/* 16 Cache Lines P Size */
-
-#define TSI148_LCSR_OTAT_2eSSTM_M      (7<<11)	/* 2eSST Xfer Rate Mask */
-#define TSI148_LCSR_OTAT_2eSSTM_160    (0<<11)	/* 160MB/s 2eSST Xfer Rate */
-#define TSI148_LCSR_OTAT_2eSSTM_267    (1<<11)	/* 267MB/s 2eSST Xfer Rate */
-#define TSI148_LCSR_OTAT_2eSSTM_320    (2<<11)	/* 320MB/s 2eSST Xfer Rate */
-
-#define TSI148_LCSR_OTAT_TM_M          (7<<8)	/* Xfer Protocol Mask */
-#define TSI148_LCSR_OTAT_TM_SCT        (0<<8)	/* SCT Xfer Protocol */
-#define TSI148_LCSR_OTAT_TM_BLT        (1<<8)	/* BLT Xfer Protocol */
-#define TSI148_LCSR_OTAT_TM_MBLT       (2<<8)	/* MBLT Xfer Protocol */
-#define TSI148_LCSR_OTAT_TM_2eVME      (3<<8)	/* 2eVME Xfer Protocol */
-#define TSI148_LCSR_OTAT_TM_2eSST      (4<<8)	/* 2eSST Xfer Protocol */
-#define TSI148_LCSR_OTAT_TM_2eSSTB     (5<<8)	/* 2eSST Bcast Xfer Protocol */
-
-#define TSI148_LCSR_OTAT_DBW_M         (3<<6)	/* Max Data Width */
-#define TSI148_LCSR_OTAT_DBW_16        (0<<6)	/* 16-bit Data Width */
-#define TSI148_LCSR_OTAT_DBW_32        (1<<6)	/* 32-bit Data Width */
-
-#define TSI148_LCSR_OTAT_SUP           (1<<5)	/* Supervisory Access */
-#define TSI148_LCSR_OTAT_PGM           (1<<4)	/* Program Access */
-
-#define TSI148_LCSR_OTAT_AMODE_M       (0xf<<0)	/* Address Mode Mask */
-#define TSI148_LCSR_OTAT_AMODE_A16     (0<<0)	/* A16 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_A24     (1<<0)	/* A24 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_A32     (2<<0)	/* A32 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_A64     (4<<0)	/* A32 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_CRCSR   (5<<0)	/* CR/CSR Address Space */
-#define TSI148_LCSR_OTAT_AMODE_USER1   (8<<0)	/* User1 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_USER2   (9<<0)	/* User2 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_USER3   (10<<0)	/* User3 Address Space */
-#define TSI148_LCSR_OTAT_AMODE_USER4   (11<<0)	/* User4 Address Space */
-
-/*
- *  VME Master Control Register  CRG+$234
- */
-#define TSI148_LCSR_VMCTRL_VSA         (1<<27)	/* VMEbus Stop Ack */
-#define TSI148_LCSR_VMCTRL_VS          (1<<26)	/* VMEbus Stop */
-#define TSI148_LCSR_VMCTRL_DHB         (1<<25)	/* Device Has Bus */
-#define TSI148_LCSR_VMCTRL_DWB         (1<<24)	/* Device Wants Bus */
-
-#define TSI148_LCSR_VMCTRL_RMWEN       (1<<20)	/* RMW Enable */
-
-#define TSI148_LCSR_VMCTRL_ATO_M       (7<<16)	/* Master Access Time-out Mask
-						 */
-#define TSI148_LCSR_VMCTRL_ATO_32      (0<<16)	/* 32 us */
-#define TSI148_LCSR_VMCTRL_ATO_128     (1<<16)	/* 128 us */
-#define TSI148_LCSR_VMCTRL_ATO_512     (2<<16)	/* 512 us */
-#define TSI148_LCSR_VMCTRL_ATO_2M      (3<<16)	/* 2 ms */
-#define TSI148_LCSR_VMCTRL_ATO_8M      (4<<16)	/* 8 ms */
-#define TSI148_LCSR_VMCTRL_ATO_32M     (5<<16)	/* 32 ms */
-#define TSI148_LCSR_VMCTRL_ATO_128M    (6<<16)	/* 128 ms */
-#define TSI148_LCSR_VMCTRL_ATO_DIS     (7<<16)	/* Disabled */
-
-#define TSI148_LCSR_VMCTRL_VTOFF_M     (7<<12)	/* VMEbus Master Time off */
-#define TSI148_LCSR_VMCTRL_VTOFF_0     (0<<12)	/* 0us */
-#define TSI148_LCSR_VMCTRL_VTOFF_1     (1<<12)	/* 1us */
-#define TSI148_LCSR_VMCTRL_VTOFF_2     (2<<12)	/* 2us */
-#define TSI148_LCSR_VMCTRL_VTOFF_4     (3<<12)	/* 4us */
-#define TSI148_LCSR_VMCTRL_VTOFF_8     (4<<12)	/* 8us */
-#define TSI148_LCSR_VMCTRL_VTOFF_16    (5<<12)	/* 16us */
-#define TSI148_LCSR_VMCTRL_VTOFF_32    (6<<12)	/* 32us */
-#define TSI148_LCSR_VMCTRL_VTOFF_64    (7<<12)	/* 64us */
-
-#define TSI148_LCSR_VMCTRL_VTON_M      (7<<8)	/* VMEbus Master Time On */
-#define TSI148_LCSR_VMCTRL_VTON_4      (0<<8)	/* 8us */
-#define TSI148_LCSR_VMCTRL_VTON_8      (1<<8)	/* 8us */
-#define TSI148_LCSR_VMCTRL_VTON_16     (2<<8)	/* 16us */
-#define TSI148_LCSR_VMCTRL_VTON_32     (3<<8)	/* 32us */
-#define TSI148_LCSR_VMCTRL_VTON_64     (4<<8)	/* 64us */
-#define TSI148_LCSR_VMCTRL_VTON_128    (5<<8)	/* 128us */
-#define TSI148_LCSR_VMCTRL_VTON_256    (6<<8)	/* 256us */
-#define TSI148_LCSR_VMCTRL_VTON_512    (7<<8)	/* 512us */
-
-#define TSI148_LCSR_VMCTRL_VREL_M      (3<<3)	/* VMEbus Master Rel Mode Mask
-						 */
-#define TSI148_LCSR_VMCTRL_VREL_T_D    (0<<3)	/* Time on or Done */
-#define TSI148_LCSR_VMCTRL_VREL_T_R_D  (1<<3)	/* Time on and REQ or Done */
-#define TSI148_LCSR_VMCTRL_VREL_T_B_D  (2<<3)	/* Time on and BCLR or Done */
-#define TSI148_LCSR_VMCTRL_VREL_T_D_R  (3<<3)	/* Time on or Done and REQ */
-
-#define TSI148_LCSR_VMCTRL_VFAIR       (1<<2)	/* VMEbus Master Fair Mode */
-#define TSI148_LCSR_VMCTRL_VREQL_M     (3<<0)	/* VMEbus Master Req Level Mask
-						 */
-
-/*
- *  VMEbus Control Register CRG+$238
- */
-#define TSI148_LCSR_VCTRL_LRE          (1<<31)	/* Late Retry Enable */
-
-#define TSI148_LCSR_VCTRL_DLT_M        (0xF<<24)	/* Deadlock Timer */
-#define TSI148_LCSR_VCTRL_DLT_OFF      (0<<24)	/* Deadlock Timer Off */
-#define TSI148_LCSR_VCTRL_DLT_16       (1<<24)	/* 16 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_32       (2<<24)	/* 32 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_64       (3<<24)	/* 64 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_128      (4<<24)	/* 128 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_256      (5<<24)	/* 256 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_512      (6<<24)	/* 512 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_1024     (7<<24)	/* 1024 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_2048     (8<<24)	/* 2048 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_4096     (9<<24)	/* 4096 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_8192     (0xA<<24)	/* 8192 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_16384    (0xB<<24)	/* 16384 VCLKS */
-#define TSI148_LCSR_VCTRL_DLT_32768    (0xC<<24)	/* 32768 VCLKS */
-
-#define TSI148_LCSR_VCTRL_NERBB        (1<<20)	/* No Early Release of Bus Busy
-						 */
-
-#define TSI148_LCSR_VCTRL_SRESET       (1<<17)	/* System Reset */
-#define TSI148_LCSR_VCTRL_LRESET       (1<<16)	/* Local Reset */
-
-#define TSI148_LCSR_VCTRL_SFAILAI      (1<<15)	/* SYSFAIL Auto Slot ID */
-#define TSI148_LCSR_VCTRL_BID_M        (0x1F<<8)	/* Broadcast ID Mask */
-
-#define TSI148_LCSR_VCTRL_ATOEN        (1<<7)	/* Arbiter Time-out Enable */
-#define TSI148_LCSR_VCTRL_ROBIN        (1<<6)	/* VMEbus Round Robin */
-
-#define TSI148_LCSR_VCTRL_GTO_M        (7<<0)	/* VMEbus Global Time-out Mask
-						 */
-#define TSI148_LCSR_VCTRL_GTO_8	      (0<<0)	/* 8 us */
-#define TSI148_LCSR_VCTRL_GTO_16	      (1<<0)	/* 16 us */
-#define TSI148_LCSR_VCTRL_GTO_32	      (2<<0)	/* 32 us */
-#define TSI148_LCSR_VCTRL_GTO_64	      (3<<0)	/* 64 us */
-#define TSI148_LCSR_VCTRL_GTO_128      (4<<0)	/* 128 us */
-#define TSI148_LCSR_VCTRL_GTO_256      (5<<0)	/* 256 us */
-#define TSI148_LCSR_VCTRL_GTO_512      (6<<0)	/* 512 us */
-#define TSI148_LCSR_VCTRL_GTO_DIS      (7<<0)	/* Disabled */
-
-/*
- *  VMEbus Status Register  CRG + $23C
- */
-#define TSI148_LCSR_VSTAT_CPURST       (1<<15)	/* Clear power up reset */
-#define TSI148_LCSR_VSTAT_BRDFL        (1<<14)	/* Board fail */
-#define TSI148_LCSR_VSTAT_PURSTS       (1<<12)	/* Power up reset status */
-#define TSI148_LCSR_VSTAT_BDFAILS      (1<<11)	/* Board Fail Status */
-#define TSI148_LCSR_VSTAT_SYSFAILS     (1<<10)	/* System Fail Status */
-#define TSI148_LCSR_VSTAT_ACFAILS      (1<<9)	/* AC fail status */
-#define TSI148_LCSR_VSTAT_SCONS        (1<<8)	/* System Cont Status */
-#define TSI148_LCSR_VSTAT_GAP          (1<<5)	/* Geographic Addr Parity */
-#define TSI148_LCSR_VSTAT_GA_M         (0x1F<<0)  /* Geographic Addr Mask */
-
-/*
- *  PCI Configuration Status Register CRG+$240
- */
-#define TSI148_LCSR_PSTAT_REQ64S       (1<<6)	/* Request 64 status set */
-#define TSI148_LCSR_PSTAT_M66ENS       (1<<5)	/* M66ENS 66Mhz enable */
-#define TSI148_LCSR_PSTAT_FRAMES       (1<<4)	/* Frame Status */
-#define TSI148_LCSR_PSTAT_IRDYS        (1<<3)	/* IRDY status */
-#define TSI148_LCSR_PSTAT_DEVSELS      (1<<2)	/* DEVL status */
-#define TSI148_LCSR_PSTAT_STOPS        (1<<1)	/* STOP status */
-#define TSI148_LCSR_PSTAT_TRDYS        (1<<0)	/* TRDY status */
-
-/*
- *  VMEbus Exception Attributes Register  CRG + $268
- */
-#define TSI148_LCSR_VEAT_VES           (1<<31)	/* Status */
-#define TSI148_LCSR_VEAT_VEOF          (1<<30)	/* Overflow */
-#define TSI148_LCSR_VEAT_VESCL         (1<<29)	/* Status Clear */
-#define TSI148_LCSR_VEAT_2EOT          (1<<21)	/* 2e Odd Termination */
-#define TSI148_LCSR_VEAT_2EST          (1<<20)	/* 2e Slave terminated */
-#define TSI148_LCSR_VEAT_BERR          (1<<19)	/* Bus Error */
-#define TSI148_LCSR_VEAT_LWORD         (1<<18)	/* LWORD_ signal state */
-#define TSI148_LCSR_VEAT_WRITE         (1<<17)	/* WRITE_ signal state */
-#define TSI148_LCSR_VEAT_IACK          (1<<16)	/* IACK_ signal state */
-#define TSI148_LCSR_VEAT_DS1           (1<<15)	/* DS1_ signal state */
-#define TSI148_LCSR_VEAT_DS0           (1<<14)	/* DS0_ signal state */
-#define TSI148_LCSR_VEAT_AM_M          (0x3F<<8)	/* Address Mode Mask */
-#define TSI148_LCSR_VEAT_XAM_M         (0xFF<<0)	/* Master AMode Mask */
-
-
-/*
- * VMEbus PCI Error Diagnostics PCI/X Attributes Register  CRG + $280
- */
-#define TSI148_LCSR_EDPAT_EDPCL        (1<<29)
-
-/*
- *  Inbound Translation Starting Address Lower
- */
-#define TSI148_LCSR_ITSAL6432_M        (0xFFFF<<16)	/* Mask */
-#define TSI148_LCSR_ITSAL24_M          (0x00FFF<<12)	/* Mask */
-#define TSI148_LCSR_ITSAL16_M          (0x0000FFF<<4)	/* Mask */
-
-/*
- *  Inbound Translation Ending Address Lower
- */
-#define TSI148_LCSR_ITEAL6432_M        (0xFFFF<<16)	/* Mask */
-#define TSI148_LCSR_ITEAL24_M          (0x00FFF<<12)	/* Mask */
-#define TSI148_LCSR_ITEAL16_M          (0x0000FFF<<4)	/* Mask */
-
-/*
- *  Inbound Translation Offset Lower
- */
-#define TSI148_LCSR_ITOFFL6432_M       (0xFFFF<<16)	/* Mask */
-#define TSI148_LCSR_ITOFFL24_M         (0xFFFFF<<12)	/* Mask */
-#define TSI148_LCSR_ITOFFL16_M         (0xFFFFFFF<<4)	/* Mask */
-
-/*
- *  Inbound Translation Attribute
- */
-#define TSI148_LCSR_ITAT_EN            (1<<31)	/* Window Enable */
-#define TSI148_LCSR_ITAT_TH            (1<<18)	/* Prefetch Threshold */
-
-#define TSI148_LCSR_ITAT_VFS_M         (3<<16)	/* Virtual FIFO Size Mask */
-#define TSI148_LCSR_ITAT_VFS_64        (0<<16)	/* 64 bytes Virtual FIFO Size */
-#define TSI148_LCSR_ITAT_VFS_128       (1<<16)	/* 128 bytes Virtual FIFO Sz */
-#define TSI148_LCSR_ITAT_VFS_256       (2<<16)	/* 256 bytes Virtual FIFO Sz */
-#define TSI148_LCSR_ITAT_VFS_512       (3<<16)	/* 512 bytes Virtual FIFO Sz */
-
-#define TSI148_LCSR_ITAT_2eSSTM_M      (7<<12)	/* 2eSST Xfer Rate Mask */
-#define TSI148_LCSR_ITAT_2eSSTM_160    (0<<12)	/* 160MB/s 2eSST Xfer Rate */
-#define TSI148_LCSR_ITAT_2eSSTM_267    (1<<12)	/* 267MB/s 2eSST Xfer Rate */
-#define TSI148_LCSR_ITAT_2eSSTM_320    (2<<12)	/* 320MB/s 2eSST Xfer Rate */
-
-#define TSI148_LCSR_ITAT_2eSSTB        (1<<11)	/* 2eSST Bcast Xfer Protocol */
-#define TSI148_LCSR_ITAT_2eSST         (1<<10)	/* 2eSST Xfer Protocol */
-#define TSI148_LCSR_ITAT_2eVME         (1<<9)	/* 2eVME Xfer Protocol */
-#define TSI148_LCSR_ITAT_MBLT          (1<<8)	/* MBLT Xfer Protocol */
-#define TSI148_LCSR_ITAT_BLT           (1<<7)	/* BLT Xfer Protocol */
-
-#define TSI148_LCSR_ITAT_AS_M          (7<<4)	/* Address Space Mask */
-#define TSI148_LCSR_ITAT_AS_A16        (0<<4)	/* A16 Address Space */
-#define TSI148_LCSR_ITAT_AS_A24        (1<<4)	/* A24 Address Space */
-#define TSI148_LCSR_ITAT_AS_A32        (2<<4)	/* A32 Address Space */
-#define TSI148_LCSR_ITAT_AS_A64        (4<<4)	/* A64 Address Space */
-
-#define TSI148_LCSR_ITAT_SUPR          (1<<3)	/* Supervisor Access */
-#define TSI148_LCSR_ITAT_NPRIV         (1<<2)	/* Non-Priv (User) Access */
-#define TSI148_LCSR_ITAT_PGM           (1<<1)	/* Program Access */
-#define TSI148_LCSR_ITAT_DATA          (1<<0)	/* Data Access */
-
-/*
- *  GCSR Base Address Lower Address  CRG +$404
- */
-#define TSI148_LCSR_GBAL_M             (0x7FFFFFF<<5)	/* Mask */
-
-/*
- *  GCSR Attribute Register CRG + $408
- */
-#define TSI148_LCSR_GCSRAT_EN          (1<<7)	/* Enable access to GCSR */
-
-#define TSI148_LCSR_GCSRAT_AS_M        (7<<4)	/* Address Space Mask */
-#define TSI148_LCSR_GCSRAT_AS_A16       (0<<4)	/* Address Space 16 */
-#define TSI148_LCSR_GCSRAT_AS_A24       (1<<4)	/* Address Space 24 */
-#define TSI148_LCSR_GCSRAT_AS_A32       (2<<4)	/* Address Space 32 */
-#define TSI148_LCSR_GCSRAT_AS_A64       (4<<4)	/* Address Space 64 */
-
-#define TSI148_LCSR_GCSRAT_SUPR        (1<<3)	/* Sup set -GCSR decoder */
-#define TSI148_LCSR_GCSRAT_NPRIV       (1<<2)	/* Non-Privliged set - CGSR */
-#define TSI148_LCSR_GCSRAT_PGM         (1<<1)	/* Program set - GCSR decoder */
-#define TSI148_LCSR_GCSRAT_DATA        (1<<0)	/* DATA set GCSR decoder */
-
-/*
- *  CRG Base Address Lower Address  CRG + $410
- */
-#define TSI148_LCSR_CBAL_M             (0xFFFFF<<12)
-
-/*
- *  CRG Attribute Register  CRG + $414
- */
-#define TSI148_LCSR_CRGAT_EN           (1<<7)	/* Enable PRG Access */
-
-#define TSI148_LCSR_CRGAT_AS_M         (7<<4)	/* Address Space */
-#define TSI148_LCSR_CRGAT_AS_A16       (0<<4)	/* Address Space 16 */
-#define TSI148_LCSR_CRGAT_AS_A24       (1<<4)	/* Address Space 24 */
-#define TSI148_LCSR_CRGAT_AS_A32       (2<<4)	/* Address Space 32 */
-#define TSI148_LCSR_CRGAT_AS_A64       (4<<4)	/* Address Space 64 */
-
-#define TSI148_LCSR_CRGAT_SUPR         (1<<3)	/* Supervisor Access */
-#define TSI148_LCSR_CRGAT_NPRIV        (1<<2)	/* Non-Privliged(User) Access */
-#define TSI148_LCSR_CRGAT_PGM          (1<<1)	/* Program Access */
-#define TSI148_LCSR_CRGAT_DATA         (1<<0)	/* Data Access */
-
-/*
- *  CR/CSR Offset Lower Register  CRG + $41C
- */
-#define TSI148_LCSR_CROL_M             (0x1FFF<<19)	/* Mask */
-
-/*
- *  CR/CSR Attribute register  CRG + $420
- */
-#define TSI148_LCSR_CRAT_EN            (1<<7)	/* Enable access to CR/CSR */
-
-/*
- *  Location Monitor base address lower register  CRG + $428
- */
-#define TSI148_LCSR_LMBAL_M            (0x7FFFFFF<<5)	/* Mask */
-
-/*
- *  Location Monitor Attribute Register  CRG + $42C
- */
-#define TSI148_LCSR_LMAT_EN            (1<<7)	/* Enable Location Monitor */
-
-#define TSI148_LCSR_LMAT_AS_M          (7<<4)	/* Address Space MASK  */
-#define TSI148_LCSR_LMAT_AS_A16        (0<<4)	/* A16 */
-#define TSI148_LCSR_LMAT_AS_A24        (1<<4)	/* A24 */
-#define TSI148_LCSR_LMAT_AS_A32        (2<<4)	/* A32 */
-#define TSI148_LCSR_LMAT_AS_A64        (4<<4)	/* A64 */
-
-#define TSI148_LCSR_LMAT_SUPR          (1<<3)	/* Supervisor Access */
-#define TSI148_LCSR_LMAT_NPRIV         (1<<2)	/* Non-Priv (User) Access */
-#define TSI148_LCSR_LMAT_PGM           (1<<1)	/* Program Access */
-#define TSI148_LCSR_LMAT_DATA          (1<<0)	/* Data Access  */
-
-/*
- *  Broadcast Pulse Generator Timer Register  CRG + $438
- */
-#define TSI148_LCSR_BPGTR_BPGT_M       (0xFFFF<<0)	/* Mask */
-
-/*
- *  Broadcast Programmable Clock Timer Register  CRG + $43C
- */
-#define TSI148_LCSR_BPCTR_BPCT_M       (0xFFFFFF<<0)	/* Mask */
-
-/*
- *  VMEbus Interrupt Control Register           CRG + $43C
- */
-#define TSI148_LCSR_VICR_CNTS_M        (3<<22)	/* Cntr Source MASK */
-#define TSI148_LCSR_VICR_CNTS_DIS      (1<<22)	/* Cntr Disable */
-#define TSI148_LCSR_VICR_CNTS_IRQ1     (2<<22)	/* IRQ1 to Cntr */
-#define TSI148_LCSR_VICR_CNTS_IRQ2     (3<<22)	/* IRQ2 to Cntr */
-
-#define TSI148_LCSR_VICR_EDGIS_M       (3<<20)	/* Edge interrupt MASK */
-#define TSI148_LCSR_VICR_EDGIS_DIS     (1<<20)	/* Edge interrupt Disable */
-#define TSI148_LCSR_VICR_EDGIS_IRQ1    (2<<20)	/* IRQ1 to Edge */
-#define TSI148_LCSR_VICR_EDGIS_IRQ2    (3<<20)	/* IRQ2 to Edge */
-
-#define TSI148_LCSR_VICR_IRQIF_M       (3<<18)	/* IRQ1* Function MASK */
-#define TSI148_LCSR_VICR_IRQIF_NORM    (1<<18)	/* Normal */
-#define TSI148_LCSR_VICR_IRQIF_PULSE   (2<<18)	/* Pulse Generator */
-#define TSI148_LCSR_VICR_IRQIF_PROG    (3<<18)	/* Programmable Clock */
-#define TSI148_LCSR_VICR_IRQIF_1U      (4<<18)	/* 1us Clock */
-
-#define TSI148_LCSR_VICR_IRQ2F_M       (3<<16)	/* IRQ2* Function MASK */
-#define TSI148_LCSR_VICR_IRQ2F_NORM    (1<<16)	/* Normal */
-#define TSI148_LCSR_VICR_IRQ2F_PULSE   (2<<16)	/* Pulse Generator */
-#define TSI148_LCSR_VICR_IRQ2F_PROG    (3<<16)	/* Programmable Clock */
-#define TSI148_LCSR_VICR_IRQ2F_1U      (4<<16)	/* 1us Clock */
-
-#define TSI148_LCSR_VICR_BIP           (1<<15)	/* Broadcast Interrupt Pulse */
-
-#define TSI148_LCSR_VICR_IRQC          (1<<12)	/* VMEbus IRQ Clear */
-#define TSI148_LCSR_VICR_IRQS          (1<<11)	/* VMEbus IRQ Status */
-
-#define TSI148_LCSR_VICR_IRQL_M        (7<<8)	/* VMEbus SW IRQ Level Mask */
-#define TSI148_LCSR_VICR_IRQL_1        (1<<8)	/* VMEbus SW IRQ Level 1 */
-#define TSI148_LCSR_VICR_IRQL_2        (2<<8)	/* VMEbus SW IRQ Level 2 */
-#define TSI148_LCSR_VICR_IRQL_3        (3<<8)	/* VMEbus SW IRQ Level 3 */
-#define TSI148_LCSR_VICR_IRQL_4        (4<<8)	/* VMEbus SW IRQ Level 4 */
-#define TSI148_LCSR_VICR_IRQL_5        (5<<8)	/* VMEbus SW IRQ Level 5 */
-#define TSI148_LCSR_VICR_IRQL_6        (6<<8)	/* VMEbus SW IRQ Level 6 */
-#define TSI148_LCSR_VICR_IRQL_7        (7<<8)	/* VMEbus SW IRQ Level 7 */
-
-static const int TSI148_LCSR_VICR_IRQL[8] = { 0, TSI148_LCSR_VICR_IRQL_1,
-			TSI148_LCSR_VICR_IRQL_2, TSI148_LCSR_VICR_IRQL_3,
-			TSI148_LCSR_VICR_IRQL_4, TSI148_LCSR_VICR_IRQL_5,
-			TSI148_LCSR_VICR_IRQL_6, TSI148_LCSR_VICR_IRQL_7 };
-
-#define TSI148_LCSR_VICR_STID_M        (0xFF<<0)	/* Status/ID Mask */
-
-/*
- *  Interrupt Enable Register   CRG + $440
- */
-#define TSI148_LCSR_INTEN_DMA1EN       (1<<25)	/* DMAC 1 */
-#define TSI148_LCSR_INTEN_DMA0EN       (1<<24)	/* DMAC 0 */
-#define TSI148_LCSR_INTEN_LM3EN        (1<<23)	/* Location Monitor 3 */
-#define TSI148_LCSR_INTEN_LM2EN        (1<<22)	/* Location Monitor 2 */
-#define TSI148_LCSR_INTEN_LM1EN        (1<<21)	/* Location Monitor 1 */
-#define TSI148_LCSR_INTEN_LM0EN        (1<<20)	/* Location Monitor 0 */
-#define TSI148_LCSR_INTEN_MB3EN        (1<<19)	/* Mail Box 3 */
-#define TSI148_LCSR_INTEN_MB2EN        (1<<18)	/* Mail Box 2 */
-#define TSI148_LCSR_INTEN_MB1EN        (1<<17)	/* Mail Box 1 */
-#define TSI148_LCSR_INTEN_MB0EN        (1<<16)	/* Mail Box 0 */
-#define TSI148_LCSR_INTEN_PERREN       (1<<13)	/* PCI/X Error */
-#define TSI148_LCSR_INTEN_VERREN       (1<<12)	/* VMEbus Error */
-#define TSI148_LCSR_INTEN_VIEEN        (1<<11)	/* VMEbus IRQ Edge */
-#define TSI148_LCSR_INTEN_IACKEN       (1<<10)	/* IACK */
-#define TSI148_LCSR_INTEN_SYSFLEN      (1<<9)	/* System Fail */
-#define TSI148_LCSR_INTEN_ACFLEN       (1<<8)	/* AC Fail */
-#define TSI148_LCSR_INTEN_IRQ7EN       (1<<7)	/* IRQ7 */
-#define TSI148_LCSR_INTEN_IRQ6EN       (1<<6)	/* IRQ6 */
-#define TSI148_LCSR_INTEN_IRQ5EN       (1<<5)	/* IRQ5 */
-#define TSI148_LCSR_INTEN_IRQ4EN       (1<<4)	/* IRQ4 */
-#define TSI148_LCSR_INTEN_IRQ3EN       (1<<3)	/* IRQ3 */
-#define TSI148_LCSR_INTEN_IRQ2EN       (1<<2)	/* IRQ2 */
-#define TSI148_LCSR_INTEN_IRQ1EN       (1<<1)	/* IRQ1 */
-
-static const int TSI148_LCSR_INTEN_LMEN[4] = { TSI148_LCSR_INTEN_LM0EN,
-					TSI148_LCSR_INTEN_LM1EN,
-					TSI148_LCSR_INTEN_LM2EN,
-					TSI148_LCSR_INTEN_LM3EN };
-
-static const int TSI148_LCSR_INTEN_IRQEN[7] = { TSI148_LCSR_INTEN_IRQ1EN,
-					TSI148_LCSR_INTEN_IRQ2EN,
-					TSI148_LCSR_INTEN_IRQ3EN,
-					TSI148_LCSR_INTEN_IRQ4EN,
-					TSI148_LCSR_INTEN_IRQ5EN,
-					TSI148_LCSR_INTEN_IRQ6EN,
-					TSI148_LCSR_INTEN_IRQ7EN };
-
-/*
- *  Interrupt Enable Out Register CRG + $444
- */
-#define TSI148_LCSR_INTEO_DMA1EO       (1<<25)	/* DMAC 1 */
-#define TSI148_LCSR_INTEO_DMA0EO       (1<<24)	/* DMAC 0 */
-#define TSI148_LCSR_INTEO_LM3EO        (1<<23)	/* Loc Monitor 3 */
-#define TSI148_LCSR_INTEO_LM2EO        (1<<22)	/* Loc Monitor 2 */
-#define TSI148_LCSR_INTEO_LM1EO        (1<<21)	/* Loc Monitor 1 */
-#define TSI148_LCSR_INTEO_LM0EO        (1<<20)	/* Location Monitor 0 */
-#define TSI148_LCSR_INTEO_MB3EO        (1<<19)	/* Mail Box 3 */
-#define TSI148_LCSR_INTEO_MB2EO        (1<<18)	/* Mail Box 2 */
-#define TSI148_LCSR_INTEO_MB1EO        (1<<17)	/* Mail Box 1 */
-#define TSI148_LCSR_INTEO_MB0EO        (1<<16)	/* Mail Box 0 */
-#define TSI148_LCSR_INTEO_PERREO       (1<<13)	/* PCI/X Error */
-#define TSI148_LCSR_INTEO_VERREO       (1<<12)	/* VMEbus Error */
-#define TSI148_LCSR_INTEO_VIEEO        (1<<11)	/* VMEbus IRQ Edge */
-#define TSI148_LCSR_INTEO_IACKEO       (1<<10)	/* IACK */
-#define TSI148_LCSR_INTEO_SYSFLEO      (1<<9)	/* System Fail */
-#define TSI148_LCSR_INTEO_ACFLEO       (1<<8)	/* AC Fail */
-#define TSI148_LCSR_INTEO_IRQ7EO       (1<<7)	/* IRQ7 */
-#define TSI148_LCSR_INTEO_IRQ6EO       (1<<6)	/* IRQ6 */
-#define TSI148_LCSR_INTEO_IRQ5EO       (1<<5)	/* IRQ5 */
-#define TSI148_LCSR_INTEO_IRQ4EO       (1<<4)	/* IRQ4 */
-#define TSI148_LCSR_INTEO_IRQ3EO       (1<<3)	/* IRQ3 */
-#define TSI148_LCSR_INTEO_IRQ2EO       (1<<2)	/* IRQ2 */
-#define TSI148_LCSR_INTEO_IRQ1EO       (1<<1)	/* IRQ1 */
-
-static const int TSI148_LCSR_INTEO_LMEO[4] = { TSI148_LCSR_INTEO_LM0EO,
-					TSI148_LCSR_INTEO_LM1EO,
-					TSI148_LCSR_INTEO_LM2EO,
-					TSI148_LCSR_INTEO_LM3EO };
-
-static const int TSI148_LCSR_INTEO_IRQEO[7] = { TSI148_LCSR_INTEO_IRQ1EO,
-					TSI148_LCSR_INTEO_IRQ2EO,
-					TSI148_LCSR_INTEO_IRQ3EO,
-					TSI148_LCSR_INTEO_IRQ4EO,
-					TSI148_LCSR_INTEO_IRQ5EO,
-					TSI148_LCSR_INTEO_IRQ6EO,
-					TSI148_LCSR_INTEO_IRQ7EO };
-
-/*
- *  Interrupt Status Register CRG + $448
- */
-#define TSI148_LCSR_INTS_DMA1S         (1<<25)	/* DMA 1 */
-#define TSI148_LCSR_INTS_DMA0S         (1<<24)	/* DMA 0 */
-#define TSI148_LCSR_INTS_LM3S          (1<<23)	/* Location Monitor 3 */
-#define TSI148_LCSR_INTS_LM2S          (1<<22)	/* Location Monitor 2 */
-#define TSI148_LCSR_INTS_LM1S          (1<<21)	/* Location Monitor 1 */
-#define TSI148_LCSR_INTS_LM0S          (1<<20)	/* Location Monitor 0 */
-#define TSI148_LCSR_INTS_MB3S          (1<<19)	/* Mail Box 3 */
-#define TSI148_LCSR_INTS_MB2S          (1<<18)	/* Mail Box 2 */
-#define TSI148_LCSR_INTS_MB1S          (1<<17)	/* Mail Box 1 */
-#define TSI148_LCSR_INTS_MB0S          (1<<16)	/* Mail Box 0 */
-#define TSI148_LCSR_INTS_PERRS         (1<<13)	/* PCI/X Error */
-#define TSI148_LCSR_INTS_VERRS         (1<<12)	/* VMEbus Error */
-#define TSI148_LCSR_INTS_VIES          (1<<11)	/* VMEbus IRQ Edge */
-#define TSI148_LCSR_INTS_IACKS         (1<<10)	/* IACK */
-#define TSI148_LCSR_INTS_SYSFLS        (1<<9)	/* System Fail */
-#define TSI148_LCSR_INTS_ACFLS         (1<<8)	/* AC Fail */
-#define TSI148_LCSR_INTS_IRQ7S         (1<<7)	/* IRQ7 */
-#define TSI148_LCSR_INTS_IRQ6S         (1<<6)	/* IRQ6 */
-#define TSI148_LCSR_INTS_IRQ5S         (1<<5)	/* IRQ5 */
-#define TSI148_LCSR_INTS_IRQ4S         (1<<4)	/* IRQ4 */
-#define TSI148_LCSR_INTS_IRQ3S         (1<<3)	/* IRQ3 */
-#define TSI148_LCSR_INTS_IRQ2S         (1<<2)	/* IRQ2 */
-#define TSI148_LCSR_INTS_IRQ1S         (1<<1)	/* IRQ1 */
-
-static const int TSI148_LCSR_INTS_LMS[4] = { TSI148_LCSR_INTS_LM0S,
-					TSI148_LCSR_INTS_LM1S,
-					TSI148_LCSR_INTS_LM2S,
-					TSI148_LCSR_INTS_LM3S };
-
-static const int TSI148_LCSR_INTS_MBS[4] = { TSI148_LCSR_INTS_MB0S,
-					TSI148_LCSR_INTS_MB1S,
-					TSI148_LCSR_INTS_MB2S,
-					TSI148_LCSR_INTS_MB3S };
-
-/*
- *  Interrupt Clear Register CRG + $44C
- */
-#define TSI148_LCSR_INTC_DMA1C         (1<<25)	/* DMA 1 */
-#define TSI148_LCSR_INTC_DMA0C         (1<<24)	/* DMA 0 */
-#define TSI148_LCSR_INTC_LM3C          (1<<23)	/* Location Monitor 3 */
-#define TSI148_LCSR_INTC_LM2C          (1<<22)	/* Location Monitor 2 */
-#define TSI148_LCSR_INTC_LM1C          (1<<21)	/* Location Monitor 1 */
-#define TSI148_LCSR_INTC_LM0C          (1<<20)	/* Location Monitor 0 */
-#define TSI148_LCSR_INTC_MB3C          (1<<19)	/* Mail Box 3 */
-#define TSI148_LCSR_INTC_MB2C          (1<<18)	/* Mail Box 2 */
-#define TSI148_LCSR_INTC_MB1C          (1<<17)	/* Mail Box 1 */
-#define TSI148_LCSR_INTC_MB0C          (1<<16)	/* Mail Box 0 */
-#define TSI148_LCSR_INTC_PERRC         (1<<13)	/* VMEbus Error */
-#define TSI148_LCSR_INTC_VERRC         (1<<12)	/* VMEbus Access Time-out */
-#define TSI148_LCSR_INTC_VIEC          (1<<11)	/* VMEbus IRQ Edge */
-#define TSI148_LCSR_INTC_IACKC         (1<<10)	/* IACK */
-#define TSI148_LCSR_INTC_SYSFLC        (1<<9)	/* System Fail */
-#define TSI148_LCSR_INTC_ACFLC         (1<<8)	/* AC Fail */
-
-static const int TSI148_LCSR_INTC_LMC[4] = { TSI148_LCSR_INTC_LM0C,
-					TSI148_LCSR_INTC_LM1C,
-					TSI148_LCSR_INTC_LM2C,
-					TSI148_LCSR_INTC_LM3C };
-
-static const int TSI148_LCSR_INTC_MBC[4] = { TSI148_LCSR_INTC_MB0C,
-					TSI148_LCSR_INTC_MB1C,
-					TSI148_LCSR_INTC_MB2C,
-					TSI148_LCSR_INTC_MB3C };
-
-/*
- *  Interrupt Map Register 1 CRG + $458
- */
-#define TSI148_LCSR_INTM1_DMA1M_M      (3<<18)	/* DMA 1 */
-#define TSI148_LCSR_INTM1_DMA0M_M      (3<<16)	/* DMA 0 */
-#define TSI148_LCSR_INTM1_LM3M_M       (3<<14)	/* Location Monitor 3 */
-#define TSI148_LCSR_INTM1_LM2M_M       (3<<12)	/* Location Monitor 2 */
-#define TSI148_LCSR_INTM1_LM1M_M       (3<<10)	/* Location Monitor 1 */
-#define TSI148_LCSR_INTM1_LM0M_M       (3<<8)	/* Location Monitor 0 */
-#define TSI148_LCSR_INTM1_MB3M_M       (3<<6)	/* Mail Box 3 */
-#define TSI148_LCSR_INTM1_MB2M_M       (3<<4)	/* Mail Box 2 */
-#define TSI148_LCSR_INTM1_MB1M_M       (3<<2)	/* Mail Box 1 */
-#define TSI148_LCSR_INTM1_MB0M_M       (3<<0)	/* Mail Box 0 */
-
-/*
- *  Interrupt Map Register 2 CRG + $45C
- */
-#define TSI148_LCSR_INTM2_PERRM_M      (3<<26)	/* PCI Bus Error */
-#define TSI148_LCSR_INTM2_VERRM_M      (3<<24)	/* VMEbus Error */
-#define TSI148_LCSR_INTM2_VIEM_M       (3<<22)	/* VMEbus IRQ Edge */
-#define TSI148_LCSR_INTM2_IACKM_M      (3<<20)	/* IACK */
-#define TSI148_LCSR_INTM2_SYSFLM_M     (3<<18)	/* System Fail */
-#define TSI148_LCSR_INTM2_ACFLM_M      (3<<16)	/* AC Fail */
-#define TSI148_LCSR_INTM2_IRQ7M_M      (3<<14)	/* IRQ7 */
-#define TSI148_LCSR_INTM2_IRQ6M_M      (3<<12)	/* IRQ6 */
-#define TSI148_LCSR_INTM2_IRQ5M_M      (3<<10)	/* IRQ5 */
-#define TSI148_LCSR_INTM2_IRQ4M_M      (3<<8)	/* IRQ4 */
-#define TSI148_LCSR_INTM2_IRQ3M_M      (3<<6)	/* IRQ3 */
-#define TSI148_LCSR_INTM2_IRQ2M_M      (3<<4)	/* IRQ2 */
-#define TSI148_LCSR_INTM2_IRQ1M_M      (3<<2)	/* IRQ1 */
-
-/*
- *  DMA Control (0-1) Registers CRG + $500
- */
-#define TSI148_LCSR_DCTL_ABT           (1<<27)	/* Abort */
-#define TSI148_LCSR_DCTL_PAU           (1<<26)	/* Pause */
-#define TSI148_LCSR_DCTL_DGO           (1<<25)	/* DMA Go */
-
-#define TSI148_LCSR_DCTL_MOD           (1<<23)	/* Mode */
-
-#define TSI148_LCSR_DCTL_VBKS_M        (7<<12)	/* VMEbus block Size MASK */
-#define TSI148_LCSR_DCTL_VBKS_32       (0<<12)	/* VMEbus block Size 32 */
-#define TSI148_LCSR_DCTL_VBKS_64       (1<<12)	/* VMEbus block Size 64 */
-#define TSI148_LCSR_DCTL_VBKS_128      (2<<12)	/* VMEbus block Size 128 */
-#define TSI148_LCSR_DCTL_VBKS_256      (3<<12)	/* VMEbus block Size 256 */
-#define TSI148_LCSR_DCTL_VBKS_512      (4<<12)	/* VMEbus block Size 512 */
-#define TSI148_LCSR_DCTL_VBKS_1024     (5<<12)	/* VMEbus block Size 1024 */
-#define TSI148_LCSR_DCTL_VBKS_2048     (6<<12)	/* VMEbus block Size 2048 */
-#define TSI148_LCSR_DCTL_VBKS_4096     (7<<12)	/* VMEbus block Size 4096 */
-
-#define TSI148_LCSR_DCTL_VBOT_M        (7<<8)	/* VMEbus back-off MASK */
-#define TSI148_LCSR_DCTL_VBOT_0        (0<<8)	/* VMEbus back-off  0us */
-#define TSI148_LCSR_DCTL_VBOT_1        (1<<8)	/* VMEbus back-off 1us */
-#define TSI148_LCSR_DCTL_VBOT_2        (2<<8)	/* VMEbus back-off 2us */
-#define TSI148_LCSR_DCTL_VBOT_4        (3<<8)	/* VMEbus back-off 4us */
-#define TSI148_LCSR_DCTL_VBOT_8        (4<<8)	/* VMEbus back-off 8us */
-#define TSI148_LCSR_DCTL_VBOT_16       (5<<8)	/* VMEbus back-off 16us */
-#define TSI148_LCSR_DCTL_VBOT_32       (6<<8)	/* VMEbus back-off 32us */
-#define TSI148_LCSR_DCTL_VBOT_64       (7<<8)	/* VMEbus back-off 64us */
-
-#define TSI148_LCSR_DCTL_PBKS_M        (7<<4)	/* PCI block size MASK */
-#define TSI148_LCSR_DCTL_PBKS_32       (0<<4)	/* PCI block size 32 bytes */
-#define TSI148_LCSR_DCTL_PBKS_64       (1<<4)	/* PCI block size 64 bytes */
-#define TSI148_LCSR_DCTL_PBKS_128      (2<<4)	/* PCI block size 128 bytes */
-#define TSI148_LCSR_DCTL_PBKS_256      (3<<4)	/* PCI block size 256 bytes */
-#define TSI148_LCSR_DCTL_PBKS_512      (4<<4)	/* PCI block size 512 bytes */
-#define TSI148_LCSR_DCTL_PBKS_1024     (5<<4)	/* PCI block size 1024 bytes */
-#define TSI148_LCSR_DCTL_PBKS_2048     (6<<4)	/* PCI block size 2048 bytes */
-#define TSI148_LCSR_DCTL_PBKS_4096     (7<<4)	/* PCI block size 4096 bytes */
-
-#define TSI148_LCSR_DCTL_PBOT_M        (7<<0)	/* PCI back off MASK */
-#define TSI148_LCSR_DCTL_PBOT_0        (0<<0)	/* PCI back off 0us */
-#define TSI148_LCSR_DCTL_PBOT_1        (1<<0)	/* PCI back off 1us */
-#define TSI148_LCSR_DCTL_PBOT_2        (2<<0)	/* PCI back off 2us */
-#define TSI148_LCSR_DCTL_PBOT_4        (3<<0)	/* PCI back off 3us */
-#define TSI148_LCSR_DCTL_PBOT_8        (4<<0)	/* PCI back off 4us */
-#define TSI148_LCSR_DCTL_PBOT_16       (5<<0)	/* PCI back off 8us */
-#define TSI148_LCSR_DCTL_PBOT_32       (6<<0)	/* PCI back off 16us */
-#define TSI148_LCSR_DCTL_PBOT_64       (7<<0)	/* PCI back off 32us */
-
-/*
- *  DMA Status Registers (0-1)  CRG + $504
- */
-#define TSI148_LCSR_DSTA_SMA           (1<<31)	/* PCI Signalled Master Abt */
-#define TSI148_LCSR_DSTA_RTA           (1<<30)	/* PCI Received Target Abt */
-#define TSI148_LCSR_DSTA_MRC           (1<<29)	/* PCI Max Retry Count */
-#define TSI148_LCSR_DSTA_VBE           (1<<28)	/* VMEbus error */
-#define TSI148_LCSR_DSTA_ABT           (1<<27)	/* Abort */
-#define TSI148_LCSR_DSTA_PAU           (1<<26)	/* Pause */
-#define TSI148_LCSR_DSTA_DON           (1<<25)	/* Done */
-#define TSI148_LCSR_DSTA_BSY           (1<<24)	/* Busy */
-
-/*
- *  DMA Current Link Address Lower (0-1)
- */
-#define TSI148_LCSR_DCLAL_M            (0x3FFFFFF<<6)	/* Mask */
-
-/*
- *  DMA Source Attribute (0-1) Reg
- */
-#define TSI148_LCSR_DSAT_TYP_M         (3<<28)	/* Source Bus Type */
-#define TSI148_LCSR_DSAT_TYP_PCI       (0<<28)	/* PCI Bus */
-#define TSI148_LCSR_DSAT_TYP_VME       (1<<28)	/* VMEbus */
-#define TSI148_LCSR_DSAT_TYP_PAT       (2<<28)	/* Data Pattern */
-
-#define TSI148_LCSR_DSAT_PSZ           (1<<25)	/* Pattern Size */
-#define TSI148_LCSR_DSAT_NIN           (1<<24)	/* No Increment */
-
-#define TSI148_LCSR_DSAT_2eSSTM_M      (3<<11)	/* 2eSST Trans Rate Mask */
-#define TSI148_LCSR_DSAT_2eSSTM_160    (0<<11)	/* 160 MB/s */
-#define TSI148_LCSR_DSAT_2eSSTM_267    (1<<11)	/* 267 MB/s */
-#define TSI148_LCSR_DSAT_2eSSTM_320    (2<<11)	/* 320 MB/s */
-
-#define TSI148_LCSR_DSAT_TM_M          (7<<8)	/* Bus Transfer Protocol Mask */
-#define TSI148_LCSR_DSAT_TM_SCT        (0<<8)	/* SCT */
-#define TSI148_LCSR_DSAT_TM_BLT        (1<<8)	/* BLT */
-#define TSI148_LCSR_DSAT_TM_MBLT       (2<<8)	/* MBLT */
-#define TSI148_LCSR_DSAT_TM_2eVME      (3<<8)	/* 2eVME */
-#define TSI148_LCSR_DSAT_TM_2eSST      (4<<8)	/* 2eSST */
-#define TSI148_LCSR_DSAT_TM_2eSSTB     (5<<8)	/* 2eSST Broadcast */
-
-#define TSI148_LCSR_DSAT_DBW_M         (3<<6)	/* Max Data Width MASK */
-#define TSI148_LCSR_DSAT_DBW_16        (0<<6)	/* 16 Bits */
-#define TSI148_LCSR_DSAT_DBW_32        (1<<6)	/* 32 Bits */
-
-#define TSI148_LCSR_DSAT_SUP           (1<<5)	/* Supervisory Mode */
-#define TSI148_LCSR_DSAT_PGM           (1<<4)	/* Program Mode */
-
-#define TSI148_LCSR_DSAT_AMODE_M       (0xf<<0)	/* Address Space Mask */
-#define TSI148_LCSR_DSAT_AMODE_A16     (0<<0)	/* A16 */
-#define TSI148_LCSR_DSAT_AMODE_A24     (1<<0)	/* A24 */
-#define TSI148_LCSR_DSAT_AMODE_A32     (2<<0)	/* A32 */
-#define TSI148_LCSR_DSAT_AMODE_A64     (4<<0)	/* A64 */
-#define TSI148_LCSR_DSAT_AMODE_CRCSR   (5<<0)	/* CR/CSR */
-#define TSI148_LCSR_DSAT_AMODE_USER1   (8<<0)	/* User1 */
-#define TSI148_LCSR_DSAT_AMODE_USER2   (9<<0)	/* User2 */
-#define TSI148_LCSR_DSAT_AMODE_USER3   (0xa<<0)	/* User3 */
-#define TSI148_LCSR_DSAT_AMODE_USER4   (0xb<<0)	/* User4 */
-
-/*
- *  DMA Destination Attribute Registers (0-1)
- */
-#define TSI148_LCSR_DDAT_TYP_PCI       (0<<28)	/* Destination PCI Bus  */
-#define TSI148_LCSR_DDAT_TYP_VME       (1<<28)	/* Destination VMEbus */
-
-#define TSI148_LCSR_DDAT_2eSSTM_M      (3<<11)	/* 2eSST Transfer Rate Mask */
-#define TSI148_LCSR_DDAT_2eSSTM_160    (0<<11)	/* 160 MB/s */
-#define TSI148_LCSR_DDAT_2eSSTM_267    (1<<11)	/* 267 MB/s */
-#define TSI148_LCSR_DDAT_2eSSTM_320    (2<<11)	/* 320 MB/s */
-
-#define TSI148_LCSR_DDAT_TM_M          (7<<8)	/* Bus Transfer Protocol Mask */
-#define TSI148_LCSR_DDAT_TM_SCT        (0<<8)	/* SCT */
-#define TSI148_LCSR_DDAT_TM_BLT        (1<<8)	/* BLT */
-#define TSI148_LCSR_DDAT_TM_MBLT       (2<<8)	/* MBLT */
-#define TSI148_LCSR_DDAT_TM_2eVME      (3<<8)	/* 2eVME */
-#define TSI148_LCSR_DDAT_TM_2eSST      (4<<8)	/* 2eSST */
-#define TSI148_LCSR_DDAT_TM_2eSSTB     (5<<8)	/* 2eSST Broadcast */
-
-#define TSI148_LCSR_DDAT_DBW_M         (3<<6)	/* Max Data Width MASK */
-#define TSI148_LCSR_DDAT_DBW_16        (0<<6)	/* 16 Bits */
-#define TSI148_LCSR_DDAT_DBW_32        (1<<6)	/* 32 Bits */
-
-#define TSI148_LCSR_DDAT_SUP           (1<<5)	/* Supervisory/User Access */
-#define TSI148_LCSR_DDAT_PGM           (1<<4)	/* Program/Data Access */
-
-#define TSI148_LCSR_DDAT_AMODE_M       (0xf<<0)	/* Address Space Mask */
-#define TSI148_LCSR_DDAT_AMODE_A16      (0<<0)	/* A16 */
-#define TSI148_LCSR_DDAT_AMODE_A24      (1<<0)	/* A24 */
-#define TSI148_LCSR_DDAT_AMODE_A32      (2<<0)	/* A32 */
-#define TSI148_LCSR_DDAT_AMODE_A64      (4<<0)	/* A64 */
-#define TSI148_LCSR_DDAT_AMODE_CRCSR   (5<<0)	/* CRC/SR */
-#define TSI148_LCSR_DDAT_AMODE_USER1   (8<<0)	/* User1 */
-#define TSI148_LCSR_DDAT_AMODE_USER2   (9<<0)	/* User2 */
-#define TSI148_LCSR_DDAT_AMODE_USER3   (0xa<<0)	/* User3 */
-#define TSI148_LCSR_DDAT_AMODE_USER4   (0xb<<0)	/* User4 */
-
-/*
- *  DMA Next Link Address Lower
- */
-#define TSI148_LCSR_DNLAL_DNLAL_M      (0x3FFFFFF<<6)	/* Address Mask */
-#define TSI148_LCSR_DNLAL_LLA          (1<<0)  /* Last Link Address Indicator */
-
-/*
- *  DMA 2eSST Broadcast Select
- */
-#define TSI148_LCSR_DBS_M              (0x1FFFFF<<0)	/* Mask */
-
-/*
- *  GCSR Register Group
- */
-
-/*
- *  GCSR Control and Status Register  CRG + $604
- */
-#define TSI148_GCSR_GCTRL_LRST         (1<<15)	/* Local Reset */
-#define TSI148_GCSR_GCTRL_SFAILEN      (1<<14)	/* System Fail enable */
-#define TSI148_GCSR_GCTRL_BDFAILS      (1<<13)	/* Board Fail Status */
-#define TSI148_GCSR_GCTRL_SCON         (1<<12)	/* System Copntroller */
-#define TSI148_GCSR_GCTRL_MEN          (1<<11)	/* Module Enable (READY) */
-
-#define TSI148_GCSR_GCTRL_LMI3S        (1<<7)	/* Loc Monitor 3 Int Status */
-#define TSI148_GCSR_GCTRL_LMI2S        (1<<6)	/* Loc Monitor 2 Int Status */
-#define TSI148_GCSR_GCTRL_LMI1S        (1<<5)	/* Loc Monitor 1 Int Status */
-#define TSI148_GCSR_GCTRL_LMI0S        (1<<4)	/* Loc Monitor 0 Int Status */
-#define TSI148_GCSR_GCTRL_MBI3S        (1<<3)	/* Mail box 3 Int Status */
-#define TSI148_GCSR_GCTRL_MBI2S        (1<<2)	/* Mail box 2 Int Status */
-#define TSI148_GCSR_GCTRL_MBI1S        (1<<1)	/* Mail box 1 Int Status */
-#define TSI148_GCSR_GCTRL_MBI0S        (1<<0)	/* Mail box 0 Int Status */
-
-#define TSI148_GCSR_GAP                (1<<5)	/* Geographic Addr Parity */
-#define TSI148_GCSR_GA_M               (0x1F<<0)  /* Geographic Address Mask */
-
-/*
- *  CR/CSR Register Group
- */
-
-/*
- *  CR/CSR Bit Clear Register CRG + $FF4
- */
-#define TSI148_CRCSR_CSRBCR_LRSTC      (1<<7)	/* Local Reset Clear */
-#define TSI148_CRCSR_CSRBCR_SFAILC     (1<<6)	/* System Fail Enable Clear */
-#define TSI148_CRCSR_CSRBCR_BDFAILS    (1<<5)	/* Board Fail Status */
-#define TSI148_CRCSR_CSRBCR_MENC       (1<<4)	/* Module Enable Clear */
-#define TSI148_CRCSR_CSRBCR_BERRSC     (1<<3)	/* Bus Error Status Clear */
-
-/*
- *  CR/CSR Bit Set Register CRG+$FF8
- */
-#define TSI148_CRCSR_CSRBSR_LISTS      (1<<7)	/* Local Reset Clear */
-#define TSI148_CRCSR_CSRBSR_SFAILS     (1<<6)	/* System Fail Enable Clear */
-#define TSI148_CRCSR_CSRBSR_BDFAILS    (1<<5)	/* Board Fail Status */
-#define TSI148_CRCSR_CSRBSR_MENS       (1<<4)	/* Module Enable Clear */
-#define TSI148_CRCSR_CSRBSR_BERRS      (1<<3)	/* Bus Error Status Clear */
-
-/*
- *  CR/CSR Base Address Register CRG + FFC
- */
-#define TSI148_CRCSR_CBAR_M            (0x1F<<3)	/* Mask */
-
-#endif				/* TSI148_H */
diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
deleted file mode 100644
index 8dba20186be3..000000000000
--- a/drivers/vme/vme.c
+++ /dev/null
@@ -1,2015 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * VME Bridge Framework
- *
- * Author: Martyn Welch <martyn.welch@ge.com>
- * Copyright 2008 GE Intelligent Platforms Embedded Systems, Inc.
- *
- * Based on work by Tom Armistead and Ajit Prem
- * Copyright 2004 Motorola Inc.
- */
-
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/highmem.h>
-#include <linux/interrupt.h>
-#include <linux/pagemap.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/syscalls.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/vme.h>
-
-#include "vme_bridge.h"
-
-/* Bitmask and list of registered buses both protected by common mutex */
-static unsigned int vme_bus_numbers;
-static LIST_HEAD(vme_bus_list);
-static DEFINE_MUTEX(vme_buses_lock);
-
-static int __init vme_init(void);
-
-static struct vme_dev *dev_to_vme_dev(struct device *dev)
-{
-	return container_of(dev, struct vme_dev, dev);
-}
-
-/*
- * Find the bridge that the resource is associated with.
- */
-static struct vme_bridge *find_bridge(struct vme_resource *resource)
-{
-	/* Get list to search */
-	switch (resource->type) {
-	case VME_MASTER:
-		return list_entry(resource->entry, struct vme_master_resource,
-			list)->parent;
-	case VME_SLAVE:
-		return list_entry(resource->entry, struct vme_slave_resource,
-			list)->parent;
-	case VME_DMA:
-		return list_entry(resource->entry, struct vme_dma_resource,
-			list)->parent;
-	case VME_LM:
-		return list_entry(resource->entry, struct vme_lm_resource,
-			list)->parent;
-	default:
-		printk(KERN_ERR "Unknown resource type\n");
-		return NULL;
-	}
-}
-
-/**
- * vme_alloc_consistent - Allocate contiguous memory.
- * @resource: Pointer to VME resource.
- * @size: Size of allocation required.
- * @dma: Pointer to variable to store physical address of allocation.
- *
- * Allocate a contiguous block of memory for use by the driver. This is used to
- * create the buffers for the slave windows.
- *
- * Return: Virtual address of allocation on success, NULL on failure.
- */
-void *vme_alloc_consistent(struct vme_resource *resource, size_t size,
-	dma_addr_t *dma)
-{
-	struct vme_bridge *bridge;
-
-	if (!resource) {
-		printk(KERN_ERR "No resource\n");
-		return NULL;
-	}
-
-	bridge = find_bridge(resource);
-	if (!bridge) {
-		printk(KERN_ERR "Can't find bridge\n");
-		return NULL;
-	}
-
-	if (!bridge->parent) {
-		printk(KERN_ERR "Dev entry NULL for bridge %s\n", bridge->name);
-		return NULL;
-	}
-
-	if (!bridge->alloc_consistent) {
-		printk(KERN_ERR "alloc_consistent not supported by bridge %s\n",
-		       bridge->name);
-		return NULL;
-	}
-
-	return bridge->alloc_consistent(bridge->parent, size, dma);
-}
-EXPORT_SYMBOL(vme_alloc_consistent);
-
-/**
- * vme_free_consistent - Free previously allocated memory.
- * @resource: Pointer to VME resource.
- * @size: Size of allocation to free.
- * @vaddr: Virtual address of allocation.
- * @dma: Physical address of allocation.
- *
- * Free previously allocated block of contiguous memory.
- */
-void vme_free_consistent(struct vme_resource *resource, size_t size,
-	void *vaddr, dma_addr_t dma)
-{
-	struct vme_bridge *bridge;
-
-	if (!resource) {
-		printk(KERN_ERR "No resource\n");
-		return;
-	}
-
-	bridge = find_bridge(resource);
-	if (!bridge) {
-		printk(KERN_ERR "Can't find bridge\n");
-		return;
-	}
-
-	if (!bridge->parent) {
-		printk(KERN_ERR "Dev entry NULL for bridge %s\n", bridge->name);
-		return;
-	}
-
-	if (!bridge->free_consistent) {
-		printk(KERN_ERR "free_consistent not supported by bridge %s\n",
-		       bridge->name);
-		return;
-	}
-
-	bridge->free_consistent(bridge->parent, size, vaddr, dma);
-}
-EXPORT_SYMBOL(vme_free_consistent);
-
-/**
- * vme_get_size - Helper function returning size of a VME window
- * @resource: Pointer to VME slave or master resource.
- *
- * Determine the size of the VME window provided. This is a helper
- * function, wrappering the call to vme_master_get or vme_slave_get
- * depending on the type of window resource handed to it.
- *
- * Return: Size of the window on success, zero on failure.
- */
-size_t vme_get_size(struct vme_resource *resource)
-{
-	int enabled, retval;
-	unsigned long long base, size;
-	dma_addr_t buf_base;
-	u32 aspace, cycle, dwidth;
-
-	switch (resource->type) {
-	case VME_MASTER:
-		retval = vme_master_get(resource, &enabled, &base, &size,
-			&aspace, &cycle, &dwidth);
-		if (retval)
-			return 0;
-
-		return size;
-	case VME_SLAVE:
-		retval = vme_slave_get(resource, &enabled, &base, &size,
-			&buf_base, &aspace, &cycle);
-		if (retval)
-			return 0;
-
-		return size;
-	case VME_DMA:
-		return 0;
-	default:
-		printk(KERN_ERR "Unknown resource type\n");
-		return 0;
-	}
-}
-EXPORT_SYMBOL(vme_get_size);
-
-int vme_check_window(u32 aspace, unsigned long long vme_base,
-		     unsigned long long size)
-{
-	int retval = 0;
-
-	if (vme_base + size < size)
-		return -EINVAL;
-
-	switch (aspace) {
-	case VME_A16:
-		if (vme_base + size > VME_A16_MAX)
-			retval = -EFAULT;
-		break;
-	case VME_A24:
-		if (vme_base + size > VME_A24_MAX)
-			retval = -EFAULT;
-		break;
-	case VME_A32:
-		if (vme_base + size > VME_A32_MAX)
-			retval = -EFAULT;
-		break;
-	case VME_A64:
-		/* The VME_A64_MAX limit is actually U64_MAX + 1 */
-		break;
-	case VME_CRCSR:
-		if (vme_base + size > VME_CRCSR_MAX)
-			retval = -EFAULT;
-		break;
-	case VME_USER1:
-	case VME_USER2:
-	case VME_USER3:
-	case VME_USER4:
-		/* User Defined */
-		break;
-	default:
-		printk(KERN_ERR "Invalid address space\n");
-		retval = -EINVAL;
-		break;
-	}
-
-	return retval;
-}
-EXPORT_SYMBOL(vme_check_window);
-
-static u32 vme_get_aspace(int am)
-{
-	switch (am) {
-	case 0x29:
-	case 0x2D:
-		return VME_A16;
-	case 0x38:
-	case 0x39:
-	case 0x3A:
-	case 0x3B:
-	case 0x3C:
-	case 0x3D:
-	case 0x3E:
-	case 0x3F:
-		return VME_A24;
-	case 0x8:
-	case 0x9:
-	case 0xA:
-	case 0xB:
-	case 0xC:
-	case 0xD:
-	case 0xE:
-	case 0xF:
-		return VME_A32;
-	case 0x0:
-	case 0x1:
-	case 0x3:
-		return VME_A64;
-	}
-
-	return 0;
-}
-
-/**
- * vme_slave_request - Request a VME slave window resource.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @address: Required VME address space.
- * @cycle: Required VME data transfer cycle type.
- *
- * Request use of a VME window resource capable of being set for the requested
- * address space and data transfer cycle.
- *
- * Return: Pointer to VME resource on success, NULL on failure.
- */
-struct vme_resource *vme_slave_request(struct vme_dev *vdev, u32 address,
-	u32 cycle)
-{
-	struct vme_bridge *bridge;
-	struct list_head *slave_pos = NULL;
-	struct vme_slave_resource *allocated_image = NULL;
-	struct vme_slave_resource *slave_image = NULL;
-	struct vme_resource *resource = NULL;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		goto err_bus;
-	}
-
-	/* Loop through slave resources */
-	list_for_each(slave_pos, &bridge->slave_resources) {
-		slave_image = list_entry(slave_pos,
-			struct vme_slave_resource, list);
-
-		if (!slave_image) {
-			printk(KERN_ERR "Registered NULL Slave resource\n");
-			continue;
-		}
-
-		/* Find an unlocked and compatible image */
-		mutex_lock(&slave_image->mtx);
-		if (((slave_image->address_attr & address) == address) &&
-			((slave_image->cycle_attr & cycle) == cycle) &&
-			(slave_image->locked == 0)) {
-
-			slave_image->locked = 1;
-			mutex_unlock(&slave_image->mtx);
-			allocated_image = slave_image;
-			break;
-		}
-		mutex_unlock(&slave_image->mtx);
-	}
-
-	/* No free image */
-	if (!allocated_image)
-		goto err_image;
-
-	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
-	if (!resource)
-		goto err_alloc;
-
-	resource->type = VME_SLAVE;
-	resource->entry = &allocated_image->list;
-
-	return resource;
-
-err_alloc:
-	/* Unlock image */
-	mutex_lock(&slave_image->mtx);
-	slave_image->locked = 0;
-	mutex_unlock(&slave_image->mtx);
-err_image:
-err_bus:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_slave_request);
-
-/**
- * vme_slave_set - Set VME slave window configuration.
- * @resource: Pointer to VME slave resource.
- * @enabled: State to which the window should be configured.
- * @vme_base: Base address for the window.
- * @size: Size of the VME window.
- * @buf_base: Based address of buffer used to provide VME slave window storage.
- * @aspace: VME address space for the VME window.
- * @cycle: VME data transfer cycle type for the VME window.
- *
- * Set configuration for provided VME slave window.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device, if an invalid resource has been provided or invalid
- *         attributes are provided. Hardware specific errors may also be
- *         returned.
- */
-int vme_slave_set(struct vme_resource *resource, int enabled,
-	unsigned long long vme_base, unsigned long long size,
-	dma_addr_t buf_base, u32 aspace, u32 cycle)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_slave_resource *image;
-	int retval;
-
-	if (resource->type != VME_SLAVE) {
-		printk(KERN_ERR "Not a slave resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_slave_resource, list);
-
-	if (!bridge->slave_set) {
-		printk(KERN_ERR "Function not supported\n");
-		return -ENOSYS;
-	}
-
-	if (!(((image->address_attr & aspace) == aspace) &&
-		((image->cycle_attr & cycle) == cycle))) {
-		printk(KERN_ERR "Invalid attributes\n");
-		return -EINVAL;
-	}
-
-	retval = vme_check_window(aspace, vme_base, size);
-	if (retval)
-		return retval;
-
-	return bridge->slave_set(image, enabled, vme_base, size, buf_base,
-		aspace, cycle);
-}
-EXPORT_SYMBOL(vme_slave_set);
-
-/**
- * vme_slave_get - Retrieve VME slave window configuration.
- * @resource: Pointer to VME slave resource.
- * @enabled: Pointer to variable for storing state.
- * @vme_base: Pointer to variable for storing window base address.
- * @size: Pointer to variable for storing window size.
- * @buf_base: Pointer to variable for storing slave buffer base address.
- * @aspace: Pointer to variable for storing VME address space.
- * @cycle: Pointer to variable for storing VME data transfer cycle type.
- *
- * Return configuration for provided VME slave window.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device or if an invalid resource has been provided.
- */
-int vme_slave_get(struct vme_resource *resource, int *enabled,
-	unsigned long long *vme_base, unsigned long long *size,
-	dma_addr_t *buf_base, u32 *aspace, u32 *cycle)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_slave_resource *image;
-
-	if (resource->type != VME_SLAVE) {
-		printk(KERN_ERR "Not a slave resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_slave_resource, list);
-
-	if (!bridge->slave_get) {
-		printk(KERN_ERR "vme_slave_get not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->slave_get(image, enabled, vme_base, size, buf_base,
-		aspace, cycle);
-}
-EXPORT_SYMBOL(vme_slave_get);
-
-/**
- * vme_slave_free - Free VME slave window
- * @resource: Pointer to VME slave resource.
- *
- * Free the provided slave resource so that it may be reallocated.
- */
-void vme_slave_free(struct vme_resource *resource)
-{
-	struct vme_slave_resource *slave_image;
-
-	if (resource->type != VME_SLAVE) {
-		printk(KERN_ERR "Not a slave resource\n");
-		return;
-	}
-
-	slave_image = list_entry(resource->entry, struct vme_slave_resource,
-		list);
-	if (!slave_image) {
-		printk(KERN_ERR "Can't find slave resource\n");
-		return;
-	}
-
-	/* Unlock image */
-	mutex_lock(&slave_image->mtx);
-	if (slave_image->locked == 0)
-		printk(KERN_ERR "Image is already free\n");
-
-	slave_image->locked = 0;
-	mutex_unlock(&slave_image->mtx);
-
-	/* Free up resource memory */
-	kfree(resource);
-}
-EXPORT_SYMBOL(vme_slave_free);
-
-/**
- * vme_master_request - Request a VME master window resource.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @address: Required VME address space.
- * @cycle: Required VME data transfer cycle type.
- * @dwidth: Required VME data transfer width.
- *
- * Request use of a VME window resource capable of being set for the requested
- * address space, data transfer cycle and width.
- *
- * Return: Pointer to VME resource on success, NULL on failure.
- */
-struct vme_resource *vme_master_request(struct vme_dev *vdev, u32 address,
-	u32 cycle, u32 dwidth)
-{
-	struct vme_bridge *bridge;
-	struct list_head *master_pos = NULL;
-	struct vme_master_resource *allocated_image = NULL;
-	struct vme_master_resource *master_image = NULL;
-	struct vme_resource *resource = NULL;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		goto err_bus;
-	}
-
-	/* Loop through master resources */
-	list_for_each(master_pos, &bridge->master_resources) {
-		master_image = list_entry(master_pos,
-			struct vme_master_resource, list);
-
-		if (!master_image) {
-			printk(KERN_WARNING "Registered NULL master resource\n");
-			continue;
-		}
-
-		/* Find an unlocked and compatible image */
-		spin_lock(&master_image->lock);
-		if (((master_image->address_attr & address) == address) &&
-			((master_image->cycle_attr & cycle) == cycle) &&
-			((master_image->width_attr & dwidth) == dwidth) &&
-			(master_image->locked == 0)) {
-
-			master_image->locked = 1;
-			spin_unlock(&master_image->lock);
-			allocated_image = master_image;
-			break;
-		}
-		spin_unlock(&master_image->lock);
-	}
-
-	/* Check to see if we found a resource */
-	if (!allocated_image) {
-		printk(KERN_ERR "Can't find a suitable resource\n");
-		goto err_image;
-	}
-
-	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
-	if (!resource)
-		goto err_alloc;
-
-	resource->type = VME_MASTER;
-	resource->entry = &allocated_image->list;
-
-	return resource;
-
-err_alloc:
-	/* Unlock image */
-	spin_lock(&master_image->lock);
-	master_image->locked = 0;
-	spin_unlock(&master_image->lock);
-err_image:
-err_bus:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_master_request);
-
-/**
- * vme_master_set - Set VME master window configuration.
- * @resource: Pointer to VME master resource.
- * @enabled: State to which the window should be configured.
- * @vme_base: Base address for the window.
- * @size: Size of the VME window.
- * @aspace: VME address space for the VME window.
- * @cycle: VME data transfer cycle type for the VME window.
- * @dwidth: VME data transfer width for the VME window.
- *
- * Set configuration for provided VME master window.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device, if an invalid resource has been provided or invalid
- *         attributes are provided. Hardware specific errors may also be
- *         returned.
- */
-int vme_master_set(struct vme_resource *resource, int enabled,
-	unsigned long long vme_base, unsigned long long size, u32 aspace,
-	u32 cycle, u32 dwidth)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_master_resource *image;
-	int retval;
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-
-	if (!bridge->master_set) {
-		printk(KERN_WARNING "vme_master_set not supported\n");
-		return -EINVAL;
-	}
-
-	if (!(((image->address_attr & aspace) == aspace) &&
-		((image->cycle_attr & cycle) == cycle) &&
-		((image->width_attr & dwidth) == dwidth))) {
-		printk(KERN_WARNING "Invalid attributes\n");
-		return -EINVAL;
-	}
-
-	retval = vme_check_window(aspace, vme_base, size);
-	if (retval)
-		return retval;
-
-	return bridge->master_set(image, enabled, vme_base, size, aspace,
-		cycle, dwidth);
-}
-EXPORT_SYMBOL(vme_master_set);
-
-/**
- * vme_master_get - Retrieve VME master window configuration.
- * @resource: Pointer to VME master resource.
- * @enabled: Pointer to variable for storing state.
- * @vme_base: Pointer to variable for storing window base address.
- * @size: Pointer to variable for storing window size.
- * @aspace: Pointer to variable for storing VME address space.
- * @cycle: Pointer to variable for storing VME data transfer cycle type.
- * @dwidth: Pointer to variable for storing VME data transfer width.
- *
- * Return configuration for provided VME master window.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device or if an invalid resource has been provided.
- */
-int vme_master_get(struct vme_resource *resource, int *enabled,
-	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
-	u32 *cycle, u32 *dwidth)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_master_resource *image;
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-
-	if (!bridge->master_get) {
-		printk(KERN_WARNING "%s not supported\n", __func__);
-		return -EINVAL;
-	}
-
-	return bridge->master_get(image, enabled, vme_base, size, aspace,
-		cycle, dwidth);
-}
-EXPORT_SYMBOL(vme_master_get);
-
-/**
- * vme_master_read - Read data from VME space into a buffer.
- * @resource: Pointer to VME master resource.
- * @buf: Pointer to buffer where data should be transferred.
- * @count: Number of bytes to transfer.
- * @offset: Offset into VME master window at which to start transfer.
- *
- * Perform read of count bytes of data from location on VME bus which maps into
- * the VME master window at offset to buf.
- *
- * Return: Number of bytes read, -EINVAL if resource is not a VME master
- *         resource or read operation is not supported. -EFAULT returned if
- *         invalid offset is provided. Hardware specific errors may also be
- *         returned.
- */
-ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count,
-	loff_t offset)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_master_resource *image;
-	size_t length;
-
-	if (!bridge->master_read) {
-		printk(KERN_WARNING "Reading from resource not supported\n");
-		return -EINVAL;
-	}
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-
-	length = vme_get_size(resource);
-
-	if (offset > length) {
-		printk(KERN_WARNING "Invalid Offset\n");
-		return -EFAULT;
-	}
-
-	if ((offset + count) > length)
-		count = length - offset;
-
-	return bridge->master_read(image, buf, count, offset);
-
-}
-EXPORT_SYMBOL(vme_master_read);
-
-/**
- * vme_master_write - Write data out to VME space from a buffer.
- * @resource: Pointer to VME master resource.
- * @buf: Pointer to buffer holding data to transfer.
- * @count: Number of bytes to transfer.
- * @offset: Offset into VME master window at which to start transfer.
- *
- * Perform write of count bytes of data from buf to location on VME bus which
- * maps into the VME master window at offset.
- *
- * Return: Number of bytes written, -EINVAL if resource is not a VME master
- *         resource or write operation is not supported. -EFAULT returned if
- *         invalid offset is provided. Hardware specific errors may also be
- *         returned.
- */
-ssize_t vme_master_write(struct vme_resource *resource, void *buf,
-	size_t count, loff_t offset)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_master_resource *image;
-	size_t length;
-
-	if (!bridge->master_write) {
-		printk(KERN_WARNING "Writing to resource not supported\n");
-		return -EINVAL;
-	}
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-
-	length = vme_get_size(resource);
-
-	if (offset > length) {
-		printk(KERN_WARNING "Invalid Offset\n");
-		return -EFAULT;
-	}
-
-	if ((offset + count) > length)
-		count = length - offset;
-
-	return bridge->master_write(image, buf, count, offset);
-}
-EXPORT_SYMBOL(vme_master_write);
-
-/**
- * vme_master_rmw - Perform read-modify-write cycle.
- * @resource: Pointer to VME master resource.
- * @mask: Bits to be compared and swapped in operation.
- * @compare: Bits to be compared with data read from offset.
- * @swap: Bits to be swapped in data read from offset.
- * @offset: Offset into VME master window at which to perform operation.
- *
- * Perform read-modify-write cycle on provided location:
- * - Location on VME bus is read.
- * - Bits selected by mask are compared with compare.
- * - Where a selected bit matches that in compare and are selected in swap,
- * the bit is swapped.
- * - Result written back to location on VME bus.
- *
- * Return: Bytes written on success, -EINVAL if resource is not a VME master
- *         resource or RMW operation is not supported. Hardware specific
- *         errors may also be returned.
- */
-unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask,
-	unsigned int compare, unsigned int swap, loff_t offset)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_master_resource *image;
-
-	if (!bridge->master_rmw) {
-		printk(KERN_WARNING "Writing to resource not supported\n");
-		return -EINVAL;
-	}
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-
-	return bridge->master_rmw(image, mask, compare, swap, offset);
-}
-EXPORT_SYMBOL(vme_master_rmw);
-
-/**
- * vme_master_mmap - Mmap region of VME master window.
- * @resource: Pointer to VME master resource.
- * @vma: Pointer to definition of user mapping.
- *
- * Memory map a region of the VME master window into user space.
- *
- * Return: Zero on success, -EINVAL if resource is not a VME master
- *         resource or -EFAULT if map exceeds window size. Other generic mmap
- *         errors may also be returned.
- */
-int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma)
-{
-	struct vme_master_resource *image;
-	phys_addr_t phys_addr;
-	unsigned long vma_size;
-
-	if (resource->type != VME_MASTER) {
-		pr_err("Not a master resource\n");
-		return -EINVAL;
-	}
-
-	image = list_entry(resource->entry, struct vme_master_resource, list);
-	phys_addr = image->bus_resource.start + (vma->vm_pgoff << PAGE_SHIFT);
-	vma_size = vma->vm_end - vma->vm_start;
-
-	if (phys_addr + vma_size > image->bus_resource.end + 1) {
-		pr_err("Map size cannot exceed the window size\n");
-		return -EFAULT;
-	}
-
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	return vm_iomap_memory(vma, phys_addr, vma->vm_end - vma->vm_start);
-}
-EXPORT_SYMBOL(vme_master_mmap);
-
-/**
- * vme_master_free - Free VME master window
- * @resource: Pointer to VME master resource.
- *
- * Free the provided master resource so that it may be reallocated.
- */
-void vme_master_free(struct vme_resource *resource)
-{
-	struct vme_master_resource *master_image;
-
-	if (resource->type != VME_MASTER) {
-		printk(KERN_ERR "Not a master resource\n");
-		return;
-	}
-
-	master_image = list_entry(resource->entry, struct vme_master_resource,
-		list);
-	if (!master_image) {
-		printk(KERN_ERR "Can't find master resource\n");
-		return;
-	}
-
-	/* Unlock image */
-	spin_lock(&master_image->lock);
-	if (master_image->locked == 0)
-		printk(KERN_ERR "Image is already free\n");
-
-	master_image->locked = 0;
-	spin_unlock(&master_image->lock);
-
-	/* Free up resource memory */
-	kfree(resource);
-}
-EXPORT_SYMBOL(vme_master_free);
-
-/**
- * vme_dma_request - Request a DMA controller.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @route: Required src/destination combination.
- *
- * Request a VME DMA controller with capability to perform transfers bewteen
- * requested source/destination combination.
- *
- * Return: Pointer to VME DMA resource on success, NULL on failure.
- */
-struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route)
-{
-	struct vme_bridge *bridge;
-	struct list_head *dma_pos = NULL;
-	struct vme_dma_resource *allocated_ctrlr = NULL;
-	struct vme_dma_resource *dma_ctrlr = NULL;
-	struct vme_resource *resource = NULL;
-
-	/* XXX Not checking resource attributes */
-	printk(KERN_ERR "No VME resource Attribute tests done\n");
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		goto err_bus;
-	}
-
-	/* Loop through DMA resources */
-	list_for_each(dma_pos, &bridge->dma_resources) {
-		dma_ctrlr = list_entry(dma_pos,
-			struct vme_dma_resource, list);
-		if (!dma_ctrlr) {
-			printk(KERN_ERR "Registered NULL DMA resource\n");
-			continue;
-		}
-
-		/* Find an unlocked and compatible controller */
-		mutex_lock(&dma_ctrlr->mtx);
-		if (((dma_ctrlr->route_attr & route) == route) &&
-			(dma_ctrlr->locked == 0)) {
-
-			dma_ctrlr->locked = 1;
-			mutex_unlock(&dma_ctrlr->mtx);
-			allocated_ctrlr = dma_ctrlr;
-			break;
-		}
-		mutex_unlock(&dma_ctrlr->mtx);
-	}
-
-	/* Check to see if we found a resource */
-	if (!allocated_ctrlr)
-		goto err_ctrlr;
-
-	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
-	if (!resource)
-		goto err_alloc;
-
-	resource->type = VME_DMA;
-	resource->entry = &allocated_ctrlr->list;
-
-	return resource;
-
-err_alloc:
-	/* Unlock image */
-	mutex_lock(&dma_ctrlr->mtx);
-	dma_ctrlr->locked = 0;
-	mutex_unlock(&dma_ctrlr->mtx);
-err_ctrlr:
-err_bus:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_dma_request);
-
-/**
- * vme_new_dma_list - Create new VME DMA list.
- * @resource: Pointer to VME DMA resource.
- *
- * Create a new VME DMA list. It is the responsibility of the user to free
- * the list once it is no longer required with vme_dma_list_free().
- *
- * Return: Pointer to new VME DMA list, NULL on allocation failure or invalid
- *         VME DMA resource.
- */
-struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource)
-{
-	struct vme_dma_list *dma_list;
-
-	if (resource->type != VME_DMA) {
-		printk(KERN_ERR "Not a DMA resource\n");
-		return NULL;
-	}
-
-	dma_list = kmalloc(sizeof(*dma_list), GFP_KERNEL);
-	if (!dma_list)
-		return NULL;
-
-	INIT_LIST_HEAD(&dma_list->entries);
-	dma_list->parent = list_entry(resource->entry,
-				      struct vme_dma_resource,
-				      list);
-	mutex_init(&dma_list->mtx);
-
-	return dma_list;
-}
-EXPORT_SYMBOL(vme_new_dma_list);
-
-/**
- * vme_dma_pattern_attribute - Create "Pattern" type VME DMA list attribute.
- * @pattern: Value to use used as pattern
- * @type: Type of pattern to be written.
- *
- * Create VME DMA list attribute for pattern generation. It is the
- * responsibility of the user to free used attributes using
- * vme_dma_free_attribute().
- *
- * Return: Pointer to VME DMA attribute, NULL on failure.
- */
-struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type)
-{
-	struct vme_dma_attr *attributes;
-	struct vme_dma_pattern *pattern_attr;
-
-	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
-	if (!attributes)
-		goto err_attr;
-
-	pattern_attr = kmalloc(sizeof(*pattern_attr), GFP_KERNEL);
-	if (!pattern_attr)
-		goto err_pat;
-
-	attributes->type = VME_DMA_PATTERN;
-	attributes->private = (void *)pattern_attr;
-
-	pattern_attr->pattern = pattern;
-	pattern_attr->type = type;
-
-	return attributes;
-
-err_pat:
-	kfree(attributes);
-err_attr:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_dma_pattern_attribute);
-
-/**
- * vme_dma_pci_attribute - Create "PCI" type VME DMA list attribute.
- * @address: PCI base address for DMA transfer.
- *
- * Create VME DMA list attribute pointing to a location on PCI for DMA
- * transfers. It is the responsibility of the user to free used attributes
- * using vme_dma_free_attribute().
- *
- * Return: Pointer to VME DMA attribute, NULL on failure.
- */
-struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t address)
-{
-	struct vme_dma_attr *attributes;
-	struct vme_dma_pci *pci_attr;
-
-	/* XXX Run some sanity checks here */
-
-	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
-	if (!attributes)
-		goto err_attr;
-
-	pci_attr = kmalloc(sizeof(*pci_attr), GFP_KERNEL);
-	if (!pci_attr)
-		goto err_pci;
-
-	attributes->type = VME_DMA_PCI;
-	attributes->private = (void *)pci_attr;
-
-	pci_attr->address = address;
-
-	return attributes;
-
-err_pci:
-	kfree(attributes);
-err_attr:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_dma_pci_attribute);
-
-/**
- * vme_dma_vme_attribute - Create "VME" type VME DMA list attribute.
- * @address: VME base address for DMA transfer.
- * @aspace: VME address space to use for DMA transfer.
- * @cycle: VME bus cycle to use for DMA transfer.
- * @dwidth: VME data width to use for DMA transfer.
- *
- * Create VME DMA list attribute pointing to a location on the VME bus for DMA
- * transfers. It is the responsibility of the user to free used attributes
- * using vme_dma_free_attribute().
- *
- * Return: Pointer to VME DMA attribute, NULL on failure.
- */
-struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long address,
-	u32 aspace, u32 cycle, u32 dwidth)
-{
-	struct vme_dma_attr *attributes;
-	struct vme_dma_vme *vme_attr;
-
-	attributes = kmalloc(sizeof(*attributes), GFP_KERNEL);
-	if (!attributes)
-		goto err_attr;
-
-	vme_attr = kmalloc(sizeof(*vme_attr), GFP_KERNEL);
-	if (!vme_attr)
-		goto err_vme;
-
-	attributes->type = VME_DMA_VME;
-	attributes->private = (void *)vme_attr;
-
-	vme_attr->address = address;
-	vme_attr->aspace = aspace;
-	vme_attr->cycle = cycle;
-	vme_attr->dwidth = dwidth;
-
-	return attributes;
-
-err_vme:
-	kfree(attributes);
-err_attr:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_dma_vme_attribute);
-
-/**
- * vme_dma_free_attribute - Free DMA list attribute.
- * @attributes: Pointer to DMA list attribute.
- *
- * Free VME DMA list attribute. VME DMA list attributes can be safely freed
- * once vme_dma_list_add() has returned.
- */
-void vme_dma_free_attribute(struct vme_dma_attr *attributes)
-{
-	kfree(attributes->private);
-	kfree(attributes);
-}
-EXPORT_SYMBOL(vme_dma_free_attribute);
-
-/**
- * vme_dma_list_add - Add enty to a VME DMA list.
- * @list: Pointer to VME list.
- * @src: Pointer to DMA list attribute to use as source.
- * @dest: Pointer to DMA list attribute to use as destination.
- * @count: Number of bytes to transfer.
- *
- * Add an entry to the provided VME DMA list. Entry requires pointers to source
- * and destination DMA attributes and a count.
- *
- * Please note, the attributes supported as source and destinations for
- * transfers are hardware dependent.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device or if the link list has already been submitted for execution.
- *         Hardware specific errors also possible.
- */
-int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src,
-	struct vme_dma_attr *dest, size_t count)
-{
-	struct vme_bridge *bridge = list->parent->parent;
-	int retval;
-
-	if (!bridge->dma_list_add) {
-		printk(KERN_WARNING "Link List DMA generation not supported\n");
-		return -EINVAL;
-	}
-
-	if (!mutex_trylock(&list->mtx)) {
-		printk(KERN_ERR "Link List already submitted\n");
-		return -EINVAL;
-	}
-
-	retval = bridge->dma_list_add(list, src, dest, count);
-
-	mutex_unlock(&list->mtx);
-
-	return retval;
-}
-EXPORT_SYMBOL(vme_dma_list_add);
-
-/**
- * vme_dma_list_exec - Queue a VME DMA list for execution.
- * @list: Pointer to VME list.
- *
- * Queue the provided VME DMA list for execution. The call will return once the
- * list has been executed.
- *
- * Return: Zero on success, -EINVAL if operation is not supported on this
- *         device. Hardware specific errors also possible.
- */
-int vme_dma_list_exec(struct vme_dma_list *list)
-{
-	struct vme_bridge *bridge = list->parent->parent;
-	int retval;
-
-	if (!bridge->dma_list_exec) {
-		printk(KERN_ERR "Link List DMA execution not supported\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(&list->mtx);
-
-	retval = bridge->dma_list_exec(list);
-
-	mutex_unlock(&list->mtx);
-
-	return retval;
-}
-EXPORT_SYMBOL(vme_dma_list_exec);
-
-/**
- * vme_dma_list_free - Free a VME DMA list.
- * @list: Pointer to VME list.
- *
- * Free the provided DMA list and all its entries.
- *
- * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
- *         is still in use. Hardware specific errors also possible.
- */
-int vme_dma_list_free(struct vme_dma_list *list)
-{
-	struct vme_bridge *bridge = list->parent->parent;
-	int retval;
-
-	if (!bridge->dma_list_empty) {
-		printk(KERN_WARNING "Emptying of Link Lists not supported\n");
-		return -EINVAL;
-	}
-
-	if (!mutex_trylock(&list->mtx)) {
-		printk(KERN_ERR "Link List in use\n");
-		return -EBUSY;
-	}
-
-	/*
-	 * Empty out all of the entries from the DMA list. We need to go to the
-	 * low level driver as DMA entries are driver specific.
-	 */
-	retval = bridge->dma_list_empty(list);
-	if (retval) {
-		printk(KERN_ERR "Unable to empty link-list entries\n");
-		mutex_unlock(&list->mtx);
-		return retval;
-	}
-	mutex_unlock(&list->mtx);
-	kfree(list);
-
-	return retval;
-}
-EXPORT_SYMBOL(vme_dma_list_free);
-
-/**
- * vme_dma_free - Free a VME DMA resource.
- * @resource: Pointer to VME DMA resource.
- *
- * Free the provided DMA resource so that it may be reallocated.
- *
- * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
- *         is still active.
- */
-int vme_dma_free(struct vme_resource *resource)
-{
-	struct vme_dma_resource *ctrlr;
-
-	if (resource->type != VME_DMA) {
-		printk(KERN_ERR "Not a DMA resource\n");
-		return -EINVAL;
-	}
-
-	ctrlr = list_entry(resource->entry, struct vme_dma_resource, list);
-
-	if (!mutex_trylock(&ctrlr->mtx)) {
-		printk(KERN_ERR "Resource busy, can't free\n");
-		return -EBUSY;
-	}
-
-	if (!(list_empty(&ctrlr->pending) && list_empty(&ctrlr->running))) {
-		printk(KERN_WARNING "Resource still processing transfers\n");
-		mutex_unlock(&ctrlr->mtx);
-		return -EBUSY;
-	}
-
-	ctrlr->locked = 0;
-
-	mutex_unlock(&ctrlr->mtx);
-
-	kfree(resource);
-
-	return 0;
-}
-EXPORT_SYMBOL(vme_dma_free);
-
-void vme_bus_error_handler(struct vme_bridge *bridge,
-			   unsigned long long address, int am)
-{
-	struct list_head *handler_pos = NULL;
-	struct vme_error_handler *handler;
-	int handler_triggered = 0;
-	u32 aspace = vme_get_aspace(am);
-
-	list_for_each(handler_pos, &bridge->vme_error_handlers) {
-		handler = list_entry(handler_pos, struct vme_error_handler,
-				     list);
-		if ((aspace == handler->aspace) &&
-		    (address >= handler->start) &&
-		    (address < handler->end)) {
-			if (!handler->num_errors)
-				handler->first_error = address;
-			if (handler->num_errors != UINT_MAX)
-				handler->num_errors++;
-			handler_triggered = 1;
-		}
-	}
-
-	if (!handler_triggered)
-		dev_err(bridge->parent,
-			"Unhandled VME access error at address 0x%llx\n",
-			address);
-}
-EXPORT_SYMBOL(vme_bus_error_handler);
-
-struct vme_error_handler *vme_register_error_handler(
-	struct vme_bridge *bridge, u32 aspace,
-	unsigned long long address, size_t len)
-{
-	struct vme_error_handler *handler;
-
-	handler = kmalloc(sizeof(*handler), GFP_ATOMIC);
-	if (!handler)
-		return NULL;
-
-	handler->aspace = aspace;
-	handler->start = address;
-	handler->end = address + len;
-	handler->num_errors = 0;
-	handler->first_error = 0;
-	list_add_tail(&handler->list, &bridge->vme_error_handlers);
-
-	return handler;
-}
-EXPORT_SYMBOL(vme_register_error_handler);
-
-void vme_unregister_error_handler(struct vme_error_handler *handler)
-{
-	list_del(&handler->list);
-	kfree(handler);
-}
-EXPORT_SYMBOL(vme_unregister_error_handler);
-
-void vme_irq_handler(struct vme_bridge *bridge, int level, int statid)
-{
-	void (*call)(int, int, void *);
-	void *priv_data;
-
-	call = bridge->irq[level - 1].callback[statid].func;
-	priv_data = bridge->irq[level - 1].callback[statid].priv_data;
-	if (call)
-		call(level, statid, priv_data);
-	else
-		printk(KERN_WARNING "Spurious VME interrupt, level:%x, vector:%x\n",
-		       level, statid);
-}
-EXPORT_SYMBOL(vme_irq_handler);
-
-/**
- * vme_irq_request - Request a specific VME interrupt.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @level: Interrupt priority being requested.
- * @statid: Interrupt vector being requested.
- * @callback: Pointer to callback function called when VME interrupt/vector
- *            received.
- * @priv_data: Generic pointer that will be passed to the callback function.
- *
- * Request callback to be attached as a handler for VME interrupts with provided
- * level and statid.
- *
- * Return: Zero on success, -EINVAL on invalid vme device, level or if the
- *         function is not supported, -EBUSY if the level/statid combination is
- *         already in use. Hardware specific errors also possible.
- */
-int vme_irq_request(struct vme_dev *vdev, int level, int statid,
-	void (*callback)(int, int, void *),
-	void *priv_data)
-{
-	struct vme_bridge *bridge;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		return -EINVAL;
-	}
-
-	if ((level < 1) || (level > 7)) {
-		printk(KERN_ERR "Invalid interrupt level\n");
-		return -EINVAL;
-	}
-
-	if (!bridge->irq_set) {
-		printk(KERN_ERR "Configuring interrupts not supported\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(&bridge->irq_mtx);
-
-	if (bridge->irq[level - 1].callback[statid].func) {
-		mutex_unlock(&bridge->irq_mtx);
-		printk(KERN_WARNING "VME Interrupt already taken\n");
-		return -EBUSY;
-	}
-
-	bridge->irq[level - 1].count++;
-	bridge->irq[level - 1].callback[statid].priv_data = priv_data;
-	bridge->irq[level - 1].callback[statid].func = callback;
-
-	/* Enable IRQ level */
-	bridge->irq_set(bridge, level, 1, 1);
-
-	mutex_unlock(&bridge->irq_mtx);
-
-	return 0;
-}
-EXPORT_SYMBOL(vme_irq_request);
-
-/**
- * vme_irq_free - Free a VME interrupt.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @level: Interrupt priority of interrupt being freed.
- * @statid: Interrupt vector of interrupt being freed.
- *
- * Remove previously attached callback from VME interrupt priority/vector.
- */
-void vme_irq_free(struct vme_dev *vdev, int level, int statid)
-{
-	struct vme_bridge *bridge;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		return;
-	}
-
-	if ((level < 1) || (level > 7)) {
-		printk(KERN_ERR "Invalid interrupt level\n");
-		return;
-	}
-
-	if (!bridge->irq_set) {
-		printk(KERN_ERR "Configuring interrupts not supported\n");
-		return;
-	}
-
-	mutex_lock(&bridge->irq_mtx);
-
-	bridge->irq[level - 1].count--;
-
-	/* Disable IRQ level if no more interrupts attached at this level*/
-	if (bridge->irq[level - 1].count == 0)
-		bridge->irq_set(bridge, level, 0, 1);
-
-	bridge->irq[level - 1].callback[statid].func = NULL;
-	bridge->irq[level - 1].callback[statid].priv_data = NULL;
-
-	mutex_unlock(&bridge->irq_mtx);
-}
-EXPORT_SYMBOL(vme_irq_free);
-
-/**
- * vme_irq_generate - Generate VME interrupt.
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- * @level: Interrupt priority at which to assert the interrupt.
- * @statid: Interrupt vector to associate with the interrupt.
- *
- * Generate a VME interrupt of the provided level and with the provided
- * statid.
- *
- * Return: Zero on success, -EINVAL on invalid vme device, level or if the
- *         function is not supported. Hardware specific errors also possible.
- */
-int vme_irq_generate(struct vme_dev *vdev, int level, int statid)
-{
-	struct vme_bridge *bridge;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		return -EINVAL;
-	}
-
-	if ((level < 1) || (level > 7)) {
-		printk(KERN_WARNING "Invalid interrupt level\n");
-		return -EINVAL;
-	}
-
-	if (!bridge->irq_generate) {
-		printk(KERN_WARNING "Interrupt generation not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->irq_generate(bridge, level, statid);
-}
-EXPORT_SYMBOL(vme_irq_generate);
-
-/**
- * vme_lm_request - Request a VME location monitor
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- *
- * Allocate a location monitor resource to the driver. A location monitor
- * allows the driver to monitor accesses to a contiguous number of
- * addresses on the VME bus.
- *
- * Return: Pointer to a VME resource on success or NULL on failure.
- */
-struct vme_resource *vme_lm_request(struct vme_dev *vdev)
-{
-	struct vme_bridge *bridge;
-	struct list_head *lm_pos = NULL;
-	struct vme_lm_resource *allocated_lm = NULL;
-	struct vme_lm_resource *lm = NULL;
-	struct vme_resource *resource = NULL;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		goto err_bus;
-	}
-
-	/* Loop through LM resources */
-	list_for_each(lm_pos, &bridge->lm_resources) {
-		lm = list_entry(lm_pos,
-			struct vme_lm_resource, list);
-		if (!lm) {
-			printk(KERN_ERR "Registered NULL Location Monitor resource\n");
-			continue;
-		}
-
-		/* Find an unlocked controller */
-		mutex_lock(&lm->mtx);
-		if (lm->locked == 0) {
-			lm->locked = 1;
-			mutex_unlock(&lm->mtx);
-			allocated_lm = lm;
-			break;
-		}
-		mutex_unlock(&lm->mtx);
-	}
-
-	/* Check to see if we found a resource */
-	if (!allocated_lm)
-		goto err_lm;
-
-	resource = kmalloc(sizeof(*resource), GFP_KERNEL);
-	if (!resource)
-		goto err_alloc;
-
-	resource->type = VME_LM;
-	resource->entry = &allocated_lm->list;
-
-	return resource;
-
-err_alloc:
-	/* Unlock image */
-	mutex_lock(&lm->mtx);
-	lm->locked = 0;
-	mutex_unlock(&lm->mtx);
-err_lm:
-err_bus:
-	return NULL;
-}
-EXPORT_SYMBOL(vme_lm_request);
-
-/**
- * vme_lm_count - Determine number of VME Addresses monitored
- * @resource: Pointer to VME location monitor resource.
- *
- * The number of contiguous addresses monitored is hardware dependent.
- * Return the number of contiguous addresses monitored by the
- * location monitor.
- *
- * Return: Count of addresses monitored or -EINVAL when provided with an
- *	   invalid location monitor resource.
- */
-int vme_lm_count(struct vme_resource *resource)
-{
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return -EINVAL;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	return lm->monitors;
-}
-EXPORT_SYMBOL(vme_lm_count);
-
-/**
- * vme_lm_set - Configure location monitor
- * @resource: Pointer to VME location monitor resource.
- * @lm_base: Base address to monitor.
- * @aspace: VME address space to monitor.
- * @cycle: VME bus cycle type to monitor.
- *
- * Set the base address, address space and cycle type of accesses to be
- * monitored by the location monitor.
- *
- * Return: Zero on success, -EINVAL when provided with an invalid location
- *	   monitor resource or function is not supported. Hardware specific
- *	   errors may also be returned.
- */
-int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base,
-	u32 aspace, u32 cycle)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return -EINVAL;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	if (!bridge->lm_set) {
-		printk(KERN_ERR "vme_lm_set not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->lm_set(lm, lm_base, aspace, cycle);
-}
-EXPORT_SYMBOL(vme_lm_set);
-
-/**
- * vme_lm_get - Retrieve location monitor settings
- * @resource: Pointer to VME location monitor resource.
- * @lm_base: Pointer used to output the base address monitored.
- * @aspace: Pointer used to output the address space monitored.
- * @cycle: Pointer used to output the VME bus cycle type monitored.
- *
- * Retrieve the base address, address space and cycle type of accesses to
- * be monitored by the location monitor.
- *
- * Return: Zero on success, -EINVAL when provided with an invalid location
- *	   monitor resource or function is not supported. Hardware specific
- *	   errors may also be returned.
- */
-int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base,
-	u32 *aspace, u32 *cycle)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return -EINVAL;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	if (!bridge->lm_get) {
-		printk(KERN_ERR "vme_lm_get not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->lm_get(lm, lm_base, aspace, cycle);
-}
-EXPORT_SYMBOL(vme_lm_get);
-
-/**
- * vme_lm_attach - Provide callback for location monitor address
- * @resource: Pointer to VME location monitor resource.
- * @monitor: Offset to which callback should be attached.
- * @callback: Pointer to callback function called when triggered.
- * @data: Generic pointer that will be passed to the callback function.
- *
- * Attach a callback to the specificed offset into the location monitors
- * monitored addresses. A generic pointer is provided to allow data to be
- * passed to the callback when called.
- *
- * Return: Zero on success, -EINVAL when provided with an invalid location
- *	   monitor resource or function is not supported. Hardware specific
- *	   errors may also be returned.
- */
-int vme_lm_attach(struct vme_resource *resource, int monitor,
-	void (*callback)(void *), void *data)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return -EINVAL;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	if (!bridge->lm_attach) {
-		printk(KERN_ERR "vme_lm_attach not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->lm_attach(lm, monitor, callback, data);
-}
-EXPORT_SYMBOL(vme_lm_attach);
-
-/**
- * vme_lm_detach - Remove callback for location monitor address
- * @resource: Pointer to VME location monitor resource.
- * @monitor: Offset to which callback should be removed.
- *
- * Remove the callback associated with the specificed offset into the
- * location monitors monitored addresses.
- *
- * Return: Zero on success, -EINVAL when provided with an invalid location
- *	   monitor resource or function is not supported. Hardware specific
- *	   errors may also be returned.
- */
-int vme_lm_detach(struct vme_resource *resource, int monitor)
-{
-	struct vme_bridge *bridge = find_bridge(resource);
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return -EINVAL;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	if (!bridge->lm_detach) {
-		printk(KERN_ERR "vme_lm_detach not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->lm_detach(lm, monitor);
-}
-EXPORT_SYMBOL(vme_lm_detach);
-
-/**
- * vme_lm_free - Free allocated VME location monitor
- * @resource: Pointer to VME location monitor resource.
- *
- * Free allocation of a VME location monitor.
- *
- * WARNING: This function currently expects that any callbacks that have
- *          been attached to the location monitor have been removed.
- *
- * Return: Zero on success, -EINVAL when provided with an invalid location
- *	   monitor resource.
- */
-void vme_lm_free(struct vme_resource *resource)
-{
-	struct vme_lm_resource *lm;
-
-	if (resource->type != VME_LM) {
-		printk(KERN_ERR "Not a Location Monitor resource\n");
-		return;
-	}
-
-	lm = list_entry(resource->entry, struct vme_lm_resource, list);
-
-	mutex_lock(&lm->mtx);
-
-	/* XXX
-	 * Check to see that there aren't any callbacks still attached, if
-	 * there are we should probably be detaching them!
-	 */
-
-	lm->locked = 0;
-
-	mutex_unlock(&lm->mtx);
-
-	kfree(resource);
-}
-EXPORT_SYMBOL(vme_lm_free);
-
-/**
- * vme_slot_num - Retrieve slot ID
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- *
- * Retrieve the slot ID associated with the provided VME device.
- *
- * Return: The slot ID on success, -EINVAL if VME bridge cannot be determined
- *         or the function is not supported. Hardware specific errors may also
- *         be returned.
- */
-int vme_slot_num(struct vme_dev *vdev)
-{
-	struct vme_bridge *bridge;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		printk(KERN_ERR "Can't find VME bus\n");
-		return -EINVAL;
-	}
-
-	if (!bridge->slot_get) {
-		printk(KERN_WARNING "vme_slot_num not supported\n");
-		return -EINVAL;
-	}
-
-	return bridge->slot_get(bridge);
-}
-EXPORT_SYMBOL(vme_slot_num);
-
-/**
- * vme_bus_num - Retrieve bus number
- * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
- *
- * Retrieve the bus enumeration associated with the provided VME device.
- *
- * Return: The bus number on success, -EINVAL if VME bridge cannot be
- *         determined.
- */
-int vme_bus_num(struct vme_dev *vdev)
-{
-	struct vme_bridge *bridge;
-
-	bridge = vdev->bridge;
-	if (!bridge) {
-		pr_err("Can't find VME bus\n");
-		return -EINVAL;
-	}
-
-	return bridge->num;
-}
-EXPORT_SYMBOL(vme_bus_num);
-
-/* - Bridge Registration --------------------------------------------------- */
-
-static void vme_dev_release(struct device *dev)
-{
-	kfree(dev_to_vme_dev(dev));
-}
-
-/* Common bridge initialization */
-struct vme_bridge *vme_init_bridge(struct vme_bridge *bridge)
-{
-	INIT_LIST_HEAD(&bridge->vme_error_handlers);
-	INIT_LIST_HEAD(&bridge->master_resources);
-	INIT_LIST_HEAD(&bridge->slave_resources);
-	INIT_LIST_HEAD(&bridge->dma_resources);
-	INIT_LIST_HEAD(&bridge->lm_resources);
-	mutex_init(&bridge->irq_mtx);
-
-	return bridge;
-}
-EXPORT_SYMBOL(vme_init_bridge);
-
-int vme_register_bridge(struct vme_bridge *bridge)
-{
-	int i;
-	int ret = -1;
-
-	mutex_lock(&vme_buses_lock);
-	for (i = 0; i < sizeof(vme_bus_numbers) * 8; i++) {
-		if ((vme_bus_numbers & (1 << i)) == 0) {
-			vme_bus_numbers |= (1 << i);
-			bridge->num = i;
-			INIT_LIST_HEAD(&bridge->devices);
-			list_add_tail(&bridge->bus_list, &vme_bus_list);
-			ret = 0;
-			break;
-		}
-	}
-	mutex_unlock(&vme_buses_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(vme_register_bridge);
-
-void vme_unregister_bridge(struct vme_bridge *bridge)
-{
-	struct vme_dev *vdev;
-	struct vme_dev *tmp;
-
-	mutex_lock(&vme_buses_lock);
-	vme_bus_numbers &= ~(1 << bridge->num);
-	list_for_each_entry_safe(vdev, tmp, &bridge->devices, bridge_list) {
-		list_del(&vdev->drv_list);
-		list_del(&vdev->bridge_list);
-		device_unregister(&vdev->dev);
-	}
-	list_del(&bridge->bus_list);
-	mutex_unlock(&vme_buses_lock);
-}
-EXPORT_SYMBOL(vme_unregister_bridge);
-
-/* - Driver Registration --------------------------------------------------- */
-
-static int __vme_register_driver_bus(struct vme_driver *drv,
-	struct vme_bridge *bridge, unsigned int ndevs)
-{
-	int err;
-	unsigned int i;
-	struct vme_dev *vdev;
-	struct vme_dev *tmp;
-
-	for (i = 0; i < ndevs; i++) {
-		vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
-		if (!vdev) {
-			err = -ENOMEM;
-			goto err_devalloc;
-		}
-		vdev->num = i;
-		vdev->bridge = bridge;
-		vdev->dev.platform_data = drv;
-		vdev->dev.release = vme_dev_release;
-		vdev->dev.parent = bridge->parent;
-		vdev->dev.bus = &vme_bus_type;
-		dev_set_name(&vdev->dev, "%s.%u-%u", drv->name, bridge->num,
-			vdev->num);
-
-		err = device_register(&vdev->dev);
-		if (err)
-			goto err_reg;
-
-		if (vdev->dev.platform_data) {
-			list_add_tail(&vdev->drv_list, &drv->devices);
-			list_add_tail(&vdev->bridge_list, &bridge->devices);
-		} else
-			device_unregister(&vdev->dev);
-	}
-	return 0;
-
-err_reg:
-	put_device(&vdev->dev);
-err_devalloc:
-	list_for_each_entry_safe(vdev, tmp, &drv->devices, drv_list) {
-		list_del(&vdev->drv_list);
-		list_del(&vdev->bridge_list);
-		device_unregister(&vdev->dev);
-	}
-	return err;
-}
-
-static int __vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
-{
-	struct vme_bridge *bridge;
-	int err = 0;
-
-	mutex_lock(&vme_buses_lock);
-	list_for_each_entry(bridge, &vme_bus_list, bus_list) {
-		/*
-		 * This cannot cause trouble as we already have vme_buses_lock
-		 * and if the bridge is removed, it will have to go through
-		 * vme_unregister_bridge() to do it (which calls remove() on
-		 * the bridge which in turn tries to acquire vme_buses_lock and
-		 * will have to wait).
-		 */
-		err = __vme_register_driver_bus(drv, bridge, ndevs);
-		if (err)
-			break;
-	}
-	mutex_unlock(&vme_buses_lock);
-	return err;
-}
-
-/**
- * vme_register_driver - Register a VME driver
- * @drv: Pointer to VME driver structure to register.
- * @ndevs: Maximum number of devices to allow to be enumerated.
- *
- * Register a VME device driver with the VME subsystem.
- *
- * Return: Zero on success, error value on registration failure.
- */
-int vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
-{
-	int err;
-
-	drv->driver.name = drv->name;
-	drv->driver.bus = &vme_bus_type;
-	INIT_LIST_HEAD(&drv->devices);
-
-	err = driver_register(&drv->driver);
-	if (err)
-		return err;
-
-	err = __vme_register_driver(drv, ndevs);
-	if (err)
-		driver_unregister(&drv->driver);
-
-	return err;
-}
-EXPORT_SYMBOL(vme_register_driver);
-
-/**
- * vme_unregister_driver - Unregister a VME driver
- * @drv: Pointer to VME driver structure to unregister.
- *
- * Unregister a VME device driver from the VME subsystem.
- */
-void vme_unregister_driver(struct vme_driver *drv)
-{
-	struct vme_dev *dev, *dev_tmp;
-
-	mutex_lock(&vme_buses_lock);
-	list_for_each_entry_safe(dev, dev_tmp, &drv->devices, drv_list) {
-		list_del(&dev->drv_list);
-		list_del(&dev->bridge_list);
-		device_unregister(&dev->dev);
-	}
-	mutex_unlock(&vme_buses_lock);
-
-	driver_unregister(&drv->driver);
-}
-EXPORT_SYMBOL(vme_unregister_driver);
-
-/* - Bus Registration ------------------------------------------------------ */
-
-static int vme_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct vme_driver *vme_drv;
-
-	vme_drv = container_of(drv, struct vme_driver, driver);
-
-	if (dev->platform_data == vme_drv) {
-		struct vme_dev *vdev = dev_to_vme_dev(dev);
-
-		if (vme_drv->match && vme_drv->match(vdev))
-			return 1;
-
-		dev->platform_data = NULL;
-	}
-	return 0;
-}
-
-static int vme_bus_probe(struct device *dev)
-{
-	struct vme_driver *driver;
-	struct vme_dev *vdev = dev_to_vme_dev(dev);
-
-	driver = dev->platform_data;
-	if (driver->probe)
-		return driver->probe(vdev);
-
-	return -ENODEV;
-}
-
-static void vme_bus_remove(struct device *dev)
-{
-	struct vme_driver *driver;
-	struct vme_dev *vdev = dev_to_vme_dev(dev);
-
-	driver = dev->platform_data;
-	if (driver->remove)
-		driver->remove(vdev);
-}
-
-struct bus_type vme_bus_type = {
-	.name = "vme",
-	.match = vme_bus_match,
-	.probe = vme_bus_probe,
-	.remove = vme_bus_remove,
-};
-EXPORT_SYMBOL(vme_bus_type);
-
-static int __init vme_init(void)
-{
-	return bus_register(&vme_bus_type);
-}
-subsys_initcall(vme_init);
diff --git a/drivers/vme/vme_bridge.h b/drivers/vme/vme_bridge.h
deleted file mode 100644
index 42ecf961004e..000000000000
--- a/drivers/vme/vme_bridge.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VME_BRIDGE_H_
-#define _VME_BRIDGE_H_
-
-#include <linux/vme.h>
-
-#define VME_CRCSR_BUF_SIZE (508*1024)
-/*
- * Resource structures
- */
-struct vme_master_resource {
-	struct list_head list;
-	struct vme_bridge *parent;
-	/*
-	 * We are likely to need to access the VME bus in interrupt context, so
-	 * protect master routines with a spinlock rather than a mutex.
-	 */
-	spinlock_t lock;
-	int locked;
-	int number;
-	u32 address_attr;
-	u32 cycle_attr;
-	u32 width_attr;
-	struct resource bus_resource;
-	void __iomem *kern_base;
-};
-
-struct vme_slave_resource {
-	struct list_head list;
-	struct vme_bridge *parent;
-	struct mutex mtx;
-	int locked;
-	int number;
-	u32 address_attr;
-	u32 cycle_attr;
-};
-
-struct vme_dma_pattern {
-	u32 pattern;
-	u32 type;
-};
-
-struct vme_dma_pci {
-	dma_addr_t address;
-};
-
-struct vme_dma_vme {
-	unsigned long long address;
-	u32 aspace;
-	u32 cycle;
-	u32 dwidth;
-};
-
-struct vme_dma_list {
-	struct list_head list;
-	struct vme_dma_resource *parent;
-	struct list_head entries;
-	struct mutex mtx;
-};
-
-struct vme_dma_resource {
-	struct list_head list;
-	struct vme_bridge *parent;
-	struct mutex mtx;
-	int locked;
-	int number;
-	struct list_head pending;
-	struct list_head running;
-	u32 route_attr;
-};
-
-struct vme_lm_resource {
-	struct list_head list;
-	struct vme_bridge *parent;
-	struct mutex mtx;
-	int locked;
-	int number;
-	int monitors;
-};
-
-struct vme_error_handler {
-	struct list_head list;
-	unsigned long long start;	/* Beginning of error window */
-	unsigned long long end;		/* End of error window */
-	unsigned long long first_error;	/* Address of the first error */
-	u32 aspace;			/* Address space of error window*/
-	unsigned num_errors;		/* Number of errors */
-};
-
-struct vme_callback {
-	void (*func)(int, int, void*);
-	void *priv_data;
-};
-
-struct vme_irq {
-	int count;
-	struct vme_callback callback[VME_NUM_STATUSID];
-};
-
-/* Allow 16 characters for name (including null character) */
-#define VMENAMSIZ 16
-
-/* This structure stores all the information about one bridge
- * The structure should be dynamically allocated by the driver and one instance
- * of the structure should be present for each VME chip present in the system.
- */
-struct vme_bridge {
-	char name[VMENAMSIZ];
-	int num;
-	struct list_head master_resources;
-	struct list_head slave_resources;
-	struct list_head dma_resources;
-	struct list_head lm_resources;
-
-	/* List for registered errors handlers */
-	struct list_head vme_error_handlers;
-	/* List of devices on this bridge */
-	struct list_head devices;
-
-	/* Bridge Info - XXX Move to private structure? */
-	struct device *parent;	/* Parent device (eg. pdev->dev for PCI) */
-	void *driver_priv;	/* Private pointer for the bridge driver */
-	struct list_head bus_list; /* list of VME buses */
-
-	/* Interrupt callbacks */
-	struct vme_irq irq[7];
-	/* Locking for VME irq callback configuration */
-	struct mutex irq_mtx;
-
-	/* Slave Functions */
-	int (*slave_get) (struct vme_slave_resource *, int *,
-		unsigned long long *, unsigned long long *, dma_addr_t *,
-		u32 *, u32 *);
-	int (*slave_set) (struct vme_slave_resource *, int, unsigned long long,
-		unsigned long long, dma_addr_t, u32, u32);
-
-	/* Master Functions */
-	int (*master_get) (struct vme_master_resource *, int *,
-		unsigned long long *, unsigned long long *, u32 *, u32 *,
-		u32 *);
-	int (*master_set) (struct vme_master_resource *, int,
-		unsigned long long, unsigned long long,  u32, u32, u32);
-	ssize_t (*master_read) (struct vme_master_resource *, void *, size_t,
-		loff_t);
-	ssize_t (*master_write) (struct vme_master_resource *, void *, size_t,
-		loff_t);
-	unsigned int (*master_rmw) (struct vme_master_resource *, unsigned int,
-		unsigned int, unsigned int, loff_t);
-
-	/* DMA Functions */
-	int (*dma_list_add) (struct vme_dma_list *, struct vme_dma_attr *,
-		struct vme_dma_attr *, size_t);
-	int (*dma_list_exec) (struct vme_dma_list *);
-	int (*dma_list_empty) (struct vme_dma_list *);
-
-	/* Interrupt Functions */
-	void (*irq_set) (struct vme_bridge *, int, int, int);
-	int (*irq_generate) (struct vme_bridge *, int, int);
-
-	/* Location monitor functions */
-	int (*lm_set) (struct vme_lm_resource *, unsigned long long, u32, u32);
-	int (*lm_get) (struct vme_lm_resource *, unsigned long long *, u32 *,
-		u32 *);
-	int (*lm_attach)(struct vme_lm_resource *, int,
-			 void (*callback)(void *), void *);
-	int (*lm_detach) (struct vme_lm_resource *, int);
-
-	/* CR/CSR space functions */
-	int (*slot_get) (struct vme_bridge *);
-
-	/* Bridge parent interface */
-	void *(*alloc_consistent)(struct device *dev, size_t size,
-		dma_addr_t *dma);
-	void (*free_consistent)(struct device *dev, size_t size,
-		void *vaddr, dma_addr_t dma);
-};
-
-void vme_bus_error_handler(struct vme_bridge *bridge,
-			   unsigned long long address, int am);
-void vme_irq_handler(struct vme_bridge *, int, int);
-
-struct vme_bridge *vme_init_bridge(struct vme_bridge *);
-int vme_register_bridge(struct vme_bridge *);
-void vme_unregister_bridge(struct vme_bridge *);
-struct vme_error_handler *vme_register_error_handler(
-	struct vme_bridge *bridge, u32 aspace,
-	unsigned long long address, size_t len);
-void vme_unregister_error_handler(struct vme_error_handler *handler);
-
-#endif /* _VME_BRIDGE_H_ */
diff --git a/include/linux/vme.h b/include/linux/vme.h
deleted file mode 100644
index b204a9b4be1b..000000000000
--- a/include/linux/vme.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VME_H_
-#define _VME_H_
-
-/* Resource Type */
-enum vme_resource_type {
-	VME_MASTER,
-	VME_SLAVE,
-	VME_DMA,
-	VME_LM
-};
-
-/* VME Address Spaces */
-#define VME_A16		0x1
-#define VME_A24		0x2
-#define	VME_A32		0x4
-#define VME_A64		0x8
-#define VME_CRCSR	0x10
-#define VME_USER1	0x20
-#define VME_USER2	0x40
-#define VME_USER3	0x80
-#define VME_USER4	0x100
-
-#define VME_A16_MAX	0x10000ULL
-#define VME_A24_MAX	0x1000000ULL
-#define VME_A32_MAX	0x100000000ULL
-#define VME_A64_MAX	0x10000000000000000ULL
-#define VME_CRCSR_MAX	0x1000000ULL
-
-
-/* VME Cycle Types */
-#define VME_SCT		0x1
-#define VME_BLT		0x2
-#define VME_MBLT	0x4
-#define VME_2eVME	0x8
-#define VME_2eSST	0x10
-#define VME_2eSSTB	0x20
-
-#define VME_2eSST160	0x100
-#define VME_2eSST267	0x200
-#define VME_2eSST320	0x400
-
-#define	VME_SUPER	0x1000
-#define	VME_USER	0x2000
-#define	VME_PROG	0x4000
-#define	VME_DATA	0x8000
-
-/* VME Data Widths */
-#define VME_D8		0x1
-#define VME_D16		0x2
-#define VME_D32		0x4
-#define VME_D64		0x8
-
-/* Arbitration Scheduling Modes */
-#define VME_R_ROBIN_MODE	0x1
-#define VME_PRIORITY_MODE	0x2
-
-#define VME_DMA_PATTERN			(1<<0)
-#define VME_DMA_PCI			(1<<1)
-#define VME_DMA_VME			(1<<2)
-
-#define VME_DMA_PATTERN_BYTE		(1<<0)
-#define VME_DMA_PATTERN_WORD		(1<<1)
-#define VME_DMA_PATTERN_INCREMENT	(1<<2)
-
-#define VME_DMA_VME_TO_MEM		(1<<0)
-#define VME_DMA_MEM_TO_VME		(1<<1)
-#define VME_DMA_VME_TO_VME		(1<<2)
-#define VME_DMA_MEM_TO_MEM		(1<<3)
-#define VME_DMA_PATTERN_TO_VME		(1<<4)
-#define VME_DMA_PATTERN_TO_MEM		(1<<5)
-
-struct vme_dma_attr {
-	u32 type;
-	void *private;
-};
-
-struct vme_resource {
-	enum vme_resource_type type;
-	struct list_head *entry;
-};
-
-extern struct bus_type vme_bus_type;
-
-/* Number of VME interrupt vectors */
-#define VME_NUM_STATUSID	256
-
-/* VME_MAX_BRIDGES comes from the type of vme_bus_numbers */
-#define VME_MAX_BRIDGES		(sizeof(unsigned int)*8)
-#define VME_MAX_SLOTS		32
-
-#define VME_SLOT_CURRENT	-1
-#define VME_SLOT_ALL		-2
-
-/**
- * struct vme_dev - Structure representing a VME device
- * @num: The device number
- * @bridge: Pointer to the bridge device this device is on
- * @dev: Internal device structure
- * @drv_list: List of devices (per driver)
- * @bridge_list: List of devices (per bridge)
- */
-struct vme_dev {
-	int num;
-	struct vme_bridge *bridge;
-	struct device dev;
-	struct list_head drv_list;
-	struct list_head bridge_list;
-};
-
-/**
- * struct vme_driver - Structure representing a VME driver
- * @name: Driver name, should be unique among VME drivers and usually the same
- *        as the module name.
- * @match: Callback used to determine whether probe should be run.
- * @probe: Callback for device binding, called when new device is detected.
- * @remove: Callback, called on device removal.
- * @driver: Underlying generic device driver structure.
- * @devices: List of VME devices (struct vme_dev) associated with this driver.
- */
-struct vme_driver {
-	const char *name;
-	int (*match)(struct vme_dev *);
-	int (*probe)(struct vme_dev *);
-	void (*remove)(struct vme_dev *);
-	struct device_driver driver;
-	struct list_head devices;
-};
-
-void *vme_alloc_consistent(struct vme_resource *, size_t, dma_addr_t *);
-void vme_free_consistent(struct vme_resource *, size_t,  void *,
-	dma_addr_t);
-
-size_t vme_get_size(struct vme_resource *);
-int vme_check_window(u32 aspace, unsigned long long vme_base,
-		     unsigned long long size);
-
-struct vme_resource *vme_slave_request(struct vme_dev *, u32, u32);
-int vme_slave_set(struct vme_resource *, int, unsigned long long,
-	unsigned long long, dma_addr_t, u32, u32);
-int vme_slave_get(struct vme_resource *, int *, unsigned long long *,
-	unsigned long long *, dma_addr_t *, u32 *, u32 *);
-void vme_slave_free(struct vme_resource *);
-
-struct vme_resource *vme_master_request(struct vme_dev *, u32, u32, u32);
-int vme_master_set(struct vme_resource *, int, unsigned long long,
-	unsigned long long, u32, u32, u32);
-int vme_master_get(struct vme_resource *, int *, unsigned long long *,
-	unsigned long long *, u32 *, u32 *, u32 *);
-ssize_t vme_master_read(struct vme_resource *, void *, size_t, loff_t);
-ssize_t vme_master_write(struct vme_resource *, void *, size_t, loff_t);
-unsigned int vme_master_rmw(struct vme_resource *, unsigned int, unsigned int,
-	unsigned int, loff_t);
-int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma);
-void vme_master_free(struct vme_resource *);
-
-struct vme_resource *vme_dma_request(struct vme_dev *, u32);
-struct vme_dma_list *vme_new_dma_list(struct vme_resource *);
-struct vme_dma_attr *vme_dma_pattern_attribute(u32, u32);
-struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t);
-struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long, u32, u32, u32);
-void vme_dma_free_attribute(struct vme_dma_attr *);
-int vme_dma_list_add(struct vme_dma_list *, struct vme_dma_attr *,
-	struct vme_dma_attr *, size_t);
-int vme_dma_list_exec(struct vme_dma_list *);
-int vme_dma_list_free(struct vme_dma_list *);
-int vme_dma_free(struct vme_resource *);
-
-int vme_irq_request(struct vme_dev *, int, int,
-	void (*callback)(int, int, void *), void *);
-void vme_irq_free(struct vme_dev *, int, int);
-int vme_irq_generate(struct vme_dev *, int, int);
-
-struct vme_resource *vme_lm_request(struct vme_dev *);
-int vme_lm_count(struct vme_resource *);
-int vme_lm_set(struct vme_resource *, unsigned long long, u32, u32);
-int vme_lm_get(struct vme_resource *, unsigned long long *, u32 *, u32 *);
-int vme_lm_attach(struct vme_resource *, int, void (*callback)(void *), void *);
-int vme_lm_detach(struct vme_resource *, int);
-void vme_lm_free(struct vme_resource *);
-
-int vme_slot_num(struct vme_dev *);
-int vme_bus_num(struct vme_dev *);
-
-int vme_register_driver(struct vme_driver *, unsigned int);
-void vme_unregister_driver(struct vme_driver *);
-
-
-#endif /* _VME_H_ */
-
-- 
cgit 


From 2f8c3ae8288e4a4018330ed5c4e758b878d9c555 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 1 Jun 2022 00:07:00 -0700
Subject: driver core: Add wait_for_init_devices_probe helper function

Some devices might need to be probed and bound successfully before the
kernel boot sequence can finish and move on to init/userspace. For
example, a network interface might need to be bound to be able to mount
a NFS rootfs.

With fw_devlink=on by default, some of these devices might be blocked
from probing because they are waiting on a optional supplier that
doesn't have a driver. While fw_devlink will eventually identify such
devices and unblock the probing automatically, it might be too late by
the time it unblocks the probing of devices. For example, the IP4
autoconfig might timeout before fw_devlink unblocks probing of the
network interface.

This function is available to temporarily try and probe all devices that
have a driver even if some of their suppliers haven't been added or
don't have drivers.

The drivers can then decide which of the suppliers are optional vs
mandatory and probe the device if possible. By the time this function
returns, all such "best effort" probes are guaranteed to be completed.
If a device successfully probes in this mode, we delete all fw_devlink
discovered dependencies of that device where the supplier hasn't yet
probed successfully because they have to be optional dependencies.

This also means that some devices that aren't needed for init and could
have waited for their optional supplier to probe (when the supplier's
module is loaded later on) would end up probing prematurely with limited
functionality.  So call this function only when boot would fail without
it.

Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20220601070707.3946847-5-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h           |   1 +
 drivers/base/core.c           | 100 +++++++++++++++++++++++++++++++++++++++---
 drivers/base/dd.c             |  19 +++++---
 include/linux/device/driver.h |   1 +
 4 files changed, 110 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index ab71403d102f..b3a43a164dcd 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -160,6 +160,7 @@ extern int devres_release_all(struct device *dev);
 extern void device_block_probing(void);
 extern void device_unblock_probing(void);
 extern void deferred_probe_extend_timeout(void);
+extern void driver_deferred_probe_trigger(void);
 
 /* /sys/devices directory */
 extern struct kset *devices_kset;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 7cd789c4985d..61fdfe99b348 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -54,6 +54,7 @@ static unsigned int defer_sync_state_count = 1;
 static DEFINE_MUTEX(fwnode_link_lock);
 static bool fw_devlink_is_permissive(void);
 static bool fw_devlink_drv_reg_done;
+static bool fw_devlink_best_effort;
 
 /**
  * fwnode_link_add - Create a link between two fwnode_handles.
@@ -965,6 +966,11 @@ static void device_links_missing_supplier(struct device *dev)
 	}
 }
 
+static bool dev_is_best_effort(struct device *dev)
+{
+	return fw_devlink_best_effort && dev->can_match;
+}
+
 /**
  * device_links_check_suppliers - Check presence of supplier drivers.
  * @dev: Consumer device.
@@ -984,7 +990,7 @@ static void device_links_missing_supplier(struct device *dev)
 int device_links_check_suppliers(struct device *dev)
 {
 	struct device_link *link;
-	int ret = 0;
+	int ret = 0, fwnode_ret = 0;
 	struct fwnode_handle *sup_fw;
 
 	/*
@@ -997,12 +1003,17 @@ int device_links_check_suppliers(struct device *dev)
 		sup_fw = list_first_entry(&dev->fwnode->suppliers,
 					  struct fwnode_link,
 					  c_hook)->supplier;
-		dev_err_probe(dev, -EPROBE_DEFER, "wait for supplier %pfwP\n",
-			      sup_fw);
-		mutex_unlock(&fwnode_link_lock);
-		return -EPROBE_DEFER;
+		if (!dev_is_best_effort(dev)) {
+			fwnode_ret = -EPROBE_DEFER;
+			dev_err_probe(dev, -EPROBE_DEFER,
+				    "wait for supplier %pfwP\n", sup_fw);
+		} else {
+			fwnode_ret = -EAGAIN;
+		}
 	}
 	mutex_unlock(&fwnode_link_lock);
+	if (fwnode_ret == -EPROBE_DEFER)
+		return fwnode_ret;
 
 	device_links_write_lock();
 
@@ -1012,6 +1023,14 @@ int device_links_check_suppliers(struct device *dev)
 
 		if (link->status != DL_STATE_AVAILABLE &&
 		    !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {
+
+			if (dev_is_best_effort(dev) &&
+			    link->flags & DL_FLAG_INFERRED &&
+			    !link->supplier->can_match) {
+				ret = -EAGAIN;
+				continue;
+			}
+
 			device_links_missing_supplier(dev);
 			dev_err_probe(dev, -EPROBE_DEFER,
 				      "supplier %s not ready\n",
@@ -1024,7 +1043,8 @@ int device_links_check_suppliers(struct device *dev)
 	dev->links.status = DL_DEV_PROBING;
 
 	device_links_write_unlock();
-	return ret;
+
+	return ret ? ret : fwnode_ret;
 }
 
 /**
@@ -1289,6 +1309,18 @@ void device_links_driver_bound(struct device *dev)
 			 * save to drop the managed link completely.
 			 */
 			device_link_drop_managed(link);
+		} else if (dev_is_best_effort(dev) &&
+			   link->flags & DL_FLAG_INFERRED &&
+			   link->status != DL_STATE_CONSUMER_PROBE &&
+			   !link->supplier->can_match) {
+			/*
+			 * When dev_is_best_effort() is true, we ignore device
+			 * links to suppliers that don't have a driver.  If the
+			 * consumer device still managed to probe, there's no
+			 * point in maintaining a device link in a weird state
+			 * (consumer probed before supplier). So delete it.
+			 */
+			device_link_drop_managed(link);
 		} else {
 			WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
 			WRITE_ONCE(link->status, DL_STATE_ACTIVE);
@@ -1655,6 +1687,62 @@ void fw_devlink_drivers_done(void)
 	device_links_write_unlock();
 }
 
+/**
+ * wait_for_init_devices_probe - Try to probe any device needed for init
+ *
+ * Some devices might need to be probed and bound successfully before the kernel
+ * boot sequence can finish and move on to init/userspace. For example, a
+ * network interface might need to be bound to be able to mount a NFS rootfs.
+ *
+ * With fw_devlink=on by default, some of these devices might be blocked from
+ * probing because they are waiting on a optional supplier that doesn't have a
+ * driver. While fw_devlink will eventually identify such devices and unblock
+ * the probing automatically, it might be too late by the time it unblocks the
+ * probing of devices. For example, the IP4 autoconfig might timeout before
+ * fw_devlink unblocks probing of the network interface.
+ *
+ * This function is available to temporarily try and probe all devices that have
+ * a driver even if some of their suppliers haven't been added or don't have
+ * drivers.
+ *
+ * The drivers can then decide which of the suppliers are optional vs mandatory
+ * and probe the device if possible. By the time this function returns, all such
+ * "best effort" probes are guaranteed to be completed. If a device successfully
+ * probes in this mode, we delete all fw_devlink discovered dependencies of that
+ * device where the supplier hasn't yet probed successfully because they have to
+ * be optional dependencies.
+ *
+ * Any devices that didn't successfully probe go back to being treated as if
+ * this function was never called.
+ *
+ * This also means that some devices that aren't needed for init and could have
+ * waited for their optional supplier to probe (when the supplier's module is
+ * loaded later on) would end up probing prematurely with limited functionality.
+ * So call this function only when boot would fail without it.
+ */
+void __init wait_for_init_devices_probe(void)
+{
+	if (!fw_devlink_flags || fw_devlink_is_permissive())
+		return;
+
+	/*
+	 * Wait for all ongoing probes to finish so that the "best effort" is
+	 * only applied to devices that can't probe otherwise.
+	 */
+	wait_for_device_probe();
+
+	pr_info("Trying to probe devices needed for running init ...\n");
+	fw_devlink_best_effort = true;
+	driver_deferred_probe_trigger();
+
+	/*
+	 * Wait for all "best effort" probes to finish before going back to
+	 * normal enforcement.
+	 */
+	wait_for_device_probe();
+	fw_devlink_best_effort = false;
+}
+
 static void fw_devlink_unblock_consumers(struct device *dev)
 {
 	struct device_link *link;
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 11b0fb6414d3..4a55fbb7e0da 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -172,7 +172,7 @@ static bool driver_deferred_probe_enable;
  * changes in the midst of a probe, then deferred processing should be triggered
  * again.
  */
-static void driver_deferred_probe_trigger(void)
+void driver_deferred_probe_trigger(void)
 {
 	if (!driver_deferred_probe_enable)
 		return;
@@ -580,7 +580,7 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 {
 	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
 			   !drv->suppress_bind_attrs;
-	int ret;
+	int ret, link_ret;
 
 	if (defer_all_probes) {
 		/*
@@ -592,9 +592,9 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 		return -EPROBE_DEFER;
 	}
 
-	ret = device_links_check_suppliers(dev);
-	if (ret)
-		return ret;
+	link_ret = device_links_check_suppliers(dev);
+	if (link_ret == -EPROBE_DEFER)
+		return link_ret;
 
 	pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
 		 drv->bus->name, __func__, drv->name, dev_name(dev));
@@ -633,6 +633,15 @@ re_probe:
 
 	ret = call_driver_probe(dev, drv);
 	if (ret) {
+		/*
+		 * If fw_devlink_best_effort is active (denoted by -EAGAIN), the
+		 * device might actually probe properly once some of its missing
+		 * suppliers have probed. So, treat this as if the driver
+		 * returned -EPROBE_DEFER.
+		 */
+		if (link_ret == -EAGAIN)
+			ret = -EPROBE_DEFER;
+
 		/*
 		 * Return probe errors as positive values so that the callers
 		 * can distinguish them from other errors.
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 700453017e1c..2114d65b862f 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -129,6 +129,7 @@ extern struct device_driver *driver_find(const char *name,
 					 struct bus_type *bus);
 extern int driver_probe_done(void);
 extern void wait_for_device_probe(void);
+void __init wait_for_init_devices_probe(void);
 
 /* sysfs interface for exporting driver attributes */
 
-- 
cgit 


From 9cbffc7a59561be950ecc675d19a3d2b45202b2b Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 1 Jun 2022 00:07:05 -0700
Subject: driver core: Delete driver_deferred_probe_check_state()

The function is no longer used. So delete it.

Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20220601070707.3946847-10-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c             | 30 ------------------------------
 include/linux/device/driver.h |  1 -
 2 files changed, 31 deletions(-)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 335e71d3a618..e600dd2afc35 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -274,42 +274,12 @@ static int __init deferred_probe_timeout_setup(char *str)
 }
 __setup("deferred_probe_timeout=", deferred_probe_timeout_setup);
 
-/**
- * driver_deferred_probe_check_state() - Check deferred probe state
- * @dev: device to check
- *
- * Return:
- * * -ENODEV if initcalls have completed and modules are disabled.
- * * -ETIMEDOUT if the deferred probe timeout was set and has expired
- *   and modules are enabled.
- * * -EPROBE_DEFER in other cases.
- *
- * Drivers or subsystems can opt-in to calling this function instead of directly
- * returning -EPROBE_DEFER.
- */
-int driver_deferred_probe_check_state(struct device *dev)
-{
-	if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) {
-		dev_warn(dev, "ignoring dependency for device, assuming no driver\n");
-		return -ENODEV;
-	}
-
-	if (!driver_deferred_probe_timeout && initcalls_done) {
-		dev_warn(dev, "deferred probe timeout, ignoring dependency\n");
-		return -ETIMEDOUT;
-	}
-
-	return -EPROBE_DEFER;
-}
-EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state);
-
 static void deferred_probe_timeout_work_func(struct work_struct *work)
 {
 	struct device_private *p;
 
 	fw_devlink_drivers_done();
 
-	driver_deferred_probe_timeout = 0;
 	driver_deferred_probe_trigger();
 	flush_work(&deferred_probe_work);
 
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 2114d65b862f..7acaabde5396 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -242,7 +242,6 @@ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev)
 
 extern int driver_deferred_probe_timeout;
 void driver_deferred_probe_add(struct device *dev);
-int driver_deferred_probe_check_state(struct device *dev);
 void driver_init(void);
 
 /**
-- 
cgit 


From 82b070beae1ef55b0049768c8dc91d87565bb191 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Jun 2022 15:02:18 +0300
Subject: driver core: Introduce device_find_any_child() helper

There are several places in the kernel where this kind of functionality is
being used. Provide a generic helper for such cases.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220610120219.18988-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 20 ++++++++++++++++++++
 include/linux/device.h |  2 ++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 977b379a495b..839f64485a55 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3920,6 +3920,26 @@ struct device *device_find_child_by_name(struct device *parent,
 }
 EXPORT_SYMBOL_GPL(device_find_child_by_name);
 
+static int match_any(struct device *dev, void *unused)
+{
+	return 1;
+}
+
+/**
+ * device_find_any_child - device iterator for locating a child device, if any.
+ * @parent: parent struct device
+ *
+ * This is similar to the device_find_child() function above, but it
+ * returns a reference to a child device, if any.
+ *
+ * NOTE: you will need to drop the reference with put_device() after use.
+ */
+struct device *device_find_any_child(struct device *parent)
+{
+	return device_find_child(parent, NULL, match_any);
+}
+EXPORT_SYMBOL_GPL(device_find_any_child);
+
 int __init devices_init(void)
 {
 	devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
diff --git a/include/linux/device.h b/include/linux/device.h
index dc941997795c..424b55df0272 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -905,6 +905,8 @@ struct device *device_find_child(struct device *dev, void *data,
 				 int (*match)(struct device *dev, void *data));
 struct device *device_find_child_by_name(struct device *parent,
 					 const char *name);
+struct device *device_find_any_child(struct device *parent);
+
 int device_rename(struct device *dev, const char *new_name);
 int device_move(struct device *dev, struct device *new_parent,
 		enum dpm_order dpm_order);
-- 
cgit 


From 81eef68f3bb78f5b3dc29032ffd804a4a2d7aaf0 Mon Sep 17 00:00:00 2001
From: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Date: Fri, 10 Jun 2022 14:44:20 +0200
Subject: ASoC: Remove unused hw_write_t type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 81da8a0b7975 ("ASoC: remove codec hw_write/control_data") removed
use of hw_write_t in struct snd_soc_codec, but it left type definition.
Fully clean it up.

Fixes: 81da8a0b7975 ("ASoC: remove codec hw_write/control_data")
Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20220610124420.4160986-1-amadeuszx.slawinski@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index f20f5f890794..b276dcb5d4e8 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -408,8 +408,6 @@ struct snd_soc_jack_pin;
 
 struct snd_soc_jack_gpio;
 
-typedef int (*hw_write_t)(void *,const char* ,int);
-
 enum snd_soc_pcm_subclass {
 	SND_SOC_PCM_CLASS_PCM	= 0,
 	SND_SOC_PCM_CLASS_BE	= 1,
-- 
cgit 


From c3752f44604f3bc4f3ce6e169fa32d16943ff70b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 9 Jun 2022 11:24:54 +0900
Subject: scsi: libsas: Introduce struct smp_disc_resp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling with gcc 12, several warnings are thrown by gcc when
compiling drivers/scsi/libsas/sas_expander.c, e.g.:

In function ‘sas_get_phy_change_count’,
    inlined from ‘sas_find_bcast_phy.constprop’ at
drivers/scsi/libsas/sas_expander.c:1737:9:
drivers/scsi/libsas/sas_expander.c:1697:39: warning: array subscript
‘struct smp_resp[0]’ is partly outside array bounds of ‘unsigned
char[56]’ [-Warray-bounds]
 1697 |                 *pcc = disc_resp->disc.change_count;
      |                        ~~~~~~~~~~~~~~~^~~~~~~~~~~~~

This is due to the use of the struct smp_resp to aggregate all possible
response types using a union but allocating a response buffer with a size
exactly equal to the size of the response type needed. This leads to access
to fields of struct smp_resp from an allocated memory area that is smaller
than the size of struct smp_resp.

Fix this by defining struct smp_disc_resp for sas discovery operations.
Since this structure and the generic struct smp_resp are identical for
the little endian and big endian archs, move the definition of these
structures at the end of include/scsi/sas.h to avoid repeating their
definition.

Link: https://lore.kernel.org/r/20220609022456.409087-2-damien.lemoal@opensource.wdc.com
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_expander.c | 32 ++++++++++++++------------------
 include/scsi/sas.h                 | 28 ++++++++++++----------------
 2 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 260e735d06fa..fb998a8a7d3b 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -175,13 +175,13 @@ static enum sas_device_type to_dev_type(struct discover_resp *dr)
 		return dr->attached_dev_type;
 }
 
-static void sas_set_ex_phy(struct domain_device *dev, int phy_id, void *rsp)
+static void sas_set_ex_phy(struct domain_device *dev, int phy_id,
+			   struct smp_disc_resp *disc_resp)
 {
 	enum sas_device_type dev_type;
 	enum sas_linkrate linkrate;
 	u8 sas_addr[SAS_ADDR_SIZE];
-	struct smp_resp *resp = rsp;
-	struct discover_resp *dr = &resp->disc;
+	struct discover_resp *dr = &disc_resp->disc;
 	struct sas_ha_struct *ha = dev->port->ha;
 	struct expander_device *ex = &dev->ex_dev;
 	struct ex_phy *phy = &ex->ex_phy[phy_id];
@@ -198,7 +198,7 @@ static void sas_set_ex_phy(struct domain_device *dev, int phy_id, void *rsp)
 		BUG_ON(!phy->phy);
 	}
 
-	switch (resp->result) {
+	switch (disc_resp->result) {
 	case SMP_RESP_PHY_VACANT:
 		phy->phy_state = PHY_VACANT;
 		break;
@@ -347,12 +347,13 @@ struct domain_device *sas_ex_to_ata(struct domain_device *ex_dev, int phy_id)
 }
 
 #define DISCOVER_REQ_SIZE  16
-#define DISCOVER_RESP_SIZE 56
+#define DISCOVER_RESP_SIZE sizeof(struct smp_disc_resp)
 
 static int sas_ex_phy_discover_helper(struct domain_device *dev, u8 *disc_req,
-				      u8 *disc_resp, int single)
+				      struct smp_disc_resp *disc_resp,
+				      int single)
 {
-	struct discover_resp *dr;
+	struct discover_resp *dr = &disc_resp->disc;
 	int res;
 
 	disc_req[9] = single;
@@ -361,7 +362,6 @@ static int sas_ex_phy_discover_helper(struct domain_device *dev, u8 *disc_req,
 			       disc_resp, DISCOVER_RESP_SIZE);
 	if (res)
 		return res;
-	dr = &((struct smp_resp *)disc_resp)->disc;
 	if (memcmp(dev->sas_addr, dr->attached_sas_addr, SAS_ADDR_SIZE) == 0) {
 		pr_notice("Found loopback topology, just ignore it!\n");
 		return 0;
@@ -375,7 +375,7 @@ int sas_ex_phy_discover(struct domain_device *dev, int single)
 	struct expander_device *ex = &dev->ex_dev;
 	int  res = 0;
 	u8   *disc_req;
-	u8   *disc_resp;
+	struct smp_disc_resp *disc_resp;
 
 	disc_req = alloc_smp_req(DISCOVER_REQ_SIZE);
 	if (!disc_req)
@@ -1657,7 +1657,7 @@ out_err:
 /* ---------- Domain revalidation ---------- */
 
 static int sas_get_phy_discover(struct domain_device *dev,
-				int phy_id, struct smp_resp *disc_resp)
+				int phy_id, struct smp_disc_resp *disc_resp)
 {
 	int res;
 	u8 *disc_req;
@@ -1673,10 +1673,8 @@ static int sas_get_phy_discover(struct domain_device *dev,
 			       disc_resp, DISCOVER_RESP_SIZE);
 	if (res)
 		goto out;
-	else if (disc_resp->result != SMP_RESP_FUNC_ACC) {
+	if (disc_resp->result != SMP_RESP_FUNC_ACC)
 		res = disc_resp->result;
-		goto out;
-	}
 out:
 	kfree(disc_req);
 	return res;
@@ -1686,7 +1684,7 @@ static int sas_get_phy_change_count(struct domain_device *dev,
 				    int phy_id, int *pcc)
 {
 	int res;
-	struct smp_resp *disc_resp;
+	struct smp_disc_resp *disc_resp;
 
 	disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE);
 	if (!disc_resp)
@@ -1704,19 +1702,17 @@ static int sas_get_phy_attached_dev(struct domain_device *dev, int phy_id,
 				    u8 *sas_addr, enum sas_device_type *type)
 {
 	int res;
-	struct smp_resp *disc_resp;
-	struct discover_resp *dr;
+	struct smp_disc_resp *disc_resp;
 
 	disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE);
 	if (!disc_resp)
 		return -ENOMEM;
-	dr = &disc_resp->disc;
 
 	res = sas_get_phy_discover(dev, phy_id, disc_resp);
 	if (res == 0) {
 		memcpy(sas_addr, disc_resp->disc.attached_sas_addr,
 		       SAS_ADDR_SIZE);
-		*type = to_dev_type(dr);
+		*type = to_dev_type(&disc_resp->disc);
 		if (*type == 0)
 			memset(sas_addr, 0, SAS_ADDR_SIZE);
 	}
diff --git a/include/scsi/sas.h b/include/scsi/sas.h
index acfc69fd72d0..b3ee9bd63277 100644
--- a/include/scsi/sas.h
+++ b/include/scsi/sas.h
@@ -471,18 +471,6 @@ struct report_phy_sata_resp {
 	__be32 crc;
 } __attribute__ ((packed));
 
-struct smp_resp {
-	u8    frame_type;
-	u8    function;
-	u8    result;
-	u8    reserved;
-	union {
-		struct report_general_resp  rg;
-		struct discover_resp        disc;
-		struct report_phy_sata_resp rps;
-	};
-} __attribute__ ((packed));
-
 #elif defined(__BIG_ENDIAN_BITFIELD)
 struct sas_identify_frame {
 	/* Byte 0 */
@@ -704,6 +692,18 @@ struct report_phy_sata_resp {
 	__be32 crc;
 } __attribute__ ((packed));
 
+#else
+#error "Bitfield order not defined!"
+#endif
+
+struct smp_disc_resp {
+	u8    frame_type;
+	u8    function;
+	u8    result;
+	u8    reserved;
+	struct discover_resp disc;
+} __attribute__ ((packed));
+
 struct smp_resp {
 	u8    frame_type;
 	u8    function;
@@ -716,8 +716,4 @@ struct smp_resp {
 	};
 } __attribute__ ((packed));
 
-#else
-#error "Bitfield order not defined!"
-#endif
-
 #endif /* _SAS_H_ */
-- 
cgit 


From 44f2bfe9ef082f76184d4a048c995729d14ec45d Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 9 Jun 2022 11:24:55 +0900
Subject: scsi: libsas: Introduce struct smp_rg_resp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling with gcc 12, several warnings are thrown by gcc when
compiling drivers/scsi/libsas/sas_expander.c, e.g.:

In function ‘sas_get_ex_change_count’,
    inlined from ‘sas_find_bcast_dev’ at
    drivers/scsi/libsas/sas_expander.c:1816:8:
drivers/scsi/libsas/sas_expander.c:1781:20: warning: array subscript
‘struct smp_resp[0]’ is partly outside array bounds of ‘unsigned
char[32]’ [-Warray-bounds]
 1781 |         if (rg_resp->result != SMP_RESP_FUNC_ACC) {
      |             ~~~~~~~^~~~~~~~

This is due to the use of the struct smp_resp to aggregate all possible
response types using a union but allocating a response buffer with a size
exactly equal to the size of the response type needed. This leads to access
to fields of struct smp_resp from an allocated memory area that is smaller
than the size of struct smp_resp.

Fix this by defining struct smp_rg_resp for sas report general responses.

Link: https://lore.kernel.org/r/20220609022456.409087-3-damien.lemoal@opensource.wdc.com
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_expander.c | 31 +++++++++++++------------------
 include/scsi/sas.h                 |  8 ++++++++
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index fb998a8a7d3b..78a38980636e 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -429,27 +429,14 @@ static int sas_expander_discover(struct domain_device *dev)
 
 #define MAX_EXPANDER_PHYS 128
 
-static void ex_assign_report_general(struct domain_device *dev,
-					    struct smp_resp *resp)
-{
-	struct report_general_resp *rg = &resp->rg;
-
-	dev->ex_dev.ex_change_count = be16_to_cpu(rg->change_count);
-	dev->ex_dev.max_route_indexes = be16_to_cpu(rg->route_indexes);
-	dev->ex_dev.num_phys = min(rg->num_phys, (u8)MAX_EXPANDER_PHYS);
-	dev->ex_dev.t2t_supp = rg->t2t_supp;
-	dev->ex_dev.conf_route_table = rg->conf_route_table;
-	dev->ex_dev.configuring = rg->configuring;
-	memcpy(dev->ex_dev.enclosure_logical_id, rg->enclosure_logical_id, 8);
-}
-
 #define RG_REQ_SIZE   8
-#define RG_RESP_SIZE 32
+#define RG_RESP_SIZE  sizeof(struct smp_rg_resp)
 
 static int sas_ex_general(struct domain_device *dev)
 {
 	u8 *rg_req;
-	struct smp_resp *rg_resp;
+	struct smp_rg_resp *rg_resp;
+	struct report_general_resp *rg;
 	int res;
 	int i;
 
@@ -480,7 +467,15 @@ static int sas_ex_general(struct domain_device *dev)
 			goto out;
 		}
 
-		ex_assign_report_general(dev, rg_resp);
+		rg = &rg_resp->rg;
+		dev->ex_dev.ex_change_count = be16_to_cpu(rg->change_count);
+		dev->ex_dev.max_route_indexes = be16_to_cpu(rg->route_indexes);
+		dev->ex_dev.num_phys = min(rg->num_phys, (u8)MAX_EXPANDER_PHYS);
+		dev->ex_dev.t2t_supp = rg->t2t_supp;
+		dev->ex_dev.conf_route_table = rg->conf_route_table;
+		dev->ex_dev.configuring = rg->configuring;
+		memcpy(dev->ex_dev.enclosure_logical_id,
+		       rg->enclosure_logical_id, 8);
 
 		if (dev->ex_dev.configuring) {
 			pr_debug("RG: ex %016llx self-configuring...\n",
@@ -1756,7 +1751,7 @@ static int sas_get_ex_change_count(struct domain_device *dev, int *ecc)
 {
 	int res;
 	u8  *rg_req;
-	struct smp_resp  *rg_resp;
+	struct smp_rg_resp  *rg_resp;
 
 	rg_req = alloc_smp_req(RG_REQ_SIZE);
 	if (!rg_req)
diff --git a/include/scsi/sas.h b/include/scsi/sas.h
index b3ee9bd63277..a8f9743ed6fc 100644
--- a/include/scsi/sas.h
+++ b/include/scsi/sas.h
@@ -696,6 +696,14 @@ struct report_phy_sata_resp {
 #error "Bitfield order not defined!"
 #endif
 
+struct smp_rg_resp {
+	u8    frame_type;
+	u8    function;
+	u8    result;
+	u8    reserved;
+	struct report_general_resp rg;
+} __attribute__ ((packed));
+
 struct smp_disc_resp {
 	u8    frame_type;
 	u8    function;
-- 
cgit 


From 3dafe0648ddd9c40666069e90b8a8a6162c452aa Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 9 Jun 2022 11:24:56 +0900
Subject: scsi: libsas: Introduce struct smp_rps_resp

Similarly to sas report general and discovery responses, define the
structure struct smp_rps_resp to handle SATA PHY report responses using a
structure with a size that is exactly equal to the sas defined response
size.

With this change, struct smp_resp becomes unused and is removed.

Link: https://lore.kernel.org/r/20220609022456.409087-4-damien.lemoal@opensource.wdc.com
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aic94xx/aic94xx_dev.c | 2 +-
 drivers/scsi/libsas/sas_expander.c | 4 ++--
 drivers/scsi/libsas/sas_internal.h | 2 +-
 include/scsi/libsas.h              | 2 +-
 include/scsi/sas.h                 | 8 ++------
 5 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/aic94xx/aic94xx_dev.c b/drivers/scsi/aic94xx/aic94xx_dev.c
index 73506a459bf8..91d196f26b76 100644
--- a/drivers/scsi/aic94xx/aic94xx_dev.c
+++ b/drivers/scsi/aic94xx/aic94xx_dev.c
@@ -159,7 +159,7 @@ static int asd_init_target_ddb(struct domain_device *dev)
 		flags |= OPEN_REQUIRED;
 		if ((dev->dev_type == SAS_SATA_DEV) ||
 		    (dev->tproto & SAS_PROTOCOL_STP)) {
-			struct smp_resp *rps_resp = &dev->sata_dev.rps_resp;
+			struct smp_rps_resp *rps_resp = &dev->sata_dev.rps_resp;
 			if (rps_resp->frame_type == SMP_RESPONSE &&
 			    rps_resp->function == SMP_REPORT_PHY_SATA &&
 			    rps_resp->result == SMP_RESP_FUNC_ACC) {
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 78a38980636e..fa2209080cc2 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -676,10 +676,10 @@ int sas_smp_get_phy_events(struct sas_phy *phy)
 #ifdef CONFIG_SCSI_SAS_ATA
 
 #define RPS_REQ_SIZE  16
-#define RPS_RESP_SIZE 60
+#define RPS_RESP_SIZE sizeof(struct smp_rps_resp)
 
 int sas_get_report_phy_sata(struct domain_device *dev, int phy_id,
-			    struct smp_resp *rps_resp)
+			    struct smp_rps_resp *rps_resp)
 {
 	int res;
 	u8 *rps_req = alloc_smp_req(RPS_REQ_SIZE);
diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
index 13d0ffaada93..8d0ad3abc7b5 100644
--- a/drivers/scsi/libsas/sas_internal.h
+++ b/drivers/scsi/libsas/sas_internal.h
@@ -83,7 +83,7 @@ struct domain_device *sas_find_dev_by_rphy(struct sas_rphy *rphy);
 struct domain_device *sas_ex_to_ata(struct domain_device *ex_dev, int phy_id);
 int sas_ex_phy_discover(struct domain_device *dev, int single);
 int sas_get_report_phy_sata(struct domain_device *dev, int phy_id,
-			    struct smp_resp *rps_resp);
+			    struct smp_rps_resp *rps_resp);
 int sas_try_ata_reset(struct asd_sas_phy *phy);
 void sas_hae_reset(struct work_struct *work);
 
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index ff04eb6d250b..2dbead74a2af 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -145,7 +145,7 @@ struct sata_device {
 
 	struct ata_port *ap;
 	struct ata_host *ata_host;
-	struct smp_resp rps_resp ____cacheline_aligned; /* report_phy_sata_resp */
+	struct smp_rps_resp rps_resp ____cacheline_aligned; /* report_phy_sata_resp */
 	u8     fis[ATA_RESP_FIS_SIZE];
 };
 
diff --git a/include/scsi/sas.h b/include/scsi/sas.h
index a8f9743ed6fc..71b749bed3b0 100644
--- a/include/scsi/sas.h
+++ b/include/scsi/sas.h
@@ -712,16 +712,12 @@ struct smp_disc_resp {
 	struct discover_resp disc;
 } __attribute__ ((packed));
 
-struct smp_resp {
+struct smp_rps_resp {
 	u8    frame_type;
 	u8    function;
 	u8    result;
 	u8    reserved;
-	union {
-		struct report_general_resp  rg;
-		struct discover_resp        disc;
-		struct report_phy_sata_resp rps;
-	};
+	struct report_phy_sata_resp rps;
 } __attribute__ ((packed));
 
 #endif /* _SAS_H_ */
-- 
cgit 


From 36518b6b4da7e8d4387bc19ad21e772f1060e9d7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Jun 2022 16:04:03 -0400
Subject: teach iomap_dio_rw() to suppress dsync

New flag, equivalent to removal of IOCB_DSYNC from iocb flags.
This mimics what btrfs is doing (and that's what btrfs will
switch to).  However, I'm not at all sure that we want to
suppress REQ_FUA for those - all btrfs hack really cares about
is suppression of generic_write_sync().  For now let's keep
the existing behaviour, but I really want to hear more detailed
arguments pro or contra.

[folded brain fix from willy]

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/iomap/direct-io.c  | 20 +++++++++++---------
 include/linux/iomap.h |  6 ++++++
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 370c3241618a..c10c69e2de24 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -548,17 +548,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC)
+		if (iocb->ki_flags & IOCB_DSYNC &&
+		    !(dio_flags & IOMAP_DIO_NOSYNC)) {
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
 
-		/*
-		 * For datasync only writes, we optimistically try using FUA for
-		 * this IO.  Any non-FUA write that occurs will clear this flag,
-		 * hence we know before completion whether a cache flush is
-		 * necessary.
-		 */
-		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_WRITE_FUA;
+		       /*
+			* For datasync only writes, we optimistically try
+			* using FUA for this IO.  Any non-FUA write that
+			* occurs will clear this flag, hence we know before
+			* completion whether a cache flush is necessary.
+			*/
+			if (!(iocb->ki_flags & IOCB_SYNC))
+				dio->flags |= IOMAP_DIO_WRITE_FUA;
+		}
 	}
 
 	if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e552097c67e0..c8622d8f064e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -353,6 +353,12 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_PARTIAL		(1 << 2)
 
+/*
+ * The caller will sync the write if needed; do not sync it within
+ * iomap_dio_rw.  Overrides IOMAP_DIO_FORCE_WAIT.
+ */
+#define IOMAP_DIO_NOSYNC		(1 << 3)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before);
-- 
cgit 


From e87f2c26c8085dac59978dee1beeb05cef31a9dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 May 2022 09:28:12 -0400
Subject: struct file: use anonymous union member for rcuhead and llist

Once upon a time we couldn't afford anon unions; these days minimal
gcc version had been raised enough to take care of that.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 16 ++++++++--------
 include/linux/fs.h |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/file_table.c b/fs/file_table.c
index 5424e3a8df5f..b989e33aacda 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static void file_free_rcu(struct rcu_head *head)
 {
-	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+	struct file *f = container_of(head, struct file, f_rcuhead);
 
 	put_cred(f->f_cred);
 	kmem_cache_free(filp_cachep, f);
@@ -56,7 +56,7 @@ static inline void file_free(struct file *f)
 	security_file_free(f);
 	if (!(f->f_mode & FMODE_NOACCOUNT))
 		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+	call_rcu(&f->f_rcuhead, file_free_rcu);
 }
 
 /*
@@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
 	f->f_cred = get_cred(cred);
 	error = security_file_alloc(f);
 	if (unlikely(error)) {
-		file_free_rcu(&f->f_u.fu_rcuhead);
+		file_free_rcu(&f->f_rcuhead);
 		return ERR_PTR(error);
 	}
 
@@ -341,13 +341,13 @@ static void delayed_fput(struct work_struct *unused)
 	struct llist_node *node = llist_del_all(&delayed_fput_list);
 	struct file *f, *t;
 
-	llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
+	llist_for_each_entry_safe(f, t, node, f_llist)
 		__fput(f);
 }
 
 static void ____fput(struct callback_head *work)
 {
-	__fput(container_of(work, struct file, f_u.fu_rcuhead));
+	__fput(container_of(work, struct file, f_rcuhead));
 }
 
 /*
@@ -374,8 +374,8 @@ void fput(struct file *file)
 		struct task_struct *task = current;
 
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
-			init_task_work(&file->f_u.fu_rcuhead, ____fput);
-			if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
+			init_task_work(&file->f_rcuhead, ____fput);
+			if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
 				return;
 			/*
 			 * After this task has run exit_task_work(),
@@ -384,7 +384,7 @@ void fput(struct file *file)
 			 */
 		}
 
-		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+		if (llist_add(&file->f_llist, &delayed_fput_list))
 			schedule_delayed_work(&delayed_fput_work, 1);
 	}
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..6a2a4906041f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -924,9 +924,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 
 struct file {
 	union {
-		struct llist_node	fu_llist;
-		struct rcu_head 	fu_rcuhead;
-	} f_u;
+		struct llist_node	f_llist;
+		struct rcu_head 	f_rcuhead;
+	};
 	struct path		f_path;
 	struct inode		*f_inode;	/* cached value */
 	const struct file_operations	*f_op;
-- 
cgit 


From 91b94c5d6ae55d1161633047ffeea644b110b35f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 May 2022 09:39:27 -0400
Subject: iocb: delay evaluation of IS_SYNC(...) until we want to check
 IOCB_DSYNC

New helper to be used instead of direct checks for IOCB_DSYNC:
iocb_is_dsync(iocb).  Checks converted, which allows to avoid
the IS_SYNC(iocb->ki_filp->f_mapping->host) part (4 cache lines)
from iocb_flags() - it's checked in iocb_is_dsync() instead

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/fops.c         |  2 +-
 fs/btrfs/file.c      |  2 +-
 fs/direct-io.c       |  2 +-
 fs/fuse/file.c       |  2 +-
 fs/iomap/direct-io.c |  3 +--
 fs/zonefs/super.c    |  2 +-
 include/linux/fs.h   | 10 ++++++++--
 7 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index d6b3276a6c68..6e86931ab847 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -37,7 +37,7 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
 	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
 
 	/* avoid the need for a I/O completion work item */
-	if (iocb->ki_flags & IOCB_DSYNC)
+	if (iocb_is_dsync(iocb))
 		op |= REQ_FUA;
 	return op;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 98f81e304eb1..54358a5c9d56 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2021,7 +2021,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
 	struct file *file = iocb->ki_filp;
 	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
 	ssize_t num_written, num_sync;
-	const bool sync = iocb->ki_flags & IOCB_DSYNC;
+	const bool sync = iocb_is_dsync(iocb);
 
 	/*
 	 * If the fs flips readonly due to some impossible error, although we
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 840752006f60..39647eb56904 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1210,7 +1210,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 */
 	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
 		retval = 0;
-		if (iocb->ki_flags & IOCB_DSYNC)
+		if (iocb_is_dsync(iocb))
 			retval = dio_set_defer_completion(dio);
 		else if (!dio->inode->i_sb->s_dio_done_wq) {
 			/*
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05caa2b9272e..00fa861aeead 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb)
 {
 	unsigned int flags = iocb->ki_filp->f_flags;
 
-	if (iocb->ki_flags & IOCB_DSYNC)
+	if (iocb_is_dsync(iocb))
 		flags |= O_DSYNC;
 	if (iocb->ki_flags & IOCB_SYNC)
 		flags |= O_SYNC;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c10c69e2de24..31c7f1035b20 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -548,8 +548,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC &&
-		    !(dio_flags & IOMAP_DIO_NOSYNC)) {
+		if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
 
 		       /*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bcb21aea990a..04a98b4cd7ee 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -746,7 +746,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 			REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
 	bio->bi_iter.bi_sector = zi->i_zsector;
 	bio->bi_ioprio = iocb->ki_ioprio;
-	if (iocb->ki_flags & IOCB_DSYNC)
+	if (iocb_is_dsync(iocb))
 		bio->bi_opf |= REQ_FUA;
 
 	ret = bio_iov_iter_get_pages(bio, from);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6a2a4906041f..380a1292f4f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2720,6 +2720,12 @@ extern int vfs_fsync(struct file *file, int datasync);
 extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
 				unsigned int flags);
 
+static inline bool iocb_is_dsync(const struct kiocb *iocb)
+{
+	return (iocb->ki_flags & IOCB_DSYNC) ||
+		IS_SYNC(iocb->ki_filp->f_mapping->host);
+}
+
 /*
  * Sync the bytes written if this was a synchronous write.  Expect ki_pos
  * to already be updated for the write, and will return either the amount
@@ -2727,7 +2733,7 @@ extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
  */
 static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 {
-	if (iocb->ki_flags & IOCB_DSYNC) {
+	if (iocb_is_dsync(iocb)) {
 		int ret = vfs_fsync_range(iocb->ki_filp,
 				iocb->ki_pos - count, iocb->ki_pos - 1,
 				(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
@@ -3262,7 +3268,7 @@ static inline int iocb_flags(struct file *file)
 		res |= IOCB_APPEND;
 	if (file->f_flags & O_DIRECT)
 		res |= IOCB_DIRECT;
-	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+	if (file->f_flags & O_DSYNC)
 		res |= IOCB_DSYNC;
 	if (file->f_flags & __O_SYNC)
 		res |= IOCB_SYNC;
-- 
cgit 


From 164f4064ca81eefcea29f7f5dcf394f92be1d0c0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 May 2022 11:38:11 -0400
Subject: keep iocb_flags() result cached in struct file

* calculate at the time we set FMODE_OPENED (do_dentry_open() for normal
opens, alloc_file() for pipe()/socket()/etc.)
* update when handling F_SETFL
* keep in a new field - file->f_iocb_flags; since that thing is needed only
before the refcount reaches zero, we can put it into the same anon union
where ->f_rcuhead and ->f_llist live - those are used only after refcount
reaches zero.

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/nvme/target/io-cmd-file.c | 2 +-
 fs/aio.c                          | 2 +-
 fs/fcntl.c                        | 1 +
 fs/file_table.c                   | 1 +
 fs/io_uring.c                     | 2 +-
 fs/open.c                         | 1 +
 include/linux/fs.h                | 5 ++---
 7 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index f3d58abf11e0..64b47e2a4633 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -112,7 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 
 	iocb->ki_pos = pos;
 	iocb->ki_filp = req->ns->file;
-	iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
+	iocb->ki_flags = ki_flags | iocb->ki_filp->f_iocb_flags;
 
 	return call_iter(iocb, &iter);
 }
diff --git a/fs/aio.c b/fs/aio.c
index 3c249b938632..2bdd444d408b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	req->ki_complete = aio_complete_rw;
 	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
-	req->ki_flags = iocb_flags(req->ki_filp);
+	req->ki_flags = req->ki_filp->f_iocb_flags;
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
 		req->ki_flags |= IOCB_EVENTFD;
 	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 34a3faa4886d..146c9ab0cd4b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	}
 	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	filp->f_iocb_flags = iocb_flags(filp);
 	spin_unlock(&filp->f_lock);
 
  out:
diff --git a/fs/file_table.c b/fs/file_table.c
index b989e33aacda..905792b0521c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -241,6 +241,7 @@ static struct file *alloc_file(const struct path *path, int flags,
 	if ((file->f_mode & FMODE_WRITE) &&
 	     likely(fop->write || fop->write_iter))
 		file->f_mode |= FMODE_CAN_WRITE;
+	file->f_iocb_flags = iocb_flags(file);
 	file->f_mode |= FMODE_OPENED;
 	file->f_op = fop;
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3aab4182fd89..53424b1f019f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4330,7 +4330,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 	if (!io_req_ffs_set(req))
 		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
-	kiocb->ki_flags = iocb_flags(file);
+	kiocb->ki_flags = file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
 	if (unlikely(ret))
 		return ret;
diff --git a/fs/open.c b/fs/open.c
index 1d57fbde2feb..d80441a0bf17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -862,6 +862,7 @@ static int do_dentry_open(struct file *f,
 		f->f_mode |= FMODE_CAN_ODIRECT;
 
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+	f->f_iocb_flags = iocb_flags(f);
 
 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 380a1292f4f9..c82b9d442f56 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -926,6 +926,7 @@ struct file {
 	union {
 		struct llist_node	f_llist;
 		struct rcu_head 	f_rcuhead;
+		unsigned int 		f_iocb_flags;
 	};
 	struct path		f_path;
 	struct inode		*f_inode;	/* cached value */
@@ -2199,13 +2200,11 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
 	       !gid_valid(i_gid_into_mnt(mnt_userns, inode));
 }
 
-static inline int iocb_flags(struct file *file);
-
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
 		.ki_filp = filp,
-		.ki_flags = iocb_flags(filp),
+		.ki_flags = filp->f_iocb_flags,
 		.ki_ioprio = get_current_ioprio(),
 	};
 }
-- 
cgit 


From e70f3c701276a8825414dc650c4f76141a8704b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:06 -0700
Subject: Revert "net: set SK_MEM_QUANTUM to 4096"

This reverts commit bd68a2a854ad5a85f0c8d0a9c8048ca3f6391efb.

This change broke memcg on arches with PAGE_SIZE != 4096

Later, commit 2bb2f5fb21b04 ("net: add new socket option SO_RESERVE_MEM")
also assumed PAGE_SIZE==SK_MEM_QUANTUM

Following patches in the series will greatly reduce the over allocations
problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 657873e2d90f..5c5265269899 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1532,25 +1532,15 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
 void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
-/* We used to have PAGE_SIZE here, but systems with 64KB pages
- * do not necessarily have 16x time more memory than 4KB ones.
- */
-#define SK_MEM_QUANTUM 4096
+#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND	0
 #define SK_MEM_RECV	1
 
-/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+/* sysctl_mem values are in pages */
 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 {
-	long val = sk->sk_prot->sysctl_mem[index];
-
-#if PAGE_SIZE > SK_MEM_QUANTUM
-	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
-#elif PAGE_SIZE < SK_MEM_QUANTUM
-	val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
-#endif
-	return val;
+	return sk->sk_prot->sysctl_mem[index];
 }
 
 static inline int sk_mem_pages(int amt)
-- 
cgit 


From 100fdd1faf50557558e2911af4be32e515cb8036 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:07 -0700
Subject: net: remove SK_MEM_QUANTUM and SK_MEM_QUANTUM_SHIFT

Due to memcg interface, SK_MEM_QUANTUM is effectively PAGE_SIZE.

This might change in the future, but it seems better to avoid the
confusion.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h    |  8 +++-----
 net/core/sock.c       | 16 ++++++++--------
 net/ipv4/tcp.c        |  4 ++--
 net/ipv4/tcp_input.c  |  2 +-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/udp.c        | 10 +++++-----
 net/mptcp/protocol.c  |  8 ++++----
 net/sctp/protocol.c   |  4 ++--
 8 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 5c5265269899..298897bbfb3a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1532,8 +1532,6 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
 void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
-#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND	0
 #define SK_MEM_RECV	1
 
@@ -1545,7 +1543,7 @@ static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 
 static inline int sk_mem_pages(int amt)
 {
-	return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
+	return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
 static inline bool sk_has_account(struct sock *sk)
@@ -1594,7 +1592,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
 
 	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
 
-	if (reclaimable >= SK_MEM_QUANTUM)
+	if (reclaimable >= (int)PAGE_SIZE)
 		__sk_mem_reclaim(sk, reclaimable);
 }
 
@@ -1613,7 +1611,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk)
 
 	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
 
-	if (reclaimable > SK_MEM_QUANTUM)
+	if (reclaimable > (int)PAGE_SIZE)
 		__sk_mem_reclaim(sk, reclaimable - 1);
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index f5062d9e1222..063fd7680a84 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -991,7 +991,7 @@ EXPORT_SYMBOL(sock_set_mark);
 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 {
 	/* Round down bytes to multiple of pages */
-	bytes &= ~(SK_MEM_QUANTUM - 1);
+	bytes = round_down(bytes, PAGE_SIZE);
 
 	WARN_ON(bytes > sk->sk_reserved_mem);
 	sk->sk_reserved_mem -= bytes;
@@ -1028,9 +1028,9 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
 		return -ENOMEM;
 	}
-	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
+	sk->sk_forward_alloc += pages << PAGE_SHIFT;
 
-	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
+	sk->sk_reserved_mem += pages << PAGE_SHIFT;
 
 	return 0;
 }
@@ -3003,10 +3003,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 {
 	int ret, amt = sk_mem_pages(size);
 
-	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+	sk->sk_forward_alloc += amt << PAGE_SHIFT;
 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
 	if (!ret)
-		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -3034,12 +3034,12 @@ EXPORT_SYMBOL(__sk_mem_reduce_allocated);
 /**
  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
  *	@sk: socket
- *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
  */
 void __sk_mem_reclaim(struct sock *sk, int amount)
 {
-	amount >>= SK_MEM_QUANTUM_SHIFT;
-	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+	amount >>= PAGE_SHIFT;
+	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
 	__sk_mem_reduce_allocated(sk, amount);
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9984d23a7f3e..9e696758a4c2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4661,11 +4661,11 @@ void __init tcp_init(void)
 	max_wshare = min(4UL*1024*1024, limit);
 	max_rshare = min(6UL*1024*1024, limit);
 
-	init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
 	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
 	init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
-	init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+	init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
 	init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
 	init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e2a9ece9af2..3fb117022558 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5287,7 +5287,7 @@ new_range:
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
 			/* Do not attempt collapsing tiny skbs */
 			if (range_truesize != head->truesize ||
-			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+			    end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
 				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 					     head, skb, start, end);
 			} else {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1c054431e358..8ab98e1aca67 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3367,7 +3367,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	if (size <= sk->sk_forward_alloc)
 		return;
 	amt = sk_mem_pages(size);
-	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	sk->sk_forward_alloc += amt << PAGE_SHIFT;
 	sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa9f2ec3dc46..bbc9970fa2e9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1461,11 +1461,11 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 
 
 	sk->sk_forward_alloc += size;
-	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
+	amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
 	sk->sk_forward_alloc -= amt;
 
 	if (amt)
-		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+		__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
 
 	atomic_sub(size, &sk->sk_rmem_alloc);
 
@@ -1558,7 +1558,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	spin_lock(&list->lock);
 	if (size >= sk->sk_forward_alloc) {
 		amt = sk_mem_pages(size);
-		delta = amt << SK_MEM_QUANTUM_SHIFT;
+		delta = amt << PAGE_SHIFT;
 		if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
 			err = -ENOBUFS;
 			spin_unlock(&list->lock);
@@ -3263,8 +3263,8 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
 
 static void __udp_sysctl_init(struct net *net)
 {
-	net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
-	net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
+	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	net->ipv4.sysctl_udp_l3mdev_accept = 0;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 17e13396024a..080a630d6902 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -167,8 +167,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
 
 static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
 {
-	amount >>= SK_MEM_QUANTUM_SHIFT;
-	mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+	amount >>= PAGE_SHIFT;
+	mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT;
 	__sk_mem_reduce_allocated(sk, amount);
 }
 
@@ -327,7 +327,7 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
 		return true;
 
 	amt = sk_mem_pages(size);
-	amount = amt << SK_MEM_QUANTUM_SHIFT;
+	amount = amt << PAGE_SHIFT;
 	msk->rmem_fwd_alloc += amount;
 	if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
 		if (ssk->sk_forward_alloc < amount) {
@@ -972,7 +972,7 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk)
 
 	lockdep_assert_held_once(&sk->sk_lock.slock);
 
-	if (reclaimable > SK_MEM_QUANTUM)
+	if (reclaimable > (int)PAGE_SIZE)
 		__mptcp_rmem_reclaim(sk, reclaimable - 1);
 
 	sk_mem_reclaim_partial(sk);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 35928fefae33..fa500ea3a1f1 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1523,11 +1523,11 @@ static __init int sctp_init(void)
 	limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7);
 	max_share = min(4UL*1024*1024, limit);
 
-	sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */
+	sysctl_sctp_rmem[0] = PAGE_SIZE; /* give each asoc 1 page min */
 	sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1);
 	sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share);
 
-	sysctl_sctp_wmem[0] = SK_MEM_QUANTUM;
+	sysctl_sctp_wmem[0] = PAGE_SIZE;
 	sysctl_sctp_wmem[1] = 16*1024;
 	sysctl_sctp_wmem[2] = max(64*1024, max_share);
 
-- 
cgit 


From 0defbb0af775ef037913786048d099bbe8b9a2c2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:08 -0700
Subject: net: add per_cpu_fw_alloc field to struct proto

Each protocol having a ->memory_allocated pointer gets a corresponding
per-cpu reserve, that following patches will use.

Instead of having reserved bytes per socket,
we want to have per-cpu reserves.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h     | 1 +
 include/net/tcp.h      | 2 ++
 include/net/udp.h      | 1 +
 net/core/sock.c        | 4 ++++
 net/decnet/af_decnet.c | 4 ++++
 net/ipv4/tcp.c         | 2 ++
 net/ipv4/tcp_ipv4.c    | 3 +++
 net/ipv4/udp.c         | 4 ++++
 net/ipv4/udplite.c     | 3 +++
 net/ipv6/tcp_ipv6.c    | 3 +++
 net/ipv6/udp.c         | 3 +++
 net/ipv6/udplite.c     | 3 +++
 net/mptcp/protocol.c   | 3 +++
 net/sctp/socket.c      | 7 +++++++
 14 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 298897bbfb3a..825f8cbf791f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1254,6 +1254,7 @@ struct proto {
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
+	int  __percpu		*per_cpu_fw_alloc;
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 
 	/*
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e99f5c61f84..4794cae4577e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -253,6 +253,8 @@ extern long sysctl_tcp_mem[3];
 #define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
 extern atomic_long_t tcp_memory_allocated;
+DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+
 extern struct percpu_counter tcp_sockets_allocated;
 extern unsigned long tcp_memory_pressure;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index b83a00330566..b60eea2e3fae 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -95,6 +95,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 extern struct proto udp_prot;
 
 extern atomic_long_t udp_memory_allocated;
+DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
 
 /* sysctl variables for udp */
 extern long sysctl_udp_mem[3];
diff --git a/net/core/sock.c b/net/core/sock.c
index 063fd7680a84..f96efc95e334 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3798,6 +3798,10 @@ int proto_register(struct proto *prot, int alloc_slab)
 		pr_err("%s: missing sysctl_mem\n", prot->name);
 		return -EINVAL;
 	}
+	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+		return -EINVAL;
+	}
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create_usercopy(prot->name,
 					prot->obj_size, 0,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index dc92a67baea3..aa4f43f52499 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,6 +149,7 @@ static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
 static atomic_long_t decnet_memory_allocated;
+static DEFINE_PER_CPU(int, decnet_memory_per_cpu_fw_alloc);
 
 static int __dn_setsockopt(struct socket *sock, int level, int optname,
 		sockptr_t optval, unsigned int optlen, int flags);
@@ -454,7 +455,10 @@ static struct proto dn_proto = {
 	.owner			= THIS_MODULE,
 	.enter_memory_pressure	= dn_enter_memory_pressure,
 	.memory_pressure	= &dn_memory_pressure,
+
 	.memory_allocated	= &decnet_memory_allocated,
+	.per_cpu_fw_alloc	= &decnet_memory_per_cpu_fw_alloc,
+
 	.sysctl_mem		= sysctl_decnet_mem,
 	.sysctl_wmem		= sysctl_decnet_wmem,
 	.sysctl_rmem		= sysctl_decnet_rmem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9e696758a4c2..e6bdf8e2c09a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 
 atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
 
 #if IS_ENABLED(CONFIG_SMC)
 DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe8f23b95d32..fda811a5251f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3045,7 +3045,10 @@ struct proto tcp_prot = {
 	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
+
 	.memory_allocated	= &tcp_memory_allocated,
+	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
+
 	.memory_pressure	= &tcp_memory_pressure,
 	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bbc9970fa2e9..6172b4750a88 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,6 +125,8 @@ EXPORT_SYMBOL(sysctl_udp_mem);
 
 atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
 EXPORT_SYMBOL(udp_memory_allocated);
+DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
 
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
@@ -2946,6 +2948,8 @@ struct proto udp_prot = {
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif
 	.memory_allocated	= &udp_memory_allocated,
+	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,
+
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index cd1cd68adeec..6e08a76ae1e7 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,7 +51,10 @@ struct proto 	udplite_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
+
 	.memory_allocated  = &udp_memory_allocated,
+	.per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
+
 	.sysctl_mem	   = sysctl_udp_mem,
 	.obj_size	   = sizeof(struct udp_sock),
 	.h.udp_table	   = &udplite_table,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f37dd4aa91c6..c72448ba6dc9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2159,7 +2159,10 @@ struct proto tcpv6_prot = {
 	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
+
 	.memory_allocated	= &tcp_memory_allocated,
+	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
+
 	.memory_pressure	= &tcp_memory_pressure,
 	.orphan_count		= &tcp_orphan_count,
 	.sysctl_mem		= sysctl_tcp_mem,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 55afd7f39c04..be074f07073a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1740,7 +1740,10 @@ struct proto udpv6_prot = {
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif
+
 	.memory_allocated	= &udp_memory_allocated,
+	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,
+
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index fbb700d3f437..b70725856259 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -48,7 +48,10 @@ struct proto udplitev6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
+
 	.memory_allocated  = &udp_memory_allocated,
+	.per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
+
 	.sysctl_mem	   = sysctl_udp_mem,
 	.obj_size	   = sizeof(struct udp6_sock),
 	.h.udp_table	   = &udplite_table,
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 080a630d6902..9563124ac8af 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3437,7 +3437,10 @@ static struct proto mptcp_prot = {
 	.get_port	= mptcp_get_port,
 	.forward_alloc_get	= mptcp_forward_alloc_get,
 	.sockets_allocated	= &mptcp_sockets_allocated,
+
 	.memory_allocated	= &tcp_memory_allocated,
+	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
+
 	.memory_pressure	= &tcp_memory_pressure,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6d37d2dfb3da..05174acd981a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -93,6 +93,7 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 
 static unsigned long sctp_memory_pressure;
 static atomic_long_t sctp_memory_allocated;
+static DEFINE_PER_CPU(int, sctp_memory_per_cpu_fw_alloc);
 struct percpu_counter sctp_sockets_allocated;
 
 static void sctp_enter_memory_pressure(struct sock *sk)
@@ -9657,7 +9658,10 @@ struct proto sctp_prot = {
 	.sysctl_wmem =  sysctl_sctp_wmem,
 	.memory_pressure = &sctp_memory_pressure,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
+
 	.memory_allocated = &sctp_memory_allocated,
+	.per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc,
+
 	.sockets_allocated = &sctp_sockets_allocated,
 };
 
@@ -9700,7 +9704,10 @@ struct proto sctpv6_prot = {
 	.sysctl_wmem	= sysctl_sctp_wmem,
 	.memory_pressure = &sctp_memory_pressure,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
+
 	.memory_allocated = &sctp_memory_allocated,
+	.per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc,
+
 	.sockets_allocated = &sctp_sockets_allocated,
 };
 #endif /* IS_ENABLED(CONFIG_IPV6) */
-- 
cgit 


From 3cd3399dd7a84ada85cb839989cdf7310e302c7d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:09 -0700
Subject: net: implement per-cpu reserves for memory_allocated

We plan keeping sk->sk_forward_alloc as small as possible
in future patches.

This means we are going to call sk_memory_allocated_add()
and sk_memory_allocated_sub() more often.

Implement a per-cpu cache of +1/-1 MB, to reduce number
of changes to sk->sk_prot->memory_allocated, which
would otherwise be cause of false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 825f8cbf791f..59040fee74e7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1397,22 +1397,48 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	return !!*sk->sk_prot->memory_pressure;
 }
 
+static inline long
+proto_memory_allocated(const struct proto *prot)
+{
+	return max(0L, atomic_long_read(prot->memory_allocated));
+}
+
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
-	return atomic_long_read(sk->sk_prot->memory_allocated);
+	return proto_memory_allocated(sk->sk_prot);
 }
 
+/* 1 MB per cpu, in page units */
+#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
+
 static inline long
 sk_memory_allocated_add(struct sock *sk, int amt)
 {
-	return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
+	int local_reserve;
+
+	preempt_disable();
+	local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+	if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
+		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+	}
+	preempt_enable();
+	return sk_memory_allocated(sk);
 }
 
 static inline void
 sk_memory_allocated_sub(struct sock *sk, int amt)
 {
-	atomic_long_sub(amt, sk->sk_prot->memory_allocated);
+	int local_reserve;
+
+	preempt_disable();
+	local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+	if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
+		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+	}
+	preempt_enable();
 }
 
 #define SK_ALLOC_PERCPU_COUNTER_BATCH 16
@@ -1441,12 +1467,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
 	return percpu_counter_sum_positive(prot->sockets_allocated);
 }
 
-static inline long
-proto_memory_allocated(struct proto *prot)
-{
-	return atomic_long_read(prot->memory_allocated);
-}
-
 static inline bool
 proto_memory_pressure(struct proto *prot)
 {
-- 
cgit 


From 7c80b038d23e1f4c7fcc311f43f83b8c60e7fb80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:10 -0700
Subject: net: fix sk_wmem_schedule() and sk_rmem_schedule() errors

If sk->sk_forward_alloc is 150000, and we need to schedule 150001 bytes,
we want to allocate 1 byte more (rounded up to one page),
instead of 150001 :/

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 59040fee74e7..cf288f7e9019 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1575,19 +1575,23 @@ static inline bool sk_has_account(struct sock *sk)
 
 static inline bool sk_wmem_schedule(struct sock *sk, int size)
 {
+	int delta;
+
 	if (!sk_has_account(sk))
 		return true;
-	return size <= sk->sk_forward_alloc ||
-		__sk_mem_schedule(sk, size, SK_MEM_SEND);
+	delta = size - sk->sk_forward_alloc;
+	return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
 }
 
 static inline bool
 sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
 {
+	int delta;
+
 	if (!sk_has_account(sk))
 		return true;
-	return size <= sk->sk_forward_alloc ||
-		__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
+	delta = size - sk->sk_forward_alloc;
+	return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
 		skb_pfmemalloc(skb);
 }
 
-- 
cgit 


From 4890b686f4088c90432149bd6de567e621266fa2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jun 2022 23:34:11 -0700
Subject: net: keep sk->sk_forward_alloc as small as possible

Currently, tcp_memory_allocated can hit tcp_mem[] limits quite fast.

Each TCP socket can forward allocate up to 2 MB of memory, even after
flow became less active.

10,000 sockets can have reserved 20 GB of memory,
and we have no shrinker in place to reclaim that.

Instead of trying to reclaim the extra allocations in some places,
just keep sk->sk_forward_alloc values as small as possible.

This should not impact performance too much now we have per-cpu
reserves: Changes to tcp_memory_allocated should not be too frequent.

For sockets not using SO_RESERVE_MEM:
 - idle sockets (no packets in tx/rx queues) have zero forward alloc.
 - non idle sockets have a forward alloc smaller than one page.

Note:

 - Removal of SK_RECLAIM_CHUNK and SK_RECLAIM_THRESHOLD
   is left to MPTCP maintainers as a follow up.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h           | 29 ++---------------------------
 net/core/datagram.c          |  3 ---
 net/ipv4/tcp.c               |  7 -------
 net/ipv4/tcp_input.c         |  4 ----
 net/ipv4/tcp_timer.c         | 19 ++++---------------
 net/iucv/af_iucv.c           |  2 --
 net/mptcp/protocol.c         |  2 +-
 net/sctp/sm_statefuns.c      |  2 --
 net/sctp/socket.c            |  5 -----
 net/sctp/stream_interleave.c |  2 --
 net/sctp/ulpqueue.c          |  4 ----
 11 files changed, 7 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index cf288f7e9019..0063e8410a4e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1627,19 +1627,6 @@ static inline void sk_mem_reclaim_final(struct sock *sk)
 	sk_mem_reclaim(sk);
 }
 
-static inline void sk_mem_reclaim_partial(struct sock *sk)
-{
-	int reclaimable;
-
-	if (!sk_has_account(sk))
-		return;
-
-	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
-
-	if (reclaimable > (int)PAGE_SIZE)
-		__sk_mem_reclaim(sk, reclaimable - 1);
-}
-
 static inline void sk_mem_charge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
@@ -1647,29 +1634,17 @@ static inline void sk_mem_charge(struct sock *sk, int size)
 	sk->sk_forward_alloc -= size;
 }
 
-/* the following macros control memory reclaiming in sk_mem_uncharge()
+/* the following macros control memory reclaiming in mptcp_rmem_uncharge()
  */
 #define SK_RECLAIM_THRESHOLD	(1 << 21)
 #define SK_RECLAIM_CHUNK	(1 << 20)
 
 static inline void sk_mem_uncharge(struct sock *sk, int size)
 {
-	int reclaimable;
-
 	if (!sk_has_account(sk))
 		return;
 	sk->sk_forward_alloc += size;
-	reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
-
-	/* Avoid a possible overflow.
-	 * TCP send queues can make this happen, if sk_mem_reclaim()
-	 * is not called and more than 2 GBytes are released at once.
-	 *
-	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
-	 * no need to hold that much forward allocation anyway.
-	 */
-	if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
-		__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
+	sk_mem_reclaim(sk);
 }
 
 /*
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 50f4faeea76c..35791f86bd1a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -320,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 {
 	consume_skb(skb);
-	sk_mem_reclaim_partial(sk);
 }
 EXPORT_SYMBOL(skb_free_datagram);
 
@@ -336,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 	slow = lock_sock_fast(sk);
 	sk_peek_offset_bwd(sk, len);
 	skb_orphan(skb);
-	sk_mem_reclaim_partial(sk);
 	unlock_sock_fast(sk, slow);
 
 	/* skb is now orphaned, can be freed outside of locked section */
@@ -396,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 				      NULL);
 
 	kfree_skb(skb);
-	sk_mem_reclaim_partial(sk);
 	return err;
 }
 EXPORT_SYMBOL(skb_kill_datagram);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e6bdf8e2c09a..14ebb4ec4a51 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -858,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 {
 	struct sk_buff *skb;
 
-	if (unlikely(tcp_under_memory_pressure(sk)))
-		sk_mem_reclaim_partial(sk);
-
 	skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
 	if (likely(skb)) {
 		bool mem_scheduled;
@@ -2764,8 +2761,6 @@ void __tcp_close(struct sock *sk, long timeout)
 		__kfree_skb(skb);
 	}
 
-	sk_mem_reclaim(sk);
-
 	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
 	if (sk->sk_state == TCP_CLOSE)
 		goto adjudge_to_death;
@@ -2873,7 +2868,6 @@ adjudge_to_death:
 		}
 	}
 	if (sk->sk_state != TCP_CLOSE) {
-		sk_mem_reclaim(sk);
 		if (tcp_check_oom(sk, 0)) {
 			tcp_set_state(sk, TCP_CLOSE);
 			tcp_send_active_reset(sk, GFP_ATOMIC);
@@ -2951,7 +2945,6 @@ void tcp_write_queue_purge(struct sock *sk)
 	}
 	tcp_rtx_queue_purge(sk);
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
-	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
 	tcp_sk(sk)->packets_out = 0;
 	inet_csk(sk)->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3fb117022558..fdc7beb81b68 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -805,7 +805,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 			 * restart window, so that we send ACKs quickly.
 			 */
 			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
-			sk_mem_reclaim(sk);
 		}
 	}
 	icsk->icsk_ack.lrcvtime = now;
@@ -4390,7 +4389,6 @@ void tcp_fin(struct sock *sk)
 	skb_rbtree_purge(&tp->out_of_order_queue);
 	if (tcp_is_sack(tp))
 		tcp_sack_reset(&tp->rx_opt);
-	sk_mem_reclaim(sk);
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		sk->sk_state_change(sk);
@@ -5336,7 +5334,6 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		tcp_drop_reason(sk, rb_to_skb(node),
 				SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
 		if (!prev || goal <= 0) {
-			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 			    !tcp_under_memory_pressure(sk))
 				break;
@@ -5383,7 +5380,6 @@ static int tcp_prune_queue(struct sock *sk)
 			     skb_peek(&sk->sk_receive_queue),
 			     NULL,
 			     tp->copied_seq, tp->rcv_nxt);
-	sk_mem_reclaim(sk);
 
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
 		return 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 20cf4a98c69d..2208755e8efc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -290,15 +290,13 @@ void tcp_delack_timer_handler(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	sk_mem_reclaim_partial(sk);
-
 	if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
 	    !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
-		goto out;
+		return;
 
 	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
-		goto out;
+		return;
 	}
 	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 
@@ -317,10 +315,6 @@ void tcp_delack_timer_handler(struct sock *sk)
 		tcp_send_ack(sk);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
 	}
-
-out:
-	if (tcp_under_memory_pressure(sk))
-		sk_mem_reclaim(sk);
 }
 
 
@@ -600,11 +594,11 @@ void tcp_write_timer_handler(struct sock *sk)
 
 	if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
 	    !icsk->icsk_pending)
-		goto out;
+		return;
 
 	if (time_after(icsk->icsk_timeout, jiffies)) {
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
-		goto out;
+		return;
 	}
 
 	tcp_mstamp_refresh(tcp_sk(sk));
@@ -626,9 +620,6 @@ void tcp_write_timer_handler(struct sock *sk)
 		tcp_probe_timer(sk);
 		break;
 	}
-
-out:
-	sk_mem_reclaim(sk);
 }
 
 static void tcp_write_timer(struct timer_list *t)
@@ -743,8 +734,6 @@ static void tcp_keepalive_timer (struct timer_list *t)
 		elapsed = keepalive_time_when(tp) - elapsed;
 	}
 
-	sk_mem_reclaim(sk);
-
 resched:
 	inet_csk_reset_keepalive_timer (sk, elapsed);
 	goto out;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a0385ddbffcf..498a0c35b7bb 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -278,8 +278,6 @@ static void iucv_sock_destruct(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 	skb_queue_purge(&sk->sk_error_queue);
 
-	sk_mem_reclaim(sk);
-
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		pr_err("Attempt to release alive iucv socket %p\n", sk);
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9563124ac8af..e0fb9f96c45c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -975,7 +975,7 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk)
 	if (reclaimable > (int)PAGE_SIZE)
 		__mptcp_rmem_reclaim(sk, reclaimable - 1);
 
-	sk_mem_reclaim_partial(sk);
+	sk_mem_reclaim(sk);
 }
 
 static void mptcp_mem_reclaim_partial(struct sock *sk)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 52edee1322fc..f6ee7f4040c1 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6590,8 +6590,6 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 			pr_debug("%s: under pressure, reneging for tsn:%u\n",
 				 __func__, tsn);
 			deliver = SCTP_CMD_RENEGE;
-		} else {
-			sk_mem_reclaim(sk);
 		}
 	}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 05174acd981a..171f1a35d205 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1824,9 +1824,6 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 	if (sctp_wspace(asoc) < (int)msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
 
-	if (sk_under_memory_pressure(sk))
-		sk_mem_reclaim(sk);
-
 	if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
 		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
@@ -9195,8 +9192,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 			goto do_error;
 		if (signal_pending(current))
 			goto do_interrupted;
-		if (sk_under_memory_pressure(sk))
-			sk_mem_reclaim(sk);
 		if ((int)msg_len <= sctp_wspace(asoc) &&
 		    sk_wmem_schedule(sk, msg_len))
 			break;
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 6b13f737ebf2..bb22b71df7a3 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -979,8 +979,6 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 
 	if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0)
 		sctp_intl_start_pd(ulpq, gfp);
-
-	sk_mem_reclaim(asoc->base.sk);
 }
 
 static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 407fed46931b..0a8510a0c5e6 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -1100,12 +1100,8 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 		else if (retval == 1)
 			sctp_ulpq_reasm_drain(ulpq);
 	}
-
-	sk_mem_reclaim(asoc->base.sk);
 }
 
-
-
 /* Notify the application if an association is aborted and in
  * partial delivery mode.  Send up any pending received messages.
  */
-- 
cgit 


From 7cbb6681d7e5b88688234ad370e027a9346ff7a9 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Wed, 27 Apr 2022 12:08:04 -0700
Subject: iio: common: cros_ec_sensors: Add label attribute

When sensor location is known, populate iio sysfs "label" attribute:

* "accel-base" : the sensor is in the base of the convertible (2-1)
  device.
* "accel-display" : the sensor is in the lid/display plane of the
  device.
* "accel-camera" : the sensor is in the swivel camera subassembly.

The non-standard |location| attribute is removed, the field |loc| in
cros_ec_sensors_core_state is removed.

It apply to standalone accelerometer as well as IMU (accelerometer +
gyroscope) and sensors where the location is known (light).

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Link: https://lore.kernel.org/r/20220427190804.961697-3-gwendal@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/cros_ec_accel_legacy.c           |  2 +-
 .../common/cros_ec_sensors/cros_ec_sensors_core.c  | 30 +++++++---------------
 drivers/iio/light/cros_ec_light_prox.c             |  2 --
 drivers/iio/pressure/cros_ec_baro.c                |  2 --
 include/linux/iio/common/cros_ec_sensors_core.h    |  2 --
 5 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/accel/cros_ec_accel_legacy.c b/drivers/iio/accel/cros_ec_accel_legacy.c
index b6f3471b62dc..1c0171f26e99 100644
--- a/drivers/iio/accel/cros_ec_accel_legacy.c
+++ b/drivers/iio/accel/cros_ec_accel_legacy.c
@@ -230,7 +230,7 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
 	indio_dev->channels = cros_ec_accel_legacy_channels;
 	indio_dev->num_channels = ARRAY_SIZE(cros_ec_accel_legacy_channels);
 	/* The lid sensor needs to be presented inverted. */
-	if (state->loc == MOTIONSENSE_LOC_LID) {
+	if (!strcmp(indio_dev->label, "accel-display")) {
 		state->sign[CROS_EC_SENSOR_X] = -1;
 		state->sign[CROS_EC_SENSOR_Z] = -1;
 	}
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
index 5976aca48e3b..e5ccedef13a8 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
@@ -29,12 +29,6 @@
  */
 #define CROS_EC_FIFO_SIZE (2048 * 2 / 3)
 
-static char *cros_ec_loc[] = {
-	[MOTIONSENSE_LOC_BASE] = "base",
-	[MOTIONSENSE_LOC_LID] = "lid",
-	[MOTIONSENSE_LOC_MAX] = "unknown",
-};
-
 static int cros_ec_get_host_cmd_version_mask(struct cros_ec_device *ec_dev,
 					     u16 cmd_offset, u16 cmd, u32 *mask)
 {
@@ -287,6 +281,8 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 	indio_dev->name = pdev->name;
 
 	if (physical_device) {
+		enum motionsensor_location loc;
+
 		state->param.cmd = MOTIONSENSE_CMD_INFO;
 		state->param.info.sensor_num = sensor_platform->sensor_num;
 		ret = cros_ec_motion_send_host_cmd(state, 0);
@@ -295,7 +291,13 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 			return ret;
 		}
 		state->type = state->resp->info.type;
-		state->loc = state->resp->info.location;
+		loc = state->resp->info.location;
+		if (loc == MOTIONSENSE_LOC_BASE)
+			indio_dev->label = "accel-base";
+		else if (loc == MOTIONSENSE_LOC_LID)
+			indio_dev->label = "accel-display";
+		else if (loc == MOTIONSENSE_LOC_CAMERA)
+			indio_dev->label = "accel-camera";
 
 		/* Set sign vector, only used for backward compatibility. */
 		memset(state->sign, 1, CROS_EC_SENSOR_MAX_AXIS);
@@ -442,15 +444,6 @@ static ssize_t cros_ec_sensors_id(struct iio_dev *indio_dev,
 	return snprintf(buf, PAGE_SIZE, "%d\n", st->param.info.sensor_num);
 }
 
-static ssize_t cros_ec_sensors_loc(struct iio_dev *indio_dev,
-		uintptr_t private, const struct iio_chan_spec *chan,
-		char *buf)
-{
-	struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
-
-	return snprintf(buf, PAGE_SIZE, "%s\n", cros_ec_loc[st->loc]);
-}
-
 const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[] = {
 	{
 		.name = "calibrate",
@@ -462,11 +455,6 @@ const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[] = {
 		.shared = IIO_SHARED_BY_ALL,
 		.read = cros_ec_sensors_id
 	},
-	{
-		.name = "location",
-		.shared = IIO_SHARED_BY_ALL,
-		.read = cros_ec_sensors_loc
-	},
 	{ },
 };
 EXPORT_SYMBOL_GPL(cros_ec_sensors_ext_info);
diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c
index de472f23d1cb..a00a8b3b86cf 100644
--- a/drivers/iio/light/cros_ec_light_prox.c
+++ b/drivers/iio/light/cros_ec_light_prox.c
@@ -188,8 +188,6 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 
 	indio_dev->info = &cros_ec_light_prox_info;
 	state = iio_priv(indio_dev);
-	state->core.type = state->core.resp->info.type;
-	state->core.loc = state->core.resp->info.location;
 	channel = state->channels;
 
 	/* Common part */
diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c
index 2f882e109423..1d9d34ae3c0a 100644
--- a/drivers/iio/pressure/cros_ec_baro.c
+++ b/drivers/iio/pressure/cros_ec_baro.c
@@ -145,8 +145,6 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
 
 	indio_dev->info = &cros_ec_baro_info;
 	state = iio_priv(indio_dev);
-	state->core.type = state->core.resp->info.type;
-	state->core.loc = state->core.resp->info.location;
 	channel = state->channels;
 	/* Common part */
 	channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW);
diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index c582e1a14232..a8259c8822f5 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -41,7 +41,6 @@ typedef irqreturn_t (*cros_ec_sensors_capture_t)(int irq, void *p);
  * @param:			motion sensor parameters structure
  * @resp:			motion sensor response structure
  * @type:			type of motion sensor
- * @loc:			location where the motion sensor is placed
  * @range_updated:		True if the range of the sensor has been
  *				updated.
  * @curr_range:			If updated, the current range value.
@@ -67,7 +66,6 @@ struct cros_ec_sensors_core_state {
 	struct ec_response_motion_sense *resp;
 
 	enum motionsensor_type type;
-	enum motionsensor_location loc;
 
 	bool range_updated;
 	int curr_range;
-- 
cgit 


From ccd8a9351f7b44bc1c0c8e4d39c0d6593b106fc4 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Fri, 10 Jun 2022 23:30:08 +0900
Subject: can: skb: move can_dropped_invalid_skb() and can_skb_headroom_valid()
 to skb.c

The functions can_dropped_invalid_skb() and can_skb_headroom_valid()
grew a lot over the years to a point which it does not make much sense
to have them defined as static inline in header files. Move those two
functions to the .c counterpart of skb.h.

can_skb_headroom_valid()'s only caller being
can_dropped_invalid_skb(), the declaration is removed from the
header. Only can_dropped_invalid_skb() gets its symbol exported.

While doing so, do a small cleanup: add brackets around the else block
in can_dropped_invalid_skb().

Link: https://lore.kernel.org/all/20220610143009.323579-7-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Max Staudt <max@enpas.org>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/skb.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/skb.h   | 59 +----------------------------------------------
 2 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index a4208f125b76..dc9da76c0470 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -259,3 +259,61 @@ struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
 	return skb;
 }
 EXPORT_SYMBOL_GPL(alloc_can_err_skb);
+
+/* Check for outgoing skbs that have not been created by the CAN subsystem */
+static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb)
+{
+	/* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */
+	if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv)))
+		return false;
+
+	/* af_packet does not apply CAN skb specific settings */
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		/* init headroom */
+		can_skb_prv(skb)->ifindex = dev->ifindex;
+		can_skb_prv(skb)->skbcnt = 0;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		/* perform proper loopback on capable devices */
+		if (dev->flags & IFF_ECHO)
+			skb->pkt_type = PACKET_LOOPBACK;
+		else
+			skb->pkt_type = PACKET_HOST;
+
+		skb_reset_mac_header(skb);
+		skb_reset_network_header(skb);
+		skb_reset_transport_header(skb);
+	}
+
+	return true;
+}
+
+/* Drop a given socketbuffer if it does not contain a valid CAN frame. */
+bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+	if (skb->protocol == htons(ETH_P_CAN)) {
+		if (unlikely(skb->len != CAN_MTU ||
+			     cfd->len > CAN_MAX_DLEN))
+			goto inval_skb;
+	} else if (skb->protocol == htons(ETH_P_CANFD)) {
+		if (unlikely(skb->len != CANFD_MTU ||
+			     cfd->len > CANFD_MAX_DLEN))
+			goto inval_skb;
+	} else {
+		goto inval_skb;
+	}
+
+	if (!can_skb_headroom_valid(dev, skb))
+		goto inval_skb;
+
+	return false;
+
+inval_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return true;
+}
+EXPORT_SYMBOL_GPL(can_dropped_invalid_skb);
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index fdb22b00674a..182749e858b3 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -31,6 +31,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
 struct sk_buff *alloc_can_err_skb(struct net_device *dev,
 				  struct can_frame **cf);
+bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb);
 
 /*
  * The struct can_skb_priv is used to transport additional information along
@@ -96,64 +97,6 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb)
 	return nskb;
 }
 
-/* Check for outgoing skbs that have not been created by the CAN subsystem */
-static inline bool can_skb_headroom_valid(struct net_device *dev,
-					  struct sk_buff *skb)
-{
-	/* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */
-	if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv)))
-		return false;
-
-	/* af_packet does not apply CAN skb specific settings */
-	if (skb->ip_summed == CHECKSUM_NONE) {
-		/* init headroom */
-		can_skb_prv(skb)->ifindex = dev->ifindex;
-		can_skb_prv(skb)->skbcnt = 0;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-		/* perform proper loopback on capable devices */
-		if (dev->flags & IFF_ECHO)
-			skb->pkt_type = PACKET_LOOPBACK;
-		else
-			skb->pkt_type = PACKET_HOST;
-
-		skb_reset_mac_header(skb);
-		skb_reset_network_header(skb);
-		skb_reset_transport_header(skb);
-	}
-
-	return true;
-}
-
-/* Drop a given socketbuffer if it does not contain a valid CAN frame. */
-static inline bool can_dropped_invalid_skb(struct net_device *dev,
-					  struct sk_buff *skb)
-{
-	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
-
-	if (skb->protocol == htons(ETH_P_CAN)) {
-		if (unlikely(skb->len != CAN_MTU ||
-			     cfd->len > CAN_MAX_DLEN))
-			goto inval_skb;
-	} else if (skb->protocol == htons(ETH_P_CANFD)) {
-		if (unlikely(skb->len != CANFD_MTU ||
-			     cfd->len > CANFD_MAX_DLEN))
-			goto inval_skb;
-	} else
-		goto inval_skb;
-
-	if (!can_skb_headroom_valid(dev, skb))
-		goto inval_skb;
-
-	return false;
-
-inval_skb:
-	kfree_skb(skb);
-	dev->stats.tx_dropped++;
-	return true;
-}
-
 static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 {
 	/* the CAN specific type of skb is identified by its data length */
-- 
cgit 


From 662a60102c122e44fdaf5c826f7f415eb57d48ad Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 2 May 2022 16:20:56 +0300
Subject: usb: typec: Separate USB Power Delivery from USB Type-C

Introducing a small device class for USB Power Delivery.
The idea with it is that we do not mix any more USB Power
Delivery information into the USB Type-C connectors only.
This separation will make it possible to register USB Power
Delivery devices also from other places, for example from
USB Type-C Bridges (see USB Type-C Bridge Specification).

The device class will not always deal with only the messages
and objects that were negotiated with the partner, but
instead messages and objects that can be used in the
negotiation. That allows the USB PD devices to be shared and
reconfigured. The ports can decide which objects are to be
advertised to the partner before the contract is negotiated.
It is also possible to allow the user space to make that
decision if needed.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20220502132058.86236-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-class-usb_power_delivery     | 240 +++++++
 drivers/usb/typec/Makefile                         |   2 +-
 drivers/usb/typec/pd.c                             | 708 +++++++++++++++++++++
 drivers/usb/typec/pd.h                             |  30 +
 include/linux/usb/pd.h                             |  38 ++
 include/linux/usb/typec.h                          |  10 +
 6 files changed, 1027 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-usb_power_delivery
 create mode 100644 drivers/usb/typec/pd.c
 create mode 100644 drivers/usb/typec/pd.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-usb_power_delivery b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
new file mode 100644
index 000000000000..ce2b1b563cb3
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
@@ -0,0 +1,240 @@
+What:		/sys/class/usb_power_delivery
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Directory for USB Power Delivery devices.
+
+What:		/sys/class/usb_power_delivery/.../revision
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		File showing the USB Power Delivery Specification Revision used
+		in communication.
+
+What:		/sys/class/usb_power_delivery/.../version
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This is an optional attribute file showing the version of the
+		specific revision of the USB Power Delivery Specification. In
+		most cases the specification version is not known and the file
+		is not available.
+
+What:		/sys/class/usb_power_delivery/.../source-capabilities
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The source capabilities message "Source_Capabilities" contains a
+		set of Power Data Objects (PDO), each representing a type of
+		power supply. The order of the PDO objects is defined in the USB
+		Power Delivery Specification. Each PDO - power supply - will
+		have its own device, and the PDO device name will start with the
+		object position number as the first character followed by the
+		power supply type name (":" as delimiter).
+
+			/sys/class/usb_power_delivery/.../source_capabilities/<position>:<type>
+
+What:		/sys/class/usb_power_delivery/.../sink-capabilities
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The sink capability message "Sink_Capabilities" contains a set
+		of Power Data Objects (PDO) just like with source capabilities,
+		but instead of describing the power capabilities, these objects
+		describe the power requirements.
+
+		The order of the objects in the sink capability message is the
+		same as with the source capabilities message.
+
+Fixed Supplies
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:fixed_supply
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Devices containing the attributes (the bit fields) defined for
+		Fixed Supplies.
+
+		The device "1:fixed_supply" is special. USB Power Delivery
+		Specification dictates that the first PDO (at object position
+		1), and the only mandatory PDO, is always the vSafe5V Fixed
+		Supply Object. vSafe5V Object has additional fields defined for
+		it that the other Fixed Supply Objects do not have and that are
+		related to the USB capabilities rather than power capabilities.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/dual_role_power
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file contains boolean value that tells does the device
+		support both source and sink power roles.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/usb_suspend_supported
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the USB Suspend Supported bit in
+		vSafe5V Fixed Supply Object. If the bit is set then the device
+		will follow the USB 2.0 and USB 3.2 rules for suspend and
+		resume.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/unconstrained_power
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the Unconstrained Power bit in
+		vSafe5V Fixed Supply Object. The bit is set when an external
+		source of power, powerful enough to power the entire system on
+		its own, is available for the device.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/usb_communication_capable
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the USB Communication Capable bit in
+		vSafe5V Fixed Supply Object.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/dual_role_data
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the Dual-Role Data bit in vSafe5V
+		Fixed Supply Object. Dual role data means ability act as both
+		USB host and USB device.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/unchunked_extended_messages_supported
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the Unchunked Extended Messages
+		Supported bit in vSafe5V Fixed Supply Object.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:fixed_supply/voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The voltage the supply supports in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:fixed_supply/maximum_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum current of the fixed source supply in milliamperes.
+
+What:		/sys/class/usb_power_delivery/.../sink-capabilities/<position>:fixed_supply/operational_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Operational current of the sink in milliamperes.
+
+What:		/sys/class/usb_power_delivery/.../sink-capabilities/<position>:fixed_supply/fast_role_swap_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file contains the value of the "Fast Role Swap USB Type-C
+		Current" field that tells the current level the sink requires
+		after a Fast Role Swap.
+		0 - Fast Swap not supported"
+		1 - Default USB Power"
+		2 - 1.5A@5V"
+		3 - 3.0A@5V"
+
+Variable Supplies
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Variable Power Supply PDO.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply/maximum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply/minimum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Minimum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:variable_supply/maximum_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The maximum current in milliamperes that the source can supply
+		at the given Voltage range.
+
+What:		/sys/class/usb_power_delivery/.../sink-capabilities/<position>:variable_supply/operational_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The operational current in milliamperes that the sink requires
+		at the given Voltage range.
+
+Battery Supplies
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:battery
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Battery PDO.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:battery/maximum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:battery/minimum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Minimum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:battery/maximum_power
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum allowable Power in milliwatts.
+
+What:		/sys/class/usb_power_delivery/.../sink-capabilities/<position>:battery/operational_power
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The operational power that the sink requires at the given
+		voltage range.
+
+Standard Power Range (SPR) Programmable Power Supplies
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Programmable Power Supply (PPS) Augmented PDO (APDO).
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/maximum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/minimum_voltage
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Minimum Voltage in millivolts.
+
+What:		/sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/maximum_current
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Maximum Current in milliamperes.
+
+What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:programmable_supply/pps_power_limited
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The PPS Power Limited bit indicates whether or not the source
+		supply will exceed the rated output power if requested.
diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile
index 43626acc0aaf..2f174cd3e5df 100644
--- a/drivers/usb/typec/Makefile
+++ b/drivers/usb/typec/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_TYPEC)		+= typec.o
-typec-y				:= class.o mux.o bus.o
+typec-y				:= class.o mux.o bus.o pd.o
 typec-$(CONFIG_ACPI)		+= port-mapper.o
 obj-$(CONFIG_TYPEC)		+= altmodes/
 obj-$(CONFIG_TYPEC_TCPM)	+= tcpm/
diff --git a/drivers/usb/typec/pd.c b/drivers/usb/typec/pd.c
new file mode 100644
index 000000000000..dc72005d68db
--- /dev/null
+++ b/drivers/usb/typec/pd.c
@@ -0,0 +1,708 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * USB Power Delivery sysfs entries
+ *
+ * Copyright (C) 2022, Intel Corporation
+ * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/usb/pd.h>
+
+#include "pd.h"
+
+static DEFINE_IDA(pd_ida);
+
+static struct class pd_class = {
+	.name = "usb_power_delivery",
+	.owner = THIS_MODULE,
+};
+
+#define to_pdo(o) container_of(o, struct pdo, dev)
+
+struct pdo {
+	struct device dev;
+	int object_position;
+	u32 pdo;
+};
+
+static void pdo_release(struct device *dev)
+{
+	kfree(to_pdo(dev));
+}
+
+/* -------------------------------------------------------------------------- */
+/* Fixed Supply */
+
+static ssize_t
+dual_role_power_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_DUAL_ROLE));
+}
+static DEVICE_ATTR_RO(dual_role_power);
+
+static ssize_t
+usb_suspend_supported_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_SUSPEND));
+}
+static DEVICE_ATTR_RO(usb_suspend_supported);
+
+static ssize_t
+unconstrained_power_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_EXTPOWER));
+}
+static DEVICE_ATTR_RO(unconstrained_power);
+
+static ssize_t
+usb_communication_capable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_USB_COMM));
+}
+static DEVICE_ATTR_RO(usb_communication_capable);
+
+static ssize_t
+dual_role_data_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_DATA_SWAP));
+}
+static DEVICE_ATTR_RO(dual_role_data);
+
+static ssize_t
+unchunked_extended_messages_supported_show(struct device *dev,
+					   struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & PDO_FIXED_UNCHUNK_EXT));
+}
+static DEVICE_ATTR_RO(unchunked_extended_messages_supported);
+
+/*
+ * REVISIT: Peak Current requires access also to the RDO.
+static ssize_t
+peak_current_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	...
+}
+*/
+
+static ssize_t
+fast_role_swap_current_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", to_pdo(dev)->pdo >> PDO_FIXED_FRS_CURR_SHIFT) & 3;
+}
+static DEVICE_ATTR_RO(fast_role_swap_current);
+
+static ssize_t voltage_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umV\n", pdo_fixed_voltage(to_pdo(dev)->pdo));
+}
+static DEVICE_ATTR_RO(voltage);
+
+/* Shared with Variable supplies, both source and sink */
+static ssize_t current_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umA\n", pdo_max_current(to_pdo(dev)->pdo));
+}
+
+/* Shared with Variable type supplies */
+static struct device_attribute maximum_current_attr = {
+	.attr = {
+		.name = "maximum_current",
+		.mode = 0444,
+	},
+	.show = current_show,
+};
+
+static struct device_attribute operational_current_attr = {
+	.attr = {
+		.name = "operational_current",
+		.mode = 0444,
+	},
+	.show = current_show,
+};
+
+static struct attribute *source_fixed_supply_attrs[] = {
+	&dev_attr_dual_role_power.attr,
+	&dev_attr_usb_suspend_supported.attr,
+	&dev_attr_unconstrained_power.attr,
+	&dev_attr_usb_communication_capable.attr,
+	&dev_attr_dual_role_data.attr,
+	&dev_attr_unchunked_extended_messages_supported.attr,
+	/*&dev_attr_peak_current.attr,*/
+	&dev_attr_voltage.attr,
+	&maximum_current_attr.attr,
+	NULL
+};
+
+static umode_t fixed_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
+{
+	if (to_pdo(kobj_to_dev(kobj))->object_position &&
+	    /*attr != &dev_attr_peak_current.attr &&*/
+	    attr != &dev_attr_voltage.attr &&
+	    attr != &maximum_current_attr.attr &&
+	    attr != &operational_current_attr.attr)
+		return 0;
+
+	return attr->mode;
+}
+
+static const struct attribute_group source_fixed_supply_group = {
+	.is_visible = fixed_attr_is_visible,
+	.attrs = source_fixed_supply_attrs,
+};
+__ATTRIBUTE_GROUPS(source_fixed_supply);
+
+static struct device_type source_fixed_supply_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = source_fixed_supply_groups,
+};
+
+static struct attribute *sink_fixed_supply_attrs[] = {
+	&dev_attr_dual_role_power.attr,
+	&dev_attr_usb_suspend_supported.attr,
+	&dev_attr_unconstrained_power.attr,
+	&dev_attr_usb_communication_capable.attr,
+	&dev_attr_dual_role_data.attr,
+	&dev_attr_unchunked_extended_messages_supported.attr,
+	&dev_attr_fast_role_swap_current.attr,
+	&dev_attr_voltage.attr,
+	&operational_current_attr.attr,
+	NULL
+};
+
+static const struct attribute_group sink_fixed_supply_group = {
+	.is_visible = fixed_attr_is_visible,
+	.attrs = sink_fixed_supply_attrs,
+};
+__ATTRIBUTE_GROUPS(sink_fixed_supply);
+
+static struct device_type sink_fixed_supply_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = sink_fixed_supply_groups,
+};
+
+/* -------------------------------------------------------------------------- */
+/* Variable Supply */
+
+static ssize_t
+maximum_voltage_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umV\n", pdo_max_voltage(to_pdo(dev)->pdo));
+}
+static DEVICE_ATTR_RO(maximum_voltage);
+
+static ssize_t
+minimum_voltage_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umV\n", pdo_min_voltage(to_pdo(dev)->pdo));
+}
+static DEVICE_ATTR_RO(minimum_voltage);
+
+static struct attribute *source_variable_supply_attrs[] = {
+	&dev_attr_maximum_voltage.attr,
+	&dev_attr_minimum_voltage.attr,
+	&maximum_current_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(source_variable_supply);
+
+static struct device_type source_variable_supply_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = source_variable_supply_groups,
+};
+
+static struct attribute *sink_variable_supply_attrs[] = {
+	&dev_attr_maximum_voltage.attr,
+	&dev_attr_minimum_voltage.attr,
+	&operational_current_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(sink_variable_supply);
+
+static struct device_type sink_variable_supply_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = sink_variable_supply_groups,
+};
+
+/* -------------------------------------------------------------------------- */
+/* Battery */
+
+static ssize_t
+maximum_power_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umW\n", pdo_max_power(to_pdo(dev)->pdo));
+}
+static DEVICE_ATTR_RO(maximum_power);
+
+static ssize_t
+operational_power_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umW\n", pdo_max_power(to_pdo(dev)->pdo));
+}
+static DEVICE_ATTR_RO(operational_power);
+
+static struct attribute *source_battery_attrs[] = {
+	&dev_attr_maximum_voltage.attr,
+	&dev_attr_minimum_voltage.attr,
+	&dev_attr_maximum_power.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(source_battery);
+
+static struct device_type source_battery_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = source_battery_groups,
+};
+
+static struct attribute *sink_battery_attrs[] = {
+	&dev_attr_maximum_voltage.attr,
+	&dev_attr_minimum_voltage.attr,
+	&dev_attr_operational_power.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(sink_battery);
+
+static struct device_type sink_battery_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = sink_battery_groups,
+};
+
+/* -------------------------------------------------------------------------- */
+/* Standard Power Range (SPR) Programmable Power Supply (PPS) */
+
+static ssize_t
+pps_power_limited_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", !!(to_pdo(dev)->pdo & BIT(27)));
+}
+static DEVICE_ATTR_RO(pps_power_limited);
+
+static ssize_t
+pps_max_voltage_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umV\n", pdo_pps_apdo_max_voltage(to_pdo(dev)->pdo));
+}
+
+static ssize_t
+pps_min_voltage_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umV\n", pdo_pps_apdo_min_voltage(to_pdo(dev)->pdo));
+}
+
+static ssize_t
+pps_max_current_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%umA\n", pdo_pps_apdo_max_current(to_pdo(dev)->pdo));
+}
+
+static struct device_attribute pps_max_voltage_attr = {
+	.attr = {
+		.name = "maximum_voltage",
+		.mode = 0444,
+	},
+	.show = pps_max_voltage_show,
+};
+
+static struct device_attribute pps_min_voltage_attr = {
+	.attr = {
+		.name = "minimum_voltage",
+		.mode = 0444,
+	},
+	.show = pps_min_voltage_show,
+};
+
+static struct device_attribute pps_max_current_attr = {
+	.attr = {
+		.name = "maximum_current",
+		.mode = 0444,
+	},
+	.show = pps_max_current_show,
+};
+
+static struct attribute *source_pps_attrs[] = {
+	&dev_attr_pps_power_limited.attr,
+	&pps_max_voltage_attr.attr,
+	&pps_min_voltage_attr.attr,
+	&pps_max_current_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(source_pps);
+
+static struct device_type source_pps_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = source_pps_groups,
+};
+
+static struct attribute *sink_pps_attrs[] = {
+	&pps_max_voltage_attr.attr,
+	&pps_min_voltage_attr.attr,
+	&pps_max_current_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(sink_pps);
+
+static struct device_type sink_pps_type = {
+	.name = "pdo",
+	.release = pdo_release,
+	.groups = sink_pps_groups,
+};
+
+/* -------------------------------------------------------------------------- */
+
+static const char * const supply_name[] = {
+	[PDO_TYPE_FIXED] = "fixed_supply",
+	[PDO_TYPE_BATT]  = "battery",
+	[PDO_TYPE_VAR]	 = "variable_supply",
+};
+
+static const char * const apdo_supply_name[] = {
+	[APDO_TYPE_PPS]  = "programmable_supply",
+};
+
+static struct device_type *source_type[] = {
+	[PDO_TYPE_FIXED] = &source_fixed_supply_type,
+	[PDO_TYPE_BATT]  = &source_battery_type,
+	[PDO_TYPE_VAR]   = &source_variable_supply_type,
+};
+
+static struct device_type *source_apdo_type[] = {
+	[APDO_TYPE_PPS]  = &source_pps_type,
+};
+
+static struct device_type *sink_type[] = {
+	[PDO_TYPE_FIXED] = &sink_fixed_supply_type,
+	[PDO_TYPE_BATT]  = &sink_battery_type,
+	[PDO_TYPE_VAR]   = &sink_variable_supply_type,
+};
+
+static struct device_type *sink_apdo_type[] = {
+	[APDO_TYPE_PPS]  = &sink_pps_type,
+};
+
+/* REVISIT: Export when EPR_*_Capabilities need to be supported. */
+static int add_pdo(struct usb_power_delivery_capabilities *cap, u32 pdo, int position)
+{
+	struct device_type *type;
+	const char *name;
+	struct pdo *p;
+	int ret;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->pdo = pdo;
+	p->object_position = position;
+
+	if (pdo_type(pdo) == PDO_TYPE_APDO) {
+		/* FIXME: Only PPS supported for now! Skipping others. */
+		if (pdo_apdo_type(pdo) > APDO_TYPE_PPS) {
+			dev_warn(&cap->dev, "Unknown APDO type. PDO 0x%08x\n", pdo);
+			kfree(p);
+			return 0;
+		}
+
+		if (is_source(cap->role))
+			type = source_apdo_type[pdo_apdo_type(pdo)];
+		else
+			type = sink_apdo_type[pdo_apdo_type(pdo)];
+
+		name = apdo_supply_name[pdo_apdo_type(pdo)];
+	} else {
+		if (is_source(cap->role))
+			type = source_type[pdo_type(pdo)];
+		else
+			type = sink_type[pdo_type(pdo)];
+
+		name = supply_name[pdo_type(pdo)];
+	}
+
+	p->dev.parent = &cap->dev;
+	p->dev.type = type;
+	dev_set_name(&p->dev, "%u:%s", position + 1, name);
+
+	ret = device_register(&p->dev);
+	if (ret) {
+		put_device(&p->dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int remove_pdo(struct device *dev, void *data)
+{
+	device_unregister(dev);
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const char * const cap_name[] = {
+	[TYPEC_SINK]    = "sink-capabilities",
+	[TYPEC_SOURCE]  = "source-capabilities",
+};
+
+static void pd_capabilities_release(struct device *dev)
+{
+	kfree(to_usb_power_delivery_capabilities(dev));
+}
+
+static struct device_type pd_capabilities_type = {
+	.name = "capabilities",
+	.release = pd_capabilities_release,
+};
+
+/**
+ * usb_power_delivery_register_capabilities - Register a set of capabilities.
+ * @pd: The USB PD instance that the capabilities belong to.
+ * @desc: Description of the Capablities Message.
+ *
+ * This function registers a Capabilities Message described in @desc. The
+ * capabilities will have their own sub-directory under @pd in sysfs.
+ *
+ * The function returns pointer to struct usb_power_delivery_capabilities, or
+ * ERR_PRT(errno).
+ */
+struct usb_power_delivery_capabilities *
+usb_power_delivery_register_capabilities(struct usb_power_delivery *pd,
+					 struct usb_power_delivery_capabilities_desc *desc)
+{
+	struct usb_power_delivery_capabilities *cap;
+	int ret;
+	int i;
+
+	cap = kzalloc(sizeof(*cap), GFP_KERNEL);
+	if (!cap)
+		return ERR_PTR(-ENOMEM);
+
+	cap->pd = pd;
+	cap->role = desc->role;
+
+	cap->dev.parent = &pd->dev;
+	cap->dev.type = &pd_capabilities_type;
+	dev_set_name(&cap->dev, "%s", cap_name[cap->role]);
+
+	ret = device_register(&cap->dev);
+	if (ret) {
+		put_device(&cap->dev);
+		return ERR_PTR(ret);
+	}
+
+	for (i = 0; i < PDO_MAX_OBJECTS && desc->pdo[i]; i++) {
+		ret = add_pdo(cap, desc->pdo[i], i);
+		if (ret) {
+			usb_power_delivery_unregister_capabilities(cap);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return cap;
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_register_capabilities);
+
+/**
+ * usb_power_delivery_unregister_capabilities - Unregister a set of capabilities
+ * @cap: The capabilities
+ */
+void usb_power_delivery_unregister_capabilities(struct usb_power_delivery_capabilities *cap)
+{
+	if (!cap)
+		return;
+
+	device_for_each_child(&cap->dev, NULL, remove_pdo);
+	device_unregister(&cap->dev);
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_unregister_capabilities);
+
+/* -------------------------------------------------------------------------- */
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct usb_power_delivery *pd = to_usb_power_delivery(dev);
+
+	return sysfs_emit(buf, "%u.%u\n", (pd->revision >> 8) & 0xff, (pd->revision >> 4) & 0xf);
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct usb_power_delivery *pd = to_usb_power_delivery(dev);
+
+	return sysfs_emit(buf, "%u.%u\n", (pd->version >> 8) & 0xff, (pd->version >> 4) & 0xf);
+}
+static DEVICE_ATTR_RO(version);
+
+static struct attribute *pd_attrs[] = {
+	&dev_attr_revision.attr,
+	&dev_attr_version.attr,
+	NULL
+};
+
+static umode_t pd_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
+{
+	struct usb_power_delivery *pd = to_usb_power_delivery(kobj_to_dev(kobj));
+
+	if (attr == &dev_attr_version.attr && !pd->version)
+		return 0;
+
+	return attr->mode;
+}
+
+static const struct attribute_group pd_group = {
+	.is_visible = pd_attr_is_visible,
+	.attrs = pd_attrs,
+};
+__ATTRIBUTE_GROUPS(pd);
+
+static void pd_release(struct device *dev)
+{
+	struct usb_power_delivery *pd = to_usb_power_delivery(dev);
+
+	ida_simple_remove(&pd_ida, pd->id);
+	kfree(pd);
+}
+
+static struct device_type pd_type = {
+	.name = "usb_power_delivery",
+	.release = pd_release,
+	.groups = pd_groups,
+};
+
+struct usb_power_delivery *usb_power_delivery_find(const char *name)
+{
+	struct device *dev;
+
+	dev = class_find_device_by_name(&pd_class, name);
+
+	return dev ? to_usb_power_delivery(dev) : NULL;
+}
+
+/**
+ * usb_power_delivery_register - Register USB Power Delivery Support.
+ * @parent: Parent device.
+ * @desc: Description of the USB PD contract.
+ *
+ * This routine can be used to register USB Power Delivery capabilities that a
+ * device or devices can support. These capabilities represent all the
+ * capabilities that can be negotiated with a partner, so not only the Power
+ * Capabilities that are negotiated using the USB PD Capabilities Message.
+ *
+ * The USB Power Delivery Support object that this routine generates can be used
+ * as the parent object for all the actual USB Power Delivery Messages and
+ * objects that can be negotiated with the partner.
+ *
+ * Returns handle to struct usb_power_delivery or ERR_PTR.
+ */
+struct usb_power_delivery *
+usb_power_delivery_register(struct device *parent, struct usb_power_delivery_desc *desc)
+{
+	struct usb_power_delivery *pd;
+	int ret;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	ret = ida_simple_get(&pd_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(pd);
+		return ERR_PTR(ret);
+	}
+
+	pd->id = ret;
+	pd->revision = desc->revision;
+	pd->version = desc->version;
+
+	pd->dev.parent = parent;
+	pd->dev.type = &pd_type;
+	pd->dev.class = &pd_class;
+	dev_set_name(&pd->dev, "pd%d", pd->id);
+
+	ret = device_register(&pd->dev);
+	if (ret) {
+		put_device(&pd->dev);
+		return ERR_PTR(ret);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_register);
+
+/**
+ * usb_power_delivery_unregister - Unregister USB Power Delivery Support.
+ * @pd: The USB PD contract.
+ */
+void usb_power_delivery_unregister(struct usb_power_delivery *pd)
+{
+	if (IS_ERR_OR_NULL(pd))
+		return;
+
+	device_unregister(&pd->dev);
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_unregister);
+
+/**
+ * usb_power_delivery_link_device - Link device to its USB PD object.
+ * @pd: The USB PD instance.
+ * @dev: The device.
+ *
+ * This function can be used to create a symlink named "usb_power_delivery" for
+ * @dev that points to @pd.
+ */
+int usb_power_delivery_link_device(struct usb_power_delivery *pd, struct device *dev)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(pd) || !dev)
+		return 0;
+
+	ret = sysfs_create_link(&dev->kobj, &pd->dev.kobj, "usb_power_delivery");
+	if (ret)
+		return ret;
+
+	get_device(&pd->dev);
+	get_device(dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_link_device);
+
+/**
+ * usb_power_delivery_unlink_device - Unlink device from its USB PD object.
+ * @pd: The USB PD instance.
+ * @dev: The device.
+ *
+ * Remove the symlink that was previously created with pd_link_device().
+ */
+void usb_power_delivery_unlink_device(struct usb_power_delivery *pd, struct device *dev)
+{
+	if (IS_ERR_OR_NULL(pd) || !dev)
+		return;
+
+	sysfs_remove_link(&dev->kobj, "usb_power_delivery");
+	put_device(&pd->dev);
+	put_device(dev);
+}
+EXPORT_SYMBOL_GPL(usb_power_delivery_unlink_device);
+
+/* -------------------------------------------------------------------------- */
+
+int __init usb_power_delivery_init(void)
+{
+	return class_register(&pd_class);
+}
+
+void __exit usb_power_delivery_exit(void)
+{
+	ida_destroy(&pd_ida);
+	class_unregister(&pd_class);
+}
diff --git a/drivers/usb/typec/pd.h b/drivers/usb/typec/pd.h
new file mode 100644
index 000000000000..049a1aad440a
--- /dev/null
+++ b/drivers/usb/typec/pd.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __USB_POWER_DELIVERY__
+#define __USB_POWER_DELIVERY__
+
+#include <linux/device.h>
+#include <linux/usb/typec.h>
+
+struct usb_power_delivery {
+	struct device dev;
+	int id;
+	u16 revision;
+	u16 version;
+};
+
+struct usb_power_delivery_capabilities {
+	struct device dev;
+	struct usb_power_delivery *pd;
+	enum typec_role role;
+};
+
+#define to_usb_power_delivery_capabilities(o) container_of(o, struct usb_power_delivery_capabilities, dev)
+#define to_usb_power_delivery(o) container_of(o, struct usb_power_delivery, dev)
+
+struct usb_power_delivery *usb_power_delivery_find(const char *name);
+
+int usb_power_delivery_init(void);
+void usb_power_delivery_exit(void);
+
+#endif /* __USB_POWER_DELIVERY__ */
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 96b7ff66f074..c59fb79a42e8 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -495,4 +495,42 @@ static inline unsigned int rdo_max_power(u32 rdo)
 
 #define PD_P_SNK_STDBY_MW	2500	/* 2500 mW */
 
+#if IS_ENABLED(CONFIG_TYPEC)
+
+struct usb_power_delivery;
+
+/**
+ * usb_power_delivery_desc - USB Power Delivery Descriptor
+ * @revision: USB Power Delivery Specification Revision
+ * @version: USB Power Delivery Specicication Version - optional
+ */
+struct usb_power_delivery_desc {
+	u16 revision;
+	u16 version;
+};
+
+/**
+ * usb_power_delivery_capabilities_desc - Description of USB Power Delivery Capabilities Message
+ * @pdo: The Power Data Objects in the Capability Message
+ * @role: Power role of the capabilities
+ */
+struct usb_power_delivery_capabilities_desc {
+	u32 pdo[PDO_MAX_OBJECTS];
+	enum typec_role role;
+};
+
+struct usb_power_delivery_capabilities *
+usb_power_delivery_register_capabilities(struct usb_power_delivery *pd,
+					 struct usb_power_delivery_capabilities_desc *desc);
+void usb_power_delivery_unregister_capabilities(struct usb_power_delivery_capabilities *cap);
+
+struct usb_power_delivery *usb_power_delivery_register(struct device *parent,
+						       struct usb_power_delivery_desc *desc);
+void usb_power_delivery_unregister(struct usb_power_delivery *pd);
+
+int usb_power_delivery_link_device(struct usb_power_delivery *pd, struct device *dev);
+void usb_power_delivery_unlink_device(struct usb_power_delivery *pd, struct device *dev);
+
+#endif /* CONFIG_TYPEC */
+
 #endif /* __LINUX_USB_PD_H */
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index fdf737d48b3b..45e28d14ae56 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -52,6 +52,16 @@ enum typec_role {
 	TYPEC_SOURCE,
 };
 
+static inline int is_sink(enum typec_role role)
+{
+	return role == TYPEC_SINK;
+}
+
+static inline int is_source(enum typec_role role)
+{
+	return role == TYPEC_SOURCE;
+}
+
 enum typec_pwr_opmode {
 	TYPEC_PWR_MODE_USB,
 	TYPEC_PWR_MODE_1_5A,
-- 
cgit 


From a7cff92f0635c794e2198a69a7ff4ecfe0decab9 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 2 May 2022 16:20:57 +0300
Subject: usb: typec: USB Power Delivery helpers for ports and partners

All the USB Type-C Connector Class devices are protected, so
the drivers can not directly access them. This will adds a
few helpers that can be used to link the ports and partners
to the correct USB Power Delivery objects.

For ports a new optional sysfs attribute file is also added
that can be used to select the USB Power Delivery
capabilities that the port will advertise to the partner.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20220502132058.86236-3-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec |   8 ++
 drivers/usb/typec/class.c                   | 149 ++++++++++++++++++++++++++++
 drivers/usb/typec/class.h                   |   4 +
 include/linux/usb/typec.h                   |  13 +++
 4 files changed, 174 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index 75088ecad202..281b995beb05 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -141,6 +141,14 @@ Description:
 		- "reverse": CC2 orientation
 		- "unknown": Orientation cannot be determined.
 
+What:		/sys/class/typec/<port>/select_usb_power_delivery
+Date:		May 2022
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Lists the USB Power Delivery Capabilities that the port can
+		advertise to the partner. The currently used capabilities are in
+		brackets. Selection happens by writing to the file.
+
 USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
 
 What:		/sys/class/typec/<port>-partner/accessory_mode
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index ee0e520707dd..bbc46b14f99a 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -15,6 +15,7 @@
 
 #include "bus.h"
 #include "class.h"
+#include "pd.h"
 
 static DEFINE_IDA(typec_index_ida);
 
@@ -720,6 +721,39 @@ void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revisio
 }
 EXPORT_SYMBOL_GPL(typec_partner_set_pd_revision);
 
+/**
+ * typec_partner_set_usb_power_delivery - Declare USB Power Delivery Contract.
+ * @partner: The partner device.
+ * @pd: The USB PD instance.
+ *
+ * This routine can be used to declare USB Power Delivery Contract with @partner
+ * by linking @partner to @pd which contains the objects that were used during the
+ * negotiation of the contract.
+ *
+ * If @pd is NULL, the link is removed and the contract with @partner has ended.
+ */
+int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
+					 struct usb_power_delivery *pd)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(partner) || partner->pd == pd)
+		return 0;
+
+	if (pd) {
+		ret = usb_power_delivery_link_device(pd, &partner->dev);
+		if (ret)
+			return ret;
+	} else {
+		usb_power_delivery_unlink_device(partner->pd, &partner->dev);
+	}
+
+	partner->pd = pd;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_usb_power_delivery);
+
 /**
  * typec_partner_set_num_altmodes - Set the number of available partner altmodes
  * @partner: The partner to be updated.
@@ -1170,6 +1204,104 @@ EXPORT_SYMBOL_GPL(typec_unregister_cable);
 /* ------------------------------------------------------------------------- */
 /* USB Type-C ports */
 
+/**
+ * typec_port_set_usb_power_delivery - Assign USB PD for port.
+ * @port: USB Type-C port.
+ * @pd: USB PD instance.
+ *
+ * This routine can be used to set the USB Power Delivery Capabilities for @port
+ * that it will advertise to the partner.
+ *
+ * If @pd is NULL, the assignment is removed.
+ */
+int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_delivery *pd)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(port) || port->pd == pd)
+		return 0;
+
+	if (pd) {
+		ret = usb_power_delivery_link_device(pd, &port->dev);
+		if (ret)
+			return ret;
+	} else {
+		usb_power_delivery_unlink_device(port->pd, &port->dev);
+	}
+
+	port->pd = pd;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_port_set_usb_power_delivery);
+
+static ssize_t select_usb_power_delivery_store(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	struct usb_power_delivery *pd;
+
+	if (!port->ops || !port->ops->pd_set)
+		return -EOPNOTSUPP;
+
+	pd = usb_power_delivery_find(buf);
+	if (!pd)
+		return -EINVAL;
+
+	return port->ops->pd_set(port, pd);
+}
+
+static ssize_t select_usb_power_delivery_show(struct device *dev,
+					      struct device_attribute *attr, char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+	struct usb_power_delivery **pds;
+	struct usb_power_delivery *pd;
+	int ret = 0;
+
+	if (!port->ops || !port->ops->pd_get)
+		return -EOPNOTSUPP;
+
+	pds = port->ops->pd_get(port);
+	if (!pds)
+		return 0;
+
+	for (pd = pds[0]; pd; pd++) {
+		if (pd == port->pd)
+			ret += sysfs_emit(buf + ret, "[%s] ", dev_name(&pd->dev));
+		else
+			ret += sysfs_emit(buf + ret, "%s ", dev_name(&pd->dev));
+	}
+
+	buf[ret - 1] = '\n';
+
+	return ret;
+}
+static DEVICE_ATTR_RW(select_usb_power_delivery);
+
+static struct attribute *port_attrs[] = {
+	&dev_attr_select_usb_power_delivery.attr,
+	NULL
+};
+
+static umode_t port_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
+{
+	struct typec_port *port = to_typec_port(kobj_to_dev(kobj));
+
+	if (!port->pd || !port->ops || !port->ops->pd_get)
+		return 0;
+	if (!port->ops->pd_set)
+		return 0444;
+
+	return attr->mode;
+}
+
+static const struct attribute_group pd_group = {
+	.is_visible = port_attr_is_visible,
+	.attrs = port_attrs,
+};
+
 static const char * const typec_orientations[] = {
 	[TYPEC_ORIENTATION_NONE]	= "unknown",
 	[TYPEC_ORIENTATION_NORMAL]	= "normal",
@@ -1581,6 +1713,7 @@ static const struct attribute_group typec_group = {
 
 static const struct attribute_group *typec_groups[] = {
 	&typec_group,
+	&pd_group,
 	NULL
 };
 
@@ -2123,6 +2256,13 @@ struct typec_port *typec_register_port(struct device *parent,
 		return ERR_PTR(ret);
 	}
 
+	ret = typec_port_set_usb_power_delivery(port, cap->pd);
+	if (ret) {
+		dev_err(&port->dev, "failed to link pd\n");
+		device_unregister(&port->dev);
+		return ERR_PTR(ret);
+	}
+
 	ret = typec_link_ports(port);
 	if (ret)
 		dev_warn(&port->dev, "failed to create symlinks (%d)\n", ret);
@@ -2141,6 +2281,7 @@ void typec_unregister_port(struct typec_port *port)
 {
 	if (!IS_ERR_OR_NULL(port)) {
 		typec_unlink_ports(port);
+		typec_port_set_usb_power_delivery(port, NULL);
 		device_unregister(&port->dev);
 	}
 }
@@ -2162,8 +2303,15 @@ static int __init typec_init(void)
 	if (ret)
 		goto err_unregister_mux_class;
 
+	ret = usb_power_delivery_init();
+	if (ret)
+		goto err_unregister_class;
+
 	return 0;
 
+err_unregister_class:
+	class_unregister(&typec_class);
+
 err_unregister_mux_class:
 	class_unregister(&typec_mux_class);
 
@@ -2176,6 +2324,7 @@ subsys_initcall(typec_init);
 
 static void __exit typec_exit(void)
 {
+	usb_power_delivery_exit();
 	class_unregister(&typec_class);
 	ida_destroy(&typec_index_ida);
 	bus_unregister(&typec_bus);
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index 0f1bd6d19d67..b531f9853bc0 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -33,6 +33,8 @@ struct typec_partner {
 	int				num_altmodes;
 	u16				pd_revision; /* 0300H = "3.0" */
 	enum usb_pd_svdm_ver		svdm_version;
+
+	struct usb_power_delivery	*pd;
 };
 
 struct typec_port {
@@ -40,6 +42,8 @@ struct typec_port {
 	struct device			dev;
 	struct ida			mode_ids;
 
+	struct usb_power_delivery	*pd;
+
 	int				prefer_role;
 	enum typec_data_role		data_role;
 	enum typec_role			pwr_role;
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 45e28d14ae56..7751bedcae5d 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -22,6 +22,8 @@ struct typec_altmode_ops;
 struct fwnode_handle;
 struct device;
 
+struct usb_power_delivery;
+
 enum typec_port_type {
 	TYPEC_PORT_SRC,
 	TYPEC_PORT_SNK,
@@ -223,6 +225,8 @@ struct typec_partner_desc {
  * @pr_set: Set Power Role
  * @vconn_set: Source VCONN
  * @port_type_set: Set port type
+ * @pd_get: Get available USB Power Delivery Capabilities.
+ * @pd_set: Set USB Power Delivery Capabilities.
  */
 struct typec_operations {
 	int (*try_role)(struct typec_port *port, int role);
@@ -231,6 +235,8 @@ struct typec_operations {
 	int (*vconn_set)(struct typec_port *port, enum typec_role role);
 	int (*port_type_set)(struct typec_port *port,
 			     enum typec_port_type type);
+	struct usb_power_delivery **(*pd_get)(struct typec_port *port);
+	int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd);
 };
 
 enum usb_pd_svdm_ver {
@@ -250,6 +256,7 @@ enum usb_pd_svdm_ver {
  * @accessory: Supported Accessory Modes
  * @fwnode: Optional fwnode of the port
  * @driver_data: Private pointer for driver specific info
+ * @pd: Optional USB Power Delivery Support
  * @ops: Port operations vector
  *
  * Static capabilities of a single USB Type-C port.
@@ -267,6 +274,8 @@ struct typec_capability {
 	struct fwnode_handle	*fwnode;
 	void			*driver_data;
 
+	struct usb_power_delivery *pd;
+
 	const struct typec_operations	*ops;
 };
 
@@ -318,4 +327,8 @@ void typec_partner_set_svdm_version(struct typec_partner *partner,
 				    enum usb_pd_svdm_ver svdm_version);
 int typec_get_negotiated_svdm_version(struct typec_port *port);
 
+int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_delivery *pd);
+int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
+					 struct usb_power_delivery *pd);
+
 #endif /* __LINUX_USB_TYPEC_H */
-- 
cgit 


From e146caf303493c4f2458173d7f1598b76a9b1396 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 6 May 2022 19:18:07 +0300
Subject: usb: Avoid extra usb SET_SEL requests when enabling link power
 management

The host needs to tell the device the exit latencies using the SET_SEL
request before device initiated link powermanagement can be enabled.

The exit latency values do not change after enumeration, it's enough
to set them once. So do like Windows 10 and issue the SET_SEL request
once just before setting the configuration.

This is also the sequence described in USB 3.2 specs "9.1.2 Bus
enumeration". SET_SEL is issued once before the Set Configuration
request, and won't be cleared by the Set Configuration,
Set Interface or ClearFeature (STALL) requests.

Only warm reset, hot reset, set Address 0 clears the exit latencies.
See USB 3.2 section 9.4.14 Table 9-10 Device parameters and events

Add udev->lpm_devinit_allow, and set it if SET_SEL was successful.
If not set, then don't try to enable device initiated LPM

We used to issue a SET_SEL request every time lpm is enabled for either
U1 or U2 link states, meaning a SET_SEL was issued twice after every
Set Configuration and Set Interface requests, easily accumulating to
over 15 SET_SEL requets during a USB3 webcam enumeration.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20220506161807.3369439-1-mathias.nyman@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c | 60 ++++++++++++++++++--------------------------------
 include/linux/usb.h    |  2 ++
 2 files changed, 23 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index ba406b8d688d..b7f66dcd1fe0 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -3947,7 +3947,7 @@ static const char * const usb3_lpm_names[]  = {
  * This function will fail if the SEL or PEL values for udev are greater than
  * the maximum allowed values for the link state to be enabled.
  */
-static int usb_req_set_sel(struct usb_device *udev, enum usb3_link_state state)
+static int usb_req_set_sel(struct usb_device *udev)
 {
 	struct usb_set_sel_req *sel_values;
 	unsigned long long u1_sel;
@@ -3956,7 +3956,7 @@ static int usb_req_set_sel(struct usb_device *udev, enum usb3_link_state state)
 	unsigned long long u2_pel;
 	int ret;
 
-	if (udev->state != USB_STATE_CONFIGURED)
+	if (!udev->parent || udev->speed < USB_SPEED_SUPER || !udev->lpm_capable)
 		return 0;
 
 	/* Convert SEL and PEL stored in ns to us */
@@ -3973,34 +3973,14 @@ static int usb_req_set_sel(struct usb_device *udev, enum usb3_link_state state)
 	 * latency for the link state, and could start a device-initiated
 	 * U1/U2 when the exit latencies are too high.
 	 */
-	if ((state == USB3_LPM_U1 &&
-				(u1_sel > USB3_LPM_MAX_U1_SEL_PEL ||
-				 u1_pel > USB3_LPM_MAX_U1_SEL_PEL)) ||
-			(state == USB3_LPM_U2 &&
-			 (u2_sel > USB3_LPM_MAX_U2_SEL_PEL ||
-			  u2_pel > USB3_LPM_MAX_U2_SEL_PEL))) {
-		dev_dbg(&udev->dev, "Device-initiated %s disabled due to long SEL %llu us or PEL %llu us\n",
-				usb3_lpm_names[state], u1_sel, u1_pel);
+	if (u1_sel > USB3_LPM_MAX_U1_SEL_PEL ||
+	    u1_pel > USB3_LPM_MAX_U1_SEL_PEL ||
+	    u2_sel > USB3_LPM_MAX_U2_SEL_PEL ||
+	    u2_pel > USB3_LPM_MAX_U2_SEL_PEL) {
+		dev_dbg(&udev->dev, "Device-initiated U1/U2 disabled due to long SEL or PEL\n");
 		return -EINVAL;
 	}
 
-	/*
-	 * If we're enabling device-initiated LPM for one link state,
-	 * but the other link state has a too high SEL or PEL value,
-	 * just set those values to the max in the Set SEL request.
-	 */
-	if (u1_sel > USB3_LPM_MAX_U1_SEL_PEL)
-		u1_sel = USB3_LPM_MAX_U1_SEL_PEL;
-
-	if (u1_pel > USB3_LPM_MAX_U1_SEL_PEL)
-		u1_pel = USB3_LPM_MAX_U1_SEL_PEL;
-
-	if (u2_sel > USB3_LPM_MAX_U2_SEL_PEL)
-		u2_sel = USB3_LPM_MAX_U2_SEL_PEL;
-
-	if (u2_pel > USB3_LPM_MAX_U2_SEL_PEL)
-		u2_pel = USB3_LPM_MAX_U2_SEL_PEL;
-
 	/*
 	 * usb_enable_lpm() can be called as part of a failed device reset,
 	 * which may be initiated by an error path of a mass storage driver.
@@ -4022,6 +4002,10 @@ static int usb_req_set_sel(struct usb_device *udev, enum usb3_link_state state)
 			sel_values, sizeof *(sel_values),
 			USB_CTRL_SET_TIMEOUT);
 	kfree(sel_values);
+
+	if (ret > 0)
+		udev->lpm_devinit_allow = 1;
+
 	return ret;
 }
 
@@ -4137,6 +4121,9 @@ static bool usb_device_may_initiate_lpm(struct usb_device *udev,
 	unsigned int sel;		/* us */
 	int i, j;
 
+	if (!udev->lpm_devinit_allow)
+		return false;
+
 	if (state == USB3_LPM_U1)
 		sel = DIV_ROUND_UP(udev->u1_params.sel, 1000);
 	else if (state == USB3_LPM_U2)
@@ -4185,7 +4172,7 @@ static bool usb_device_may_initiate_lpm(struct usb_device *udev,
 static void usb_enable_link_state(struct usb_hcd *hcd, struct usb_device *udev,
 		enum usb3_link_state state)
 {
-	int timeout, ret;
+	int timeout;
 	__u8 u1_mel = udev->bos->ss_cap->bU1devExitLat;
 	__le16 u2_mel = udev->bos->ss_cap->bU2DevExitLat;
 
@@ -4197,17 +4184,6 @@ static void usb_enable_link_state(struct usb_hcd *hcd, struct usb_device *udev,
 			(state == USB3_LPM_U2 && u2_mel == 0))
 		return;
 
-	/*
-	 * First, let the device know about the exit latencies
-	 * associated with the link state we're about to enable.
-	 */
-	ret = usb_req_set_sel(udev, state);
-	if (ret < 0) {
-		dev_warn(&udev->dev, "Set SEL for device-initiated %s failed.\n",
-				usb3_lpm_names[state]);
-		return;
-	}
-
 	/* We allow the host controller to set the U1/U2 timeout internally
 	 * first, so that it can change its schedule to account for the
 	 * additional latency to send data to a device in a lower power
@@ -4487,6 +4463,11 @@ static int hub_handle_remote_wakeup(struct usb_hub *hub, unsigned int port,
 	return 0;
 }
 
+static int usb_req_set_sel(struct usb_device *udev)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_PM */
 
 /*
@@ -5012,6 +4993,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 			udev->lpm_capable = usb_device_supports_lpm(udev);
 			udev->lpm_disable_count = 1;
 			usb_set_lpm_parameters(udev);
+			usb_req_set_sel(udev);
 		}
 	}
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 60bee864d897..f7a9914fc97f 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -584,6 +584,7 @@ struct usb3_lpm_parameters {
  * @authenticated: Crypto authentication passed
  * @wusb: device is Wireless USB
  * @lpm_capable: device supports LPM
+ * @lpm_devinit_allow: Allow USB3 device initiated LPM, exit latency is in range
  * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
  * @usb2_hw_lpm_besl_capable: device can perform USB2 hardware BESL LPM
  * @usb2_hw_lpm_enabled: USB2 hardware LPM is enabled
@@ -666,6 +667,7 @@ struct usb_device {
 	unsigned authenticated:1;
 	unsigned wusb:1;
 	unsigned lpm_capable:1;
+	unsigned lpm_devinit_allow:1;
 	unsigned usb2_hw_lpm_capable:1;
 	unsigned usb2_hw_lpm_besl_capable:1;
 	unsigned usb2_hw_lpm_enabled:1;
-- 
cgit 


From 219160be496f7f9cd105c5708e37cf22ab4ce0c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 10 Jun 2022 20:30:16 -0700
Subject: tcp: sk_forced_mem_schedule() optimization

sk_memory_allocated_add() has three callers, and returns
to them @memory_allocated.

sk_forced_mem_schedule() is one of them, and ignores
the returned value.

Change sk_memory_allocated_add() to return void.

Change sock_reserve_memory() and __sk_mem_raise_allocated()
to call sk_memory_allocated().

This removes one cache line miss [1] for RPC workloads,
as first skbs in TCP write queue and receive queue go through
sk_forced_mem_schedule().

[1] Cache line holding tcp_memory_allocated.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 3 +--
 net/core/sock.c    | 9 ++++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 0063e8410a4e..304a5e39d41e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1412,7 +1412,7 @@ sk_memory_allocated(const struct sock *sk)
 /* 1 MB per cpu, in page units */
 #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
 
-static inline long
+static inline void
 sk_memory_allocated_add(struct sock *sk, int amt)
 {
 	int local_reserve;
@@ -1424,7 +1424,6 @@ sk_memory_allocated_add(struct sock *sk, int amt)
 		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
 	}
 	preempt_enable();
-	return sk_memory_allocated(sk);
 }
 
 static inline void
diff --git a/net/core/sock.c b/net/core/sock.c
index 697d5c8e2f0d..92a0296ccb18 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1019,7 +1019,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 		return -ENOMEM;
 
 	/* pre-charge to forward_alloc */
-	allocated = sk_memory_allocated_add(sk, pages);
+	sk_memory_allocated_add(sk, pages);
+	allocated = sk_memory_allocated(sk);
 	/* If the system goes into memory pressure with this
 	 * precharge, give up and return error.
 	 */
@@ -2906,11 +2907,13 @@ EXPORT_SYMBOL(sk_wait_data);
  */
 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
-	struct proto *prot = sk->sk_prot;
-	long allocated = sk_memory_allocated_add(sk, amt);
 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
+	struct proto *prot = sk->sk_prot;
 	bool charged = true;
+	long allocated;
 
+	sk_memory_allocated_add(sk, amt);
+	allocated = sk_memory_allocated(sk);
 	if (memcg_charge &&
 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
 						gfp_memcg_charge())))
-- 
cgit 


From c04245328dd7e915e21ac6395ffd218616e22754 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Fri, 10 Jun 2022 17:10:17 +0800
Subject: net: make __sys_accept4_file() static

__sys_accept4_file() isn't used outside of the file, make it static.

As the same time, move file_flags and nofile parameters into
__sys_accept4_file().

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h |  4 ----
 net/socket.c           | 15 ++++++---------
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 17311ad9f9af..414b8c7bb8f7 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -428,10 +428,6 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
 extern int __sys_sendto(int fd, void __user *buff, size_t len,
 			unsigned int flags, struct sockaddr __user *addr,
 			int addr_len);
-extern int __sys_accept4_file(struct file *file, unsigned file_flags,
-			struct sockaddr __user *upeer_sockaddr,
-			 int __user *upeer_addrlen, int flags,
-			 unsigned long nofile);
 extern struct file *do_accept(struct file *file, unsigned file_flags,
 			      struct sockaddr __user *upeer_sockaddr,
 			      int __user *upeer_addrlen, int flags);
diff --git a/net/socket.c b/net/socket.c
index 2bc8773d9dc5..1b6f5e2ebef5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1878,10 +1878,8 @@ out_fd:
 	return ERR_PTR(err);
 }
 
-int __sys_accept4_file(struct file *file, unsigned file_flags,
-		       struct sockaddr __user *upeer_sockaddr,
-		       int __user *upeer_addrlen, int flags,
-		       unsigned long nofile)
+static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr,
+			      int __user *upeer_addrlen, int flags)
 {
 	struct file *newfile;
 	int newfd;
@@ -1892,11 +1890,11 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
 	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
 		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
-	newfd = __get_unused_fd_flags(flags, nofile);
+	newfd = get_unused_fd_flags(flags);
 	if (unlikely(newfd < 0))
 		return newfd;
 
-	newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
+	newfile = do_accept(file, 0, upeer_sockaddr, upeer_addrlen,
 			    flags);
 	if (IS_ERR(newfile)) {
 		put_unused_fd(newfd);
@@ -1926,9 +1924,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 
 	f = fdget(fd);
 	if (f.file) {
-		ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
-						upeer_addrlen, flags,
-						rlimit(RLIMIT_NOFILE));
+		ret = __sys_accept4_file(f.file, upeer_sockaddr,
+					 upeer_addrlen, flags);
 		fdput(f);
 	}
 
-- 
cgit 


From 9f1c8677724a0e6a6ac7a74d2b0192a584df859d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Jun 2022 12:30:29 +0200
Subject: ASoC: hdmi-codec: Update to modern DAI terminology

As part of retiring the old defines used to specify DAI formats update the
hdmi_codec driver to use the modern names, including the variables in the
struct hdmi_codec_daifmt exported to the DRM drivers.

In updating this I did note that the only use of this information in DRM
drivers is to reject clock provider settings, thinking about what this
hardware is doing I rather suspect that there might not be any hardware
out there which needs the configuration so it may be worth considering
just having hdmi-codec support only clock consumer.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220602103029.3498791-1-broonie@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/gpu/drm/bridge/sii902x.c                    |  5 +++--
 drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c |  2 +-
 drivers/gpu/drm/exynos/exynos_hdmi.c                |  8 ++++----
 drivers/gpu/drm/i2c/tda998x_drv.c                   |  6 +++---
 drivers/gpu/drm/sti/sti_hdmi.c                      |  8 ++++----
 include/sound/hdmi-codec.h                          |  4 ++--
 sound/soc/codecs/hdmi-codec.c                       | 18 +++++++++---------
 7 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/sii902x.c b/drivers/gpu/drm/bridge/sii902x.c
index 65549fbfdc87..be9736f67542 100644
--- a/drivers/gpu/drm/bridge/sii902x.c
+++ b/drivers/gpu/drm/bridge/sii902x.c
@@ -549,8 +549,9 @@ static int sii902x_audio_hw_params(struct device *dev, void *data,
 	unsigned long mclk_rate;
 	int i, ret;
 
-	if (daifmt->bit_clk_master || daifmt->frame_clk_master) {
-		dev_dbg(dev, "%s: I2S master mode not supported\n", __func__);
+	if (daifmt->bit_clk_provider || daifmt->frame_clk_provider) {
+		dev_dbg(dev, "%s: I2S clock provider mode not supported\n",
+			__func__);
 		return -EINVAL;
 	}
 
diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c
index f50b47ac11a8..a2f0860b20bb 100644
--- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c
+++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-i2s-audio.c
@@ -45,7 +45,7 @@ static int dw_hdmi_i2s_hw_params(struct device *dev, void *data,
 	u8 inputclkfs = 0;
 
 	/* it cares I2S only */
-	if (fmt->bit_clk_master | fmt->frame_clk_master) {
+	if (fmt->bit_clk_provider | fmt->frame_clk_provider) {
 		dev_err(dev, "unsupported clock settings\n");
 		return -EINVAL;
 	}
diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c
index 7655142a4651..10b0036f8a2e 100644
--- a/drivers/gpu/drm/exynos/exynos_hdmi.c
+++ b/drivers/gpu/drm/exynos/exynos_hdmi.c
@@ -1594,12 +1594,12 @@ static int hdmi_audio_hw_params(struct device *dev, void *data,
 	struct hdmi_context *hdata = dev_get_drvdata(dev);
 
 	if (daifmt->fmt != HDMI_I2S || daifmt->bit_clk_inv ||
-	    daifmt->frame_clk_inv || daifmt->bit_clk_master ||
-	    daifmt->frame_clk_master) {
+	    daifmt->frame_clk_inv || daifmt->bit_clk_provider ||
+	    daifmt->frame_clk_provider) {
 		dev_err(dev, "%s: Bad flags %d %d %d %d\n", __func__,
 			daifmt->bit_clk_inv, daifmt->frame_clk_inv,
-			daifmt->bit_clk_master,
-			daifmt->frame_clk_master);
+			daifmt->bit_clk_provider,
+			daifmt->frame_clk_provider);
 		return -EINVAL;
 	}
 
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c
index b7ec6c374fbd..c4fadaecbb2d 100644
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -1095,11 +1095,11 @@ static int tda998x_audio_hw_params(struct device *dev, void *data,
 
 	if (!spdif &&
 	    (daifmt->bit_clk_inv || daifmt->frame_clk_inv ||
-	     daifmt->bit_clk_master || daifmt->frame_clk_master)) {
+	     daifmt->bit_clk_provider || daifmt->frame_clk_provider)) {
 		dev_err(dev, "%s: Bad flags %d %d %d %d\n", __func__,
 			daifmt->bit_clk_inv, daifmt->frame_clk_inv,
-			daifmt->bit_clk_master,
-			daifmt->frame_clk_master);
+			daifmt->bit_clk_provider,
+			daifmt->frame_clk_provider);
 		return -EINVAL;
 	}
 
diff --git a/drivers/gpu/drm/sti/sti_hdmi.c b/drivers/gpu/drm/sti/sti_hdmi.c
index b3fbee7eac11..65c76077e866 100644
--- a/drivers/gpu/drm/sti/sti_hdmi.c
+++ b/drivers/gpu/drm/sti/sti_hdmi.c
@@ -1175,12 +1175,12 @@ static int hdmi_audio_hw_params(struct device *dev,
 	DRM_DEBUG_DRIVER("\n");
 
 	if ((daifmt->fmt != HDMI_I2S) || daifmt->bit_clk_inv ||
-	    daifmt->frame_clk_inv || daifmt->bit_clk_master ||
-	    daifmt->frame_clk_master) {
+	    daifmt->frame_clk_inv || daifmt->bit_clk_provider ||
+	    daifmt->frame_clk_provider) {
 		dev_err(dev, "%s: Bad flags %d %d %d %d\n", __func__,
 			daifmt->bit_clk_inv, daifmt->frame_clk_inv,
-			daifmt->bit_clk_master,
-			daifmt->frame_clk_master);
+			daifmt->bit_clk_provider,
+			daifmt->frame_clk_provider);
 		return -EINVAL;
 	}
 
diff --git a/include/sound/hdmi-codec.h b/include/sound/hdmi-codec.h
index 4fc733c8c570..48ad33aba393 100644
--- a/include/sound/hdmi-codec.h
+++ b/include/sound/hdmi-codec.h
@@ -32,8 +32,8 @@ struct hdmi_codec_daifmt {
 	} fmt;
 	unsigned int bit_clk_inv:1;
 	unsigned int frame_clk_inv:1;
-	unsigned int bit_clk_master:1;
-	unsigned int frame_clk_master:1;
+	unsigned int bit_clk_provider:1;
+	unsigned int frame_clk_provider:1;
 	/* bit_fmt could be standard PCM format or
 	 * IEC958 encoded format. ALSA IEC958 plugin will pass
 	 * IEC958_SUBFRAME format to the underneath driver.
diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c
index b773466619b2..7d1e351f863a 100644
--- a/sound/soc/codecs/hdmi-codec.c
+++ b/sound/soc/codecs/hdmi-codec.c
@@ -606,18 +606,18 @@ static int hdmi_codec_i2s_set_fmt(struct snd_soc_dai *dai,
 	/* Reset daifmt */
 	memset(cf, 0, sizeof(*cf));
 
-	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
-	case SND_SOC_DAIFMT_CBM_CFM:
-		cf->bit_clk_master = 1;
-		cf->frame_clk_master = 1;
+	switch (fmt & SND_SOC_DAIFMT_CLOCK_PROVIDER_MASK) {
+	case SND_SOC_DAIFMT_CBP_CFP:
+		cf->bit_clk_provider = 1;
+		cf->frame_clk_provider = 1;
 		break;
-	case SND_SOC_DAIFMT_CBS_CFM:
-		cf->frame_clk_master = 1;
+	case SND_SOC_DAIFMT_CBC_CFP:
+		cf->frame_clk_provider = 1;
 		break;
-	case SND_SOC_DAIFMT_CBM_CFS:
-		cf->bit_clk_master = 1;
+	case SND_SOC_DAIFMT_CBP_CFC:
+		cf->bit_clk_provider = 1;
 		break;
-	case SND_SOC_DAIFMT_CBS_CFS:
+	case SND_SOC_DAIFMT_CBC_CFC:
 		break;
 	default:
 		return -EINVAL;
-- 
cgit 


From 0eb6584068642767baab17c2e4385a6b9c029caa Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:36 +0200
Subject: platform/surface: aggregator: Allow is_ssam_device() to be used when
 CONFIG_SURFACE_AGGREGATOR_BUS is disabled

In SSAM subsystem drivers that handle both ACPI and SSAM-native client
devices, we may want to check whether we have a SSAM (native) client
device. Further, we may want to do this even when instantiation thereof
cannot happen due to CONFIG_SURFACE_AGGREGATOR_BUS=n. Currently, doing
so causes an error due to an undefined reference error due to
ssam_device_type being placed in the bus source unit.

Therefore, if CONFIG_SURFACE_AGGREGATOR_BUS is not defined, simply let
is_ssam_device() return false to prevent this error.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/device.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index cc257097eb05..62b38b4487eb 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -177,6 +177,8 @@ struct ssam_device_driver {
 	void (*remove)(struct ssam_device *sdev);
 };
 
+#ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+
 extern struct bus_type ssam_bus_type;
 extern const struct device_type ssam_device_type;
 
@@ -193,6 +195,15 @@ static inline bool is_ssam_device(struct device *d)
 	return d->type == &ssam_device_type;
 }
 
+#else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+static inline bool is_ssam_device(struct device *d)
+{
+	return false;
+}
+
+#endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
 /**
  * to_ssam_device() - Casts the given device to a SSAM client device.
  * @d: The device to cast.
-- 
cgit 


From dc0393c76f378f68961587fd4f32de29fb8f0c79 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:37 +0200
Subject: platform/surface: aggregator: Allow devices to be marked as
 hot-removed

Some SSAM devices, notably the keyboard cover (keyboard and touchpad) on
the Surface Pro 8, can be hot-removed. When this occurs, communication
with the device may fail and time out. This timeout can unnecessarily
block and slow down device removal and even cause issues when the
devices are detached and re-attached quickly. Thus, communication should
generally be avoided once hot-removal is detected.

While we already remove a device as soon as we detect its (hot-)removal,
the corresponding device driver may still attempt to communicate with
the device during teardown. This is especially critical as communication
failure may also extend to disabling of events, which is typically done
at that stage.

Add a flag to allow marking devices as hot-removed. This can then be
used during client driver teardown to check if any communication
attempts should be avoided.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-3-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/device.h | 48 +++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 62b38b4487eb..6df7c8d4e50e 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -148,17 +148,30 @@ struct ssam_device_uid {
 #define SSAM_SDEV(cat, tid, iid, fun) \
 	SSAM_DEVICE(SSAM_DOMAIN_SERIALHUB, SSAM_SSH_TC_##cat, tid, iid, fun)
 
+/*
+ * enum ssam_device_flags - Flags for SSAM client devices.
+ * @SSAM_DEVICE_HOT_REMOVED_BIT:
+ *	The device has been hot-removed. Further communication with it may time
+ *	out and should be avoided.
+ */
+enum ssam_device_flags {
+	SSAM_DEVICE_HOT_REMOVED_BIT = 0,
+};
+
 /**
  * struct ssam_device - SSAM client device.
- * @dev:  Driver model representation of the device.
- * @ctrl: SSAM controller managing this device.
- * @uid:  UID identifying the device.
+ * @dev:   Driver model representation of the device.
+ * @ctrl:  SSAM controller managing this device.
+ * @uid:   UID identifying the device.
+ * @flags: Device state flags, see &enum ssam_device_flags.
  */
 struct ssam_device {
 	struct device dev;
 	struct ssam_controller *ctrl;
 
 	struct ssam_device_uid uid;
+
+	unsigned long flags;
 };
 
 /**
@@ -251,6 +264,35 @@ struct ssam_device *ssam_device_alloc(struct ssam_controller *ctrl,
 int ssam_device_add(struct ssam_device *sdev);
 void ssam_device_remove(struct ssam_device *sdev);
 
+/**
+ * ssam_device_mark_hot_removed() - Mark the given device as hot-removed.
+ * @sdev: The device to mark as hot-removed.
+ *
+ * Mark the device as having been hot-removed. This signals drivers using the
+ * device that communication with the device should be avoided and may lead to
+ * timeouts.
+ */
+static inline void ssam_device_mark_hot_removed(struct ssam_device *sdev)
+{
+	dev_dbg(&sdev->dev, "marking device as hot-removed\n");
+	set_bit(SSAM_DEVICE_HOT_REMOVED_BIT, &sdev->flags);
+}
+
+/**
+ * ssam_device_is_hot_removed() - Check if the given device has been
+ * hot-removed.
+ * @sdev: The device to check.
+ *
+ * Checks if the given device has been marked as hot-removed. See
+ * ssam_device_mark_hot_removed() for more details.
+ *
+ * Return: Returns ``true`` if the device has been marked as hot-removed.
+ */
+static inline bool ssam_device_is_hot_removed(struct ssam_device *sdev)
+{
+	return test_bit(SSAM_DEVICE_HOT_REMOVED_BIT, &sdev->flags);
+}
+
 /**
  * ssam_device_get() - Increment reference count of SSAM client device.
  * @sdev: The device to increment the reference count of.
-- 
cgit 


From 5c1e88b98c60e4074796e9a05d3c674479ab1919 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:38 +0200
Subject: platform/surface: aggregator: Allow notifiers to avoid communication
 on unregistering

When SSAM client devices have been (physically) hot-removed,
communication attempts with those devices may fail and time out. This
can even extend to event notifiers, due to which timeouts may occur
during device removal, slowing down that process.

Add a parameter to the notifier unregister function that allows skipping
communication with the EC to prevent this. Furthermore, add wrappers for
registering and unregistering notifiers belonging to SSAM client devices
that automatically check if the device has been marked as hot-removed
and communication should be avoided.

Note that non-SSAM client devices can generally not be hot-removed, so
also add a convenience wrapper for those, defaulting to allow
communication.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-4-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../driver-api/surface_aggregator/client.rst       |  6 +-
 drivers/platform/surface/aggregator/controller.c   | 53 +++++++++++------
 include/linux/surface_aggregator/controller.h      | 24 +++++++-
 include/linux/surface_aggregator/device.h          | 66 ++++++++++++++++++++++
 4 files changed, 128 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/surface_aggregator/client.rst b/Documentation/driver-api/surface_aggregator/client.rst
index e519d374c378..27f95abdbe99 100644
--- a/Documentation/driver-api/surface_aggregator/client.rst
+++ b/Documentation/driver-api/surface_aggregator/client.rst
@@ -17,6 +17,8 @@
 .. |SSAM_DEVICE| replace:: :c:func:`SSAM_DEVICE`
 .. |ssam_notifier_register| replace:: :c:func:`ssam_notifier_register`
 .. |ssam_notifier_unregister| replace:: :c:func:`ssam_notifier_unregister`
+.. |ssam_device_notifier_register| replace:: :c:func:`ssam_device_notifier_register`
+.. |ssam_device_notifier_unregister| replace:: :c:func:`ssam_device_notifier_unregister`
 .. |ssam_request_sync| replace:: :c:func:`ssam_request_sync`
 .. |ssam_event_mask| replace:: :c:type:`enum ssam_event_mask <ssam_event_mask>`
 
@@ -312,7 +314,9 @@ Handling Events
 To receive events from the SAM EC, an event notifier must be registered for
 the desired event via |ssam_notifier_register|. The notifier must be
 unregistered via |ssam_notifier_unregister| once it is not required any
-more.
+more. For |ssam_device| type clients, the |ssam_device_notifier_register| and
+|ssam_device_notifier_unregister| wrappers should be preferred as they properly
+handle hot-removal of client devices.
 
 Event notifiers are registered by providing (at minimum) a callback to call
 in case an event has been received, the registry specifying how the event
diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index b8c377b3f932..6de834b52b63 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -2199,16 +2199,26 @@ static int ssam_nf_refcount_enable(struct ssam_controller *ctrl,
 }
 
 /**
- * ssam_nf_refcount_disable_free() - Disable event for reference count entry if it is
- * no longer in use and free the corresponding entry.
+ * ssam_nf_refcount_disable_free() - Disable event for reference count entry if
+ * it is no longer in use and free the corresponding entry.
  * @ctrl:  The controller to disable the event on.
  * @entry: The reference count entry for the event to be disabled.
  * @flags: The flags used for enabling the event on the EC.
+ * @ec:    Flag specifying if the event should actually be disabled on the EC.
  *
- * If the reference count equals zero, i.e. the event is no longer requested by
- * any client, the event will be disabled and the corresponding reference count
- * entry freed. The reference count entry must not be used any more after a
- * call to this function.
+ * If ``ec`` equals ``true`` and the reference count equals zero (i.e. the
+ * event is no longer requested by any client), the specified event will be
+ * disabled on the EC via the corresponding request.
+ *
+ * If ``ec`` equals ``false``, no request will be sent to the EC and the event
+ * can be considered in a detached state (i.e. no longer used but still
+ * enabled). Disabling an event via this method may be required for
+ * hot-removable devices, where event disable requests may time out after the
+ * device has been physically removed.
+ *
+ * In both cases, if the reference count equals zero, the corresponding
+ * reference count entry will be freed. The reference count entry must not be
+ * used any more after a call to this function.
  *
  * Also checks if the flags used for disabling the event match the flags used
  * for enabling the event and warns if they do not (regardless of reference
@@ -2223,7 +2233,7 @@ static int ssam_nf_refcount_enable(struct ssam_controller *ctrl,
  * returns the status of the event-enable EC command.
  */
 static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
-					 struct ssam_nf_refcount_entry *entry, u8 flags)
+					 struct ssam_nf_refcount_entry *entry, u8 flags, bool ec)
 {
 	const struct ssam_event_registry reg = entry->key.reg;
 	const struct ssam_event_id id = entry->key.id;
@@ -2232,8 +2242,9 @@ static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
 
 	lockdep_assert_held(&nf->lock);
 
-	ssam_dbg(ctrl, "disabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
-		 reg.target_category, id.target_category, id.instance, entry->refcount);
+	ssam_dbg(ctrl, "%s event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 ec ? "disabling" : "detaching", reg.target_category, id.target_category,
+		 id.instance, entry->refcount);
 
 	if (entry->flags != flags) {
 		ssam_warn(ctrl,
@@ -2242,7 +2253,7 @@ static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
 			  id.instance);
 	}
 
-	if (entry->refcount == 0) {
+	if (ec && entry->refcount == 0) {
 		status = ssam_ssh_event_disable(ctrl, reg, id, flags);
 		kfree(entry);
 	}
@@ -2322,20 +2333,26 @@ int ssam_notifier_register(struct ssam_controller *ctrl, struct ssam_event_notif
 EXPORT_SYMBOL_GPL(ssam_notifier_register);
 
 /**
- * ssam_notifier_unregister() - Unregister an event notifier.
- * @ctrl: The controller the notifier has been registered on.
- * @n:    The event notifier to unregister.
+ * __ssam_notifier_unregister() - Unregister an event notifier.
+ * @ctrl:    The controller the notifier has been registered on.
+ * @n:       The event notifier to unregister.
+ * @disable: Whether to disable the corresponding event on the EC.
  *
  * Unregister an event notifier. Decrement the usage counter of the associated
  * SAM event if the notifier is not marked as an observer. If the usage counter
- * reaches zero, the event will be disabled.
+ * reaches zero and ``disable`` equals ``true``, the event will be disabled.
+ *
+ * Useful for hot-removable devices, where communication may fail once the
+ * device has been physically removed. In that case, specifying ``disable`` as
+ * ``false`` avoids communication with the EC.
  *
  * Return: Returns zero on success, %-ENOENT if the given notifier block has
  * not been registered on the controller. If the given notifier block was the
  * last one associated with its specific event, returns the status of the
  * event-disable EC-command.
  */
-int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_notifier *n)
+int __ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_notifier *n,
+			       bool disable)
 {
 	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
 	struct ssam_nf_refcount_entry *entry;
@@ -2373,7 +2390,7 @@ int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_not
 			goto remove;
 		}
 
-		status = ssam_nf_refcount_disable_free(ctrl, entry, n->event.flags);
+		status = ssam_nf_refcount_disable_free(ctrl, entry, n->event.flags, disable);
 	}
 
 remove:
@@ -2383,7 +2400,7 @@ remove:
 
 	return status;
 }
-EXPORT_SYMBOL_GPL(ssam_notifier_unregister);
+EXPORT_SYMBOL_GPL(__ssam_notifier_unregister);
 
 /**
  * ssam_controller_event_enable() - Enable the specified event.
@@ -2477,7 +2494,7 @@ int ssam_controller_event_disable(struct ssam_controller *ctrl,
 		return -ENOENT;
 	}
 
-	status = ssam_nf_refcount_disable_free(ctrl, entry, flags);
+	status = ssam_nf_refcount_disable_free(ctrl, entry, flags, true);
 
 	mutex_unlock(&nf->lock);
 	return status;
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 74bfdffaf7b0..50a2b4926c06 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -835,8 +835,28 @@ struct ssam_event_notifier {
 int ssam_notifier_register(struct ssam_controller *ctrl,
 			   struct ssam_event_notifier *n);
 
-int ssam_notifier_unregister(struct ssam_controller *ctrl,
-			     struct ssam_event_notifier *n);
+int __ssam_notifier_unregister(struct ssam_controller *ctrl,
+			       struct ssam_event_notifier *n, bool disable);
+
+/**
+ * ssam_notifier_unregister() - Unregister an event notifier.
+ * @ctrl:    The controller the notifier has been registered on.
+ * @n:       The event notifier to unregister.
+ *
+ * Unregister an event notifier. Decrement the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the usage counter
+ * reaches zero, the event will be disabled.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given notifier block has
+ * not been registered on the controller. If the given notifier block was the
+ * last one associated with its specific event, returns the status of the
+ * event-disable EC-command.
+ */
+static inline int ssam_notifier_unregister(struct ssam_controller *ctrl,
+					   struct ssam_event_notifier *n)
+{
+	return __ssam_notifier_unregister(ctrl, n, true);
+}
 
 int ssam_controller_event_enable(struct ssam_controller *ctrl,
 				 struct ssam_event_registry reg,
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 6df7c8d4e50e..c418f7f2732d 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -483,4 +483,70 @@ static inline void ssam_remove_clients(struct device *dev) {}
 				    sdev->uid.instance, ret);		\
 	}
 
+
+/* -- Helpers for client-device notifiers. ---------------------------------- */
+
+/**
+ * ssam_device_notifier_register() - Register an event notifier for the
+ * specified client device.
+ * @sdev: The device the notifier should be registered on.
+ * @n:    The event notifier to register.
+ *
+ * Register an event notifier. Increment the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the event is not
+ * marked as an observer and is currently not enabled, it will be enabled
+ * during this call. If the notifier is marked as an observer, no attempt will
+ * be made at enabling any event and no reference count will be modified.
+ *
+ * Notifiers marked as observers do not need to be associated with one specific
+ * event, i.e. as long as no event matching is performed, only the event target
+ * category needs to be set.
+ *
+ * Return: Returns zero on success, %-ENOSPC if there have already been
+ * %INT_MAX notifiers for the event ID/type associated with the notifier block
+ * registered, %-ENOMEM if the corresponding event entry could not be
+ * allocated, %-ENODEV if the device is marked as hot-removed. If this is the
+ * first time that a notifier block is registered for the specific associated
+ * event, returns the status of the event-enable EC-command.
+ */
+static inline int ssam_device_notifier_register(struct ssam_device *sdev,
+						struct ssam_event_notifier *n)
+{
+	/*
+	 * Note that this check does not provide any guarantees whatsoever as
+	 * hot-removal could happen at any point and we can't protect against
+	 * it. Nevertheless, if we can detect hot-removal, bail early to avoid
+	 * communication timeouts.
+	 */
+	if (ssam_device_is_hot_removed(sdev))
+		return -ENODEV;
+
+	return ssam_notifier_register(sdev->ctrl, n);
+}
+
+/**
+ * ssam_device_notifier_unregister() - Unregister an event notifier for the
+ * specified client device.
+ * @sdev: The device the notifier has been registered on.
+ * @n:    The event notifier to unregister.
+ *
+ * Unregister an event notifier. Decrement the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the usage counter
+ * reaches zero, the event will be disabled.
+ *
+ * In case the device has been marked as hot-removed, the event will not be
+ * disabled on the EC, as in those cases any attempt at doing so may time out.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given notifier block has
+ * not been registered on the controller. If the given notifier block was the
+ * last one associated with its specific event, returns the status of the
+ * event-disable EC-command.
+ */
+static inline int ssam_device_notifier_unregister(struct ssam_device *sdev,
+						  struct ssam_event_notifier *n)
+{
+	return __ssam_notifier_unregister(sdev->ctrl, n,
+					  !ssam_device_is_hot_removed(sdev));
+}
+
 #endif /* _LINUX_SURFACE_AGGREGATOR_DEVICE_H */
-- 
cgit 


From 25e2ca7301bd3ca5a63a6be41d729eb42202bc21 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:43 +0200
Subject: platform/surface: aggregator: Add comment for KIP subsystem category

The KIP subsystem (full name unknown, abbreviation has been obtained
through reverse engineering) handles detachable peripherals such as the
keyboard cover on the Surface Pro X and Surface Pro 8.

It is currently not entirely clear what this subsystem entails, but at
the very least it provides event notifications for when the keyboard
cover on the Surface Pro X and Surface Pro 8 have been detached or
re-attached, as well as the state that the keyboard cover is currently
in (e.g. folded-back, folded laptop-like, closed, etc.).

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-9-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/serial_hub.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index c3de43edcffa..26b95ec12733 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -306,7 +306,7 @@ enum ssam_ssh_tc {
 	SSAM_SSH_TC_LPC = 0x0b,
 	SSAM_SSH_TC_TCL = 0x0c,
 	SSAM_SSH_TC_SFL = 0x0d,
-	SSAM_SSH_TC_KIP = 0x0e,
+	SSAM_SSH_TC_KIP = 0x0e,	/* Manages detachable peripherals (Pro X/8 keyboard cover) */
 	SSAM_SSH_TC_EXT = 0x0f,
 	SSAM_SSH_TC_BLD = 0x10,
 	SSAM_SSH_TC_BAS = 0x11,	/* Detachment system (Surface Book 2/3). */
-- 
cgit 


From 546093206ba16623c18e344630dbfdd71a4327e0 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Sat, 11 Jun 2022 17:23:04 +0800
Subject: audit: make is_audit_feature_set() static

Currently nobody use is_audit_feature_set() outside this file, so make
it static.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 2 --
 kernel/audit.c        | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index cece70231138..00f7a80f1a3e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -119,8 +119,6 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_INVALID,
 };
 
-extern int is_audit_feature_set(int which);
-
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
diff --git a/kernel/audit.c b/kernel/audit.c
index 0749211d5552..a75978ae38ad 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1100,7 +1100,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
 	audit_log_common_recv_msg(NULL, ab, msg_type);
 }
 
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
 {
 	return af.features & AUDIT_FEATURE_TO_MASK(i);
 }
-- 
cgit 


From 795e10b450a88b2943a241a5eaa6e86ae4f47694 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 7 Jun 2022 15:47:43 +0300
Subject: net/mlx5: Introduce header-modify-pattern ICM properties

Added new fields for device memory capabilities, in order to
support creation of ICM memory for modify header patterns.

Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fd7d083a34d3..789d64400744 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1086,11 +1086,14 @@ struct mlx5_ifc_device_mem_cap_bits {
 	u8         log_sw_icm_alloc_granularity[0x6];
 	u8         log_steering_sw_icm_size[0x8];
 
-	u8         reserved_at_120[0x20];
+	u8         reserved_at_120[0x18];
+	u8         log_header_modify_pattern_sw_icm_size[0x8];
 
 	u8         header_modify_sw_icm_start_address[0x40];
 
-	u8         reserved_at_180[0x80];
+	u8         reserved_at_180[0x40];
+
+	u8         header_modify_pattern_sw_icm_start_address[0x40];
 
 	u8         memic_operations[0x20];
 
-- 
cgit 


From 667658364b2056f344a2769280b939a5e45610be Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 7 Jun 2022 15:47:44 +0300
Subject: net/mlx5: Manage ICM of type modify-header pattern

Added support for managing new type of ICM for devices that
support sw_owner_v2.

Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c | 42 ++++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |  1 +
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c
index 3d5e57ff558c..7e02cbe8c3b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c
@@ -12,13 +12,16 @@ struct mlx5_dm {
 	spinlock_t lock;
 	unsigned long *steering_sw_icm_alloc_blocks;
 	unsigned long *header_modify_sw_icm_alloc_blocks;
+	unsigned long *header_modify_pattern_sw_icm_alloc_blocks;
 };
 
 struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev)
 {
+	u64 header_modify_pattern_icm_blocks = 0;
 	u64 header_modify_icm_blocks = 0;
 	u64 steering_icm_blocks = 0;
 	struct mlx5_dm *dm;
+	bool support_v2;
 
 	if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM))
 		return NULL;
@@ -53,8 +56,27 @@ struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev)
 			goto err_modify_hdr;
 	}
 
+	support_v2 = MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) &&
+		     MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2) &&
+		     MLX5_CAP64_DEV_MEM(dev, header_modify_pattern_sw_icm_start_address);
+
+	if (support_v2) {
+		header_modify_pattern_icm_blocks =
+			BIT(MLX5_CAP_DEV_MEM(dev, log_header_modify_pattern_sw_icm_size) -
+			    MLX5_LOG_SW_ICM_BLOCK_SIZE(dev));
+
+		dm->header_modify_pattern_sw_icm_alloc_blocks =
+			kcalloc(BITS_TO_LONGS(header_modify_pattern_icm_blocks),
+				sizeof(unsigned long), GFP_KERNEL);
+		if (!dm->header_modify_pattern_sw_icm_alloc_blocks)
+			goto err_pattern;
+	}
+
 	return dm;
 
+err_pattern:
+	kfree(dm->header_modify_sw_icm_alloc_blocks);
+
 err_modify_hdr:
 	kfree(dm->steering_sw_icm_alloc_blocks);
 
@@ -86,6 +108,14 @@ void mlx5_dm_cleanup(struct mlx5_core_dev *dev)
 		kfree(dm->header_modify_sw_icm_alloc_blocks);
 	}
 
+	if (dm->header_modify_pattern_sw_icm_alloc_blocks) {
+		WARN_ON(!bitmap_empty(dm->header_modify_pattern_sw_icm_alloc_blocks,
+				      BIT(MLX5_CAP_DEV_MEM(dev,
+							   log_header_modify_pattern_sw_icm_size) -
+					  MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))));
+		kfree(dm->header_modify_pattern_sw_icm_alloc_blocks);
+	}
+
 	kfree(dm);
 }
 
@@ -130,6 +160,13 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
 						log_header_modify_sw_icm_size);
 		block_map = dm->header_modify_sw_icm_alloc_blocks;
 		break;
+	case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN:
+		icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
+						    header_modify_pattern_sw_icm_start_address);
+		log_icm_size = MLX5_CAP_DEV_MEM(dev,
+						log_header_modify_pattern_sw_icm_size);
+		block_map = dm->header_modify_pattern_sw_icm_alloc_blocks;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -203,6 +240,11 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
 		icm_start_addr = MLX5_CAP64_DEV_MEM(dev, header_modify_sw_icm_start_address);
 		block_map = dm->header_modify_sw_icm_alloc_blocks;
 		break;
+	case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN:
+		icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
+						    header_modify_pattern_sw_icm_start_address);
+		block_map = dm->header_modify_pattern_sw_icm_alloc_blocks;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5040cd774c5a..76d7661e3e63 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -676,6 +676,7 @@ struct mlx5e_resources {
 enum mlx5_sw_icm_type {
 	MLX5_SW_ICM_TYPE_STEERING,
 	MLX5_SW_ICM_TYPE_HEADER_MODIFY,
+	MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN,
 };
 
 #define MLX5_MAX_RESERVED_GIDS 8
-- 
cgit 


From a6492af3805ae3d9fe872545aa4797971b4e2a33 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 7 Jun 2022 15:47:45 +0300
Subject: RDMA/mlx5: Support handling of modify-header pattern ICM area

Add support for allocate/deallocate and registering MR of the new type
of ICM area. Support exists only for devices that support sw_owner_v2.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/dm.c           | 53 ++++++++++++++++++++-----------
 drivers/infiniband/hw/mlx5/mr.c           |  1 +
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |  1 +
 3 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c
index 001d766cf291..3669c90b2dad 100644
--- a/drivers/infiniband/hw/mlx5/dm.c
+++ b/drivers/infiniband/hw/mlx5/dm.c
@@ -336,9 +336,15 @@ err_copy:
 
 static enum mlx5_sw_icm_type get_icm_type(int uapi_type)
 {
-	return uapi_type == MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM ?
-		       MLX5_SW_ICM_TYPE_STEERING :
-		       MLX5_SW_ICM_TYPE_HEADER_MODIFY;
+	switch (uapi_type) {
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		return MLX5_SW_ICM_TYPE_HEADER_MODIFY;
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
+		return MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN;
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+	default:
+		return MLX5_SW_ICM_TYPE_STEERING;
+	}
 }
 
 static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
@@ -347,11 +353,32 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
 					    int type)
 {
 	struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
-	enum mlx5_sw_icm_type icm_type = get_icm_type(type);
+	enum mlx5_sw_icm_type icm_type;
 	struct mlx5_ib_dm_icm *dm;
 	u64 act_size;
 	int err;
 
+	if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW))
+		return ERR_PTR(-EPERM);
+
+	switch (type) {
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) ||
+		      MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) ||
+		      MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) ||
+		      MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2)))
+			return ERR_PTR(-EOPNOTSUPP);
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
+		if (!MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) ||
+		    !MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))
+			return ERR_PTR(-EOPNOTSUPP);
+		break;
+	default:
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
 	dm = kzalloc(sizeof(*dm), GFP_KERNEL);
 	if (!dm)
 		return ERR_PTR(-ENOMEM);
@@ -359,19 +386,6 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
 	dm->base.type = type;
 	dm->base.ibdm.device = ctx->device;
 
-	if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW)) {
-		err = -EPERM;
-		goto free;
-	}
-
-	if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) ||
-	      MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) ||
-	      MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) ||
-	      MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))) {
-		err = -EOPNOTSUPP;
-		goto free;
-	}
-
 	/* Allocation size must a multiple of the basic block size
 	 * and a power of 2.
 	 */
@@ -379,6 +393,8 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
 	act_size = roundup_pow_of_two(act_size);
 
 	dm->base.size = act_size;
+	icm_type = get_icm_type(type);
+
 	err = mlx5_dm_sw_icm_alloc(dev, icm_type, act_size, attr->alignment,
 				   to_mucontext(ctx)->devx_uid,
 				   &dm->base.dev_addr, &dm->obj_id);
@@ -420,8 +436,8 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
 		return handle_alloc_dm_memic(context, attr, attrs);
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
-		return handle_alloc_dm_sw_icm(context, attr, attrs, type);
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
 		return handle_alloc_dm_sw_icm(context, attr, attrs, type);
 	default:
 		return ERR_PTR(-EOPNOTSUPP);
@@ -474,6 +490,7 @@ static int mlx5_ib_dealloc_dm(struct ib_dm *ibdm,
 		return 0;
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
 		return mlx5_dm_icm_dealloc(ctx, to_icm(ibdm));
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 1e7653c997b5..aedfd7ff4846 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1083,6 +1083,7 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 		break;
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
 			return ERR_PTR(-EINVAL);
 
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index a21ca8ece8db..7af9e09ea556 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -63,6 +63,7 @@ enum mlx5_ib_uapi_dm_type {
 	MLX5_IB_UAPI_DM_TYPE_MEMIC,
 	MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM,
 	MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM,
+	MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM,
 };
 
 enum mlx5_ib_uapi_devx_create_event_channel_flags {
-- 
cgit 


From f5d23ee137e51b4e5cd5d263b144d5e6719f6e52 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Wed, 8 Jun 2022 13:04:47 -0700
Subject: net/mlx5: Add IFC bits and enums for flow meter

Add/extend structure layouts and defines for flow meter.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Ariel Levkovich <lariel@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   |   1 +
 include/linux/mlx5/mlx5_ifc.h | 114 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 111 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 604b85dd770a..15ac02eeed4f 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -455,6 +455,7 @@ enum {
 
 	MLX5_OPCODE_UMR			= 0x25,
 
+	MLX5_OPCODE_ACCESS_ASO		= 0x2d,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 789d64400744..91872afb2bfe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -442,7 +442,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         max_modify_header_actions[0x8];
 	u8         max_ft_level[0x8];
 
-	u8         reserved_at_40[0x20];
+	u8         reserved_at_40[0x6];
+	u8         execute_aso[0x1];
+	u8         reserved_at_47[0x19];
 
 	u8         reserved_at_60[0x2];
 	u8         reformat_insert[0x1];
@@ -940,7 +942,17 @@ struct mlx5_ifc_qos_cap_bits {
 
 	u8         max_tsar_bw_share[0x20];
 
-	u8         reserved_at_100[0x700];
+	u8         reserved_at_100[0x20];
+
+	u8         reserved_at_120[0x3];
+	u8         log_meter_aso_granularity[0x5];
+	u8         reserved_at_128[0x3];
+	u8         log_meter_aso_max_alloc[0x5];
+	u8         reserved_at_130[0x3];
+	u8         log_max_num_meter_aso[0x5];
+	u8         reserved_at_138[0x8];
+
+	u8         reserved_at_140[0x6c0];
 };
 
 struct mlx5_ifc_debug_cap_bits {
@@ -3280,6 +3292,7 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800,
 	MLX5_FLOW_CONTEXT_ACTION_IPSEC_DECRYPT = 0x1000,
 	MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT = 0x2000,
+	MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO = 0x4000,
 };
 
 enum {
@@ -3295,6 +3308,38 @@ struct mlx5_ifc_vlan_bits {
 	u8         vid[0xc];
 };
 
+enum {
+	MLX5_FLOW_METER_COLOR_RED	= 0x0,
+	MLX5_FLOW_METER_COLOR_YELLOW	= 0x1,
+	MLX5_FLOW_METER_COLOR_GREEN	= 0x2,
+	MLX5_FLOW_METER_COLOR_UNDEFINED	= 0x3,
+};
+
+enum {
+	MLX5_EXE_ASO_FLOW_METER		= 0x2,
+};
+
+struct mlx5_ifc_exe_aso_ctrl_flow_meter_bits {
+	u8        return_reg_id[0x4];
+	u8        aso_type[0x4];
+	u8        reserved_at_8[0x14];
+	u8        action[0x1];
+	u8        init_color[0x2];
+	u8        meter_id[0x1];
+};
+
+union mlx5_ifc_exe_aso_ctrl {
+	struct mlx5_ifc_exe_aso_ctrl_flow_meter_bits exe_aso_ctrl_flow_meter;
+};
+
+struct mlx5_ifc_execute_aso_bits {
+	u8        valid[0x1];
+	u8        reserved_at_1[0x7];
+	u8        aso_object_id[0x18];
+
+	union mlx5_ifc_exe_aso_ctrl exe_aso_ctrl;
+};
+
 struct mlx5_ifc_flow_context_bits {
 	struct mlx5_ifc_vlan_bits push_vlan;
 
@@ -3326,7 +3371,9 @@ struct mlx5_ifc_flow_context_bits {
 
 	struct mlx5_ifc_fte_match_param_bits match_value;
 
-	u8         reserved_at_1200[0x600];
+	struct mlx5_ifc_execute_aso_bits execute_aso[4];
+
+	u8         reserved_at_1300[0x500];
 
 	union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[];
 };
@@ -5973,7 +6020,9 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
 
 	u8         obj_id[0x20];
 
-	u8         reserved_at_60[0x20];
+	u8         reserved_at_60[0x3];
+	u8         log_obj_range[0x5];
+	u8         reserved_at_68[0x18];
 };
 
 struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
@@ -11373,12 +11422,14 @@ enum {
 	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc),
 	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13),
 	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24),
 };
 
 enum {
 	MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
 	MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13,
 	MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20,
+	MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24,
 };
 
 enum {
@@ -11451,6 +11502,61 @@ struct mlx5_ifc_create_encryption_key_in_bits {
 	struct mlx5_ifc_encryption_key_obj_bits encryption_key_object;
 };
 
+enum {
+	MLX5_FLOW_METER_MODE_BYTES_IP_LENGTH		= 0x0,
+	MLX5_FLOW_METER_MODE_BYTES_CALC_WITH_L2		= 0x1,
+	MLX5_FLOW_METER_MODE_BYTES_CALC_WITH_L2_IPG	= 0x2,
+	MLX5_FLOW_METER_MODE_NUM_PACKETS		= 0x3,
+};
+
+struct mlx5_ifc_flow_meter_parameters_bits {
+	u8         valid[0x1];
+	u8         bucket_overflow[0x1];
+	u8         start_color[0x2];
+	u8         both_buckets_on_green[0x1];
+	u8         reserved_at_5[0x1];
+	u8         meter_mode[0x2];
+	u8         reserved_at_8[0x18];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x3];
+	u8         cbs_exponent[0x5];
+	u8         cbs_mantissa[0x8];
+	u8         reserved_at_50[0x3];
+	u8         cir_exponent[0x5];
+	u8         cir_mantissa[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	u8         reserved_at_80[0x3];
+	u8         ebs_exponent[0x5];
+	u8         ebs_mantissa[0x8];
+	u8         reserved_at_90[0x3];
+	u8         eir_exponent[0x5];
+	u8         eir_mantissa[0x8];
+
+	u8         reserved_at_a0[0x60];
+};
+
+struct mlx5_ifc_flow_meter_aso_obj_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x40];
+
+	u8         reserved_at_80[0x8];
+	u8         meter_aso_access_pd[0x18];
+
+	u8         reserved_at_a0[0x160];
+
+	struct mlx5_ifc_flow_meter_parameters_bits flow_meter_parameters[2];
+};
+
+struct mlx5_ifc_create_flow_meter_aso_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_flow_meter_aso_obj_bits flow_meter_aso_obj;
+};
+
 struct mlx5_ifc_sampler_obj_bits {
 	u8         modify_field_select[0x40];
 
-- 
cgit 


From 3e94e61bd44d90070dcda53b647fdc826097ef26 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Wed, 8 Jun 2022 13:04:48 -0700
Subject: net/mlx5: Add HW definitions of vport debug counters

total_q_under_processor_handle - number of queues in error state due to an
async error or errored command.

send_queue_priority_update_flow - number of QP/SQ priority/SL update
events.

cq_overrun - number of times CQ entered an error state due to an
overflow.

async_eq_overrun -number of time an EQ mapped to async events was
overrun.

comp_eq_overrun - number of time an EQ mapped to completion events was
overrun.

quota_exceeded_command - number of commands issued and failed due to quota
exceeded.

invalid_command - number of commands issued and failed dues to any reason
other than quota exceeded.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 91872afb2bfe..f678ad88a7d5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1441,7 +1441,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_120[0xa];
 	u8         log_max_ra_req_dc[0x6];
-	u8         reserved_at_130[0xa];
+	u8         reserved_at_130[0x9];
+	u8         vnic_env_cq_overrun[0x1];
 	u8         log_max_ra_res_dc[0x6];
 
 	u8         reserved_at_140[0x5];
@@ -1636,7 +1637,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         nic_receive_steering_discard[0x1];
 	u8         receive_discard_vport_down[0x1];
 	u8         transmit_discard_vport_down[0x1];
-	u8         reserved_at_343[0x5];
+	u8         eq_overrun_count[0x1];
+	u8         reserved_at_344[0x1];
+	u8         invalid_command_count[0x1];
+	u8         quota_exceeded_count[0x1];
+	u8         reserved_at_347[0x1];
 	u8         log_max_flow_counter_bulk[0x8];
 	u8         max_flow_counter_15_0[0x10];
 
@@ -3441,11 +3446,21 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits {
 
 	u8         transmit_discard_vport_down[0x40];
 
-	u8         reserved_at_140[0xa0];
+	u8         async_eq_overrun[0x20];
+
+	u8         comp_eq_overrun[0x20];
+
+	u8         reserved_at_180[0x20];
+
+	u8         invalid_command[0x20];
+
+	u8         quota_exceeded_command[0x20];
 
 	u8         internal_rq_out_of_buffer[0x20];
 
-	u8         reserved_at_200[0xe00];
+	u8         cq_overrun[0x20];
+
+	u8         reserved_at_220[0xde0];
 };
 
 struct mlx5_ifc_traffic_counter_bits {
-- 
cgit 


From 91707779a481aab9c7f1d7a5ea3033ce87dc4fd6 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Wed, 8 Jun 2022 13:04:49 -0700
Subject: net/mlx5: Add support EXECUTE_ASO action for flow entry

Attach flow meter to FTE with object id and index.
Use metadata register C5 to store the packet color meter result.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 33 ++++++++++++++++++++++++
 include/linux/mlx5/fs.h                          | 14 ++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 2ccf7bef9b05..735dc805dad7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -479,6 +479,30 @@ static int mlx5_set_extended_dest(struct mlx5_core_dev *dev,
 
 	return 0;
 }
+
+static void
+mlx5_cmd_set_fte_flow_meter(struct fs_fte *fte, void *in_flow_context)
+{
+	void *exe_aso_ctrl;
+	void *execute_aso;
+
+	execute_aso = MLX5_ADDR_OF(flow_context, in_flow_context,
+				   execute_aso[0]);
+	MLX5_SET(execute_aso, execute_aso, valid, 1);
+	MLX5_SET(execute_aso, execute_aso, aso_object_id,
+		 fte->action.exe_aso.object_id);
+
+	exe_aso_ctrl = MLX5_ADDR_OF(execute_aso, execute_aso, exe_aso_ctrl);
+	MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, return_reg_id,
+		 fte->action.exe_aso.return_reg_id);
+	MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, aso_type,
+		 fte->action.exe_aso.type);
+	MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, init_color,
+		 fte->action.exe_aso.flow_meter.init_color);
+	MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, meter_id,
+		 fte->action.exe_aso.flow_meter.meter_idx);
+}
+
 static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			    int opmod, int modify_mask,
 			    struct mlx5_flow_table *ft,
@@ -663,6 +687,15 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			 list_size);
 	}
 
+	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) {
+		if (fte->action.exe_aso.type == MLX5_EXE_ASO_FLOW_METER) {
+			mlx5_cmd_set_fte_flow_meter(fte, in_flow_context);
+		} else {
+			err = -EOPNOTSUPP;
+			goto err_out;
+		}
+	}
+
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 err_out:
 	kvfree(in);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 8135713b0d2d..ece3e35622d7 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -212,6 +212,19 @@ struct mlx5_flow_group *
 mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in);
 void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 
+struct mlx5_exe_aso {
+	u32 object_id;
+	u8 type;
+	u8 return_reg_id;
+	union {
+		u32 ctrl_data;
+		struct {
+			u8 meter_idx;
+			u8 init_color;
+		} flow_meter;
+	};
+};
+
 struct mlx5_fs_vlan {
         u16 ethtype;
         u16 vid;
@@ -237,6 +250,7 @@ struct mlx5_flow_act {
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
 	struct ib_counters *counters;
 	struct mlx5_flow_group *fg;
+	struct mlx5_exe_aso exe_aso;
 };
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
-- 
cgit 


From d107ba1f7c067b08eb4b8ca7c51187fb7c4d97f2 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Wed, 8 Jun 2022 13:04:51 -0700
Subject: net/mlx5: Remove not used MLX5_CAP_BITS_RW_MASK

Remove not used MLX5_CAP_BITS_RW_MASK.
While at it, remove CAP_MASK, MLX5_CAP_OFF_CMDIF_CSUM
and MLX5_DEV_CAP_FLAG_*, since MLX5_CAP_BITS_RW_MASK
was their only user.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  7 -------
 include/linux/mlx5/device.h                    | 19 -------------------
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c9b4e50a593e..2078d9f03a5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -314,13 +314,6 @@ struct mlx5_reg_host_endianness {
 	u8      rsvd[15];
 };
 
-#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
-
-enum {
-	MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
-				MLX5_DEV_CAP_FLAG_DCT,
-};
-
 static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size)
 {
 	switch (size) {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 15ac02eeed4f..95a4fa0fd40a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -386,21 +386,6 @@ enum {
 	MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG	= 9,
 };
 
-enum {
-	MLX5_DEV_CAP_FLAG_XRC		= 1LL <<  3,
-	MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
-	MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
-	MLX5_DEV_CAP_FLAG_APM		= 1LL << 17,
-	MLX5_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
-	MLX5_DEV_CAP_FLAG_BLOCK_MCAST	= 1LL << 23,
-	MLX5_DEV_CAP_FLAG_ON_DMND_PG	= 1LL << 24,
-	MLX5_DEV_CAP_FLAG_CQ_MODER	= 1LL << 29,
-	MLX5_DEV_CAP_FLAG_RESIZE_CQ	= 1LL << 30,
-	MLX5_DEV_CAP_FLAG_DCT		= 1LL << 37,
-	MLX5_DEV_CAP_FLAG_SIG_HAND_OVER	= 1LL << 40,
-	MLX5_DEV_CAP_FLAG_CMDIF_CSUM	= 3LL << 46,
-};
-
 enum {
 	MLX5_ROCE_VERSION_1		= 0,
 	MLX5_ROCE_VERSION_2		= 2,
@@ -496,10 +481,6 @@ enum {
 	MLX5_MAX_PAGE_SHIFT		= 31
 };
 
-enum {
-	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
-};
-
 enum {
 	/*
 	 * Max wqe size for rdma read is 512 bytes, so this
-- 
cgit 


From cdcdce948d64139aea1c6dfea4b04f5c8ad2784e Mon Sep 17 00:00:00 2001
From: Ofer Levi <oferle@nvidia.com>
Date: Wed, 8 Jun 2022 13:04:52 -0700
Subject: net/mlx5: Add bits and fields to support enhanced CQE compression

Expose ifc bits and add needed structure fields and methods to
support enhanced CQE compression feature.
The enhanced CQE compression feature improves cpu utiliziation with
better packet latency from nic to host.

Signed-off-by: Ofer Levi <oferle@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   | 16 +++++++++++++++-
 include/linux/mlx5/mlx5_ifc.h |  7 +++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 95a4fa0fd40a..b5f58fd37a0f 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -822,7 +822,10 @@ struct mlx5_cqe64 {
 	__be32		timestamp_l;
 	__be32		sop_drop_qpn;
 	__be16		wqe_counter;
-	u8		signature;
+	union {
+		u8	signature;
+		u8	validity_iteration_count;
+	};
 	u8		op_own;
 };
 
@@ -854,6 +857,11 @@ enum {
 	MLX5_CQE_FORMAT_CSUM_STRIDX = 0x3,
 };
 
+enum {
+	MLX5_CQE_COMPRESS_LAYOUT_BASIC = 0,
+	MLX5_CQE_COMPRESS_LAYOUT_ENHANCED = 1,
+};
+
 #define MLX5_MINI_CQE_ARRAY_SIZE 8
 
 static inline u8 mlx5_get_cqe_format(struct mlx5_cqe64 *cqe)
@@ -866,6 +874,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe)
 	return cqe->op_own >> 4;
 }
 
+static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe)
+{
+	/* num_of_mini_cqes is zero based */
+	return get_cqe_opcode(cqe) + 1;
+}
+
 static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro.tcppsh_abort_dupack >> 6) & 1;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f678ad88a7d5..8e87eb47f9dc 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1739,7 +1739,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   log_max_dci_errored_streams[0x5];
 	u8	   reserved_at_598[0x8];
 
-	u8         reserved_at_5a0[0x13];
+	u8         reserved_at_5a0[0x10];
+	u8         enhanced_cqe_compression[0x1];
+	u8         reserved_at_5b1[0x2];
 	u8         log_max_dek[0x5];
 	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
@@ -4139,7 +4141,8 @@ struct mlx5_ifc_cqc_bits {
 	u8         cqe_comp_en[0x1];
 	u8         mini_cqe_res_format[0x2];
 	u8         st[0x4];
-	u8         reserved_at_18[0x8];
+	u8         reserved_at_18[0x6];
+	u8         cqe_compression_layout[0x2];
 
 	u8         reserved_at_20[0x20];
 
-- 
cgit 


From 6554400d6f66b9494a0c0f07712ab0a9d307eb01 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Fri, 3 Jun 2022 20:05:19 +0900
Subject: scsi: ufs: core: Add UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS

Add UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS for host controllers which do not
support 64-bit addressing.

Link: https://lore.kernel.org/r/20220603110524.1997825-3-yoshihiro.shimoda.uh@renesas.com
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 2 ++
 include/ufs/ufshcd.h      | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 01fb4bad86be..88e567c3ba0b 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2216,6 +2216,8 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba)
 	int err;
 
 	hba->capabilities = ufshcd_readl(hba, REG_CONTROLLER_CAPABILITIES);
+	if (hba->quirks & UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS)
+		hba->capabilities &= ~MASK_64_ADDRESSING_SUPPORT;
 
 	/* nutrs and nutmrs are 0 based values */
 	hba->nutrs = (hba->capabilities & MASK_TRANSFER_REQUESTS_SLOTS) + 1;
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index a92271421718..795c8951341d 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -577,6 +577,12 @@ enum ufshcd_quirks {
 	 * support physical host configuration.
 	 */
 	UFSHCD_QUIRK_SKIP_PH_CONFIGURATION		= 1 << 16,
+
+	/*
+	 * This quirk needs to be enabled if the host controller has
+	 * 64-bit addressing supported capability but it doesn't work.
+	 */
+	UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS		= 1 << 17,
 };
 
 enum ufshcd_caps {
-- 
cgit 


From 2f11bbc2c7f37e3a6151ac548b1c0679cc90ea83 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Fri, 3 Jun 2022 20:05:20 +0900
Subject: scsi: ufs: core: Add UFSHCD_QUIRK_HIBERN_FASTAUTO

Add UFSHCD_QUIRK_HIBERN_FASTAUTO quirk for host controllers which supports
auto-hibernate the capability but only FASTAUTO mode.

Link: https://lore.kernel.org/r/20220603110524.1997825-4-yoshihiro.shimoda.uh@renesas.com
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 9 +++++++--
 include/ufs/ufshcd.h      | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 88e567c3ba0b..bb6cbd514a69 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4260,8 +4260,13 @@ static int ufshcd_get_max_pwr_mode(struct ufs_hba *hba)
 	if (hba->max_pwr_info.is_valid)
 		return 0;
 
-	pwr_info->pwr_tx = FAST_MODE;
-	pwr_info->pwr_rx = FAST_MODE;
+	if (hba->quirks & UFSHCD_QUIRK_HIBERN_FASTAUTO) {
+		pwr_info->pwr_tx = FASTAUTO_MODE;
+		pwr_info->pwr_rx = FASTAUTO_MODE;
+	} else {
+		pwr_info->pwr_tx = FAST_MODE;
+		pwr_info->pwr_rx = FAST_MODE;
+	}
 	pwr_info->hs_rate = PA_HS_MODE_B;
 
 	/* Get the connected lane count */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 795c8951341d..991aea081ec7 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -583,6 +583,12 @@ enum ufshcd_quirks {
 	 * 64-bit addressing supported capability but it doesn't work.
 	 */
 	UFSHCD_QUIRK_BROKEN_64BIT_ADDRESS		= 1 << 17,
+
+	/*
+	 * This quirk needs to be enabled if the host controller has
+	 * auto-hibernate capability but it's FASTAUTO only.
+	 */
+	UFSHCD_QUIRK_HIBERN_FASTAUTO			= 1 << 18,
 };
 
 enum ufshcd_caps {
-- 
cgit 


From 689614ce48b0310b50d8d6c9a64f8a98cfc6f195 Mon Sep 17 00:00:00 2001
From: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Date: Tue, 14 Jun 2022 10:52:51 +0300
Subject: ASoC: SOF: topology: add code to parse config params for ACPDMIC dai
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add sof_ipc_dai_acpdmic_params and tokens to parse dmic channels and
rate params from topology file

Signed-off-by: Ajit Kumar Pandey <AjitKumar.Pandey@amd.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220614075251.21499-1-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/dai-amd.h     |  7 +++++++
 include/sound/sof/dai.h         |  2 +-
 include/uapi/sound/sof/tokens.h |  4 ++++
 sound/soc/sof/ipc3-pcm.c        |  8 ++++----
 sound/soc/sof/ipc3-topology.c   | 25 +++++++++++++++++++------
 sound/soc/sof/sof-audio.h       |  1 +
 sound/soc/sof/topology.c        |  4 ++++
 7 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/sof/dai-amd.h b/include/sound/sof/dai-amd.h
index 90d09dbdd709..92f45c180b7c 100644
--- a/include/sound/sof/dai-amd.h
+++ b/include/sound/sof/dai-amd.h
@@ -18,4 +18,11 @@ struct sof_ipc_dai_acp_params {
 	uint32_t fsync_rate;    /* FSYNC frequency in Hz */
 	uint32_t tdm_slots;
 } __packed;
+
+/* ACPDMIC Configuration Request - SOF_IPC_DAI_AMD_CONFIG */
+struct sof_ipc_dai_acpdmic_params {
+	uint32_t pdm_rate;
+	uint32_t pdm_ch;
+} __packed;
+
 #endif
diff --git a/include/sound/sof/dai.h b/include/sound/sof/dai.h
index a818a0f0a226..21d98f31a9ca 100644
--- a/include/sound/sof/dai.h
+++ b/include/sound/sof/dai.h
@@ -111,7 +111,7 @@ struct sof_ipc_dai_config {
 		struct sof_ipc_dai_sai_params sai;
 		struct sof_ipc_dai_acp_params acpbt;
 		struct sof_ipc_dai_acp_params acpsp;
-		struct sof_ipc_dai_acp_params acpdmic;
+		struct sof_ipc_dai_acpdmic_params acpdmic;
 		struct sof_ipc_dai_mtk_afe_params afe;
 	};
 } __packed;
diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index f7b2019065ad..5caf75cadaf8 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -157,6 +157,10 @@
 /* MIXER */
 #define SOF_TKN_MIXER_TYPE			1700
 
+/* ACPDMIC */
+#define SOF_TKN_AMD_ACPDMIC_RATE		1800
+#define SOF_TKN_AMD_ACPDMIC_CH			1801
+
 /* CAVS AUDIO FORMAT */
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_RATE	1900
 #define SOF_TKN_CAVS_AUDIO_FORMAT_IN_BIT_DEPTH	1901
diff --git a/sound/soc/sof/ipc3-pcm.c b/sound/soc/sof/ipc3-pcm.c
index c8774a891d6f..b97e63d3724a 100644
--- a/sound/soc/sof/ipc3-pcm.c
+++ b/sound/soc/sof/ipc3-pcm.c
@@ -344,10 +344,10 @@ static int sof_ipc3_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd,
 			channels->min, channels->max);
 		break;
 	case SOF_DAI_AMD_DMIC:
-		rate->min = private->dai_config->acpdmic.fsync_rate;
-		rate->max = private->dai_config->acpdmic.fsync_rate;
-		channels->min = private->dai_config->acpdmic.tdm_slots;
-		channels->max = private->dai_config->acpdmic.tdm_slots;
+		rate->min = private->dai_config->acpdmic.pdm_rate;
+		rate->max = private->dai_config->acpdmic.pdm_rate;
+		channels->min = private->dai_config->acpdmic.pdm_ch;
+		channels->max = private->dai_config->acpdmic.pdm_ch;
 
 		dev_dbg(component->dev,
 			"AMD_DMIC rate_min: %d rate_max: %d\n", rate->min, rate->max);
diff --git a/sound/soc/sof/ipc3-topology.c b/sound/soc/sof/ipc3-topology.c
index a91d7df3f07e..5ee1537f9c2d 100644
--- a/sound/soc/sof/ipc3-topology.c
+++ b/sound/soc/sof/ipc3-topology.c
@@ -266,6 +266,16 @@ static const struct sof_topology_token afe_tokens[] = {
 		offsetof(struct sof_ipc_dai_mtk_afe_params, format)},
 };
 
+/* ACPDMIC */
+static const struct sof_topology_token acpdmic_tokens[] = {
+	{SOF_TKN_AMD_ACPDMIC_RATE,
+		SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc_dai_acpdmic_params, pdm_rate)},
+	{SOF_TKN_AMD_ACPDMIC_CH,
+		SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
+		offsetof(struct sof_ipc_dai_acpdmic_params, pdm_ch)},
+};
+
 /* Core tokens */
 static const struct sof_topology_token core_tokens[] = {
 	{SOF_TKN_COMP_CORE_ID, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32,
@@ -300,6 +310,7 @@ static const struct sof_token_info ipc3_token_list[SOF_TOKEN_COUNT] = {
 	[SOF_ESAI_TOKENS] = {"ESAI tokens", esai_tokens, ARRAY_SIZE(esai_tokens)},
 	[SOF_SAI_TOKENS] = {"SAI tokens", sai_tokens, ARRAY_SIZE(sai_tokens)},
 	[SOF_AFE_TOKENS] = {"AFE tokens", afe_tokens, ARRAY_SIZE(afe_tokens)},
+	[SOF_ACPDMIC_TOKENS] = {"ACPDMIC tokens", acpdmic_tokens, ARRAY_SIZE(acpdmic_tokens)},
 };
 
 /**
@@ -1120,20 +1131,22 @@ static int sof_link_acp_dmic_load(struct snd_soc_component *scomp, struct snd_so
 	struct snd_soc_tplg_hw_config *hw_config = slink->hw_configs;
 	struct sof_dai_private_data *private = dai->private;
 	u32 size = sizeof(*config);
+	int ret;
 
        /* handle master/slave and inverted clocks */
 	sof_dai_set_format(hw_config, config);
 
-	/* init IPC */
-	memset(&config->acpdmic, 0, sizeof(config->acpdmic));
 	config->hdr.size = size;
 
-	config->acpdmic.fsync_rate = le32_to_cpu(hw_config->fsync_rate);
-	config->acpdmic.tdm_slots = le32_to_cpu(hw_config->tdm_slots);
+	/* parse the required set of ACPDMIC tokens based on num_hw_cfgs */
+	ret = sof_update_ipc_object(scomp, &config->acpdmic, SOF_ACPDMIC_TOKENS, slink->tuples,
+				    slink->num_tuples, size, slink->num_hw_configs);
+	if (ret < 0)
+		return ret;
 
 	dev_info(scomp->dev, "ACP_DMIC config ACP%d channel %d rate %d\n",
-		 config->dai_index, config->acpdmic.tdm_slots,
-		 config->acpdmic.fsync_rate);
+		 config->dai_index, config->acpdmic.pdm_ch,
+		 config->acpdmic.pdm_rate);
 
 	dai->number_configs = 1;
 	dai->current_config = 0;
diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h
index 79486266081f..4284ea2f3a1f 100644
--- a/sound/soc/sof/sof-audio.h
+++ b/sound/soc/sof/sof-audio.h
@@ -236,6 +236,7 @@ enum sof_tokens {
 	SOF_AUDIO_FMT_NUM_TOKENS,
 	SOF_COPIER_FORMAT_TOKENS,
 	SOF_GAIN_TOKENS,
+	SOF_ACPDMIC_TOKENS,
 
 	/* this should be the last */
 	SOF_TOKEN_COUNT,
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index 1893c590f2f0..7e54eb1bf77b 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -1739,6 +1739,10 @@ static int sof_link_load(struct snd_soc_component *scomp, int index, struct snd_
 		token_id = SOF_AFE_TOKENS;
 		num_tuples += token_list[SOF_AFE_TOKENS].count;
 		break;
+	case SOF_DAI_AMD_DMIC:
+		token_id = SOF_ACPDMIC_TOKENS;
+		num_tuples += token_list[SOF_ACPDMIC_TOKENS].count;
+		break;
 	default:
 		break;
 	}
-- 
cgit 


From 9822bb87cee12a9d1e3321b652ba537af1f174aa Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 20 Feb 2022 16:33:27 +0000
Subject: iio: core: drop iio_get_time_res()

This function was introduced with the ability to pick a clock.
There are no upstream users so presumably it isn't as obviously useful
as it seemed at the time.  Hence drop it.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20220220163327.424696-1-jic23@kernel.org
---
 drivers/iio/industrialio-core.c | 23 -----------------------
 include/linux/iio/iio.h         |  1 -
 2 files changed, 24 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 7517deec501e..dd42df854a57 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -334,29 +334,6 @@ s64 iio_get_time_ns(const struct iio_dev *indio_dev)
 }
 EXPORT_SYMBOL(iio_get_time_ns);
 
-/**
- * iio_get_time_res() - utility function to get time stamp clock resolution in
- *                      nano seconds.
- * @indio_dev: device
- */
-unsigned int iio_get_time_res(const struct iio_dev *indio_dev)
-{
-	switch (iio_device_get_clock(indio_dev)) {
-	case CLOCK_REALTIME:
-	case CLOCK_MONOTONIC:
-	case CLOCK_MONOTONIC_RAW:
-	case CLOCK_BOOTTIME:
-	case CLOCK_TAI:
-		return hrtimer_resolution;
-	case CLOCK_REALTIME_COARSE:
-	case CLOCK_MONOTONIC_COARSE:
-		return LOW_RES_NSEC;
-	default:
-		BUG();
-	}
-}
-EXPORT_SYMBOL(iio_get_time_res);
-
 static int __init iio_init(void)
 {
 	int ret;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 233d2e6b7721..f6ea2ed99457 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -313,7 +313,6 @@ static inline bool iio_channel_has_available(const struct iio_chan_spec *chan,
 }
 
 s64 iio_get_time_ns(const struct iio_dev *indio_dev);
-unsigned int iio_get_time_res(const struct iio_dev *indio_dev);
 
 /*
  * Device operating modes
-- 
cgit 


From 12c4efe3509b8018e76ea3ebda8227cb53bf5887 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 8 May 2022 18:55:41 +0100
Subject: iio: core: Fix IIO_ALIGN and rename as it was not sufficiently large
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discussion of the series:
https://lore.kernel.org/all/20220405135758.774016-1-catalin.marinas@arm.com/
mm, arm64: Reduce ARCH_KMALLOC_MINALIGN brought to my attention that
our current IIO usage of L1CACHE_ALIGN is insufficient as their are Arm
platforms out their with non coherent DMA and larger cache lines at
at higher levels of their cache hierarchy.

Rename the define to make it's purpose more explicit. It will be used
much more widely going forwards (to replace incorrect ____cacheline_aligned
markings.

Note this patch will greatly reduce the padding on some architectures
that have smaller requirements for DMA safe buffers.

The history of changing values of ARCH_KMALLOC_MINALIGN via
ARCH_DMA_MINALIGN on arm64 is rather complex. I'm not tagging this
as fixing a particular patch from that route as it's not clear what to tag.

Most recently a change to bring them back inline was reverted because
of some Qualcomm Kryo cores with an L2 cache with 128-byte lines
sitting above the point of coherency.

c1132702c71f Revert "arm64: cache: Lower ARCH_DMA_MINALIGN to 64 (L1_CACHE_BYTES)"
That reverts:
65688d2a05de arm64: cache: Lower ARCH_DMA_MINALIGN to 64 (L1_CACHE_BYTES) which
refers to the change originally being motivated by Thunder x1 performance
rather than correctness.

Fixes: 6f7c8ee585e9d ("staging:iio: Add ability to allocate private data space to iio_allocate_device")
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20220508175712.647246-2-jic23@kernel.org
---
 drivers/iio/accel/bma400_core.c |  2 +-
 drivers/iio/adc/adi-axi-adc.c   |  7 ++++---
 drivers/iio/industrialio-core.c |  4 ++--
 include/linux/iio/iio.h         | 10 ++++++++--
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/accel/bma400_core.c b/drivers/iio/accel/bma400_core.c
index 517920400df1..c31bdd9b168e 100644
--- a/drivers/iio/accel/bma400_core.c
+++ b/drivers/iio/accel/bma400_core.c
@@ -93,7 +93,7 @@ struct bma400_data {
 		__le16 buff[3];
 		u8 temperature;
 		s64 ts __aligned(8);
-	} buffer __aligned(IIO_ALIGN);
+	} buffer __aligned(IIO_DMA_MINALIGN);
 	__le16 status;
 	__be16 duration;
 };
diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index cf07d522925e..bd6fd0dd04b9 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -84,7 +84,8 @@ void *adi_axi_adc_conv_priv(struct adi_axi_adc_conv *conv)
 {
 	struct adi_axi_adc_client *cl = conv_to_client(conv);
 
-	return (char *)cl + ALIGN(sizeof(struct adi_axi_adc_client), IIO_ALIGN);
+	return (char *)cl + ALIGN(sizeof(struct adi_axi_adc_client),
+				  IIO_DMA_MINALIGN);
 }
 EXPORT_SYMBOL_NS_GPL(adi_axi_adc_conv_priv, IIO_ADI_AXI);
 
@@ -169,9 +170,9 @@ static struct adi_axi_adc_conv *adi_axi_adc_conv_register(struct device *dev,
 	struct adi_axi_adc_client *cl;
 	size_t alloc_size;
 
-	alloc_size = ALIGN(sizeof(struct adi_axi_adc_client), IIO_ALIGN);
+	alloc_size = ALIGN(sizeof(struct adi_axi_adc_client), IIO_DMA_MINALIGN);
 	if (sizeof_priv)
-		alloc_size += ALIGN(sizeof_priv, IIO_ALIGN);
+		alloc_size += ALIGN(sizeof_priv, IIO_DMA_MINALIGN);
 
 	cl = kzalloc(alloc_size, GFP_KERNEL);
 	if (!cl)
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index dd42df854a57..0703c8c7c180 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1630,7 +1630,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 
 	alloc_size = sizeof(struct iio_dev_opaque);
 	if (sizeof_priv) {
-		alloc_size = ALIGN(alloc_size, IIO_ALIGN);
+		alloc_size = ALIGN(alloc_size, IIO_DMA_MINALIGN);
 		alloc_size += sizeof_priv;
 	}
 
@@ -1640,7 +1640,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 
 	indio_dev = &iio_dev_opaque->indio_dev;
 	indio_dev->priv = (char *)iio_dev_opaque +
-		ALIGN(sizeof(struct iio_dev_opaque), IIO_ALIGN);
+		ALIGN(sizeof(struct iio_dev_opaque), IIO_DMA_MINALIGN);
 
 	indio_dev->dev.parent = parent;
 	indio_dev->dev.type = &iio_device_type;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f6ea2ed99457..4e21a82b3756 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/cdev.h>
+#include <linux/slab.h>
 #include <linux/iio/types.h>
 #include <linux/of.h>
 /* IIO TODO LIST */
@@ -708,8 +709,13 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev)
 	return dev_get_drvdata(&indio_dev->dev);
 }
 
-/* Can we make this smaller? */
-#define IIO_ALIGN L1_CACHE_BYTES
+/*
+ * Used to ensure the iio_priv() structure is aligned to allow that structure
+ * to in turn include IIO_DMA_MINALIGN'd elements such as buffers which
+ * must not share  cachelines with the rest of the structure, thus making
+ * them safe for use with non-coherent DMA.
+ */
+#define IIO_DMA_MINALIGN ARCH_KMALLOC_MINALIGN
 struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv);
 
 /* The information at the returned address is guaranteed to be cacheline aligned */
-- 
cgit 


From 0da11bf0cab9029db8b85e48d962ff05c00a4faa Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Date: Fri, 27 May 2022 00:53:45 +0000
Subject: cpuidle: haltpoll: Add trace points for guest_halt_poll_ns
 grow/shrink

Add trace points as are implemented in KVM host halt polling.
This helps tune guest halt polling params.

Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/haltpoll.c |  3 +++
 include/trace/events/power.h         | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
index cb2a96eafc02..1dff3a52917d 100644
--- a/drivers/cpuidle/governors/haltpoll.c
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/kvm_para.h>
+#include <trace/events/power.h>
 
 static unsigned int guest_halt_poll_ns __read_mostly = 200000;
 module_param(guest_halt_poll_ns, uint, 0644);
@@ -90,6 +91,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 		if (val > guest_halt_poll_ns)
 			val = guest_halt_poll_ns;
 
+		trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	} else if (block_ns > guest_halt_poll_ns &&
 		   guest_halt_poll_allow_shrink) {
@@ -100,6 +102,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 			val = 0;
 		else
 			val /= shrink;
+		trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	}
 }
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index af5018aa9517..c708521e4ed5 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -500,6 +500,35 @@ DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_remove_request,
 
 	TP_ARGS(name, type, new_value)
 );
+
+TRACE_EVENT(guest_halt_poll_ns,
+
+	TP_PROTO(bool grow, unsigned int new, unsigned int old),
+
+	TP_ARGS(grow, new, old),
+
+	TP_STRUCT__entry(
+		__field(bool, grow)
+		__field(unsigned int, new)
+		__field(unsigned int, old)
+	),
+
+	TP_fast_assign(
+		__entry->grow   = grow;
+		__entry->new    = new;
+		__entry->old    = old;
+	),
+
+	TP_printk("halt_poll_ns %u (%s %u)",
+		__entry->new,
+		__entry->grow ? "grow" : "shrink",
+		__entry->old)
+);
+
+#define trace_guest_halt_poll_ns_grow(new, old) \
+	trace_guest_halt_poll_ns(true, new, old)
+#define trace_guest_halt_poll_ns_shrink(new, old) \
+	trace_guest_halt_poll_ns(false, new, old)
 #endif /* _TRACE_POWER_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 6dbdc9f35360d4ef4704462c265f63b32fcb5354 Mon Sep 17 00:00:00 2001
From: Hongyi Lu <jwnhy0@gmail.com>
Date: Mon, 13 Jun 2022 21:16:33 +0000
Subject: bpf: Fix spelling in bpf_verifier.h

Minor spelling fix spotted in bpf_verifier.h. Spelling is no big deal,
but it is still an improvement when reading through the code.

Signed-off-by: Hongyi Lu <jwnhy0@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220613211633.58647-1-jwnhy0@gmail.com
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e8439f6cbe57..3930c963fa67 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -299,7 +299,7 @@ struct bpf_verifier_state {
 	 * If is_state_visited() sees a state with branches > 0 it means
 	 * there is a loop. If such state is exactly equal to the current state
 	 * it's an infinite loop. Note states_equal() checks for states
-	 * equvalency, so two states being 'states_equal' does not mean
+	 * equivalency, so two states being 'states_equal' does not mean
 	 * infinite loop. The exact comparison is provided by
 	 * states_maybe_looping() function. It's a stronger pre-check and
 	 * much faster than states_equal().
-- 
cgit 


From c27e1efb61c545f36c450ef60862df9251d239a4 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 10 Jun 2022 08:45:37 +0200
Subject: ALSA: control: Use xarray for faster lookups

The control elements are managed in a single linked list and we
traverse the whole list for matching each numid or ctl id per every
inquiry of a control element.  This is OK-ish for a small number of
elements but obviously it doesn't scale.  Especially the matching with
the ctl id takes time because it checks each field of the snd_ctl_id
element, e.g. the name string is matched with strcmp().

This patch adds the hash tables with Xarray for improving the lookup
speed of a control element.  There are two xarray tables added to the
card; one for numid and another for ctl id.  For the numid, we use the
numid as the index, while for the ctl id, we calculate a hash key.

The lookup is done via a single xa_load() execution.  As long as the
given control element is found on the Xarray table, that's fine, we
can give back a quick lookup result.  The problem is when no entry
hits on the table, and for this case, we have a slight optimization.
Namely, the driver checks whether we had a collision on Xarray table,
and do a fallback search (linear lookup of the full entries) only if a
hash key collision happened beforehand.
So, in theory, the inquiry for a non-existing element might take still
time even with this patch in a worst case, but this must be pretty
rare.

The feature is enabled via CONFIG_SND_CTL_FAST_LOOKUP, which is turned
on as default.  For simplicity, the option can be turned off only when
CONFIG_EXPERT is set ("You are expert? Then you manage 1000 knobs").

Link: https://lore.kernel.org/r/20211028130027.18764-1-tiwai@suse.de
Link: https://lore.kernel.org/r/20220609180504.775-1-tiwai@suse.de
Link: https://lore.kernel.org/all/cover.1653813866.git.quic_rbankapu@quicinc.com/
Link: https://lore.kernel.org/r/20220610064537.18660-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/core.h |   6 ++
 sound/core/Kconfig   |  10 +++
 sound/core/control.c | 180 ++++++++++++++++++++++++++++++++++++++++++---------
 sound/core/init.c    |   4 ++
 4 files changed, 168 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index 6d4cc49584c6..dd28de2343b8 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -14,6 +14,7 @@
 #include <linux/pm.h>			/* pm_message_t */
 #include <linux/stringify.h>
 #include <linux/printk.h>
+#include <linux/xarray.h>
 
 /* number of supported soundcards */
 #ifdef CONFIG_SND_DYNAMIC_MINORS
@@ -103,6 +104,11 @@ struct snd_card {
 	size_t user_ctl_alloc_size;	// current memory allocation by user controls.
 	struct list_head controls;	/* all controls for this card */
 	struct list_head ctl_files;	/* active control files */
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+	struct xarray ctl_numids;	/* hash table for numids */
+	struct xarray ctl_hash;		/* hash table for ctl id matching */
+	bool ctl_hash_collision;	/* ctl_hash collision seen? */
+#endif
 
 	struct snd_info_entry *proc_root;	/* root for soundcard specific files */
 	struct proc_dir_entry *proc_root_link;	/* number link to real id */
diff --git a/sound/core/Kconfig b/sound/core/Kconfig
index dd7b40734723..25b2434e4556 100644
--- a/sound/core/Kconfig
+++ b/sound/core/Kconfig
@@ -154,6 +154,16 @@ config SND_VERBOSE_PRINTK
 
 	  You don't need this unless you're debugging ALSA.
 
+config SND_CTL_FAST_LOOKUP
+	bool "Fast lookup of control elements" if EXPERT
+	default y
+	select XARRAY_MULTI
+	help
+	  This option enables the faster lookup of control elements.
+	  It will consume more memory because of the additional Xarray.
+	  If you want to choose the memory footprint over the performance
+	  inevitably, turn this off.
+
 config SND_DEBUG
 	bool "Debug"
 	help
diff --git a/sound/core/control.c b/sound/core/control.c
index a25c0d64d104..6a8fd9933f06 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -364,6 +364,93 @@ static int snd_ctl_find_hole(struct snd_card *card, unsigned int count)
 	return 0;
 }
 
+/* check whether the given id is contained in the given kctl */
+static bool elem_id_matches(const struct snd_kcontrol *kctl,
+			    const struct snd_ctl_elem_id *id)
+{
+	return kctl->id.iface == id->iface &&
+		kctl->id.device == id->device &&
+		kctl->id.subdevice == id->subdevice &&
+		!strncmp(kctl->id.name, id->name, sizeof(kctl->id.name)) &&
+		kctl->id.index <= id->index &&
+		kctl->id.index + kctl->count > id->index;
+}
+
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+/* Compute a hash key for the corresponding ctl id
+ * It's for the name lookup, hence the numid is excluded.
+ * The hash key is bound in LONG_MAX to be used for Xarray key.
+ */
+#define MULTIPLIER	37
+static unsigned long get_ctl_id_hash(const struct snd_ctl_elem_id *id)
+{
+	unsigned long h;
+	const unsigned char *p;
+
+	h = id->iface;
+	h = MULTIPLIER * h + id->device;
+	h = MULTIPLIER * h + id->subdevice;
+	for (p = id->name; *p; p++)
+		h = MULTIPLIER * h + *p;
+	h = MULTIPLIER * h + id->index;
+	h &= LONG_MAX;
+	return h;
+}
+
+/* add hash entries to numid and ctl xarray tables */
+static void add_hash_entries(struct snd_card *card,
+			     struct snd_kcontrol *kcontrol)
+{
+	struct snd_ctl_elem_id id = kcontrol->id;
+	int i;
+
+	xa_store_range(&card->ctl_numids, kcontrol->id.numid,
+		       kcontrol->id.numid + kcontrol->count - 1,
+		       kcontrol, GFP_KERNEL);
+
+	for (i = 0; i < kcontrol->count; i++) {
+		id.index = kcontrol->id.index + i;
+		if (xa_insert(&card->ctl_hash, get_ctl_id_hash(&id),
+			      kcontrol, GFP_KERNEL)) {
+			/* skip hash for this entry, noting we had collision */
+			card->ctl_hash_collision = true;
+			dev_dbg(card->dev, "ctl_hash collision %d:%s:%d\n",
+				id.iface, id.name, id.index);
+		}
+	}
+}
+
+/* remove hash entries that have been added */
+static void remove_hash_entries(struct snd_card *card,
+				struct snd_kcontrol *kcontrol)
+{
+	struct snd_ctl_elem_id id = kcontrol->id;
+	struct snd_kcontrol *matched;
+	unsigned long h;
+	int i;
+
+	for (i = 0; i < kcontrol->count; i++) {
+		xa_erase(&card->ctl_numids, id.numid);
+		h = get_ctl_id_hash(&id);
+		matched = xa_load(&card->ctl_hash, h);
+		if (matched && (matched == kcontrol ||
+				elem_id_matches(matched, &id)))
+			xa_erase(&card->ctl_hash, h);
+		id.index++;
+		id.numid++;
+	}
+}
+#else /* CONFIG_SND_CTL_FAST_LOOKUP */
+static inline void add_hash_entries(struct snd_card *card,
+				    struct snd_kcontrol *kcontrol)
+{
+}
+static inline void remove_hash_entries(struct snd_card *card,
+				       struct snd_kcontrol *kcontrol)
+{
+}
+#endif /* CONFIG_SND_CTL_FAST_LOOKUP */
+
 enum snd_ctl_add_mode {
 	CTL_ADD_EXCLUSIVE, CTL_REPLACE, CTL_ADD_ON_REPLACE,
 };
@@ -408,6 +495,8 @@ static int __snd_ctl_add_replace(struct snd_card *card,
 	kcontrol->id.numid = card->last_numid + 1;
 	card->last_numid += kcontrol->count;
 
+	add_hash_entries(card, kcontrol);
+
 	for (idx = 0; idx < kcontrol->count; idx++)
 		snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_ADD, kcontrol, idx);
 
@@ -479,6 +568,26 @@ int snd_ctl_replace(struct snd_card *card, struct snd_kcontrol *kcontrol,
 }
 EXPORT_SYMBOL(snd_ctl_replace);
 
+static int __snd_ctl_remove(struct snd_card *card,
+			    struct snd_kcontrol *kcontrol,
+			    bool remove_hash)
+{
+	unsigned int idx;
+
+	if (snd_BUG_ON(!card || !kcontrol))
+		return -EINVAL;
+	list_del(&kcontrol->list);
+
+	if (remove_hash)
+		remove_hash_entries(card, kcontrol);
+
+	card->controls_count -= kcontrol->count;
+	for (idx = 0; idx < kcontrol->count; idx++)
+		snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx);
+	snd_ctl_free_one(kcontrol);
+	return 0;
+}
+
 /**
  * snd_ctl_remove - remove the control from the card and release it
  * @card: the card instance
@@ -492,16 +601,7 @@ EXPORT_SYMBOL(snd_ctl_replace);
  */
 int snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol)
 {
-	unsigned int idx;
-
-	if (snd_BUG_ON(!card || !kcontrol))
-		return -EINVAL;
-	list_del(&kcontrol->list);
-	card->controls_count -= kcontrol->count;
-	for (idx = 0; idx < kcontrol->count; idx++)
-		snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx);
-	snd_ctl_free_one(kcontrol);
-	return 0;
+	return __snd_ctl_remove(card, kcontrol, true);
 }
 EXPORT_SYMBOL(snd_ctl_remove);
 
@@ -642,14 +742,30 @@ int snd_ctl_rename_id(struct snd_card *card, struct snd_ctl_elem_id *src_id,
 		up_write(&card->controls_rwsem);
 		return -ENOENT;
 	}
+	remove_hash_entries(card, kctl);
 	kctl->id = *dst_id;
 	kctl->id.numid = card->last_numid + 1;
 	card->last_numid += kctl->count;
+	add_hash_entries(card, kctl);
 	up_write(&card->controls_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(snd_ctl_rename_id);
 
+#ifndef CONFIG_SND_CTL_FAST_LOOKUP
+static struct snd_kcontrol *
+snd_ctl_find_numid_slow(struct snd_card *card, unsigned int numid)
+{
+	struct snd_kcontrol *kctl;
+
+	list_for_each_entry(kctl, &card->controls, list) {
+		if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid)
+			return kctl;
+	}
+	return NULL;
+}
+#endif /* !CONFIG_SND_CTL_FAST_LOOKUP */
+
 /**
  * snd_ctl_find_numid - find the control instance with the given number-id
  * @card: the card instance
@@ -665,15 +781,13 @@ EXPORT_SYMBOL(snd_ctl_rename_id);
  */
 struct snd_kcontrol *snd_ctl_find_numid(struct snd_card *card, unsigned int numid)
 {
-	struct snd_kcontrol *kctl;
-
 	if (snd_BUG_ON(!card || !numid))
 		return NULL;
-	list_for_each_entry(kctl, &card->controls, list) {
-		if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid)
-			return kctl;
-	}
-	return NULL;
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+	return xa_load(&card->ctl_numids, numid);
+#else
+	return snd_ctl_find_numid_slow(card, numid);
+#endif
 }
 EXPORT_SYMBOL(snd_ctl_find_numid);
 
@@ -699,21 +813,18 @@ struct snd_kcontrol *snd_ctl_find_id(struct snd_card *card,
 		return NULL;
 	if (id->numid != 0)
 		return snd_ctl_find_numid(card, id->numid);
-	list_for_each_entry(kctl, &card->controls, list) {
-		if (kctl->id.iface != id->iface)
-			continue;
-		if (kctl->id.device != id->device)
-			continue;
-		if (kctl->id.subdevice != id->subdevice)
-			continue;
-		if (strncmp(kctl->id.name, id->name, sizeof(kctl->id.name)))
-			continue;
-		if (kctl->id.index > id->index)
-			continue;
-		if (kctl->id.index + kctl->count <= id->index)
-			continue;
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+	kctl = xa_load(&card->ctl_hash, get_ctl_id_hash(id));
+	if (kctl && elem_id_matches(kctl, id))
 		return kctl;
-	}
+	if (!card->ctl_hash_collision)
+		return NULL; /* we can rely on only hash table */
+#endif
+	/* no matching in hash table - try all as the last resort */
+	list_for_each_entry(kctl, &card->controls, list)
+		if (elem_id_matches(kctl, id))
+			return kctl;
+
 	return NULL;
 }
 EXPORT_SYMBOL(snd_ctl_find_id);
@@ -2195,8 +2306,13 @@ static int snd_ctl_dev_free(struct snd_device *device)
 	down_write(&card->controls_rwsem);
 	while (!list_empty(&card->controls)) {
 		control = snd_kcontrol(card->controls.next);
-		snd_ctl_remove(card, control);
+		__snd_ctl_remove(card, control, false);
 	}
+
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+	xa_destroy(&card->ctl_numids);
+	xa_destroy(&card->ctl_hash);
+#endif
 	up_write(&card->controls_rwsem);
 	put_device(&card->ctl_dev);
 	return 0;
diff --git a/sound/core/init.c b/sound/core/init.c
index 726a8353201f..1870aee7b64f 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -310,6 +310,10 @@ static int snd_card_init(struct snd_card *card, struct device *parent,
 	rwlock_init(&card->ctl_files_rwlock);
 	INIT_LIST_HEAD(&card->controls);
 	INIT_LIST_HEAD(&card->ctl_files);
+#ifdef CONFIG_SND_CTL_FAST_LOOKUP
+	xa_init(&card->ctl_numids);
+	xa_init(&card->ctl_hash);
+#endif
 	spin_lock_init(&card->files_lock);
 	INIT_LIST_HEAD(&card->files_list);
 	mutex_init(&card->memory_mutex);
-- 
cgit 


From 1b7ec5143c34f167266fa21245d99bacb4db4aa6 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 9 Jun 2022 14:02:17 +0200
Subject: ALSA: control: Rename CONFIG_SND_CTL_VALIDATION to
 CONFIG_SND_CTL_DEBUG

The purpose of CONFIG_SND_CTL_VALIDATION is rather to enable the
debugging feature for the control API.  The validation is only a part
of it.  Let's rename it to be more explicit and intuitive.

While we're at it, let's advertise, give more comment to recommend
this feature for development in the kconfig help text.

Link: https://lore.kernel.org/r/20220609120219.3937-3-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/control.h |  2 +-
 sound/core/Kconfig      | 17 +++++++++++------
 sound/core/control.c    |  4 ++--
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/sound/control.h b/include/sound/control.h
index 985c51a8fb74..fcd3cce673ec 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -23,7 +23,7 @@ typedef int (snd_kcontrol_tlv_rw_t)(struct snd_kcontrol *kcontrol,
 				    unsigned int __user *tlv);
 
 /* internal flag for skipping validations */
-#ifdef CONFIG_SND_CTL_VALIDATION
+#ifdef CONFIG_SND_CTL_DEBUG
 #define SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK	(1 << 24)
 #define snd_ctl_skip_validation(info) \
 	((info)->access & SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK)
diff --git a/sound/core/Kconfig b/sound/core/Kconfig
index 25b2434e4556..5289bb29131b 100644
--- a/sound/core/Kconfig
+++ b/sound/core/Kconfig
@@ -188,14 +188,19 @@ config SND_PCM_XRUN_DEBUG
 	  sound clicking when system is loaded, it may help to determine
 	  the process or driver which causes the scheduling gaps.
 
-config SND_CTL_VALIDATION
-	bool "Perform sanity-checks for each control element access"
+config SND_CTL_DEBUG
+	bool "Enable debugging feature for control API"
 	depends on SND_DEBUG
 	help
-	  Say Y to enable the additional validation of each control element
-	  access, including sanity-checks like whether the values returned
-	  from the driver are in the proper ranges or the check of the invalid
-	  access at out-of-array areas.
+	  Say Y to enable the debugging feature for ALSA control API.
+	  It performs the additional sanity-checks for each control element
+	  read access, such as whether the values returned from the driver
+	  are in the proper ranges or the check of the invalid access at
+	  out-of-array areas.  The error is printed when the driver gives
+	  such unexpected values.
+	  When you develop a driver that deals with control elements, it's
+	  strongly recommended to try this one once and verify whether you see
+	  any relevant errors or not.
 
 config SND_JACK_INJECTION_DEBUG
 	bool "Sound jack injection interface via debugfs"
diff --git a/sound/core/control.c b/sound/core/control.c
index 6a8fd9933f06..1401522ce552 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -966,7 +966,7 @@ static const unsigned int value_sizes[] = {
 	[SNDRV_CTL_ELEM_TYPE_INTEGER64] = sizeof(long long),
 };
 
-#ifdef CONFIG_SND_CTL_VALIDATION
+#ifdef CONFIG_SND_CTL_DEBUG
 /* fill the remaining snd_ctl_elem_value data with the given pattern */
 static void fill_remaining_elem_value(struct snd_ctl_elem_value *control,
 				      struct snd_ctl_elem_info *info,
@@ -1188,7 +1188,7 @@ static int snd_ctl_elem_read(struct snd_card *card,
 
 	snd_ctl_build_ioff(&control->id, kctl, index_offset);
 
-#ifdef CONFIG_SND_CTL_VALIDATION
+#ifdef CONFIG_SND_CTL_DEBUG
 	/* info is needed only for validation */
 	memset(&info, 0, sizeof(info));
 	info.id = control->id;
-- 
cgit 


From 064520e8aeaa2569f6504a50a37ac801b73656bc Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Wed, 15 Jun 2022 16:43:48 +0800
Subject: ASoC: SOF: Intel: Add support for MeteorLake (MTL)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add platform abstraction for the Meteor Lake platform.

This platform has significant differences compared to the TGL/ADL
generation: it relies on new hardware using the code name 'ACE' and
only supports the INTEL_IPC4 protocol and firmware architecture based
on the Zephyr RTOS

Co-developed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Link: https://lore.kernel.org/r/20220615084348.3489-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h |   2 +
 sound/soc/sof/intel/Kconfig         |  16 +
 sound/soc/sof/intel/Makefile        |   4 +-
 sound/soc/sof/intel/hda.h           |   3 +
 sound/soc/sof/intel/mtl.c           | 800 ++++++++++++++++++++++++++++++++++++
 sound/soc/sof/intel/mtl.h           |  76 ++++
 sound/soc/sof/intel/pci-mtl.c       |  71 ++++
 sound/soc/sof/intel/shim.h          |   1 +
 8 files changed, 972 insertions(+), 1 deletion(-)
 create mode 100644 sound/soc/sof/intel/mtl.c
 create mode 100644 sound/soc/sof/intel/mtl.h
 create mode 100644 sound/soc/sof/intel/pci-mtl.c

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 67e0d3e750b5..b5b489ea1aef 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -9,6 +9,8 @@
 
 #define SDW_SHIM_BASE			0x2C000
 #define SDW_ALH_BASE			0x2C800
+#define SDW_SHIM_BASE_ACE		0x38000
+#define SDW_ALH_BASE_ACE		0x24000
 #define SDW_LINK_BASE			0x30000
 #define SDW_LINK_SIZE			0x10000
 
diff --git a/sound/soc/sof/intel/Kconfig b/sound/soc/sof/intel/Kconfig
index 80cdc3788bbe..3f54678e810b 100644
--- a/sound/soc/sof/intel/Kconfig
+++ b/sound/soc/sof/intel/Kconfig
@@ -221,6 +221,22 @@ config SND_SOC_SOF_ALDERLAKE
 	  Say Y if you have such a device.
 	  If unsure select "N".
 
+config SND_SOC_SOF_INTEL_MTL
+	tristate
+	select SND_SOC_SOF_HDA_COMMON
+	select SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE
+	select SND_SOC_SOF_INTEL_IPC4
+
+config SND_SOC_SOF_METEORLAKE
+	tristate "SOF support for Meteorlake"
+	default SND_SOC_SOF_PCI
+	select SND_SOC_SOF_INTEL_MTL
+	help
+	  This adds support for Sound Open Firmware for Intel(R) platforms
+	  using the Meteorlake processors.
+	  Say Y if you have such a device.
+	  If unsure select "N".
+
 config SND_SOC_SOF_HDA_COMMON
 	tristate
 	select SND_SOC_SOF_INTEL_COMMON
diff --git a/sound/soc/sof/intel/Makefile b/sound/soc/sof/intel/Makefile
index b9d51dc39ffa..a079159bb2f0 100644
--- a/sound/soc/sof/intel/Makefile
+++ b/sound/soc/sof/intel/Makefile
@@ -6,7 +6,7 @@ snd-sof-acpi-intel-bdw-objs := bdw.o
 snd-sof-intel-hda-common-objs := hda.o hda-loader.o hda-stream.o hda-trace.o \
 				 hda-dsp.o hda-ipc.o hda-ctrl.o hda-pcm.o \
 				 hda-dai.o hda-bus.o \
-				 apl.o cnl.o tgl.o icl.o hda-common-ops.o
+				 apl.o cnl.o tgl.o icl.o mtl.o hda-common-ops.o
 snd-sof-intel-hda-common-$(CONFIG_SND_SOC_SOF_HDA_PROBES) += hda-probes.o
 
 snd-sof-intel-hda-objs := hda-codec.o
@@ -24,9 +24,11 @@ snd-sof-pci-intel-apl-objs := pci-apl.o
 snd-sof-pci-intel-cnl-objs := pci-cnl.o
 snd-sof-pci-intel-icl-objs := pci-icl.o
 snd-sof-pci-intel-tgl-objs := pci-tgl.o
+snd-sof-pci-intel-mtl-objs := pci-mtl.o
 
 obj-$(CONFIG_SND_SOC_SOF_MERRIFIELD) += snd-sof-pci-intel-tng.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_APL) += snd-sof-pci-intel-apl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_CNL) += snd-sof-pci-intel-cnl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_ICL) += snd-sof-pci-intel-icl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_TGL) += snd-sof-pci-intel-tgl.o
+obj-$(CONFIG_SND_SOC_SOF_INTEL_MTL) += snd-sof-pci-intel-mtl.o
diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h
index 8b7f3c07d478..a3118499e34f 100644
--- a/sound/soc/sof/intel/hda.h
+++ b/sound/soc/sof/intel/hda.h
@@ -714,6 +714,8 @@ extern struct snd_sof_dsp_ops sof_tgl_ops;
 int sof_tgl_ops_init(struct snd_sof_dev *sdev);
 extern struct snd_sof_dsp_ops sof_icl_ops;
 int sof_icl_ops_init(struct snd_sof_dev *sdev);
+extern struct snd_sof_dsp_ops sof_mtl_ops;
+int sof_mtl_ops_init(struct snd_sof_dev *sdev);
 
 extern const struct sof_intel_dsp_desc apl_chip_info;
 extern const struct sof_intel_dsp_desc cnl_chip_info;
@@ -723,6 +725,7 @@ extern const struct sof_intel_dsp_desc tglh_chip_info;
 extern const struct sof_intel_dsp_desc ehl_chip_info;
 extern const struct sof_intel_dsp_desc jsl_chip_info;
 extern const struct sof_intel_dsp_desc adls_chip_info;
+extern const struct sof_intel_dsp_desc mtl_chip_info;
 
 /* Probes support */
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_PROBES)
diff --git a/sound/soc/sof/intel/mtl.c b/sound/soc/sof/intel/mtl.c
new file mode 100644
index 000000000000..37be77beb415
--- /dev/null
+++ b/sound/soc/sof/intel/mtl.c
@@ -0,0 +1,800 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+//
+// Copyright(c) 2022 Intel Corporation. All rights reserved.
+//
+// Authors: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
+//
+
+/*
+ * Hardware interface for audio DSP on Meteorlake.
+ */
+
+#include <linux/firmware.h>
+#include <sound/sof/ipc4/header.h>
+#include "../ipc4-priv.h"
+#include "../ops.h"
+#include "hda.h"
+#include "hda-ipc.h"
+#include "../sof-audio.h"
+#include "mtl.h"
+
+static const struct snd_sof_debugfs_map mtl_dsp_debugfs[] = {
+	{"hda", HDA_DSP_HDA_BAR, 0, 0x4000, SOF_DEBUGFS_ACCESS_ALWAYS},
+	{"pp", HDA_DSP_PP_BAR,  0, 0x1000, SOF_DEBUGFS_ACCESS_ALWAYS},
+	{"dsp", HDA_DSP_BAR,  0, 0x10000, SOF_DEBUGFS_ACCESS_ALWAYS},
+};
+
+static void mtl_ipc_host_done(struct snd_sof_dev *sdev)
+{
+	/*
+	 * clear busy interrupt to tell dsp controller this interrupt has been accepted,
+	 * not trigger it again
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR,
+				       MTL_DSP_REG_HFIPCXTDR_BUSY, MTL_DSP_REG_HFIPCXTDR_BUSY);
+	/*
+	 * clear busy bit to ack dsp the msg has been processed and send reply msg to dsp
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDA,
+				       MTL_DSP_REG_HFIPCXTDA_BUSY, 0);
+}
+
+static void mtl_ipc_dsp_done(struct snd_sof_dev *sdev)
+{
+	/*
+	 * set DONE bit - tell DSP we have received the reply msg from DSP, and processed it,
+	 * don't send more reply to host
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA,
+				       MTL_DSP_REG_HFIPCXIDA_DONE, MTL_DSP_REG_HFIPCXIDA_DONE);
+
+	/* unmask Done interrupt */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL,
+				MTL_DSP_REG_HFIPCXCTL_DONE, MTL_DSP_REG_HFIPCXCTL_DONE);
+}
+
+/* Check if an IPC IRQ occurred */
+static bool mtl_dsp_check_ipc_irq(struct snd_sof_dev *sdev)
+{
+	u32 irq_status;
+	u32 hfintipptr;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+	irq_status = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr + MTL_DSP_IRQSTS);
+
+	dev_vdbg(sdev->dev, "irq handler: irq_status:0x%x\n", irq_status);
+
+	if (irq_status != U32_MAX && (irq_status & MTL_DSP_IRQSTS_IPC))
+		return true;
+
+	return false;
+}
+
+/* Check if an SDW IRQ occurred */
+static bool mtl_dsp_check_sdw_irq(struct snd_sof_dev *sdev)
+{
+	u32 irq_status;
+	u32 hfintipptr;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+	irq_status = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr + MTL_DSP_IRQSTS);
+
+	if (irq_status != U32_MAX && (irq_status & MTL_DSP_IRQSTS_SDW))
+		return true;
+
+	return false;
+}
+
+static int mtl_ipc_send_msg(struct snd_sof_dev *sdev, struct snd_sof_ipc_msg *msg)
+{
+	struct sof_ipc4_msg *msg_data = msg->msg_data;
+
+	/* send the message via mailbox */
+	if (msg_data->data_size)
+		sof_mailbox_write(sdev, sdev->host_box.offset, msg_data->data_ptr,
+				  msg_data->data_size);
+
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDDY,
+			  msg_data->extension);
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDR,
+			  msg_data->primary | MTL_DSP_REG_HFIPCXIDR_BUSY);
+
+	return 0;
+}
+
+static void mtl_enable_ipc_interrupts(struct snd_sof_dev *sdev)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+
+	/* enable IPC DONE and BUSY interrupts */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, chip->ipc_ctl,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE);
+}
+
+static void mtl_disable_ipc_interrupts(struct snd_sof_dev *sdev)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+
+	/* disable IPC DONE and BUSY interrupts */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, chip->ipc_ctl,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE, 0);
+}
+
+static int mtl_enable_interrupts(struct snd_sof_dev *sdev)
+{
+	u32 hfintipptr;
+	u32 irqinten;
+	u32 host_ipc;
+	u32 hipcie;
+	int ret;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+
+	/* Enable Host IPC and SOUNDWIRE */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, hfintipptr,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK);
+
+	/* check if operation was successful */
+	host_ipc = MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK;
+	irqinten = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, hfintipptr, irqinten,
+					    (irqinten & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to enable Host IPC and/or SOUNDWIRE\n");
+		return ret;
+	}
+
+	/* Set Host IPC interrupt enable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE,
+				MTL_DSP_REG_HfHIPCIE_IE_MASK, MTL_DSP_REG_HfHIPCIE_IE_MASK);
+
+	/* check if operation was successful */
+	host_ipc = MTL_DSP_REG_HfHIPCIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE, hipcie,
+					    (hipcie & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to set Host IPC interrupt enable\n");
+		return ret;
+	}
+
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE,
+				MTL_DSP_REG_HfSNDWIE_IE_MASK, MTL_DSP_REG_HfSNDWIE_IE_MASK);
+	host_ipc = MTL_DSP_REG_HfSNDWIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE, hipcie,
+					    (hipcie & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to set SoundWire IPC interrupt enable\n");
+
+	return ret;
+}
+
+static int mtl_disable_interrupts(struct snd_sof_dev *sdev)
+{
+	u32 hfintipptr;
+	u32 irqinten;
+	u32 host_ipc;
+	u32 hipcie;
+	int ret1;
+	int ret;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+
+	/* Disable Host IPC and SOUNDWIRE */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, hfintipptr,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK, 0);
+
+	/* check if operation was successful */
+	host_ipc = MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK;
+	irqinten = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, hfintipptr, irqinten,
+					    (irqinten & host_ipc) == 0,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	/* Continue to disable other interrupts when error happens */
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to disable Host IPC and SoundWire\n");
+
+	/* Set Host IPC interrupt disable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE,
+				MTL_DSP_REG_HfHIPCIE_IE_MASK, 0);
+
+	/* check if operation was successful */
+	host_ipc = MTL_DSP_REG_HfHIPCIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE);
+	ret1 = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE, hipcie,
+					     (hipcie & host_ipc) == 0,
+					     HDA_DSP_REG_POLL_INTERVAL_US,
+					     HDA_DSP_RESET_TIMEOUT_US);
+	if (ret1 < 0) {
+		dev_err(sdev->dev, "failed to set Host IPC interrupt disable\n");
+		if (!ret)
+			ret = ret1;
+	}
+
+	/* Set SoundWire IPC interrupt disable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE,
+				MTL_DSP_REG_HfSNDWIE_IE_MASK, 0);
+	host_ipc = MTL_DSP_REG_HfSNDWIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE);
+	ret1 = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE, hipcie,
+					     (hipcie & host_ipc) == 0,
+					     HDA_DSP_REG_POLL_INTERVAL_US,
+					     HDA_DSP_RESET_TIMEOUT_US);
+	if (ret1 < 0) {
+		dev_err(sdev->dev, "failed to set SoundWire IPC interrupt disable\n");
+		if (!ret)
+			ret = ret1;
+	}
+
+	return ret;
+}
+
+/* pre fw run operations */
+static int mtl_dsp_pre_fw_run(struct snd_sof_dev *sdev)
+{
+	u32 dsphfpwrsts;
+	u32 dsphfdsscs;
+	u32 cpa;
+	u32 pgs;
+	int ret;
+
+	/* Set the DSP subsystem power on */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFDSSCS,
+				MTL_HFDSSCS_SPA_MASK, MTL_HFDSSCS_SPA_MASK);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_HFDSSCS_CPA_MASK;
+	dsphfdsscs = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFDSSCS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFDSSCS, dsphfdsscs,
+					    (dsphfdsscs & cpa) == cpa, HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to enable DSP subsystem\n");
+		return ret;
+	}
+
+	/* Power up gated-DSP-0 domain in order to access the DSP shim register block. */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFPWRCTL,
+				MTL_HFPWRCTL_WPDSPHPXPG, MTL_HFPWRCTL_WPDSPHPXPG);
+
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	pgs = MTL_HFPWRSTS_DSPHPXPGS_MASK;
+	dsphfpwrsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFPWRSTS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFPWRSTS, dsphfpwrsts,
+					    (dsphfpwrsts & pgs) == pgs,
+					    HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to power up gated DSP domain\n");
+
+	/* make sure SoundWire is not power-gated */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_HDA_BAR, MTL_HFPWRCTL,
+				MTL_HfPWRCTL_WPIOXPG(1), MTL_HfPWRCTL_WPIOXPG(1));
+	return ret;
+}
+
+static int mtl_dsp_post_fw_run(struct snd_sof_dev *sdev)
+{
+	int ret;
+
+	if (sdev->first_boot) {
+		struct sof_intel_hda_dev *hdev = sdev->pdata->hw_pdata;
+
+		ret = hda_sdw_startup(sdev);
+		if (ret < 0) {
+			dev_err(sdev->dev, "could not startup SoundWire links\n");
+			return ret;
+		}
+
+		/* Check if IMR boot is usable */
+		if (!sof_debug_check_flag(SOF_DBG_IGNORE_D3_PERSISTENT))
+			hdev->imrboot_supported = true;
+	}
+
+	hda_sdw_int_enable(sdev, true);
+	return 0;
+}
+
+static void mtl_dsp_dump(struct snd_sof_dev *sdev, u32 flags)
+{
+	char *level = (flags & SOF_DBG_DUMP_OPTIONAL) ? KERN_DEBUG : KERN_ERR;
+	u32 romdbgsts;
+	u32 romdbgerr;
+	u32 fwsts;
+	u32 fwlec;
+
+	fwsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_ROM_STS);
+	fwlec = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_ROM_ERROR);
+	romdbgsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY);
+	romdbgerr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY_ERROR);
+
+	dev_err(sdev->dev, "ROM status: %#x, ROM error: %#x\n", fwsts, fwlec);
+	dev_err(sdev->dev, "ROM debug status: %#x, ROM debug error: %#x\n", romdbgsts,
+		romdbgerr);
+	romdbgsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY + 0x8 * 3);
+	dev_printk(level, sdev->dev, "ROM feature bit%s enabled\n",
+		   romdbgsts & BIT(24) ? "" : " not");
+}
+
+static bool mtl_dsp_primary_core_is_enabled(struct snd_sof_dev *sdev)
+{
+	int val;
+
+	val = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE);
+	if (val != U32_MAX && val & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK)
+		return true;
+
+	return false;
+}
+
+static int mtl_dsp_core_power_up(struct snd_sof_dev *sdev, int core)
+{
+	unsigned int cpa;
+	u32 dspcxctl;
+	int ret;
+
+	/* Only the primary core can be powered up by the host */
+	if (core != SOF_DSP_PRIMARY_CORE || mtl_dsp_primary_core_is_enabled(sdev))
+		return 0;
+
+	/* Program the owner of the IP & shim registers (10: Host CPU) */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_OSEL,
+				0x2 << MTL_DSP2CXCTL_PRIMARY_CORE_OSEL_SHIFT);
+
+	/* enable SPA bit */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK;
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE, dspcxctl,
+					    (dspcxctl & cpa) == cpa, HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "%s: timeout on MTL_DSP2CXCTL_PRIMARY_CORE read\n",
+			__func__);
+		return ret;
+	}
+
+	/* did core power up ? */
+	dspcxctl = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE);
+	if ((dspcxctl & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK)
+		!= MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK) {
+		dev_err(sdev->dev, "power up core failed core %d adspcs %#x\n",
+			core, dspcxctl);
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+static int mtl_dsp_core_power_down(struct snd_sof_dev *sdev, int core)
+{
+	u32 dspcxctl;
+	int ret;
+
+	/* Only the primary core can be powered down by the host */
+	if (core != SOF_DSP_PRIMARY_CORE || !mtl_dsp_primary_core_is_enabled(sdev))
+		return 0;
+
+	/* disable SPA bit */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK, 0);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE, dspcxctl,
+					    !(dspcxctl & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK),
+					    HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_PD_TIMEOUT * USEC_PER_MSEC);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to power down primary core\n");
+
+	return ret;
+}
+
+static int mtl_dsp_cl_init(struct snd_sof_dev *sdev, int stream_tag, bool imr_boot)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+	unsigned int status;
+	u32 ipc_hdr;
+	int ret;
+
+	/* step 1: purge FW request */
+	ipc_hdr = chip->ipc_req_mask | HDA_DSP_ROM_IPC_CONTROL;
+	if (!imr_boot)
+		ipc_hdr |= HDA_DSP_ROM_IPC_PURGE_FW | ((stream_tag - 1) << 9);
+
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, chip->ipc_req, ipc_hdr);
+
+	/* step 2: power up primary core */
+	ret = mtl_dsp_core_power_up(sdev, SOF_DSP_PRIMARY_CORE);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "dsp core 0/1 power up failed\n");
+		goto err;
+	}
+
+	dev_dbg(sdev->dev, "Primary core power up successful\n");
+
+	/* step 3: wait for IPC DONE bit from ROM */
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, chip->ipc_ack, status,
+					    ((status & chip->ipc_ack_mask) == chip->ipc_ack_mask),
+					    HDA_DSP_REG_POLL_INTERVAL_US, MTL_DSP_PURGE_TIMEOUT_US);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "timeout waiting for purge IPC done\n");
+		goto err;
+	}
+
+	/* set DONE bit to clear the reply IPC message */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, chip->ipc_ack, chip->ipc_ack_mask,
+				       chip->ipc_ack_mask);
+
+	/* step 4: enable interrupts */
+	ret = mtl_enable_interrupts(sdev);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "%s: failed to enable interrupts\n", __func__);
+		goto err;
+	}
+
+	mtl_enable_ipc_interrupts(sdev);
+
+	/*
+	 * ACE workaround: don't wait for ROM INIT.
+	 * The platform cannot catch ROM_INIT_DONE because of a very short
+	 * timing window. Follow the recommendations and skip this part.
+	 */
+
+	return 0;
+
+err:
+	snd_sof_dsp_dbg_dump(sdev, "MTL DSP init fail", 0);
+	mtl_dsp_core_power_down(sdev, SOF_DSP_PRIMARY_CORE);
+	return ret;
+}
+
+static irqreturn_t mtl_ipc_irq_thread(int irq, void *context)
+{
+	struct sof_ipc4_msg notification_data = {{ 0 }};
+	struct snd_sof_dev *sdev = context;
+	bool ipc_irq = false;
+	u32 hipcida;
+	u32 hipctdr;
+
+	hipcida = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA);
+
+	/* reply message from DSP */
+	if (hipcida & MTL_DSP_REG_HFIPCXIDA_DONE) {
+		/* DSP received the message */
+		snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL,
+					MTL_DSP_REG_HFIPCXCTL_DONE, 0);
+
+		mtl_ipc_dsp_done(sdev);
+
+		ipc_irq = true;
+	}
+
+	hipctdr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR);
+	if (hipctdr & MTL_DSP_REG_HFIPCXTDR_BUSY) {
+		/* Message from DSP (reply or notification) */
+		u32 extension = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDDY);
+		u32 primary = hipctdr & MTL_DSP_REG_HFIPCXTDR_MSG_MASK;
+
+		/*
+		 * ACE fw sends a new fw ipc message to host to
+		 * notify the status of the last host ipc message
+		 */
+		if (primary & SOF_IPC4_MSG_DIR_MASK) {
+			/* Reply received */
+			struct sof_ipc4_msg *data = sdev->ipc->msg.reply_data;
+
+			data->primary = primary;
+			data->extension = extension;
+
+			spin_lock_irq(&sdev->ipc_lock);
+
+			snd_sof_ipc_get_reply(sdev);
+			snd_sof_ipc_reply(sdev, data->primary);
+
+			spin_unlock_irq(&sdev->ipc_lock);
+		} else {
+			/* Notification received */
+			notification_data.primary = primary;
+			notification_data.extension = extension;
+
+			sdev->ipc->msg.rx_data = &notification_data;
+			snd_sof_ipc_msgs_rx(sdev);
+			sdev->ipc->msg.rx_data = NULL;
+		}
+
+		mtl_ipc_host_done(sdev);
+
+		ipc_irq = true;
+	}
+
+	if (!ipc_irq) {
+		/* This interrupt is not shared so no need to return IRQ_NONE. */
+		dev_dbg_ratelimited(sdev->dev, "%s nothing to do in IPC IRQ thread\n",
+				    __func__);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int mtl_dsp_ipc_get_mailbox_offset(struct snd_sof_dev *sdev)
+{
+	return MTL_DSP_MBOX_UPLINK_OFFSET;
+}
+
+static int mtl_dsp_ipc_get_window_offset(struct snd_sof_dev *sdev, u32 id)
+{
+	return MTL_SRAM_WINDOW_OFFSET(id);
+}
+
+static int mtl_suspend(struct snd_sof_dev *sdev, bool runtime_suspend)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	struct hdac_bus *bus = sof_to_bus(sdev);
+#endif
+	u32 dsphfdsscs;
+	u32 cpa;
+	int ret;
+	int i;
+
+	mtl_disable_ipc_interrupts(sdev);
+	ret = mtl_disable_interrupts(sdev);
+	if (ret)
+		return ret;
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	hda_codec_jack_wake_enable(sdev, runtime_suspend);
+	/* power down all hda link */
+	snd_hdac_ext_bus_link_power_down_all(bus);
+#endif
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFPWRCTL,
+				MTL_HFPWRCTL_WPDSPHPXPG, 0);
+
+	/* Set the DSP subsystem power down */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFDSSCS,
+				MTL_HFDSSCS_SPA_MASK, 0);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_HFDSSCS_CPA_MASK;
+	dsphfdsscs = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFDSSCS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFDSSCS, dsphfdsscs,
+					    (dsphfdsscs & cpa) == 0, HDA_DSP_REG_POLL_INTERVAL_US,
+					HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to disable DSP subsystem\n");
+
+	/* reset ref counts for all cores */
+	for (i = 0; i < chip->cores_num; i++)
+		sdev->dsp_core_ref_count[i] = 0;
+
+	/* TODO: need to reset controller? */
+
+	/* display codec can be powered off after link reset */
+	hda_codec_i915_display_power(sdev, false);
+
+	return 0;
+}
+
+static int mtl_dsp_suspend(struct snd_sof_dev *sdev, u32 target_state)
+{
+	const struct sof_dsp_power_state target_dsp_state = {
+		.state = target_state,
+		.substate = target_state == SOF_DSP_PM_D0 ?
+				SOF_HDA_DSP_PM_D0I3 : 0,
+	};
+	int ret;
+
+	ret = mtl_suspend(sdev, false);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_dsp_state);
+}
+
+static int mtl_dsp_runtime_suspend(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D3,
+	};
+	int ret;
+
+	ret = mtl_suspend(sdev, true);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static int mtl_resume(struct snd_sof_dev *sdev, bool runtime_resume)
+{
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	struct hdac_bus *bus = sof_to_bus(sdev);
+	struct hdac_ext_link *hlink = NULL;
+#endif
+
+	/* display codec must be powered before link reset */
+	hda_codec_i915_display_power(sdev, true);
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	/* check jack status */
+	if (runtime_resume) {
+		hda_codec_jack_wake_enable(sdev, false);
+		if (sdev->system_suspend_target == SOF_SUSPEND_NONE)
+			hda_codec_jack_check(sdev);
+	}
+
+	/* turn off the links that were off before suspend */
+	list_for_each_entry(hlink, &bus->hlink_list, list) {
+		if (!hlink->ref_count)
+			snd_hdac_ext_bus_link_power_down(hlink);
+	}
+
+	/* check dma status and clean up CORB/RIRB buffers */
+	if (!bus->cmd_dma_state)
+		snd_hdac_bus_stop_cmd_io(bus);
+#endif
+
+	return 0;
+}
+
+static int mtl_dsp_resume(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D0,
+		.substate = SOF_HDA_DSP_PM_D0I0,
+	};
+	int ret;
+
+	ret = mtl_resume(sdev, false);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static int mtl_dsp_runtime_resume(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D0,
+	};
+	int ret;
+
+	ret = mtl_resume(sdev, true);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static void mtl_ipc_dump(struct snd_sof_dev *sdev)
+{
+	u32 hipcctl;
+	u32 hipcida;
+	u32 hipctdr;
+
+	/* read IPC status */
+	hipcida = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA);
+	hipcctl = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL);
+	hipctdr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR);
+
+	/* dump the IPC regs */
+	/* TODO: parse the raw msg */
+	dev_err(sdev->dev,
+		"error: host status 0x%8.8x dsp status 0x%8.8x mask 0x%8.8x\n",
+		hipcida, hipctdr, hipcctl);
+}
+
+/* Meteorlake ops */
+struct snd_sof_dsp_ops sof_mtl_ops;
+EXPORT_SYMBOL_NS(sof_mtl_ops, SND_SOC_SOF_INTEL_HDA_COMMON);
+
+int sof_mtl_ops_init(struct snd_sof_dev *sdev)
+{
+	struct sof_ipc4_fw_data *ipc4_data;
+
+	/* common defaults */
+	memcpy(&sof_mtl_ops, &sof_hda_common_ops, sizeof(struct snd_sof_dsp_ops));
+
+	/* shutdown */
+	sof_mtl_ops.shutdown = hda_dsp_shutdown;
+
+	/* doorbell */
+	sof_mtl_ops.irq_thread = mtl_ipc_irq_thread;
+
+	/* ipc */
+	sof_mtl_ops.send_msg = mtl_ipc_send_msg;
+	sof_mtl_ops.get_mailbox_offset = mtl_dsp_ipc_get_mailbox_offset;
+	sof_mtl_ops.get_window_offset = mtl_dsp_ipc_get_window_offset;
+
+	/* debug */
+	sof_mtl_ops.debug_map = mtl_dsp_debugfs;
+	sof_mtl_ops.debug_map_count = ARRAY_SIZE(mtl_dsp_debugfs);
+	sof_mtl_ops.dbg_dump = mtl_dsp_dump;
+	sof_mtl_ops.ipc_dump = mtl_ipc_dump;
+
+	/* pre/post fw run */
+	sof_mtl_ops.pre_fw_run = mtl_dsp_pre_fw_run;
+	sof_mtl_ops.post_fw_run = mtl_dsp_post_fw_run;
+
+	/* parse platform specific extended manifest */
+	sof_mtl_ops.parse_platform_ext_manifest = NULL;
+
+	/* dsp core get/put */
+	/* TODO: add core_get and core_put */
+
+	/* PM */
+	sof_mtl_ops.suspend = mtl_dsp_suspend;
+	sof_mtl_ops.resume = mtl_dsp_resume;
+	sof_mtl_ops.runtime_suspend = mtl_dsp_runtime_suspend;
+	sof_mtl_ops.runtime_resume = mtl_dsp_runtime_resume;
+
+	sdev->private = devm_kzalloc(sdev->dev, sizeof(struct sof_ipc4_fw_data), GFP_KERNEL);
+	if (!sdev->private)
+		return -ENOMEM;
+
+	ipc4_data = sdev->private;
+	ipc4_data->manifest_fw_hdr_offset = SOF_MAN4_FW_HDR_OFFSET;
+
+	/* set DAI ops */
+	hda_set_dai_drv_ops(sdev, &sof_mtl_ops);
+
+	return 0;
+};
+EXPORT_SYMBOL_NS(sof_mtl_ops_init, SND_SOC_SOF_INTEL_HDA_COMMON);
+
+const struct sof_intel_dsp_desc mtl_chip_info = {
+	.cores_num = 3,
+	.init_core_mask = BIT(0),
+	.host_managed_cores_mask = BIT(0),
+	.ipc_req = MTL_DSP_REG_HFIPCXIDR,
+	.ipc_req_mask = MTL_DSP_REG_HFIPCXIDR_BUSY,
+	.ipc_ack = MTL_DSP_REG_HFIPCXIDA,
+	.ipc_ack_mask = MTL_DSP_REG_HFIPCXIDA_DONE,
+	.ipc_ctl = MTL_DSP_REG_HFIPCXCTL,
+	.rom_status_reg = MTL_DSP_ROM_STS,
+	.rom_init_timeout	= 300,
+	.ssp_count = ICL_SSP_COUNT,
+	.ssp_base_offset = CNL_SSP_BASE_OFFSET,
+	.sdw_shim_base = SDW_SHIM_BASE_ACE,
+	.sdw_alh_base = SDW_ALH_BASE_ACE,
+	.check_sdw_irq = mtl_dsp_check_sdw_irq,
+	.check_ipc_irq = mtl_dsp_check_ipc_irq,
+	.cl_init = mtl_dsp_cl_init,
+	.hw_ip_version = SOF_INTEL_ACE_1_0,
+};
+EXPORT_SYMBOL_NS(mtl_chip_info, SND_SOC_SOF_INTEL_HDA_COMMON);
diff --git a/sound/soc/sof/intel/mtl.h b/sound/soc/sof/intel/mtl.h
new file mode 100644
index 000000000000..788bf0e3ea87
--- /dev/null
+++ b/sound/soc/sof/intel/mtl.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved.
+ */
+
+/* DSP Registers */
+#define MTL_HFDSSCS			0x1000
+#define MTL_HFDSSCS_SPA_MASK		BIT(16)
+#define MTL_HFDSSCS_CPA_MASK		BIT(24)
+#define MTL_HFSNDWIE			0x114C
+#define MTL_HFPWRCTL			0x1D18
+#define MTL_HfPWRCTL_WPIOXPG(x)		BIT((x) + 8)
+#define MTL_HFPWRCTL_WPDSPHPXPG		BIT(0)
+#define MTL_HFPWRSTS			0x1D1C
+#define MTL_HFPWRSTS_DSPHPXPGS_MASK	BIT(0)
+#define MTL_HFINTIPPTR			0x1108
+#define MTL_IRQ_INTEN_L_HOST_IPC_MASK	BIT(0)
+#define MTL_IRQ_INTEN_L_SOUNDWIRE_MASK	BIT(6)
+#define MTL_HFINTIPPTR_PTR_MASK		GENMASK(20, 0)
+
+#define MTL_DSP2CXCAP_PRIMARY_CORE	0x178D00
+#define MTL_DSP2CXCTL_PRIMARY_CORE	0x178D04
+#define MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK BIT(0)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK BIT(8)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_OSEL GENMASK(25, 24)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_OSEL_SHIFT 24
+
+/* IPC Registers */
+#define MTL_DSP_REG_HFIPCXTDR		0x73200
+#define MTL_DSP_REG_HFIPCXTDR_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXTDR_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXTDA		0x73204
+#define MTL_DSP_REG_HFIPCXTDA_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDR		0x73210
+#define MTL_DSP_REG_HFIPCXIDR_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDR_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXIDA		0x73214
+#define MTL_DSP_REG_HFIPCXIDA_DONE	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDA_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXCTL		0x73228
+#define MTL_DSP_REG_HFIPCXCTL_BUSY	BIT(0)
+#define MTL_DSP_REG_HFIPCXCTL_DONE	BIT(1)
+#define MTL_DSP_REG_HFIPCXTDDY		0x73300
+#define MTL_DSP_REG_HFIPCXIDDY		0x73380
+#define MTL_DSP_REG_HfHIPCIE		0x1140
+#define MTL_DSP_REG_HfHIPCIE_IE_MASK	BIT(0)
+#define MTL_DSP_REG_HfSNDWIE		0x114C
+#define MTL_DSP_REG_HfSNDWIE_IE_MASK	GENMASK(3, 0)
+
+#define MTL_DSP_IRQSTS			0x20
+#define MTL_DSP_IRQSTS_IPC		BIT(0)
+#define MTL_DSP_IRQSTS_SDW		BIT(6)
+
+#define MTL_DSP_PURGE_TIMEOUT_US	20000000 /* 20s */
+#define MTL_DSP_REG_POLL_INTERVAL_US	10	/* 10 us */
+
+/* Memory windows */
+#define MTL_SRAM_WINDOW_OFFSET(x)	(0x180000 + 0x8000 * (x))
+
+#define MTL_DSP_MBOX_UPLINK_OFFSET	(MTL_SRAM_WINDOW_OFFSET(0) + 0x1000)
+#define MTL_DSP_MBOX_UPLINK_SIZE	0x1000
+#define MTL_DSP_MBOX_DOWNLINK_OFFSET	MTL_SRAM_WINDOW_OFFSET(1)
+#define MTL_DSP_MBOX_DOWNLINK_SIZE	0x1000
+
+/* FW registers */
+#define MTL_DSP_ROM_STS			MTL_SRAM_WINDOW_OFFSET(0) /* ROM status */
+#define MTL_DSP_ROM_ERROR		(MTL_SRAM_WINDOW_OFFSET(0) + 0x4) /* ROM error code */
+
+#define MTL_DSP_REG_HFFLGPXQWY		0x163200 /* ROM debug status */
+#define MTL_DSP_REG_HFFLGPXQWY_ERROR	0x163204 /* ROM debug error code */
+#define MTL_DSP_REG_HfIMRIS1		0x162088
+#define MTL_DSP_REG_HfIMRIS1_IU_MASK	BIT(0)
+
diff --git a/sound/soc/sof/intel/pci-mtl.c b/sound/soc/sof/intel/pci-mtl.c
new file mode 100644
index 000000000000..899b00d53d64
--- /dev/null
+++ b/sound/soc/sof/intel/pci-mtl.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+//
+// This file is provided under a dual BSD/GPLv2 license.  When using or
+// redistributing this file, you may do so under either license.
+//
+// Copyright(c) 2018-2022 Intel Corporation. All rights reserved.
+//
+// Author: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
+//
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <sound/soc-acpi.h>
+#include <sound/soc-acpi-intel-match.h>
+#include <sound/sof.h>
+#include "../ops.h"
+#include "../sof-pci-dev.h"
+
+/* platform specific devices */
+#include "hda.h"
+#include "mtl.h"
+
+static const struct sof_dev_desc mtl_desc = {
+	.use_acpi_target_states	= true,
+	.machines               = snd_soc_acpi_intel_mtl_machines,
+	.alt_machines		= snd_soc_acpi_intel_mtl_sdw_machines,
+	.resindex_lpe_base      = 0,
+	.resindex_pcicfg_base   = -1,
+	.resindex_imr_base      = -1,
+	.irqindex_host_ipc      = -1,
+	.chip_info = &mtl_chip_info,
+	.ipc_supported_mask	= BIT(SOF_INTEL_IPC4),
+	.ipc_default		= SOF_INTEL_IPC4,
+	.default_fw_path = {
+		[SOF_INTEL_IPC4] = "intel/sof-ipc4/mtl",
+	},
+	.default_tplg_path = {
+		[SOF_INTEL_IPC4] = "intel/sof-ace-tplg",
+	},
+	.default_fw_filename = {
+		[SOF_INTEL_IPC4] = "dsp_basefw.bin",
+	},
+	.nocodec_tplg_filename = "sof-mtl-nocodec.tplg",
+	.ops = &sof_mtl_ops,
+	.ops_init = sof_mtl_ops_init,
+};
+
+/* PCI IDs */
+static const struct pci_device_id sof_pci_ids[] = {
+	{ PCI_DEVICE(0x8086, 0x7E28), /* MTL */
+		.driver_data = (unsigned long)&mtl_desc},
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, sof_pci_ids);
+
+/* pci_driver definition */
+static struct pci_driver snd_sof_pci_intel_mtl_driver = {
+	.name = "sof-audio-pci-intel-mtl",
+	.id_table = sof_pci_ids,
+	.probe = hda_pci_intel_probe,
+	.remove = sof_pci_remove,
+	.shutdown = sof_pci_shutdown,
+	.driver = {
+		.pm = &sof_pci_pm,
+	},
+};
+module_pci_driver(snd_sof_pci_intel_mtl_driver);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON);
+MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV);
diff --git a/sound/soc/sof/intel/shim.h b/sound/soc/sof/intel/shim.h
index 371991fa474f..638159bee864 100644
--- a/sound/soc/sof/intel/shim.h
+++ b/sound/soc/sof/intel/shim.h
@@ -20,6 +20,7 @@ enum sof_intel_hw_ip_version {
 	SOF_INTEL_CAVS_1_8,	/* CannonLake, CometLake, CoffeeLake */
 	SOF_INTEL_CAVS_2_0,	/* IceLake, JasperLake */
 	SOF_INTEL_CAVS_2_5,	/* TigerLake, AlderLake */
+	SOF_INTEL_ACE_1_0,	/* MeteorLake */
 };
 
 /*
-- 
cgit 


From 6365a1935c5151455812e96d8de434c551dc0d98 Mon Sep 17 00:00:00 2001
From: Ma Wupeng <mawupeng1@huawei.com>
Date: Tue, 14 Jun 2022 17:21:52 +0800
Subject: efi: Make code to find mirrored memory ranges generic

Commit b05b9f5f9dcf ("x86, mirror: x86 enabling - find mirrored memory
ranges") introduce the efi_find_mirror() function on x86. In order to reuse
the API we make it public.

Arm64 can support mirrored memory too, so function efi_find_mirror() is added to
efi_init() to this support for arm64.

Since efi_init() is shared by ARM, arm64 and riscv, this patch will bring
mirror memory support for these architectures, but this support is only tested
in arm64.

Signed-off-by: Ma Wupeng <mawupeng1@huawei.com>
Link: https://lore.kernel.org/r/20220614092156.1972846-2-mawupeng1@huawei.com
[ardb: fix subject to better reflect the payload]
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/include/asm/efi.h      |  4 ----
 arch/x86/platform/efi/efi.c     | 23 -----------------------
 drivers/firmware/efi/efi-init.c |  1 +
 drivers/firmware/efi/efi.c      | 23 +++++++++++++++++++++++
 include/linux/efi.h             |  3 +++
 5 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 71943dce691e..eb90206eae80 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -383,7 +383,6 @@ static inline bool efi_is_64bit(void)
 extern bool efi_reboot_required(void);
 extern bool efi_is_table_address(unsigned long phys_addr);
 
-extern void efi_find_mirror(void);
 extern void efi_reserve_boot_services(void);
 #else
 static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
@@ -395,9 +394,6 @@ static inline  bool efi_is_table_address(unsigned long phys_addr)
 {
 	return false;
 }
-static inline void efi_find_mirror(void)
-{
-}
 static inline void efi_reserve_boot_services(void)
 {
 }
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1591d67e0bcd..6e598bd78eef 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -108,29 +108,6 @@ static int __init setup_add_efi_memmap(char *arg)
 }
 early_param("add_efi_memmap", setup_add_efi_memmap);
 
-void __init efi_find_mirror(void)
-{
-	efi_memory_desc_t *md;
-	u64 mirror_size = 0, total_size = 0;
-
-	if (!efi_enabled(EFI_MEMMAP))
-		return;
-
-	for_each_efi_memory_desc(md) {
-		unsigned long long start = md->phys_addr;
-		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-
-		total_size += size;
-		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
-			memblock_mark_mirror(start, size);
-			mirror_size += size;
-		}
-	}
-	if (mirror_size)
-		pr_info("Memory: %lldM/%lldM mirrored memory\n",
-			mirror_size>>20, total_size>>20);
-}
-
 /*
  * Tell the kernel about the EFI memory map.  This might include
  * more than the max 128 entries that can fit in the passed in e820
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index b2c829e95bd1..3928dbff76d0 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -240,6 +240,7 @@ void __init efi_init(void)
 	 * And now, memblock is fully populated, it is time to do capping.
 	 */
 	early_init_dt_check_for_usable_mem_range();
+	efi_find_mirror();
 	efi_esrt_init();
 	efi_mokvar_table_init();
 
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 860534bcfdac..79c232e07de7 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -446,6 +446,29 @@ err_put:
 
 subsys_initcall(efisubsys_init);
 
+void __init efi_find_mirror(void)
+{
+	efi_memory_desc_t *md;
+	u64 mirror_size = 0, total_size = 0;
+
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
+	for_each_efi_memory_desc(md) {
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+		total_size += size;
+		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+			memblock_mark_mirror(start, size);
+			mirror_size += size;
+		}
+	}
+	if (mirror_size)
+		pr_info("Memory: %lldM/%lldM mirrored memory\n",
+			mirror_size>>20, total_size>>20);
+}
+
 /*
  * Find the efi memory descriptor for a given physical address.  Given a
  * physical address, determine if it exists within an EFI Memory Map entry,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 7d9b0bb47eb3..53f64c14a525 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -872,6 +872,7 @@ static inline bool efi_rt_services_supported(unsigned int mask)
 {
 	return (efi.runtime_supported_mask & mask) == mask;
 }
+extern void efi_find_mirror(void);
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -889,6 +890,8 @@ static inline bool efi_rt_services_supported(unsigned int mask)
 {
 	return false;
 }
+
+static inline void efi_find_mirror(void) {}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);
-- 
cgit 


From f67be8b7ee90c292948c3ec6395673963cccaee6 Mon Sep 17 00:00:00 2001
From: Li Chen <lchen@ambarella.com>
Date: Sun, 22 May 2022 20:26:58 -0700
Subject: regmap: provide regmap_field helpers for simple bit operations

We have set/clear/test operations for regmap, but not for regmap_field yet.
So let's introduce regmap_field helpers too.

In many instances regmap_field_update_bits() is used for simple bit setting
and clearing. In these cases the last argument is redundant and we can
hide it with a static inline function.

This adds three new helpers for simple bit operations: set_bits,
clear_bits and test_bits (the last one defined as a regular function).

Signed-off-by: Li Chen <lchen@ambarella.com>
Link: https://lore.kernel.org/r/180eef422c3.deae9cd960729.8518395646822099769@zohomail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 22 ++++++++++++++++++++++
 include/linux/regmap.h       | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 2221d9863831..cb0be5e7b100 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2220,6 +2220,28 @@ int regmap_field_update_bits_base(struct regmap_field *field,
 }
 EXPORT_SYMBOL_GPL(regmap_field_update_bits_base);
 
+/**
+ * regmap_field_test_bits() - Check if all specified bits are set in a
+ *                            register field.
+ *
+ * @field: Register field to operate on
+ * @bits: Bits to test
+ *
+ * Returns -1 if the underlying regmap_field_read() fails, 0 if at least one of the
+ * tested bits is not set and 1 if all tested bits are set.
+ */
+int regmap_field_test_bits(struct regmap_field *field, unsigned int bits)
+{
+	unsigned int val, ret;
+
+	ret = regmap_field_read(field, &val);
+	if (ret)
+		return ret;
+
+	return (val & bits) == bits;
+}
+EXPORT_SYMBOL_GPL(regmap_field_test_bits);
+
 /**
  * regmap_fields_update_bits_base() - Perform a read/modify/write cycle a
  *                                    register field with port ID
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 8952fa3d0d59..d5b08f4f0dc0 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1336,6 +1336,22 @@ static inline int regmap_field_update_bits(struct regmap_field *field,
 					     NULL, false, false);
 }
 
+static inline int regmap_field_set_bits(struct regmap_field *field,
+					unsigned int bits)
+{
+	return regmap_field_update_bits_base(field, bits, bits, NULL, false,
+					     false);
+}
+
+static inline int regmap_field_clear_bits(struct regmap_field *field,
+					  unsigned int bits)
+{
+	return regmap_field_update_bits_base(field, bits, 0, NULL, false,
+					     false);
+}
+
+int regmap_field_test_bits(struct regmap_field *field, unsigned int bits);
+
 static inline int
 regmap_field_force_update_bits(struct regmap_field *field,
 			       unsigned int mask, unsigned int val)
@@ -1769,6 +1785,27 @@ regmap_field_force_update_bits(struct regmap_field *field,
 	return -EINVAL;
 }
 
+static inline int regmap_field_set_bits(struct regmap_field *field,
+					unsigned int bits)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
+static inline int regmap_field_clear_bits(struct regmap_field *field,
+					  unsigned int bits)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
+static inline int regmap_field_test_bits(struct regmap_field *field,
+					 unsigned int bits)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_fields_write(struct regmap_field *field,
 				      unsigned int id, unsigned int val)
 {
-- 
cgit 


From 003cbe046171596809c2f37dc07e69df1b4d9f95 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 1 Jun 2022 20:58:55 +0530
Subject: pinctrl: Add pingroup and define PINCTRL_PINGROUP

Add 'struct pingroup' to represent pingroup and 'PINCTRL_PINGROUP'
macro for inline use. Both are used to manage and represent
larger number of pingroups.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220601152900.1012813-2-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 70b45d28e7a9..487117ccb1bc 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -26,6 +26,26 @@ struct pin_config_item;
 struct gpio_chip;
 struct device_node;
 
+/**
+ * struct pingroup - provides information on pingroup
+ * @name: a name for pingroup
+ * @pins: an array of pins in the pingroup
+ * @npins: number of pins in the pingroup
+ */
+struct pingroup {
+	const char *name;
+	const unsigned int *pins;
+	size_t npins;
+};
+
+/* Convenience macro to define a single named or anonymous pingroup */
+#define PINCTRL_PINGROUP(_name, _pins, _npins)	\
+(struct pingroup){				\
+	.name = _name,				\
+	.pins = _pins,				\
+	.npins = _npins,			\
+}
+
 /**
  * struct pinctrl_pin_desc - boards/machines provide information on their
  * pins, pads or other muxable units in this struct
-- 
cgit 


From f2c5092190f21e02d384f750bcc473554f3aa3f8 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 14 Jun 2022 20:18:14 +0200
Subject: arch/*: Disable softirq stacks on PREEMPT_RT.

PREEMPT_RT preempts softirqs and the current implementation avoids
do_softirq_own_stack() and only uses __do_softirq().

Disable the unused softirqs stacks on PREEMPT_RT to save some memory and
ensure that do_softirq_own_stack() is not used bwcause it is not expected.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/kernel/irq.c                 | 3 ++-
 arch/parisc/kernel/irq.c              | 2 ++
 arch/powerpc/kernel/irq.c             | 4 ++++
 arch/s390/include/asm/softirq_stack.h | 3 ++-
 arch/sh/kernel/irq.c                  | 2 ++
 arch/sparc/kernel/irq_64.c            | 2 ++
 include/asm-generic/softirq_stack.h   | 2 +-
 7 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 5c6f8d11a3ce..034cb48c9eeb 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -70,6 +70,7 @@ static void __init init_irq_stacks(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT
 static void ____do_softirq(void *arg)
 {
 	__do_softirq();
@@ -80,7 +81,7 @@ void do_softirq_own_stack(void)
 	call_with_stack(____do_softirq, NULL,
 			__this_cpu_read(irq_stack_ptr));
 }
-
+#endif
 #endif
 
 int arch_show_interrupts(struct seq_file *p, int prec)
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 0fe2d79fb123..eba193bcdab1 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -480,10 +480,12 @@ static void execute_on_irq_stack(void *func, unsigned long param1)
 	*irq_stack_in_use = 1;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 void do_softirq_own_stack(void)
 {
 	execute_on_irq_stack(__do_softirq, 0);
 }
+#endif
 #endif /* CONFIG_IRQSTACKS */
 
 /* ONLY called from entry.S:intr_extint() */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index dd09919c3c66..0822a274a549 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -611,6 +611,7 @@ static inline void check_stack_overflow(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT
 static __always_inline void call_do_softirq(const void *sp)
 {
 	/* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */
@@ -629,6 +630,7 @@ static __always_inline void call_do_softirq(const void *sp)
 		   "r11", "r12"
 	);
 }
+#endif
 
 static __always_inline void call_do_irq(struct pt_regs *regs, void *sp)
 {
@@ -747,10 +749,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly;
 void *softirq_ctx[NR_CPUS] __read_mostly;
 void *hardirq_ctx[NR_CPUS] __read_mostly;
 
+#ifndef CONFIG_PREEMPT_RT
 void do_softirq_own_stack(void)
 {
 	call_do_softirq(softirq_ctx[smp_processor_id()]);
 }
+#endif
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
 {
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
index fd17f25704bd..af68d6c1d584 100644
--- a/arch/s390/include/asm/softirq_stack.h
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -5,9 +5,10 @@
 #include <asm/lowcore.h>
 #include <asm/stacktrace.h>
 
+#ifndef CONFIG_PREEMPT_RT
 static inline void do_softirq_own_stack(void)
 {
 	call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
 }
-
+#endif
 #endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ef0f0827cf57..2d3eca8fee01 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu)
 	hardirq_ctx[cpu] = NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 void do_softirq_own_stack(void)
 {
 	struct thread_info *curctx;
@@ -176,6 +177,7 @@ void do_softirq_own_stack(void)
 		  "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
 	);
 }
+#endif
 #else
 static inline void handle_one_irq(unsigned int irq)
 {
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index c8848bb681a1..41fa1be980a3 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT
 void do_softirq_own_stack(void)
 {
 	void *orig_sp, *sp = softirq_stack[smp_processor_id()];
@@ -869,6 +870,7 @@ void do_softirq_own_stack(void)
 	__asm__ __volatile__("mov %0, %%sp"
 			     : : "r" (orig_sp));
 }
+#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(void)
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
index eceeecf6a5bd..d3e2d81656e0 100644
--- a/include/asm-generic/softirq_stack.h
+++ b/include/asm-generic/softirq_stack.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
 #define __ASM_GENERIC_SOFTIRQ_STACK_H
 
-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
+#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT)
 void do_softirq_own_stack(void);
 #else
 static inline void do_softirq_own_stack(void)
-- 
cgit 


From d593d64f043add170d8ea9cf698449637917dcf9 Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Wed, 18 May 2022 22:14:14 +0530
Subject: lib: Add register read/write tracing support

Generic MMIO read/write i.e., __raw_{read,write}{b,l,w,q} accessors
are typically used to read/write from/to memory mapped registers
and can cause hangs or some undefined behaviour in following few
cases,

* If the access to the register space is unclocked, for example: if
  there is an access to multimedia(MM) block registers without MM
  clocks.

* If the register space is protected and not set to be accessible from
  non-secure world, for example: only EL3 (EL: Exception level) access
  is allowed and any EL2/EL1 access is forbidden.

* If xPU(memory/register protection units) is controlling access to
  certain memory/register space for specific clients.

and more...

Such cases usually results in instant reboot/SErrors/NOC or interconnect
hangs and tracing these register accesses can be very helpful to debug
such issues during initial development stages and also in later stages.

So use ftrace trace events to log such MMIO register accesses which
provides rich feature set such as early enablement of trace events,
filtering capability, dumping ftrace logs on console and many more.

Sample output:

rwmmio_write: __qcom_geni_serial_console_write+0x160/0x1e0 width=32 val=0xa0d5d addr=0xfffffbfffdbff700
rwmmio_post_write: __qcom_geni_serial_console_write+0x160/0x1e0 width=32 val=0xa0d5d addr=0xfffffbfffdbff700
rwmmio_read: qcom_geni_serial_poll_bit+0x94/0x138 width=32 addr=0xfffffbfffdbff610
rwmmio_post_read: qcom_geni_serial_poll_bit+0x94/0x138 width=32 val=0x0 addr=0xfffffbfffdbff610

Co-developed-by: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
Signed-off-by: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/Kconfig                  |  3 ++
 arch/arm64/Kconfig            |  1 +
 include/trace/events/rwmmio.h | 97 +++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig                   |  7 ++++
 lib/Makefile                  |  2 +
 lib/trace_readwrite.c         | 47 +++++++++++++++++++++
 6 files changed, 157 insertions(+)
 create mode 100644 include/trace/events/rwmmio.h
 create mode 100644 lib/trace_readwrite.c

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index fcf9a41a4ef5..47899446483b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1396,6 +1396,9 @@ config ARCH_HAS_ELFCORE_COMPAT
 config ARCH_HAS_PARANOID_L1D_FLUSH
 	bool
 
+config ARCH_HAVE_TRACE_MMIO_ACCESS
+	bool
+
 config DYNAMIC_SIGFRAME
 	bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..fbf21e01f0b6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -49,6 +49,7 @@ config ARM64
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
 	select ARCH_HAVE_ELF_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_HAVE_TRACE_MMIO_ACCESS
 	select ARCH_INLINE_READ_LOCK if !PREEMPTION
 	select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
 	select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
diff --git a/include/trace/events/rwmmio.h b/include/trace/events/rwmmio.h
new file mode 100644
index 000000000000..de41159216c1
--- /dev/null
+++ b/include/trace/events/rwmmio.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rwmmio
+
+#if !defined(_TRACE_RWMMIO_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RWMMIO_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(rwmmio_rw_template,
+
+	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
+
+	TP_ARGS(caller, val, width, addr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, caller)
+		__field(unsigned long, addr)
+		__field(u64, val)
+		__field(u8, width)
+	),
+
+	TP_fast_assign(
+		__entry->caller = caller;
+		__entry->val = val;
+		__entry->addr = (unsigned long)addr;
+		__entry->width = width;
+	),
+
+	TP_printk("%pS width=%d val=%#llx addr=%#lx",
+		(void *)__entry->caller, __entry->width,
+		__entry->val, __entry->addr)
+);
+
+DEFINE_EVENT(rwmmio_rw_template, rwmmio_write,
+	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
+	TP_ARGS(caller, val, width, addr)
+);
+
+DEFINE_EVENT(rwmmio_rw_template, rwmmio_post_write,
+	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
+	TP_ARGS(caller, val, width, addr)
+);
+
+TRACE_EVENT(rwmmio_read,
+
+	TP_PROTO(unsigned long caller, u8 width, const volatile void __iomem *addr),
+
+	TP_ARGS(caller, width, addr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, caller)
+		__field(unsigned long, addr)
+		__field(u8, width)
+	),
+
+	TP_fast_assign(
+		__entry->caller = caller;
+		__entry->addr = (unsigned long)addr;
+		__entry->width = width;
+	),
+
+	TP_printk("%pS width=%d addr=%#lx",
+		 (void *)__entry->caller, __entry->width, __entry->addr)
+);
+
+TRACE_EVENT(rwmmio_post_read,
+
+	TP_PROTO(unsigned long caller, u64 val, u8 width, const volatile void __iomem *addr),
+
+	TP_ARGS(caller, val, width, addr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, caller)
+		__field(unsigned long, addr)
+		__field(u64, val)
+		__field(u8, width)
+	),
+
+	TP_fast_assign(
+		__entry->caller = caller;
+		__entry->val = val;
+		__entry->addr = (unsigned long)addr;
+		__entry->width = width;
+	),
+
+	TP_printk("%pS width=%d val=%#llx addr=%#lx",
+		 (void *)__entry->caller, __entry->width,
+		 __entry->val, __entry->addr)
+);
+
+#endif /* _TRACE_RWMMIO_H */
+
+#include <trace/define_trace.h>
diff --git a/lib/Kconfig b/lib/Kconfig
index 6a843639814f..80aeceee88ed 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -118,6 +118,13 @@ config INDIRECT_IOMEM_FALLBACK
 	  mmio accesses when the IO memory address is not a registered
 	  emulated region.
 
+config TRACE_MMIO_ACCESS
+	bool "Register read/write tracing"
+	depends on TRACING && ARCH_HAVE_TRACE_MMIO_ACCESS
+	help
+	  Create tracepoints for MMIO read/write operations. These trace events
+	  can be used for logging all MMIO read/write operations.
+
 source "lib/crypto/Kconfig"
 
 config CRC_CCITT
diff --git a/lib/Makefile b/lib/Makefile
index ea54294d73bf..07917a40c91f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -151,6 +151,8 @@ lib-y += logic_pio.o
 
 lib-$(CONFIG_INDIRECT_IOMEM) += logic_iomem.o
 
+obj-$(CONFIG_TRACE_MMIO_ACCESS) += trace_readwrite.o
+
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/trace_readwrite.c b/lib/trace_readwrite.c
new file mode 100644
index 000000000000..88637038b30c
--- /dev/null
+++ b/lib/trace_readwrite.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Register read and write tracepoints
+ *
+ * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <asm-generic/io.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rwmmio.h>
+
+#ifdef CONFIG_TRACE_MMIO_ACCESS
+void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+		    unsigned long caller_addr)
+{
+	trace_rwmmio_write(caller_addr, val, width, addr);
+}
+EXPORT_SYMBOL_GPL(log_write_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_write);
+
+void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+			 unsigned long caller_addr)
+{
+	trace_rwmmio_post_write(caller_addr, val, width, addr);
+}
+EXPORT_SYMBOL_GPL(log_post_write_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_write);
+
+void log_read_mmio(u8 width, const volatile void __iomem *addr,
+		   unsigned long caller_addr)
+{
+	trace_rwmmio_read(caller_addr, width, addr);
+}
+EXPORT_SYMBOL_GPL(log_read_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_read);
+
+void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
+			unsigned long caller_addr)
+{
+	trace_rwmmio_post_read(caller_addr, val, width, addr);
+}
+EXPORT_SYMBOL_GPL(log_post_read_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_read);
+#endif /* CONFIG_TRACE_MMIO_ACCESS */
-- 
cgit 


From 210031971cdd25a2a2b70c190de98c237db0731f Mon Sep 17 00:00:00 2001
From: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Date: Wed, 18 May 2022 22:14:16 +0530
Subject: asm-generic/io: Add logging support for MMIO accessors

Add logging support for MMIO high level accessors such as read{b,w,l,q}
and their relaxed versions to aid in debugging unexpected crashes/hangs
caused by the corresponding MMIO operation.

Signed-off-by: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/io.h | 91 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ce93aaf69f8..9c5114335a2d 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -10,6 +10,7 @@
 #include <asm/page.h> /* I/O is all done through memory accesses */
 #include <linux/string.h> /* for memset() and memcpy() */
 #include <linux/types.h>
+#include <linux/instruction_pointer.h>
 
 #ifdef CONFIG_GENERIC_IOMAP
 #include <asm-generic/iomap.h>
@@ -61,6 +62,44 @@
 #define __io_par(v)     __io_ar(v)
 #endif
 
+/*
+ * "__DISABLE_TRACE_MMIO__" flag can be used to disable MMIO tracing for
+ * specific kernel drivers in case of excessive/unwanted logging.
+ *
+ * Usage: Add a #define flag at the beginning of the driver file.
+ * Ex: #define __DISABLE_TRACE_MMIO__
+ *     #include <...>
+ *     ...
+ */
+#if IS_ENABLED(CONFIG_TRACE_MMIO_ACCESS) && !(defined(__DISABLE_TRACE_MMIO__))
+#include <linux/tracepoint-defs.h>
+
+DECLARE_TRACEPOINT(rwmmio_write);
+DECLARE_TRACEPOINT(rwmmio_post_write);
+DECLARE_TRACEPOINT(rwmmio_read);
+DECLARE_TRACEPOINT(rwmmio_post_read);
+
+void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+		    unsigned long caller_addr);
+void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+			 unsigned long caller_addr);
+void log_read_mmio(u8 width, const volatile void __iomem *addr,
+		   unsigned long caller_addr);
+void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
+			unsigned long caller_addr);
+
+#else
+
+static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+				  unsigned long caller_addr) {}
+static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
+				       unsigned long caller_addr) {}
+static inline void log_read_mmio(u8 width, const volatile void __iomem *addr,
+				 unsigned long caller_addr) {}
+static inline void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
+				      unsigned long caller_addr) {}
+
+#endif /* CONFIG_TRACE_MMIO_ACCESS */
 
 /*
  * __raw_{read,write}{b,w,l,q}() access memory in native endianness.
@@ -149,9 +188,11 @@ static inline u8 readb(const volatile void __iomem *addr)
 {
 	u8 val;
 
+	log_read_mmio(8, addr, _THIS_IP_);
 	__io_br();
 	val = __raw_readb(addr);
 	__io_ar(val);
+	log_post_read_mmio(val, 8, addr, _THIS_IP_);
 	return val;
 }
 #endif
@@ -162,9 +203,11 @@ static inline u16 readw(const volatile void __iomem *addr)
 {
 	u16 val;
 
+	log_read_mmio(16, addr, _THIS_IP_);
 	__io_br();
 	val = __le16_to_cpu((__le16 __force)__raw_readw(addr));
 	__io_ar(val);
+	log_post_read_mmio(val, 16, addr, _THIS_IP_);
 	return val;
 }
 #endif
@@ -175,9 +218,11 @@ static inline u32 readl(const volatile void __iomem *addr)
 {
 	u32 val;
 
+	log_read_mmio(32, addr, _THIS_IP_);
 	__io_br();
 	val = __le32_to_cpu((__le32 __force)__raw_readl(addr));
 	__io_ar(val);
+	log_post_read_mmio(val, 32, addr, _THIS_IP_);
 	return val;
 }
 #endif
@@ -189,9 +234,11 @@ static inline u64 readq(const volatile void __iomem *addr)
 {
 	u64 val;
 
+	log_read_mmio(64, addr, _THIS_IP_);
 	__io_br();
 	val = __le64_to_cpu(__raw_readq(addr));
 	__io_ar(val);
+	log_post_read_mmio(val, 64, addr, _THIS_IP_);
 	return val;
 }
 #endif
@@ -201,9 +248,11 @@ static inline u64 readq(const volatile void __iomem *addr)
 #define writeb writeb
 static inline void writeb(u8 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 8, addr, _THIS_IP_);
 	__io_bw();
 	__raw_writeb(value, addr);
 	__io_aw();
+	log_post_write_mmio(value, 8, addr, _THIS_IP_);
 }
 #endif
 
@@ -211,9 +260,11 @@ static inline void writeb(u8 value, volatile void __iomem *addr)
 #define writew writew
 static inline void writew(u16 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 16, addr, _THIS_IP_);
 	__io_bw();
 	__raw_writew((u16 __force)cpu_to_le16(value), addr);
 	__io_aw();
+	log_post_write_mmio(value, 16, addr, _THIS_IP_);
 }
 #endif
 
@@ -221,9 +272,11 @@ static inline void writew(u16 value, volatile void __iomem *addr)
 #define writel writel
 static inline void writel(u32 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 32, addr, _THIS_IP_);
 	__io_bw();
 	__raw_writel((u32 __force)__cpu_to_le32(value), addr);
 	__io_aw();
+	log_post_write_mmio(value, 32, addr, _THIS_IP_);
 }
 #endif
 
@@ -232,9 +285,11 @@ static inline void writel(u32 value, volatile void __iomem *addr)
 #define writeq writeq
 static inline void writeq(u64 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 64, addr, _THIS_IP_);
 	__io_bw();
 	__raw_writeq(__cpu_to_le64(value), addr);
 	__io_aw();
+	log_post_write_mmio(value, 64, addr, _THIS_IP_);
 }
 #endif
 #endif /* CONFIG_64BIT */
@@ -248,7 +303,12 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 #define readb_relaxed readb_relaxed
 static inline u8 readb_relaxed(const volatile void __iomem *addr)
 {
-	return __raw_readb(addr);
+	u8 val;
+
+	log_read_mmio(8, addr, _THIS_IP_);
+	val = __raw_readb(addr);
+	log_post_read_mmio(val, 8, addr, _THIS_IP_);
+	return val;
 }
 #endif
 
@@ -256,7 +316,12 @@ static inline u8 readb_relaxed(const volatile void __iomem *addr)
 #define readw_relaxed readw_relaxed
 static inline u16 readw_relaxed(const volatile void __iomem *addr)
 {
-	return __le16_to_cpu(__raw_readw(addr));
+	u16 val;
+
+	log_read_mmio(16, addr, _THIS_IP_);
+	val = __le16_to_cpu(__raw_readw(addr));
+	log_post_read_mmio(val, 16, addr, _THIS_IP_);
+	return val;
 }
 #endif
 
@@ -264,7 +329,12 @@ static inline u16 readw_relaxed(const volatile void __iomem *addr)
 #define readl_relaxed readl_relaxed
 static inline u32 readl_relaxed(const volatile void __iomem *addr)
 {
-	return __le32_to_cpu(__raw_readl(addr));
+	u32 val;
+
+	log_read_mmio(32, addr, _THIS_IP_);
+	val = __le32_to_cpu(__raw_readl(addr));
+	log_post_read_mmio(val, 32, addr, _THIS_IP_);
+	return val;
 }
 #endif
 
@@ -272,7 +342,12 @@ static inline u32 readl_relaxed(const volatile void __iomem *addr)
 #define readq_relaxed readq_relaxed
 static inline u64 readq_relaxed(const volatile void __iomem *addr)
 {
-	return __le64_to_cpu(__raw_readq(addr));
+	u64 val;
+
+	log_read_mmio(64, addr, _THIS_IP_);
+	val = __le64_to_cpu(__raw_readq(addr));
+	log_post_read_mmio(val, 64, addr, _THIS_IP_);
+	return val;
 }
 #endif
 
@@ -280,7 +355,9 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr)
 #define writeb_relaxed writeb_relaxed
 static inline void writeb_relaxed(u8 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 8, addr, _THIS_IP_);
 	__raw_writeb(value, addr);
+	log_post_write_mmio(value, 8, addr, _THIS_IP_);
 }
 #endif
 
@@ -288,7 +365,9 @@ static inline void writeb_relaxed(u8 value, volatile void __iomem *addr)
 #define writew_relaxed writew_relaxed
 static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 16, addr, _THIS_IP_);
 	__raw_writew(cpu_to_le16(value), addr);
+	log_post_write_mmio(value, 16, addr, _THIS_IP_);
 }
 #endif
 
@@ -296,7 +375,9 @@ static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 #define writel_relaxed writel_relaxed
 static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 32, addr, _THIS_IP_);
 	__raw_writel(__cpu_to_le32(value), addr);
+	log_post_write_mmio(value, 32, addr, _THIS_IP_);
 }
 #endif
 
@@ -304,7 +385,9 @@ static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 #define writeq_relaxed writeq_relaxed
 static inline void writeq_relaxed(u64 value, volatile void __iomem *addr)
 {
+	log_write_mmio(value, 64, addr, _THIS_IP_);
 	__raw_writeq(__cpu_to_le64(value), addr);
+	log_post_write_mmio(value, 64, addr, _THIS_IP_);
 }
 #endif
 
-- 
cgit 


From 10f09307199da274584b8170a41228ca6dfed6d3 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Fri, 10 Jun 2022 10:45:30 +0200
Subject: iio: core: drop of.h from iio.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no reason to include OF as we only need to forward declare
'of_phandle_args'. Previously, some drivers were actually relying on
this for some headers (those were already fixed).

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220610084545.547700-20-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 4e21a82b3756..d9b4a9ca9a0f 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -11,13 +11,14 @@
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/iio/types.h>
-#include <linux/of.h>
 /* IIO TODO LIST */
 /*
  * Provide means of adjusting timer accuracy.
  * Currently assumes nano seconds.
  */
 
+struct of_phandle_args;
+
 enum iio_shared_by {
 	IIO_SEPARATE,
 	IIO_SHARED_BY_TYPE,
-- 
cgit 


From fb91526b5fb04133799bc708661b467226caa032 Mon Sep 17 00:00:00 2001
From: Rex-BC Chen <rex-bc.chen@mediatek.com>
Date: Mon, 23 May 2022 17:33:40 +0800
Subject: dt-bindings: reset: mediatek: Add infra_ao reset index for
 MT8192/MT8195
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To support reset of infra_ao, add the index of infra_ao reset of
thermal/svs/pcei for MT8192 and thermal/svs for MT8195.

Signed-off-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
[Nícolas: Test for MT8192]
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Link: https://lore.kernel.org/r/20220523093346.28493-14-rex-bc.chen@mediatek.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/reset/mt8192-resets.h | 8 ++++++++
 include/dt-bindings/reset/mt8195-resets.h | 6 ++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/mt8192-resets.h b/include/dt-bindings/reset/mt8192-resets.h
index 764ca9910fa9..12e2087c90a3 100644
--- a/include/dt-bindings/reset/mt8192-resets.h
+++ b/include/dt-bindings/reset/mt8192-resets.h
@@ -7,6 +7,7 @@
 #ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8192
 #define _DT_BINDINGS_RESET_CONTROLLER_MT8192
 
+/* TOPRGU resets */
 #define MT8192_TOPRGU_MM_SW_RST					1
 #define MT8192_TOPRGU_MFG_SW_RST				2
 #define MT8192_TOPRGU_VENC_SW_RST				3
@@ -30,4 +31,11 @@
 /* MMSYS resets */
 #define MT8192_MMSYS_SW0_RST_B_DISP_DSI0			15
 
+/* INFRA resets */
+#define MT8192_INFRA_RST0_THERM_CTRL_SWRST		0
+#define MT8192_INFRA_RST2_PEXTP_PHY_SWRST		1
+#define MT8192_INFRA_RST3_THERM_CTRL_PTP_SWRST	2
+#define MT8192_INFRA_RST4_PCIE_TOP_SWRST		3
+#define MT8192_INFRA_RST4_THERM_CTRL_MCU_SWRST	4
+
 #endif  /* _DT_BINDINGS_RESET_CONTROLLER_MT8192 */
diff --git a/include/dt-bindings/reset/mt8195-resets.h b/include/dt-bindings/reset/mt8195-resets.h
index a26bccc8b957..0b1937f14b36 100644
--- a/include/dt-bindings/reset/mt8195-resets.h
+++ b/include/dt-bindings/reset/mt8195-resets.h
@@ -7,6 +7,7 @@
 #ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8195
 #define _DT_BINDINGS_RESET_CONTROLLER_MT8195
 
+/* TOPRGU resets */
 #define MT8195_TOPRGU_CONN_MCU_SW_RST          0
 #define MT8195_TOPRGU_INFRA_GRST_SW_RST        1
 #define MT8195_TOPRGU_APU_SW_RST               2
@@ -26,4 +27,9 @@
 
 #define MT8195_TOPRGU_SW_RST_NUM               16
 
+/* INFRA resets */
+#define MT8195_INFRA_RST0_THERM_CTRL_SWRST     0
+#define MT8195_INFRA_RST3_THERM_CTRL_PTP_SWRST 1
+#define MT8195_INFRA_RST4_THERM_CTRL_MCU_SWRST 2
+
 #endif  /* _DT_BINDINGS_RESET_CONTROLLER_MT8195 */
-- 
cgit 


From 5ea61b478f30c7083fc9048934220f20a3089782 Mon Sep 17 00:00:00 2001
From: Rex-BC Chen <rex-bc.chen@mediatek.com>
Date: Mon, 23 May 2022 17:33:44 +0800
Subject: dt-bindings: reset: mediatek: Add infra_ao reset index for MT8186
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To support reset of infra_ao, add the index of infra_ao reset of
thermal/svs for MT8186.

Signed-off-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220523093346.28493-18-rex-bc.chen@mediatek.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/reset/mt8186-resets.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/mt8186-resets.h b/include/dt-bindings/reset/mt8186-resets.h
index 5f850370c42c..2e9029c22f38 100644
--- a/include/dt-bindings/reset/mt8186-resets.h
+++ b/include/dt-bindings/reset/mt8186-resets.h
@@ -7,6 +7,7 @@
 #ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8186
 #define _DT_BINDINGS_RESET_CONTROLLER_MT8186
 
+/* TOPRGU resets */
 #define MT8186_TOPRGU_INFRA_SW_RST				0
 #define MT8186_TOPRGU_MM_SW_RST					1
 #define MT8186_TOPRGU_MFG_SW_RST				2
@@ -33,4 +34,8 @@
 /* MMSYS resets */
 #define MT8186_MMSYS_SW0_RST_B_DISP_DSI0			19
 
+/* INFRA resets */
+#define MT8186_INFRA_THERMAL_CTRL_RST			0
+#define MT8186_INFRA_PTP_CTRL_RST				1
+
 #endif  /* _DT_BINDINGS_RESET_CONTROLLER_MT8186 */
-- 
cgit 


From af89cd45603483135bdd238fcb3fa871155a0ae1 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 May 2022 09:57:34 +0200
Subject: clk: Improve documentation for devm_clk_get() and its optional
 variant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of "Context:" and "Return:". Mention that the clk is not to be
expected to be prepared, previously only not being enabled was mentioned
which probably dates from the times when the concept of clk preparation
wasn't invented yet.

Also describe devm_clk_get_optional() fully instead of just referencing
devm_clk_get().

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220520075737.758761-2-u.kleine-koenig@pengutronix.de
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 39faa54efe88..c8fc398d2ad7 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -443,15 +443,16 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  * @dev: device for clock "consumer"
  * @id: clock consumer ID
  *
- * Returns a struct clk corresponding to the clock producer, or
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
  * valid IS_ERR() condition containing errno.  The implementation
  * uses @dev and @id to determine the clock consumer, and thereby
  * the clock producer.  (IOW, @id may be identical strings, but
  * clk_get may return different clock producers depending on @dev.)
  *
- * Drivers must assume that the clock source is not enabled.
- *
- * devm_clk_get should not be called from within interrupt context.
+ * Drivers must assume that the clock source is neither prepared nor
+ * enabled.
  *
  * The clock will automatically be freed when the device is unbound
  * from the bus.
@@ -464,8 +465,20 @@ struct clk *devm_clk_get(struct device *dev, const char *id);
  * @dev: device for clock "consumer"
  * @id: clock consumer ID
  *
- * Behaves the same as devm_clk_get() except where there is no clock producer.
- * In this case, instead of returning -ENOENT, the function returns NULL.
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get().
+ *
+ * Drivers must assume that the clock source is neither prepared nor
+ * enabled.
+ *
+ * The clock will automatically be freed when the device is unbound
+ * from the bus.
  */
 struct clk *devm_clk_get_optional(struct device *dev, const char *id);
 
-- 
cgit 


From 7ef9651e9792b08eb310c6beb202cbc947f43cab Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 May 2022 09:57:36 +0200
Subject: clk: Provide new devm_clk helpers for prepared and enabled clocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a driver keeps a clock prepared (or enabled) during the whole
lifetime of the driver, these helpers allow to simplify the drivers.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <aardelean@deviqon.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220520075737.758761-4-u.kleine-koenig@pengutronix.de
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c |  27 ++++++++++++
 include/linux/clk.h      | 109 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index c822f4ef1584..43ccd20e0298 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -66,12 +66,39 @@ struct clk *devm_clk_get(struct device *dev, const char *id)
 }
 EXPORT_SYMBOL(devm_clk_get);
 
+struct clk *devm_clk_get_prepared(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get, clk_prepare, clk_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_prepared);
+
+struct clk *devm_clk_get_enabled(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get,
+			      clk_prepare_enable, clk_disable_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_enabled);
+
 struct clk *devm_clk_get_optional(struct device *dev, const char *id)
 {
 	return __devm_clk_get(dev, id, clk_get_optional, NULL, NULL);
 }
 EXPORT_SYMBOL(devm_clk_get_optional);
 
+struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get_optional,
+			      clk_prepare, clk_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_optional_prepared);
+
+struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get_optional,
+			      clk_prepare_enable, clk_disable_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_optional_enabled);
+
 struct clk_bulk_devres {
 	struct clk_bulk_data *clks;
 	int num_clks;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index c8fc398d2ad7..c13061cabdfc 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -459,6 +459,47 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  */
 struct clk *devm_clk_get(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_prepared - devm_clk_get() + clk_prepare()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  (IOW, @id may be identical strings, but
+ * clk_get may return different clock producers depending on @dev.)
+ *
+ * The returned clk (if valid) is prepared. Drivers must however assume
+ * that the clock is not enabled.
+ *
+ * The clock will automatically be unprepared and freed when the device
+ * is unbound from the bus.
+ */
+struct clk *devm_clk_get_prepared(struct device *dev, const char *id);
+
+/**
+ * devm_clk_get_enabled - devm_clk_get() + clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  (IOW, @id may be identical strings, but
+ * clk_get may return different clock producers depending on @dev.)
+ *
+ * The returned clk (if valid) is prepared and enabled.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_enabled(struct device *dev, const char *id);
+
 /**
  * devm_clk_get_optional - lookup and obtain a managed reference to an optional
  *			   clock producer.
@@ -482,6 +523,50 @@ struct clk *devm_clk_get(struct device *dev, const char *id);
  */
 struct clk *devm_clk_get_optional(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_optional_prepared - devm_clk_get_optional() + clk_prepare()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get_prepared().
+ *
+ * The returned clk (if valid) is prepared. Drivers must however
+ * assume that the clock is not enabled.
+ *
+ * The clock will automatically be unprepared and freed when the
+ * device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id);
+
+/**
+ * devm_clk_get_optional_enabled - devm_clk_get_optional() +
+ *                                 clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get_enabled().
+ *
+ * The returned clk (if valid) is prepared and enabled.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id);
+
 /**
  * devm_get_clk_from_child - lookup and obtain a managed reference to a
  *			     clock producer from child node.
@@ -826,12 +911,36 @@ static inline struct clk *devm_clk_get(struct device *dev, const char *id)
 	return NULL;
 }
 
+static inline struct clk *devm_clk_get_prepared(struct device *dev,
+						const char *id)
+{
+	return NULL;
+}
+
+static inline struct clk *devm_clk_get_enabled(struct device *dev,
+					       const char *id)
+{
+	return NULL;
+}
+
 static inline struct clk *devm_clk_get_optional(struct device *dev,
 						const char *id)
 {
 	return NULL;
 }
 
+static inline struct clk *devm_clk_get_optional_prepared(struct device *dev,
+							 const char *id)
+{
+	return NULL;
+}
+
+static inline struct clk *devm_clk_get_optional_enabled(struct device *dev,
+							const char *id)
+{
+	return NULL;
+}
+
 static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 						 struct clk_bulk_data *clks)
 {
-- 
cgit 


From 925d046e7e52c71c3531199ce137e141807ef740 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Tue, 7 Jun 2022 14:32:44 +0300
Subject: RDMA/core: Add a netevent notifier to cma

Add a netevent callback for cma, mainly to catch NETEVENT_NEIGH_UPDATE.

Previously, when a system with failover MAC mechanism change its MAC address
during a CM connection attempt, the RDMA-CM would take a lot of time till
it disconnects and timesout due to the incorrect MAC address.

Now when we get a NETEVENT_NEIGH_UPDATE we check if it is due to a failover
MAC change and if so, we instantly destroy the CM and notify the user in order
to spare the unnecessary waiting for the timeout.

Link: https://lore.kernel.org/r/bb255c9e301cd50b905663b8e73f7f5133d0e4c5.1654601342.git.leonro@nvidia.com
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/cma.c | 81 +++++++++++++++++++++++++++++++++++++++++++
 include/rdma/rdma_cm.h        |  1 +
 2 files changed, 82 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0a17b1bb9547..46d06678dfbe 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -21,6 +21,7 @@
 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/netevent.h>
 #include <net/tcp.h>
 #include <net/ipv6.h>
 #include <net/ip_fib.h>
@@ -5047,10 +5048,87 @@ out:
 	return ret;
 }
 
+static void cma_netevent_work_handler(struct work_struct *_work)
+{
+	struct rdma_id_private *id_priv =
+		container_of(_work, struct rdma_id_private, id.net_work);
+	struct rdma_cm_event event = {};
+
+	mutex_lock(&id_priv->handler_mutex);
+
+	if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+	    READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+		goto out_unlock;
+
+	event.event = RDMA_CM_EVENT_UNREACHABLE;
+	event.status = -ETIMEDOUT;
+
+	if (cma_cm_event_handler(id_priv, &event)) {
+		__acquire(&id_priv->handler_mutex);
+		id_priv->cm_id.ib = NULL;
+		cma_id_put(id_priv);
+		destroy_id_handler_unlock(id_priv);
+		return;
+	}
+
+out_unlock:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_id_put(id_priv);
+}
+
+static int cma_netevent_callback(struct notifier_block *self,
+				 unsigned long event, void *ctx)
+{
+	struct id_table_entry *ips_node = NULL;
+	struct rdma_id_private *current_id;
+	struct neighbour *neigh = ctx;
+	unsigned long flags;
+
+	if (event != NETEVENT_NEIGH_UPDATE)
+		return NOTIFY_DONE;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	if (neigh->tbl->family == AF_INET6) {
+		struct sockaddr_in6 neigh_sock_6;
+
+		neigh_sock_6.sin6_family = AF_INET6;
+		neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_6);
+	} else if (neigh->tbl->family == AF_INET) {
+		struct sockaddr_in neigh_sock_4;
+
+		neigh_sock_4.sin_family = AF_INET;
+		neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_4);
+	} else
+		goto out;
+
+	if (!ips_node)
+		goto out;
+
+	list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
+		if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
+			   neigh->ha, ETH_ALEN))
+			continue;
+		INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
+		cma_id_get(current_id);
+		queue_work(cma_wq, &current_id->id.net_work);
+	}
+out:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+	return NOTIFY_DONE;
+}
+
 static struct notifier_block cma_nb = {
 	.notifier_call = cma_netdev_callback
 };
 
+static struct notifier_block cma_netevent_cb = {
+	.notifier_call = cma_netevent_callback
+};
+
 static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
@@ -5273,6 +5351,7 @@ static int __init cma_init(void)
 
 	ib_sa_register_client(&sa_client);
 	register_netdevice_notifier(&cma_nb);
+	register_netevent_notifier(&cma_netevent_cb);
 
 	ret = ib_register_client(&cma_client);
 	if (ret)
@@ -5287,6 +5366,7 @@ static int __init cma_init(void)
 err_ib:
 	ib_unregister_client(&cma_client);
 err:
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
@@ -5299,6 +5379,7 @@ static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
 	ib_unregister_client(&cma_client);
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index d989f030fae0..5b18e2e36ee6 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -108,6 +108,7 @@ struct rdma_cm_id {
 	enum rdma_ucm_port_space ps;
 	enum ib_qp_type		 qp_type;
 	u32			 port_num;
+	struct work_struct net_work;
 };
 
 struct rdma_cm_id *
-- 
cgit 


From 5a0e4529d9aee8ce348f628ad476c9ddb6cf457d Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:52 -0500
Subject: dmaengine: dw-edma: Remove unused irq field in struct dw_edma_chip

The "irq" field of struct dw_edma_chip was never used. Remove it.

Link: https://lore.kernel.org/r/20220524152159.2370739-2-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-pcie.c | 1 -
 include/linux/dma/edma.h           | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index cee7aa231d7b..bc07923c3bc0 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -214,7 +214,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->dw = dw;
 	chip->dev = dev;
 	chip->id = pdev->devfn;
-	chip->irq = pdev->irq;
 
 	dw->mf = vsec_data.mf;
 	dw->nr_irqs = nr_irqs;
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index cab6e18773da..d4333e721588 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -18,13 +18,11 @@ struct dw_edma;
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
- * @irq:		 irq line
  * @dw:			 struct dw_edma that is filed by dw_edma_probe()
  */
 struct dw_edma_chip {
 	struct device		*dev;
 	int			id;
-	int			irq;
 	struct dw_edma		*dw;
 };
 
-- 
cgit 


From ee774c40fa32e3b2642bb2533bc54662ea8445c2 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@kernel.org>
Date: Wed, 15 Jun 2022 23:07:19 +0200
Subject: dt-bindings: efm32: remove bindings for deleted platform

Commit cc6111375cec ("ARM: drop efm32 platform") removed the platform,
so no need to still carry the bindings.

Signed-off-by: Wolfram Sang <wsa@kernel.org>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220615210720.6363-1-wsa@kernel.org
---
 .../devicetree/bindings/clock/efm32-clock.txt      | 11 ------
 .../devicetree/bindings/i2c/i2c-efm32.txt          | 33 -----------------
 .../devicetree/bindings/serial/efm32-uart.txt      | 20 ----------
 .../devicetree/bindings/spi/efm32-spi.txt          | 39 --------------------
 include/dt-bindings/clock/efm32-cmu.h              | 43 ----------------------
 5 files changed, 146 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/clock/efm32-clock.txt
 delete mode 100644 Documentation/devicetree/bindings/i2c/i2c-efm32.txt
 delete mode 100644 Documentation/devicetree/bindings/serial/efm32-uart.txt
 delete mode 100644 Documentation/devicetree/bindings/spi/efm32-spi.txt
 delete mode 100644 include/dt-bindings/clock/efm32-cmu.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/efm32-clock.txt b/Documentation/devicetree/bindings/clock/efm32-clock.txt
deleted file mode 100644
index 263d293f6a10..000000000000
--- a/Documentation/devicetree/bindings/clock/efm32-clock.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-* Clock bindings for Energy Micro efm32 Giant Gecko's Clock Management Unit
-
-Required properties:
-- compatible: Should be "efm32gg,cmu"
-- reg: Base address and length of the register set
-- interrupts: Interrupt used by the CMU
-- #clock-cells: Should be <1>
-
-The clock consumer should specify the desired clock by having the clock ID in
-its "clocks" phandle cell. The header efm32-clk.h contains a list of available
-IDs.
diff --git a/Documentation/devicetree/bindings/i2c/i2c-efm32.txt b/Documentation/devicetree/bindings/i2c/i2c-efm32.txt
deleted file mode 100644
index 3b30e54ae3c7..000000000000
--- a/Documentation/devicetree/bindings/i2c/i2c-efm32.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-* Energymicro efm32 i2c controller
-
-Required properties :
-
- - reg : Offset and length of the register set for the device
- - compatible : should be "energymicro,efm32-i2c"
- - interrupts : the interrupt number
- - clocks : reference to the module clock
-
-Recommended properties :
-
- - clock-frequency : maximal I2C bus clock frequency in Hz.
- - energymicro,location : Decides the location of the USART I/O pins.
-   Allowed range : [0 .. 6]
-
-Example:
-	i2c0: i2c@4000a000 {
-		#address-cells = <1>;
-		#size-cells = <0>;
-		compatible = "energymicro,efm32-i2c";
-		reg = <0x4000a000 0x400>;
-		interrupts = <9>;
-		clocks = <&cmu clk_HFPERCLKI2C0>;
-		clock-frequency = <100000>;
-		energymicro,location = <3>;
-
-		eeprom@50 {
-			compatible = "microchip,24c02";
-			reg = <0x50>;
-			pagesize = <16>;
-		};
-	};
-
diff --git a/Documentation/devicetree/bindings/serial/efm32-uart.txt b/Documentation/devicetree/bindings/serial/efm32-uart.txt
deleted file mode 100644
index 4f8d8fde0c1c..000000000000
--- a/Documentation/devicetree/bindings/serial/efm32-uart.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-* Energymicro efm32 UART
-
-Required properties:
-- compatible : Should be "energymicro,efm32-uart"
-- reg : Address and length of the register set
-- interrupts : Should contain uart interrupt
-
-Optional properties:
-- energymicro,location : Decides the location of the USART I/O pins.
-  Allowed range : [0 .. 5]
-  Default: 0
-
-Example:
-
-uart@4000c400 {
-	compatible = "energymicro,efm32-uart";
-	reg = <0x4000c400 0x400>;
-	interrupts = <15>;
-	energymicro,location = <0>;
-};
diff --git a/Documentation/devicetree/bindings/spi/efm32-spi.txt b/Documentation/devicetree/bindings/spi/efm32-spi.txt
deleted file mode 100644
index e0fa61a1be0c..000000000000
--- a/Documentation/devicetree/bindings/spi/efm32-spi.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-* Energy Micro EFM32 SPI
-
-Required properties:
-- #address-cells: see spi-bus.txt
-- #size-cells: see spi-bus.txt
-- compatible: should be "energymicro,efm32-spi"
-- reg: Offset and length of the register set for the controller
-- interrupts: pair specifying rx and tx irq
-- clocks: phandle to the spi clock
-- cs-gpios: see spi-bus.txt
-
-Recommended properties :
-- energymicro,location: Value to write to the ROUTE register's LOCATION
-                        bitfield to configure the pinmux for the device, see
-                        datasheet for values.
-                        If this property is not provided, keeping what is
-                        already configured in the hardware, so its either the
-                        reset default 0 or whatever the bootloader did.
-
-Example:
-
-spi1: spi@4000c400 { /* USART1 */
-	#address-cells = <1>;
-	#size-cells = <0>;
-	compatible = "energymicro,efm32-spi";
-	reg = <0x4000c400 0x400>;
-	interrupts = <15 16>;
-	clocks = <&cmu 20>;
-	cs-gpios = <&gpio 51 1>; // D3
-	energymicro,location = <1>;
-
-	ks8851@0 {
-		compatible = "ks8851";
-		spi-max-frequency = <6000000>;
-		reg = <0>;
-		interrupt-parent = <&boardfpga>;
-		interrupts = <4>;
-	};
-};
diff --git a/include/dt-bindings/clock/efm32-cmu.h b/include/dt-bindings/clock/efm32-cmu.h
deleted file mode 100644
index 4b48d15fe194..000000000000
--- a/include/dt-bindings/clock/efm32-cmu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __DT_BINDINGS_CLOCK_EFM32_CMU_H
-#define __DT_BINDINGS_CLOCK_EFM32_CMU_H
-
-#define clk_HFXO		0
-#define clk_HFRCO		1
-#define clk_LFXO		2
-#define clk_LFRCO		3
-#define clk_ULFRCO		4
-#define clk_AUXHFRCO		5
-#define clk_HFCLKNODIV		6
-#define clk_HFCLK		7
-#define clk_HFPERCLK		8
-#define clk_HFCORECLK		9
-#define clk_LFACLK		10
-#define clk_LFBCLK		11
-#define clk_WDOGCLK		12
-#define clk_HFCORECLKDMA	13
-#define clk_HFCORECLKAES	14
-#define clk_HFCORECLKUSBC	15
-#define clk_HFCORECLKUSB	16
-#define clk_HFCORECLKLE		17
-#define clk_HFCORECLKEBI	18
-#define clk_HFPERCLKUSART0	19
-#define clk_HFPERCLKUSART1	20
-#define clk_HFPERCLKUSART2	21
-#define clk_HFPERCLKUART0	22
-#define clk_HFPERCLKUART1	23
-#define clk_HFPERCLKTIMER0	24
-#define clk_HFPERCLKTIMER1	25
-#define clk_HFPERCLKTIMER2	26
-#define clk_HFPERCLKTIMER3	27
-#define clk_HFPERCLKACMP0	28
-#define clk_HFPERCLKACMP1	29
-#define clk_HFPERCLKI2C0	30
-#define clk_HFPERCLKI2C1	31
-#define clk_HFPERCLKGPIO	32
-#define clk_HFPERCLKVCMP	33
-#define clk_HFPERCLKPRS		34
-#define clk_HFPERCLKADC0	35
-#define clk_HFPERCLKDAC0	36
-
-#endif /* __DT_BINDINGS_CLOCK_EFM32_CMU_H */
-- 
cgit 


From 07fd5b6cdf3cc30bfde8fe0f644771688be04447 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 13 Jun 2022 12:19:50 -1000
Subject: cgroup: Use separate src/dst nodes when preloading css_sets for
 migration

Each cset (css_set) is pinned by its tasks. When we're moving tasks around
across csets for a migration, we need to hold the source and destination
csets to ensure that they don't go away while we're moving tasks about. This
is done by linking cset->mg_preload_node on either the
mgctx->preloaded_src_csets or mgctx->preloaded_dst_csets list. Using the
same cset->mg_preload_node for both the src and dst lists was deemed okay as
a cset can't be both the source and destination at the same time.

Unfortunately, this overloading becomes problematic when multiple tasks are
involved in a migration and some of them are identity noop migrations while
others are actually moving across cgroups. For example, this can happen with
the following sequence on cgroup1:

 #1> mkdir -p /sys/fs/cgroup/misc/a/b
 #2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs
 #3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS &
 #4> PID=$!
 #5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks
 #6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs

the process including the group leader back into a. In this final migration,
non-leader threads would be doing identity migration while the group leader
is doing an actual one.

After #3, let's say the whole process was in cset A, and that after #4, the
leader moves to cset B. Then, during #6, the following happens:

 1. cgroup_migrate_add_src() is called on B for the leader.

 2. cgroup_migrate_add_src() is called on A for the other threads.

 3. cgroup_migrate_prepare_dst() is called. It scans the src list.

 4. It notices that B wants to migrate to A, so it tries to A to the dst
    list but realizes that its ->mg_preload_node is already busy.

 5. and then it notices A wants to migrate to A as it's an identity
    migration, it culls it by list_del_init()'ing its ->mg_preload_node and
    putting references accordingly.

 6. The rest of migration takes place with B on the src list but nothing on
    the dst list.

This means that A isn't held while migration is in progress. If all tasks
leave A before the migration finishes and the incoming task pins it, the
cset will be destroyed leading to use-after-free.

This is caused by overloading cset->mg_preload_node for both src and dst
preload lists. We wanted to exclude the cset from the src list but ended up
inadvertently excluding it from the dst list too.

This patch fixes the issue by separating out cset->mg_preload_node into
->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst
preloadings don't interfere with each other.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reported-by: shisiyuan <shisiyuan19870131@gmail.com>
Link: http://lkml.kernel.org/r/1654187688-27411-1-git-send-email-shisiyuan@xiaomi.com
Link: https://www.spinics.net/lists/cgroups/msg33313.html
Fixes: f817de98513d ("cgroup: prepare migration path for unified hierarchy")
Cc: stable@vger.kernel.org # v3.16+
---
 include/linux/cgroup-defs.h |  3 ++-
 kernel/cgroup/cgroup.c      | 37 +++++++++++++++++++++++--------------
 2 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..d4427d0a0e18 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -264,7 +264,8 @@ struct css_set {
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
-	struct list_head mg_preload_node;
+	struct list_head mg_src_preload_node;
+	struct list_head mg_dst_preload_node;
 	struct list_head mg_node;
 
 	/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..13c8e91d7862 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -765,7 +765,8 @@ struct css_set init_css_set = {
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
-	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
+	.mg_src_preload_node	= LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+	.mg_dst_preload_node	= LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
 
 	/*
@@ -1240,7 +1241,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	INIT_LIST_HEAD(&cset->threaded_csets);
 	INIT_HLIST_NODE(&cset->hlist);
 	INIT_LIST_HEAD(&cset->cgrp_links);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_src_preload_node);
+	INIT_LIST_HEAD(&cset->mg_dst_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
@@ -2597,21 +2599,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
  */
 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
 {
-	LIST_HEAD(preloaded);
 	struct css_set *cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
 
-	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
-	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+				 mg_src_preload_node) {
+		cset->mg_src_cgrp = NULL;
+		cset->mg_dst_cgrp = NULL;
+		cset->mg_dst_cset = NULL;
+		list_del_init(&cset->mg_src_preload_node);
+		put_css_set_locked(cset);
+	}
 
-	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+				 mg_dst_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
-		list_del_init(&cset->mg_preload_node);
+		list_del_init(&cset->mg_dst_preload_node);
 		put_css_set_locked(cset);
 	}
 
@@ -2651,7 +2659,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 	if (src_cset->dead)
 		return;
 
-	if (!list_empty(&src_cset->mg_preload_node))
+	if (!list_empty(&src_cset->mg_src_preload_node))
 		return;
 
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2664,7 +2672,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 	src_cset->mg_src_cgrp = src_cgrp;
 	src_cset->mg_dst_cgrp = dst_cgrp;
 	get_css_set(src_cset);
-	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+	list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
 }
 
 /**
@@ -2689,7 +2697,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 
 	/* look up the dst cset for each src cset and link it to src */
 	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
-				 mg_preload_node) {
+				 mg_src_preload_node) {
 		struct css_set *dst_cset;
 		struct cgroup_subsys *ss;
 		int ssid;
@@ -2708,7 +2716,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 		if (src_cset == dst_cset) {
 			src_cset->mg_src_cgrp = NULL;
 			src_cset->mg_dst_cgrp = NULL;
-			list_del_init(&src_cset->mg_preload_node);
+			list_del_init(&src_cset->mg_src_preload_node);
 			put_css_set(src_cset);
 			put_css_set(dst_cset);
 			continue;
@@ -2716,8 +2724,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 
 		src_cset->mg_dst_cset = dst_cset;
 
-		if (list_empty(&dst_cset->mg_preload_node))
-			list_add_tail(&dst_cset->mg_preload_node,
+		if (list_empty(&dst_cset->mg_dst_preload_node))
+			list_add_tail(&dst_cset->mg_dst_preload_node,
 				      &mgctx->preloaded_dst_csets);
 		else
 			put_css_set(dst_cset);
@@ -2963,7 +2971,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		goto out_finish;
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+			    mg_src_preload_node) {
 		struct task_struct *task, *ntask;
 
 		/* all tasks in src_csets need to be migrated */
-- 
cgit 


From cb2bf7c6e544d2df3a683d6012ac15a5276ef911 Mon Sep 17 00:00:00 2001
From: Alim Akhtar <alim.akhtar@samsung.com>
Date: Wed, 15 Jun 2022 17:42:03 +0530
Subject: scsi: ufs: host: ufs-exynos: Use already existing definition

UFS core already uses RX_MIN_ACTIVATETIME_CAPABILITY macro, let's use the
same in driver as well instead of having a different macro name for the
same offset.

Link: https://lore.kernel.org/r/20220615121204.16642-2-alim.akhtar@samsung.com
Reviewed-by: Chanho Park <chanho61.park@samsung.com>
Signed-off-by: Alim Akhtar <alim.akhtar@samsung.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/host/ufs-exynos.c | 5 +++--
 include/ufs/unipro.h          | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c
index 04728b5da040..f971569bafc7 100644
--- a/drivers/ufs/host/ufs-exynos.c
+++ b/drivers/ufs/host/ufs-exynos.c
@@ -651,8 +651,9 @@ static void exynos_ufs_config_phy_cap_attr(struct exynos_ufs *ufs)
 
 			if (attr->rx_min_actv_time_cap)
 				ufshcd_dme_set(hba,
-					UIC_ARG_MIB_SEL(RX_MIN_ACTIVATETIME_CAP,
-						i), attr->rx_min_actv_time_cap);
+					UIC_ARG_MIB_SEL(
+					RX_MIN_ACTIVATETIME_CAPABILITY, i),
+					attr->rx_min_actv_time_cap);
 
 			if (attr->rx_hibern8_time_cap)
 				ufshcd_dme_set(hba,
diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h
index 0521f887e3ac..ade92e8d3676 100644
--- a/include/ufs/unipro.h
+++ b/include/ufs/unipro.h
@@ -69,7 +69,6 @@
 #define RX_HS_G2_PREP_LENGTH_CAP		0x0096
 #define RX_HS_G3_PREP_LENGTH_CAP		0x0097
 #define RX_ADV_GRANULARITY_CAP			0x0098
-#define RX_MIN_ACTIVATETIME_CAP			0x008F
 #define RX_HIBERN8TIME_CAP			0x0092
 #define RX_ADV_HIBERN8TIME_CAP			0x0099
 #define RX_ADV_MIN_ACTIVATETIME_CAP		0x009A
-- 
cgit 


From c0d93b12f31c31748ca5d3349777c70f1e2a8228 Mon Sep 17 00:00:00 2001
From: Alim Akhtar <alim.akhtar@samsung.com>
Date: Wed, 15 Jun 2022 17:42:04 +0530
Subject: scsi: ufs: Rearrange addresses in increasing order

Rearrange all the unipro and mphy addresses in their increasing order.

Link: https://lore.kernel.org/r/20220615121204.16642-3-alim.akhtar@samsung.com
Reviewed-by: Chanho Park <chanho61.park@samsung.com>
Signed-off-by: Alim Akhtar <alim.akhtar@samsung.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/ufs/unipro.h | 102 +++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h
index ade92e8d3676..4cbfe9272787 100644
--- a/include/ufs/unipro.h
+++ b/include/ufs/unipro.h
@@ -38,6 +38,18 @@
 /*
  * M-RX Configuration Attributes
  */
+#define RX_HS_G1_SYNC_LENGTH_CAP		0x008B
+#define RX_HS_G1_PREP_LENGTH_CAP		0x008C
+#define RX_MIN_ACTIVATETIME_CAPABILITY		0x008F
+#define RX_HIBERN8TIME_CAPABILITY		0x0092
+#define RX_HS_G2_SYNC_LENGTH_CAP		0x0094
+#define RX_HS_G3_SYNC_LENGTH_CAP		0x0095
+#define RX_HS_G2_PREP_LENGTH_CAP		0x0096
+#define RX_HS_G3_PREP_LENGTH_CAP		0x0097
+#define RX_ADV_GRANULARITY_CAP			0x0098
+#define RX_HIBERN8TIME_CAP			0x0092
+#define RX_ADV_HIBERN8TIME_CAP			0x0099
+#define RX_ADV_MIN_ACTIVATETIME_CAP		0x009A
 #define RX_MODE					0x00A1
 #define RX_HSRATE_SERIES			0x00A2
 #define RX_HSGEAR				0x00A3
@@ -47,31 +59,19 @@
 #define RX_ENTER_HIBERN8			0x00A7
 #define RX_BYPASS_8B10B_ENABLE			0x00A8
 #define RX_TERMINATION_FORCE_ENABLE		0x00A9
-#define RX_MIN_ACTIVATETIME_CAPABILITY		0x008F
-#define RX_HIBERN8TIME_CAPABILITY		0x0092
+#define RXCALCTRL				0x00B4
+#define RXSQCTRL				0x00B5
+#define CFGRXCDR8				0x00BA
+#define CFGRXOVR8				0x00BD
+#define CFGRXOVR6				0x00BF
+#define RXDIRECTCTRL2				0x00C7
+#define CFGRXOVR4				0x00E9
 #define RX_REFCLKFREQ				0x00EB
 #define	RX_CFGCLKFREQVAL			0x00EC
 #define CFGWIDEINLN				0x00F0
-#define CFGRXCDR8				0x00BA
 #define ENARXDIRECTCFG4				0x00F2
-#define CFGRXOVR8				0x00BD
-#define RXDIRECTCTRL2				0x00C7
 #define ENARXDIRECTCFG3				0x00F3
-#define RXCALCTRL				0x00B4
 #define ENARXDIRECTCFG2				0x00F4
-#define CFGRXOVR4				0x00E9
-#define RXSQCTRL				0x00B5
-#define CFGRXOVR6				0x00BF
-#define RX_HS_G1_SYNC_LENGTH_CAP		0x008B
-#define RX_HS_G1_PREP_LENGTH_CAP		0x008C
-#define RX_HS_G2_SYNC_LENGTH_CAP		0x0094
-#define RX_HS_G3_SYNC_LENGTH_CAP		0x0095
-#define RX_HS_G2_PREP_LENGTH_CAP		0x0096
-#define RX_HS_G3_PREP_LENGTH_CAP		0x0097
-#define RX_ADV_GRANULARITY_CAP			0x0098
-#define RX_HIBERN8TIME_CAP			0x0092
-#define RX_ADV_HIBERN8TIME_CAP			0x0099
-#define RX_ADV_MIN_ACTIVATETIME_CAP		0x009A
 
 
 #define is_mphy_tx_attr(attr)			(attr < RX_MODE)
@@ -102,47 +102,50 @@
 /*
  * PHY Adapter attributes
  */
-#define PA_ACTIVETXDATALANES	0x1560
-#define PA_ACTIVERXDATALANES	0x1580
-#define PA_TXTRAILINGCLOCKS	0x1564
 #define PA_PHY_TYPE		0x1500
 #define PA_AVAILTXDATALANES	0x1520
-#define PA_AVAILRXDATALANES	0x1540
-#define PA_MINRXTRAILINGCLOCKS	0x1543
-#define PA_TXPWRSTATUS		0x1567
-#define PA_RXPWRSTATUS		0x1582
-#define PA_TXFORCECLOCK		0x1562
-#define PA_TXPWRMODE		0x1563
-#define PA_LEGACYDPHYESCDL	0x1570
 #define PA_MAXTXSPEEDFAST	0x1521
 #define PA_MAXTXSPEEDSLOW	0x1522
 #define PA_MAXRXSPEEDFAST	0x1541
 #define PA_MAXRXSPEEDSLOW	0x1542
 #define PA_TXLINKSTARTUPHS	0x1544
+#define PA_AVAILRXDATALANES	0x1540
+#define PA_MINRXTRAILINGCLOCKS	0x1543
 #define PA_LOCAL_TX_LCC_ENABLE	0x155E
+#define PA_ACTIVETXDATALANES	0x1560
+#define PA_CONNECTEDTXDATALANES	0x1561
+#define PA_TXFORCECLOCK		0x1562
+#define PA_TXPWRMODE		0x1563
+#define PA_TXTRAILINGCLOCKS	0x1564
 #define PA_TXSPEEDFAST		0x1565
 #define PA_TXSPEEDSLOW		0x1566
-#define PA_REMOTEVERINFO	0x15A0
+#define PA_TXPWRSTATUS		0x1567
 #define PA_TXGEAR		0x1568
 #define PA_TXTERMINATION	0x1569
 #define PA_HSSERIES		0x156A
+#define PA_LEGACYDPHYESCDL	0x1570
 #define PA_PWRMODE		0x1571
+#define PA_ACTIVERXDATALANES	0x1580
+#define PA_CONNECTEDRXDATALANES	0x1581
+#define PA_RXPWRSTATUS		0x1582
 #define PA_RXGEAR		0x1583
 #define PA_RXTERMINATION	0x1584
 #define PA_MAXRXPWMGEAR		0x1586
 #define PA_MAXRXHSGEAR		0x1587
+#define PA_PACPREQTIMEOUT	0x1590
+#define PA_PACPREQEOBTIMEOUT	0x1591
+#define PA_REMOTEVERINFO	0x15A0
+#define PA_LOGICALLANEMAP	0x15A1
+#define PA_SLEEPNOCONFIGTIME	0x15A2
+#define PA_STALLNOCONFIGTIME	0x15A3
+#define PA_SAVECONFIGTIME	0x15A4
 #define PA_RXHSUNTERMCAP	0x15A5
 #define PA_RXLSTERMCAP		0x15A6
 #define PA_GRANULARITY		0x15AA
-#define PA_PACPREQTIMEOUT	0x1590
-#define PA_PACPREQEOBTIMEOUT	0x1591
 #define PA_HIBERN8TIME		0x15A7
 #define PA_LOCALVERINFO		0x15A9
 #define PA_GRANULARITY		0x15AA
 #define PA_TACTIVATE		0x15A8
-#define PA_PACPFRAMECOUNT	0x15C0
-#define PA_PACPERRORCOUNT	0x15C1
-#define PA_PHYTESTCONTROL	0x15C2
 #define PA_PWRMODEUSERDATA0	0x15B0
 #define PA_PWRMODEUSERDATA1	0x15B1
 #define PA_PWRMODEUSERDATA2	0x15B2
@@ -155,12 +158,9 @@
 #define PA_PWRMODEUSERDATA9	0x15B9
 #define PA_PWRMODEUSERDATA10	0x15BA
 #define PA_PWRMODEUSERDATA11	0x15BB
-#define PA_CONNECTEDTXDATALANES	0x1561
-#define PA_CONNECTEDRXDATALANES	0x1581
-#define PA_LOGICALLANEMAP	0x15A1
-#define PA_SLEEPNOCONFIGTIME	0x15A2
-#define PA_STALLNOCONFIGTIME	0x15A3
-#define PA_SAVECONFIGTIME	0x15A4
+#define PA_PACPFRAMECOUNT	0x15C0
+#define PA_PACPERRORCOUNT	0x15C1
+#define PA_PHYTESTCONTROL	0x15C2
 #define PA_TXHSADAPTTYPE       0x15D4
 
 /* Adpat type for PA_TXHSADAPTTYPE attribute */
@@ -172,9 +172,9 @@
 #define PA_HIBERN8_TIME_UNIT_US		100
 
 /*Other attributes*/
+#define VS_POWERSTATE		0xD083
 #define VS_MPHYCFGUPDT		0xD085
 #define VS_DEBUGOMC		0xD09E
-#define VS_POWERSTATE		0xD083
 
 #define PA_GRANULARITY_MIN_VAL	1
 #define PA_GRANULARITY_MAX_VAL	6
@@ -245,27 +245,27 @@ enum ufs_unipro_ver {
 /*
  * Data Link Layer Attributes
  */
+#define DL_TXPREEMPTIONCAP	0x2000
+#define DL_TC0TXMAXSDUSIZE	0x2001
+#define DL_TC0RXINITCREDITVAL	0x2002
+#define DL_TC1TXMAXSDUSIZE	0x2003
+#define DL_TC1RXINITCREDITVAL	0x2004
+#define DL_TC0TXBUFFERSIZE	0x2005
+#define DL_TC1TXBUFFERSIZE	0x2006
 #define DL_TC0TXFCTHRESHOLD	0x2040
 #define DL_FC0PROTTIMEOUTVAL	0x2041
 #define DL_TC0REPLAYTIMEOUTVAL	0x2042
 #define DL_AFC0REQTIMEOUTVAL	0x2043
 #define DL_AFC0CREDITTHRESHOLD	0x2044
 #define DL_TC0OUTACKTHRESHOLD	0x2045
+#define DL_PEERTC0PRESENT	0x2046
+#define DL_PEERTC0RXINITCREVAL	0x2047
 #define DL_TC1TXFCTHRESHOLD	0x2060
 #define DL_FC1PROTTIMEOUTVAL	0x2061
 #define DL_TC1REPLAYTIMEOUTVAL	0x2062
 #define DL_AFC1REQTIMEOUTVAL	0x2063
 #define DL_AFC1CREDITTHRESHOLD	0x2064
 #define DL_TC1OUTACKTHRESHOLD	0x2065
-#define DL_TXPREEMPTIONCAP	0x2000
-#define DL_TC0TXMAXSDUSIZE	0x2001
-#define DL_TC0RXINITCREDITVAL	0x2002
-#define DL_TC0TXBUFFERSIZE	0x2005
-#define DL_PEERTC0PRESENT	0x2046
-#define DL_PEERTC0RXINITCREVAL	0x2047
-#define DL_TC1TXMAXSDUSIZE	0x2003
-#define DL_TC1RXINITCREDITVAL	0x2004
-#define DL_TC1TXBUFFERSIZE	0x2006
 #define DL_PEERTC1PRESENT	0x2066
 #define DL_PEERTC1RXINITCREVAL	0x2067
 
-- 
cgit 


From fc53683b45b053d94a660c417fd3d63cabc43b6f Mon Sep 17 00:00:00 2001
From: Stanley Chu <stanley.chu@mediatek.com>
Date: Thu, 16 Jun 2022 13:37:15 +0800
Subject: scsi: ufs: Export ufshcd_uic_change_pwr_mode()

Export ufshcd_uic_change_pwr_mode() to allow vendors to use it for
SoC-specific power mode change design limitations.

Link: https://lore.kernel.org/r/20220616053725.5681-2-stanley.chu@mediatek.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Stanley Chu <stanley.chu@mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 3 ++-
 include/ufs/ufshcd.h      | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index bb6cbd514a69..f484ba61472a 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4093,7 +4093,7 @@ out_unlock:
  *
  * Returns 0 on success, non-zero value on failure
  */
-static int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode)
+int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode)
 {
 	struct uic_command uic_cmd = {0};
 	int ret;
@@ -4118,6 +4118,7 @@ static int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode)
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ufshcd_uic_change_pwr_mode);
 
 int ufshcd_link_recovery(struct ufs_hba *hba)
 {
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 991aea081ec7..cb66304bda3d 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1099,6 +1099,7 @@ extern int ufshcd_dme_get_attr(struct ufs_hba *hba, u32 attr_sel,
 			       u32 *mib_val, u8 peer);
 extern int ufshcd_config_pwr_mode(struct ufs_hba *hba,
 			struct ufs_pa_layer_attr *desired_pwr_mode);
+extern int ufshcd_uic_change_pwr_mode(struct ufs_hba *hba, u8 mode);
 
 /* UIC command interfaces for DME primitives */
 #define DME_LOCAL	0
-- 
cgit 


From 3f9b6cec12e2d2b3bdf69d47979b7877985c55e0 Mon Sep 17 00:00:00 2001
From: CC Chou <cc.chou@mediatek.com>
Date: Thu, 16 Jun 2022 13:37:17 +0800
Subject: scsi: ufs: ufs-mediatek: Introduce workaround for power mode change

Some MediaTek SoC chips need special flow for power mode change, especially
for chips supporting HS-G5.

Enable the workaround by setting the host-specific capability.

Link: https://lore.kernel.org/r/20220616053725.5681-4-stanley.chu@mediatek.com
Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>
Signed-off-by: CC Chou <cc.chou@mediatek.com>
Signed-off-by: Eddie Huang <eddie.huang@mediatek.com>
Signed-off-by: Dennis Yu <tun-yu.yu@mediatek.com>
Signed-off-by: Peter Wang <peter.want@medaitek.com>
Signed-off-by: Stanley Chu <stanley.chu@mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/host/ufs-mediatek.c | 60 +++++++++++++++++++++++++++++++++++++++--
 drivers/ufs/host/ufs-mediatek.h |  1 +
 include/ufs/unipro.h            |  1 +
 3 files changed, 60 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index beabc3ccd30b..2931fd21e38a 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -82,6 +82,13 @@ static bool ufs_mtk_is_broken_vcc(struct ufs_hba *hba)
 	return !!(host->caps & UFS_MTK_CAP_BROKEN_VCC);
 }
 
+static bool ufs_mtk_is_pmc_via_fastauto(struct ufs_hba *hba)
+{
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
+
+	return (host->caps & UFS_MTK_CAP_PMC_VIA_FASTAUTO);
+}
+
 static void ufs_mtk_cfg_unipro_cg(struct ufs_hba *hba, bool enable)
 {
 	u32 tmp;
@@ -579,6 +586,9 @@ static void ufs_mtk_init_host_caps(struct ufs_hba *hba)
 	if (of_property_read_bool(np, "mediatek,ufs-broken-vcc"))
 		host->caps |= UFS_MTK_CAP_BROKEN_VCC;
 
+	if (of_property_read_bool(np, "mediatek,ufs-pmc-via-fastauto"))
+		host->caps |= UFS_MTK_CAP_PMC_VIA_FASTAUTO;
+
 	dev_info(hba->dev, "caps: 0x%x", host->caps);
 }
 
@@ -754,6 +764,26 @@ out:
 	return err;
 }
 
+static bool ufs_mtk_pmc_via_fastauto(struct ufs_hba *hba,
+				     struct ufs_pa_layer_attr *dev_req_params)
+{
+	if (!ufs_mtk_is_pmc_via_fastauto(hba))
+		return false;
+
+	if (dev_req_params->hs_rate == hba->pwr_info.hs_rate)
+		return false;
+
+	if (dev_req_params->pwr_tx != FAST_MODE &&
+	    dev_req_params->gear_tx < UFS_HS_G4)
+		return false;
+
+	if (dev_req_params->pwr_rx != FAST_MODE &&
+	    dev_req_params->gear_rx < UFS_HS_G4)
+		return false;
+
+	return true;
+}
+
 static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 				  struct ufs_pa_layer_attr *dev_max_params,
 				  struct ufs_pa_layer_attr *dev_req_params)
@@ -763,8 +793,8 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 	int ret;
 
 	ufshcd_init_pwr_dev_param(&host_cap);
-	host_cap.hs_rx_gear = UFS_HS_G4;
-	host_cap.hs_tx_gear = UFS_HS_G4;
+	host_cap.hs_rx_gear = UFS_HS_G5;
+	host_cap.hs_tx_gear = UFS_HS_G5;
 
 	ret = ufshcd_get_pwr_dev_param(&host_cap,
 				       dev_max_params,
@@ -774,6 +804,32 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 			__func__);
 	}
 
+	if (ufs_mtk_pmc_via_fastauto(hba, dev_req_params)) {
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), true);
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXGEAR), UFS_HS_G1);
+
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXTERMINATION), true);
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_RXGEAR), UFS_HS_G1);
+
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_ACTIVETXDATALANES),
+			       dev_req_params->lane_tx);
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_ACTIVERXDATALANES),
+			       dev_req_params->lane_rx);
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HSSERIES),
+			       dev_req_params->hs_rate);
+
+		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXHSADAPTTYPE),
+			       PA_NO_ADAPT);
+
+		ret = ufshcd_uic_change_pwr_mode(hba,
+					FASTAUTO_MODE << 4 | FASTAUTO_MODE);
+
+		if (ret) {
+			dev_err(hba->dev, "%s: HSG1B FASTAUTO failed ret=%d\n",
+				__func__, ret);
+		}
+	}
+
 	if (host->hw_ver.major >= 3) {
 		ret = ufshcd_dme_configure_adapt(hba,
 					   dev_req_params->gear_tx,
diff --git a/drivers/ufs/host/ufs-mediatek.h b/drivers/ufs/host/ufs-mediatek.h
index 414dca86c09f..7e1913769671 100644
--- a/drivers/ufs/host/ufs-mediatek.h
+++ b/drivers/ufs/host/ufs-mediatek.h
@@ -108,6 +108,7 @@ enum ufs_mtk_host_caps {
 	UFS_MTK_CAP_VA09_PWR_CTRL              = 1 << 1,
 	UFS_MTK_CAP_DISABLE_AH8                = 1 << 2,
 	UFS_MTK_CAP_BROKEN_VCC                 = 1 << 3,
+	UFS_MTK_CAP_PMC_VIA_FASTAUTO           = 1 << 6,
 };
 
 struct ufs_mtk_crypt_cfg {
diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h
index 4cbfe9272787..6c553f98fe57 100644
--- a/include/ufs/unipro.h
+++ b/include/ufs/unipro.h
@@ -228,6 +228,7 @@ enum ufs_hs_gear_tag {
 	UFS_HS_G2,		/* HS Gear 2 */
 	UFS_HS_G3,		/* HS Gear 3 */
 	UFS_HS_G4,		/* HS Gear 4 */
+	UFS_HS_G5		/* HS Gear 5 */
 };
 
 enum ufs_unipro_ver {
-- 
cgit 


From 1d6f9decb60a23cde2e0fbe0f89d5fc6d462ddd5 Mon Sep 17 00:00:00 2001
From: Stanley Chu <stanley.chu@mediatek.com>
Date: Thu, 16 Jun 2022 13:37:23 +0800
Subject: scsi: ufs: Export regulator functions

Export below regulator functions to allow vendors to
customize regulator configuration in their own platforms.

int ufshcd_populate_vreg(struct device *dev, const char *name,
                         struct ufs_vreg **out_vreg);
int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg);

Link: https://lore.kernel.org/r/20220616053725.5681-10-stanley.chu@mediatek.com
Signed-off-by: Stanley Chu <stanley.chu@mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c        | 3 ++-
 drivers/ufs/host/ufshcd-pltfrm.c | 5 +++--
 drivers/ufs/host/ufshcd-pltfrm.h | 2 ++
 include/ufs/ufshcd.h             | 2 ++
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 295a900d39a3..cdea7edeb14d 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -8415,7 +8415,7 @@ static int ufshcd_setup_hba_vreg(struct ufs_hba *hba, bool on)
 	return ufshcd_toggle_vreg(hba->dev, info->vdd_hba, on);
 }
 
-static int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg)
+int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg)
 {
 	int ret = 0;
 
@@ -8431,6 +8431,7 @@ static int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg)
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ufshcd_get_vreg);
 
 static int ufshcd_init_vreg(struct ufs_hba *hba)
 {
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index e7332cc65b1f..2dd9c660531b 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -109,8 +109,8 @@ out:
 }
 
 #define MAX_PROP_SIZE 32
-static int ufshcd_populate_vreg(struct device *dev, const char *name,
-		struct ufs_vreg **out_vreg)
+int ufshcd_populate_vreg(struct device *dev, const char *name,
+			 struct ufs_vreg **out_vreg)
 {
 	char prop_name[MAX_PROP_SIZE];
 	struct ufs_vreg *vreg = NULL;
@@ -145,6 +145,7 @@ out:
 	*out_vreg = vreg;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ufshcd_populate_vreg);
 
 /**
  * ufshcd_parse_regulator_info - get regulator info from device tree
diff --git a/drivers/ufs/host/ufshcd-pltfrm.h b/drivers/ufs/host/ufshcd-pltfrm.h
index 43c2e412bd99..5130c9471dc2 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.h
+++ b/drivers/ufs/host/ufshcd-pltfrm.h
@@ -32,5 +32,7 @@ void ufshcd_init_pwr_dev_param(struct ufs_dev_params *dev_param);
 int ufshcd_pltfrm_init(struct platform_device *pdev,
 		       const struct ufs_hba_variant_ops *vops);
 void ufshcd_pltfrm_shutdown(struct platform_device *pdev);
+int ufshcd_populate_vreg(struct device *dev, const char *name,
+			 struct ufs_vreg **out_vreg);
 
 #endif /* UFSHCD_PLTFRM_H_ */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index cb66304bda3d..b5c9064a11d9 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1199,6 +1199,8 @@ void ufshcd_map_desc_id_to_length(struct ufs_hba *hba, enum desc_idn desc_id,
 
 u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba);
 
+int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg);
+
 int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd);
 
 int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba,
-- 
cgit 


From 6a33ed506416a74aea3071a72192fb06a3cc82df Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 16 Jun 2022 11:02:10 +0300
Subject: scsi: iscsi: Make iscsi_unregister_transport() return void

This function always returns 0. We can make it return void to simplify the
code. Also, no caller ever checks the return value of this function.

Link: https://lore.kernel.org/r/20220616080210.18531-1-mgurtovoy@nvidia.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 4 +---
 include/scsi/scsi_transport_iscsi.h | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 2578db4c095d..360e73ee29d5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -4807,7 +4807,7 @@ free_priv:
 }
 EXPORT_SYMBOL_GPL(iscsi_register_transport);
 
-int iscsi_unregister_transport(struct iscsi_transport *tt)
+void iscsi_unregister_transport(struct iscsi_transport *tt)
 {
 	struct iscsi_internal *priv;
 	unsigned long flags;
@@ -4830,8 +4830,6 @@ int iscsi_unregister_transport(struct iscsi_transport *tt)
 	sysfs_remove_group(&priv->dev.kobj, &iscsi_transport_group);
 	device_unregister(&priv->dev);
 	mutex_unlock(&rx_queue_mutex);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 9acb8422f680..695396a5f607 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -162,7 +162,7 @@ struct iscsi_transport {
  * transport registration upcalls
  */
 extern struct scsi_transport_template *iscsi_register_transport(struct iscsi_transport *tt);
-extern int iscsi_unregister_transport(struct iscsi_transport *tt);
+extern void iscsi_unregister_transport(struct iscsi_transport *tt);
 
 /*
  * control plane upcalls
-- 
cgit 


From d687f621c518d791b5fffde8add3112d869b0b1b Mon Sep 17 00:00:00 2001
From: Delyan Kratunov <delyank@fb.com>
Date: Tue, 14 Jun 2022 23:10:42 +0000
Subject: bpf: move bpf_prog to bpf.h

In order to add a version of bpf_prog_run_array which accesses the
bpf_prog->aux member, bpf_prog needs to be more than a forward
declaration inside bpf.h.

Given that filter.h already includes bpf.h, this merely reorders
the type declarations for filter.h users. bpf.h users now have access to
bpf_prog internals.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/3ed7824e3948f22d84583649ccac0ff0d38b6b58.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/filter.h | 34 ----------------------------------
 2 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e6092d0ea95..69106ae46464 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -5,6 +5,7 @@
 #define _LINUX_BPF_H 1
 
 #include <uapi/linux/bpf.h>
+#include <uapi/linux/filter.h>
 
 #include <linux/workqueue.h>
 #include <linux/file.h>
@@ -22,6 +23,7 @@
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
+#include <linux/stddef.h>
 #include <linux/bpfptr.h>
 #include <linux/btf.h>
 
@@ -1084,6 +1086,40 @@ struct bpf_prog_aux {
 	};
 };
 
+struct bpf_prog {
+	u16			pages;		/* Number of allocated pages */
+	u16			jited:1,	/* Is our filter JIT'ed? */
+				jit_requested:1,/* archs need to JIT the prog */
+				gpl_compatible:1, /* Is filter GPL compatible? */
+				cb_access:1,	/* Is control block accessed? */
+				dst_needed:1,	/* Do we need dst entry? */
+				blinding_requested:1, /* needs constant blinding */
+				blinded:1,	/* Was blinded */
+				is_func:1,	/* program is a bpf function */
+				kprobe_override:1, /* Do we override a kprobe? */
+				has_callchain_buf:1, /* callchain buffer allocated? */
+				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
+				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
+				call_get_func_ip:1, /* Do we call get_func_ip() */
+				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
+	enum bpf_prog_type	type;		/* Type of BPF program */
+	enum bpf_attach_type	expected_attach_type; /* For some prog types */
+	u32			len;		/* Number of filter blocks */
+	u32			jited_len;	/* Size of jited insns in bytes */
+	u8			tag[BPF_TAG_SIZE];
+	struct bpf_prog_stats __percpu *stats;
+	int __percpu		*active;
+	unsigned int		(*bpf_func)(const void *ctx,
+					    const struct bpf_insn *insn);
+	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
+	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	/* Instructions for interpreter */
+	union {
+		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
+		DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
+	};
+};
+
 struct bpf_array_aux {
 	/* Programs with direct jumps into programs part of this array. */
 	struct list_head poke_progs;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ed0c0ff42ad5..d0cbb31b1b4d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -559,40 +559,6 @@ struct bpf_prog_stats {
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
-struct bpf_prog {
-	u16			pages;		/* Number of allocated pages */
-	u16			jited:1,	/* Is our filter JIT'ed? */
-				jit_requested:1,/* archs need to JIT the prog */
-				gpl_compatible:1, /* Is filter GPL compatible? */
-				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				blinding_requested:1, /* needs constant blinding */
-				blinded:1,	/* Was blinded */
-				is_func:1,	/* program is a bpf function */
-				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1, /* callchain buffer allocated? */
-				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
-				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1, /* Do we call get_func_ip() */
-				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
-	enum bpf_prog_type	type;		/* Type of BPF program */
-	enum bpf_attach_type	expected_attach_type; /* For some prog types */
-	u32			len;		/* Number of filter blocks */
-	u32			jited_len;	/* Size of jited insns in bytes */
-	u8			tag[BPF_TAG_SIZE];
-	struct bpf_prog_stats __percpu *stats;
-	int __percpu		*active;
-	unsigned int		(*bpf_func)(const void *ctx,
-					    const struct bpf_insn *insn);
-	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
-	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	/* Instructions for interpreter */
-	union {
-		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
-		DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
-	};
-};
-
 struct sk_filter {
 	refcount_t	refcnt;
 	struct rcu_head	rcu;
-- 
cgit 


From 8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 Mon Sep 17 00:00:00 2001
From: Delyan Kratunov <delyank@fb.com>
Date: Tue, 14 Jun 2022 23:10:46 +0000
Subject: bpf: implement sleepable uprobes by chaining gps

uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe
programs is therefore on the bpf side.

Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu. In order for uprobe bpf programs to become sleepable, it has to be
protected by the tasks_trace rcu flavor instead (and kfree() called after
a corresponding grace period).

Therefore, the free path for bpf_prog_array now chains a tasks_trace and
normal grace periods one after the other.

Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations).

The downside is that the tasks_trace latency affects all perf_event-attached
bpf programs (and not just uprobe ones). This is deemed safe given the
possible attach rates for kprobe/uprobe/tp programs.

Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes
an rcu read section, in addition to the overarching tasks_trace section.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h         | 52 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c           | 15 +++++++++++++
 kernel/trace/bpf_trace.c    |  4 ++--
 kernel/trace/trace_uprobe.c |  5 ++---
 4 files changed, 71 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69106ae46464..f3e88afdaffe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -26,6 +26,7 @@
 #include <linux/stddef.h>
 #include <linux/bpfptr.h>
 #include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1372,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
+/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
 bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@@ -1463,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
 	return ret;
 }
 
+/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
+ *
+ * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
+ * overall. As a result, we must use the bpf_prog_array_free_sleepable
+ * in order to use the tasks_trace rcu grace period.
+ *
+ * When a non-sleepable program is inside the array, we take the rcu read
+ * section and disable preemption for that program alone, so it can access
+ * rcu-protected dynamically sized maps.
+ */
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
+			     const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
+	u32 ret = 1;
+
+	might_fault();
+
+	rcu_read_lock_trace();
+	migrate_disable();
+
+	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
+	if (unlikely(!array))
+		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable)
+			rcu_read_lock();
+
+		run_ctx.bpf_cookie = item->bpf_cookie;
+		ret &= run_prog(prog, ctx);
+		item++;
+
+		if (!prog->aux->sleepable)
+			rcu_read_unlock();
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+out:
+	migrate_enable();
+	rcu_read_unlock_trace();
+	return ret;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e78cc5eea4a5..b5ffebcce6cc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
 	kfree_rcu(progs, rcu);
 }
 
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+	struct bpf_prog_array *progs;
+
+	progs = container_of(rcu, struct bpf_prog_array, rcu);
+	kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+	if (!progs || progs == &bpf_empty_prog_array.hdr)
+		return;
+	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
 int bpf_prog_array_length(struct bpf_prog_array *array)
 {
 	struct bpf_prog_array_item *item;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 10b157a6d73e..d1c22594dbf9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	event->prog = prog;
 	event->bpf_cookie = bpf_cookie;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
-	bpf_prog_array_free(old_array);
+	bpf_prog_array_free_sleepable(old_array);
 
 unlock:
 	mutex_unlock(&bpf_event_mutex);
@@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 		bpf_prog_array_delete_safe(old_array, event->prog);
 	} else {
 		rcu_assign_pointer(event->tp_event->prog_array, new_array);
-		bpf_prog_array_free(old_array);
+		bpf_prog_array_free_sleepable(old_array);
 	}
 
 	bpf_prog_put(event->prog);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..0282c119b1b2 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/string.h>
 #include <linux/rculist.h>
+#include <linux/filter.h>
 
 #include "trace_dynevent.h"
 #include "trace_probe.h"
@@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (bpf_prog_array_valid(call)) {
 		u32 ret;
 
-		preempt_disable();
-		ret = trace_call_bpf(call, regs);
-		preempt_enable();
+		ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
 		if (!ret)
 			return;
 	}
-- 
cgit 


From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 30 May 2022 14:34:50 -0400
Subject: mm: avoid unnecessary page fault retires on shared memory types

I observed that for each of the shared file-backed page faults, we're very
likely to retry one more time for the 1st write fault upon no page.  It's
because we'll need to release the mmap lock for dirty rate limit purpose
with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()).

Then after that throttling we return VM_FAULT_RETRY.

We did that probably because VM_FAULT_RETRY is the only way we can return
to the fault handler at that time telling it we've released the mmap lock.

However that's not ideal because it's very likely the fault does not need
to be retried at all since the pgtable was well installed before the
throttling, so the next continuous fault (including taking mmap read lock,
walk the pgtable, etc.) could be in most cases unnecessary.

It's not only slowing down page faults for shared file-backed, but also add
more mmap lock contention which is in most cases not needed at all.

To observe this, one could try to write to some shmem page and look at
"pgfault" value in /proc/vmstat, then we should expect 2 counts for each
shmem write simply because we retried, and vm event "pgfault" will capture
that.

To make it more efficient, add a new VM_FAULT_COMPLETED return code just to
show that we've completed the whole fault and released the lock.  It's also
a hint that we should very possibly not need another fault immediately on
this page because we've just completed it.

This patch provides a ~12% perf boost on my aarch64 test VM with a simple
program sequentially dirtying 400MB shmem file being mmap()ed and these are
the time it needs:

  Before: 650.980 ms (+-1.94%)
  After:  569.396 ms (+-1.38%)

I believe it could help more than that.

We need some special care on GUP and the s390 pgfault handler (for gmap
code before returning from pgfault), the rest changes in the page fault
handlers should be relatively straightforward.

Another thing to mention is that mm_account_fault() does take this new
fault as a generic fault to be accounted, unlike VM_FAULT_RETRY.

I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do
not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping
them as-is.

Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vineet Gupta <vgupta@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>	[arm part]
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Weinberger <richard@nod.at>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Will Deacon <will@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Helge Deller <deller@gmx.de>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/mm/fault.c         |  4 ++++
 arch/arc/mm/fault.c           |  4 ++++
 arch/arm/mm/fault.c           |  4 ++++
 arch/arm64/mm/fault.c         |  4 ++++
 arch/csky/mm/fault.c          |  4 ++++
 arch/hexagon/mm/vm_fault.c    |  4 ++++
 arch/ia64/mm/fault.c          |  4 ++++
 arch/m68k/mm/fault.c          |  4 ++++
 arch/microblaze/mm/fault.c    |  4 ++++
 arch/mips/mm/fault.c          |  4 ++++
 arch/nios2/mm/fault.c         |  4 ++++
 arch/openrisc/mm/fault.c      |  4 ++++
 arch/parisc/mm/fault.c        |  4 ++++
 arch/powerpc/mm/copro_fault.c |  5 +++++
 arch/powerpc/mm/fault.c       |  5 +++++
 arch/riscv/mm/fault.c         |  4 ++++
 arch/s390/mm/fault.c          | 12 ++++++++++++
 arch/sh/mm/fault.c            |  4 ++++
 arch/sparc/mm/fault_32.c      |  4 ++++
 arch/sparc/mm/fault_64.c      |  5 +++++
 arch/um/kernel/trap.c         |  4 ++++
 arch/x86/mm/fault.c           |  4 ++++
 arch/xtensa/mm/fault.c        |  4 ++++
 include/linux/mm_types.h      |  2 ++
 mm/gup.c                      | 34 +++++++++++++++++++++++++++++++++-
 mm/memory.c                   |  2 +-
 26 files changed, 139 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index ec20c1004abf..ef427a6bdd1a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -155,6 +155,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index dad27e4d69ff..5ca59a482632 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -146,6 +146,10 @@ retry:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/*
 	 * Fault retry nuances, mmap_lock already relinquished by core mm
 	 */
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index a062e07516dd..46cccd6bf705 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -322,6 +322,10 @@ retry:
 		return 0;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (!(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_RETRY) {
 			flags |= FAULT_FLAG_TRIED;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c5e11768e5c1..de166cdeb89a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -608,6 +608,10 @@ retry:
 		return 0;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (fault & VM_FAULT_RETRY) {
 		mm_flags |= FAULT_FLAG_TRIED;
 		goto retry;
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 7215a46b6b8e..e15f736cca4b 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -285,6 +285,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 4fac4b9eb316..f73c7cbfe326 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -96,6 +96,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/* The most common case -- we are done. */
 	if (likely(!(fault & VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_RETRY) {
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 07379d1a227f..ef78c2d66cdd 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -139,6 +139,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We ran out of memory, or some other thing happened
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 71aa9f6315dc..4d2837eb3e2a 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -141,6 +141,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return 0;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index a9626e6a68af..5c40c3ebe52f 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -222,6 +222,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index b08bc556d30d..a27045f5a556 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -162,6 +162,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index a32f14cd72f2..edaca0a6c1c1 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -139,6 +139,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 53b760af3bb7..b4762d66e9ef 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -165,6 +165,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 84bc437be5cd..9ad80d4d3389 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -311,6 +311,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index c1cb21a00884..7c507fb48182 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 
 	ret = 0;
 	*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (*flt & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (unlikely(*flt & VM_FAULT_ERROR)) {
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d53fed4eccbd..014005428687 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -511,6 +511,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return user_mode(regs) ? 0 : SIGBUS;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		goto out;
+
 	/*
 	 * Handle the retry right now, the mmap_lock has been released in that
 	 * case.
@@ -525,6 +529,7 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR))
 		return mm_fault_error(regs, address, fault);
 
+out:
 	/*
 	 * Major/minor page fault accounting.
 	 */
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 40694f0cab9e..f2fbd1400b7c 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -326,6 +326,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_RETRY)) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e173b6187ad5..973dcd05c293 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -433,6 +433,17 @@ retry:
 			goto out_up;
 		goto out;
 	}
+
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED) {
+		if (gmap) {
+			mmap_read_lock(mm);
+			goto out_gmap;
+		}
+		fault = 0;
+		goto out;
+	}
+
 	if (unlikely(fault & VM_FAULT_ERROR))
 		goto out_up;
 
@@ -452,6 +463,7 @@ retry:
 		mmap_read_lock(mm);
 		goto retry;
 	}
+out_gmap:
 	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 		address =  __gmap_link(gmap, current->thread.gmap_addr,
 				       address);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index e175667b1363..acd2f5e50bfc 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -485,6 +485,10 @@ good_area:
 		if (mm_fault_error(regs, error_code, address, fault))
 			return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (fault & VM_FAULT_RETRY) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index ad569d9bd124..91259f291c54 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -190,6 +190,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 253e07043298..4acc12eafbf5 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -427,6 +427,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		goto exit_exception;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		goto lock_released;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -449,6 +453,7 @@ good_area:
 	}
 	mmap_read_unlock(mm);
 
+lock_released:
 	mm_rss = get_mm_rss(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index d1d5d0be0308..d3ce21c4ca32 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,6 +76,10 @@ good_area:
 		if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
 			goto out_nosemaphore;
 
+		/* The fault is fully completed (including releasing mmap lock) */
+		if (fault & VM_FAULT_COMPLETED)
+			return 0;
+
 		if (unlikely(fault & VM_FAULT_ERROR)) {
 			if (fault & VM_FAULT_OOM) {
 				goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fad8faa29d04..fe10c6d76bac 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1408,6 +1408,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/*
 	 * If we need to retry the mmap_lock has already been released,
 	 * and if there is a fatal signal pending there is no guarantee
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 16f0a5ff5799..8c781b05c0bd 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -172,6 +172,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c29ab4c0cd5c..6b961a29bf26 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t;
  * @VM_FAULT_NEEDDSYNC:		->fault did not modify page tables and needs
  *				fsync() to complete (for synchronous page faults
  *				in DAX)
+ * @VM_FAULT_COMPLETED:		->fault completed, meanwhile mmap lock released
  * @VM_FAULT_HINDEX_MASK:	mask HINDEX value
  *
  */
@@ -746,6 +747,7 @@ enum vm_fault_reason {
 	VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
 	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
 	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
+	VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
 	VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
 };
 
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..407a81d5ca03 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -951,6 +951,25 @@ static int faultin_page(struct vm_area_struct *vma,
 	}
 
 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+	if (ret & VM_FAULT_COMPLETED) {
+		/*
+		 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+		 * mmap lock in the page fault handler. Sanity check this.
+		 */
+		WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+		if (locked)
+			*locked = 0;
+		/*
+		 * We should do the same as VM_FAULT_RETRY, but let's not
+		 * return -EBUSY since that's not reflecting the reality of
+		 * what has happened - we've just fully completed a page
+		 * fault, with the mmap lock released.  Use -EAGAIN to show
+		 * that we want to take the mmap lock _again_.
+		 */
+		return -EAGAIN;
+	}
+
 	if (ret & VM_FAULT_ERROR) {
 		int err = vm_fault_to_errno(ret, *flags);
 
@@ -1177,6 +1196,7 @@ retry:
 			case 0:
 				goto retry;
 			case -EBUSY:
+			case -EAGAIN:
 				ret = 0;
 				fallthrough;
 			case -EFAULT:
@@ -1303,6 +1323,18 @@ retry:
 		return -EINTR;
 
 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+	if (ret & VM_FAULT_COMPLETED) {
+		/*
+		 * NOTE: it's a pity that we need to retake the lock here
+		 * to pair with the unlock() in the callers. Ideally we
+		 * could tell the callers so they do not need to unlock.
+		 */
+		mmap_read_lock(mm);
+		*unlocked = true;
+		return 0;
+	}
+
 	if (ret & VM_FAULT_ERROR) {
 		int err = vm_fault_to_errno(ret, 0);
 
@@ -1368,7 +1400,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 			/* VM_FAULT_RETRY couldn't trigger, bypass */
 			return ret;
 
-		/* VM_FAULT_RETRY cannot return errors */
+		/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
 		if (!*locked) {
 			BUG_ON(ret < 0);
 			BUG_ON(ret >= nr_pages);
diff --git a/mm/memory.c b/mm/memory.c
index 7a089145cad4..580c62febe42 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3020,7 +3020,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
 		balance_dirty_pages_ratelimited(mapping);
 		if (fpin) {
 			fput(fpin);
-			return VM_FAULT_RETRY;
+			return VM_FAULT_COMPLETED;
 		}
 	}
 
-- 
cgit 


From bcc728eb4f446073e0160671d7d0059a4e9aa300 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 31 May 2022 10:04:21 +0800
Subject: mm/damon: remove obsolete comments of kdamond_stop

Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete
kdamond_stop and change to use kthread stop mechanism, these obsolete
comments should be removed accordingly.

Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7c62da31ce4b..2765c7d99beb 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -397,7 +397,6 @@ struct damon_callback {
  * detail.
  *
  * @kdamond:		Kernel thread who does the monitoring.
- * @kdamond_stop:	Notifies whether kdamond should stop.
  * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
  *
  * For each monitoring context, one kernel thread for the monitoring is
@@ -406,14 +405,14 @@ struct damon_callback {
  * Once started, the monitoring thread runs until explicitly required to be
  * terminated or every monitoring target is invalid.  The validity of the
  * targets is checked via the &damon_operations.target_valid of @ops.  The
- * termination can also be explicitly requested by writing non-zero to
- * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
- * Therefore, users can know whether the monitoring is ongoing or terminated by
- * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
- * outside of the monitoring thread must be protected by @kdamond_lock.
- *
- * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
- * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ * termination can also be explicitly requested by calling damon_stop().
+ * The thread sets @kdamond to NULL when it terminates. Therefore, users can
+ * know whether the monitoring is ongoing or terminated by reading @kdamond.
+ * Reads and writes to @kdamond from outside of the monitoring thread must
+ * be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
+ * Accesses to other fields must be protected by themselves.
  *
  * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
-- 
cgit 


From 9384d79249d04b03572abb7e551a35d99c9268c0 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Mon, 6 Jun 2022 16:15:33 +0200
Subject: mm/highmem: delete memmove_page()

Matthew Wilcox reported that, while he was looking at memmove_page(), he
realized that it can't actually work.

The reasons are hidden in its implementation, which makes use of memmove()
on logical addresses provided by kmap_local_page().  memmove() does the
wrong thing when it tests "if (dest <= src)".

Therefore, delete memmove_page().

No need to change any other code because we have no call sites of
memmove_page() across the whole kernel.

Link: https://lkml.kernel.org/r/20220606141533.555-1-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Reported-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3af34de54330..fee9835e3793 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off,
 	kunmap_local(dst);
 }
 
-static inline void memmove_page(struct page *dst_page, size_t dst_off,
-			       struct page *src_page, size_t src_off,
-			       size_t len)
-{
-	char *dst = kmap_local_page(dst_page);
-	char *src = kmap_local_page(src_page);
-
-	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
-	memmove(dst + dst_off, src + src_off, len);
-	kunmap_local(src);
-	kunmap_local(dst);
-}
-
 static inline void memset_page(struct page *page, size_t offset, int val,
 			       size_t len)
 {
-- 
cgit 


From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001
From: Patrick Wang <patrick.wang.shcn@gmail.com>
Date: Sat, 11 Jun 2022 11:55:48 +0800
Subject: mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count
 argument to kmemleak_alloc_phys()

Patch series "mm: kmemleak: store objects allocated with physical address
separately and check when scan", v4.

The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to
check address.  But on some architectures, kmemleak_*_phys() is called
before those two variables initialized.  The following steps will be
taken:

1) Add OBJECT_PHYS flag and rbtree for the objects allocated
   with physical address
2) Store physical address in objects if allocated with OBJECT_PHYS
3) Check the boundary when scan instead of in kmemleak_*_phys()

This patch set will solve:
https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com
https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com

v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com
v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com
v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com


This patch (of 4):

Remove the unused kmemleak_not_leak_phys() function.  And remove the
min_count argument to kmemleak_alloc_phys() function, assume it's 0.

Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com
Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com
Signed-off-by: Patrick Wang <patrick.wang.shcn@gmail.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Yee Lee <yee.lee@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/dev-tools/kmemleak.rst    |  1 -
 drivers/of/fdt.c                        |  2 +-
 include/linux/kmemleak.h                |  8 ++------
 mm/kmemleak.c                           | 20 +++-----------------
 mm/memblock.c                           | 14 +++++++-------
 tools/testing/memblock/linux/kmemleak.h |  2 +-
 6 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 1c935f41cd3a..5483fd39ef29 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -174,7 +174,6 @@ mapping:
 
 - ``kmemleak_alloc_phys``
 - ``kmemleak_free_part_phys``
-- ``kmemleak_not_leak_phys``
 - ``kmemleak_ignore_phys``
 
 Dealing with false positives/negatives
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index a8f5b6532165..2c677e84c3f5 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
 			pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
 				uname, &base, (unsigned long)(size / SZ_1M));
 			if (!nomap)
-				kmemleak_alloc_phys(base, size, 0, 0);
+				kmemleak_alloc_phys(base, size, 0);
 		}
 		else
 			pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 34684b2026ab..6a3cd1bf4680 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
 extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
-extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
 				gfp_t gfp) __ref;
 extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
-extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
 extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
 
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
@@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr)
 {
 }
 static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-				       int min_count, gfp_t gfp)
+				       gfp_t gfp)
 {
 }
 static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 {
 }
-static inline void kmemleak_not_leak_phys(phys_addr_t phys)
-{
-}
 static inline void kmemleak_ignore_phys(phys_addr_t phys)
 {
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a182f5ddaf68..156eafafa182 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1125,15 +1125,13 @@ EXPORT_SYMBOL(kmemleak_no_scan);
  *			 address argument
  * @phys:	physical address of the object
  * @size:	size of the object
- * @min_count:	minimum number of references to this object.
- *              See kmemleak_alloc()
  * @gfp:	kmalloc() flags used for kmemleak internal memory allocations
  */
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
-			       gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
 {
 	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-		kmemleak_alloc(__va(phys), size, min_count, gfp);
+		/* assume min_count 0 */
+		kmemleak_alloc(__va(phys), size, 0, gfp);
 }
 EXPORT_SYMBOL(kmemleak_alloc_phys);
 
@@ -1151,18 +1149,6 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 }
 EXPORT_SYMBOL(kmemleak_free_part_phys);
 
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- *			    address argument
- * @phys:	physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
-	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-		kmemleak_not_leak(__va(phys));
-}
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
-
 /**
  * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
  *			  address argument
diff --git a/mm/memblock.c b/mm/memblock.c
index e4f03a6e8e56..749abd2685c4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
  * from the regions with mirroring enabled and then retried from any
  * memory region.
  *
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
  *
  * Return:
  * Physical address of allocated memory block on success, %0 on failure.
@@ -1398,12 +1398,12 @@ done:
 	 */
 	if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
 		/*
-		 * The min_count is set to 0 so that memblock allocated
-		 * blocks are never reported as leaks. This is because many
-		 * of these blocks are only referred via the physical
-		 * address which is not looked up by kmemleak.
+		 * Memblock allocated blocks are never reported as
+		 * leaks. This is because many of these blocks are
+		 * only referred via the physical address which is
+		 * not looked up by kmemleak.
 		 */
-		kmemleak_alloc_phys(found, size, 0, 0);
+		kmemleak_alloc_phys(found, size, 0);
 
 	return found;
 }
diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h
index 462f8c5e8aa0..5fed13bb9ec4 100644
--- a/tools/testing/memblock/linux/kmemleak.h
+++ b/tools/testing/memblock/linux/kmemleak.h
@@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 }
 
 static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-				       int min_count, gfp_t gfp)
+				       gfp_t gfp)
 {
 }
 
-- 
cgit 


From fc4db90fe71e640e3fe88df346f7cf653b75315d Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Fri, 10 Jun 2022 11:03:10 -0700
Subject: mm: kmem: make mem_cgroup_from_obj() vmalloc()-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently mem_cgroup_from_obj() is not working properly with objects
allocated using vmalloc().  It creates problems in some cases, when it's
called for static objects belonging to modules or generally allocated
using vmalloc().

This patch makes mem_cgroup_from_obj() safe to be called on objects
allocated using vmalloc().

It also introduces mem_cgroup_from_slab_obj(), which is a faster version
to use in places when we know the object is either a slab object or a
generic slab page (e.g.  when adding an object to a lru list).

Link: https://lkml.kernel.org/r/20220610180310.1725111-1-roman.gushchin@linux.dev
Suggested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Tested-by: Vasily Averin <vvs@openvz.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Qian Cai <quic_qiancai@quicinc.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++++
 mm/list_lru.c              |  2 +-
 mm/memcontrol.c            | 71 ++++++++++++++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9ecead1042b9..3ce96ce5fe3e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 }
 
 struct mem_cgroup *mem_cgroup_from_obj(void *p);
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
 
 static inline void count_objcg_event(struct obj_cgroup *objcg,
 				     enum vm_event_item idx)
@@ -1801,6 +1802,11 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
        return NULL;
 }
 
+static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+	return NULL;
+}
+
 static inline void count_objcg_event(struct obj_cgroup *objcg,
 				     enum vm_event_item idx)
 {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index ba76428ceece..a05e5bef3b40 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
 	if (!list_lru_memcg_aware(lru))
 		goto out;
 
-	memcg = mem_cgroup_from_obj(ptr);
+	memcg = mem_cgroup_from_slab_obj(ptr);
 	if (!memcg)
 		goto out;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28c1532cc91f..c1ae9b3f8d35 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -783,7 +783,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 	struct lruvec *lruvec;
 
 	rcu_read_lock();
-	memcg = mem_cgroup_from_obj(p);
+	memcg = mem_cgroup_from_slab_obj(p);
 
 	/*
 	 * Untracked pages have no memcg, no lruvec. Update only the
@@ -2841,27 +2841,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
 	return 0;
 }
 
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
 {
-	struct folio *folio;
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	folio = virt_to_folio(p);
-
 	/*
 	 * Slab objects are accounted individually, not per-page.
 	 * Memcg membership data for each individual object is saved in
@@ -2894,6 +2876,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 	return page_memcg_check(folio_page(folio, 0));
 }
 
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+	struct folio *folio;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	if (unlikely(is_vmalloc_addr(p)))
+		folio = page_folio(vmalloc_to_page(p));
+	else
+		folio = virt_to_folio(p);
+
+	return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
 {
 	struct obj_cgroup *objcg = NULL;
-- 
cgit 


From 1d0403d20f6c281cb3d14c5f1db5317caeec48e9 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@openvz.org>
Date: Fri, 3 Jun 2022 07:19:43 +0300
Subject: net: set proper memcg for net_init hooks allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__register_pernet_operations() executes init hook of registered
pernet_operation structure in all existing net namespaces.

Typically, these hooks are called by a process associated with the
specified net namespace, and all __GFP_ACCOUNT marked allocation are
accounted for corresponding container/memcg.

However __register_pernet_operations() calls the hooks in the same
context, and as a result all marked allocations are accounted to one memcg
for all processed net namespaces.

This patch adjusts active memcg for each net namespace and helps to
account memory allocated inside ops_init() into the proper memcg.

Link: https://lkml.kernel.org/r/f9394752-e272-9bf9-645f-a18c56d1c4ec@openvz.org
Signed-off-by: Vasily Averin <vvs@openvz.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Linux Kernel Functional Testing <lkft@linaro.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net_namespace.c   |  7 +++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3ce96ce5fe3e..04f2f33607e9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1756,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
+/**
+ * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object.
+ * @p: pointer to object from which memcg should be extracted. It can be NULL.
+ *
+ * Retrieves the memory group into which the memory of the pointed kernel
+ * object is accounted. If memcg is found, its reference is taken.
+ * If a passed kernel object is uncharged, or if proper memcg cannot be found,
+ * as well as if mem_cgroup is disabled, NULL is returned.
+ *
+ * Return: valid memcg pointer with taken reference or NULL.
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	do {
+		memcg = mem_cgroup_from_obj(p);
+	} while (memcg && !css_tryget(&memcg->css));
+	rcu_read_unlock();
+	return memcg;
+}
+
+/**
+ * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup.
+ * @memcg: pointer to a valid memory cgroup or NULL.
+ *
+ * If passed argument is not NULL, returns it without any additional checks
+ * and changes. Otherwise, root_mem_cgroup is returned.
+ *
+ * NOTE: root_mem_cgroup can be NULL during early boot.
+ */
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+	return memcg ? memcg : root_mem_cgroup;
+}
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1799,7 +1835,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 
 static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
 {
-       return NULL;
+	return NULL;
 }
 
 static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
@@ -1812,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 {
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+	return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27..6b9f19122ec1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -18,6 +18,7 @@
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/uidgid.h>
 #include <linux/cookie.h>
 
@@ -1143,7 +1144,13 @@ static int __register_pernet_operations(struct list_head *list,
 		 * setup_net() and cleanup_net() are not possible.
 		 */
 		for_each_net(net) {
+			struct mem_cgroup *old, *memcg;
+
+			memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net));
+			old = set_active_memcg(memcg);
 			error = ops_init(ops, net);
+			set_active_memcg(old);
+			mem_cgroup_put(memcg);
 			if (error)
 				goto out_undo;
 			list_add_tail(&net->exit_list, &net_exit_list);
-- 
cgit 


From 4815a36009044ba69a9b8d781943ec6505c451a2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 3 Jun 2022 20:10:12 +0300
Subject: include/linux/rbtree.h: replace kernel.h with the necessary
 inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20220603171012.48880-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 235047d7a1b5..f7edca369eda 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -17,9 +17,9 @@
 #ifndef	_LINUX_RBTREE_H
 #define	_LINUX_RBTREE_H
 
+#include <linux/container_of.h>
 #include <linux/rbtree_types.h>
 
-#include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/rcupdate.h>
 
-- 
cgit 


From d30dfd490f7dc4cb6a7c11a647bd1ff7a22139e7 Mon Sep 17 00:00:00 2001
From: Justin Stitt <jstitt007@gmail.com>
Date: Wed, 8 Jun 2022 15:35:39 -0700
Subject: include/uapi/linux/swab.h: move explicit cast outside ternary

A cast inside __builtin_constant_p doesn't do anything since it should
evaluate as constant at compile time irrespective of this cast.  Instead,
I moved this cast outside the ternary to ensure the return type is as
expected.

Additionally, if __HAVE_BUILTIN_BSWAP16__ was not defined then __swab16 is
actually returning an `int` not a `u16` due to integer promotion.

As Al Viro notes:
You *can't* get smaller-than-int out of ? :, same as you can't get it
out of addition, etc.

This also fixes some clang -Wformat warnings involving default
argument promotion.

Link: https://github.com/ClangBuiltLinux/linux/issues/378
Link: https://lkml.kernel.org/r/20220608223539.470472-1-justinstitt@google.com
Signed-off-by: Justin Stitt <jstitt007@gmail.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/swab.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 7272f85d6d6a..0723a9cce747 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -102,7 +102,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
 #else
 #define __swab16(x)				\
-	(__builtin_constant_p((__u16)(x)) ?	\
+	(__u16)(__builtin_constant_p(x) ?	\
 	___constant_swab16(x) :			\
 	__fswab16(x))
 #endif
@@ -115,7 +115,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
 #else
 #define __swab32(x)				\
-	(__builtin_constant_p((__u32)(x)) ?	\
+	(__u32)(__builtin_constant_p(x) ?	\
 	___constant_swab32(x) :			\
 	__fswab32(x))
 #endif
@@ -128,7 +128,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x))
 #else
 #define __swab64(x)				\
-	(__builtin_constant_p((__u64)(x)) ?	\
+	(__u64)(__builtin_constant_p(x) ?	\
 	___constant_swab64(x) :			\
 	__fswab64(x))
 #endif
-- 
cgit 


From dabba87229411a5e9d20ac03ffc36463c53ae672 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 27 May 2022 02:55:34 +0000
Subject: fs/kernel_read_file: allow to read files up-to ssize_t

Patch series "Allow to kexec with initramfs larger than 2G", v2.

Currently, the largest initramfs that is supported by kexec_file_load()
syscall is 2G.

This is because kernel_read_file() returns int, and is limited to INT_MAX
or 2G.

On the other hand, there are kexec based boot loaders (i.e.  u-root), that
may need to boot netboot images that might be larger than 2G.

The first patch changes the return type from int to ssize_t in
kernel_read_file* functions.

The second patch increases the maximum initramfs file size to 4G.

Tested: verified that can kexec_file_load() works with 4G initramfs
on x86_64.


This patch (of 2):

Currently, the maximum file size that is supported is 2G.  This may be too
small in some cases.  For example, kexec_file_load() system call loads
initramfs.  In some netboot cases initramfs can be rather large.

Allow to use up-to ssize_t bytes.  The callers still can limit the maximum
file size via buf_size.

Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/kernel_read_file.c            | 38 +++++++++++++++++++-------------------
 include/linux/kernel_read_file.h | 32 ++++++++++++++++----------------
 include/linux/limits.h           |  1 +
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b9..5d826274570c 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
  * change between calls to kernel_read_file().
  *
  * Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
  *
  */
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
-		     size_t buf_size, size_t *file_size,
-		     enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+			 size_t buf_size, size_t *file_size,
+			 enum kernel_read_file_id id)
 {
 	loff_t i_size, pos;
-	size_t copied;
+	ssize_t copied;
 	void *allocated = NULL;
 	bool whole_file;
 	int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
 		goto out;
 	}
 	/* The file is too big for sane activities. */
-	if (i_size > INT_MAX) {
+	if (i_size > SSIZE_MAX) {
 		ret = -EFBIG;
 		goto out;
 	}
@@ -124,12 +124,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
-			       size_t buf_size, size_t *file_size,
-			       enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+				   size_t buf_size, size_t *file_size,
+				   enum kernel_read_file_id id)
 {
 	struct file *file;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id)
 {
 	struct file *file;
 	struct path root;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
 
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
-			     size_t buf_size, size_t *file_size,
-			     enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+				 size_t buf_size, size_t *file_size,
+				 enum kernel_read_file_id id)
 {
 	struct fd f = fdget(fd);
-	int ret = -EBADF;
+	ssize_t ret = -EBADF;
 
 	if (!f.file || !(f.file->f_mode & FMODE_READ))
 		goto out;
diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 575ffa1031d3..90451e2e12bd 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
-int kernel_read_file(struct file *file, loff_t offset,
-		     void **buf, size_t buf_size,
-		     size_t *file_size,
-		     enum kernel_read_file_id id);
-int kernel_read_file_from_path(const char *path, loff_t offset,
-			       void **buf, size_t buf_size,
-			       size_t *file_size,
-			       enum kernel_read_file_id id);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id);
-int kernel_read_file_from_fd(int fd, loff_t offset,
-			     void **buf, size_t buf_size,
-			     size_t *file_size,
-			     enum kernel_read_file_id id);
+ssize_t kernel_read_file(struct file *file, loff_t offset,
+			 void **buf, size_t buf_size,
+			 size_t *file_size,
+			 enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset,
+				   void **buf, size_t buf_size,
+				   size_t *file_size,
+				   enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset,
+				 void **buf, size_t buf_size,
+				 size_t *file_size,
+				 enum kernel_read_file_id id);
 
 #endif /* _LINUX_KERNEL_READ_FILE_H */
diff --git a/include/linux/limits.h b/include/linux/limits.h
index b568b9c30bbf..f6bcc9369010 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -7,6 +7,7 @@
 #include <vdso/limits.h>
 
 #define SIZE_MAX	(~(size_t)0)
+#define SSIZE_MAX	((ssize_t)(SIZE_MAX >> 1))
 #define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
 #define U8_MAX		((u8)~0U)
-- 
cgit 


From a793679827a87b01f9d973ea30b923cd5a3ff2c5 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 Jun 2022 10:46:11 +0200
Subject: linux/phy.h: add phydev_err_probe() wrapper for dev_err_probe()

The dev_err_probe() function is quite useful to avoid boilerplate
related to -EPROBE_DEFER handling. Add a phydev_err_probe() helper to
simplify making use of that from phy drivers which otherwise use the
phydev_* helpers.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 508f1149665b..bed9a347481b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1539,6 +1539,9 @@ static inline void phy_device_reset(struct phy_device *phydev, int value)
 #define phydev_err(_phydev, format, args...)	\
 	dev_err(&_phydev->mdio.dev, format, ##args)
 
+#define phydev_err_probe(_phydev, err, format, args...)	\
+	dev_err_probe(&_phydev->mdio.dev, err, format, ##args)
+
 #define phydev_info(_phydev, format, args...)	\
 	dev_info(&_phydev->mdio.dev, format, ##args)
 
-- 
cgit 


From ac80287a6af9fc3f3d189d6d1f523889a0a9e1bc Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 15 Jun 2022 16:48:42 +0300
Subject: bpf: Fix documentation of th_len in bpf_tcp_{gen,check}_syncookie

bpf_tcp_gen_syncookie expects the full length of the TCP header (with
all options), and bpf_tcp_check_syncookie accepts lengths bigger than
sizeof(struct tcphdr). Fix the documentation that says these lengths
should be exactly sizeof(struct tcphdr).

While at it, fix a typo in the name of struct ipv6hdr.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-2-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 10 ++++++----
 tools/include/uapi/linux/bpf.h | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..f545e39df72a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3597,10 +3597,11 @@ union bpf_attr {
  *
  * 		*iph* points to the start of the IPv4 or IPv6 header, while
  * 		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- * 		**sizeof**\ (**struct ip6hdr**).
+ * 		**sizeof**\ (**struct ipv6hdr**).
  *
  * 		*th* points to the start of the TCP header, while *th_len*
- * 		contains **sizeof**\ (**struct tcphdr**).
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  * 	Return
  * 		0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
  * 		error otherwise.
@@ -3783,10 +3784,11 @@ union bpf_attr {
  *
  *		*iph* points to the start of the IPv4 or IPv6 header, while
  *		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- *		**sizeof**\ (**struct ip6hdr**).
+ *		**sizeof**\ (**struct ipv6hdr**).
  *
  *		*th* points to the start of the TCP header, while *th_len*
- *		contains the length of the TCP header.
+ *		contains the length of the TCP header with options (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  *	Return
  *		On success, lower 32 bits hold the generated SYN cookie in
  *		followed by 16 bits which hold the MSS value for that cookie,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62d..f545e39df72a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3597,10 +3597,11 @@ union bpf_attr {
  *
  * 		*iph* points to the start of the IPv4 or IPv6 header, while
  * 		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- * 		**sizeof**\ (**struct ip6hdr**).
+ * 		**sizeof**\ (**struct ipv6hdr**).
  *
  * 		*th* points to the start of the TCP header, while *th_len*
- * 		contains **sizeof**\ (**struct tcphdr**).
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  * 	Return
  * 		0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
  * 		error otherwise.
@@ -3783,10 +3784,11 @@ union bpf_attr {
  *
  *		*iph* points to the start of the IPv4 or IPv6 header, while
  *		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- *		**sizeof**\ (**struct ip6hdr**).
+ *		**sizeof**\ (**struct ipv6hdr**).
  *
  *		*th* points to the start of the TCP header, while *th_len*
- *		contains the length of the TCP header.
+ *		contains the length of the TCP header with options (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  *	Return
  *		On success, lower 32 bits hold the generated SYN cookie in
  *		followed by 16 bits which hold the MSS value for that cookie,
-- 
cgit 


From 508362ac66b0478affb4e52cb8da98478312d72d Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 15 Jun 2022 16:48:43 +0300
Subject: bpf: Allow helpers to accept pointers with a fixed size

Before this commit, the BPF verifier required ARG_PTR_TO_MEM arguments
to be followed by ARG_CONST_SIZE holding the size of the memory region.
The helpers had to check that size in runtime.

There are cases where the size expected by a helper is a compile-time
constant. Checking it in runtime is an unnecessary overhead and waste of
BPF registers.

This commit allows helpers to accept pointers to memory without the
corresponding ARG_CONST_SIZE, given that they define the memory region
size in struct bpf_func_proto and use ARG_PTR_TO_FIXED_SIZE_MEM type.

arg_size is unionized with arg_btf_id to reduce the kernel image size,
and it's valid because they are used by different argument types.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-3-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/verifier.c | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f3e88afdaffe..a94531971a7a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -401,6 +401,9 @@ enum bpf_type_flag {
 	/* DYNPTR points to a ringbuf record. */
 	DYNPTR_TYPE_RINGBUF	= BIT(9 + BPF_BASE_TYPE_BITS),
 
+	/* Size is known at compile time. */
+	MEM_FIXED_SIZE		= BIT(10 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -464,6 +467,8 @@ enum bpf_arg_type {
 	 * all bytes or clear them in error case.
 	 */
 	ARG_PTR_TO_UNINIT_MEM		= MEM_UNINIT | ARG_PTR_TO_MEM,
+	/* Pointer to valid memory of size known at compile time. */
+	ARG_PTR_TO_FIXED_SIZE_MEM	= MEM_FIXED_SIZE | ARG_PTR_TO_MEM,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
@@ -529,6 +534,14 @@ struct bpf_func_proto {
 			u32 *arg5_btf_id;
 		};
 		u32 *arg_btf_id[5];
+		struct {
+			size_t arg1_size;
+			size_t arg2_size;
+			size_t arg3_size;
+			size_t arg4_size;
+			size_t arg5_size;
+		};
+		size_t arg_size[5];
 	};
 	int *ret_btf_id; /* return value btf_id */
 	bool (*allowed)(const struct bpf_prog *prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eadc23a8452c..2859901ffbe3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5848,6 +5848,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_arg_type arg_type = fn->arg_type[arg];
 	enum bpf_reg_type type = reg->type;
+	u32 *arg_btf_id = NULL;
 	int err = 0;
 
 	if (arg_type == ARG_DONTCARE)
@@ -5884,7 +5885,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		 */
 		goto skip_type_check;
 
-	err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
+	/* arg_btf_id and arg_size are in a union. */
+	if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+		arg_btf_id = fn->arg_btf_id[arg];
+
+	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
 	if (err)
 		return err;
 
@@ -6011,6 +6016,11 @@ skip_type_check:
 		 * next is_mem_size argument below.
 		 */
 		meta->raw_mode = arg_type & MEM_UNINIT;
+		if (arg_type & MEM_FIXED_SIZE) {
+			err = check_helper_mem_access(env, regno,
+						      fn->arg_size[arg], false,
+						      meta);
+		}
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
@@ -6400,11 +6410,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
 	return count <= 1;
 }
 
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
-				    enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
 {
-	return (base_type(arg_curr) == ARG_PTR_TO_MEM) !=
-		arg_type_is_mem_size(arg_next);
+	bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+	bool has_size = fn->arg_size[arg] != 0;
+	bool is_next_size = false;
+
+	if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+		is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+	if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+		return is_next_size;
+
+	return has_size == is_next_size || is_next_size == is_fixed;
 }
 
 static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -6415,11 +6433,11 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	 * helper function specification.
 	 */
 	if (arg_type_is_mem_size(fn->arg1_type) ||
-	    base_type(fn->arg5_type) == ARG_PTR_TO_MEM ||
-	    check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
-	    check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
-	    check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
-	    check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+	    check_args_pair_invalid(fn, 0) ||
+	    check_args_pair_invalid(fn, 1) ||
+	    check_args_pair_invalid(fn, 2) ||
+	    check_args_pair_invalid(fn, 3) ||
+	    check_args_pair_invalid(fn, 4))
 		return false;
 
 	return true;
@@ -6460,7 +6478,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
 		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
 			return false;
 
-		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+		    /* arg_btf_id and arg_size are in a union. */
+		    (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+		     !(fn->arg_type[i] & MEM_FIXED_SIZE)))
 			return false;
 	}
 
-- 
cgit 


From 33bf9885040c399cf6a95bd33216644126728e14 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 15 Jun 2022 16:48:44 +0300
Subject: bpf: Add helpers to issue and check SYN cookies in XDP

The new helpers bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6} allow an XDP
program to generate SYN cookies in response to TCP SYN packets and to
check those cookies upon receiving the first ACK packet (the final
packet of the TCP handshake).

Unlike bpf_tcp_{gen,check}_syncookie these new helpers don't need a
listening socket on the local machine, which allows to use them together
with synproxy to accelerate SYN cookie generation.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-4-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h              |   1 +
 include/uapi/linux/bpf.h       |  78 +++++++++++++++++++++++++++
 net/core/filter.c              | 118 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c           |   3 +-
 scripts/bpf_doc.py             |   4 ++
 tools/include/uapi/linux/bpf.h |  78 +++++++++++++++++++++++++++
 6 files changed, 281 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e99f5c61f84..9a1efe23fab7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -432,6 +432,7 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
 			 struct tcphdr *th, u32 *cookie);
 u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
 			 struct tcphdr *th, u32 *cookie);
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss);
 u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
 			  const struct tcp_request_sock_ops *af_ops,
 			  struct sock *sk, struct tcphdr *th);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f545e39df72a..e81362891596 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5251,6 +5251,80 @@ union bpf_attr {
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
  *		is out of bounds.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv4/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv6/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5457,6 +5531,10 @@ union bpf_attr {
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
 	FN(dynptr_data),		\
+	FN(tcp_raw_gen_syncookie_ipv4),	\
+	FN(tcp_raw_gen_syncookie_ipv6),	\
+	FN(tcp_raw_check_syncookie_ipv4),	\
+	FN(tcp_raw_check_syncookie_ipv6),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 5af58eb48587..b62d4126a561 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7444,6 +7444,114 @@ static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SYN_COOKIES
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
+	   struct tcphdr *, th, u32, th_len)
+{
+	u32 cookie;
+	u16 mss;
+
+	if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+		return -EINVAL;
+
+	mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
+	cookie = __cookie_v4_init_sequence(iph, th, &mss);
+
+	return cookie | ((u64)mss << 32);
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
+	.func		= bpf_tcp_raw_gen_syncookie_ipv4,
+	.gpl_only	= true, /* __cookie_v4_init_sequence() is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct iphdr),
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
+	   struct tcphdr *, th, u32, th_len)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+	const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+		sizeof(struct ipv6hdr);
+	u32 cookie;
+	u16 mss;
+
+	if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+		return -EINVAL;
+
+	mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
+	cookie = __cookie_v6_init_sequence(iph, th, &mss);
+
+	return cookie | ((u64)mss << 32);
+#else
+	return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
+	.func		= bpf_tcp_raw_gen_syncookie_ipv6,
+	.gpl_only	= true, /* __cookie_v6_init_sequence() is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct ipv6hdr),
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
+	   struct tcphdr *, th)
+{
+	u32 cookie = ntohl(th->ack_seq) - 1;
+
+	if (__cookie_v4_check(iph, th, cookie) > 0)
+		return 0;
+
+	return -EACCES;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
+	.func		= bpf_tcp_raw_check_syncookie_ipv4,
+	.gpl_only	= true, /* __cookie_v4_check is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct iphdr),
+	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg2_size	= sizeof(struct tcphdr),
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
+	   struct tcphdr *, th)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+	u32 cookie = ntohl(th->ack_seq) - 1;
+
+	if (__cookie_v6_check(iph, th, cookie) > 0)
+		return 0;
+
+	return -EACCES;
+#else
+	return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
+	.func		= bpf_tcp_raw_check_syncookie_ipv6,
+	.gpl_only	= true, /* __cookie_v6_check is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct ipv6hdr),
+	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg2_size	= sizeof(struct tcphdr),
+};
+#endif /* CONFIG_SYN_COOKIES */
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -7856,6 +7964,16 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
+#ifdef CONFIG_SYN_COOKIES
+	case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+		return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+	case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+		return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+	case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+		return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+	case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+		return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e2a9ece9af2..6426f6a2e744 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3967,7 +3967,7 @@ static bool smc_parse_options(const struct tcphdr *th,
 /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
  * value on success.
  */
-static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
 {
 	const unsigned char *ptr = (const unsigned char *)(th + 1);
 	int length = (th->doff * 4) - sizeof(struct tcphdr);
@@ -4006,6 +4006,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
 	}
 	return mss;
 }
+EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
 
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 855b937e7585..a0ec321469bd 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -635,6 +635,8 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'struct iphdr',
+            'struct ipv6hdr',
     ]
     known_types = {
             '...',
@@ -686,6 +688,8 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'struct iphdr',
+            'struct ipv6hdr',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f545e39df72a..e81362891596 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5251,6 +5251,80 @@ union bpf_attr {
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
  *		is out of bounds.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv4/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv6/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5457,6 +5531,10 @@ union bpf_attr {
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
 	FN(dynptr_data),		\
+	FN(tcp_raw_gen_syncookie_ipv4),	\
+	FN(tcp_raw_gen_syncookie_ipv6),	\
+	FN(tcp_raw_check_syncookie_ipv4),	\
+	FN(tcp_raw_check_syncookie_ipv6),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit 


From f0a6d77b351c18c122fc1638ac9e58f5e0346f64 Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Tue, 14 Jun 2022 22:51:47 +0300
Subject: ata: make transfer mode masks *unsigned int*

The packed transfer mode masks and also the {pio|mwdma|udma}_mask fields
of *struct*s ata_device and ata_port_info are declared as *unsigned long*
(which is a 64-bit type on 64-bit architectures) but actually the packed
masks occupy only 20 bits (7 PIO modes, 5 MWDMA modes, and 8 UDMA modes)
and the PIO/MWDMA/UDMA masks easily fit into just 8 bits each, so we can
safely use (always 32-bit) *unsigned int* variables instead.  This saves
745 bytes of object code in libata-core.o alone, not to mention LLDDs...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 drivers/ata/libata-acpi.c      |  8 +++----
 drivers/ata/libata-core.c      | 38 ++++++++++++++++----------------
 drivers/ata/pata_acpi.c        |  2 +-
 drivers/ata/pata_ali.c         |  2 +-
 drivers/ata/pata_amd.c         | 14 ++++++------
 drivers/ata/pata_hpt366.c      |  2 +-
 drivers/ata/pata_hpt37x.c      |  6 +++---
 drivers/ata/pata_hpt3x2n.c     |  2 +-
 drivers/ata/pata_pdc2027x.c    |  4 ++--
 drivers/ata/pata_serverworks.c |  4 ++--
 drivers/ata/pata_sis.c         |  2 +-
 drivers/ata/pata_via.c         |  2 +-
 include/linux/libata.h         | 49 +++++++++++++++++++++---------------------
 13 files changed, 67 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index 3d345d173556..61b4ccf88bf1 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -480,10 +480,10 @@ static int ata_dev_get_GTF(struct ata_device *dev, struct ata_acpi_gtf **gtf)
  * RETURNS:
  * Determined xfermask.
  */
-unsigned long ata_acpi_gtm_xfermask(struct ata_device *dev,
-				    const struct ata_acpi_gtm *gtm)
+unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
+				   const struct ata_acpi_gtm *gtm)
 {
-	unsigned long xfer_mask = 0;
+	unsigned int xfer_mask = 0;
 	unsigned int type;
 	int unit;
 	u8 mode;
@@ -525,7 +525,7 @@ int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm)
 	struct ata_device *dev;
 
 	ata_for_each_dev(dev, &ap->link, ENABLED) {
-		unsigned long xfer_mask, udma_mask;
+		unsigned int xfer_mask, udma_mask;
 
 		xfer_mask = ata_acpi_gtm_xfermask(dev, gtm);
 		ata_unpack_xfermask(xfer_mask, NULL, NULL, &udma_mask);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 980328a4b896..035092184c08 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -93,7 +93,7 @@ struct ata_force_param {
 	const char	*name;
 	u8		cbl;
 	u8		spd_limit;
-	unsigned long	xfer_mask;
+	unsigned int	xfer_mask;
 	unsigned int	horkage_on;
 	unsigned int	horkage_off;
 	u16		lflags_on;
@@ -425,7 +425,7 @@ static void ata_force_xfermask(struct ata_device *dev)
 
 	for (i = ata_force_tbl_size - 1; i >= 0; i--) {
 		const struct ata_force_ent *fe = &ata_force_tbl[i];
-		unsigned long pio_mask, mwdma_mask, udma_mask;
+		unsigned int pio_mask, mwdma_mask, udma_mask;
 
 		if (fe->port != -1 && fe->port != dev->link->ap->print_id)
 			continue;
@@ -803,11 +803,11 @@ int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
  *	RETURNS:
  *	Packed xfer_mask.
  */
-unsigned long ata_pack_xfermask(unsigned long pio_mask,
-				unsigned long mwdma_mask,
-				unsigned long udma_mask)
+unsigned int ata_pack_xfermask(unsigned int pio_mask,
+			       unsigned int mwdma_mask,
+			       unsigned int udma_mask)
 {
-	return ((pio_mask << ATA_SHIFT_PIO) & ATA_MASK_PIO) |
+	return	((pio_mask << ATA_SHIFT_PIO) & ATA_MASK_PIO) |
 		((mwdma_mask << ATA_SHIFT_MWDMA) & ATA_MASK_MWDMA) |
 		((udma_mask << ATA_SHIFT_UDMA) & ATA_MASK_UDMA);
 }
@@ -823,8 +823,8 @@ EXPORT_SYMBOL_GPL(ata_pack_xfermask);
  *	Unpack @xfer_mask into @pio_mask, @mwdma_mask and @udma_mask.
  *	Any NULL destination masks will be ignored.
  */
-void ata_unpack_xfermask(unsigned long xfer_mask, unsigned long *pio_mask,
-			 unsigned long *mwdma_mask, unsigned long *udma_mask)
+void ata_unpack_xfermask(unsigned int xfer_mask, unsigned int *pio_mask,
+			 unsigned int *mwdma_mask, unsigned int *udma_mask)
 {
 	if (pio_mask)
 		*pio_mask = (xfer_mask & ATA_MASK_PIO) >> ATA_SHIFT_PIO;
@@ -857,7 +857,7 @@ static const struct ata_xfer_ent {
  *	RETURNS:
  *	Matching XFER_* value, 0xff if no match found.
  */
-u8 ata_xfer_mask2mode(unsigned long xfer_mask)
+u8 ata_xfer_mask2mode(unsigned int xfer_mask)
 {
 	int highbit = fls(xfer_mask) - 1;
 	const struct ata_xfer_ent *ent;
@@ -881,7 +881,7 @@ EXPORT_SYMBOL_GPL(ata_xfer_mask2mode);
  *	RETURNS:
  *	Matching xfer_mask, 0 if no match found.
  */
-unsigned long ata_xfer_mode2mask(u8 xfer_mode)
+unsigned int ata_xfer_mode2mask(u8 xfer_mode)
 {
 	const struct ata_xfer_ent *ent;
 
@@ -930,7 +930,7 @@ EXPORT_SYMBOL_GPL(ata_xfer_mode2shift);
  *	Constant C string representing highest speed listed in
  *	@mode_mask, or the constant C string "<n/a>".
  */
-const char *ata_mode_string(unsigned long xfer_mask)
+const char *ata_mode_string(unsigned int xfer_mask)
 {
 	static const char * const xfer_mode_str[] = {
 		"PIO0",
@@ -1383,9 +1383,9 @@ static inline void ata_dump_id(struct ata_device *dev, const u16 *id)
  *	RETURNS:
  *	Computed xfermask
  */
-unsigned long ata_id_xfermask(const u16 *id)
+unsigned int ata_id_xfermask(const u16 *id)
 {
-	unsigned long pio_mask, mwdma_mask, udma_mask;
+	unsigned int pio_mask, mwdma_mask, udma_mask;
 
 	/* Usual case. Word 53 indicates word 64 is valid */
 	if (id[ATA_ID_FIELD_VALID] & (1 << 1)) {
@@ -2534,7 +2534,7 @@ int ata_dev_configure(struct ata_device *dev)
 	struct ata_port *ap = dev->link->ap;
 	bool print_info = ata_dev_print_info(dev);
 	const u16 *id = dev->id;
-	unsigned long xfer_mask;
+	unsigned int xfer_mask;
 	unsigned int err_mask;
 	char revbuf[7];		/* XYZ-99\0 */
 	char fwrevbuf[ATA_ID_FW_REV_LEN+1];
@@ -3202,8 +3202,8 @@ u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle)
 int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel)
 {
 	char buf[32];
-	unsigned long orig_mask, xfer_mask;
-	unsigned long pio_mask, mwdma_mask, udma_mask;
+	unsigned int orig_mask, xfer_mask;
+	unsigned int pio_mask, mwdma_mask, udma_mask;
 	int quiet, highbit;
 
 	quiet = !!(sel & ATA_DNXFER_QUIET);
@@ -3381,7 +3381,7 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 
 	/* step 1: calculate xfer_mask */
 	ata_for_each_dev(dev, link, ENABLED) {
-		unsigned long pio_mask, dma_mask;
+		unsigned int pio_mask, dma_mask;
 		unsigned int mode_mask;
 
 		mode_mask = ATA_DMA_MASK_ATA;
@@ -4217,7 +4217,7 @@ static void ata_dev_xfermask(struct ata_device *dev)
 	struct ata_link *link = dev->link;
 	struct ata_port *ap = link->ap;
 	struct ata_host *host = ap->host;
-	unsigned long xfer_mask;
+	unsigned int xfer_mask;
 
 	/* controller modes available */
 	xfer_mask = ata_pack_xfermask(ap->pio_mask,
@@ -5776,7 +5776,7 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
 	/* set cable, sata_spd_limit and report */
 	for (i = 0; i < host->n_ports; i++) {
 		struct ata_port *ap = host->ports[i];
-		unsigned long xfer_mask;
+		unsigned int xfer_mask;
 
 		/* set SATA cable type if still unset */
 		if (ap->cbl == ATA_CBL_NONE && (ap->flags & ATA_FLAG_SATA))
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index ade4c3eee230..f8706ee427d2 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -97,7 +97,7 @@ static unsigned long pacpi_discover_modes(struct ata_port *ap, struct ata_device
  *	this case the list of discovered valid modes obtained by ACPI probing
  */
 
-static unsigned long pacpi_mode_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int pacpi_mode_filter(struct ata_device *adev, unsigned int mask)
 {
 	struct pata_acpi *acpi = adev->link->ap->private_data;
 	return mask & acpi->mask[adev->devno];
diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index 1b90cda27246..76ad0e73fe2a 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -115,7 +115,7 @@ static int ali_c2_cable_detect(struct ata_port *ap)
  *	fix that later on. Also ensure we do not do UDMA on WDC drives
  */
 
-static unsigned long ali_20_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int ali_20_filter(struct ata_device *adev, unsigned int mask)
 {
 	char model_num[ATA_ID_PROD_LEN + 1];
 	/* No DMA on anything but a disk for now */
diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c
index 154748cfcc79..f216f9d7b9ec 100644
--- a/drivers/ata/pata_amd.c
+++ b/drivers/ata/pata_amd.c
@@ -264,8 +264,8 @@ static void amd133_set_dmamode(struct ata_port *ap, struct ata_device *adev)
  * cached during driver attach and are consulted to select transfer
  * mode.
  */
-static unsigned long nv_mode_filter(struct ata_device *dev,
-				    unsigned long xfer_mask)
+static unsigned int nv_mode_filter(struct ata_device *dev,
+				   unsigned int xfer_mask)
 {
 	static const unsigned int udma_mask_map[] =
 		{ ATA_UDMA2, ATA_UDMA1, ATA_UDMA0, 0,
@@ -274,7 +274,7 @@ static unsigned long nv_mode_filter(struct ata_device *dev,
 	char acpi_str[32] = "";
 	u32 saved_udma, udma;
 	const struct ata_acpi_gtm *gtm;
-	unsigned long bios_limit = 0, acpi_limit = 0, limit;
+	unsigned int bios_limit = 0, acpi_limit = 0, limit;
 
 	/* find out what BIOS configured */
 	udma = saved_udma = (unsigned long)ap->host->private_data;
@@ -310,10 +310,10 @@ static unsigned long nv_mode_filter(struct ata_device *dev,
 	   cable detection result */
 	limit |= ata_pack_xfermask(ATA_PIO4, ATA_MWDMA2, ATA_UDMA2);
 
-	ata_port_dbg(ap, "nv_mode_filter: 0x%lx&0x%lx->0x%lx, "
-			"BIOS=0x%lx (0x%x) ACPI=0x%lx%s\n",
-			xfer_mask, limit, xfer_mask & limit, bios_limit,
-			saved_udma, acpi_limit, acpi_str);
+	ata_port_dbg(ap,
+		     "nv_mode_filter: 0x%x&0x%x->0x%x, BIOS=0x%x (0x%x) ACPI=0x%x%s\n",
+		     xfer_mask, limit, xfer_mask & limit, bios_limit,
+		     saved_udma, acpi_limit, acpi_str);
 
 	return xfer_mask & limit;
 }
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index c99e8f0708b3..7e441fb304d3 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -194,7 +194,7 @@ static int hpt_dma_blacklisted(const struct ata_device *dev, char *modestr,
  *	Block UDMA on devices that cause trouble with this controller.
  */
 
-static unsigned long hpt366_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int hpt366_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (adev->class == ATA_DEV_ATA) {
 		if (hpt_dma_blacklisted(adev, "UDMA",  bad_ata33))
diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c
index d1a3d99d5d0a..ce3c5eaa7e76 100644
--- a/drivers/ata/pata_hpt37x.c
+++ b/drivers/ata/pata_hpt37x.c
@@ -278,7 +278,7 @@ static const char * const bad_ata100_5[] = {
  *	Block UDMA on devices that cause trouble with this controller.
  */
 
-static unsigned long hpt370_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int hpt370_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (adev->class == ATA_DEV_ATA) {
 		if (hpt_dma_blacklisted(adev, "UDMA", bad_ata33))
@@ -297,7 +297,7 @@ static unsigned long hpt370_filter(struct ata_device *adev, unsigned long mask)
  *	Block UDMA on devices that cause trouble with this controller.
  */
 
-static unsigned long hpt370a_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int hpt370a_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (adev->class == ATA_DEV_ATA) {
 		if (hpt_dma_blacklisted(adev, "UDMA100", bad_ata100_5))
@@ -314,7 +314,7 @@ static unsigned long hpt370a_filter(struct ata_device *adev, unsigned long mask)
  *	The Marvell bridge chips used on the HighPoint SATA cards do not seem
  *	to support the UltraDMA modes 1, 2, and 3 as well as any MWDMA modes...
  */
-static unsigned long hpt372_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int hpt372_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (ata_id_is_sata(adev->id))
 		mask &= ~((0xE << ATA_SHIFT_UDMA) | ATA_MASK_MWDMA);
diff --git a/drivers/ata/pata_hpt3x2n.c b/drivers/ata/pata_hpt3x2n.c
index d1595e17dca2..617c95522f43 100644
--- a/drivers/ata/pata_hpt3x2n.c
+++ b/drivers/ata/pata_hpt3x2n.c
@@ -113,7 +113,7 @@ static u32 hpt3x2n_find_mode(struct ata_port *ap, int speed)
  *	The Marvell bridge chips used on the HighPoint SATA cards do not seem
  *	to support the UltraDMA modes 1, 2, and 3 as well as any MWDMA modes...
  */
-static unsigned long hpt372n_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int hpt372n_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (ata_id_is_sata(adev->id))
 		mask &= ~((0xE << ATA_SHIFT_UDMA) | ATA_MASK_MWDMA);
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index 4fbb3eed8b0b..4191aa61c8e4 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -57,7 +57,7 @@ static int pdc2027x_prereset(struct ata_link *link, unsigned long deadline);
 static void pdc2027x_set_piomode(struct ata_port *ap, struct ata_device *adev);
 static void pdc2027x_set_dmamode(struct ata_port *ap, struct ata_device *adev);
 static int pdc2027x_check_atapi_dma(struct ata_queued_cmd *qc);
-static unsigned long pdc2027x_mode_filter(struct ata_device *adev, unsigned long mask);
+static unsigned int pdc2027x_mode_filter(struct ata_device *adev, unsigned int mask);
 static int pdc2027x_cable_detect(struct ata_port *ap);
 static int pdc2027x_set_mode(struct ata_link *link, struct ata_device **r_failed);
 
@@ -251,7 +251,7 @@ static int pdc2027x_prereset(struct ata_link *link, unsigned long deadline)
  *	Block UDMA on devices that cause trouble with this controller.
  */
 
-static unsigned long pdc2027x_mode_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int pdc2027x_mode_filter(struct ata_device *adev, unsigned int mask)
 {
 	unsigned char model_num[ATA_ID_PROD_LEN + 1];
 	struct ata_device *pair = ata_dev_pair(adev);
diff --git a/drivers/ata/pata_serverworks.c b/drivers/ata/pata_serverworks.c
index e410fe44177f..c0bc4af0d196 100644
--- a/drivers/ata/pata_serverworks.c
+++ b/drivers/ata/pata_serverworks.c
@@ -150,7 +150,7 @@ static u8 serverworks_is_csb(struct pci_dev *pdev)
  *	bug we hit.
  */
 
-static unsigned long serverworks_osb4_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int serverworks_osb4_filter(struct ata_device *adev, unsigned int mask)
 {
 	if (adev->class == ATA_DEV_ATA)
 		mask &= ~ATA_MASK_UDMA;
@@ -166,7 +166,7 @@ static unsigned long serverworks_osb4_filter(struct ata_device *adev, unsigned l
  *	Check the blacklist and disable UDMA5 if matched
  */
 
-static unsigned long serverworks_csb_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int serverworks_csb_filter(struct ata_device *adev, unsigned int mask)
 {
 	const char *p;
 	char model_num[ATA_ID_PROD_LEN + 1];
diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c
index b5b764e18adf..92e4cf05de2c 100644
--- a/drivers/ata/pata_sis.c
+++ b/drivers/ata/pata_sis.c
@@ -525,7 +525,7 @@ static void sis_133_set_dmamode (struct ata_port *ap, struct ata_device *adev)
  *	Block UDMA6 on devices that do not support it.
  */
 
-static unsigned long sis_133_mode_filter(struct ata_device *adev, unsigned long mask)
+static unsigned int sis_133_mode_filter(struct ata_device *adev, unsigned int mask)
 {
 	struct ata_port *ap = adev->link->ap;
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 215c02d4056a..34f00f389932 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -352,7 +352,7 @@ static void via_set_dmamode(struct ata_port *ap, struct ata_device *adev)
  *	one breed of Transcend SSD. Return the updated mask.
  */
 
-static unsigned long via_mode_filter(struct ata_device *dev, unsigned long mask)
+static unsigned int via_mode_filter(struct ata_device *dev, unsigned int mask)
 {
 	struct ata_host *host = dev->link->ap->host;
 	const struct via_isa_bridge *config = host->private_data;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0f2a59c9c735..a8bc88b4fe07 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -275,7 +275,7 @@ enum {
 	PORT_DISABLED		= 2,
 
 	/* encoding various smaller bitmaps into a single
-	 * unsigned long bitmap
+	 * unsigned int bitmap
 	 */
 	ATA_NR_PIO_MODES	= 7,
 	ATA_NR_MWDMA_MODES	= 5,
@@ -426,12 +426,9 @@ enum {
 };
 
 enum ata_xfer_mask {
-	ATA_MASK_PIO		= ((1LU << ATA_NR_PIO_MODES) - 1)
-					<< ATA_SHIFT_PIO,
-	ATA_MASK_MWDMA		= ((1LU << ATA_NR_MWDMA_MODES) - 1)
-					<< ATA_SHIFT_MWDMA,
-	ATA_MASK_UDMA		= ((1LU << ATA_NR_UDMA_MODES) - 1)
-					<< ATA_SHIFT_UDMA,
+	ATA_MASK_PIO		= ((1U << ATA_NR_PIO_MODES) - 1) << ATA_SHIFT_PIO,
+	ATA_MASK_MWDMA		= ((1U << ATA_NR_MWDMA_MODES) - 1) << ATA_SHIFT_MWDMA,
+	ATA_MASK_UDMA		= ((1U << ATA_NR_UDMA_MODES) - 1) << ATA_SHIFT_UDMA,
 };
 
 enum hsm_task_states {
@@ -680,9 +677,9 @@ struct ata_device {
 	unsigned int		cdb_len;
 
 	/* per-dev xfer mask */
-	unsigned long		pio_mask;
-	unsigned long		mwdma_mask;
-	unsigned long		udma_mask;
+	unsigned int		pio_mask;
+	unsigned int		mwdma_mask;
+	unsigned int		udma_mask;
 
 	/* for CHS addressing */
 	u16			cylinders;	/* Number of cylinders */
@@ -885,7 +882,7 @@ struct ata_port_operations {
 	 * Configuration and exception handling
 	 */
 	int  (*cable_detect)(struct ata_port *ap);
-	unsigned long (*mode_filter)(struct ata_device *dev, unsigned long xfer_mask);
+	unsigned int (*mode_filter)(struct ata_device *dev, unsigned int xfer_mask);
 	void (*set_piomode)(struct ata_port *ap, struct ata_device *dev);
 	void (*set_dmamode)(struct ata_port *ap, struct ata_device *dev);
 	int  (*set_mode)(struct ata_link *link, struct ata_device **r_failed_dev);
@@ -981,9 +978,9 @@ struct ata_port_operations {
 struct ata_port_info {
 	unsigned long		flags;
 	unsigned long		link_flags;
-	unsigned long		pio_mask;
-	unsigned long		mwdma_mask;
-	unsigned long		udma_mask;
+	unsigned int		pio_mask;
+	unsigned int		mwdma_mask;
+	unsigned int		udma_mask;
 	struct ata_port_operations *port_ops;
 	void 			*private_data;
 };
@@ -1102,16 +1099,18 @@ extern void ata_msleep(struct ata_port *ap, unsigned int msecs);
 extern u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask,
 			u32 val, unsigned long interval, unsigned long timeout);
 extern int atapi_cmd_type(u8 opcode);
-extern unsigned long ata_pack_xfermask(unsigned long pio_mask,
-			unsigned long mwdma_mask, unsigned long udma_mask);
-extern void ata_unpack_xfermask(unsigned long xfer_mask,
-			unsigned long *pio_mask, unsigned long *mwdma_mask,
-			unsigned long *udma_mask);
-extern u8 ata_xfer_mask2mode(unsigned long xfer_mask);
-extern unsigned long ata_xfer_mode2mask(u8 xfer_mode);
+extern unsigned int ata_pack_xfermask(unsigned int pio_mask,
+				      unsigned int mwdma_mask,
+				      unsigned int udma_mask);
+extern void ata_unpack_xfermask(unsigned int xfer_mask,
+				unsigned int *pio_mask,
+				unsigned int *mwdma_mask,
+				unsigned int *udma_mask);
+extern u8 ata_xfer_mask2mode(unsigned int xfer_mask);
+extern unsigned int ata_xfer_mode2mask(u8 xfer_mode);
 extern int ata_xfer_mode2shift(u8 xfer_mode);
-extern const char *ata_mode_string(unsigned long xfer_mask);
-extern unsigned long ata_id_xfermask(const u16 *id);
+extern const char *ata_mode_string(unsigned int xfer_mask);
+extern unsigned int ata_id_xfermask(const u16 *id);
 extern int ata_std_qc_defer(struct ata_queued_cmd *qc);
 extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
@@ -1283,8 +1282,8 @@ static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap)
 }
 int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm);
 int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm);
-unsigned long ata_acpi_gtm_xfermask(struct ata_device *dev,
-				    const struct ata_acpi_gtm *gtm);
+unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
+				   const struct ata_acpi_gtm *gtm);
 int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm);
 #else
 static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap)
-- 
cgit 


From d64de9773c18409d2161228242968ff3ebe3707e Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Thu, 9 Jun 2022 20:31:19 +0800
Subject: crypto: hisilicon/qm - modify event irq processing

When the driver receives an event interrupt, the driver will enable
the event interrupt after handling all completed tasks on the function,
tasks on the function are parsed through only one thread. If the task's
user callback takes time, other tasks on the function will be blocked.

Therefore, the event irq processing is modified as follows:
1. Obtain the ID of the queue that completes the task.
2. Enable event interrupt.
3. Parse the completed tasks in the queue and call the user callback.
Enabling event interrupt in advance can quickly report pending event
interrupts and process tasks in multiple threads.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/qm.c | 142 +++++++++++++++++++++++++++---------------
 include/linux/hisi_acc_qm.h   |   8 ++-
 2 files changed, 99 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index f8d36b68494e..ad83c194d664 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -877,13 +877,6 @@ static void qm_pm_put_sync(struct hisi_qm *qm)
 	pm_runtime_put_autosuspend(dev);
 }
 
-static struct hisi_qp *qm_to_hisi_qp(struct hisi_qm *qm, struct qm_eqe *eqe)
-{
-	u16 cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
-
-	return &qm->qp_array[cqn];
-}
-
 static void qm_cq_head_update(struct hisi_qp *qp)
 {
 	if (qp->qp_status.cq_head == QM_Q_DEPTH - 1) {
@@ -894,47 +887,37 @@ static void qm_cq_head_update(struct hisi_qp *qp)
 	}
 }
 
-static void qm_poll_qp(struct hisi_qp *qp, struct hisi_qm *qm)
+static void qm_poll_req_cb(struct hisi_qp *qp)
 {
-	if (unlikely(atomic_read(&qp->qp_status.flags) == QP_STOP))
-		return;
-
-	if (qp->event_cb) {
-		qp->event_cb(qp);
-		return;
-	}
-
-	if (qp->req_cb) {
-		struct qm_cqe *cqe = qp->cqe + qp->qp_status.cq_head;
-
-		while (QM_CQE_PHASE(cqe) == qp->qp_status.cqc_phase) {
-			dma_rmb();
-			qp->req_cb(qp, qp->sqe + qm->sqe_size *
-				   le16_to_cpu(cqe->sq_head));
-			qm_cq_head_update(qp);
-			cqe = qp->cqe + qp->qp_status.cq_head;
-			qm_db(qm, qp->qp_id, QM_DOORBELL_CMD_CQ,
-			      qp->qp_status.cq_head, 0);
-			atomic_dec(&qp->qp_status.used);
-		}
+	struct qm_cqe *cqe = qp->cqe + qp->qp_status.cq_head;
+	struct hisi_qm *qm = qp->qm;
 
-		/* set c_flag */
+	while (QM_CQE_PHASE(cqe) == qp->qp_status.cqc_phase) {
+		dma_rmb();
+		qp->req_cb(qp, qp->sqe + qm->sqe_size *
+			   le16_to_cpu(cqe->sq_head));
+		qm_cq_head_update(qp);
+		cqe = qp->cqe + qp->qp_status.cq_head;
 		qm_db(qm, qp->qp_id, QM_DOORBELL_CMD_CQ,
-		      qp->qp_status.cq_head, 1);
+		      qp->qp_status.cq_head, 0);
+		atomic_dec(&qp->qp_status.used);
 	}
+
+	/* set c_flag */
+	qm_db(qm, qp->qp_id, QM_DOORBELL_CMD_CQ, qp->qp_status.cq_head, 1);
 }
 
-static void qm_work_process(struct work_struct *work)
+static int qm_get_complete_eqe_num(struct hisi_qm_poll_data *poll_data)
 {
-	struct hisi_qm *qm = container_of(work, struct hisi_qm, work);
+	struct hisi_qm *qm = poll_data->qm;
 	struct qm_eqe *eqe = qm->eqe + qm->status.eq_head;
-	struct hisi_qp *qp;
 	int eqe_num = 0;
+	u16 cqn;
 
 	while (QM_EQE_PHASE(eqe) == qm->status.eqc_phase) {
+		cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
+		poll_data->qp_finish_id[eqe_num] = cqn;
 		eqe_num++;
-		qp = qm_to_hisi_qp(qm, eqe);
-		qm_poll_qp(qp, qm);
 
 		if (qm->status.eq_head == QM_EQ_DEPTH - 1) {
 			qm->status.eqc_phase = !qm->status.eqc_phase;
@@ -945,34 +928,70 @@ static void qm_work_process(struct work_struct *work)
 			qm->status.eq_head++;
 		}
 
-		if (eqe_num == QM_EQ_DEPTH / 2 - 1) {
-			eqe_num = 0;
-			qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
-		}
+		if (eqe_num == (QM_EQ_DEPTH >> 1) - 1)
+			break;
 	}
 
 	qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
+
+	return eqe_num;
 }
 
-static irqreturn_t do_qm_irq(int irq, void *data)
+static void qm_work_process(struct work_struct *work)
 {
-	struct hisi_qm *qm = (struct hisi_qm *)data;
+	struct hisi_qm_poll_data *poll_data =
+		container_of(work, struct hisi_qm_poll_data, work);
+	struct hisi_qm *qm = poll_data->qm;
+	struct hisi_qp *qp;
+	int eqe_num, i;
 
-	/* the workqueue created by device driver of QM */
-	queue_work(qm->wq, &qm->work);
+	/* Get qp id of completed tasks and re-enable the interrupt. */
+	eqe_num = qm_get_complete_eqe_num(poll_data);
+	for (i = eqe_num - 1; i >= 0; i--) {
+		qp = &qm->qp_array[poll_data->qp_finish_id[i]];
+		if (unlikely(atomic_read(&qp->qp_status.flags) == QP_STOP))
+			continue;
 
-	return IRQ_HANDLED;
+		if (qp->event_cb) {
+			qp->event_cb(qp);
+			continue;
+		}
+
+		if (likely(qp->req_cb))
+			qm_poll_req_cb(qp);
+	}
+}
+
+static bool do_qm_irq(struct hisi_qm *qm)
+{
+	struct qm_eqe *eqe = qm->eqe + qm->status.eq_head;
+	struct hisi_qm_poll_data *poll_data;
+	u16 cqn;
+
+	if (!readl(qm->io_base + QM_VF_EQ_INT_SOURCE))
+		return false;
+
+	if (QM_EQE_PHASE(eqe) == qm->status.eqc_phase) {
+		cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
+		poll_data = &qm->poll_data[cqn];
+		queue_work(qm->wq, &poll_data->work);
+
+		return true;
+	}
+
+	return false;
 }
 
 static irqreturn_t qm_irq(int irq, void *data)
 {
 	struct hisi_qm *qm = data;
+	bool ret;
 
-	if (readl(qm->io_base + QM_VF_EQ_INT_SOURCE))
-		return do_qm_irq(irq, data);
+	ret = do_qm_irq(qm);
+	if (ret)
+		return IRQ_HANDLED;
 
 	atomic64_inc(&qm->debug.dfx.err_irq_cnt);
-	dev_err(&qm->pdev->dev, "invalid int source\n");
 	qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
 
 	return IRQ_NONE;
@@ -3551,8 +3570,10 @@ static void hisi_qp_memory_uninit(struct hisi_qm *qm, int num)
 	for (i = num - 1; i >= 0; i--) {
 		qdma = &qm->qp_array[i].qdma;
 		dma_free_coherent(dev, qdma->size, qdma->va, qdma->dma);
+		kfree(qm->poll_data[i].qp_finish_id);
 	}
 
+	kfree(qm->poll_data);
 	kfree(qm->qp_array);
 }
 
@@ -3561,12 +3582,18 @@ static int hisi_qp_memory_init(struct hisi_qm *qm, size_t dma_size, int id)
 	struct device *dev = &qm->pdev->dev;
 	size_t off = qm->sqe_size * QM_Q_DEPTH;
 	struct hisi_qp *qp;
+	int ret = -ENOMEM;
+
+	qm->poll_data[id].qp_finish_id = kcalloc(qm->qp_num, sizeof(u16),
+						 GFP_KERNEL);
+	if (!qm->poll_data[id].qp_finish_id)
+		return -ENOMEM;
 
 	qp = &qm->qp_array[id];
 	qp->qdma.va = dma_alloc_coherent(dev, dma_size, &qp->qdma.dma,
 					 GFP_KERNEL);
 	if (!qp->qdma.va)
-		return -ENOMEM;
+		goto err_free_qp_finish_id;
 
 	qp->sqe = qp->qdma.va;
 	qp->sqe_dma = qp->qdma.dma;
@@ -3577,6 +3604,10 @@ static int hisi_qp_memory_init(struct hisi_qm *qm, size_t dma_size, int id)
 	qp->qp_id = id;
 
 	return 0;
+
+err_free_qp_finish_id:
+	kfree(qm->poll_data[id].qp_finish_id);
+	return ret;
 }
 
 static void hisi_qm_pre_init(struct hisi_qm *qm)
@@ -6024,7 +6055,11 @@ err_disable_pcidev:
 
 static int hisi_qm_init_work(struct hisi_qm *qm)
 {
-	INIT_WORK(&qm->work, qm_work_process);
+	int i;
+
+	for (i = 0; i < qm->qp_num; i++)
+		INIT_WORK(&qm->poll_data[i].work, qm_work_process);
+
 	if (qm->fun_type == QM_HW_PF)
 		INIT_WORK(&qm->rst_work, hisi_qm_controller_reset);
 
@@ -6052,11 +6087,18 @@ static int hisi_qp_alloc_memory(struct hisi_qm *qm)
 	if (!qm->qp_array)
 		return -ENOMEM;
 
+	qm->poll_data = kcalloc(qm->qp_num, sizeof(struct hisi_qm_poll_data), GFP_KERNEL);
+	if (!qm->poll_data) {
+		kfree(qm->qp_array);
+		return -ENOMEM;
+	}
+
 	/* one more page for device or qp statuses */
 	qp_dma_size = qm->sqe_size * QM_Q_DEPTH +
 		      sizeof(struct qm_cqe) * QM_Q_DEPTH;
 	qp_dma_size = PAGE_ALIGN(qp_dma_size) + PAGE_SIZE;
 	for (i = 0; i < qm->qp_num; i++) {
+		qm->poll_data[i].qm = qm;
 		ret = hisi_qp_memory_init(qm, qp_dma_size, i);
 		if (ret)
 			goto err_init_qp_mem;
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 6cabafffd0dd..116e8bd68c99 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -265,6 +265,12 @@ struct hisi_qm_list {
 	void (*unregister_from_crypto)(struct hisi_qm *qm);
 };
 
+struct hisi_qm_poll_data {
+	struct hisi_qm *qm;
+	struct work_struct work;
+	u16 *qp_finish_id;
+};
+
 struct hisi_qm {
 	enum qm_hw_ver ver;
 	enum qm_fun_type fun_type;
@@ -302,6 +308,7 @@ struct hisi_qm {
 	struct rw_semaphore qps_lock;
 	struct idr qp_idr;
 	struct hisi_qp *qp_array;
+	struct hisi_qm_poll_data *poll_data;
 
 	struct mutex mailbox_lock;
 
@@ -312,7 +319,6 @@ struct hisi_qm {
 	u32 error_mask;
 
 	struct workqueue_struct *wq;
-	struct work_struct work;
 	struct work_struct rst_work;
 	struct work_struct cmd_process;
 
-- 
cgit 


From fa9c562f9735d24c3253747eb21f3f0c0f6de48e Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Wed, 15 Jun 2022 16:39:04 +0800
Subject: net: make xpcs_do_config to accept advertising for pcs-xpcs and
 sja1105

xpcs_config() has 'advertising' input that is required for C37 1000BASE-X
AN in later patch series. So, we prepare xpcs_do_config() for it.

For sja1105, xpcs_do_config() is used for xpcs configuration without
depending on advertising input, so set to NULL.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c | 2 +-
 drivers/net/pcs/pcs-xpcs.c             | 6 +++---
 include/linux/pcs/pcs-xpcs.h           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 72b6fc1932b5..b253e27bcfb4 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2330,7 +2330,7 @@ int sja1105_static_config_reload(struct sja1105_private *priv,
 		else
 			mode = MLO_AN_PHY;
 
-		rc = xpcs_do_config(xpcs, priv->phy_mode[i], mode);
+		rc = xpcs_do_config(xpcs, priv->phy_mode[i], mode, NULL);
 		if (rc < 0)
 			goto out;
 
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 4cfd05c15aee..48d81c40aab7 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -795,7 +795,7 @@ static int xpcs_config_2500basex(struct dw_xpcs *xpcs)
 }
 
 int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
-		   unsigned int mode)
+		   unsigned int mode, const unsigned long *advertising)
 {
 	const struct xpcs_compat *compat;
 	int ret;
@@ -843,7 +843,7 @@ static int xpcs_config(struct phylink_pcs *pcs, unsigned int mode,
 {
 	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 
-	return xpcs_do_config(xpcs, interface, mode);
+	return xpcs_do_config(xpcs, interface, mode, advertising);
 }
 
 static int xpcs_get_state_c73(struct dw_xpcs *xpcs,
@@ -864,7 +864,7 @@ static int xpcs_get_state_c73(struct dw_xpcs *xpcs,
 
 		state->link = 0;
 
-		return xpcs_do_config(xpcs, state->interface, MLO_AN_INBAND);
+		return xpcs_do_config(xpcs, state->interface, MLO_AN_INBAND, NULL);
 	}
 
 	if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state, compat)) {
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 266eb26fb029..37eb97cc2283 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -30,7 +30,7 @@ int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
 void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 		  phy_interface_t interface, int speed, int duplex);
 int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
-		   unsigned int mode);
+		   unsigned int mode, const unsigned long *advertising);
 void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
 		    int enable);
-- 
cgit 


From b47aec885bcd672ebca2108a8b7e9ce3e3982775 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Wed, 15 Jun 2022 16:39:06 +0800
Subject: net: pcs: xpcs: add CL37 1000BASE-X AN support

For CL37 1000BASE-X AN, DW xPCS does not support C22 method but offers
C45 vendor-specific MII MMD for programming.

We also add the ability to disable Autoneg (through ethtool for certain
network switch that supports 1000BASE-X (1000Mbps and Full-Duplex) but
not Autoneg capability.

v4: Fixes to comment from Russell King. Thanks!
    https://patchwork.kernel.org/comment/24894239/
    Make xpcs_modify_changed() as private, change to use
    mdiodev_modify_changed() for cleaner code.

v3: Fixes to issues spotted by Russell King. Thanks!
    https://patchwork.kernel.org/comment/24890210/
    Use phylink_mii_c22_pcs_decode_state(), remove unnecessary
    interrupt clearing and skip speed & duplex setting if AN
    is enabled.

v2: Fixes to issues spotted by Russell King in v1. Thanks!
    https://patchwork.kernel.org/comment/24826650/
    Use phylink_mii_c22_pcs_encode_advertisement() and implement
    C45 MII ADV handling since IP only support C45 access.

Tested-by: Emilio Riva <emilio.riva@ericsson.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 170 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/pcs/pcs-xpcs.h   |   1 -
 include/linux/pcs/pcs-xpcs.h |   1 +
 3 files changed, 171 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 48d81c40aab7..a5d520e34ea3 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -77,6 +77,14 @@ static const int xpcs_sgmii_features[] = {
 	__ETHTOOL_LINK_MODE_MASK_NBITS,
 };
 
+static const int xpcs_1000basex_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
 static const int xpcs_2500basex_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -102,6 +110,10 @@ static const phy_interface_t xpcs_sgmii_interfaces[] = {
 	PHY_INTERFACE_MODE_SGMII,
 };
 
+static const phy_interface_t xpcs_1000basex_interfaces[] = {
+	PHY_INTERFACE_MODE_1000BASEX,
+};
+
 static const phy_interface_t xpcs_2500basex_interfaces[] = {
 	PHY_INTERFACE_MODE_2500BASEX,
 	PHY_INTERFACE_MODE_MAX,
@@ -112,6 +124,7 @@ enum {
 	DW_XPCS_10GKR,
 	DW_XPCS_XLGMII,
 	DW_XPCS_SGMII,
+	DW_XPCS_1000BASEX,
 	DW_XPCS_2500BASEX,
 	DW_XPCS_INTERFACE_MAX,
 };
@@ -189,6 +202,14 @@ int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val)
 	return mdiobus_c45_write(bus, addr, dev, reg, val);
 }
 
+static int xpcs_modify_changed(struct dw_xpcs *xpcs, int dev, u32 reg,
+			       u16 mask, u16 set)
+{
+	u32 reg_addr = mdiobus_c45_addr(dev, reg);
+
+	return mdiodev_modify_changed(xpcs->mdiodev, reg_addr, mask, set);
+}
+
 static int xpcs_read_vendor(struct dw_xpcs *xpcs, int dev, u32 reg)
 {
 	return xpcs_read(xpcs, dev, DW_VENDOR | reg);
@@ -237,6 +258,7 @@ static int xpcs_soft_reset(struct dw_xpcs *xpcs,
 		break;
 	case DW_AN_C37_SGMII:
 	case DW_2500BASEX:
+	case DW_AN_C37_1000BASEX:
 		dev = MDIO_MMD_VEND2;
 		break;
 	default:
@@ -772,6 +794,68 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode)
 	return ret;
 }
 
+static int xpcs_config_aneg_c37_1000basex(struct dw_xpcs *xpcs, unsigned int mode,
+					  const unsigned long *advertising)
+{
+	phy_interface_t interface = PHY_INTERFACE_MODE_1000BASEX;
+	int ret, mdio_ctrl, adv;
+	bool changed = 0;
+
+	/* According to Chap 7.12, to set 1000BASE-X C37 AN, AN must
+	 * be disabled first:-
+	 * 1) VR_MII_MMD_CTRL Bit(12)[AN_ENABLE] = 0b
+	 * 2) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 00b (1000BASE-X C37)
+	 */
+	mdio_ctrl = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL);
+	if (mdio_ctrl < 0)
+		return mdio_ctrl;
+
+	if (mdio_ctrl & AN_CL37_EN) {
+		ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL,
+				 mdio_ctrl & ~AN_CL37_EN);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL);
+	if (ret < 0)
+		return ret;
+
+	ret &= ~DW_VR_MII_PCS_MODE_MASK;
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL, ret);
+	if (ret < 0)
+		return ret;
+
+	/* Check for advertising changes and update the C45 MII ADV
+	 * register accordingly.
+	 */
+	adv = phylink_mii_c22_pcs_encode_advertisement(interface,
+						       advertising);
+	if (adv >= 0) {
+		ret = xpcs_modify_changed(xpcs, MDIO_MMD_VEND2,
+					  MII_ADVERTISE, 0xffff, adv);
+		if (ret < 0)
+			return ret;
+
+		changed = ret;
+	}
+
+	/* Clear CL37 AN complete status */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_INTR_STS, 0);
+	if (ret < 0)
+		return ret;
+
+	if (phylink_autoneg_inband(mode) &&
+	    linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, advertising)) {
+		ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL,
+				 mdio_ctrl | AN_CL37_EN);
+		if (ret < 0)
+			return ret;
+	}
+
+	return changed;
+}
+
 static int xpcs_config_2500basex(struct dw_xpcs *xpcs)
 {
 	int ret;
@@ -817,6 +901,12 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
 		if (ret)
 			return ret;
 		break;
+	case DW_AN_C37_1000BASEX:
+		ret = xpcs_config_aneg_c37_1000basex(xpcs, mode,
+						     advertising);
+		if (ret)
+			return ret;
+		break;
 	case DW_2500BASEX:
 		ret = xpcs_config_2500basex(xpcs);
 		if (ret)
@@ -921,6 +1011,29 @@ static int xpcs_get_state_c37_sgmii(struct dw_xpcs *xpcs,
 	return 0;
 }
 
+static int xpcs_get_state_c37_1000basex(struct dw_xpcs *xpcs,
+					struct phylink_link_state *state)
+{
+	int lpa, bmsr;
+
+	if (state->an_enabled) {
+		/* Reset link state */
+		state->link = false;
+
+		lpa = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_LPA);
+		if (lpa < 0 || lpa & LPA_RFAULT)
+			return lpa;
+
+		bmsr = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_BMSR);
+		if (bmsr < 0)
+			return bmsr;
+
+		phylink_mii_c22_pcs_decode_state(state, bmsr, lpa);
+	}
+
+	return 0;
+}
+
 static void xpcs_get_state(struct phylink_pcs *pcs,
 			   struct phylink_link_state *state)
 {
@@ -948,6 +1061,13 @@ static void xpcs_get_state(struct phylink_pcs *pcs,
 			       ERR_PTR(ret));
 		}
 		break;
+	case DW_AN_C37_1000BASEX:
+		ret = xpcs_get_state_c37_1000basex(xpcs, state);
+		if (ret) {
+			pr_err("xpcs_get_state_c37_1000basex returned %pe\n",
+			       ERR_PTR(ret));
+		}
+		break;
 	default:
 		return;
 	}
@@ -983,6 +1103,35 @@ static void xpcs_link_up_sgmii(struct dw_xpcs *xpcs, unsigned int mode,
 		pr_err("%s: xpcs_write returned %pe\n", __func__, ERR_PTR(ret));
 }
 
+static void xpcs_link_up_1000basex(struct dw_xpcs *xpcs, unsigned int mode,
+				   int speed, int duplex)
+{
+	int val, ret;
+
+	if (phylink_autoneg_inband(mode))
+		return;
+
+	switch (speed) {
+	case SPEED_1000:
+		val = BMCR_SPEED1000;
+		break;
+	case SPEED_100:
+	case SPEED_10:
+	default:
+		pr_err("%s: speed = %d\n", __func__, speed);
+		return;
+	}
+
+	if (duplex == DUPLEX_FULL)
+		val |= BMCR_FULLDPLX;
+	else
+		pr_err("%s: half duplex not supported\n", __func__);
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1, val);
+	if (ret)
+		pr_err("%s: xpcs_write returned %pe\n", __func__, ERR_PTR(ret));
+}
+
 void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 		  phy_interface_t interface, int speed, int duplex)
 {
@@ -992,9 +1141,23 @@ void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 		return xpcs_config_usxgmii(xpcs, speed);
 	if (interface == PHY_INTERFACE_MODE_SGMII)
 		return xpcs_link_up_sgmii(xpcs, mode, speed, duplex);
+	if (interface == PHY_INTERFACE_MODE_1000BASEX)
+		return xpcs_link_up_1000basex(xpcs, mode, speed, duplex);
 }
 EXPORT_SYMBOL_GPL(xpcs_link_up);
 
+static void xpcs_an_restart(struct phylink_pcs *pcs)
+{
+	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1);
+	if (ret >= 0) {
+		ret |= BMCR_ANRESTART;
+		xpcs_write(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1, ret);
+	}
+}
+
 static u32 xpcs_get_id(struct dw_xpcs *xpcs)
 {
 	int ret;
@@ -1060,6 +1223,12 @@ static const struct xpcs_compat synopsys_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
 		.num_interfaces = ARRAY_SIZE(xpcs_sgmii_interfaces),
 		.an_mode = DW_AN_C37_SGMII,
 	},
+	[DW_XPCS_1000BASEX] = {
+		.supported = xpcs_1000basex_features,
+		.interface = xpcs_1000basex_interfaces,
+		.num_interfaces = ARRAY_SIZE(xpcs_1000basex_interfaces),
+		.an_mode = DW_AN_C37_1000BASEX,
+	},
 	[DW_XPCS_2500BASEX] = {
 		.supported = xpcs_2500basex_features,
 		.interface = xpcs_2500basex_interfaces,
@@ -1115,6 +1284,7 @@ static const struct phylink_pcs_ops xpcs_phylink_ops = {
 	.pcs_validate = xpcs_validate,
 	.pcs_config = xpcs_config,
 	.pcs_get_state = xpcs_get_state,
+	.pcs_an_restart = xpcs_an_restart,
 	.pcs_link_up = xpcs_link_up,
 };
 
diff --git a/drivers/net/pcs/pcs-xpcs.h b/drivers/net/pcs/pcs-xpcs.h
index 35651d32a224..770df50323a0 100644
--- a/drivers/net/pcs/pcs-xpcs.h
+++ b/drivers/net/pcs/pcs-xpcs.h
@@ -109,7 +109,6 @@
 
 int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg);
 int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val);
-
 int nxp_sja1105_sgmii_pma_config(struct dw_xpcs *xpcs);
 int nxp_sja1110_sgmii_pma_config(struct dw_xpcs *xpcs);
 int nxp_sja1110_2500basex_pma_config(struct dw_xpcs *xpcs);
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 37eb97cc2283..d2da1e0b4a92 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -17,6 +17,7 @@
 #define DW_AN_C73			1
 #define DW_AN_C37_SGMII			2
 #define DW_2500BASEX			3
+#define DW_AN_C37_1000BASEX		4
 
 struct xpcs_id;
 
-- 
cgit 


From d0804085c5a7feb557bd3219777b51f5598dfdc6 Mon Sep 17 00:00:00 2001
From: Moudy Ho <moudy.ho@mediatek.com>
Date: Fri, 10 Jun 2022 14:34:18 +0800
Subject: soc: mediatek: mutex: add common interface for modules setting

In order to allow multiple modules to operate MUTEX hardware through
a common interfrace, two flexible indexes "mtk_mutex_mod_index" and
"mtk_mutex_sof_index" need to be added to replace original component
ID so that like DDP and MDP can add their own MOD table or SOF
settings independently.

In addition, 2 generic interface "mtk_mutex_write_mod" and
"mtk_mutex_write_sof" have been added, which is expected to replace
the "mtk_mutex_add_comp" and "mtk_mutex_remove_comp" pair originally
dedicated to DDP in the future.

Signed-off-by: Moudy Ho <moudy.ho@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Reviewed-by: CK Hu <ck.hu@mediatek.com>
Link: https://lore.kernel.org/r/20220610063424.7800-2-moudy.ho@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mtk-mutex.c       | 53 ++++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-mutex.h | 25 ++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-mutex.c b/drivers/soc/mediatek/mtk-mutex.c
index b8d5c4a62542..56987e2ff906 100644
--- a/drivers/soc/mediatek/mtk-mutex.c
+++ b/drivers/soc/mediatek/mtk-mutex.c
@@ -199,6 +199,7 @@ struct mtk_mutex_data {
 	const unsigned int *mutex_sof;
 	const unsigned int mutex_mod_reg;
 	const unsigned int mutex_sof_reg;
+	const unsigned int *mutex_table_mod;
 	const bool no_clk;
 };
 
@@ -644,6 +645,58 @@ void mtk_mutex_release(struct mtk_mutex *mutex)
 }
 EXPORT_SYMBOL_GPL(mtk_mutex_release);
 
+int mtk_mutex_write_mod(struct mtk_mutex *mutex,
+			enum mtk_mutex_mod_index idx, bool clear)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	unsigned int reg;
+	unsigned int offset;
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	if (idx < MUTEX_MOD_IDX_MDP_RDMA0 ||
+	    idx >= MUTEX_MOD_IDX_MAX) {
+		dev_err(mtx->dev, "Not supported MOD table index : %d", idx);
+		return -EINVAL;
+	}
+
+	offset = DISP_REG_MUTEX_MOD(mtx->data->mutex_mod_reg,
+				    mutex->id);
+	reg = readl_relaxed(mtx->regs + offset);
+
+	if (clear)
+		reg &= ~BIT(mtx->data->mutex_table_mod[idx]);
+	else
+		reg |= BIT(mtx->data->mutex_table_mod[idx]);
+
+	writel_relaxed(reg, mtx->regs + offset);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_write_mod);
+
+int mtk_mutex_write_sof(struct mtk_mutex *mutex,
+			enum mtk_mutex_sof_index idx)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	if (idx < MUTEX_SOF_IDX_SINGLE_MODE ||
+	    idx >= MUTEX_SOF_IDX_MAX) {
+		dev_err(mtx->dev, "Not supported SOF index : %d", idx);
+		return -EINVAL;
+	}
+
+	writel_relaxed(idx, mtx->regs +
+		       DISP_REG_MUTEX_SOF(mtx->data->mutex_sof_reg, mutex->id));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_write_sof);
+
 static int mtk_mutex_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h
index 6fe4ffbde290..2ddab9d2b85d 100644
--- a/include/linux/soc/mediatek/mtk-mutex.h
+++ b/include/linux/soc/mediatek/mtk-mutex.h
@@ -10,6 +10,26 @@ struct regmap;
 struct device;
 struct mtk_mutex;
 
+enum mtk_mutex_mod_index {
+	/* MDP table index */
+	MUTEX_MOD_IDX_MDP_RDMA0,
+	MUTEX_MOD_IDX_MDP_RSZ0,
+	MUTEX_MOD_IDX_MDP_RSZ1,
+	MUTEX_MOD_IDX_MDP_TDSHP0,
+	MUTEX_MOD_IDX_MDP_WROT0,
+	MUTEX_MOD_IDX_MDP_WDMA,
+	MUTEX_MOD_IDX_MDP_AAL0,
+	MUTEX_MOD_IDX_MDP_CCORR0,
+
+	MUTEX_MOD_IDX_MAX		/* ALWAYS keep at the end */
+};
+
+enum mtk_mutex_sof_index {
+	MUTEX_SOF_IDX_SINGLE_MODE,
+
+	MUTEX_SOF_IDX_MAX		/* ALWAYS keep at the end */
+};
+
 struct mtk_mutex *mtk_mutex_get(struct device *dev);
 int mtk_mutex_prepare(struct mtk_mutex *mutex);
 void mtk_mutex_add_comp(struct mtk_mutex *mutex,
@@ -22,5 +42,10 @@ void mtk_mutex_unprepare(struct mtk_mutex *mutex);
 void mtk_mutex_put(struct mtk_mutex *mutex);
 void mtk_mutex_acquire(struct mtk_mutex *mutex);
 void mtk_mutex_release(struct mtk_mutex *mutex);
+int mtk_mutex_write_mod(struct mtk_mutex *mutex,
+			enum mtk_mutex_mod_index idx,
+			bool clear);
+int mtk_mutex_write_sof(struct mtk_mutex *mutex,
+			enum mtk_mutex_sof_index idx);
 
 #endif /* MTK_MUTEX_H */
-- 
cgit 


From e5758850c2ea448dd750a280d128a3590d68b899 Mon Sep 17 00:00:00 2001
From: Moudy Ho <moudy.ho@mediatek.com>
Date: Fri, 10 Jun 2022 14:34:23 +0800
Subject: soc: mediatek: mutex: add functions that operate registers by CMDQ

Due to HW limitations, MDP3 is necessary to enable MUTEX in each frame
for SOF triggering and cooperate with CMDQ control to reduce the amount
of interrupts generated(also, reduce frame latency).

In response to the above situation, a new interface
"mtk_mutex_enable_by_cmdq" has been added to achieve the purpose.

Signed-off-by: Moudy Ho <moudy.ho@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Reviewed-by: CK Hu <ck.hu@mediatek.com>
Link: https://lore.kernel.org/r/20220610063424.7800-7-moudy.ho@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mtk-mutex.c       | 45 +++++++++++++++++++++++++++++++++-
 include/linux/soc/mediatek/mtk-mutex.h |  2 ++
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-mutex.c b/drivers/soc/mediatek/mtk-mutex.c
index a566f4cf6372..fa8e0ba38803 100644
--- a/drivers/soc/mediatek/mtk-mutex.c
+++ b/drivers/soc/mediatek/mtk-mutex.c
@@ -7,10 +7,12 @@
 #include <linux/iopoll.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
+#include <linux/of_address.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/soc/mediatek/mtk-mmsys.h>
 #include <linux/soc/mediatek/mtk-mutex.h>
+#include <linux/soc/mediatek/mtk-cmdq.h>
 
 #define MT2701_MUTEX0_MOD0			0x2c
 #define MT2701_MUTEX0_SOF0			0x30
@@ -218,6 +220,8 @@ struct mtk_mutex_ctx {
 	void __iomem			*regs;
 	struct mtk_mutex		mutex[10];
 	const struct mtk_mutex_data	*data;
+	phys_addr_t			addr;
+	struct cmdq_client_reg		cmdq_reg;
 };
 
 static const unsigned int mt2701_mutex_mod[DDP_COMPONENT_ID_MAX] = {
@@ -632,6 +636,30 @@ void mtk_mutex_enable(struct mtk_mutex *mutex)
 }
 EXPORT_SYMBOL_GPL(mtk_mutex_enable);
 
+int mtk_mutex_enable_by_cmdq(struct mtk_mutex *mutex, void *pkt)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+#if IS_REACHABLE(CONFIG_MTK_CMDQ)
+	struct cmdq_pkt *cmdq_pkt = (struct cmdq_pkt *)pkt;
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	if (!mtx->cmdq_reg.size) {
+		dev_err(mtx->dev, "mediatek,gce-client-reg hasn't been set");
+		return -EINVAL;
+	}
+
+	cmdq_pkt_write(cmdq_pkt, mtx->cmdq_reg.subsys,
+		       mtx->addr + DISP_REG_MUTEX_EN(mutex->id), 1);
+	return 0;
+#else
+	dev_err(mtx->dev, "Not support for enable MUTEX by CMDQ");
+	return -ENODEV;
+#endif
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_enable_by_cmdq);
+
 void mtk_mutex_disable(struct mtk_mutex *mutex)
 {
 	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
@@ -722,8 +750,11 @@ static int mtk_mutex_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct mtk_mutex_ctx *mtx;
-	struct resource *regs;
+	struct resource *regs, addr;
 	int i;
+#if IS_REACHABLE(CONFIG_MTK_CMDQ)
+	int ret;
+#endif
 
 	mtx = devm_kzalloc(dev, sizeof(*mtx), GFP_KERNEL);
 	if (!mtx)
@@ -743,6 +774,18 @@ static int mtk_mutex_probe(struct platform_device *pdev)
 		}
 	}
 
+	if (of_address_to_resource(dev->of_node, 0, &addr) < 0) {
+		dev_err(dev, "Failed to get addr\n");
+		return -EINVAL;
+	}
+	mtx->addr = addr.start;
+
+#if IS_REACHABLE(CONFIG_MTK_CMDQ)
+	ret = cmdq_dev_get_client_reg(dev, &mtx->cmdq_reg, 0);
+	if (ret)
+		dev_dbg(dev, "No mediatek,gce-client-reg!\n");
+#endif
+
 	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	mtx->regs = devm_ioremap_resource(dev, regs);
 	if (IS_ERR(mtx->regs)) {
diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h
index 2ddab9d2b85d..a0f4f51a3b45 100644
--- a/include/linux/soc/mediatek/mtk-mutex.h
+++ b/include/linux/soc/mediatek/mtk-mutex.h
@@ -35,6 +35,8 @@ int mtk_mutex_prepare(struct mtk_mutex *mutex);
 void mtk_mutex_add_comp(struct mtk_mutex *mutex,
 			enum mtk_ddp_comp_id id);
 void mtk_mutex_enable(struct mtk_mutex *mutex);
+int mtk_mutex_enable_by_cmdq(struct mtk_mutex *mutex,
+			     void *pkt);
 void mtk_mutex_disable(struct mtk_mutex *mutex);
 void mtk_mutex_remove_comp(struct mtk_mutex *mutex,
 			   enum mtk_ddp_comp_id id);
-- 
cgit 


From dc368e1c658e4f478a45e8d1d5b0c8392ca87506 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 16 Jun 2022 15:54:07 -0700
Subject: bpf: Fix non-static bpf_func_proto struct definitions

This patch does two things:

1) Marks the dynptr bpf_func_proto structs that were added in [1]
   as static, as pointed out by the kernel test robot in [2].

2) There are some bpf_func_proto structs marked as extern which can
   instead be statically defined.

  [1] https://lore.kernel.org/bpf/20220523210712.3641569-1-joannelkoong@gmail.com/
  [2] https://lore.kernel.org/bpf/62ab89f2.Pko7sI08RAKdF8R6%25lkp@intel.com/

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220616225407.1878436-1-joannelkoong@gmail.com
---
 include/linux/bpf.h  |  3 ---
 kernel/bpf/helpers.c | 12 ++++++------
 kernel/bpf/syscall.c |  2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a94531971a7a..0edd7d2c0064 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2363,12 +2363,9 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
-extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
-extern const struct bpf_func_proto bpf_strncmp_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
-extern const struct bpf_func_proto bpf_kptr_xchg_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efb..a1c84d256f83 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -584,7 +584,7 @@ BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
 	return strncmp(s1, s2, s1_sz);
 }
 
-const struct bpf_func_proto bpf_strncmp_proto = {
+static const struct bpf_func_proto bpf_strncmp_proto = {
 	.func		= bpf_strncmp,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1402,7 +1402,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
  */
 #define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
 
-const struct bpf_func_proto bpf_kptr_xchg_proto = {
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 	.func         = bpf_kptr_xchg,
 	.gpl_only     = false,
 	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
@@ -1487,7 +1487,7 @@ error:
 	return err;
 }
 
-const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.func		= bpf_dynptr_from_mem,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1513,7 +1513,7 @@ BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
 	return 0;
 }
 
-const struct bpf_func_proto bpf_dynptr_read_proto = {
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.func		= bpf_dynptr_read,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1539,7 +1539,7 @@ BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
 	return 0;
 }
 
-const struct bpf_func_proto bpf_dynptr_write_proto = {
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.func		= bpf_dynptr_write,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1566,7 +1566,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
 	return (unsigned long)(ptr->data + ptr->offset + offset);
 }
 
-const struct bpf_func_proto bpf_dynptr_data_proto = {
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
 	.func		= bpf_dynptr_data,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aeb31137b2ed..7d5af5b99f0d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5131,7 +5131,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag
 	return *res ? 0 : -ENOENT;
 }
 
-const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
 	.func		= bpf_kallsyms_lookup_name,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-- 
cgit 


From 5994f58977e0123d5cb77617b3cc4676325028dd Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Fri, 10 Jun 2022 12:24:09 +0300
Subject: dt-bindings: reset: add sama7g5 definitions

Add reset bindings for SAMA7G5. At the moment only USB PHYs are
included.

Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/dt-bindings/reset/sama7g5-reset.h | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 include/dt-bindings/reset/sama7g5-reset.h

(limited to 'include')

diff --git a/include/dt-bindings/reset/sama7g5-reset.h b/include/dt-bindings/reset/sama7g5-reset.h
new file mode 100644
index 000000000000..2116f41d04e0
--- /dev/null
+++ b/include/dt-bindings/reset/sama7g5-reset.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef __DT_BINDINGS_RESET_SAMA7G5_H
+#define __DT_BINDINGS_RESET_SAMA7G5_H
+
+#define SAMA7G5_RESET_USB_PHY1		4
+#define SAMA7G5_RESET_USB_PHY2		5
+#define SAMA7G5_RESET_USB_PHY3		6
+
+#endif /* __DT_BINDINGS_RESET_SAMA7G5_H */
-- 
cgit 


From 9a2139c2912ea64e288749c21452930dc752d4fd Mon Sep 17 00:00:00 2001
From: Caleb Connolly <caleb.connolly@linaro.org>
Date: Fri, 29 Apr 2022 23:08:56 +0100
Subject: spmi: add a helper to look up an SPMI device from a device node

The helper function spmi_device_from_of() takes a device node and
returns the SPMI device associated with it.
This is like of_find_device_by_node but for SPMI devices.

Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20220429220904.137297-2-caleb.connolly@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/spmi/spmi.c  | 17 +++++++++++++++++
 include/linux/spmi.h |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/spmi/spmi.c b/drivers/spmi/spmi.c
index b37ead9e2fad..a456ce5141e1 100644
--- a/drivers/spmi/spmi.c
+++ b/drivers/spmi/spmi.c
@@ -386,6 +386,23 @@ static struct bus_type spmi_bus_type = {
 	.uevent		= spmi_drv_uevent,
 };
 
+/**
+ * spmi_device_from_of() - get the associated SPMI device from a device node
+ *
+ * @np:		device node
+ *
+ * Returns the struct spmi_device associated with a device node or NULL.
+ */
+struct spmi_device *spmi_device_from_of(struct device_node *np)
+{
+	struct device *dev = bus_find_device_by_of_node(&spmi_bus_type, np);
+
+	if (dev)
+		return to_spmi_device(dev);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(spmi_device_from_of);
+
 /**
  * spmi_controller_alloc() - Allocate a new SPMI device
  * @ctrl:	associated controller
diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index 729bcbf9f5ad..eac1956a8727 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -164,6 +164,9 @@ static inline void spmi_driver_unregister(struct spmi_driver *sdrv)
 	module_driver(__spmi_driver, spmi_driver_register, \
 			spmi_driver_unregister)
 
+struct device_node;
+
+struct spmi_device *spmi_device_from_of(struct device_node *np);
 int spmi_register_read(struct spmi_device *sdev, u8 addr, u8 *buf);
 int spmi_ext_register_read(struct spmi_device *sdev, u8 addr, u8 *buf,
 			   size_t len);
-- 
cgit 


From e9c11c6e3a0e93903f5a13f8d2f97ae1bba512e1 Mon Sep 17 00:00:00 2001
From: Caleb Connolly <caleb.connolly@linaro.org>
Date: Fri, 29 Apr 2022 23:08:57 +0100
Subject: mfd: qcom-spmi-pmic: expose the PMIC revid information to clients

Some PMIC functions such as the RRADC need to be aware of the PMIC
chip revision information to implement errata or otherwise adjust
behaviour, export the PMIC information to enable this.

This is specifically required to enable the RRADC to adjust
coefficients based on which chip fab the PMIC was produced in,
this can vary per unique device and therefore has to be read at
runtime.

Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Tested-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220429220904.137297-3-caleb.connolly@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/mfd/qcom-spmi-pmic.c      | 265 +++++++++++++++++++++++++-------------
 include/soc/qcom/qcom-spmi-pmic.h |  60 +++++++++
 2 files changed, 235 insertions(+), 90 deletions(-)
 create mode 100644 include/soc/qcom/qcom-spmi-pmic.h

(limited to 'include')

diff --git a/drivers/mfd/qcom-spmi-pmic.c b/drivers/mfd/qcom-spmi-pmic.c
index 1cacc00aa6c9..0f1f60e4daf7 100644
--- a/drivers/mfd/qcom-spmi-pmic.c
+++ b/drivers/mfd/qcom-spmi-pmic.c
@@ -3,11 +3,16 @@
  * Copyright (c) 2014, The Linux Foundation. All rights reserved.
  */
 
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spmi.h>
+#include <linux/types.h>
 #include <linux/regmap.h>
 #include <linux/of_platform.h>
+#include <soc/qcom/qcom-spmi-pmic.h>
 
 #define PMIC_REV2		0x101
 #define PMIC_REV3		0x102
@@ -17,106 +22,140 @@
 
 #define PMIC_TYPE_VALUE		0x51
 
-#define COMMON_SUBTYPE		0x00
-#define PM8941_SUBTYPE		0x01
-#define PM8841_SUBTYPE		0x02
-#define PM8019_SUBTYPE		0x03
-#define PM8226_SUBTYPE		0x04
-#define PM8110_SUBTYPE		0x05
-#define PMA8084_SUBTYPE		0x06
-#define PMI8962_SUBTYPE		0x07
-#define PMD9635_SUBTYPE		0x08
-#define PM8994_SUBTYPE		0x09
-#define PMI8994_SUBTYPE		0x0a
-#define PM8916_SUBTYPE		0x0b
-#define PM8004_SUBTYPE		0x0c
-#define PM8909_SUBTYPE		0x0d
-#define PM8028_SUBTYPE		0x0e
-#define PM8901_SUBTYPE		0x0f
-#define PM8950_SUBTYPE		0x10
-#define PMI8950_SUBTYPE		0x11
-#define PM8998_SUBTYPE		0x14
-#define PMI8998_SUBTYPE		0x15
-#define PM8005_SUBTYPE		0x18
-#define PM660L_SUBTYPE		0x1A
-#define PM660_SUBTYPE		0x1B
-#define PM8150_SUBTYPE		0x1E
-#define PM8150L_SUBTYPE		0x1f
-#define PM8150B_SUBTYPE		0x20
-#define PMK8002_SUBTYPE		0x21
-#define PM8009_SUBTYPE		0x24
-#define PM8150C_SUBTYPE		0x26
-#define SMB2351_SUBTYPE		0x29
+#define PMIC_REV4_V2		0x02
+
+struct qcom_spmi_dev {
+	int num_usids;
+	struct qcom_spmi_pmic pmic;
+};
+
+#define N_USIDS(n)		((void *)n)
 
 static const struct of_device_id pmic_spmi_id_table[] = {
-	{ .compatible = "qcom,pm660",     .data = (void *)PM660_SUBTYPE },
-	{ .compatible = "qcom,pm660l",    .data = (void *)PM660L_SUBTYPE },
-	{ .compatible = "qcom,pm8004",    .data = (void *)PM8004_SUBTYPE },
-	{ .compatible = "qcom,pm8005",    .data = (void *)PM8005_SUBTYPE },
-	{ .compatible = "qcom,pm8019",    .data = (void *)PM8019_SUBTYPE },
-	{ .compatible = "qcom,pm8028",    .data = (void *)PM8028_SUBTYPE },
-	{ .compatible = "qcom,pm8110",    .data = (void *)PM8110_SUBTYPE },
-	{ .compatible = "qcom,pm8150",    .data = (void *)PM8150_SUBTYPE },
-	{ .compatible = "qcom,pm8150b",   .data = (void *)PM8150B_SUBTYPE },
-	{ .compatible = "qcom,pm8150c",   .data = (void *)PM8150C_SUBTYPE },
-	{ .compatible = "qcom,pm8150l",   .data = (void *)PM8150L_SUBTYPE },
-	{ .compatible = "qcom,pm8226",    .data = (void *)PM8226_SUBTYPE },
-	{ .compatible = "qcom,pm8841",    .data = (void *)PM8841_SUBTYPE },
-	{ .compatible = "qcom,pm8901",    .data = (void *)PM8901_SUBTYPE },
-	{ .compatible = "qcom,pm8909",    .data = (void *)PM8909_SUBTYPE },
-	{ .compatible = "qcom,pm8916",    .data = (void *)PM8916_SUBTYPE },
-	{ .compatible = "qcom,pm8941",    .data = (void *)PM8941_SUBTYPE },
-	{ .compatible = "qcom,pm8950",    .data = (void *)PM8950_SUBTYPE },
-	{ .compatible = "qcom,pm8994",    .data = (void *)PM8994_SUBTYPE },
-	{ .compatible = "qcom,pm8998",    .data = (void *)PM8998_SUBTYPE },
-	{ .compatible = "qcom,pma8084",   .data = (void *)PMA8084_SUBTYPE },
-	{ .compatible = "qcom,pmd9635",   .data = (void *)PMD9635_SUBTYPE },
-	{ .compatible = "qcom,pmi8950",   .data = (void *)PMI8950_SUBTYPE },
-	{ .compatible = "qcom,pmi8962",   .data = (void *)PMI8962_SUBTYPE },
-	{ .compatible = "qcom,pmi8994",   .data = (void *)PMI8994_SUBTYPE },
-	{ .compatible = "qcom,pmi8998",   .data = (void *)PMI8998_SUBTYPE },
-	{ .compatible = "qcom,pmk8002",   .data = (void *)PMK8002_SUBTYPE },
-	{ .compatible = "qcom,smb2351",   .data = (void *)SMB2351_SUBTYPE },
-	{ .compatible = "qcom,spmi-pmic", .data = (void *)COMMON_SUBTYPE },
+	{ .compatible = "qcom,pm660", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm660l", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8004", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8005", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8019", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8028", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8110", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8150", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8150b", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8150c", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8150l", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8226", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8841", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8901", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8909", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8916", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8941", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8950", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8994", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pm8998", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pma8084", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmd9635", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmi8950", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmi8962", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmi8994", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmi8998", .data = N_USIDS(2) },
+	{ .compatible = "qcom,pmk8002", .data = N_USIDS(2) },
+	{ .compatible = "qcom,smb2351", .data = N_USIDS(2) },
+	{ .compatible = "qcom,spmi-pmic", .data = N_USIDS(1) },
 	{ }
 };
 
-static void pmic_spmi_show_revid(struct regmap *map, struct device *dev)
+/*
+ * A PMIC can be represented by multiple SPMI devices, but
+ * only the base PMIC device will contain a reference to
+ * the revision information.
+ *
+ * This function takes a pointer to a pmic device and
+ * returns a pointer to the base PMIC device.
+ *
+ * This only supports PMICs with 1 or 2 USIDs.
+ */
+static struct spmi_device *qcom_pmic_get_base_usid(struct device *dev)
 {
-	unsigned int rev2, minor, major, type, subtype;
-	const char *name = "unknown";
-	int ret, i;
+	struct spmi_device *sdev;
+	struct qcom_spmi_dev *ctx;
+	struct device_node *spmi_bus;
+	struct device_node *other_usid = NULL;
+	int function_parent_usid, ret;
+	u32 pmic_addr;
 
-	ret = regmap_read(map, PMIC_TYPE, &type);
-	if (ret < 0)
-		return;
+	sdev = to_spmi_device(dev);
+	ctx = dev_get_drvdata(&sdev->dev);
+
+	/*
+	 * Quick return if the function device is already in the base
+	 * USID. This will always be hit for PMICs with only 1 USID.
+	 */
+	if (sdev->usid % ctx->num_usids == 0)
+		return sdev;
 
-	if (type != PMIC_TYPE_VALUE)
-		return;
+	function_parent_usid = sdev->usid;
+
+	/*
+	 * Walk through the list of PMICs until we find the sibling USID.
+	 * The goal is to find the first USID which is less than the
+	 * number of USIDs in the PMIC array, e.g. for a PMIC with 2 USIDs
+	 * where the function device is under USID 3, we want to find the
+	 * device for USID 2.
+	 */
+	spmi_bus = of_get_parent(sdev->dev.of_node);
+	do {
+		other_usid = of_get_next_child(spmi_bus, other_usid);
+
+		ret = of_property_read_u32_index(other_usid, "reg", 0, &pmic_addr);
+		if (ret)
+			return ERR_PTR(ret);
+
+		sdev = spmi_device_from_of(other_usid);
+		if (pmic_addr == function_parent_usid - (ctx->num_usids - 1)) {
+			if (!sdev)
+				/*
+				 * If the base USID for this PMIC hasn't probed yet
+				 * but the secondary USID has, then we need to defer
+				 * the function driver so that it will attempt to
+				 * probe again when the base USID is ready.
+				 */
+				return ERR_PTR(-EPROBE_DEFER);
+			return sdev;
+		}
+	} while (other_usid->sibling);
+
+	return ERR_PTR(-ENODATA);
+}
+
+static int pmic_spmi_load_revid(struct regmap *map, struct device *dev,
+				 struct qcom_spmi_pmic *pmic)
+{
+	int ret;
 
-	ret = regmap_read(map, PMIC_SUBTYPE, &subtype);
+	ret = regmap_read(map, PMIC_TYPE, &pmic->type);
 	if (ret < 0)
-		return;
+		return ret;
 
-	for (i = 0; i < ARRAY_SIZE(pmic_spmi_id_table); i++) {
-		if (subtype == (unsigned long)pmic_spmi_id_table[i].data)
-			break;
-	}
+	if (pmic->type != PMIC_TYPE_VALUE)
+		return ret;
+
+	ret = regmap_read(map, PMIC_SUBTYPE, &pmic->subtype);
+	if (ret < 0)
+		return ret;
 
-	if (i != ARRAY_SIZE(pmic_spmi_id_table))
-		name = pmic_spmi_id_table[i].compatible;
+	pmic->name = of_match_device(pmic_spmi_id_table, dev)->compatible;
 
-	ret = regmap_read(map, PMIC_REV2, &rev2);
+	ret = regmap_read(map, PMIC_REV2, &pmic->rev2);
 	if (ret < 0)
-		return;
+		return ret;
 
-	ret = regmap_read(map, PMIC_REV3, &minor);
+	ret = regmap_read(map, PMIC_REV3, &pmic->minor);
 	if (ret < 0)
-		return;
+		return ret;
 
-	ret = regmap_read(map, PMIC_REV4, &major);
+	ret = regmap_read(map, PMIC_REV4, &pmic->major);
 	if (ret < 0)
-		return;
+		return ret;
 
 	/*
 	 * In early versions of PM8941 and PM8226, the major revision number
@@ -124,15 +163,49 @@ static void pmic_spmi_show_revid(struct regmap *map, struct device *dev)
 	 * Increment the major revision number here if the chip is an early
 	 * version of PM8941 or PM8226.
 	 */
-	if ((subtype == PM8941_SUBTYPE || subtype == PM8226_SUBTYPE) &&
-	    major < 0x02)
-		major++;
+	if ((pmic->subtype == PM8941_SUBTYPE || pmic->subtype == PM8226_SUBTYPE) &&
+	    pmic->major < PMIC_REV4_V2)
+		pmic->major++;
+
+	if (pmic->subtype == PM8110_SUBTYPE)
+		pmic->minor = pmic->rev2;
+
+	dev_dbg(dev, "%x: %s v%d.%d\n",
+		pmic->subtype, pmic->name, pmic->major, pmic->minor);
+
+	return 0;
+}
+
+/**
+ * qcom_pmic_get() - Get a pointer to the base PMIC device
+ *
+ * This function takes a struct device for a driver which is a child of a PMIC.
+ * And locates the PMIC revision information for it.
+ *
+ * @dev: the pmic function device
+ * @return: the struct qcom_spmi_pmic* pointer associated with the function device
+ */
+const struct qcom_spmi_pmic *qcom_pmic_get(struct device *dev)
+{
+	struct spmi_device *sdev;
+	struct qcom_spmi_dev *spmi;
+
+	/*
+	 * Make sure the device is actually a child of a PMIC
+	 */
+	if (!of_match_device(pmic_spmi_id_table, dev->parent))
+		return ERR_PTR(-EINVAL);
+
+	sdev = qcom_pmic_get_base_usid(dev->parent);
 
-	if (subtype == PM8110_SUBTYPE)
-		minor = rev2;
+	if (IS_ERR(sdev))
+		return ERR_CAST(sdev);
 
-	dev_dbg(dev, "%x: %s v%d.%d\n", subtype, name, major, minor);
+	spmi = dev_get_drvdata(&sdev->dev);
+
+	return &spmi->pmic;
 }
+EXPORT_SYMBOL(qcom_pmic_get);
 
 static const struct regmap_config spmi_regmap_config = {
 	.reg_bits	= 16,
@@ -144,14 +217,26 @@ static const struct regmap_config spmi_regmap_config = {
 static int pmic_spmi_probe(struct spmi_device *sdev)
 {
 	struct regmap *regmap;
+	struct qcom_spmi_dev *ctx;
+	int ret;
 
 	regmap = devm_regmap_init_spmi_ext(sdev, &spmi_regmap_config);
 	if (IS_ERR(regmap))
 		return PTR_ERR(regmap);
 
+	ctx = devm_kzalloc(&sdev->dev, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->num_usids = (uintptr_t)of_device_get_match_data(&sdev->dev);
+
 	/* Only the first slave id for a PMIC contains this information */
-	if (sdev->usid % 2 == 0)
-		pmic_spmi_show_revid(regmap, &sdev->dev);
+	if (sdev->usid % ctx->num_usids == 0) {
+		ret = pmic_spmi_load_revid(regmap, &sdev->dev, &ctx->pmic);
+		if (ret < 0)
+			return ret;
+	}
+	spmi_device_set_drvdata(sdev, ctx);
 
 	return devm_of_platform_populate(&sdev->dev);
 }
diff --git a/include/soc/qcom/qcom-spmi-pmic.h b/include/soc/qcom/qcom-spmi-pmic.h
new file mode 100644
index 000000000000..5894da3c7f6a
--- /dev/null
+++ b/include/soc/qcom/qcom-spmi-pmic.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Linaro. All rights reserved.
+ * Author: Caleb Connolly <caleb.connolly@linaro.org>
+ */
+
+#ifndef __QCOM_SPMI_PMIC_H__
+#define __QCOM_SPMI_PMIC_H__
+
+#include <linux/device.h>
+
+#define COMMON_SUBTYPE		0x00
+#define PM8941_SUBTYPE		0x01
+#define PM8841_SUBTYPE		0x02
+#define PM8019_SUBTYPE		0x03
+#define PM8226_SUBTYPE		0x04
+#define PM8110_SUBTYPE		0x05
+#define PMA8084_SUBTYPE		0x06
+#define PMI8962_SUBTYPE		0x07
+#define PMD9635_SUBTYPE		0x08
+#define PM8994_SUBTYPE		0x09
+#define PMI8994_SUBTYPE		0x0a
+#define PM8916_SUBTYPE		0x0b
+#define PM8004_SUBTYPE		0x0c
+#define PM8909_SUBTYPE		0x0d
+#define PM8028_SUBTYPE		0x0e
+#define PM8901_SUBTYPE		0x0f
+#define PM8950_SUBTYPE		0x10
+#define PMI8950_SUBTYPE		0x11
+#define PM8998_SUBTYPE		0x14
+#define PMI8998_SUBTYPE		0x15
+#define PM8005_SUBTYPE		0x18
+#define PM660L_SUBTYPE		0x1A
+#define PM660_SUBTYPE		0x1B
+#define PM8150_SUBTYPE		0x1E
+#define PM8150L_SUBTYPE		0x1f
+#define PM8150B_SUBTYPE		0x20
+#define PMK8002_SUBTYPE		0x21
+#define PM8009_SUBTYPE		0x24
+#define PM8150C_SUBTYPE		0x26
+#define SMB2351_SUBTYPE		0x29
+
+#define PMI8998_FAB_ID_SMIC	0x11
+#define PMI8998_FAB_ID_GF	0x30
+
+#define PM660_FAB_ID_GF		0x0
+#define PM660_FAB_ID_TSMC	0x2
+#define PM660_FAB_ID_MX		0x3
+
+struct qcom_spmi_pmic {
+	unsigned int type;
+	unsigned int subtype;
+	unsigned int major;
+	unsigned int minor;
+	unsigned int rev2;
+	const char *name;
+};
+
+const struct qcom_spmi_pmic *qcom_pmic_get(struct device *dev);
+
+#endif /* __QCOM_SPMI_PMIC_H__ */
-- 
cgit 


From d23c3c085a95fddae5143823b7d5a81419e6f497 Mon Sep 17 00:00:00 2001
From: Caleb Connolly <caleb.connolly@linaro.org>
Date: Fri, 29 Apr 2022 23:08:58 +0100
Subject: mfd: qcom-spmi-pmic: read fab id on supported PMICs

The PMI8998 and PM660 expose the fab_id, this is needed by drivers like
the RRADC to calibrate ADC values.

Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Tested-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220429220904.137297-4-caleb.connolly@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/mfd/qcom-spmi-pmic.c      | 7 +++++++
 include/soc/qcom/qcom-spmi-pmic.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/qcom-spmi-pmic.c b/drivers/mfd/qcom-spmi-pmic.c
index 0f1f60e4daf7..00003a868d28 100644
--- a/drivers/mfd/qcom-spmi-pmic.c
+++ b/drivers/mfd/qcom-spmi-pmic.c
@@ -19,6 +19,7 @@
 #define PMIC_REV4		0x103
 #define PMIC_TYPE		0x104
 #define PMIC_SUBTYPE		0x105
+#define PMIC_FAB_ID		0x1f2
 
 #define PMIC_TYPE_VALUE		0x51
 
@@ -157,6 +158,12 @@ static int pmic_spmi_load_revid(struct regmap *map, struct device *dev,
 	if (ret < 0)
 		return ret;
 
+	if (pmic->subtype == PMI8998_SUBTYPE || pmic->subtype == PM660_SUBTYPE) {
+		ret = regmap_read(map, PMIC_FAB_ID, &pmic->fab_id);
+		if (ret < 0)
+			return ret;
+	}
+
 	/*
 	 * In early versions of PM8941 and PM8226, the major revision number
 	 * started incrementing from 0 (eg 0 = v1.0, 1 = v2.0).
diff --git a/include/soc/qcom/qcom-spmi-pmic.h b/include/soc/qcom/qcom-spmi-pmic.h
index 5894da3c7f6a..72398ff44719 100644
--- a/include/soc/qcom/qcom-spmi-pmic.h
+++ b/include/soc/qcom/qcom-spmi-pmic.h
@@ -52,6 +52,7 @@ struct qcom_spmi_pmic {
 	unsigned int major;
 	unsigned int minor;
 	unsigned int rev2;
+	unsigned int fab_id;
 	const char *name;
 };
 
-- 
cgit 


From 4a08069461ac9822855116d24fbf11d5fb223cd1 Mon Sep 17 00:00:00 2001
From: Dmitry Rokosov <DDRokosov@sberdevices.ru>
Date: Tue, 7 Jun 2022 18:39:18 +0000
Subject: iio: trigger: warn about non-registered iio trigger getting attempt

As a part of patch series about wrong trigger register() and get()
calls order in the some IIO drivers trigger initialization path:

https://lore.kernel.org/all/20220524181150.9240-1-ddrokosov@sberdevices.ru/

runtime WARN_ONCE() is added to alarm IIO driver authors who make such
a mistake.

When an IIO driver allocates a new IIO trigger, it should register it
before calling the get() operation. In other words, each IIO driver
must abide by IIO trigger alloc()/register()/get() calls order.

Signed-off-by: Dmitry Rokosov <ddrokosov@sberdevices.ru>
Link: https://lore.kernel.org/r/20220607183907.20017-1-ddrokosov@sberdevices.ru
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-trigger.c | 2 ++
 include/linux/iio/trigger.h        | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index d89bf90650d5..26d610d4cbb8 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -576,6 +576,8 @@ struct iio_trigger *viio_trigger_alloc(struct device *parent,
 	if (trig->name == NULL)
 		goto free_descs;
 
+	INIT_LIST_HEAD(&trig->list);
+
 	trig->subirq_chip.name = trig->name;
 	trig->subirq_chip.irq_mask = &iio_trig_subirqmask;
 	trig->subirq_chip.irq_unmask = &iio_trig_subirqunmask;
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 4c69b144677b..03b1d6863436 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -93,6 +93,11 @@ static inline void iio_trigger_put(struct iio_trigger *trig)
 static inline struct iio_trigger *iio_trigger_get(struct iio_trigger *trig)
 {
 	get_device(&trig->dev);
+
+	WARN_ONCE(list_empty(&trig->list),
+		  "Getting non-registered iio trigger %s is prohibited\n",
+		  trig->name);
+
 	__module_get(trig->owner);
 
 	return trig;
-- 
cgit 


From 8670dc33f48bab4d7bb4b8d0232f17f4dae419ec Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Fri, 17 Jun 2022 11:24:23 +0800
Subject: net: dsa: felix: update base time of time-aware shaper when adjusting
 PTP time

When adjusting the PTP clock, the base time of the TAS configuration
will become unreliable. We need reset the TAS configuration by using a
new base time.

For example, if the driver gets a base time 0 of Qbv configuration from
user, and current time is 20000. The driver will set the TAS base time
to be 20000. After the PTP clock adjustment, the current time becomes
10000. If the TAS base time is still 20000, it will be a future time,
and TAS entry list will stop running. Another example, if the current
time becomes to be 10000000 after PTP clock adjust, a large time offset
can cause the hardware to hang.

This patch introduces a tas_clock_adjust() function to reset the TAS
module by using a new base time after the PTP clock adjustment. This can
avoid issues above.

Due to PTP clock adjustment can occur at any time, it may conflict with
the TAS configuration. We introduce a new TAS lock to serialize the
access to the TAS registers.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c | 83 +++++++++++++++++++++++++++++++---
 drivers/net/ethernet/mscc/ocelot.c     |  1 +
 drivers/net/ethernet/mscc/ocelot_ptp.c |  8 ++++
 include/soc/mscc/ocelot.h              |  7 +++
 4 files changed, 93 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 570d0204b7be..dd9085ae0922 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1196,10 +1196,13 @@ static void vsc9959_tas_gcl_set(struct ocelot *ocelot, const u32 gcl_ix,
 static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 				    struct tc_taprio_qopt_offload *taprio)
 {
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct timespec64 base_ts;
 	int ret, i;
 	u32 val;
 
+	mutex_lock(&ocelot->tas_lock);
+
 	if (!taprio->enable) {
 		ocelot_rmw_rix(ocelot,
 			       QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF),
@@ -1207,15 +1210,20 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 			       QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
 			       QSYS_TAG_CONFIG, port);
 
+		mutex_unlock(&ocelot->tas_lock);
 		return 0;
 	}
 
 	if (taprio->cycle_time > NSEC_PER_SEC ||
-	    taprio->cycle_time_extension >= NSEC_PER_SEC)
-		return -EINVAL;
+	    taprio->cycle_time_extension >= NSEC_PER_SEC) {
+		ret = -EINVAL;
+		goto err;
+	}
 
-	if (taprio->num_entries > VSC9959_TAS_GCL_ENTRY_MAX)
-		return -ERANGE;
+	if (taprio->num_entries > VSC9959_TAS_GCL_ENTRY_MAX) {
+		ret = -ERANGE;
+		goto err;
+	}
 
 	/* Enable guard band. The switch will schedule frames without taking
 	 * their length into account. Thus we'll always need to enable the
@@ -1236,8 +1244,10 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 	 * config is pending, need reset the TAS module
 	 */
 	val = ocelot_read(ocelot, QSYS_PARAM_STATUS_REG_8);
-	if (val & QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING)
-		return  -EBUSY;
+	if (val & QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING) {
+		ret = -EBUSY;
+		goto err;
+	}
 
 	ocelot_rmw_rix(ocelot,
 		       QSYS_TAG_CONFIG_ENABLE |
@@ -1248,6 +1258,8 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 		       QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES_M,
 		       QSYS_TAG_CONFIG, port);
 
+	ocelot_port->base_time = taprio->base_time;
+
 	vsc9959_new_base_time(ocelot, taprio->base_time,
 			      taprio->cycle_time, &base_ts);
 	ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
@@ -1271,9 +1283,67 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 				 !(val & QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE),
 				 10, 100000);
 
+err:
+	mutex_unlock(&ocelot->tas_lock);
+
 	return ret;
 }
 
+static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
+{
+	struct ocelot_port *ocelot_port;
+	struct timespec64 base_ts;
+	u64 cycletime;
+	int port;
+	u32 val;
+
+	mutex_lock(&ocelot->tas_lock);
+
+	for (port = 0; port < ocelot->num_phys_ports; port++) {
+		val = ocelot_read_rix(ocelot, QSYS_TAG_CONFIG, port);
+		if (!(val & QSYS_TAG_CONFIG_ENABLE))
+			continue;
+
+		ocelot_rmw(ocelot,
+			   QSYS_TAS_PARAM_CFG_CTRL_PORT_NUM(port),
+			   QSYS_TAS_PARAM_CFG_CTRL_PORT_NUM_M,
+			   QSYS_TAS_PARAM_CFG_CTRL);
+
+		ocelot_rmw_rix(ocelot,
+			       QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF),
+			       QSYS_TAG_CONFIG_ENABLE |
+			       QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
+			       QSYS_TAG_CONFIG, port);
+
+		cycletime = ocelot_read(ocelot, QSYS_PARAM_CFG_REG_4);
+		ocelot_port = ocelot->ports[port];
+
+		vsc9959_new_base_time(ocelot, ocelot_port->base_time,
+				      cycletime, &base_ts);
+
+		ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
+		ocelot_write(ocelot, lower_32_bits(base_ts.tv_sec),
+			     QSYS_PARAM_CFG_REG_2);
+		val = upper_32_bits(base_ts.tv_sec);
+		ocelot_rmw(ocelot,
+			   QSYS_PARAM_CFG_REG_3_BASE_TIME_SEC_MSB(val),
+			   QSYS_PARAM_CFG_REG_3_BASE_TIME_SEC_MSB_M,
+			   QSYS_PARAM_CFG_REG_3);
+
+		ocelot_rmw(ocelot, QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE,
+			   QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE,
+			   QSYS_TAS_PARAM_CFG_CTRL);
+
+		ocelot_rmw_rix(ocelot,
+			       QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF) |
+			       QSYS_TAG_CONFIG_ENABLE,
+			       QSYS_TAG_CONFIG_ENABLE |
+			       QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
+			       QSYS_TAG_CONFIG, port);
+	}
+	mutex_unlock(&ocelot->tas_lock);
+}
+
 static int vsc9959_qos_port_cbs_set(struct dsa_switch *ds, int port,
 				    struct tc_cbs_qopt_offload *cbs_qopt)
 {
@@ -2210,6 +2280,7 @@ static const struct ocelot_ops vsc9959_ops = {
 	.psfp_filter_del	= vsc9959_psfp_filter_del,
 	.psfp_stats_get		= vsc9959_psfp_stats_get,
 	.cut_through_fwd	= vsc9959_cut_through_fwd,
+	.tas_clock_adjust	= vsc9959_tas_clock_adjust,
 };
 
 static const struct felix_info felix_info_vsc9959 = {
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8da7e25a47c9..d4649e4ee0e7 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -3367,6 +3367,7 @@ int ocelot_init(struct ocelot *ocelot)
 	mutex_init(&ocelot->ptp_lock);
 	mutex_init(&ocelot->mact_lock);
 	mutex_init(&ocelot->fwd_domain_lock);
+	mutex_init(&ocelot->tas_lock);
 	spin_lock_init(&ocelot->ptp_clock_lock);
 	spin_lock_init(&ocelot->ts_id_lock);
 	snprintf(queue_name, sizeof(queue_name), "%s-stats",
diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c
index 87ad2137ba06..09c703efe946 100644
--- a/drivers/net/ethernet/mscc/ocelot_ptp.c
+++ b/drivers/net/ethernet/mscc/ocelot_ptp.c
@@ -72,6 +72,10 @@ int ocelot_ptp_settime64(struct ptp_clock_info *ptp,
 	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
 
 	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+
+	if (ocelot->ops->tas_clock_adjust)
+		ocelot->ops->tas_clock_adjust(ocelot);
+
 	return 0;
 }
 EXPORT_SYMBOL(ocelot_ptp_settime64);
@@ -105,6 +109,9 @@ int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 		ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
 
 		spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+
+		if (ocelot->ops->tas_clock_adjust)
+			ocelot->ops->tas_clock_adjust(ocelot);
 	} else {
 		/* Fall back using ocelot_ptp_settime64 which is not exact. */
 		struct timespec64 ts;
@@ -117,6 +124,7 @@ int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 
 		ocelot_ptp_settime64(ptp, &ts);
 	}
+
 	return 0;
 }
 EXPORT_SYMBOL(ocelot_ptp_adjtime);
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 5f88385a7748..3737570116c3 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -575,6 +575,7 @@ struct ocelot_ops {
 	int (*psfp_stats_get)(struct ocelot *ocelot, struct flow_cls_offload *f,
 			      struct flow_stats *stats);
 	void (*cut_through_fwd)(struct ocelot *ocelot);
+	void (*tas_clock_adjust)(struct ocelot *ocelot);
 };
 
 struct ocelot_vcap_policer {
@@ -691,6 +692,9 @@ struct ocelot_port {
 	int				bridge_num;
 
 	int				speed;
+
+	/* Store the AdminBaseTime of EST fetched from userspace. */
+	s64				base_time;
 };
 
 struct ocelot {
@@ -757,6 +761,9 @@ struct ocelot {
 	/* Lock for serializing forwarding domain changes */
 	struct mutex			fwd_domain_lock;
 
+	/* Lock for serializing Time-Aware Shaper changes */
+	struct mutex			tas_lock;
+
 	struct workqueue_struct		*owq;
 
 	u8				ptp:1;
-- 
cgit 


From ba44f8182ec299c5d1c8a72fc0fde4ec127b5a6d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Jun 2022 20:47:04 -0700
Subject: raw: use more conventional iterators

In order to prepare the following patch,
I change raw v4 & v6 code to use more conventional
iterators.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h   |   5 +--
 include/net/rawv6.h |   6 +--
 net/ipv4/raw.c      |  93 +++++++++++++++++-----------------------------
 net/ipv4/raw_diag.c |  33 ++++++++---------
 net/ipv6/raw.c      | 105 +++++++++++++++++++---------------------------------
 5 files changed, 92 insertions(+), 150 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 8ad8df594853..719d3556fc0a 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -20,9 +20,8 @@
 extern struct proto raw_prot;
 
 extern struct raw_hashinfo raw_v4_hashinfo;
-struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
-			     unsigned short num, __be32 raddr,
-			     __be32 laddr, int dif, int sdif);
+bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+		  __be32 raddr, __be32 laddr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
 void raw_icmp_error(struct sk_buff *, int, u32);
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 53d86b6055e8..c48c1298699a 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -5,9 +5,9 @@
 #include <net/protocol.h>
 
 extern struct raw_hashinfo raw_v6_hashinfo;
-struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
-			     unsigned short num, const struct in6_addr *loc_addr,
-			     const struct in6_addr *rmt_addr, int dif, int sdif);
+bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+		  const struct in6_addr *loc_addr,
+		  const struct in6_addr *rmt_addr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bbd717805b10..05e0de4a7c7f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -117,24 +117,19 @@ void raw_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
-struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
-			     unsigned short num, __be32 raddr, __be32 laddr,
-			     int dif, int sdif)
+bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+		  __be32 raddr, __be32 laddr, int dif, int sdif)
 {
-	sk_for_each_from(sk) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
-		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
-		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-		    raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
-			goto found; /* gotcha */
-	}
-	sk = NULL;
-found:
-	return sk;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
+	    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
+	    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+	    raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+		return true;
+	return false;
 }
-EXPORT_SYMBOL_GPL(__raw_v4_lookup);
+EXPORT_SYMBOL_GPL(raw_v4_match);
 
 /*
  *	0 - deliver
@@ -168,23 +163,21 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
  */
 static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 {
+	struct net *net = dev_net(skb->dev);
 	int sdif = inet_sdif(skb);
 	int dif = inet_iif(skb);
-	struct sock *sk;
 	struct hlist_head *head;
 	int delivered = 0;
-	struct net *net;
+	struct sock *sk;
 
-	read_lock(&raw_v4_hashinfo.lock);
 	head = &raw_v4_hashinfo.ht[hash];
 	if (hlist_empty(head))
-		goto out;
-
-	net = dev_net(skb->dev);
-	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
-			     iph->saddr, iph->daddr, dif, sdif);
-
-	while (sk) {
+		return 0;
+	read_lock(&raw_v4_hashinfo.lock);
+	sk_for_each(sk, head) {
+		if (!raw_v4_match(net, sk, iph->protocol,
+				  iph->saddr, iph->daddr, dif, sdif))
+			continue;
 		delivered = 1;
 		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
 		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
@@ -195,31 +188,16 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 			if (clone)
 				raw_rcv(sk, clone);
 		}
-		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
-				     iph->saddr, iph->daddr,
-				     dif, sdif);
 	}
-out:
 	read_unlock(&raw_v4_hashinfo.lock);
 	return delivered;
 }
 
 int raw_local_deliver(struct sk_buff *skb, int protocol)
 {
-	int hash;
-	struct sock *raw_sk;
-
-	hash = protocol & (RAW_HTABLE_SIZE - 1);
-	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
-
-	/* If there maybe a raw socket we must check - if not we
-	 * don't care less
-	 */
-	if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
-		raw_sk = NULL;
-
-	return raw_sk != NULL;
+	int hash = protocol & (RAW_HTABLE_SIZE - 1);
 
+	return raw_v4_input(skb, ip_hdr(skb), hash);
 }
 
 static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
@@ -286,29 +264,24 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 
 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 {
-	int hash;
-	struct sock *raw_sk;
+	struct net *net = dev_net(skb->dev);;
+	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
+	struct hlist_head *head;
 	const struct iphdr *iph;
-	struct net *net;
+	struct sock *sk;
+	int hash;
 
 	hash = protocol & (RAW_HTABLE_SIZE - 1);
+	head = &raw_v4_hashinfo.ht[hash];
 
 	read_lock(&raw_v4_hashinfo.lock);
-	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
-	if (raw_sk) {
-		int dif = skb->dev->ifindex;
-		int sdif = inet_sdif(skb);
-
+	sk_for_each(sk, head) {
 		iph = (const struct iphdr *)skb->data;
-		net = dev_net(skb->dev);
-
-		while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
-						iph->daddr, iph->saddr,
-						dif, sdif)) != NULL) {
-			raw_err(raw_sk, skb, info);
-			raw_sk = sk_next(raw_sk);
-			iph = (const struct iphdr *)skb->data;
-		}
+		if (!raw_v4_match(net, sk, iph->protocol,
+				  iph->saddr, iph->daddr, dif, sdif))
+			continue;
+		raw_err(sk, skb, info);
 	}
 	read_unlock(&raw_v4_hashinfo.lock);
 }
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index ccacbde30a2c..b6d92dc7b051 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -34,31 +34,30 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r)
  * use helper to figure it out.
  */
 
-static struct sock *raw_lookup(struct net *net, struct sock *from,
-			       const struct inet_diag_req_v2 *req)
+static bool raw_lookup(struct net *net, struct sock *sk,
+		       const struct inet_diag_req_v2 *req)
 {
 	struct inet_diag_req_raw *r = (void *)req;
-	struct sock *sk = NULL;
 
 	if (r->sdiag_family == AF_INET)
-		sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
-				     r->id.idiag_dst[0],
-				     r->id.idiag_src[0],
-				     r->id.idiag_if, 0);
+		return raw_v4_match(net, sk, r->sdiag_raw_protocol,
+				    r->id.idiag_dst[0],
+				    r->id.idiag_src[0],
+				    r->id.idiag_if, 0);
 #if IS_ENABLED(CONFIG_IPV6)
 	else
-		sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
-				     (const struct in6_addr *)r->id.idiag_src,
-				     (const struct in6_addr *)r->id.idiag_dst,
-				     r->id.idiag_if, 0);
+		return raw_v6_match(net, sk, r->sdiag_raw_protocol,
+				    (const struct in6_addr *)r->id.idiag_src,
+				    (const struct in6_addr *)r->id.idiag_dst,
+				    r->id.idiag_if, 0);
 #endif
-	return sk;
+	return false;
 }
 
 static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
 {
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
-	struct sock *sk = NULL, *s;
+	struct sock *sk;
 	int slot;
 
 	if (IS_ERR(hashinfo))
@@ -66,9 +65,8 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2
 
 	read_lock(&hashinfo->lock);
 	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
-		sk_for_each(s, &hashinfo->ht[slot]) {
-			sk = raw_lookup(net, s, r);
-			if (sk) {
+		sk_for_each(sk, &hashinfo->ht[slot]) {
+			if (raw_lookup(net, sk, r)) {
 				/*
 				 * Grab it and keep until we fill
 				 * diag meaage to be reported, so
@@ -81,10 +79,11 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2
 			}
 		}
 	}
+	sk = ERR_PTR(-ENOENT);
 out_unlock:
 	read_unlock(&hashinfo->lock);
 
-	return sk ? sk : ERR_PTR(-ENOENT);
+	return sk;
 }
 
 static int raw_diag_dump_one(struct netlink_callback *cb,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 3b7cbd522b54..c0f2e3475984 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -66,41 +66,27 @@ struct raw_hashinfo raw_v6_hashinfo = {
 };
 EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
-struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
-		unsigned short num, const struct in6_addr *loc_addr,
-		const struct in6_addr *rmt_addr, int dif, int sdif)
+bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+		  const struct in6_addr *loc_addr,
+		  const struct in6_addr *rmt_addr, int dif, int sdif)
 {
-	bool is_multicast = ipv6_addr_is_multicast(loc_addr);
-
-	sk_for_each_from(sk)
-		if (inet_sk(sk)->inet_num == num) {
-
-			if (!net_eq(sock_net(sk), net))
-				continue;
-
-			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
-			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
-				continue;
-
-			if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
-						 dif, sdif))
-				continue;
-
-			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-				if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
-					goto found;
-				if (is_multicast &&
-				    inet6_mc_check(sk, loc_addr, rmt_addr))
-					goto found;
-				continue;
-			}
-			goto found;
-		}
-	sk = NULL;
-found:
-	return sk;
+	if (inet_sk(sk)->inet_num != num ||
+	    !net_eq(sock_net(sk), net) ||
+	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+	     !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+	    !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+				 dif, sdif))
+		return false;
+
+	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
+	    (ipv6_addr_is_multicast(loc_addr) &&
+	     inet6_mc_check(sk, loc_addr, rmt_addr)))
+		return true;
+
+	return false;
 }
-EXPORT_SYMBOL_GPL(__raw_v6_lookup);
+EXPORT_SYMBOL_GPL(raw_v6_match);
 
 /*
  *	0 - deliver
@@ -156,31 +142,28 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
  */
 static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 {
+	struct net *net = dev_net(skb->dev);
 	const struct in6_addr *saddr;
 	const struct in6_addr *daddr;
+	struct hlist_head *head;
 	struct sock *sk;
 	bool delivered = false;
 	__u8 hash;
-	struct net *net;
 
 	saddr = &ipv6_hdr(skb)->saddr;
 	daddr = saddr + 1;
 
 	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-
+	head = &raw_v6_hashinfo.ht[hash];
+	if (hlist_empty(head))
+		return false;
 	read_lock(&raw_v6_hashinfo.lock);
-	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
-
-	if (!sk)
-		goto out;
-
-	net = dev_net(skb->dev);
-	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr,
-			     inet6_iif(skb), inet6_sdif(skb));
-
-	while (sk) {
+	sk_for_each(sk, head) {
 		int filtered;
 
+		if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
+				  inet6_iif(skb), inet6_sdif(skb)))
+			continue;
 		delivered = true;
 		switch (nexthdr) {
 		case IPPROTO_ICMPV6:
@@ -219,23 +202,14 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 				rawv6_rcv(sk, clone);
 			}
 		}
-		sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
-				     inet6_iif(skb), inet6_sdif(skb));
 	}
-out:
 	read_unlock(&raw_v6_hashinfo.lock);
 	return delivered;
 }
 
 bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
 {
-	struct sock *raw_sk;
-
-	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
-	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
-		raw_sk = NULL;
-
-	return raw_sk != NULL;
+	return ipv6_raw_deliver(skb, nexthdr);
 }
 
 /* This cleans up af_inet6 a bit. -DaveM */
@@ -361,28 +335,25 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32 info)
 {
+	const struct in6_addr *saddr, *daddr;
+	struct net *net = dev_net(skb->dev);
+	struct hlist_head *head;
 	struct sock *sk;
 	int hash;
-	const struct in6_addr *saddr, *daddr;
-	struct net *net;
 
 	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-
+	head = &raw_v6_hashinfo.ht[hash];
 	read_lock(&raw_v6_hashinfo.lock);
-	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
-	if (sk) {
+	sk_for_each(sk, head) {
 		/* Note: ipv6_hdr(skb) != skb->data */
 		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
 		saddr = &ip6h->saddr;
 		daddr = &ip6h->daddr;
-		net = dev_net(skb->dev);
 
-		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
-					     inet6_iif(skb), inet6_iif(skb)))) {
-			rawv6_err(sk, skb, NULL, type, code,
-					inner_offset, info);
-			sk = sk_next(sk);
-		}
+		if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
+				  inet6_iif(skb), inet6_iif(skb)))
+			continue;
+		rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
 	}
 	read_unlock(&raw_v6_hashinfo.lock);
 }
-- 
cgit 


From 0daf07e527095e64ee8927ce297ab626643e9f51 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Jun 2022 20:47:05 -0700
Subject: raw: convert raw sockets to RCU

Using rwlock in networking code is extremely risky.
writers can starve if enough readers are constantly
grabing the rwlock.

I thought rwlock were at fault and sent this patch:

https://lkml.org/lkml/2022/6/17/272

But Peter and Linus essentially told me rwlock had to be unfair.

We need to get rid of rwlock in networking code.

Without this fix, following script triggers soft lockups:

for i in {1..48}
do
 ping -f -n -q 127.0.0.1 &
 sleep 0.1
done

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h   | 11 ++++++-
 include/net/rawv6.h |  1 +
 net/ipv4/af_inet.c  |  2 ++
 net/ipv4/raw.c      | 83 ++++++++++++++++++++++++-----------------------------
 net/ipv4/raw_diag.c | 22 ++++++++------
 net/ipv6/af_inet6.c |  3 ++
 net/ipv6/raw.c      | 28 +++++++++---------
 7 files changed, 80 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 719d3556fc0a..d81eeeb8f1e6 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -33,9 +33,18 @@ int raw_rcv(struct sock *, struct sk_buff *);
 
 struct raw_hashinfo {
 	rwlock_t lock;
-	struct hlist_head ht[RAW_HTABLE_SIZE];
+	struct hlist_nulls_head ht[RAW_HTABLE_SIZE];
 };
 
+static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo)
+{
+	int i;
+
+	rwlock_init(&hashinfo->lock);
+	for (i = 0; i < RAW_HTABLE_SIZE; i++)
+		INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i);
+}
+
 #ifdef CONFIG_PROC_FS
 int raw_proc_init(void);
 void raw_proc_exit(void);
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index c48c1298699a..bc70909625f6 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,7 @@
 #define _NET_RAWV6_H
 
 #include <net/protocol.h>
+#include <net/raw.h>
 
 extern struct raw_hashinfo raw_v6_hashinfo;
 bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 30e0e8992085..da81f56fdd1c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1929,6 +1929,8 @@ static int __init inet_init(void)
 
 	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
 
+	raw_hashinfo_init(&raw_v4_hashinfo);
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
 		goto out;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 05e0de4a7c7f..d28bf0b901a2 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -85,20 +85,19 @@ struct raw_frag_vec {
 	int hlen;
 };
 
-struct raw_hashinfo raw_v4_hashinfo = {
-	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
-};
+struct raw_hashinfo raw_v4_hashinfo;
 EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
 
 int raw_hash_sk(struct sock *sk)
 {
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
-	struct hlist_head *head;
+	struct hlist_nulls_head *hlist;
 
-	head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+	hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
 
 	write_lock_bh(&h->lock);
-	sk_add_node(sk, head);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist);
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	write_unlock_bh(&h->lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
@@ -111,7 +110,7 @@ void raw_unhash_sk(struct sock *sk)
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 
 	write_lock_bh(&h->lock);
-	if (sk_del_node_init(sk))
+	if (__sk_nulls_del_node_init_rcu(sk))
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	write_unlock_bh(&h->lock);
 }
@@ -164,17 +163,16 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
 static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 {
 	struct net *net = dev_net(skb->dev);
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	int sdif = inet_sdif(skb);
 	int dif = inet_iif(skb);
-	struct hlist_head *head;
 	int delivered = 0;
 	struct sock *sk;
 
-	head = &raw_v4_hashinfo.ht[hash];
-	if (hlist_empty(head))
-		return 0;
-	read_lock(&raw_v4_hashinfo.lock);
-	sk_for_each(sk, head) {
+	hlist = &raw_v4_hashinfo.ht[hash];
+	rcu_read_lock();
+	hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 		if (!raw_v4_match(net, sk, iph->protocol,
 				  iph->saddr, iph->daddr, dif, sdif))
 			continue;
@@ -189,7 +187,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 				raw_rcv(sk, clone);
 		}
 	}
-	read_unlock(&raw_v4_hashinfo.lock);
+	rcu_read_unlock();
 	return delivered;
 }
 
@@ -265,25 +263,26 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 {
 	struct net *net = dev_net(skb->dev);;
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	int dif = skb->dev->ifindex;
 	int sdif = inet_sdif(skb);
-	struct hlist_head *head;
 	const struct iphdr *iph;
 	struct sock *sk;
 	int hash;
 
 	hash = protocol & (RAW_HTABLE_SIZE - 1);
-	head = &raw_v4_hashinfo.ht[hash];
+	hlist = &raw_v4_hashinfo.ht[hash];
 
-	read_lock(&raw_v4_hashinfo.lock);
-	sk_for_each(sk, head) {
+	rcu_read_lock();
+	hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 		iph = (const struct iphdr *)skb->data;
 		if (!raw_v4_match(net, sk, iph->protocol,
 				  iph->saddr, iph->daddr, dif, sdif))
 			continue;
 		raw_err(sk, skb, info);
 	}
-	read_unlock(&raw_v4_hashinfo.lock);
+	rcu_read_unlock();
 }
 
 static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -944,44 +943,41 @@ struct proto raw_prot = {
 };
 
 #ifdef CONFIG_PROC_FS
-static struct sock *raw_get_first(struct seq_file *seq)
+static struct sock *raw_get_first(struct seq_file *seq, int bucket)
 {
-	struct sock *sk;
 	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
+	struct sock *sk;
 
-	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+	for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
-		sk_for_each(sk, &h->ht[state->bucket])
+		hlist = &h->ht[state->bucket];
+		hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 			if (sock_net(sk) == seq_file_net(seq))
-				goto found;
+				return sk;
+		}
 	}
-	sk = NULL;
-found:
-	return sk;
+	return NULL;
 }
 
 static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 {
-	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
 	struct raw_iter_state *state = raw_seq_private(seq);
 
 	do {
-		sk = sk_next(sk);
-try_again:
-		;
+		sk = sk_nulls_next(sk);
 	} while (sk && sock_net(sk) != seq_file_net(seq));
 
-	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
-		sk = sk_head(&h->ht[state->bucket]);
-		goto try_again;
-	}
+	if (!sk)
+		return raw_get_first(seq, state->bucket + 1);
 	return sk;
 }
 
 static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct sock *sk = raw_get_first(seq);
+	struct sock *sk = raw_get_first(seq, 0);
 
 	if (sk)
 		while (pos && (sk = raw_get_next(seq, sk)) != NULL)
@@ -990,11 +986,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 void *raw_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(&h->lock)
+	__acquires(RCU)
 {
-	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
-
-	read_lock(&h->lock);
+	rcu_read_lock();
 	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1004,7 +998,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct sock *sk;
 
 	if (v == SEQ_START_TOKEN)
-		sk = raw_get_first(seq);
+		sk = raw_get_first(seq, 0);
 	else
 		sk = raw_get_next(seq, v);
 	++*pos;
@@ -1013,11 +1007,9 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 EXPORT_SYMBOL_GPL(raw_seq_next);
 
 void raw_seq_stop(struct seq_file *seq, void *v)
-	__releases(&h->lock)
+	__releases(RCU)
 {
-	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
-
-	read_unlock(&h->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(raw_seq_stop);
 
@@ -1079,6 +1071,7 @@ static __net_initdata struct pernet_operations raw_net_ops = {
 
 int __init raw_proc_init(void)
 {
+
 	return register_pernet_subsys(&raw_net_ops);
 }
 
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index b6d92dc7b051..5f208e840d85 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -57,31 +57,32 @@ static bool raw_lookup(struct net *net, struct sock *sk,
 static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
 {
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	struct sock *sk;
 	int slot;
 
 	if (IS_ERR(hashinfo))
 		return ERR_CAST(hashinfo);
 
-	read_lock(&hashinfo->lock);
+	rcu_read_lock();
 	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
-		sk_for_each(sk, &hashinfo->ht[slot]) {
+		hlist = &hashinfo->ht[slot];
+		hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 			if (raw_lookup(net, sk, r)) {
 				/*
 				 * Grab it and keep until we fill
-				 * diag meaage to be reported, so
+				 * diag message to be reported, so
 				 * caller should call sock_put then.
-				 * We can do that because we're keeping
-				 * hashinfo->lock here.
 				 */
-				sock_hold(sk);
-				goto out_unlock;
+				if (refcount_inc_not_zero(&sk->sk_refcnt))
+					goto out_unlock;
 			}
 		}
 	}
 	sk = ERR_PTR(-ENOENT);
 out_unlock:
-	read_unlock(&hashinfo->lock);
+	rcu_read_unlock();
 
 	return sk;
 }
@@ -141,6 +142,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
 	struct net *net = sock_net(skb->sk);
 	struct inet_diag_dump_data *cb_data;
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	int num, s_num, slot, s_slot;
 	struct sock *sk = NULL;
 	struct nlattr *bc;
@@ -157,7 +160,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
 		num = 0;
 
-		sk_for_each(sk, &hashinfo->ht[slot]) {
+		hlist = &hashinfo->ht[slot];
+		hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 70564ddccc46..658823e91eca 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -63,6 +63,7 @@
 #include <net/compat.h>
 #include <net/xfrm.h>
 #include <net/ioam6.h>
+#include <net/rawv6.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
@@ -1073,6 +1074,8 @@ static int __init inet6_init(void)
 		goto out;
 	}
 
+	raw_hashinfo_init(&raw_v6_hashinfo);
+
 	err = proto_register(&tcpv6_prot, 1);
 	if (err)
 		goto out;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index c0f2e3475984..f6119998700e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -61,9 +61,7 @@
 
 #define	ICMPV6_HDRLEN	4	/* ICMPv6 header, RFC 4443 Section 2.1 */
 
-struct raw_hashinfo raw_v6_hashinfo = {
-	.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
-};
+struct raw_hashinfo raw_v6_hashinfo;
 EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
 bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
@@ -143,9 +141,10 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
 static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct net *net = dev_net(skb->dev);
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	const struct in6_addr *saddr;
 	const struct in6_addr *daddr;
-	struct hlist_head *head;
 	struct sock *sk;
 	bool delivered = false;
 	__u8 hash;
@@ -154,11 +153,9 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	daddr = saddr + 1;
 
 	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-	head = &raw_v6_hashinfo.ht[hash];
-	if (hlist_empty(head))
-		return false;
-	read_lock(&raw_v6_hashinfo.lock);
-	sk_for_each(sk, head) {
+	hlist = &raw_v6_hashinfo.ht[hash];
+	rcu_read_lock();
+	hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 		int filtered;
 
 		if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
@@ -203,7 +200,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			}
 		}
 	}
-	read_unlock(&raw_v6_hashinfo.lock);
+	rcu_read_unlock();
 	return delivered;
 }
 
@@ -337,14 +334,15 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 {
 	const struct in6_addr *saddr, *daddr;
 	struct net *net = dev_net(skb->dev);
-	struct hlist_head *head;
+	struct hlist_nulls_head *hlist;
+	struct hlist_nulls_node *hnode;
 	struct sock *sk;
 	int hash;
 
 	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-	head = &raw_v6_hashinfo.ht[hash];
-	read_lock(&raw_v6_hashinfo.lock);
-	sk_for_each(sk, head) {
+	hlist = &raw_v6_hashinfo.ht[hash];
+	rcu_read_lock();
+	hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
 		/* Note: ipv6_hdr(skb) != skb->data */
 		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
 		saddr = &ip6h->saddr;
@@ -355,7 +353,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 			continue;
 		rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
 	}
-	read_unlock(&raw_v6_hashinfo.lock);
+	rcu_read_unlock();
 }
 
 static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
-- 
cgit 


From bdb6cfe7512f7a214815a3092f0be50963dcacbc Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Sat, 18 Jun 2022 11:28:32 +0100
Subject: net: mii: add mii_bmcr_encode_fixed()

Add a function to encode a fixed speed/duplex to a BMCR value.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 5ee13083cec7..d5a959ce4877 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -545,4 +545,39 @@ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv)
 	return cap;
 }
 
+/**
+ * mii_bmcr_encode_fixed - encode fixed speed/duplex settings to a BMCR value
+ * @speed: a SPEED_* value
+ * @duplex: a DUPLEX_* value
+ *
+ * Encode the speed and duplex to a BMCR value. 2500, 1000, 100 and 10 Mbps are
+ * supported. 2500Mbps is encoded to 1000Mbps. Other speeds are encoded as 10
+ * Mbps. Unknown duplex values are encoded to half-duplex.
+ */
+static inline u16 mii_bmcr_encode_fixed(int speed, int duplex)
+{
+	u16 bmcr;
+
+	switch (speed) {
+	case SPEED_2500:
+	case SPEED_1000:
+		bmcr = BMCR_SPEED1000;
+		break;
+
+	case SPEED_100:
+		bmcr = BMCR_SPEED100;
+		break;
+
+	case SPEED_10:
+	default:
+		bmcr = BMCR_SPEED10;
+		break;
+	}
+
+	if (duplex == DUPLEX_FULL)
+		bmcr |= BMCR_FULLDPLX;
+
+	return bmcr;
+}
+
 #endif /* __LINUX_MII_H__ */
-- 
cgit 


From 2e0aee8f0a22c60a1ae0876f7233e70ad9d026b8 Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Thu, 16 Jun 2022 23:51:49 +0300
Subject: ata: make ata_port::fastdrain_cnt *unsigned int*

*unsigned long* ata_port::fastdrain_cnt (64-bit value in a 64-bit kernel)
is always assigned from the 32-bit *unsigned int* variables, thus could
also be made just *unsigned int*...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index a8bc88b4fe07..0269ff114f5a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -847,7 +847,7 @@ struct ata_port {
 	enum ata_lpm_policy	target_lpm_policy;
 
 	struct timer_list	fastdrain_timer;
-	unsigned long		fastdrain_cnt;
+	unsigned int		fastdrain_cnt;
 
 	async_cookie_t		cookie;
 
-- 
cgit 


From cd76175a2b204911a3cddef36b99e56945b6938c Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 17 Jun 2022 16:40:47 +0200
Subject: ALSA: rawmidi: Make internal functions local static

__snd_rawmidi_transmit_peek() and __snd_rawmidi_transmit_ack() are
never called from the outside.  Let's make them local static and
unexport them.

Link: https://lore.kernel.org/r/20220617144051.18985-2-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/rawmidi.h |  4 ----
 sound/core/rawmidi.c    | 13 ++++++-------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h
index 7a08ed2acd60..9402c25ae9ba 100644
--- a/include/sound/rawmidi.h
+++ b/include/sound/rawmidi.h
@@ -156,10 +156,6 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
 int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count);
 int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream,
 			 unsigned char *buffer, int count);
-int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
-			      unsigned char *buffer, int count);
-int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream,
-			       int count);
 int snd_rawmidi_proceed(struct snd_rawmidi_substream *substream);
 
 /* main midi functions */
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index befa9809ff00..82e8f656bbb2 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -1258,7 +1258,7 @@ int snd_rawmidi_transmit_empty(struct snd_rawmidi_substream *substream)
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_empty);
 
-/**
+/*
  * __snd_rawmidi_transmit_peek - copy data from the internal buffer
  * @substream: the rawmidi substream
  * @buffer: the buffer pointer
@@ -1266,8 +1266,8 @@ EXPORT_SYMBOL(snd_rawmidi_transmit_empty);
  *
  * This is a variant of snd_rawmidi_transmit_peek() without spinlock.
  */
-int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
-			      unsigned char *buffer, int count)
+static int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
+				       unsigned char *buffer, int count)
 {
 	int result, count1;
 	struct snd_rawmidi_runtime *runtime = substream->runtime;
@@ -1304,7 +1304,6 @@ int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
       __skip:
 	return result;
 }
-EXPORT_SYMBOL(__snd_rawmidi_transmit_peek);
 
 /**
  * snd_rawmidi_transmit_peek - copy data from the internal buffer
@@ -1334,14 +1333,15 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_peek);
 
-/**
+/*
  * __snd_rawmidi_transmit_ack - acknowledge the transmission
  * @substream: the rawmidi substream
  * @count: the transferred count
  *
  * This is a variant of __snd_rawmidi_transmit_ack() without spinlock.
  */
-int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
+static int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream,
+				      int count)
 {
 	struct snd_rawmidi_runtime *runtime = substream->runtime;
 
@@ -1361,7 +1361,6 @@ int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int coun
 	}
 	return count;
 }
-EXPORT_SYMBOL(__snd_rawmidi_transmit_ack);
 
 /**
  * snd_rawmidi_transmit_ack - acknowledge the transmission
-- 
cgit 


From f1d40433352e5d4babd59c0dd50b5f9414073ddb Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 17 Jun 2022 16:40:48 +0200
Subject: ALSA: rawmidi: Move lock to snd_rawmidi_substream

Having a lock in snd_rawmidi_runtime can be a problem especially when
a substream is accessed from the outside, as the runtime creation
might be racy with the external calls.  As a first step for hardening,
move the spinlock from snd_rawmidi_runtime to snd_rawmidi_substream.

This patch just replaces the lock calls, no real functional change is
put yet.

Link: https://lore.kernel.org/r/20220617144051.18985-3-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/rawmidi.h |   2 +-
 sound/core/rawmidi.c    | 131 +++++++++++++++++++++++-------------------------
 2 files changed, 65 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h
index 9402c25ae9ba..e1f59b2940af 100644
--- a/include/sound/rawmidi.h
+++ b/include/sound/rawmidi.h
@@ -63,7 +63,6 @@ struct snd_rawmidi_runtime {
 	size_t xruns;		/* over/underruns counter */
 	int buffer_ref;		/* buffer reference count */
 	/* misc */
-	spinlock_t lock;
 	wait_queue_head_t sleep;
 	/* event handler (new bytes, input only) */
 	void (*event)(struct snd_rawmidi_substream *substream);
@@ -85,6 +84,7 @@ struct snd_rawmidi_substream {
 	unsigned int clock_type;	/* clock source to use for input framing */
 	int use_count;			/* use counter (for output) */
 	size_t bytes;
+	spinlock_t lock;
 	struct snd_rawmidi *rmidi;
 	struct snd_rawmidi_str *pstr;
 	char name[32];
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 82e8f656bbb2..0a00f37d8c42 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -102,13 +102,12 @@ static inline bool __snd_rawmidi_ready(struct snd_rawmidi_runtime *runtime)
 
 static bool snd_rawmidi_ready(struct snd_rawmidi_substream *substream)
 {
-	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	unsigned long flags;
 	bool ready;
 
-	spin_lock_irqsave(&runtime->lock, flags);
-	ready = __snd_rawmidi_ready(runtime);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
+	ready = __snd_rawmidi_ready(substream->runtime);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return ready;
 }
 
@@ -130,7 +129,7 @@ static void snd_rawmidi_input_event_work(struct work_struct *work)
 		runtime->event(runtime->substream);
 }
 
-/* buffer refcount management: call with runtime->lock held */
+/* buffer refcount management: call with substream->lock held */
 static inline void snd_rawmidi_buffer_ref(struct snd_rawmidi_runtime *runtime)
 {
 	runtime->buffer_ref++;
@@ -149,7 +148,6 @@ static int snd_rawmidi_runtime_create(struct snd_rawmidi_substream *substream)
 	if (!runtime)
 		return -ENOMEM;
 	runtime->substream = substream;
-	spin_lock_init(&runtime->lock);
 	init_waitqueue_head(&runtime->sleep);
 	INIT_WORK(&runtime->event_work, snd_rawmidi_input_event_work);
 	runtime->event = NULL;
@@ -203,20 +201,20 @@ static void __reset_runtime_ptrs(struct snd_rawmidi_runtime *runtime,
 	runtime->avail = is_input ? 0 : runtime->buffer_size;
 }
 
-static void reset_runtime_ptrs(struct snd_rawmidi_runtime *runtime,
+static void reset_runtime_ptrs(struct snd_rawmidi_substream *substream,
 			       bool is_input)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&runtime->lock, flags);
-	__reset_runtime_ptrs(runtime, is_input);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
+	__reset_runtime_ptrs(substream->runtime, is_input);
+	spin_unlock_irqrestore(&substream->lock, flags);
 }
 
 int snd_rawmidi_drop_output(struct snd_rawmidi_substream *substream)
 {
 	snd_rawmidi_output_trigger(substream, 0);
-	reset_runtime_ptrs(substream->runtime, false);
+	reset_runtime_ptrs(substream, false);
 	return 0;
 }
 EXPORT_SYMBOL(snd_rawmidi_drop_output);
@@ -256,7 +254,7 @@ EXPORT_SYMBOL(snd_rawmidi_drain_output);
 int snd_rawmidi_drain_input(struct snd_rawmidi_substream *substream)
 {
 	snd_rawmidi_input_trigger(substream, 0);
-	reset_runtime_ptrs(substream->runtime, true);
+	reset_runtime_ptrs(substream, true);
 	return 0;
 }
 EXPORT_SYMBOL(snd_rawmidi_drain_input);
@@ -676,10 +674,11 @@ static int snd_rawmidi_info_select_user(struct snd_card *card,
 	return 0;
 }
 
-static int resize_runtime_buffer(struct snd_rawmidi_runtime *runtime,
+static int resize_runtime_buffer(struct snd_rawmidi_substream *substream,
 				 struct snd_rawmidi_params *params,
 				 bool is_input)
 {
+	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	char *newbuf, *oldbuf;
 	unsigned int framing = params->mode & SNDRV_RAWMIDI_MODE_FRAMING_MASK;
 
@@ -693,9 +692,9 @@ static int resize_runtime_buffer(struct snd_rawmidi_runtime *runtime,
 		newbuf = kvzalloc(params->buffer_size, GFP_KERNEL);
 		if (!newbuf)
 			return -ENOMEM;
-		spin_lock_irq(&runtime->lock);
+		spin_lock_irq(&substream->lock);
 		if (runtime->buffer_ref) {
-			spin_unlock_irq(&runtime->lock);
+			spin_unlock_irq(&substream->lock);
 			kvfree(newbuf);
 			return -EBUSY;
 		}
@@ -703,7 +702,7 @@ static int resize_runtime_buffer(struct snd_rawmidi_runtime *runtime,
 		runtime->buffer = newbuf;
 		runtime->buffer_size = params->buffer_size;
 		__reset_runtime_ptrs(runtime, is_input);
-		spin_unlock_irq(&runtime->lock);
+		spin_unlock_irq(&substream->lock);
 		kvfree(oldbuf);
 	}
 	runtime->avail_min = params->avail_min;
@@ -717,7 +716,7 @@ int snd_rawmidi_output_params(struct snd_rawmidi_substream *substream,
 		return -EBUSY;
 	snd_rawmidi_drain_output(substream);
 	substream->active_sensing = !params->no_active_sensing;
-	return resize_runtime_buffer(substream->runtime, params, false);
+	return resize_runtime_buffer(substream, params, false);
 }
 EXPORT_SYMBOL(snd_rawmidi_output_params);
 
@@ -735,7 +734,7 @@ int snd_rawmidi_input_params(struct snd_rawmidi_substream *substream,
 	if (framing > SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP)
 		return -EINVAL;
 	snd_rawmidi_drain_input(substream);
-	err = resize_runtime_buffer(substream->runtime, params, true);
+	err = resize_runtime_buffer(substream, params, true);
 	if (err < 0)
 		return err;
 
@@ -752,9 +751,9 @@ static int snd_rawmidi_output_status(struct snd_rawmidi_substream *substream,
 
 	memset(status, 0, sizeof(*status));
 	status->stream = SNDRV_RAWMIDI_STREAM_OUTPUT;
-	spin_lock_irq(&runtime->lock);
+	spin_lock_irq(&substream->lock);
 	status->avail = runtime->avail;
-	spin_unlock_irq(&runtime->lock);
+	spin_unlock_irq(&substream->lock);
 	return 0;
 }
 
@@ -765,11 +764,11 @@ static int snd_rawmidi_input_status(struct snd_rawmidi_substream *substream,
 
 	memset(status, 0, sizeof(*status));
 	status->stream = SNDRV_RAWMIDI_STREAM_INPUT;
-	spin_lock_irq(&runtime->lock);
+	spin_lock_irq(&substream->lock);
 	status->avail = runtime->avail;
 	status->xruns = runtime->xruns;
 	runtime->xruns = 0;
-	spin_unlock_irq(&runtime->lock);
+	spin_unlock_irq(&substream->lock);
 	return 0;
 }
 
@@ -1074,7 +1073,7 @@ int snd_rawmidi_receive(struct snd_rawmidi_substream *substream,
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	if (substream->framing == SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP) {
 		result = receive_with_tstamp_framing(substream, buffer, count, &ts64);
 	} else if (count == 1) {	/* special case, faster code */
@@ -1121,7 +1120,7 @@ int snd_rawmidi_receive(struct snd_rawmidi_substream *substream,
 		else if (__snd_rawmidi_ready(runtime))
 			wake_up(&runtime->sleep);
 	}
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_receive);
@@ -1136,7 +1135,7 @@ static long snd_rawmidi_kernel_read1(struct snd_rawmidi_substream *substream,
 	unsigned long appl_ptr;
 	int err = 0;
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	snd_rawmidi_buffer_ref(runtime);
 	while (count > 0 && runtime->avail) {
 		count1 = runtime->buffer_size - runtime->appl_ptr;
@@ -1154,11 +1153,11 @@ static long snd_rawmidi_kernel_read1(struct snd_rawmidi_substream *substream,
 		if (kernelbuf)
 			memcpy(kernelbuf + result, runtime->buffer + appl_ptr, count1);
 		if (userbuf) {
-			spin_unlock_irqrestore(&runtime->lock, flags);
+			spin_unlock_irqrestore(&substream->lock, flags);
 			if (copy_to_user(userbuf + result,
 					 runtime->buffer + appl_ptr, count1))
 				err = -EFAULT;
-			spin_lock_irqsave(&runtime->lock, flags);
+			spin_lock_irqsave(&substream->lock, flags);
 			if (err)
 				goto out;
 		}
@@ -1167,7 +1166,7 @@ static long snd_rawmidi_kernel_read1(struct snd_rawmidi_substream *substream,
 	}
  out:
 	snd_rawmidi_buffer_unref(runtime);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result > 0 ? result : err;
 }
 
@@ -1196,31 +1195,31 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun
 	snd_rawmidi_input_trigger(substream, 1);
 	result = 0;
 	while (count > 0) {
-		spin_lock_irq(&runtime->lock);
+		spin_lock_irq(&substream->lock);
 		while (!__snd_rawmidi_ready(runtime)) {
 			wait_queue_entry_t wait;
 
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				return result > 0 ? result : -EAGAIN;
 			}
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&runtime->sleep, &wait);
 			set_current_state(TASK_INTERRUPTIBLE);
-			spin_unlock_irq(&runtime->lock);
+			spin_unlock_irq(&substream->lock);
 			schedule();
 			remove_wait_queue(&runtime->sleep, &wait);
 			if (rfile->rmidi->card->shutdown)
 				return -ENODEV;
 			if (signal_pending(current))
 				return result > 0 ? result : -ERESTARTSYS;
-			spin_lock_irq(&runtime->lock);
+			spin_lock_irq(&substream->lock);
 			if (!runtime->avail) {
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				return result > 0 ? result : -EIO;
 			}
 		}
-		spin_unlock_irq(&runtime->lock);
+		spin_unlock_irq(&substream->lock);
 		count1 = snd_rawmidi_kernel_read1(substream,
 						  (unsigned char __user *)buf,
 						  NULL/*kernelbuf*/,
@@ -1251,9 +1250,9 @@ int snd_rawmidi_transmit_empty(struct snd_rawmidi_substream *substream)
 			  "snd_rawmidi_transmit_empty: output is not active!!!\n");
 		return 1;
 	}
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	result = runtime->avail >= runtime->buffer_size;
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_empty);
@@ -1322,13 +1321,12 @@ static int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
 int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
 			      unsigned char *buffer, int count)
 {
-	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	int result;
 	unsigned long flags;
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	result = __snd_rawmidi_transmit_peek(substream, buffer, count);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_peek);
@@ -1375,13 +1373,12 @@ static int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream,
  */
 int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
 {
-	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	int result;
 	unsigned long flags;
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	result = __snd_rawmidi_transmit_ack(substream, count);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_ack);
@@ -1399,11 +1396,10 @@ EXPORT_SYMBOL(snd_rawmidi_transmit_ack);
 int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream,
 			 unsigned char *buffer, int count)
 {
-	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	int result;
 	unsigned long flags;
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	if (!substream->opened)
 		result = -EBADFD;
 	else {
@@ -1413,7 +1409,7 @@ int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream,
 		else
 			result = __snd_rawmidi_transmit_ack(substream, count);
 	}
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit);
@@ -1430,12 +1426,12 @@ int snd_rawmidi_proceed(struct snd_rawmidi_substream *substream)
 	unsigned long flags;
 	int count = 0;
 
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	if (runtime->avail < runtime->buffer_size) {
 		count = runtime->buffer_size - runtime->avail;
 		__snd_rawmidi_transmit_ack(substream, count);
 	}
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	return count;
 }
 EXPORT_SYMBOL(snd_rawmidi_proceed);
@@ -1456,10 +1452,10 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
 		return -EINVAL;
 
 	result = 0;
-	spin_lock_irqsave(&runtime->lock, flags);
+	spin_lock_irqsave(&substream->lock, flags);
 	if (substream->append) {
 		if ((long)runtime->avail < count) {
-			spin_unlock_irqrestore(&runtime->lock, flags);
+			spin_unlock_irqrestore(&substream->lock, flags);
 			return -EAGAIN;
 		}
 	}
@@ -1481,14 +1477,14 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
 			memcpy(runtime->buffer + appl_ptr,
 			       kernelbuf + result, count1);
 		else if (userbuf) {
-			spin_unlock_irqrestore(&runtime->lock, flags);
+			spin_unlock_irqrestore(&substream->lock, flags);
 			if (copy_from_user(runtime->buffer + appl_ptr,
 					   userbuf + result, count1)) {
-				spin_lock_irqsave(&runtime->lock, flags);
+				spin_lock_irqsave(&substream->lock, flags);
 				result = result > 0 ? result : -EFAULT;
 				goto __end;
 			}
-			spin_lock_irqsave(&runtime->lock, flags);
+			spin_lock_irqsave(&substream->lock, flags);
 		}
 		result += count1;
 		count -= count1;
@@ -1496,7 +1492,7 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
       __end:
 	count1 = runtime->avail < runtime->buffer_size;
 	snd_rawmidi_buffer_unref(runtime);
-	spin_unlock_irqrestore(&runtime->lock, flags);
+	spin_unlock_irqrestore(&substream->lock, flags);
 	if (count1)
 		snd_rawmidi_output_trigger(substream, 1);
 	return result;
@@ -1526,31 +1522,31 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 		return -EIO;
 	result = 0;
 	while (count > 0) {
-		spin_lock_irq(&runtime->lock);
+		spin_lock_irq(&substream->lock);
 		while (!snd_rawmidi_ready_append(substream, count)) {
 			wait_queue_entry_t wait;
 
 			if (file->f_flags & O_NONBLOCK) {
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				return result > 0 ? result : -EAGAIN;
 			}
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&runtime->sleep, &wait);
 			set_current_state(TASK_INTERRUPTIBLE);
-			spin_unlock_irq(&runtime->lock);
+			spin_unlock_irq(&substream->lock);
 			timeout = schedule_timeout(30 * HZ);
 			remove_wait_queue(&runtime->sleep, &wait);
 			if (rfile->rmidi->card->shutdown)
 				return -ENODEV;
 			if (signal_pending(current))
 				return result > 0 ? result : -ERESTARTSYS;
-			spin_lock_irq(&runtime->lock);
+			spin_lock_irq(&substream->lock);
 			if (!runtime->avail && !timeout) {
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				return result > 0 ? result : -EIO;
 			}
 		}
-		spin_unlock_irq(&runtime->lock);
+		spin_unlock_irq(&substream->lock);
 		count1 = snd_rawmidi_kernel_write1(substream, buf, NULL, count);
 		if (count1 < 0)
 			return result > 0 ? result : count1;
@@ -1561,7 +1557,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 		count -= count1;
 	}
 	if (file->f_flags & O_DSYNC) {
-		spin_lock_irq(&runtime->lock);
+		spin_lock_irq(&substream->lock);
 		while (runtime->avail != runtime->buffer_size) {
 			wait_queue_entry_t wait;
 			unsigned int last_avail = runtime->avail;
@@ -1569,16 +1565,16 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&runtime->sleep, &wait);
 			set_current_state(TASK_INTERRUPTIBLE);
-			spin_unlock_irq(&runtime->lock);
+			spin_unlock_irq(&substream->lock);
 			timeout = schedule_timeout(30 * HZ);
 			remove_wait_queue(&runtime->sleep, &wait);
 			if (signal_pending(current))
 				return result > 0 ? result : -ERESTARTSYS;
 			if (runtime->avail == last_avail && !timeout)
 				return result > 0 ? result : -EIO;
-			spin_lock_irq(&runtime->lock);
+			spin_lock_irq(&substream->lock);
 		}
-		spin_unlock_irq(&runtime->lock);
+		spin_unlock_irq(&substream->lock);
 	}
 	return result;
 }
@@ -1649,10 +1645,10 @@ static void snd_rawmidi_proc_info_read(struct snd_info_entry *entry,
 				    "  Owner PID    : %d\n",
 				    pid_vnr(substream->pid));
 				runtime = substream->runtime;
-				spin_lock_irq(&runtime->lock);
+				spin_lock_irq(&substream->lock);
 				buffer_size = runtime->buffer_size;
 				avail = runtime->avail;
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				snd_iprintf(buffer,
 				    "  Mode         : %s\n"
 				    "  Buffer size  : %lu\n"
@@ -1676,11 +1672,11 @@ static void snd_rawmidi_proc_info_read(struct snd_info_entry *entry,
 					    "  Owner PID    : %d\n",
 					    pid_vnr(substream->pid));
 				runtime = substream->runtime;
-				spin_lock_irq(&runtime->lock);
+				spin_lock_irq(&substream->lock);
 				buffer_size = runtime->buffer_size;
 				avail = runtime->avail;
 				xruns = runtime->xruns;
-				spin_unlock_irq(&runtime->lock);
+				spin_unlock_irq(&substream->lock);
 				snd_iprintf(buffer,
 					    "  Buffer size  : %lu\n"
 					    "  Avail        : %lu\n"
@@ -1732,6 +1728,7 @@ static int snd_rawmidi_alloc_substreams(struct snd_rawmidi *rmidi,
 		substream->number = idx;
 		substream->rmidi = rmidi;
 		substream->pstr = stream;
+		spin_lock_init(&substream->lock);
 		list_add_tail(&substream->list, &stream->substreams);
 		stream->substream_count++;
 	}
-- 
cgit 


From 5a531791edb249e41e00cf1cc580dbb09e2157ae Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 13 May 2022 13:53:06 +0100
Subject: media: v4l2-tpg: add HDMI Video Guard Band test pattern

This inserts 4 pixels of the RGB color 0xab55ab at the left hand side of
the image. This is only done for 3 or 4 byte RGB pixel formats. The HDMI
TMDS encoding of this pixel value equals the Video Guard Band value as
defined by HDMI (see section 5.2.2.1 in the HDMI 1.3 Specification) that
preceeds the first actual pixel of a video line. If an HDMI receiver
doesn't handle this correctly, then it might keep skipping these Video
Guard Band patterns and end up with a shorter video line. So this is a
nice pattern to test with.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/v4l2-tpg/v4l2-tpg-core.c | 38 +++++++++++++++++++++++++++
 include/media/tpg/v4l2-tpg.h                  | 16 +++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include')

diff --git a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
index 7607b516a7c4..cb985de8ff72 100644
--- a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
+++ b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
@@ -2402,6 +2402,44 @@ static void tpg_fill_plane_extras(const struct tpg_data *tpg,
 			((params->sav_eav_f ^ vact) << 1) |
 			(hact ^ vact ^ params->sav_eav_f);
 	}
+	if (tpg->insert_hdmi_video_guard_band) {
+		unsigned int i;
+
+		switch (tpg->fourcc) {
+		case V4L2_PIX_FMT_BGR24:
+		case V4L2_PIX_FMT_RGB24:
+			for (i = 0; i < 3 * 4; i += 3) {
+				vbuf[i] = 0xab;
+				vbuf[i + 1] = 0x55;
+				vbuf[i + 2] = 0xab;
+			}
+			break;
+		case V4L2_PIX_FMT_RGB32:
+		case V4L2_PIX_FMT_ARGB32:
+		case V4L2_PIX_FMT_XRGB32:
+		case V4L2_PIX_FMT_BGRX32:
+		case V4L2_PIX_FMT_BGRA32:
+			for (i = 0; i < 4 * 4; i += 4) {
+				vbuf[i] = 0x00;
+				vbuf[i + 1] = 0xab;
+				vbuf[i + 2] = 0x55;
+				vbuf[i + 3] = 0xab;
+			}
+			break;
+		case V4L2_PIX_FMT_BGR32:
+		case V4L2_PIX_FMT_XBGR32:
+		case V4L2_PIX_FMT_ABGR32:
+		case V4L2_PIX_FMT_RGBX32:
+		case V4L2_PIX_FMT_RGBA32:
+			for (i = 0; i < 4 * 4; i += 4) {
+				vbuf[i] = 0xab;
+				vbuf[i + 1] = 0x55;
+				vbuf[i + 2] = 0xab;
+				vbuf[i + 3] = 0x00;
+			}
+			break;
+		}
+	}
 }
 
 static void tpg_fill_plane_pattern(const struct tpg_data *tpg,
diff --git a/include/media/tpg/v4l2-tpg.h b/include/media/tpg/v4l2-tpg.h
index 181dcbe777f3..a55088921d1d 100644
--- a/include/media/tpg/v4l2-tpg.h
+++ b/include/media/tpg/v4l2-tpg.h
@@ -210,6 +210,7 @@ struct tpg_data {
 	bool				show_square;
 	bool				insert_sav;
 	bool				insert_eav;
+	bool				insert_hdmi_video_guard_band;
 
 	/* Test pattern movement */
 	enum tpg_move_mode		mv_hor_mode;
@@ -591,6 +592,21 @@ static inline void tpg_s_insert_eav(struct tpg_data *tpg, bool insert_eav)
 	tpg->insert_eav = insert_eav;
 }
 
+/*
+ * This inserts 4 pixels of the RGB color 0xab55ab at the left hand side of the
+ * image. This is only done for 3 or 4 byte RGB pixel formats. This pixel value
+ * equals the Video Guard Band value as defined by HDMI (see section 5.2.2.1
+ * in the HDMI 1.3 Specification) that preceeds the first actual pixel. If the
+ * HDMI receiver doesn't handle this correctly, then it might keep skipping
+ * these Video Guard Band patterns and end up with a shorter video line. So this
+ * is a nice pattern to test with.
+ */
+static inline void tpg_s_insert_hdmi_video_guard_band(struct tpg_data *tpg,
+						      bool insert_hdmi_video_guard_band)
+{
+	tpg->insert_hdmi_video_guard_band = insert_hdmi_video_guard_band;
+}
+
 void tpg_update_mv_step(struct tpg_data *tpg);
 
 static inline void tpg_s_mv_hor_mode(struct tpg_data *tpg,
-- 
cgit 


From 5374d8fb75f313294c7d97e85c22bead34d63f2b Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 19 May 2022 08:11:46 +0100
Subject: media: Add P010 video format

P010 is a YUV format with 10-bits per component with interleaved UV.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst  | 54 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 4 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index 8dff5906639b..a900ff66911a 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -109,6 +109,13 @@ All components are stored with the same number of bits per component.
       - Cb, Cr
       - No
       - 16x16 tiles
+    * - V4L2_PIX_FMT_P010
+      - 'P010'
+      - 10
+      - 4:2:0
+      - Cb, Cr
+      - Yes
+      - Linear
     * - V4L2_PIX_FMT_NV16
       - 'NV16'
       - 8
@@ -171,6 +178,7 @@ horizontally.
 .. _V4L2-PIX-FMT-NV21:
 .. _V4L2-PIX-FMT-NV12M:
 .. _V4L2-PIX-FMT-NV21M:
+.. _V4L2-PIX-FMT-P010:
 
 NV12, NV21, NV12M and NV21M
 ---------------------------
@@ -519,6 +527,52 @@ number of lines as the luma plane.
       - Cb\ :sub:`33`
       - Cr\ :sub:`33`
 
+.. _V4L2_PIX_FMT_P010:
+
+P010
+----
+
+Like NV12 with 10 bits per component, expanded to 16 bits.
+Data in the 10 high bits, zeros in the 6 low bits, arranged in little endian order.
+
+.. flat-table:: Sample 4x4 P010 Image
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - start + 0:
+      - Y'\ :sub:`00`
+      - Y'\ :sub:`01`
+      - Y'\ :sub:`02`
+      - Y'\ :sub:`03`
+    * - start + 8:
+      - Y'\ :sub:`10`
+      - Y'\ :sub:`11`
+      - Y'\ :sub:`12`
+      - Y'\ :sub:`13`
+    * - start + 16:
+      - Y'\ :sub:`20`
+      - Y'\ :sub:`21`
+      - Y'\ :sub:`22`
+      - Y'\ :sub:`23`
+    * - start + 24:
+      - Y'\ :sub:`30`
+      - Y'\ :sub:`31`
+      - Y'\ :sub:`32`
+      - Y'\ :sub:`33`
+    * - start + 32:
+      - Cb\ :sub:`00`
+      - Cr\ :sub:`00`
+      - Cb\ :sub:`01`
+      - Cr\ :sub:`01`
+    * - start + 40:
+      - Cb\ :sub:`10`
+      - Cr\ :sub:`10`
+      - Cb\ :sub:`11`
+      - Cr\ :sub:`11`
+
+.. raw:: latex
+
+    \endgroup
 
 Fully Planar YUV Formats
 ========================
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index df34b2a283bc..1e38ad8906a2 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -266,6 +266,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_NV61,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV24,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV42,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_P010,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 
 		{ .format = V4L2_PIX_FMT_YUV410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
 		{ .format = V4L2_PIX_FMT_YVU410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 21470de62d72..e2526701294e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1306,6 +1306,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV61:		descr = "Y/CrCb 4:2:2"; break;
 	case V4L2_PIX_FMT_NV24:		descr = "Y/CbCr 4:4:4"; break;
 	case V4L2_PIX_FMT_NV42:		descr = "Y/CrCb 4:4:4"; break;
+	case V4L2_PIX_FMT_P010:		descr = "10-bit Y/CrCb 4:2:0"; break;
 	case V4L2_PIX_FMT_NV12_4L4:	descr = "Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_16L16:	descr = "Y/CbCr 4:2:0 (16x16 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_32L32:   descr = "Y/CbCr 4:2:0 (32x32 Linear)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 343b95107fce..5311ac4fde35 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -602,6 +602,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV61    v4l2_fourcc('N', 'V', '6', '1') /* 16  Y/CrCb 4:2:2  */
 #define V4L2_PIX_FMT_NV24    v4l2_fourcc('N', 'V', '2', '4') /* 24  Y/CbCr 4:4:4  */
 #define V4L2_PIX_FMT_NV42    v4l2_fourcc('N', 'V', '4', '2') /* 24  Y/CrCb 4:4:4  */
+#define V4L2_PIX_FMT_P010    v4l2_fourcc('P', '0', '1', '0') /* 24  Y/CbCr 4:2:0 10-bit per component */
 
 /* two non contiguous planes - one Y, one Cr + Cb interleaved  */
 #define V4L2_PIX_FMT_NV12M   v4l2_fourcc('N', 'M', '1', '2') /* 12  Y/CbCr 4:2:0  */
-- 
cgit 


From 8e1c69149f27189cff93a0cfe9402e576d89ce29 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:10 +0000
Subject: KVM: Avoid pfn_to_page() and vice versa when releasing pages

Invert the order of KVM's page/pfn release helpers so that the "inner"
helper operates on a page instead of a pfn.  As pointed out by Linus[*],
converting between struct page and a pfn isn't necessarily cheap, and
that's not even counting the overhead of is_error_noslot_pfn() and
kvm_is_reserved_pfn().  Even if the checks were dirt cheap, there's no
reason to convert from a page to a pfn and back to a page, just to mark
the page dirty/accessed or to put a reference to the page.

Opportunistically drop a stale declaration of kvm_set_page_accessed()
from kvm_host.h (there was no implementation).

No functional change intended.

[*] https://lore.kernel.org/all/CAHk-=wifQimj2d6npq-wCi5onYPjzQg4vyO4tFcPJJZr268cRw@mail.gmail.com

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      | 64 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c20f2d55840c..af4b5c0bf04e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1139,7 +1139,6 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
 				      bool *writable);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
-void kvm_set_page_accessed(struct page *page);
 
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 351cbd121cf5..4732a99935f9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2820,18 +2820,40 @@ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
 
+static bool kvm_is_ad_tracked_page(struct page *page)
+{
+	/*
+	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
+	 * touched (e.g. set dirty) except by its owner".
+	 */
+	return !PageReserved(page);
+}
+
+static void kvm_set_page_dirty(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		SetPageDirty(page);
+}
+
+static void kvm_set_page_accessed(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		mark_page_accessed(page);
+}
+
 void kvm_release_page_clean(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 
-	kvm_release_pfn_clean(page_to_pfn(page));
+	kvm_set_page_accessed(page);
+	put_page(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		put_page(pfn_to_page(pfn));
+		kvm_release_page_clean(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
@@ -2839,40 +2861,40 @@ void kvm_release_page_dirty(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 
-	kvm_release_pfn_dirty(page_to_pfn(page));
+	kvm_set_page_dirty(page);
+	kvm_release_page_clean(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
-	kvm_set_pfn_dirty(pfn);
-	kvm_release_pfn_clean(pfn);
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
+		kvm_release_page_dirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
-static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
-{
-	if (!pfn_valid(pfn))
-		return false;
-
-	/*
-	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
-	 * touched (e.g. set dirty) except by its owner".
-	 */
-	return !PageReserved(pfn_to_page(pfn));
-}
-
+/*
+ * Note, checking for an error/noslot pfn is the caller's responsibility when
+ * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
+ * "set" helpers are not to be used when the pfn might point at garbage.
+ */
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (kvm_is_ad_tracked_pfn(pfn))
-		SetPageDirty(pfn_to_page(pfn));
+	if (WARN_ON(is_error_noslot_pfn(pfn)))
+		return;
+
+	if (pfn_valid(pfn))
+		kvm_set_page_dirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-	if (kvm_is_ad_tracked_pfn(pfn))
-		mark_page_accessed(pfn_to_page(pfn));
+	if (WARN_ON(is_error_noslot_pfn(pfn)))
+		return;
+
+	if (pfn_valid(pfn))
+		kvm_set_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
-- 
cgit 


From b1624f99aa8fedafcabf1b92fa51ed88dde14acb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:13 +0000
Subject: KVM: Remove kvm_vcpu_gfn_to_page() and kvm_vcpu_gpa_to_page()

Drop helpers to convert a gfn/gpa to a 'struct page' in the context of a
vCPU.  KVM doesn't require that guests be backed by 'struct page' memory,
thus any use of helpers that assume 'struct page' is bound to be flawed,
as was the case for the recently removed last user in x86's nested VMX.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  7 -------
 virt/kvm/kvm_main.c      | 31 +++++++++++--------------------
 2 files changed, 11 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index af4b5c0bf04e..4334789409c0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1230,7 +1230,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
-struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
@@ -1718,12 +1717,6 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
 	return (hpa_t)pfn << PAGE_SHIFT;
 }
 
-static inline struct page *kvm_vcpu_gpa_to_page(struct kvm_vcpu *vcpu,
-						gpa_t gpa)
-{
-	return kvm_vcpu_gfn_to_page(vcpu, gpa_to_gfn(gpa));
-}
-
 static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
 {
 	unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 78d280e119b3..7cd0e3d67f9f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2720,8 +2720,18 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
-static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
+/*
+ * Do not use this helper unless you are absolutely certain the gfn _must_ be
+ * backed by 'struct page'.  A valid example is if the backing memslot is
+ * controlled by KVM.  Note, if the returned page is valid, it's refcount has
+ * been elevated by gfn_to_pfn().
+ */
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
+	kvm_pfn_t pfn;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
@@ -2730,15 +2740,6 @@ static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
 
 	return pfn_to_page(pfn);
 }
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	kvm_pfn_t pfn;
-
-	pfn = gfn_to_pfn(kvm, gfn);
-
-	return kvm_pfn_to_page(pfn);
-}
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
@@ -2808,16 +2809,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
-struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	kvm_pfn_t pfn;
-
-	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
-
-	return kvm_pfn_to_page(pfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
-
 static bool kvm_is_ad_tracked_page(struct page *page)
 {
 	/*
-- 
cgit 


From 284dc49307738d2a897fd375431f741213cd0f27 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:14 +0000
Subject: KVM: Take a 'struct page', not a pfn in kvm_is_zone_device_page()

Operate on a 'struct page' instead of a pfn when checking if a page is a
ZONE_DEVICE page, and rename the helper accordingly.  Generally speaking,
KVM doesn't actually care about ZONE_DEVICE memory, i.e. shouldn't do
anything special for ZONE_DEVICE memory.  Rather, KVM wants to treat
ZONE_DEVICE memory like regular memory, and the need to identify
ZONE_DEVICE memory only arises as an exception to PG_reserved pages. In
other words, KVM should only ever check for ZONE_DEVICE memory after KVM
has already verified that there is a struct page associated with the pfn.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-9-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 5 +++--
 include/linux/kvm_host.h | 2 +-
 virt/kvm/kvm_main.c      | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2dcedf04ef85..b209aaf096df 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2788,15 +2788,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 				  const struct kvm_memory_slot *slot)
 {
+	struct page *page = pfn_to_page(pfn);
+	int level = PG_LEVEL_4K;
 	unsigned long hva;
 	unsigned long flags;
-	int level = PG_LEVEL_4K;
 	pgd_t pgd;
 	p4d_t p4d;
 	pud_t pud;
 	pmd_t pmd;
 
-	if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
+	if (!PageCompound(page) && !kvm_is_zone_device_page(page))
 		return PG_LEVEL_4K;
 
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4334789409c0..6fb433ac2ba1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1571,7 +1571,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
-bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
+bool kvm_is_zone_device_page(struct page *page);
 
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7cd0e3d67f9f..aa1bcd5f140d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,7 +168,7 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 {
 }
 
-bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+bool kvm_is_zone_device_page(struct page *page)
 {
 	/*
 	 * The metadata used by is_zone_device_page() to determine whether or
@@ -176,10 +176,10 @@ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
 	 * page_count() is zero to help detect bad usage of this helper.
 	 */
-	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+	if (WARN_ON_ONCE(!page_count(page)))
 		return false;
 
-	return is_zone_device_page(pfn_to_page(pfn));
+	return is_zone_device_page(page);
 }
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
@@ -192,7 +192,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 	if (pfn_valid(pfn))
 		return PageReserved(pfn_to_page(pfn)) &&
 		       !is_zero_pfn(pfn) &&
-		       !kvm_is_zone_device_pfn(pfn);
+		       !kvm_is_zone_device_page(pfn_to_page(pfn));
 
 	return true;
 }
-- 
cgit 


From b14b2690c50e02145bb867dfcde8845eb17aa8a4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:15 +0000
Subject: KVM: Rename/refactor kvm_is_reserved_pfn() to
 kvm_pfn_to_refcounted_page()

Rename and refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page()
to better reflect what KVM is actually checking, and to eliminate extra
pfn_to_page() lookups.  The kvm_release_pfn_*() an kvm_try_get_pfn()
helpers in particular benefit from "refouncted" nomenclature, as it's not
all that obvious why KVM needs to get/put refcounts for some PG_reserved
pages (ZERO_PAGE and ZONE_DEVICE).

Add a comment to call out that the list of exceptions to PG_reserved is
all but guaranteed to be incomplete.  The list has mostly been compiled
by people throwing noodles at KVM and finding out they stick a little too
well, e.g. the ZERO_PAGE's refcount overflowed and ZONE_DEVICE pages
didn't get freed.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-10-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c     | 15 ++++++-----
 arch/x86/kvm/mmu/tdp_mmu.c |  2 +-
 include/linux/kvm_host.h   |  2 +-
 virt/kvm/kvm_main.c        | 66 ++++++++++++++++++++++++++++++++++++----------
 4 files changed, 63 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b209aaf096df..7ebb97c95da7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -534,6 +534,7 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
 	kvm_pfn_t pfn;
 	u64 old_spte = *sptep;
 	int level = sptep_to_sp(sptep)->role.level;
+	struct page *page;
 
 	if (!is_shadow_present_pte(old_spte) ||
 	    !spte_has_volatile_bits(old_spte))
@@ -549,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
 	pfn = spte_to_pfn(old_spte);
 
 	/*
-	 * KVM does not hold the refcount of the page used by
-	 * kvm mmu, before reclaiming the page, we should
-	 * unmap it from mmu first.
+	 * KVM doesn't hold a reference to any pages mapped into the guest, and
+	 * instead uses the mmu_notifier to ensure that KVM unmaps any pages
+	 * before they are reclaimed.  Sanity check that, if the pfn is backed
+	 * by a refcounted page, the refcount is elevated.
 	 */
-	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+	page = kvm_pfn_to_refcounted_page(pfn);
+	WARN_ON(page && !page_count(page));
 
 	if (is_accessed_spte(old_spte))
 		kvm_set_pfn_accessed(pfn);
@@ -2881,7 +2884,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (unlikely(fault->max_level == PG_LEVEL_4K))
 		return;
 
-	if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+	if (is_error_noslot_pfn(fault->pfn) || !kvm_pfn_to_refcounted_page(fault->pfn))
 		return;
 
 	if (kvm_slot_dirty_track_enabled(slot))
@@ -5993,7 +5996,7 @@ restart:
 		 * the guest, and the guest page table is using 4K page size
 		 * mapping if the indirect sp has level = 1.
 		 */
-		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+		if (sp->role.direct && kvm_pfn_to_refcounted_page(pfn) &&
 		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
 							       pfn, PG_LEVEL_NUM)) {
 			pte_list_remove(kvm, rmap_head, sptep);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8e8d412fb65c..1e777b739ac4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1751,7 +1751,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 		 */
 		pfn = spte_to_pfn(iter.old_spte);
 
-		if (kvm_is_reserved_pfn(pfn))
+		if (!kvm_pfn_to_refcounted_page(pfn))
 			continue;
 
 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6fb433ac2ba1..a2bbdf3ab086 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1570,7 +1570,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 
-bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
 bool kvm_is_zone_device_page(struct page *page);
 
 struct kvm_irq_ack_notifier {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aa1bcd5f140d..04196096f9e4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -182,19 +182,36 @@ bool kvm_is_zone_device_page(struct page *page)
 	return is_zone_device_page(page);
 }
 
-bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+/*
+ * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
+ * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
+ * is likely incomplete, it has been compiled purely through people wanting to
+ * back guest with a certain type of memory and encountering issues.
+ */
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
 {
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+	if (!PageReserved(page))
+		return page;
+
+	/* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
+	if (is_zero_pfn(pfn))
+		return page;
+
 	/*
 	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 	 * perspective they are "normal" pages, albeit with slightly different
 	 * usage rules.
 	 */
-	if (pfn_valid(pfn))
-		return PageReserved(pfn_to_page(pfn)) &&
-		       !is_zero_pfn(pfn) &&
-		       !kvm_is_zone_device_page(pfn_to_page(pfn));
+	if (kvm_is_zone_device_page(page))
+		return page;
 
-	return true;
+	return NULL;
 }
 
 /*
@@ -2501,9 +2518,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 
 static int kvm_try_get_pfn(kvm_pfn_t pfn)
 {
-	if (kvm_is_reserved_pfn(pfn))
+	struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+	if (!page)
 		return 1;
-	return get_page_unless_zero(pfn_to_page(pfn));
+
+	return get_page_unless_zero(page);
 }
 
 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
@@ -2728,6 +2748,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
  */
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
+	struct page *page;
 	kvm_pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
@@ -2735,10 +2756,11 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
-	if (kvm_is_reserved_pfn(pfn))
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
 		return KVM_ERR_PTR_BAD_PAGE;
 
-	return pfn_to_page(pfn);
+	return page;
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
@@ -2841,8 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		kvm_release_page_clean(pfn_to_page(pfn));
+	struct page *page;
+
+	if (is_error_noslot_pfn(pfn))
+		return;
+
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		return;
+
+	kvm_release_page_clean(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
@@ -2857,8 +2887,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		kvm_release_page_dirty(pfn_to_page(pfn));
+	struct page *page;
+
+	if (is_error_noslot_pfn(pfn))
+		return;
+
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		return;
+
+	kvm_release_page_dirty(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
-- 
cgit 


From 7b0a0e3c3a88260b6fcb017e49f198463aa62ed1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 14 Apr 2022 16:50:57 +0200
Subject: wifi: cfg80211: do some rework towards MLO link APIs

In order to support multi-link operation with multiple links,
start adding some APIs. The notable addition here is to have
the link ID in a new nl80211 attribute, that will be used to
differentiate the links in many nl80211 operations.

So far, this patch adds the netlink NL80211_ATTR_MLO_LINK_ID
attribute (as well as the NL80211_ATTR_MLO_LINKS attribute)
and plugs it through the system in some places, checking the
validity etc. along with other infrastructure needed for it.

For now, I've decided to include only the over-the-air link
ID in the API. I know we discussed that we eventually need to
have to have other ways of identifying a link, but for local
AP mode and auth/assoc commands as well as set_key etc. we'll
use the OTA ID.

Also included in this patch is some refactoring of the data
structures in struct wireless_dev, splitting for the first
time the data into type dependent pieces, to make reasoning
about these things easier.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   6 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c        |   9 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   4 +-
 drivers/net/wireless/marvell/libertas/mesh.c       |  10 +-
 drivers/net/wireless/marvell/mwifiex/11h.c         |   2 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    |  18 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c |   3 +-
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  |  14 +-
 drivers/net/wireless/quantenna/qtnfmac/commands.c  |   2 +-
 drivers/net/wireless/quantenna/qtnfmac/event.c     |  15 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |   4 +-
 include/linux/ieee80211.h                          |   3 +
 include/net/cfg80211.h                             |  99 +++-
 include/uapi/linux/nl80211.h                       |  28 +
 net/mac80211/cfg.c                                 |   8 +-
 net/mac80211/mlme.c                                |   2 +-
 net/wireless/ap.c                                  |  46 +-
 net/wireless/chan.c                                | 196 +++++--
 net/wireless/core.c                                |  28 +-
 net/wireless/core.h                                |  13 +-
 net/wireless/ibss.c                                |  57 +-
 net/wireless/mesh.c                                |  31 +-
 net/wireless/mlme.c                                |  74 +--
 net/wireless/nl80211.c                             | 623 +++++++++++++++------
 net/wireless/ocb.c                                 |   5 +-
 net/wireless/rdev-ops.h                            |  32 +-
 net/wireless/reg.c                                 | 139 +++--
 net/wireless/scan.c                                |   8 +-
 net/wireless/sme.c                                 | 102 ++--
 net/wireless/trace.h                               |  86 ++-
 net/wireless/util.c                                |  44 +-
 net/wireless/wext-compat.c                         |  48 +-
 net/wireless/wext-sme.c                            |  29 +-
 33 files changed, 1255 insertions(+), 533 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index bd1183830e91..33ed54738d47 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1119,7 +1119,7 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
 					NL80211_CHAN_HT20 : NL80211_CHAN_NO_HT);
 
 	mutex_lock(&vif->wdev.mtx);
-	cfg80211_ch_switch_notify(vif->ndev, &chandef);
+	cfg80211_ch_switch_notify(vif->ndev, &chandef, 0);
 	mutex_unlock(&vif->wdev.mtx);
 }
 
@@ -2967,7 +2967,8 @@ static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	return ath6kl_set_ies(vif, beacon);
 }
 
-static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			  unsigned int link_id)
 {
 	struct ath6kl *ar = ath6kl_priv(dev);
 	struct ath6kl_vif *vif = netdev_priv(dev);
@@ -3368,6 +3369,7 @@ static int ath6kl_cfg80211_sscan_stop(struct wiphy *wiphy,
 
 static int ath6kl_cfg80211_set_bitrate(struct wiphy *wiphy,
 				       struct net_device *dev,
+				       unsigned int link_id,
 				       const u8 *addr,
 				       const struct cfg80211_bitrate_mask *mask)
 {
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 8f2638f5b87b..f93bdffa4d1d 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -2098,8 +2098,8 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 			     bcon->tail_len))
 		privacy = 1;
 
-	memcpy(vif->ssid, wdev->ssid, wdev->ssid_len);
-	vif->ssid_len = wdev->ssid_len;
+	memcpy(vif->ssid, wdev->u.ap.ssid, wdev->u.ap.ssid_len);
+	vif->ssid_len = wdev->u.ap.ssid_len;
 
 	/* in case privacy has changed, need to restart the AP */
 	if (vif->privacy != privacy) {
@@ -2108,7 +2108,7 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 
 		rc = _wil_cfg80211_start_ap(wiphy, ndev, vif->ssid,
 					    vif->ssid_len, privacy,
-					    wdev->beacon_interval,
+					    wdev->links[0].ap.beacon_interval,
 					    vif->channel,
 					    vif->wmi_edmg_channel, bcon,
 					    vif->hidden_ssid,
@@ -2186,7 +2186,8 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 }
 
 static int wil_cfg80211_stop_ap(struct wiphy *wiphy,
-				struct net_device *ndev)
+				struct net_device *ndev,
+				unsigned int link_id)
 {
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	struct wil6210_vif *vif = ndev_to_vif(ndev);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 605206abe424..11e1f07f83e0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -4965,7 +4965,8 @@ exit:
 	return err;
 }
 
-static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev)
+static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
+				  unsigned int link_id)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct brcmf_if *ifp = netdev_priv(ndev);
@@ -5302,6 +5303,7 @@ exit:
 
 static int brcmf_cfg80211_get_channel(struct wiphy *wiphy,
 				      struct wireless_dev *wdev,
+				      unsigned int link_id,
 				      struct cfg80211_chan_def *chandef)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
diff --git a/drivers/net/wireless/marvell/libertas/mesh.c b/drivers/net/wireless/marvell/libertas/mesh.c
index a58c1e141f2c..90ffe8d1e0e8 100644
--- a/drivers/net/wireless/marvell/libertas/mesh.c
+++ b/drivers/net/wireless/marvell/libertas/mesh.c
@@ -109,9 +109,9 @@ static int lbs_mesh_config(struct lbs_private *priv, uint16_t action,
 
 		if (priv->mesh_dev) {
 			mesh_wdev = priv->mesh_dev->ieee80211_ptr;
-			ie->val.mesh_id_len = mesh_wdev->mesh_id_up_len;
-			memcpy(ie->val.mesh_id, mesh_wdev->ssid,
-						mesh_wdev->mesh_id_up_len);
+			ie->val.mesh_id_len = mesh_wdev->u.mesh.id_up_len;
+			memcpy(ie->val.mesh_id, mesh_wdev->u.mesh.id,
+						mesh_wdev->u.mesh.id_up_len);
 		}
 
 		ie->len = sizeof(struct mrvl_meshie_val) -
@@ -986,8 +986,8 @@ static int lbs_add_mesh(struct lbs_private *priv)
 	mesh_wdev->wiphy = priv->wdev->wiphy;
 
 	if (priv->mesh_tlv) {
-		sprintf(mesh_wdev->ssid, "mesh");
-		mesh_wdev->mesh_id_up_len = 4;
+		sprintf(mesh_wdev->u.mesh.id, "mesh");
+		mesh_wdev->u.mesh.id_up_len = 4;
 	}
 
 	mesh_wdev->netdev = mesh_dev;
diff --git a/drivers/net/wireless/marvell/mwifiex/11h.c b/drivers/net/wireless/marvell/mwifiex/11h.c
index 3fa25cd64cda..4ca8d0135708 100644
--- a/drivers/net/wireless/marvell/mwifiex/11h.c
+++ b/drivers/net/wireless/marvell/mwifiex/11h.c
@@ -304,6 +304,6 @@ void mwifiex_dfs_chan_sw_work_queue(struct work_struct *work)
 	mwifiex_dbg(priv->adapter, MSG,
 		    "indicating channel switch completion to kernel\n");
 	mutex_lock(&priv->wdev.mtx);
-	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef);
+	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0);
 	mutex_unlock(&priv->wdev.mtx);
 }
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 6f23ec34e2e2..d68c40e0e122 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1753,10 +1753,12 @@ mwifiex_mgmt_stypes[NUM_NL80211_IFTYPES] = {
  * Function configures data rates to firmware using bitrate mask
  * provided by cfg80211.
  */
-static int mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy,
-				struct net_device *dev,
-				const u8 *peer,
-				const struct cfg80211_bitrate_mask *mask)
+static int
+mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  unsigned int link_id,
+				  const u8 *peer,
+				  const struct cfg80211_bitrate_mask *mask)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 	u16 bitmap_rates[MAX_BITMAP_RATES_SIZE];
@@ -1998,7 +2000,8 @@ mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
 /* cfg80211 operation handler for stop ap.
  * Function stops BSS running at uAP interface.
  */
-static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+				    unsigned int link_id)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 
@@ -2421,7 +2424,7 @@ mwifiex_cfg80211_connect(struct wiphy *wiphy, struct net_device *dev,
 		return -EINVAL;
 	}
 
-	if (priv->wdev.current_bss) {
+	if (priv->wdev.connected) {
 		mwifiex_dbg(adapter, ERROR,
 			    "%s: already connected\n", dev->name);
 		return -EALREADY;
@@ -2649,7 +2652,7 @@ mwifiex_cfg80211_scan(struct wiphy *wiphy,
 		return -EBUSY;
 	}
 
-	if (!priv->wdev.current_bss && priv->scan_block)
+	if (!priv->wdev.connected && priv->scan_block)
 		priv->scan_block = false;
 
 	if (!mwifiex_stop_bg_scan(priv))
@@ -4025,6 +4028,7 @@ mwifiex_cfg80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 static int mwifiex_cfg80211_get_channel(struct wiphy *wiphy,
 					struct wireless_dev *wdev,
+					unsigned int link_id,
 					struct cfg80211_chan_def *chandef)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev);
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index 1ac4684fab25..5c2c7f1dbffd 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -1426,7 +1426,8 @@ static int change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	return wilc_add_beacon(vif, 0, 0, beacon);
 }
 
-static int stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int stop_ap(struct wiphy *wiphy, struct net_device *dev,
+		   unsigned int link_id)
 {
 	int ret;
 	struct wilc_vif *vif = netdev_priv(dev);
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 84b15a655eab..1593e810b3ca 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -352,7 +352,8 @@ static int qtnf_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	return ret;
 }
 
-static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			unsigned int link_id)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
@@ -500,7 +501,7 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev,
 
 	switch (vif->wdev.iftype) {
 	case NL80211_IFTYPE_STATION:
-		if (idx != 0 || !vif->wdev.current_bss)
+		if (idx != 0 || !vif->wdev.connected)
 			return -ENOENT;
 
 		ether_addr_copy(mac, vif->bssid);
@@ -729,7 +730,7 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
 		pr_err("VIF%u.%u: failed to disconnect\n",
 		       mac->macid, vif->vifid);
 
-	if (vif->wdev.current_bss) {
+	if (vif->wdev.connected) {
 		netif_carrier_off(vif->netdev);
 		cfg80211_disconnected(vif->netdev, reason_code,
 				      NULL, 0, true, GFP_KERNEL);
@@ -745,10 +746,11 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 	struct qtnf_wmac *mac = wiphy_priv(wiphy);
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct ieee80211_supported_band *sband;
-	const struct cfg80211_chan_def *chandef = &wdev->chandef;
+	const struct cfg80211_chan_def *chandef = wdev_chandef(wdev, 0);
 	struct ieee80211_channel *chan;
 	int ret;
 
+
 	sband = wiphy->bands[NL80211_BAND_2GHZ];
 	if (sband && idx >= sband->n_channels) {
 		idx -= sband->n_channels;
@@ -765,7 +767,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 	survey->channel = chan;
 	survey->filled = 0x0;
 
-	if (chan == chandef->chan)
+	if (chandef && chan == chandef->chan)
 		survey->filled = SURVEY_INFO_IN_USE;
 
 	ret = qtnf_cmd_get_chan_stats(mac, chan->center_freq, survey);
@@ -778,7 +780,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 
 static int
 qtnf_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
-		 struct cfg80211_chan_def *chandef)
+		 unsigned int link_id, struct cfg80211_chan_def *chandef)
 {
 	struct net_device *ndev = wdev->netdev;
 	struct qtnf_vif *vif;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index c68563c83098..3d734a7a5ba8 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -2005,7 +2005,7 @@ int qtnf_cmd_send_scan(struct qtnf_wmac *mac)
 		dwell_active = scan_req->duration;
 		dwell_passive = scan_req->duration;
 	} else if (wdev->iftype == NL80211_IFTYPE_STATION &&
-		   wdev->current_bss) {
+		   wdev->connected) {
 		/* let device select dwell based on traffic conditions */
 		dwell_active = QTNF_SCAN_TIME_AUTO;
 		dwell_passive = QTNF_SCAN_TIME_AUTO;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index 8dc80574d08d..4fafe370101a 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -189,7 +189,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 			vif->mac->macid, vif->vifid,
 			join_info->bssid, chandef.chan->hw_value);
 
-		if (!vif->wdev.ssid_len) {
+		if (!vif->wdev.u.client.ssid_len) {
 			pr_warn("VIF%u.%u: SSID unknown for BSS:%pM\n",
 				vif->mac->macid, vif->vifid,
 				join_info->bssid);
@@ -197,7 +197,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 			goto done;
 		}
 
-		ie = kzalloc(2 + vif->wdev.ssid_len, GFP_KERNEL);
+		ie = kzalloc(2 + vif->wdev.u.client.ssid_len, GFP_KERNEL);
 		if (!ie) {
 			pr_warn("VIF%u.%u: IE alloc failed for BSS:%pM\n",
 				vif->mac->macid, vif->vifid,
@@ -207,14 +207,15 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 		}
 
 		ie[0] = WLAN_EID_SSID;
-		ie[1] = vif->wdev.ssid_len;
-		memcpy(ie + 2, vif->wdev.ssid, vif->wdev.ssid_len);
+		ie[1] = vif->wdev.u.client.ssid_len;
+		memcpy(ie + 2, vif->wdev.u.client.ssid,
+		       vif->wdev.u.client.ssid_len);
 
 		bss = cfg80211_inform_bss(wiphy, chandef.chan,
 					  CFG80211_BSS_FTYPE_UNKNOWN,
 					  join_info->bssid, 0,
 					  WLAN_CAPABILITY_ESS, 100,
-					  ie, 2 + vif->wdev.ssid_len,
+					  ie, 2 + vif->wdev.u.client.ssid_len,
 					  0, GFP_KERNEL);
 		if (!bss) {
 			pr_warn("VIF%u.%u: can't connect to unknown BSS: %pM\n",
@@ -470,14 +471,14 @@ qtnf_event_handle_freq_change(struct qtnf_wmac *mac,
 			continue;
 
 		if (vif->wdev.iftype == NL80211_IFTYPE_STATION &&
-		    !vif->wdev.current_bss)
+		    !vif->wdev.connected)
 			continue;
 
 		if (!vif->netdev)
 			continue;
 
 		mutex_lock(&vif->wdev.mtx);
-		cfg80211_ch_switch_notify(vif->netdev, &chandef);
+		cfg80211_ch_switch_notify(vif->netdev, &chandef, 0);
 		mutex_unlock(&vif->wdev.mtx);
 	}
 
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 43b5604c0bca..349aa3c4b668 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -2086,6 +2086,7 @@ static u8 rtw_get_chan_type(struct adapter *adapter)
 }
 
 static int cfg80211_rtw_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
+				    unsigned int link_id,
 				    struct cfg80211_chan_def *chandef)
 {
 	struct adapter *adapter = wiphy_to_adapter(wiphy);
@@ -2446,7 +2447,8 @@ static int cfg80211_rtw_change_beacon(struct wiphy *wiphy, struct net_device *nd
 	return rtw_add_beacon(adapter, info->head, info->head_len, info->tail, info->tail_len);
 }
 
-static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev)
+static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
+				unsigned int link_id)
 {
 	return 0;
 }
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5c65ae6b8154..e15771965916 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4376,4 +4376,7 @@ enum ieee80211_range_params_max_total_ltf {
 	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED,
 };
 
+/* multi-link device */
+#define IEEE80211_MLD_MAX_NUM_LINKS	15
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6d02e12e4702..772e099fc932 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1158,6 +1158,7 @@ struct cfg80211_mbssid_elems {
 
 /**
  * struct cfg80211_beacon_data - beacon data
+ * @link_id: the link ID for the AP MLD link sending this beacon
  * @head: head portion of beacon (before TIM IE)
  *	or %NULL if not changed
  * @tail: tail portion of beacon (after TIM IE)
@@ -1188,6 +1189,8 @@ struct cfg80211_mbssid_elems {
  *	attribute is present in beacon data or not.
  */
 struct cfg80211_beacon_data {
+	unsigned int link_id;
+
 	const u8 *head, *tail;
 	const u8 *beacon_ies;
 	const u8 *proberesp_ies;
@@ -4201,7 +4204,8 @@ struct cfg80211_ops {
 			    struct cfg80211_ap_settings *settings);
 	int	(*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
 				 struct cfg80211_beacon_data *info);
-	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev);
+	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
+			   unsigned int link_id);
 
 
 	int	(*add_station)(struct wiphy *wiphy, struct net_device *dev,
@@ -4309,6 +4313,7 @@ struct cfg80211_ops {
 
 	int	(*set_bitrate_mask)(struct wiphy *wiphy,
 				    struct net_device *dev,
+				    unsigned int link_id,
 				    const u8 *peer,
 				    const struct cfg80211_bitrate_mask *mask);
 
@@ -4384,6 +4389,7 @@ struct cfg80211_ops {
 
 	int	(*get_channel)(struct wiphy *wiphy,
 			       struct wireless_dev *wdev,
+			       unsigned int link_id,
 			       struct cfg80211_chan_def *chandef);
 
 	int	(*start_p2p_device)(struct wiphy *wiphy,
@@ -4420,6 +4426,7 @@ struct cfg80211_ops {
 			       struct cfg80211_qos_map *qos_map);
 
 	int	(*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev,
+				    unsigned int link_id,
 				    struct cfg80211_chan_def *chandef);
 
 	int	(*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
@@ -4545,10 +4552,14 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation
  *	before connection.
  * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
+ * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs,
+ *	in order to not have them reachable in normal drivers, until we have
+ *	complete feature/interface combinations/etc. advertisement. No driver
+ *	should set this flag for now.
  */
 enum wiphy_flags {
 	WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK		= BIT(0),
-	/* use hole at 1 */
+	WIPHY_FLAG_SUPPORTS_MLO			= BIT(1),
 	WIPHY_FLAG_SPLIT_SCAN_6GHZ		= BIT(2),
 	WIPHY_FLAG_NETNS_OK			= BIT(3),
 	WIPHY_FLAG_PS_ON_BY_DEFAULT		= BIT(4),
@@ -5505,6 +5516,8 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @netdev: (private) Used to reference back to the netdev, may be %NULL
  * @identifier: (private) Identifier used in nl80211 to identify this
  *	wireless device if it has no netdev
+ * @connected_addr: (private) BSSID or AP MLD address if connected
+ * @connected: indicates if connected or not (STA mode)
  * @current_bss: (private) Used by the internal configuration code
  * @chandef: (private) Used by the internal configuration code to track
  *	the user-set channel definition.
@@ -5585,8 +5598,6 @@ struct wireless_dev {
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
 
 	/* currently used for IBSS and SME - might be rearranged later */
-	u8 ssid[IEEE80211_MAX_SSID_LEN];
-	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
 	enum ieee80211_bss_type conn_bss_type;
@@ -5598,20 +5609,17 @@ struct wireless_dev {
 	struct list_head event_list;
 	spinlock_t event_lock;
 
-	struct cfg80211_internal_bss *current_bss; /* associated / joined */
-	struct cfg80211_chan_def preset_chandef;
-	struct cfg80211_chan_def chandef;
+	u8 connected:1;
 
 	bool ps;
 	int ps_timeout;
 
-	int beacon_interval;
-
 	u32 ap_unexpected_nlportid;
 
 	u32 owner_nlportid;
 	bool nl_owner_dead;
 
+	/* FIXME: need to rework radar detection for MLO */
 	bool cac_started;
 	unsigned long cac_start_time;
 	unsigned int cac_time_ms;
@@ -5639,6 +5647,50 @@ struct wireless_dev {
 	struct work_struct pmsr_free_wk;
 
 	unsigned long unprot_beacon_reported;
+
+	union {
+		struct {
+			u8 connected_addr[ETH_ALEN] __aligned(2);
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} client;
+		struct {
+			int beacon_interval;
+			struct cfg80211_chan_def preset_chandef;
+			struct cfg80211_chan_def chandef;
+			u8 id[IEEE80211_MAX_SSID_LEN];
+			u8 id_len, id_up_len;
+		} mesh;
+		struct {
+			struct cfg80211_chan_def preset_chandef;
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} ap;
+		struct {
+			struct cfg80211_internal_bss *current_bss;
+			struct cfg80211_chan_def chandef;
+			int beacon_interval;
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} ibss;
+		struct {
+			struct cfg80211_chan_def chandef;
+		} ocb;
+	} u;
+
+	struct {
+		u8 addr[ETH_ALEN] __aligned(2);
+		union {
+			struct {
+				unsigned int beacon_interval;
+				struct cfg80211_chan_def chandef;
+			} ap;
+			struct {
+				struct cfg80211_internal_bss *current_bss;
+			} client;
+		};
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
+	u16 valid_links;
 };
 
 static inline const u8 *wdev_address(struct wireless_dev *wdev)
@@ -5667,6 +5719,31 @@ static inline void *wdev_priv(struct wireless_dev *wdev)
 	return wiphy_priv(wdev->wiphy);
 }
 
+/**
+ * wdev_chandef - return chandef pointer from wireless_dev
+ * @wdev: the wdev
+ * @link_id: the link ID for MLO
+ *
+ * Return: The chandef depending on the mode, or %NULL.
+ */
+struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
+				       unsigned int link_id);
+
+static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
+					unsigned int link_id)
+{
+	WARN_ON(link_id && !wdev->valid_links);
+	WARN_ON(wdev->valid_links &&
+		!(wdev->valid_links & BIT(link_id)));
+}
+
+#define for_each_valid_link(wdev, link_id)					\
+	for (link_id = 0;							\
+	     link_id < ((wdev)->valid_links ? ARRAY_SIZE((wdev)->links) : 1);	\
+	     link_id++)								\
+		if (!(wdev)->valid_links ||					\
+		    ((wdev)->valid_links & BIT(link_id)))
+
 /**
  * DOC: Utility functions
  *
@@ -7882,12 +7959,14 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
  * cfg80211_ch_switch_notify - update wdev channel and notify userspace
  * @dev: the device which switched channels
  * @chandef: the new channel definition
+ * @link_id: the link ID for MLO, must be 0 for non-MLO
  *
  * Caller must acquire wdev_lock, therefore must only be called from sleepable
  * driver context!
  */
 void cfg80211_ch_switch_notify(struct net_device *dev,
-			       struct cfg80211_chan_def *chandef);
+			       struct cfg80211_chan_def *chandef,
+			       unsigned int link_id);
 
 /*
  * cfg80211_ch_switch_started_notify - notify channel switch start
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 98f905f16411..a9a2c9fef295 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -323,6 +323,17 @@
  * Once the association is done, the driver cleans the FILS AAD data.
  */
 
+/**
+ * DOC: Multi-Link Operation
+ *
+ * In Multi-Link Operation, a connection between to MLDs utilizes multiple
+ * links. To use this in nl80211, various commands and responses now need
+ * to or will include the new %NL80211_ATTR_MLO_LINKS attribute.
+ * Additionally, various commands that need to operate on a specific link
+ * now need to be given the %NL80211_ATTR_MLO_LINK_ID attribute, e.g. to
+ * use %NL80211_CMD_START_AP or similar functions.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -1237,6 +1248,12 @@
  *      to describe the BSSID address of the AP and %NL80211_ATTR_TIMEOUT to
  *      specify the timeout value.
  *
+ * @NL80211_CMD_ADD_LINK: Add a new link to an interface. The
+ *	%NL80211_ATTR_MLO_LINK_ID attribute is used for the new link.
+ * @NL80211_CMD_REMOVE_LINK: Remove a link from an interface. This may come
+ *	without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links
+ *	in preparation for e.g. roaming to a regular (non-MLO) AP.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1481,6 +1498,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_ASSOC_COMEBACK,
 
+	NL80211_CMD_ADD_LINK,
+	NL80211_CMD_REMOVE_LINK,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2663,6 +2683,11 @@ enum nl80211_commands {
  *	association request when used with NL80211_CMD_NEW_STATION). Can be set
  *	only if %NL80211_STA_FLAG_WME is set.
  *
+ * @NL80211_ATTR_MLO_LINK_ID: A (u8) link ID for use with MLO, to be used with
+ *	various commands that need a link ID to operate.
+ * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some
+ *	per-link information and a link ID.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3177,6 +3202,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_DISABLE_EHT,
 
+	NL80211_ATTR_MLO_LINKS,
+	NL80211_ATTR_MLO_LINK_ID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 881efbfb96f6..362cac9e2135 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1342,7 +1342,8 @@ static void ieee80211_free_next_beacon(struct ieee80211_sub_if_data *sdata)
 	sdata->u.ap.next_beacon = NULL;
 }
 
-static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			     unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_sub_if_data *vlan;
@@ -3049,6 +3050,7 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy,
 
 static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 				      struct net_device *dev,
+				      unsigned int link_id,
 				      const u8 *addr,
 				      const struct cfg80211_bitrate_mask *mask)
 {
@@ -3390,7 +3392,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	if (err)
 		return err;
 
-	cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef);
+	cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef, 0);
 
 	return 0;
 }
@@ -3898,6 +3900,7 @@ unlock:
 
 static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 				     struct wireless_dev *wdev,
+				     unsigned int link_id,
 				     struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
@@ -3958,6 +3961,7 @@ static int ieee80211_set_qos_map(struct wiphy *wiphy,
 
 static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 				      struct net_device *dev,
+				      unsigned int link_id,
 				      struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6d5ad71ef02c..e0a9b7d63071 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1314,7 +1314,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 
-	cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef);
+	cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef, 0);
 }
 
 void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success)
diff --git a/net/wireless/ap.c b/net/wireless/ap.c
index 550ac9d827fe..e68923200018 100644
--- a/net/wireless/ap.c
+++ b/net/wireless/ap.c
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * Parts of this file are
+ * Copyright (C) 2022 Intel Corporation
+ */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
 #include <net/cfg80211.h>
@@ -7,8 +11,9 @@
 #include "rdev-ops.h"
 
 
-int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, bool notify)
+static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+			       struct net_device *dev, unsigned int link_id,
+			       bool notify)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
@@ -22,15 +27,16 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EOPNOTSUPP;
 
-	if (!wdev->beacon_interval)
+	if (!wdev->links[link_id].ap.beacon_interval)
 		return -ENOENT;
 
-	err = rdev_stop_ap(rdev, dev);
+	err = rdev_stop_ap(rdev, dev, link_id);
 	if (!err) {
 		wdev->conn_owner_nlportid = 0;
-		wdev->beacon_interval = 0;
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
-		wdev->ssid_len = 0;
+		wdev->links[link_id].ap.beacon_interval = 0;
+		memset(&wdev->links[link_id].ap.chandef, 0,
+		       sizeof(wdev->links[link_id].ap.chandef));
+		wdev->u.ap.ssid_len = 0;
 		rdev_set_qos_map(rdev, dev, NULL);
 		if (notify)
 			nl80211_send_ap_stopped(wdev);
@@ -46,14 +52,36 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	return err;
 }
 
+int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev, int link_id,
+		       bool notify)
+{
+	unsigned int link;
+	int ret = 0;
+
+	if (link_id >= 0)
+		return ___cfg80211_stop_ap(rdev, dev, link_id, notify);
+
+	for_each_valid_link(dev->ieee80211_ptr, link) {
+		int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify);
+
+		if (ret1)
+			ret = ret1;
+		/* try the next one also if one errored */
+	}
+
+	return ret;
+}
+
 int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		     struct net_device *dev, bool notify)
+		     struct net_device *dev, int link_id,
+		     bool notify)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
 	wdev_lock(wdev);
-	err = __cfg80211_stop_ap(rdev, dev, notify);
+	err = __cfg80211_stop_ap(rdev, dev, link_id, notify);
 	wdev_unlock(wdev);
 
 	return err;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index f74f176e0d9d..efc2de4bab57 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -672,14 +672,21 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
  * range of chandef.
  */
 bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
-			  struct ieee80211_channel *chan)
+			  struct ieee80211_channel *chan,
+			  bool primary_only)
 {
 	int width;
 	u32 freq;
 
+	if (!chandef->chan)
+		return false;
+
 	if (chandef->chan->center_freq == chan->center_freq)
 		return true;
 
+	if (primary_only)
+		return false;
+
 	width = cfg80211_chandef_get_width(chandef);
 	if (width <= 20)
 		return false;
@@ -704,23 +711,25 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
 
 bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
 {
-	bool active = false;
+	unsigned int link;
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->chandef.chan)
-		return false;
-
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		active = wdev->beacon_interval != 0;
+		for_each_valid_link(wdev, link) {
+			if (wdev->links[link].ap.beacon_interval)
+				return true;
+		}
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		active = wdev->ssid_len != 0;
+		if (wdev->u.ibss.ssid_len)
+			return true;
 		break;
 	case NL80211_IFTYPE_MESH_POINT:
-		active = wdev->mesh_id_len != 0;
+		if (wdev->u.mesh.id_len)
+			return true;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_OCB:
@@ -737,7 +746,35 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
 		WARN_ON(1);
 	}
 
-	return active;
+	return false;
+}
+
+bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
+			       struct ieee80211_channel *chan,
+			       bool primary_only)
+{
+	unsigned int link;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		for_each_valid_link(wdev, link) {
+			if (cfg80211_is_sub_chan(&wdev->links[link].ap.chandef,
+						 chan, primary_only))
+				return true;
+		}
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_is_sub_chan(&wdev->u.ibss.chandef, chan,
+					    primary_only);
+	case NL80211_IFTYPE_MESH_POINT:
+		return cfg80211_is_sub_chan(&wdev->u.mesh.chandef, chan,
+					    primary_only);
+	default:
+		break;
+	}
+
+	return false;
 }
 
 static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
@@ -752,7 +789,7 @@ static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
 			continue;
 		}
 
-		if (cfg80211_is_sub_chan(&wdev->chandef, chan)) {
+		if (cfg80211_wdev_on_sub_chan(wdev, chan, false)) {
 			wdev_unlock(wdev);
 			return true;
 		}
@@ -772,7 +809,8 @@ cfg80211_offchan_chain_is_active(struct cfg80211_registered_device *rdev,
 	if (!cfg80211_chandef_valid(&rdev->background_radar_chandef))
 		return false;
 
-	return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel);
+	return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel,
+				    false);
 }
 
 bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
@@ -1176,6 +1214,68 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL(cfg80211_chandef_usable);
 
+static bool cfg80211_ir_permissive_check_wdev(enum nl80211_iftype iftype,
+					      struct wireless_dev *wdev,
+					      struct ieee80211_channel *chan)
+{
+	struct ieee80211_channel *other_chan = NULL;
+	unsigned int link_id;
+	int r1, r2;
+
+	for_each_valid_link(wdev, link_id) {
+		if (wdev->iftype == NL80211_IFTYPE_STATION &&
+		    wdev->links[link_id].client.current_bss)
+			other_chan = wdev->links[link_id].client.current_bss->pub.channel;
+
+		/*
+		 * If a GO already operates on the same GO_CONCURRENT channel,
+		 * this one (maybe the same one) can beacon as well. We allow
+		 * the operation even if the station we relied on with
+		 * GO_CONCURRENT is disconnected now. But then we must make sure
+		 * we're not outdoor on an indoor-only channel.
+		 */
+		if (iftype == NL80211_IFTYPE_P2P_GO &&
+		    wdev->iftype == NL80211_IFTYPE_P2P_GO &&
+		    wdev->links[link_id].ap.beacon_interval &&
+		    !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
+			other_chan = wdev->links[link_id].ap.chandef.chan;
+
+		if (!other_chan)
+			continue;
+
+		if (chan == other_chan)
+			return true;
+
+		if (chan->band != NL80211_BAND_5GHZ &&
+		    chan->band != NL80211_BAND_6GHZ)
+			continue;
+
+		r1 = cfg80211_get_unii(chan->center_freq);
+		r2 = cfg80211_get_unii(other_chan->center_freq);
+
+		if (r1 != -EINVAL && r1 == r2) {
+			/*
+			 * At some locations channels 149-165 are considered a
+			 * bundle, but at other locations, e.g., Indonesia,
+			 * channels 149-161 are considered a bundle while
+			 * channel 165 is left out and considered to be in a
+			 * different bundle. Thus, in case that there is a
+			 * station interface connected to an AP on channel 165,
+			 * it is assumed that channels 149-161 are allowed for
+			 * GO operations. However, having a station interface
+			 * connected to an AP on channels 149-161, does not
+			 * allow GO operation on channel 165.
+			 */
+			if (chan->center_freq == 5825 &&
+			    other_chan->center_freq != 5825)
+				continue;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * Check if the channel can be used under permissive conditions mandated by
  * some regulatory bodies, i.e., the channel is marked with
@@ -1219,59 +1319,14 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 	 * the current registered device.
 	 */
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-		struct ieee80211_channel *other_chan = NULL;
-		int r1, r2;
+		bool ret;
 
 		wdev_lock(wdev);
-		if (wdev->iftype == NL80211_IFTYPE_STATION &&
-		    wdev->current_bss)
-			other_chan = wdev->current_bss->pub.channel;
-
-		/*
-		 * If a GO already operates on the same GO_CONCURRENT channel,
-		 * this one (maybe the same one) can beacon as well. We allow
-		 * the operation even if the station we relied on with
-		 * GO_CONCURRENT is disconnected now. But then we must make sure
-		 * we're not outdoor on an indoor-only channel.
-		 */
-		if (iftype == NL80211_IFTYPE_P2P_GO &&
-		    wdev->iftype == NL80211_IFTYPE_P2P_GO &&
-		    wdev->beacon_interval &&
-		    !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
-			other_chan = wdev->chandef.chan;
+		ret = cfg80211_ir_permissive_check_wdev(iftype, wdev, chan);
 		wdev_unlock(wdev);
 
-		if (!other_chan)
-			continue;
-
-		if (chan == other_chan)
-			return true;
-
-		if (chan->band != NL80211_BAND_5GHZ &&
-		    chan->band != NL80211_BAND_6GHZ)
-			continue;
-
-		r1 = cfg80211_get_unii(chan->center_freq);
-		r2 = cfg80211_get_unii(other_chan->center_freq);
-
-		if (r1 != -EINVAL && r1 == r2) {
-			/*
-			 * At some locations channels 149-165 are considered a
-			 * bundle, but at other locations, e.g., Indonesia,
-			 * channels 149-161 are considered a bundle while
-			 * channel 165 is left out and considered to be in a
-			 * different bundle. Thus, in case that there is a
-			 * station interface connected to an AP on channel 165,
-			 * it is assumed that channels 149-161 are allowed for
-			 * GO operations. However, having a station interface
-			 * connected to an AP on channels 149-161, does not
-			 * allow GO operation on channel 165.
-			 */
-			if (chan->center_freq == 5825 &&
-			    other_chan->center_freq != 5825)
-				continue;
-			return true;
-		}
+		if (ret)
+			return ret;
 	}
 
 	return false;
@@ -1374,3 +1429,24 @@ bool cfg80211_any_usable_channels(struct wiphy *wiphy,
 	return false;
 }
 EXPORT_SYMBOL(cfg80211_any_usable_channels);
+
+struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
+				       unsigned int link_id)
+{
+	ASSERT_WDEV_LOCK(wdev);
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_MESH_POINT:
+		return &wdev->u.mesh.chandef;
+	case NL80211_IFTYPE_ADHOC:
+		return &wdev->u.ibss.chandef;
+	case NL80211_IFTYPE_OCB:
+		return &wdev->u.ocb.chandef;
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		return &wdev->links[link_id].ap.chandef;
+	default:
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(wdev_chandef);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index f08d4b3bb148..3e5d12040726 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1118,6 +1118,7 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev,
 				      bool unregister_netdev)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	unsigned int link_id;
 
 	ASSERT_RTNL();
 	lockdep_assert_held(&rdev->wiphy.mtx);
@@ -1167,11 +1168,22 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev,
 	 */
 	cfg80211_process_wdev_events(wdev);
 
-	if (WARN_ON(wdev->current_bss)) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
+	if (wdev->iftype == NL80211_IFTYPE_STATION ||
+	    wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) {
+		for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) {
+			struct cfg80211_internal_bss *curbss;
+
+			curbss = wdev->links[link_id].client.current_bss;
+
+			if (WARN_ON(curbss)) {
+				cfg80211_unhold_bss(curbss);
+				cfg80211_put_bss(wdev->wiphy, &curbss->pub);
+				wdev->links[link_id].client.current_bss = NULL;
+			}
+		}
 	}
+
+	wdev->connected = false;
 }
 
 void cfg80211_unregister_wdev(struct wireless_dev *wdev)
@@ -1233,7 +1245,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 		break;
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		__cfg80211_stop_ap(rdev, dev, true);
+		__cfg80211_stop_ap(rdev, dev, -1, true);
 		break;
 	case NL80211_IFTYPE_OCB:
 		__cfg80211_leave_ocb(rdev, dev);
@@ -1463,9 +1475,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 				memcpy(&setup, &default_mesh_setup,
 						sizeof(setup));
 				 /* back compat only needed for mesh_id */
-				setup.mesh_id = wdev->ssid;
-				setup.mesh_id_len = wdev->mesh_id_up_len;
-				if (wdev->mesh_id_up_len)
+				setup.mesh_id = wdev->u.mesh.id;
+				setup.mesh_id_len = wdev->u.mesh.id_up_len;
+				if (wdev->u.mesh.id_up_len)
 					__cfg80211_join_mesh(rdev, dev,
 							&setup,
 							&default_mesh_config);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 5436ada91b1a..2c195067ddff 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -307,6 +307,7 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
 void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                       unsigned long age_secs);
 void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+				     unsigned int link,
 				     struct ieee80211_channel *channel);
 
 /* IBSS */
@@ -353,9 +354,11 @@ int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 
 /* AP */
 int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, bool notify);
+		       struct net_device *dev, int link,
+		       bool notify);
 int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		     struct net_device *dev, bool notify);
+		     struct net_device *dev, int link,
+		     bool notify);
 
 /* MLME */
 int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
@@ -507,7 +510,11 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
 bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev);
 
 bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
-			  struct ieee80211_channel *chan);
+			  struct ieee80211_channel *chan,
+			  bool primary_only);
+bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
+			       struct ieee80211_channel *chan,
+			       bool primary_only);
 
 static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
 {
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 5d89eec2869a..4935f94d1acc 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -28,7 +28,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
 		return;
 
-	if (!wdev->ssid_len)
+	if (!wdev->u.ibss.ssid_len)
 		return;
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0,
@@ -37,13 +37,13 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
 	if (WARN_ON(!bss))
 		return;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+	if (wdev->u.ibss.current_bss) {
+		cfg80211_unhold_bss(wdev->u.ibss.current_bss);
+		cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub);
 	}
 
 	cfg80211_hold_bss(bss_from_pub(bss));
-	wdev->current_bss = bss_from_pub(bss);
+	wdev->u.ibss.current_bss = bss_from_pub(bss);
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
@@ -96,7 +96,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	lockdep_assert_held(&rdev->wiphy.mtx);
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		return -EALREADY;
 
 	if (!params->basic_rates) {
@@ -131,7 +131,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 		kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = connkeys;
 
-	wdev->chandef = params->chandef;
+	wdev->u.ibss.chandef = params->chandef;
 	if (connkeys) {
 		params->wep_keys = connkeys->params;
 		params->wep_tx_key = connkeys->def;
@@ -146,8 +146,8 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 		return err;
 	}
 
-	memcpy(wdev->ssid, params->ssid, params->ssid_len);
-	wdev->ssid_len = params->ssid_len;
+	memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len);
+	wdev->u.ibss.ssid_len = params->ssid_len;
 
 	return 0;
 }
@@ -173,14 +173,14 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 		for (i = 0; i < 6; i++)
 			rdev_del_key(rdev, dev, i, false, NULL);
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+	if (wdev->u.ibss.current_bss) {
+		cfg80211_unhold_bss(wdev->u.ibss.current_bss);
+		cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub);
 	}
 
-	wdev->current_bss = NULL;
-	wdev->ssid_len = 0;
-	memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+	wdev->u.ibss.current_bss = NULL;
+	wdev->u.ibss.ssid_len = 0;
+	memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef));
 #ifdef CONFIG_CFG80211_WEXT
 	if (!nowext)
 		wdev->wext.ibss.ssid_len = 0;
@@ -205,7 +205,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->ssid_len)
+	if (!wdev->u.ibss.ssid_len)
 		return -ENOLINK;
 
 	err = rdev_leave_ibss(rdev, dev);
@@ -339,7 +339,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -374,8 +374,8 @@ int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		chan = wdev->current_bss->pub.channel;
+	if (wdev->u.ibss.current_bss)
+		chan = wdev->u.ibss.current_bss->pub.channel;
 	else if (wdev->wext.ibss.chandef.chan)
 		chan = wdev->wext.ibss.chandef.chan;
 	wdev_unlock(wdev);
@@ -408,7 +408,7 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -419,8 +419,8 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 	if (len > 0 && ssid[len - 1] == '\0')
 		len--;
 
-	memcpy(wdev->ssid, ssid, len);
-	wdev->wext.ibss.ssid = wdev->ssid;
+	memcpy(wdev->u.ibss.ssid, ssid, len);
+	wdev->wext.ibss.ssid = wdev->u.ibss.ssid;
 	wdev->wext.ibss.ssid_len = len;
 
 	wdev_lock(wdev);
@@ -443,10 +443,10 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
 	data->flags = 0;
 
 	wdev_lock(wdev);
-	if (wdev->ssid_len) {
+	if (wdev->u.ibss.ssid_len) {
 		data->flags = 1;
-		data->length = wdev->ssid_len;
-		memcpy(ssid, wdev->ssid, data->length);
+		data->length = wdev->u.ibss.ssid_len;
+		memcpy(ssid, wdev->u.ibss.ssid, data->length);
 	} else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) {
 		data->flags = 1;
 		data->length = wdev->wext.ibss.ssid_len;
@@ -494,7 +494,7 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -527,8 +527,9 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev,
 	ap_addr->sa_family = ARPHRD_ETHER;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (wdev->u.ibss.current_bss)
+		memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else if (wdev->wext.ibss.bssid)
 		memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN);
 	else
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index e4e363138279..59a3c5c092b1 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * Portions
+ * Copyright (C) 2022 Intel Corporation
+ */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
 #include <net/cfg80211.h>
@@ -114,7 +118,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 	      setup->is_secure)
 		return -EOPNOTSUPP;
 
-	if (wdev->mesh_id_len)
+	if (wdev->u.mesh.id_len)
 		return -EALREADY;
 
 	if (!setup->mesh_id_len)
@@ -125,7 +129,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 
 	if (!setup->chandef.chan) {
 		/* if no channel explicitly given, use preset channel */
-		setup->chandef = wdev->preset_chandef;
+		setup->chandef = wdev->u.mesh.preset_chandef;
 	}
 
 	if (!setup->chandef.chan) {
@@ -209,10 +213,10 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 
 	err = rdev_join_mesh(rdev, dev, conf, setup);
 	if (!err) {
-		memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
-		wdev->mesh_id_len = setup->mesh_id_len;
-		wdev->chandef = setup->chandef;
-		wdev->beacon_interval = setup->beacon_interval;
+		memcpy(wdev->u.mesh.id, setup->mesh_id, setup->mesh_id_len);
+		wdev->u.mesh.id_len = setup->mesh_id_len;
+		wdev->u.mesh.chandef = setup->chandef;
+		wdev->u.mesh.beacon_interval = setup->beacon_interval;
 	}
 
 	return err;
@@ -241,15 +245,15 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
 		err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev,
 						     chandef->chan);
 		if (!err)
-			wdev->chandef = *chandef;
+			wdev->u.mesh.chandef = *chandef;
 
 		return err;
 	}
 
-	if (wdev->mesh_id_len)
+	if (wdev->u.mesh.id_len)
 		return -EBUSY;
 
-	wdev->preset_chandef = *chandef;
+	wdev->u.mesh.preset_chandef = *chandef;
 	return 0;
 }
 
@@ -267,15 +271,16 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 	if (!rdev->ops->leave_mesh)
 		return -EOPNOTSUPP;
 
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		return -ENOTCONN;
 
 	err = rdev_leave_mesh(rdev, dev);
 	if (!err) {
 		wdev->conn_owner_nlportid = 0;
-		wdev->mesh_id_len = 0;
-		wdev->beacon_interval = 0;
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+		wdev->u.mesh.id_len = 0;
+		wdev->u.mesh.beacon_interval = 0;
+		memset(&wdev->u.mesh.chandef, 0,
+		       sizeof(wdev->u.mesh.chandef));
 		rdev_set_qos_map(rdev, dev, NULL);
 		cfg80211_sched_dfs_chan_update(rdev);
 	}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c8155a483ec2..fab2d6206cdd 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -92,8 +92,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev,
 
 	nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL);
 
-	if (!wdev->current_bss ||
-	    !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
+	if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))
 		return;
 
 	__cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
@@ -113,8 +112,8 @@ static void cfg80211_process_disassoc(struct wireless_dev *wdev,
 	nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect,
 			      GFP_KERNEL);
 
-	if (WARN_ON(!wdev->current_bss ||
-		    !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	if (WARN_ON(!wdev->connected ||
+		    !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return;
 
 	__cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
@@ -260,8 +259,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 		if (!key || !key_len || key_idx < 0 || key_idx > 3)
 			return -EINVAL;
 
-	if (wdev->current_bss &&
-	    ether_addr_equal(bssid, wdev->current_bss->pub.bssid))
+	if (wdev->connected &&
+	    ether_addr_equal(bssid, wdev->u.client.connected_addr))
 		return -EALREADY;
 
 	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
@@ -322,9 +321,9 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (wdev->current_bss &&
-	    (!req->prev_bssid || !ether_addr_equal(wdev->current_bss->pub.bssid,
-						   req->prev_bssid)))
+	if (wdev->connected &&
+	    (!req->prev_bssid ||
+	     !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
 		return -EALREADY;
 
 	cfg80211_oper_and_ht_capa(&req->ht_capa_mask,
@@ -364,13 +363,13 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (local_state_change &&
-	    (!wdev->current_bss ||
-	     !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	    (!wdev->connected ||
+	     !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return 0;
 
 	if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
-	    (wdev->current_bss &&
-	     ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	    (wdev->connected &&
+	     ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		wdev->conn_owner_nlportid = 0;
 
 	return rdev_deauth(rdev, dev, &req);
@@ -392,11 +391,12 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		return -ENOTCONN;
 
-	if (ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
-		req.bss = &wdev->current_bss->pub;
+	if (ether_addr_equal(wdev->links[0].client.current_bss->pub.bssid,
+			     bssid))
+		req.bss = &wdev->links[0].client.current_bss->pub;
 	else
 		return -ENOTCONN;
 
@@ -405,7 +405,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 		return err;
 
 	/* driver should have reported the disassoc */
-	WARN_ON(wdev->current_bss);
+	WARN_ON(wdev->connected);
 	return 0;
 }
 
@@ -420,10 +420,10 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 	if (!rdev->ops->deauth)
 		return;
 
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		return;
 
-	memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+	memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN);
 	cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0,
 			     WLAN_REASON_DEAUTH_LEAVING, false);
 }
@@ -676,28 +676,34 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 
 		switch (wdev->iftype) {
 		case NL80211_IFTYPE_ADHOC:
+			/*
+			 * check for IBSS DA must be done by driver as
+			 * cfg80211 doesn't track the stations
+			 */
+			if (!wdev->u.ibss.current_bss ||
+			    !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid,
+					      mgmt->bssid)) {
+				err = -ENOTCONN;
+				break;
+			}
+			break;
 		case NL80211_IFTYPE_STATION:
 		case NL80211_IFTYPE_P2P_CLIENT:
-			if (!wdev->current_bss) {
+			if (!wdev->connected) {
 				err = -ENOTCONN;
 				break;
 			}
 
-			if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+			/* FIXME: MLD may address this differently */
+
+			if (!ether_addr_equal(wdev->u.client.connected_addr,
 					      mgmt->bssid)) {
 				err = -ENOTCONN;
 				break;
 			}
 
-			/*
-			 * check for IBSS DA must be done by driver as
-			 * cfg80211 doesn't track the stations
-			 */
-			if (wdev->iftype == NL80211_IFTYPE_ADHOC)
-				break;
-
 			/* for station, check that DA is the AP */
-			if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+			if (!ether_addr_equal(wdev->u.client.connected_addr,
 					      mgmt->da)) {
 				err = -ENOTCONN;
 				break;
@@ -743,12 +749,12 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 		if (!ieee80211_is_action(mgmt->frame_control) ||
 		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
 			return -EINVAL;
-		if (!wdev->current_bss &&
+		if (!wdev->connected &&
 		    !wiphy_ext_feature_isset(
 			    &rdev->wiphy,
 			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
 			return -EINVAL;
-		if (wdev->current_bss &&
+		if (wdev->connected &&
 		    !wiphy_ext_feature_isset(
 			    &rdev->wiphy,
 			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
@@ -940,12 +946,16 @@ void cfg80211_cac_event(struct net_device *netdev,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	unsigned long timeout;
 
+	/* not yet supported */
+	if (wdev->valid_links)
+		return;
+
 	trace_cfg80211_cac_event(netdev, event);
 
 	if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED))
 		return;
 
-	if (WARN_ON(!wdev->chandef.chan))
+	if (WARN_ON(!wdev->links[0].ap.chandef.chan))
 		return;
 
 	switch (event) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 740b29481bc6..af31978fc9cc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -792,6 +792,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 				 NL80211_EHT_MIN_CAPABILITY_LEN,
 				 NL80211_EHT_MAX_CAPABILITY_LEN),
 	[NL80211_ATTR_DISABLE_EHT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_MLO_LINKS] =
+		NLA_POLICY_NESTED_ARRAY(nl80211_policy),
+	[NL80211_ATTR_MLO_LINK_ID] =
+		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 };
 
 /* policy for the key attributes */
@@ -1225,6 +1229,37 @@ static bool nl80211_put_txq_stats(struct sk_buff *msg,
 
 /* netlink command implementations */
 
+/**
+ * nl80211_link_id - return link ID
+ * @attrs: attributes to look at
+ *
+ * Returns: the link ID or 0 if not given
+ *
+ * Note this function doesn't do any validation of the link
+ * ID validity wrt. links that were actually added, so it must
+ * be called only from ops with %NL80211_FLAG_MLO_VALID_LINK_ID
+ * or if additional validation is done.
+ */
+static unsigned int nl80211_link_id(struct nlattr **attrs)
+{
+	struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];
+
+	if (!linkid)
+		return 0;
+
+	return nla_get_u8(linkid);
+}
+
+static int nl80211_link_id_or_invalid(struct nlattr **attrs)
+{
+	struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];
+
+	if (!linkid)
+		return -1;
+
+	return nla_get_u8(linkid);
+}
+
 struct key_parse {
 	struct key_params p;
 	int idx;
@@ -1496,11 +1531,15 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 	case NL80211_IFTYPE_MESH_POINT:
 		break;
 	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.current_bss)
+			return 0;
+		return -ENOLINK;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (!wdev->current_bss)
-			return -ENOLINK;
-		break;
+		/* for MLO, require driver validation of the link ID */
+		if (wdev->connected)
+			return 0;
+		return -ENOLINK;
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NL80211_IFTYPE_OCB:
 	case NL80211_IFTYPE_MONITOR:
@@ -3232,12 +3271,14 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 
 static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 				 struct net_device *dev,
-				 struct genl_info *info)
+				 struct genl_info *info,
+				 int _link_id)
 {
 	struct cfg80211_chan_def chandef;
 	int result;
 	enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR;
 	struct wireless_dev *wdev = NULL;
+	int link_id = _link_id;
 
 	if (dev)
 		wdev = dev->ieee80211_ptr;
@@ -3246,6 +3287,12 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	if (wdev)
 		iftype = wdev->iftype;
 
+	if (link_id < 0) {
+		if (wdev && wdev->valid_links)
+			return -EINVAL;
+		link_id = 0;
+	}
+
 	result = nl80211_parse_chandef(rdev, info, &chandef);
 	if (result)
 		return result;
@@ -3254,49 +3301,48 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
 		if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
-						   iftype)) {
-			result = -EINVAL;
-			break;
-		}
-		if (wdev->beacon_interval) {
+						   iftype))
+			return -EINVAL;
+		if (wdev->links[link_id].ap.beacon_interval) {
+			struct ieee80211_channel *cur_chan;
+
 			if (!dev || !rdev->ops->set_ap_chanwidth ||
 			    !(rdev->wiphy.features &
-			      NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) {
-				result = -EBUSY;
-				break;
-			}
+			      NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE))
+				return -EBUSY;
 
 			/* Only allow dynamic channel width changes */
-			if (chandef.chan != wdev->preset_chandef.chan) {
-				result = -EBUSY;
-				break;
-			}
-			result = rdev_set_ap_chanwidth(rdev, dev, &chandef);
+			cur_chan = wdev->links[link_id].ap.chandef.chan;
+			if (chandef.chan != cur_chan)
+				return -EBUSY;
+
+			result = rdev_set_ap_chanwidth(rdev, dev, link_id,
+						       &chandef);
 			if (result)
-				break;
+				return result;
+			wdev->links[link_id].ap.chandef = chandef;
+		} else {
+			wdev->u.ap.preset_chandef = chandef;
 		}
-		wdev->preset_chandef = chandef;
-		result = 0;
-		break;
+		return 0;
 	case NL80211_IFTYPE_MESH_POINT:
-		result = cfg80211_set_mesh_channel(rdev, wdev, &chandef);
-		break;
+		return cfg80211_set_mesh_channel(rdev, wdev, &chandef);
 	case NL80211_IFTYPE_MONITOR:
-		result = cfg80211_set_monitor_channel(rdev, &chandef);
-		break;
+		return cfg80211_set_monitor_channel(rdev, &chandef);
 	default:
-		result = -EINVAL;
+		break;
 	}
 
-	return result;
+	return -EINVAL;
 }
 
 static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
 	struct net_device *netdev = info->user_ptr[1];
 
-	return __nl80211_set_channel(rdev, netdev, info);
+	return __nl80211_set_channel(rdev, netdev, info, link_id);
 }
 
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
@@ -3411,7 +3457,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		result = __nl80211_set_channel(
 			rdev,
 			nl80211_can_set_dev_channel(wdev) ? netdev : NULL,
-			info);
+			info, -1);
 		if (result)
 			goto out;
 	}
@@ -3696,15 +3742,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	    nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
 		goto nla_put_failure;
 
-	if (rdev->ops->get_channel) {
-		int ret;
+	if (rdev->ops->get_channel && !wdev->valid_links) {
 		struct cfg80211_chan_def chandef = {};
+		int ret;
 
-		ret = rdev_get_channel(rdev, wdev, &chandef);
-		if (ret == 0) {
-			if (nl80211_send_chandef(msg, &chandef))
-				goto nla_put_failure;
-		}
+		ret = rdev_get_channel(rdev, wdev, 0, &chandef);
+		if (ret == 0 && nl80211_send_chandef(msg, &chandef))
+			goto nla_put_failure;
 	}
 
 	if (rdev->ops->get_tx_power) {
@@ -3721,27 +3765,24 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		if (wdev->ssid_len &&
-		    nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid))
+		if (wdev->u.ap.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len,
+			    wdev->u.ap.ssid))
 			goto nla_put_failure_locked;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-	case NL80211_IFTYPE_ADHOC: {
-		const struct element *ssid_elem;
-
-		if (!wdev->current_bss)
-			break;
-		rcu_read_lock();
-		ssid_elem = ieee80211_bss_get_elem(&wdev->current_bss->pub,
-						   WLAN_EID_SSID);
-		if (ssid_elem &&
-		    nla_put(msg, NL80211_ATTR_SSID, ssid_elem->datalen,
-			    ssid_elem->data))
-			goto nla_put_failure_rcu_locked;
-		rcu_read_unlock();
+		if (wdev->u.client.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.client.ssid_len,
+			    wdev->u.client.ssid))
+			goto nla_put_failure_locked;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ibss.ssid_len,
+			    wdev->u.ibss.ssid))
+			goto nla_put_failure_locked;
 		break;
-		}
 	default:
 		/* nothing */
 		break;
@@ -3761,8 +3802,6 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	genlmsg_end(msg, hdr);
 	return 0;
 
- nla_put_failure_rcu_locked:
-	rcu_read_unlock();
  nla_put_failure_locked:
 	wdev_unlock(wdev);
  nla_put_failure:
@@ -4014,10 +4053,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
-		wdev->mesh_id_up_len =
+		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
-		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
-		       wdev->mesh_id_up_len);
+		memcpy(wdev->u.mesh.id,
+		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->u.mesh.id_up_len);
 		wdev_unlock(wdev);
 	}
 
@@ -4122,10 +4162,11 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
-		wdev->mesh_id_up_len =
+		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
-		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
-		       wdev->mesh_id_up_len);
+		memcpy(wdev->u.mesh.id,
+		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->u.mesh.id_up_len);
 		wdev_unlock(wdev);
 		break;
 	case NL80211_IFTYPE_NAN:
@@ -4662,7 +4703,7 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EOPNOTSUPP;
 
-	if (!dev->ieee80211_ptr->beacon_interval)
+	if (!dev->ieee80211_ptr->links[0].ap.beacon_interval)
 		return -EINVAL;
 
 	acl = parse_acl_data(&rdev->wiphy, info);
@@ -4818,14 +4859,24 @@ static void he_build_mcs_mask(u16 he_mcs_map,
 	}
 }
 
-static u16 he_get_txmcsmap(struct genl_info *info,
+static u16 he_get_txmcsmap(struct genl_info *info, unsigned int link_id,
 			   const struct ieee80211_sta_he_cap *he_cap)
 {
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	__le16	tx_mcs;
+	struct cfg80211_chan_def *chandef;
+	__le16 tx_mcs;
 
-	switch (wdev->chandef.width) {
+	chandef = wdev_chandef(wdev, link_id);
+	if (!chandef) {
+		/*
+		 * This is probably broken, but we never maintained
+		 * a chandef in these cases, so it always was.
+		 */
+		return le16_to_cpu(he_cap->he_mcs_nss_supp.tx_mcs_80);
+	}
+
+	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_80P80:
 		tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80;
 		break;
@@ -4836,6 +4887,7 @@ static u16 he_get_txmcsmap(struct genl_info *info,
 		tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80;
 		break;
 	}
+
 	return le16_to_cpu(tx_mcs);
 }
 
@@ -4843,7 +4895,8 @@ static bool he_set_mcs_mask(struct genl_info *info,
 			    struct wireless_dev *wdev,
 			    struct ieee80211_supported_band *sband,
 			    struct nl80211_txrate_he *txrate,
-			    u16 mcs[NL80211_HE_NSS_MAX])
+			    u16 mcs[NL80211_HE_NSS_MAX],
+			    unsigned int link_id)
 {
 	const struct ieee80211_sta_he_cap *he_cap;
 	u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {};
@@ -4856,7 +4909,7 @@ static bool he_set_mcs_mask(struct genl_info *info,
 
 	memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX);
 
-	tx_mcs_map = he_get_txmcsmap(info, he_cap);
+	tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);
 
 	/* Build he_mcs_mask from HE capabilities */
 	he_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
@@ -4876,7 +4929,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 					 enum nl80211_attrs attr,
 					 struct cfg80211_bitrate_mask *mask,
 					 struct net_device *dev,
-					 bool default_all_enabled)
+					 bool default_all_enabled,
+					 unsigned int link_id)
 {
 	struct nlattr *tb[NL80211_TXRATE_MAX + 1];
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -4913,7 +4967,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 		if (!he_cap)
 			continue;
 
-		he_tx_mcs_map = he_get_txmcsmap(info, he_cap);
+		he_tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);
 		he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs);
 
 		mask->control[i].he_gi = 0xFF;
@@ -4978,7 +5032,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 		if (tb[NL80211_TXRATE_HE] &&
 		    !he_set_mcs_mask(info, wdev, sband,
 				     nla_data(tb[NL80211_TXRATE_HE]),
-				     mask->control[band].he_mcs))
+				     mask->control[band].he_mcs,
+				     link_id))
 			return -EINVAL;
 
 		if (tb[NL80211_TXRATE_HE_GI])
@@ -5215,6 +5270,8 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 
 	memset(bcn, 0, sizeof(*bcn));
 
+	bcn->link_id = nl80211_link_id(attrs);
+
 	if (attrs[NL80211_ATTR_BEACON_HEAD]) {
 		bcn->head = nla_data(attrs[NL80211_ATTR_BEACON_HEAD]);
 		bcn->head_len = nla_len(attrs[NL80211_ATTR_BEACON_HEAD]);
@@ -5468,22 +5525,20 @@ static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_ap_settings *params)
 {
 	struct wireless_dev *wdev;
-	bool ret = false;
 
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 		if (wdev->iftype != NL80211_IFTYPE_AP &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_GO)
 			continue;
 
-		if (!wdev->preset_chandef.chan)
+		if (!wdev->u.ap.preset_chandef.chan)
 			continue;
 
-		params->chandef = wdev->preset_chandef;
-		ret = true;
-		break;
+		params->chandef = wdev->u.ap.preset_chandef;
+		return true;
 	}
 
-	return ret;
+	return false;
 }
 
 static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
@@ -5541,6 +5596,7 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_ap_settings *params;
@@ -5553,7 +5609,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->start_ap)
 		return -EOPNOTSUPP;
 
-	if (wdev->beacon_interval)
+	if (wdev->links[link_id].ap.beacon_interval)
 		return -EALREADY;
 
 	/* these are required for START_AP */
@@ -5595,6 +5651,18 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			err = -EINVAL;
 			goto out;
 		}
+
+		if (wdev->u.ap.ssid_len &&
+		    (wdev->u.ap.ssid_len != params->ssid_len ||
+		     memcmp(wdev->u.ap.ssid, params->ssid, params->ssid_len))) {
+			/* require identical SSID for MLO */
+			err = -EINVAL;
+			goto out;
+		}
+	} else if (wdev->valid_links) {
+		/* require SSID for MLO */
+		err = -EINVAL;
+		goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_HIDDEN_SSID])
@@ -5662,8 +5730,12 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_chandef(rdev, info, &params->chandef);
 		if (err)
 			goto out;
-	} else if (wdev->preset_chandef.chan) {
-		params->chandef = wdev->preset_chandef;
+	} else if (wdev->valid_links) {
+		/* with MLD need to specify the channel configuration */
+		err = -EINVAL;
+		goto out;
+	} else if (wdev->u.ap.preset_chandef.chan) {
+		params->chandef = wdev->u.ap.preset_chandef;
 	} else if (!nl80211_get_ap_channel(rdev, params)) {
 		err = -EINVAL;
 		goto out;
@@ -5679,7 +5751,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 						    NL80211_ATTR_TX_RATES,
 						    &params->beacon_rate,
-						    dev, false);
+						    dev, false, link_id);
 		if (err)
 			goto out;
 
@@ -5779,19 +5851,28 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		params->flags |= NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT;
 
 	wdev_lock(wdev);
+	if (wdev->conn_owner_nlportid &&
+	    info->attrs[NL80211_ATTR_SOCKET_OWNER] &&
+	    wdev->conn_owner_nlportid != info->snd_portid) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* FIXME: validate MLO/link-id against driver capabilities */
+
 	err = rdev_start_ap(rdev, dev, params);
 	if (!err) {
-		wdev->preset_chandef = params->chandef;
-		wdev->beacon_interval = params->beacon_interval;
-		wdev->chandef = params->chandef;
-		wdev->ssid_len = params->ssid_len;
-		memcpy(wdev->ssid, params->ssid, wdev->ssid_len);
+		wdev->links[link_id].ap.beacon_interval = params->beacon_interval;
+		wdev->links[link_id].ap.chandef = params->chandef;
+		wdev->u.ap.ssid_len = params->ssid_len;
+		memcpy(wdev->u.ap.ssid, params->ssid,
+		       params->ssid_len);
 
 		if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
 			wdev->conn_owner_nlportid = info->snd_portid;
 	}
+out_unlock:
 	wdev_unlock(wdev);
-
 out:
 	kfree(params->acl);
 	kfree(params->beacon.mbssid_ies);
@@ -5807,6 +5888,7 @@ out:
 static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_beacon_data params;
@@ -5819,7 +5901,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->change_beacon)
 		return -EOPNOTSUPP;
 
-	if (!wdev->beacon_interval)
+	if (!wdev->links[link_id].ap.beacon_interval)
 		return -EINVAL;
 
 	err = nl80211_parse_beacon(rdev, info->attrs, &params);
@@ -5838,9 +5920,10 @@ out:
 static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 
-	return cfg80211_stop_ap(rdev, dev, false);
+	return cfg80211_stop_ap(rdev, dev, link_id, false);
 }
 
 static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
@@ -7590,7 +7673,7 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 
 	wdev_lock(wdev);
 	/* If not connected, get default parameters */
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
 	else
 		err = rdev_get_mesh_config(rdev, dev, &cur_params);
@@ -7971,7 +8054,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb,
 		return err;
 
 	wdev_lock(wdev);
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		err = -ENOLINK;
 
 	if (!err)
@@ -8463,14 +8546,44 @@ int nl80211_parse_random_mac(struct nlattr **attrs,
 	return 0;
 }
 
-static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
+static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev,
+					      struct ieee80211_channel *chan)
 {
+	unsigned int link_id;
+	bool all_ok = true;
+
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (!cfg80211_beaconing_iface_active(wdev))
 		return true;
 
-	if (!(wdev->chandef.chan->flags & IEEE80211_CHAN_RADAR))
+	/*
+	 * FIXME: check if we have a free HW resource/link for chan
+	 *
+	 * This, as well as the FIXME below, requires knowing the link
+	 * capabilities of the hardware.
+	 */
+
+	/* we cannot leave radar channels */
+	for_each_valid_link(wdev, link_id) {
+		struct cfg80211_chan_def *chandef;
+
+		chandef = wdev_chandef(wdev, link_id);
+		if (!chandef)
+			continue;
+
+		/*
+		 * FIXME: don't require all_ok, but rather check only the
+		 *	  correct HW resource/link onto which 'chan' falls,
+		 *	  as only that link leaves the channel for doing
+		 *	  the off-channel operation.
+		 */
+
+		if (chandef->chan->flags & IEEE80211_CHAN_RADAR)
+			all_ok = false;
+	}
+
+	if (all_ok)
 		return true;
 
 	return regulatory_pre_cac_allowed(wdev->wiphy);
@@ -8553,7 +8666,7 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
 		int err;
 
 		if (!(wiphy->features & randomness_flag) ||
-		    (wdev && wdev->current_bss))
+		    (wdev && wdev->connected))
 			return -EOPNOTSUPP;
 
 		err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask);
@@ -8690,17 +8803,14 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	request->n_channels = i;
 
 	wdev_lock(wdev);
-	if (!cfg80211_off_channel_oper_allowed(wdev)) {
-		struct ieee80211_channel *chan;
+	for (i = 0; i < request->n_channels; i++) {
+		struct ieee80211_channel *chan = request->channels[i];
 
-		if (request->n_channels != 1) {
-			wdev_unlock(wdev);
-			err = -EBUSY;
-			goto out_free;
-		}
+		/* if we can go off-channel to the target channel we're good */
+		if (cfg80211_off_channel_oper_allowed(wdev, chan))
+			continue;
 
-		chan = request->channels[0];
-		if (chan->center_freq != wdev->chandef.chan->center_freq) {
+		if (!cfg80211_wdev_on_sub_chan(wdev, chan, true)) {
 			wdev_unlock(wdev);
 			err = -EBUSY;
 			goto out_free;
@@ -9445,7 +9555,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 
 	err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms);
 	if (!err) {
-		wdev->chandef = chandef;
+		wdev->links[0].ap.chandef = chandef;
 		wdev->cac_started = true;
 		wdev->cac_start_time = jiffies;
 		wdev->cac_time_ms = cac_time_ms;
@@ -9513,6 +9623,7 @@ static int nl80211_notify_radar_detection(struct sk_buff *skb,
 static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_csa_settings params;
@@ -9539,15 +9650,15 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 		need_handle_dfs_flag = false;
 
 		/* useless if AP is not running */
-		if (!wdev->beacon_interval)
+		if (!wdev->links[link_id].ap.beacon_interval)
 			return -ENOTCONN;
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		if (!wdev->ssid_len)
+		if (!wdev->u.ibss.ssid_len)
 			return -ENOTCONN;
 		break;
 	case NL80211_IFTYPE_MESH_POINT:
-		if (!wdev->mesh_id_len)
+		if (!wdev->u.mesh.id_len)
 			return -ENOTCONN;
 		break;
 	default:
@@ -9718,6 +9829,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 {
 	struct cfg80211_bss *res = &intbss->pub;
 	const struct cfg80211_bss_ies *ies;
+	unsigned int link_id;
 	void *hdr;
 	struct nlattr *bss;
 
@@ -9822,13 +9934,15 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
-		if (intbss == wdev->current_bss &&
-		    nla_put_u32(msg, NL80211_BSS_STATUS,
-				NL80211_BSS_STATUS_ASSOCIATED))
-			goto nla_put_failure;
+		for_each_valid_link(wdev, link_id) {
+			if (intbss == wdev->links[link_id].client.current_bss &&
+			    nla_put_u32(msg, NL80211_BSS_STATUS,
+					NL80211_BSS_STATUS_ASSOCIATED))
+				goto nla_put_failure;
+		}
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		if (intbss == wdev->current_bss &&
+		if (intbss == wdev->u.ibss.current_bss &&
 		    nla_put_u32(msg, NL80211_BSS_STATUS,
 				NL80211_BSS_STATUS_IBSS_JOINED))
 			goto nla_put_failure;
@@ -11362,7 +11476,7 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	}
 
 	wdev_lock(dev->ieee80211_ptr);
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		ret = -ENOLINK;
 	else
 		ret = rdev_update_connect_params(rdev, dev, &connect, changed);
@@ -11575,9 +11689,9 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 				     struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct wireless_dev *wdev = info->user_ptr[1];
 	struct cfg80211_chan_def chandef;
-	const struct cfg80211_chan_def *compat_chandef;
 	struct sk_buff *msg;
 	void *hdr;
 	u64 cookie;
@@ -11607,10 +11721,22 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 		return err;
 
 	wdev_lock(wdev);
-	if (!cfg80211_off_channel_oper_allowed(wdev) &&
-	    !cfg80211_chandef_identical(&wdev->chandef, &chandef)) {
-		compat_chandef = cfg80211_chandef_compatible(&wdev->chandef,
-							     &chandef);
+	if (!cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
+		const struct cfg80211_chan_def *oper_chandef, *compat_chandef;
+
+		oper_chandef = wdev_chandef(wdev, link_id);
+
+		if (WARN_ON(!oper_chandef)) {
+			/* cannot happen since we must beacon to get here */
+			WARN_ON(1);
+			wdev_unlock(wdev);
+			return -EBUSY;
+		}
+
+		/* note: returns first one if identical chandefs */
+		compat_chandef = cfg80211_chandef_compatible(&chandef,
+							     oper_chandef);
+
 		if (compat_chandef != &chandef) {
 			wdev_unlock(wdev);
 			return -EBUSY;
@@ -11672,6 +11798,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 				       struct genl_info *info)
 {
 	struct cfg80211_bitrate_mask mask;
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -11683,11 +11810,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 	wdev_lock(wdev);
 	err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 					    NL80211_ATTR_TX_RATES, &mask,
-					    dev, true);
+					    dev, true, link_id);
 	if (err)
 		goto out;
 
-	err = rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+	err = rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask);
 out:
 	wdev_unlock(wdev);
 	return err;
@@ -11812,7 +11939,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (params.offchan && !cfg80211_off_channel_oper_allowed(wdev)) {
+	if (params.offchan &&
+	    !cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
 		wdev_unlock(wdev);
 		return -EBUSY;
 	}
@@ -12030,12 +12158,13 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 * connection is established and enough beacons received to calculate
 	 * the average.
 	 */
-	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
+	if (!wdev->cqm_config->last_rssi_event_value &&
+	    wdev->links[0].client.current_bss &&
 	    rdev->ops->get_station) {
 		struct station_info sinfo = {};
 		u8 *mac_addr;
 
-		mac_addr = wdev->current_bss->pub.bssid;
+		mac_addr = wdev->links[0].client.current_bss->pub.bssid;
 
 		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
 		if (err)
@@ -12298,7 +12427,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 						    NL80211_ATTR_TX_RATES,
 						    &setup.beacon_rate,
-						    dev, false);
+						    dev, false, 0);
 		if (err)
 			return err;
 
@@ -13268,7 +13397,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 		rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]);
 
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (!wdev->connected) {
 		err = -ENOTCONN;
 		goto out;
 	}
@@ -14537,7 +14666,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (wdev->current_bss)
+		if (wdev->connected)
 			break;
 		err = -ENOTCONN;
 		goto out;
@@ -14710,13 +14839,13 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (!wdev->connected) {
 		ret = -ENOTCONN;
 		goto out;
 	}
 
 	pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	if (memcmp(pmk_conf.aa, wdev->current_bss->pub.bssid, ETH_ALEN)) {
+	if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -14844,9 +14973,13 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_MESH_POINT:
 		break;
 	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.current_bss)
+			break;
+		err = -ENOTCONN;
+		goto out;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (wdev->current_bss)
+		if (wdev->connected)
 			break;
 		err = -ENOTCONN;
 		goto out;
@@ -14882,12 +15015,14 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_ftm_responder_stats ftm_stats = {};
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct sk_buff *msg;
 	void *hdr;
 	struct nlattr *ftm_stats_attr;
 	int err;
 
-	if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval)
+	if (wdev->iftype != NL80211_IFTYPE_AP ||
+	    !wdev->links[link_id].ap.beacon_interval)
 		return -EOPNOTSUPP;
 
 	err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
@@ -15017,7 +15152,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info)
 static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			  struct nlattr *attrs[], struct net_device *dev,
 			  struct cfg80211_tid_cfg *tid_conf,
-			  struct genl_info *info, const u8 *peer)
+			  struct genl_info *info, const u8 *peer,
+			  unsigned int link_id)
 {
 	struct netlink_ext_ack *extack = info->extack;
 	u64 mask;
@@ -15092,7 +15228,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
 			err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
 						    &tid_conf->txrate_mask, dev,
-						    true);
+						    true, link_id);
 			if (err)
 				return err;
 
@@ -15119,6 +15255,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_tid_config *tid_config;
 	struct nlattr *tid;
@@ -15156,7 +15293,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 
 		ret = parse_tid_conf(rdev, attrs, dev,
 				     &tid_config->tid_conf[conf_idx],
-				     info, tid_config->peer);
+				     info, tid_config->peer, link_id);
 		if (ret)
 			goto bad_tid_conf;
 
@@ -15295,6 +15432,62 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 	return rdev_set_fils_aad(rdev, dev, &fils_aad);
 }
 
+static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
+{
+	unsigned int link_id = nl80211_link_id(info->attrs);
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (!(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		return -EINVAL;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC] ||
+	    !is_valid_ether_addr(nla_data(info->attrs[NL80211_ATTR_MAC])))
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	wdev->valid_links |= BIT(link_id);
+	ether_addr_copy(wdev->links[link_id].addr,
+			nla_data(info->attrs[NL80211_ATTR_MAC]));
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
+static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
+{
+	unsigned int link_id = nl80211_link_id(info->attrs);
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* cannot remove if there's no link */
+	if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* FIXME: stop the link operations first */
+
+	wdev_lock(wdev);
+	wdev->valid_links &= ~BIT(link_id);
+	eth_zero_addr(wdev->links[link_id].addr);
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -15307,6 +15500,8 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 					 NL80211_FLAG_CHECK_NETDEV_UP)
 #define NL80211_FLAG_CLEAR_SKB		0x20
 #define NL80211_FLAG_NO_WIPHY_MTX	0x40
+#define NL80211_FLAG_MLO_VALID_LINK_ID	0x80
+#define NL80211_FLAG_MLO_UNSUPPORTED	0x100
 
 #define INTERNAL_FLAG_SELECTORS(__sel)			\
 	SELECTOR(__sel, NONE, 0) /* must be first */	\
@@ -15316,6 +15511,12 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 		 NL80211_FLAG_NEED_WDEV)		\
 	SELECTOR(__sel, NETDEV,				\
 		 NL80211_FLAG_NEED_NETDEV)		\
+	SELECTOR(__sel, NETDEV_LINK,			\
+		 NL80211_FLAG_NEED_NETDEV |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
+	SELECTOR(__sel, NETDEV_NO_MLO,			\
+		 NL80211_FLAG_NEED_NETDEV |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)	\
 	SELECTOR(__sel, WIPHY_RTNL,			\
 		 NL80211_FLAG_NEED_WIPHY |		\
 		 NL80211_FLAG_NEED_RTNL)		\
@@ -15331,14 +15532,31 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 		 NL80211_FLAG_NEED_RTNL)		\
 	SELECTOR(__sel, NETDEV_UP,			\
 		 NL80211_FLAG_NEED_NETDEV_UP)		\
+	SELECTOR(__sel, NETDEV_UP_LINK,			\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
+	SELECTOR(__sel, NETDEV_UP_NO_MLO,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
+	SELECTOR(__sel, NETDEV_UP_NO_MLO_CLEAR,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_CLEAR_SKB |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
 	SELECTOR(__sel, NETDEV_UP_NOTMX,		\
 		 NL80211_FLAG_NEED_NETDEV_UP |		\
 		 NL80211_FLAG_NO_WIPHY_MTX)		\
+	SELECTOR(__sel, NETDEV_UP_NOTMX_NOMLO,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_NO_WIPHY_MTX |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
 	SELECTOR(__sel, NETDEV_UP_CLEAR,		\
 		 NL80211_FLAG_NEED_NETDEV_UP |		\
 		 NL80211_FLAG_CLEAR_SKB)		\
 	SELECTOR(__sel, WDEV_UP,			\
 		 NL80211_FLAG_NEED_WDEV_UP)		\
+	SELECTOR(__sel, WDEV_UP_LINK,			\
+		 NL80211_FLAG_NEED_WDEV_UP |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
 	SELECTOR(__sel, WDEV_UP_RTNL,			\
 		 NL80211_FLAG_NEED_WDEV_UP |		\
 		 NL80211_FLAG_NEED_RTNL)		\
@@ -15362,9 +15580,10 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 			    struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = NULL;
-	struct wireless_dev *wdev;
-	struct net_device *dev;
+	struct wireless_dev *wdev = NULL;
+	struct net_device *dev = NULL;
 	u32 internal_flags;
+	int err;
 
 	if (WARN_ON(ops->internal_flags >= ARRAY_SIZE(nl80211_internal_flags)))
 		return -EINVAL;
@@ -15375,8 +15594,8 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 	if (internal_flags & NL80211_FLAG_NEED_WIPHY) {
 		rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
 		if (IS_ERR(rdev)) {
-			rtnl_unlock();
-			return PTR_ERR(rdev);
+			err = PTR_ERR(rdev);
+			goto out_unlock;
 		}
 		info->user_ptr[0] = rdev;
 	} else if (internal_flags & NL80211_FLAG_NEED_NETDEV ||
@@ -15384,17 +15603,18 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info),
 						  info->attrs);
 		if (IS_ERR(wdev)) {
-			rtnl_unlock();
-			return PTR_ERR(wdev);
+			err = PTR_ERR(wdev);
+			goto out_unlock;
 		}
 
 		dev = wdev->netdev;
+		dev_hold(dev);
 		rdev = wiphy_to_rdev(wdev->wiphy);
 
 		if (internal_flags & NL80211_FLAG_NEED_NETDEV) {
 			if (!dev) {
-				rtnl_unlock();
-				return -EINVAL;
+				err = -EINVAL;
+				goto out_unlock;
 			}
 
 			info->user_ptr[1] = dev;
@@ -15404,14 +15624,44 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 		if (internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
 		    !wdev_running(wdev)) {
-			rtnl_unlock();
-			return -ENETDOWN;
+			err = -ENETDOWN;
+			goto out_unlock;
 		}
 
-		dev_hold(dev);
 		info->user_ptr[0] = rdev;
 	}
 
+	if (internal_flags & NL80211_FLAG_MLO_VALID_LINK_ID) {
+		struct nlattr *link_id = info->attrs[NL80211_ATTR_MLO_LINK_ID];
+
+		if (!wdev) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* MLO -> require valid link ID */
+		if (wdev->valid_links &&
+		    (!link_id ||
+		     !(wdev->valid_links & BIT(nla_get_u16(link_id))))) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* non-MLO -> no link ID attribute accepted */
+		if (!wdev->valid_links && link_id) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	if (internal_flags & NL80211_FLAG_MLO_UNSUPPORTED) {
+		if (info->attrs[NL80211_ATTR_MLO_LINK_ID] ||
+		    (wdev && wdev->valid_links)) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
 	if (rdev && !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
 		wiphy_lock(&rdev->wiphy);
 		/* we keep the mutex locked until post_doit */
@@ -15421,6 +15671,10 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		rtnl_unlock();
 
 	return 0;
+out_unlock:
+	rtnl_unlock();
+	dev_put(dev);
+	return err;
 }
 
 static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
@@ -15636,6 +15890,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_key,
 		.flags = GENL_UNS_ADMIN_PERM,
+		/* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on key */
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_CLEAR_SKB),
 	},
@@ -15659,21 +15914,24 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_set_beacon,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_START_AP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_start_ap,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_STOP_AP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_stop_ap,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_GET_STATION,
@@ -15939,7 +16197,9 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_remain_on_channel,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
+		/* FIXME: requiring a link ID here is probably not good */
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
@@ -15953,7 +16213,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_tx_bitrate_mask,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_REGISTER_FRAME,
@@ -16002,7 +16263,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_channel,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_JOIN_MESH,
@@ -16163,7 +16425,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_mac_acl,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_RADAR_DETECT,
@@ -16171,7 +16434,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.doit = nl80211_start_radar_detection,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
-					 NL80211_FLAG_NO_WIPHY_MTX),
+					 NL80211_FLAG_NO_WIPHY_MTX |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_GET_PROTOCOL_FEATURES,
@@ -16217,7 +16481,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_channel_switch,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_VENDOR,
@@ -16240,7 +16505,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_add_tx_ts,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_DEL_TX_TS,
@@ -16301,7 +16567,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_get_ftm_responder_stats,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_PEER_MEASUREMENT_START,
@@ -16333,7 +16600,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.cmd = NL80211_CMD_SET_TID_CONFIG,
 		.doit = nl80211_set_tid_config,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_SET_SAR_SPECS,
@@ -16357,6 +16625,19 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
 	},
+	{
+		.cmd = NL80211_CMD_ADD_LINK,
+		.doit = nl80211_add_link,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+	},
+	{
+		.cmd = NL80211_CMD_REMOVE_LINK,
+		.doit = nl80211_remove_link,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
@@ -17984,23 +18265,37 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 }
 
 void cfg80211_ch_switch_notify(struct net_device *dev,
-			       struct cfg80211_chan_def *chandef)
+			       struct cfg80211_chan_def *chandef,
+			       unsigned int link_id)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 
 	ASSERT_WDEV_LOCK(wdev);
+	WARN_INVALID_LINK_ID(wdev, link_id);
 
-	trace_cfg80211_ch_switch_notify(dev, chandef);
-
-	wdev->chandef = *chandef;
-	wdev->preset_chandef = *chandef;
+	trace_cfg80211_ch_switch_notify(dev, chandef, link_id);
 
-	if ((wdev->iftype == NL80211_IFTYPE_STATION ||
-	     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
-	    !WARN_ON(!wdev->current_bss))
-		cfg80211_update_assoc_bss_entry(wdev, chandef->chan);
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		if (!WARN_ON(!wdev->links[link_id].client.current_bss))
+			cfg80211_update_assoc_bss_entry(wdev, link_id,
+							chandef->chan);
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		wdev->u.mesh.chandef = *chandef;
+		wdev->u.mesh.preset_chandef = *chandef;
+		break;
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		wdev->links[link_id].ap.chandef = *chandef;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
 
 	cfg80211_sched_dfs_chan_update(rdev);
 
diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c
index 2d26a6d980bf..27a1732264f9 100644
--- a/net/wireless/ocb.c
+++ b/net/wireless/ocb.c
@@ -4,6 +4,7 @@
  *
  * Copyright: (c) 2014 Czech Technical University in Prague
  *            (c) 2014 Volkswagen Group Research
+ * Copyright (C) 2022 Intel Corporation
  * Author:    Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
  * Funded by: Volkswagen Group Research
  */
@@ -34,7 +35,7 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
 
 	err = rdev_join_ocb(rdev, dev, setup);
 	if (!err)
-		wdev->chandef = setup->chandef;
+		wdev->u.ocb.chandef = setup->chandef;
 
 	return err;
 }
@@ -69,7 +70,7 @@ int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 
 	err = rdev_leave_ocb(rdev, dev);
 	if (!err)
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+		memset(&wdev->u.ocb.chandef, 0, sizeof(wdev->u.ocb.chandef));
 
 	return err;
 }
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 439bcf52369c..d2300eff03ae 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright(c) 2016-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018, 2021-2022 Intel Corporation
+ */
 #ifndef __CFG80211_RDEV_OPS
 #define __CFG80211_RDEV_OPS
 
@@ -172,11 +177,11 @@ static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
 }
 
 static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
-			       struct net_device *dev)
+			       struct net_device *dev, unsigned int link_id)
 {
 	int ret;
-	trace_rdev_stop_ap(&rdev->wiphy, dev);
-	ret = rdev->ops->stop_ap(&rdev->wiphy, dev);
+	trace_rdev_stop_ap(&rdev->wiphy, dev, link_id);
+	ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
@@ -651,12 +656,14 @@ static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
 
 static inline int
 rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
-		      struct net_device *dev, const u8 *peer,
+		      struct net_device *dev, unsigned int link_id,
+		      const u8 *peer,
 		      const struct cfg80211_bitrate_mask *mask)
 {
 	int ret;
-	trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
-	ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
+	trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask);
+	ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id,
+					  peer, mask);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
@@ -944,12 +951,13 @@ static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
 static inline int
 rdev_get_channel(struct cfg80211_registered_device *rdev,
 		 struct wireless_dev *wdev,
+		 unsigned int link_id,
 		 struct cfg80211_chan_def *chandef)
 {
 	int ret;
 
-	trace_rdev_get_channel(&rdev->wiphy, wdev);
-	ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef);
+	trace_rdev_get_channel(&rdev->wiphy, wdev, link_id);
+	ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef);
 	trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);
 
 	return ret;
@@ -1107,12 +1115,14 @@ static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
 
 static inline int
 rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
-		      struct net_device *dev, struct cfg80211_chan_def *chandef)
+		      struct net_device *dev,
+		      unsigned int link_id,
+		      struct cfg80211_chan_def *chandef)
 {
 	int ret;
 
-	trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef);
-	ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef);
+	trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
+	ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 
 	return ret;
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 58e83ce642ad..c7383ede794f 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -2370,6 +2370,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	enum nl80211_iftype iftype;
 	bool ret;
+	int link;
 
 	wdev_lock(wdev);
 	iftype = wdev->iftype;
@@ -2378,62 +2379,83 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	if (!wdev->netdev || !netif_running(wdev->netdev))
 		goto wdev_inactive_unlock;
 
-	switch (iftype) {
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_P2P_GO:
-	case NL80211_IFTYPE_MESH_POINT:
-		if (!wdev->beacon_interval)
-			goto wdev_inactive_unlock;
-		chandef = wdev->chandef;
-		break;
-	case NL80211_IFTYPE_ADHOC:
-		if (!wdev->ssid_len)
-			goto wdev_inactive_unlock;
-		chandef = wdev->chandef;
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_P2P_CLIENT:
-		if (!wdev->current_bss ||
-		    !wdev->current_bss->pub.channel)
-			goto wdev_inactive_unlock;
-
-		if (!rdev->ops->get_channel ||
-		    rdev_get_channel(rdev, wdev, &chandef))
-			cfg80211_chandef_create(&chandef,
-						wdev->current_bss->pub.channel,
-						NL80211_CHAN_NO_HT);
-		break;
-	case NL80211_IFTYPE_MONITOR:
-	case NL80211_IFTYPE_AP_VLAN:
-	case NL80211_IFTYPE_P2P_DEVICE:
-		/* no enforcement required */
-		break;
-	default:
-		/* others not implemented for now */
-		WARN_ON(1);
-		break;
-	}
+	for (link = 0; link < ARRAY_SIZE(wdev->links); link++) {
+		struct ieee80211_channel *chan;
 
-	wdev_unlock(wdev);
+		if (!wdev->valid_links && link > 0)
+			break;
+		if (!(wdev->valid_links & BIT(link)))
+			continue;
+		switch (iftype) {
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_MESH_POINT:
+			if (!wdev->u.mesh.beacon_interval)
+				continue;
+			chandef = wdev->u.mesh.chandef;
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			if (!wdev->u.ibss.ssid_len)
+				continue;
+			chandef = wdev->u.ibss.chandef;
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			/* Maybe we could consider disabling that link only? */
+			if (!wdev->links[link].client.current_bss)
+				continue;
 
-	switch (iftype) {
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_P2P_GO:
-	case NL80211_IFTYPE_ADHOC:
-	case NL80211_IFTYPE_MESH_POINT:
-		wiphy_lock(wiphy);
-		ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype);
-		wiphy_unlock(wiphy);
+			chan = wdev->links[link].client.current_bss->pub.channel;
+			if (!chan)
+				continue;
 
-		return ret;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_P2P_CLIENT:
-		return cfg80211_chandef_usable(wiphy, &chandef,
-					       IEEE80211_CHAN_DISABLED);
-	default:
-		break;
+			if (!rdev->ops->get_channel ||
+			    rdev_get_channel(rdev, wdev, link, &chandef))
+				cfg80211_chandef_create(&chandef, chan,
+							NL80211_CHAN_NO_HT);
+			break;
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_P2P_DEVICE:
+			/* no enforcement required */
+			break;
+		default:
+			/* others not implemented for now */
+			WARN_ON(1);
+			break;
+		}
+
+		wdev_unlock(wdev);
+
+		switch (iftype) {
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_MESH_POINT:
+			wiphy_lock(wiphy);
+			ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef,
+							    iftype);
+			wiphy_unlock(wiphy);
+
+			if (!ret)
+				return ret;
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			ret = cfg80211_chandef_usable(wiphy, &chandef,
+						      IEEE80211_CHAN_DISABLED);
+			if (!ret)
+				return ret;
+			break;
+		default:
+			break;
+		}
+
+		wdev_lock(wdev);
 	}
 
+	wdev_unlock(wdev);
+
 	return true;
 
 wdev_inactive_unlock:
@@ -4215,8 +4237,17 @@ static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev)
 	 * In both cases we should end the CAC on the wdev.
 	 */
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-		if (wdev->cac_started &&
-		    !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef))
+		struct cfg80211_chan_def *chandef;
+
+		if (!wdev->cac_started)
+			continue;
+
+		/* FIXME: radar detection is tied to link 0 for now */
+		chandef = wdev_chandef(wdev, 0);
+		if (!chandef)
+			continue;
+
+		if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef))
 			rdev_end_cac(rdev, wdev->netdev);
 	}
 }
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 6d82bd9eaf8c..0134e5d5c81a 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -5,7 +5,7 @@
  * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2016	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -2617,7 +2617,8 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
 	spin_lock_bh(&rdev->bss_lock);
 
 	list_for_each_entry(bss, &rdev->bss_list, list) {
-		if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel))
+		if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel,
+						     false))
 			iter(wiphy, &bss->pub, iter_data);
 	}
 
@@ -2626,11 +2627,12 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
 EXPORT_SYMBOL(cfg80211_bss_iter);
 
 void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+				     unsigned int link_id,
 				     struct ieee80211_channel *chan)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
-	struct cfg80211_internal_bss *cbss = wdev->current_bss;
+	struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss;
 	struct cfg80211_internal_bss *new = NULL;
 	struct cfg80211_internal_bss *bss;
 	struct cfg80211_bss *nontrans_bss;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index ff4d48fcbfb2..35602201057b 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -5,7 +5,7 @@
  * (for nl80211's connect() and wext)
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved.
+ * Copyright (C) 2009, 2020, 2022 Intel Corporation. All rights reserved.
  * Copyright 2017	Intel Deutschland GmbH
  */
 
@@ -454,6 +454,20 @@ void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev)
 	schedule_work(&rdev->conn_work);
 }
 
+static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev)
+{
+	unsigned int link;
+
+	for_each_valid_link(wdev, link) {
+		if (!wdev->links[link].client.current_bss)
+			continue;
+		cfg80211_unhold_bss(wdev->links[link].client.current_bss);
+		cfg80211_put_bss(wdev->wiphy,
+				 &wdev->links[link].client.current_bss->pub);
+		wdev->links[link].client.current_bss = NULL;
+	}
+}
+
 static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev,
 				     const u8 *ies, size_t ies_len,
 				     const u8 **out_ies, size_t *out_ies_len)
@@ -521,12 +535,11 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 	if (!rdev->ops->auth || !rdev->ops->assoc)
 		return -EOPNOTSUPP;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
+	cfg80211_wdev_release_bsses(wdev);
 
+	if (wdev->connected) {
 		cfg80211_sme_free(wdev);
+		wdev->connected = false;
 	}
 
 	if (wdev->conn)
@@ -563,8 +576,8 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 		wdev->conn->auto_auth = false;
 	}
 
-	wdev->conn->params.ssid = wdev->ssid;
-	wdev->conn->params.ssid_len = wdev->ssid_len;
+	wdev->conn->params.ssid = wdev->u.client.ssid;
+	wdev->conn->params.ssid_len = wdev->u.client.ssid_len;
 
 	/* see if we have the bss already */
 	bss = cfg80211_get_conn_bss(wdev);
@@ -648,7 +661,7 @@ static bool cfg80211_is_all_idle(void)
 	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 			wdev_lock(wdev);
-			if (wdev->conn || wdev->current_bss ||
+			if (wdev->conn || wdev->connected ||
 			    cfg80211_beaconing_iface_active(wdev))
 				is_all_idle = false;
 			wdev_unlock(wdev);
@@ -668,7 +681,6 @@ static void disconnect_work(struct work_struct *work)
 
 DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
 
-
 /*
  * API calls for drivers implementing connect/disconnect and
  * SME event handling
@@ -729,23 +741,19 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) {
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid,
-					   wdev->ssid, wdev->ssid_len,
+					   wdev->u.client.ssid, wdev->u.client.ssid_len,
 					   wdev->conn_bss_type,
 					   IEEE80211_PRIVACY_ANY);
 		if (cr->bss)
 			cfg80211_hold_bss(bss_from_pub(cr->bss));
 	}
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
-	}
+	cfg80211_wdev_release_bsses(wdev);
 
 	if (cr->status != WLAN_STATUS_SUCCESS) {
 		kfree_sensitive(wdev->connect_keys);
 		wdev->connect_keys = NULL;
-		wdev->ssid_len = 0;
+		wdev->u.client.ssid_len = 0;
 		wdev->conn_owner_nlportid = 0;
 		if (cr->bss) {
 			cfg80211_unhold_bss(bss_from_pub(cr->bss));
@@ -758,7 +766,9 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (WARN_ON(!cr->bss))
 		return;
 
-	wdev->current_bss = bss_from_pub(cr->bss);
+	wdev->links[0].client.current_bss = bss_from_pub(cr->bss);
+	wdev->connected = true;
+	ether_addr_copy(wdev->u.client.connected_addr, cr->bss->bssid);
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
@@ -801,7 +811,7 @@ void cfg80211_connect_done(struct net_device *dev,
 
 			found = cfg80211_get_bss(wdev->wiphy, NULL,
 						 params->bss->bssid,
-						 wdev->ssid, wdev->ssid_len,
+						 wdev->u.client.ssid, wdev->u.client.ssid_len,
 						 wdev->conn_bss_type,
 						 IEEE80211_PRIVACY_ANY);
 			if (found) {
@@ -906,18 +916,17 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		goto out;
 
-	if (WARN_ON(!wdev->current_bss))
+	if (WARN_ON(!wdev->connected))
 		goto out;
 
-	cfg80211_unhold_bss(wdev->current_bss);
-	cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-	wdev->current_bss = NULL;
+	cfg80211_wdev_release_bsses(wdev);
 
 	if (WARN_ON(!info->bss))
 		return;
 
 	cfg80211_hold_bss(bss_from_pub(info->bss));
-	wdev->current_bss = bss_from_pub(info->bss);
+	wdev->links[0].client.current_bss = bss_from_pub(info->bss);
+	ether_addr_copy(wdev->u.client.connected_addr, info->bss->bssid);
 
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
@@ -963,8 +972,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 
 	if (!info->bss) {
 		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
-					     info->bssid, wdev->ssid,
-					     wdev->ssid_len,
+					     info->bssid, wdev->u.client.ssid,
+					     wdev->u.client.ssid_len,
 					     wdev->conn_bss_type,
 					     IEEE80211_PRIVACY_ANY);
 	}
@@ -1034,8 +1043,8 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return;
 
-	if (WARN_ON(!wdev->current_bss) ||
-	    WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	if (WARN_ON(!wdev->connected) ||
+	    WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return;
 
 	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
@@ -1087,13 +1096,9 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		return;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-	}
-
-	wdev->current_bss = NULL;
-	wdev->ssid_len = 0;
+	cfg80211_wdev_release_bsses(wdev);
+	wdev->connected = false;
+	wdev->u.client.ssid_len = 0;
 	wdev->conn_owner_nlportid = 0;
 	kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = NULL;
@@ -1182,19 +1187,20 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	 * already connected, so reject a new SSID unless it's the
 	 * same (which is the case for re-association.)
 	 */
-	if (wdev->ssid_len &&
-	    (wdev->ssid_len != connect->ssid_len ||
-	     memcmp(wdev->ssid, connect->ssid, wdev->ssid_len)))
+	if (wdev->u.client.ssid_len &&
+	    (wdev->u.client.ssid_len != connect->ssid_len ||
+	     memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len)))
 		return -EALREADY;
 
 	/*
 	 * If connected, reject (re-)association unless prev_bssid
 	 * matches the current BSSID.
 	 */
-	if (wdev->current_bss) {
+	if (wdev->connected) {
 		if (!prev_bssid)
 			return -EALREADY;
-		if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid))
+		if (!ether_addr_equal(prev_bssid,
+				      wdev->u.client.connected_addr))
 			return -ENOTCONN;
 	}
 
@@ -1245,8 +1251,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	}
 
 	wdev->connect_keys = connkeys;
-	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
-	wdev->ssid_len = connect->ssid_len;
+	memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len);
+	wdev->u.client.ssid_len = connect->ssid_len;
 
 	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
 					      IEEE80211_BSS_TYPE_ESS;
@@ -1262,8 +1268,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 		 * This could be reassoc getting refused, don't clear
 		 * ssid_len in that case.
 		 */
-		if (!wdev->current_bss)
-			wdev->ssid_len = 0;
+		if (!wdev->connected)
+			wdev->u.client.ssid_len = 0;
 		return err;
 	}
 
@@ -1287,7 +1293,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 		err = cfg80211_sme_disconnect(wdev, reason);
 	else if (!rdev->ops->disconnect)
 		cfg80211_mlme_down(rdev, dev);
-	else if (wdev->ssid_len)
+	else if (wdev->u.client.ssid_len)
 		err = rdev_disconnect(rdev, dev, reason);
 
 	/*
@@ -1295,8 +1301,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	 * in which case cfg80211_disconnected() will take care of
 	 * this later.
 	 */
-	if (!wdev->current_bss)
-		wdev->ssid_len = 0;
+	if (!wdev->connected)
+		wdev->u.client.ssid_len = 0;
 
 	return err;
 }
@@ -1320,7 +1326,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 			break;
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
-			__cfg80211_stop_ap(rdev, wdev->netdev, false);
+			__cfg80211_stop_ap(rdev, wdev->netdev, -1, false);
 			break;
 		case NL80211_IFTYPE_MESH_POINT:
 			__cfg80211_leave_mesh(rdev, wdev->netdev);
@@ -1332,7 +1338,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 			 * ops->disconnect not implemented.  Otherwise we can
 			 * use cfg80211_disconnect.
 			 */
-			if (rdev->ops->disconnect || wdev->current_bss)
+			if (rdev->ops->disconnect || wdev->connected)
 				cfg80211_disconnect(rdev, wdev->netdev,
 						    WLAN_REASON_DEAUTH_LEAVING,
 						    true);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 228079d7690a..3b2c956b8d78 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -569,6 +569,7 @@ TRACE_EVENT(rdev_start_ap,
 		__field(bool, privacy)
 		__field(enum nl80211_auth_type, auth_type)
 		__field(int, inactivity_timeout)
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -583,16 +584,17 @@ TRACE_EVENT(rdev_start_ap,
 		__entry->inactivity_timeout = settings->inactivity_timeout;
 		memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
 		memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
+		__entry->link_id = settings->beacon.link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
 		  CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
 		  "hidden ssid: %d, wpa versions: %u, privacy: %s, "
-		  "auth type: %d, inactivity timeout: %d",
+		  "auth type: %d, inactivity timeout: %d, link_id: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
 		  __entry->beacon_interval, __entry->dtim_period,
 		  __entry->hidden_ssid, __entry->wpa_ver,
 		  BOOL_TO_STR(__entry->privacy), __entry->auth_type,
-		  __entry->inactivity_timeout)
+		  __entry->inactivity_timeout, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_change_beacon,
@@ -602,6 +604,7 @@ TRACE_EVENT(rdev_change_beacon,
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(int, link_id)
 		__dynamic_array(u8, head, info ? info->head_len : 0)
 		__dynamic_array(u8, tail, info ? info->tail_len : 0)
 		__dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0)
@@ -615,6 +618,7 @@ TRACE_EVENT(rdev_change_beacon,
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		if (info) {
+			__entry->link_id = info->link_id;
 			if (info->head)
 				memcpy(__get_dynamic_array(head), info->head,
 				       info->head_len);
@@ -635,9 +639,30 @@ TRACE_EVENT(rdev_change_beacon,
 			if (info->probe_resp)
 				memcpy(__get_dynamic_array(probe_resp),
 				       info->probe_resp, info->probe_resp_len);
+		} else {
+			__entry->link_id = -1;
 		}
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
+);
+
+TRACE_EVENT(rdev_stop_ap,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, netdev, link_id),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__field(unsigned int, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
 );
 
 DECLARE_EVENT_CLASS(wiphy_netdev_evt,
@@ -654,11 +679,6 @@ DECLARE_EVENT_CLASS(wiphy_netdev_evt,
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
 );
 
-DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
-	TP_ARGS(wiphy, netdev)
-);
-
 DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
 	TP_ARGS(wiphy, netdev)
@@ -1619,20 +1639,24 @@ TRACE_EVENT(rdev_testmode_dump,
 
 TRACE_EVENT(rdev_set_bitrate_mask,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id,
 		 const u8 *peer, const struct cfg80211_bitrate_mask *mask),
-	TP_ARGS(wiphy, netdev, peer, mask),
+	TP_ARGS(wiphy, netdev, link_id, peer, mask),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(unsigned int, link_id)
 		MAC_ENTRY(peer)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
 		MAC_ASSIGN(peer, peer);
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: " MAC_PR_FMT,
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
+		  MAC_PR_ARG(peer))
 );
 
 TRACE_EVENT(rdev_update_mgmt_frame_registrations,
@@ -2040,9 +2064,22 @@ TRACE_EVENT(rdev_set_noack_map,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
 );
 
-DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel,
-	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
-	TP_ARGS(wiphy, wdev)
+TRACE_EVENT(rdev_get_channel,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, wdev, link_id),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(unsigned int, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->link_id = link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_return_chandef,
@@ -2296,20 +2333,24 @@ TRACE_EVENT(rdev_set_qos_map,
 
 TRACE_EVENT(rdev_set_ap_chanwidth,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id,
 		 struct cfg80211_chan_def *chandef),
-	TP_ARGS(wiphy, netdev, chandef),
+	TP_ARGS(wiphy, netdev, link_id, chandef),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
+		__entry->link_id = link_id;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+		  __entry->link_id)
 );
 
 TRACE_EVENT(rdev_add_tx_ts,
@@ -3022,18 +3063,21 @@ TRACE_EVENT(cfg80211_chandef_dfs_required,
 
 TRACE_EVENT(cfg80211_ch_switch_notify,
 	TP_PROTO(struct net_device *netdev,
-		 struct cfg80211_chan_def *chandef),
-	TP_ARGS(netdev, chandef),
+		 struct cfg80211_chan_def *chandef,
+		 unsigned int link_id),
+	TP_ARGS(netdev, chandef, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
+		__entry->link_id = link_id;
 	),
-	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
-		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_ch_switch_started_notify,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index a60d7d638e72..b7257862e0fe 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -1041,7 +1041,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			return -EBUSY;
 
 		dev->ieee80211_ptr->use_4addr = false;
-		dev->ieee80211_ptr->mesh_id_up_len = 0;
 		wdev_lock(dev->ieee80211_ptr);
 		rdev_set_qos_map(rdev, dev, NULL);
 		wdev_unlock(dev->ieee80211_ptr);
@@ -1049,7 +1048,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 		switch (otype) {
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
-			cfg80211_stop_ap(rdev, dev, true);
+			cfg80211_stop_ap(rdev, dev, -1, true);
 			break;
 		case NL80211_IFTYPE_ADHOC:
 			cfg80211_leave_ibss(rdev, dev, false);
@@ -1073,6 +1072,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 
 		cfg80211_process_rdev_events(rdev);
 		cfg80211_mlme_purge_registrations(dev->ieee80211_ptr);
+
+		memset(&dev->ieee80211_ptr->u, 0,
+		       sizeof(dev->ieee80211_ptr->u));
+		memset(&dev->ieee80211_ptr->links, 0,
+		       sizeof(dev->ieee80211_ptr->links));
 	}
 
 	err = rdev_change_virtual_intf(rdev, dev, ntype, params);
@@ -1930,6 +1934,24 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 }
 EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
 
+static int cfg80211_wdev_bi(struct wireless_dev *wdev)
+{
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		WARN_ON(wdev->valid_links);
+		return wdev->links[0].ap.beacon_interval;
+	case NL80211_IFTYPE_MESH_POINT:
+		return wdev->u.mesh.beacon_interval;
+	case NL80211_IFTYPE_ADHOC:
+		return wdev->u.ibss.beacon_interval;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
 				       u32 *beacon_int_gcd,
 				       bool *beacon_int_different)
@@ -1940,19 +1962,27 @@ static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
 	*beacon_int_different = false;
 
 	list_for_each_entry(wdev, &wiphy->wdev_list, list) {
-		if (!wdev->beacon_interval)
+		int wdev_bi;
+
+		/* this feature isn't supported with MLO */
+		if (wdev->valid_links)
+			continue;
+
+		wdev_bi = cfg80211_wdev_bi(wdev);
+
+		if (!wdev_bi)
 			continue;
 
 		if (!*beacon_int_gcd) {
-			*beacon_int_gcd = wdev->beacon_interval;
+			*beacon_int_gcd = wdev_bi;
 			continue;
 		}
 
-		if (wdev->beacon_interval == *beacon_int_gcd)
+		if (wdev_bi == *beacon_int_gcd)
 			continue;
 
 		*beacon_int_different = true;
-		*beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval);
+		*beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi);
 	}
 
 	if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index a32065d600a1..a9767bfe7330 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -7,7 +7,7 @@
  * we directly assign the wireless handlers of wireless interfaces.
  *
  * Copyright 2008-2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
  */
 
 #include <linux/export.h>
@@ -415,6 +415,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	int err, i;
 	bool rejoin = false;
 
+	if (wdev->valid_links)
+		return -EINVAL;
+
 	if (pairwise && !addr)
 		return -EINVAL;
 
@@ -437,7 +440,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		return -EOPNOTSUPP;
 
 	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
-		if (!wdev->current_bss)
+		if (!wdev->connected)
 			return -ENOLINK;
 
 		if (!rdev->ops->set_default_mgmt_key)
@@ -450,7 +453,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 	if (remove) {
 		err = 0;
-		if (wdev->current_bss) {
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss)) {
 			/*
 			 * If removing the current TX key, we will need to
 			 * join a new IBSS without the privacy bit clear.
@@ -501,7 +506,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 
 	err = 0;
-	if (wdev->current_bss)
+	if (wdev->connected ||
+	    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+	     wdev->u.ibss.current_bss))
 		err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
 	else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
 		 params->cipher != WLAN_CIPHER_SUITE_WEP104)
@@ -526,7 +533,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
 	     params->cipher == WLAN_CIPHER_SUITE_WEP104) &&
 	    (tx_key || (!addr && wdev->wext.default_key == -1))) {
-		if (wdev->current_bss) {
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss)) {
 			/*
 			 * If we are getting a new TX key from not having
 			 * had one before we need to join a new IBSS with
@@ -549,7 +558,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC &&
 	    (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) {
-		if (wdev->current_bss)
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss))
 			err = rdev_set_default_mgmt_key(rdev, dev, idx);
 		if (!err)
 			wdev->wext.default_mgmt_key = idx;
@@ -595,6 +606,11 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	wiphy_lock(&rdev->wiphy);
+	if (wdev->valid_links) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	idx = erq->flags & IW_ENCODE_INDEX;
 	if (idx == 0) {
 		idx = wdev->wext.default_key;
@@ -613,7 +629,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 		/* No key data - just set the default TX key index */
 		err = 0;
 		wdev_lock(wdev);
-		if (wdev->current_bss)
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss))
 			err = rdev_set_default_key(rdev, dev, idx, true,
 						   true);
 		if (!err)
@@ -865,7 +883,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev,
 			break;
 		}
 
-		ret = rdev_get_channel(rdev, wdev, &chandef);
+		ret = rdev_get_channel(rdev, wdev, 0, &chandef);
 		if (ret)
 			break;
 		freq->m = chandef.chan->center_freq;
@@ -1270,7 +1288,10 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
 		return -EINVAL;
 
 	wiphy_lock(&rdev->wiphy);
-	ret = rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+	if (dev->ieee80211_ptr->valid_links)
+		ret = -EOPNOTSUPP;
+	else
+		ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask);
 	wiphy_unlock(&rdev->wiphy);
 
 	return ret;
@@ -1294,8 +1315,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 
 	err = 0;
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (!wdev->valid_links && wdev->links[0].client.current_bss)
+		memcpy(addr, wdev->links[0].client.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else
 		err = -EOPNOTSUPP;
 	wdev_unlock(wdev);
@@ -1339,11 +1361,11 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 
 	/* Grab BSSID of current BSS, if any */
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (wdev->valid_links || !wdev->links[0].client.current_bss) {
 		wdev_unlock(wdev);
 		return NULL;
 	}
-	memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+	memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN);
 	wdev_unlock(wdev);
 
 	memset(&sinfo, 0, sizeof(sinfo));
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index cd09a9042261..68f45afc352d 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -3,7 +3,7 @@
  * cfg80211 wext compat for managed mode.
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009, 2020-2021 Intel Corporation.
+ * Copyright (C) 2009, 2020-2022 Intel Corporation
  */
 
 #include <linux/export.h>
@@ -124,9 +124,12 @@ int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
+	if (wdev->valid_links)
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		chan = wdev->current_bss->pub.channel;
+	if (wdev->links[0].client.current_bss)
+		chan = wdev->links[0].client.current_bss->pub.channel;
 	else if (wdev->wext.connect.channel)
 		chan = wdev->wext.connect.channel;
 	wdev_unlock(wdev);
@@ -208,15 +211,19 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
+	if (wdev->valid_links)
+		return -EINVAL;
+
 	data->flags = 0;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss) {
+	if (wdev->links[0].client.current_bss) {
 		const struct element *ssid_elem;
 
 		rcu_read_lock();
-		ssid_elem = ieee80211_bss_get_elem(&wdev->current_bss->pub,
-						   WLAN_EID_SSID);
+		ssid_elem = ieee80211_bss_get_elem(
+				&wdev->links[0].client.current_bss->pub,
+				WLAN_EID_SSID);
 		if (ssid_elem) {
 			data->flags = 1;
 			data->length = ssid_elem->datalen;
@@ -300,8 +307,14 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev,
 	ap_addr->sa_family = ARPHRD_ETHER;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (wdev->valid_links) {
+		wdev_unlock(wdev);
+		return -EOPNOTSUPP;
+	}
+	if (wdev->links[0].client.current_bss)
+		memcpy(ap_addr->sa_data,
+		       wdev->links[0].client.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else
 		eth_zero_addr(ap_addr->sa_data);
 	wdev_unlock(wdev);
-- 
cgit 


From d0a9123ef548def5c8880e83e5df948eb5b55c62 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 10 May 2022 13:26:44 +0200
Subject: wifi: mac80211: move some future per-link data to bss_conf

To add MLD, reuse the bss_conf structure later for per-link
information, so move some things into it that are per link.

Most transformations were done with the following spatch:

    @@
    expression sdata;
    identifier var = { chanctx_conf, mu_mimo_owner, csa_active, color_change_active, color_change_color };
    @@
    -sdata->vif.var
    +sdata->vif.bss_conf.var

    @@
    struct ieee80211_vif *vif;
    identifier var = { chanctx_conf, mu_mimo_owner, csa_active, color_change_active, color_change_color };
    @@
    -vif->var
    +vif->bss_conf.var

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/htt_rx.c           |  2 +-
 drivers/net/wireless/ath/ath10k/mac.c              |  8 ++---
 drivers/net/wireless/ath/ath10k/wmi-tlv.c          |  2 +-
 drivers/net/wireless/ath/ath10k/wmi.c              |  2 +-
 drivers/net/wireless/ath/ath11k/mac.c              | 12 +++----
 drivers/net/wireless/ath/ath11k/wmi.c              |  4 +--
 drivers/net/wireless/ath/ath9k/beacon.c            |  2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c    |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/coex.c      |  6 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c        |  2 +-
 .../net/wireless/intel/iwlwifi/mvm/debugfs-vif.c   |  4 +--
 .../net/wireless/intel/iwlwifi/mvm/ftm-responder.c |  4 +--
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  | 10 +++---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  | 10 +++---
 drivers/net/wireless/intel/iwlwifi/mvm/power.c     |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c        |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tdls.c      |  4 +--
 .../net/wireless/intel/iwlwifi/mvm/time-event.c    |  4 +--
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |  2 +-
 drivers/net/wireless/mac80211_hwsim.c              | 22 +++++++-----
 drivers/net/wireless/mediatek/mt76/mac80211.c      |  4 +--
 drivers/net/wireless/mediatek/mt76/mt7615/mcu.c    |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c    | 10 +++---
 drivers/net/wireless/ti/wlcore/main.c              |  2 +-
 include/net/mac80211.h                             | 40 ++++++++++-----------
 net/mac80211/airtime.c                             |  4 +--
 net/mac80211/cfg.c                                 | 40 ++++++++++-----------
 net/mac80211/chan.c                                | 39 ++++++++++----------
 net/mac80211/driver-ops.h                          |  2 +-
 net/mac80211/ethtool.c                             |  4 +--
 net/mac80211/ibss.c                                | 10 +++---
 net/mac80211/ieee80211_i.h                         |  6 ++--
 net/mac80211/iface.c                               |  8 ++---
 net/mac80211/main.c                                |  4 +--
 net/mac80211/mesh.c                                | 14 ++++----
 net/mac80211/mlme.c                                | 42 +++++++++++-----------
 net/mac80211/ocb.c                                 |  3 +-
 net/mac80211/offchannel.c                          |  6 ++--
 net/mac80211/rate.c                                |  5 +--
 net/mac80211/rx.c                                  |  2 +-
 net/mac80211/sta_info.c                            |  2 +-
 net/mac80211/tdls.c                                |  6 ++--
 net/mac80211/tx.c                                  | 28 +++++++--------
 net/mac80211/util.c                                | 16 ++++-----
 net/mac80211/vht.c                                 |  6 ++--
 45 files changed, 209 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c
index b8461501221a..8a075a711b71 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -3840,7 +3840,7 @@ ath10k_update_per_peer_tx_stats(struct ath10k *ar,
 	switch (txrate.flags) {
 	case WMI_RATE_PREAMBLE_OFDM:
 		if (arsta->arvif && arsta->arvif->vif)
-			conf = rcu_dereference(arsta->arvif->vif->chanctx_conf);
+			conf = rcu_dereference(arsta->arvif->vif->bss_conf.chanctx_conf);
 		if (conf && conf->def.chan->band == NL80211_BAND_5GHZ)
 			arsta->tx_info.status.rates[0].idx = rate_idx - 4;
 		break;
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index a04e18250dae..42c657b8b73e 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -659,7 +659,7 @@ int ath10k_mac_vif_chan(struct ieee80211_vif *vif,
 	struct ieee80211_chanctx_conf *conf;
 
 	rcu_read_lock();
-	conf = rcu_dereference(vif->chanctx_conf);
+	conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (!conf) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -2028,7 +2028,7 @@ static void ath10k_mac_vif_ap_csa_count_down(struct ath10k_vif *arvif)
 	if (arvif->vdev_type != WMI_VDEV_TYPE_AP)
 		return;
 
-	if (!vif->csa_active)
+	if (!vif->bss_conf.csa_active)
 		return;
 
 	if (!arvif->is_up)
@@ -8832,7 +8832,7 @@ ath10k_mac_change_chanctx_cnt_iter(void *data, u8 *mac,
 {
 	struct ath10k_mac_change_chanctx_arg *arg = data;
 
-	if (rcu_access_pointer(vif->chanctx_conf) != arg->ctx)
+	if (rcu_access_pointer(vif->bss_conf.chanctx_conf) != arg->ctx)
 		return;
 
 	arg->n_vifs++;
@@ -8845,7 +8845,7 @@ ath10k_mac_change_chanctx_fill_iter(void *data, u8 *mac,
 	struct ath10k_mac_change_chanctx_arg *arg = data;
 	struct ieee80211_chanctx_conf *ctx;
 
-	ctx = rcu_access_pointer(vif->chanctx_conf);
+	ctx = rcu_access_pointer(vif->bss_conf.chanctx_conf);
 	if (ctx != arg->ctx)
 		return;
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
index 7efbe03fbca8..876410a47d1d 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
@@ -205,7 +205,7 @@ static int ath10k_wmi_tlv_event_bcn_tx_status(struct ath10k *ar,
 	}
 
 	arvif = ath10k_get_arvif(ar, vdev_id);
-	if (arvif && arvif->is_up && arvif->vif->csa_active)
+	if (arvif && arvif->is_up && arvif->vif->bss_conf.csa_active)
 		ieee80211_queue_work(ar->hw, &arvif->ap_csa_work);
 
 	kfree(tb);
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index cd438f76f284..af19cab24c76 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3882,7 +3882,7 @@ void ath10k_wmi_event_host_swba(struct ath10k *ar, struct sk_buff *skb)
 		 * Once CSA counter is completed stop sending beacons until
 		 * actual channel switch is done
 		 */
-		if (arvif->vif->csa_active &&
+		if (arvif->vif->bss_conf.csa_active &&
 		    ieee80211_beacon_cntdwn_is_complete(arvif->vif)) {
 			ieee80211_csa_finish(arvif->vif);
 			continue;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 42d2e8cf8125..f28e7feca318 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -505,7 +505,7 @@ static int ath11k_mac_vif_chan(struct ieee80211_vif *vif,
 	struct ieee80211_chanctx_conf *conf;
 
 	rcu_read_lock();
-	conf = rcu_dereference(vif->chanctx_conf);
+	conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (!conf) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -1398,10 +1398,10 @@ void ath11k_mac_bcn_tx_event(struct ath11k_vif *arvif)
 {
 	struct ieee80211_vif *vif = arvif->vif;
 
-	if (!vif->color_change_active && !arvif->bcca_zero_sent)
+	if (!vif->bss_conf.color_change_active && !arvif->bcca_zero_sent)
 		return;
 
-	if (vif->color_change_active && ieee80211_beacon_cntdwn_is_complete(vif)) {
+	if (vif->bss_conf.color_change_active && ieee80211_beacon_cntdwn_is_complete(vif)) {
 		arvif->bcca_zero_sent = true;
 		ieee80211_color_change_finish(vif);
 		return;
@@ -1409,7 +1409,7 @@ void ath11k_mac_bcn_tx_event(struct ath11k_vif *arvif)
 
 	arvif->bcca_zero_sent = false;
 
-	if (vif->color_change_active)
+	if (vif->bss_conf.color_change_active)
 		ieee80211_beacon_update_cntdwn(vif);
 	ath11k_mac_setup_bcn_tmpl(arvif);
 }
@@ -6849,7 +6849,7 @@ ath11k_mac_change_chanctx_cnt_iter(void *data, u8 *mac,
 {
 	struct ath11k_mac_change_chanctx_arg *arg = data;
 
-	if (rcu_access_pointer(vif->chanctx_conf) != arg->ctx)
+	if (rcu_access_pointer(vif->bss_conf.chanctx_conf) != arg->ctx)
 		return;
 
 	arg->n_vifs++;
@@ -6862,7 +6862,7 @@ ath11k_mac_change_chanctx_fill_iter(void *data, u8 *mac,
 	struct ath11k_mac_change_chanctx_arg *arg = data;
 	struct ieee80211_chanctx_conf *ctx;
 
-	ctx = rcu_access_pointer(vif->chanctx_conf);
+	ctx = rcu_access_pointer(vif->bss_conf.chanctx_conf);
 	if (ctx != arg->ctx)
 		return;
 
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index f69918b9452a..f2d5e07dc148 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -1700,7 +1700,7 @@ int ath11k_wmi_bcn_tmpl(struct ath11k *ar, u32 vdev_id,
 	cmd->vdev_id = vdev_id;
 	cmd->tim_ie_offset = offs->tim_offset;
 
-	if (vif->csa_active) {
+	if (vif->bss_conf.csa_active) {
 		cmd->csa_switch_count_offset = offs->cntdwn_counter_offs[0];
 		cmd->ext_csa_switch_count_offset = offs->cntdwn_counter_offs[1];
 	}
@@ -7475,7 +7475,7 @@ ath11k_wmi_process_csa_switch_count_event(struct ath11k_base *ab,
 			continue;
 		}
 
-		if (arvif->is_up && arvif->vif->csa_active)
+		if (arvif->is_up && arvif->vif->bss_conf.csa_active)
 			ieee80211_csa_finish(arvif->vif);
 	}
 	rcu_read_unlock();
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index 72e2e71aac0e..8b1b966bcef1 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -362,7 +362,7 @@ static void ath9k_set_tsfadjust(struct ath_softc *sc,
 
 bool ath9k_csa_is_finished(struct ath_softc *sc, struct ieee80211_vif *vif)
 {
-	if (!vif || !vif->csa_active)
+	if (!vif || !vif->bss_conf.csa_active)
 		return false;
 
 	if (!ieee80211_beacon_cntdwn_is_complete(vif))
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
index c745897aa3d6..468bc934d848 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
@@ -511,7 +511,7 @@ bool ath9k_htc_csa_is_finished(struct ath9k_htc_priv *priv)
 	struct ieee80211_vif *vif;
 
 	vif = priv->csa_vif;
-	if (!vif || !vif->csa_active)
+	if (!vif || !vif->bss_conf.csa_active)
 		return false;
 
 	if (!ieee80211_beacon_cntdwn_is_complete(vif))
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
index 9b194cb8d65e..8760f2c73369 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2013-2014, 2018-2020 Intel Corporation
+ * Copyright (C) 2013-2014, 2018-2020, 2022 Intel Corporation
  * Copyright (C) 2013-2015 Intel Mobile Communications GmbH
  */
 #include <linux/ieee80211.h>
@@ -106,7 +106,7 @@ iwl_get_coex_type(struct iwl_mvm *mvm, const struct ieee80211_vif *vif)
 
 	rcu_read_lock();
 
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 
 	if (!chanctx_conf ||
 	     chanctx_conf->def.chan->band != NL80211_BAND_2GHZ) {
@@ -283,7 +283,7 @@ static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 		return;
 	}
 
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 
 	/* If channel context is invalid or not on 2.4GHz .. */
 	if ((!chanctx_conf ||
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index 61f9136a333d..8edc8646a23a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -731,7 +731,7 @@ static int iwl_mvm_d3_reprogram(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 		return -EINVAL;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(vif->chanctx_conf);
+	ctx = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (WARN_ON(!ctx)) {
 		rcu_read_unlock();
 		return -EINVAL;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
index 7d9faeffd154..78d8b37eb71a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
@@ -234,7 +234,7 @@ static ssize_t iwl_dbgfs_mac_params_read(struct file *file,
 	}
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (chanctx_conf)
 		pos += scnprintf(buf+pos, bufsz-pos,
 				 "idle rx chains %d, active rx chains: %d\n",
@@ -597,7 +597,7 @@ static ssize_t iwl_dbgfs_rx_phyinfo_write(struct ieee80211_vif *vif, char *buf,
 	mutex_lock(&mvm->mutex);
 	rcu_read_lock();
 
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	/* make sure the channel context is assigned */
 	if (!chanctx_conf) {
 		rcu_read_unlock();
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c
index 9729680476fd..e862d1b43f21 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <net/cfg80211.h>
 #include <linux/etherdevice.h>
@@ -398,7 +398,7 @@ int iwl_mvm_ftm_start_responder(struct iwl_mvm *mvm, struct ieee80211_vif *vif)
 	}
 
 	rcu_read_lock();
-	pctx = rcu_dereference(vif->chanctx_conf);
+	pctx = rcu_dereference(vif->bss_conf.chanctx_conf);
 	/* Copy the ctx to unlock the rcu and send the phy ctxt. We don't care
 	 * about changes in the ctx after releasing the lock because the driver
 	 * is still protected by the mutex. */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 56fa20596f16..7756ac0faf3f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -481,7 +481,7 @@ static void iwl_mvm_mac_ctxt_cmd_common(struct iwl_mvm *mvm,
 		eth_broadcast_addr(cmd->bssid_addr);
 
 	rcu_read_lock();
-	chanctx = rcu_dereference(vif->chanctx_conf);
+	chanctx = rcu_dereference(vif->bss_conf.chanctx_conf);
 	iwl_mvm_ack_rates(mvm, vif, chanctx ? chanctx->def.chan->band
 					    : NL80211_BAND_2GHZ,
 			  &cck_ack_rates, &ofdm_ack_rates);
@@ -934,7 +934,7 @@ static int iwl_mvm_mac_ctxt_send_beacon_v9(struct iwl_mvm *mvm,
 
 	/* Enable FILS on PSC channels only */
 	rcu_read_lock();
-	ctx = rcu_dereference(vif->chanctx_conf);
+	ctx = rcu_dereference(vif->bss_conf.chanctx_conf);
 	channel = ieee80211_frequency_to_channel(ctx->def.chan->center_freq);
 	WARN_ON(channel == 0);
 	if (cfg80211_channel_is_psc(ctx->def.chan) &&
@@ -1335,7 +1335,7 @@ void iwl_mvm_rx_beacon_notif(struct iwl_mvm *mvm,
 
 	csa_vif = rcu_dereference_protected(mvm->csa_vif,
 					    lockdep_is_held(&mvm->mutex));
-	if (unlikely(csa_vif && csa_vif->csa_active))
+	if (unlikely(csa_vif && csa_vif->bss_conf.csa_active))
 		iwl_mvm_csa_count_down(mvm, csa_vif, mvm->ap_last_beacon_gp2,
 				       (status == TX_STATUS_SUCCESS));
 
@@ -1558,7 +1558,7 @@ void iwl_mvm_channel_switch_start_notif(struct iwl_mvm *mvm,
 	switch (vif->type) {
 	case NL80211_IFTYPE_AP:
 		csa_vif = rcu_dereference(mvm->csa_vif);
-		if (WARN_ON(!csa_vif || !csa_vif->csa_active ||
+		if (WARN_ON(!csa_vif || !csa_vif->bss_conf.csa_active ||
 			    csa_vif != vif))
 			goto out_unlock;
 
@@ -1587,7 +1587,7 @@ void iwl_mvm_channel_switch_start_notif(struct iwl_mvm *mvm,
 		 */
 		if (iwl_fw_lookup_notif_ver(mvm->fw, MAC_CONF_GROUP,
 					    CHANNEL_SWITCH_ERROR_NOTIF,
-					    0) && !vif->csa_active) {
+					    0) && !vif->bss_conf.csa_active) {
 			IWL_DEBUG_INFO(mvm, "Channel Switch was canceled\n");
 			iwl_mvm_cancel_channel_switch(mvm, vif, mac_id);
 			break;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index bb9bd2165355..c5626ff83805 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1768,7 +1768,7 @@ static int iwl_mvm_update_mu_groups(struct iwl_mvm *mvm,
 static void iwl_mvm_mu_mimo_iface_iterator(void *_data, u8 *mac,
 					   struct ieee80211_vif *vif)
 {
-	if (vif->mu_mimo_owner) {
+	if (vif->bss_conf.mu_mimo_owner) {
 		struct iwl_mu_group_mgmt_notif *notif = _data;
 
 		/*
@@ -1965,7 +1965,7 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm,
 
 	rcu_read_lock();
 
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return;
@@ -2337,7 +2337,7 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 		 * However, on HW restart we should restore this data.
 		 */
 		if (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status) &&
-		    (changes & BSS_CHANGED_MU_GROUPS) && vif->mu_mimo_owner) {
+		    (changes & BSS_CHANGED_MU_GROUPS) && vif->bss_conf.mu_mimo_owner) {
 			ret = iwl_mvm_update_mu_groups(mvm, vif);
 			if (ret)
 				IWL_ERR(mvm,
@@ -4004,7 +4004,7 @@ static void iwl_mvm_ftm_responder_chanctx_iter(void *_data, u8 *mac,
 {
 	struct iwl_mvm_ftm_responder_iter_data *data = _data;
 
-	if (rcu_access_pointer(vif->chanctx_conf) == data->ctx &&
+	if (rcu_access_pointer(vif->bss_conf.chanctx_conf) == data->ctx &&
 	    vif->type == NL80211_IFTYPE_AP && vif->bss_conf.ftmr_params)
 		data->responder = true;
 }
@@ -4631,7 +4631,7 @@ static int iwl_mvm_pre_channel_switch(struct ieee80211_hw *hw,
 		csa_vif =
 			rcu_dereference_protected(mvm->csa_vif,
 						  lockdep_is_held(&mvm->mutex));
-		if (WARN_ONCE(csa_vif && csa_vif->csa_active,
+		if (WARN_ONCE(csa_vif && csa_vif->bss_conf.csa_active,
 			      "Another CSA is already in progress")) {
 			ret = -EBUSY;
 			goto out_unlock;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/power.c b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
index b9bd81242b21..afdf3bb523e9 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/power.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
@@ -283,7 +283,7 @@ static bool iwl_mvm_power_is_radar(struct ieee80211_vif *vif)
 	bool radar_detect = false;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	WARN_ON(!chanctx_conf);
 	if (chanctx_conf) {
 		chan = chanctx_conf->def.chan;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 974eeecc9153..303975f9e2b5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -1980,7 +1980,7 @@ static bool rs_tpc_perform(struct iwl_mvm *mvm,
 #endif
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf))
 		band = NUM_NL80211_BANDS;
 	else
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
index bf04326e35ff..674dd137fb9f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2014 Intel Mobile Communications GmbH
  * Copyright (C) 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2020, 2022 Intel Corporation
  */
 #include <linux/etherdevice.h>
 #include "mvm.h"
@@ -380,7 +380,7 @@ iwl_mvm_tdls_config_channel_switch(struct iwl_mvm *mvm,
 			   type == TDLS_MOVE_CH) {
 			/* we need to return to base channel */
 			struct ieee80211_chanctx_conf *chanctx =
-					rcu_dereference(vif->chanctx_conf);
+					rcu_dereference(vif->bss_conf.chanctx_conf);
 
 			if (WARN_ON_ONCE(!chanctx)) {
 				rcu_read_unlock();
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 6edf2b79db43..4f0794a45bf5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2012-2014, 2018-2021 Intel Corporation
+ * Copyright (C) 2012-2014, 2018-2022 Intel Corporation
  * Copyright (C) 2013-2015 Intel Mobile Communications GmbH
  * Copyright (C) 2017 Intel Deutschland GmbH
  */
@@ -123,7 +123,7 @@ static void iwl_mvm_csa_noa_start(struct iwl_mvm *mvm)
 	rcu_read_lock();
 
 	csa_vif = rcu_dereference(mvm->csa_vif);
-	if (!csa_vif || !csa_vif->csa_active)
+	if (!csa_vif || !csa_vif->bss_conf.csa_active)
 		goto out_unlock;
 
 	IWL_DEBUG_TE(mvm, "CSA NOA started\n");
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 8125bb76f59e..f9e08b339e0c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -1959,7 +1959,7 @@ static void iwl_mvm_tx_reclaim(struct iwl_mvm *mvm, int sta_id, int tid,
 
 		if (mvmsta->vif)
 			chanctx_conf =
-				rcu_dereference(mvmsta->vif->chanctx_conf);
+				rcu_dereference(mvmsta->vif->bss_conf.chanctx_conf);
 
 		if (WARN_ON_ONCE(!chanctx_conf))
 			goto out;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index bd408d260e9c..5905803893b8 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -888,7 +888,7 @@ static void hwsim_send_ps_poll(void *dat, u8 *mac, struct ieee80211_vif *vif)
 
 	rcu_read_lock();
 	mac80211_hwsim_tx_frame(data->hw, skb,
-				rcu_dereference(vif->chanctx_conf)->def.chan);
+				rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
 	rcu_read_unlock();
 }
 
@@ -921,7 +921,7 @@ static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac,
 
 	rcu_read_lock();
 	mac80211_hwsim_tx_frame(data->hw, skb,
-				rcu_dereference(vif->chanctx_conf)->def.chan);
+				rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
 	rcu_read_unlock();
 }
 
@@ -1464,11 +1464,11 @@ static void mac80211_hwsim_tx_iter(void *_data, u8 *addr,
 {
 	struct tx_iter_data *data = _data;
 
-	if (!vif->chanctx_conf)
+	if (!vif->bss_conf.chanctx_conf)
 		return;
 
 	if (!hwsim_chans_compat(data->channel,
-				rcu_dereference(vif->chanctx_conf)->def.chan))
+				rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan))
 		return;
 
 	data->receive = true;
@@ -1686,7 +1686,11 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
 	} else if (txi->hw_queue == 4) {
 		channel = data->tmp_chan;
 	} else {
-		chanctx_conf = rcu_dereference(txi->control.vif->chanctx_conf);
+		struct ieee80211_bss_conf *bss_conf;
+
+		bss_conf = &txi->control.vif->bss_conf;
+
+		chanctx_conf = rcu_dereference(bss_conf->chanctx_conf);
 		if (chanctx_conf) {
 			channel = chanctx_conf->def.chan;
 			confbw = chanctx_conf->def.width;
@@ -1935,14 +1939,14 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 	}
 
 	mac80211_hwsim_tx_frame(hw, skb,
-				rcu_dereference(vif->chanctx_conf)->def.chan);
+				rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
 
 	while ((skb = ieee80211_get_buffered_bc(hw, vif)) != NULL) {
 		mac80211_hwsim_tx_frame(hw, skb,
-				rcu_dereference(vif->chanctx_conf)->def.chan);
+				rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
 	}
 
-	if (vif->csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
+	if (vif->bss_conf.csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
 		ieee80211_csa_finish(vif);
 }
 
@@ -2204,7 +2208,7 @@ mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw,
 		struct ieee80211_chanctx_conf *chanctx_conf;
 
 		rcu_read_lock();
-		chanctx_conf = rcu_dereference(vif->chanctx_conf);
+		chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 
 		if (!WARN_ON(!chanctx_conf))
 			confbw = chanctx_conf->def.width;
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 18b5de55334c..5f75a8945a6e 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -1459,7 +1459,7 @@ EXPORT_SYMBOL_GPL(mt76_get_sar_power);
 static void
 __mt76_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	if (vif->csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
+	if (vif->bss_conf.csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
 		ieee80211_csa_finish(vif);
 }
 
@@ -1481,7 +1481,7 @@ __mt76_csa_check(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
 	struct mt76_dev *dev = priv;
 
-	if (!vif->csa_active)
+	if (!vif->bss_conf.csa_active)
 		return;
 
 	dev->csa_complete |= ieee80211_beacon_cntdwn_is_complete(vif);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
index 97e2a85cb728..8fb6c9d735dc 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
@@ -363,7 +363,7 @@ out:
 static void
 mt7615_mcu_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	if (vif->csa_active)
+	if (vif->bss_conf.csa_active)
 		ieee80211_csa_finish(vif);
 }
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index b7e2b365356c..1b3d6634a72e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -322,7 +322,7 @@ int mt7915_mcu_wa_cmd(struct mt7915_dev *dev, int cmd, u32 a1, u32 a2, u32 a3)
 static void
 mt7915_mcu_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	if (vif->csa_active)
+	if (vif->bss_conf.csa_active)
 		ieee80211_csa_finish(vif);
 }
 
@@ -409,7 +409,7 @@ mt7915_mcu_rx_log_message(struct mt7915_dev *dev, struct sk_buff *skb)
 static void
 mt7915_mcu_cca_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	if (!vif->color_change_active)
+	if (!vif->bss_conf.color_change_active)
 		return;
 
 	ieee80211_color_change_finish(vif);
@@ -1818,7 +1818,7 @@ mt7915_mcu_beacon_cntdwn(struct ieee80211_vif *vif, struct sk_buff *rskb,
 	if (!offs->cntdwn_counter_offs[0])
 		return;
 
-	sub_tag = vif->csa_active ? BSS_INFO_BCN_CSA : BSS_INFO_BCN_BCC;
+	sub_tag = vif->bss_conf.csa_active ? BSS_INFO_BCN_CSA : BSS_INFO_BCN_BCC;
 	tlv = mt7915_mcu_add_nested_subtlv(rskb, sub_tag, sizeof(*info),
 					   &bcn->sub_ntlv, &bcn->len);
 	info = (struct bss_info_bcn_cntdwn *)tlv;
@@ -1903,9 +1903,9 @@ mt7915_mcu_beacon_cont(struct mt7915_dev *dev, struct ieee80211_vif *vif,
 	if (offs->cntdwn_counter_offs[0]) {
 		u16 offset = offs->cntdwn_counter_offs[0];
 
-		if (vif->csa_active)
+		if (vif->bss_conf.csa_active)
 			cont->csa_ofs = cpu_to_le16(offset - 4);
-		if (vif->color_change_active)
+		if (vif->bss_conf.color_change_active)
 			cont->bcc_ofs = cpu_to_le16(offset - 3);
 	}
 
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 6959efa4bfa9..21a9e3b0cbac 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4675,7 +4675,7 @@ static void wlcore_op_change_chanctx(struct ieee80211_hw *hw,
 		struct ieee80211_vif *vif = wl12xx_wlvif_to_vif(wlvif);
 
 		rcu_read_lock();
-		if (rcu_access_pointer(vif->chanctx_conf) != ctx) {
+		if (rcu_access_pointer(vif->bss_conf.chanctx_conf) != ctx) {
 			rcu_read_unlock();
 			continue;
 		}
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5c9e97eca739..e3ded46f70ac 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -636,6 +636,19 @@ struct ieee80211_fils_discovery {
  * @tx_pwr_env_num: number of @tx_pwr_env.
  * @pwr_reduction: power constraint of BSS.
  * @eht_support: does this BSS support EHT
+ * @csa_active: marks whether a channel switch is going on. Internally it is
+ *	write-protected by sdata_lock and local->mtx so holding either is fine
+ *	for read access.
+ * @mu_mimo_owner: indicates interface owns MU-MIMO capability
+ * @chanctx_conf: The channel context this interface is assigned to, or %NULL
+ *	when it is not assigned. This pointer is RCU-protected due to the TX
+ *	path needing to access it; even though the netdev carrier will always
+ *	be off when it is %NULL there can still be races and packets could be
+ *	processed after it switches back to %NULL.
+ * @color_change_active: marks whether a color change is ongoing. Internally it is
+ *	write-protected by sdata_lock and local->mtx so holding either is fine
+ *	for read access.
+ * @color_change_color: the bss color that will be used after the change.
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
@@ -711,6 +724,13 @@ struct ieee80211_bss_conf {
 	u8 tx_pwr_env_num;
 	u8 pwr_reduction;
 	bool eht_support;
+
+	bool csa_active;
+	bool mu_mimo_owner;
+	struct ieee80211_chanctx_conf __rcu *chanctx_conf;
+
+	bool color_change_active;
+	u8 color_change_color;
 };
 
 /**
@@ -1713,10 +1733,6 @@ enum ieee80211_offload_flags {
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
- * @csa_active: marks whether a channel switch is going on. Internally it is
- *	write-protected by sdata_lock and local->mtx so holding either is fine
- *	for read access.
- * @mu_mimo_owner: indicates interface owns MU-MIMO capability
  * @driver_flags: flags/capabilities the driver has for this interface,
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
@@ -1728,11 +1744,6 @@ enum ieee80211_offload_flags {
  *	restrictions.
  * @hw_queue: hardware queue for each AC
  * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only
- * @chanctx_conf: The channel context this interface is assigned to, or %NULL
- *	when it is not assigned. This pointer is RCU-protected due to the TX
- *	path needing to access it; even though the netdev carrier will always
- *	be off when it is %NULL there can still be races and packets could be
- *	processed after it switches back to %NULL.
  * @debugfs_dir: debugfs dentry, can be used by drivers to create own per
  *	interface debug files. Note that it will be NULL for the virtual
  *	monitor interface (if that is requested.)
@@ -1747,10 +1758,6 @@ enum ieee80211_offload_flags {
  *	protected by fq->lock.
  * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
  *	&enum ieee80211_offload_flags.
- * @color_change_active: marks whether a color change is ongoing. Internally it is
- *	write-protected by sdata_lock and local->mtx so holding either is fine
- *	for read access.
- * @color_change_color: the bss color that will be used after the change.
  * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled.
  */
 struct ieee80211_vif {
@@ -1758,16 +1765,12 @@ struct ieee80211_vif {
 	struct ieee80211_bss_conf bss_conf;
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
-	bool csa_active;
-	bool mu_mimo_owner;
 
 	u8 cab_queue;
 	u8 hw_queue[IEEE80211_NUM_ACS];
 
 	struct ieee80211_txq *txq;
 
-	struct ieee80211_chanctx_conf __rcu *chanctx_conf;
-
 	u32 driver_flags;
 	u32 offload_flags;
 
@@ -1780,9 +1783,6 @@ struct ieee80211_vif {
 
 	bool txqs_stopped[IEEE80211_NUM_ACS];
 
-	bool color_change_active;
-	u8 color_change_color;
-
 	struct ieee80211_vif *mbssid_tx_vif;
 
 	/* must be last */
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 4bab1683652d..2e66598fac79 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: ISC
 /*
  * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
  */
 
 #include <net/mac80211.h>
@@ -637,7 +637,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
 
 	len += 38; /* Ethernet header length */
 
-	conf = rcu_dereference(vif->chanctx_conf);
+	conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (conf) {
 		band = conf->def.chan->band;
 		shift = ieee80211_chandef_get_shift(&conf->def);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 362cac9e2135..8a15c71e7c7d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -53,7 +53,7 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
 				params->vht_mumimo_follow_addr);
 	}
 
-	sdata->vif.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow;
+	sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow;
 }
 
 static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata,
@@ -1310,7 +1310,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	/* don't allow changing the beacon while a countdown is in place - offset
 	 * of channel switch counter may change
 	 */
-	if (sdata->vif.csa_active || sdata->vif.color_change_active)
+	if (sdata->vif.bss_conf.csa_active || sdata->vif.bss_conf.color_change_active)
 		return -EBUSY;
 
 	old = sdata_dereference(sdata->u.ap.beacon, sdata);
@@ -1368,7 +1368,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 
 	/* abort any running channel switch */
 	mutex_lock(&local->mtx);
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 	if (sdata->csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
@@ -3067,7 +3067,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 	 * to send something, and if we're an AP we have to be able to do
 	 * so at a basic rate so that all clients can receive it.
 	 */
-	if (rcu_access_pointer(sdata->vif.chanctx_conf) &&
+	if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) &&
 	    sdata->vif.bss_conf.chandef.chan) {
 		u32 basic_rates = sdata->vif.bss_conf.basic_rates;
 		enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band;
@@ -3374,7 +3374,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 					&sdata->csa_chandef))
 		return -EINVAL;
 
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 
 	err = ieee80211_set_after_csa_beacon(sdata, &changed);
 	if (err)
@@ -3418,7 +3418,7 @@ void ieee80211_csa_finalize_work(struct work_struct *work)
 	mutex_lock(&local->chanctx_mtx);
 
 	/* AP might have been stopped while waiting for the lock. */
-	if (!sdata->vif.csa_active)
+	if (!sdata->vif.bss_conf.csa_active)
 		goto unlock;
 
 	if (!ieee80211_sdata_running(sdata))
@@ -3570,7 +3570,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 
 static void ieee80211_color_change_abort(struct ieee80211_sub_if_data  *sdata)
 {
-	sdata->vif.color_change_active = false;
+	sdata->vif.bss_conf.color_change_active = false;
 
 	ieee80211_free_next_beacon(sdata);
 
@@ -3603,11 +3603,11 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 		return -EINVAL;
 
 	/* don't allow another channel switch if one is already active. */
-	if (sdata->vif.csa_active)
+	if (sdata->vif.bss_conf.csa_active)
 		return -EBUSY;
 
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf) {
 		err = -EBUSY;
@@ -3646,7 +3646,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	/* if there is a color change in progress, abort it */
-	if (sdata->vif.color_change_active)
+	if (sdata->vif.bss_conf.color_change_active)
 		ieee80211_color_change_abort(sdata);
 
 	err = ieee80211_set_csa_beacon(sdata, params, &changed);
@@ -3657,7 +3657,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata->csa_chandef = params->chandef;
 	sdata->csa_block_tx = params->block_tx;
-	sdata->vif.csa_active = true;
+	sdata->vif.bss_conf.csa_active = true;
 
 	if (sdata->csa_block_tx)
 		ieee80211_stop_vif_queues(local, sdata,
@@ -3826,7 +3826,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
 	mutex_lock(&local->mtx);
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		ret = -EINVAL;
 		goto unlock;
@@ -3909,7 +3909,7 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 	int ret = -ENODATA;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (chanctx_conf) {
 		*chandef = sdata->vif.bss_conf.chandef;
 		ret = 0;
@@ -4405,7 +4405,7 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
 	sdata_assert_lock(sdata);
 	lockdep_assert_held(&local->mtx);
 
-	sdata->vif.color_change_active = false;
+	sdata->vif.bss_conf.color_change_active = false;
 
 	err = ieee80211_set_after_color_change_beacon(sdata, &changed);
 	if (err) {
@@ -4414,7 +4414,7 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
 	}
 
 	ieee80211_color_change_bss_config_notify(sdata,
-						 sdata->vif.color_change_color,
+						 sdata->vif.bss_conf.color_change_color,
 						 1, changed);
 	cfg80211_color_change_notify(sdata->dev);
 
@@ -4432,7 +4432,7 @@ void ieee80211_color_change_finalize_work(struct work_struct *work)
 	mutex_lock(&local->mtx);
 
 	/* AP might have been stopped while waiting for the lock. */
-	if (!sdata->vif.color_change_active)
+	if (!sdata->vif.bss_conf.color_change_active)
 		goto unlock;
 
 	if (!ieee80211_sdata_running(sdata))
@@ -4460,7 +4460,7 @@ ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 
-	if (sdata->vif.color_change_active || sdata->vif.csa_active)
+	if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active)
 		return;
 
 	cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap);
@@ -4486,7 +4486,7 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
 	/* don't allow another color change if one is already active or if csa
 	 * is active
 	 */
-	if (sdata->vif.color_change_active || sdata->vif.csa_active) {
+	if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) {
 		err = -EBUSY;
 		goto out;
 	}
@@ -4495,8 +4495,8 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
 	if (err)
 		goto out;
 
-	sdata->vif.color_change_active = true;
-	sdata->vif.color_change_color = params->color;
+	sdata->vif.bss_conf.color_change_active = true;
+	sdata->vif.bss_conf.color_change_color = params->color;
 
 	cfg80211_color_change_started_notify(sdata->dev, params->count);
 
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index d8246e00a10b..eea400902133 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * mac80211 - channel management
- * Copyright 2020 - 2021 Intel Corporation
+ * Copyright 2020 - 2022 Intel Corporation
  */
 
 #include <linux/nl80211.h>
@@ -72,7 +72,7 @@ ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_local *local __maybe_unused = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf)
 		return NULL;
@@ -260,7 +260,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
 			continue;
 
 		switch (vif->type) {
@@ -298,7 +298,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 
 	/* use the configured bandwidth in case of monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
-	if (sdata && rcu_access_pointer(sdata->vif.chanctx_conf) == conf)
+	if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == conf)
 		max_bw = max(max_bw, conf->def.width);
 
 	rcu_read_unlock();
@@ -368,7 +368,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 		if (!ieee80211_sdata_running(sta->sdata))
 			continue;
 
-		if (rcu_access_pointer(sta->sdata->vif.chanctx_conf) !=
+		if (rcu_access_pointer(sta->sdata->vif.bss_conf.chanctx_conf) !=
 		    &ctx->conf)
 			continue;
 
@@ -533,7 +533,7 @@ ieee80211_chanctx_radar_required(struct ieee80211_local *local,
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		if (!ieee80211_sdata_running(sdata))
 			continue;
-		if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
 			continue;
 		if (!sdata->radar_required)
 			continue;
@@ -689,7 +689,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 
 		if (!ieee80211_sdata_running(sdata))
 			continue;
-		if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
 			continue;
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			continue;
@@ -759,7 +759,7 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
 		return -ENOTSUPP;
 
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 
 	if (conf) {
@@ -781,7 +781,7 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
 	}
 
 out:
-	rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
+	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, conf);
 
 	sdata->vif.bss_conf.idle = !conf;
 
@@ -825,7 +825,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		if (rcu_access_pointer(sdata->vif.chanctx_conf) !=
+		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) !=
 						&chanctx->conf)
 			continue;
 
@@ -874,7 +874,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 	/* Disable SMPS for the monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
 	if (sdata &&
-	    rcu_access_pointer(sdata->vif.chanctx_conf) == &chanctx->conf)
+	    rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf)
 		rx_chains_dynamic = rx_chains_static = local->rx_chains;
 
 	rcu_read_unlock();
@@ -917,7 +917,7 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
 	 * channel context pointer for a while, possibly pointing
 	 * to a channel context that has already been freed.
 	 */
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	WARN_ON(!conf);
 
@@ -925,7 +925,7 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
 		conf = NULL;
 
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
-		rcu_assign_pointer(vlan->vif.chanctx_conf, conf);
+		rcu_assign_pointer(vlan->vif.bss_conf.chanctx_conf, conf);
 }
 
 void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
@@ -1173,7 +1173,7 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 	}
 
 	list_move(&sdata->assigned_chanctx_list, &new_ctx->assigned_vifs);
-	rcu_assign_pointer(sdata->vif.chanctx_conf, &new_ctx->conf);
+	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, &new_ctx->conf);
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP)
 		__ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
@@ -1515,7 +1515,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 			if (!ieee80211_vif_has_in_place_reservation(sdata))
 				continue;
 
-			rcu_assign_pointer(sdata->vif.chanctx_conf, &ctx->conf);
+			rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf,
+					   &ctx->conf);
 
 			if (sdata->vif.type == NL80211_IFTYPE_AP)
 				__ieee80211_vif_copy_chanctx_to_vlans(sdata,
@@ -1634,7 +1635,7 @@ static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf)
 		return;
@@ -1809,7 +1810,7 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf) {
 		ret = -EINVAL;
@@ -1879,9 +1880,9 @@ void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	conf = rcu_dereference_protected(ap->vif.chanctx_conf,
+	conf = rcu_dereference_protected(ap->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
-	rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
+	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, conf);
 	mutex_unlock(&local->chanctx_mtx);
 }
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 4e2fc1a08681..fd2882348211 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -165,7 +165,7 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
 	if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
 			 sdata->vif.type == NL80211_IFTYPE_NAN ||
 			 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
-			  !sdata->vif.mu_mimo_owner &&
+			  !sdata->vif.bss_conf.mu_mimo_owner &&
 			  !(changed & BSS_CHANGED_TXPOWER))))
 		return;
 
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 31cd3c1ac07f..6e1fc8788101 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -5,7 +5,7 @@
  * Copied from cfg.c - originally
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2014	Intel Corporation (Author: Johannes Berg)
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018, 2022 Intel Corporation
  */
 #include <linux/types.h>
 #include <net/cfg80211.h>
@@ -150,7 +150,7 @@ do_survey:
 	survey.filled = 0;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (chanctx_conf)
 		channel = chanctx_conf->def.chan;
 	else
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 14c04fd48b7a..8ff547ff351e 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -9,7 +9,7 @@
  * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright(c) 2018-2021 Intel Corporation
+ * Copyright(c) 2018-2022 Intel Corporation
  */
 
 #include <linux/delay.h>
@@ -622,7 +622,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	}
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON_ONCE(!chanctx_conf))
 		return NULL;
 	band = chanctx_conf->def.chan->band;
@@ -923,7 +923,7 @@ ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata,
 	if (len < required_len)
 		return;
 
-	if (!sdata->vif.csa_active)
+	if (!sdata->vif.bss_conf.csa_active)
 		ieee80211_ibss_process_chanswitch(sdata, elems, false);
 }
 
@@ -1143,7 +1143,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		goto put_bss;
 
 	/* process channel switch */
-	if (sdata->vif.csa_active ||
+	if (sdata->vif.bss_conf.csa_active ||
 	    ieee80211_ibss_process_chanswitch(sdata, elems, true))
 		goto put_bss;
 
@@ -1220,7 +1220,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON_ONCE(!chanctx_conf)) {
 		rcu_read_unlock();
 		return;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1cf331572de1..5a1347727b4a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1076,7 +1076,7 @@ ieee80211_vif_get_shift(struct ieee80211_vif *vif)
 	int shift = 0;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(vif->chanctx_conf);
+	chanctx_conf = rcu_dereference(vif->bss_conf.chanctx_conf);
 	if (chanctx_conf)
 		shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
 	rcu_read_unlock();
@@ -1527,7 +1527,7 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata)
 	enum nl80211_band band;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 
 	if (!chanctx_conf) {
 		rcu_read_unlock();
@@ -2224,7 +2224,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_chanctx_conf *chanctx_conf;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		kfree_skb(skb);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index fb8d102fca48..9e0e71ca8068 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -51,7 +51,7 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
 	int power;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!chanctx_conf) {
 		rcu_read_unlock();
 		return false;
@@ -275,7 +275,7 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
 			 * will not add another interface while any channel
 			 * switch is active.
 			 */
-			if (nsdata->vif.csa_active)
+			if (nsdata->vif.bss_conf.csa_active)
 				return -EBUSY;
 
 			/*
@@ -450,7 +450,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	cancel_work_sync(&sdata->recalc_smps);
 	sdata_lock(sdata);
 	mutex_lock(&local->mtx);
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 	if (sdata->vif.type == NL80211_IFTYPE_STATION)
 		sdata->u.mgd.csa_waiting_bcn = false;
 	if (sdata->csa_block_tx) {
@@ -502,7 +502,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 		mutex_lock(&local->mtx);
 		list_del(&sdata->u.vlan.list);
 		mutex_unlock(&local->mtx);
-		RCU_INIT_POINTER(sdata->vif.chanctx_conf, NULL);
+		RCU_INIT_POINTER(sdata->vif.bss_conf.chanctx_conf, NULL);
 		/* see comment in the default case below */
 		ieee80211_free_keys(sdata, true);
 		/* no need to tell driver */
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4f3e93c0819b..ebde131efcaa 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -147,7 +147,7 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		if (!rcu_access_pointer(sdata->vif.chanctx_conf))
+		if (!rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf))
 			continue;
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			continue;
@@ -284,7 +284,7 @@ static void ieee80211_restart_work(struct work_struct *work)
 			 * Then we can have a race...
 			 */
 			cancel_work_sync(&sdata->u.mgd.csa_connection_drop_work);
-			if (sdata->vif.csa_active) {
+			if (sdata->vif.bss_conf.csa_active) {
 				sdata_lock(sdata);
 				ieee80211_sta_connection_lost(sdata,
 							      WLAN_REASON_UNSPECIFIED,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 5275f4f32a78..f60e257cba95 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  * Authors:    Luis Carlos Cobo <luisca@cozybit.com>
  * 	       Javier Cardona <javier@cozybit.com>
  */
@@ -399,7 +399,7 @@ static int mesh_add_ds_params_ie(struct ieee80211_sub_if_data *sdata,
 		return -ENOMEM;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return -EINVAL;
@@ -455,7 +455,7 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
 	u8 *pos;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return -EINVAL;
@@ -527,7 +527,7 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
 	u8 *pos;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return -EINVAL;
@@ -820,7 +820,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
 
 	sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh);
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	band = chanctx_conf->def.chan->band;
 	rcu_read_unlock();
 
@@ -1357,7 +1357,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 					      rx_status);
 
 		if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT &&
-		    !sdata->vif.csa_active)
+		    !sdata->vif.bss_conf.csa_active)
 			ieee80211_mesh_process_chnswitch(sdata, elems, true);
 	}
 
@@ -1488,7 +1488,7 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata,
 
 	ifmsh->pre_value = pre_value;
 
-	if (!sdata->vif.csa_active &&
+	if (!sdata->vif.bss_conf.csa_active &&
 	    !ieee80211_mesh_process_chnswitch(sdata, elems, false)) {
 		mcsa_dbg(sdata, "Failed to process CSA action frame");
 		goto free;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e0a9b7d63071..c526af66ff8d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -624,7 +624,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_sub_if_data *other;
 
 		list_for_each_entry_rcu(other, &local->interfaces, list) {
-			if (other->vif.mu_mimo_owner) {
+			if (other->vif.bss_conf.mu_mimo_owner) {
 				disable_mu_mimo = true;
 				break;
 			}
@@ -632,7 +632,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		if (disable_mu_mimo)
 			cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
 		else
-			sdata->vif.mu_mimo_owner = true;
+			sdata->vif.bss_conf.mu_mimo_owner = true;
 	}
 
 	mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
@@ -664,7 +664,7 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
 	bool reg_cap = false;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!WARN_ON_ONCE(!chanctx_conf))
 		reg_cap = cfg80211_chandef_usable(sdata->wdev.wiphy,
 						  &chanctx_conf->def,
@@ -705,7 +705,7 @@ static void ieee80211_add_eht_ie(struct ieee80211_sub_if_data *sdata,
 	bool reg_cap = false;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!WARN_ON_ONCE(!chanctx_conf))
 		reg_cap = cfg80211_chandef_usable(sdata->wdev.wiphy,
 						  &chanctx_conf->def,
@@ -766,7 +766,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 	sdata_assert_lock(sdata);
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return -EINVAL;
@@ -1229,7 +1229,7 @@ static void ieee80211_chswitch_work(struct work_struct *work)
 	if (!ifmgd->associated)
 		goto out;
 
-	if (!sdata->vif.csa_active)
+	if (!sdata->vif.bss_conf.csa_active)
 		goto out;
 
 	/*
@@ -1289,7 +1289,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata)
 
 	sdata_assert_lock(sdata);
 
-	WARN_ON(!sdata->vif.csa_active);
+	WARN_ON(!sdata->vif.bss_conf.csa_active);
 
 	if (sdata->csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
@@ -1297,7 +1297,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata)
 		sdata->csa_block_tx = false;
 	}
 
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 	ifmgd->csa_waiting_bcn = false;
 	/*
 	 * If the CSA IE is still present on the beacon after the switch,
@@ -1361,7 +1361,7 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_sub_if_data *sdata)
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
 
 	sdata->csa_block_tx = false;
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 
 	mutex_unlock(&local->mtx);
 
@@ -1412,13 +1412,13 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	if (res < 0)
 		goto lock_and_drop_connection;
 
-	if (beacon && sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) {
+	if (beacon && sdata->vif.bss_conf.csa_active && !ifmgd->csa_waiting_bcn) {
 		if (res)
 			ieee80211_sta_abort_chanswitch(sdata);
 		else
 			drv_channel_switch_rx_beacon(sdata, &ch_switch);
 		return;
-	} else if (sdata->vif.csa_active || res) {
+	} else if (sdata->vif.bss_conf.csa_active || res) {
 		/* disregard subsequent announcements if already processing */
 		return;
 	}
@@ -1471,7 +1471,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 
 	mutex_lock(&local->mtx);
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf) {
 		sdata_info(sdata,
@@ -1504,7 +1504,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	}
 	mutex_unlock(&local->chanctx_mtx);
 
-	sdata->vif.csa_active = true;
+	sdata->vif.bss_conf.csa_active = true;
 	sdata->csa_chandef = csa_ie.chandef;
 	sdata->csa_block_tx = csa_ie.mode;
 	ifmgd->csa_ignored_same_chan = false;
@@ -1543,7 +1543,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	 * send a deauthentication frame. Those two fields will be
 	 * reset when the disconnection worker runs.
 	 */
-	sdata->vif.csa_active = true;
+	sdata->vif.bss_conf.csa_active = true;
 	sdata->csa_block_tx = csa_ie.mode;
 
 	ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work);
@@ -2447,7 +2447,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(sdata->vif.bss_conf.mu_group.position, 0,
 	       sizeof(sdata->vif.bss_conf.mu_group.position));
 	changed |= BSS_CHANGED_MU_GROUPS;
-	sdata->vif.mu_mimo_owner = false;
+	sdata->vif.bss_conf.mu_mimo_owner = false;
 
 	sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
 
@@ -2482,7 +2482,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	mutex_lock(&local->mtx);
 	ieee80211_vif_release_channel(sdata);
 
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 	ifmgd->csa_waiting_bcn = false;
 	ifmgd->csa_ignored_same_chan = false;
 	if (sdata->csa_block_tx) {
@@ -2808,7 +2808,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 					WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
 			       tx, frame_buf);
 	mutex_lock(&local->mtx);
-	sdata->vif.csa_active = false;
+	sdata->vif.bss_conf.csa_active = false;
 	ifmgd->csa_waiting_bcn = false;
 	if (sdata->csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
@@ -2948,7 +2948,7 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		eth_zero_addr(sdata->u.mgd.bssid);
 		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
-		sdata->vif.mu_mimo_owner = false;
+		sdata->vif.bss_conf.mu_mimo_owner = false;
 
 		mutex_lock(&sdata->local->mtx);
 		ieee80211_vif_release_channel(sdata);
@@ -4134,7 +4134,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!chanctx_conf) {
 		rcu_read_unlock();
 		return;
@@ -4803,7 +4803,7 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t)
 		from_timer(sdata, t, u.mgd.bcn_mon_timer);
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
+	if (sdata->vif.bss_conf.csa_active && !ifmgd->csa_waiting_bcn)
 		return;
 
 	if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)
@@ -4823,7 +4823,7 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t)
 	struct sta_info *sta;
 	unsigned long timeout;
 
-	if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
+	if (sdata->vif.bss_conf.csa_active && !ifmgd->csa_waiting_bcn)
 		return;
 
 	sta = sta_info_get(sdata, ifmgd->bssid);
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index f97cb4c453d3..d0f0d96b0948 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -4,6 +4,7 @@
  *
  * Copyright: (c) 2014 Czech Technical University in Prague
  *            (c) 2014 Volkswagen Group Research
+ * Copyright (C) 2022 Intel Corporation
  * Author:    Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
  * Funded by: Volkswagen Group Research
  */
@@ -59,7 +60,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	ocb_dbg(sdata, "Adding new OCB station %pM\n", addr);
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON_ONCE(!chanctx_conf)) {
 		rcu_read_unlock();
 		return;
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index c5d2ab9df1e7..a1fbd562cac1 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -8,7 +8,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019, 2022 Intel Corporation
  */
 #include <linux/export.h>
 #include <net/mac80211.h>
@@ -845,7 +845,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		struct ieee80211_chanctx_conf *chanctx_conf;
 
 		rcu_read_lock();
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 
 		if (chanctx_conf) {
 			need_offchan = params->chan &&
@@ -876,7 +876,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	data = skb_put_data(skb, params->buf, params->len);
 
 	/* Update CSA counters */
-	if (sdata->vif.csa_active &&
+	if (sdata->vif.bss_conf.csa_active &&
 	    (sdata->vif.type == NL80211_IFTYPE_AP ||
 	     sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
 	     sdata->vif.type == NL80211_IFTYPE_ADHOC) &&
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index ae9700e0a1a5..f22381127948 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -4,6 +4,7 @@
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
  * Copyright 2017	Intel Deutschland GmbH
+ * Copyright (C) 2022 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -43,7 +44,7 @@ void rate_control_rate_init(struct sta_info *sta)
 
 	rcu_read_lock();
 
-	chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		return;
@@ -100,7 +101,7 @@ void rate_control_rate_update(struct ieee80211_local *local,
 	if (ref && ref->ops->rate_update) {
 		rcu_read_lock();
 
-		chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf);
 		if (WARN_ON(!chanctx_conf)) {
 			rcu_read_unlock();
 			return;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a9f4e90ad893..9a179a18aafc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3167,7 +3167,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 	if (ieee80211_hw_check(&rx->local->hw, DETECTS_COLOR_COLLISION))
 		return;
 
-	if (rx->sdata->vif.csa_active)
+	if (rx->sdata->vif.bss_conf.csa_active)
 		return;
 
 	baselen = mgmt->u.beacon.variable - rx->skb->data;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index e04a0905e941..7eeb6919d2b7 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1467,7 +1467,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
 	skb->dev = sdata->dev;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (WARN_ON(!chanctx_conf)) {
 		rcu_read_unlock();
 		kfree_skb(skb);
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 4e2d22e47429..fa04021d4c0f 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -6,7 +6,7 @@
  * Copyright 2014, Intel Corporation
  * Copyright 2014  Intel Mobile Communications GmbH
  * Copyright 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2019, 2021 Intel Corporation
+ * Copyright (C) 2019, 2021-2022 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -1254,7 +1254,7 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_supported_band *sband;
 
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (conf) {
 		width = conf->def.width;
@@ -1372,7 +1372,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
 	switch (oper) {
 	case NL80211_TDLS_ENABLE_LINK:
-		if (sdata->vif.csa_active) {
+		if (sdata->vif.bss_conf.csa_active) {
 			tdls_dbg(sdata, "TDLS: disallow link during CSA\n");
 			ret = -EBUSY;
 			break;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 37fe72bb5ab0..e5edc6fd21c0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -57,7 +57,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 		return 0;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(tx->sdata->vif.bss_conf.chanctx_conf);
 	if (chanctx_conf) {
 		shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
 		rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
@@ -2345,12 +2345,12 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 		}
 	}
 
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!chanctx_conf) {
 		tmp_sdata = rcu_dereference(local->monitor_sdata);
 		if (tmp_sdata)
 			chanctx_conf =
-				rcu_dereference(tmp_sdata->vif.chanctx_conf);
+				rcu_dereference(tmp_sdata->vif.bss_conf.chanctx_conf);
 	}
 
 	if (chanctx_conf)
@@ -2599,7 +2599,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		}
 		ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
 					u.ap);
-		chanctx_conf = rcu_dereference(ap_sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(ap_sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2610,7 +2610,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		fallthrough;
 	case NL80211_IFTYPE_AP:
 		if (sdata->vif.type == NL80211_IFTYPE_AP)
-			chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+			chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2689,7 +2689,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 						skb->data + ETH_ALEN);
 
 		}
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2732,7 +2732,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 			memcpy(hdr.addr3, skb->data, ETH_ALEN);
 			hdrlen = 24;
 		}
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2745,7 +2745,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
 		eth_broadcast_addr(hdr.addr3);
 		hdrlen = 24;
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2758,7 +2758,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
 		memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
 		hdrlen = 24;
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (!chanctx_conf) {
 			ret = -ENOTCONN;
 			goto free;
@@ -2981,7 +2981,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 		goto out;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (!chanctx_conf) {
 		rcu_read_unlock();
 		goto out;
@@ -4604,7 +4604,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
 	sdata = vif_to_sdata(info->control.vif);
 
 	if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) {
-		chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (unlikely(!chanctx_conf)) {
 			dev_kfree_skb(skb);
 			return true;
@@ -4808,7 +4808,7 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 
 	bcn_offsets = beacon->cntdwn_counter_offsets;
 	count = beacon->cntdwn_current_counter;
-	if (sdata->vif.csa_active)
+	if (sdata->vif.bss_conf.csa_active)
 		max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
 
 	for (i = 0; i < max_count; ++i) {
@@ -5119,7 +5119,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 	rcu_read_lock();
 
 	sdata = vif_to_sdata(vif);
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 
 	if (!ieee80211_sdata_running(sdata) || !chanctx_conf)
 		goto out;
@@ -5536,7 +5536,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw,
 	sdata = vif_to_sdata(vif);
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 
 	if (!chanctx_conf)
 		goto out;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 9e6c4dcef280..48d8f0aad69f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1566,7 +1566,7 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	if (chanctx_conf)
 		center_freq = chanctx_conf->def.chan->center_freq;
 
@@ -1613,7 +1613,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 	memset(&qparam, 0, sizeof(qparam));
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 	use_11b = (chanctx_conf &&
 		   chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) &&
 		 !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
@@ -2264,7 +2264,7 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 		return;
 
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (conf) {
 		ctx = container_of(conf, struct ieee80211_chanctx, conf);
@@ -2523,7 +2523,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_TXPOWER |
 			  BSS_CHANGED_MCAST_RATE;
 
-		if (sdata->vif.mu_mimo_owner)
+		if (sdata->vif.bss_conf.mu_mimo_owner)
 			changed |= BSS_CHANGED_MU_GROUPS;
 
 		switch (sdata->vif.type) {
@@ -2806,8 +2806,8 @@ void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
-					lockdep_is_held(&local->chanctx_mtx));
+	chanctx_conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+						 lockdep_is_held(&local->chanctx_mtx));
 
 	/*
 	 * This function can be called from a work, thus it may be possible
@@ -2832,8 +2832,8 @@ void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
-					lockdep_is_held(&local->chanctx_mtx));
+	chanctx_conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+						 lockdep_is_held(&local->chanctx_mtx));
 
 	if (WARN_ON_ONCE(!chanctx_conf))
 		goto unlock;
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ff26e0c4787b..ac97584b3a0b 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -4,7 +4,7 @@
  *
  * Portions of this file
  * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -649,7 +649,7 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 
-	if (!sdata->vif.mu_mimo_owner)
+	if (!sdata->vif.bss_conf.mu_mimo_owner)
 		return;
 
 	if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
@@ -673,7 +673,7 @@ void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
 {
 	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 
-	if (WARN_ON_ONCE(!vif->mu_mimo_owner))
+	if (WARN_ON_ONCE(!vif->bss_conf.mu_mimo_owner))
 		return;
 
 	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
-- 
cgit 


From f276e20b182dbfc069d192fda259d85feea71143 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 10 May 2022 17:05:04 +0200
Subject: wifi: mac80211: move interface config to new struct

We'll use bss_conf for per-link configuration later, so
move out all the non-link-specific data out into a new
struct ieee80211_vif_cfg used in the vif.

Some adjustments were done with the following spatch:

    @@
    expression sdata;
    struct ieee80211_vif *vifp;
    identifier var = { assoc, ibss_joined, aid, arp_addr_list, arp_addr_cnt, ssid, ssid_len, s1g, ibss_creator };
    @@
    (
    -sdata->vif.bss_conf.var
    +sdata->vif.cfg.var
    |
    -vifp->bss_conf.var
    +vifp->cfg.var
    )

    @bss_conf@
    struct ieee80211_bss_conf *bss_conf;
    identifier var = { assoc, ibss_joined, aid, arp_addr_list, arp_addr_cnt, ssid, ssid_len, s1g, ibss_creator };
    @@
    -bss_conf->var
    +vif_cfg->var

(though more manual fixups were needed, e.g. replacing
"vif_cfg->" by "vif->cfg." in many files.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ar5523/ar5523.c           | 10 ++--
 drivers/net/wireless/ath/ath10k/mac.c              | 29 +++++-----
 drivers/net/wireless/ath/ath11k/mac.c              | 22 ++++----
 drivers/net/wireless/ath/ath5k/mac80211-ops.c      | 12 ++--
 drivers/net/wireless/ath/ath9k/beacon.c            | 11 ++--
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      | 16 +++---
 drivers/net/wireless/ath/ath9k/main.c              |  8 +--
 drivers/net/wireless/ath/carl9170/main.c           |  2 +-
 drivers/net/wireless/ath/wcn36xx/main.c            | 16 +++---
 drivers/net/wireless/ath/wcn36xx/smd.c             |  2 +-
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      | 12 ++--
 drivers/net/wireless/intel/iwlegacy/3945-mac.c     |  6 +-
 drivers/net/wireless/intel/iwlegacy/4965.c         |  6 +-
 drivers/net/wireless/intel/iwlegacy/common.c       | 10 ++--
 drivers/net/wireless/intel/iwlwifi/dvm/rxon.c      | 22 ++++----
 drivers/net/wireless/intel/iwlwifi/mvm/coex.c      |  4 +-
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c        |  2 +-
 .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 12 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  | 12 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  | 28 ++++-----
 .../net/wireless/intel/iwlwifi/mvm/offloading.c    |  6 +-
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/power.c     |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/quota.c     |  4 +-
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c      |  4 +-
 drivers/net/wireless/intel/iwlwifi/mvm/sf.c        |  6 +-
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c       |  2 +-
 .../net/wireless/intel/iwlwifi/mvm/time-event.c    |  8 +--
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |  4 +-
 drivers/net/wireless/intersil/p54/main.c           |  4 +-
 drivers/net/wireless/mac80211_hwsim.c              |  6 +-
 drivers/net/wireless/marvell/mwl8k.c               | 10 ++--
 drivers/net/wireless/mediatek/mt76/mt7603/main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7615/main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7615/mcu.c    |  2 +-
 .../net/wireless/mediatek/mt76/mt76_connac_mcu.c   | 10 ++--
 drivers/net/wireless/mediatek/mt76/mt7915/main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/mcu.c    |  2 +-
 drivers/net/wireless/mediatek/mt7601u/phy.c        |  9 ++-
 drivers/net/wireless/ralink/rt2x00/rt2x00config.c  |  4 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00mac.c     |  2 +-
 .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  | 10 ++--
 drivers/net/wireless/realtek/rtlwifi/core.c        |  4 +-
 drivers/net/wireless/realtek/rtw88/bf.c            |  2 +-
 drivers/net/wireless/realtek/rtw88/mac80211.c      |  4 +-
 drivers/net/wireless/realtek/rtw88/main.c          | 13 +++--
 drivers/net/wireless/realtek/rtw89/cam.c           |  2 +-
 drivers/net/wireless/realtek/rtw89/core.c          |  2 +-
 drivers/net/wireless/realtek/rtw89/core.h          |  2 +-
 drivers/net/wireless/realtek/rtw89/mac80211.c      |  2 +-
 drivers/net/wireless/realtek/rtw89/phy.c           |  4 +-
 drivers/net/wireless/rsi/rsi_91x_core.c            |  3 +-
 drivers/net/wireless/rsi/rsi_91x_hal.c             |  7 +--
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        | 31 +++++-----
 drivers/net/wireless/rsi/rsi_91x_mgmt.c            |  3 +-
 drivers/net/wireless/silabs/wfx/hif_tx.c           | 12 ++--
 drivers/net/wireless/silabs/wfx/sta.c              | 24 ++++----
 drivers/net/wireless/st/cw1200/sta.c               | 36 ++++++------
 drivers/net/wireless/st/cw1200/txrx.c              |  4 +-
 drivers/net/wireless/ti/wl1251/main.c              |  8 +--
 drivers/net/wireless/ti/wlcore/cmd.c               |  4 +-
 drivers/net/wireless/ti/wlcore/main.c              | 37 ++++++------
 drivers/staging/vt6655/device_main.c               |  6 +-
 drivers/staging/vt6656/main_usb.c                  |  4 +-
 include/net/mac80211.h                             | 66 +++++++++++++---------
 net/mac80211/cfg.c                                 | 10 ++--
 net/mac80211/chan.c                                |  2 +-
 net/mac80211/debugfs_netdev.c                      |  6 +-
 net/mac80211/ibss.c                                | 26 ++++-----
 net/mac80211/iface.c                               |  2 +-
 net/mac80211/main.c                                | 10 ++--
 net/mac80211/mlme.c                                | 26 +++++----
 net/mac80211/offchannel.c                          |  2 +-
 net/mac80211/scan.c                                |  2 +-
 net/mac80211/tdls.c                                |  4 +-
 net/mac80211/trace.h                               | 49 +++++++++-------
 net/mac80211/tx.c                                  |  2 +-
 net/mac80211/util.c                                |  2 +-
 79 files changed, 402 insertions(+), 368 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c
index 9f84a6fde0c2..4cd06a0942d4 100644
--- a/drivers/net/wireless/ath/ar5523/ar5523.c
+++ b/drivers/net/wireless/ath/ar5523/ar5523.c
@@ -1256,14 +1256,14 @@ static int ar5523_create_connection(struct ar5523 *ar,
 				sizeof(create), 0);
 }
 
-static int ar5523_write_associd(struct ar5523 *ar,
-				struct ieee80211_bss_conf *bss)
+static int ar5523_write_associd(struct ar5523 *ar, struct ieee80211_vif *vif)
 {
+	struct ieee80211_bss_conf *bss = &vif->bss_conf;
 	struct ar5523_cmd_set_associd associd;
 
 	memset(&associd, 0, sizeof(associd));
 	associd.defaultrateix = cpu_to_be32(0);	/* XXX */
-	associd.associd = cpu_to_be32(bss->aid);
+	associd.associd = cpu_to_be32(vif->cfg.aid);
 	associd.timoffset = cpu_to_be32(0x3b);	/* XXX */
 	memcpy(associd.bssid, bss->bssid, ETH_ALEN);
 	return ar5523_cmd_write(ar, WDCMSG_WRITE_ASSOCID, &associd,
@@ -1284,7 +1284,7 @@ static void ar5523_bss_info_changed(struct ieee80211_hw *hw,
 	if (!(changed & BSS_CHANGED_ASSOC))
 		goto out_unlock;
 
-	if (bss->assoc) {
+	if (vif->cfg.assoc) {
 		error = ar5523_create_connection(ar, vif, bss);
 		if (error) {
 			ar5523_err(ar, "could not create connection\n");
@@ -1297,7 +1297,7 @@ static void ar5523_bss_info_changed(struct ieee80211_hw *hw,
 			goto out_unlock;
 		}
 
-		error = ar5523_write_associd(ar, bss);
+		error = ar5523_write_associd(ar, vif);
 		if (error) {
 			ar5523_err(ar, "could not set association\n");
 			goto out_unlock;
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 42c657b8b73e..4e7e2da3c859 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -1509,8 +1509,8 @@ static int ath10k_vdev_start_restart(struct ath10k_vif *arvif,
 		arg.channel.chan_radar =
 			!!(chandef->chan->flags & IEEE80211_CHAN_RADAR);
 	} else if (arvif->vdev_type == WMI_VDEV_TYPE_IBSS) {
-		arg.ssid = arvif->vif->bss_conf.ssid;
-		arg.ssid_len = arvif->vif->bss_conf.ssid_len;
+		arg.ssid = arvif->vif->cfg.ssid;
+		arg.ssid_len = arvif->vif->cfg.ssid_len;
 	}
 
 	ath10k_dbg(ar, ATH10K_DBG_MAC,
@@ -1823,8 +1823,7 @@ static void ath10k_control_beaconing(struct ath10k_vif *arvif,
 }
 
 static void ath10k_control_ibss(struct ath10k_vif *arvif,
-				struct ieee80211_bss_conf *info,
-				const u8 self_peer[ETH_ALEN])
+				struct ieee80211_vif *vif)
 {
 	struct ath10k *ar = arvif->ar;
 	u32 vdev_param;
@@ -1832,7 +1831,7 @@ static void ath10k_control_ibss(struct ath10k_vif *arvif,
 
 	lockdep_assert_held(&arvif->ar->conf_mutex);
 
-	if (!info->ibss_joined) {
+	if (!vif->cfg.ibss_joined) {
 		if (is_zero_ether_addr(arvif->bssid))
 			return;
 
@@ -2163,7 +2162,7 @@ static void ath10k_peer_assoc_h_basic(struct ath10k *ar,
 	lockdep_assert_held(&ar->conf_mutex);
 
 	if (vif->type == NL80211_IFTYPE_STATION)
-		aid = vif->bss_conf.aid;
+		aid = vif->cfg.aid;
 	else
 		aid = sta->aid;
 
@@ -2193,7 +2192,8 @@ static void ath10k_peer_assoc_h_crypto(struct ath10k *ar,
 		return;
 
 	bss = cfg80211_get_bss(ar->hw->wiphy, def.chan, info->bssid,
-			       info->ssid_len ? info->ssid : NULL, info->ssid_len,
+			       vif->cfg.ssid_len ? vif->cfg.ssid : NULL,
+			       vif->cfg.ssid_len,
 			       IEEE80211_BSS_TYPE_ANY, IEEE80211_PRIVACY_ANY);
 	if (bss) {
 		const struct cfg80211_bss_ies *ies;
@@ -3118,11 +3118,11 @@ static void ath10k_bss_assoc(struct ieee80211_hw *hw,
 
 	ath10k_dbg(ar, ATH10K_DBG_MAC,
 		   "mac vdev %d up (associated) bssid %pM aid %d\n",
-		   arvif->vdev_id, bss_conf->bssid, bss_conf->aid);
+		   arvif->vdev_id, bss_conf->bssid, vif->cfg.aid);
 
 	WARN_ON(arvif->is_up);
 
-	arvif->aid = bss_conf->aid;
+	arvif->aid = vif->cfg.aid;
 	ether_addr_copy(arvif->bssid, bss_conf->bssid);
 
 	ret = ath10k_wmi_pdev_set_param(ar,
@@ -6082,7 +6082,7 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 	mutex_lock(&ar->conf_mutex);
 
 	if (changed & BSS_CHANGED_IBSS)
-		ath10k_control_ibss(arvif, info, vif->addr);
+		ath10k_control_ibss(arvif, vif);
 
 	if (changed & BSS_CHANGED_BEACON_INT) {
 		arvif->beacon_interval = info->beacon_int;
@@ -6147,9 +6147,10 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_SSID &&
 	    vif->type == NL80211_IFTYPE_AP) {
-		arvif->u.ap.ssid_len = info->ssid_len;
-		if (info->ssid_len)
-			memcpy(arvif->u.ap.ssid, info->ssid, info->ssid_len);
+		arvif->u.ap.ssid_len = vif->cfg.ssid_len;
+		if (vif->cfg.ssid_len)
+			memcpy(arvif->u.ap.ssid, vif->cfg.ssid,
+			       vif->cfg.ssid_len);
 		arvif->u.ap.hidden_ssid = info->hidden_ssid;
 	}
 
@@ -6226,7 +6227,7 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (info->assoc) {
+		if (vif->cfg.assoc) {
 			/* Workaround: Make sure monitor vdev is not running
 			 * when associating to prevent some firmware revisions
 			 * (e.g. 10.1 and 10.2) from crashing.
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index f28e7feca318..169161549361 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1539,7 +1539,7 @@ static void ath11k_peer_assoc_h_basic(struct ath11k *ar,
 	lockdep_assert_held(&ar->conf_mutex);
 
 	if (vif->type == NL80211_IFTYPE_STATION)
-		aid = vif->bss_conf.aid;
+		aid = vif->cfg.aid;
 	else
 		aid = sta->aid;
 
@@ -2749,7 +2749,7 @@ static void ath11k_bss_assoc(struct ieee80211_hw *hw,
 
 	WARN_ON(arvif->is_up);
 
-	arvif->aid = bss_conf->aid;
+	arvif->aid = vif->cfg.aid;
 	ether_addr_copy(arvif->bssid, bss_conf->bssid);
 
 	ret = ath11k_wmi_vdev_up(ar, arvif->vdev_id, arvif->aid, arvif->bssid);
@@ -2764,7 +2764,7 @@ static void ath11k_bss_assoc(struct ieee80211_hw *hw,
 
 	ath11k_dbg(ar->ab, ATH11K_DBG_MAC,
 		   "mac vdev %d up (associated) bssid %pM aid %d\n",
-		   arvif->vdev_id, bss_conf->bssid, bss_conf->aid);
+		   arvif->vdev_id, bss_conf->bssid, vif->cfg.aid);
 
 	spin_lock_bh(&ar->ab->base_lock);
 
@@ -3185,9 +3185,10 @@ static void ath11k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_SSID &&
 	    vif->type == NL80211_IFTYPE_AP) {
-		arvif->u.ap.ssid_len = info->ssid_len;
-		if (info->ssid_len)
-			memcpy(arvif->u.ap.ssid, info->ssid, info->ssid_len);
+		arvif->u.ap.ssid_len = vif->cfg.ssid_len;
+		if (vif->cfg.ssid_len)
+			memcpy(arvif->u.ap.ssid, vif->cfg.ssid,
+			       vif->cfg.ssid_len);
 		arvif->u.ap.hidden_ssid = info->hidden_ssid;
 	}
 
@@ -3275,7 +3276,7 @@ static void ath11k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (info->assoc)
+		if (vif->cfg.assoc)
 			ath11k_bss_assoc(hw, vif, info);
 		else
 			ath11k_bss_disassoc(hw, vif);
@@ -3406,14 +3407,15 @@ static void ath11k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
 		ath11k_mac_fils_discovery(arvif, info);
 
 	if (changed & BSS_CHANGED_ARP_FILTER) {
-		ipv4_cnt = min(info->arp_addr_cnt, ATH11K_IPV4_MAX_COUNT);
-		memcpy(arvif->arp_ns_offload.ipv4_addr, info->arp_addr_list,
+		ipv4_cnt = min(vif->cfg.arp_addr_cnt, ATH11K_IPV4_MAX_COUNT);
+		memcpy(arvif->arp_ns_offload.ipv4_addr,
+		       vif->cfg.arp_addr_list,
 		       ipv4_cnt * sizeof(u32));
 		memcpy(arvif->arp_ns_offload.mac_addr, vif->addr, ETH_ALEN);
 		arvif->arp_ns_offload.ipv4_count = ipv4_cnt;
 
 		ath11k_dbg(ar->ab, ATH11K_DBG_MAC, "mac arp_addr_cnt %d vif->addr %pM, offload_addr %pI4\n",
-			   info->arp_addr_cnt,
+			   vif->cfg.arp_addr_cnt,
 			   vif->addr, arvif->arp_ns_offload.ipv4_addr);
 	}
 
diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
index 532eeac9e83e..0df0fa1da181 100644
--- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c
+++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
@@ -278,9 +278,9 @@ ath5k_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	}
 
 	if (changes & BSS_CHANGED_ASSOC) {
-		avf->assoc = bss_conf->assoc;
-		if (bss_conf->assoc)
-			ah->assoc = bss_conf->assoc;
+		avf->assoc = vif->cfg.assoc;
+		if (vif->cfg.assoc)
+			ah->assoc = vif->cfg.assoc;
 		else
 			ah->assoc = ath5k_any_vif_assoc(ah);
 
@@ -288,11 +288,11 @@ ath5k_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			ath5k_set_beacon_filter(hw, ah->assoc);
 		ath5k_hw_set_ledstate(ah, ah->assoc ?
 			AR5K_LED_ASSOC : AR5K_LED_INIT);
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			ATH5K_DBG(ah, ATH5K_DEBUG_ANY,
 				  "Bss Info ASSOC %d, bssid: %pM\n",
-				  bss_conf->aid, common->curbssid);
-			common->curaid = bss_conf->aid;
+				  vif->cfg.aid, common->curbssid);
+			common->curaid = vif->cfg.aid;
 			ath5k_hw_set_bssid(ah);
 			/* Once ANI is available you would start it here */
 		}
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index 8b1b966bcef1..d41988f8ea3f 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -585,8 +585,9 @@ static void ath9k_beacon_config_adhoc(struct ath_softc *sc,
 
 static void ath9k_cache_beacon_config(struct ath_softc *sc,
 				      struct ath_chanctx *ctx,
-				      struct ieee80211_bss_conf *bss_conf)
+				      struct ieee80211_vif *vif)
 {
+	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	struct ath_beacon_config *cur_conf = &ctx->beacon;
 
@@ -596,7 +597,7 @@ static void ath9k_cache_beacon_config(struct ath_softc *sc,
 	cur_conf->beacon_interval = bss_conf->beacon_int;
 	cur_conf->dtim_period = bss_conf->dtim_period;
 	cur_conf->dtim_count = 1;
-	cur_conf->ibss_creator = bss_conf->ibss_creator;
+	cur_conf->ibss_creator = vif->cfg.ibss_creator;
 
 	/*
 	 * It looks like mac80211 may end up using beacon interval of zero in
@@ -649,7 +650,7 @@ void ath9k_beacon_config(struct ath_softc *sc, struct ieee80211_vif *main_vif,
 	cur_conf->enable_beacon = beacons;
 
 	if (sc->sc_ah->opmode == NL80211_IFTYPE_STATION) {
-		ath9k_cache_beacon_config(sc, ctx, &main_vif->bss_conf);
+		ath9k_cache_beacon_config(sc, ctx, main_vif);
 
 		ath9k_set_beacon(sc);
 		set_bit(ATH_OP_BEACONS, &common->op_flags);
@@ -657,7 +658,7 @@ void ath9k_beacon_config(struct ath_softc *sc, struct ieee80211_vif *main_vif,
 	}
 
 	/* Update the beacon configuration. */
-	ath9k_cache_beacon_config(sc, ctx, &main_vif->bss_conf);
+	ath9k_cache_beacon_config(sc, ctx, main_vif);
 
 	/*
 	 * Configure the HW beacon registers only when we have a valid
@@ -670,7 +671,7 @@ void ath9k_beacon_config(struct ath_softc *sc, struct ieee80211_vif *main_vif,
 		 * IBSS interface.
 		 */
 		if (sc->sc_ah->opmode == NL80211_IFTYPE_ADHOC &&
-		    !enabled && beacons && !main_vif->bss_conf.ibss_creator) {
+		    !enabled && beacons && !main_vif->cfg.ibss_creator) {
 			spin_lock_irqsave(&sc->sc_pm_lock, flags);
 			sc->ps_flags |= PS_BEACON_SYNC | PS_WAIT_FOR_BEACON;
 			spin_unlock_irqrestore(&sc->sc_pm_lock, flags);
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index cfee732a89b1..3a5ec4da6a38 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -100,7 +100,7 @@ static void ath9k_htc_vif_iter(void *data, u8 *mac, struct ieee80211_vif *vif)
 		priv->rearm_ani = true;
 	}
 
-	if (bss_conf->assoc) {
+	if (vif->cfg.assoc) {
 		priv->rearm_ani = true;
 		priv->reconfig_beacon = true;
 	}
@@ -1488,8 +1488,8 @@ static void ath9k_htc_bss_iter(void *data, u8 *mac, struct ieee80211_vif *vif)
 	struct ath_common *common = ath9k_hw_common(priv->ah);
 	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 
-	if ((vif->type == NL80211_IFTYPE_STATION) && bss_conf->assoc) {
-		common->curaid = bss_conf->aid;
+	if ((vif->type == NL80211_IFTYPE_STATION) && vif->cfg.assoc) {
+		common->curaid = vif->cfg.aid;
 		common->last_rssi = ATH_RSSI_DUMMY_MARKER;
 		memcpy(common->curbssid, bss_conf->bssid, ETH_ALEN);
 		set_bit(ATH_OP_PRIM_STA_VIF, &common->op_flags);
@@ -1521,17 +1521,17 @@ static void ath9k_htc_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_ASSOC) {
 		ath_dbg(common, CONFIG, "BSS Changed ASSOC %d\n",
-			bss_conf->assoc);
+			vif->cfg.assoc);
 
-		bss_conf->assoc ?
+		vif->cfg.assoc ?
 			priv->num_sta_assoc_vif++ : priv->num_sta_assoc_vif--;
 
-		if (!bss_conf->assoc)
+		if (!vif->cfg.assoc)
 			clear_bit(ATH_OP_PRIM_STA_VIF, &common->op_flags);
 
 		if (priv->ah->opmode == NL80211_IFTYPE_STATION) {
 			ath9k_htc_choose_set_bssid(priv);
-			if (bss_conf->assoc && (priv->num_sta_assoc_vif == 1))
+			if (vif->cfg.assoc && (priv->num_sta_assoc_vif == 1))
 				ath9k_htc_start_ani(priv);
 			else if (priv->num_sta_assoc_vif == 0)
 				ath9k_htc_stop_ani(priv);
@@ -1540,7 +1540,7 @@ static void ath9k_htc_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_IBSS) {
 		if (priv->ah->opmode == NL80211_IFTYPE_ADHOC) {
-			common->curaid = bss_conf->aid;
+			common->curaid = vif->cfg.aid;
 			memcpy(common->curbssid, bss_conf->bssid, ETH_ALEN);
 			ath9k_htc_set_bssid(priv);
 		}
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 77144647f4fc..56c2681e5192 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1881,11 +1881,11 @@ static void ath9k_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_ASSOC) {
 		ath_dbg(common, CONFIG, "BSSID %pM Changed ASSOC %d\n",
-			bss_conf->bssid, bss_conf->assoc);
+			bss_conf->bssid, vif->cfg.assoc);
 
 		memcpy(avp->bssid, bss_conf->bssid, ETH_ALEN);
-		avp->aid = bss_conf->aid;
-		avp->assoc = bss_conf->assoc;
+		avp->aid = vif->cfg.aid;
+		avp->assoc = vif->cfg.assoc;
 
 		ath9k_calculate_summary_state(sc, avp->chanctx);
 	}
@@ -1893,7 +1893,7 @@ static void ath9k_bss_info_changed(struct ieee80211_hw *hw,
 	if ((changed & BSS_CHANGED_IBSS) ||
 	      (changed & BSS_CHANGED_OCB)) {
 		memcpy(common->curbssid, bss_conf->bssid, ETH_ALEN);
-		common->curaid = bss_conf->aid;
+		common->curaid = vif->cfg.aid;
 		ath9k_hw_write_associd(sc->sc_ah);
 	}
 
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 101295162967..e28a5f3085c0 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1115,7 +1115,7 @@ static void carl9170_op_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		ar->common.curaid = bss_conf->aid;
+		ar->common.curaid = vif->cfg.aid;
 		err = carl9170_set_beacon_timers(ar);
 		if (err)
 			goto out;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index e34d3d0b7082..72ba2e2fc93a 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -919,17 +919,17 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 		wcn36xx_dbg(WCN36XX_DBG_MAC,
 			    "mac bss changed ssid\n");
 		wcn36xx_dbg_dump(WCN36XX_DBG_MAC, "ssid ",
-				 bss_conf->ssid, bss_conf->ssid_len);
+				 vif->cfg.ssid, vif->cfg.ssid_len);
 
-		vif_priv->ssid.length = bss_conf->ssid_len;
+		vif_priv->ssid.length = vif->cfg.ssid_len;
 		memcpy(&vif_priv->ssid.ssid,
-		       bss_conf->ssid,
-		       bss_conf->ssid_len);
+		       vif->cfg.ssid,
+		       vif->cfg.ssid_len);
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
 		vif_priv->is_joining = false;
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			struct ieee80211_sta *sta;
 			struct wcn36xx_sta *sta_priv;
 
@@ -937,7 +937,7 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 				    "mac assoc bss %pM vif %pM AID=%d\n",
 				     bss_conf->bssid,
 				     vif->addr,
-				     bss_conf->aid);
+				     vif->cfg.aid);
 
 			vif_priv->sta_assoc = true;
 
@@ -963,7 +963,7 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 			wcn36xx_smd_config_bss(wcn, vif, sta,
 					       bss_conf->bssid,
 					       true);
-			sta_priv->aid = bss_conf->aid;
+			sta_priv->aid = vif->cfg.aid;
 			/*
 			 * config_sta must be called from  because this is the
 			 * place where AID is available.
@@ -977,7 +977,7 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 				    "disassociated bss %pM vif %pM AID=%d\n",
 				    bss_conf->bssid,
 				    vif->addr,
-				    bss_conf->aid);
+				    vif->cfg.aid);
 			vif_priv->sta_assoc = false;
 			wcn36xx_smd_set_link_st(wcn,
 						bss_conf->bssid,
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 7ac9a1e6f768..46ab21824d63 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -3005,7 +3005,7 @@ int wcn36xx_smd_arp_offload(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 		msg_body.host_offload_params.enable =
 			WCN36XX_HAL_OFFLOAD_ARP_AND_BCAST_FILTER_ENABLE;
 		memcpy(&msg_body.host_offload_params.u,
-		       &vif->bss_conf.arp_addr_list[0], sizeof(__be32));
+		       &vif->cfg.arp_addr_list[0], sizeof(__be32));
 	}
 	msg_body.ns_offload_params.bss_index = vif_priv->bss_index;
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 8c741b98d8e5..e6cd638a85d6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -507,7 +507,7 @@ brcms_ops_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 		brcms_c_start_station(wl->wlc, vif->addr);
 	else if (vif->type == NL80211_IFTYPE_AP)
 		brcms_c_start_ap(wl->wlc, vif->addr, vif->bss_conf.bssid,
-				 vif->bss_conf.ssid, vif->bss_conf.ssid_len);
+				 vif->cfg.ssid, vif->cfg.ssid_len);
 	else if (vif->type == NL80211_IFTYPE_ADHOC)
 		brcms_c_start_adhoc(wl->wlc, vif->addr);
 	spin_unlock_bh(&wl->lock);
@@ -592,9 +592,9 @@ brcms_ops_bss_info_changed(struct ieee80211_hw *hw,
 		 * also implies a change in the AID.
 		 */
 		brcms_err(core, "%s: %s: %sassociated\n", KBUILD_MODNAME,
-			  __func__, info->assoc ? "" : "dis");
+			  __func__, vif->cfg.assoc ? "" : "dis");
 		spin_lock_bh(&wl->lock);
-		brcms_c_associate_upd(wl->wlc, info->assoc);
+		brcms_c_associate_upd(wl->wlc, vif->cfg.assoc);
 		spin_unlock_bh(&wl->lock);
 	}
 	if (changed & BSS_CHANGED_ERP_SLOT) {
@@ -669,7 +669,7 @@ brcms_ops_bss_info_changed(struct ieee80211_hw *hw,
 	if (changed & BSS_CHANGED_SSID) {
 		/* BSSID changed, for whatever reason (IBSS and managed mode) */
 		spin_lock_bh(&wl->lock);
-		brcms_c_set_ssid(wl->wlc, info->ssid, info->ssid_len);
+		brcms_c_set_ssid(wl->wlc, vif->cfg.ssid, vif->cfg.ssid_len);
 		spin_unlock_bh(&wl->lock);
 	}
 	if (changed & BSS_CHANGED_BEACON) {
@@ -715,13 +715,13 @@ brcms_ops_bss_info_changed(struct ieee80211_hw *hw,
 	if (changed & BSS_CHANGED_IBSS) {
 		/* IBSS join status changed */
 		brcms_err(core, "%s: IBSS joined: %s (implement)\n",
-			  __func__, info->ibss_joined ? "true" : "false");
+			  __func__, vif->cfg.ibss_joined ? "true" : "false");
 	}
 
 	if (changed & BSS_CHANGED_ARP_FILTER) {
 		/* Hardware ARP filter address list or state changed */
 		brcms_err(core, "%s: arp filtering: %d addresses"
-			  " (implement)\n", __func__, info->arp_addr_cnt);
+			  " (implement)\n", __func__, vif->cfg.arp_addr_cnt);
 	}
 
 	if (changed & BSS_CHANGED_QOS) {
diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
index bd4e7d752958..846138d6e33d 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
@@ -2701,7 +2701,7 @@ il3945_post_associate(struct il_priv *il)
 	if (!il->vif || !il->is_open)
 		return;
 
-	D_ASSOC("Associated as %d to: %pM\n", il->vif->bss_conf.aid,
+	D_ASSOC("Associated as %d to: %pM\n", il->vif->cfg.aid,
 		il->active.bssid_addr);
 
 	if (test_bit(S_EXIT_PENDING, &il->status))
@@ -2718,9 +2718,9 @@ il3945_post_associate(struct il_priv *il)
 
 	il->staging.filter_flags |= RXON_FILTER_ASSOC_MSK;
 
-	il->staging.assoc_id = cpu_to_le16(il->vif->bss_conf.aid);
+	il->staging.assoc_id = cpu_to_le16(il->vif->cfg.aid);
 
-	D_ASSOC("assoc id %d beacon interval %d\n", il->vif->bss_conf.aid,
+	D_ASSOC("assoc id %d beacon interval %d\n", il->vif->cfg.aid,
 		il->vif->bss_conf.beacon_int);
 
 	if (il->vif->bss_conf.use_short_preamble)
diff --git a/drivers/net/wireless/intel/iwlegacy/4965.c b/drivers/net/wireless/intel/iwlegacy/4965.c
index 9fa556486511..c34729f576cd 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965.c
@@ -1756,9 +1756,9 @@ il4965_post_associate(struct il_priv *il)
 	if (il->ops->set_rxon_chain)
 		il->ops->set_rxon_chain(il);
 
-	il->staging.assoc_id = cpu_to_le16(vif->bss_conf.aid);
+	il->staging.assoc_id = cpu_to_le16(vif->cfg.aid);
 
-	D_ASSOC("assoc id %d beacon interval %d\n", vif->bss_conf.aid,
+	D_ASSOC("assoc id %d beacon interval %d\n", vif->cfg.aid,
 		vif->bss_conf.beacon_int);
 
 	if (vif->bss_conf.use_short_preamble)
@@ -1775,7 +1775,7 @@ il4965_post_associate(struct il_priv *il)
 
 	il_commit_rxon(il);
 
-	D_ASSOC("Associated as %d to: %pM\n", vif->bss_conf.aid,
+	D_ASSOC("Associated as %d to: %pM\n", vif->cfg.aid,
 		il->active.bssid_addr);
 
 	switch (vif->type) {
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index 8299d89e7505..b01945415be6 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -5427,8 +5427,8 @@ il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	}
 
 	if (changes & BSS_CHANGED_ASSOC) {
-		D_MAC80211("ASSOC %d\n", bss_conf->assoc);
-		if (bss_conf->assoc) {
+		D_MAC80211("ASSOC %d\n", vif->cfg.assoc);
+		if (vif->cfg.assoc) {
 			il->timestamp = bss_conf->sync_tsf;
 
 			if (!il_is_rfkill(il))
@@ -5437,7 +5437,7 @@ il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			il_set_no_assoc(il, vif);
 	}
 
-	if (changes && il_is_associated(il) && bss_conf->aid) {
+	if (changes && il_is_associated(il) && vif->cfg.aid) {
 		D_MAC80211("Changes (%#x) while associated\n", changes);
 		ret = il_send_rxon_assoc(il);
 		if (!ret) {
@@ -5459,10 +5459,10 @@ il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 
 	if (changes & BSS_CHANGED_IBSS) {
 		ret = il->ops->manage_ibss_station(il, vif,
-						   bss_conf->ibss_joined);
+						   vif->cfg.ibss_joined);
 		if (ret)
 			IL_ERR("failed to %s IBSS station %pM\n",
-			       bss_conf->ibss_joined ? "add" : "remove",
+			       vif->cfg.ibss_joined ? "add" : "remove",
 			       bss_conf->bssid);
 	}
 
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
index 5dd2d43a01d8..17a14970edec 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
@@ -562,12 +562,12 @@ int iwlagn_set_pan_params(struct iwl_priv *priv)
 		slot1 = bcnint - slot0;
 
 		if (test_bit(STATUS_SCAN_HW, &priv->status) ||
-		    (!ctx_bss->vif->bss_conf.idle &&
-		     !ctx_bss->vif->bss_conf.assoc)) {
+		    (!ctx_bss->vif->cfg.idle &&
+		     !ctx_bss->vif->cfg.assoc)) {
 			slot0 = dtim * bcnint * 3 - IWL_MIN_SLOT_TIME;
 			slot1 = IWL_MIN_SLOT_TIME;
-		} else if (!ctx_pan->vif->bss_conf.idle &&
-			   !ctx_pan->vif->bss_conf.assoc) {
+		} else if (!ctx_pan->vif->cfg.idle &&
+			   !ctx_pan->vif->cfg.assoc) {
 			slot1 = dtim * bcnint * 3 - IWL_MIN_SLOT_TIME;
 			slot0 = IWL_MIN_SLOT_TIME;
 		}
@@ -1392,7 +1392,7 @@ void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 
 	mutex_lock(&priv->mutex);
 
-	if (changes & BSS_CHANGED_IDLE && bss_conf->idle) {
+	if (changes & BSS_CHANGED_IDLE && vif->cfg.idle) {
 		/*
 		 * If we go idle, then clearly no "passive-no-rx"
 		 * workaround is needed any more, this is a reset.
@@ -1420,14 +1420,14 @@ void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 		iwlagn_update_qos(priv, ctx);
 	}
 
-	ctx->staging.assoc_id = cpu_to_le16(vif->bss_conf.aid);
+	ctx->staging.assoc_id = cpu_to_le16(vif->cfg.aid);
 	if (vif->bss_conf.use_short_preamble)
 		ctx->staging.flags |= RXON_FLG_SHORT_PREAMBLE_MSK;
 	else
 		ctx->staging.flags &= ~RXON_FLG_SHORT_PREAMBLE_MSK;
 
 	if (changes & BSS_CHANGED_ASSOC) {
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			priv->timestamp = bss_conf->sync_tsf;
 			ctx->staging.filter_flags |= RXON_FILTER_ASSOC_MSK;
 		} else {
@@ -1483,7 +1483,7 @@ void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 	 */
 
 	if (vif->type == NL80211_IFTYPE_STATION) {
-		if (!bss_conf->assoc)
+		if (!vif->cfg.assoc)
 			ctx->staging.filter_flags |= RXON_FILTER_BCON_AWARE_MSK;
 		else
 			ctx->staging.filter_flags &=
@@ -1493,7 +1493,7 @@ void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 	if (force || memcmp(&ctx->staging, &ctx->active, sizeof(ctx->staging)))
 		iwlagn_commit_rxon(priv, ctx);
 
-	if (changes & BSS_CHANGED_ASSOC && bss_conf->assoc) {
+	if (changes & BSS_CHANGED_ASSOC && vif->cfg.assoc) {
 		/*
 		 * The chain noise calibration will enable PM upon
 		 * completion. If calibration has already been run
@@ -1509,10 +1509,10 @@ void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changes & BSS_CHANGED_IBSS) {
 		ret = iwlagn_manage_ibss_station(priv, vif,
-						 bss_conf->ibss_joined);
+						 vif->cfg.ibss_joined);
 		if (ret)
 			IWL_ERR(priv, "failed to %s IBSS station %pM\n",
-				bss_conf->ibss_joined ? "add" : "remove",
+				vif->cfg.ibss_joined ? "add" : "remove",
 				bss_conf->bssid);
 	}
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
index 8760f2c73369..ee3c8a786199 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
@@ -311,7 +311,7 @@ static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 		smps_mode = IEEE80211_SMPS_DYNAMIC;
 
 	/* relax SMPS constraints for next association */
-	if (!vif->bss_conf.assoc)
+	if (!vif->cfg.assoc)
 		smps_mode = IEEE80211_SMPS_AUTOMATIC;
 
 	if (mvmvif->phy_ctxt &&
@@ -382,7 +382,7 @@ static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 	 *  we are not associated
 	 */
 	if (iwl_get_coex_type(mvm, vif) == BT_COEX_LOOSE_LUT ||
-	    mvm->cfg->bt_shared_single_ant || !vif->bss_conf.assoc ||
+	    mvm->cfg->bt_shared_single_ant || !vif->cfg.assoc ||
 	    le32_to_cpu(mvm->last_bt_notif.bt_activity_grading) == BT_OFF) {
 		iwl_mvm_bt_coex_reduced_txp(mvm, mvmvif->ap_sta_id, false);
 		iwl_mvm_bt_coex_enable_rssi_event(mvm, vif, false, 0);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index 8edc8646a23a..aeb0015b73d2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -749,7 +749,7 @@ static int iwl_mvm_d3_reprogram(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	/* add back the MAC */
 	mvmvif->uploaded = false;
 
-	if (WARN_ON(!vif->bss_conf.assoc))
+	if (WARN_ON(!vif->cfg.assoc))
 		return -EINVAL;
 
 	ret = iwl_mvm_mac_ctxt_add(mvm, vif);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
index 430044bc4755..777c568f35a5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
  * Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/etherdevice.h>
 #include <linux/math64.h>
@@ -67,7 +67,7 @@ int iwl_mvm_ftm_add_pasn_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	 * the TK is already configured for this station, so it
 	 * shouldn't be set again here.
 	 */
-	if (vif->bss_conf.assoc &&
+	if (vif->cfg.assoc &&
 	    !memcmp(addr, vif->bss_conf.bssid, ETH_ALEN)) {
 		struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 		struct ieee80211_sta *sta;
@@ -222,7 +222,7 @@ static void iwl_mvm_ftm_cmd_v5(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	for (i = 0; i < ETH_ALEN; i++)
 		cmd->macaddr_mask[i] = ~req->mac_addr_mask[i];
 
-	if (vif->bss_conf.assoc)
+	if (vif->cfg.assoc)
 		memcpy(cmd->range_req_bssid, vif->bss_conf.bssid, ETH_ALEN);
 	else
 		eth_broadcast_addr(cmd->range_req_bssid);
@@ -254,7 +254,7 @@ static void iwl_mvm_ftm_cmd_common(struct iwl_mvm *mvm,
 	for (i = 0; i < ETH_ALEN; i++)
 		cmd->macaddr_mask[i] = ~req->mac_addr_mask[i];
 
-	if (vif->bss_conf.assoc) {
+	if (vif->cfg.assoc) {
 		memcpy(cmd->range_req_bssid, vif->bss_conf.bssid, ETH_ALEN);
 
 		/* AP's TSF is only relevant if associated */
@@ -503,7 +503,7 @@ iwl_mvm_ftm_put_target(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	iwl_mvm_ftm_put_target_common(mvm, peer, target);
 
-	if (vif->bss_conf.assoc &&
+	if (vif->cfg.assoc &&
 	    !memcmp(peer->addr, vif->bss_conf.bssid, ETH_ALEN)) {
 		struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 		struct ieee80211_sta *sta;
@@ -693,7 +693,7 @@ iwl_mvm_ftm_set_secured_ranging(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 		target->cipher = entry->cipher;
 		memcpy(target->hltk, entry->hltk, sizeof(target->hltk));
 
-		if (vif->bss_conf.assoc &&
+		if (vif->cfg.assoc &&
 		    !memcmp(vif->bss_conf.bssid, target->bssid,
 			    sizeof(target->bssid)))
 			ieee80211_iter_keys(mvm->hw, vif, iter, target);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 7756ac0faf3f..65df8cbb57d9 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -570,7 +570,7 @@ static int iwl_mvm_mac_ctxt_cmd_sta(struct iwl_mvm *mvm,
 	}
 
 	/* We need the dtim_period to set the MAC as associated */
-	if (vif->bss_conf.assoc && vif->bss_conf.dtim_period &&
+	if (vif->cfg.assoc && vif->bss_conf.dtim_period &&
 	    !force_assoc_off) {
 		struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 		u32 dtim_offs;
@@ -628,9 +628,9 @@ static int iwl_mvm_mac_ctxt_cmd_sta(struct iwl_mvm *mvm,
 					      vif->bss_conf.dtim_period);
 
 	ctxt_sta->listen_interval = cpu_to_le32(mvm->hw->conf.listen_interval);
-	ctxt_sta->assoc_id = cpu_to_le32(vif->bss_conf.aid);
+	ctxt_sta->assoc_id = cpu_to_le32(vif->cfg.aid);
 
-	if (vif->probe_req_reg && vif->bss_conf.assoc && vif->p2p)
+	if (vif->probe_req_reg && vif->cfg.assoc && vif->p2p)
 		cmd.filter_flags |= cpu_to_le32(MAC_FILTER_IN_PROBE_REQUEST);
 
 	if (vif->bss_conf.he_support && !iwlwifi_mod_params.disable_11ax) {
@@ -944,8 +944,8 @@ static int iwl_mvm_mac_ctxt_send_beacon_v9(struct iwl_mvm *mvm,
 			IWL_MAC_BEACON_FILS :
 			IWL_MAC_BEACON_FILS_V1;
 		beacon_cmd.short_ssid =
-			cpu_to_le32(~crc32_le(~0, vif->bss_conf.ssid,
-					      vif->bss_conf.ssid_len));
+			cpu_to_le32(~crc32_le(~0, vif->cfg.ssid,
+					      vif->cfg.ssid_len));
 	}
 	rcu_read_unlock();
 
@@ -1031,7 +1031,7 @@ static void iwl_mvm_mac_ap_iterator(void *_data, u8 *mac,
 {
 	struct iwl_mvm_mac_ap_iterator_data *data = _data;
 
-	if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc)
+	if (vif->type != NL80211_IFTYPE_STATION || !vif->cfg.assoc)
 		return;
 
 	/* Station client has higher priority over P2P client*/
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index c5626ff83805..b5257b4fd000 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1614,7 +1614,7 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac,
 		return;
 
 	if (vif->type != NL80211_IFTYPE_STATION ||
-	    !vif->bss_conf.assoc)
+	    !vif->cfg.assoc)
 		return;
 
 	cmd->port_id = data->port_id++;
@@ -1740,7 +1740,7 @@ static void iwl_mvm_config_iface_filter(struct ieee80211_hw *hw,
 		return;
 
 	/* Supported only for p2p client interfaces */
-	if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc ||
+	if (vif->type != NL80211_IFTYPE_STATION || !vif->cfg.assoc ||
 	    !vif->p2p)
 		return;
 
@@ -2191,7 +2191,7 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 	 * on the beacon interval, which was not known when the station
 	 * interface was added.
 	 */
-	if (changes & BSS_CHANGED_ASSOC && bss_conf->assoc) {
+	if (changes & BSS_CHANGED_ASSOC && vif->cfg.assoc) {
 		if (vif->bss_conf.he_support &&
 		    !iwlwifi_mod_params.disable_11ax)
 			iwl_mvm_cfg_he_sta(mvm, vif, mvmvif->ap_sta_id);
@@ -2201,7 +2201,7 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 
 	/* Update MU EDCA params */
 	if (changes & BSS_CHANGED_QOS && mvmvif->associated &&
-	    bss_conf->assoc && vif->bss_conf.he_support &&
+	    vif->cfg.assoc && vif->bss_conf.he_support &&
 	    !iwlwifi_mod_params.disable_11ax)
 		iwl_mvm_cfg_he_sta(mvm, vif, mvmvif->ap_sta_id);
 
@@ -2220,10 +2220,10 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 
 	/* after sending it once, adopt mac80211 data */
 	memcpy(mvmvif->bssid, bss_conf->bssid, ETH_ALEN);
-	mvmvif->associated = bss_conf->assoc;
+	mvmvif->associated = vif->cfg.assoc;
 
 	if (changes & BSS_CHANGED_ASSOC) {
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			/* clear statistics to get clean beacon counter */
 			iwl_mvm_request_statistics(mvm, true);
 			memset(&mvmvif->beacon_stats, 0,
@@ -2627,7 +2627,7 @@ static void iwl_mvm_bss_info_changed(struct ieee80211_hw *hw,
 
 	mutex_lock(&mvm->mutex);
 
-	if (changes & BSS_CHANGED_IDLE && !bss_conf->idle)
+	if (changes & BSS_CHANGED_IDLE && !vif->cfg.idle)
 		iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_SCHED, true);
 
 	switch (vif->type) {
@@ -3020,7 +3020,7 @@ static void iwl_mvm_mei_host_associated(struct iwl_mvm *mvm,
 #if IS_ENABLED(CONFIG_IWLMEI)
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 	struct iwl_mei_conn_info conn_info = {
-		.ssid_len = vif->bss_conf.ssid_len,
+		.ssid_len = vif->cfg.ssid_len,
 		.channel = vif->bss_conf.chandef.chan->hw_value,
 	};
 
@@ -3068,7 +3068,7 @@ static void iwl_mvm_mei_host_associated(struct iwl_mvm *mvm,
 		return;
 	}
 
-	memcpy(conn_info.ssid, vif->bss_conf.ssid, vif->bss_conf.ssid_len);
+	memcpy(conn_info.ssid, vif->cfg.ssid, vif->cfg.ssid_len);
 	memcpy(conn_info.bssid,  vif->bss_conf.bssid, ETH_ALEN);
 
 	/* TODO: add support for collocated AP data */
@@ -3381,7 +3381,7 @@ static int iwl_mvm_mac_sched_scan_start(struct ieee80211_hw *hw,
 
 	mutex_lock(&mvm->mutex);
 
-	if (!vif->bss_conf.idle) {
+	if (!vif->cfg.idle) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -3747,7 +3747,7 @@ static int iwl_mvm_send_aux_roc_cmd(struct iwl_mvm *mvm,
 	 * like the delay to be for 2-3 dtim intervals, in case there are
 	 * other time events with higher priority.
 	 */
-	if (vif->bss_conf.assoc) {
+	if (vif->cfg.assoc) {
 		delay = min_t(u32, dtim_interval * 3, AUX_ROC_MAX_DELAY);
 		/* We cannot remain off-channel longer than the DTIM interval */
 		if (dtim_interval <= req_dur) {
@@ -4502,7 +4502,7 @@ static int __iwl_mvm_mac_testmode_cmd(struct iwl_mvm *mvm,
 	case IWL_MVM_TM_CMD_SET_BEACON_FILTER:
 		/* must be associated client vif - ignore authorized */
 		if (!vif || vif->type != NL80211_IFTYPE_STATION ||
-		    !vif->bss_conf.assoc || !vif->bss_conf.dtim_period ||
+		    !vif->cfg.assoc || !vif->bss_conf.dtim_period ||
 		    !tb[IWL_MVM_TM_ATTR_BEACON_FILTER_STATE])
 			return -EINVAL;
 
@@ -4670,7 +4670,7 @@ static int iwl_mvm_pre_channel_switch(struct ieee80211_hw *hw,
 		 * we don't know the dtim period. In this case, the firmware can't
 		 * track the beacons.
 		 */
-		if (!vif->bss_conf.assoc || !vif->bss_conf.dtim_period) {
+		if (!vif->cfg.assoc || !vif->bss_conf.dtim_period) {
 			ret = -EBUSY;
 			goto out_unlock;
 		}
@@ -5069,7 +5069,7 @@ static void iwl_mvm_mac_sta_statistics(struct ieee80211_hw *hw,
 	if (!(vif->driver_flags & IEEE80211_VIF_BEACON_FILTER))
 		return;
 
-	if (!vif->bss_conf.assoc)
+	if (!vif->cfg.assoc)
 		return;
 
 	mutex_lock(&mvm->mutex);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/offloading.c b/drivers/net/wireless/intel/iwlwifi/mvm/offloading.c
index c7dabc6b3765..a8bd0f5f795c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/offloading.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/offloading.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2012-2014, 2021 Intel Corporation
+ * Copyright (C) 2012-2014, 2021-2022 Intel Corporation
  * Copyright (C) 2013-2014 Intel Mobile Communications GmbH
  * Copyright (C) 2015 Intel Deutschland GmbH
  */
@@ -192,9 +192,9 @@ int iwl_mvm_send_proto_offload(struct iwl_mvm *mvm,
 		size = sizeof(cmd.v1);
 	}
 
-	if (vif->bss_conf.arp_addr_cnt) {
+	if (vif->cfg.arp_addr_cnt) {
 		enabled |= IWL_D3_PROTO_OFFLOAD_ARP | IWL_D3_PROTO_IPV4_VALID;
-		common->host_ipv4_addr = vif->bss_conf.arp_addr_list[0];
+		common->host_ipv4_addr = vif->cfg.arp_addr_list[0];
 		memcpy(common->arp_mac_addr, vif->addr, ETH_ALEN);
 	}
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index b2f33ebdf485..db43c8a83a31 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -162,7 +162,7 @@ static void iwl_mvm_rx_monitor_notif(struct iwl_mvm *mvm,
 	    vif->bss_conf.chandef.width < NL80211_CHAN_WIDTH_40)
 		return;
 
-	if (!vif->bss_conf.assoc)
+	if (!vif->cfg.assoc)
 		return;
 
 	/* this shouldn't happen *again*, ignore it */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/power.c b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
index afdf3bb523e9..b49f265a421f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/power.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
@@ -223,7 +223,7 @@ static void iwl_mvm_p2p_standalone_iterator(void *_data, u8 *mac,
 		*is_p2p_standalone = false;
 		break;
 	case NL80211_IFTYPE_STATION:
-		if (vif->bss_conf.assoc)
+		if (vif->cfg.assoc)
 			*is_p2p_standalone = false;
 		break;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/quota.c b/drivers/net/wireless/intel/iwlwifi/mvm/quota.c
index c862bd243b55..cea1a34f9130 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/quota.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/quota.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2012-2014, 2018, 2021 Intel Corporation
+ * Copyright (C) 2012-2014, 2018, 2021-2022 Intel Corporation
  * Copyright (C) 2013-2014 Intel Mobile Communications GmbH
  * Copyright (C) 2016-2017 Intel Deutschland GmbH
  */
@@ -47,7 +47,7 @@ static void iwl_mvm_quota_iterator(void *_data, u8 *mac,
 
 	switch (vif->type) {
 	case NL80211_IFTYPE_STATION:
-		if (vif->bss_conf.assoc)
+		if (vif->cfg.assoc)
 			break;
 		return;
 	case NL80211_IFTYPE_AP:
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index a4077053e374..582a95ffc7ab 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1948,14 +1948,14 @@ static void iwl_mvm_scan_6ghz_passive_scan(struct iwl_mvm *mvm,
 	 * reset or resume flow, or while not associated and a large interval
 	 * has passed since the last 6GHz passive scan.
 	 */
-	if ((vif->bss_conf.assoc ||
+	if ((vif->cfg.assoc ||
 	     time_after(mvm->last_6ghz_passive_scan_jiffies +
 			(IWL_MVM_6GHZ_PASSIVE_SCAN_TIMEOUT * HZ), jiffies)) &&
 	    (time_before(mvm->last_reset_or_resume_time_jiffies +
 			 (IWL_MVM_6GHZ_PASSIVE_SCAN_ASSOC_TIMEOUT * HZ),
 			 jiffies))) {
 		IWL_DEBUG_SCAN(mvm, "6GHz passive scan: %s\n",
-			       vif->bss_conf.assoc ? "associated" :
+			       vif->cfg.assoc ? "associated" :
 			       "timeout did not expire");
 		return;
 	}
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sf.c b/drivers/net/wireless/intel/iwlwifi/mvm/sf.c
index 693752d8f65b..1f4ac1e93cee 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sf.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sf.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2013-2014, 2018-2019 Intel Corporation
+ * Copyright (C) 2013-2014, 2018-2019, 2022 Intel Corporation
  * Copyright (C) 2013-2014 Intel Mobile Communications GmbH
  */
 #include "mvm.h"
@@ -31,7 +31,7 @@ static void iwl_mvm_bound_iface_iterator(void *_data, u8 *mac,
 
 	if (vif->type == NL80211_IFTYPE_STATION) {
 		data->sta_vif_ap_sta_id = mvmvif->ap_sta_id;
-		if (vif->bss_conf.assoc)
+		if (vif->cfg.assoc)
 			data->sta_vif_state = SF_FULL_ON;
 		else
 			data->sta_vif_state = SF_INIT_OFF;
@@ -261,7 +261,7 @@ int iwl_mvm_sf_update(struct iwl_mvm *mvm, struct ieee80211_vif *changed_vif,
 				return -EINVAL;
 			if (changed_vif->type != NL80211_IFTYPE_STATION) {
 				new_state = SF_UNINIT;
-			} else if (changed_vif->bss_conf.assoc &&
+			} else if (changed_vif->cfg.assoc &&
 				   changed_vif->bss_conf.dtim_period) {
 				mvmvif = iwl_mvm_vif_from_mac80211(changed_vif);
 				sta_id = mvmvif->ap_sta_id;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index bbb1522e7280..b296f4965895 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -1948,7 +1948,7 @@ int iwl_mvm_rm_sta(struct iwl_mvm *mvm,
 	if (vif->type == NL80211_IFTYPE_STATION &&
 	    mvmvif->ap_sta_id == sta_id) {
 		/* if associated - we can't remove the AP STA now */
-		if (vif->bss_conf.assoc)
+		if (vif->cfg.assoc)
 			return ret;
 
 		/* unassoc - go ahead - remove the AP STA now */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 4f0794a45bf5..ed8ba81a6043 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -160,7 +160,7 @@ static bool iwl_mvm_te_check_disconnect(struct iwl_mvm *mvm,
 	if (vif->type != NL80211_IFTYPE_STATION)
 		return false;
 
-	if (!mvmvif->csa_bcn_pending && vif->bss_conf.assoc &&
+	if (!mvmvif->csa_bcn_pending && vif->cfg.assoc &&
 	    vif->bss_conf.dtim_period)
 		return false;
 	if (errmsg)
@@ -176,7 +176,7 @@ static bool iwl_mvm_te_check_disconnect(struct iwl_mvm *mvm,
 		rcu_read_unlock();
 	}
 
-	if (vif->bss_conf.assoc) {
+	if (vif->cfg.assoc) {
 		/*
 		 * When not associated, this will be called from
 		 * iwl_mvm_event_mlme_callback_ini()
@@ -346,7 +346,7 @@ static void iwl_mvm_te_handle_notif(struct iwl_mvm *mvm,
 			 * and know the dtim period.
 			 */
 			iwl_mvm_te_check_disconnect(mvm, te_data->vif,
-				!te_data->vif->bss_conf.assoc ?
+				!te_data->vif->cfg.assoc ?
 				"Not associated and the time event is over already..." :
 				"No beacon heard and the time event is over already...");
 			break;
@@ -859,7 +859,7 @@ void iwl_mvm_rx_session_protect_notif(struct iwl_mvm *mvm,
 			 * and know the dtim period.
 			 */
 			iwl_mvm_te_check_disconnect(mvm, vif,
-						    !vif->bss_conf.assoc ?
+						    !vif->cfg.assoc ?
 						    "Not associated and the session protection is over already..." :
 						    "No beacon heard and the session protection is over already...");
 			spin_lock_bh(&mvm->time_event_lock);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index bc947733d982..3ee5ea3484be 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -604,7 +604,7 @@ static void iwl_mvm_sta_iface_iterator(void *_data, u8 *mac,
 	if (vif->type != NL80211_IFTYPE_STATION)
 		return;
 
-	if (vif->bss_conf.assoc)
+	if (vif->cfg.assoc)
 		data->assoc = true;
 }
 
@@ -816,7 +816,7 @@ static void iwl_mvm_uapsd_agg_disconnect(struct iwl_mvm *mvm,
 	if (vif->type != NL80211_IFTYPE_STATION)
 		return;
 
-	if (!vif->bss_conf.assoc)
+	if (!vif->cfg.assoc)
 		return;
 
 	if (!mvmvif->queue_params[IEEE80211_AC_VO].uapsd &&
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index a3ca6620dc0c..a9dfe6a3da0d 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -480,8 +480,8 @@ static void p54_bss_info_changed(struct ieee80211_hw *dev,
 			p54_scan(priv, P54_SCAN_EXIT, 0);
 	}
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (info->assoc) {
-			priv->aid = info->aid;
+		if (vif->cfg.assoc) {
+			priv->aid = vif->cfg.aid;
 			priv->wakeup_timer = info->beacon_int *
 					     info->dtim_period * 5;
 			p54_setup_mac(priv);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 5905803893b8..5aaf34f6f43b 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2118,9 +2118,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_ASSOC) {
 		wiphy_dbg(hw->wiphy, "  ASSOC: assoc=%d aid=%d\n",
-			  info->assoc, info->aid);
-		vp->assoc = info->assoc;
-		vp->aid = info->aid;
+			  vif->cfg.assoc, vif->cfg.aid);
+		vp->assoc = vif->cfg.assoc;
+		vp->aid = vif->cfg.aid;
 	}
 
 	if (changed & BSS_CHANGED_BEACON_ENABLED) {
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 36c24d17136c..5f1bcfb5e3f6 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -3250,7 +3250,7 @@ mwl8k_cmd_set_aid(struct ieee80211_hw *hw,
 
 	cmd->header.code = cpu_to_le16(MWL8K_CMD_SET_AID);
 	cmd->header.length = cpu_to_le16(sizeof(*cmd));
-	cmd->aid = cpu_to_le16(vif->bss_conf.aid);
+	cmd->aid = cpu_to_le16(vif->cfg.aid);
 	memcpy(cmd->bssid, vif->bss_conf.bssid, ETH_ALEN);
 
 	if (vif->bss_conf.use_cts_prot) {
@@ -5013,13 +5013,13 @@ mwl8k_bss_info_changed_sta(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	/*
 	 * No need to capture a beacon if we're no longer associated.
 	 */
-	if ((changed & BSS_CHANGED_ASSOC) && !vif->bss_conf.assoc)
+	if ((changed & BSS_CHANGED_ASSOC) && !vif->cfg.assoc)
 		priv->capture_beacon = false;
 
 	/*
 	 * Get the AP's legacy and MCS rates.
 	 */
-	if (vif->bss_conf.assoc) {
+	if (vif->cfg.assoc) {
 		struct ieee80211_sta *ap;
 
 		rcu_read_lock();
@@ -5085,7 +5085,7 @@ mwl8k_bss_info_changed_sta(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			goto out;
 	}
 
-	if (vif->bss_conf.assoc && !priv->ap_fw &&
+	if (vif->cfg.assoc && !priv->ap_fw &&
 	    (changed & (BSS_CHANGED_ASSOC | BSS_CHANGED_ERP_CTS_PROT |
 			BSS_CHANGED_HT))) {
 		rc = mwl8k_cmd_set_aid(hw, vif, ap_legacy_rates);
@@ -5093,7 +5093,7 @@ mwl8k_bss_info_changed_sta(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			goto out;
 	}
 
-	if (vif->bss_conf.assoc &&
+	if (vif->cfg.assoc &&
 	    (changed & (BSS_CHANGED_ASSOC | BSS_CHANGED_BEACON_INT))) {
 		/*
 		 * Finalize the join.  Tell rx handler to process
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index 91425b454cae..1f14ecda1f55 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -305,7 +305,7 @@ mt7603_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	mutex_lock(&dev->mt76.mutex);
 
 	if (changed & (BSS_CHANGED_ASSOC | BSS_CHANGED_BSSID)) {
-		if (info->assoc || info->ibss_joined) {
+		if (vif->cfg.assoc || vif->cfg.ibss_joined) {
 			mt76_wr(dev, MT_BSSID0(mvif->idx),
 				get_unaligned_le32(info->bssid));
 			mt76_wr(dev, MT_BSSID1(mvif->idx),
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
index a9c9b97d173e..d992fdea0f32 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
@@ -616,7 +616,7 @@ static void mt7615_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC)
-		mt7615_mac_set_beacon_filter(phy, vif, info->assoc);
+		mt7615_mac_set_beacon_filter(phy, vif, vif->cfg.assoc);
 
 	mt7615_mutex_release(dev);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
index 8fb6c9d735dc..f58376733c67 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
@@ -2530,7 +2530,7 @@ int mt7615_mcu_set_bss_pm(struct mt7615_dev *dev, struct ieee80211_vif *vif,
 		u8 pad;
 	} req = {
 		.bss_idx = mvif->mt76.idx,
-		.aid = cpu_to_le16(vif->bss_conf.aid),
+		.aid = cpu_to_le16(vif->cfg.aid),
 		.dtim_period = vif->bss_conf.dtim_period,
 		.bcn_interval = cpu_to_le16(vif->bss_conf.beacon_int),
 	};
diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
index faa279bbbcb2..d3da54e6670b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
@@ -402,7 +402,7 @@ void mt76_connac_mcu_sta_basic_tlv(struct sk_buff *skb,
 		else
 			conn_type = CONNECTION_INFRA_AP;
 		basic->conn_type = cpu_to_le32(conn_type);
-		basic->aid = cpu_to_le16(vif->bss_conf.aid);
+		basic->aid = cpu_to_le16(vif->cfg.aid);
 		break;
 	case NL80211_IFTYPE_ADHOC:
 		basic->conn_type = cpu_to_le32(CONNECTION_IBSS_ADHOC);
@@ -546,7 +546,7 @@ void mt76_connac_mcu_wtbl_generic_tlv(struct mt76_dev *dev,
 
 	if (sta) {
 		if (vif->type == NL80211_IFTYPE_STATION)
-			generic->partial_aid = cpu_to_le16(vif->bss_conf.aid);
+			generic->partial_aid = cpu_to_le16(vif->cfg.aid);
 		else
 			generic->partial_aid = cpu_to_le16(sta->aid);
 		memcpy(generic->peer_addr, sta->addr, ETH_ALEN);
@@ -2157,8 +2157,10 @@ int mt76_connac_mcu_update_arp_filter(struct mt76_dev *dev,
 				      struct mt76_vif *vif,
 				      struct ieee80211_bss_conf *info)
 {
+	struct ieee80211_vif *mvif = container_of(info, struct ieee80211_vif,
+						  bss_conf);
 	struct sk_buff *skb;
-	int i, len = min_t(int, info->arp_addr_cnt,
+	int i, len = min_t(int, mvif->cfg.arp_addr_cnt,
 			   IEEE80211_BSS_ARP_ADDR_LIST_LEN);
 	struct {
 		struct {
@@ -2186,7 +2188,7 @@ int mt76_connac_mcu_update_arp_filter(struct mt76_dev *dev,
 
 	skb_put_data(skb, &req_hdr, sizeof(req_hdr));
 	for (i = 0; i < len; i++)
-		skb_put_data(skb, &info->arp_addr_list[i], sizeof(__be32));
+		skb_put_data(skb, &mvif->cfg.arp_addr_list[i], sizeof(__be32));
 
 	return mt76_mcu_skb_send_msg(dev, skb, MCU_UNI_CMD(OFFLOAD), true);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
index 710ca757fb52..05327b0b6fc3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
@@ -593,7 +593,7 @@ static void mt7915_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		mt7915_mcu_add_bss_info(phy, vif, info->assoc);
+		mt7915_mcu_add_bss_info(phy, vif, vif->cfg.assoc);
 		mt7915_mcu_add_obss_spr(dev, vif, info->he_obss_pd.enable);
 	}
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 80279f342109..8532033794bd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -678,7 +678,7 @@ static void mt7921_bss_info_changed(struct ieee80211_hw *hw,
 		mt7921_mcu_sta_update(dev, NULL, vif, true,
 				      MT76_STA_INFO_STATE_ASSOC);
 		if (dev->pm.enable)
-			mt7921_mcu_set_beacon_filter(dev, vif, info->assoc);
+			mt7921_mcu_set_beacon_filter(dev, vif, vif->cfg.assoc);
 	}
 
 	if (changed & BSS_CHANGED_ARP_FILTER) {
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
index 12bab18c4171..64fc400bd981 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
@@ -1036,7 +1036,7 @@ mt7921_mcu_set_bss_pm(struct mt7921_dev *dev, struct ieee80211_vif *vif,
 		u8 pad;
 	} req = {
 		.bss_idx = mvif->mt76.idx,
-		.aid = cpu_to_le16(vif->bss_conf.aid),
+		.aid = cpu_to_le16(vif->cfg.aid),
 		.dtim_period = vif->bss_conf.dtim_period,
 		.bcn_interval = cpu_to_le16(vif->bss_conf.beacon_int),
 	};
diff --git a/drivers/net/wireless/mediatek/mt7601u/phy.c b/drivers/net/wireless/mediatek/mt7601u/phy.c
index 8a00f6a75ca9..d4cd2215aba9 100644
--- a/drivers/net/wireless/mediatek/mt7601u/phy.c
+++ b/drivers/net/wireless/mediatek/mt7601u/phy.c
@@ -1097,7 +1097,10 @@ static void mt7601u_phy_freq_cal(struct work_struct *work)
 void mt7601u_phy_con_cal_onoff(struct mt7601u_dev *dev,
 			       struct ieee80211_bss_conf *info)
 {
-	if (!info->assoc)
+	struct ieee80211_vif *vif = container_of(info, struct ieee80211_vif,
+						 bss_conf);
+
+	if (!vif->cfg.assoc)
 		cancel_delayed_work_sync(&dev->freq_cal.work);
 
 	/* Start/stop collecting beacon data */
@@ -1108,10 +1111,10 @@ void mt7601u_phy_con_cal_onoff(struct mt7601u_dev *dev,
 	spin_unlock_bh(&dev->con_mon_lock);
 
 	dev->freq_cal.freq = dev->ee->rf_freq_off;
-	dev->freq_cal.enabled = info->assoc;
+	dev->freq_cal.enabled = vif->cfg.assoc;
 	dev->freq_cal.adjusting = false;
 
-	if (info->assoc)
+	if (vif->cfg.assoc)
 		ieee80211_queue_delayed_work(dev->hw, &dev->freq_cal.work,
 					     MT_FREQ_CAL_INIT_DELAY);
 }
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00config.c b/drivers/net/wireless/ralink/rt2x00/rt2x00config.c
index 6bafdd991171..f895f560a185 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00config.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00config.c
@@ -70,6 +70,8 @@ void rt2x00lib_config_erp(struct rt2x00_dev *rt2x00dev,
 			  struct ieee80211_bss_conf *bss_conf,
 			  u32 changed)
 {
+	struct ieee80211_vif *vif = container_of(bss_conf, struct ieee80211_vif,
+						 bss_conf);
 	struct rt2x00lib_erp erp;
 
 	memset(&erp, 0, sizeof(erp));
@@ -87,7 +89,7 @@ void rt2x00lib_config_erp(struct rt2x00_dev *rt2x00dev,
 	erp.beacon_int = bss_conf->beacon_int;
 
 	/* Update the AID, this is needed for dynamic PS support */
-	rt2x00dev->aid = bss_conf->assoc ? bss_conf->aid : 0;
+	rt2x00dev->aid = vif->cfg.assoc ? vif->cfg.aid : 0;
 	rt2x00dev->last_beacon = bss_conf->sync_tsf;
 
 	/* Update global beacon interval time, this is needed for PS support */
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
index dea5babd30fe..660554a01894 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
@@ -645,7 +645,7 @@ void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw,
 	if (changes & BSS_CHANGED_ASSOC) {
 		rt2x00dev->link.count = 0;
 
-		if (bss_conf->assoc)
+		if (vif->cfg.assoc)
 			rt2x00dev->intf_associated++;
 		else
 			rt2x00dev->intf_associated--;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 8b2ca9e8eac6..0239e12ec8a5 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -4570,11 +4570,11 @@ rtl8xxxu_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	rarpt = &priv->ra_report;
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		dev_dbg(dev, "Changed ASSOC: %i!\n", bss_conf->assoc);
+		dev_dbg(dev, "Changed ASSOC: %i!\n", vif->cfg.assoc);
 
 		rtl8xxxu_set_linktype(priv, vif->type);
 
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			u32 ramask;
 			int sgi = 0;
 			u8 highest_rate;
@@ -4639,7 +4639,7 @@ rtl8xxxu_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 
 			/* joinbss sequence */
 			rtl8xxxu_write16(priv, REG_BCN_PSR_RPT,
-					 0xc000 | bss_conf->aid);
+					 0xc000 | vif->cfg.aid);
 
 			priv->fops->report_connect(priv, 0, true);
 		} else {
@@ -5405,7 +5405,7 @@ void rtl8723bu_handle_bt_inquiry(struct rtl8xxxu_priv *priv)
 
 	vif = priv->vif;
 	btcoex = &priv->bt_coex;
-	wifi_connected = (vif && vif->bss_conf.assoc);
+	wifi_connected = (vif && vif->cfg.assoc);
 
 	if (!wifi_connected) {
 		rtl8723bu_set_ps_tdma(priv, 0x8, 0x0, 0x0, 0x0, 0x0);
@@ -5431,7 +5431,7 @@ void rtl8723bu_handle_bt_info(struct rtl8xxxu_priv *priv)
 
 	vif = priv->vif;
 	btcoex = &priv->bt_coex;
-	wifi_connected = (vif && vif->bss_conf.assoc);
+	wifi_connected = (vif && vif->cfg.assoc);
 
 	if (wifi_connected) {
 		u32 val32 = 0;
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 99a1d91ced5a..5177eb02740e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1094,7 +1094,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw,
 	if (changed & BSS_CHANGED_ASSOC) {
 		u8 mstatus;
 
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			struct ieee80211_sta *sta = NULL;
 			u8 keep_alive = 10;
 
@@ -1111,7 +1111,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw,
 
 			mac->link_state = MAC80211_LINKED;
 			mac->cnt_after_linked = 0;
-			mac->assoc_id = bss_conf->aid;
+			mac->assoc_id = vif->cfg.aid;
 			memcpy(mac->bssid, bss_conf->bssid, ETH_ALEN);
 
 			if (rtlpriv->cfg->ops->linked_set_reg)
diff --git a/drivers/net/wireless/realtek/rtw88/bf.c b/drivers/net/wireless/realtek/rtw88/bf.c
index e76841d3417b..76c7f3257dd3 100644
--- a/drivers/net/wireless/realtek/rtw88/bf.c
+++ b/drivers/net/wireless/realtek/rtw88/bf.c
@@ -67,7 +67,7 @@ void rtw_bf_assoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif,
 		ether_addr_copy(bfee->mac_addr, bssid);
 		bfee->role = RTW_BFEE_MU;
 		bfee->p_aid = (bssid[5] << 1) | (bssid[4] >> 7);
-		bfee->aid = bss_conf->aid;
+		bfee->aid = vif->cfg.aid;
 		bfinfo->bfer_mu_cnt++;
 
 		rtw_chip_config_bfee(rtwdev, rtwvif, bfee, true);
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index 4310362dc333..c5954c18862d 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -369,12 +369,12 @@ static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_ASSOC) {
 		rtw_vif_assoc_changed(rtwvif, conf);
-		if (conf->assoc) {
+		if (vif->cfg.assoc) {
 			rtw_coex_connect_notify(rtwdev, COEX_ASSOCIATE_FINISH);
 
 			rtw_fw_download_rsvd_page(rtwdev);
 			rtw_send_rsvd_page_h2c(rtwdev);
-			rtw_coex_media_status_notify(rtwdev, conf->assoc);
+			rtw_coex_media_status_notify(rtwdev, vif->cfg.assoc);
 			if (rtw_bf_support)
 				rtw_bf_assoc(rtwdev, vif, conf);
 			rtw_store_op_chan(rtwdev);
diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c
index a44b1810165d..f78b9f28d5f6 100644
--- a/drivers/net/wireless/realtek/rtw88/main.c
+++ b/drivers/net/wireless/realtek/rtw88/main.c
@@ -171,7 +171,7 @@ static void rtw_vif_watch_dog_iter(void *data, u8 *mac,
 	struct rtw_vif *rtwvif = (struct rtw_vif *)vif->drv_priv;
 
 	if (vif->type == NL80211_IFTYPE_STATION)
-		if (vif->bss_conf.assoc)
+		if (vif->cfg.assoc)
 			iter_data->rtwvif = rtwvif;
 
 	rtw_dynamic_csi_rate(iter_data->rtwdev, rtwvif);
@@ -525,8 +525,13 @@ EXPORT_SYMBOL(rtw_dump_reg);
 void rtw_vif_assoc_changed(struct rtw_vif *rtwvif,
 			   struct ieee80211_bss_conf *conf)
 {
-	if (conf && conf->assoc) {
-		rtwvif->aid = conf->aid;
+	struct ieee80211_vif *vif = NULL;
+
+	if (conf)
+		vif = container_of(conf, struct ieee80211_vif, bss_conf);
+
+	if (conf && vif->cfg.assoc) {
+		rtwvif->aid = vif->cfg.aid;
 		rtwvif->net_type = RTW_NET_MGD_LINKED;
 	} else {
 		rtwvif->aid = 0;
@@ -1591,7 +1596,7 @@ static void rtw_vif_smps_iter(void *data, u8 *mac,
 {
 	struct rtw_dev *rtwdev = (struct rtw_dev *)data;
 
-	if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc)
+	if (vif->type != NL80211_IFTYPE_STATION || !vif->cfg.assoc)
 		return;
 
 	if (rtwdev->hal.txrx_1ss)
diff --git a/drivers/net/wireless/realtek/rtw89/cam.c b/drivers/net/wireless/realtek/rtw89/cam.c
index db3c55f0ccd0..cfa939e26ffd 100644
--- a/drivers/net/wireless/realtek/rtw89/cam.c
+++ b/drivers/net/wireless/realtek/rtw89/cam.c
@@ -701,7 +701,7 @@ void rtw89_cam_fill_addr_cam_info(struct rtw89_dev *rtwdev,
 	FWCMD_SET_ADDR_FRM_TGT_IND(cmd, rtwvif->frm_tgt_ind);
 	FWCMD_SET_ADDR_MACID(cmd, rtwsta ? rtwsta->mac_id : rtwvif->mac_id);
 	if (rtwvif->net_type == RTW89_NET_TYPE_INFRA)
-		FWCMD_SET_ADDR_AID12(cmd, vif->bss_conf.aid & 0xfff);
+		FWCMD_SET_ADDR_AID12(cmd, vif->cfg.aid & 0xfff);
 	else if (rtwvif->net_type == RTW89_NET_TYPE_AP_MODE)
 		FWCMD_SET_ADDR_AID12(cmd, sta ? sta->aid & 0xfff : 0);
 	FWCMD_SET_ADDR_WOL_PATTERN(cmd, rtwvif->wowlan_pattern);
diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c
index d2f2a3d65ef6..bd7b66e98480 100644
--- a/drivers/net/wireless/realtek/rtw89/core.c
+++ b/drivers/net/wireless/realtek/rtw89/core.c
@@ -1374,7 +1374,7 @@ static void rtw89_stats_trigger_frame(struct rtw89_dev *rtwdev,
 		if (aid == RTW89_TF_PAD)
 			break;
 
-		if (aid == vif->bss_conf.aid) {
+		if (aid == vif->cfg.aid) {
 			rtwvif->stats.rx_tf_acc++;
 			rtwdev->stats.rx_tf_acc++;
 			break;
diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h
index 239d47d0ec6d..cf60adaed64a 100644
--- a/drivers/net/wireless/realtek/rtw89/core.h
+++ b/drivers/net/wireless/realtek/rtw89/core.h
@@ -3715,7 +3715,7 @@ void rtw89_chip_cfg_txpwr_ul_tb_offset(struct rtw89_dev *rtwdev,
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
 	const struct rtw89_chip_info *chip = rtwdev->chip;
 
-	if (!vif->bss_conf.he_support || !vif->bss_conf.assoc)
+	if (!vif->bss_conf.he_support || !vif->cfg.assoc)
 		return;
 
 	if (chip->ops->set_txpwr_ul_tb_offset)
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index 6d0c62c545a7..ac8b9bf2b0bb 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -345,7 +345,7 @@ static void rtw89_ops_bss_info_changed(struct ieee80211_hw *hw,
 	rtw89_leave_ps_mode(rtwdev);
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (conf->assoc) {
+		if (vif->cfg.assoc) {
 			rtw89_station_mode_sta_assoc(rtwdev, vif, conf);
 			rtw89_phy_set_bss_color(rtwdev, vif);
 			rtw89_chip_cfg_txpwr_ul_tb_offset(rtwdev, vif);
diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c
index 217aacb6e8c1..af89c011b0db 100644
--- a/drivers/net/wireless/realtek/rtw89/phy.c
+++ b/drivers/net/wireless/realtek/rtw89/phy.c
@@ -3630,7 +3630,7 @@ void rtw89_phy_set_bss_color(struct rtw89_dev *rtwdev, struct ieee80211_vif *vif
 	enum rtw89_phy_idx phy_idx = RTW89_PHY_0;
 	u8 bss_color;
 
-	if (!vif->bss_conf.he_support || !vif->bss_conf.assoc)
+	if (!vif->bss_conf.he_support || !vif->cfg.assoc)
 		return;
 
 	bss_color = vif->bss_conf.he_bss_color.color;
@@ -3640,7 +3640,7 @@ void rtw89_phy_set_bss_color(struct rtw89_dev *rtwdev, struct ieee80211_vif *vif
 	rtw89_phy_write32_idx(rtwdev, R_BSS_CLR_MAP, B_BSS_CLR_MAP_TGT, bss_color,
 			      phy_idx);
 	rtw89_phy_write32_idx(rtwdev, R_BSS_CLR_MAP, B_BSS_CLR_MAP_STAID,
-			      vif->bss_conf.aid, phy_idx);
+			      vif->cfg.aid, phy_idx);
 }
 
 static void
diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c
index 6bfaab48b507..0f3a80f66b61 100644
--- a/drivers/net/wireless/rsi/rsi_91x_core.c
+++ b/drivers/net/wireless/rsi/rsi_91x_core.c
@@ -420,7 +420,8 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 			rsi_hal_send_sta_notify_frame(common,
 						      RSI_IFTYPE_STATION,
 						      STA_CONNECTED, bss->bssid,
-						      bss->qos, bss->aid, 0,
+						      bss->qos, vif->cfg.aid,
+						      0,
 						      vif);
 		}
 
diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index dca81a4bbdd7..d3634ad986ac 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -295,7 +295,6 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
 	struct rsi_hw *adapter = common->priv;
 	struct ieee80211_vif *vif;
 	struct ieee80211_tx_info *info;
-	struct ieee80211_bss_conf *bss;
 	int status = -EINVAL;
 
 	if (!skb)
@@ -307,11 +306,10 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
 	if (!info->control.vif)
 		goto err;
 	vif = info->control.vif;
-	bss = &vif->bss_conf;
 
 	if (((vif->type == NL80211_IFTYPE_STATION) ||
 	     (vif->type == NL80211_IFTYPE_P2P_CLIENT)) &&
-	    (!bss->assoc))
+	    (!vif->cfg.assoc))
 		goto err;
 
 	status = rsi_send_pkt_to_bus(common, skb);
@@ -367,7 +365,8 @@ int rsi_send_mgmt_pkt(struct rsi_common *common,
 	xtend_desc = (struct rsi_xtended_desc *)&skb->data[FRAME_DESC_SZ];
 
 	/* Indicate to firmware to give cfm for probe */
-	if (ieee80211_is_probe_req(wh->frame_control) && !bss->assoc) {
+	if (ieee80211_is_probe_req(wh->frame_control) &&
+	    !info->control.vif->cfg.assoc) {
 		rsi_dbg(INFO_ZONE,
 			"%s: blocking mgmt queue\n", __func__);
 		mgmt_desc->misc_flags = RSI_DESC_REQUIRE_CFM_TO_HOST;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index f01e82b90c07..d4b3834388ab 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -237,7 +237,6 @@ static int rsi_mac80211_hw_scan_start(struct ieee80211_hw *hw,
 	struct cfg80211_scan_request *scan_req = &hw_req->req;
 	struct rsi_hw *adapter = hw->priv;
 	struct rsi_common *common = adapter->priv;
-	struct ieee80211_bss_conf *bss = &vif->bss_conf;
 
 	rsi_dbg(INFO_ZONE, "***** Hardware scan start *****\n");
 	common->mac_ops_resumed = false;
@@ -256,7 +255,7 @@ static int rsi_mac80211_hw_scan_start(struct ieee80211_hw *hw,
 	/* If STA is not connected, return with special value 1, in order
 	 * to start sw_scan in mac80211
 	 */
-	if (!bss->assoc)
+	if (!vif->cfg.assoc)
 		return 1;
 
 	mutex_lock(&common->mutex);
@@ -579,7 +578,6 @@ static int rsi_channel_change(struct ieee80211_hw *hw)
 	struct ieee80211_channel *curchan = hw->conf.chandef.chan;
 	u16 channel = curchan->hw_value;
 	struct ieee80211_vif *vif;
-	struct ieee80211_bss_conf *bss;
 	bool assoc = false;
 	int i;
 
@@ -593,8 +591,7 @@ static int rsi_channel_change(struct ieee80211_hw *hw)
 		if (!vif)
 			continue;
 		if (vif->type == NL80211_IFTYPE_STATION) {
-			bss = &vif->bss_conf;
-			if (bss->assoc) {
+			if (vif->cfg.assoc) {
 				assoc = true;
 				break;
 			}
@@ -700,7 +697,7 @@ static int rsi_mac80211_config(struct ieee80211_hw *hw,
 			}
 			if ((vif->type == NL80211_IFTYPE_STATION ||
 			     vif->type == NL80211_IFTYPE_P2P_CLIENT) &&
-			    (!sta_vif || vif->bss_conf.assoc))
+			    (!sta_vif || vif->cfg.assoc))
 				sta_vif = vif;
 		}
 		if (set_ps && sta_vif) {
@@ -797,8 +794,8 @@ static void rsi_mac80211_bss_info_changed(struct ieee80211_hw *hw,
 	mutex_lock(&common->mutex);
 	if (changed & BSS_CHANGED_ASSOC) {
 		rsi_dbg(INFO_ZONE, "%s: Changed Association status: %d\n",
-			__func__, bss_conf->assoc);
-		if (bss_conf->assoc) {
+			__func__, vif->cfg.assoc);
+		if (vif->cfg.assoc) {
 			/* Send the RX filter frame */
 			rx_filter_word = (ALLOW_DATA_ASSOC_PEER |
 					  ALLOW_CTRL_ASSOC_PEER |
@@ -807,17 +804,17 @@ static void rsi_mac80211_bss_info_changed(struct ieee80211_hw *hw,
 		}
 		rsi_inform_bss_status(common,
 				      RSI_OPMODE_STA,
-				      bss_conf->assoc,
+				      vif->cfg.assoc,
 				      bss_conf->bssid,
 				      bss_conf->qos,
-				      bss_conf->aid,
+				      vif->cfg.aid,
 				      NULL, 0,
 				      bss_conf->assoc_capability, vif);
 		adapter->ps_info.dtim_interval_duration = bss->dtim_period;
 		adapter->ps_info.listen_interval = conf->listen_interval;
 
 		/* If U-APSD is updated, send ps parameters to firmware */
-		if (bss->assoc) {
+		if (vif->cfg.assoc) {
 			if (common->uapsd_bitmap) {
 				rsi_dbg(INFO_ZONE, "Configuring UAPSD\n");
 				rsi_conf_uapsd(adapter, vif);
@@ -1359,7 +1356,7 @@ static void rsi_fill_rx_status(struct ieee80211_hw *hw,
 	if (!bss)
 		return;
 	/* CQM only for connected AP beacons, the RSSI is a weighted avg */
-	if (bss->assoc && !(memcmp(bss->bssid, hdr->addr2, ETH_ALEN))) {
+	if (vif->cfg.assoc && !(memcmp(bss->bssid, hdr->addr2, ETH_ALEN))) {
 		if (ieee80211_is_beacon(hdr->frame_control))
 			rsi_perform_cqm(common, hdr->addr2, rxs->signal, vif);
 	}
@@ -1737,7 +1734,7 @@ static void rsi_resume_conn_channel(struct rsi_common *common)
 		}
 		if (((vif->type == NL80211_IFTYPE_STATION) ||
 		     (vif->type == NL80211_IFTYPE_P2P_CLIENT)) &&
-		    vif->bss_conf.assoc) {
+		    vif->cfg.assoc) {
 			rsi_switch_channel(adapter, vif);
 			break;
 		}
@@ -1862,17 +1859,15 @@ static u16 rsi_wow_map_triggers(struct rsi_common *common,
 int rsi_config_wowlan(struct rsi_hw *adapter, struct cfg80211_wowlan *wowlan)
 {
 	struct rsi_common *common = adapter->priv;
+	struct ieee80211_vif *vif = adapter->vifs[0];
 	u16 triggers = 0;
 	u16 rx_filter_word = 0;
-	struct ieee80211_bss_conf *bss = NULL;
 
 	rsi_dbg(INFO_ZONE, "Config WoWLAN to device\n");
 
-	if (!adapter->vifs[0])
+	if (!vif)
 		return -EINVAL;
 
-	bss = &adapter->vifs[0]->bss_conf;
-
 	if (WARN_ON(!wowlan)) {
 		rsi_dbg(ERR_ZONE, "WoW triggers not enabled\n");
 		return -EINVAL;
@@ -1884,7 +1879,7 @@ int rsi_config_wowlan(struct rsi_hw *adapter, struct cfg80211_wowlan *wowlan)
 		rsi_dbg(ERR_ZONE, "%s:No valid WoW triggers\n", __func__);
 		return -EINVAL;
 	}
-	if (!bss->assoc) {
+	if (!vif->cfg.assoc) {
 		rsi_dbg(ERR_ZONE,
 			"Cannot configure WoWLAN (Station not connected)\n");
 		common->wow_flags |= RSI_WOW_NO_CONNECTION;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
index c14689266fec..1b309e47a1f1 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
@@ -1635,7 +1635,6 @@ int rsi_send_ps_request(struct rsi_hw *adapter, bool enable,
 			struct ieee80211_vif *vif)
 {
 	struct rsi_common *common = adapter->priv;
-	struct ieee80211_bss_conf *bss = &vif->bss_conf;
 	struct rsi_request_ps *ps;
 	struct rsi_ps_info *ps_info;
 	struct sk_buff *skb;
@@ -1669,7 +1668,7 @@ int rsi_send_ps_request(struct rsi_hw *adapter, bool enable,
 	ps->ps_sleep.sleep_duration =
 		cpu_to_le32(ps_info->deep_sleep_wakeup_period);
 
-	if (bss->assoc)
+	if (vif->cfg.assoc)
 		ps->ps_sleep.connected_sleep = RSI_CONNECTED_SLEEP;
 	else
 		ps->ps_sleep.connected_sleep = RSI_DEEP_SLEEP;
diff --git a/drivers/net/wireless/silabs/wfx/hif_tx.c b/drivers/net/wireless/silabs/wfx/hif_tx.c
index d35dd940d968..9402503fbde3 100644
--- a/drivers/net/wireless/silabs/wfx/hif_tx.c
+++ b/drivers/net/wireless/silabs/wfx/hif_tx.c
@@ -282,6 +282,8 @@ int wfx_hif_stop_scan(struct wfx_vif *wvif)
 int wfx_hif_join(struct wfx_vif *wvif, const struct ieee80211_bss_conf *conf,
 		 struct ieee80211_channel *channel, const u8 *ssid, int ssid_len)
 {
+	struct ieee80211_vif *vif = container_of(conf, struct ieee80211_vif,
+						 bss_conf);
 	int ret;
 	struct wfx_hif_msg *hif;
 	struct wfx_hif_req_join *body = wfx_alloc_hif(sizeof(*body), &hif);
@@ -289,10 +291,10 @@ int wfx_hif_join(struct wfx_vif *wvif, const struct ieee80211_bss_conf *conf,
 	WARN_ON(!conf->beacon_int);
 	WARN_ON(!conf->basic_rates);
 	WARN_ON(sizeof(body->ssid) < ssid_len);
-	WARN(!conf->ibss_joined && !ssid_len, "joining an unknown BSS");
+	WARN(!vif->cfg.ibss_joined && !ssid_len, "joining an unknown BSS");
 	if (!hif)
 		return -ENOMEM;
-	body->infrastructure_bss_mode = !conf->ibss_joined;
+	body->infrastructure_bss_mode = !vif->cfg.ibss_joined;
 	body->short_preamble = conf->use_short_preamble;
 	body->probe_for_join = !(channel->flags & IEEE80211_CHAN_NO_IR);
 	body->channel_number = channel->hw_value;
@@ -417,6 +419,8 @@ int wfx_hif_set_pm(struct wfx_vif *wvif, bool ps, int dynamic_ps_timeout)
 int wfx_hif_start(struct wfx_vif *wvif, const struct ieee80211_bss_conf *conf,
 		  const struct ieee80211_channel *channel)
 {
+        struct ieee80211_vif *vif = container_of(conf, struct ieee80211_vif,
+						 bss_conf);
 	int ret;
 	struct wfx_hif_msg *hif;
 	struct wfx_hif_req_start *body = wfx_alloc_hif(sizeof(*body), &hif);
@@ -429,8 +433,8 @@ int wfx_hif_start(struct wfx_vif *wvif, const struct ieee80211_bss_conf *conf,
 	body->channel_number = channel->hw_value;
 	body->beacon_interval = cpu_to_le32(conf->beacon_int);
 	body->basic_rate_set = cpu_to_le32(wfx_rate_mask_to_hw(wvif->wdev, conf->basic_rates));
-	body->ssid_length = conf->ssid_len;
-	memcpy(body->ssid, conf->ssid, conf->ssid_len);
+	body->ssid_length = vif->cfg.ssid_len;
+	memcpy(body->ssid, vif->cfg.ssid, vif->cfg.ssid_len);
 	wfx_fill_header(hif, wvif->id, HIF_REQ_ID_START, sizeof(*body));
 	ret = wfx_cmd_send(wvif->wdev, hif, NULL, 0, false);
 	kfree(hif);
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 329d7f4a2b2e..79fafe8143d9 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -156,7 +156,7 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 	struct ieee80211_conf *conf = &wvif->wdev->hw->conf;
 	struct ieee80211_vif *vif = wvif_to_vif(wvif);
 
-	WARN(!vif->bss_conf.assoc && enable_ps,
+	WARN(!vif->cfg.assoc && enable_ps,
 	     "enable_ps is reliable only if associated");
 	if (wdev_to_wvif(wvif->wdev, 0)) {
 		struct wfx_vif *wvif_ch0 = wdev_to_wvif(wvif->wdev, 0);
@@ -175,7 +175,7 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 			/* It is useless to enable PS if channels are the same. */
 			if (enable_ps)
 				*enable_ps = false;
-			if (vif->bss_conf.assoc && vif->bss_conf.ps)
+			if (vif->cfg.assoc && vif->bss_conf.ps)
 				dev_info(wvif->wdev->dev, "ignoring requested PS mode");
 			return -1;
 		}
@@ -189,7 +189,7 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 	}
 	if (enable_ps)
 		*enable_ps = vif->bss_conf.ps;
-	if (vif->bss_conf.assoc && vif->bss_conf.ps)
+	if (vif->cfg.assoc && vif->bss_conf.ps)
 		return conf->dynamic_ps_timeout;
 	else
 		return -1;
@@ -201,7 +201,7 @@ int wfx_update_pm(struct wfx_vif *wvif)
 	int ps_timeout;
 	bool ps;
 
-	if (!vif->bss_conf.assoc)
+	if (!vif->cfg.assoc)
 		return 0;
 	ps_timeout = wfx_get_ps_timeout(wvif, &ps);
 	if (!ps)
@@ -417,7 +417,7 @@ static void wfx_join(struct wfx_vif *wvif)
 
 	bss = cfg80211_get_bss(wvif->wdev->hw->wiphy, wvif->channel, conf->bssid, NULL, 0,
 			       IEEE80211_BSS_TYPE_ANY, IEEE80211_PRIVACY_ANY);
-	if (!bss && !conf->ibss_joined) {
+	if (!bss && !vif->cfg.ibss_joined) {
 		wfx_tx_unlock(wvif->wdev);
 		return;
 	}
@@ -458,7 +458,7 @@ static void wfx_join_finalize(struct wfx_vif *wvif, struct ieee80211_bss_conf *i
 	bool greenfield = false;
 
 	rcu_read_lock(); /* protect sta */
-	if (info->bssid && !info->ibss_joined)
+	if (info->bssid && !vif->cfg.ibss_joined)
 		sta = ieee80211_find_sta(vif, info->bssid);
 	if (sta && sta->deflink.ht_cap.ht_supported)
 		ampdu_density = sta->deflink.ht_cap.ampdu_density;
@@ -471,7 +471,7 @@ static void wfx_join_finalize(struct wfx_vif *wvif, struct ieee80211_bss_conf *i
 	wfx_hif_set_association_mode(wvif, ampdu_density, greenfield, info->use_short_preamble);
 	wfx_hif_keep_alive_period(wvif, 0);
 	/* beacon_loss_count is defined to 7 in net/mac80211/mlme.c. Let's use the same value. */
-	wfx_hif_set_bss_params(wvif, info->aid, 7);
+	wfx_hif_set_bss_params(wvif, vif->cfg.aid, 7);
 	wfx_hif_set_beacon_wakeup_period(wvif, 1, 1);
 	wfx_update_pm(wvif);
 }
@@ -522,9 +522,9 @@ void wfx_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (info->assoc || info->ibss_joined)
+		if (vif->cfg.assoc || vif->cfg.ibss_joined)
 			wfx_join_finalize(wvif, info);
-		else if (!info->assoc && vif->type == NL80211_IFTYPE_STATION)
+		else if (!vif->cfg.assoc && vif->type == NL80211_IFTYPE_STATION)
 			wfx_reset(wvif);
 		else
 			dev_warn(wdev->dev, "misunderstood change: ASSOC\n");
@@ -540,11 +540,11 @@ void wfx_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 
 	if (changed & BSS_CHANGED_ARP_FILTER) {
 		for (i = 0; i < HIF_MAX_ARP_IP_ADDRTABLE_ENTRIES; i++) {
-			__be32 *arp_addr = &info->arp_addr_list[i];
+			__be32 *arp_addr = &vif->cfg.arp_addr_list[i];
 
-			if (info->arp_addr_cnt > HIF_MAX_ARP_IP_ADDRTABLE_ENTRIES)
+			if (vif->cfg.arp_addr_cnt > HIF_MAX_ARP_IP_ADDRTABLE_ENTRIES)
 				arp_addr = NULL;
-			if (i >= info->arp_addr_cnt)
+			if (i >= vif->cfg.arp_addr_cnt)
 				arp_addr = NULL;
 			wfx_hif_set_arp_ipv4_filter(wvif, i, arp_addr);
 		}
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 321df124d449..3f48def51ebc 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -1208,8 +1208,8 @@ static void cw1200_do_join(struct cw1200_common *priv)
 	struct cfg80211_bss *bss = NULL;
 	struct wsm_protected_mgmt_policy mgmt_policy;
 	struct wsm_join join = {
-		.mode = conf->ibss_joined ?
-				WSM_JOIN_MODE_IBSS : WSM_JOIN_MODE_BSS,
+		.mode = priv->vif->cfg.ibss_joined ?
+		WSM_JOIN_MODE_IBSS : WSM_JOIN_MODE_BSS,
 		.preamble_type = WSM_JOIN_PREAMBLE_LONG,
 		.probe_for_join = 1,
 		.atim_window = 0,
@@ -1230,7 +1230,7 @@ static void cw1200_do_join(struct cw1200_common *priv)
 	bss = cfg80211_get_bss(priv->hw->wiphy, priv->channel, bssid, NULL, 0,
 			       IEEE80211_BSS_TYPE_ANY, IEEE80211_PRIVACY_ANY);
 
-	if (!bss && !conf->ibss_joined) {
+	if (!bss && !priv->vif->cfg.ibss_joined) {
 		wsm_unlock_tx(priv);
 		return;
 	}
@@ -1284,7 +1284,7 @@ static void cw1200_do_join(struct cw1200_common *priv)
 		 join.bssid,
 		 join.dtim_period, priv->beacon_int);
 
-	if (!conf->ibss_joined) {
+	if (!priv->vif->cfg.ibss_joined) {
 		const u8 *ssidie;
 		rcu_read_lock();
 		ssidie = ieee80211_bss_get_ie(bss, WLAN_EID_SSID);
@@ -1302,7 +1302,7 @@ static void cw1200_do_join(struct cw1200_common *priv)
 	}
 
 	/* Enable asynchronous join calls */
-	if (!conf->ibss_joined) {
+	if (!priv->vif->cfg.ibss_joined) {
 		join.flags |= WSM_JOIN_FLAGS_FORCE;
 		join.flags |= WSM_JOIN_FLAGS_FORCE_WITH_COMPLETE_IND;
 	}
@@ -1813,15 +1813,15 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 		int i;
 
 		pr_debug("[STA] BSS_CHANGED_ARP_FILTER cnt: %d\n",
-			 info->arp_addr_cnt);
+			 vif->cfg.arp_addr_cnt);
 
 		/* Currently only one IP address is supported by firmware.
 		 * In case of more IPs arp filtering will be disabled.
 		 */
-		if (info->arp_addr_cnt > 0 &&
-		    info->arp_addr_cnt <= WSM_MAX_ARP_IP_ADDRTABLE_ENTRIES) {
-			for (i = 0; i < info->arp_addr_cnt; i++) {
-				filter.ipv4addrs[i] = info->arp_addr_list[i];
+		if (vif->cfg.arp_addr_cnt > 0 &&
+		    vif->cfg.arp_addr_cnt <= WSM_MAX_ARP_IP_ADDRTABLE_ENTRIES) {
+			for (i = 0; i < vif->cfg.arp_addr_cnt; i++) {
+				filter.ipv4addrs[i] = vif->cfg.arp_addr_list[i];
 				pr_debug("[STA] addr[%d]: 0x%X\n",
 					 i, filter.ipv4addrs[i]);
 			}
@@ -1857,7 +1857,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 
 	if (changed & BSS_CHANGED_BEACON_INT) {
 		pr_debug("CHANGED_BEACON_INT\n");
-		if (info->ibss_joined)
+		if (vif->cfg.ibss_joined)
 			do_join = true;
 		else if (priv->join_status == CW1200_JOIN_STATUS_AP)
 			cw1200_update_beaconing(priv);
@@ -1882,7 +1882,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 	     BSS_CHANGED_BASIC_RATES |
 	     BSS_CHANGED_HT)) {
 		pr_debug("BSS_CHANGED_ASSOC\n");
-		if (info->assoc) {
+		if (vif->cfg.assoc) {
 			if (priv->join_status < CW1200_JOIN_STATUS_PRE_STA) {
 				ieee80211_connection_loss(vif);
 				mutex_unlock(&priv->conf_mutex);
@@ -1894,7 +1894,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			do_join = true;
 		}
 
-		if (info->assoc || info->ibss_joined) {
+		if (vif->cfg.assoc || vif->cfg.ibss_joined) {
 			struct ieee80211_sta *sta = NULL;
 			__le32 htprot = 0;
 
@@ -1904,7 +1904,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 
 			rcu_read_lock();
 
-			if (info->bssid && !info->ibss_joined)
+			if (info->bssid && !vif->cfg.ibss_joined)
 				sta = ieee80211_find_sta(vif, info->bssid);
 			if (sta) {
 				priv->ht_info.ht_cap = sta->deflink.ht_cap;
@@ -1958,7 +1958,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			cancel_work_sync(&priv->unjoin_work);
 
 			priv->bss_params.beacon_lost_count = priv->cqm_beacon_loss_count;
-			priv->bss_params.aid = info->aid;
+			priv->bss_params.aid = vif->cfg.aid;
 
 			if (priv->join_dtim_period < 1)
 				priv->join_dtim_period = 1;
@@ -1973,7 +1973,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 				 priv->association_mode.basic_rate_set);
 			wsm_set_association_mode(priv, &priv->association_mode);
 
-			if (!info->ibss_joined) {
+			if (!vif->cfg.ibss_joined) {
 				wsm_keep_alive_period(priv, 30 /* sec */);
 				wsm_set_bss_params(priv, &priv->bss_params);
 				priv->setbssparams_done = true;
@@ -2330,8 +2330,8 @@ static int cw1200_start_ap(struct cw1200_common *priv)
 
 	memset(start.ssid, 0, sizeof(start.ssid));
 	if (!conf->hidden_ssid) {
-		start.ssid_len = conf->ssid_len;
-		memcpy(start.ssid, conf->ssid, start.ssid_len);
+		start.ssid_len = priv->vif->cfg.ssid_len;
+		memcpy(start.ssid, priv->vif->cfg.ssid, start.ssid_len);
 	}
 
 	priv->beacon_int = conf->beacon_int;
diff --git a/drivers/net/wireless/st/cw1200/txrx.c b/drivers/net/wireless/st/cw1200/txrx.c
index 7de666b90ff5..fde21fca6c5e 100644
--- a/drivers/net/wireless/st/cw1200/txrx.c
+++ b/drivers/net/wireless/st/cw1200/txrx.c
@@ -1183,8 +1183,8 @@ void cw1200_rx_cb(struct cw1200_common *priv,
 
 		/* Disable beacon filter once we're associated... */
 		if (priv->disable_beacon_filter &&
-		    (priv->vif->bss_conf.assoc ||
-		     priv->vif->bss_conf.ibss_joined)) {
+		    (priv->vif->cfg.assoc ||
+		     priv->vif->cfg.ibss_joined)) {
 			priv->disable_beacon_filter = false;
 			queue_work(priv->workqueue,
 				   &priv->update_filtering_work);
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index a25a6143e65f..bdc93c4f54ba 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1123,7 +1123,7 @@ static void wl1251_op_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			wl->beacon_int = bss_conf->beacon_int;
 
 			skb = ieee80211_pspoll_get(wl->hw, wl->vif);
@@ -1137,7 +1137,7 @@ static void wl1251_op_bss_info_changed(struct ieee80211_hw *hw,
 			if (ret < 0)
 				goto out_sleep;
 
-			ret = wl1251_acx_aid(wl, bss_conf->aid);
+			ret = wl1251_acx_aid(wl, vif->cfg.aid);
 			if (ret < 0)
 				goto out_sleep;
 		} else {
@@ -1176,10 +1176,10 @@ static void wl1251_op_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_ARP_FILTER) {
-		__be32 addr = bss_conf->arp_addr_list[0];
+		__be32 addr = vif->cfg.arp_addr_list[0];
 		WARN_ON(wl->bss_type != BSS_TYPE_STA_BSS);
 
-		enable = bss_conf->arp_addr_cnt == 1 && bss_conf->assoc;
+		enable = vif->cfg.arp_addr_cnt == 1 && vif->cfg.assoc;
 		ret = wl1251_acx_arp_ip_filter(wl, enable, addr);
 		if (ret < 0)
 			goto out_sleep;
diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c
index df6029ef6304..138edd28b0de 100644
--- a/drivers/net/wireless/ti/wlcore/cmd.c
+++ b/drivers/net/wireless/ti/wlcore/cmd.c
@@ -675,8 +675,8 @@ int wl12xx_cmd_role_start_ap(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 		memcpy(cmd->ap.ssid, wlvif->ssid, wlvif->ssid_len);
 	} else {
 		cmd->ap.ssid_type = WL12XX_SSID_TYPE_HIDDEN;
-		cmd->ap.ssid_len = bss_conf->ssid_len;
-		memcpy(cmd->ap.ssid, bss_conf->ssid, bss_conf->ssid_len);
+		cmd->ap.ssid_len = vif->cfg.ssid_len;
+		memcpy(cmd->ap.ssid, vif->cfg.ssid, vif->cfg.ssid_len);
 	}
 
 	supported_rates = CONF_TX_ENABLED_RATES | CONF_TX_MCS_RATES |
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 21a9e3b0cbac..ad9560bd9512 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -2904,10 +2904,12 @@ static int wlcore_set_assoc(struct wl1271 *wl, struct wl12xx_vif *wlvif,
 			    struct ieee80211_bss_conf *bss_conf,
 			    u32 sta_rate_set)
 {
+	struct ieee80211_vif *vif = container_of(bss_conf, struct ieee80211_vif,
+						 bss_conf);
 	int ieoffset;
 	int ret;
 
-	wlvif->aid = bss_conf->aid;
+	wlvif->aid = vif->cfg.aid;
 	wlvif->channel_type = cfg80211_get_chandef_type(&bss_conf->chandef);
 	wlvif->beacon_int = bss_conf->beacon_int;
 	wlvif->wmm_enabled = bss_conf->qos;
@@ -3935,7 +3937,6 @@ static int wl1271_ap_set_probe_resp_tmpl_legacy(struct wl1271 *wl,
 					     u32 rates)
 {
 	struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
-	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 	u8 probe_rsp_templ[WL1271_CMD_TEMPL_MAX_SIZE];
 	int ssid_ie_offset, ie_offset, templ_len;
 	const u8 *ptr;
@@ -3948,7 +3949,7 @@ static int wl1271_ap_set_probe_resp_tmpl_legacy(struct wl1271 *wl,
 					       probe_rsp_len, 0,
 					       rates);
 
-	if (probe_rsp_len + bss_conf->ssid_len > WL1271_CMD_TEMPL_MAX_SIZE) {
+	if (probe_rsp_len + vif->cfg.ssid_len > WL1271_CMD_TEMPL_MAX_SIZE) {
 		wl1271_error("probe_rsp template too big");
 		return -EINVAL;
 	}
@@ -3970,12 +3971,12 @@ static int wl1271_ap_set_probe_resp_tmpl_legacy(struct wl1271 *wl,
 
 	/* insert SSID from bss_conf */
 	probe_rsp_templ[ssid_ie_offset] = WLAN_EID_SSID;
-	probe_rsp_templ[ssid_ie_offset + 1] = bss_conf->ssid_len;
+	probe_rsp_templ[ssid_ie_offset + 1] = vif->cfg.ssid_len;
 	memcpy(probe_rsp_templ + ssid_ie_offset + 2,
-	       bss_conf->ssid, bss_conf->ssid_len);
-	templ_len = ssid_ie_offset + 2 + bss_conf->ssid_len;
+	       vif->cfg.ssid, vif->cfg.ssid_len);
+	templ_len = ssid_ie_offset + 2 + vif->cfg.ssid_len;
 
-	memcpy(probe_rsp_templ + ssid_ie_offset + 2 + bss_conf->ssid_len,
+	memcpy(probe_rsp_templ + ssid_ie_offset + 2 + vif->cfg.ssid_len,
 	       ptr, probe_rsp_len - (ptr - probe_rsp_data));
 	templ_len += probe_rsp_len - (ptr - probe_rsp_data);
 
@@ -4255,15 +4256,15 @@ out:
 }
 
 static int wlcore_set_bssid(struct wl1271 *wl, struct wl12xx_vif *wlvif,
-			    struct ieee80211_bss_conf *bss_conf,
-			    u32 sta_rate_set)
+			    struct ieee80211_vif *vif, u32 sta_rate_set)
 {
+	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 	u32 rates;
 	int ret;
 
 	wl1271_debug(DEBUG_MAC80211,
 	     "changed_bssid: %pM, aid: %d, bcn_int: %d, brates: 0x%x sta_rate_set: 0x%x",
-	     bss_conf->bssid, bss_conf->aid,
+	     bss_conf->bssid, vif->cfg.aid,
 	     bss_conf->beacon_int,
 	     bss_conf->basic_rates, sta_rate_set);
 
@@ -4351,7 +4352,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	}
 
 	if (changed & BSS_CHANGED_IBSS) {
-		if (bss_conf->ibss_joined) {
+		if (vif->cfg.ibss_joined) {
 			set_bit(WLVIF_FLAG_IBSS_JOINED, &wlvif->flags);
 			ibss_joined = true;
 		} else {
@@ -4375,7 +4376,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	}
 
 	if (changed & BSS_CHANGED_IDLE && !is_ibss)
-		wl1271_sta_handle_idle(wl, wlvif, bss_conf->idle);
+		wl1271_sta_handle_idle(wl, wlvif, vif->cfg.idle);
 
 	if (changed & BSS_CHANGED_CQM) {
 		bool enable = false;
@@ -4411,7 +4412,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 
 	if (changed & BSS_CHANGED_BSSID) {
 		if (!is_zero_ether_addr(bss_conf->bssid)) {
-			ret = wlcore_set_bssid(wl, wlvif, bss_conf,
+			ret = wlcore_set_bssid(wl, wlvif, vif,
 					       sta_rate_set);
 			if (ret < 0)
 				goto out;
@@ -4427,9 +4428,9 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 
 	if (changed & BSS_CHANGED_IBSS) {
 		wl1271_debug(DEBUG_ADHOC, "ibss_joined: %d",
-			     bss_conf->ibss_joined);
+			     vif->cfg.ibss_joined);
 
-		if (bss_conf->ibss_joined) {
+		if (vif->cfg.ibss_joined) {
 			u32 rates = bss_conf->basic_rates;
 			wlvif->basic_rate_set =
 				wl1271_tx_enabled_rates_get(wl, rates,
@@ -4466,7 +4467,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	}
 
 	if (changed & BSS_CHANGED_ASSOC) {
-		if (bss_conf->assoc) {
+		if (vif->cfg.assoc) {
 			ret = wlcore_set_assoc(wl, wlvif, bss_conf,
 					       sta_rate_set);
 			if (ret < 0)
@@ -4541,11 +4542,11 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	/* Handle arp filtering. Done after join. */
 	if ((changed & BSS_CHANGED_ARP_FILTER) ||
 	    (!is_ibss && (changed & BSS_CHANGED_QOS))) {
-		__be32 addr = bss_conf->arp_addr_list[0];
+		__be32 addr = vif->cfg.arp_addr_list[0];
 		wlvif->sta.qos = bss_conf->qos;
 		WARN_ON(wlvif->bss_type != BSS_TYPE_STA_BSS);
 
-		if (bss_conf->arp_addr_cnt == 1 && bss_conf->assoc) {
+		if (vif->cfg.arp_addr_cnt == 1 && vif->cfg.assoc) {
 			wlvif->ip_addr = addr;
 			/*
 			 * The template should have been configured only upon
diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index afaf331fe125..8c5de30478ec 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -979,7 +979,7 @@ static void vnt_check_bb_vga(struct vnt_private *priv)
 	if (priv->hw->conf.flags & IEEE80211_CONF_OFFCHANNEL)
 		return;
 
-	if (!(priv->vif->bss_conf.assoc && priv->current_rssi))
+	if (!(priv->vif->cfg.assoc && priv->current_rssi))
 		return;
 
 	RFvRSSITodBm(priv, (u8)priv->current_rssi, &dbm);
@@ -1399,7 +1399,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 {
 	struct vnt_private *priv = hw->priv;
 
-	priv->current_aid = conf->aid;
+	priv->current_aid = vif->cfg.aid;
 
 	if (changed & BSS_CHANGED_BSSID && conf->bssid) {
 		unsigned long flags;
@@ -1468,7 +1468,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & (BSS_CHANGED_ASSOC | BSS_CHANGED_BEACON_INFO) &&
 	    priv->op_mode != NL80211_IFTYPE_AP) {
-		if (conf->assoc && conf->beacon_rate) {
+		if (vif->cfg.assoc && conf->beacon_rate) {
 			CARDbUpdateTSF(priv, conf->beacon_rate->hw_value,
 				       conf->sync_tsf);
 
diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c
index ae7f5916d4d6..3ab8a7bb9715 100644
--- a/drivers/staging/vt6656/main_usb.c
+++ b/drivers/staging/vt6656/main_usb.c
@@ -749,7 +749,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 {
 	struct vnt_private *priv = hw->priv;
 
-	priv->current_aid = conf->aid;
+	priv->current_aid = vif->cfg.aid;
 
 	if (changed & BSS_CHANGED_BSSID && conf->bssid)
 		vnt_mac_set_bssid_addr(priv, (u8 *)conf->bssid);
@@ -811,7 +811,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & (BSS_CHANGED_ASSOC | BSS_CHANGED_BEACON_INFO) &&
 	    priv->op_mode != NL80211_IFTYPE_AP) {
-		if (conf->assoc && conf->beacon_rate) {
+		if (vif->cfg.assoc && conf->beacon_rate) {
 			u16 ps_beacon_int = conf->beacon_int;
 
 			if (conf->dtim_period)
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e3ded46f70ac..1520922d21a5 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -526,11 +526,6 @@ struct ieee80211_fils_discovery {
  *	mode only, set if the AP advertises TWT responder role)
  * @twt_protected: does this BSS support protected TWT frames
  * @twt_broadcast: does this BSS support broadcast TWT
- * @assoc: association status
- * @ibss_joined: indicates whether this station is part of an IBSS
- *	or not
- * @ibss_creator: indicates if a new IBSS network is being created
- * @aid: association ID number, valid only when @assoc is true
  * @use_cts_prot: use CTS protection
  * @use_short_preamble: use 802.11b short preamble
  * @use_short_slot: use short slot time (only relevant for ERP)
@@ -551,6 +546,8 @@ struct ieee80211_fils_discovery {
  *	IMPORTANT: These three sync_* parameters would possibly be out of sync
  *	by the time the driver will use them. The synchronized view is currently
  *	guaranteed only in certain callbacks.
+ *	Note also that this is not used with MLD associations, mac80211 doesn't
+ *	know how to track beacons for all of the links for this.
  * @beacon_int: beacon interval
  * @assoc_capability: capabilities taken from assoc resp
  * @basic_rates: bitmap of basic rates, each bit stands for an
@@ -576,21 +573,9 @@ struct ieee80211_fils_discovery {
  *	threshold event and can't be enabled simultaneously with it.
  * @cqm_rssi_high: Connection quality monitor RSSI upper threshold.
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
- * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The
- *	may filter ARP queries targeted for other addresses than listed here.
- *	The driver must allow ARP queries targeted for all address listed here
- *	to pass through. An empty list implies no ARP queries need to pass.
- * @arp_addr_cnt: Number of addresses currently on the list. Note that this
- *	may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list
- *	array size), it's up to the driver what to do in that case.
  * @qos: This is a QoS-enabled BSS.
- * @idle: This interface is idle. There's also a global idle flag in the
- *	hardware config which may be more appropriate depending on what
- *	your driver/device needs to do.
  * @ps: power-save mode (STA only). This flag is NOT affected by
  *	offchannel/dynamic_ps operations.
- * @ssid: The SSID of the current vif. Valid in AP and IBSS mode.
- * @ssid_len: Length of SSID given in @ssid.
  * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode.
  * @txpower: TX power in dBm.  INT_MIN means not configured.
  * @txpower_type: TX power adjustment used to control per packet Transmit
@@ -628,7 +613,6 @@ struct ieee80211_fils_discovery {
  * @fils_discovery: FILS discovery configuration
  * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response
  *	interval.
- * @s1g: BSS is S1G BSS (affects Association Request format).
  * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed
  *	to driver when rate control is offloaded to firmware.
  * @power_type: power type of BSS for 6 GHz
@@ -661,10 +645,6 @@ struct ieee80211_bss_conf {
 	bool twt_responder;
 	bool twt_protected;
 	bool twt_broadcast;
-	/* association related data */
-	bool assoc, ibss_joined;
-	bool ibss_creator;
-	u16 aid;
 	/* erp related data */
 	bool use_cts_prot;
 	bool use_short_preamble;
@@ -686,13 +666,8 @@ struct ieee80211_bss_conf {
 	s32 cqm_rssi_high;
 	struct cfg80211_chan_def chandef;
 	struct ieee80211_mu_group_data mu_group;
-	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
-	int arp_addr_cnt;
 	bool qos;
-	bool idle;
 	bool ps;
-	u8 ssid[IEEE80211_MAX_SSID_LEN];
-	size_t ssid_len;
 	bool hidden_ssid;
 	int txpower;
 	enum nl80211_tx_power_setting txpower_type;
@@ -717,7 +692,6 @@ struct ieee80211_bss_conf {
 	struct cfg80211_he_bss_color he_bss_color;
 	struct ieee80211_fils_discovery fils_discovery;
 	u32 unsol_bcast_probe_resp_interval;
-	bool s1g;
 	struct cfg80211_bitrate_mask beacon_tx_rate;
 	enum ieee80211_ap_reg_power power_type;
 	struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT];
@@ -1721,6 +1695,40 @@ enum ieee80211_offload_flags {
 	IEEE80211_OFFLOAD_DECAP_ENABLED		= BIT(2),
 };
 
+/**
+ * struct ieee80211_vif_cfg - interface configuration
+ * @assoc: association status
+ * @ibss_joined: indicates whether this station is part of an IBSS or not
+ * @ibss_creator: indicates if a new IBSS network is being created
+ * @aid: association ID number, valid only when @assoc is true
+ * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The
+ *	may filter ARP queries targeted for other addresses than listed here.
+ *	The driver must allow ARP queries targeted for all address listed here
+ *	to pass through. An empty list implies no ARP queries need to pass.
+ * @arp_addr_cnt: Number of addresses currently on the list. Note that this
+ *	may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list
+ *	array size), it's up to the driver what to do in that case.
+ * @ssid: The SSID of the current vif. Valid in AP and IBSS mode.
+ * @ssid_len: Length of SSID given in @ssid.
+ * @s1g: BSS is S1G BSS (affects Association Request format).
+ * @idle: This interface is idle. There's also a global idle flag in the
+ *	hardware config which may be more appropriate depending on what
+ *	your driver/device needs to do.
+ */
+struct ieee80211_vif_cfg {
+	/* association related data */
+	bool assoc, ibss_joined;
+	bool ibss_creator;
+	u16 aid;
+
+	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
+	int arp_addr_cnt;
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+	size_t ssid_len;
+	bool s1g;
+	bool idle;
+};
+
 /**
  * struct ieee80211_vif - per-interface data
  *
@@ -1728,6 +1736,7 @@ enum ieee80211_offload_flags {
  * use during the life of a virtual interface.
  *
  * @type: type of this virtual interface
+ * @cfg: vif configuration, see &struct ieee80211_vif_cfg
  * @bss_conf: BSS configuration for this interface, either our own
  *	or the BSS we're associated to
  * @addr: address of this interface
@@ -1762,6 +1771,7 @@ enum ieee80211_offload_flags {
  */
 struct ieee80211_vif {
 	enum nl80211_iftype type;
+	struct ieee80211_vif_cfg cfg;
 	struct ieee80211_bss_conf bss_conf;
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 8a15c71e7c7d..fd57bad38681 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1216,12 +1216,12 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	sdata->vif.bss_conf.twt_responder = params->twt_responder;
 	sdata->vif.bss_conf.he_obss_pd = params->he_obss_pd;
 	sdata->vif.bss_conf.he_bss_color = params->beacon.he_bss_color;
-	sdata->vif.bss_conf.s1g = params->chandef.chan->band ==
+	sdata->vif.cfg.s1g = params->chandef.chan->band ==
 				  NL80211_BAND_S1GHZ;
 
-	sdata->vif.bss_conf.ssid_len = params->ssid_len;
+	sdata->vif.cfg.ssid_len = params->ssid_len;
 	if (params->ssid_len)
-		memcpy(sdata->vif.bss_conf.ssid, params->ssid,
+		memcpy(sdata->vif.cfg.ssid, params->ssid,
 		       params->ssid_len);
 	sdata->vif.bss_conf.hidden_ssid =
 		(params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE);
@@ -1405,7 +1405,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata->vif.bss_conf.enable_beacon = false;
 	sdata->beacon_rate_set = false;
-	sdata->vif.bss_conf.ssid_len = 0;
+	sdata->vif.cfg.ssid_len = 0;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
 
@@ -3488,7 +3488,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		if (!sdata->vif.bss_conf.ibss_joined)
+		if (!sdata->vif.cfg.ibss_joined)
 			return -EINVAL;
 
 		if (params->chandef.width != sdata->u.ibss.chandef.width)
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index eea400902133..deb358eef9d0 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -783,7 +783,7 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
 out:
 	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, conf);
 
-	sdata->vif.bss_conf.idle = !conf;
+	sdata->vif.cfg.idle = !conf;
 
 	if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) {
 		ieee80211_recalc_chanctx_chantype(local, curr_ctx);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index cf71484658c6..59b33de0f3b0 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (c) 2006	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -233,7 +233,7 @@ IEEE80211_IF_FILE_R(hw_queues);
 
 /* STA attributes */
 IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
-IEEE80211_IF_FILE(aid, vif.bss_conf.aid, DEC);
+IEEE80211_IF_FILE(aid, vif.cfg.aid, DEC);
 IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
 
 static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
@@ -366,7 +366,7 @@ IEEE80211_IF_FILE_W(tkip_mic_test);
 static ssize_t ieee80211_if_parse_beacon_loss(
 	struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
 {
-	if (!ieee80211_sdata_running(sdata) || !sdata->vif.bss_conf.assoc)
+	if (!ieee80211_sdata_running(sdata) || !sdata->vif.cfg.assoc)
 		return -ENOTCONN;
 
 	ieee80211_beacon_loss(&sdata->vif);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 8ff547ff351e..afb5982a1d42 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -244,9 +244,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		sta_info_flush(sdata);
 
 	/* if merging, indicate to driver that we leave the old IBSS */
-	if (sdata->vif.bss_conf.ibss_joined) {
-		sdata->vif.bss_conf.ibss_joined = false;
-		sdata->vif.bss_conf.ibss_creator = false;
+	if (sdata->vif.cfg.ibss_joined) {
+		sdata->vif.cfg.ibss_joined = false;
+		sdata->vif.cfg.ibss_creator = false;
 		sdata->vif.bss_conf.enable_beacon = false;
 		netif_carrier_off(sdata->dev);
 		ieee80211_bss_info_change_notify(sdata,
@@ -326,8 +326,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.enable_beacon = true;
 	sdata->vif.bss_conf.beacon_int = beacon_int;
 	sdata->vif.bss_conf.basic_rates = basic_rates;
-	sdata->vif.bss_conf.ssid_len = ifibss->ssid_len;
-	memcpy(sdata->vif.bss_conf.ssid, ifibss->ssid, ifibss->ssid_len);
+	sdata->vif.cfg.ssid_len = ifibss->ssid_len;
+	memcpy(sdata->vif.cfg.ssid, ifibss->ssid, ifibss->ssid_len);
 	bss_change = BSS_CHANGED_BEACON_INT;
 	bss_change |= ieee80211_reset_erp_info(sdata);
 	bss_change |= BSS_CHANGED_BSSID;
@@ -359,15 +359,15 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_set_wmm_default(sdata, true, false);
 
-	sdata->vif.bss_conf.ibss_joined = true;
-	sdata->vif.bss_conf.ibss_creator = creator;
+	sdata->vif.cfg.ibss_joined = true;
+	sdata->vif.cfg.ibss_creator = creator;
 
 	err = drv_join_ibss(local, sdata);
 	if (err) {
-		sdata->vif.bss_conf.ibss_joined = false;
-		sdata->vif.bss_conf.ibss_creator = false;
+		sdata->vif.cfg.ibss_joined = false;
+		sdata->vif.cfg.ibss_creator = false;
 		sdata->vif.bss_conf.enable_beacon = false;
-		sdata->vif.bss_conf.ssid_len = 0;
+		sdata->vif.cfg.ssid_len = 0;
 		RCU_INIT_POINTER(ifibss->presp, NULL);
 		kfree_rcu(presp, rcu_head);
 		mutex_lock(&local->mtx);
@@ -708,10 +708,10 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata)
 
 	netif_carrier_off(sdata->dev);
 
-	sdata->vif.bss_conf.ibss_joined = false;
-	sdata->vif.bss_conf.ibss_creator = false;
+	sdata->vif.cfg.ibss_joined = false;
+	sdata->vif.cfg.ibss_creator = false;
 	sdata->vif.bss_conf.enable_beacon = false;
-	sdata->vif.bss_conf.ssid_len = 0;
+	sdata->vif.cfg.ssid_len = 0;
 
 	/* remove beacon */
 	presp = rcu_dereference_protected(ifibss->presp,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9e0e71ca8068..cce0a9aad2cd 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1642,7 +1642,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	sdata->control_port_no_encrypt = false;
 	sdata->control_port_over_nl80211 = false;
 	sdata->control_port_no_preauth = false;
-	sdata->vif.bss_conf.idle = true;
+	sdata->vif.cfg.idle = true;
 	sdata->vif.bss_conf.txpower = INT_MIN; /* unset */
 
 	sdata->noack_map = 0;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ebde131efcaa..0c81ae492df4 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -349,7 +349,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	struct wireless_dev *wdev = ndev->ieee80211_ptr;
 	struct in_device *idev;
 	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_bss_conf *bss_conf;
+	struct ieee80211_vif_cfg *vif_cfg;
 	struct ieee80211_if_managed *ifmgd;
 	int c = 0;
 
@@ -361,7 +361,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 		return NOTIFY_DONE;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
-	bss_conf = &sdata->vif.bss_conf;
+	vif_cfg = &sdata->vif.cfg;
 
 	/* ARP filtering is only supported in managed mode */
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
@@ -374,16 +374,16 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	ifmgd = &sdata->u.mgd;
 	sdata_lock(sdata);
 
-	/* Copy the addresses to the bss_conf list */
+	/* Copy the addresses to the vif config list */
 	ifa = rtnl_dereference(idev->ifa_list);
 	while (ifa) {
 		if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
-			bss_conf->arp_addr_list[c] = ifa->ifa_address;
+			vif_cfg->arp_addr_list[c] = ifa->ifa_address;
 		ifa = rtnl_dereference(ifa->ifa_next);
 		c++;
 	}
 
-	bss_conf->arp_addr_cnt = c;
+	vif_cfg->arp_addr_cnt = c;
 
 	/* Configure driver only if associated (which also implies it is up) */
 	if (ifmgd->associated)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c526af66ff8d..4a792e88568d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2258,6 +2258,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_bss *bss = (void *)cbss->priv;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg;
 
 	bss_info_changed |= BSS_CHANGED_ASSOC;
 	bss_info_changed |= ieee80211_handle_bss_capability(sdata,
@@ -2317,7 +2318,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 		bss_conf->dtim_period = 0;
 	}
 
-	bss_conf->assoc = 1;
+	vif_cfg->assoc = 1;
 
 	/* Tell the driver to monitor connection quality (if supported) */
 	if (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI &&
@@ -2325,7 +2326,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 		bss_info_changed |= BSS_CHANGED_CQM;
 
 	/* Enable ARP filtering */
-	if (bss_conf->arp_addr_cnt)
+	if (vif_cfg->arp_addr_cnt)
 		bss_info_changed |= BSS_CHANGED_ARP_FILTER;
 
 	ieee80211_bss_info_change_notify(sdata, bss_info_changed);
@@ -2419,7 +2420,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* clear bssid only after building the needed mgmt frames */
 	eth_zero_addr(ifmgd->bssid);
 
-	sdata->vif.bss_conf.ssid_len = 0;
+	sdata->vif.cfg.ssid_len = 0;
 
 	/* remove AP and TDLS peers */
 	sta_info_flush(sdata);
@@ -2429,7 +2430,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_led_assoc(local, 0);
 	changed |= BSS_CHANGED_ASSOC;
-	sdata->vif.bss_conf.assoc = false;
+	sdata->vif.cfg.assoc = false;
 
 	ifmgd->p2p_noa_index = -1;
 	memset(&sdata->vif.bss_conf.p2p_noa_attr, 0,
@@ -2455,7 +2456,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	cancel_work_sync(&local->dynamic_ps_enable_work);
 
 	/* Disable ARP filtering */
-	if (sdata->vif.bss_conf.arp_addr_cnt)
+	if (sdata->vif.cfg.arp_addr_cnt)
 		changed |= BSS_CHANGED_ARP_FILTER;
 
 	sdata->vif.bss_conf.qos = false;
@@ -2642,8 +2643,8 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 		ieee80211_send_nullfunc(sdata->local, sdata, false);
 	} else {
 		ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst,
-					      sdata->vif.bss_conf.ssid,
-					      sdata->vif.bss_conf.ssid_len,
+					      sdata->vif.cfg.ssid,
+					      sdata->vif.cfg.ssid_len,
 					      ifmgd->assoc_bss->channel);
 	}
 
@@ -3425,7 +3426,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	sdata->vif.bss_conf.aid = aid;
+	sdata->vif.cfg.aid = aid;
 	ifmgd->tdls_chan_switch_prohibited =
 		elems->ext_capab && elems->ext_capab_len >= 5 &&
 		(elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED);
@@ -4102,6 +4103,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg;
 	struct ieee80211_mgmt *mgmt = (void *) hdr;
 	size_t baselen;
 	struct ieee802_11_elems *elems;
@@ -4226,7 +4228,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	ncrc = elems->crc;
 
 	if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
-	    ieee80211_check_tim(elems->tim, elems->tim_len, bss_conf->aid)) {
+	    ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) {
 		if (local->hw.conf.dynamic_ps_timeout > 0) {
 			if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 				local->hw.conf.flags &= ~IEEE80211_CONF_PS;
@@ -5877,7 +5879,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgd_assoc_data *assoc_data;
 	const struct cfg80211_bss_ies *beacon_ies;
 	struct ieee80211_supported_band *sband;
-	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg;
 	const struct element *ssid_elem, *ht_elem, *vht_elem;
 	int i, err;
 	bool override = false;
@@ -5895,8 +5897,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	}
 	memcpy(assoc_data->ssid, ssid_elem->data, ssid_elem->datalen);
 	assoc_data->ssid_len = ssid_elem->datalen;
-	memcpy(bss_conf->ssid, assoc_data->ssid, assoc_data->ssid_len);
-	bss_conf->ssid_len = assoc_data->ssid_len;
+	memcpy(vif_cfg->ssid, assoc_data->ssid, assoc_data->ssid_len);
+	vif_cfg->ssid_len = assoc_data->ssid_len;
 	rcu_read_unlock();
 
 	if (ifmgd->associated) {
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index a1fbd562cac1..7da6b49598ab 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -785,7 +785,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_ADHOC:
-		if (!sdata->vif.bss_conf.ibss_joined)
+		if (!sdata->vif.cfg.ibss_joined)
 			need_offchan = true;
 #ifdef CONFIG_MAC80211_MESH
 		fallthrough;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index b698756887eb..f80284eee055 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -177,7 +177,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	rcu_read_lock();
 	scan_sdata = rcu_dereference(local->scan_sdata);
 	if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION &&
-	    scan_sdata->vif.bss_conf.assoc &&
+	    scan_sdata->vif.cfg.assoc &&
 	    ieee80211_have_rx_timestamp(rx_status)) {
 		bss_meta.parent_tsf =
 			ieee80211_calculate_rx_timestamp(local, rx_status,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index fa04021d4c0f..7eb6d8c4f25c 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -230,7 +230,7 @@ ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
 
 	*pos++ = WLAN_EID_AID;
 	*pos++ = 2; /* len */
-	put_unaligned_le16(sdata->vif.bss_conf.aid, pos);
+	put_unaligned_le16(sdata->vif.cfg.aid, pos);
 }
 
 /* translate numbering in the WMM parameter IE to the mac80211 notation */
@@ -1444,7 +1444,7 @@ void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer,
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 
-	if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc) {
+	if (vif->type != NL80211_IFTYPE_STATION || !vif->cfg.assoc) {
 		sdata_err(sdata, "Discarding TDLS oper %d - not STA or disconnected\n",
 			  oper);
 		return;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 743adfbb9b15..74ef7e71e1ef 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
-* Portions of this file
-* Copyright(c) 2016-2017 Intel Deutschland GmbH
-* Copyright (C) 2018 - 2021 Intel Corporation
-*/
+ * Portions of this file
+ * Copyright(c) 2016-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 - 2022 Intel Corporation
+ */
 
 #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
 #define __MAC80211_DRIVER_TRACE
@@ -425,14 +425,14 @@ TRACE_EVENT(drv_bss_info_changed,
 		__field(u32, channel_cfreq1)
 		__field(u32, channel_cfreq1_offset)
 		__dynamic_array(u32, arp_addr_list,
-				info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+				sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
 					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
-					info->arp_addr_cnt)
+					sdata->vif.cfg.arp_addr_cnt)
 		__field(int, arp_addr_cnt)
 		__field(bool, qos)
 		__field(bool, idle)
 		__field(bool, ps)
-		__dynamic_array(u8, ssid, info->ssid_len)
+		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
 		__field(bool, hidden_ssid)
 		__field(int, txpower)
 		__field(u8, p2p_oppps_ctwindow)
@@ -442,10 +442,10 @@ TRACE_EVENT(drv_bss_info_changed,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->changed = changed;
-		__entry->aid = info->aid;
-		__entry->assoc = info->assoc;
-		__entry->ibss_joined = info->ibss_joined;
-		__entry->ibss_creator = info->ibss_creator;
+		__entry->aid = sdata->vif.cfg.aid;
+		__entry->assoc = sdata->vif.cfg.assoc;
+		__entry->ibss_joined = sdata->vif.cfg.ibss_joined;
+		__entry->ibss_creator = sdata->vif.cfg.ibss_creator;
 		__entry->shortpre = info->use_short_preamble;
 		__entry->cts = info->use_cts_prot;
 		__entry->shortslot = info->use_short_slot;
@@ -465,15 +465,18 @@ TRACE_EVENT(drv_bss_info_changed,
 		__entry->channel_width = info->chandef.width;
 		__entry->channel_cfreq1 = info->chandef.center_freq1;
 		__entry->channel_cfreq1_offset = info->chandef.freq1_offset;
-		__entry->arp_addr_cnt = info->arp_addr_cnt;
-		memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list,
-		       sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+		__entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt;
+		memcpy(__get_dynamic_array(arp_addr_list),
+		       sdata->vif.cfg.arp_addr_list,
+		       sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
 					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
-					info->arp_addr_cnt));
+					sdata->vif.cfg.arp_addr_cnt));
 		__entry->qos = info->qos;
-		__entry->idle = info->idle;
+		__entry->idle = sdata->vif.cfg.idle;
 		__entry->ps = info->ps;
-		memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+		memcpy(__get_dynamic_array(ssid),
+		       sdata->vif.cfg.ssid,
+		       sdata->vif.cfg.ssid_len);
 		__entry->hidden_ssid = info->hidden_ssid;
 		__entry->txpower = info->txpower;
 		__entry->p2p_oppps_ctwindow = info->p2p_noa_attr.oppps_ctwindow;
@@ -1719,7 +1722,7 @@ TRACE_EVENT(drv_start_ap,
 		VIF_ENTRY
 		__field(u8, dtimper)
 		__field(u16, bcnint)
-		__dynamic_array(u8, ssid, info->ssid_len)
+		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
 		__field(bool, hidden_ssid)
 	),
 
@@ -1728,7 +1731,9 @@ TRACE_EVENT(drv_start_ap,
 		VIF_ASSIGN;
 		__entry->dtimper = info->dtim_period;
 		__entry->bcnint = info->beacon_int;
-		memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+		memcpy(__get_dynamic_array(ssid),
+		       sdata->vif.cfg.ssid,
+		       sdata->vif.cfg.ssid_len);
 		__entry->hidden_ssid = info->hidden_ssid;
 	),
 
@@ -1786,7 +1791,7 @@ TRACE_EVENT(drv_join_ibss,
 		VIF_ENTRY
 		__field(u8, dtimper)
 		__field(u16, bcnint)
-		__dynamic_array(u8, ssid, info->ssid_len)
+		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
 	),
 
 	TP_fast_assign(
@@ -1794,7 +1799,9 @@ TRACE_EVENT(drv_join_ibss,
 		VIF_ASSIGN;
 		__entry->dtimper = info->dtim_period;
 		__entry->bcnint = info->beacon_int;
-		memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+		memcpy(__get_dynamic_array(ssid),
+		       sdata->vif.cfg.ssid,
+		       sdata->vif.cfg.ssid_len);
 	),
 
 	TP_printk(
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e5edc6fd21c0..a4270e9ec10e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5381,7 +5381,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
 	pspoll = skb_put_zero(skb, sizeof(*pspoll));
 	pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
 					    IEEE80211_STYPE_PSPOLL);
-	pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid);
+	pspoll->aid = cpu_to_le16(sdata->vif.cfg.aid);
 
 	/* aid in PS-Poll has its two MSBs each set to 1 */
 	pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 48d8f0aad69f..dccc757baab2 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2496,7 +2496,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		case NL80211_IFTYPE_MONITOR:
 			break;
 		case NL80211_IFTYPE_ADHOC:
-			if (sdata->vif.bss_conf.ibss_joined)
+			if (sdata->vif.cfg.ibss_joined)
 				WARN_ON(drv_join_ibss(local, sdata));
 			fallthrough;
 		default:
-- 
cgit 


From 7b7090b4c6a906cc7c3e2a460335f705b93f4506 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 24 May 2022 10:55:56 +0200
Subject: wifi: mac80211: split bss_info_changed method

Split the bss_info_changed method to vif_cfg_changed and
link_info_changed, with the latter getting a link ID.
Also change the 'changed' parameter to u64 already, we
know we need that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/admtek/adm8211.c              |   2 +-
 drivers/net/wireless/ath/ar5523/ar5523.c           |   2 +-
 drivers/net/wireless/ath/ath10k/mac.c              |   2 +-
 drivers/net/wireless/ath/ath11k/mac.c              |   2 +-
 drivers/net/wireless/ath/ath5k/mac80211-ops.c      |   2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |   2 +-
 drivers/net/wireless/ath/ath9k/main.c              |   2 +-
 drivers/net/wireless/ath/carl9170/main.c           |   2 +-
 drivers/net/wireless/ath/wcn36xx/main.c            |   4 +-
 drivers/net/wireless/atmel/at76c50x-usb.c          |   2 +-
 drivers/net/wireless/broadcom/b43/main.c           |   4 +-
 drivers/net/wireless/broadcom/b43legacy/main.c     |   2 +-
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |   2 +-
 drivers/net/wireless/intel/iwlegacy/common.c       |   6 +-
 drivers/net/wireless/intel/iwlegacy/common.h       |   2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/agn.h       |   2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/rxon.c      |   2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |   6 +-
 drivers/net/wireless/intersil/p54/main.c           |   2 +-
 drivers/net/wireless/mac80211_hwsim.c              |   4 +-
 drivers/net/wireless/marvell/libertas_tf/main.c    |   2 +-
 drivers/net/wireless/marvell/mwl8k.c               |   2 +-
 drivers/net/wireless/mediatek/mt76/mt7603/main.c   |   2 +-
 drivers/net/wireless/mediatek/mt76/mt7615/main.c   |   2 +-
 drivers/net/wireless/mediatek/mt76/mt76x02.h       |   2 +-
 drivers/net/wireless/mediatek/mt76/mt76x02_util.c  |   2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/main.c   |   2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |   2 +-
 drivers/net/wireless/mediatek/mt7601u/main.c       |   2 +-
 drivers/net/wireless/purelifi/plfxlc/mac.c         |   4 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00.h        |   2 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00mac.c     |   2 +-
 drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |   2 +-
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |   2 +-
 .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |   2 +-
 drivers/net/wireless/realtek/rtlwifi/core.c        |   2 +-
 drivers/net/wireless/realtek/rtw88/mac80211.c      |   2 +-
 drivers/net/wireless/realtek/rtw89/mac80211.c      |   2 +-
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        |   2 +-
 drivers/net/wireless/silabs/wfx/sta.c              |   2 +-
 drivers/net/wireless/silabs/wfx/sta.h              |   2 +-
 drivers/net/wireless/st/cw1200/sta.c               |   4 +-
 drivers/net/wireless/st/cw1200/sta.h               |   2 +-
 drivers/net/wireless/ti/wl1251/main.c              |   2 +-
 drivers/net/wireless/ti/wlcore/main.c              |   2 +-
 drivers/net/wireless/zydas/zd1211rw/zd_mac.c       |   4 +-
 drivers/staging/vt6655/device_main.c               |   2 +-
 drivers/staging/vt6656/main_usb.c                  |   2 +-
 include/net/mac80211.h                             |  28 ++++-
 net/mac80211/cfg.c                                 |  31 ++---
 net/mac80211/chan.c                                |   9 +-
 net/mac80211/driver-ops.h                          |  35 ++++--
 net/mac80211/ibss.c                                |   2 +-
 net/mac80211/ieee80211_i.h                         |   6 +-
 net/mac80211/iface.c                               |   5 +-
 net/mac80211/main.c                                |  84 ++++++++++++-
 net/mac80211/mesh.c                                |   6 +-
 net/mac80211/mlme.c                                |  16 +--
 net/mac80211/offchannel.c                          |   8 +-
 net/mac80211/sta_info.c                            |   2 +-
 net/mac80211/tdls.c                                |   2 +-
 net/mac80211/trace.h                               | 138 +++++++++++++--------
 net/mac80211/util.c                                |   4 +-
 net/mac80211/vht.c                                 |   2 +-
 64 files changed, 324 insertions(+), 170 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c
index 2db9c948c0fc..6bee16b207d1 100644
--- a/drivers/net/wireless/admtek/adm8211.c
+++ b/drivers/net/wireless/admtek/adm8211.c
@@ -1311,7 +1311,7 @@ static int adm8211_config(struct ieee80211_hw *dev, u32 changed)
 static void adm8211_bss_info_changed(struct ieee80211_hw *dev,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *conf,
-				     u32 changes)
+				     u64 changes)
 {
 	struct adm8211_priv *priv = dev->priv;
 
diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c
index 4cd06a0942d4..6f937d2cc126 100644
--- a/drivers/net/wireless/ath/ar5523/ar5523.c
+++ b/drivers/net/wireless/ath/ar5523/ar5523.c
@@ -1273,7 +1273,7 @@ static int ar5523_write_associd(struct ar5523 *ar, struct ieee80211_vif *vif)
 static void ar5523_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *bss,
-				    u32 changed)
+				    u64 changed)
 {
 	struct ar5523 *ar = hw->priv;
 	int error;
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 4e7e2da3c859..c1f8123d81fe 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -6068,7 +6068,7 @@ static void ath10k_recalculate_mgmt_rate(struct ath10k *ar,
 static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *info,
-				    u32 changed)
+				    u64 changed)
 {
 	struct ath10k *ar = hw->priv;
 	struct ath10k_vif *arvif = (void *)vif->drv_priv;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 169161549361..3e07b1454fe4 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -3091,7 +3091,7 @@ static int ath11k_mac_config_obss_pd(struct ath11k *ar,
 static void ath11k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
 					   struct ieee80211_vif *vif,
 					   struct ieee80211_bss_conf *info,
-					   u32 changed)
+					   u64 changed)
 {
 	struct ath11k *ar = hw->priv;
 	struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif);
diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
index 0df0fa1da181..8da232e81518 100644
--- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c
+++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
@@ -250,7 +250,7 @@ unlock:
 
 static void
 ath5k_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		       struct ieee80211_bss_conf *bss_conf, u32 changes)
+		       struct ieee80211_bss_conf *bss_conf, u64 changes)
 {
 	struct ath5k_vif *avf = (void *)vif->drv_priv;
 	struct ath5k_hw *ah = hw->priv;
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 3a5ec4da6a38..14d713e03872 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1509,7 +1509,7 @@ static void ath9k_htc_choose_set_bssid(struct ath9k_htc_priv *priv)
 static void ath9k_htc_bss_info_changed(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
 				       struct ieee80211_bss_conf *bss_conf,
-				       u32 changed)
+				       u64 changed)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
 	struct ath_hw *ah = priv->ah;
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 56c2681e5192..729f8ee9644d 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1863,7 +1863,7 @@ static int ath9k_set_key(struct ieee80211_hw *hw,
 static void ath9k_bss_info_changed(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
 				   struct ieee80211_bss_conf *bss_conf,
-				   u32 changed)
+				   u64 changed)
 {
 #define CHECK_ANI				\
 	(BSS_CHANGED_ASSOC |			\
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index e28a5f3085c0..3d881028bd00 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1032,7 +1032,7 @@ static void carl9170_op_configure_filter(struct ieee80211_hw *hw,
 static void carl9170_op_bss_info_changed(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
 					 struct ieee80211_bss_conf *bss_conf,
-					 u32 changed)
+					 u64 changed)
 {
 	struct ar9170 *ar = hw->priv;
 	struct ath_common *common = &ar->common;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 72ba2e2fc93a..04fb26cb8322 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -872,7 +872,7 @@ void wcn36xx_set_default_rates_v1(struct wcn36xx_hal_supported_rates_v1 *rates)
 static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *bss_conf,
-				     u32 changed)
+				     u64 changed)
 {
 	struct wcn36xx *wcn = hw->priv;
 	struct sk_buff *skb = NULL;
@@ -880,7 +880,7 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 	enum wcn36xx_hal_link_state link_state;
 	struct wcn36xx_vif *vif_priv = wcn36xx_vif_to_priv(vif);
 
-	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac bss info changed vif %p changed 0x%08x\n",
+	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac bss info changed vif %p changed 0x%llx\n",
 		    vif, changed);
 
 	mutex_lock(&wcn->conf_mutex);
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 7582761c61e2..24e609c1f523 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -2033,7 +2033,7 @@ static int at76_config(struct ieee80211_hw *hw, u32 changed)
 static void at76_bss_info_changed(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
 				  struct ieee80211_bss_conf *conf,
-				  u32 changed)
+				  u64 changed)
 {
 	struct at76_priv *priv = hw->priv;
 
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index 17bcec5f3ff7..e3121a9d0579 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -366,7 +366,7 @@ static int b43_wireless_core_start(struct b43_wldev *dev);
 static void b43_op_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *conf,
-				    u32 changed);
+				    u64 changed);
 
 static int b43_ratelimit(struct b43_wl *wl)
 {
@@ -4097,7 +4097,7 @@ static void b43_update_basic_rates(struct b43_wldev *dev, u32 brates)
 static void b43_op_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *conf,
-				    u32 changed)
+				    u64 changed)
 {
 	struct b43_wl *wl = hw_to_b43_wl(hw);
 	struct b43_wldev *dev;
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index eec3af9c3745..96d5a034c09b 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -2806,7 +2806,7 @@ static void b43legacy_update_basic_rates(struct b43legacy_wldev *dev, u32 brates
 static void b43legacy_op_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *conf,
-				    u32 changed)
+				    u64 changed)
 {
 	struct b43legacy_wl *wl = hw_to_b43legacy_wl(hw);
 	struct b43legacy_wldev *dev;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index e6cd638a85d6..fd3c131c622c 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -582,7 +582,7 @@ static int brcms_ops_config(struct ieee80211_hw *hw, u32 changed)
 static void
 brcms_ops_bss_info_changed(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
-			struct ieee80211_bss_conf *info, u32 changed)
+			struct ieee80211_bss_conf *info, u64 changed)
 {
 	struct brcms_info *wl = hw->priv;
 	struct bcma_device *core = wl->wlc->hw->d11core;
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index b01945415be6..affad34f70f0 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -5311,13 +5311,13 @@ il_beacon_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 void
 il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			struct ieee80211_bss_conf *bss_conf, u32 changes)
+			struct ieee80211_bss_conf *bss_conf, u64 changes)
 {
 	struct il_priv *il = hw->priv;
 	int ret;
 
 	mutex_lock(&il->mutex);
-	D_MAC80211("enter: changes 0x%x\n", changes);
+	D_MAC80211("enter: changes 0x%llx\n", changes);
 
 	if (!il_is_alive(il)) {
 		D_MAC80211("leave - not alive\n");
@@ -5438,7 +5438,7 @@ il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	}
 
 	if (changes && il_is_associated(il) && vif->cfg.aid) {
-		D_MAC80211("Changes (%#x) while associated\n", changes);
+		D_MAC80211("Changes (%#llx) while associated\n", changes);
 		ret = il_send_rxon_assoc(il);
 		if (!ret) {
 			/* Sync active_rxon with latest change. */
diff --git a/drivers/net/wireless/intel/iwlegacy/common.h b/drivers/net/wireless/intel/iwlegacy/common.h
index 40877ef1fbf2..d1383b4f0f05 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.h
+++ b/drivers/net/wireless/intel/iwlegacy/common.h
@@ -1947,7 +1947,7 @@ il_get_hw_mode(struct il_priv *il, enum nl80211_band band)
 int il_mac_config(struct ieee80211_hw *hw, u32 changed);
 void il_mac_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			     struct ieee80211_bss_conf *bss_conf, u32 changes);
+			     struct ieee80211_bss_conf *bss_conf, u64 changes);
 void il_tx_cmd_protection(struct il_priv *il, struct ieee80211_tx_info *info,
 			  __le16 fc, __le32 *tx_flags);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
index abb8696ba294..411a6f6638b4 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
@@ -92,7 +92,7 @@ int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed);
 void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *bss_conf,
-			     u32 changes);
+			     u64 changes);
 void iwlagn_config_ht40(struct ieee80211_conf *conf,
 			struct iwl_rxon_context *ctx);
 void iwl_set_rxon_ht(struct iwl_priv *priv, struct iwl_ht_config *ht_conf);
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
index 17a14970edec..45e382fe45a2 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
@@ -1383,7 +1383,7 @@ static void iwlagn_chain_noise_reset(struct iwl_priv *priv)
 void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *bss_conf,
-			     u32 changes)
+			     u64 changes)
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
 	struct iwl_rxon_context *ctx = iwl_rxon_ctx_from_vif(vif);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index b5257b4fd000..586208506275 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2181,7 +2181,7 @@ static void iwl_mvm_protect_assoc(struct iwl_mvm *mvm,
 static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 					     struct ieee80211_vif *vif,
 					     struct ieee80211_bss_conf *bss_conf,
-					     u32 changes)
+					     u64 changes)
 {
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 	int ret;
@@ -2590,7 +2590,7 @@ static void
 iwl_mvm_bss_info_changed_ap_ibss(struct iwl_mvm *mvm,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *bss_conf,
-				 u32 changes)
+				 u64 changes)
 {
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
 
@@ -2621,7 +2621,7 @@ iwl_mvm_bss_info_changed_ap_ibss(struct iwl_mvm *mvm,
 static void iwl_mvm_bss_info_changed(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *bss_conf,
-				     u32 changes)
+				     u64 changes)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index a9dfe6a3da0d..cc6769a4fec7 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -449,7 +449,7 @@ static int p54_get_stats(struct ieee80211_hw *dev,
 static void p54_bss_info_changed(struct ieee80211_hw *dev,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *info,
-				 u32 changed)
+				 u64 changed)
 {
 	struct p54_common *priv = dev->priv;
 
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 5aaf34f6f43b..de10a40d5306 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2100,14 +2100,14 @@ static void mac80211_hwsim_bcn_en_iter(void *data, u8 *mac,
 static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
 					    struct ieee80211_vif *vif,
 					    struct ieee80211_bss_conf *info,
-					    u32 changed)
+					    u64 changed)
 {
 	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
 	struct mac80211_hwsim_data *data = hw->priv;
 
 	hwsim_check_magic(vif);
 
-	wiphy_dbg(hw->wiphy, "%s(changed=0x%x vif->addr=%pM)\n",
+	wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM)\n",
 		  __func__, changed, vif->addr);
 
 	if (changed & BSS_CHANGED_BSSID) {
diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c
index 02a1e1f547d8..21c3e0bdc444 100644
--- a/drivers/net/wireless/marvell/libertas_tf/main.c
+++ b/drivers/net/wireless/marvell/libertas_tf/main.c
@@ -417,7 +417,7 @@ static void lbtf_op_configure_filter(struct ieee80211_hw *hw,
 static void lbtf_op_bss_info_changed(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
 			struct ieee80211_bss_conf *bss_conf,
-			u32 changes)
+			u64 changes)
 {
 	struct lbtf_private *priv = hw->priv;
 	struct sk_buff *beacon;
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 5f1bcfb5e3f6..7eef3a74d124 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5163,7 +5163,7 @@ out:
 
 static void
 mwl8k_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		       struct ieee80211_bss_conf *info, u32 changed)
+		       struct ieee80211_bss_conf *info, u64 changed)
 {
 	if (vif->type == NL80211_IFTYPE_STATION)
 		mwl8k_bss_info_changed_sta(hw, vif, info, changed);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index 1f14ecda1f55..088c0a4cf774 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -297,7 +297,7 @@ mt7603_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags,
 
 static void
 mt7603_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			struct ieee80211_bss_conf *info, u32 changed)
+			struct ieee80211_bss_conf *info, u64 changed)
 {
 	struct mt7603_dev *dev = hw->priv;
 	struct mt7603_vif *mvif = (struct mt7603_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
index d992fdea0f32..277c22a4d049 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
@@ -576,7 +576,7 @@ static void mt7615_configure_filter(struct ieee80211_hw *hw,
 static void mt7615_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *info,
-				    u32 changed)
+				    u64 changed)
 {
 	struct mt7615_dev *dev = mt7615_hw_dev(hw);
 	struct mt7615_phy *phy = mt7615_hw_phy(hw);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h
index f76fd22ee035..74ad418f5a70 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h
@@ -187,7 +187,7 @@ void mt76x02_sw_scan_complete(struct ieee80211_hw *hw,
 void mt76x02_sta_ps(struct mt76_dev *dev, struct ieee80211_sta *sta, bool ps);
 void mt76x02_bss_info_changed(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
-			      struct ieee80211_bss_conf *info, u32 changed);
+			      struct ieee80211_bss_conf *info, u64 changed);
 void mt76x02_reconfig_complete(struct ieee80211_hw *hw,
 			       enum ieee80211_reconfig_type reconfig_type);
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
index 5bd0a0bae688..a0e2d042751b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
@@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(mt76x02_sta_ps);
 void mt76x02_bss_info_changed(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
 			      struct ieee80211_bss_conf *info,
-			      u32 changed)
+			      u64 changed)
 {
 	struct mt76x02_vif *mvif = (struct mt76x02_vif *)vif->drv_priv;
 	struct mt76x02_dev *dev = hw->priv;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
index 05327b0b6fc3..fbeac9aa2af6 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
@@ -573,7 +573,7 @@ mt7915_update_bss_color(struct ieee80211_hw *hw,
 static void mt7915_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *info,
-				    u32 changed)
+				    u64 changed)
 {
 	struct mt7915_phy *phy = mt7915_hw_phy(hw);
 	struct mt7915_dev *dev = mt7915_hw_dev(hw);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 8532033794bd..63583605d1cd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -637,7 +637,7 @@ static void mt7921_configure_filter(struct ieee80211_hw *hw,
 static void mt7921_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *info,
-				    u32 changed)
+				    u64 changed)
 {
 	struct mt7921_phy *phy = mt7921_hw_phy(hw);
 	struct mt7921_dev *dev = mt7921_hw_dev(hw);
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index 671d8897ae76..6c9c7a61c5c9 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -132,7 +132,7 @@ mt76_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags,
 
 static void
 mt7601u_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			 struct ieee80211_bss_conf *info, u32 changed)
+			 struct ieee80211_bss_conf *info, u64 changed)
 {
 	struct mt7601u_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c
index 90e552532701..1fc1682683e1 100644
--- a/drivers/net/wireless/purelifi/plfxlc/mac.c
+++ b/drivers/net/wireless/purelifi/plfxlc/mac.c
@@ -587,12 +587,12 @@ static void plfxlc_op_configure_filter(struct ieee80211_hw *hw,
 static void plfxlc_op_bss_info_changed(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
 				       struct ieee80211_bss_conf *bss_conf,
-				       u32 changes)
+				       u64 changes)
 {
 	struct plfxlc_mac *mac = plfxlc_hw_mac(hw);
 	int associated;
 
-	dev_dbg(plfxlc_mac_dev(mac), "changes: %x\n", changes);
+	dev_dbg(plfxlc_mac_dev(mac), "changes: %llx\n", changes);
 
 	if (mac->type != NL80211_IFTYPE_ADHOC) { /* for STATION */
 		associated = is_valid_ether_addr(bss_conf->bssid);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
index 9f6fc40649be..918e0477bb7d 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
@@ -1479,7 +1479,7 @@ int rt2x00mac_get_stats(struct ieee80211_hw *hw,
 void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct ieee80211_bss_conf *bss_conf,
-				u32 changes);
+				u64 changes);
 int rt2x00mac_conf_tx(struct ieee80211_hw *hw,
 		      struct ieee80211_vif *vif, u16 queue,
 		      const struct ieee80211_tx_queue_params *params);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
index 660554a01894..6205d22765c7 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
@@ -574,7 +574,7 @@ EXPORT_SYMBOL_GPL(rt2x00mac_get_stats);
 void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct ieee80211_bss_conf *bss_conf,
-				u32 changes)
+				u64 changes)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct rt2x00_intf *intf = vif_to_intf(vif);
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
index 025619cd14e8..10e8fdc31879 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
@@ -1500,7 +1500,7 @@ static void rtl8180_conf_erp(struct ieee80211_hw *dev,
 static void rtl8180_bss_info_changed(struct ieee80211_hw *dev,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *info,
-				     u32 changed)
+				     u64 changed)
 {
 	struct rtl8180_priv *priv = dev->priv;
 	struct rtl8180_vif *vif_priv;
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index eb68b2d3caa1..8ab65c888baf 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -1251,7 +1251,7 @@ static void rtl8187_conf_erp(struct rtl8187_priv *priv, bool use_short_slot,
 static void rtl8187_bss_info_changed(struct ieee80211_hw *dev,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *info,
-				     u32 changed)
+				     u64 changed)
 {
 	struct rtl8187_priv *priv = dev->priv;
 	struct rtl8187_vif *vif_priv;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 0239e12ec8a5..65c4cb1e030c 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -4558,7 +4558,7 @@ rtl8xxxu_wireless_mode(struct ieee80211_hw *hw, struct ieee80211_sta *sta)
 
 static void
 rtl8xxxu_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			  struct ieee80211_bss_conf *bss_conf, u32 changed)
+			  struct ieee80211_bss_conf *bss_conf, u64 changed)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
 	struct device *dev = &priv->udev->dev;
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 5177eb02740e..2f7fd8888d3a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1040,7 +1040,7 @@ EXPORT_SYMBOL_GPL(rtl_update_beacon_work_callback);
 static void rtl_op_bss_info_changed(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
 				    struct ieee80211_bss_conf *bss_conf,
-				    u32 changed)
+				    u64 changed)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index c5954c18862d..9e0b5692fbab 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -355,7 +355,7 @@ static void rtw_conf_tx(struct rtw_dev *rtwdev,
 static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
 				     struct ieee80211_bss_conf *conf,
-				     u32 changed)
+				     u64 changed)
 {
 	struct rtw_dev *rtwdev = hw->priv;
 	struct rtw_vif *rtwvif = (struct rtw_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index ac8b9bf2b0bb..5afb8fe5708e 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -336,7 +336,7 @@ static void rtw89_station_mode_sta_assoc(struct rtw89_dev *rtwdev,
 static void rtw89_ops_bss_info_changed(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
 				       struct ieee80211_bss_conf *conf,
-				       u32 changed)
+				       u64 changed)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index d4b3834388ab..1dff3d263382 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -783,7 +783,7 @@ static void rsi_switch_channel(struct rsi_hw *adapter,
 static void rsi_mac80211_bss_info_changed(struct ieee80211_hw *hw,
 					  struct ieee80211_vif *vif,
 					  struct ieee80211_bss_conf *bss_conf,
-					  u32 changed)
+					  u64 changed)
 {
 	struct rsi_hw *adapter = hw->priv;
 	struct rsi_common *common = adapter->priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 79fafe8143d9..f4103a653e3b 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -506,7 +506,7 @@ static void wfx_enable_beacon(struct wfx_vif *wvif, bool enable)
 }
 
 void wfx_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			  struct ieee80211_bss_conf *info, u32 changed)
+			  struct ieee80211_bss_conf *info, u64 changed)
 {
 	struct wfx_dev *wdev = hw->priv;
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index c69b2227e9ac..d9c6bd632b20 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -36,7 +36,7 @@ void wfx_leave_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		u16 queue, const struct ieee80211_tx_queue_params *params);
 void wfx_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			  struct ieee80211_bss_conf *info, u32 changed);
+			  struct ieee80211_bss_conf *info, u64 changed);
 int wfx_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta);
 int wfx_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta);
 void wfx_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 3f48def51ebc..5fa6878ee109 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -1796,14 +1796,14 @@ static int cw1200_set_btcoexinfo(struct cw1200_common *priv)
 void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *info,
-			     u32 changed)
+			     u64 changed)
 {
 	struct cw1200_common *priv = dev->priv;
 	bool do_join = false;
 
 	mutex_lock(&priv->conf_mutex);
 
-	pr_debug("BSS CHANGED:  %08x\n", changed);
+	pr_debug("BSS CHANGED:  %llx\n", changed);
 
 	/* TODO: BSS_CHANGED_QOS */
 	/* TODO: BSS_CHANGED_TXPOWER */
diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h
index 706dab8e73bf..05e3ab7ccef1 100644
--- a/drivers/net/wireless/st/cw1200/sta.h
+++ b/drivers/net/wireless/st/cw1200/sta.h
@@ -103,7 +103,7 @@ void cw1200_sta_notify(struct ieee80211_hw *dev, struct ieee80211_vif *vif,
 void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *info,
-			     u32 changed);
+			     u64 changed);
 int cw1200_ampdu_action(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
 			struct ieee80211_ampdu_params *params);
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index bdc93c4f54ba..340ab4985fe2 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1077,7 +1077,7 @@ out:
 static void wl1251_op_bss_info_changed(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
 				       struct ieee80211_bss_conf *bss_conf,
-				       u32 changed)
+				       u64 changed)
 {
 	struct wl1251 *wl = hw->priv;
 	struct sk_buff *beacon, *skb;
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index ad9560bd9512..d365bdce2a10 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4580,7 +4580,7 @@ out:
 static void wl1271_op_bss_info_changed(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
 				       struct ieee80211_bss_conf *bss_conf,
-				       u32 changed)
+				       u64 changed)
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index 3ef8533205f9..b42fedba519d 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -1278,12 +1278,12 @@ static void set_rts_cts(struct zd_mac *mac, unsigned int short_preamble)
 static void zd_op_bss_info_changed(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
 				   struct ieee80211_bss_conf *bss_conf,
-				   u32 changes)
+				   u64 changes)
 {
 	struct zd_mac *mac = zd_hw_mac(hw);
 	int associated;
 
-	dev_dbg_f(zd_mac_dev(mac), "changes: %x\n", changes);
+	dev_dbg_f(zd_mac_dev(mac), "changes: %llx\n", changes);
 
 	if (mac->type == NL80211_IFTYPE_MESH_POINT ||
 	    mac->type == NL80211_IFTYPE_ADHOC ||
diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index 8c5de30478ec..e8ac7b93b58c 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1395,7 +1395,7 @@ static int vnt_config(struct ieee80211_hw *hw, u32 changed)
 
 static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 struct ieee80211_bss_conf *conf, u32 changed)
+				 struct ieee80211_bss_conf *conf, u64 changed)
 {
 	struct vnt_private *priv = hw->priv;
 
diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c
index 3ab8a7bb9715..897ee0f7fc6b 100644
--- a/drivers/staging/vt6656/main_usb.c
+++ b/drivers/staging/vt6656/main_usb.c
@@ -745,7 +745,7 @@ static int vnt_config(struct ieee80211_hw *hw, u32 changed)
 
 static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 struct ieee80211_bss_conf *conf, u32 changed)
+				 struct ieee80211_bss_conf *conf, u64 changed)
 {
 	struct vnt_private *priv = hw->priv;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1520922d21a5..17b6eb426356 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -273,8 +273,8 @@ struct ieee80211_vif_chanctx_switch {
 /**
  * enum ieee80211_bss_change - BSS change notification flags
  *
- * These flags are used with the bss_info_changed() callback
- * to indicate which BSS parameter changed.
+ * These flags are used with the bss_info_changed(), link_info_changed()
+ * and vif_cfg_changed() callbacks to indicate which parameter(s) changed.
  *
  * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated),
  *	also implies a change in the AID.
@@ -3524,6 +3524,22 @@ struct ieee80211_prep_tx_info {
  *	for association indication. The @changed parameter indicates which
  *	of the bss parameters has changed when a call is made. The callback
  *	can sleep.
+ *	Note: this callback is called if @vif_cfg_changed or @link_info_changed
+ *	are not implemented.
+ *
+ * @vif_cfg_changed: Handler for configuration requests related to interface
+ *	(MLD) parameters from &struct ieee80211_vif_cfg that vary during the
+ *	lifetime of the interface (e.g. assoc status, IP addresses, etc.)
+ *	The @changed parameter indicates which value changed.
+ *	The callback can sleep.
+ *
+ * @link_info_changed: Handler for configuration requests related to link
+ *	parameters from &struct ieee80211_bss_conf that are related to an
+ *	individual link. e.g. legacy/HT/VHT/... rate information.
+ *	The @changed parameter indicates which value changed, and the @link_id
+ *	parameter indicates the link ID. Note that the @link_id will be 0 for
+ *	non-MLO connections.
+ *	The callback can sleep.
  *
  * @prepare_multicast: Prepare for multicast filter configuration.
  *	This callback is optional, and its return value is passed
@@ -4032,7 +4048,13 @@ struct ieee80211_ops {
 	void (*bss_info_changed)(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *info,
-				 u32 changed);
+				 u64 changed);
+	void (*vif_cfg_changed)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif,
+				u64 changed);
+	void (*link_info_changed)(struct ieee80211_hw *hw,
+				  struct ieee80211_vif *vif,
+				  unsigned int link_id, u64 changed);
 
 	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 	void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 0a717cb95400..a1404fe5cfe4 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -39,7 +39,8 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
 		memcpy(sdata->vif.bss_conf.mu_group.position,
 		       params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
 		       WLAN_USER_POSITION_LEN);
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
+		ieee80211_link_info_change_notify(sdata, 0,
+						  BSS_CHANGED_MU_GROUPS);
 		/* don't care about endianness - just check for 0 */
 		memcpy(&membership, params->vht_mumimo_groups,
 		       WLAN_MEMBERSHIP_LEN);
@@ -1333,7 +1334,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 		err |= BSS_CHANGED_HE_BSS_COLOR;
 	}
 
-	ieee80211_bss_info_change_notify(sdata, err);
+	ieee80211_link_info_change_notify(sdata, 0, err);
 	return 0;
 }
 
@@ -1414,7 +1415,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	sdata->beacon_rate_set = false;
 	sdata->vif.cfg.ssid_len = 0;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BEACON_ENABLED);
 
 	if (sdata->wdev.cac_started) {
 		chandef = sdata->vif.bss_conf.chandef;
@@ -2347,7 +2348,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
 	if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) {
 		conf->ht_opmode = nconf->ht_opmode;
 		sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode;
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_HT);
 	}
 	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask))
 		conf->dot11MeshHWMPactivePathToRootTimeout =
@@ -2502,7 +2503,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 		changed |= BSS_CHANGED_P2P_PS;
 	}
 
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	return 0;
 }
@@ -2543,7 +2544,7 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_QOS);
 
 	return 0;
 }
@@ -2692,7 +2693,7 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
 	memcpy(sdata->vif.bss_conf.mcast_rate, rate,
 	       sizeof(int) * NUM_NL80211_BANDS);
 
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MCAST_RATE);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_MCAST_RATE);
 
 	return 0;
 }
@@ -3026,7 +3027,7 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
 	/* tell the driver upon association, unless already associated */
 	if (sdata->u.mgd.associated &&
 	    sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_CQM);
 
 	return 0;
 }
@@ -3051,7 +3052,7 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy,
 	/* tell the driver upon association, unless already associated */
 	if (sdata->u.mgd.associated &&
 	    sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_CQM);
 
 	return 0;
 }
@@ -3389,7 +3390,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	if (err)
 		return err;
 
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	if (sdata->deflink.csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
@@ -3677,7 +3678,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 					  params->count, params->block_tx);
 
 	if (changed) {
-		ieee80211_bss_info_change_notify(sdata, changed);
+		ieee80211_link_info_change_notify(sdata, 0, changed);
 		drv_channel_switch_beacon(sdata, &params->chandef);
 	} else {
 		/* if the beacon didn't change, we can finalize immediately */
@@ -3980,7 +3981,7 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 
 	ret = ieee80211_vif_change_bandwidth(sdata, chandef, &changed);
 	if (ret == 0)
-		ieee80211_bss_info_change_notify(sdata, changed);
+		ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	return ret;
 }
@@ -4389,7 +4390,7 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.he_bss_color.enabled = enable;
 	changed |= BSS_CHANGED_HE_BSS_COLOR;
 
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) {
 		struct ieee80211_sub_if_data *child;
@@ -4399,8 +4400,8 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
 			if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) {
 				child->vif.bss_conf.he_bss_color.color = color;
 				child->vif.bss_conf.he_bss_color.enabled = enable;
-				ieee80211_bss_info_change_notify(child,
-								 BSS_CHANGED_HE_BSS_COLOR);
+				ieee80211_link_info_change_notify(child, 0,
+								  BSS_CHANGED_HE_BSS_COLOR);
 			}
 		}
 		mutex_unlock(&sdata->local->iflist_mtx);
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 67131ca3f649..5d8b49f20198 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -799,8 +799,7 @@ out:
 
 	if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
 	    sdata->vif.type != NL80211_IFTYPE_MONITOR)
-		ieee80211_bss_info_change_notify(sdata,
-						 BSS_CHANGED_IDLE);
+		ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE);
 
 	ieee80211_check_fast_xmit_iface(sdata);
 
@@ -1188,7 +1187,7 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 	ieee80211_recalc_radar_chanctx(local, new_ctx);
 
 	if (changed)
-		ieee80211_bss_info_change_notify(sdata, changed);
+		ieee80211_link_info_change_notify(sdata, 0, changed);
 
 out:
 	ieee80211_vif_chanctx_reservation_complete(sdata);
@@ -1533,8 +1532,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 			ieee80211_vif_update_chandef(sdata,
 						     &sdata->deflink.reserved_chandef);
 			if (changed)
-				ieee80211_bss_info_change_notify(sdata,
-								 changed);
+				ieee80211_link_info_change_notify(sdata, 0,
+								  changed);
 
 			ieee80211_recalc_txpower(sdata, false);
 		}
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index fd2882348211..15ab8d00815b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -147,10 +147,27 @@ static inline int drv_config(struct ieee80211_local *local, u32 changed)
 	return ret;
 }
 
-static inline void drv_bss_info_changed(struct ieee80211_local *local,
-					struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_bss_conf *info,
-					u32 changed)
+static inline void drv_vif_cfg_changed(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       u64 changed)
+{
+	might_sleep();
+
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	trace_drv_vif_cfg_changed(local, sdata, changed);
+	if (local->ops->vif_cfg_changed)
+		local->ops->vif_cfg_changed(&local->hw, &sdata->vif, changed);
+	else if (local->ops->bss_info_changed)
+		local->ops->bss_info_changed(&local->hw, &sdata->vif,
+					     &sdata->vif.bss_conf, changed);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_link_info_changed(struct ieee80211_local *local,
+					 struct ieee80211_sub_if_data *sdata,
+					 int link_id, u64 changed)
 {
 	might_sleep();
 
@@ -172,9 +189,13 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_bss_info_changed(local, sdata, info, changed);
-	if (local->ops->bss_info_changed)
-		local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed);
+	trace_drv_link_info_changed(local, sdata, link_id, changed);
+	if (local->ops->link_info_changed)
+		local->ops->link_info_changed(&local->hw, &sdata->vif,
+					      link_id, changed);
+	else if (local->ops->bss_info_changed)
+		local->ops->bss_info_changed(&local->hw, &sdata->vif,
+					     &sdata->vif.bss_conf, changed);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 066f7c5adeec..3b68e9f4345b 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1851,7 +1851,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 		| IEEE80211_HT_PARAM_RIFS_MODE;
 
 	changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE;
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = local->rx_chains;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f0a8bb444033..20153957cdee 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1836,7 +1836,11 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
-				      u32 changed);
+				      u64 changed);
+void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata,
+				     u64 changed);
+void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
+				       int link_id, u64 changed);
 void ieee80211_configure_filter(struct ieee80211_local *local);
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 2ee34d898821..978dfa48e098 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -80,7 +80,8 @@ void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
 {
 	if (__ieee80211_recalc_txpower(sdata) ||
 	    (update_bss && ieee80211_sdata_running(sdata)))
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER);
+		ieee80211_link_info_change_notify(sdata, 0,
+						  BSS_CHANGED_TXPOWER);
 }
 
 static u32 __ieee80211_idle_off(struct ieee80211_local *local)
@@ -1281,7 +1282,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
 		    sdata->vif.type != NL80211_IFTYPE_NAN)
 			changed |= ieee80211_reset_erp_info(sdata);
-		ieee80211_bss_info_change_notify(sdata, changed);
+		ieee80211_link_info_change_notify(sdata, 0, changed);
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 0c81ae492df4..5c71b522013f 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -199,15 +199,88 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 	return ret;
 }
 
+#define BSS_CHANGED_VIF_CFG_FLAGS (BSS_CHANGED_ASSOC |\
+				   BSS_CHANGED_IDLE |\
+				   BSS_CHANGED_IBSS |\
+				   BSS_CHANGED_ARP_FILTER |\
+				   BSS_CHANGED_SSID)
+
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
-				      u32 changed)
+				      u64 changed)
 {
 	struct ieee80211_local *local = sdata->local;
 
+	might_sleep();
+
 	if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		return;
 
-	drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed);
+	if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON |
+				    BSS_CHANGED_BEACON_ENABLED) &&
+			 sdata->vif.type != NL80211_IFTYPE_AP &&
+			 sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+			 sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+			 sdata->vif.type != NL80211_IFTYPE_OCB))
+		return;
+
+	if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
+			 sdata->vif.type == NL80211_IFTYPE_NAN ||
+			 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+			  !sdata->vif.bss_conf.mu_mimo_owner &&
+			  !(changed & BSS_CHANGED_TXPOWER))))
+		return;
+
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (changed & BSS_CHANGED_VIF_CFG_FLAGS) {
+		u64 ch = changed & BSS_CHANGED_VIF_CFG_FLAGS;
+
+		trace_drv_vif_cfg_changed(local, sdata, changed);
+		if (local->ops->vif_cfg_changed)
+			local->ops->vif_cfg_changed(&local->hw, &sdata->vif, ch);
+	}
+
+	if (changed & ~BSS_CHANGED_VIF_CFG_FLAGS) {
+		u64 ch = changed & ~BSS_CHANGED_VIF_CFG_FLAGS;
+
+		/* FIXME: should be for each link */
+		trace_drv_link_info_changed(local, sdata, 0, changed);
+		if (local->ops->link_info_changed)
+			local->ops->link_info_changed(&local->hw, &sdata->vif,
+						      0, ch);
+	}
+
+	if (local->ops->bss_info_changed)
+		local->ops->bss_info_changed(&local->hw, &sdata->vif,
+					     &sdata->vif.bss_conf, changed);
+	trace_drv_return_void(local);
+}
+
+void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata,
+				     u64 changed)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	WARN_ON_ONCE(changed & ~BSS_CHANGED_VIF_CFG_FLAGS);
+
+	if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		return;
+
+	drv_vif_cfg_changed(local, sdata, changed);
+}
+
+void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
+				       int link_id, u64 changed)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	WARN_ON_ONCE(changed & BSS_CHANGED_VIF_CFG_FLAGS);
+
+	if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		return;
+
+	drv_link_info_changed(local, sdata, link_id, changed);
 }
 
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
@@ -387,8 +460,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 
 	/* Configure driver only if associated (which also implies it is up) */
 	if (ifmgd->associated)
-		ieee80211_bss_info_change_notify(sdata,
-						 BSS_CHANGED_ARP_FILTER);
+		ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_ARP_FILTER);
 
 	sdata_unlock(sdata);
 
@@ -557,6 +629,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	if (WARN_ON(ops->sta_state && (ops->sta_add || ops->sta_remove)))
 		return NULL;
 
+	if (WARN_ON(!!ops->link_info_changed != !!ops->vif_cfg_changed ||
+		    (ops->link_info_changed && ops->bss_info_changed)))
+		return NULL;
+
 	/* check all or no channel context operations exist */
 	i = !!ops->add_chanctx + !!ops->remove_chanctx +
 	    !!ops->change_chanctx + !!ops->assign_vif_chanctx +
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index f60e257cba95..13722a7f2254 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1057,7 +1057,7 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 	}
 
 	ieee80211_recalc_dtim(local, sdata);
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 
 	netif_carrier_on(sdata->dev);
 	return 0;
@@ -1081,7 +1081,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
 	sdata->vif.bss_conf.enable_beacon = false;
 	sdata->beacon_rate_set = false;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BEACON_ENABLED);
 
 	/* remove beacon */
 	bcn = rcu_dereference_protected(ifmsh->beacon,
@@ -1581,7 +1581,7 @@ static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata)
 		if (ieee80211_mesh_rebuild_beacon(sdata))
 			return;
 
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 }
 
 void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fd1b97e1e990..e172bdfe9b0a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1835,7 +1835,7 @@ void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata)
 
 	if (sdata->vif.bss_conf.ps != ps_allowed) {
 		sdata->vif.bss_conf.ps = ps_allowed;
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_PS);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_PS);
 	}
 }
 
@@ -2031,7 +2031,7 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 {
 	if (__ieee80211_sta_handle_tspec_ac_params(sdata))
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_QOS);
 }
 
 static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
@@ -2920,7 +2920,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
 		sta_info_destroy_addr(sdata, auth_data->bss->bssid);
 
 		eth_zero_addr(sdata->deflink.u.mgd.bssid);
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
 		mutex_lock(&sdata->local->mtx);
 		ieee80211_vif_release_channel(sdata);
@@ -2949,7 +2949,7 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		sta_info_destroy_addr(sdata, assoc_data->bss->bssid);
 
 		eth_zero_addr(sdata->deflink.u.mgd.bssid);
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
 		sdata->vif.bss_conf.mu_mimo_owner = false;
 
@@ -4387,7 +4387,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					       elems->pwr_constr_elem,
 					       elems->cisco_dtpc_elem);
 
-	ieee80211_bss_info_change_notify(sdata, changed);
+	ieee80211_link_info_change_notify(sdata, 0, changed);
 free:
 	kfree(elems);
 }
@@ -5697,7 +5697,7 @@ skip_rates:
 		 * tell driver about BSSID, basic rates and timing
 		 * this was set up above, before setting the channel
 		 */
-		ieee80211_bss_info_change_notify(sdata,
+		ieee80211_link_info_change_notify(sdata, 0,
 			BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES |
 			BSS_CHANGED_BEACON_INT);
 
@@ -5865,7 +5865,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 
  err_clear:
 	eth_zero_addr(sdata->deflink.u.mgd.bssid);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 	ifmgd->auth_data = NULL;
 	mutex_lock(&sdata->local->mtx);
 	ieee80211_vif_release_channel(sdata);
@@ -6212,7 +6212,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	return 0;
  err_clear:
 	eth_zero_addr(sdata->deflink.u.mgd.bssid);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 	ifmgd->assoc_data = NULL;
  err_free:
 	kfree(assoc_data);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 6cd3df1eb687..2ed4e2325914 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -118,8 +118,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
 			set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
 				&sdata->state);
 			sdata->vif.bss_conf.enable_beacon = false;
-			ieee80211_bss_info_change_notify(
-				sdata, BSS_CHANGED_BEACON_ENABLED);
+			ieee80211_link_info_change_notify(
+				sdata, 0, BSS_CHANGED_BEACON_ENABLED);
 		}
 
 		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
@@ -155,8 +155,8 @@ void ieee80211_offchannel_return(struct ieee80211_local *local)
 		if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
 				       &sdata->state)) {
 			sdata->vif.bss_conf.enable_beacon = true;
-			ieee80211_bss_info_change_notify(
-				sdata, BSS_CHANGED_BEACON_ENABLED);
+			ieee80211_link_info_change_notify(
+				sdata, 0, BSS_CHANGED_BEACON_ENABLED);
 		}
 	}
 	mutex_unlock(&local->iflist_mtx);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 115efa830673..9a70d846d0dd 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -630,7 +630,7 @@ ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata)
 
 	if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) {
 		sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps;
-		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS);
+		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_P2P_PS);
 	}
 }
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 8a2ec9c31240..11a3b950b490 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1335,7 +1335,7 @@ iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	sdata->vif.bss_conf.ht_operation_mode = opmode;
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_HT);
 }
 
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 74ef7e71e1ef..9417378b4fc0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -390,22 +390,71 @@ TRACE_EVENT(drv_config,
 	)
 );
 
-TRACE_EVENT(drv_bss_info_changed,
+TRACE_EVENT(drv_vif_cfg_changed,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 struct ieee80211_bss_conf *info,
-		 u32 changed),
+		 u64 changed),
 
-	TP_ARGS(local, sdata, info, changed),
+	TP_ARGS(local, sdata, changed),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
-		__field(u32, changed)
+		__field(u64, changed)
 		__field(bool, assoc)
 		__field(bool, ibss_joined)
 		__field(bool, ibss_creator)
 		__field(u16, aid)
+		__dynamic_array(u32, arp_addr_list,
+				sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
+					sdata->vif.cfg.arp_addr_cnt)
+		__field(int, arp_addr_cnt)
+		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
+		__field(int, s1g)
+		__field(bool, idle)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->changed = changed;
+		__entry->aid = sdata->vif.cfg.aid;
+		__entry->assoc = sdata->vif.cfg.assoc;
+		__entry->ibss_joined = sdata->vif.cfg.ibss_joined;
+		__entry->ibss_creator = sdata->vif.cfg.ibss_creator;
+
+		__entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt;
+		memcpy(__get_dynamic_array(arp_addr_list),
+		       sdata->vif.cfg.arp_addr_list,
+		       sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
+					sdata->vif.cfg.arp_addr_cnt));
+		memcpy(__get_dynamic_array(ssid),
+		       sdata->vif.cfg.ssid,
+		       sdata->vif.cfg.ssid_len);
+		__entry->s1g = sdata->vif.cfg.s1g;
+		__entry->idle = sdata->vif.cfg.idle;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " changed:%#llx",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
+	)
+);
+
+TRACE_EVENT(drv_link_info_changed,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 int link_id, u64 changed),
+
+	TP_ARGS(local, sdata, link_id, changed),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u64, changed)
+		__field(int, link_id)
 		__field(bool, cts)
 		__field(bool, shortpre)
 		__field(bool, shortslot)
@@ -424,15 +473,8 @@ TRACE_EVENT(drv_bss_info_changed,
 		__field(u32, channel_width)
 		__field(u32, channel_cfreq1)
 		__field(u32, channel_cfreq1_offset)
-		__dynamic_array(u32, arp_addr_list,
-				sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
-					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
-					sdata->vif.cfg.arp_addr_cnt)
-		__field(int, arp_addr_cnt)
 		__field(bool, qos)
-		__field(bool, idle)
 		__field(bool, ps)
-		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
 		__field(bool, hidden_ssid)
 		__field(int, txpower)
 		__field(u8, p2p_oppps_ctwindow)
@@ -442,49 +484,37 @@ TRACE_EVENT(drv_bss_info_changed,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->changed = changed;
-		__entry->aid = sdata->vif.cfg.aid;
-		__entry->assoc = sdata->vif.cfg.assoc;
-		__entry->ibss_joined = sdata->vif.cfg.ibss_joined;
-		__entry->ibss_creator = sdata->vif.cfg.ibss_creator;
-		__entry->shortpre = info->use_short_preamble;
-		__entry->cts = info->use_cts_prot;
-		__entry->shortslot = info->use_short_slot;
-		__entry->enable_beacon = info->enable_beacon;
-		__entry->dtimper = info->dtim_period;
-		__entry->bcnint = info->beacon_int;
-		__entry->assoc_cap = info->assoc_capability;
-		__entry->sync_tsf = info->sync_tsf;
-		__entry->sync_device_ts = info->sync_device_ts;
-		__entry->sync_dtim_count = info->sync_dtim_count;
-		__entry->basic_rates = info->basic_rates;
-		memcpy(__entry->mcast_rate, info->mcast_rate,
+		__entry->link_id = link_id;
+		__entry->shortpre = sdata->vif.bss_conf.use_short_preamble;
+		__entry->cts = sdata->vif.bss_conf.use_cts_prot;
+		__entry->shortslot = sdata->vif.bss_conf.use_short_slot;
+		__entry->enable_beacon = sdata->vif.bss_conf.enable_beacon;
+		__entry->dtimper = sdata->vif.bss_conf.dtim_period;
+		__entry->bcnint = sdata->vif.bss_conf.beacon_int;
+		__entry->assoc_cap = sdata->vif.bss_conf.assoc_capability;
+		__entry->sync_tsf = sdata->vif.bss_conf.sync_tsf;
+		__entry->sync_device_ts = sdata->vif.bss_conf.sync_device_ts;
+		__entry->sync_dtim_count = sdata->vif.bss_conf.sync_dtim_count;
+		__entry->basic_rates = sdata->vif.bss_conf.basic_rates;
+		memcpy(__entry->mcast_rate, sdata->vif.bss_conf.mcast_rate,
 		       sizeof(__entry->mcast_rate));
-		__entry->ht_operation_mode = info->ht_operation_mode;
-		__entry->cqm_rssi_thold = info->cqm_rssi_thold;
-		__entry->cqm_rssi_hyst = info->cqm_rssi_hyst;
-		__entry->channel_width = info->chandef.width;
-		__entry->channel_cfreq1 = info->chandef.center_freq1;
-		__entry->channel_cfreq1_offset = info->chandef.freq1_offset;
-		__entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt;
-		memcpy(__get_dynamic_array(arp_addr_list),
-		       sdata->vif.cfg.arp_addr_list,
-		       sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
-					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
-					sdata->vif.cfg.arp_addr_cnt));
-		__entry->qos = info->qos;
-		__entry->idle = sdata->vif.cfg.idle;
-		__entry->ps = info->ps;
-		memcpy(__get_dynamic_array(ssid),
-		       sdata->vif.cfg.ssid,
-		       sdata->vif.cfg.ssid_len);
-		__entry->hidden_ssid = info->hidden_ssid;
-		__entry->txpower = info->txpower;
-		__entry->p2p_oppps_ctwindow = info->p2p_noa_attr.oppps_ctwindow;
-	),
-
-	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT " changed:%#x",
-		LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
+		__entry->ht_operation_mode = sdata->vif.bss_conf.ht_operation_mode;
+		__entry->cqm_rssi_thold = sdata->vif.bss_conf.cqm_rssi_thold;
+		__entry->cqm_rssi_hyst = sdata->vif.bss_conf.cqm_rssi_hyst;
+		__entry->channel_width = sdata->vif.bss_conf.chandef.width;
+		__entry->channel_cfreq1 = sdata->vif.bss_conf.chandef.center_freq1;
+		__entry->channel_cfreq1_offset = sdata->vif.bss_conf.chandef.freq1_offset;
+		__entry->qos = sdata->vif.bss_conf.qos;
+		__entry->ps = sdata->vif.bss_conf.ps;
+		__entry->hidden_ssid = sdata->vif.bss_conf.hidden_ssid;
+		__entry->txpower = sdata->vif.bss_conf.txpower;
+		__entry->p2p_oppps_ctwindow = sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " link_id:%d, changed:%#llx",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id,
+		__entry->changed
 	)
 );
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 9cbc09e6d84e..2a279dc3e457 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1699,8 +1699,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 	    sdata->vif.type != NL80211_IFTYPE_NAN) {
 		sdata->vif.bss_conf.qos = enable_qos;
 		if (bss_notify)
-			ieee80211_bss_info_change_notify(sdata,
-							 BSS_CHANGED_QOS);
+			ieee80211_link_info_change_notify(sdata, 0,
+							  BSS_CHANGED_QOS);
 	}
 }
 
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ac97584b3a0b..7daca8352deb 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -665,7 +665,7 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 	       mgmt->u.action.u.vht_group_notif.position,
 	       WLAN_USER_POSITION_LEN);
 
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
+	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_MU_GROUPS);
 }
 
 void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
-- 
cgit 


From 8e14130d3faf7b6b0fc57b530bb601cd9d6a1dab Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 30 May 2022 13:09:28 +0200
Subject: wifi: mac80211: add per-link configuration pointer

Add pointers so we can start using link_id throughout the
code, even if for now only link ID 0 is valid, pointing
to the "built-in" bss_conf, which is used by drivers that
are not aware of MLD.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  3 +++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/iface.c       | 23 +++++++++++++++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 17b6eb426356..9af283b1c3b5 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1739,6 +1739,8 @@ struct ieee80211_vif_cfg {
  * @cfg: vif configuration, see &struct ieee80211_vif_cfg
  * @bss_conf: BSS configuration for this interface, either our own
  *	or the BSS we're associated to
+ * @link_conf: in case of MLD, the per-link BSS configuration,
+ *	indexed by link ID
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
@@ -1773,6 +1775,7 @@ struct ieee80211_vif {
 	enum nl80211_iftype type;
 	struct ieee80211_vif_cfg cfg;
 	struct ieee80211_bss_conf bss_conf;
+	struct ieee80211_bss_conf *link_conf[IEEE80211_MLD_MAX_NUM_LINKS];
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 20153957cdee..397b111f006d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1031,6 +1031,7 @@ struct ieee80211_sub_if_data {
 	} u;
 
 	struct ieee80211_link_data deflink;
+	struct ieee80211_link_data *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	struct {
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 978dfa48e098..04ee525394e9 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1012,6 +1012,23 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
 	sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
 }
 
+static void ieee80211_sdata_init(struct ieee80211_local *local,
+				 struct ieee80211_sub_if_data *sdata)
+{
+	sdata->local = local;
+
+	/*
+	 * Initialize the default link, so we can use link_id 0 for non-MLD,
+	 * and that continues to work for non-MLD-aware drivers that use just
+	 * vif.bss_conf instead of vif.link_conf.
+	 *
+	 * Note that we never change this, so if link ID 0 isn't used in an
+	 * MLD connection, we get a separate allocation for it.
+	 */
+	sdata->vif.link_conf[0] = &sdata->vif.bss_conf;
+	sdata->link[0] = &sdata->deflink;
+}
+
 int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata;
@@ -1031,12 +1048,13 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 		return -ENOMEM;
 
 	/* set up data */
-	sdata->local = local;
 	sdata->vif.type = NL80211_IFTYPE_MONITOR;
 	snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
 		 wiphy_name(local->hw.wiphy));
 	sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
 
+	ieee80211_sdata_init(local, sdata);
+
 	ieee80211_set_default_queues(sdata);
 
 	ret = drv_add_interface(local, sdata);
@@ -2074,7 +2092,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 
 	/* initialise type-independent data */
 	sdata->wdev.wiphy = local->hw.wiphy;
-	sdata->local = local;
+
+	ieee80211_sdata_init(local, sdata);
 
 	ieee80211_init_frag_cache(&sdata->frags);
 
-- 
cgit 


From b4f85443c17c7edb49c82fc1d28d26860c8c850d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 30 May 2022 18:35:23 +0200
Subject: wifi: mac80211: make channel context code MLO-aware

Make the channel context code MLO aware, along with some
functions that it uses, so that the chan.c file is now
MLD-clean and no longer uses deflink/bss_conf/etc.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c             |   2 +
 drivers/net/wireless/ath/ath11k/mac.c             |   2 +
 drivers/net/wireless/ath/ath9k/main.c             |   2 +
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |   2 +
 drivers/net/wireless/mac80211_hwsim.c             |   2 +
 drivers/net/wireless/silabs/wfx/sta.c             |   2 +
 drivers/net/wireless/silabs/wfx/sta.h             |   2 +
 drivers/net/wireless/ti/wlcore/main.c             |   2 +
 include/net/mac80211.h                            |   4 +
 net/mac80211/cfg.c                                |  52 +-
 net/mac80211/chan.c                               | 642 +++++++++++++---------
 net/mac80211/debug.h                              |  14 +
 net/mac80211/driver-ops.h                         |   8 +-
 net/mac80211/eht.c                                |   4 +-
 net/mac80211/he.c                                 |   6 +-
 net/mac80211/ibss.c                               |   8 +-
 net/mac80211/ieee80211_i.h                        |  51 +-
 net/mac80211/iface.c                              |  15 +-
 net/mac80211/mesh_plink.c                         |   4 +-
 net/mac80211/mlme.c                               |  28 +-
 net/mac80211/ocb.c                                |   6 +-
 net/mac80211/rate.c                               |   8 +-
 net/mac80211/rate.h                               |   7 +-
 net/mac80211/rx.c                                 |   8 +-
 net/mac80211/tdls.c                               |   4 +-
 net/mac80211/trace.h                              |  17 +-
 net/mac80211/util.c                               |  39 +-
 net/mac80211/vht.c                                |  36 +-
 28 files changed, 586 insertions(+), 391 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index c1f8123d81fe..4808ed6a6e20 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -8919,6 +8919,7 @@ unlock:
 static int
 ath10k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
+				 unsigned int link_id,
 				 struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath10k *ar = hw->priv;
@@ -8998,6 +8999,7 @@ err:
 static void
 ath10k_mac_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
+				   unsigned int link_id,
 				   struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath10k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 3e07b1454fe4..96a33b49a52f 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -7072,6 +7072,7 @@ static int ath11k_start_vdev_delay(struct ieee80211_hw *hw,
 static int
 ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
+				 unsigned int link_id,
 				 struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath11k *ar = hw->priv;
@@ -7161,6 +7162,7 @@ out:
 static void
 ath11k_mac_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
+				   unsigned int link_id,
 				   struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath11k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 729f8ee9644d..c3d5d9795424 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -2596,6 +2596,7 @@ static void ath9k_change_chanctx(struct ieee80211_hw *hw,
 
 static int ath9k_assign_vif_chanctx(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
+				    unsigned int link_id,
 				    struct ieee80211_chanctx_conf *conf)
 {
 	struct ath_softc *sc = hw->priv;
@@ -2627,6 +2628,7 @@ static int ath9k_assign_vif_chanctx(struct ieee80211_hw *hw,
 
 static void ath9k_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
+				       unsigned int link_id,
 				       struct ieee80211_chanctx_conf *conf)
 {
 	struct ath_softc *sc = hw->priv;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 586208506275..5d3cedc146be 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -4235,6 +4235,7 @@ out:
 }
 static int iwl_mvm_assign_vif_chanctx(struct ieee80211_hw *hw,
 				      struct ieee80211_vif *vif,
+				      unsigned int link_id,
 				      struct ieee80211_chanctx_conf *ctx)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
@@ -4308,6 +4309,7 @@ out:
 
 static void iwl_mvm_unassign_vif_chanctx(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
+					 unsigned int link_id,
 					 struct ieee80211_chanctx_conf *ctx)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index de10a40d5306..f9392bec85d9 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2722,6 +2722,7 @@ static void mac80211_hwsim_change_chanctx(struct ieee80211_hw *hw,
 
 static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif,
+					     unsigned int link_id,
 					     struct ieee80211_chanctx_conf *ctx)
 {
 	hwsim_check_magic(vif);
@@ -2732,6 +2733,7 @@ static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw,
 
 static void mac80211_hwsim_unassign_vif_chanctx(struct ieee80211_hw *hw,
 						struct ieee80211_vif *vif,
+						unsigned int link_id,
 						struct ieee80211_chanctx_conf *ctx)
 {
 	hwsim_check_magic(vif);
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index f4103a653e3b..97a631ff1bfa 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -680,6 +680,7 @@ void wfx_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *
 }
 
 int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			   unsigned int link_id,
 			   struct ieee80211_chanctx_conf *conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
@@ -692,6 +693,7 @@ int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			      unsigned int link_id,
 			      struct ieee80211_chanctx_conf *conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index d9c6bd632b20..3109d257fe94 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -48,8 +48,10 @@ int wfx_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf
 void wfx_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf);
 void wfx_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf, u32 changed);
 int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			   unsigned int link_id,
 			   struct ieee80211_chanctx_conf *conf);
 void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			      unsigned int link_id,
 			      struct ieee80211_chanctx_conf *conf);
 
 /* Hardware API Callbacks */
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index d365bdce2a10..b476f244a20e 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4701,6 +4701,7 @@ out:
 
 static int wlcore_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 					struct ieee80211_vif *vif,
+					unsigned int link_id,
 					struct ieee80211_chanctx_conf *ctx)
 {
 	struct wl1271 *wl = hw->priv;
@@ -4751,6 +4752,7 @@ out:
 
 static void wlcore_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 					   struct ieee80211_vif *vif,
+					   unsigned int link_id,
 					   struct ieee80211_chanctx_conf *ctx)
 {
 	struct wl1271 *wl = hw->priv;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 9af283b1c3b5..4dc51c52dd28 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -261,11 +261,13 @@ enum ieee80211_chanctx_switch_mode {
  * done.
  *
  * @vif: the vif that should be switched from old_ctx to new_ctx
+ * @link_id: the link ID that's switching
  * @old_ctx: the old context to which the vif was assigned
  * @new_ctx: the new context to which the vif must be assigned
  */
 struct ieee80211_vif_chanctx_switch {
 	struct ieee80211_vif *vif;
+	unsigned int link_id;
 	struct ieee80211_chanctx_conf *old_ctx;
 	struct ieee80211_chanctx_conf *new_ctx;
 };
@@ -4262,9 +4264,11 @@ struct ieee80211_ops {
 			       u32 changed);
 	int (*assign_vif_chanctx)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
+				  unsigned int link_id,
 				  struct ieee80211_chanctx_conf *ctx);
 	void (*unassign_vif_chanctx)(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
+				     unsigned int link_id,
 				     struct ieee80211_chanctx_conf *ctx);
 	int (*switch_vif_chanctx)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif_chanctx_switch *vifs,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 69f540b0cbc6..884d2cfcec1e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -840,9 +840,10 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 		sdata = wiphy_dereference(local->hw.wiphy,
 					  local->monitor_sdata);
 		if (sdata) {
-			ieee80211_vif_release_channel(sdata);
-			ret = ieee80211_vif_use_channel(sdata, chandef,
-					IEEE80211_CHANCTX_EXCLUSIVE);
+			ieee80211_link_release_channel(sdata->link[0]);
+			ret = ieee80211_link_use_channel(sdata->link[0],
+							 chandef,
+							 IEEE80211_CHANCTX_EXCLUSIVE);
 		}
 	} else if (local->open_count == local->monitors) {
 		local->_oper_chandef = *chandef;
@@ -1183,10 +1184,12 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	mutex_lock(&local->mtx);
-	err = ieee80211_vif_use_channel(sdata, &params->chandef,
-					IEEE80211_CHANCTX_SHARED);
+	err = ieee80211_link_use_channel(sdata->link[params->beacon.link_id],
+					 &params->chandef,
+					 IEEE80211_CHANCTX_SHARED);
 	if (!err)
-		ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+		ieee80211_link_copy_chanctx_to_vlans(sdata->link[params->beacon.link_id],
+						     false);
 	mutex_unlock(&local->mtx);
 	if (err) {
 		sdata->vif.bss_conf.beacon_int = prev_beacon_int;
@@ -1296,7 +1299,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 
 error:
 	mutex_lock(&local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(sdata->link[params->beacon.link_id]);
 	mutex_unlock(&local->mtx);
 
 	return err;
@@ -1433,8 +1436,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf);
 
 	mutex_lock(&local->mtx);
-	ieee80211_vif_copy_chanctx_to_vlans(sdata, true);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_copy_chanctx_to_vlans(sdata->link[link_id], true);
+	ieee80211_link_release_channel(sdata->link[link_id]);
 	mutex_unlock(&local->mtx);
 
 	return 0;
@@ -2401,8 +2404,8 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
 	mutex_lock(&sdata->local->mtx);
-	err = ieee80211_vif_use_channel(sdata, &setup->chandef,
-					IEEE80211_CHANCTX_SHARED);
+	err = ieee80211_link_use_channel(sdata->link[0], &setup->chandef,
+					 IEEE80211_CHANCTX_SHARED);
 	mutex_unlock(&sdata->local->mtx);
 	if (err)
 		return err;
@@ -2416,7 +2419,7 @@ static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev)
 
 	ieee80211_stop_mesh(sdata);
 	mutex_lock(&sdata->local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(sdata->link[0]);
 	kfree(sdata->u.mesh.ie);
 	mutex_unlock(&sdata->local->mtx);
 
@@ -3145,8 +3148,8 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = local->rx_chains;
 
-	err = ieee80211_vif_use_channel(sdata, chandef,
-					IEEE80211_CHANCTX_SHARED);
+	err = ieee80211_link_use_channel(sdata->link[0], chandef,
+					 IEEE80211_CHANCTX_SHARED);
 	if (err)
 		goto out_unlock;
 
@@ -3174,7 +3177,7 @@ static void ieee80211_end_cac(struct wiphy *wiphy,
 		cancel_delayed_work(&sdata->deflink.dfs_cac_timer_work);
 
 		if (sdata->wdev.cac_started) {
-			ieee80211_vif_release_channel(sdata);
+			ieee80211_link_release_channel(sdata->link[0]);
 			sdata->wdev.cac_started = false;
 		}
 	}
@@ -3378,7 +3381,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 		if (sdata->deflink.reserved_ready)
 			return 0;
 
-		return ieee80211_vif_use_reserved_context(sdata);
+		return ieee80211_link_use_reserved_context(sdata->link[0]);
 	}
 
 	if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
@@ -3643,16 +3646,16 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	if (err)
 		goto out;
 
-	err = ieee80211_vif_reserve_chanctx(sdata, &params->chandef,
-					    chanctx->mode,
-					    params->radar_required);
+	err = ieee80211_link_reserve_chanctx(sdata->link[0], &params->chandef,
+					     chanctx->mode,
+					     params->radar_required);
 	if (err)
 		goto out;
 
 	/* if reservation is invalid then this will fail */
 	err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0);
 	if (err) {
-		ieee80211_vif_unreserve_chanctx(sdata);
+		ieee80211_link_unreserve_chanctx(sdata->link[0]);
 		goto out;
 	}
 
@@ -3662,7 +3665,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 	err = ieee80211_set_csa_beacon(sdata, params, &changed);
 	if (err) {
-		ieee80211_vif_unreserve_chanctx(sdata);
+		ieee80211_link_unreserve_chanctx(sdata->link[0]);
 		goto out;
 	}
 
@@ -3921,9 +3924,9 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 	int ret = -ENODATA;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+	chanctx_conf = rcu_dereference(sdata->vif.link_conf[link_id]->chanctx_conf);
 	if (chanctx_conf) {
-		*chandef = sdata->vif.bss_conf.chandef;
+		*chandef = sdata->vif.link_conf[link_id]->chandef;
 		ret = 0;
 	} else if (local->open_count > 0 &&
 		   local->open_count == local->monitors &&
@@ -3980,7 +3983,8 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 	int ret;
 	u32 changed = 0;
 
-	ret = ieee80211_vif_change_bandwidth(sdata, chandef, &changed);
+	ret = ieee80211_link_change_bandwidth(sdata->link[link_id], chandef,
+					      &changed);
 	if (ret == 0)
 		ieee80211_link_info_change_notify(sdata, link_id, changed);
 
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 5d8b49f20198..4f25660d0eeb 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -15,12 +15,12 @@
 static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
 					  struct ieee80211_chanctx *ctx)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 	int num = 0;
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list)
+	list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list)
 		num++;
 
 	return num;
@@ -29,12 +29,12 @@ static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
 static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local,
 					  struct ieee80211_chanctx *ctx)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 	int num = 0;
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list)
+	list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list)
 		num++;
 
 	return num;
@@ -67,12 +67,14 @@ static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local)
 }
 
 static struct ieee80211_chanctx *
-ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata)
+ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata,
+			  unsigned int link_id)
 {
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local __maybe_unused = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(link_conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf)
 		return NULL;
@@ -80,21 +82,27 @@ ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata)
 	return container_of(conf, struct ieee80211_chanctx, conf);
 }
 
+static struct ieee80211_chanctx *
+ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
+{
+	return ieee80211_vif_get_chanctx(link->sdata, link->link_id);
+}
+
 static const struct cfg80211_chan_def *
 ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local,
 				   struct ieee80211_chanctx *ctx,
 				   const struct cfg80211_chan_def *compat)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	list_for_each_entry(sdata, &ctx->reserved_vifs,
+	list_for_each_entry(link, &ctx->reserved_links,
 			    reserved_chanctx_list) {
 		if (!compat)
-			compat = &sdata->deflink.reserved_chandef;
+			compat = &link->reserved_chandef;
 
-		compat = cfg80211_chandef_compatible(&sdata->deflink.reserved_chandef,
+		compat = cfg80211_chandef_compatible(&link->reserved_chandef,
 						     compat);
 		if (!compat)
 			break;
@@ -108,20 +116,23 @@ ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
 				       struct ieee80211_chanctx *ctx,
 				       const struct cfg80211_chan_def *compat)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	list_for_each_entry(sdata, &ctx->assigned_vifs,
+	list_for_each_entry(link, &ctx->assigned_links,
 			    assigned_chanctx_list) {
-		if (sdata->deflink.reserved_chanctx != NULL)
+		struct ieee80211_bss_conf *link_conf =
+			link->sdata->vif.link_conf[link->link_id];
+
+		if (link->reserved_chanctx)
 			continue;
 
 		if (!compat)
-			compat = &sdata->vif.bss_conf.chandef;
+			compat = &link_conf->chandef;
 
 		compat = cfg80211_chandef_compatible(
-				&sdata->vif.bss_conf.chandef, compat);
+				&link_conf->chandef, compat);
 		if (!compat)
 			break;
 	}
@@ -157,7 +168,7 @@ ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
 	if (ieee80211_chanctx_combined_chandef(local, ctx, def))
 		return true;
 
-	if (!list_empty(&ctx->reserved_vifs) &&
+	if (!list_empty(&ctx->reserved_links) &&
 	    ieee80211_chanctx_reserved_chandef(local, ctx, def))
 		return true;
 
@@ -193,13 +204,19 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
 	return NULL;
 }
 
-static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta)
+static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta,
+						    unsigned int link_id)
 {
-	enum ieee80211_sta_rx_bandwidth width = ieee80211_sta_cap_rx_bw(sta);
+	enum ieee80211_sta_rx_bandwidth width =
+		ieee80211_sta_cap_rx_bw(sta, link_id);
+
+	/* no effect if this STA has no presence on this link */
+	if (!sta->sta.link[link_id])
+		return NL80211_CHAN_WIDTH_20_NOHT;
 
 	switch (width) {
 	case IEEE80211_STA_RX_BW_20:
-		if (sta->sta.deflink.ht_cap.ht_supported)
+		if (sta->sta.link[link_id]->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20;
 		else
 			return NL80211_CHAN_WIDTH_20_NOHT;
@@ -227,7 +244,8 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta)
 }
 
 static enum nl80211_chan_width
-ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
+ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata,
+			      unsigned int link_id)
 {
 	enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
 	struct sta_info *sta;
@@ -238,7 +256,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
 		    !(sta->sdata->bss && sta->sdata->bss == sdata->bss))
 			continue;
 
-		max_bw = max(max_bw, ieee80211_get_sta_bw(sta));
+		max_bw = max(max_bw, ieee80211_get_sta_bw(sta, link_id));
 	}
 	rcu_read_unlock();
 
@@ -246,27 +264,28 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
 }
 
 static enum nl80211_chan_width
-ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
-				      struct ieee80211_chanctx_conf *conf)
+ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata,
+					  struct ieee80211_chanctx_conf *conf)
 {
-	struct ieee80211_sub_if_data *sdata;
 	enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
+	struct ieee80211_vif *vif = &sdata->vif;
+	int link_id;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		struct ieee80211_vif *vif = &sdata->vif;
+	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
 		enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT;
+		struct ieee80211_bss_conf *link_conf =
+			sdata->vif.link_conf[link_id];
 
-		if (!ieee80211_sdata_running(sdata))
+		if (!link_conf)
 			continue;
 
-		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
+		if (rcu_access_pointer(link_conf->chanctx_conf) != conf)
 			continue;
 
 		switch (vif->type) {
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_AP_VLAN:
-			width = ieee80211_get_max_required_bw(sdata);
+			width = ieee80211_get_max_required_bw(sdata, link_id);
 			break;
 		case NL80211_IFTYPE_STATION:
 			/*
@@ -274,8 +293,8 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 			 * point, so take the width from the chandef, but
 			 * account also for TDLS peers
 			 */
-			width = max(vif->bss_conf.chandef.width,
-				    ieee80211_get_max_required_bw(sdata));
+			width = max(link_conf->chandef.width,
+				    ieee80211_get_max_required_bw(sdata, link_id));
 			break;
 		case NL80211_IFTYPE_P2P_DEVICE:
 		case NL80211_IFTYPE_NAN:
@@ -283,7 +302,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 		case NL80211_IFTYPE_ADHOC:
 		case NL80211_IFTYPE_MESH_POINT:
 		case NL80211_IFTYPE_OCB:
-			width = vif->bss_conf.chandef.width;
+			width = link_conf->chandef.width;
 			break;
 		case NL80211_IFTYPE_WDS:
 		case NL80211_IFTYPE_UNSPECIFIED:
@@ -293,12 +312,36 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 		case NL80211_IFTYPE_P2P_GO:
 			WARN_ON_ONCE(1);
 		}
+
+		max_bw = max(max_bw, width);
+	}
+
+	return max_bw;
+}
+
+static enum nl80211_chan_width
+ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
+				      struct ieee80211_chanctx_conf *conf)
+{
+	struct ieee80211_sub_if_data *sdata;
+	enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		enum nl80211_chan_width width;
+
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		width = ieee80211_get_chanctx_vif_max_required_bw(sdata, conf);
+
 		max_bw = max(max_bw, width);
 	}
 
 	/* use the configured bandwidth in case of monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
-	if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == conf)
+	if (sdata &&
+	    rcu_access_pointer(sdata->vif.link_conf[0]->chanctx_conf) == conf)
 		max_bw = max(max_bw, conf->def.width);
 
 	rcu_read_unlock();
@@ -350,7 +393,7 @@ static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
 }
 
 /* calling this function is assuming that station vif is updated to
- * lates changes by calling ieee80211_vif_update_chandef
+ * lates changes by calling ieee80211_link_update_chandef
  */
 static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 				     struct ieee80211_chanctx *ctx,
@@ -363,29 +406,38 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 	rcu_read_lock();
 	list_for_each_entry_rcu(sta, &local->sta_list,
 				list) {
+		struct ieee80211_sub_if_data *sdata = sta->sdata;
 		enum ieee80211_sta_rx_bandwidth new_sta_bw;
+		unsigned int link_id;
 
 		if (!ieee80211_sdata_running(sta->sdata))
 			continue;
 
-		if (rcu_access_pointer(sta->sdata->vif.bss_conf.chanctx_conf) !=
-		    &ctx->conf)
-			continue;
+		for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) {
+			struct ieee80211_bss_conf *link_conf =
+				sdata->vif.link_conf[link_id];
+
+			if (!link_conf)
+				continue;
 
-		new_sta_bw = ieee80211_sta_cur_vht_bw(sta);
+			if (rcu_access_pointer(link_conf->chanctx_conf) != &ctx->conf)
+				continue;
 
-		/* nothing change */
-		if (new_sta_bw == sta->sta.deflink.bandwidth)
-			continue;
+			new_sta_bw = ieee80211_sta_cur_vht_bw(sta, link_id);
 
-		/* vif changed to narrow BW and narrow BW for station wasn't
-		 * requested or vise versa */
-		if ((new_sta_bw < sta->sta.deflink.bandwidth) == !narrowed)
-			continue;
+			/* nothing change */
+			if (new_sta_bw == sta->sta.link[link_id]->bandwidth)
+				continue;
+
+			/* vif changed to narrow BW and narrow BW for station wasn't
+			 * requested or vise versa */
+			if ((new_sta_bw < sta->sta.link[link_id]->bandwidth) == !narrowed)
+				continue;
 
-		sta->sta.deflink.bandwidth = new_sta_bw;
-		rate_control_rate_update(local, sband, sta,
-					 IEEE80211_RC_BW_CHANGED);
+			sta->sta.link[link_id]->bandwidth = new_sta_bw;
+			rate_control_rate_update(local, sband, sta, link_id,
+						 IEEE80211_RC_BW_CHANGED);
+		}
 	}
 	rcu_read_unlock();
 }
@@ -508,9 +560,14 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		if (sdata->deflink.radar_required) {
-			rcu_read_unlock();
-			return true;
+		unsigned int link_id;
+
+		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+			if (sdata->link[link_id] &&
+			    sdata->link[link_id]->radar_required) {
+				rcu_read_unlock();
+				return true;
+			}
 		}
 	}
 	rcu_read_unlock();
@@ -531,15 +588,27 @@ ieee80211_chanctx_radar_required(struct ieee80211_local *local,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		unsigned int link_id;
+
 		if (!ieee80211_sdata_running(sdata))
 			continue;
-		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
-			continue;
-		if (!sdata->deflink.radar_required)
-			continue;
+		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+			struct ieee80211_bss_conf *link_conf =
+				sdata->vif.link_conf[link_id];
 
-		required = true;
-		break;
+			if (!link_conf)
+				continue;
+
+			if (rcu_access_pointer(link_conf->chanctx_conf) != conf)
+				continue;
+			if (!sdata->link[link_id]->radar_required)
+				continue;
+			required = true;
+			break;
+		}
+
+		if (required)
+			break;
 	}
 	rcu_read_unlock();
 
@@ -559,8 +628,8 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local,
 	if (!ctx)
 		return NULL;
 
-	INIT_LIST_HEAD(&ctx->assigned_vifs);
-	INIT_LIST_HEAD(&ctx->reserved_vifs);
+	INIT_LIST_HEAD(&ctx->assigned_links);
+	INIT_LIST_HEAD(&ctx->reserved_links);
 	ctx->conf.def = *chandef;
 	ctx->conf.rx_chains_static = 1;
 	ctx->conf.rx_chains_dynamic = 1;
@@ -686,21 +755,32 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		int link_id;
 
 		if (!ieee80211_sdata_running(sdata))
 			continue;
-		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) != conf)
-			continue;
+
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			continue;
 
-		if (!compat)
-			compat = &sdata->vif.bss_conf.chandef;
+		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+			struct ieee80211_bss_conf *link_conf =
+				sdata->vif.link_conf[link_id];
 
-		compat = cfg80211_chandef_compatible(
-				&sdata->vif.bss_conf.chandef, compat);
-		if (WARN_ON_ONCE(!compat))
-			break;
+			if (!link_conf)
+				continue;
+
+			if (rcu_access_pointer(link_conf->chanctx_conf) != conf)
+				continue;
+
+			if (!compat)
+				compat = &link_conf->chandef;
+
+			compat = cfg80211_chandef_compatible(&link_conf->chandef,
+							     compat);
+			if (WARN_ON_ONCE(!compat))
+				break;
+		}
 	}
 
 	/* TDLS peers can sometimes affect the chandef width */
@@ -748,9 +828,11 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
 	drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR);
 }
 
-static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_chanctx *new_ctx)
+static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
+					 struct ieee80211_chanctx *new_ctx)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *curr_ctx = NULL;
@@ -759,29 +841,29 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
 		return -ENOTSUPP;
 
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 
 	if (conf) {
 		curr_ctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-		drv_unassign_vif_chanctx(local, sdata, curr_ctx);
+		drv_unassign_vif_chanctx(local, sdata, link_id, curr_ctx);
 		conf = NULL;
-		list_del(&sdata->assigned_chanctx_list);
+		list_del(&link->assigned_chanctx_list);
 	}
 
 	if (new_ctx) {
-		ret = drv_assign_vif_chanctx(local, sdata, new_ctx);
+		ret = drv_assign_vif_chanctx(local, sdata, link_id, new_ctx);
 		if (ret)
 			goto out;
 
 		conf = &new_ctx->conf;
-		list_add(&sdata->assigned_chanctx_list,
-			 &new_ctx->assigned_vifs);
+		list_add(&link->assigned_chanctx_list,
+			 &new_ctx->assigned_links);
 	}
 
 out:
-	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, conf);
+	rcu_assign_pointer(sdata->vif.link_conf[link_id]->chanctx_conf, conf);
 
 	sdata->vif.cfg.idle = !conf;
 
@@ -820,60 +902,64 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		u8 needed_static, needed_dynamic;
+		unsigned int link_id;
 
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) !=
-						&chanctx->conf)
-			continue;
-
 		switch (sdata->vif.type) {
-		case NL80211_IFTYPE_P2P_DEVICE:
-		case NL80211_IFTYPE_NAN:
-			continue;
 		case NL80211_IFTYPE_STATION:
 			if (!sdata->u.mgd.associated)
 				continue;
 			break;
-		case NL80211_IFTYPE_AP_VLAN:
-			continue;
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_ADHOC:
 		case NL80211_IFTYPE_MESH_POINT:
 		case NL80211_IFTYPE_OCB:
 			break;
 		default:
-			WARN_ON_ONCE(1);
+			continue;
 		}
 
-		switch (sdata->deflink.smps_mode) {
-		default:
-			WARN_ONCE(1, "Invalid SMPS mode %d\n",
-				  sdata->deflink.smps_mode);
-			fallthrough;
-		case IEEE80211_SMPS_OFF:
-			needed_static = sdata->deflink.needed_rx_chains;
-			needed_dynamic = sdata->deflink.needed_rx_chains;
-			break;
-		case IEEE80211_SMPS_DYNAMIC:
-			needed_static = 1;
-			needed_dynamic = sdata->deflink.needed_rx_chains;
-			break;
-		case IEEE80211_SMPS_STATIC:
-			needed_static = 1;
-			needed_dynamic = 1;
-			break;
-		}
+		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+			struct ieee80211_link_data *link = sdata->link[link_id];
+			struct ieee80211_bss_conf *link_conf =
+				sdata->vif.link_conf[link_id];
+
+			if (!link_conf)
+				continue;
+
+			if (rcu_access_pointer(link_conf->chanctx_conf) != &chanctx->conf)
+				continue;
+
+			switch (link->smps_mode) {
+			default:
+				WARN_ONCE(1, "Invalid SMPS mode %d\n",
+					  link->smps_mode);
+				fallthrough;
+			case IEEE80211_SMPS_OFF:
+				needed_static = link->needed_rx_chains;
+				needed_dynamic = link->needed_rx_chains;
+				break;
+			case IEEE80211_SMPS_DYNAMIC:
+				needed_static = 1;
+				needed_dynamic = link->needed_rx_chains;
+				break;
+			case IEEE80211_SMPS_STATIC:
+				needed_static = 1;
+				needed_dynamic = 1;
+				break;
+			}
 
-		rx_chains_static = max(rx_chains_static, needed_static);
-		rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic);
+			rx_chains_static = max(rx_chains_static, needed_static);
+			rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic);
+		}
 	}
 
 	/* Disable SMPS for the monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
 	if (sdata &&
-	    rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf)
+	    rcu_access_pointer(sdata->vif.link_conf[0]->chanctx_conf) == &chanctx->conf)
 		rx_chains_dynamic = rx_chains_static = local->rx_chains;
 
 	rcu_read_unlock();
@@ -898,9 +984,12 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 }
 
 static void
-__ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
-				      bool clear)
+__ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
+				       bool clear)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local __maybe_unused = sdata->local;
 	struct ieee80211_sub_if_data *vlan;
 	struct ieee80211_chanctx_conf *conf;
@@ -916,7 +1005,7 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
 	 * channel context pointer for a while, possibly pointing
 	 * to a channel context that has already been freed.
 	 */
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(link_conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	WARN_ON(!conf);
 
@@ -924,32 +1013,34 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
 		conf = NULL;
 
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
-		rcu_assign_pointer(vlan->vif.bss_conf.chanctx_conf, conf);
+		rcu_assign_pointer(vlan->vif.link_conf[link_id]->chanctx_conf,
+				   conf);
 }
 
-void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
-					 bool clear)
+void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
+					  bool clear)
 {
-	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_local *local = link->sdata->local;
 
 	mutex_lock(&local->chanctx_mtx);
 
-	__ieee80211_vif_copy_chanctx_to_vlans(sdata, clear);
+	__ieee80211_link_copy_chanctx_to_vlans(link, clear);
 
 	mutex_unlock(&local->chanctx_mtx);
 }
 
-int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata)
+int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link)
 {
-	struct ieee80211_chanctx *ctx = sdata->deflink.reserved_chanctx;
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	struct ieee80211_chanctx *ctx = link->reserved_chanctx;
 
 	lockdep_assert_held(&sdata->local->chanctx_mtx);
 
 	if (WARN_ON(!ctx))
 		return -EINVAL;
 
-	list_del(&sdata->reserved_chanctx_list);
-	sdata->deflink.reserved_chanctx = NULL;
+	list_del(&link->reserved_chanctx_list);
+	link->reserved_chanctx = NULL;
 
 	if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) {
 		if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) {
@@ -974,17 +1065,18 @@ int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata)
 	return 0;
 }
 
-int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
-				  const struct cfg80211_chan_def *chandef,
-				  enum ieee80211_chanctx_mode mode,
-				  bool radar_required)
+int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
+				   const struct cfg80211_chan_def *chandef,
+				   enum ieee80211_chanctx_mode mode,
+				   bool radar_required)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx;
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	curr_ctx = ieee80211_vif_get_chanctx(sdata);
+	curr_ctx = ieee80211_link_get_chanctx(link);
 	if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx)
 		return -ENOTSUPP;
 
@@ -998,11 +1090,11 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
 			if (!curr_ctx ||
 			    (curr_ctx->replace_state ==
 			     IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
-			    !list_empty(&curr_ctx->reserved_vifs)) {
+			    !list_empty(&curr_ctx->reserved_links)) {
 				/*
-				 * Another vif already requested this context
+				 * Another link already requested this context
 				 * for a reservation. Find another one hoping
-				 * all vifs assigned to it will also switch
+				 * all links assigned to it will also switch
 				 * soon enough.
 				 *
 				 * TODO: This needs a little more work as some
@@ -1011,13 +1103,13 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
 				 * provided some channel context juggling was
 				 * performed.
 				 *
-				 * Consider ctx1..3, vif1..6, each ctx has 2
-				 * vifs. vif1 and vif2 from ctx1 request new
+				 * Consider ctx1..3, link1..6, each ctx has 2
+				 * links. link1 and link2 from ctx1 request new
 				 * different chandefs starting 2 in-place
 				 * reserations with ctx4 and ctx5 replacing
-				 * ctx1 and ctx2 respectively. Next vif5 and
-				 * vif6 from ctx3 reserve ctx4. If vif3 and
-				 * vif4 remain on ctx2 as they are then this
+				 * ctx1 and ctx2 respectively. Next link5 and
+				 * link6 from ctx3 reserve ctx4. If link3 and
+				 * link4 remain on ctx2 as they are then this
 				 * fails unless `replace_ctx` from ctx5 is
 				 * replaced with ctx3.
 				 */
@@ -1027,7 +1119,7 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
 					    IEEE80211_CHANCTX_REPLACE_NONE)
 						continue;
 
-					if (!list_empty(&ctx->reserved_vifs))
+					if (!list_empty(&ctx->reserved_links))
 						continue;
 
 					curr_ctx = ctx;
@@ -1042,7 +1134,7 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
 			if (!curr_ctx ||
 			    (curr_ctx->replace_state ==
 			     IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
-			    !list_empty(&curr_ctx->reserved_vifs))
+			    !list_empty(&curr_ctx->reserved_links))
 				return -EBUSY;
 
 			new_ctx = ieee80211_alloc_chanctx(local, chandef, mode);
@@ -1061,25 +1153,27 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	list_add(&sdata->reserved_chanctx_list, &new_ctx->reserved_vifs);
-	sdata->deflink.reserved_chanctx = new_ctx;
-	sdata->deflink.reserved_chandef = *chandef;
-	sdata->deflink.reserved_radar_required = radar_required;
-	sdata->deflink.reserved_ready = false;
+	list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links);
+	link->reserved_chanctx = new_ctx;
+	link->reserved_chandef = *chandef;
+	link->reserved_radar_required = radar_required;
+	link->reserved_ready = false;
 
 	return 0;
 }
 
 static void
-ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_ADHOC:
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_MESH_POINT:
 	case NL80211_IFTYPE_OCB:
 		ieee80211_queue_work(&sdata->local->hw,
-				     &sdata->deflink.csa_finalize_work);
+				     &link->csa_finalize_work);
 		break;
 	case NL80211_IFTYPE_STATION:
 		ieee80211_queue_work(&sdata->local->hw,
@@ -1100,23 +1194,28 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
 }
 
 static void
-ieee80211_vif_update_chandef(struct ieee80211_sub_if_data *sdata,
-			     const struct cfg80211_chan_def *chandef)
+ieee80211_link_update_chandef(struct ieee80211_link_data *link,
+			      const struct cfg80211_chan_def *chandef)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
 	struct ieee80211_sub_if_data *vlan;
 
-	sdata->vif.bss_conf.chandef = *chandef;
+	sdata->vif.link_conf[link_id]->chandef = *chandef;
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return;
 
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
-		vlan->vif.bss_conf.chandef = *chandef;
+		vlan->vif.link_conf[link_id]->chandef = *chandef;
 }
 
 static int
-ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
@@ -1127,10 +1226,10 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 	lockdep_assert_held(&local->mtx);
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	new_ctx = sdata->deflink.reserved_chanctx;
-	old_ctx = ieee80211_vif_get_chanctx(sdata);
+	new_ctx = link->reserved_chanctx;
+	old_ctx = ieee80211_link_get_chanctx(link);
 
-	if (WARN_ON(!sdata->deflink.reserved_ready))
+	if (WARN_ON(!link->reserved_ready))
 		return -EBUSY;
 
 	if (WARN_ON(!new_ctx))
@@ -1144,23 +1243,24 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 		return -EINVAL;
 
 	chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-				&sdata->deflink.reserved_chandef);
+				&link->reserved_chandef);
 	if (WARN_ON(!chandef))
 		return -EINVAL;
 
-	if (sdata->vif.bss_conf.chandef.width != sdata->deflink.reserved_chandef.width)
+	if (link_conf->chandef.width != link->reserved_chandef.width)
 		changed = BSS_CHANGED_BANDWIDTH;
 
-	ieee80211_vif_update_chandef(sdata, &sdata->deflink.reserved_chandef);
+	ieee80211_link_update_chandef(link, &link->reserved_chandef);
 
 	ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef);
 
 	vif_chsw[0].vif = &sdata->vif;
 	vif_chsw[0].old_ctx = &old_ctx->conf;
 	vif_chsw[0].new_ctx = &new_ctx->conf;
+	vif_chsw[0].link_id = link->link_id;
 
-	list_del(&sdata->reserved_chanctx_list);
-	sdata->deflink.reserved_chanctx = NULL;
+	list_del(&link->reserved_chanctx_list);
+	link->reserved_chanctx = NULL;
 
 	err = drv_switch_vif_chanctx(local, vif_chsw, 1,
 				     CHANCTX_SWMODE_REASSIGN_VIF);
@@ -1171,11 +1271,11 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 		goto out;
 	}
 
-	list_move(&sdata->assigned_chanctx_list, &new_ctx->assigned_vifs);
-	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, &new_ctx->conf);
+	list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links);
+	rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf);
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP)
-		__ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+		__ieee80211_link_copy_chanctx_to_vlans(link, false);
 
 	ieee80211_check_fast_xmit_iface(sdata);
 
@@ -1187,25 +1287,27 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
 	ieee80211_recalc_radar_chanctx(local, new_ctx);
 
 	if (changed)
-		ieee80211_link_info_change_notify(sdata, 0, changed);
+		ieee80211_link_info_change_notify(sdata, link_id, changed);
 
 out:
-	ieee80211_vif_chanctx_reservation_complete(sdata);
+	ieee80211_link_chanctx_reservation_complete(link);
 	return err;
 }
 
 static int
-ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
 	const struct cfg80211_chan_def *chandef;
 	int err;
 
-	old_ctx = ieee80211_vif_get_chanctx(sdata);
-	new_ctx = sdata->deflink.reserved_chanctx;
+	old_ctx = ieee80211_vif_get_chanctx(sdata, link_id);
+	new_ctx = link->reserved_chanctx;
 
-	if (WARN_ON(!sdata->deflink.reserved_ready))
+	if (WARN_ON(!link->reserved_ready))
 		return -EINVAL;
 
 	if (WARN_ON(old_ctx))
@@ -1219,16 +1321,16 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
 		return -EINVAL;
 
 	chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-				&sdata->deflink.reserved_chandef);
+				&link->reserved_chandef);
 	if (WARN_ON(!chandef))
 		return -EINVAL;
 
 	ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef);
 
-	list_del(&sdata->reserved_chanctx_list);
-	sdata->deflink.reserved_chanctx = NULL;
+	list_del(&link->reserved_chanctx_list);
+	link->reserved_chanctx = NULL;
 
-	err = ieee80211_assign_vif_chanctx(sdata, new_ctx);
+	err = ieee80211_assign_link_chanctx(link, new_ctx);
 	if (err) {
 		if (ieee80211_chanctx_refcount(local, new_ctx) == 0)
 			ieee80211_free_chanctx(local, new_ctx);
@@ -1237,19 +1339,20 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
 	}
 
 out:
-	ieee80211_vif_chanctx_reservation_complete(sdata);
+	ieee80211_link_chanctx_reservation_complete(link);
 	return err;
 }
 
 static bool
-ieee80211_vif_has_in_place_reservation(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
 
 	lockdep_assert_held(&sdata->local->chanctx_mtx);
 
-	new_ctx = sdata->deflink.reserved_chanctx;
-	old_ctx = ieee80211_vif_get_chanctx(sdata);
+	new_ctx = link->reserved_chanctx;
+	old_ctx = ieee80211_link_get_chanctx(link);
 
 	if (!old_ctx)
 		return false;
@@ -1289,7 +1392,7 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
 				      int n_vifs)
 {
 	struct ieee80211_vif_chanctx_switch *vif_chsw;
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 	struct ieee80211_chanctx *ctx, *old_ctx;
 	int i, err;
 
@@ -1310,16 +1413,16 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
 			goto out;
 		}
 
-		list_for_each_entry(sdata, &ctx->reserved_vifs,
+		list_for_each_entry(link, &ctx->reserved_links,
 				    reserved_chanctx_list) {
-			if (!ieee80211_vif_has_in_place_reservation(
-					sdata))
+			if (!ieee80211_link_has_in_place_reservation(link))
 				continue;
 
-			old_ctx = ieee80211_vif_get_chanctx(sdata);
-			vif_chsw[i].vif = &sdata->vif;
+			old_ctx = ieee80211_link_get_chanctx(link);
+			vif_chsw[i].vif = &link->sdata->vif;
 			vif_chsw[i].old_ctx = &old_ctx->conf;
 			vif_chsw[i].new_ctx = &ctx->conf;
+			vif_chsw[i].link_id = link->link_id;
 
 			i++;
 		}
@@ -1345,7 +1448,7 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local)
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
 
-		if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+		if (!list_empty(&ctx->replace_ctx->assigned_links))
 			continue;
 
 		ieee80211_del_chanctx(local, ctx->replace_ctx);
@@ -1362,7 +1465,7 @@ err:
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
 
-		if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+		if (!list_empty(&ctx->replace_ctx->assigned_links))
 			continue;
 
 		ieee80211_del_chanctx(local, ctx);
@@ -1374,7 +1477,6 @@ err:
 
 static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 {
-	struct ieee80211_sub_if_data *sdata, *sdata_tmp;
 	struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
 	struct ieee80211_chanctx *new_ctx = NULL;
 	int err, n_assigned, n_reserved, n_ready;
@@ -1400,6 +1502,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	 */
 
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
+		struct ieee80211_link_data *link;
+
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
 
@@ -1417,12 +1521,12 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 		n_reserved = 0;
 		n_ready = 0;
 
-		list_for_each_entry(sdata, &ctx->replace_ctx->assigned_vifs,
+		list_for_each_entry(link, &ctx->replace_ctx->assigned_links,
 				    assigned_chanctx_list) {
 			n_assigned++;
-			if (sdata->deflink.reserved_chanctx) {
+			if (link->reserved_chanctx) {
 				n_reserved++;
-				if (sdata->deflink.reserved_ready)
+				if (link->reserved_ready)
 					n_ready++;
 			}
 		}
@@ -1439,13 +1543,13 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 		}
 
 		ctx->conf.radar_enabled = false;
-		list_for_each_entry(sdata, &ctx->reserved_vifs,
+		list_for_each_entry(link, &ctx->reserved_links,
 				    reserved_chanctx_list) {
-			if (ieee80211_vif_has_in_place_reservation(sdata) &&
-			    !sdata->deflink.reserved_ready)
+			if (ieee80211_link_has_in_place_reservation(link) &&
+			    !link->reserved_ready)
 				return -EAGAIN;
 
-			old_ctx = ieee80211_vif_get_chanctx(sdata);
+			old_ctx = ieee80211_link_get_chanctx(link);
 			if (old_ctx) {
 				if (old_ctx->replace_state ==
 				    IEEE80211_CHANCTX_WILL_BE_REPLACED)
@@ -1456,7 +1560,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 				n_vifs_ctxless++;
 			}
 
-			if (sdata->deflink.reserved_radar_required)
+			if (link->reserved_radar_required)
 				ctx->conf.radar_enabled = true;
 		}
 	}
@@ -1499,6 +1603,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	 * context(s).
 	 */
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
+		struct ieee80211_link_data *link, *link_tmp;
+
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
 
@@ -1507,32 +1613,34 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 			goto err;
 		}
 
-		list_for_each_entry(sdata, &ctx->reserved_vifs,
+		list_for_each_entry(link, &ctx->reserved_links,
 				    reserved_chanctx_list) {
+			struct ieee80211_sub_if_data *sdata = link->sdata;
+			struct ieee80211_bss_conf *link_conf =
+				sdata->vif.link_conf[link->link_id];
 			u32 changed = 0;
 
-			if (!ieee80211_vif_has_in_place_reservation(sdata))
+			if (!ieee80211_link_has_in_place_reservation(link))
 				continue;
 
-			rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf,
+			rcu_assign_pointer(link_conf->chanctx_conf,
 					   &ctx->conf);
 
 			if (sdata->vif.type == NL80211_IFTYPE_AP)
-				__ieee80211_vif_copy_chanctx_to_vlans(sdata,
-								      false);
+				__ieee80211_link_copy_chanctx_to_vlans(link,
+								       false);
 
 			ieee80211_check_fast_xmit_iface(sdata);
 
-			sdata->deflink.radar_required = sdata->deflink.reserved_radar_required;
+			link->radar_required = link->reserved_radar_required;
 
-			if (sdata->vif.bss_conf.chandef.width !=
-			    sdata->deflink.reserved_chandef.width)
+			if (link_conf->chandef.width != link->reserved_chandef.width)
 				changed = BSS_CHANGED_BANDWIDTH;
 
-			ieee80211_vif_update_chandef(sdata,
-						     &sdata->deflink.reserved_chandef);
+			ieee80211_link_update_chandef(link, &link->reserved_chandef);
 			if (changed)
-				ieee80211_link_info_change_notify(sdata, 0,
+				ieee80211_link_info_change_notify(sdata,
+								  link->link_id,
 								  changed);
 
 			ieee80211_recalc_txpower(sdata, false);
@@ -1543,17 +1651,17 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 		ieee80211_recalc_radar_chanctx(local, ctx);
 		ieee80211_recalc_chanctx_min_def(local, ctx);
 
-		list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+		list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links,
 					 reserved_chanctx_list) {
-			if (ieee80211_vif_get_chanctx(sdata) != ctx)
+			if (ieee80211_link_get_chanctx(link) != ctx)
 				continue;
 
-			list_del(&sdata->reserved_chanctx_list);
-			list_move(&sdata->assigned_chanctx_list,
-				  &ctx->assigned_vifs);
-			sdata->deflink.reserved_chanctx = NULL;
+			list_del(&link->reserved_chanctx_list);
+			list_move(&link->assigned_chanctx_list,
+				  &ctx->assigned_links);
+			link->reserved_chanctx = NULL;
 
-			ieee80211_vif_chanctx_reservation_complete(sdata);
+			ieee80211_link_chanctx_reservation_complete(link);
 		}
 
 		/*
@@ -1563,31 +1671,29 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 		 * reservation for originally requested interface has already
 		 * succeeded at this point.
 		 */
-		list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+		list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links,
 					 reserved_chanctx_list) {
-			if (WARN_ON(ieee80211_vif_has_in_place_reservation(
-					sdata)))
+			if (WARN_ON(ieee80211_link_has_in_place_reservation(link)))
 				continue;
 
-			if (WARN_ON(sdata->deflink.reserved_chanctx != ctx))
+			if (WARN_ON(link->reserved_chanctx != ctx))
 				continue;
 
-			if (!sdata->deflink.reserved_ready)
+			if (!link->reserved_ready)
 				continue;
 
-			if (ieee80211_vif_get_chanctx(sdata))
-				err = ieee80211_vif_use_reserved_reassign(
-						sdata);
+			if (ieee80211_link_get_chanctx(link))
+				err = ieee80211_link_use_reserved_reassign(link);
 			else
-				err = ieee80211_vif_use_reserved_assign(sdata);
+				err = ieee80211_link_use_reserved_assign(link);
 
 			if (err) {
-				sdata_info(sdata,
-					   "failed to finalize (re-)assign reservation (err=%d)\n",
-					   err);
-				ieee80211_vif_unreserve_chanctx(sdata);
+				link_info(link,
+					  "failed to finalize (re-)assign reservation (err=%d)\n",
+					  err);
+				ieee80211_link_unreserve_chanctx(link);
 				cfg80211_stop_iface(local->hw.wiphy,
-						    &sdata->wdev,
+						    &link->sdata->wdev,
 						    GFP_KERNEL);
 			}
 		}
@@ -1613,21 +1719,26 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 
 err:
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
+		struct ieee80211_link_data *link, *link_tmp;
+
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
 
-		list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+		list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links,
 					 reserved_chanctx_list) {
-			ieee80211_vif_unreserve_chanctx(sdata);
-			ieee80211_vif_chanctx_reservation_complete(sdata);
+			ieee80211_link_unreserve_chanctx(link);
+			ieee80211_link_chanctx_reservation_complete(link);
 		}
 	}
 
 	return err;
 }
 
-static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+static void __ieee80211_link_release_channel(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -1635,38 +1746,38 @@ static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
 
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(link_conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf)
 		return;
 
 	ctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-	if (sdata->deflink.reserved_chanctx) {
-		if (sdata->deflink.reserved_chanctx->replace_state ==
-		    IEEE80211_CHANCTX_REPLACES_OTHER &&
-		    ieee80211_chanctx_num_reserved(local,
-						   sdata->deflink.reserved_chanctx) > 1)
+	if (link->reserved_chanctx) {
+		if (link->reserved_chanctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER &&
+		    ieee80211_chanctx_num_reserved(local, link->reserved_chanctx) > 1)
 			use_reserved_switch = true;
 
-		ieee80211_vif_unreserve_chanctx(sdata);
+		ieee80211_link_unreserve_chanctx(link);
 	}
 
-	ieee80211_assign_vif_chanctx(sdata, NULL);
+	ieee80211_assign_link_chanctx(link, NULL);
 	if (ieee80211_chanctx_refcount(local, ctx) == 0)
 		ieee80211_free_chanctx(local, ctx);
 
-	sdata->deflink.radar_required = false;
+	link->radar_required = false;
 
 	/* Unreserving may ready an in-place reservation. */
 	if (use_reserved_switch)
 		ieee80211_vif_use_reserved_switch(local);
 }
 
-int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
-			      const struct cfg80211_chan_def *chandef,
-			      enum ieee80211_chanctx_mode mode)
+int ieee80211_link_use_channel(struct ieee80211_link_data *link,
+			       const struct cfg80211_chan_def *chandef,
+			       enum ieee80211_chanctx_mode mode)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *ctx;
 	u8 radar_detect_width = 0;
@@ -1686,14 +1797,14 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
 	if (ret > 0)
 		radar_detect_width = BIT(chandef->width);
 
-	sdata->deflink.radar_required = ret;
+	sdata->link[link_id]->radar_required = ret;
 
 	ret = ieee80211_check_combinations(sdata, chandef, mode,
 					   radar_detect_width);
 	if (ret < 0)
 		goto out;
 
-	__ieee80211_vif_release_channel(sdata);
+	__ieee80211_link_release_channel(link);
 
 	ctx = ieee80211_find_chanctx(local, chandef, mode);
 	if (!ctx)
@@ -1703,9 +1814,9 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	ieee80211_vif_update_chandef(sdata, chandef);
+	ieee80211_link_update_chandef(link, chandef);
 
-	ret = ieee80211_assign_vif_chanctx(sdata, ctx);
+	ret = ieee80211_assign_link_chanctx(link, ctx);
 	if (ret) {
 		/* if assign fails refcount stays the same */
 		if (ieee80211_chanctx_refcount(local, ctx) == 0)
@@ -1717,14 +1828,15 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
 	ieee80211_recalc_radar_chanctx(local, ctx);
  out:
 	if (ret)
-		sdata->deflink.radar_required = false;
+		link->radar_required = false;
 
 	mutex_unlock(&local->chanctx_mtx);
 	return ret;
 }
 
-int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
+int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *new_ctx;
 	struct ieee80211_chanctx *old_ctx;
@@ -1733,8 +1845,8 @@ int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
 	lockdep_assert_held(&local->mtx);
 	lockdep_assert_held(&local->chanctx_mtx);
 
-	new_ctx = sdata->deflink.reserved_chanctx;
-	old_ctx = ieee80211_vif_get_chanctx(sdata);
+	new_ctx = link->reserved_chanctx;
+	old_ctx = ieee80211_link_get_chanctx(link);
 
 	if (WARN_ON(!new_ctx))
 		return -EINVAL;
@@ -1743,16 +1855,16 @@ int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
 		    IEEE80211_CHANCTX_WILL_BE_REPLACED))
 		return -EINVAL;
 
-	if (WARN_ON(sdata->deflink.reserved_ready))
+	if (WARN_ON(link->reserved_ready))
 		return -EINVAL;
 
-	sdata->deflink.reserved_ready = true;
+	link->reserved_ready = true;
 
 	if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) {
 		if (old_ctx)
-			return ieee80211_vif_use_reserved_reassign(sdata);
+			return ieee80211_link_use_reserved_reassign(link);
 
-		return ieee80211_vif_use_reserved_assign(sdata);
+		return ieee80211_link_use_reserved_assign(link);
 	}
 
 	/*
@@ -1784,10 +1896,13 @@ int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
 	return 0;
 }
 
-int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
-				   const struct cfg80211_chan_def *chandef,
-				   u32 *changed)
+int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
+				    const struct cfg80211_chan_def *chandef,
+				    u32 *changed)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -1799,18 +1914,18 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 		return -EINVAL;
 
 	mutex_lock(&local->chanctx_mtx);
-	if (cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef)) {
+	if (cfg80211_chandef_identical(chandef, &link_conf->chandef)) {
 		ret = 0;
 		goto out;
 	}
 
 	if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT) {
+	    link_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(link_conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf) {
 		ret = -EINVAL;
@@ -1845,7 +1960,7 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	ieee80211_vif_update_chandef(sdata, chandef);
+	ieee80211_link_update_chandef(link, chandef);
 
 	ieee80211_recalc_chanctx_chantype(local, ctx);
 
@@ -1856,19 +1971,24 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
 	return ret;
 }
 
-void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+void ieee80211_link_release_channel(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+
 	WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev));
 
 	lockdep_assert_held(&sdata->local->mtx);
 
 	mutex_lock(&sdata->local->chanctx_mtx);
-	__ieee80211_vif_release_channel(sdata);
+	__ieee80211_link_release_channel(link);
 	mutex_unlock(&sdata->local->chanctx_mtx);
 }
 
-void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata)
+void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	unsigned int link_id = link->link_id;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sub_if_data *ap;
 	struct ieee80211_chanctx_conf *conf;
@@ -1880,9 +2000,9 @@ void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	conf = rcu_dereference_protected(ap->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(ap->vif.link_conf[link_id]->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
-	rcu_assign_pointer(sdata->vif.bss_conf.chanctx_conf, conf);
+	rcu_assign_pointer(link_conf->chanctx_conf, conf);
 	mutex_unlock(&local->chanctx_mtx);
 }
 
diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h
index d90a8f9cc3fd..3302e8da0314 100644
--- a/net/mac80211/debug.h
+++ b/net/mac80211/debug.h
@@ -1,4 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions
+ * Copyright (C) 2022 Intel Corporation
+ */
 #ifndef __MAC80211_DEBUG_H
 #define __MAC80211_DEBUG_H
 #include <net/cfg80211.h>
@@ -130,6 +134,16 @@ do {									\
 #define sdata_dbg(sdata, fmt, ...)					\
 	_sdata_dbg(1, sdata, fmt, ##__VA_ARGS__)
 
+#define link_info(link, fmt, ...)					\
+	_sdata_info((link)->sdata, "[link %d] " fmt, (link)->link_id,	\
+		    ##__VA_ARGS__)
+#define link_err(link, fmt, ...)					\
+	_sdata_err((link)->sdata, "[link %d] " fmt, (link)->link_id,	\
+		   ##__VA_ARGS__)
+#define link_dbg(link, fmt, ...)					\
+	_sdata_dbg(1, (link)->sdata, "[link %d] " fmt, (link)->link_id,	\
+		   ##__VA_ARGS__)
+
 #define ht_dbg(sdata, fmt, ...)						\
 	_sdata_dbg(MAC80211_HT_DEBUG,					\
 		   sdata, fmt, ##__VA_ARGS__)
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 15ab8d00815b..9238283a8237 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -938,6 +938,7 @@ static inline void drv_change_chanctx(struct ieee80211_local *local,
 
 static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
 					 struct ieee80211_sub_if_data *sdata,
+					 unsigned int link_id,
 					 struct ieee80211_chanctx *ctx)
 {
 	int ret = 0;
@@ -945,11 +946,12 @@ static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_assign_vif_chanctx(local, sdata, ctx);
+	trace_drv_assign_vif_chanctx(local, sdata, link_id, ctx);
 	if (local->ops->assign_vif_chanctx) {
 		WARN_ON_ONCE(!ctx->driver_present);
 		ret = local->ops->assign_vif_chanctx(&local->hw,
 						     &sdata->vif,
+						     link_id,
 						     &ctx->conf);
 	}
 	trace_drv_return_int(local, ret);
@@ -959,6 +961,7 @@ static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
 
 static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
 					    struct ieee80211_sub_if_data *sdata,
+					    unsigned int link_id,
 					    struct ieee80211_chanctx *ctx)
 {
 	might_sleep();
@@ -966,11 +969,12 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_unassign_vif_chanctx(local, sdata, ctx);
+	trace_drv_unassign_vif_chanctx(local, sdata, link_id, ctx);
 	if (local->ops->unassign_vif_chanctx) {
 		WARN_ON_ONCE(!ctx->driver_present);
 		local->ops->unassign_vif_chanctx(&local->hw,
 						 &sdata->vif,
+						 link_id,
 						 &ctx->conf);
 	}
 	trace_drv_return_void(local);
diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c
index 96c9486bf2fe..2d9c6e845ce4 100644
--- a/net/mac80211/eht.c
+++ b/net/mac80211/eht.c
@@ -71,6 +71,6 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 
 	eht_cap->has_eht = true;
 
-	sta->deflink.cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta);
-	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+	sta->deflink.cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta, 0);
+	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta, 0);
 }
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index 1a61f7552edd..20448dda8c4d 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -3,7 +3,7 @@
  * HE handling
  *
  * Copyright(c) 2017 Intel Deutschland GmbH
- * Copyright(c) 2019 - 2020 Intel Corporation
+ * Copyright(c) 2019 - 2022 Intel Corporation
  */
 
 #include "ieee80211_i.h"
@@ -153,8 +153,8 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 
 	he_cap->has_he = true;
 
-	sta->deflink.cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta);
-	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+	sta->deflink.cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta, 0);
+	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta, 0);
 
 	if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa)
 		ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 3b68e9f4345b..2ca75f2d3023 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -301,8 +301,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	radar_required = err;
 
 	mutex_lock(&local->mtx);
-	if (ieee80211_vif_use_channel(sdata, &chandef,
-				      ifibss->fixed_channel ?
+	if (ieee80211_link_use_channel(sdata->link[0], &chandef,
+				       ifibss->fixed_channel ?
 					IEEE80211_CHANCTX_SHARED :
 					IEEE80211_CHANCTX_EXCLUSIVE)) {
 		sdata_info(sdata, "Failed to join IBSS, no channel context\n");
@@ -371,7 +371,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		RCU_INIT_POINTER(ifibss->presp, NULL);
 		kfree_rcu(presp, rcu_head);
 		mutex_lock(&local->mtx);
-		ieee80211_vif_release_channel(sdata);
+		ieee80211_link_release_channel(sdata->link[0]);
 		mutex_unlock(&local->mtx);
 		sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n",
 			   err);
@@ -725,7 +725,7 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata)
 						BSS_CHANGED_IBSS);
 	drv_leave_ibss(local, sdata);
 	mutex_lock(&local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(sdata->link[0]);
 	mutex_unlock(&local->mtx);
 }
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 397b111f006d..92ed1e3c2980 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -787,8 +787,8 @@ struct ieee80211_chanctx {
 	struct list_head list;
 	struct rcu_head rcu_head;
 
-	struct list_head assigned_vifs;
-	struct list_head reserved_vifs;
+	struct list_head assigned_links;
+	struct list_head reserved_links;
 
 	enum ieee80211_chanctx_replace_state replace_state;
 	struct ieee80211_chanctx *replace_ctx;
@@ -909,6 +909,12 @@ struct ieee80211_link_data_ap {
 };
 
 struct ieee80211_link_data {
+	struct ieee80211_sub_if_data *sdata;
+	unsigned int link_id;
+
+	struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */
+	struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */
+
 	/* multicast keys only */
 	struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS +
 					NUM_DEFAULT_MGMT_KEYS +
@@ -989,9 +995,6 @@ struct ieee80211_sub_if_data {
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 	struct mac80211_qos_map __rcu *qos_map;
 
-	struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */
-	struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */
-
 	/* used to reconfigure hardware SM PS */
 	struct work_struct recalc_smps;
 
@@ -2119,8 +2122,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const struct ieee80211_vht_cap *vht_cap_ie,
 				    struct sta_info *sta);
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta);
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
+							unsigned int link_id);
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
+							 unsigned int link_id);
 void ieee80211_sta_set_rx_nss(struct sta_info *sta);
 enum ieee80211_sta_rx_bandwidth
 ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
@@ -2470,26 +2475,26 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
 u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
 
 int __must_check
-ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
-			  const struct cfg80211_chan_def *chandef,
-			  enum ieee80211_chanctx_mode mode);
+ieee80211_link_use_channel(struct ieee80211_link_data *link,
+			   const struct cfg80211_chan_def *chandef,
+			   enum ieee80211_chanctx_mode mode);
 int __must_check
-ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
-			      const struct cfg80211_chan_def *chandef,
-			      enum ieee80211_chanctx_mode mode,
-			      bool radar_required);
+ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
+			       const struct cfg80211_chan_def *chandef,
+			       enum ieee80211_chanctx_mode mode,
+			       bool radar_required);
 int __must_check
-ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata);
-int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata);
+ieee80211_link_use_reserved_context(struct ieee80211_link_data *link);
+int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link);
 
 int __must_check
-ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
-			       const struct cfg80211_chan_def *chandef,
-			       u32 *changed);
-void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata);
-void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata);
-void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
-					 bool clear);
+ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
+				const struct cfg80211_chan_def *chandef,
+				u32 *changed);
+void ieee80211_link_release_channel(struct ieee80211_link_data *link);
+void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link);
+void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
+					  bool clear);
 int ieee80211_chanctx_refcount(struct ieee80211_local *local,
 			       struct ieee80211_chanctx *ctx);
 
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 04ee525394e9..1764068a18d1 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -471,7 +471,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 		chandef = sdata->vif.bss_conf.chandef;
 		WARN_ON(local->suspended);
 		mutex_lock(&local->mtx);
-		ieee80211_vif_release_channel(sdata);
+		ieee80211_link_release_channel(sdata->link[0]);
 		mutex_unlock(&local->mtx);
 		cfg80211_cac_event(sdata->dev, &chandef,
 				   NL80211_RADAR_CAC_ABORTED,
@@ -1027,6 +1027,7 @@ static void ieee80211_sdata_init(struct ieee80211_local *local,
 	 */
 	sdata->vif.link_conf[0] = &sdata->vif.bss_conf;
 	sdata->link[0] = &sdata->deflink;
+	sdata->deflink.sdata = sdata;
 }
 
 int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
@@ -1077,8 +1078,8 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	mutex_unlock(&local->iflist_mtx);
 
 	mutex_lock(&local->mtx);
-	ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
-					IEEE80211_CHANCTX_EXCLUSIVE);
+	ret = ieee80211_link_use_channel(sdata->link[0], &local->monitor_chandef,
+					 IEEE80211_CHANCTX_EXCLUSIVE);
 	mutex_unlock(&local->mtx);
 	if (ret) {
 		mutex_lock(&local->iflist_mtx);
@@ -1122,7 +1123,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
 	synchronize_net();
 
 	mutex_lock(&local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(sdata->link[0]);
 	mutex_unlock(&local->mtx);
 
 	drv_remove_interface(local, sdata);
@@ -1228,7 +1229,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	case NL80211_IFTYPE_AP_VLAN:
 		/* no need to tell driver, but set carrier and chanctx */
 		if (sdata->bss->active) {
-			ieee80211_vif_vlan_copy_chanctx(sdata);
+			ieee80211_link_vlan_copy_chanctx(sdata->link[0]);
 			netif_carrier_on(dev);
 			ieee80211_set_vif_encap_ops(sdata);
 		} else {
@@ -1681,8 +1682,8 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		  ieee80211_csa_finalize_work);
 	INIT_WORK(&sdata->deflink.color_change_finalize_work,
 		  ieee80211_color_change_finalize_work);
-	INIT_LIST_HEAD(&sdata->assigned_chanctx_list);
-	INIT_LIST_HEAD(&sdata->reserved_chanctx_list);
+	INIT_LIST_HEAD(&sdata->deflink.assigned_chanctx_list);
+	INIT_LIST_HEAD(&sdata->deflink.reserved_chanctx_list);
 
 	switch (type) {
 	case NL80211_IFTYPE_P2P_GO:
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 42ba7424589e..a6cb1db50f1d 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2019, 2021 Intel Corporation
+ * Copyright (C) 2019, 2021-2022 Intel Corporation
  * Author:     Luis Carlos Cobo <luisca@cozybit.com>
  */
 #include <linux/gfp.h>
@@ -464,7 +464,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
 	if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
 		rate_control_rate_init(sta);
 	else
-		rate_control_rate_update(local, sband, sta, changed);
+		rate_control_rate_update(local, sband, sta, 0, changed);
 out:
 	spin_unlock_bh(&sta->mesh->plink_lock);
 }
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e172bdfe9b0a..ca9e0d83e2a4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -479,7 +479,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 		return -EINVAL;
 	}
 
-	ret = ieee80211_vif_change_bandwidth(sdata, &chandef, changed);
+	ret = ieee80211_link_change_bandwidth(&sdata->deflink, &chandef, changed);
 
 	if (ret) {
 		sdata_info(sdata,
@@ -1248,7 +1248,7 @@ static void ieee80211_chswitch_work(struct work_struct *work)
 		if (sdata->deflink.reserved_ready)
 			goto out;
 
-		ret = ieee80211_vif_use_reserved_context(sdata);
+		ret = ieee80211_link_use_reserved_context(&sdata->deflink);
 		if (ret) {
 			sdata_info(sdata,
 				   "failed to use reserved channel context, disconnecting (err=%d)\n",
@@ -1354,7 +1354,7 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_sub_if_data *sdata)
 	mutex_lock(&local->mtx);
 
 	mutex_lock(&local->chanctx_mtx);
-	ieee80211_vif_unreserve_chanctx(sdata);
+	ieee80211_link_unreserve_chanctx(&sdata->deflink);
 	mutex_unlock(&local->chanctx_mtx);
 
 	if (sdata->deflink.csa_block_tx)
@@ -1496,8 +1496,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		goto drop_connection;
 	}
 
-	res = ieee80211_vif_reserve_chanctx(sdata, &csa_ie.chandef,
-					    chanctx->mode, false);
+	res = ieee80211_link_reserve_chanctx(&sdata->deflink, &csa_ie.chandef,
+					     chanctx->mode, false);
 	if (res) {
 		sdata_info(sdata,
 			   "failed to reserve channel context for channel switch, disconnecting (err=%d)\n",
@@ -1942,7 +1942,7 @@ void ieee80211_dfs_cac_timer_work(struct work_struct *work)
 
 	mutex_lock(&sdata->local->mtx);
 	if (sdata->wdev.cac_started) {
-		ieee80211_vif_release_channel(sdata);
+		ieee80211_link_release_channel(&sdata->deflink);
 		cfg80211_cac_event(sdata->dev, &chandef,
 				   NL80211_RADAR_CAC_FINISHED,
 				   GFP_KERNEL);
@@ -2483,7 +2483,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	ifmgd->flags = 0;
 	mutex_lock(&local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(&sdata->deflink);
 
 	sdata->vif.bss_conf.csa_active = false;
 	sdata->deflink.u.mgd.csa_waiting_bcn = false;
@@ -2923,7 +2923,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
 		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
 		mutex_lock(&sdata->local->mtx);
-		ieee80211_vif_release_channel(sdata);
+		ieee80211_link_release_channel(&sdata->deflink);
 		mutex_unlock(&sdata->local->mtx);
 	}
 
@@ -2954,7 +2954,7 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		sdata->vif.bss_conf.mu_mimo_owner = false;
 
 		mutex_lock(&sdata->local->mtx);
-		ieee80211_vif_release_channel(sdata);
+		ieee80211_link_release_channel(&sdata->deflink);
 		mutex_unlock(&sdata->local->mtx);
 
 		if (abandon)
@@ -5497,8 +5497,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 	 * on incompatible channels, e.g. 80+80 and 160 sharing the
 	 * same control channel) try to use a smaller bandwidth.
 	 */
-	ret = ieee80211_vif_use_channel(sdata, &chandef,
-					IEEE80211_CHANCTX_SHARED);
+	ret = ieee80211_link_use_channel(&sdata->deflink, &chandef,
+					 IEEE80211_CHANCTX_SHARED);
 
 	/* don't downgrade for 5 and 10 MHz channels, though. */
 	if (chandef.width == NL80211_CHAN_WIDTH_5 ||
@@ -5507,8 +5507,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 
 	while (ret && chandef.width != NL80211_CHAN_WIDTH_20_NOHT) {
 		ifmgd->flags |= ieee80211_chandef_downgrade(&chandef);
-		ret = ieee80211_vif_use_channel(sdata, &chandef,
-						IEEE80211_CHANCTX_SHARED);
+		ret = ieee80211_link_use_channel(&sdata->deflink, &chandef,
+						 IEEE80211_CHANCTX_SHARED);
 	}
  out:
 	mutex_unlock(&local->mtx);
@@ -5868,7 +5868,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
 	ifmgd->auth_data = NULL;
 	mutex_lock(&sdata->local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(&sdata->deflink);
 	mutex_unlock(&sdata->local->mtx);
 	kfree(auth_data);
 	return err;
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index ab2658ad73ce..468c741a9aeb 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -186,8 +186,8 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
 	mutex_lock(&sdata->local->mtx);
-	err = ieee80211_vif_use_channel(sdata, &setup->chandef,
-					IEEE80211_CHANCTX_SHARED);
+	err = ieee80211_link_use_channel(sdata->link[0], &setup->chandef,
+					 IEEE80211_CHANCTX_SHARED);
 	mutex_unlock(&sdata->local->mtx);
 	if (err)
 		return err;
@@ -229,7 +229,7 @@ int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata)
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_OCB);
 
 	mutex_lock(&sdata->local->mtx);
-	ieee80211_vif_release_channel(sdata);
+	ieee80211_link_release_channel(sdata->link[0]);
 	mutex_unlock(&sdata->local->mtx);
 
 	skb_queue_purge(&sdata->skb_queue);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index f22381127948..b268088585eb 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -90,14 +90,17 @@ void rate_control_tx_status(struct ieee80211_local *local,
 }
 
 void rate_control_rate_update(struct ieee80211_local *local,
-				    struct ieee80211_supported_band *sband,
-				    struct sta_info *sta, u32 changed)
+			      struct ieee80211_supported_band *sband,
+			      struct sta_info *sta, unsigned int link_id,
+			      u32 changed)
 {
 	struct rate_control_ref *ref = local->rate_ctrl;
 	struct ieee80211_sta *ista = &sta->sta;
 	void *priv_sta = sta->rate_ctrl_priv;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 
+	WARN_ON(link_id != 0);
+
 	if (ref && ref->ops->rate_update) {
 		rcu_read_lock();
 
@@ -113,6 +116,7 @@ void rate_control_rate_update(struct ieee80211_local *local,
 		spin_unlock_bh(&sta->rate_ctrl_lock);
 		rcu_read_unlock();
 	}
+
 	drv_sta_rc_update(local, sta->sdata, &sta->sta, changed);
 }
 
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 79b44d3db171..fbc8bdb54c43 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -3,6 +3,7 @@
  * Copyright 2002-2005, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
  * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright (C) 2022 Intel Corporation
  */
 
 #ifndef IEEE80211_RATE_H
@@ -31,8 +32,10 @@ void rate_control_tx_status(struct ieee80211_local *local,
 
 void rate_control_rate_init(struct sta_info *sta);
 void rate_control_rate_update(struct ieee80211_local *local,
-				    struct ieee80211_supported_band *sband,
-				    struct sta_info *sta, u32 changed);
+			      struct ieee80211_supported_band *sband,
+			      struct sta_info *sta,
+			      unsigned int link_id,
+			      u32 changed);
 
 static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
 					   struct sta_info *sta, gfp_t gfp)
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 21e7424f261a..1774d0f9feaa 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3369,7 +3369,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 
 			sband = rx->local->hw.wiphy->bands[status->band];
 
-			rate_control_rate_update(local, sband, rx->sta,
+			rate_control_rate_update(local, sband, rx->sta, 0,
 						 IEEE80211_RC_SMPS_CHANGED);
 			cfg80211_sta_opmode_change_notify(sdata->dev,
 							  rx->sta->addr,
@@ -3391,11 +3391,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ)
 				max_bw = IEEE80211_STA_RX_BW_20;
 			else
-				max_bw = ieee80211_sta_cap_rx_bw(rx->sta);
+				max_bw = ieee80211_sta_cap_rx_bw(rx->sta, 0);
 
 			/* set cur_max_bandwidth and recalc sta bw */
 			rx->sta->deflink.cur_max_bandwidth = max_bw;
-			new_bw = ieee80211_sta_cur_vht_bw(rx->sta);
+			new_bw = ieee80211_sta_cur_vht_bw(rx->sta, 0);
 
 			if (rx->sta->sta.deflink.bandwidth == new_bw)
 				goto handled;
@@ -3406,7 +3406,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 				ieee80211_sta_rx_bw_to_chan_width(rx->sta);
 			sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;
 
-			rate_control_rate_update(local, sband, rx->sta,
+			rate_control_rate_update(local, sband, rx->sta, 0,
 						 IEEE80211_RC_BW_CHANGED);
 			cfg80211_sta_opmode_change_notify(sdata->dev,
 							  rx->sta->addr,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 11a3b950b490..86a13ef31ef1 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1268,10 +1268,10 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata,
 			enum ieee80211_sta_rx_bandwidth bw;
 
 			bw = ieee80211_chan_width_to_rx_bw(conf->def.width);
-			bw = min(bw, ieee80211_sta_cap_rx_bw(sta));
+			bw = min(bw, ieee80211_sta_cap_rx_bw(sta, 0));
 			if (bw != sta->sta.deflink.bandwidth) {
 				sta->sta.deflink.bandwidth = bw;
-				rate_control_rate_update(local, sband, sta,
+				rate_control_rate_update(local, sband, sta, 0,
 							 IEEE80211_RC_BW_CHANGED);
 				/*
 				 * if a TDLS peer BW was updated, we need to
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 9417378b4fc0..2f325eaf4576 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1625,6 +1625,7 @@ struct trace_chandef_entry {
 
 struct trace_switch_entry {
 	struct trace_vif_entry vif;
+	unsigned int link_id;
 	struct trace_chandef_entry old_chandef;
 	struct trace_chandef_entry new_chandef;
 } __packed;
@@ -1664,6 +1665,7 @@ TRACE_EVENT(drv_switch_vif_chanctx,
 
 				SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type);
 				SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p);
+				SWITCH_ENTRY_ASSIGN(link_id, link_id);
 				strncpy(local_vifs[i].vif.vif_name,
 					sdata->name,
 					sizeof(local_vifs[i].vif.vif_name));
@@ -1704,40 +1706,45 @@ TRACE_EVENT(drv_switch_vif_chanctx,
 DECLARE_EVENT_CLASS(local_sdata_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
+		 unsigned int link_id,
 		 struct ieee80211_chanctx *ctx),
 
-	TP_ARGS(local, sdata, ctx),
+	TP_ARGS(local, sdata, link_id, ctx),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
 		CHANCTX_ENTRY
+		__field(unsigned int, link_id)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		CHANCTX_ASSIGN;
+		__entry->link_id = link_id;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT CHANCTX_PR_FMT,
-		LOCAL_PR_ARG, VIF_PR_ARG, CHANCTX_PR_ARG
+		LOCAL_PR_FMT VIF_PR_FMT " link_id:%d" CHANCTX_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, CHANCTX_PR_ARG
 	)
 );
 
 DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
+		 unsigned int link_id,
 		 struct ieee80211_chanctx *ctx),
-	TP_ARGS(local, sdata, ctx)
+	TP_ARGS(local, sdata, link_id, ctx)
 );
 
 DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
+		 unsigned int link_id,
 		 struct ieee80211_chanctx *ctx),
-	TP_ARGS(local, sdata, ctx)
+	TP_ARGS(local, sdata, link_id, ctx)
 );
 
 TRACE_EVENT(drv_start_ap,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 2a279dc3e457..4ec96170a679 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2255,7 +2255,8 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
 }
 
 static void ieee80211_assign_chanctx(struct ieee80211_local *local,
-				     struct ieee80211_sub_if_data *sdata)
+				     struct ieee80211_sub_if_data *sdata,
+				     unsigned int link_id)
 {
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -2264,11 +2265,11 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 		return;
 
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (conf) {
 		ctx = container_of(conf, struct ieee80211_chanctx, conf);
-		drv_assign_vif_chanctx(local, sdata, ctx);
+		drv_assign_vif_chanctx(local, sdata, link_id, ctx);
 	}
 	mutex_unlock(&local->chanctx_mtx);
 }
@@ -2474,7 +2475,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		sdata = wiphy_dereference(local->hw.wiphy,
 					  local->monitor_sdata);
 		if (sdata && ieee80211_sdata_running(sdata))
-			ieee80211_assign_chanctx(local, sdata);
+			ieee80211_assign_chanctx(local, sdata, 0);
 	}
 
 	/* reconfigure hardware */
@@ -2484,12 +2485,16 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 
 	/* Finally also reconfigure all the BSS information */
 	list_for_each_entry(sdata, &local->interfaces, list) {
+		unsigned int link;
 		u32 changed;
 
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		ieee80211_assign_chanctx(local, sdata);
+		for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) {
+			if (sdata->vif.link_conf[link])
+				ieee80211_assign_chanctx(local, sdata, link);
+		}
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_AP_VLAN:
@@ -3975,7 +3980,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local)
 
 		if (sdata->wdev.cac_started) {
 			chandef = sdata->vif.bss_conf.chandef;
-			ieee80211_vif_release_channel(sdata);
+			ieee80211_link_release_channel(sdata->link[0]);
 			cfg80211_cac_event(sdata->dev,
 					   &chandef,
 					   NL80211_RADAR_CAC_ABORTED,
@@ -4401,7 +4406,7 @@ void ieee80211_recalc_dtim(struct ieee80211_local *local,
 static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
 					 struct ieee80211_chanctx *ctx)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_link_data *link;
 	u8 radar_detect = 0;
 
 	lockdep_assert_held(&local->chanctx_mtx);
@@ -4409,20 +4414,26 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
 	if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED))
 		return 0;
 
-	list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list)
-		if (sdata->deflink.reserved_radar_required)
-			radar_detect |= BIT(sdata->deflink.reserved_chandef.width);
+	list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list)
+		if (link->reserved_radar_required)
+			radar_detect |= BIT(link->reserved_chandef.width);
 
 	/*
 	 * An in-place reservation context should not have any assigned vifs
 	 * until it replaces the other context.
 	 */
 	WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER &&
-		!list_empty(&ctx->assigned_vifs));
+		!list_empty(&ctx->assigned_links));
+
+	list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) {
+		struct ieee80211_sub_if_data *sdata = link->sdata;
 
-	list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list)
-		if (sdata->deflink.radar_required)
-			radar_detect |= BIT(sdata->vif.bss_conf.chandef.width);
+		if (!link->radar_required)
+			continue;
+
+		radar_detect |=
+			BIT(sdata->vif.link_conf[link->link_id]->chandef.width);
+	}
 
 	return radar_detect;
 }
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 7daca8352deb..27d260be9123 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -313,7 +313,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 			sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 	}
 
-	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta, 0);
 
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
 	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
@@ -330,19 +330,21 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 }
 
 /* FIXME: move this to some better location - parses HE/EHT now */
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
+							unsigned int link_id)
 {
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap;
-	struct ieee80211_sta_he_cap *he_cap = &sta->sta.deflink.he_cap;
-	struct ieee80211_sta_eht_cap *eht_cap = &sta->sta.deflink.eht_cap;
+	struct ieee80211_bss_conf *link_conf = sta->sdata->vif.link_conf[link_id];
+	struct ieee80211_link_sta *link_sta = sta->sta.link[link_id];
+	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->vht_cap;
+	struct ieee80211_sta_he_cap *he_cap = &link_sta->he_cap;
+	struct ieee80211_sta_eht_cap *eht_cap = &link_sta->eht_cap;
 	u32 cap_width;
 
 	if (he_cap->has_he) {
 		u8 info;
 
 		if (eht_cap->has_eht &&
-		    sta->sdata->vif.bss_conf.chandef.chan->band ==
-		    NL80211_BAND_6GHZ) {
+		    link_conf->chandef.chan->band == NL80211_BAND_6GHZ) {
 			info = eht_cap->eht_cap_elem.phy_cap_info[0];
 
 			if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
@@ -351,8 +353,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
 
 		info = he_cap->he_cap_elem.phy_cap_info[0];
 
-		if (sta->sdata->vif.bss_conf.chandef.chan->band ==
-				NL80211_BAND_2GHZ) {
+		if (link_conf->chandef.chan->band == NL80211_BAND_2GHZ) {
 			if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
 				return IEEE80211_STA_RX_BW_40;
 			else
@@ -369,7 +370,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
 	}
 
 	if (!vht_cap->vht_supported)
-		return sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+		return link_sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
 				IEEE80211_STA_RX_BW_40 :
 				IEEE80211_STA_RX_BW_20;
 
@@ -466,14 +467,15 @@ ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
 }
 
 /* FIXME: rename/move - this deals with everything not just VHT */
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
+							 unsigned int link_id)
 {
-	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_bss_conf *link_conf = sta->sdata->vif.link_conf[link_id];
+	enum nl80211_chan_width bss_width = link_conf->chandef.width;
 	enum ieee80211_sta_rx_bandwidth bw;
-	enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width;
 
-	bw = ieee80211_sta_cap_rx_bw(sta);
-	bw = min(bw, sta->deflink.cur_max_bandwidth);
+	bw = ieee80211_sta_cap_rx_bw(sta, link_id);
+	bw = min(bw, sta->link[link_id]->cur_max_bandwidth);
 
 	/* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of
 	 * IEEE80211-2016 specification makes higher bandwidth operation
@@ -629,7 +631,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	new_bw = ieee80211_sta_cur_vht_bw(sta);
+	new_bw = ieee80211_sta_cur_vht_bw(sta, 0);
 	if (new_bw != sta->sta.deflink.bandwidth) {
 		sta->sta.deflink.bandwidth = new_bw;
 		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta);
@@ -692,7 +694,7 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 
 	if (changed > 0) {
 		ieee80211_recalc_min_chandef(sdata);
-		rate_control_rate_update(local, sband, sta, changed);
+		rate_control_rate_update(local, sband, sta, 0, changed);
 	}
 }
 
-- 
cgit 


From ec7a04073d3b31c03a9ad00e0709ffd78b5cfd1b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 30 May 2022 22:36:30 +0200
Subject: wifi: mac80211: use IEEE80211_MLD_MAX_NUM_LINKS

Remove MAX_STA_LINKS and use IEEE80211_MLD_MAX_NUM_LINKS
instead to unify between the station and other data structures.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 4 +---
 net/mac80211/sta_info.h | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 4dc51c52dd28..902a10ee50f2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2061,8 +2061,6 @@ struct ieee80211_sta_txpwr {
 	enum nl80211_tx_power_setting type;
 };
 
-#define MAX_STA_LINKS			15
-
 /**
  * struct ieee80211_link_sta - station Link specific info
  * All link specific info for a STA link for a non MLD STA(single)
@@ -2190,7 +2188,7 @@ struct ieee80211_sta {
 
 	bool multi_link_sta;
 	struct ieee80211_link_sta deflink;
-	struct ieee80211_link_sta *link[MAX_STA_LINKS];
+	struct ieee80211_link_sta *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 	/* must be last */
 	u8 drv_priv[] __aligned(sizeof(void *));
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 134a7dad0ac4..2ff3d5fd0cbf 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -710,7 +710,7 @@ struct sta_info {
 
 	bool multi_link_sta;
 	struct link_sta_info deflink;
-	struct link_sta_info *link[MAX_STA_LINKS];
+	struct link_sta_info *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 	/* keep last! */
 	struct ieee80211_sta sta;
-- 
cgit 


From afe0d181905ed03c2379fcb81d423704f59f8788 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 30 May 2022 23:22:19 +0200
Subject: wifi: mac80211: add link_id to vht.c code for MLO

Update the code in vht.c and add the link_id parameter where
necessary.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |   2 +-
 include/net/mac80211.h                            |   3 +-
 net/mac80211/cfg.c                                |   5 +-
 net/mac80211/ibss.c                               |   2 +-
 net/mac80211/ieee80211_i.h                        |  18 ++--
 net/mac80211/iface.c                              |   6 +-
 net/mac80211/mesh_plink.c                         |   2 +-
 net/mac80211/mlme.c                               |   6 +-
 net/mac80211/rate.c                               |   2 +-
 net/mac80211/rx.c                                 |   2 +-
 net/mac80211/tdls.c                               |   2 +-
 net/mac80211/vht.c                                | 122 ++++++++++++----------
 12 files changed, 92 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5d3cedc146be..2539dc5aaa3f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1776,7 +1776,7 @@ static void iwl_mvm_mu_mimo_iface_iterator(void *_data, u8 *mac,
 		 * the data received from firmware as if it came from the
 		 * action frame, so no conversion is needed.
 		 */
-		ieee80211_update_mu_groups(vif,
+		ieee80211_update_mu_groups(vif, 0,
 					   (u8 *)&notif->membership_status,
 					   (u8 *)&notif->user_position);
 	}
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 902a10ee50f2..0a26cb2871d6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6548,6 +6548,7 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif)
  * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data
  *
  * @vif: the specified virtual interface
+ * @link_id: the link ID for MLO, otherwise 0
  * @membership: 64 bits array - a bit is set if station is member of the group
  * @position: 2 bits per group id indicating the position in the group
  *
@@ -6556,7 +6557,7 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif)
  * matching GroupId management frame.
  * Calls to this function need to be serialized with RX path.
  */
-void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id,
 				const u8 *membership, const u8 *position);
 
 void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 884d2cfcec1e..53aac8f277e1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1763,7 +1763,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	/* VHT can override some HT caps such as the A-MSDU max length */
 	if (params->vht_capa)
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-						    params->vht_capa, sta);
+						    params->vht_capa, sta, 0);
 
 	if (params->he_capa)
 		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
@@ -1784,7 +1784,8 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		/* returned value is only needed for rc update, but the
 		 * rc isn't initialized here yet, so ignore it
 		 */
-		__ieee80211_vht_handle_opmode(sdata, sta, params->opmode_notif,
+		__ieee80211_vht_handle_opmode(sdata, sta, 0,
+					      params->opmode_notif,
 					      sband->band);
 	}
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 2ca75f2d3023..87815a3624f3 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1068,7 +1068,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 						   &chandef);
 			memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
 			ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-							    &cap_ie, sta);
+							    &cap_ie, sta, 0);
 			if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap)))
 				rates_updated |= true;
 		}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 92ed1e3c2980..5be96b0bbd96 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2121,29 +2121,31 @@ void
 ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const struct ieee80211_vht_cap *vht_cap_ie,
-				    struct sta_info *sta);
+				    struct sta_info *sta, unsigned int link_id);
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
 							unsigned int link_id);
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
 							 unsigned int link_id);
-void ieee80211_sta_set_rx_nss(struct sta_info *sta);
+void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id);
 enum ieee80211_sta_rx_bandwidth
 ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
-enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta);
+enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta,
+						  unsigned int link_id);
 void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+				 unsigned int link_id,
 				 struct ieee80211_mgmt *mgmt);
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-                                  struct sta_info *sta, u8 opmode,
-				  enum nl80211_band band);
+				  struct sta_info *sta, unsigned int link_id,
+				  u8 opmode, enum nl80211_band band);
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				 struct sta_info *sta, u8 opmode,
-				 enum nl80211_band band);
+				 struct sta_info *sta, unsigned int link_id,
+				 u8 opmode, enum nl80211_band band);
 void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_sta_vht_cap *vht_cap);
 void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
 				     u16 vht_mask[NL80211_VHT_NSS_MAX]);
 enum nl80211_chan_width
-ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);
+ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta, unsigned int link_id);
 
 /* HE */
 void
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index fa684d76a169..c5be60ee12d8 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1496,14 +1496,14 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 			sta = sta_info_get_bss(sdata, mgmt->sa);
 
 			if (sta)
-				ieee80211_vht_handle_opmode(sdata, sta, opmode,
-							    band);
+				ieee80211_vht_handle_opmode(sdata, sta, 0,
+							    opmode, band);
 
 			mutex_unlock(&local->sta_mtx);
 			break;
 		}
 		case WLAN_VHT_ACTION_GROUPID_MGMT:
-			ieee80211_process_mu_groups(sdata, mgmt);
+			ieee80211_process_mu_groups(sdata, 0, mgmt);
 			break;
 		default:
 			WARN_ON(1);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index a6cb1db50f1d..fb7e8a9600ca 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -442,7 +442,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
 		changed |= IEEE80211_RC_BW_CHANGED;
 
 	ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-					    elems->vht_cap_elem, sta);
+					    elems->vht_cap_elem, sta, 0);
 
 	ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap,
 					  elems->he_cap_len,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ca9e0d83e2a4..643ef0ae786e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3570,7 +3570,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 
 	if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-						    elems->vht_cap_elem, sta);
+						    elems->vht_cap_elem,
+						    sta, 0);
 
 	if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
 	    elems->he_cap) {
@@ -4377,7 +4378,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (sta && elems->opmode_notif)
-		ieee80211_vht_handle_opmode(sdata, sta, *elems->opmode_notif,
+		ieee80211_vht_handle_opmode(sdata, sta, 0,
+					    *elems->opmode_notif,
 					    rx_status->band);
 	mutex_unlock(&local->sta_mtx);
 
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index b268088585eb..402e898b75c3 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -37,7 +37,7 @@ void rate_control_rate_init(struct sta_info *sta)
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 
-	ieee80211_sta_set_rx_nss(sta);
+	ieee80211_sta_set_rx_nss(sta, 0);
 
 	if (!ref)
 		return;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1774d0f9feaa..e6b16ebbdb48 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3403,7 +3403,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			rx->sta->sta.deflink.bandwidth = new_bw;
 			sband = rx->local->hw.wiphy->bands[status->band];
 			sta_opmode.bw =
-				ieee80211_sta_rx_bw_to_chan_width(rx->sta);
+				ieee80211_sta_rx_bw_to_chan_width(rx->sta, 0);
 			sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;
 
 			rate_control_rate_update(local, sband, rx->sta, 0,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 86a13ef31ef1..4fc120c64022 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -308,7 +308,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
 	/* IEEE802.11ac-2013 Table E-4 */
 	u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
 	struct cfg80211_chan_def uc = sta->tdls_chandef;
-	enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta);
+	enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta, 0);
 	int i;
 
 	/* only support upgrading non-narrow channels up to 80Mhz */
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 27d260be9123..acfe1459535f 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -116,16 +116,16 @@ void
 ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const struct ieee80211_vht_cap *vht_cap_ie,
-				    struct sta_info *sta)
+				    struct sta_info *sta, unsigned int link_id)
 {
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap;
+	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.link[link_id]->vht_cap;
 	struct ieee80211_sta_vht_cap own_cap;
 	u32 cap_info, i;
 	bool have_80mhz;
 
 	memset(vht_cap, 0, sizeof(*vht_cap));
 
-	if (!sta->sta.deflink.ht_cap.ht_supported)
+	if (!sta->sta.link[link_id]->ht_cap.ht_supported)
 		return;
 
 	if (!vht_cap_ie || !sband->vht_cap.vht_supported)
@@ -295,10 +295,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
-		sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		break;
 	default:
-		sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
 
 		if (!(vht_cap->vht_mcs.tx_highest &
 				cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
@@ -310,10 +310,11 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 		 * above) between 160 and 80+80 yet.
 		 */
 		if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)
-			sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+			sta->link[link_id]->cur_max_bandwidth =
+				IEEE80211_STA_RX_BW_160;
 	}
 
-	sta->sta.deflink.bandwidth = ieee80211_sta_cur_vht_bw(sta, 0);
+	sta->sta.link[link_id]->bandwidth = ieee80211_sta_cur_vht_bw(sta, link_id);
 
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
 	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
@@ -391,16 +392,17 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
 	return IEEE80211_STA_RX_BW_80;
 }
 
-enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta)
+enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta,
+						  unsigned int link_id)
 {
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap;
+	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.link[link_id]->vht_cap;
 	u32 cap_width;
 
 	if (!vht_cap->vht_supported) {
-		if (!sta->sta.deflink.ht_cap.ht_supported)
+		if (!sta->sta.link[link_id]->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20_NOHT;
 
-		return sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+		return sta->sta.link[link_id]->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
 				NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20;
 	}
 
@@ -415,15 +417,17 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta)
 }
 
 enum nl80211_chan_width
-ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta)
+ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta, unsigned int link_id)
 {
-	enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.deflink.bandwidth;
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.deflink.vht_cap;
+	enum ieee80211_sta_rx_bandwidth cur_bw =
+		sta->sta.link[link_id]->bandwidth;
+	struct ieee80211_sta_vht_cap *vht_cap =
+		&sta->sta.link[link_id]->vht_cap;
 	u32 cap_width;
 
 	switch (cur_bw) {
 	case IEEE80211_STA_RX_BW_20:
-		if (!sta->sta.deflink.ht_cap.ht_supported)
+		if (!sta->sta.link[link_id]->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20_NOHT;
 		else
 			return NL80211_CHAN_WIDTH_20;
@@ -497,18 +501,18 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
 	return bw;
 }
 
-void ieee80211_sta_set_rx_nss(struct sta_info *sta)
+void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id)
 {
 	u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, eht_rx_nss = 0, rx_nss;
 	bool support_160;
 
 	/* if we received a notification already don't overwrite it */
-	if (sta->sta.deflink.rx_nss)
+	if (sta->sta.link[link_id]->rx_nss)
 		return;
 
-	if (sta->sta.deflink.eht_cap.has_eht) {
+	if (sta->sta.link[link_id]->eht_cap.has_eht) {
 		int i;
-		const u8 *rx_nss_mcs = (void *)&sta->sta.deflink.eht_cap.eht_mcs_nss_supp;
+		const u8 *rx_nss_mcs = (void *)&sta->sta.link[link_id]->eht_cap.eht_mcs_nss_supp;
 
 		/* get the max nss for EHT over all possible bandwidths and mcs */
 		for (i = 0; i < sizeof(struct ieee80211_eht_mcs_nss_supp); i++)
@@ -517,10 +521,10 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
 						       IEEE80211_EHT_MCS_NSS_RX));
 	}
 
-	if (sta->sta.deflink.he_cap.has_he) {
+	if (sta->sta.link[link_id]->he_cap.has_he) {
 		int i;
 		u8 rx_mcs_80 = 0, rx_mcs_160 = 0;
-		const struct ieee80211_sta_he_cap *he_cap = &sta->sta.deflink.he_cap;
+		const struct ieee80211_sta_he_cap *he_cap = &sta->sta.link[link_id]->he_cap;
 		u16 mcs_160_map =
 			le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160);
 		u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80);
@@ -551,23 +555,23 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
 			he_rx_nss = rx_mcs_80;
 	}
 
-	if (sta->sta.deflink.ht_cap.ht_supported) {
-		if (sta->sta.deflink.ht_cap.mcs.rx_mask[0])
+	if (sta->sta.link[link_id]->ht_cap.ht_supported) {
+		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[0])
 			ht_rx_nss++;
-		if (sta->sta.deflink.ht_cap.mcs.rx_mask[1])
+		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[1])
 			ht_rx_nss++;
-		if (sta->sta.deflink.ht_cap.mcs.rx_mask[2])
+		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[2])
 			ht_rx_nss++;
-		if (sta->sta.deflink.ht_cap.mcs.rx_mask[3])
+		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[3])
 			ht_rx_nss++;
 		/* FIXME: consider rx_highest? */
 	}
 
-	if (sta->sta.deflink.vht_cap.vht_supported) {
+	if (sta->sta.link[link_id]->vht_cap.vht_supported) {
 		int i;
 		u16 rx_mcs_map;
 
-		rx_mcs_map = le16_to_cpu(sta->sta.deflink.vht_cap.vht_mcs.rx_mcs_map);
+		rx_mcs_map = le16_to_cpu(sta->sta.link[link_id]->vht_cap.vht_mcs.rx_mcs_map);
 
 		for (i = 7; i >= 0; i--) {
 			u8 mcs = (rx_mcs_map >> (2 * i)) & 3;
@@ -583,12 +587,12 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
 	rx_nss = max(vht_rx_nss, ht_rx_nss);
 	rx_nss = max(he_rx_nss, rx_nss);
 	rx_nss = max(eht_rx_nss, rx_nss);
-	sta->sta.deflink.rx_nss = max_t(u8, 1, rx_nss);
+	sta->sta.link[link_id]->rx_nss = max_t(u8, 1, rx_nss);
 }
 
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				  struct sta_info *sta, u8 opmode,
-				  enum nl80211_band band)
+				  struct sta_info *sta, unsigned int link_id,
+				  u8 opmode, enum nl80211_band band)
 {
 	enum ieee80211_sta_rx_bandwidth new_bw;
 	struct sta_opmode_info sta_opmode = {};
@@ -603,8 +607,8 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT;
 	nss += 1;
 
-	if (sta->sta.deflink.rx_nss != nss) {
-		sta->sta.deflink.rx_nss = nss;
+	if (sta->sta.link[link_id]->rx_nss != nss) {
+		sta->sta.link[link_id]->rx_nss = nss;
 		sta_opmode.rx_nss = nss;
 		changed |= IEEE80211_RC_NSS_CHANGED;
 		sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED;
@@ -613,28 +617,28 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
 		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
-		sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
+		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ:
 		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
-		sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
+		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ:
 		if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80)
-			sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+			sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		else
-			sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+			sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ:
 		/* legacy only, no longer used by newer spec */
-		sta->deflink.cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		break;
 	}
 
-	new_bw = ieee80211_sta_cur_vht_bw(sta, 0);
-	if (new_bw != sta->sta.deflink.bandwidth) {
-		sta->sta.deflink.bandwidth = new_bw;
-		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta);
+	new_bw = ieee80211_sta_cur_vht_bw(sta, link_id);
+	if (new_bw != sta->sta.link[link_id]->bandwidth) {
+		sta->sta.link[link_id]->bandwidth = new_bw;
+		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta, link_id);
 		changed |= IEEE80211_RC_BW_CHANGED;
 		sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED;
 	}
@@ -647,54 +651,56 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 }
 
 void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+				 unsigned int link_id,
 				 struct ieee80211_mgmt *mgmt)
 {
-	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 
-	if (!sdata->vif.bss_conf.mu_mimo_owner)
+	if (!link_conf->mu_mimo_owner)
 		return;
 
 	if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
-		    bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) &&
+		    link_conf->mu_group.position, WLAN_USER_POSITION_LEN) &&
 	    !memcmp(mgmt->u.action.u.vht_group_notif.membership,
-		    bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN))
+		    link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN))
 		return;
 
-	memcpy(bss_conf->mu_group.membership,
+	memcpy(link_conf->mu_group.membership,
 	       mgmt->u.action.u.vht_group_notif.membership,
 	       WLAN_MEMBERSHIP_LEN);
-	memcpy(bss_conf->mu_group.position,
+	memcpy(link_conf->mu_group.position,
 	       mgmt->u.action.u.vht_group_notif.position,
 	       WLAN_USER_POSITION_LEN);
 
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_MU_GROUPS);
+	ieee80211_link_info_change_notify(sdata, link_id, BSS_CHANGED_MU_GROUPS);
 }
 
-void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id,
 				const u8 *membership, const u8 *position)
 {
-	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+	struct ieee80211_bss_conf *link_conf = vif->link_conf[link_id];
 
-	if (WARN_ON_ONCE(!vif->bss_conf.mu_mimo_owner))
+	if (WARN_ON_ONCE(!link_conf->mu_mimo_owner))
 		return;
 
-	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
-	memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
+	memcpy(link_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
+	memcpy(link_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
 }
 EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
 
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				 struct sta_info *sta, u8 opmode,
-				 enum nl80211_band band)
+				 struct sta_info *sta, unsigned int link_id,
+				 u8 opmode, enum nl80211_band band)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
 
-	u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
+	u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, link_id,
+						    opmode, band);
 
 	if (changed > 0) {
 		ieee80211_recalc_min_chandef(sdata);
-		rate_control_rate_update(local, sband, sta, 0, changed);
+		rate_control_rate_update(local, sband, sta, link_id, changed);
 	}
 }
 
-- 
cgit 


From e9aac179ad4526afa3190856b71aa41decb6dc6a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 30 May 2022 23:45:04 +0200
Subject: wifi: mac80211: make some SMPS code MLD-aware

Start making some SMPS related code MLD-aware. This isn't
really done yet, but again cuts down our 'deflink' reliance.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/dvm/lib.c   |  4 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c |  2 +-
 drivers/net/wireless/realtek/rtw88/main.c      |  4 ++--
 include/net/mac80211.h                         |  3 ++-
 net/mac80211/cfg.c                             | 19 +++++++++++------
 net/mac80211/debugfs_netdev.c                  |  2 +-
 net/mac80211/ht.c                              | 29 ++++++++++++++------------
 net/mac80211/ieee80211_i.h                     |  4 +++-
 net/mac80211/iface.c                           |  2 +-
 net/mac80211/mlme.c                            |  2 +-
 net/mac80211/util.c                            |  5 +++--
 11 files changed, 45 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
index 40d790b36d85..1dc974e2c511 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
  *
- * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2008 - 2014, 2022 Intel Corporation. All rights reserved.
  *****************************************************************************/
 #include <linux/etherdevice.h>
 #include <linux/kernel.h>
@@ -441,7 +441,7 @@ static void iwlagn_bt_traffic_change_work(struct work_struct *work)
 		priv->current_ht_config.smps = smps_request;
 		for_each_context(priv, ctx) {
 			if (ctx->vif && ctx->vif->type == NL80211_IFTYPE_STATION)
-				ieee80211_request_smps(ctx->vif, smps_request);
+				ieee80211_request_smps(ctx->vif, 0, smps_request);
 		}
 	}
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 3ee5ea3484be..14b2de65bd84 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -304,7 +304,7 @@ void iwl_mvm_update_smps(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 			smps_mode = IEEE80211_SMPS_DYNAMIC;
 	}
 
-	ieee80211_request_smps(vif, smps_mode);
+	ieee80211_request_smps(vif, 0, smps_mode);
 }
 
 static bool iwl_wait_stats_complete(struct iwl_notif_wait_data *notif_wait,
diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c
index f78b9f28d5f6..985ee36efc0f 100644
--- a/drivers/net/wireless/realtek/rtw88/main.c
+++ b/drivers/net/wireless/realtek/rtw88/main.c
@@ -1600,9 +1600,9 @@ static void rtw_vif_smps_iter(void *data, u8 *mac,
 		return;
 
 	if (rtwdev->hal.txrx_1ss)
-		ieee80211_request_smps(vif, IEEE80211_SMPS_STATIC);
+		ieee80211_request_smps(vif, 0, IEEE80211_SMPS_STATIC);
 	else
-		ieee80211_request_smps(vif, IEEE80211_SMPS_OFF);
+		ieee80211_request_smps(vif, 0, IEEE80211_SMPS_OFF);
 }
 
 void rtw_set_txrx_1ss(struct rtw_dev *rtwdev, bool txrx_1ss)
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0a26cb2871d6..302340f6fa08 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6210,13 +6210,14 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif,
 /**
  * ieee80211_request_smps - request SM PS transition
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: link ID for MLO, or 0
  * @smps_mode: new SM PS mode
  *
  * This allows the driver to request an SM PS transition in managed
  * mode. This is useful when the driver has more information than
  * the stack about possible interference, for example by bluetooth.
  */
-void ieee80211_request_smps(struct ieee80211_vif *vif,
+void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id,
 			    enum ieee80211_smps_mode smps_mode);
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d7f2eb7421c3..2c9554174bcf 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2913,6 +2913,7 @@ static int ieee80211_testmode_dump(struct wiphy *wiphy,
 #endif
 
 int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
+				 unsigned int link_id,
 				 enum ieee80211_smps_mode smps_mode)
 {
 	const u8 *ap;
@@ -2926,8 +2927,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
-	old_req = sdata->deflink.u.mgd.req_smps;
-	sdata->deflink.u.mgd.req_smps = smps_mode;
+	old_req = sdata->link[link_id]->u.mgd.req_smps;
+	sdata->link[link_id]->u.mgd.req_smps = smps_mode;
 
 	if (old_req == smps_mode &&
 	    smps_mode != IEEE80211_SMPS_AUTOMATIC)
@@ -2939,10 +2940,10 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	 * the new value until we associate.
 	 */
 	if (!sdata->u.mgd.associated ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+	    sdata->vif.link_conf[link_id]->chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
 		return 0;
 
-	ap = sdata->deflink.u.mgd.bssid;
+	ap = sdata->link[link_id]->u.mgd.bssid;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
@@ -2966,7 +2967,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	err = ieee80211_send_smps_action(sdata, smps_mode,
 					 ap, ap);
 	if (err)
-		sdata->deflink.u.mgd.req_smps = old_req;
+		sdata->link[link_id]->u.mgd.req_smps = old_req;
 	else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found)
 		ieee80211_teardown_tdls_peers(sdata);
 
@@ -2978,6 +2979,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	unsigned int link_id;
 
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
@@ -2994,7 +2996,12 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 
 	/* no change, but if automatic follow powersave */
 	sdata_lock(sdata);
-	__ieee80211_request_smps_mgd(sdata, sdata->deflink.u.mgd.req_smps);
+	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+		if (!sdata->link[link_id])
+			continue;
+		__ieee80211_request_smps_mgd(sdata, link_id,
+					     sdata->link[link_id]->u.mgd.req_smps);
+	}
 	sdata_unlock(sdata);
 
 	if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index bd398b631763..dfb194e15018 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -256,7 +256,7 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
 		return -EOPNOTSUPP;
 
 	sdata_lock(sdata);
-	err = __ieee80211_request_smps_mgd(sdata, smps_mode);
+	err = __ieee80211_request_smps_mgd(sdata, 0, smps_mode);
 	sdata_unlock(sdata);
 
 	return err;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 22677df83ed8..111828b559a0 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -542,30 +542,33 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 
 void ieee80211_request_smps_mgd_work(struct work_struct *work)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data,
-			     deflink.u.mgd.request_smps_work);
-
-	sdata_lock(sdata);
-	__ieee80211_request_smps_mgd(sdata,
-				     sdata->deflink.u.mgd.driver_smps_mode);
-	sdata_unlock(sdata);
+	struct ieee80211_link_data *link =
+		container_of(work, struct ieee80211_link_data,
+			     u.mgd.request_smps_work);
+
+	sdata_lock(link->sdata);
+	__ieee80211_request_smps_mgd(link->sdata, link->link_id,
+				     link->u.mgd.driver_smps_mode);
+	sdata_unlock(link->sdata);
 }
 
-void ieee80211_request_smps(struct ieee80211_vif *vif,
+void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id,
 			    enum ieee80211_smps_mode smps_mode)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_link_data *link = sdata->link[link_id];
 
 	if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION))
 		return;
 
-	if (sdata->deflink.u.mgd.driver_smps_mode == smps_mode)
+	if (WARN_ON(!link))
 		return;
 
-	sdata->deflink.u.mgd.driver_smps_mode = smps_mode;
-	ieee80211_queue_work(&sdata->local->hw,
-			     &sdata->deflink.u.mgd.request_smps_work);
+	if (link->u.mgd.driver_smps_mode == smps_mode)
+		return;
+
+	link->u.mgd.driver_smps_mode = smps_mode;
+	ieee80211_queue_work(&sdata->local->hw, &link->u.mgd.request_smps_work);
 }
 /* this might change ... don't want non-open drivers using it */
 EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e75496a99299..0948d074c4b9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2422,8 +2422,10 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 			    struct ieee802_11_elems *elems,
 			    enum nl80211_band band, u32 *basic_rates);
 int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
+				 unsigned int link_id,
 				 enum ieee80211_smps_mode smps_mode);
-void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata);
+void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
+			   unsigned int link_id);
 void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata);
 
 size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index c5be60ee12d8..ea75d5d5cb6a 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1657,7 +1657,7 @@ static void ieee80211_recalc_smps_work(struct work_struct *work)
 	struct ieee80211_sub_if_data *sdata =
 		container_of(work, struct ieee80211_sub_if_data, recalc_smps);
 
-	ieee80211_recalc_smps(sdata);
+	ieee80211_recalc_smps(sdata, 0);
 }
 
 /*
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 30423b2d4eee..cdbf7a5dc2ba 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2337,7 +2337,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	ieee80211_recalc_ps(local);
 	mutex_unlock(&local->iflist_mtx);
 
-	ieee80211_recalc_smps(sdata);
+	ieee80211_recalc_smps(sdata, 0);
 	ieee80211_recalc_ps_vif(sdata);
 
 	netif_carrier_on(sdata->dev);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 4ec96170a679..ecda655e7481 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2803,7 +2803,8 @@ void ieee80211_resume_disconnect(struct ieee80211_vif *vif)
 }
 EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect);
 
-void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata)
+void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
+			   unsigned int link_id)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *chanctx_conf;
@@ -2811,7 +2812,7 @@ void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	chanctx_conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf,
+	chanctx_conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
 						 lockdep_is_held(&local->chanctx_mtx));
 
 	/*
-- 
cgit 


From 0f48b8b88aa9ed7b65d7cb55dbc57ec914ddada1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 31 May 2022 14:03:38 +0200
Subject: wifi: ieee80211: add definitions for multi-link element

Add the definitions necessary to build and parse some of the
multi-link element, the per-STA profile isn't fully included.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e15771965916..870f659087d6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4379,4 +4379,226 @@ enum ieee80211_range_params_max_total_ltf {
 /* multi-link device */
 #define IEEE80211_MLD_MAX_NUM_LINKS	15
 
+#define IEEE80211_ML_CONTROL_TYPE			0x0007
+#define IEEE80211_ML_CONTROL_TYPE_BASIC			0
+#define IEEE80211_ML_CONTROL_TYPE_PREQ			1
+#define IEEE80211_ML_CONTROL_TYPE_RECONF		2
+#define IEEE80211_ML_CONTROL_TYPE_TDLS			3
+#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS		4
+#define IEEE80211_ML_CONTROL_PRESENCE_MASK		0xfff0
+
+struct ieee80211_multi_link_elem {
+	__le16 control;
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_BASIC_PRES_LINK_ID		0x0010
+#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT	0x0020
+#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY		0x0040
+#define IEEE80211_MLC_BASIC_PRES_EML_CAPA		0x0080
+#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP		0x0100
+#define IEEE80211_MLC_BASIC_PRES_MLD_ID			0x0200
+
+#define IEEE80211_MED_SYNC_DELAY_DURATION		0x00ff
+#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH	0x0f00
+#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS	0xf000
+
+#define IEEE80211_EML_CAP_EMLSR_SUPP			0x0001
+#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY		0x000e
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US		0
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US		1
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US		2
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US		3
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US		4
+#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY	0x0070
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US		0
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US		1
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US		2
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US		3
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US		4
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US		5
+#define IEEE80211_EML_CAP_EMLMR_SUPPORT			0x0080
+#define IEEE80211_EML_CAP_EMLMR_DELAY			0x0700
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_0US			0
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_32US			1
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_64US			2
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_128US			3
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_256US			4
+#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT		0x7800
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0			0
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US		1
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US		2
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US		3
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU		4
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU		5
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU		6
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU		7
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU		8
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU		9
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU		10
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU		11
+
+#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS		0x000f
+#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT		0x0010
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP	0x0060
+#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND		0x0f80
+#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT		0x1000
+
+struct ieee80211_mle_basic_common_info {
+	u8 len;
+	u8 mld_mac_addr[ETH_ALEN];
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_PREQ_PRES_MLD_ID			0x0010
+
+struct ieee80211_mle_preq_common_info {
+	u8 len;
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR		0x0010
+
+/* no fixed fields in RECONF */
+
+struct ieee80211_mle_tdls_common_info {
+	u8 len;
+	u8 ap_mld_mac_addr[ETH_ALEN];
+} __packed;
+
+#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR	0x0010
+
+/* no fixed fields in PRIO_ACCESS */
+
+/**
+ * ieee80211_mle_common_size - check multi-link element common size
+ * @data: multi-link element, must already be checked for size using
+ *	ieee80211_mle_size_ok()
+ */
+static inline u8 ieee80211_mle_common_size(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	u8 common = 0;
+
+	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
+	case IEEE80211_ML_CONTROL_TYPE_BASIC:
+		common += sizeof(struct ieee80211_mle_basic_common_info);
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PREQ:
+		common += sizeof(struct ieee80211_mle_preq_common_info);
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_RECONF:
+		if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
+			common += ETH_ALEN;
+		return common;
+	case IEEE80211_ML_CONTROL_TYPE_TDLS:
+		common += sizeof(struct ieee80211_mle_tdls_common_info);
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
+		if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR)
+			common += ETH_ALEN;
+		return common;
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+
+	return common + mle->variable[0];
+}
+
+/**
+ * ieee80211_mle_size_ok - validate multi-link element size
+ * @data: pointer to the element data
+ * @len: length of the containing element
+ */
+static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u8 fixed = sizeof(*mle);
+	u8 common = 0;
+	bool check_common_len = false;
+	u16 control;
+
+	if (len < fixed)
+		return false;
+
+	control = le16_to_cpu(mle->control);
+
+	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
+	case IEEE80211_ML_CONTROL_TYPE_BASIC:
+		common += sizeof(struct ieee80211_mle_basic_common_info);
+		check_common_len = true;
+		if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+			common += 1;
+		if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+			common += 1;
+		if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
+			common += 1;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PREQ:
+		common += sizeof(struct ieee80211_mle_preq_common_info);
+		if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID)
+			common += 1;
+		check_common_len = true;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_RECONF:
+		if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
+			common += ETH_ALEN;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_TDLS:
+		common += sizeof(struct ieee80211_mle_tdls_common_info);
+		check_common_len = true;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
+		if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR)
+			common += ETH_ALEN;
+		break;
+	default:
+		/* we don't know this type */
+		return true;
+	}
+
+	if (len < fixed + common)
+		return false;
+
+	if (!check_common_len)
+		return true;
+
+	/* if present, common length is the first octet there */
+	return mle->variable[0] >= common;
+}
+
+enum ieee80211_mle_subelems {
+	IEEE80211_MLE_SUBELEM_PER_STA_PROFILE		= 0,
+};
+
+#define IEEE80211_MLE_STA_CONTROL_LINK_ID			0x000f
+#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE		0x0010
+#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT		0x0020
+#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT		0x0040
+#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT		0x0080
+#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT		0x0100
+#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT	0x0200
+#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE		0x0400
+#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT	0x0800
+
+struct ieee80211_mle_per_sta_profile {
+	__le16 control;
+	u8 sta_info_len;
+	u8 variable[];
+} __packed;
+
+#define for_each_mle_subelement(_elem, _data, _len)			\
+	if (ieee80211_mle_size_ok(_data, _len))				\
+		for_each_element(_elem,					\
+				 _data + ieee80211_mle_common_size(_data),\
+				 _len - ieee80211_mle_common_size(_data))
+
 #endif /* LINUX_IEEE80211_H */
-- 
cgit 


From d648c23024bd01333acd2fd5e34bcde0ffb66b16 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 31 May 2022 19:48:33 +0200
Subject: wifi: nl80211: support MLO in auth/assoc

For authentication, we need the BSS, the link_id and the AP
MLD address to create the link and station, (for now) the
driver assigns a link address and sends the frame, the MLD
address needs to be the address of the interface.

For association, pass the list of BSSes that were selected
for the MLO connection, along with extra per-STA profile
elements, the AP MLD address and the link ID on which the
association request should be sent.

Note that for now we don't have a proper way to pass the link
address(es) and so the driver/mac80211 will select one, but
depending on how that selection works it means that assoc w/o
auth data still being around (mac80211 implementation detail)
the association won't necessarily work - so this will need to
be extended in the future to sort out the link addressing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 31 +++++++++++++++
 include/uapi/linux/nl80211.h |  3 ++
 net/wireless/mlme.c          | 30 +++++++++++++--
 net/wireless/nl80211.c       | 91 ++++++++++++++++++++++++++++++++++++++++++--
 net/wireless/sme.c           |  2 +
 5 files changed, 151 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 772e099fc932..a4f9e6094118 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2718,6 +2718,12 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
  *	Authentication algorithm number, i.e., starting at the Authentication
  *	transaction sequence number field.
  * @auth_data_len: Length of auth_data buffer in octets
+ * @link_id: if >= 0, indicates authentication should be done as an MLD,
+ *	the interface address is included as the MLD address and the
+ *	necessary link (with the given link_id) will be created (and
+ *	given an MLD address) by the driver
+ * @ap_mld_addr: AP MLD address in case of authentication request with
+ *	an AP MLD, valid iff @link_id >= 0
  */
 struct cfg80211_auth_request {
 	struct cfg80211_bss *bss;
@@ -2728,6 +2734,21 @@ struct cfg80211_auth_request {
 	u8 key_len, key_idx;
 	const u8 *auth_data;
 	size_t auth_data_len;
+	s8 link_id;
+	const u8 *ap_mld_addr;
+};
+
+/**
+ * struct cfg80211_assoc_link - per-link information for MLO association
+ * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss;
+ *	if this is %NULL for a link, that link is not requested
+ * @elems: extra elements for the per-STA profile for this link
+ * @elems_len: length of the elements
+ */
+struct cfg80211_assoc_link {
+	struct cfg80211_bss *bss;
+	const u8 *elems;
+	size_t elems_len;
 };
 
 /**
@@ -2761,6 +2782,8 @@ enum cfg80211_assoc_req_flags {
  *	given a reference that it must give back to cfg80211_send_rx_assoc()
  *	or to cfg80211_assoc_timeout(). To ensure proper refcounting, new
  *	association requests while already associating must be rejected.
+ *	This also applies to the @links.bss parameter, which is used instead
+ *	of this one (it is %NULL) for MLO associations.
  * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
  * @ie_len: Length of ie buffer in octets
  * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
@@ -2785,6 +2808,11 @@ enum cfg80211_assoc_req_flags {
  *	with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
  * @s1g_capa: S1G capability override
  * @s1g_capa_mask: S1G capability override mask
+ * @links: per-link information for MLO connections
+ * @link_id: >= 0 for MLO connections, where links are given, and indicates
+ *	the link on which the association request should be sent
+ * @ap_mld_addr: AP MLD address in case of MLO association request,
+ *	valid iff @link_id >= 0
  */
 struct cfg80211_assoc_request {
 	struct cfg80211_bss *bss;
@@ -2800,6 +2828,9 @@ struct cfg80211_assoc_request {
 	size_t fils_kek_len;
 	const u8 *fils_nonces;
 	struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask;
+	struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS];
+	const u8 *ap_mld_addr;
+	s8 link_id;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a9a2c9fef295..60ad9a9f153d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2687,6 +2687,8 @@ enum nl80211_commands {
  *	various commands that need a link ID to operate.
  * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some
  *	per-link information and a link ID.
+ * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as
+ *	authenticate/associate.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -3204,6 +3206,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MLO_LINKS,
 	NL80211_ATTR_MLO_LINK_ID,
+	NL80211_ATTR_MLD_ADDR,
 
 	/* add attributes here, update the policy in nl80211.c */
 
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 532113937469..d92eed0e52cd 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -241,6 +241,10 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 	if (!req->bss)
 		return -ENOENT;
 
+	if (req->link_id >= 0 &&
+	    !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		return -EINVAL;
+
 	if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) {
 		if (!req->key || !req->key_len ||
 		    req->key_idx < 0 || req->key_idx > 3)
@@ -294,10 +298,19 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 			struct cfg80211_assoc_request *req)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
+	int err, i, j;
 
 	ASSERT_WDEV_LOCK(wdev);
 
+	for (i = 1; i < ARRAY_SIZE(req->links); i++) {
+		if (!req->links[i].bss)
+			continue;
+		for (j = 0; j < i; j++) {
+			if (req->links[i].bss == req->links[j].bss)
+				return -EINVAL;
+		}
+	}
+
 	if (wdev->connected &&
 	    (!req->prev_bssid ||
 	     !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
@@ -310,8 +323,19 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 
 	err = rdev_assoc(rdev, dev, req);
 	if (!err) {
-		cfg80211_ref_bss(&rdev->wiphy, req->bss);
-		cfg80211_hold_bss(bss_from_pub(req->bss));
+		int link_id;
+
+		if (req->bss) {
+			cfg80211_ref_bss(&rdev->wiphy, req->bss);
+			cfg80211_hold_bss(bss_from_pub(req->bss));
+		}
+
+		for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) {
+			if (!req->links[link_id].bss)
+				continue;
+			cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss);
+			cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss));
+		}
 	}
 	return err;
 }
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5a4d3ddcdf80..9bc66a21ac3a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -796,6 +796,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_NESTED_ARRAY(nl80211_policy),
 	[NL80211_ATTR_MLO_LINK_ID] =
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
+	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
 };
 
 /* policy for the key attributes */
@@ -10282,6 +10283,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	req.key = key.p.key;
 	req.key_len = key.p.key_len;
 	req.key_idx = key.idx;
+	req.link_id = nl80211_link_id_or_invalid(info->attrs);
+	if (req.link_id >= 0) {
+		if (!info->attrs[NL80211_ATTR_MLD_ADDR])
+			return -EINVAL;
+		req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+	}
 
 	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
 				   IEEE80211_BSS_TYPE_ESS,
@@ -10475,7 +10482,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_assoc_request req = {};
+	struct nlattr **attrs = NULL;
 	const u8 *bssid, *ssid;
+	unsigned int link_id;
 	int err, ssid_len;
 
 	if (dev->ieee80211_ptr->conn_owner_nlportid &&
@@ -10585,9 +10594,81 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		       sizeof(req.s1g_capa));
 	}
 
-	req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs, &bssid);
-	if (IS_ERR(req.bss))
-		return PTR_ERR(req.bss);
+	req.link_id = nl80211_link_id_or_invalid(info->attrs);
+
+	if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
+		unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs);
+		struct nlattr *link;
+		int rem = 0;
+
+		if (req.link_id < 0)
+			return -EINVAL;
+
+		if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO))
+			return -EINVAL;
+
+		if (info->attrs[NL80211_ATTR_MAC] ||
+		    info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+		    !info->attrs[NL80211_ATTR_MLD_ADDR])
+			return -EINVAL;
+
+		req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+
+		attrs = kzalloc(attrsize, GFP_KERNEL);
+		if (!attrs)
+			return -ENOMEM;
+
+		nla_for_each_nested(link,
+				    info->attrs[NL80211_ATTR_MLO_LINKS],
+				    rem) {
+			memset(attrs, 0, attrsize);
+
+			nla_parse_nested(attrs, NL80211_ATTR_MAX,
+					 link, NULL, NULL);
+
+			if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
+				err = -EINVAL;
+				goto free;
+			}
+
+			link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]);
+			/* cannot use the same link ID again */
+			if (req.links[link_id].bss) {
+				err = -EINVAL;
+				goto free;
+			}
+			req.links[link_id].bss =
+				nl80211_assoc_bss(rdev, ssid, ssid_len, attrs,
+						  &bssid);
+			if (IS_ERR(req.links[link_id].bss)) {
+				err = PTR_ERR(req.links[link_id].bss);
+				goto free;
+			}
+
+			if (attrs[NL80211_ATTR_IE]) {
+				req.links[link_id].elems =
+					nla_data(attrs[NL80211_ATTR_IE]);
+				req.links[link_id].elems_len =
+					nla_len(attrs[NL80211_ATTR_IE]);
+			}
+		}
+
+		if (!req.links[req.link_id].bss) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		kfree(attrs);
+		attrs = NULL;
+	} else {
+		if (req.link_id >= 0)
+			return -EINVAL;
+
+		req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs,
+					    &bssid);
+		if (IS_ERR(req.bss))
+			return PTR_ERR(req.bss);
+	}
 
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
@@ -10605,7 +10686,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		wdev_unlock(dev->ieee80211_ptr);
 	}
 
+free:
+	for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++)
+		cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss);
 	cfg80211_put_bss(&rdev->wiphy, req.bss);
+	kfree(attrs);
 
 	return err;
 }
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index c8a99b90723b..b3c6ce4f85ee 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -177,6 +177,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 						params->ssid, params->ssid_len,
 						IEEE80211_BSS_TYPE_ESS,
 						IEEE80211_PRIVACY_ANY);
+		auth_req.link_id = -1;
 		err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req);
 		cfg80211_put_bss(&rdev->wiphy, auth_req.bss);
 		return err;
@@ -198,6 +199,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 		req.ht_capa_mask = params->ht_capa_mask;
 		req.vht_capa = params->vht_capa;
 		req.vht_capa_mask = params->vht_capa_mask;
+		req.link_id = -1;
 
 		req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel,
 					   params->bssid,
-- 
cgit 


From d8787ec6b4ef1857b827699eca6f5978d0aecd74 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 31 May 2022 23:20:08 +0200
Subject: wifi: mac80211: add vif link addition/removal

Add the necessary infrastructure, including a new driver
method, to add/remove links to/from an interface.

Also add the missing link address to bss_conf (which we
use as link_conf too), and fill it, in station mode for
now just randomly, in AP mode we get the address from
cfg80211 since the link must be created with an address
first.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  15 +++++
 net/mac80211/driver-ops.h  |  21 +++++++
 net/mac80211/ieee80211_i.h |   3 +
 net/mac80211/iface.c       | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 net/mac80211/trace.h       |  27 +++++++++
 5 files changed, 197 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 302340f6fa08..f221afdfa475 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -515,6 +515,7 @@ struct ieee80211_fils_discovery {
  * This structure keeps information about a BSS (and an association
  * to that BSS) that can change during the lifetime of the BSS.
  *
+ * @addr: (link) address used locally
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
  * @uora_exists: is the UORA element advertised by AP
  * @ack_enabled: indicates support to receive a multi-TID that solicits either
@@ -638,6 +639,7 @@ struct ieee80211_fils_discovery {
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
+	u8 addr[ETH_ALEN] __aligned(2);
 	u8 htc_trig_based_pkt_ext;
 	bool uora_exists;
 	u8 uora_ocw_range;
@@ -1743,6 +1745,7 @@ struct ieee80211_vif_cfg {
  *	or the BSS we're associated to
  * @link_conf: in case of MLD, the per-link BSS configuration,
  *	indexed by link ID
+ * @valid_links: bitmap of valid links, or 0 for non-MLO.
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
@@ -1778,6 +1781,7 @@ struct ieee80211_vif {
 	struct ieee80211_vif_cfg cfg;
 	struct ieee80211_bss_conf bss_conf;
 	struct ieee80211_bss_conf *link_conf[IEEE80211_MLD_MAX_NUM_LINKS];
+	u16 valid_links;
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
 
@@ -4028,6 +4032,13 @@ struct ieee80211_prep_tx_info {
  *	disable background CAC/radar detection.
  * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to
  *	resolve a path for hardware flow offloading
+ * @change_vif_links: Change the valid links on an interface, note that while
+ *	removing the old link information is still valid (link_conf pointer),
+ *	but may immediately disappear after the function returns. The old or
+ *	new links bitmaps may be 0 if going from/to a non-MLO situation.
+ *	The @old[] array contains pointers to the old bss_conf structures
+ *	that were already removed, in case they're needed.
+ *	This callback can sleep.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4371,6 +4382,10 @@ struct ieee80211_ops {
 				     struct ieee80211_sta *sta,
 				     struct net_device_path_ctx *ctx,
 				     struct net_device_path *path);
+	int (*change_vif_links)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif,
+				u16 old_links, u16 new_links,
+				struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 9238283a8237..c8133c84d7d5 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1533,4 +1533,25 @@ static inline int drv_net_fill_forward_path(struct ieee80211_local *local,
 	return ret;
 }
 
+static inline int drv_change_vif_links(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       u16 old_links, u16 new_links,
+				       struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS])
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	if (!check_sdata_in_driver(sdata))
+		return -EIO;
+
+	trace_drv_change_vif_links(local, sdata, old_links, new_links);
+	if (local->ops->change_vif_links)
+		ret = local->ops->change_vif_links(&local->hw, &sdata->vif,
+						   old_links, new_links, old);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8286e607152c..22f17231dede 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2006,6 +2006,9 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata);
 int ieee80211_add_virtual_monitor(struct ieee80211_local *local);
 void ieee80211_del_virtual_monitor(struct ieee80211_local *local);
 
+int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
+			    u16 new_links);
+
 bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
 void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
 			      bool update_bss);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index ea75d5d5cb6a..f118e7710fb1 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -220,8 +220,10 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
 
 	ret = eth_mac_addr(dev, sa);
 
-	if (ret == 0)
+	if (ret == 0) {
 		memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN);
+		ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
+	}
 
 	return ret;
 }
@@ -449,7 +451,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	cancel_work_sync(&local->dynamic_ps_enable_work);
 
 	cancel_work_sync(&sdata->recalc_smps);
+
 	sdata_lock(sdata);
+	WARN(sdata->vif.valid_links,
+	     "destroying interface with valid links 0x%04x\n",
+	     sdata->vif.valid_links);
+
 	mutex_lock(&local->mtx);
 	sdata->vif.bss_conf.csa_active = false;
 	if (sdata->vif.type == NL80211_IFTYPE_STATION)
@@ -1013,10 +1020,15 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
 }
 
 static void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
-				unsigned int link_id,
+				int link_id,
 				struct ieee80211_link_data *link,
 				struct ieee80211_bss_conf *link_conf)
 {
+	bool deflink = link_id < 0;
+
+	if (link_id < 0)
+		link_id = 0;
+
 	sdata->vif.link_conf[link_id] = link_conf;
 	sdata->link[link_id] = link;
 
@@ -1031,6 +1043,23 @@ static void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 	INIT_LIST_HEAD(&link->reserved_chanctx_list);
 	INIT_DELAYED_WORK(&link->dfs_cac_timer_work,
 			  ieee80211_dfs_cac_timer_work);
+
+	if (!deflink) {
+		switch (sdata->vif.type) {
+		case NL80211_IFTYPE_AP:
+			ether_addr_copy(link_conf->addr,
+					sdata->wdev.links[link_id].addr);
+			WARN_ON(!(sdata->wdev.valid_links & BIT(link_id)));
+			break;
+		case NL80211_IFTYPE_STATION:
+			eth_random_addr(link_conf->addr);
+			ether_addr_copy(sdata->wdev.links[link_id].addr,
+					link_conf->addr);
+			break;
+		default:
+			WARN_ON(1);
+		}
+	}
 }
 
 static void ieee80211_sdata_init(struct ieee80211_local *local,
@@ -1046,7 +1075,7 @@ static void ieee80211_sdata_init(struct ieee80211_local *local,
 	 * Note that we never change this, so if link ID 0 isn't used in an
 	 * MLD connection, we get a separate allocation for it.
 	 */
-	ieee80211_link_init(sdata, 0, &sdata->deflink, &sdata->vif.bss_conf);
+	ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf);
 }
 
 int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
@@ -1768,6 +1797,10 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
 	if (!local->ops->change_interface)
 		return -EBUSY;
 
+	/* for now, don't support changing while links exist */
+	if (sdata->vif.valid_links)
+		return -EBUSY;
+
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP:
 		if (!list_empty(&sdata->u.ap.vlans))
@@ -2030,6 +2063,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		strlcpy(sdata->name, name, IFNAMSIZ);
 		ieee80211_assign_perm_addr(local, wdev->address, type);
 		memcpy(sdata->vif.addr, wdev->address, ETH_ALEN);
+		ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
 	} else {
 		int size = ALIGN(sizeof(*sdata) + local->hw.vif_data_size,
 				 sizeof(void *));
@@ -2094,6 +2128,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		sdata = netdev_priv(ndev);
 		ndev->ieee80211_ptr = &sdata->wdev;
 		memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN);
+		ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
 		memcpy(sdata->name, ndev->name, IFNAMSIZ);
 
 		if (txq_size) {
@@ -2318,3 +2353,96 @@ void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata)
 	else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		atomic_dec(&sdata->u.vlan.num_mcast_sta);
 }
+
+int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
+			    u16 new_links)
+{
+	u16 old_links = sdata->vif.valid_links;
+	unsigned long add = new_links & ~old_links;
+	unsigned long rem = old_links & ~new_links;
+	unsigned int link_id;
+	int ret;
+	struct {
+		struct ieee80211_link_data data;
+		struct ieee80211_bss_conf conf;
+	} *links[IEEE80211_MLD_MAX_NUM_LINKS] = {}, *link;
+	struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS];
+	struct ieee80211_link_data *old_data[IEEE80211_MLD_MAX_NUM_LINKS];
+	bool use_deflink = old_links == 0; /* set for error case */
+
+	sdata_assert_lock(sdata);
+
+	if (old_links == new_links)
+		return 0;
+
+	/* allocate new link structures first */
+	for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link) {
+			ret = -ENOMEM;
+			goto free;
+		}
+		links[link_id] = link;
+	}
+
+	/* keep track of the old pointers for the driver */
+	BUILD_BUG_ON(sizeof(old) != sizeof(sdata->vif.link_conf));
+	memcpy(old, sdata->vif.link_conf, sizeof(old));
+	/* and for us in error cases */
+	BUILD_BUG_ON(sizeof(old_data) != sizeof(sdata->link));
+	memcpy(old_data, sdata->link, sizeof(old_data));
+
+	/* link them into data structures */
+	for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
+		WARN_ON(!use_deflink &&
+			sdata->link[link_id] == &sdata->deflink);
+
+		link = links[link_id];
+		ieee80211_link_init(sdata, link_id, &link->data, &link->conf);
+	}
+
+	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+		sdata->link[link_id] = NULL;
+		sdata->vif.link_conf[link_id] = NULL;
+	}
+
+	sdata->vif.valid_links = new_links;
+
+	/* tell the driver */
+	ret = drv_change_vif_links(sdata->local, sdata,
+				   old_links, new_links,
+				   old);
+	if (ret) {
+		/* restore config */
+		memcpy(sdata->link, old_data, sizeof(old_data));
+		memcpy(sdata->vif.link_conf, old, sizeof(old));
+		sdata->vif.valid_links = old_links;
+		/* and free the newly allocated links */
+		goto free;
+	}
+
+	/* use deflink/bss_conf again if and only if there are no more links */
+	use_deflink = new_links == 0;
+
+	/* now use this to free the old links */
+	memset(links, 0, sizeof(links));
+	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+		if (sdata->link[link_id] == &sdata->deflink)
+			continue;
+		/*
+		 * we must have allocated the data through this path so
+		 * we know we can free both at the same time
+		 */
+		links[link_id] = container_of(sdata->link[link_id],
+					      typeof(*links[link_id]),
+					      data);
+	}
+
+free:
+	for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++)
+		kfree(links[link_id]);
+	if (use_deflink)
+		ieee80211_link_init(sdata, -1, &sdata->deflink,
+				    &sdata->vif.bss_conf);
+	return ret;
+}
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 295db57a76a2..256ebab13cda 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2455,6 +2455,33 @@ DEFINE_EVENT(sta_event, drv_net_fill_forward_path,
 	TP_ARGS(local, sdata, sta)
 );
 
+TRACE_EVENT(drv_change_vif_links,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 u16 old_links, u16 new_links),
+
+	TP_ARGS(local, sdata, old_links, new_links),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u16, old_links)
+		__field(u16, new_links)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->old_links = old_links;
+		__entry->new_links = new_links;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " old_links:0x%04x, new_links:0x%04x\n",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->old_links, __entry->new_links
+	)
+);
+
 /*
  * Tracing for API calls that drivers call.
  */
-- 
cgit 


From eef25a6679adb3cc73b611cf6c78386f54df0e0f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 1 Jun 2022 15:59:31 +0200
Subject: wifi: mac80211: remove band from TX info in MLO

If the interface is an MLD, then we don't know which band
the frame will be transmitted on, and we don't know how to
look up the band. Set the band information to zero in that
case, the driver cannot rely on it anyway.

No longer inline ieee80211_tx_skb_tid() since it's even
bigger now.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  4 ++-
 net/mac80211/ieee80211_i.h | 19 ++--------
 net/mac80211/tx.c          | 89 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 73 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f221afdfa475..8222419c9ead 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1029,7 +1029,9 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate)
  *  (3) TX status information - driver tells mac80211 what happened
  *
  * @flags: transmit info flags, defined above
- * @band: the band to transmit on (use for checking for races)
+ * @band: the band to transmit on (use e.g. for checking for races),
+ *	not valid if the interface is an MLD since we won't know which
+ *	link the frame will be transmitted on
  * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC
  * @ack_frame_id: internal frame ID for TX status, used internally
  * @tx_time_est: TX time estimate in units of 4us, used internally
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 22f17231dede..0f69859e4e39 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2251,23 +2251,8 @@ ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 }
 
-static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
-					struct sk_buff *skb, int tid)
-{
-	struct ieee80211_chanctx_conf *chanctx_conf;
-
-	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
-	if (WARN_ON(!chanctx_conf)) {
-		rcu_read_unlock();
-		kfree_skb(skb);
-		return;
-	}
-
-	__ieee80211_tx_skb_tid_band(sdata, skb, tid,
-				    chanctx_conf->def.chan->band);
-	rcu_read_unlock();
-}
+void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
+			  struct sk_buff *skb, int tid);
 
 static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
 				    struct sk_buff *skb)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 0589d2d66fe6..e8a5e27a3dd1 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2566,8 +2566,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	bool tdls_peer;
 	bool multicast;
 	u16 info_id = 0;
-	struct ieee80211_chanctx_conf *chanctx_conf;
-	struct ieee80211_sub_if_data *ap_sdata;
+	struct ieee80211_chanctx_conf *chanctx_conf = NULL;
 	enum nl80211_band band;
 	int ret;
 
@@ -2584,7 +2583,9 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	ethertype = (skb->data[12] << 8) | skb->data[13];
 	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
 
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+	if (!sdata->vif.valid_links)
+		chanctx_conf =
+			rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
@@ -2599,10 +2600,16 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 			authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
 			wme_sta = sta->sta.wme;
 		}
-		/* override chanctx_conf from AP (we don't have one) */
-		ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
-					u.ap);
-		chanctx_conf = rcu_dereference(ap_sdata->vif.bss_conf.chanctx_conf);
+		if (!sdata->vif.valid_links) {
+			struct ieee80211_sub_if_data *ap_sdata;
+
+			/* override chanctx_conf from AP (we don't have one) */
+			ap_sdata = container_of(sdata->bss,
+						struct ieee80211_sub_if_data,
+						u.ap);
+			chanctx_conf =
+				rcu_dereference(ap_sdata->vif.bss_conf.chanctx_conf);
+		}
 		if (sdata->wdev.use_4addr)
 			break;
 		fallthrough;
@@ -2738,10 +2745,15 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (!chanctx_conf) {
-		ret = -ENOTCONN;
-		goto free;
+		if (!sdata->vif.valid_links) {
+			ret = -ENOTCONN;
+			goto free;
+		}
+		/* MLD transmissions must not rely on the band */
+		band = 0;
+	} else {
+		band = chanctx_conf->def.chan->band;
 	}
-	band = chanctx_conf->def.chan->band;
 
 	multicast = is_multicast_ether_addr(hdr.addr1);
 
@@ -2953,14 +2965,20 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 	    !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG))
 		goto out;
 
-	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
-	if (!chanctx_conf) {
+	if (!sdata->vif.valid_links) {
+		rcu_read_lock();
+		chanctx_conf =
+			rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+		if (!chanctx_conf) {
+			rcu_read_unlock();
+			goto out;
+		}
+		build.band = chanctx_conf->def.chan->band;
 		rcu_read_unlock();
-		goto out;
+	} else {
+		/* MLD transmissions must not rely on the band */
+		build.band = 0;
 	}
-	build.band = chanctx_conf->def.chan->band;
-	rcu_read_unlock();
 
 	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
 
@@ -4577,12 +4595,16 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
 	sdata = vif_to_sdata(info->control.vif);
 
 	if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) {
-		chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
-		if (unlikely(!chanctx_conf)) {
-			dev_kfree_skb(skb);
-			return true;
+		/* update band only for non-MLD */
+		if (!sdata->vif.valid_links) {
+			chanctx_conf =
+				rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+			if (unlikely(!chanctx_conf)) {
+				dev_kfree_skb(skb);
+				return true;
+			}
+			info->band = chanctx_conf->def.chan->band;
 		}
-		info->band = chanctx_conf->def.chan->band;
 		result = ieee80211_tx(sdata, NULL, skb, true);
 	} else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) {
 		if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
@@ -5665,6 +5687,31 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 	local_bh_enable();
 }
 
+void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
+			  struct sk_buff *skb, int tid)
+{
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	enum nl80211_band band;
+
+	rcu_read_lock();
+	if (!sdata->vif.valid_links) {
+		chanctx_conf =
+			rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+		if (WARN_ON(!chanctx_conf)) {
+			rcu_read_unlock();
+			kfree_skb(skb);
+			return;
+		}
+		band = chanctx_conf->def.chan->band;
+	} else {
+		/* MLD transmissions must not rely on the band */
+		band = 0;
+	}
+
+	__ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+	rcu_read_unlock();
+}
+
 int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len,
 			      const u8 *dest, __be16 proto, bool unencrypted,
-- 
cgit 


From 69d41b5a9c9d8d24c0faeb376fc2f52fc810d855 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 9 Jun 2022 22:05:07 +0200
Subject: wifi: mac80211: add MLO link ID to TX frame metadata

Take a few bits out of the control.flags to add the link ID
to TX frame metadata, so drivers don't need to look it up
by the address themselves. Implement that lookup where it's
needed, for internal frame TX, and set it to "unspecified"
for data transmissions.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  7 +++++++
 net/mac80211/tx.c      | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8222419c9ead..1bea225c0d4d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -867,6 +867,10 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered
  *	relative to other frames that have this flag set, independent
  *	of their QoS TID or other priority field values.
+ * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this
+ *	frame should be transmitted on the specific link. This really is
+ *	only relevant for frames that do not have data present, and is
+ *	also not used for 802.3 format frames.
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -880,8 +884,11 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_INTCFL_NEED_TXPROCESSING	= BIT(6),
 	IEEE80211_TX_CTRL_NO_SEQNO		= BIT(7),
 	IEEE80211_TX_CTRL_DONT_REORDER		= BIT(8),
+	IEEE80211_TX_CTRL_MLO_LINK		= 0xf0000000,
 };
 
+#define IEEE80211_LINK_UNSPECIFIED	0xf
+
 /**
  * enum mac80211_tx_status_flags - flags to describe transmit status
  *
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e8a5e27a3dd1..17313954c628 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2889,7 +2889,9 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	info->flags = info_flags;
 	info->ack_frame_id = info_id;
 	info->band = band;
-	info->control.flags = ctrl_flags;
+	info->control.flags = ctrl_flags |
+			      u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
+					      IEEE80211_TX_CTRL_MLO_LINK);
 
 	return skb;
  free:
@@ -3558,7 +3560,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT |
+			      u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
+					      IEEE80211_TX_CTRL_MLO_LINK);
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	if (local->force_tx_status)
@@ -5668,7 +5672,9 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb, int tid,
 				 enum nl80211_band band)
 {
+	const struct ieee80211_hdr *hdr = (void *)skb->data;
 	int ac = ieee80211_ac_from_tid(tid);
+	unsigned int link;
 
 	skb_reset_mac_header(skb);
 	skb_set_queue_mapping(skb, ac);
@@ -5676,6 +5682,30 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 
 	skb->dev = sdata->dev;
 
+	BUILD_BUG_ON(IEEE80211_LINK_UNSPECIFIED < IEEE80211_MLD_MAX_NUM_LINKS);
+	BUILD_BUG_ON(!FIELD_FIT(IEEE80211_TX_CTRL_MLO_LINK,
+				IEEE80211_LINK_UNSPECIFIED));
+
+	if (!sdata->vif.valid_links) {
+		link = 0;
+	} else if (memcmp(sdata->vif.addr, hdr->addr2, ETH_ALEN) == 0) {
+		/* address from the MLD */
+		link = IEEE80211_LINK_UNSPECIFIED;
+	} else {
+		/* otherwise must be addressed from a link */
+		for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) {
+			if (memcmp(sdata->vif.link_conf[link]->addr,
+				   hdr->addr2, ETH_ALEN) == 0)
+				break;
+		}
+
+		if (WARN_ON_ONCE(link == ARRAY_SIZE(sdata->vif.link_conf)))
+			link = ffs(sdata->vif.valid_links) - 1;
+	}
+
+	IEEE80211_SKB_CB(skb)->control.flags |=
+		u32_encode_bits(link, IEEE80211_TX_CTRL_MLO_LINK);
+
 	/*
 	 * The other path calling ieee80211_xmit is from the tasklet,
 	 * and while we can handle concurrent transmissions locking
-- 
cgit 


From cb71f1d136a635decf43c3b502ee34fb05640fcd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 31 May 2022 23:20:08 +0200
Subject: wifi: mac80211: add sta link addition/removal

Add the necessary infrastructure, including a new driver
method, to add/remove links to/from a station. To do this,
refactor the link alloc/free a bit, splitting that so we
can do it without linking them, to handle failures better.

Note that a station entry must be created representing an
MLD or a non-MLD STA, it cannot change between the two.
When representing an MLD, the 'deflink' is used for the
first link, which might be removed later, in which case
the memory isn't reused.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    |  13 +++-
 net/mac80211/cfg.c        |   2 +-
 net/mac80211/driver-ops.h |  21 +++++++
 net/mac80211/ibss.c       |   4 +-
 net/mac80211/mesh_plink.c |   2 +-
 net/mac80211/mlme.c       |   2 +-
 net/mac80211/ocb.c        |   2 +-
 net/mac80211/sta_info.c   | 156 +++++++++++++++++++++++++++++++++++++++-------
 net/mac80211/sta_info.h   |   8 ++-
 net/mac80211/trace.h      |  31 +++++++++
 10 files changed, 206 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1bea225c0d4d..1bc9d1d9769a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2150,7 +2150,6 @@ struct ieee80211_link_sta {
  * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
  *	the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
- * @multi_link_sta: Identifies if this sta is a MLD STA
  * @deflink: This holds the default link STA information, for non MLO STA all link
  *	specific STA information is accessed through @deflink or through
  *	link[0] which points to address of @deflink. For MLO Link STA
@@ -2162,6 +2161,7 @@ struct ieee80211_link_sta {
  *	@deflink address and remaining would be allocated and the address
  *	would be assigned to link[link_id] where link_id is the id assigned
  *	by the AP.
+ * @valid_links: bitmap of valid links, or 0 for non-MLO
  */
 struct ieee80211_sta {
 	u8 addr[ETH_ALEN];
@@ -2199,7 +2199,7 @@ struct ieee80211_sta {
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
-	bool multi_link_sta;
+	u16 valid_links;
 	struct ieee80211_link_sta deflink;
 	struct ieee80211_link_sta *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
@@ -4048,6 +4048,11 @@ struct ieee80211_prep_tx_info {
  *	The @old[] array contains pointers to the old bss_conf structures
  *	that were already removed, in case they're needed.
  *	This callback can sleep.
+ * @change_sta_links: Change the valid links of a station, similar to
+ *	@change_vif_links. This callback can sleep.
+ *	Note that a sta can also be inserted or removed with valid links,
+ *	i.e. passed to @sta_add/@sta_state with sta->valid_links not zero.
+ *	In fact, cannot change from having valid_links and not having them.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4395,6 +4400,10 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				u16 old_links, u16 new_links,
 				struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]);
+	int (*change_sta_links)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif,
+				struct ieee80211_sta *sta,
+				u16 old_links, u16 new_links);
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e30659b1242b..abc3fa5a8d7e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1839,7 +1839,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	    !sdata->u.mgd.associated)
 		return -EINVAL;
 
-	sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
+	sta = sta_info_alloc(sdata, mac, -1, GFP_KERNEL);
 	if (!sta)
 		return -ENOMEM;
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c8133c84d7d5..52be89f8f0bc 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1554,4 +1554,25 @@ static inline int drv_change_vif_links(struct ieee80211_local *local,
 	return ret;
 }
 
+static inline int drv_change_sta_links(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_sta *sta,
+				       u16 old_links, u16 new_links)
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	if (!check_sdata_in_driver(sdata))
+		return -EIO;
+
+	trace_drv_change_sta_links(local, sdata, sta, old_links, new_links);
+	if (local->ops->change_sta_links)
+		ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta,
+						   old_links, new_links);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 09c11c067cbf..65a3808dc92a 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -629,7 +629,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
 	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
-	sta = sta_info_alloc(sdata, addr, GFP_KERNEL);
+	sta = sta_info_alloc(sdata, addr, -1, GFP_KERNEL);
 	if (!sta) {
 		rcu_read_lock();
 		return NULL;
@@ -1229,7 +1229,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
-	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+	sta = sta_info_alloc(sdata, addr, -1, GFP_ATOMIC);
 	if (!sta)
 		return;
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index e24bd48bc915..fe54ac431202 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -510,7 +510,7 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr)
 	if (aid < 0)
 		return NULL;
 
-	sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL);
+	sta = sta_info_alloc(sdata, hw_addr, -1, GFP_KERNEL);
 	if (!sta)
 		return NULL;
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b6ee8da07663..c2c997086553 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5579,7 +5579,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (!have_sta) {
-		new_sta = sta_info_alloc(sdata, cbss->bssid, GFP_KERNEL);
+		new_sta = sta_info_alloc(sdata, cbss->bssid, -1, GFP_KERNEL);
 		if (!new_sta)
 			return -ENOMEM;
 	}
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 468c741a9aeb..0fd29d9c496c 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -69,7 +69,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
 	scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
 	rcu_read_unlock();
 
-	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+	sta = sta_info_alloc(sdata, addr, -1, GFP_ATOMIC);
 	if (!sta)
 		return;
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f20ce7bbcb39..b1426a2459e8 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -64,6 +64,11 @@
  * freed before they are done using it.
  */
 
+struct sta_link_alloc {
+	struct link_sta_info info;
+	struct ieee80211_link_sta sta;
+};
+
 static const struct rhashtable_params sta_rht_params = {
 	.nelem_hint = 3, /* start small */
 	.automatic_shrinking = true,
@@ -245,17 +250,27 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
 	return NULL;
 }
 
-static void sta_info_free_links(struct sta_info *sta)
+static void sta_info_free_link(struct link_sta_info *link_sta)
 {
-	unsigned int link_id;
+	free_percpu(link_sta->pcpu_rx_stats);
+}
 
-	for (link_id = 0; link_id < ARRAY_SIZE(sta->link); link_id++) {
-		if (!sta->link[link_id])
-			continue;
-		free_percpu(sta->link[link_id]->pcpu_rx_stats);
+static void sta_remove_link(struct sta_info *sta, unsigned int link_id)
+{
+	struct sta_link_alloc *alloc = NULL;
 
-		if (sta->link[link_id] != &sta->deflink)
-			kfree(sta->link[link_id]);
+	if (WARN_ON(!sta->link[link_id]))
+		return;
+
+	if (sta->link[link_id] != &sta->deflink)
+		alloc = container_of(sta->link[link_id], typeof(*alloc), info);
+
+	sta->sta.valid_links &= ~BIT(link_id);
+	sta->link[link_id] = NULL;
+	sta->sta.link[link_id] = NULL;
+	if (alloc) {
+		sta_info_free_link(&alloc->info);
+		kfree(alloc);
 	}
 }
 
@@ -272,6 +287,15 @@ static void sta_info_free_links(struct sta_info *sta)
  */
 void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
 {
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+		if (!(sta->sta.valid_links & BIT(i)))
+			continue;
+
+		sta_remove_link(sta, i);
+	}
+
 	/*
 	 * If we had used sta_info_pre_move_state() then we might not
 	 * have gone through the state transitions down again, so do
@@ -302,7 +326,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
 	kfree(sta->mesh);
 #endif
 
-	sta_info_free_links(sta);
+	sta_info_free_link(&sta->deflink);
 	kfree(sta);
 }
 
@@ -348,19 +372,13 @@ static int sta_prepare_rate_control(struct ieee80211_local *local,
 	return 0;
 }
 
-static int sta_info_init_link(struct sta_info *sta,
-			      unsigned int link_id,
-			      struct link_sta_info *link_info,
-			      struct ieee80211_link_sta *link_sta,
-			      gfp_t gfp)
+static int sta_info_alloc_link(struct ieee80211_local *local,
+			       struct link_sta_info *link_info,
+			       gfp_t gfp)
 {
-	struct ieee80211_local *local = sta->local;
 	struct ieee80211_hw *hw = &local->hw;
 	int i;
 
-	link_info->sta = sta;
-	link_info->link_id = link_id;
-
 	if (ieee80211_hw_check(hw, USES_RSS)) {
 		link_info->pcpu_rx_stats =
 			alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp);
@@ -368,9 +386,6 @@ static int sta_info_init_link(struct sta_info *sta,
 			return -ENOMEM;
 	}
 
-	sta->link[link_id] = link_info;
-	sta->sta.link[link_id] = link_sta;
-
 	link_info->rx_stats.last_rx = jiffies;
 	u64_stats_init(&link_info->rx_stats.syncp);
 
@@ -382,8 +397,19 @@ static int sta_info_init_link(struct sta_info *sta,
 	return 0;
 }
 
+static void sta_info_add_link(struct sta_info *sta,
+			      unsigned int link_id,
+			      struct link_sta_info *link_info,
+			      struct ieee80211_link_sta *link_sta)
+{
+	link_info->sta = sta;
+	link_info->link_id = link_id;
+	sta->link[link_id] = link_info;
+	sta->sta.link[link_id] = link_sta;
+}
+
 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
-				const u8 *addr, gfp_t gfp)
+				const u8 *addr, int link_id, gfp_t gfp)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_hw *hw = &local->hw;
@@ -397,9 +423,17 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	sta->local = local;
 	sta->sdata = sdata;
 
-	if (sta_info_init_link(sta, 0, &sta->deflink, &sta->sta.deflink, gfp))
+	if (sta_info_alloc_link(local, &sta->deflink, gfp))
 		return NULL;
 
+	if (link_id >= 0) {
+		sta_info_add_link(sta, link_id, &sta->deflink,
+				  &sta->sta.deflink);
+		sta->sta.valid_links = BIT(link_id);
+	} else {
+		sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink);
+	}
+
 	spin_lock_init(&sta->lock);
 	spin_lock_init(&sta->ps_lock);
 	INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames);
@@ -565,7 +599,7 @@ free_txq:
 	if (sta->sta.txq[0])
 		kfree(to_txq_info(sta->sta.txq[0]));
 free:
-	sta_info_free_links(sta);
+	sta_info_free_link(&sta->deflink);
 #ifdef CONFIG_MAC80211_MESH
 	kfree(sta->mesh);
 #endif
@@ -2613,3 +2647,77 @@ void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
 
 	sta_update_codel_params(sta, thr);
 }
+
+int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct sta_link_alloc *alloc;
+	int ret;
+
+	lockdep_assert_held(&sdata->local->sta_mtx);
+
+	/* must represent an MLD from the start */
+	if (WARN_ON(!sta->sta.valid_links))
+		return -EINVAL;
+
+	if (WARN_ON(sta->sta.valid_links & BIT(link_id) ||
+		    sta->link[link_id]))
+		return -EBUSY;
+
+	alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
+	if (!alloc)
+		return -ENOMEM;
+
+	ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL);
+	if (ret) {
+		kfree(alloc);
+		return ret;
+	}
+
+	sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta);
+
+	return 0;
+}
+
+int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u16 old_links = sta->sta.valid_links;
+	u16 new_links = old_links | BIT(link_id);
+	int ret;
+
+	lockdep_assert_held(&sdata->local->sta_mtx);
+
+	if (WARN_ON(old_links == new_links || !sta->link[link_id]))
+		return -EINVAL;
+
+	sta->sta.valid_links = new_links;
+
+	if (!test_sta_flag(sta, WLAN_STA_INSERTED))
+		return 0;
+
+	ret = drv_change_sta_links(sdata->local, sdata, &sta->sta,
+				   old_links, new_links);
+	if (ret) {
+		sta->sta.valid_links = old_links;
+		sta_remove_link(sta, link_id);
+	}
+
+	return ret;
+}
+
+void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+	lockdep_assert_held(&sdata->local->sta_mtx);
+
+	sta->sta.valid_links &= ~BIT(link_id);
+
+	if (test_sta_flag(sta, WLAN_STA_INSERTED))
+		drv_change_sta_links(sdata->local, sdata, &sta->sta,
+				     sta->sta.valid_links,
+				     sta->sta.valid_links & ~BIT(link_id));
+
+	sta_remove_link(sta, link_id);
+}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 2ff3d5fd0cbf..8ec65bb7d13e 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -623,7 +623,6 @@ struct link_sta_info {
  * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
  *	the BSS one.
  * @frags: fragment cache
- * @multi_link_sta: Identifies if this sta is a MLD STA or regular STA
  * @deflink: This is the default link STA information, for non MLO STA all link
  *	specific STA information is accessed through @deflink or through
  *	link[0] which points to address of @deflink. For MLO Link STA
@@ -708,7 +707,6 @@ struct sta_info {
 
 	struct ieee80211_fragment_cache frags;
 
-	bool multi_link_sta;
 	struct link_sta_info deflink;
 	struct link_sta_info *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
@@ -833,7 +831,7 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
  * until sta_info_insert().
  */
 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
-				const u8 *addr, gfp_t gfp);
+				const u8 *addr, int link_id, gfp_t gfp);
 
 void sta_info_free(struct ieee80211_local *local, struct sta_info *sta);
 
@@ -892,6 +890,10 @@ u32 sta_get_expected_throughput(struct sta_info *sta);
 void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 			  unsigned long exp_time);
 
+int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id);
+int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id);
+void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id);
+
 void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
 void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
 void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 256ebab13cda..9804634e7d99 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2482,6 +2482,37 @@ TRACE_EVENT(drv_change_vif_links,
 	)
 );
 
+TRACE_EVENT(drv_change_sta_links,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta,
+		 u16 old_links, u16 new_links),
+
+	TP_ARGS(local, sdata, sta, old_links, new_links),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u16, old_links)
+		__field(u16, new_links)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->old_links = old_links;
+		__entry->new_links = new_links;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " old_links:0x%04x, new_links:0x%04x\n",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
+		__entry->old_links, __entry->new_links
+	)
+);
+
 /*
  * Tracing for API calls that drivers call.
  */
-- 
cgit 


From f2a0290b2df2b0e65ab78b71c4a15e732afd4458 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 10 Jun 2022 11:07:55 +0200
Subject: wifi: cfg80211: add optional link add/remove callbacks

Add some optional callbacks for link add/remove so that
drivers can react here. Initially, I thought it would be
sufficient to just create the link in start_ap etc., but
it turns out that's not so simple, since there are quite
a few callbacks that can be called: if they're erroneously
without start_ap, things might crash.

Thus it might be easier for drivers to allocate all the
necessary data structures immediately, to not have to
worry about it in each callback, since cfg80211 checks
that the link ID is valid (has been added.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  | 12 ++++++++++++
 net/wireless/nl80211.c  | 14 +++++++++++++-
 net/wireless/rdev-ops.h | 26 ++++++++++++++++++++++++++
 net/wireless/trace.h    | 20 +++++++++++++++++++-
 4 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a4f9e6094118..5706f96b819a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3858,6 +3858,11 @@ struct mgmt_frame_regs {
  *	keep the struct wireless_dev's iftype updated.
  *	This additionally holds the RTNL to be able to do netdev changes.
  *
+ * @add_intf_link: Add a new MLO link to the given interface. Note that
+ *	the wdev->link[] data structure has been updated, so the new link
+ *	address is available.
+ * @del_intf_link: Remove an MLO link from the given interface.
+ *
  * @add_key: add a key with the given parameters. @mac_addr will be %NULL
  *	when adding a group key.
  *
@@ -4212,6 +4217,13 @@ struct cfg80211_ops {
 				       enum nl80211_iftype type,
 				       struct vif_params *params);
 
+	int	(*add_intf_link)(struct wiphy *wiphy,
+				 struct wireless_dev *wdev,
+				 unsigned int link_id);
+	void	(*del_intf_link)(struct wiphy *wiphy,
+				 struct wireless_dev *wdev,
+				 unsigned int link_id);
+
 	int	(*add_key)(struct wiphy *wiphy, struct net_device *netdev,
 			   u8 key_index, bool pairwise, const u8 *mac_addr,
 			   struct key_params *params);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9bc66a21ac3a..0a69a5a6b74d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -15556,9 +15556,11 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 
 static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
 {
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int ret;
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
 		return -EINVAL;
@@ -15578,13 +15580,20 @@ static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
 	wdev->valid_links |= BIT(link_id);
 	ether_addr_copy(wdev->links[link_id].addr,
 			nla_data(info->attrs[NL80211_ATTR_MAC]));
+
+	ret = rdev_add_intf_link(rdev, wdev, link_id);
+	if (ret) {
+		wdev->valid_links &= ~BIT(link_id);
+		eth_zero_addr(wdev->links[link_id].addr);
+	}
 	wdev_unlock(wdev);
 
-	return 0;
+	return ret;
 }
 
 static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
 {
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -15604,6 +15613,9 @@ static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
 
 	wdev_lock(wdev);
 	wdev->valid_links &= ~BIT(link_id);
+
+	rdev_del_intf_link(rdev, wdev, link_id);
+
 	eth_zero_addr(wdev->links[link_id].addr);
 	wdev_unlock(wdev);
 
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index d2300eff03ae..a329ba036989 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1422,4 +1422,30 @@ rdev_set_radar_background(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_add_intf_link(struct cfg80211_registered_device *rdev,
+		   struct wireless_dev *wdev,
+		   unsigned int link_id)
+{
+	int ret = 0;
+
+	trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id);
+	if (rdev->ops->add_intf_link)
+		ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+
+	return ret;
+}
+
+static inline void
+rdev_del_intf_link(struct cfg80211_registered_device *rdev,
+		   struct wireless_dev *wdev,
+		   unsigned int link_id)
+{
+	trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id);
+	if (rdev->ops->add_intf_link)
+		rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id);
+	trace_rdev_return_void(&rdev->wiphy);
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 92742430958e..65f8b814ecd0 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2064,7 +2064,7 @@ TRACE_EVENT(rdev_set_noack_map,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
 );
 
-TRACE_EVENT(rdev_get_channel,
+DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
 		 unsigned int link_id),
 	TP_ARGS(wiphy, wdev, link_id),
@@ -2082,6 +2082,12 @@ TRACE_EVENT(rdev_get_channel,
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
 );
 
+DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, wdev, link_id)
+);
+
 TRACE_EVENT(rdev_return_chandef,
 	TP_PROTO(struct wiphy *wiphy, int ret,
 		 struct cfg80211_chan_def *chandef),
@@ -2823,6 +2829,18 @@ TRACE_EVENT(rdev_set_radar_background,
 		  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
 );
 
+DEFINE_EVENT(wiphy_wdev_link_evt, rdev_add_intf_link,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, wdev, link_id)
+);
+
+DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, wdev, link_id)
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
-- 
cgit 


From ae7ba17b49b6707e62f31643dda25592c29482f8 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Thu, 2 Jun 2022 15:08:16 +0300
Subject: wifi: mac80211: pass the link id in start/stop ap

In start_ap and stop_ap mac80211 callbacks pass the link_id
to the drivers.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 40 +++++++++++++++++++----
 drivers/net/wireless/realtek/rtw88/mac80211.c     |  3 +-
 drivers/net/wireless/realtek/rtw89/mac80211.c     |  6 ++--
 drivers/net/wireless/silabs/wfx/sta.c             |  6 ++--
 drivers/net/wireless/silabs/wfx/sta.h             |  6 ++--
 include/net/mac80211.h                            |  6 ++--
 net/mac80211/cfg.c                                |  4 +--
 net/mac80211/driver-ops.h                         | 15 +++++----
 net/mac80211/trace.h                              | 36 ++++++++++++++++----
 net/mac80211/util.c                               |  2 +-
 10 files changed, 93 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 2539dc5aaa3f..eaffc3163bd1 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2396,7 +2396,8 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 }
 
 static int iwl_mvm_start_ap_ibss(struct ieee80211_hw *hw,
-				 struct ieee80211_vif *vif)
+				 struct ieee80211_vif *vif,
+				 unsigned int link_id)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -2522,8 +2523,22 @@ out_unlock:
 	return ret;
 }
 
+static int iwl_mvm_start_ap(struct ieee80211_hw *hw,
+			    struct ieee80211_vif *vif,
+			    unsigned int link_id)
+{
+	return iwl_mvm_start_ap_ibss(hw, vif, link_id);
+}
+
+static int iwl_mvm_start_ibss(struct ieee80211_hw *hw,
+			      struct ieee80211_vif *vif)
+{
+	return iwl_mvm_start_ap_ibss(hw, vif, 0);
+}
+
 static void iwl_mvm_stop_ap_ibss(struct ieee80211_hw *hw,
-				 struct ieee80211_vif *vif)
+				 struct ieee80211_vif *vif,
+				 unsigned int link_id)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -2586,6 +2601,19 @@ static void iwl_mvm_stop_ap_ibss(struct ieee80211_hw *hw,
 	mutex_unlock(&mvm->mutex);
 }
 
+static void iwl_mvm_stop_ap(struct ieee80211_hw *hw,
+			    struct ieee80211_vif *vif,
+			    unsigned int link_id)
+{
+	iwl_mvm_stop_ap_ibss(hw, vif, link_id);
+}
+
+static void iwl_mvm_stop_ibss(struct ieee80211_hw *hw,
+			      struct ieee80211_vif *vif)
+{
+	iwl_mvm_stop_ap_ibss(hw, vif, 0);
+}
+
 static void
 iwl_mvm_bss_info_changed_ap_ibss(struct iwl_mvm *mvm,
 				 struct ieee80211_vif *vif,
@@ -5408,10 +5436,10 @@ const struct ieee80211_ops iwl_mvm_hw_ops = {
 	.unassign_vif_chanctx = iwl_mvm_unassign_vif_chanctx,
 	.switch_vif_chanctx = iwl_mvm_switch_vif_chanctx,
 
-	.start_ap = iwl_mvm_start_ap_ibss,
-	.stop_ap = iwl_mvm_stop_ap_ibss,
-	.join_ibss = iwl_mvm_start_ap_ibss,
-	.leave_ibss = iwl_mvm_stop_ap_ibss,
+	.start_ap = iwl_mvm_start_ap,
+	.stop_ap = iwl_mvm_stop_ap,
+	.join_ibss = iwl_mvm_start_ibss,
+	.leave_ibss = iwl_mvm_stop_ibss,
 
 	.tx_last_beacon = iwl_mvm_tx_last_beacon,
 
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index 9e0b5692fbab..ba60ca7ecdbf 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -429,7 +429,8 @@ static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw,
 	mutex_unlock(&rtwdev->mutex);
 }
 
-static int rtw_ops_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
+static int rtw_ops_start_ap(struct ieee80211_hw *hw,
+			    struct ieee80211_vif *vif, unsigned int link_id)
 {
 	struct rtw_dev *rtwdev = hw->priv;
 	struct rtw_chip_info *chip = rtwdev->chip;
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index 5afb8fe5708e..0e3c95cf4c99 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -381,7 +381,8 @@ static void rtw89_ops_bss_info_changed(struct ieee80211_hw *hw,
 	mutex_unlock(&rtwdev->mutex);
 }
 
-static int rtw89_ops_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
+static int rtw89_ops_start_ap(struct ieee80211_hw *hw,
+			      struct ieee80211_vif *vif, unsigned int link_id)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
@@ -401,7 +402,8 @@ static int rtw89_ops_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif
 }
 
 static
-void rtw89_ops_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
+void rtw89_ops_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		       unsigned int link_id)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 97a631ff1bfa..3ae93573f787 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -378,7 +378,8 @@ static void wfx_set_mfp_ap(struct wfx_vif *wvif)
 	}
 }
 
-int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
+int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		 unsigned int link_id)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
 	struct wfx_dev *wdev = wvif->wdev;
@@ -396,7 +397,8 @@ int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 	return ret;
 }
 
-void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
+void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		 unsigned int link_id)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
 
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index 3109d257fe94..93ea992de1d7 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -29,8 +29,10 @@ void wfx_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags,
 
 int wfx_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void wfx_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
-int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
-void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
+int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		 unsigned int link_id);
+void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		 unsigned int link_id);
 int wfx_join_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void wfx_leave_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1bc9d1d9769a..48c688fdf23d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4084,8 +4084,10 @@ struct ieee80211_ops {
 				  struct ieee80211_vif *vif,
 				  unsigned int link_id, u64 changed);
 
-	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
-	void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
+	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			unsigned int link_id);
+	void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			unsigned int link_id);
 
 	u64 (*prepare_multicast)(struct ieee80211_hw *hw,
 				 struct netdev_hw_addr_list *mc_list);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f1ee73d96dfb..009f1723c990 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1287,7 +1287,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 		changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP;
 	}
 
-	err = drv_start_ap(sdata->local, sdata);
+	err = drv_start_ap(sdata->local, sdata, link_id);
 	if (err) {
 		old = sdata_dereference(link->u.ap.beacon, sdata);
 
@@ -1442,7 +1442,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 				   GFP_KERNEL);
 	}
 
-	drv_stop_ap(sdata->local, sdata);
+	drv_stop_ap(sdata->local, sdata, link_id);
 
 	/* free all potentially still buffered bcast frames */
 	local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 52be89f8f0bc..db38c8cc9d8f 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -985,7 +985,8 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local,
 			   int n_vifs, enum ieee80211_chanctx_switch_mode mode);
 
 static inline int drv_start_ap(struct ieee80211_local *local,
-			       struct ieee80211_sub_if_data *sdata)
+			       struct ieee80211_sub_if_data *sdata,
+			       unsigned int link_id)
 {
 	int ret = 0;
 
@@ -994,22 +995,24 @@ static inline int drv_start_ap(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf);
+	trace_drv_start_ap(local, sdata, sdata->vif.link_conf[link_id],
+			   link_id);
 	if (local->ops->start_ap)
-		ret = local->ops->start_ap(&local->hw, &sdata->vif);
+		ret = local->ops->start_ap(&local->hw, &sdata->vif, link_id);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
 
 static inline void drv_stop_ap(struct ieee80211_local *local,
-			       struct ieee80211_sub_if_data *sdata)
+			       struct ieee80211_sub_if_data *sdata,
+			       unsigned int link_id)
 {
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_stop_ap(local, sdata);
+	trace_drv_stop_ap(local, sdata, link_id);
 	if (local->ops->stop_ap)
-		local->ops->stop_ap(&local->hw, &sdata->vif);
+		local->ops->stop_ap(&local->hw, &sdata->vif, link_id);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 9804634e7d99..f96e7cdca4c2 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1752,13 +1752,15 @@ DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
 TRACE_EVENT(drv_start_ap,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 struct ieee80211_bss_conf *info),
+		 struct ieee80211_bss_conf *info,
+		 unsigned int link_id),
 
-	TP_ARGS(local, sdata, info),
+	TP_ARGS(local, sdata, info, link_id),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
+		__field(u32, link_id)
 		__field(u8, dtimper)
 		__field(u16, bcnint)
 		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
@@ -1768,6 +1770,7 @@ TRACE_EVENT(drv_start_ap,
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
+		__entry->link_id = link_id;
 		__entry->dtimper = info->dtim_period;
 		__entry->bcnint = info->beacon_int;
 		memcpy(__get_dynamic_array(ssid),
@@ -1777,15 +1780,34 @@ TRACE_EVENT(drv_start_ap,
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT,
-		LOCAL_PR_ARG, VIF_PR_ARG
+		LOCAL_PR_FMT  VIF_PR_FMT " link id %u",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id
 	)
 );
 
-DEFINE_EVENT(local_sdata_evt, drv_stop_ap,
+TRACE_EVENT(drv_stop_ap,
 	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sub_if_data *sdata),
-	TP_ARGS(local, sdata)
+		 struct ieee80211_sub_if_data *sdata,
+		 unsigned int link_id),
+
+	TP_ARGS(local, sdata, link_id),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, link_id)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->link_id = link_id;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " link id %u",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id
+	)
 );
 
 TRACE_EVENT(drv_reconfig_complete,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ecda655e7481..bccc3a309ed0 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2568,7 +2568,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 				changed |= BSS_CHANGED_AP_PROBE_RESP;
 
 				if (rcu_access_pointer(sdata->deflink.u.ap.beacon))
-					drv_start_ap(local, sdata);
+					drv_start_ap(local, sdata, 0);
 			}
 			fallthrough;
 		case NL80211_IFTYPE_MESH_POINT:
-- 
cgit 


From 6e8912a503759bb8f1f01c5b761d0d45815fa6de Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Mon, 6 Jun 2022 14:25:54 +0300
Subject: wifi: mac80211: return a beacon for a specific link

Pass the link id through to the get_beacon and return
the beacon for a specific link id.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c              |  2 +-
 drivers/net/wireless/ath/ath10k/wmi.c              |  2 +-
 drivers/net/wireless/ath/ath11k/mac.c              |  2 +-
 drivers/net/wireless/ath/ath5k/base.c              |  2 +-
 drivers/net/wireless/ath/ath9k/beacon.c            |  2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c    |  2 +-
 drivers/net/wireless/ath/carl9170/tx.c             |  2 +-
 drivers/net/wireless/ath/wcn36xx/main.c            |  2 +-
 drivers/net/wireless/broadcom/b43/main.c           |  2 +-
 drivers/net/wireless/broadcom/b43legacy/main.c     |  2 +-
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  4 +-
 drivers/net/wireless/intel/iwlegacy/common.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/main.c      |  4 +-
 drivers/net/wireless/intel/iwlwifi/dvm/rxon.c      |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c   |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  |  2 +-
 drivers/net/wireless/intersil/p54/main.c           |  2 +-
 drivers/net/wireless/mac80211_hwsim.c              |  2 +-
 drivers/net/wireless/marvell/libertas_tf/main.c    |  4 +-
 drivers/net/wireless/marvell/mwl8k.c               |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7603/beacon.c |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7615/mcu.c    |  4 +-
 .../net/wireless/mediatek/mt76/mt76x02_beacon.c    |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c    |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/mcu.c    |  2 +-
 drivers/net/wireless/purelifi/plfxlc/mac.c         |  4 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00queue.c   |  2 +-
 drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |  2 +-
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |  2 +-
 drivers/net/wireless/realtek/rtlwifi/core.c        |  2 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c         |  2 +-
 drivers/net/wireless/realtek/rtw88/fw.c            |  2 +-
 drivers/net/wireless/realtek/rtw89/fw.c            |  3 +-
 drivers/net/wireless/rsi/rsi_91x_hal.c             |  2 +-
 drivers/net/wireless/silabs/wfx/sta.c              |  6 +-
 drivers/net/wireless/st/cw1200/sta.c               |  4 +-
 drivers/net/wireless/ti/wl1251/main.c              |  2 +-
 drivers/net/wireless/ti/wlcore/main.c              |  4 +-
 drivers/net/wireless/zydas/zd1211rw/zd_mac.c       |  9 +--
 drivers/staging/vt6655/rxtx.c                      |  2 +-
 drivers/staging/vt6656/rxtx.c                      |  2 +-
 include/net/mac80211.h                             | 14 ++--
 net/mac80211/tx.c                                  | 75 +++++++++++++---------
 43 files changed, 109 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 4808ed6a6e20..3d111d6447f0 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -1630,7 +1630,7 @@ static int ath10k_mac_setup_bcn_tmpl(struct ath10k_vif *arvif)
 	    arvif->vdev_type != WMI_VDEV_TYPE_IBSS)
 		return 0;
 
-	bcn = ieee80211_beacon_get_template(hw, vif, &offs);
+	bcn = ieee80211_beacon_get_template(hw, vif, &offs, 0);
 	if (!bcn) {
 		ath10k_warn(ar, "failed to get beacon template from mac80211\n");
 		return -EPERM;
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index af19cab24c76..074d8ba5072a 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3888,7 +3888,7 @@ void ath10k_wmi_event_host_swba(struct ath10k *ar, struct sk_buff *skb)
 			continue;
 		}
 
-		bcn = ieee80211_beacon_get(ar->hw, arvif->vif);
+		bcn = ieee80211_beacon_get(ar->hw, arvif->vif, 0);
 		if (!bcn) {
 			ath10k_warn(ar, "could not get mac80211 beacon\n");
 			continue;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 96a33b49a52f..17dbc7d9cf29 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1362,7 +1362,7 @@ static int ath11k_mac_setup_bcn_tmpl(struct ath11k_vif *arvif)
 	if (arvif->vdev_type != WMI_VDEV_TYPE_AP)
 		return 0;
 
-	bcn = ieee80211_beacon_get_template(hw, vif, &offs);
+	bcn = ieee80211_beacon_get_template(hw, vif, &offs, 0);
 	if (!bcn) {
 		ath11k_warn(ab, "failed to get beacon template from mac80211\n");
 		return -EPERM;
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 66d123f48085..85c982e0a1cd 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -1946,7 +1946,7 @@ ath5k_beacon_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 		goto out;
 	}
 
-	skb = ieee80211_beacon_get(hw, vif);
+	skb = ieee80211_beacon_get(hw, vif, 0);
 
 	if (!skb) {
 		ret = -ENOMEM;
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index d41988f8ea3f..ee72faac2f1d 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -135,7 +135,7 @@ static struct ath_buf *ath9k_beacon_generate(struct ieee80211_hw *hw,
 		bf->bf_mpdu = NULL;
 	}
 
-	skb = ieee80211_beacon_get(hw, vif);
+	skb = ieee80211_beacon_get(hw, vif, 0);
 	if (skb == NULL)
 		return NULL;
 
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
index 468bc934d848..533471e69400 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
@@ -215,7 +215,7 @@ static void ath9k_htc_send_beacon(struct ath9k_htc_priv *priv,
 	}
 
 	/* Get a new beacon */
-	beacon = ieee80211_beacon_get(priv->hw, vif);
+	beacon = ieee80211_beacon_get(priv->hw, vif, 0);
 	if (!beacon) {
 		spin_unlock_bh(&priv->beacon_lock);
 		return;
diff --git a/drivers/net/wireless/ath/carl9170/tx.c b/drivers/net/wireless/ath/carl9170/tx.c
index 514f568d9d07..6bb9aa2bfe65 100644
--- a/drivers/net/wireless/ath/carl9170/tx.c
+++ b/drivers/net/wireless/ath/carl9170/tx.c
@@ -1628,7 +1628,7 @@ int carl9170_update_beacon(struct ar9170 *ar, const bool submit)
 		goto out_unlock;
 
 	skb = ieee80211_beacon_get_tim(ar->hw, carl9170_get_vif(cvif),
-		NULL, NULL);
+				       NULL, NULL, 0);
 
 	if (!skb) {
 		err = -ENOMEM;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 04fb26cb8322..ace8641909bd 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -1010,7 +1010,7 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 			wcn36xx_smd_config_bss(wcn, vif, NULL,
 					       vif->addr, false);
 			skb = ieee80211_beacon_get_tim(hw, vif, &tim_off,
-						       &tim_len);
+						       &tim_len, 0);
 			if (!skb) {
 				wcn36xx_err("failed to alloc beacon skb\n");
 				goto out;
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index e3121a9d0579..6b4188066f0b 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -1832,7 +1832,7 @@ static void b43_update_templates(struct b43_wl *wl)
 	 * the TIM field, but that would probably require resizing and
 	 * moving of data within the beacon template.
 	 * Simply request a new beacon and let mac80211 do the hard work. */
-	beacon = ieee80211_beacon_get(wl->hw, wl->vif);
+	beacon = ieee80211_beacon_get(wl->hw, wl->vif, 0);
 	if (unlikely(!beacon))
 		return;
 
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 96d5a034c09b..532013184389 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -1241,7 +1241,7 @@ static void b43legacy_update_templates(struct b43legacy_wl *wl)
 	 * field, but that would probably require resizing and moving of data
 	 * within the beacon template. Simply request a new beacon and let
 	 * mac80211 do the hard work. */
-	beacon = ieee80211_beacon_get(wl->hw, wl->vif);
+	beacon = ieee80211_beacon_get(wl->hw, wl->vif, 0);
 	if (unlikely(!beacon))
 		return;
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index fd3c131c622c..61d7404aded4 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -678,7 +678,7 @@ brcms_ops_bss_info_changed(struct ieee80211_hw *hw,
 		u16 tim_offset = 0;
 
 		spin_lock_bh(&wl->lock);
-		beacon = ieee80211_beacon_get_tim(hw, vif, &tim_offset, NULL);
+		beacon = ieee80211_beacon_get_tim(hw, vif, &tim_offset, NULL, 0);
 		brcms_c_set_new_beacon(wl->wlc, beacon, tim_offset,
 				       info->dtim_period);
 		spin_unlock_bh(&wl->lock);
@@ -950,7 +950,7 @@ static int brcms_ops_beacon_set_tim(struct ieee80211_hw *hw,
 	spin_lock_bh(&wl->lock);
 	if (wl->wlc->vif)
 		beacon = ieee80211_beacon_get_tim(hw, wl->wlc->vif,
-						  &tim_offset, NULL);
+						  &tim_offset, NULL, 0);
 	if (beacon)
 		brcms_c_set_new_beacon(wl->wlc, beacon, tim_offset,
 				       wl->wlc->vif->bss_conf.dtim_period);
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index affad34f70f0..6f007e63f0c2 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -5276,7 +5276,7 @@ il_beacon_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 	struct il_priv *il = hw->priv;
 	unsigned long flags;
 	__le64 timestamp;
-	struct sk_buff *skb = ieee80211_beacon_get(hw, vif);
+	struct sk_buff *skb = ieee80211_beacon_get(hw, vif, 0);
 
 	if (!skb)
 		return;
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
index caf452922dbd..a873be109f43 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
  *
- * Copyright(c) 2003 - 2014, 2018 - 2021  Intel Corporation. All rights reserved.
+ * Copyright(c) 2003 - 2014, 2018 - 2022  Intel Corporation. All rights reserved.
  * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * Portions of this file are derived from the ipw3945 project, as well
@@ -284,7 +284,7 @@ static void iwl_bg_beacon_update(struct work_struct *work)
 	}
 
 	/* Pull updated AP beacon from mac80211. will fail if not in AP mode */
-	beacon = ieee80211_beacon_get(priv->hw, priv->beacon_ctx->vif);
+	beacon = ieee80211_beacon_get(priv->hw, priv->beacon_ctx->vif, 0);
 	if (!beacon) {
 		IWL_ERR(priv, "update beacon failed -- keeping old\n");
 		goto out;
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
index 45e382fe45a2..f80cce37e2c0 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
@@ -183,7 +183,7 @@ static int iwlagn_update_beacon(struct iwl_priv *priv,
 	lockdep_assert_held(&priv->mutex);
 
 	dev_kfree_skb(priv->beacon_skb);
-	priv->beacon_skb = ieee80211_beacon_get(priv->hw, vif);
+	priv->beacon_skb = ieee80211_beacon_get(priv->hw, vif, 0);
 	if (!priv->beacon_skb)
 		return -ENOMEM;
 	return iwlagn_send_beacon_cmd(priv);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
index 49898fd99594..c0bd697b080a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
@@ -1233,7 +1233,7 @@ static int _iwl_dbgfs_inject_beacon_ie(struct iwl_mvm *mvm, char *bin, int len)
 
 	mvm->hw->extra_beacon_tailroom = len;
 
-	beacon = ieee80211_beacon_get_template(mvm->hw, vif, NULL);
+	beacon = ieee80211_beacon_get_template(mvm->hw, vif, NULL, 0);
 	if (!beacon)
 		goto out_err;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 65df8cbb57d9..ed586e6d7d64 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -1002,7 +1002,7 @@ int iwl_mvm_mac_ctxt_beacon_changed(struct iwl_mvm *mvm,
 	WARN_ON(vif->type != NL80211_IFTYPE_AP &&
 		vif->type != NL80211_IFTYPE_ADHOC);
 
-	beacon = ieee80211_beacon_get_template(mvm->hw, vif, NULL);
+	beacon = ieee80211_beacon_get_template(mvm->hw, vif, NULL, 0);
 	if (!beacon)
 		return -ENOMEM;
 
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index cc6769a4fec7..115be1f3f33d 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -139,7 +139,7 @@ static int p54_beacon_update(struct p54_common *priv,
 	struct sk_buff *beacon;
 	int ret;
 
-	beacon = ieee80211_beacon_get(priv->hw, vif);
+	beacon = ieee80211_beacon_get(priv->hw, vif, 0);
 	if (!beacon)
 		return -ENOMEM;
 	ret = p54_beacon_format_ie_tim(beacon);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index be83904c642b..44174a4c2117 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1908,7 +1908,7 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 	    vif->type != NL80211_IFTYPE_OCB)
 		return;
 
-	skb = ieee80211_beacon_get(hw, vif);
+	skb = ieee80211_beacon_get(hw, vif, 0);
 	if (skb == NULL)
 		return;
 	info = IEEE80211_SKB_CB(skb);
diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c
index 21c3e0bdc444..74c4942b9a5a 100644
--- a/drivers/net/wireless/marvell/libertas_tf/main.c
+++ b/drivers/net/wireless/marvell/libertas_tf/main.c
@@ -427,7 +427,7 @@ static void lbtf_op_bss_info_changed(struct ieee80211_hw *hw,
 		switch (priv->vif->type) {
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_MESH_POINT:
-			beacon = ieee80211_beacon_get(hw, vif);
+			beacon = ieee80211_beacon_get(hw, vif, 0);
 			if (beacon) {
 				lbtf_beacon_set(priv, beacon);
 				kfree_skb(beacon);
@@ -691,7 +691,7 @@ void lbtf_bcn_sent(struct lbtf_private *priv)
 		}
 	}
 
-	skb = ieee80211_beacon_get(priv->hw, priv->vif);
+	skb = ieee80211_beacon_get(priv->hw, priv->vif, 0);
 
 	if (skb) {
 		lbtf_beacon_set(priv, skb);
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 7eef3a74d124..293bec9bb8dd 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5147,7 +5147,7 @@ mwl8k_bss_info_changed_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (changed & (BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON)) {
 		struct sk_buff *skb;
 
-		skb = ieee80211_beacon_get(hw, vif);
+		skb = ieee80211_beacon_get(hw, vif, 0);
 		if (skb != NULL) {
 			mwl8k_cmd_set_beacon(hw, vif, skb->data, skb->len);
 			kfree_skb(skb);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c b/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c
index b5e8308e0cc7..c67fc29bb7c3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c
@@ -20,7 +20,7 @@ mt7603_update_beacon_iter(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!(mdev->beacon_mask & BIT(mvif->idx)))
 		return;
 
-	skb = ieee80211_beacon_get(mt76_hw(dev), vif);
+	skb = ieee80211_beacon_get(mt76_hw(dev), vif, 0);
 	if (!skb)
 		return;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
index f58376733c67..3f8f06d9628c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
@@ -706,7 +706,7 @@ mt7615_mcu_add_beacon_offload(struct mt7615_dev *dev,
 	if (!enable)
 		goto out;
 
-	skb = ieee80211_beacon_get_template(hw, vif, &offs);
+	skb = ieee80211_beacon_get_template(hw, vif, &offs, 0);
 	if (!skb)
 		return -EINVAL;
 
@@ -1076,7 +1076,7 @@ mt7615_mcu_uni_add_beacon_offload(struct mt7615_dev *dev,
 	if (!enable)
 		goto out;
 
-	skb = ieee80211_beacon_get_template(mt76_hw(dev), vif, &offs);
+	skb = ieee80211_beacon_get_template(mt76_hw(dev), vif, &offs, 0);
 	if (!skb)
 		return -EINVAL;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_beacon.c b/drivers/net/wireless/mediatek/mt76/mt76x02_beacon.c
index 5d034cec191b..b72510949877 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_beacon.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_beacon.c
@@ -139,7 +139,7 @@ mt76x02_update_beacon_iter(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!(dev->mt76.beacon_mask & BIT(mvif->idx)))
 		return;
 
-	skb = ieee80211_beacon_get(mt76_hw(dev), vif);
+	skb = ieee80211_beacon_get(mt76_hw(dev), vif, 0);
 	if (!skb)
 		return;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index 1b3d6634a72e..8c91257ba1c9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -2086,7 +2086,7 @@ int mt7915_mcu_add_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (!en)
 		goto out;
 
-	skb = ieee80211_beacon_get_template(hw, vif, &offs);
+	skb = ieee80211_beacon_get_template(hw, vif, &offs, 0);
 	if (!skb)
 		return -EINVAL;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
index 64fc400bd981..3b5b475b0875 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
@@ -1258,7 +1258,7 @@ mt7921_mcu_uni_add_beacon_offload(struct mt7921_dev *dev,
 	if (!enable)
 		goto out;
 
-	skb = ieee80211_beacon_get_template(mt76_hw(dev), vif, &offs);
+	skb = ieee80211_beacon_get_template(mt76_hw(dev), vif, &offs, 0);
 	if (!skb)
 		return -EINVAL;
 
diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c
index 1fc1682683e1..d3cdffbded69 100644
--- a/drivers/net/wireless/purelifi/plfxlc/mac.c
+++ b/drivers/net/wireless/purelifi/plfxlc/mac.c
@@ -133,7 +133,7 @@ int plfxlc_restore_settings(struct plfxlc_mac *mac)
 		return 0;
 
 	if (mac->vif) {
-		beacon = ieee80211_beacon_get(mac->hw, mac->vif);
+		beacon = ieee80211_beacon_get(mac->hw, mac->vif, 0);
 		if (beacon) {
 			/*beacon is hardcoded in firmware */
 			kfree_skb(beacon);
@@ -601,7 +601,7 @@ static void plfxlc_op_bss_info_changed(struct ieee80211_hw *hw,
 	/* for ADHOC */
 	associated = true;
 	if (changes & BSS_CHANGED_BEACON) {
-		struct sk_buff *beacon = ieee80211_beacon_get(hw, vif);
+		struct sk_buff *beacon = ieee80211_beacon_get(hw, vif, 0);
 
 		if (beacon) {
 			/*beacon is hardcoded in firmware */
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
index aa6b2f3d2eff..4d06038afd83 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
@@ -758,7 +758,7 @@ int rt2x00queue_update_beacon(struct rt2x00_dev *rt2x00dev,
 	 */
 	rt2x00queue_free_skb(intf->beacon);
 
-	intf->beacon->skb = ieee80211_beacon_get(rt2x00dev->hw, vif);
+	intf->beacon->skb = ieee80211_beacon_get(rt2x00dev->hw, vif, 0);
 	if (!intf->beacon->skb)
 		return -ENOMEM;
 
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
index 10e8fdc31879..f66cc9083972 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
@@ -1300,7 +1300,7 @@ static void rtl8180_beacon_work(struct work_struct *work)
 		goto resched;
 
 	/* grab a fresh beacon */
-	skb = ieee80211_beacon_get(dev, vif);
+	skb = ieee80211_beacon_get(dev, vif, 0);
 	if (!skb)
 		goto resched;
 
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index 8ab65c888baf..edc84f9dc984 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -1075,7 +1075,7 @@ static void rtl8187_beacon_work(struct work_struct *work)
 		goto resched;
 
 	/* grab a fresh beacon */
-	skb = ieee80211_beacon_get(dev, vif);
+	skb = ieee80211_beacon_get(dev, vif, 0);
 	if (!skb)
 		goto resched;
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 2f7fd8888d3a..8537cc6d7a89 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1009,7 +1009,7 @@ static void send_beacon_frame(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	struct sk_buff *skb = ieee80211_beacon_get(hw, vif);
+	struct sk_buff *skb = ieee80211_beacon_get(hw, vif, 0);
 	struct rtl_tcb_desc tcb_desc;
 
 	if (skb) {
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 8e4c15654746..ca79f652fef3 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1100,7 +1100,7 @@ static void _rtl_pci_prepare_bcn_tasklet(struct tasklet_struct *t)
 	}
 
 	/*NB: the beacon data buffer must be 32-bit aligned. */
-	pskb = ieee80211_beacon_get(hw, mac->vif);
+	pskb = ieee80211_beacon_get(hw, mac->vif, 0);
 	if (!pskb)
 		return;
 	hdr = rtl_get_hdr(pskb);
diff --git a/drivers/net/wireless/realtek/rtw88/fw.c b/drivers/net/wireless/realtek/rtw88/fw.c
index c3ae631c2264..4fdab0329695 100644
--- a/drivers/net/wireless/realtek/rtw88/fw.c
+++ b/drivers/net/wireless/realtek/rtw88/fw.c
@@ -1070,7 +1070,7 @@ static struct sk_buff *rtw_get_rsvd_page_skb(struct ieee80211_hw *hw,
 
 	switch (rsvd_pkt->type) {
 	case RSVD_BEACON:
-		skb_new = ieee80211_beacon_get_tim(hw, vif, &tim_offset, NULL);
+		skb_new = ieee80211_beacon_get_tim(hw, vif, &tim_offset, NULL, 0);
 		rsvd_pkt->tim_offset = tim_offset;
 		break;
 	case RSVD_PS_POLL:
diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c
index 2d9c3157d878..179ec9276e32 100644
--- a/drivers/net/wireless/realtek/rtw89/fw.c
+++ b/drivers/net/wireless/realtek/rtw89/fw.c
@@ -1043,7 +1043,8 @@ int rtw89_fw_h2c_update_beacon(struct rtw89_dev *rtwdev,
 	u16 tim_offset;
 	int bcn_total_len;
 
-	skb_beacon = ieee80211_beacon_get_tim(rtwdev->hw, vif, &tim_offset, NULL);
+	skb_beacon = ieee80211_beacon_get_tim(rtwdev->hw, vif, &tim_offset,
+					      NULL, 0);
 	if (!skb_beacon) {
 		rtw89_err(rtwdev, "failed to get beacon skb\n");
 		return -ENOMEM;
diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index d3634ad986ac..40f9a31f9ca7 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -443,7 +443,7 @@ int rsi_prepare_beacon(struct rsi_common *common, struct sk_buff *skb)
 		return -EINVAL;
 	mac_bcn = ieee80211_beacon_get_tim(adapter->hw,
 					   vif,
-					   &tim_offset, NULL);
+					   &tim_offset, NULL, 0);
 	if (!mac_bcn) {
 		rsi_dbg(ERR_ZONE, "Failed to get beacon from mac80211\n");
 		return -EINVAL;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 3ae93573f787..0378144795ca 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -339,7 +339,7 @@ static int wfx_upload_ap_templates(struct wfx_vif *wvif)
 	struct ieee80211_vif *vif = wvif_to_vif(wvif);
 	struct sk_buff *skb;
 
-	skb = ieee80211_beacon_get(wvif->wdev->hw, vif);
+	skb = ieee80211_beacon_get(wvif->wdev->hw, vif, 0);
 	if (!skb)
 		return -ENOMEM;
 	wfx_hif_set_template_frame(wvif, skb, HIF_TMPLT_BCN, API_RATE_INDEX_B_1MBPS);
@@ -356,7 +356,7 @@ static int wfx_upload_ap_templates(struct wfx_vif *wvif)
 static void wfx_set_mfp_ap(struct wfx_vif *wvif)
 {
 	struct ieee80211_vif *vif = wvif_to_vif(wvif);
-	struct sk_buff *skb = ieee80211_beacon_get(wvif->wdev->hw, vif);
+	struct sk_buff *skb = ieee80211_beacon_get(wvif->wdev->hw, vif, 0);
 	const int ieoffset = offsetof(struct ieee80211_mgmt, u.beacon.variable);
 	const u16 *ptr = (u16 *)cfg80211_find_ie(WLAN_EID_RSN, skb->data + ieoffset,
 						 skb->len - ieoffset);
@@ -588,7 +588,7 @@ static int wfx_update_tim(struct wfx_vif *wvif)
 	u8 *tim_ptr;
 
 	skb = ieee80211_beacon_get_tim(wvif->wdev->hw, vif, &tim_offset,
-				       &tim_length);
+				       &tim_length, 0);
 	if (!skb)
 		return -ENOENT;
 	tim_ptr = skb->data + tim_offset;
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 5fa6878ee109..a3f6942df0c1 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -1671,7 +1671,7 @@ static int cw1200_set_tim_impl(struct cw1200_common *priv, bool aid0_bit_set)
 	pr_debug("[AP] mcast: %s.\n", aid0_bit_set ? "ena" : "dis");
 
 	skb = ieee80211_beacon_get_tim(priv->hw, priv->vif,
-			&tim_offset, &tim_length);
+				       &tim_offset, &tim_length, 0);
 	if (!skb) {
 		if (!__cw1200_flush(priv, true))
 			wsm_unlock_tx(priv);
@@ -2203,7 +2203,7 @@ static int cw1200_upload_beacon(struct cw1200_common *priv)
 		frame.rate = WSM_TRANSMIT_RATE_6;
 
 	frame.skb = ieee80211_beacon_get_tim(priv->hw, priv->vif,
-					     &tim_offset, &tim_len);
+					     &tim_offset, &tim_len, 0);
 	if (!frame.skb)
 		return -ENOMEM;
 
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index 340ab4985fe2..d76558964420 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1186,7 +1186,7 @@ static void wl1251_op_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_BEACON) {
-		beacon = ieee80211_beacon_get(hw, vif);
+		beacon = ieee80211_beacon_get(hw, vif, 0);
 		if (!beacon)
 			goto out_sleep;
 
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index b476f244a20e..a1923ef52d55 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4039,7 +4039,7 @@ static int wlcore_set_beacon_template(struct wl1271 *wl,
 	u32 min_rate;
 	int ret;
 	int ieoffset = offsetof(struct ieee80211_mgmt, u.beacon.variable);
-	struct sk_buff *beacon = ieee80211_beacon_get(wl->hw, vif);
+	struct sk_buff *beacon = ieee80211_beacon_get(wl->hw, vif, 0);
 	u16 tmpl_id;
 
 	if (!beacon) {
@@ -5493,7 +5493,7 @@ static const void *wlcore_get_beacon_ie(struct wl1271 *wl,
 {
 	int ieoffset = offsetof(struct ieee80211_mgmt, u.beacon.variable);
 	struct sk_buff *beacon =
-		ieee80211_beacon_get(wl->hw, wl12xx_wlvif_to_vif(wlvif));
+		ieee80211_beacon_get(wl->hw, wl12xx_wlvif_to_vif(wlvif), 0);
 
 	if (!beacon)
 		return NULL;
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index b42fedba519d..80b905d49954 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -398,7 +398,7 @@ int zd_restore_settings(struct zd_mac *mac)
 	    mac->type == NL80211_IFTYPE_ADHOC ||
 	    mac->type == NL80211_IFTYPE_AP) {
 		if (mac->vif != NULL) {
-			beacon = ieee80211_beacon_get(mac->hw, mac->vif);
+			beacon = ieee80211_beacon_get(mac->hw, mac->vif, 0);
 			if (beacon)
 				zd_mac_config_beacon(mac->hw, beacon, false);
 		}
@@ -1167,7 +1167,7 @@ static void zd_beacon_done(struct zd_mac *mac)
 	/*
 	 * Fetch next beacon so that tim_count is updated.
 	 */
-	beacon = ieee80211_beacon_get(mac->hw, mac->vif);
+	beacon = ieee80211_beacon_get(mac->hw, mac->vif, 0);
 	if (beacon)
 		zd_mac_config_beacon(mac->hw, beacon, true);
 
@@ -1290,7 +1290,8 @@ static void zd_op_bss_info_changed(struct ieee80211_hw *hw,
 	    mac->type == NL80211_IFTYPE_AP) {
 		associated = true;
 		if (changes & BSS_CHANGED_BEACON) {
-			struct sk_buff *beacon = ieee80211_beacon_get(hw, vif);
+			struct sk_buff *beacon = ieee80211_beacon_get(hw, vif,
+								      0);
 
 			if (beacon) {
 				zd_chip_disable_hwint(&mac->chip);
@@ -1447,7 +1448,7 @@ static void beacon_watchdog_handler(struct work_struct *work)
 
 		zd_chip_disable_hwint(&mac->chip);
 
-		beacon = ieee80211_beacon_get(mac->hw, mac->vif);
+		beacon = ieee80211_beacon_get(mac->hw, mac->vif, 0);
 		if (beacon) {
 			zd_mac_free_cur_beacon(mac);
 
diff --git a/drivers/staging/vt6655/rxtx.c b/drivers/staging/vt6655/rxtx.c
index 71cbfa607d96..85cd01c463e8 100644
--- a/drivers/staging/vt6655/rxtx.c
+++ b/drivers/staging/vt6655/rxtx.c
@@ -1435,7 +1435,7 @@ int vnt_beacon_make(struct vnt_private *priv, struct ieee80211_vif *vif)
 {
 	struct sk_buff *beacon;
 
-	beacon = ieee80211_beacon_get(priv->hw, vif);
+	beacon = ieee80211_beacon_get(priv->hw, vif, 0);
 	if (!beacon)
 		return -ENOMEM;
 
diff --git a/drivers/staging/vt6656/rxtx.c b/drivers/staging/vt6656/rxtx.c
index 4d29f8ebb393..cd99091c6c28 100644
--- a/drivers/staging/vt6656/rxtx.c
+++ b/drivers/staging/vt6656/rxtx.c
@@ -699,7 +699,7 @@ int vnt_beacon_make(struct vnt_private *priv, struct ieee80211_vif *vif)
 {
 	struct sk_buff *beacon;
 
-	beacon = ieee80211_beacon_get(priv->hw, vif);
+	beacon = ieee80211_beacon_get(priv->hw, vif, 0);
 	if (!beacon)
 		return -ENOMEM;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 48c688fdf23d..06545ef1d41c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5069,6 +5069,7 @@ struct ieee80211_mutable_offsets {
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @offs: &struct ieee80211_mutable_offsets pointer to struct that will
  *	receive the offsets that may be updated by the driver.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP)
  *
  * If the driver implements beaconing modes, it must use this function to
  * obtain the beacon template.
@@ -5085,7 +5086,8 @@ struct ieee80211_mutable_offsets {
 struct sk_buff *
 ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
-			      struct ieee80211_mutable_offsets *offs);
+			      struct ieee80211_mutable_offsets *offs,
+			      unsigned int link_id);
 
 /**
  * ieee80211_beacon_get_tim - beacon generation function
@@ -5096,6 +5098,7 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
  * @tim_length: pointer to variable that will receive the TIM IE length,
  *	(including the ID and length bytes!).
  *	Set to 0 if invalid (in non-AP modes).
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP)
  *
  * If the driver implements beaconing modes, it must use this function to
  * obtain the beacon frame.
@@ -5111,21 +5114,24 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
  */
 struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
-					 u16 *tim_offset, u16 *tim_length);
+					 u16 *tim_offset, u16 *tim_length,
+					 unsigned int link_id);
 
 /**
  * ieee80211_beacon_get - beacon generation function
  * @hw: pointer obtained from ieee80211_alloc_hw().
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP)
  *
  * See ieee80211_beacon_get_tim().
  *
  * Return: See ieee80211_beacon_get_tim().
  */
 static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
-						   struct ieee80211_vif *vif)
+						   struct ieee80211_vif *vif,
+						   unsigned int link_id)
 {
-	return ieee80211_beacon_get_tim(hw, vif, NULL, NULL);
+	return ieee80211_beacon_get_tim(hw, vif, NULL, NULL, link_id);
 }
 
 /**
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 17313954c628..c3e14ef20c05 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4689,11 +4689,12 @@ void ieee80211_tx_pending(struct tasklet_struct *t)
 
 static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 				       struct ps_data *ps, struct sk_buff *skb,
-				       bool is_template)
+				       bool is_template, unsigned int link_id)
 {
 	u8 *pos, *tim;
 	int aid0 = 0;
 	int i, have_bits = 0, n1, n2;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 
 	/* Generate bitmap for TIM only if there are any STAs in power save
 	 * mode. */
@@ -4704,7 +4705,7 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 					  IEEE80211_MAX_AID+1);
 	if (!is_template) {
 		if (ps->dtim_count == 0)
-			ps->dtim_count = sdata->vif.bss_conf.dtim_period - 1;
+			ps->dtim_count = link_conf->dtim_period - 1;
 		else
 			ps->dtim_count--;
 	}
@@ -4713,7 +4714,7 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 	*pos++ = WLAN_EID_TIM;
 	*pos++ = 4;
 	*pos++ = ps->dtim_count;
-	*pos++ = sdata->vif.bss_conf.dtim_period;
+	*pos++ = link_conf->dtim_period;
 
 	if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf))
 		aid0 = 1;
@@ -4754,7 +4755,7 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 
 static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 				    struct ps_data *ps, struct sk_buff *skb,
-				    bool is_template)
+				    bool is_template, unsigned int link_id)
 {
 	struct ieee80211_local *local = sdata->local;
 
@@ -4766,10 +4767,12 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 	 * of the tim bitmap in mac80211 and the driver.
 	 */
 	if (local->tim_in_locked_section) {
-		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template);
+		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template,
+					   link_id);
 	} else {
 		spin_lock_bh(&local->tim_lock);
-		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template);
+		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template,
+					   link_id);
 		spin_unlock_bh(&local->tim_lock);
 	}
 
@@ -4777,7 +4780,8 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 }
 
 static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
-					struct beacon_data *beacon)
+					struct beacon_data *beacon,
+					unsigned int link_id)
 {
 	u8 *beacon_data, count, max_count = 1;
 	struct probe_resp *resp;
@@ -4803,11 +4807,11 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 	}
 
 	rcu_read_lock();
-	resp = rcu_dereference(sdata->deflink.u.ap.probe_resp);
+	resp = rcu_dereference(sdata->link[link_id]->u.ap.probe_resp);
 
 	bcn_offsets = beacon->cntdwn_counter_offsets;
 	count = beacon->cntdwn_current_counter;
-	if (sdata->vif.bss_conf.csa_active)
+	if (sdata->vif.link_conf[link_id]->csa_active)
 		max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
 
 	for (i = 0; i < max_count; ++i) {
@@ -4948,14 +4952,15 @@ EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete);
 
 static int ieee80211_beacon_protect(struct sk_buff *skb,
 				    struct ieee80211_local *local,
-				    struct ieee80211_sub_if_data *sdata)
+				    struct ieee80211_sub_if_data *sdata,
+				    unsigned int link_id)
 {
 	ieee80211_tx_result res;
 	struct ieee80211_tx_data tx;
 	struct sk_buff *check_skb;
 
 	memset(&tx, 0, sizeof(tx));
-	tx.key = rcu_dereference(sdata->deflink.default_beacon_key);
+	tx.key = rcu_dereference(sdata->link[link_id]->default_beacon_key);
 	if (!tx.key)
 		return 0;
 	tx.local = local;
@@ -4979,7 +4984,8 @@ ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 			    struct beacon_data *beacon,
 			    struct sk_buff *skb,
 			    struct ieee80211_chanctx_conf *chanctx_conf,
-			    u16 csa_off_base)
+			    u16 csa_off_base,
+			    unsigned int link_id)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -5010,7 +5016,7 @@ ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 	memset(&txrc, 0, sizeof(txrc));
 	txrc.hw = hw;
 	txrc.sband = local->hw.wiphy->bands[band];
-	txrc.bss_conf = &sdata->vif.bss_conf;
+	txrc.bss_conf = sdata->vif.link_conf[link_id];
 	txrc.skb = skb;
 	txrc.reported_rate.idx = -1;
 	if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band])
@@ -5045,7 +5051,8 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 			struct ieee80211_mutable_offsets *offs,
 			bool is_template,
 			struct beacon_data *beacon,
-			struct ieee80211_chanctx_conf *chanctx_conf)
+			struct ieee80211_chanctx_conf *chanctx_conf,
+			unsigned int link_id)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -5058,7 +5065,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 		if (!is_template)
 			ieee80211_beacon_update_cntdwn(vif);
 
-		ieee80211_set_beacon_cntdwn(sdata, beacon);
+		ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
 	}
 
 	/* headroom, head length,
@@ -5074,7 +5081,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	skb_reserve(skb, local->tx_headroom);
 	skb_put_data(skb, beacon->head, beacon->head_len);
 
-	ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template);
+	ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template, link_id);
 
 	if (offs) {
 		offs->tim_offset = beacon->head_len;
@@ -5093,11 +5100,11 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	if (beacon->tail)
 		skb_put_data(skb, beacon->tail, beacon->tail_len);
 
-	if (ieee80211_beacon_protect(skb, local, sdata) < 0)
+	if (ieee80211_beacon_protect(skb, local, sdata, link_id) < 0)
 		return NULL;
 
 	ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, chanctx_conf,
-				    csa_off_base);
+				    csa_off_base, link_id);
 	return skb;
 }
 
@@ -5105,7 +5112,8 @@ static struct sk_buff *
 __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		       struct ieee80211_vif *vif,
 		       struct ieee80211_mutable_offsets *offs,
-		       bool is_template)
+		       bool is_template,
+		       unsigned int link_id)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct beacon_data *beacon = NULL;
@@ -5116,7 +5124,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 	rcu_read_lock();
 
 	sdata = vif_to_sdata(vif);
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+	chanctx_conf =
+		rcu_dereference(sdata->vif.link_conf[link_id]->chanctx_conf);
 
 	if (!ieee80211_sdata_running(sdata) || !chanctx_conf)
 		goto out;
@@ -5125,12 +5134,12 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		memset(offs, 0, sizeof(*offs));
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP) {
-		beacon = rcu_dereference(sdata->deflink.u.ap.beacon);
+		beacon = rcu_dereference(sdata->link[link_id]->u.ap.beacon);
 		if (!beacon)
 			goto out;
 
 		skb = ieee80211_beacon_get_ap(hw, vif, offs, is_template,
-					      beacon, chanctx_conf);
+					      beacon, chanctx_conf, link_id);
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 		struct ieee80211_hdr *hdr;
@@ -5143,7 +5152,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 			if (!is_template)
 				__ieee80211_beacon_update_cntdwn(beacon);
 
-			ieee80211_set_beacon_cntdwn(sdata, beacon);
+			ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
 		}
 
 		skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
@@ -5158,7 +5167,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 						 IEEE80211_STYPE_BEACON);
 
 		ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb,
-					    chanctx_conf, 0);
+					    chanctx_conf, 0, link_id);
 	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
 		struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
@@ -5175,7 +5184,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 				 */
 				__ieee80211_beacon_update_cntdwn(beacon);
 
-			ieee80211_set_beacon_cntdwn(sdata, beacon);
+			ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
 		}
 
 		if (ifmsh->sync_ops)
@@ -5190,7 +5199,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 			goto out;
 		skb_reserve(skb, local->tx_headroom);
 		skb_put_data(skb, beacon->head, beacon->head_len);
-		ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template);
+		ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template,
+					 link_id);
 
 		if (offs) {
 			offs->tim_offset = beacon->head_len;
@@ -5199,7 +5209,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 
 		skb_put_data(skb, beacon->tail, beacon->tail_len);
 		ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb,
-					    chanctx_conf, 0);
+					    chanctx_conf, 0, link_id);
 	} else {
 		WARN_ON(1);
 		goto out;
@@ -5214,18 +5224,21 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 struct sk_buff *
 ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
-			      struct ieee80211_mutable_offsets *offs)
+			      struct ieee80211_mutable_offsets *offs,
+			      unsigned int link_id)
 {
-	return __ieee80211_beacon_get(hw, vif, offs, true);
+	return __ieee80211_beacon_get(hw, vif, offs, true, link_id);
 }
 EXPORT_SYMBOL(ieee80211_beacon_get_template);
 
 struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
-					 u16 *tim_offset, u16 *tim_length)
+					 u16 *tim_offset, u16 *tim_length,
+					 unsigned int link_id)
 {
 	struct ieee80211_mutable_offsets offs = {};
-	struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false);
+	struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false,
+						     link_id);
 	struct sk_buff *copy;
 	int shift;
 
-- 
cgit 


From c71420db653aba30a234d1e4cf86dde376e604fa Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Jun 2022 09:20:45 +0200
Subject: wifi: mac80211: RCU-ify link STA pointers

We need to be able to access these in a race-free way under
traffic while adding/removing them, so RCU-ify the pointers.
This requires passing a link_sta to a lot of functions so
we don't have to do the RCU handling everywhere.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |   2 +-
 net/mac80211/cfg.c         |  12 ++--
 net/mac80211/chan.c        |  25 +++++---
 net/mac80211/eht.c         |  13 ++---
 net/mac80211/he.c          |  17 +++---
 net/mac80211/ht.c          |  14 ++---
 net/mac80211/ibss.c        |   5 +-
 net/mac80211/ieee80211_i.h |  30 +++++-----
 net/mac80211/iface.c       |   3 +-
 net/mac80211/mesh_plink.c  |   8 ++-
 net/mac80211/mlme.c        |  12 ++--
 net/mac80211/rate.c        |   2 +-
 net/mac80211/rx.c          |   6 +-
 net/mac80211/sta_info.c    |  28 +++++----
 net/mac80211/sta_info.h    |   5 +-
 net/mac80211/tdls.c        |   5 +-
 net/mac80211/vht.c         | 139 ++++++++++++++++++++++++---------------------
 17 files changed, 180 insertions(+), 146 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 06545ef1d41c..27f24ac0426d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2201,7 +2201,7 @@ struct ieee80211_sta {
 
 	u16 valid_links;
 	struct ieee80211_link_sta deflink;
-	struct ieee80211_link_sta *link[IEEE80211_MLD_MAX_NUM_LINKS];
+	struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 	/* must be last */
 	u8 drv_priv[] __aligned(sizeof(void *));
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 009f1723c990..b387f5f4fef0 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1771,19 +1771,21 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 
 	if (params->ht_capa)
 		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
-						  params->ht_capa, sta, 0);
+						  params->ht_capa,
+						  &sta->deflink);
 
 	/* VHT can override some HT caps such as the A-MSDU max length */
 	if (params->vht_capa)
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-						    params->vht_capa, sta, 0);
+						    params->vht_capa,
+						    &sta->deflink);
 
 	if (params->he_capa)
 		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
 						  (void *)params->he_capa,
 						  params->he_capa_len,
 						  (void *)params->he_6ghz_capa,
-						  sta, 0);
+						  &sta->deflink);
 
 	if (params->eht_capa)
 		ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband,
@@ -1791,13 +1793,13 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 						    params->he_capa_len,
 						    params->eht_capa,
 						    params->eht_capa_len,
-						    sta, 0);
+						    &sta->deflink);
 
 	if (params->opmode_notif_used) {
 		/* returned value is only needed for rc update, but the
 		 * rc isn't initialized here yet, so ignore it
 		 */
-		__ieee80211_vht_handle_opmode(sdata, sta, 0,
+		__ieee80211_vht_handle_opmode(sdata, &sta->deflink,
 					      params->opmode_notif,
 					      sband->band);
 	}
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 4f25660d0eeb..6853b563fb6c 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -207,16 +207,20 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
 static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta,
 						    unsigned int link_id)
 {
-	enum ieee80211_sta_rx_bandwidth width =
-		ieee80211_sta_cap_rx_bw(sta, link_id);
+	enum ieee80211_sta_rx_bandwidth width;
+	struct link_sta_info *link_sta;
+
+	link_sta = rcu_dereference(sta->link[link_id]);
 
 	/* no effect if this STA has no presence on this link */
-	if (!sta->sta.link[link_id])
+	if (!link_sta)
 		return NL80211_CHAN_WIDTH_20_NOHT;
 
+	width = ieee80211_sta_cap_rx_bw(link_sta);
+
 	switch (width) {
 	case IEEE80211_STA_RX_BW_20:
-		if (sta->sta.link[link_id]->ht_cap.ht_supported)
+		if (link_sta->pub->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20;
 		else
 			return NL80211_CHAN_WIDTH_20_NOHT;
@@ -416,6 +420,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 		for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) {
 			struct ieee80211_bss_conf *link_conf =
 				sdata->vif.link_conf[link_id];
+			struct link_sta_info *link_sta;
 
 			if (!link_conf)
 				continue;
@@ -423,18 +428,22 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 			if (rcu_access_pointer(link_conf->chanctx_conf) != &ctx->conf)
 				continue;
 
-			new_sta_bw = ieee80211_sta_cur_vht_bw(sta, link_id);
+			link_sta = rcu_dereference(sta->link[link_id]);
+			if (!link_sta)
+				continue;
+
+			new_sta_bw = ieee80211_sta_cur_vht_bw(link_sta);
 
 			/* nothing change */
-			if (new_sta_bw == sta->sta.link[link_id]->bandwidth)
+			if (new_sta_bw == link_sta->pub->bandwidth)
 				continue;
 
 			/* vif changed to narrow BW and narrow BW for station wasn't
 			 * requested or vise versa */
-			if ((new_sta_bw < sta->sta.link[link_id]->bandwidth) == !narrowed)
+			if ((new_sta_bw < link_sta->pub->bandwidth) == !narrowed)
 				continue;
 
-			sta->sta.link[link_id]->bandwidth = new_sta_bw;
+			link_sta->pub->bandwidth = new_sta_bw;
 			rate_control_rate_update(local, sband, sta, link_id,
 						 IEEE80211_RC_BW_CHANGED);
 		}
diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c
index de762a803c38..31e20a342f21 100644
--- a/net/mac80211/eht.c
+++ b/net/mac80211/eht.c
@@ -12,11 +12,10 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const u8 *he_cap_ie, u8 he_cap_len,
 				    const struct ieee80211_eht_cap_elem *eht_cap_ie_elem,
-				    u8 eht_cap_len, struct sta_info *sta,
-				    unsigned int link_id)
+				    u8 eht_cap_len,
+				    struct link_sta_info *link_sta)
 {
-	struct ieee80211_sta_eht_cap *eht_cap =
-		&sta->sta.link[link_id]->eht_cap;
+	struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap;
 	struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie;
 	u8 eht_ppe_size = 0;
 	u8 mcs_nss_size;
@@ -73,8 +72,6 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 
 	eht_cap->has_eht = true;
 
-	sta->link[link_id]->cur_max_bandwidth =
-		ieee80211_sta_cap_rx_bw(sta, link_id);
-	sta->sta.link[link_id]->bandwidth =
-		ieee80211_sta_cur_vht_bw(sta, link_id);
+	link_sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(link_sta);
+	link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta);
 }
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index e48e9a021622..d9228fd3f77a 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -10,8 +10,9 @@
 
 static void
 ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
-				   struct sta_info *sta, unsigned int link_id)
+				   struct link_sta_info *link_sta)
 {
+	struct sta_info *sta = link_sta->sta;
 	enum ieee80211_smps_mode smps_mode;
 
 	if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
@@ -49,7 +50,7 @@ ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_
 		break;
 	}
 
-	sta->sta.link[link_id]->he_6ghz_capa = *he_6ghz_capa;
+	link_sta->pub->he_6ghz_capa = *he_6ghz_capa;
 }
 
 static void ieee80211_he_mcs_disable(__le16 *he_mcs)
@@ -108,9 +109,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_supported_band *sband,
 				  const u8 *he_cap_ie, u8 he_cap_len,
 				  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
-				  struct sta_info *sta, unsigned int link_id)
+				  struct link_sta_info *link_sta)
 {
-	struct ieee80211_sta_he_cap *he_cap = &sta->sta.link[link_id]->he_cap;
+	struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap;
 	struct ieee80211_sta_he_cap own_he_cap;
 	struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie;
 	u8 he_ppe_size;
@@ -153,13 +154,11 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 
 	he_cap->has_he = true;
 
-	sta->link[link_id]->cur_max_bandwidth =
-		ieee80211_sta_cap_rx_bw(sta, link_id);
-	sta->sta.link[link_id]->bandwidth =
-		ieee80211_sta_cur_vht_bw(sta, link_id);
+	link_sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(link_sta);
+	link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta);
 
 	if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa)
-		ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta, link_id);
+		ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, link_sta);
 
 	ieee80211_he_mcs_intersection(&own_he_cap.he_mcs_nss_supp.rx_mcs_80,
 				      &he_cap->he_mcs_nss_supp.rx_mcs_80,
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 111828b559a0..2eb3a409b70f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -138,8 +138,9 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
 bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 				       struct ieee80211_supported_band *sband,
 				       const struct ieee80211_ht_cap *ht_cap_ie,
-				       struct sta_info *sta, unsigned int link_id)
+				       struct link_sta_info *link_sta)
 {
+	struct sta_info *sta = link_sta->sta;
 	struct ieee80211_sta_ht_cap ht_cap, own_cap;
 	u8 ampdu_info, tx_mcs_set_cap;
 	int i, max_tx_streams;
@@ -243,12 +244,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839;
 
  apply:
-	changed = memcmp(&sta->sta.link[link_id]->ht_cap,
-			 &ht_cap, sizeof(ht_cap));
+	changed = memcmp(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap));
 
-	memcpy(&sta->sta.link[link_id]->ht_cap, &ht_cap, sizeof(ht_cap));
+	memcpy(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap));
 
-	switch (sdata->vif.link_conf[link_id]->chandef.width) {
+	switch (sdata->vif.link_conf[link_sta->link_id]->chandef.width) {
 	default:
 		WARN_ON_ONCE(1);
 		fallthrough;
@@ -265,9 +265,9 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	sta->sta.link[link_id]->bandwidth = bw;
+	link_sta->pub->bandwidth = bw;
 
-	sta->link[link_id]->cur_max_bandwidth =
+	link_sta->cur_max_bandwidth =
 		ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
 				IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20;
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 65a3808dc92a..65b6255c6747 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1051,7 +1051,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 		memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie));
 		rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
 								   &htcap_ie,
-								   sta, 0);
+								   &sta->deflink);
 
 		if (elems->vht_operation && elems->vht_cap_elem &&
 		    sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 &&
@@ -1068,7 +1068,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 						   &chandef);
 			memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
 			ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-							    &cap_ie, sta, 0);
+							    &cap_ie,
+							    &sta->deflink);
 			if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap)))
 				rates_updated |= true;
 		}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index aa5e4c2734b7..46f4e89825a0 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2061,7 +2061,7 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
 bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 				       struct ieee80211_supported_band *sband,
 				       const struct ieee80211_ht_cap *ht_cap_ie,
-				       struct sta_info *sta, unsigned int link_id);
+				       struct link_sta_info *link_sta);
 void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 			  const u8 *da, u16 tid,
 			  u16 initiator, u16 reason_code);
@@ -2117,31 +2117,31 @@ void
 ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const struct ieee80211_vht_cap *vht_cap_ie,
-				    struct sta_info *sta, unsigned int link_id);
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
-							unsigned int link_id);
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
-							 unsigned int link_id);
-void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id);
+				    struct link_sta_info *link_sta);
+enum ieee80211_sta_rx_bandwidth
+ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta);
+enum ieee80211_sta_rx_bandwidth
+ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta);
+void ieee80211_sta_set_rx_nss(struct link_sta_info *link_sta);
 enum ieee80211_sta_rx_bandwidth
 ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
-enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta,
-						  unsigned int link_id);
+enum nl80211_chan_width
+ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta);
 void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 				 unsigned int link_id,
 				 struct ieee80211_mgmt *mgmt);
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				  struct sta_info *sta, unsigned int link_id,
+				  struct link_sta_info *sta,
 				  u8 opmode, enum nl80211_band band);
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				 struct sta_info *sta, unsigned int link_id,
+				 struct link_sta_info *sta,
 				 u8 opmode, enum nl80211_band band);
 void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_sta_vht_cap *vht_cap);
 void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
 				     u16 vht_mask[NL80211_VHT_NSS_MAX]);
 enum nl80211_chan_width
-ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta, unsigned int link_id);
+ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *sta);
 
 /* HE */
 void
@@ -2149,7 +2149,7 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_supported_band *sband,
 				  const u8 *he_cap_ie, u8 he_cap_len,
 				  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
-				  struct sta_info *sta, unsigned int link_id);
+				  struct link_sta_info *link_sta);
 void
 ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
 				const struct ieee80211_he_spr *he_spr_ie_elem);
@@ -2560,6 +2560,6 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const u8 *he_cap_ie, u8 he_cap_len,
 				    const struct ieee80211_eht_cap_elem *eht_cap_ie_elem,
-				    u8 eht_cap_len, struct sta_info *sta,
-				    unsigned int link_id);
+				    u8 eht_cap_len,
+				    struct link_sta_info *link_sta);
 #endif /* IEEE80211_I_H */
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index f118e7710fb1..0cf5a395d925 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1525,7 +1525,8 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 			sta = sta_info_get_bss(sdata, mgmt->sa);
 
 			if (sta)
-				ieee80211_vht_handle_opmode(sdata, sta, 0,
+				ieee80211_vht_handle_opmode(sdata,
+							    &sta->deflink,
 							    opmode, band);
 
 			mutex_unlock(&local->sta_mtx);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index fe54ac431202..55bed9ce98fe 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -438,16 +438,18 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
 	sta->sta.deflink.supp_rates[sband->band] = rates;
 
 	if (ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
-					      elems->ht_cap_elem, sta, 0))
+					      elems->ht_cap_elem,
+					      &sta->deflink))
 		changed |= IEEE80211_RC_BW_CHANGED;
 
 	ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-					    elems->vht_cap_elem, sta, 0);
+					    elems->vht_cap_elem,
+					    &sta->deflink);
 
 	ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap,
 					  elems->he_cap_len,
 					  elems->he_6ghz_capa,
-					  sta, 0);
+					  &sta->deflink);
 
 	if (bw != sta->sta.deflink.bandwidth)
 		changed |= IEEE80211_RC_BW_CHANGED;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c2c997086553..01a72d1fcfcc 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3566,12 +3566,13 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	/* Set up internal HT/VHT capabilities */
 	if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
 		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
-						  elems->ht_cap_elem, sta, 0);
+						  elems->ht_cap_elem,
+						  &sta->deflink);
 
 	if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
 						    elems->vht_cap_elem,
-						    sta, 0);
+						    &sta->deflink);
 
 	if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
 	    elems->he_cap) {
@@ -3579,7 +3580,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 						  elems->he_cap,
 						  elems->he_cap_len,
 						  elems->he_6ghz_capa,
-						  sta, 0);
+						  &sta->deflink);
 
 		bss_conf->he_support = sta->sta.deflink.he_cap.has_he;
 		if (elems->rsnx && elems->rsnx_len &&
@@ -3599,7 +3600,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 							    elems->he_cap_len,
 							    elems->eht_cap,
 							    elems->eht_cap_len,
-							    sta, 0);
+							    &sta->deflink);
 
 			bss_conf->eht_support = sta->sta.deflink.eht_cap.has_eht;
 		} else {
@@ -4378,7 +4379,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (sta && elems->opmode_notif)
-		ieee80211_vht_handle_opmode(sdata, sta, 0,
+		ieee80211_vht_handle_opmode(sdata,
+					    &sta->deflink,
 					    *elems->opmode_notif,
 					    rx_status->band);
 	mutex_unlock(&local->sta_mtx);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index c58d9689f51f..7947e9a162a9 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -37,7 +37,7 @@ void rate_control_rate_init(struct sta_info *sta)
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 
-	ieee80211_sta_set_rx_nss(sta, 0);
+	ieee80211_sta_set_rx_nss(&sta->deflink);
 
 	if (!ref)
 		return;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0b5c34d00c14..d017ad14d7db 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3391,11 +3391,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ)
 				max_bw = IEEE80211_STA_RX_BW_20;
 			else
-				max_bw = ieee80211_sta_cap_rx_bw(rx->sta, 0);
+				max_bw = ieee80211_sta_cap_rx_bw(&rx->sta->deflink);
 
 			/* set cur_max_bandwidth and recalc sta bw */
 			rx->sta->deflink.cur_max_bandwidth = max_bw;
-			new_bw = ieee80211_sta_cur_vht_bw(rx->sta, 0);
+			new_bw = ieee80211_sta_cur_vht_bw(&rx->sta->deflink);
 
 			if (rx->sta->sta.deflink.bandwidth == new_bw)
 				goto handled;
@@ -3403,7 +3403,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			rx->sta->sta.deflink.bandwidth = new_bw;
 			sband = rx->local->hw.wiphy->bands[status->band];
 			sta_opmode.bw =
-				ieee80211_sta_rx_bw_to_chan_width(rx->sta, 0);
+				ieee80211_sta_rx_bw_to_chan_width(&rx->sta->deflink);
 			sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;
 
 			rate_control_rate_update(local, sband, rx->sta, 0,
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index b1426a2459e8..1f4189e08675 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,6 +67,7 @@
 struct sta_link_alloc {
 	struct link_sta_info info;
 	struct ieee80211_link_sta sta;
+	struct rcu_head rcu_head;
 };
 
 static const struct rhashtable_params sta_rht_params = {
@@ -258,19 +259,23 @@ static void sta_info_free_link(struct link_sta_info *link_sta)
 static void sta_remove_link(struct sta_info *sta, unsigned int link_id)
 {
 	struct sta_link_alloc *alloc = NULL;
+	struct link_sta_info *link_sta;
 
-	if (WARN_ON(!sta->link[link_id]))
+	link_sta = rcu_dereference_protected(sta->link[link_id],
+					     lockdep_is_held(&sta->local->sta_mtx));
+
+	if (WARN_ON(!link_sta))
 		return;
 
-	if (sta->link[link_id] != &sta->deflink)
-		alloc = container_of(sta->link[link_id], typeof(*alloc), info);
+	if (link_sta != &sta->deflink)
+		alloc = container_of(link_sta, typeof(*alloc), info);
 
 	sta->sta.valid_links &= ~BIT(link_id);
-	sta->link[link_id] = NULL;
-	sta->sta.link[link_id] = NULL;
+	RCU_INIT_POINTER(sta->link[link_id], NULL);
+	RCU_INIT_POINTER(sta->sta.link[link_id], NULL);
 	if (alloc) {
 		sta_info_free_link(&alloc->info);
-		kfree(alloc);
+		kfree_rcu(alloc, rcu_head);
 	}
 }
 
@@ -404,8 +409,9 @@ static void sta_info_add_link(struct sta_info *sta,
 {
 	link_info->sta = sta;
 	link_info->link_id = link_id;
-	sta->link[link_id] = link_info;
-	sta->sta.link[link_id] = link_sta;
+	link_info->pub = link_sta;
+	rcu_assign_pointer(sta->link[link_id], link_info);
+	rcu_assign_pointer(sta->sta.link[link_id], link_sta);
 }
 
 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
@@ -2682,13 +2688,15 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
 int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct link_sta_info *link_sta;
 	u16 old_links = sta->sta.valid_links;
 	u16 new_links = old_links | BIT(link_id);
 	int ret;
 
-	lockdep_assert_held(&sdata->local->sta_mtx);
+	link_sta = rcu_dereference_protected(sta->link[link_id],
+					     lockdep_is_held(&sdata->local->sta_mtx));
 
-	if (WARN_ON(old_links == new_links || !sta->link[link_id]))
+	if (WARN_ON(old_links == new_links || !link_sta))
 		return -EINVAL;
 
 	sta->sta.valid_links = new_links;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 8ec65bb7d13e..27c96c04b13f 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -516,6 +516,7 @@ struct ieee80211_fragment_cache {
  * @status_stats.last_ack_signal: last ACK signal
  * @status_stats.ack_signal_filled: last ACK signal validity
  * @status_stats.avg_ack_signal: average ACK signal
+ * @pub: public (driver visible) link STA data
  * TODO Move other link params from sta_info as required for MLD operation
  */
 struct link_sta_info {
@@ -561,6 +562,8 @@ struct link_sta_info {
 	} tx_stats;
 
 	enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
+
+	struct ieee80211_link_sta *pub;
 };
 
 /**
@@ -708,7 +711,7 @@ struct sta_info {
 	struct ieee80211_fragment_cache frags;
 
 	struct link_sta_info deflink;
-	struct link_sta_info *link[IEEE80211_MLD_MAX_NUM_LINKS];
+	struct link_sta_info __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 	/* keep last! */
 	struct ieee80211_sta sta;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 4fc120c64022..c531fa17f426 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -308,7 +308,8 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
 	/* IEEE802.11ac-2013 Table E-4 */
 	u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
 	struct cfg80211_chan_def uc = sta->tdls_chandef;
-	enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta, 0);
+	enum nl80211_chan_width max_width =
+		ieee80211_sta_cap_chan_bw(&sta->deflink);
 	int i;
 
 	/* only support upgrading non-narrow channels up to 80Mhz */
@@ -1268,7 +1269,7 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata,
 			enum ieee80211_sta_rx_bandwidth bw;
 
 			bw = ieee80211_chan_width_to_rx_bw(conf->def.width);
-			bw = min(bw, ieee80211_sta_cap_rx_bw(sta, 0));
+			bw = min(bw, ieee80211_sta_cap_rx_bw(&sta->deflink));
 			if (bw != sta->sta.deflink.bandwidth) {
 				sta->sta.deflink.bandwidth = bw;
 				rate_control_rate_update(local, sband, sta, 0,
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index acfe1459535f..fa14627b499a 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -116,16 +116,16 @@ void
 ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_supported_band *sband,
 				    const struct ieee80211_vht_cap *vht_cap_ie,
-				    struct sta_info *sta, unsigned int link_id)
+				    struct link_sta_info *link_sta)
 {
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.link[link_id]->vht_cap;
+	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap;
 	struct ieee80211_sta_vht_cap own_cap;
 	u32 cap_info, i;
 	bool have_80mhz;
 
 	memset(vht_cap, 0, sizeof(*vht_cap));
 
-	if (!sta->sta.link[link_id]->ht_cap.ht_supported)
+	if (!link_sta->pub->ht_cap.ht_supported)
 		return;
 
 	if (!vht_cap_ie || !sband->vht_cap.vht_supported)
@@ -162,7 +162,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	 * our own capabilities and then use those below.
 	 */
 	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
-	    !test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+	    !test_sta_flag(link_sta->sta, WLAN_STA_TDLS_PEER))
 		ieee80211_apply_vhtcap_overrides(sdata, &own_cap);
 
 	/* take some capabilities as-is */
@@ -286,8 +286,9 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	 */
 	if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) {
 		vht_cap->vht_supported = false;
-		sdata_info(sdata, "Ignoring VHT IE from %pM due to invalid rx_mcs_map\n",
-			   sta->addr);
+		sdata_info(sdata,
+			   "Ignoring VHT IE from %pM (link:%pM) due to invalid rx_mcs_map\n",
+			   link_sta->sta->addr, link_sta->addr);
 		return;
 	}
 
@@ -295,10 +296,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
-		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+		link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		break;
 	default:
-		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+		link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
 
 		if (!(vht_cap->vht_mcs.tx_highest &
 				cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
@@ -310,35 +311,40 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 		 * above) between 160 and 80+80 yet.
 		 */
 		if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)
-			sta->link[link_id]->cur_max_bandwidth =
+			link_sta->cur_max_bandwidth =
 				IEEE80211_STA_RX_BW_160;
 	}
 
-	sta->sta.link[link_id]->bandwidth = ieee80211_sta_cur_vht_bw(sta, link_id);
+	link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta);
 
+	/*
+	 * FIXME - should the amsdu len be per link? store per link
+	 * and maintain a minimum?
+	 */
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
 	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
-		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
+		link_sta->sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
 		break;
 	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
-		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
+		link_sta->sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
 		break;
 	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
 	default:
-		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
+		link_sta->sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
 		break;
 	}
 }
 
 /* FIXME: move this to some better location - parses HE/EHT now */
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
-							unsigned int link_id)
+enum ieee80211_sta_rx_bandwidth
+ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta)
 {
-	struct ieee80211_bss_conf *link_conf = sta->sdata->vif.link_conf[link_id];
-	struct ieee80211_link_sta *link_sta = sta->sta.link[link_id];
-	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->vht_cap;
-	struct ieee80211_sta_he_cap *he_cap = &link_sta->he_cap;
-	struct ieee80211_sta_eht_cap *eht_cap = &link_sta->eht_cap;
+	unsigned int link_id = link_sta->link_id;
+	struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata;
+	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap;
+	struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap;
+	struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap;
 	u32 cap_width;
 
 	if (he_cap->has_he) {
@@ -371,7 +377,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
 	}
 
 	if (!vht_cap->vht_supported)
-		return link_sta->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+		return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
 				IEEE80211_STA_RX_BW_40 :
 				IEEE80211_STA_RX_BW_20;
 
@@ -392,17 +398,17 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta,
 	return IEEE80211_STA_RX_BW_80;
 }
 
-enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta,
-						  unsigned int link_id)
+enum nl80211_chan_width
+ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta)
 {
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.link[link_id]->vht_cap;
+	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap;
 	u32 cap_width;
 
 	if (!vht_cap->vht_supported) {
-		if (!sta->sta.link[link_id]->ht_cap.ht_supported)
+		if (!link_sta->pub->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20_NOHT;
 
-		return sta->sta.link[link_id]->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+		return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
 				NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20;
 	}
 
@@ -417,17 +423,17 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta,
 }
 
 enum nl80211_chan_width
-ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta, unsigned int link_id)
+ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta)
 {
 	enum ieee80211_sta_rx_bandwidth cur_bw =
-		sta->sta.link[link_id]->bandwidth;
+		link_sta->pub->bandwidth;
 	struct ieee80211_sta_vht_cap *vht_cap =
-		&sta->sta.link[link_id]->vht_cap;
+		&link_sta->pub->vht_cap;
 	u32 cap_width;
 
 	switch (cur_bw) {
 	case IEEE80211_STA_RX_BW_20:
-		if (!sta->sta.link[link_id]->ht_cap.ht_supported)
+		if (!link_sta->pub->ht_cap.ht_supported)
 			return NL80211_CHAN_WIDTH_20_NOHT;
 		else
 			return NL80211_CHAN_WIDTH_20;
@@ -471,15 +477,17 @@ ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
 }
 
 /* FIXME: rename/move - this deals with everything not just VHT */
-enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
-							 unsigned int link_id)
+enum ieee80211_sta_rx_bandwidth
+ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta)
 {
-	struct ieee80211_bss_conf *link_conf = sta->sdata->vif.link_conf[link_id];
+	struct sta_info *sta = link_sta->sta;
+	struct ieee80211_bss_conf *link_conf =
+		sta->sdata->vif.link_conf[link_sta->link_id];
 	enum nl80211_chan_width bss_width = link_conf->chandef.width;
 	enum ieee80211_sta_rx_bandwidth bw;
 
-	bw = ieee80211_sta_cap_rx_bw(sta, link_id);
-	bw = min(bw, sta->link[link_id]->cur_max_bandwidth);
+	bw = ieee80211_sta_cap_rx_bw(link_sta);
+	bw = min(bw, link_sta->cur_max_bandwidth);
 
 	/* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of
 	 * IEEE80211-2016 specification makes higher bandwidth operation
@@ -501,18 +509,18 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta,
 	return bw;
 }
 
-void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id)
+void ieee80211_sta_set_rx_nss(struct link_sta_info *link_sta)
 {
 	u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, eht_rx_nss = 0, rx_nss;
 	bool support_160;
 
 	/* if we received a notification already don't overwrite it */
-	if (sta->sta.link[link_id]->rx_nss)
+	if (link_sta->pub->rx_nss)
 		return;
 
-	if (sta->sta.link[link_id]->eht_cap.has_eht) {
+	if (link_sta->pub->eht_cap.has_eht) {
 		int i;
-		const u8 *rx_nss_mcs = (void *)&sta->sta.link[link_id]->eht_cap.eht_mcs_nss_supp;
+		const u8 *rx_nss_mcs = (void *)&link_sta->pub->eht_cap.eht_mcs_nss_supp;
 
 		/* get the max nss for EHT over all possible bandwidths and mcs */
 		for (i = 0; i < sizeof(struct ieee80211_eht_mcs_nss_supp); i++)
@@ -521,10 +529,10 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id)
 						       IEEE80211_EHT_MCS_NSS_RX));
 	}
 
-	if (sta->sta.link[link_id]->he_cap.has_he) {
+	if (link_sta->pub->he_cap.has_he) {
 		int i;
 		u8 rx_mcs_80 = 0, rx_mcs_160 = 0;
-		const struct ieee80211_sta_he_cap *he_cap = &sta->sta.link[link_id]->he_cap;
+		const struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap;
 		u16 mcs_160_map =
 			le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160);
 		u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80);
@@ -555,23 +563,23 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id)
 			he_rx_nss = rx_mcs_80;
 	}
 
-	if (sta->sta.link[link_id]->ht_cap.ht_supported) {
-		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[0])
+	if (link_sta->pub->ht_cap.ht_supported) {
+		if (link_sta->pub->ht_cap.mcs.rx_mask[0])
 			ht_rx_nss++;
-		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[1])
+		if (link_sta->pub->ht_cap.mcs.rx_mask[1])
 			ht_rx_nss++;
-		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[2])
+		if (link_sta->pub->ht_cap.mcs.rx_mask[2])
 			ht_rx_nss++;
-		if (sta->sta.link[link_id]->ht_cap.mcs.rx_mask[3])
+		if (link_sta->pub->ht_cap.mcs.rx_mask[3])
 			ht_rx_nss++;
 		/* FIXME: consider rx_highest? */
 	}
 
-	if (sta->sta.link[link_id]->vht_cap.vht_supported) {
+	if (link_sta->pub->vht_cap.vht_supported) {
 		int i;
 		u16 rx_mcs_map;
 
-		rx_mcs_map = le16_to_cpu(sta->sta.link[link_id]->vht_cap.vht_mcs.rx_mcs_map);
+		rx_mcs_map = le16_to_cpu(link_sta->pub->vht_cap.vht_mcs.rx_mcs_map);
 
 		for (i = 7; i >= 0; i--) {
 			u8 mcs = (rx_mcs_map >> (2 * i)) & 3;
@@ -587,11 +595,11 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta, unsigned int link_id)
 	rx_nss = max(vht_rx_nss, ht_rx_nss);
 	rx_nss = max(he_rx_nss, rx_nss);
 	rx_nss = max(eht_rx_nss, rx_nss);
-	sta->sta.link[link_id]->rx_nss = max_t(u8, 1, rx_nss);
+	link_sta->pub->rx_nss = max_t(u8, 1, rx_nss);
 }
 
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				  struct sta_info *sta, unsigned int link_id,
+				  struct link_sta_info *link_sta,
 				  u8 opmode, enum nl80211_band band)
 {
 	enum ieee80211_sta_rx_bandwidth new_bw;
@@ -607,8 +615,8 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT;
 	nss += 1;
 
-	if (sta->sta.link[link_id]->rx_nss != nss) {
-		sta->sta.link[link_id]->rx_nss = nss;
+	if (link_sta->pub->rx_nss != nss) {
+		link_sta->pub->rx_nss = nss;
 		sta_opmode.rx_nss = nss;
 		changed |= IEEE80211_RC_NSS_CHANGED;
 		sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED;
@@ -617,34 +625,34 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
 		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
-		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
+		link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ:
 		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
-		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
+		link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ:
 		if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80)
-			sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+			link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		else
-			sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+			link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ:
 		/* legacy only, no longer used by newer spec */
-		sta->link[link_id]->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+		link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		break;
 	}
 
-	new_bw = ieee80211_sta_cur_vht_bw(sta, link_id);
-	if (new_bw != sta->sta.link[link_id]->bandwidth) {
-		sta->sta.link[link_id]->bandwidth = new_bw;
-		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta, link_id);
+	new_bw = ieee80211_sta_cur_vht_bw(link_sta);
+	if (new_bw != link_sta->pub->bandwidth) {
+		link_sta->pub->bandwidth = new_bw;
+		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(link_sta);
 		changed |= IEEE80211_RC_BW_CHANGED;
 		sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED;
 	}
 
 	if (sta_opmode.changed)
-		cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr,
+		cfg80211_sta_opmode_change_notify(sdata->dev, link_sta->addr,
 						  &sta_opmode, GFP_KERNEL);
 
 	return changed;
@@ -689,18 +697,19 @@ void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id,
 EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
 
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
-				 struct sta_info *sta, unsigned int link_id,
+				 struct link_sta_info *link_sta,
 				 u8 opmode, enum nl80211_band band)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
 
-	u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, link_id,
+	u32 changed = __ieee80211_vht_handle_opmode(sdata, link_sta,
 						    opmode, band);
 
 	if (changed > 0) {
 		ieee80211_recalc_min_chandef(sdata);
-		rate_control_rate_update(local, sband, sta, link_id, changed);
+		rate_control_rate_update(local, sband, link_sta->sta,
+					 link_sta->link_id, changed);
 	}
 }
 
-- 
cgit 


From efbabc11650040c64884ff3019b88c7bcc0ceb1d Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Wed, 8 Jun 2022 14:46:37 +0530
Subject: cfg80211: Indicate MLO connection info in connect and roam callbacks

The MLO links used for connection with an MLD AP are decided by the
driver in case of SME offloaded to driver.

Add support for the drivers to indicate the information of links used
for MLO connection in connect and roam callbacks, update the connected
links information in wdev from connect/roam result sent by driver.
Also, send the connected links information to userspace.

Add a netlink flag attribute to indicate that userspace supports
handling of MLO connection. Drivers must not do MLO connection when this
flag is not set. This is to maintain backwards compatibility with older
supplicant versions which doesn't have support for MLO connection.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   2 +-
 drivers/net/wireless/ath/wil6210/wmi.c             |   4 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   6 +-
 drivers/net/wireless/rndis_wlan.c                  |   5 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |   4 +-
 drivers/staging/wlan-ng/cfg80211.c                 |   2 +-
 include/net/cfg80211.h                             |  84 +++--
 include/uapi/linux/nl80211.h                       |   6 +
 net/wireless/mlme.c                                |   4 +-
 net/wireless/nl80211.c                             | 120 ++++++-
 net/wireless/sme.c                                 | 391 +++++++++++++++------
 11 files changed, 480 insertions(+), 148 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 33ed54738d47..e11c7e9accc0 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -807,7 +807,7 @@ void ath6kl_cfg80211_connect_event(struct ath6kl_vif *vif, u16 channel,
 		cfg80211_put_bss(ar->wiphy, bss);
 	} else if (vif->sme_state == SME_CONNECTED) {
 		struct cfg80211_roam_info roam_info = {
-			.bss = bss,
+			.links[0].bss = bss,
 			.req_ie = assoc_req_ie,
 			.req_ie_len = assoc_req_len,
 			.resp_ie = assoc_resp_ie,
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index 98b4c189eecc..ea7bd403e706 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1822,8 +1822,8 @@ wmi_evt_reassoc_status(struct wil6210_vif *vif, int id, void *d, int len)
 	freq = ieee80211_channel_to_frequency(ch, NL80211_BAND_60GHZ);
 
 	memset(&info, 0, sizeof(info));
-	info.channel = ieee80211_get_channel(wiphy, freq);
-	info.bss = vif->bss;
+	info.links[0].channel = ieee80211_get_channel(wiphy, freq);
+	info.links[0].bss = vif->bss;
 	info.req_ie = assoc_req_ie;
 	info.req_ie_len = assoc_req_ie_len;
 	info.resp_ie = assoc_resp_ie;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 11e1f07f83e0..3ae6779fe153 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6017,8 +6017,8 @@ brcmf_bss_roaming_done(struct brcmf_cfg80211_info *cfg,
 done:
 	kfree(buf);
 
-	roam_info.channel = notify_channel;
-	roam_info.bssid = profile->bssid;
+	roam_info.links[0].channel = notify_channel;
+	roam_info.links[0].bssid = profile->bssid;
 	roam_info.req_ie = conn_info->req_ie;
 	roam_info.req_ie_len = conn_info->req_ie_len;
 	roam_info.resp_ie = conn_info->resp_ie;
@@ -6061,7 +6061,7 @@ brcmf_bss_connect_done(struct brcmf_cfg80211_info *cfg,
 		} else {
 			conn_params.status = WLAN_STATUS_AUTH_TIMEOUT;
 		}
-		conn_params.bssid = profile->bssid;
+		conn_params.links[0].bssid = profile->bssid;
 		conn_params.req_ie = conn_info->req_ie;
 		conn_params.req_ie_len = conn_info->req_ie_len;
 		conn_params.resp_ie = conn_info->resp_ie;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index ff2448394a1e..05524291d60c 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -2813,8 +2813,9 @@ static void rndis_wlan_do_link_up_work(struct usbnet *usbdev)
 						resp_ie_len, 0, GFP_KERNEL);
 		} else {
 			struct cfg80211_roam_info roam_info = {
-				.channel = get_current_channel(usbdev, NULL),
-				.bssid = bssid,
+				.links[0].channel =
+					get_current_channel(usbdev, NULL),
+				.links[0].bssid = bssid,
 				.req_ie = req_ie,
 				.req_ie_len = req_ie_len,
 				.resp_ie = resp_ie,
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 349aa3c4b668..cf35125b7891 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -450,8 +450,8 @@ check_bss:
 
 		notify_channel = ieee80211_get_channel(wiphy, freq);
 
-		roam_info.channel = notify_channel;
-		roam_info.bssid = cur_network->network.mac_address;
+		roam_info.links[0].channel = notify_channel;
+		roam_info.links[0].bssid = cur_network->network.mac_address;
 		roam_info.req_ie =
 			pmlmepriv->assoc_req+sizeof(struct ieee80211_hdr_3addr)+2;
 		roam_info.req_ie_len =
diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c
index 87379edce9a8..b7b56d8406d1 100644
--- a/drivers/staging/wlan-ng/cfg80211.c
+++ b/drivers/staging/wlan-ng/cfg80211.c
@@ -645,7 +645,7 @@ void prism2_disconnected(struct wlandevice *wlandev)
 void prism2_roamed(struct wlandevice *wlandev)
 {
 	struct cfg80211_roam_info roam_info = {
-		.bssid = wlandev->bssid,
+		.links[0].bssid = wlandev->bssid,
 	};
 
 	cfg80211_roamed(wlandev->netdev, &roam_info, GFP_KERNEL);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5706f96b819a..996782c44838 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2763,6 +2763,9 @@ struct cfg80211_assoc_link {
  *	request (connect callback).
  * @ASSOC_REQ_DISABLE_HE:  Disable HE
  * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
+ * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links.
+ *	Drivers shall disable MLO features for the current association if this
+ *	flag is not set.
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -2771,6 +2774,7 @@ enum cfg80211_assoc_req_flags {
 	CONNECT_REQ_EXTERNAL_AUTH_SUPPORT	= BIT(3),
 	ASSOC_REQ_DISABLE_HE			= BIT(4),
 	ASSOC_REQ_DISABLE_EHT			= BIT(5),
+	CONNECT_REQ_MLO_SUPPORT			= BIT(6),
 };
 
 /**
@@ -5780,12 +5784,13 @@ static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
 		!(wdev->valid_links & BIT(link_id)));
 }
 
-#define for_each_valid_link(wdev, link_id)					\
-	for (link_id = 0;							\
-	     link_id < ((wdev)->valid_links ? ARRAY_SIZE((wdev)->links) : 1);	\
-	     link_id++)								\
-		if (!(wdev)->valid_links ||					\
-		    ((wdev)->valid_links & BIT(link_id)))
+#define for_each_valid_link(link_info, link_id)			\
+	for (link_id = 0;					\
+	     link_id < ((link_info)->valid_links ?		\
+			ARRAY_SIZE((link_info)->links) : 1);	\
+	     link_id++)						\
+		if (!(link_info)->valid_links ||		\
+		    ((link_info)->valid_links & BIT(link_id)))
 
 /**
  * DOC: Utility functions
@@ -7296,13 +7301,6 @@ struct cfg80211_fils_resp_params {
  *	indicate that this is a failure, but without a status code.
  *	@timeout_reason is used to report the reason for the timeout in that
  *	case.
- * @bssid: The BSSID of the AP (may be %NULL)
- * @bss: Entry of bss to which STA got connected to, can be obtained through
- *	cfg80211_get_bss() (may be %NULL). But it is recommended to store the
- *	bss from the connect_request and hold a reference to it and return
- *	through this param to avoid a warning if the bss is expired during the
- *	connection, esp. for those drivers implementing connect op.
- *	Only one parameter among @bssid and @bss needs to be specified.
  * @req_ie: Association request IEs (may be %NULL)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
@@ -7314,17 +7312,41 @@ struct cfg80211_fils_resp_params {
  *	not known. This value is used only if @status < 0 to indicate that the
  *	failure is due to a timeout and not due to explicit rejection by the AP.
  *	This value is ignored in other cases (@status >= 0).
+ * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise
+ *	zero.
+ * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL.
+ * @links : For MLO connection, contains link info for the valid links indicated
+ *	using @valid_links. For non-MLO connection, links[0] contains the
+ *	connected AP info.
+ * @links.addr: For MLO connection, MAC address of the STA link. Otherwise
+ *	%NULL.
+ * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO
+ *	connection, links[0].bssid points to the BSSID of the AP (may be %NULL).
+ * @links.bss: For MLO connection, entry of bss to which STA link is connected.
+ *	For non-MLO connection, links[0].bss points to entry of bss to which STA
+ *	is connected. It can be obtained through cfg80211_get_bss() (may be
+ *	%NULL). It is recommended to store the bss from the connect_request and
+ *	hold a reference to it and return through this param to avoid a warning
+ *	if the bss is expired during the connection, esp. for those drivers
+ *	implementing connect op. Only one parameter among @bssid and @bss needs
+ *	to be specified.
  */
 struct cfg80211_connect_resp_params {
 	int status;
-	const u8 *bssid;
-	struct cfg80211_bss *bss;
 	const u8 *req_ie;
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
 	struct cfg80211_fils_resp_params fils;
 	enum nl80211_timeout_reason timeout_reason;
+
+	const u8 *ap_mld_addr;
+	u16 valid_links;
+	struct {
+		const u8 *addr;
+		const u8 *bssid;
+		struct cfg80211_bss *bss;
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
@@ -7394,8 +7416,8 @@ cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 
 	memset(&params, 0, sizeof(params));
 	params.status = status;
-	params.bssid = bssid;
-	params.bss = bss;
+	params.links[0].bssid = bssid;
+	params.links[0].bss = bss;
 	params.req_ie = req_ie;
 	params.req_ie_len = req_ie_len;
 	params.resp_ie = resp_ie;
@@ -7466,24 +7488,40 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
 /**
  * struct cfg80211_roam_info - driver initiated roaming information
  *
- * @channel: the channel of the new AP
- * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set)
- * @bssid: the BSSID of the new AP (may be %NULL if %bss is set)
  * @req_ie: association request IEs (maybe be %NULL)
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
  * @fils: FILS related roaming information.
+ * @valid_links: For MLO roaming, BIT mask of the new valid links is set.
+ *	Otherwise zero.
+ * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL.
+ * @links : For MLO roaming, contains new link info for the valid links set in
+ *	@valid_links. For non-MLO roaming, links[0] contains the new AP info.
+ * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL.
+ * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO
+ *	roaming, links[0].bssid points to the BSSID of the new AP. May be
+ *	%NULL if %links.bss is set.
+ * @links.channel: the channel of the new AP.
+ * @links.bss: For MLO roaming, entry of new bss to which STA link got
+ *	roamed. For non-MLO roaming, links[0].bss points to entry of bss to
+ *	which STA got roamed (may be %NULL if %links.bssid is set)
  */
 struct cfg80211_roam_info {
-	struct ieee80211_channel *channel;
-	struct cfg80211_bss *bss;
-	const u8 *bssid;
 	const u8 *req_ie;
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
 	struct cfg80211_fils_resp_params fils;
+
+	const u8 *ap_mld_addr;
+	u16 valid_links;
+	struct {
+		const u8 *addr;
+		const u8 *bssid;
+		struct ieee80211_channel *channel;
+		struct cfg80211_bss *bss;
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 60ad9a9f153d..89f64f46b98d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2690,6 +2690,10 @@ enum nl80211_commands {
  * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as
  *	authenticate/associate.
  *
+ * @NL80211_ATTR_MLO_SUPPORT: Flag attribute to indicate user space supports MLO
+ *	connection. Used with %NL80211_CMD_CONNECT. If this attribute is not
+ *	included in NL80211_CMD_CONNECT drivers must not perform MLO connection.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3208,6 +3212,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MLO_LINK_ID,
 	NL80211_ATTR_MLD_ADDR,
 
+	NL80211_ATTR_MLO_SUPPORT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index d92eed0e52cd..8a84cf77667c 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -42,8 +42,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 
 	memset(&cr, 0, sizeof(cr));
 	cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code);
-	cr.bssid = mgmt->bssid;
-	cr.bss = bss;
+	cr.links[0].bssid = mgmt->bssid;
+	cr.links[0].bss = bss;
 	cr.req_ie = req_ies;
 	cr.req_ie_len = req_ies_len;
 	cr.resp_ie = resp_ie;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index aca799b9971e..6a45801c783c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -797,6 +797,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_LINK_ID] =
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
+	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -11529,6 +11530,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT;
 	}
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_MLO_SUPPORT]))
+		connect.flags |= CONNECT_REQ_MLO_SUPPORT;
+
 	wdev_lock(dev->ieee80211_ptr);
 
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
@@ -17304,10 +17308,29 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 {
 	struct sk_buff *msg;
 	void *hdr;
+	unsigned int link;
+	size_t link_info_size = 0;
+	const u8 *connected_addr = cr->valid_links ?
+				   cr->ap_mld_addr : cr->links[0].bssid;
+
+	if (cr->valid_links) {
+		for_each_valid_link(cr, link) {
+			/* Nested attribute header */
+			link_info_size += NLA_HDRLEN;
+			/* Link ID */
+			link_info_size += nla_total_size(sizeof(u8));
+			link_info_size += cr->links[link].addr ?
+					  nla_total_size(ETH_ALEN) : 0;
+			link_info_size += (cr->links[link].bssid ||
+					   cr->links[link].bss) ?
+					  nla_total_size(ETH_ALEN) : 0;
+		}
+	}
 
 	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
 			cr->fils.kek_len + cr->fils.pmk_len +
-			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size,
+			gfp);
 	if (!msg)
 		return;
 
@@ -17319,8 +17342,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    (cr->bssid &&
-	     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cr->bssid)) ||
+	    (connected_addr &&
+	     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr)) ||
 	    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
 			cr->status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
 			cr->status) ||
@@ -17346,6 +17369,38 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
 		goto nla_put_failure;
 
+	if (cr->valid_links) {
+		int i = 1;
+		struct nlattr *nested;
+
+		nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
+		if (!nested)
+			goto nla_put_failure;
+
+		for_each_valid_link(cr, link) {
+			struct nlattr *nested_mlo_links;
+			const u8 *bssid = cr->links[link].bss ?
+					  cr->links[link].bss->bssid :
+					  cr->links[link].bssid;
+
+			nested_mlo_links = nla_nest_start(msg, i);
+			if (!nested_mlo_links)
+				goto nla_put_failure;
+
+			if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
+			    (bssid &&
+			     nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
+			    (cr->links[link].addr &&
+			     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
+				     cr->links[link].addr)))
+				goto nla_put_failure;
+
+			nla_nest_end(msg, nested_mlo_links);
+			i++;
+		}
+		nla_nest_end(msg, nested);
+	}
+
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
@@ -17362,11 +17417,32 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 {
 	struct sk_buff *msg;
 	void *hdr;
-	const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
+	size_t link_info_size = 0;
+	unsigned int link;
+	const u8 *connected_addr = info->ap_mld_addr ?
+				   info->ap_mld_addr :
+				   (info->links[0].bss ?
+				    info->links[0].bss->bssid :
+				    info->links[0].bssid);
+
+	if (info->valid_links) {
+		for_each_valid_link(info, link) {
+			/* Nested attribute header */
+			link_info_size += NLA_HDRLEN;
+			/* Link ID */
+			link_info_size += nla_total_size(sizeof(u8));
+			link_info_size += info->links[link].addr ?
+					  nla_total_size(ETH_ALEN) : 0;
+			link_info_size += (info->links[link].bssid ||
+					   info->links[link].bss) ?
+					  nla_total_size(ETH_ALEN) : 0;
+		}
+	}
 
 	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
 			info->fils.kek_len + info->fils.pmk_len +
-			(info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			(info->fils.pmkid ? WLAN_PMKID_LEN : 0) +
+			link_info_size, gfp);
 	if (!msg)
 		return;
 
@@ -17378,7 +17454,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid) ||
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr) ||
 	    (info->req_ie &&
 	     nla_put(msg, NL80211_ATTR_REQ_IE, info->req_ie_len,
 		     info->req_ie)) ||
@@ -17397,6 +17473,38 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	     nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
 		goto nla_put_failure;
 
+	if (info->valid_links) {
+		int i = 1;
+		struct nlattr *nested;
+
+		nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
+		if (!nested)
+			goto nla_put_failure;
+
+		for_each_valid_link(info, link) {
+			struct nlattr *nested_mlo_links;
+			const u8 *bssid = info->links[link].bss ?
+					  info->links[link].bss->bssid :
+					  info->links[link].bssid;
+
+			nested_mlo_links = nla_nest_start(msg, i);
+			if (!nested_mlo_links)
+				goto nla_put_failure;
+
+			if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
+			    (bssid &&
+			     nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
+			    (info->links[link].addr &&
+			     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
+				     info->links[link].addr)))
+				goto nla_put_failure;
+
+			nla_nest_end(msg, nested_mlo_links);
+			i++;
+		}
+		nla_nest_end(msg, nested);
+	}
+
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index b3c6ce4f85ee..00be498aab2e 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -275,7 +275,7 @@ void cfg80211_conn_work(struct work_struct *work)
 
 			memset(&cr, 0, sizeof(cr));
 			cr.status = -1;
-			cr.bssid = bssid;
+			cr.links[0].bssid = bssid;
 			cr.timeout_reason = treason;
 			__cfg80211_connect_result(wdev->netdev, &cr, false);
 		}
@@ -384,7 +384,7 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 
 		memset(&cr, 0, sizeof(cr));
 		cr.status = status_code;
-		cr.bssid = mgmt->bssid;
+		cr.links[0].bssid = mgmt->bssid;
 		cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED;
 		__cfg80211_connect_result(wdev->netdev, &cr, false);
 	} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
@@ -698,6 +698,20 @@ static void disconnect_work(struct work_struct *work)
 
 DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
 
+static void
+cfg80211_connect_result_release_bsses(struct wireless_dev *wdev,
+				      struct cfg80211_connect_resp_params *cr)
+{
+	unsigned int link;
+
+	for_each_valid_link(cr, link) {
+		if (!cr->links[link].bss)
+			continue;
+		cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss));
+		cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
+	}
+}
+
 /*
  * API calls for drivers implementing connect/disconnect and
  * SME event handling
@@ -715,21 +729,33 @@ void __cfg80211_connect_result(struct net_device *dev,
 #ifdef CONFIG_CFG80211_WEXT
 	union iwreq_data wrqu;
 #endif
+	unsigned int link;
+	const u8 *connected_addr;
+	bool bss_not_found = false;
 
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
-		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) {
-		cfg80211_put_bss(wdev->wiphy, cr->bss);
-		return;
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		goto out;
+
+	if (cr->valid_links) {
+		if (WARN_ON(!cr->ap_mld_addr))
+			goto out;
+
+		for_each_valid_link(cr, link) {
+			if (WARN_ON(!cr->links[link].addr))
+				goto out;
+		}
 	}
 
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr,
 				    GFP_KERNEL);
+	connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid;
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (wextev) {
+	if (wextev && !cr->valid_links) {
 		if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) {
 			memset(&wrqu, 0, sizeof(wrqu));
 			wrqu.data.length = cr->req_ie_len;
@@ -746,23 +772,38 @@ void __cfg80211_connect_result(struct net_device *dev,
 
 		memset(&wrqu, 0, sizeof(wrqu));
 		wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-		if (cr->bssid && cr->status == WLAN_STATUS_SUCCESS) {
-			memcpy(wrqu.ap_addr.sa_data, cr->bssid, ETH_ALEN);
-			memcpy(wdev->wext.prev_bssid, cr->bssid, ETH_ALEN);
+		if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) {
+			memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN);
+			memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN);
 			wdev->wext.prev_bssid_valid = true;
 		}
 		wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
 	}
 #endif
 
-	if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) {
-		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
-		cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid,
-					   wdev->u.client.ssid, wdev->u.client.ssid_len,
-					   wdev->conn_bss_type,
-					   IEEE80211_PRIVACY_ANY);
-		if (cr->bss)
-			cfg80211_hold_bss(bss_from_pub(cr->bss));
+	if (cr->status == WLAN_STATUS_SUCCESS) {
+		for_each_valid_link(cr, link) {
+			if (WARN_ON_ONCE(!cr->links[link].bss))
+				break;
+		}
+
+		for_each_valid_link(cr, link) {
+			if (cr->links[link].bss)
+				continue;
+
+			cr->links[link].bss =
+				cfg80211_get_bss(wdev->wiphy, NULL,
+						 cr->links[link].bssid,
+						 wdev->u.client.ssid,
+						 wdev->u.client.ssid_len,
+						 wdev->conn_bss_type,
+						 IEEE80211_PRIVACY_ANY);
+			if (!cr->links[link].bss) {
+				bss_not_found = true;
+				break;
+			}
+			cfg80211_hold_bss(bss_from_pub(cr->links[link].bss));
+		}
 	}
 
 	cfg80211_wdev_release_bsses(wdev);
@@ -772,26 +813,40 @@ void __cfg80211_connect_result(struct net_device *dev,
 		wdev->connect_keys = NULL;
 		wdev->u.client.ssid_len = 0;
 		wdev->conn_owner_nlportid = 0;
-		if (cr->bss) {
-			cfg80211_unhold_bss(bss_from_pub(cr->bss));
-			cfg80211_put_bss(wdev->wiphy, cr->bss);
-		}
+		cfg80211_connect_result_release_bsses(wdev, cr);
 		cfg80211_sme_free(wdev);
 		return;
 	}
 
-	if (WARN_ON(!cr->bss))
+	if (WARN_ON(bss_not_found)) {
+		cfg80211_connect_result_release_bsses(wdev, cr);
 		return;
+	}
 
-	wdev->links[0].client.current_bss = bss_from_pub(cr->bss);
+	memset(wdev->links, 0, sizeof(wdev->links));
+	wdev->valid_links = cr->valid_links;
+	for_each_valid_link(cr, link)
+		wdev->links[link].client.current_bss =
+			bss_from_pub(cr->links[link].bss);
 	wdev->connected = true;
-	ether_addr_copy(wdev->u.client.connected_addr, cr->bss->bssid);
+	ether_addr_copy(wdev->u.client.connected_addr, connected_addr);
+	if (cr->valid_links) {
+		for_each_valid_link(cr, link)
+			memcpy(wdev->links[link].addr, cr->links[link].addr,
+			       ETH_ALEN);
+	}
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
 
 	rcu_read_lock();
-	country_elem = ieee80211_bss_get_elem(cr->bss, WLAN_EID_COUNTRY);
+	for_each_valid_link(cr, link) {
+		country_elem =
+			ieee80211_bss_get_elem(cr->links[link].bss,
+					       WLAN_EID_COUNTRY);
+		if (country_elem)
+			break;
+	}
 	if (!country_elem) {
 		rcu_read_unlock();
 		return;
@@ -804,12 +859,60 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (!country_data)
 		return;
 
-	regulatory_hint_country_ie(wdev->wiphy, cr->bss->channel->band,
+	regulatory_hint_country_ie(wdev->wiphy,
+				   cr->links[link].bss->channel->band,
 				   country_data, country_datalen);
 	kfree(country_data);
+
+	return;
+out:
+	for_each_valid_link(cr, link)
+		cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
 }
 
-/* Consumes bss object one way or another */
+static void cfg80211_update_link_bss(struct wireless_dev *wdev,
+				     struct cfg80211_bss **bss)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_internal_bss *ibss;
+
+	if (!*bss)
+		return;
+
+	ibss = bss_from_pub(*bss);
+	if (list_empty(&ibss->list)) {
+		struct cfg80211_bss *found = NULL, *tmp = *bss;
+
+		found = cfg80211_get_bss(wdev->wiphy, NULL,
+					 (*bss)->bssid,
+					 wdev->u.client.ssid,
+					 wdev->u.client.ssid_len,
+					 wdev->conn_bss_type,
+					 IEEE80211_PRIVACY_ANY);
+		if (found) {
+			/* The same BSS is already updated so use it
+			 * instead, as it has latest info.
+			 */
+			*bss = found;
+		} else {
+			/* Update with BSS provided by driver, it will
+			 * be freshly added and ref cnted, we can free
+			 * the old one.
+			 *
+			 * signal_valid can be false, as we are not
+			 * expecting the BSS to be found.
+			 *
+			 * keep the old timestamp to avoid confusion
+			 */
+			cfg80211_bss_update(rdev, ibss, false,
+					    ibss->ts);
+		}
+
+		cfg80211_put_bss(wdev->wiphy, tmp);
+	}
+}
+
+/* Consumes bss object(s) one way or another */
 void cfg80211_connect_done(struct net_device *dev,
 			   struct cfg80211_connect_resp_params *params,
 			   gfp_t gfp)
@@ -819,55 +922,34 @@ void cfg80211_connect_done(struct net_device *dev,
 	struct cfg80211_event *ev;
 	unsigned long flags;
 	u8 *next;
+	size_t link_info_size = 0;
+	unsigned int link;
 
-	if (params->bss) {
-		struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss);
-
-		if (list_empty(&ibss->list)) {
-			struct cfg80211_bss *found = NULL, *tmp = params->bss;
-
-			found = cfg80211_get_bss(wdev->wiphy, NULL,
-						 params->bss->bssid,
-						 wdev->u.client.ssid, wdev->u.client.ssid_len,
-						 wdev->conn_bss_type,
-						 IEEE80211_PRIVACY_ANY);
-			if (found) {
-				/* The same BSS is already updated so use it
-				 * instead, as it has latest info.
-				 */
-				params->bss = found;
-			} else {
-				/* Update with BSS provided by driver, it will
-				 * be freshly added and ref cnted, we can free
-				 * the old one.
-				 *
-				 * signal_valid can be false, as we are not
-				 * expecting the BSS to be found.
-				 *
-				 * keep the old timestamp to avoid confusion
-				 */
-				cfg80211_bss_update(rdev, ibss, false,
-						    ibss->ts);
-			}
-
-			cfg80211_put_bss(wdev->wiphy, tmp);
-		}
+	for_each_valid_link(params, link) {
+		cfg80211_update_link_bss(wdev, &params->links[link].bss);
+		link_info_size += params->links[link].bssid ? ETH_ALEN : 0;
+		link_info_size += params->links[link].addr ? ETH_ALEN : 0;
 	}
 
-	ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
+	ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) +
 		     params->req_ie_len + params->resp_ie_len +
 		     params->fils.kek_len + params->fils.pmk_len +
-		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size,
+		     gfp);
+
 	if (!ev) {
-		cfg80211_put_bss(wdev->wiphy, params->bss);
+		for_each_valid_link(params, link)
+			cfg80211_put_bss(wdev->wiphy,
+					 params->links[link].bss);
 		return;
 	}
 
 	ev->type = EVENT_CONNECT_RESULT;
 	next = ((u8 *)ev) + sizeof(*ev);
-	if (params->bssid) {
-		ev->cr.bssid = next;
-		memcpy((void *)ev->cr.bssid, params->bssid, ETH_ALEN);
+	if (params->ap_mld_addr) {
+		ev->cr.ap_mld_addr = next;
+		memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr,
+		       ETH_ALEN);
 		next += ETH_ALEN;
 	}
 	if (params->req_ie_len) {
@@ -907,9 +989,28 @@ void cfg80211_connect_done(struct net_device *dev,
 	ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
 	if (params->fils.update_erp_next_seq_num)
 		ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
-	if (params->bss)
-		cfg80211_hold_bss(bss_from_pub(params->bss));
-	ev->cr.bss = params->bss;
+	ev->cr.valid_links = params->valid_links;
+	for_each_valid_link(params, link) {
+		if (params->links[link].bss)
+			cfg80211_hold_bss(
+				bss_from_pub(params->links[link].bss));
+		ev->cr.links[link].bss = params->links[link].bss;
+
+		if (params->links[link].addr) {
+			ev->cr.links[link].addr = next;
+			memcpy((void *)ev->cr.links[link].addr,
+			       params->links[link].addr,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+		if (params->links[link].bssid) {
+			ev->cr.links[link].bssid = next;
+			memcpy((void *)ev->cr.links[link].bssid,
+			       params->links[link].bssid,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+	}
 	ev->cr.status = params->status;
 	ev->cr.timeout_reason = params->timeout_reason;
 
@@ -927,6 +1028,9 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 #ifdef CONFIG_CFG80211_WEXT
 	union iwreq_data wrqu;
 #endif
+	unsigned int link;
+	const u8 *connected_addr;
+
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
@@ -936,48 +1040,76 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 	if (WARN_ON(!wdev->connected))
 		goto out;
 
+	if (info->valid_links) {
+		if (WARN_ON(!info->ap_mld_addr))
+			goto out;
+
+		for_each_valid_link(info, link) {
+			if (WARN_ON(!info->links[link].addr))
+				goto out;
+		}
+	}
+
 	cfg80211_wdev_release_bsses(wdev);
 
-	if (WARN_ON(!info->bss))
-		return;
+	for_each_valid_link(info, link) {
+		if (WARN_ON(!info->links[link].bss))
+			goto out;
+	}
 
-	cfg80211_hold_bss(bss_from_pub(info->bss));
-	wdev->links[0].client.current_bss = bss_from_pub(info->bss);
-	ether_addr_copy(wdev->u.client.connected_addr, info->bss->bssid);
+	memset(wdev->links, 0, sizeof(wdev->links));
+	wdev->valid_links = info->valid_links;
+	for_each_valid_link(info, link) {
+		cfg80211_hold_bss(bss_from_pub(info->links[link].bss));
+		wdev->links[link].client.current_bss =
+			bss_from_pub(info->links[link].bss);
+	}
 
+	connected_addr = info->valid_links ?
+			 info->ap_mld_addr :
+			 info->links[0].bss->bssid;
+	ether_addr_copy(wdev->u.client.connected_addr, connected_addr);
+	if (info->valid_links) {
+		for_each_valid_link(info, link)
+			memcpy(wdev->links[link].addr, info->links[link].addr,
+			       ETH_ALEN);
+	}
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
 			    wdev->netdev, info, GFP_KERNEL);
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (info->req_ie) {
-		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = info->req_ie_len;
-		wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
-				    &wrqu, info->req_ie);
-	}
+	if (!info->valid_links) {
+		if (info->req_ie) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = info->req_ie_len;
+			wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
+					    &wrqu, info->req_ie);
+		}
+
+		if (info->resp_ie) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = info->resp_ie_len;
+			wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
+					    &wrqu, info->resp_ie);
+		}
 
-	if (info->resp_ie) {
 		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = info->resp_ie_len;
-		wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
-				    &wrqu, info->resp_ie);
+		wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+		memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN);
+		memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN);
+		wdev->wext.prev_bssid_valid = true;
+		wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
 	}
-
-	memset(&wrqu, 0, sizeof(wrqu));
-	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-	memcpy(wrqu.ap_addr.sa_data, info->bss->bssid, ETH_ALEN);
-	memcpy(wdev->wext.prev_bssid, info->bss->bssid, ETH_ALEN);
-	wdev->wext.prev_bssid_valid = true;
-	wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
 #endif
 
 	return;
 out:
-	cfg80211_put_bss(wdev->wiphy, info->bss);
+	for_each_valid_link(info, link)
+		cfg80211_put_bss(wdev->wiphy, info->links[link].bss);
 }
 
-/* Consumes info->bss object one way or another */
+/* Consumes info->links.bss object(s) one way or another */
 void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 		     gfp_t gfp)
 {
@@ -986,25 +1118,41 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	struct cfg80211_event *ev;
 	unsigned long flags;
 	u8 *next;
+	unsigned int link;
+	size_t link_info_size = 0;
+	bool bss_not_found = false;
+
+	for_each_valid_link(info, link) {
+		link_info_size += info->links[link].addr ? ETH_ALEN : 0;
+		link_info_size += info->links[link].bssid ? ETH_ALEN : 0;
 
-	if (!info->bss) {
-		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
-					     info->bssid, wdev->u.client.ssid,
-					     wdev->u.client.ssid_len,
-					     wdev->conn_bss_type,
-					     IEEE80211_PRIVACY_ANY);
+		if (info->links[link].bss)
+			continue;
+
+		info->links[link].bss =
+			cfg80211_get_bss(wdev->wiphy,
+					 info->links[link].channel,
+					 info->links[link].bssid,
+					 wdev->u.client.ssid,
+					 wdev->u.client.ssid_len,
+					 wdev->conn_bss_type,
+					 IEEE80211_PRIVACY_ANY);
+
+		if (!info->links[link].bss) {
+			bss_not_found = true;
+			break;
+		}
 	}
 
-	if (WARN_ON(!info->bss))
-		return;
+	if (WARN_ON(bss_not_found))
+		goto out;
 
 	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
 		     info->fils.kek_len + info->fils.pmk_len +
-		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
-	if (!ev) {
-		cfg80211_put_bss(wdev->wiphy, info->bss);
-		return;
-	}
+		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0) +
+		     (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp);
+	if (!ev)
+		goto out;
 
 	ev->type = EVENT_ROAMED;
 	next = ((u8 *)ev) + sizeof(*ev);
@@ -1044,12 +1192,43 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
 	if (info->fils.update_erp_next_seq_num)
 		ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
-	ev->rm.bss = info->bss;
+	if (info->ap_mld_addr) {
+		ev->rm.ap_mld_addr = next;
+		memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr,
+		       ETH_ALEN);
+		next += ETH_ALEN;
+	}
+	ev->rm.valid_links = info->valid_links;
+	for_each_valid_link(info, link) {
+		ev->rm.links[link].bss = info->links[link].bss;
+
+		if (info->links[link].addr) {
+			ev->rm.links[link].addr = next;
+			memcpy((void *)ev->rm.links[link].addr,
+			       info->links[link].addr,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+
+		if (info->links[link].bssid) {
+			ev->rm.links[link].bssid = next;
+			memcpy((void *)ev->rm.links[link].bssid,
+			       info->links[link].bssid,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+	}
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
 	spin_unlock_irqrestore(&wdev->event_lock, flags);
 	queue_work(cfg80211_wq, &rdev->event_work);
+
+	return;
+out:
+	for_each_valid_link(info, link)
+		cfg80211_put_bss(wdev->wiphy, info->links[link].bss);
+
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
-- 
cgit 


From 394fb16954793d387e375b645d44fcfd8602bfda Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Wed, 25 May 2022 16:43:59 +0200
Subject: dt-bindings: interconnect: Add Qualcomm SM6350 NoC support

Add bindings for Qualcomm SM6350 Network-On-Chip interconnect devices.

As SM6350 has two pairs of NoCs sharing the same reg, allow this in the
binding documentation, as was done for qcm2290.

Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220525144404.200390-4-luca.weiss@fairphone.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 .../bindings/interconnect/qcom,sm6350-rpmh.yaml    |  82 ++++++++++++
 include/dt-bindings/interconnect/qcom,sm6350.h     | 148 +++++++++++++++++++++
 2 files changed, 230 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/interconnect/qcom,sm6350-rpmh.yaml
 create mode 100644 include/dt-bindings/interconnect/qcom,sm6350.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sm6350-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sm6350-rpmh.yaml
new file mode 100644
index 000000000000..49eb156b08e0
--- /dev/null
+++ b/Documentation/devicetree/bindings/interconnect/qcom,sm6350-rpmh.yaml
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interconnect/qcom,sm6350-rpmh.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM6350 RPMh Network-On-Chip Interconnect
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description:
+  Qualcomm RPMh-based interconnect provider on SM6350.
+
+allOf:
+  - $ref: qcom,rpmh-common.yaml#
+
+properties:
+  compatible:
+    enum:
+      - qcom,sm6350-aggre1-noc
+      - qcom,sm6350-aggre2-noc
+      - qcom,sm6350-config-noc
+      - qcom,sm6350-dc-noc
+      - qcom,sm6350-gem-noc
+      - qcom,sm6350-mmss-noc
+      - qcom,sm6350-npu-noc
+      - qcom,sm6350-system-noc
+
+  reg:
+    maxItems: 1
+
+  '#interconnect-cells': true
+
+patternProperties:
+  '^interconnect-[a-z0-9\-]+$':
+    type: object
+    description:
+      The interconnect providers do not have a separate QoS register space,
+      but share parent's space.
+    $ref: qcom,rpmh-common.yaml#
+
+    properties:
+      compatible:
+        enum:
+          - qcom,sm6350-clk-virt
+          - qcom,sm6350-compute-noc
+
+      '#interconnect-cells': true
+
+    required:
+      - compatible
+
+    unevaluatedProperties: false
+
+required:
+  - compatible
+  - reg
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    config_noc: interconnect@1500000 {
+        compatible = "qcom,sm6350-config-noc";
+        reg = <0x01500000 0x28000>;
+        #interconnect-cells = <2>;
+        qcom,bcm-voters = <&apps_bcm_voter>;
+    };
+
+    system_noc: interconnect@1620000 {
+        compatible = "qcom,sm6350-system-noc";
+        reg = <0x01620000 0x17080>;
+        #interconnect-cells = <2>;
+        qcom,bcm-voters = <&apps_bcm_voter>;
+
+        clk_virt: interconnect-clk-virt {
+            compatible = "qcom,sm6350-clk-virt";
+            #interconnect-cells = <2>;
+            qcom,bcm-voters = <&apps_bcm_voter>;
+        };
+    };
diff --git a/include/dt-bindings/interconnect/qcom,sm6350.h b/include/dt-bindings/interconnect/qcom,sm6350.h
new file mode 100644
index 000000000000..e662cede9aaa
--- /dev/null
+++ b/include/dt-bindings/interconnect/qcom,sm6350.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Qualcomm SM6350 interconnect IDs
+ *
+ * Copyright (C) 2022 Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_SM6350_H
+#define __DT_BINDINGS_INTERCONNECT_QCOM_SM6350_H
+
+#define MASTER_A1NOC_CFG		0
+#define MASTER_QUP_0			1
+#define MASTER_EMMC			2
+#define MASTER_UFS_MEM			3
+#define A1NOC_SNOC_SLV			4
+#define SLAVE_SERVICE_A1NOC		5
+
+#define MASTER_A2NOC_CFG		0
+#define MASTER_QDSS_BAM			1
+#define MASTER_QUP_1			2
+#define MASTER_CRYPTO_CORE_0		3
+#define MASTER_IPA			4
+#define MASTER_QDSS_ETR			5
+#define MASTER_SDCC_2			6
+#define MASTER_USB3			7
+#define A2NOC_SNOC_SLV			8
+#define SLAVE_SERVICE_A2NOC		9
+
+#define MASTER_CAMNOC_HF0_UNCOMP	0
+#define MASTER_CAMNOC_ICP_UNCOMP	1
+#define MASTER_CAMNOC_SF_UNCOMP		2
+#define MASTER_QUP_CORE_0		3
+#define MASTER_QUP_CORE_1		4
+#define MASTER_LLCC			5
+#define SLAVE_CAMNOC_UNCOMP		6
+#define SLAVE_QUP_CORE_0		7
+#define SLAVE_QUP_CORE_1		8
+#define SLAVE_EBI_CH0			9
+
+#define MASTER_NPU			0
+#define MASTER_NPU_PROC			1
+#define SLAVE_CDSP_GEM_NOC		2
+
+#define SNOC_CNOC_MAS			0
+#define MASTER_QDSS_DAP			1
+#define SLAVE_A1NOC_CFG			2
+#define SLAVE_A2NOC_CFG			3
+#define SLAVE_AHB2PHY			4
+#define SLAVE_AHB2PHY_2			5
+#define SLAVE_AOSS			6
+#define SLAVE_BOOT_ROM			7
+#define SLAVE_CAMERA_CFG		8
+#define SLAVE_CAMERA_NRT_THROTTLE_CFG	9
+#define SLAVE_CAMERA_RT_THROTTLE_CFG	10
+#define SLAVE_CLK_CTL			11
+#define SLAVE_RBCPR_CX_CFG		12
+#define SLAVE_RBCPR_MX_CFG		13
+#define SLAVE_CRYPTO_0_CFG		14
+#define SLAVE_DCC_CFG			15
+#define SLAVE_CNOC_DDRSS		16
+#define SLAVE_DISPLAY_CFG		17
+#define SLAVE_DISPLAY_THROTTLE_CFG	18
+#define SLAVE_EMMC_CFG			19
+#define SLAVE_GLM			20
+#define SLAVE_GRAPHICS_3D_CFG		21
+#define SLAVE_IMEM_CFG			22
+#define SLAVE_IPA_CFG			23
+#define SLAVE_CNOC_MNOC_CFG		24
+#define SLAVE_CNOC_MSS			25
+#define SLAVE_NPU_CFG			26
+#define SLAVE_PDM			27
+#define SLAVE_PIMEM_CFG			28
+#define SLAVE_PRNG			29
+#define SLAVE_QDSS_CFG			30
+#define SLAVE_QM_CFG			31
+#define SLAVE_QM_MPU_CFG		32
+#define SLAVE_QUP_0			33
+#define SLAVE_QUP_1			34
+#define SLAVE_SDCC_2			35
+#define SLAVE_SECURITY			36
+#define SLAVE_SNOC_CFG			37
+#define SLAVE_TCSR			38
+#define SLAVE_UFS_MEM_CFG		39
+#define SLAVE_USB3			40
+#define SLAVE_VENUS_CFG			41
+#define SLAVE_VENUS_THROTTLE_CFG	42
+#define SLAVE_VSENSE_CTRL_CFG		43
+#define SLAVE_SERVICE_CNOC		44
+
+#define MASTER_CNOC_DC_NOC		0
+#define SLAVE_GEM_NOC_CFG		1
+#define SLAVE_LLCC_CFG			2
+
+#define MASTER_AMPSS_M0			0
+#define MASTER_SYS_TCU			1
+#define MASTER_GEM_NOC_CFG		2
+#define MASTER_COMPUTE_NOC		3
+#define MASTER_MNOC_HF_MEM_NOC		4
+#define MASTER_MNOC_SF_MEM_NOC		5
+#define MASTER_SNOC_GC_MEM_NOC		6
+#define MASTER_SNOC_SF_MEM_NOC		7
+#define MASTER_GRAPHICS_3D		8
+#define SLAVE_MCDMA_MS_MPU_CFG		9
+#define SLAVE_MSS_PROC_MS_MPU_CFG	10
+#define SLAVE_GEM_NOC_SNOC		11
+#define SLAVE_LLCC			12
+#define SLAVE_SERVICE_GEM_NOC		13
+
+#define MASTER_CNOC_MNOC_CFG		0
+#define MASTER_VIDEO_P0			1
+#define MASTER_VIDEO_PROC		2
+#define MASTER_CAMNOC_HF		3
+#define MASTER_CAMNOC_ICP		4
+#define MASTER_CAMNOC_SF		5
+#define MASTER_MDP_PORT0		6
+#define SLAVE_MNOC_HF_MEM_NOC		7
+#define SLAVE_MNOC_SF_MEM_NOC		8
+#define SLAVE_SERVICE_MNOC		9
+
+#define MASTER_NPU_SYS			0
+#define MASTER_NPU_NOC_CFG		1
+#define SLAVE_NPU_CAL_DP0		2
+#define SLAVE_NPU_CP			3
+#define SLAVE_NPU_INT_DMA_BWMON_CFG	4
+#define SLAVE_NPU_DPM			5
+#define SLAVE_ISENSE_CFG		6
+#define SLAVE_NPU_LLM_CFG		7
+#define SLAVE_NPU_TCM			8
+#define SLAVE_NPU_COMPUTE_NOC		9
+#define SLAVE_SERVICE_NPU_NOC		10
+
+#define MASTER_SNOC_CFG			0
+#define A1NOC_SNOC_MAS			1
+#define A2NOC_SNOC_MAS			2
+#define MASTER_GEM_NOC_SNOC		3
+#define MASTER_PIMEM			4
+#define MASTER_GIC			5
+#define SLAVE_APPSS			6
+#define SNOC_CNOC_SLV			7
+#define SLAVE_SNOC_GEM_NOC_GC		8
+#define SLAVE_SNOC_GEM_NOC_SF		9
+#define SLAVE_OCIMEM			10
+#define SLAVE_PIMEM			11
+#define SLAVE_SERVICE_SNOC		12
+#define SLAVE_QDSS_STM			13
+#define SLAVE_TCU			14
+
+#endif
-- 
cgit 


From cd268e309c29e9a8b15a47f684d848c1d57fe150 Mon Sep 17 00:00:00 2001
From: David Virag <virag.david003@gmail.com>
Date: Thu, 2 Jun 2022 01:37:39 +0200
Subject: dt-bindings: clock: Add bindings for Exynos7885 CMU_FSYS

CMU_FSYS clock domain provides clocks for MMC (MMC_CARD, MMC_EMBD,
MMC_SDIO), and USB30DRD.

Add clock indices and bindings documentation for CMU_FSYS domain.

Signed-off-by: David Virag <virag.david003@gmail.com>
Reviewed-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220601233743.56317-2-virag.david003@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../bindings/clock/samsung,exynos7885-clock.yaml   | 27 +++++++++++++++++++
 include/dt-bindings/clock/exynos7885.h             | 31 +++++++++++++++++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml
index 5073e569a47f..006d33a9e0f1 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml
@@ -33,6 +33,7 @@ properties:
     enum:
       - samsung,exynos7885-cmu-top
       - samsung,exynos7885-cmu-core
+      - samsung,exynos7885-cmu-fsys
       - samsung,exynos7885-cmu-peri
 
   clocks:
@@ -88,6 +89,32 @@ allOf:
             - const: dout_core_cci
             - const: dout_core_g3d
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: samsung,exynos7885-cmu-fsys
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (26 MHz)
+            - description: CMU_FSYS bus clock (from CMU_TOP)
+            - description: MMC_CARD clock (from CMU_TOP)
+            - description: MMC_EMBD clock (from CMU_TOP)
+            - description: MMC_SDIO clock (from CMU_TOP)
+            - description: USB30DRD clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: dout_fsys_bus
+            - const: dout_fsys_mmc_card
+            - const: dout_fsys_mmc_embd
+            - const: dout_fsys_mmc_sdio
+            - const: dout_fsys_usb30drd
+
   - if:
       properties:
         compatible:
diff --git a/include/dt-bindings/clock/exynos7885.h b/include/dt-bindings/clock/exynos7885.h
index 1f8701691d62..d2e1483f93e4 100644
--- a/include/dt-bindings/clock/exynos7885.h
+++ b/include/dt-bindings/clock/exynos7885.h
@@ -54,7 +54,22 @@
 #define CLK_GOUT_PERI_USI0		43
 #define CLK_GOUT_PERI_USI1		44
 #define CLK_GOUT_PERI_USI2		45
-#define TOP_NR_CLK			46
+#define CLK_MOUT_FSYS_BUS		46
+#define CLK_MOUT_FSYS_MMC_CARD		47
+#define CLK_MOUT_FSYS_MMC_EMBD		48
+#define CLK_MOUT_FSYS_MMC_SDIO		49
+#define CLK_MOUT_FSYS_USB30DRD		50
+#define CLK_DOUT_FSYS_BUS		51
+#define CLK_DOUT_FSYS_MMC_CARD		52
+#define CLK_DOUT_FSYS_MMC_EMBD		53
+#define CLK_DOUT_FSYS_MMC_SDIO		54
+#define CLK_DOUT_FSYS_USB30DRD		55
+#define CLK_GOUT_FSYS_BUS		56
+#define CLK_GOUT_FSYS_MMC_CARD		57
+#define CLK_GOUT_FSYS_MMC_EMBD		58
+#define CLK_GOUT_FSYS_MMC_SDIO		59
+#define CLK_GOUT_FSYS_USB30DRD		60
+#define TOP_NR_CLK			61
 
 /* CMU_CORE */
 #define CLK_MOUT_CORE_BUS_USER		1
@@ -112,4 +127,18 @@
 #define CLK_GOUT_WDT1_PCLK		43
 #define PERI_NR_CLK			44
 
+/* CMU_FSYS */
+#define CLK_MOUT_FSYS_BUS_USER		1
+#define CLK_MOUT_FSYS_MMC_CARD_USER	2
+#define CLK_MOUT_FSYS_MMC_EMBD_USER	3
+#define CLK_MOUT_FSYS_MMC_SDIO_USER	4
+#define CLK_MOUT_FSYS_USB30DRD_USER	4
+#define CLK_GOUT_MMC_CARD_ACLK		5
+#define CLK_GOUT_MMC_CARD_SDCLKIN	6
+#define CLK_GOUT_MMC_EMBD_ACLK		7
+#define CLK_GOUT_MMC_EMBD_SDCLKIN	8
+#define CLK_GOUT_MMC_SDIO_ACLK		9
+#define CLK_GOUT_MMC_SDIO_SDCLKIN	10
+#define FSYS_NR_CLK			11
+
 #endif /* _DT_BINDINGS_CLOCK_EXYNOS_7885_H */
-- 
cgit 


From e756e932a3a16418cd8bad757b028bfb337b4a51 Mon Sep 17 00:00:00 2001
From: David Virag <virag.david003@gmail.com>
Date: Thu, 2 Jun 2022 01:37:40 +0200
Subject: dt-bindings: clock: Add indices for Exynos7885 TREX clocks

TREX D Core and P core clocks seem to be related to the BTS (Bus Traffic
Shaper) inside the Exynos7885 SoC, and are needed for the SoC to
function correctly.

Add indices for these clocks.

Signed-off-by: David Virag <virag.david003@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220601233743.56317-3-virag.david003@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/dt-bindings/clock/exynos7885.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/exynos7885.h b/include/dt-bindings/clock/exynos7885.h
index d2e1483f93e4..8256e7430b63 100644
--- a/include/dt-bindings/clock/exynos7885.h
+++ b/include/dt-bindings/clock/exynos7885.h
@@ -72,14 +72,21 @@
 #define TOP_NR_CLK			61
 
 /* CMU_CORE */
-#define CLK_MOUT_CORE_BUS_USER		1
-#define CLK_MOUT_CORE_CCI_USER		2
-#define CLK_MOUT_CORE_G3D_USER		3
-#define CLK_MOUT_CORE_GIC		4
-#define CLK_DOUT_CORE_BUSP		5
-#define CLK_GOUT_CCI_ACLK		6
-#define CLK_GOUT_GIC400_CLK		7
-#define CORE_NR_CLK			8
+#define CLK_MOUT_CORE_BUS_USER			1
+#define CLK_MOUT_CORE_CCI_USER			2
+#define CLK_MOUT_CORE_G3D_USER			3
+#define CLK_MOUT_CORE_GIC			4
+#define CLK_DOUT_CORE_BUSP			5
+#define CLK_GOUT_CCI_ACLK			6
+#define CLK_GOUT_GIC400_CLK			7
+#define CLK_GOUT_TREX_D_CORE_ACLK		8
+#define CLK_GOUT_TREX_D_CORE_GCLK		9
+#define CLK_GOUT_TREX_D_CORE_PCLK		10
+#define CLK_GOUT_TREX_P_CORE_ACLK_P_CORE	11
+#define CLK_GOUT_TREX_P_CORE_CCLK_P_CORE	12
+#define CLK_GOUT_TREX_P_CORE_PCLK		13
+#define CLK_GOUT_TREX_P_CORE_PCLK_P_CORE	14
+#define CORE_NR_CLK				15
 
 /* CMU_PERI */
 #define CLK_MOUT_PERI_BUS_USER		1
-- 
cgit 


From 04919bed948dc22a0032a9da867b7dcb8aece4ca Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Wed, 15 Jun 2022 09:20:11 -0700
Subject: tcp: Introduce tcp_read_skb()

This patch inroduces tcp_read_skb() based on tcp_read_sock(),
a preparation for the next patch which actually introduces
a new sock ops.

TCP is special here, because it has tcp_read_sock() which is
mainly used by splice(). tcp_read_sock() supports partial read
and arbitrary offset, neither of them is needed for sockmap.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220615162014.89193-2-xiyou.wangcong@gmail.com
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp.c    | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c21a9b516f1e..7547d90fbb57 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -672,6 +672,8 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
+int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
+		 sk_read_actor_t recv_actor);
 
 void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f7309452bdce..124f384f8695 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1734,6 +1734,53 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 }
 EXPORT_SYMBOL(tcp_read_sock);
 
+int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
+		 sk_read_actor_t recv_actor)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	struct sk_buff *skb;
+	int copied = 0;
+	u32 offset;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+
+	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		int used;
+
+		__skb_unlink(skb, &sk->sk_receive_queue);
+		used = recv_actor(desc, skb, 0, skb->len);
+		if (used <= 0) {
+			if (!copied)
+				copied = used;
+			break;
+		}
+		seq += used;
+		copied += used;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+			consume_skb(skb);
+			++seq;
+			break;
+		}
+		consume_skb(skb);
+		if (!desc->count)
+			break;
+		WRITE_ONCE(tp->copied_seq, seq);
+	}
+	WRITE_ONCE(tp->copied_seq, seq);
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (copied > 0)
+		tcp_cleanup_rbuf(sk, copied);
+
+	return copied;
+}
+EXPORT_SYMBOL(tcp_read_skb);
+
 int tcp_peek_len(struct socket *sock)
 {
 	return tcp_inq(sock->sk);
-- 
cgit 


From 965b57b469a589d64d81b1688b38dcb537011bb0 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Wed, 15 Jun 2022 09:20:12 -0700
Subject: net: Introduce a new proto_ops ->read_skb()

Currently both splice() and sockmap use ->read_sock() to
read skb from receive queue, but for sockmap we only read
one entire skb at a time, so ->read_sock() is too conservative
to use. Introduce a new proto_ops ->read_skb() which supports
this sematic, with this we can finally pass the ownership of
skb to recv actors.

For non-TCP protocols, all ->read_sock() can be simply
converted to ->read_skb().

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com
---
 include/linux/net.h |  4 ++++
 include/net/tcp.h   |  3 +--
 include/net/udp.h   |  3 +--
 net/core/skmsg.c    | 20 +++++---------------
 net/ipv4/af_inet.c  |  3 ++-
 net/ipv4/tcp.c      |  9 +++------
 net/ipv4/udp.c      | 10 ++++------
 net/ipv6/af_inet6.c |  3 ++-
 net/unix/af_unix.c  | 23 +++++++++--------------
 9 files changed, 31 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 12093f4db50c..a03485e8cbb2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -152,6 +152,8 @@ struct module;
 struct sk_buff;
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 			       unsigned int, size_t);
+typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *);
+
 
 struct proto_ops {
 	int		family;
@@ -214,6 +216,8 @@ struct proto_ops {
 	 */
 	int		(*read_sock)(struct sock *sk, read_descriptor_t *desc,
 				     sk_read_actor_t recv_actor);
+	/* This is different from read_sock(), it reads an entire skb at a time. */
+	int		(*read_skb)(struct sock *sk, skb_read_actor_t recv_actor);
 	int		(*sendpage_locked)(struct sock *sk, struct page *page,
 					   int offset, size_t size, int flags);
 	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7547d90fbb57..8e48dc56837b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -672,8 +672,7 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
-int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
-		 sk_read_actor_t recv_actor);
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 
 void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index b60eea2e3fae..987f7fc7c0aa 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -306,8 +306,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			       struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
-int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
-		  sk_read_actor_t recv_actor);
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 
 /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
  * possibly multiple cache miss on dequeue()
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 7e03f96e441b..f7f63b7d990c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1160,21 +1160,17 @@ static void sk_psock_done_strp(struct sk_psock *psock)
 }
 #endif /* CONFIG_BPF_STREAM_PARSER */
 
-static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
-				 unsigned int offset, size_t orig_len)
+static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
 {
-	struct sock *sk = (struct sock *)desc->arg.data;
 	struct sk_psock *psock;
 	struct bpf_prog *prog;
 	int ret = __SK_DROP;
-	int len = orig_len;
+	int len = skb->len;
 
 	/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
 	skb = skb_clone(skb, GFP_ATOMIC);
-	if (!skb) {
-		desc->error = -ENOMEM;
+	if (!skb)
 		return 0;
-	}
 
 	rcu_read_lock();
 	psock = sk_psock(sk);
@@ -1204,16 +1200,10 @@ out:
 static void sk_psock_verdict_data_ready(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
-	read_descriptor_t desc;
 
-	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+	if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
 		return;
-
-	desc.arg.data = sk;
-	desc.error = 0;
-	desc.count = 1;
-
-	sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+	sock->ops->read_skb(sk, sk_psock_verdict_recv);
 }
 
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index da81f56fdd1c..7abd652a558f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = {
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
+	.read_skb	   = tcp_read_skb,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
 	.peek_len	   = tcp_peek_len,
@@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
-	.read_sock	   = udp_read_sock,
+	.read_skb	   = udp_read_skb,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 124f384f8695..9d2fd3ced21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1734,8 +1734,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 }
 EXPORT_SYMBOL(tcp_read_sock);
 
-int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
-		 sk_read_actor_t recv_actor)
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 seq = tp->copied_seq;
@@ -1750,7 +1749,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
 		int used;
 
 		__skb_unlink(skb, &sk->sk_receive_queue);
-		used = recv_actor(desc, skb, 0, skb->len);
+		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
 				copied = used;
@@ -1765,9 +1764,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
 			break;
 		}
 		consume_skb(skb);
-		if (!desc->count)
-			break;
-		WRITE_ONCE(tp->copied_seq, seq);
+		break;
 	}
 	WRITE_ONCE(tp->copied_seq, seq);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6172b4750a88..c660b0bc4d14 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1797,8 +1797,7 @@ busy_check:
 }
 EXPORT_SYMBOL(__skb_recv_udp);
 
-int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
-		  sk_read_actor_t recv_actor)
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	int copied = 0;
 
@@ -1820,7 +1819,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			continue;
 		}
 
-		used = recv_actor(desc, skb, 0, skb->len);
+		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
 				copied = used;
@@ -1831,13 +1830,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		}
 
 		kfree_skb(skb);
-		if (!desc->count)
-			break;
+		break;
 	}
 
 	return copied;
 }
-EXPORT_SYMBOL(udp_read_sock);
+EXPORT_SYMBOL(udp_read_skb);
 
 /*
  * 	This should be easy, if there is something there we
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 658823e91eca..0ee0770e79aa 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -702,6 +702,7 @@ const struct proto_ops inet6_stream_ops = {
 	.sendpage_locked   = tcp_sendpage_locked,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
+	.read_skb	   = tcp_read_skb,
 	.peek_len	   = tcp_peek_len,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet6_compat_ioctl,
@@ -727,7 +728,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet6_sendmsg,		/* retpoline's sake */
 	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
-	.read_sock	   = udp_read_sock,
+	.read_skb	   = udp_read_skb,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3453e0053f76..1bed3739768c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 				       unsigned int flags);
 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
-			  sk_read_actor_t recv_actor);
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
-				 sk_read_actor_t recv_actor);
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 			      int, int);
 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = {
 	.shutdown =	unix_shutdown,
 	.sendmsg =	unix_stream_sendmsg,
 	.recvmsg =	unix_stream_recvmsg,
-	.read_sock =	unix_stream_read_sock,
+	.read_skb =	unix_stream_read_skb,
 	.mmap =		sock_no_mmap,
 	.sendpage =	unix_stream_sendpage,
 	.splice_read =	unix_stream_splice_read,
@@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
 	.sendmsg =	unix_dgram_sendmsg,
-	.read_sock =	unix_read_sock,
+	.read_skb =	unix_read_skb,
 	.recvmsg =	unix_dgram_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
@@ -2487,8 +2485,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
 	return __unix_dgram_recvmsg(sk, msg, size, flags);
 }
 
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
-			  sk_read_actor_t recv_actor)
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	int copied = 0;
 
@@ -2503,7 +2500,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 		if (!skb)
 			return err;
 
-		used = recv_actor(desc, skb, 0, skb->len);
+		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)
 				copied = used;
@@ -2514,8 +2511,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 		}
 
 		kfree_skb(skb);
-		if (!desc->count)
-			break;
+		break;
 	}
 
 	return copied;
@@ -2650,13 +2646,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 }
 #endif
 
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
-				 sk_read_actor_t recv_actor)
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
 		return -ENOTCONN;
 
-	return unix_read_sock(sk, desc, recv_actor);
+	return unix_read_skb(sk, recv_actor);
 }
 
 static int unix_stream_read_generic(struct unix_stream_read_state *state,
-- 
cgit 


From 414c12385d4741e35d88670c6cc2f40a77809734 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 13 Apr 2022 15:17:25 -0700
Subject: rcu: Provide a get_completed_synchronize_rcu() function

It is currently up to the caller to handle stale return values from
get_state_synchronize_rcu().  If poll_state_synchronize_rcu() returned
true once, a grace period has elapsed, regardless of the fact that counter
wrap might cause some future poll_state_synchronize_rcu() invocation to
return false.  For example, the caller might store a separate flag that
indicates whether some previous call to poll_state_synchronize_rcu()
determined that the relevant grace period had already ended.

This approach works, but it requires extra storage and is easy to get
wrong.  This commit therefore introduces a get_completed_synchronize_rcu()
that returns a cookie that causes poll_state_synchronize_rcu() to always
return true.  This already-completed cookie can be stored in place of the
cookie that previously caused poll_state_synchronize_rcu() to return true.
It can also be used to flag a given structure as not having been exposed
to readers, and thus not requiring a grace period to elapse.

This commit is in preparation for polled expedited grace periods.

Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
Cc: Brian Foster <bfoster@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |  1 +
 kernel/rcu/rcu.h         |  3 +++
 kernel/rcu/tiny.c        |  4 ++--
 kernel/rcu/tree.c        |  3 ++-
 kernel/rcu/update.c      | 13 +++++++++++++
 5 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1a32036c918c..7f12daa4708b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,6 +41,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
 void rcu_barrier_tasks_rude(void);
 void synchronize_rcu(void);
+unsigned long get_completed_synchronize_rcu(void);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0adb55941aeb..32291f4eefde 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -23,6 +23,9 @@
 #define RCU_SEQ_CTR_SHIFT	2
 #define RCU_SEQ_STATE_MASK	((1 << RCU_SEQ_CTR_SHIFT) - 1)
 
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED	0x1
+
 extern int sysctl_sched_rt_runtime;
 
 /*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 340b3f8b090d..dbee6bea6726 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -58,7 +58,7 @@ void rcu_qs(void)
 		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
 		raise_softirq_irqoff(RCU_SOFTIRQ);
 	}
-	WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
+	WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
 	local_irq_restore(flags);
 }
 
@@ -213,7 +213,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  */
 bool poll_state_synchronize_rcu(unsigned long oldstate)
 {
-	return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+	return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ec28e259774e..46cfceea8784 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3924,7 +3924,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  */
 bool poll_state_synchronize_rcu(unsigned long oldstate)
 {
-	if (rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) {
+	if (oldstate == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fc7fef575606..2e93acad1e31 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
 module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
 
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+	return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
 #ifdef CONFIG_PROVE_RCU
 
 /*
-- 
cgit 


From 3847b64570b1753e9863a4106aec03f4d68e7b17 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 23 May 2022 20:50:11 -0700
Subject: rcu-tasks: Merge state into .b.need_qs and atomically update

This commit gets rid of the task_struct structure's ->trc_reader_checked
field, making it instead be a bit within the task_struct structure's
existing ->trc_reader_special.b.need_qs field.  This commit also
atomically loads, stores, and checks the resulting combination of the
reader-checked and need-quiescent state flags.  This will in turn allow
significant simplification of the rcu_tasks_trace_postgp() function
as well as elimination of the trc_n_readers_need_end counter in later
commits.  These changes will in turn simplify later elimination of the
RCU Tasks Trace scan of the task list, which will make RCU Tasks Trace
grace periods less CPU-intensive.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: KP Singh <kpsingh@kernel.org>
---
 include/linux/rcupdate.h |  18 +++++----
 include/linux/sched.h    |   1 -
 kernel/rcu/tasks.h       | 103 ++++++++++++++++++++++++++++++++---------------
 3 files changed, 82 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1a32036c918c..1e728d544fc1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -169,13 +169,17 @@ void synchronize_rcu_tasks(void);
 # endif
 
 # ifdef CONFIG_TASKS_TRACE_RCU
-# define rcu_tasks_trace_qs(t)						\
-	do {								\
-		if (!likely(READ_ONCE((t)->trc_reader_checked)) &&	\
-		    !unlikely(READ_ONCE((t)->trc_reader_nesting))) {	\
-			smp_store_release(&(t)->trc_reader_checked, true); \
-			smp_mb(); /* Readers partitioned by store. */	\
-		}							\
+// Bits for ->trc_reader_special.b.need_qs field.
+#define TRC_NEED_QS		0x1  // Task needs a quiescent state.
+#define TRC_NEED_QS_CHECKED	0x2  // Task has been checked for needing quiescent state.
+
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
+
+# define rcu_tasks_trace_qs(t)							\
+	do {									\
+		if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) &&	\
+		    likely(!READ_ONCE((t)->trc_reader_nesting)))		\
+			rcu_trc_cmpxchg_need_qs((t), 0,	TRC_NEED_QS_CHECKED);	\
 	} while (0)
 # else
 # define rcu_tasks_trace_qs(t) do { } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..e6eb5871593e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -843,7 +843,6 @@ struct task_struct {
 	int				trc_reader_nesting;
 	int				trc_ipi_to_cpu;
 	union rcu_special		trc_reader_special;
-	bool				trc_reader_checked;
 	struct list_head		trc_holdout_list;
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index bd9f2e24f5c7..7bdc62606816 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1208,6 +1208,39 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
 DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
 		 "RCU Tasks Trace");
 
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+	smp_mb(); // Enforce full grace-period ordering.
+	return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+	smp_store_release(&t->trc_reader_special.b.need_qs, v);
+	smp_mb(); // Enforce full grace-period ordering.
+}
+
+/*
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
+ */
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
+{
+	union rcu_special ret;
+	union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+	union rcu_special trs_new = trs_old;
+
+	if (trs_old.b.need_qs != old)
+		return trs_old.b.need_qs;
+	trs_new.b.need_qs = new;
+	ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+	return ret.b.need_qs;
+}
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
+
 /*
  * This irq_work handler allows rcu_read_unlock_trace() to be invoked
  * while the scheduler locks are held.
@@ -1221,16 +1254,20 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
 /* If we are the last reader, wake up the grace-period kthread. */
 void rcu_read_unlock_trace_special(struct task_struct *t)
 {
-	int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
+	int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS));
 
-	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
-	    t->trc_reader_special.b.need_mb)
+	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
 		smp_mb(); // Pairs with update-side barriers.
 	// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
-	if (nq)
-		WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
+	if (nqs) {
+		u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+						       TRC_NEED_QS_CHECKED);
+
+		WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS),
+			  "%s: result = %d", __func__, result);
+	}
 	WRITE_ONCE(t->trc_reader_nesting, 0);
-	if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
+	if (nqs && atomic_dec_and_test(&trc_n_readers_need_end))
 		irq_work_queue(&rcu_tasks_trace_iw);
 }
 EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
@@ -1260,27 +1297,24 @@ static void trc_read_check_handler(void *t_in)
 	struct task_struct *texp = t_in;
 
 	// If the task is no longer running on this CPU, leave.
-	if (unlikely(texp != t)) {
+	if (unlikely(texp != t))
 		goto reset_ipi; // Already on holdout list, so will check later.
-	}
 
 	// If the task is not in a read-side critical section, and
 	// if this is the last reader, awaken the grace-period kthread.
 	if (likely(!READ_ONCE(t->trc_reader_nesting))) {
-		WRITE_ONCE(t->trc_reader_checked, true);
+		rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
 		goto reset_ipi;
 	}
 	// If we are racing with an rcu_read_unlock_trace(), try again later.
 	if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
 		goto reset_ipi;
-	WRITE_ONCE(t->trc_reader_checked, true);
 
 	// Get here if the task is in a read-side critical section.  Set
 	// its state so that it will awaken the grace-period kthread upon
 	// exit from that critical section.
-	atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-	WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-	WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+	if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+		atomic_inc(&trc_n_readers_need_end); // One more to wait on.
 
 reset_ipi:
 	// Allow future IPIs to be sent on CPU and for task.
@@ -1291,8 +1325,9 @@ reset_ipi:
 }
 
 /* Callback function for scheduler to check locked-down task.  */
-static int trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
 {
+	struct list_head *bhp = bhp_in;
 	int cpu = task_cpu(t);
 	int nesting;
 	bool ofl = cpu_is_offline(cpu);
@@ -1323,16 +1358,19 @@ static int trc_inspect_reader(struct task_struct *t, void *arg)
 	// If not exiting a read-side critical section, mark as checked
 	// so that the grace-period kthread will remove it from the
 	// holdout list.
-	t->trc_reader_checked = nesting >= 0;
-	if (nesting <= 0)
+	if (nesting <= 0) {
+		if (!nesting)
+			rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
 		return nesting ? -EINVAL : 0;  // If in QS, done, otherwise try again later.
+	}
 
 	// The task is in a read-side critical section, so set up its
 	// state so that it will awaken the grace-period kthread upon exit
 	// from that critical section.
-	atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-	WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-	WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+	if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) {
+		atomic_inc(&trc_n_readers_need_end); // One more to wait on.
+		trc_add_holdout(t, bhp);
+	}
 	return 0;
 }
 
@@ -1348,14 +1386,14 @@ static void trc_wait_for_one_reader(struct task_struct *t,
 
 	// The current task had better be in a quiescent state.
 	if (t == current) {
-		t->trc_reader_checked = true;
+		rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
 		WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
 		return;
 	}
 
 	// Attempt to nail down the task for inspection.
 	get_task_struct(t);
-	if (!task_call_func(t, trc_inspect_reader, NULL)) {
+	if (!task_call_func(t, trc_inspect_reader, bhp)) {
 		put_task_struct(t);
 		return;
 	}
@@ -1419,8 +1457,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t,
 	if (unlikely(t == NULL))
 		return;
 
-	WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
-	WRITE_ONCE(t->trc_reader_checked, false);
+	rcu_st_need_qs(t, 0);
 	t->trc_ipi_to_cpu = -1;
 	trc_wait_for_one_reader(t, hop);
 }
@@ -1442,7 +1479,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
 	// Wait for late-stage exiting tasks to finish exiting.
 	// These might have passed the call to exit_tasks_rcu_finish().
 	synchronize_rcu();
-	// Any tasks that exit after this point will set ->trc_reader_checked.
+	// Any tasks that exit after this point will set
+	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
 }
 
 /* Communicate task state back to the RCU tasks trace stall warning request. */
@@ -1460,7 +1498,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg)
 		return false; // It is running, so decline to inspect it.
 	trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
 	trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
-	trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+	trc_rdrp->needqs = rcu_ld_need_qs(t);
 	return true;
 }
 
@@ -1514,12 +1552,12 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
 	list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
 		// If safe and needed, try to check the current task.
 		if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
-		    !READ_ONCE(t->trc_reader_checked))
+		    !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
 			trc_wait_for_one_reader(t, hop);
 
 		// If check succeeded, remove this task from the list.
 		if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
-		    READ_ONCE(t->trc_reader_checked))
+		    rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
 			trc_del_holdout(t);
 		else if (needreport)
 			show_stalled_task_trace(t, firstreport);
@@ -1574,12 +1612,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
 		// Stall warning time, so make a list of the offenders.
 		rcu_read_lock();
 		for_each_process_thread(g, t)
-			if (READ_ONCE(t->trc_reader_special.b.need_qs))
+			if (rcu_ld_need_qs(t) & TRC_NEED_QS)
 				trc_add_holdout(t, &holdouts);
 		rcu_read_unlock();
 		firstreport = true;
 		list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
-			if (READ_ONCE(t->trc_reader_special.b.need_qs))
+			if (rcu_ld_need_qs(t) & TRC_NEED_QS)
 				show_stalled_task_trace(t, &firstreport);
 			trc_del_holdout(t); // Release task_struct reference.
 		}
@@ -1595,11 +1633,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
 /* Report any needed quiescent state for this exiting task. */
 static void exit_tasks_rcu_finish_trace(struct task_struct *t)
 {
-	WRITE_ONCE(t->trc_reader_checked, true);
+	rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
 	WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
-	WRITE_ONCE(t->trc_reader_nesting, 0);
-	if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
+	if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS))
 		rcu_read_unlock_trace_special(t);
+	else
+		WRITE_ONCE(t->trc_reader_nesting, 0);
 }
 
 /**
-- 
cgit 


From 434c9eefb959c36331a93617ea95df903469b99f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 16 May 2022 17:56:16 -0700
Subject: rcu-tasks: Add data structures for lightweight grace periods

This commit adds fields to task_struct and to rcu_tasks_percpu that will
be used to avoid the task-list scan for RCU Tasks Trace grace periods,
and also initializes these fields.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: KP Singh <kpsingh@kernel.org>
---
 include/linux/sched.h | 2 ++
 init/init_task.c      | 1 +
 kernel/fork.c         | 1 +
 kernel/rcu/tasks.h    | 4 ++++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6eb5871593e..b88caf54e168 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,6 +844,8 @@ struct task_struct {
 	int				trc_ipi_to_cpu;
 	union rcu_special		trc_reader_special;
 	struct list_head		trc_holdout_list;
+	struct list_head		trc_blkd_node;
+	int				trc_blkd_cpu;
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 
 	struct sched_info		sched_info;
diff --git a/init/init_task.c b/init/init_task.c
index 73cc8f03511a..ff6c4b9bfe6b 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -157,6 +157,7 @@ struct task_struct init_task
 	.trc_reader_nesting = 0,
 	.trc_reader_special.s = 0,
 	.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
+	.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
 #endif
 #ifdef CONFIG_CPUSETS
 	.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..1950eb870244 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->trc_reader_nesting = 0;
 	p->trc_reader_special.s = 0;
 	INIT_LIST_HEAD(&p->trc_holdout_list);
+	INIT_LIST_HEAD(&p->trc_blkd_node);
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 64eb4d7b142e..fd4508af055e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
  * @rtp_work: Work queue for invoking callbacks.
  * @rtp_irq_work: IRQ work queue for deferred wakeups.
  * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
  * @cpu: CPU number corresponding to this entry.
  * @rtpp: Pointer to the rcu_tasks structure.
  */
@@ -40,6 +41,7 @@ struct rcu_tasks_percpu {
 	struct work_struct rtp_work;
 	struct irq_work rtp_irq_work;
 	struct rcu_head barrier_q_head;
+	struct list_head rtp_blkd_tasks;
 	int cpu;
 	struct rcu_tasks *rtpp;
 };
@@ -256,6 +258,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 		INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
 		rtpcp->cpu = cpu;
 		rtpcp->rtpp = rtp;
+		if (!rtpcp->rtp_blkd_tasks.next)
+			INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
 		raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
 	}
 	raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
-- 
cgit 


From 0356d4e66214569de674ab2684f2e0b440a466ab Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 17 May 2022 11:30:32 -0700
Subject: rcu-tasks: Track blocked RCU Tasks Trace readers

This commit places any task that has ever blocked within its current
RCU Tasks Trace read-side critical section on a per-CPU list within the
rcu_tasks_percpu structure.  Tasks are removed from this list when they
exit by the exit_tasks_rcu_finish_trace() function.  The purpose of this
commit is to provide the information needed to eliminate the current
scan of the full task list.

This commit offsets the INT_MIN value for ->trc_reader_nesting with the
new nesting level in order to avoid queueing tasks that are exiting
their read-side critical sections.

[ paulmck: Apply kernel test robot feedback. ]
[ paulmck: Apply feedback from syzbot+9bb26e7c5e8e4fa7e641@syzkaller.appspotmail.com ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: syzbot <syzbot+9bb26e7c5e8e4fa7e641@syzkaller.appspotmail.com>
Tested-by: "Zhang, Qiang1" <qiang1.zhang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: KP Singh <kpsingh@kernel.org>
---
 include/linux/rcupdate.h       | 11 +++++++++--
 include/linux/rcupdate_trace.h |  2 +-
 kernel/rcu/tasks.h             | 22 +++++++++++++++++++++-
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1e728d544fc1..ebdfeead44e5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -174,12 +174,19 @@ void synchronize_rcu_tasks(void);
 #define TRC_NEED_QS_CHECKED	0x2  // Task has been checked for needing quiescent state.
 
 u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
+void rcu_tasks_trace_qs_blkd(struct task_struct *t);
 
 # define rcu_tasks_trace_qs(t)							\
 	do {									\
+		int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);	\
+										\
 		if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) &&	\
-		    likely(!READ_ONCE((t)->trc_reader_nesting)))		\
+		    likely(!___rttq_nesting)) {					\
 			rcu_trc_cmpxchg_need_qs((t), 0,	TRC_NEED_QS_CHECKED);	\
+		} else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&	\
+			   !READ_ONCE((t)->trc_reader_special.b.blocked)) {	\
+			rcu_tasks_trace_qs_blkd(t);				\
+		}								\
 	} while (0)
 # else
 # define rcu_tasks_trace_qs(t) do { } while (0)
@@ -188,7 +195,7 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
 #define rcu_tasks_qs(t, preempt)					\
 do {									\
 	rcu_tasks_classic_qs((t), (preempt));				\
-	rcu_tasks_trace_qs((t));					\
+	rcu_tasks_trace_qs(t);						\
 } while (0)
 
 # ifdef CONFIG_TASKS_RUDE_RCU
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 6f9c35817398..9bc8cbb33340 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -75,7 +75,7 @@ static inline void rcu_read_unlock_trace(void)
 	nesting = READ_ONCE(t->trc_reader_nesting) - 1;
 	barrier(); // Critical section before disabling.
 	// Disable IPI-based setting of .need_qs.
-	WRITE_ONCE(t->trc_reader_nesting, INT_MIN);
+	WRITE_ONCE(t->trc_reader_nesting, INT_MIN + nesting);
 	if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) {
 		WRITE_ONCE(t->trc_reader_nesting, nesting);
 		return;  // We assume shallow reader nesting.
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index fd4508af055e..bab75ec26bdb 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1261,6 +1261,24 @@ void rcu_read_unlock_trace_special(struct task_struct *t)
 }
 EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
 
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+	unsigned long flags;
+	struct rcu_tasks_percpu *rtpcp;
+
+	local_irq_save(flags);
+	rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+	raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+	t->trc_blkd_cpu = smp_processor_id();
+	if (!rtpcp->rtp_blkd_tasks.next)
+		INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+	list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+	t->trc_reader_special.b.blocked = true;
+	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
 /* Add a task to the holdout list, if it is not already on the list. */
 static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
 {
@@ -1586,9 +1604,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
 /* Report any needed quiescent state for this exiting task. */
 static void exit_tasks_rcu_finish_trace(struct task_struct *t)
 {
+	union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
 	rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
 	WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
-	if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS))
+	if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS) || trs.b.blocked)
 		rcu_read_unlock_trace_special(t);
 	else
 		WRITE_ONCE(t->trc_reader_nesting, 0);
-- 
cgit 


From 5f8a62af527ad8a615d68889d816b72c3f5f227f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:18 +0200
Subject: context_tracking: Remove unused context_tracking_in_user()

This function is not used and CT_WARN_ON() coupled with ct_state() is
the preferred way to assert context tracking state values.

Reported-by: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/context_tracking_state.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index ae1e63e26947..edc7b46376a6 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -41,12 +41,7 @@ static inline bool context_tracking_enabled_this_cpu(void)
 	return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
 }
 
-static __always_inline bool context_tracking_in_user(void)
-{
-	return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
-}
 #else
-static __always_inline bool context_tracking_in_user(void) { return false; }
 static __always_inline bool context_tracking_enabled(void) { return false; }
 static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
 static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; }
-- 
cgit 


From 2f6fe93fede802acfe010752db461ccd34745745 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jun 2022 20:10:03 +0200
Subject: ACPI: glue: Introduce acpi_find_child_by_adr()

Rearrange the ACPI device lookup code used internally by
acpi_find_child_device() so it can avoid extra checks after finding
one object with a matching _ADR and use it for defining
acpi_find_child_by_adr() that will allow the callers to find a given
ACPI device's child matching a given bus address without doing any
other checks in check_one_child().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/acpi/glue.c     | 28 ++++++++++++++++++++++++----
 include/acpi/acpi_bus.h |  2 ++
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index 5203a7c60daa..204fe94c7e45 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -119,6 +119,7 @@ struct find_child_walk_data {
 	struct acpi_device *adev;
 	u64 address;
 	int score;
+	bool check_sta;
 	bool check_children;
 };
 
@@ -131,9 +132,13 @@ static int check_one_child(struct acpi_device *adev, void *data)
 		return 0;
 
 	if (!wd->adev) {
-		/* This is the first matching object.  Save it and continue. */
+		/*
+		 * This is the first matching object, so save it.  If it is not
+		 * necessary to look for any other matching objects, stop the
+		 * search.
+		 */
 		wd->adev = adev;
-		return 0;
+		return !(wd->check_sta || wd->check_children);
 	}
 
 	/*
@@ -169,12 +174,14 @@ static int check_one_child(struct acpi_device *adev, void *data)
 	return 0;
 }
 
-struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
-					   u64 address, bool check_children)
+static struct acpi_device *acpi_find_child(struct acpi_device *parent,
+					   u64 address, bool check_children,
+					   bool check_sta)
 {
 	struct find_child_walk_data wd = {
 		.address = address,
 		.check_children = check_children,
+		.check_sta = check_sta,
 		.adev = NULL,
 		.score = 0,
 	};
@@ -184,8 +191,21 @@ struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
 
 	return wd.adev;
 }
+
+struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
+					   u64 address, bool check_children)
+{
+	return acpi_find_child(parent, address, check_children, true);
+}
 EXPORT_SYMBOL_GPL(acpi_find_child_device);
 
+struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev,
+					   acpi_bus_address adr)
+{
+	return acpi_find_child(adev, adr, false, false);
+}
+EXPORT_SYMBOL_GPL(acpi_find_child_by_adr);
+
 static void acpi_physnode_link_name(char *buf, unsigned int node_id)
 {
 	if (node_id > 0)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 0dc1ea0b52f5..597961969ca1 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -622,6 +622,8 @@ static inline int acpi_dma_configure(struct device *dev,
 }
 struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
 					   u64 address, bool check_children);
+struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev,
+					   acpi_bus_address adr);
 int acpi_is_root_bridge(acpi_handle);
 struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle);
 
-- 
cgit 


From ff32e59947c87b01dc25a8b5763d609c1a8f56eb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jun 2022 20:26:47 +0200
Subject: ACPI: bus: Introduce acpi_dev_for_each_child_reverse()

Make it possible to walk the children of an ACPI device in the revese
order by defining acpi_dev_for_each_child_reverse() in analogy with
acpi_dev_for_each_child().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/acpi/bus.c      | 12 ++++++++++++
 include/acpi/acpi_bus.h |  3 +++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 7027ff2edfba..4d7c51a33b01 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1115,6 +1115,18 @@ int acpi_dev_for_each_child(struct acpi_device *adev,
 	return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check);
 }
 
+int acpi_dev_for_each_child_reverse(struct acpi_device *adev,
+				    int (*fn)(struct acpi_device *, void *),
+				    void *data)
+{
+	struct acpi_dev_walk_context adwc = {
+		.fn = fn,
+		.data = data,
+	};
+
+	return device_for_each_child_reverse(&adev->dev, &adwc, acpi_dev_for_one_check);
+}
+
 /* --------------------------------------------------------------------------
                              Initialization/Cleanup
    -------------------------------------------------------------------------- */
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 597961969ca1..76de1aa5d221 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -483,6 +483,9 @@ extern struct bus_type acpi_bus_type;
 int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data);
 int acpi_dev_for_each_child(struct acpi_device *adev,
 			    int (*fn)(struct acpi_device *, void *), void *data);
+int acpi_dev_for_each_child_reverse(struct acpi_device *adev,
+				    int (*fn)(struct acpi_device *, void *),
+				    void *data);
 
 /*
  * Events
-- 
cgit 


From 1ade23711971b0eececf0d7fedc29d3c1d2fce01 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Jun 2022 02:53:42 +0300
Subject: bpf: Inline calls to bpf_loop when callback is known

Calls to `bpf_loop` are replaced with direct loops to avoid
indirection. E.g. the following:

  bpf_loop(10, foo, NULL, 0);

Is replaced by equivalent of the following:

  for (int i = 0; i < 10; ++i)
    foo(i, NULL);

This transformation could be applied when:
- callback is known and does not change during program execution;
- flags passed to `bpf_loop` are always zero.

Inlining logic works as follows:

- During execution simulation function `update_loop_inline_state`
  tracks the following information for each `bpf_loop` call
  instruction:
  - is callback known and constant?
  - are flags constant and zero?
- Function `optimize_bpf_loop` increases stack depth for functions
  where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop`
  to apply the inlining. The additional stack space is used to spill
  registers R6, R7 and R8. These registers are used as loop counter,
  loop maximal bound and callback context parameter;

Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/r/20220620235344.569325-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   3 +
 include/linux/bpf_verifier.h |  12 +++
 kernel/bpf/bpf_iter.c        |   9 ++-
 kernel/bpf/verifier.c        | 180 +++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 195 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0edd7d2c0064..d05e1495a06e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1286,6 +1286,9 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
+/* Maximum number of loops for bpf_loop */
+#define BPF_MAX_LOOPS	BIT(23)
+
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
 				 BPF_F_WRONLY |		\
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3930c963fa67..81b19669efba 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -344,6 +344,14 @@ struct bpf_verifier_state_list {
 	int miss_cnt, hit_cnt;
 };
 
+struct bpf_loop_inline_state {
+	int initialized:1; /* set to true upon first entry */
+	int fit_for_inline:1; /* true if callback function is the same
+			       * at each call and flags are always zero
+			       */
+	u32 callback_subprogno; /* valid when fit_for_inline is true */
+};
+
 /* Possible states for alu_state member. */
 #define BPF_ALU_SANITIZE_SRC		(1U << 0)
 #define BPF_ALU_SANITIZE_DST		(1U << 1)
@@ -373,6 +381,10 @@ struct bpf_insn_aux_data {
 				u32 mem_size;	/* mem_size for non-struct typed var */
 			};
 		} btf_var;
+		/* if instruction is a call to bpf_loop this field tracks
+		 * the state of the relevant registers to make decision about inlining
+		 */
+		struct bpf_loop_inline_state loop_inline_state;
 	};
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..7e8fd49406f6 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-/* maximum number of loops */
-#define MAX_LOOPS	BIT(23)
-
 BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	   u64, flags)
 {
@@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	u64 ret;
 	u32 i;
 
+	/* Note: these safety checks are also verified when bpf_loop
+	 * is inlined, be careful to modify this code in sync. See
+	 * function verifier.c:inline_bpf_loop.
+	 */
 	if (flags)
 		return -EINVAL;
-	if (nr_loops > MAX_LOOPS)
+	if (nr_loops > BPF_MAX_LOOPS)
 		return -E2BIG;
 
 	for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2859901ffbe3..bf72dc511df6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7124,6 +7124,41 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
 	return -ENOTSUPP;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[BPF_REG_4];
+	bool reg_is_null = register_is_null(reg);
+
+	if (reg_is_null)
+		mark_chain_precision(env, BPF_REG_4);
+
+	return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+	if (!state->initialized) {
+		state->initialized = 1;
+		state->fit_for_inline = loop_flag_is_zero(env);
+		state->callback_subprogno = subprogno;
+		return;
+	}
+
+	if (!state->fit_for_inline)
+		return;
+
+	state->fit_for_inline = (loop_flag_is_zero(env) &&
+				 state->callback_subprogno == subprogno);
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -7276,6 +7311,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
+		update_loop_inline_state(env, meta.subprogno);
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
@@ -7682,11 +7718,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
-	return &env->insn_aux_data[env->insn_idx];
-}
-
 enum {
 	REASON_BOUNDS	= -1,
 	REASON_TYPE	= -2,
@@ -14315,6 +14346,142 @@ patch_call_imm:
 	return 0;
 }
 
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+					int position,
+					s32 stack_base,
+					u32 callback_subprogno,
+					u32 *cnt)
+{
+	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+	int reg_loop_max = BPF_REG_6;
+	int reg_loop_cnt = BPF_REG_7;
+	int reg_loop_ctx = BPF_REG_8;
+
+	struct bpf_prog *new_prog;
+	u32 callback_start;
+	u32 call_insn_offset;
+	s32 callback_offset;
+
+	/* This represents an inlined version of bpf_iter.c:bpf_loop,
+	 * be careful to modify this code in sync.
+	 */
+	struct bpf_insn insn_buf[] = {
+		/* Return error and jump to the end of the patch if
+		 * expected number of iterations is too big.
+		 */
+		BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+		BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+		BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+		/* spill R6, R7, R8 to use these as loop vars */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+		/* initialize loop vars */
+		BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+		BPF_MOV32_IMM(reg_loop_cnt, 0),
+		BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+		/* loop header,
+		 * if reg_loop_cnt >= reg_loop_max skip the loop body
+		 */
+		BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+		/* callback call,
+		 * correct callback offset would be set after patching
+		 */
+		BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+		BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+		BPF_CALL_REL(0),
+		/* increment loop counter */
+		BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+		/* jump to loop header if callback returned 0 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+		/* return value of bpf_loop,
+		 * set R0 to the number of iterations
+		 */
+		BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+		/* restore original values of R6, R7, R8 */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+	};
+
+	*cnt = ARRAY_SIZE(insn_buf);
+	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+	if (!new_prog)
+		return new_prog;
+
+	/* callback start is known only after patching */
+	callback_start = env->subprog_info[callback_subprogno].start;
+	/* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+	call_insn_offset = position + 12;
+	callback_offset = callback_start - call_insn_offset - 1;
+	env->prog->insnsi[call_insn_offset].imm = callback_offset;
+
+	return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+		insn->src_reg == 0 &&
+		insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8.  These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	int i, cur_subprog = 0, cnt, delta = 0;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	u16 stack_depth = subprogs[cur_subprog].stack_depth;
+	u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+	u16 stack_depth_extra = 0;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		struct bpf_loop_inline_state *inline_state =
+			&env->insn_aux_data[i + delta].loop_inline_state;
+
+		if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+			struct bpf_prog *new_prog;
+
+			stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+			new_prog = inline_bpf_loop(env,
+						   i + delta,
+						   -(stack_depth + stack_depth_extra),
+						   inline_state->callback_subprogno,
+						   &cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta     += cnt - 1;
+			env->prog  = new_prog;
+			insn       = new_prog->insnsi + i + delta;
+		}
+
+		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+			subprogs[cur_subprog].stack_depth += stack_depth_extra;
+			cur_subprog++;
+			stack_depth = subprogs[cur_subprog].stack_depth;
+			stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+			stack_depth_extra = 0;
+		}
+	}
+
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+	return 0;
+}
+
 static void free_states(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state_list *sl, *sln;
@@ -15052,6 +15219,9 @@ skip_full_check:
 		ret = check_max_stack_depth(env);
 
 	/* instruction rewrites happen after this point */
+	if (ret == 0)
+		ret = optimize_bpf_loop(env);
+
 	if (is_priv) {
 		if (ret == 0)
 			opt_hard_wire_dead_code_branches(env);
-- 
cgit 


From f9aefd6b2aa38b9787d2705f0f1161dfd23cdb8f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 20 Jun 2022 02:30:17 -0700
Subject: net: warn if mac header was not set

Make sure skb_mac_header(), skb_mac_offset() and skb_mac_header_len() uses
are not fooled if the mac header has not been set.

These checks are enabled if CONFIG_DEBUG_NET=y

This commit will likely expose existing bugs in linux networking stacks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220620093017.3366713-1-eric.dumazet@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 82edf0359ab3..cd4a8268894a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2763,8 +2763,14 @@ static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
 	skb->network_header += offset;
 }
 
+static inline int skb_mac_header_was_set(const struct sk_buff *skb)
+{
+	return skb->mac_header != (typeof(skb->mac_header))~0U;
+}
+
 static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
 {
+	DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
 	return skb->head + skb->mac_header;
 }
 
@@ -2775,14 +2781,10 @@ static inline int skb_mac_offset(const struct sk_buff *skb)
 
 static inline u32 skb_mac_header_len(const struct sk_buff *skb)
 {
+	DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
 	return skb->network_header - skb->mac_header;
 }
 
-static inline int skb_mac_header_was_set(const struct sk_buff *skb)
-{
-	return skb->mac_header != (typeof(skb->mac_header))~0U;
-}
-
 static inline void skb_unset_mac_header(struct sk_buff *skb)
 {
 	skb->mac_header = (typeof(skb->mac_header))~0U;
-- 
cgit 


From af185d8c76333daa877678e0166a7b45e63bf3c4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 20 Jun 2022 03:05:09 -0700
Subject: raw: complete rcu conversion

raw_diag_dump() can use rcu_read_lock() instead of read_lock()

Now the hashinfo lock is only used from process context,
in write mode only, we can convert it to a spinlock,
and we do not need to block BH anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220620100509.3493504-1-eric.dumazet@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/raw.h   | 4 ++--
 net/ipv4/raw.c      | 8 ++++----
 net/ipv4/raw_diag.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index d81eeeb8f1e6..d224376360e1 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -32,7 +32,7 @@ int raw_rcv(struct sock *, struct sk_buff *);
 #define RAW_HTABLE_SIZE	MAX_INET_PROTOS
 
 struct raw_hashinfo {
-	rwlock_t lock;
+	spinlock_t lock;
 	struct hlist_nulls_head ht[RAW_HTABLE_SIZE];
 };
 
@@ -40,7 +40,7 @@ static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo)
 {
 	int i;
 
-	rwlock_init(&hashinfo->lock);
+	spin_lock_init(&hashinfo->lock);
 	for (i = 0; i < RAW_HTABLE_SIZE; i++)
 		INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i);
 }
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 959bea12dc48..027389969915 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -95,10 +95,10 @@ int raw_hash_sk(struct sock *sk)
 
 	hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
 
-	write_lock_bh(&h->lock);
+	spin_lock(&h->lock);
 	__sk_nulls_add_node_rcu(sk, hlist);
 	sock_set_flag(sk, SOCK_RCU_FREE);
-	write_unlock_bh(&h->lock);
+	spin_unlock(&h->lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
 	return 0;
@@ -109,10 +109,10 @@ void raw_unhash_sk(struct sock *sk)
 {
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 
-	write_lock_bh(&h->lock);
+	spin_lock(&h->lock);
 	if (__sk_nulls_del_node_init_rcu(sk))
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-	write_unlock_bh(&h->lock);
+	spin_unlock(&h->lock);
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index ac4b6525d3c6..999321834b94 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -156,7 +156,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
-	read_lock(&hashinfo->lock);
+	rcu_read_lock();
 	for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
 		num = 0;
 
@@ -184,7 +184,7 @@ next:
 	}
 
 out_unlock:
-	read_unlock(&hashinfo->lock);
+	rcu_read_unlock();
 
 	cb->args[0] = slot;
 	cb->args[1] = num;
-- 
cgit 


From 6b183919f7051294dc5fc331bb608d5d7f29f5da Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 21 Jun 2022 11:20:41 +0100
Subject: ASoC: core: Add new SOC_DOUBLE_SX_TLV macro

Currently macros only exist for SX style (implicit sign bit 2's
compliment) volume controls where the volumes for left and right
are in separate registers. Some future Cirrus devices will have
both volumes in the same register, as such add a new macro to
support this.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220621102041.1713504-4-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 8909cc7d311e..76ee3c2b8b56 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -136,6 +136,18 @@
 	.put = snd_soc_put_volsw, \
 	.private_value = SOC_DOUBLE_VALUE(reg, shift_left, shift_right, \
 					  max, invert, 0) }
+#define SOC_DOUBLE_SX_TLV(xname, xreg, shift_left, shift_right, xmin, xmax, tlv_array) \
+{       .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname), \
+	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ | \
+	SNDRV_CTL_ELEM_ACCESS_READWRITE, \
+	.tlv.p  = (tlv_array), \
+	.info = snd_soc_info_volsw_sx, \
+	.get = snd_soc_get_volsw_sx, \
+	.put = snd_soc_put_volsw_sx, \
+	.private_value = (unsigned long)&(struct soc_mixer_control) \
+		{.reg = xreg, .rreg = xreg, \
+		.shift = shift_left, .rshift = shift_right, \
+		.max = xmax, .min = xmin} }
 #define SOC_DOUBLE_R_TLV(xname, reg_left, reg_right, xshift, xmax, xinvert, tlv_array) \
 {	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname),\
 	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |\
-- 
cgit 


From a37599ebfb656c2af4ca119de556eba29b6926d6 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Wed, 15 Jun 2022 17:20:18 +0000
Subject: usb: typec: mux: Add CONFIG guards for functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some drivers that can use the Type C mux API, but don't have
to. Introduce CONFIG guards for the mux functions so that drivers can
include the header file and not run into compilation errors on systems
which don't have CONFIG_TYPEC enabled. When CONFIG_TYPEC is not enabled,
the Type C mux functions will be stub versions of the original calls.

Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20220615172129.1314056-3-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_mux.h | 44 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/typec_mux.h b/include/linux/usb/typec_mux.h
index ee57781dcf28..9292f0e07846 100644
--- a/include/linux/usb/typec_mux.h
+++ b/include/linux/usb/typec_mux.h
@@ -58,17 +58,13 @@ struct typec_mux_desc {
 	void *drvdata;
 };
 
+#if IS_ENABLED(CONFIG_TYPEC)
+
 struct typec_mux *fwnode_typec_mux_get(struct fwnode_handle *fwnode,
 				       const struct typec_altmode_desc *desc);
 void typec_mux_put(struct typec_mux *mux);
 int typec_mux_set(struct typec_mux *mux, struct typec_mux_state *state);
 
-static inline struct typec_mux *
-typec_mux_get(struct device *dev, const struct typec_altmode_desc *desc)
-{
-	return fwnode_typec_mux_get(dev_fwnode(dev), desc);
-}
-
 struct typec_mux_dev *
 typec_mux_register(struct device *parent, const struct typec_mux_desc *desc);
 void typec_mux_unregister(struct typec_mux_dev *mux);
@@ -76,4 +72,40 @@ void typec_mux_unregister(struct typec_mux_dev *mux);
 void typec_mux_set_drvdata(struct typec_mux_dev *mux, void *data);
 void *typec_mux_get_drvdata(struct typec_mux_dev *mux);
 
+#else
+
+static inline struct typec_mux *fwnode_typec_mux_get(struct fwnode_handle *fwnode,
+				       const struct typec_altmode_desc *desc)
+{
+	return NULL;
+}
+
+static inline void typec_mux_put(struct typec_mux *mux) {}
+
+static inline int typec_mux_set(struct typec_mux *mux, struct typec_mux_state *state)
+{
+	return 0;
+}
+
+static inline struct typec_mux_dev *
+typec_mux_register(struct device *parent, const struct typec_mux_desc *desc)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+static inline void typec_mux_unregister(struct typec_mux_dev *mux) {}
+
+static inline void typec_mux_set_drvdata(struct typec_mux_dev *mux, void *data) {}
+static inline void *typec_mux_get_drvdata(struct typec_mux_dev *mux)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+#endif /* CONFIG_TYPEC */
+
+static inline struct typec_mux *
+typec_mux_get(struct device *dev, const struct typec_altmode_desc *desc)
+{
+	return fwnode_typec_mux_get(dev_fwnode(dev), desc);
+}
+
 #endif /* __USB_TYPEC_MUX */
-- 
cgit 


From 95acd8817e66d031d2e6ee7def3f1e1874819317 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Fri, 17 Jun 2022 12:57:34 +0200
Subject: bpf, x64: Add predicate for bpf2bpf with tailcalls support in JIT

The BPF core/verifier is hard-coded to permit mixing bpf2bpf and tail
calls for only x86-64. Change the logic to instead rely on a new weak
function 'bool bpf_jit_supports_subprog_tailcalls(void)', which a capable
JIT backend can override.

Update the x86-64 eBPF JIT to reflect this.

Signed-off-by: Tony Ambardar <Tony.Ambardar@gmail.com>
[jakub: drop MIPS bits and tweak patch subject]
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220617105735.733938-2-jakub@cloudflare.com
---
 arch/x86/net/bpf_jit_comp.c | 6 ++++++
 include/linux/filter.h      | 1 +
 kernel/bpf/core.c           | 6 ++++++
 kernel/bpf/verifier.c       | 3 ++-
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f298b18a9a3d..2c51ca9f7cec 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2491,3 +2491,9 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 		return ERR_PTR(-EINVAL);
 	return dst;
 }
+
+/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool bpf_jit_supports_subprog_tailcalls(void)
+{
+	return true;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d0cbb31b1b4d..4c1a8b247545 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -914,6 +914,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
+bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b5ffebcce6cc..f023cb399e3f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2729,6 +2729,12 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+	return false;
+}
+
 bool __weak bpf_jit_supports_kfunc_call(void)
 {
 	return false;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bf72dc511df6..a20d7736a5b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6154,7 +6154,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
 
 static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
 {
-	return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
+	return env->prog->jit_requested &&
+	       bpf_jit_supports_subprog_tailcalls();
 }
 
 static int check_map_func_compatibility(struct bpf_verifier_env *env,
-- 
cgit 


From 77515ebaf01920e2db49e04672ef669a7c2907f2 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Tue, 7 Jun 2022 11:26:25 +0800
Subject: devcoredump: remove the useless gfp_t parameter in dev_coredumpv and
 dev_coredumpm

The dev_coredumpv() and dev_coredumpm() could not be used in atomic
context, because they call kvasprintf_const() and kstrdup() with
GFP_KERNEL parameter. The process is shown below:

dev_coredumpv(.., gfp_t gfp)
  dev_coredumpm(.., gfp_t gfp)
    dev_set_name
      kobject_set_name_vargs
        kvasprintf_const(GFP_KERNEL, ...); //may sleep
          kstrdup(s, GFP_KERNEL); //may sleep

This patch removes gfp_t parameter of dev_coredumpv() and dev_coredumpm()
and changes the gfp_t parameter of kzalloc() in dev_coredumpm() to
GFP_KERNEL in order to show they could not be used in atomic context.

Fixes: 833c95456a70 ("device coredump: add new device coredump class")
Reviewed-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Link: https://lore.kernel.org/r/df72af3b1862bac7d8e793d1f3931857d3779dfd.1654569290.git.duoming@zju.edu.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/devcoredump.c                               | 16 ++++++----------
 drivers/bluetooth/btmrvl_sdio.c                          |  2 +-
 drivers/bluetooth/hci_qca.c                              |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_dump.c                   |  2 +-
 drivers/gpu/drm/msm/disp/msm_disp_snapshot.c             |  4 ++--
 drivers/gpu/drm/msm/msm_gpu.c                            |  4 ++--
 drivers/media/platform/qcom/venus/core.c                 |  2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c           |  2 +-
 drivers/net/wireless/ath/ath10k/coredump.c               |  2 +-
 drivers/net/wireless/ath/wil6210/wil_crash_dump.c        |  2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c |  2 +-
 drivers/net/wireless/intel/iwlwifi/fw/dbg.c              |  6 ++----
 drivers/net/wireless/marvell/mwifiex/main.c              |  3 +--
 drivers/net/wireless/mediatek/mt76/mt7615/mac.c          |  3 +--
 drivers/net/wireless/mediatek/mt76/mt7921/mac.c          |  3 +--
 drivers/net/wireless/realtek/rtw88/main.c                |  2 +-
 drivers/net/wireless/realtek/rtw89/ser.c                 |  2 +-
 drivers/remoteproc/qcom_q6v5_mss.c                       |  2 +-
 drivers/remoteproc/remoteproc_coredump.c                 |  8 ++++----
 include/drm/drm_print.h                                  |  2 +-
 include/linux/devcoredump.h                              | 13 ++++++-------
 sound/soc/intel/avs/apl.c                                |  2 +-
 sound/soc/intel/avs/skl.c                                |  2 +-
 sound/soc/intel/catpt/dsp.c                              |  2 +-
 24 files changed, 40 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index f4d794d6bb85..8535f0bd5dfb 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -173,15 +173,13 @@ static void devcd_freev(void *data)
  * @dev: the struct device for the crashed device
  * @data: vmalloc data containing the device coredump
  * @datalen: length of the data
- * @gfp: allocation flags
  *
  * This function takes ownership of the vmalloc'ed data and will free
  * it when it is no longer used. See dev_coredumpm() for more information.
  */
-void dev_coredumpv(struct device *dev, void *data, size_t datalen,
-		   gfp_t gfp)
+void dev_coredumpv(struct device *dev, void *data, size_t datalen)
 {
-	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);
+	dev_coredumpm(dev, NULL, data, datalen, devcd_readv, devcd_freev);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpv);
 
@@ -236,7 +234,6 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
  * @owner: the module that contains the read/free functions, use %THIS_MODULE
  * @data: data cookie for the @read/@free functions
  * @datalen: length of the data
- * @gfp: allocation flags
  * @read: function to read from the given buffer
  * @free: function to free the given buffer
  *
@@ -246,7 +243,7 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
  * function will be called to free the data.
  */
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 				   void *data, size_t datalen),
 		   void (*free)(void *data))
@@ -268,7 +265,7 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 	if (!try_module_get(owner))
 		goto free;
 
-	devcd = kzalloc(sizeof(*devcd), gfp);
+	devcd = kzalloc(sizeof(*devcd), GFP_KERNEL);
 	if (!devcd)
 		goto put_module;
 
@@ -318,7 +315,6 @@ EXPORT_SYMBOL_GPL(dev_coredumpm);
  * @dev: the struct device for the crashed device
  * @table: the dump data
  * @datalen: length of the data
- * @gfp: allocation flags
  *
  * Creates a new device coredump for the given device. If a previous one hasn't
  * been read yet, the new coredump is discarded. The data lifetime is determined
@@ -326,9 +322,9 @@ EXPORT_SYMBOL_GPL(dev_coredumpm);
  * it will free the data.
  */
 void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-		    size_t datalen, gfp_t gfp)
+		    size_t datalen)
 {
-	dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,
+	dev_coredumpm(dev, NULL, table, datalen, devcd_read_from_sgtable,
 		      devcd_free_sgtable);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpsg);
diff --git a/drivers/bluetooth/btmrvl_sdio.c b/drivers/bluetooth/btmrvl_sdio.c
index b8ef66f89fc1..9b9728719db2 100644
--- a/drivers/bluetooth/btmrvl_sdio.c
+++ b/drivers/bluetooth/btmrvl_sdio.c
@@ -1515,7 +1515,7 @@ done:
 	/* fw_dump_data will be free in device coredump release function
 	 * after 5 min
 	 */
-	dev_coredumpv(&card->func->dev, fw_dump_data, fw_dump_len, GFP_KERNEL);
+	dev_coredumpv(&card->func->dev, fw_dump_data, fw_dump_len);
 	BT_INFO("== btmrvl firmware dump to /sys/class/devcoredump end");
 }
 
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index eab34e24d944..2e4074211ae9 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1120,7 +1120,7 @@ static void qca_controller_memdump(struct work_struct *work)
 				    qca_memdump->ram_dump_size);
 			memdump_buf = qca_memdump->memdump_buf_head;
 			dev_coredumpv(&hu->serdev->dev, memdump_buf,
-				      qca_memdump->received_dump, GFP_KERNEL);
+				      qca_memdump->received_dump);
 			cancel_delayed_work(&qca->ctrl_memdump_timeout);
 			kfree(qca->qca_memdump);
 			qca->qca_memdump = NULL;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index f418e0b75772..519fcb234b3e 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -225,5 +225,5 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit)
 
 	etnaviv_core_dump_header(&iter, ETDUMP_BUF_END, iter.data);
 
-	dev_coredumpv(gpu->dev, iter.start, iter.data - iter.start, GFP_KERNEL);
+	dev_coredumpv(gpu->dev, iter.start, iter.data - iter.start);
 }
diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
index e75b97127c0d..f057d294c30b 100644
--- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
+++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
@@ -74,8 +74,8 @@ static void _msm_disp_snapshot_work(struct kthread_work *work)
 	 * If there is a codedump pending for the device, the dev_coredumpm()
 	 * will also free new coredump state.
 	 */
-	dev_coredumpm(disp_state->dev, THIS_MODULE, disp_state, 0, GFP_KERNEL,
-			disp_devcoredump_read, msm_disp_state_free);
+	dev_coredumpm(disp_state->dev, THIS_MODULE, disp_state, 0,
+		      disp_devcoredump_read, msm_disp_state_free);
 }
 
 void msm_disp_snapshot_state(struct drm_device *drm_dev)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index eb8a6663f309..30576ced0a0a 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -317,8 +317,8 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
 	gpu->crashstate = state;
 
 	/* FIXME: Release the crashstate if this errors out? */
-	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
-		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0,
+		      msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
 }
 #else
 static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c
index 877eca125803..db84dfb3fb11 100644
--- a/drivers/media/platform/qcom/venus/core.c
+++ b/drivers/media/platform/qcom/venus/core.c
@@ -49,7 +49,7 @@ static void venus_coredump(struct venus_core *core)
 
 	memcpy(data, mem_va, mem_size);
 	memunmap(mem_va);
-	dev_coredumpv(dev, data, mem_size, GFP_KERNEL);
+	dev_coredumpv(dev, data, mem_size);
 }
 
 static void venus_event_notify(struct venus_core *core, u32 event)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
index c991b30bc9f0..fa520ab7c960 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
@@ -281,5 +281,5 @@ void mcp251xfd_dump(const struct mcp251xfd_priv *priv)
 	mcp251xfd_dump_end(priv, &iter);
 
 	dev_coredumpv(&priv->spi->dev, iter.start,
-		      iter.data - iter.start, GFP_KERNEL);
+		      iter.data - iter.start);
 }
diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c
index fe6b6f97a916..dc9237069921 100644
--- a/drivers/net/wireless/ath/ath10k/coredump.c
+++ b/drivers/net/wireless/ath/ath10k/coredump.c
@@ -1607,7 +1607,7 @@ int ath10k_coredump_submit(struct ath10k *ar)
 		return -ENODATA;
 	}
 
-	dev_coredumpv(ar->dev, dump, le32_to_cpu(dump->len), GFP_KERNEL);
+	dev_coredumpv(ar->dev, dump, le32_to_cpu(dump->len));
 
 	return 0;
 }
diff --git a/drivers/net/wireless/ath/wil6210/wil_crash_dump.c b/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
index 89c12cb2aaab..79299609dd62 100644
--- a/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
+++ b/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
@@ -117,6 +117,6 @@ void wil_fw_core_dump(struct wil6210_priv *wil)
 	/* fw_dump_data will be free in device coredump release function
 	 * after 5 min
 	 */
-	dev_coredumpv(wil_to_dev(wil), fw_dump_data, fw_dump_size, GFP_KERNEL);
+	dev_coredumpv(wil_to_dev(wil), fw_dump_data, fw_dump_size);
 	wil_info(wil, "fw core dumped, size %d bytes\n", fw_dump_size);
 }
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
index eecf8a38d94a..87f3652ef3bd 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
@@ -37,7 +37,7 @@ int brcmf_debug_create_memdump(struct brcmf_bus *bus, const void *data,
 		return err;
 	}
 
-	dev_coredumpv(bus->dev, dump, len + ramsize, GFP_KERNEL);
+	dev_coredumpv(bus->dev, dump, len + ramsize);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
index abf49022edbe..f2f7cf494a8c 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
@@ -2601,8 +2601,7 @@ static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt,
 					     fw_error_dump.trans_ptr->data,
 					     fw_error_dump.trans_ptr->len,
 					     fw_error_dump.fwrt_len);
-		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len,
-			       GFP_KERNEL);
+		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len);
 	}
 	vfree(fw_error_dump.fwrt_ptr);
 	vfree(fw_error_dump.trans_ptr);
@@ -2647,8 +2646,7 @@ static void iwl_fw_error_ini_dump(struct iwl_fw_runtime *fwrt,
 					     entry->data, entry->size, offs);
 			offs += entry->size;
 		}
-		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len,
-			       GFP_KERNEL);
+		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len);
 	}
 	iwl_dump_ini_list_free(&dump_list);
 }
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index ace7371c4773..26fef0ab1b0d 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1115,8 +1115,7 @@ void mwifiex_upload_device_dump(struct mwifiex_adapter *adapter)
 	 */
 	mwifiex_dbg(adapter, MSG,
 		    "== mwifiex dump information to /sys/class/devcoredump start\n");
-	dev_coredumpv(adapter->dev, adapter->devdump_data, adapter->devdump_len,
-		      GFP_KERNEL);
+	dev_coredumpv(adapter->dev, adapter->devdump_data, adapter->devdump_len);
 	mwifiex_dbg(adapter, MSG,
 		    "== mwifiex dump information to /sys/class/devcoredump end\n");
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
index bd687f7de628..5336fe8c668d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
@@ -2421,6 +2421,5 @@ void mt7615_coredump_work(struct work_struct *work)
 
 		dev_kfree_skb(skb);
 	}
-	dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ,
-		      GFP_KERNEL);
+	dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
index a630ddbf19e5..cac284f95ce0 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
@@ -1630,8 +1630,7 @@ void mt7921_coredump_work(struct work_struct *work)
 	}
 
 	if (dump)
-		dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ,
-			      GFP_KERNEL);
+		dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ);
 
 	mt7921_reset(&dev->mt76);
 }
diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c
index efabd5b1bf5b..a276544cecdd 100644
--- a/drivers/net/wireless/realtek/rtw88/main.c
+++ b/drivers/net/wireless/realtek/rtw88/main.c
@@ -414,7 +414,7 @@ static void rtw_fwcd_dump(struct rtw_dev *rtwdev)
 	 * framework. Note that a new dump will be discarded if a previous one
 	 * hasn't been released yet.
 	 */
-	dev_coredumpv(rtwdev->dev, desc->data, desc->size, GFP_KERNEL);
+	dev_coredumpv(rtwdev->dev, desc->data, desc->size);
 }
 
 static void rtw_fwcd_free(struct rtw_dev *rtwdev, bool free_self)
diff --git a/drivers/net/wireless/realtek/rtw89/ser.c b/drivers/net/wireless/realtek/rtw89/ser.c
index 9e95ed972710..d28fe01ad729 100644
--- a/drivers/net/wireless/realtek/rtw89/ser.c
+++ b/drivers/net/wireless/realtek/rtw89/ser.c
@@ -127,7 +127,7 @@ static void rtw89_ser_cd_send(struct rtw89_dev *rtwdev,
 	 * will be discarded if a previous one hasn't been released by
 	 * framework yet.
 	 */
-	dev_coredumpv(rtwdev->dev, buf, sizeof(*buf), GFP_KERNEL);
+	dev_coredumpv(rtwdev->dev, buf, sizeof(*buf));
 }
 
 static void rtw89_ser_cd_free(struct rtw89_dev *rtwdev,
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index af217de75e4d..813d87faef6c 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -597,7 +597,7 @@ static void q6v5_dump_mba_logs(struct q6v5 *qproc)
 	data = vmalloc(MBA_LOG_SIZE);
 	if (data) {
 		memcpy(data, mba_region, MBA_LOG_SIZE);
-		dev_coredumpv(&rproc->dev, data, MBA_LOG_SIZE, GFP_KERNEL);
+		dev_coredumpv(&rproc->dev, data, MBA_LOG_SIZE);
 	}
 	memunmap(mba_region);
 }
diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c
index 4b093420d98a..cd55c2abd227 100644
--- a/drivers/remoteproc/remoteproc_coredump.c
+++ b/drivers/remoteproc/remoteproc_coredump.c
@@ -309,7 +309,7 @@ void rproc_coredump(struct rproc *rproc)
 		phdr += elf_size_of_phdr(class);
 	}
 	if (dump_conf == RPROC_COREDUMP_ENABLED) {
-		dev_coredumpv(&rproc->dev, data, data_size, GFP_KERNEL);
+		dev_coredumpv(&rproc->dev, data, data_size);
 		return;
 	}
 
@@ -318,7 +318,7 @@ void rproc_coredump(struct rproc *rproc)
 	dump_state.header = data;
 	init_completion(&dump_state.dump_done);
 
-	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size, GFP_KERNEL,
+	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size,
 		      rproc_coredump_read, rproc_coredump_free);
 
 	/*
@@ -449,7 +449,7 @@ void rproc_coredump_using_sections(struct rproc *rproc)
 	}
 
 	if (dump_conf == RPROC_COREDUMP_ENABLED) {
-		dev_coredumpv(&rproc->dev, data, data_size, GFP_KERNEL);
+		dev_coredumpv(&rproc->dev, data, data_size);
 		return;
 	}
 
@@ -458,7 +458,7 @@ void rproc_coredump_using_sections(struct rproc *rproc)
 	dump_state.header = data;
 	init_completion(&dump_state.dump_done);
 
-	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size, GFP_KERNEL,
+	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size,
 		      rproc_coredump_read, rproc_coredump_free);
 
 	/* Wait until the dump is read and free is called. Data is freed
diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index 22fabdeed297..b41850366bcc 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -162,7 +162,7 @@ struct drm_print_iterator {
  *	void makecoredump(...)
  *	{
  *		...
- *		dev_coredumpm(dev, THIS_MODULE, data, 0, GFP_KERNEL,
+ *		dev_coredumpm(dev, THIS_MODULE, data, 0,
  *			coredump_read, ...)
  *	}
  *
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c008169ed2c6..c7d840d824c3 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -52,27 +52,26 @@ static inline void _devcd_free_sgtable(struct scatterlist *table)
 
 
 #ifdef CONFIG_DEV_COREDUMP
-void dev_coredumpv(struct device *dev, void *data, size_t datalen,
-		   gfp_t gfp);
+void dev_coredumpv(struct device *dev, void *data, size_t datalen);
 
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 				   void *data, size_t datalen),
 		   void (*free)(void *data));
 
 void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-		    size_t datalen, gfp_t gfp);
+		    size_t datalen);
 #else
 static inline void dev_coredumpv(struct device *dev, void *data,
-				 size_t datalen, gfp_t gfp)
+				 size_t datalen)
 {
 	vfree(data);
 }
 
 static inline void
 dev_coredumpm(struct device *dev, struct module *owner,
-	      void *data, size_t datalen, gfp_t gfp,
+	      void *data, size_t datalen,
 	      ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 			      void *data, size_t datalen),
 	      void (*free)(void *data))
@@ -81,7 +80,7 @@ dev_coredumpm(struct device *dev, struct module *owner,
 }
 
 static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-				  size_t datalen, gfp_t gfp)
+				  size_t datalen)
 {
 	_devcd_free_sgtable(table);
 }
diff --git a/sound/soc/intel/avs/apl.c b/sound/soc/intel/avs/apl.c
index b8e2b23c9f64..1ff57f1a483d 100644
--- a/sound/soc/intel/avs/apl.c
+++ b/sound/soc/intel/avs/apl.c
@@ -164,7 +164,7 @@ static int apl_coredump(struct avs_dev *adev, union avs_notify_msg *msg)
 	} while (offset < msg->ext.coredump.stack_dump_size);
 
 exit:
-	dev_coredumpv(adev->dev, dump, dump_size, GFP_KERNEL);
+	dev_coredumpv(adev->dev, dump, dump_size);
 
 	return 0;
 }
diff --git a/sound/soc/intel/avs/skl.c b/sound/soc/intel/avs/skl.c
index bda5ec7510fe..3413162768dc 100644
--- a/sound/soc/intel/avs/skl.c
+++ b/sound/soc/intel/avs/skl.c
@@ -88,7 +88,7 @@ static int skl_coredump(struct avs_dev *adev, union avs_notify_msg *msg)
 		return -ENOMEM;
 
 	memcpy_fromio(dump, avs_sram_addr(adev, AVS_FW_REGS_WINDOW), AVS_FW_REGS_SIZE);
-	dev_coredumpv(adev->dev, dump, AVS_FW_REGS_SIZE, GFP_KERNEL);
+	dev_coredumpv(adev->dev, dump, AVS_FW_REGS_SIZE);
 
 	return 0;
 }
diff --git a/sound/soc/intel/catpt/dsp.c b/sound/soc/intel/catpt/dsp.c
index 346bec000306..d2afe9ff1e3a 100644
--- a/sound/soc/intel/catpt/dsp.c
+++ b/sound/soc/intel/catpt/dsp.c
@@ -539,7 +539,7 @@ int catpt_coredump(struct catpt_dev *cdev)
 		pos += CATPT_DMA_REGS_SIZE;
 	}
 
-	dev_coredumpv(cdev->dev, dump, dump_size, GFP_KERNEL);
+	dev_coredumpv(cdev->dev, dump, dump_size);
 
 	return 0;
 }
-- 
cgit 


From e386b6725798eec07facedf4d4bb710c079fd25c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 2 Jun 2022 17:30:01 -0700
Subject: rcu-tasks: Eliminate RCU Tasks Trace IPIs to online CPUs

Currently, the RCU Tasks Trace grace-period kthread IPIs each online CPU
using smp_call_function_single() in order to track any tasks currently in
RCU Tasks Trace read-side critical sections during which the corresponding
task has neither blocked nor been preempted.  These IPIs are annoying
and are also not strictly necessary because any task that blocks or is
preempted within its current RCU Tasks Trace read-side critical section
will be tracked on one of the per-CPU rcu_tasks_percpu structure's
->rtp_blkd_tasks list.  So the only time that this is a problem is if
one of the CPUs runs through a long-duration RCU Tasks Trace read-side
critical section without a context switch.

Note that the task_call_func() function cannot help here because there is
no safe way to identify the target task.  Of course, the task_call_func()
function will be very useful later, when processing the list of tasks,
but it needs to know the task.

This commit therefore creates a cpu_curr_snapshot() function that returns
a pointer the task_struct structure of some task that happened to be
running on the specified CPU more or less during the time that the
cpu_curr_snapshot() function was executing.  If there was no context
switch during this time, this function will return a pointer to the
task_struct structure of the task that was running throughout.  If there
was a context switch, then the outgoing task will be taken care of by
RCU's context-switch hook, and the incoming task was either already taken
care during some previous context switch, or it is not currently within an
RCU Tasks Trace read-side critical section.  And in this latter case, the
grace period already started, so there is no need to wait on this task.

This new cpu_curr_snapshot() function is invoked on each CPU early in
the RCU Tasks Trace grace-period processing, and the resulting tasks
are queued for later quiescent-state inspection.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: KP Singh <kpsingh@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/rcu/tasks.h    | 24 +++++++-----------------
 kernel/sched/core.c   | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b88caf54e168..72242bc73d85 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2224,6 +2224,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
+extern struct task_struct *cpu_curr_snapshot(int cpu);
 
 /*
  * In order to reduce various lock holder preemption latencies provide an
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 9d7d6fd4b8a7..c2aae2643a0b 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1479,21 +1479,6 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop
 		trc_wait_for_one_reader(t, hop);
 }
 
-/*
- * Get the current CPU's current task on the holdout list.
- * Calls to this function must be serialized.
- */
-static void rcu_tasks_trace_pertask_handler(void *hop_in)
-{
-	struct list_head *hop = hop_in;
-	struct task_struct *t = current;
-
-	// Pull in the currently running task, but only if it is currently
-	// in an RCU tasks trace read-side critical section.
-	if (rcu_tasks_trace_pertask_prep(t, false))
-		trc_add_holdout(t, hop);
-}
-
 /* Initialize for a new RCU-tasks-trace grace period. */
 static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 {
@@ -1513,8 +1498,13 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 
 	// These smp_call_function_single() calls are serialized to
 	// allow safe access to the hop list.
-	for_each_online_cpu(cpu)
-		smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1);
+	for_each_online_cpu(cpu) {
+		rcu_read_lock();
+		t = cpu_curr_snapshot(cpu);
+		if (rcu_tasks_trace_pertask_prep(t, true))
+			trc_add_holdout(t, hop);
+		rcu_read_unlock();
+	}
 
 	// Only after all running tasks have been accounted for is it
 	// safe to take care of the tasks that have blocked within their
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..9568019be124 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4263,6 +4263,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
 	return ret;
 }
 
+/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU.  If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee.  Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+	struct task_struct *t;
+
+	smp_mb(); /* Pairing determined by caller's synchronization design. */
+	t = rcu_dereference(cpu_curr(cpu));
+	smp_mb(); /* Pairing determined by caller's synchronization design. */
+	return t;
+}
+
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
-- 
cgit 


From 0ffc781a19ed3030c792ad1a0e44e6e047bb9adc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:20 +0200
Subject: context_tracking: Rename __context_tracking_enter/exit() to
 __ct_user_enter/exit()

The context tracking namespace is going to expand and some new functions
will require even longer names. Start shrinking the context_tracking
prefix to "ct" as is already the case for some existing macros, this
will make the introduction of new functions easier.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking.h | 12 ++++++------
 kernel/context_tracking.c        | 20 ++++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 7a14807c9d1a..773035124bad 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -14,8 +14,8 @@
 extern void context_tracking_cpu_set(int cpu);
 
 /* Called with interrupts disabled.  */
-extern void __context_tracking_enter(enum ctx_state state);
-extern void __context_tracking_exit(enum ctx_state state);
+extern void __ct_user_enter(enum ctx_state state);
+extern void __ct_user_exit(enum ctx_state state);
 
 extern void context_tracking_enter(enum ctx_state state);
 extern void context_tracking_exit(enum ctx_state state);
@@ -38,13 +38,13 @@ static inline void user_exit(void)
 static __always_inline void user_enter_irqoff(void)
 {
 	if (context_tracking_enabled())
-		__context_tracking_enter(CONTEXT_USER);
+		__ct_user_enter(CONTEXT_USER);
 
 }
 static __always_inline void user_exit_irqoff(void)
 {
 	if (context_tracking_enabled())
-		__context_tracking_exit(CONTEXT_USER);
+		__ct_user_exit(CONTEXT_USER);
 }
 
 static inline enum ctx_state exception_enter(void)
@@ -74,7 +74,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 static __always_inline bool context_tracking_guest_enter(void)
 {
 	if (context_tracking_enabled())
-		__context_tracking_enter(CONTEXT_GUEST);
+		__ct_user_enter(CONTEXT_GUEST);
 
 	return context_tracking_enabled_this_cpu();
 }
@@ -82,7 +82,7 @@ static __always_inline bool context_tracking_guest_enter(void)
 static __always_inline void context_tracking_guest_exit(void)
 {
 	if (context_tracking_enabled())
-		__context_tracking_exit(CONTEXT_GUEST);
+		__ct_user_exit(CONTEXT_GUEST);
 }
 
 /**
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 3082332f6476..88c60ab39fbb 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -51,15 +51,15 @@ static __always_inline void context_tracking_recursion_exit(void)
 }
 
 /**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- *                          enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ *		     to enter user or guest space mode.
  *
  * This function must be called right before we switch from the kernel
  * to user or guest space, when it's guaranteed the remaining kernel
  * instructions to execute won't use any RCU read side critical section
  * because this function sets RCU in extended quiescent state.
  */
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
 {
 	/* Kernel threads aren't supposed to go to userspace */
 	WARN_ON_ONCE(!current->mm);
@@ -101,7 +101,7 @@ void noinstr __context_tracking_enter(enum ctx_state state)
 	}
 	context_tracking_recursion_exit();
 }
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
 
 /*
  * OBSOLETE:
@@ -128,7 +128,7 @@ void context_tracking_enter(enum ctx_state state)
 		return;
 
 	local_irq_save(flags);
-	__context_tracking_enter(state);
+	__ct_user_enter(state);
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
@@ -149,8 +149,8 @@ void context_tracking_user_enter(void)
 NOKPROBE_SYMBOL(context_tracking_user_enter);
 
 /**
- * context_tracking_exit - Inform the context tracking that the CPU is
- *                         exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ *		    exiting user or guest mode and entering the kernel.
  *
  * This function must be called after we entered the kernel from user or
  * guest space before any use of RCU read side critical section. This
@@ -160,7 +160,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
  * This call supports re-entrancy. This way it can be called from any exception
  * handler without needing to know if we came from userspace or not.
  */
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
 {
 	if (!context_tracking_recursion_enter())
 		return;
@@ -183,7 +183,7 @@ void noinstr __context_tracking_exit(enum ctx_state state)
 	}
 	context_tracking_recursion_exit();
 }
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
 
 /*
  * OBSOLETE:
@@ -202,7 +202,7 @@ void context_tracking_exit(enum ctx_state state)
 		return;
 
 	local_irq_save(flags);
-	__context_tracking_exit(state);
+	__ct_user_exit(state);
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
-- 
cgit 


From bb42856bfd54fda1cbc7c470fcf5db1596938f4f Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:27:36 -0500
Subject: scsi: iscsi: Add helper to remove a session from the kernel

During qedi shutdown we need to stop the iSCSI layer from sending new nops
as pings and from responding to target ones and make sure there is no
running connection cleanups. Commit d1f2ce77638d ("scsi: qedi: Fix host
removal with running sessions") converted the driver to use the libicsi
helper to drive session removal, so the above issues could be handled. The
problem is that during system shutdown iscsid will not be running so when
we try to remove the root session we will hang waiting for userspace to
reply.

Add a helper that will drive the destruction of sessions like these during
system shutdown.

Link: https://lore.kernel.org/r/20220616222738.5722-5-michael.christie@oracle.com
Tested-by: Nilesh Javali <njavali@marvell.com>
Reviewed-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 49 +++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_transport_iscsi.h |  1 +
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index c9de0acdccee..8be99d044425 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2334,6 +2334,55 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
 	ISCSI_DBG_TRANS_CONN(conn, "cleanup done.\n");
 }
 
+static int iscsi_iter_force_destroy_conn_fn(struct device *dev, void *data)
+{
+	struct iscsi_transport *transport;
+	struct iscsi_cls_conn *conn;
+
+	if (!iscsi_is_conn_dev(dev))
+		return 0;
+
+	conn = iscsi_dev_to_conn(dev);
+	transport = conn->transport;
+
+	if (READ_ONCE(conn->state) != ISCSI_CONN_DOWN)
+		iscsi_if_stop_conn(conn, STOP_CONN_TERM);
+
+	transport->destroy_conn(conn);
+	return 0;
+}
+
+/**
+ * iscsi_force_destroy_session - destroy a session from the kernel
+ * @session: session to destroy
+ *
+ * Force the destruction of a session from the kernel. This should only be
+ * used when userspace is no longer running during system shutdown.
+ */
+void iscsi_force_destroy_session(struct iscsi_cls_session *session)
+{
+	struct iscsi_transport *transport = session->transport;
+	unsigned long flags;
+
+	WARN_ON_ONCE(system_state == SYSTEM_RUNNING);
+
+	spin_lock_irqsave(&sesslock, flags);
+	if (list_empty(&session->sess_list)) {
+		spin_unlock_irqrestore(&sesslock, flags);
+		/*
+		 * Conn/ep is already freed. Session is being torn down via
+		 * async path. For shutdown we don't care about it so return.
+		 */
+		return;
+	}
+	spin_unlock_irqrestore(&sesslock, flags);
+
+	device_for_each_child(&session->dev, NULL,
+			      iscsi_iter_force_destroy_conn_fn);
+	transport->destroy_session(session);
+}
+EXPORT_SYMBOL_GPL(iscsi_force_destroy_session);
+
 void iscsi_free_session(struct iscsi_cls_session *session)
 {
 	ISCSI_DBG_TRANS_SESSION(session, "Freeing session\n");
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 695396a5f607..cab52b0f11d0 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -442,6 +442,7 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
 						struct iscsi_transport *t,
 						int dd_size,
 						unsigned int target_id);
+extern void iscsi_force_destroy_session(struct iscsi_cls_session *session);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_alloc_conn(struct iscsi_cls_session *sess,
-- 
cgit 


From 31500e902759322ba3c64b60dabae2704e738df8 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:27:38 -0500
Subject: scsi: iscsi: Fix session removal on shutdown

When the system is shutting down, iscsid is not running so we will not get
a response to the ISCSI_ERR_INVALID_HOST error event. The system shutdown
will then hang waiting on userspace to remove the session.

This has libiscsi force the destruction of the session from the kernel when
iscsi_host_remove() is called from a driver's shutdown callout.

This fixes a regression added in qedi boot with commit d1f2ce77638d ("scsi:
qedi: Fix host removal with running sessions") which made qedi use the
common session removal function that waits on userspace instead of rolling
its own kernel based removal.

Link: https://lore.kernel.org/r/20220616222738.5722-7-michael.christie@oracle.com
Fixes: d1f2ce77638d ("scsi: qedi: Fix host removal with running sessions")
Tested-by: Nilesh Javali <njavali@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c | 4 ++--
 drivers/scsi/be2iscsi/be_main.c          | 2 +-
 drivers/scsi/bnx2i/bnx2i_iscsi.c         | 2 +-
 drivers/scsi/cxgbi/libcxgbi.c            | 2 +-
 drivers/scsi/iscsi_tcp.c                 | 4 ++--
 drivers/scsi/libiscsi.c                  | 9 +++++++--
 drivers/scsi/qedi/qedi_main.c            | 9 ++++++---
 include/scsi/libiscsi.h                  | 2 +-
 8 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 321949a570ed..620ae5b2d80d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -568,7 +568,7 @@ static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
 	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
 
 	iscsi_session_teardown(cls_session);
-	iscsi_host_remove(shost);
+	iscsi_host_remove(shost, false);
 	iscsi_host_free(shost);
 }
 
@@ -685,7 +685,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	return cls_session;
 
 remove_host:
-	iscsi_host_remove(shost);
+	iscsi_host_remove(shost, false);
 free_host:
 	iscsi_host_free(shost);
 	return NULL;
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 3bb0adefbe06..02026476c39c 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5745,7 +5745,7 @@ static void beiscsi_remove(struct pci_dev *pcidev)
 	cancel_work_sync(&phba->sess_work);
 
 	beiscsi_iface_destroy_default(phba);
-	iscsi_host_remove(phba->shost);
+	iscsi_host_remove(phba->shost, false);
 	beiscsi_disable_port(phba, 1);
 
 	/* after cancelling boot_work */
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 15fbd09baa94..a3c800e04a2e 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -909,7 +909,7 @@ void bnx2i_free_hba(struct bnx2i_hba *hba)
 {
 	struct Scsi_Host *shost = hba->shost;
 
-	iscsi_host_remove(shost);
+	iscsi_host_remove(shost, false);
 	INIT_LIST_HEAD(&hba->ep_ofld_list);
 	INIT_LIST_HEAD(&hba->ep_active_list);
 	INIT_LIST_HEAD(&hba->ep_destroy_list);
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 4365d52c6430..32abdf0fa9aa 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -328,7 +328,7 @@ void cxgbi_hbas_remove(struct cxgbi_device *cdev)
 		chba = cdev->hbas[i];
 		if (chba) {
 			cdev->hbas[i] = NULL;
-			iscsi_host_remove(chba->shost);
+			iscsi_host_remove(chba->shost, false);
 			pci_dev_put(cdev->pdev);
 			iscsi_host_free(chba->shost);
 		}
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 9fee70d6434a..52c6f70d60ec 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -898,7 +898,7 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
 remove_session:
 	iscsi_session_teardown(cls_session);
 remove_host:
-	iscsi_host_remove(shost);
+	iscsi_host_remove(shost, false);
 free_host:
 	iscsi_host_free(shost);
 	return NULL;
@@ -915,7 +915,7 @@ static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session)
 	iscsi_tcp_r2tpool_free(cls_session->dd_data);
 	iscsi_session_teardown(cls_session);
 
-	iscsi_host_remove(shost);
+	iscsi_host_remove(shost, false);
 	iscsi_host_free(shost);
 }
 
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 8d78559ae94a..3894decf8fbf 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2832,11 +2832,12 @@ static void iscsi_notify_host_removed(struct iscsi_cls_session *cls_session)
 /**
  * iscsi_host_remove - remove host and sessions
  * @shost: scsi host
+ * @is_shutdown: true if called from a driver shutdown callout
  *
  * If there are any sessions left, this will initiate the removal and wait
  * for the completion.
  */
-void iscsi_host_remove(struct Scsi_Host *shost)
+void iscsi_host_remove(struct Scsi_Host *shost, bool is_shutdown)
 {
 	struct iscsi_host *ihost = shost_priv(shost);
 	unsigned long flags;
@@ -2845,7 +2846,11 @@ void iscsi_host_remove(struct Scsi_Host *shost)
 	ihost->state = ISCSI_HOST_REMOVED;
 	spin_unlock_irqrestore(&ihost->lock, flags);
 
-	iscsi_host_for_each_session(shost, iscsi_notify_host_removed);
+	if (!is_shutdown)
+		iscsi_host_for_each_session(shost, iscsi_notify_host_removed);
+	else
+		iscsi_host_for_each_session(shost, iscsi_force_destroy_session);
+
 	wait_event_interruptible(ihost->session_removal_wq,
 				 ihost->num_sessions == 0);
 	if (signal_pending(current))
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index deebe62e2b41..cecfb2cb4c7b 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -2414,9 +2414,12 @@ static void __qedi_remove(struct pci_dev *pdev, int mode)
 	int rval;
 	u16 retry = 10;
 
-	if (mode == QEDI_MODE_NORMAL || mode == QEDI_MODE_SHUTDOWN) {
-		iscsi_host_remove(qedi->shost);
+	if (mode == QEDI_MODE_NORMAL)
+		iscsi_host_remove(qedi->shost, false);
+	else if (mode == QEDI_MODE_SHUTDOWN)
+		iscsi_host_remove(qedi->shost, true);
 
+	if (mode == QEDI_MODE_NORMAL || mode == QEDI_MODE_SHUTDOWN) {
 		if (qedi->tmf_thread) {
 			destroy_workqueue(qedi->tmf_thread);
 			qedi->tmf_thread = NULL;
@@ -2791,7 +2794,7 @@ remove_host:
 #ifdef CONFIG_DEBUG_FS
 	qedi_dbg_host_exit(&qedi->dbg_ctx);
 #endif
-	iscsi_host_remove(qedi->shost);
+	iscsi_host_remove(qedi->shost, false);
 stop_iscsi_func:
 	qedi_ops->stop(qedi->cdev);
 stop_slowpath:
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index c0703cd20a99..9758a4a9923f 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -411,7 +411,7 @@ extern int iscsi_host_add(struct Scsi_Host *shost, struct device *pdev);
 extern struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht,
 					  int dd_data_size,
 					  bool xmit_can_sleep);
-extern void iscsi_host_remove(struct Scsi_Host *shost);
+extern void iscsi_host_remove(struct Scsi_Host *shost, bool is_shutdown);
 extern void iscsi_host_free(struct Scsi_Host *shost);
 extern int iscsi_target_alloc(struct scsi_target *starget);
 extern int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost,
-- 
cgit 


From 4b9f8ce4d5e823e42944c5a0a4842b0f936365ad Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:45:49 -0500
Subject: scsi: iscsi: Rename iscsi_conn_queue_work()

Rename iscsi_conn_queue_work() to iscsi_conn_queue_xmit() to reflect that
it handles queueing of xmits only.

Link: https://lore.kernel.org/r/20220616224557.115234-2-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Wu Bo <wubo40@huawei.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/cxgbi/libcxgbi.c |  2 +-
 drivers/scsi/iscsi_tcp.c      |  2 +-
 drivers/scsi/libiscsi.c       | 12 ++++++------
 include/scsi/libiscsi.h       |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 32abdf0fa9aa..af281e271f88 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -1455,7 +1455,7 @@ void cxgbi_conn_tx_open(struct cxgbi_sock *csk)
 	if (conn) {
 		log_debug(1 << CXGBI_DBG_SOCK,
 			"csk 0x%p, cid %d.\n", csk, conn->id);
-		iscsi_conn_queue_work(conn);
+		iscsi_conn_queue_xmit(conn);
 	}
 }
 EXPORT_SYMBOL_GPL(cxgbi_conn_tx_open);
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 52c6f70d60ec..da1dc345b873 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -205,7 +205,7 @@ static void iscsi_sw_tcp_write_space(struct sock *sk)
 	old_write_space(sk);
 
 	ISCSI_SW_TCP_DBG(conn, "iscsi_write_space\n");
-	iscsi_conn_queue_work(conn);
+	iscsi_conn_queue_xmit(conn);
 }
 
 static void iscsi_sw_tcp_conn_set_callbacks(struct iscsi_conn *conn)
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 3894decf8fbf..a6859805d530 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -83,7 +83,7 @@ MODULE_PARM_DESC(debug_libiscsi_eh,
 				"%s " dbg_fmt, __func__, ##arg);	\
 	} while (0);
 
-inline void iscsi_conn_queue_work(struct iscsi_conn *conn)
+inline void iscsi_conn_queue_xmit(struct iscsi_conn *conn)
 {
 	struct Scsi_Host *shost = conn->session->host;
 	struct iscsi_host *ihost = shost_priv(shost);
@@ -91,7 +91,7 @@ inline void iscsi_conn_queue_work(struct iscsi_conn *conn)
 	if (ihost->workq)
 		queue_work(ihost->workq, &conn->xmitwork);
 }
-EXPORT_SYMBOL_GPL(iscsi_conn_queue_work);
+EXPORT_SYMBOL_GPL(iscsi_conn_queue_xmit);
 
 static void __iscsi_update_cmdsn(struct iscsi_session *session,
 				 uint32_t exp_cmdsn, uint32_t max_cmdsn)
@@ -765,7 +765,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 			goto free_task;
 	} else {
 		list_add_tail(&task->running, &conn->mgmtqueue);
-		iscsi_conn_queue_work(conn);
+		iscsi_conn_queue_xmit(conn);
 	}
 
 	return task;
@@ -1513,7 +1513,7 @@ void iscsi_requeue_task(struct iscsi_task *task)
 		 */
 		iscsi_put_task(task);
 	}
-	iscsi_conn_queue_work(conn);
+	iscsi_conn_queue_xmit(conn);
 	spin_unlock_bh(&conn->session->frwd_lock);
 }
 EXPORT_SYMBOL_GPL(iscsi_requeue_task);
@@ -1786,7 +1786,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 		}
 	} else {
 		list_add_tail(&task->running, &conn->cmdqueue);
-		iscsi_conn_queue_work(conn);
+		iscsi_conn_queue_xmit(conn);
 	}
 
 	session->queued_cmdsn++;
@@ -1967,7 +1967,7 @@ EXPORT_SYMBOL_GPL(iscsi_suspend_tx);
 static void iscsi_start_tx(struct iscsi_conn *conn)
 {
 	clear_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags);
-	iscsi_conn_queue_work(conn);
+	iscsi_conn_queue_xmit(conn);
 }
 
 /*
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 9758a4a9923f..51fb0c17815e 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -453,7 +453,7 @@ extern int iscsi_conn_get_addr_param(struct sockaddr_storage *addr,
 				     enum iscsi_param param, char *buf);
 extern void iscsi_suspend_tx(struct iscsi_conn *conn);
 extern void iscsi_suspend_queue(struct iscsi_conn *conn);
-extern void iscsi_conn_queue_work(struct iscsi_conn *conn);
+extern void iscsi_conn_queue_xmit(struct iscsi_conn *conn);
 
 #define iscsi_conn_printk(prefix, _c, fmt, a...) \
 	iscsi_cls_conn_printk(prefix, ((struct iscsi_conn *)_c)->cls_conn, \
-- 
cgit 


From 8af809966c0b34cfacd8da9a412689b8e9910354 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:45:50 -0500
Subject: scsi: iscsi: Add recv workqueue helpers

Add helpers to allow the drivers to run their recv paths from libiscsi's
workqueue.

Link: https://lore.kernel.org/r/20220616224557.115234-3-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libiscsi.c | 29 +++++++++++++++++++++++++++--
 include/scsi/libiscsi.h |  4 ++++
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index a6859805d530..20156fb33646 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -93,6 +93,16 @@ inline void iscsi_conn_queue_xmit(struct iscsi_conn *conn)
 }
 EXPORT_SYMBOL_GPL(iscsi_conn_queue_xmit);
 
+inline void iscsi_conn_queue_recv(struct iscsi_conn *conn)
+{
+	struct Scsi_Host *shost = conn->session->host;
+	struct iscsi_host *ihost = shost_priv(shost);
+
+	if (ihost->workq && !test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))
+		queue_work(ihost->workq, &conn->recvwork);
+}
+EXPORT_SYMBOL_GPL(iscsi_conn_queue_recv);
+
 static void __iscsi_update_cmdsn(struct iscsi_session *session,
 				 uint32_t exp_cmdsn, uint32_t max_cmdsn)
 {
@@ -1947,7 +1957,7 @@ EXPORT_SYMBOL_GPL(iscsi_suspend_queue);
 
 /**
  * iscsi_suspend_tx - suspend iscsi_data_xmit
- * @conn: iscsi conn tp stop processing IO on.
+ * @conn: iscsi conn to stop processing IO on.
  *
  * This function sets the suspend bit to prevent iscsi_data_xmit
  * from sending new IO, and if work is queued on the xmit thread
@@ -1960,7 +1970,7 @@ void iscsi_suspend_tx(struct iscsi_conn *conn)
 
 	set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags);
 	if (ihost->workq)
-		flush_workqueue(ihost->workq);
+		flush_work(&conn->xmitwork);
 }
 EXPORT_SYMBOL_GPL(iscsi_suspend_tx);
 
@@ -1970,6 +1980,21 @@ static void iscsi_start_tx(struct iscsi_conn *conn)
 	iscsi_conn_queue_xmit(conn);
 }
 
+/**
+ * iscsi_suspend_rx - Prevent recvwork from running again.
+ * @conn: iscsi conn to stop.
+ */
+void iscsi_suspend_rx(struct iscsi_conn *conn)
+{
+	struct Scsi_Host *shost = conn->session->host;
+	struct iscsi_host *ihost = shost_priv(shost);
+
+	set_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags);
+	if (ihost->workq)
+		flush_work(&conn->recvwork);
+}
+EXPORT_SYMBOL_GPL(iscsi_suspend_rx);
+
 /*
  * We want to make sure a ping is in flight. It has timed out.
  * And we are not busy processing a pdu that is making
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 51fb0c17815e..5a10e5acfad2 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -213,6 +213,8 @@ struct iscsi_conn {
 	struct list_head	cmdqueue;	/* data-path cmd queue */
 	struct list_head	requeue;	/* tasks needing another run */
 	struct work_struct	xmitwork;	/* per-conn. xmit workqueue */
+	/* recv */
+	struct work_struct	recvwork;
 	unsigned long		flags;		/* ISCSI_CONN_FLAGs */
 
 	/* negotiated params */
@@ -452,8 +454,10 @@ extern int iscsi_conn_get_param(struct iscsi_cls_conn *cls_conn,
 extern int iscsi_conn_get_addr_param(struct sockaddr_storage *addr,
 				     enum iscsi_param param, char *buf);
 extern void iscsi_suspend_tx(struct iscsi_conn *conn);
+extern void iscsi_suspend_rx(struct iscsi_conn *conn);
 extern void iscsi_suspend_queue(struct iscsi_conn *conn);
 extern void iscsi_conn_queue_xmit(struct iscsi_conn *conn);
+extern void iscsi_conn_queue_recv(struct iscsi_conn *conn);
 
 #define iscsi_conn_printk(prefix, _c, fmt, a...) \
 	iscsi_cls_conn_printk(prefix, ((struct iscsi_conn *)_c)->cls_conn, \
-- 
cgit 


From e1c6a7ec14290a0e371b09300685638f9009f2ab Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:45:55 -0500
Subject: scsi: iscsi: Remove iscsi_get_task back_lock requirement

We currently require that the back_lock is held when calling the functions
that manipulate the iscsi_task refcount. The only reason for this is to
handle races where we are handling SCSI-ml EH callbacks and the cmd is
completing at the same time the normal completion path is running, and we
can't return from the EH callback until the driver has stopped accessing
the cmd. Holding the back_lock while also accessing the task->state made it
simple to check that a cmd is completing and also get/put a refcount at the
same time, and at the time we were not as concerned about performance.

The problem is that we don't want to take the back_lock from the xmit path
for normal I/O since it causes contention with the completion path if the
user has chosen to try and split those paths on different CPUs (in this
case abusing the CPUs and ignoring caching improves perf for some uses).

Begins to remove the back_lock requirement for iscsi_get/put_task by
removing the requirement for the get path. Instead of always holding the
back_lock we detect if something has done the last put and is about to call
iscsi_free_task(). A subsequent commit will then allow iSCSI code to do the
last put on a task and only grab the back_lock if the refcount is now zero
and it's going to call iscsi_free_task().

Link: https://lore.kernel.org/r/20220616224557.115234-8-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/be2iscsi/be_main.c | 19 ++++++++-
 drivers/scsi/libiscsi.c         | 95 ++++++++++++++++++++++++++++-------------
 drivers/scsi/libiscsi_tcp.c     |  6 ++-
 include/scsi/libiscsi.h         |  2 +-
 4 files changed, 89 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 02026476c39c..50a577ac3bb4 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -231,6 +231,7 @@ static int beiscsi_eh_abort(struct scsi_cmnd *sc)
 	cls_session = starget_to_session(scsi_target(sc->device));
 	session = cls_session->dd_data;
 
+completion_check:
 	/* check if we raced, task just got cleaned up under us */
 	spin_lock_bh(&session->back_lock);
 	if (!abrt_task || !abrt_task->sc) {
@@ -238,7 +239,13 @@ static int beiscsi_eh_abort(struct scsi_cmnd *sc)
 		return SUCCESS;
 	}
 	/* get a task ref till FW processes the req for the ICD used */
-	__iscsi_get_task(abrt_task);
+	if (!iscsi_get_task(abrt_task)) {
+		spin_unlock(&session->back_lock);
+		/* We are just about to call iscsi_free_task so wait for it. */
+		udelay(5);
+		goto completion_check;
+	}
+
 	abrt_io_task = abrt_task->dd_data;
 	conn = abrt_task->conn;
 	beiscsi_conn = conn->dd_data;
@@ -323,7 +330,15 @@ static int beiscsi_eh_device_reset(struct scsi_cmnd *sc)
 		}
 
 		/* get a task ref till FW processes the req for the ICD used */
-		__iscsi_get_task(task);
+		if (!iscsi_get_task(task)) {
+			/*
+			 * The task has completed in the driver and is
+			 * completing in libiscsi. Just ignore it here. When we
+			 * call iscsi_eh_device_reset, it will wait for us.
+			 */
+			continue;
+		}
+
 		io_task = task->dd_data;
 		/* mark WRB invalid which have been not processed by FW yet */
 		if (is_chip_be2_be3r(phba)) {
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index fce9f9e5b00b..3a6b827496dd 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -83,6 +83,8 @@ MODULE_PARM_DESC(debug_libiscsi_eh,
 				"%s " dbg_fmt, __func__, ##arg);	\
 	} while (0);
 
+#define ISCSI_CMD_COMPL_WAIT 5
+
 inline void iscsi_conn_queue_xmit(struct iscsi_conn *conn)
 {
 	struct Scsi_Host *shost = conn->session->host;
@@ -482,11 +484,11 @@ static void iscsi_free_task(struct iscsi_task *task)
 	}
 }
 
-void __iscsi_get_task(struct iscsi_task *task)
+bool iscsi_get_task(struct iscsi_task *task)
 {
-	refcount_inc(&task->refcount);
+	return refcount_inc_not_zero(&task->refcount);
 }
-EXPORT_SYMBOL_GPL(__iscsi_get_task);
+EXPORT_SYMBOL_GPL(iscsi_get_task);
 
 void __iscsi_put_task(struct iscsi_task *task)
 {
@@ -600,20 +602,17 @@ static bool cleanup_queued_task(struct iscsi_task *task)
 }
 
 /*
- * session frwd lock must be held and if not called for a task that is still
- * pending or from the xmit thread, then xmit thread must be suspended
+ * session back and frwd lock must be held and if not called for a task that
+ * is still pending or from the xmit thread, then xmit thread must be suspended
  */
-static void fail_scsi_task(struct iscsi_task *task, int err)
+static void __fail_scsi_task(struct iscsi_task *task, int err)
 {
 	struct iscsi_conn *conn = task->conn;
 	struct scsi_cmnd *sc;
 	int state;
 
-	spin_lock_bh(&conn->session->back_lock);
-	if (cleanup_queued_task(task)) {
-		spin_unlock_bh(&conn->session->back_lock);
+	if (cleanup_queued_task(task))
 		return;
-	}
 
 	if (task->state == ISCSI_TASK_PENDING) {
 		/*
@@ -632,7 +631,15 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
 	sc->result = err << 16;
 	scsi_set_resid(sc, scsi_bufflen(sc));
 	iscsi_complete_task(task, state);
-	spin_unlock_bh(&conn->session->back_lock);
+}
+
+static void fail_scsi_task(struct iscsi_task *task, int err)
+{
+	struct iscsi_session *session = task->conn->session;
+
+	spin_lock_bh(&session->back_lock);
+	__fail_scsi_task(task, err);
+	spin_unlock_bh(&session->back_lock);
 }
 
 static int iscsi_prep_mgmt_task(struct iscsi_conn *conn,
@@ -1450,8 +1457,17 @@ static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task,
 	spin_lock_bh(&conn->session->back_lock);
 
 	if (!conn->task) {
-		/* Take a ref so we can access it after xmit_task() */
-		__iscsi_get_task(task);
+		/*
+		 * Take a ref so we can access it after xmit_task().
+		 *
+		 * This should never fail because the failure paths will have
+		 * stopped the xmit thread.
+		 */
+		if (!iscsi_get_task(task)) {
+			spin_unlock_bh(&conn->session->back_lock);
+			WARN_ON_ONCE(1);
+			return 0;
+		}
 	} else {
 		/* Already have a ref from when we failed to send it last call */
 		conn->task = NULL;
@@ -1493,7 +1509,7 @@ static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task,
 		 * get an extra ref that is released next time we access it
 		 * as conn->task above.
 		 */
-		__iscsi_get_task(task);
+		iscsi_get_task(task);
 		conn->task = task;
 	}
 
@@ -1912,6 +1928,7 @@ static void fail_scsi_tasks(struct iscsi_conn *conn, u64 lun, int error)
 	struct iscsi_task *task;
 	int i;
 
+restart_cmd_loop:
 	spin_lock_bh(&session->back_lock);
 	for (i = 0; i < session->cmds_max; i++) {
 		task = session->cmds[i];
@@ -1920,22 +1937,25 @@ static void fail_scsi_tasks(struct iscsi_conn *conn, u64 lun, int error)
 
 		if (lun != -1 && lun != task->sc->device->lun)
 			continue;
-
-		__iscsi_get_task(task);
-		spin_unlock_bh(&session->back_lock);
+		/*
+		 * The cmd is completing but if this is called from an eh
+		 * callout path then when we return scsi-ml owns the cmd. Wait
+		 * for the completion path to finish freeing the cmd.
+		 */
+		if (!iscsi_get_task(task)) {
+			spin_unlock_bh(&session->back_lock);
+			spin_unlock_bh(&session->frwd_lock);
+			udelay(ISCSI_CMD_COMPL_WAIT);
+			spin_lock_bh(&session->frwd_lock);
+			goto restart_cmd_loop;
+		}
 
 		ISCSI_DBG_SESSION(session,
 				  "failing sc %p itt 0x%x state %d\n",
 				  task->sc, task->itt, task->state);
-		fail_scsi_task(task, error);
-
-		spin_unlock_bh(&session->frwd_lock);
-		iscsi_put_task(task);
-		spin_lock_bh(&session->frwd_lock);
-
-		spin_lock_bh(&session->back_lock);
+		__fail_scsi_task(task, error);
+		__iscsi_put_task(task);
 	}
-
 	spin_unlock_bh(&session->back_lock);
 }
 
@@ -2040,7 +2060,16 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 		spin_unlock(&session->back_lock);
 		goto done;
 	}
-	__iscsi_get_task(task);
+	if (!iscsi_get_task(task)) {
+		/*
+		 * Racing with the completion path right now, so give it more
+		 * time so that path can complete it like normal.
+		 */
+		rc = BLK_EH_RESET_TIMER;
+		task = NULL;
+		spin_unlock(&session->back_lock);
+		goto done;
+	}
 	spin_unlock(&session->back_lock);
 
 	if (session->state != ISCSI_STATE_LOGGED_IN) {
@@ -2289,6 +2318,7 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
 
 	ISCSI_DBG_EH(session, "aborting sc %p\n", sc);
 
+completion_check:
 	mutex_lock(&session->eh_mutex);
 	spin_lock_bh(&session->frwd_lock);
 	/*
@@ -2328,13 +2358,20 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
 		return SUCCESS;
 	}
 
+	if (!iscsi_get_task(task)) {
+		spin_unlock(&session->back_lock);
+		spin_unlock_bh(&session->frwd_lock);
+		mutex_unlock(&session->eh_mutex);
+		/* We are just about to call iscsi_free_task so wait for it. */
+		udelay(ISCSI_CMD_COMPL_WAIT);
+		goto completion_check;
+	}
+
+	ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n", sc, task->itt);
 	conn = session->leadconn;
 	iscsi_get_conn(conn->cls_conn);
 	conn->eh_abort_cnt++;
 	age = session->age;
-
-	ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n", sc, task->itt);
-	__iscsi_get_task(task);
 	spin_unlock(&session->back_lock);
 
 	if (task->state == ISCSI_TASK_PENDING) {
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 883005757ddb..c182aa83f2c9 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -558,7 +558,11 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 		return 0;
 	}
 	task->last_xfer = jiffies;
-	__iscsi_get_task(task);
+	if (!iscsi_get_task(task)) {
+		spin_unlock(&session->back_lock);
+		/* Let the path that got the early rsp complete it */
+		return 0;
+	}
 
 	tcp_conn = conn->dd_data;
 	rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 5a10e5acfad2..ec15e820f62b 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -482,7 +482,7 @@ extern struct iscsi_task *iscsi_itt_to_task(struct iscsi_conn *, itt_t);
 extern void iscsi_requeue_task(struct iscsi_task *task);
 extern void iscsi_put_task(struct iscsi_task *task);
 extern void __iscsi_put_task(struct iscsi_task *task);
-extern void __iscsi_get_task(struct iscsi_task *task);
+extern bool iscsi_get_task(struct iscsi_task *task);
 extern void iscsi_complete_scsi_task(struct iscsi_task *task,
 				     uint32_t exp_cmdsn, uint32_t max_cmdsn);
 
-- 
cgit 


From 6e637b723d8277cac4d347ea671ab798b70d8fc9 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 16 Jun 2022 17:45:57 -0500
Subject: scsi: libiscsi: Improve conn_send_pdu API

The conn_send_pdu API is evil in that it returns a pointer to an
iscsi_task, but that task might have been freed already so you can't touch
it. This patch splits the task allocation and transmission, so functions
like iscsi_send_nopout() can access the task before its sent and do
whatever bookkeeping is needed before it is sent.

Link: https://lore.kernel.org/r/20220616224557.115234-10-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libiscsi.c | 85 ++++++++++++++++++++++++++++++++++++-------------
 include/scsi/libiscsi.h |  3 --
 2 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 163e914084e0..d95f4bcdeb2e 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -695,12 +695,18 @@ static int iscsi_prep_mgmt_task(struct iscsi_conn *conn,
 	return 0;
 }
 
+/**
+ * iscsi_alloc_mgmt_task - allocate and setup a mgmt task.
+ * @conn: iscsi conn that the task will be sent on.
+ * @hdr: iscsi pdu that will be sent.
+ * @data: buffer for data segment if needed.
+ * @data_size: length of data in bytes.
+ */
 static struct iscsi_task *
-__iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+iscsi_alloc_mgmt_task(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 		      char *data, uint32_t data_size)
 {
 	struct iscsi_session *session = conn->session;
-	struct iscsi_host *ihost = shost_priv(session->host);
 	uint8_t opcode = hdr->opcode & ISCSI_OPCODE_MASK;
 	struct iscsi_task *task;
 	itt_t itt;
@@ -781,28 +787,57 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 						   task->conn->session->age);
 	}
 
-	if (unlikely(READ_ONCE(conn->ping_task) == INVALID_SCSI_TASK))
-		WRITE_ONCE(conn->ping_task, task);
+	return task;
+
+free_task:
+	iscsi_put_task(task);
+	return NULL;
+}
+
+/**
+ * iscsi_send_mgmt_task - Send task created with iscsi_alloc_mgmt_task.
+ * @task: iscsi task to send.
+ *
+ * On failure this returns a non-zero error code, and the driver must free
+ * the task with iscsi_put_task;
+ */
+static int iscsi_send_mgmt_task(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_host *ihost = shost_priv(conn->session->host);
+	int rc = 0;
 
 	if (!ihost->workq) {
-		if (iscsi_prep_mgmt_task(conn, task))
-			goto free_task;
+		rc = iscsi_prep_mgmt_task(conn, task);
+		if (rc)
+			return rc;
 
-		if (session->tt->xmit_task(task))
-			goto free_task;
+		rc = session->tt->xmit_task(task);
+		if (rc)
+			return rc;
 	} else {
 		list_add_tail(&task->running, &conn->mgmtqueue);
 		iscsi_conn_queue_xmit(conn);
 	}
 
-	return task;
+	return 0;
+}
 
-free_task:
-	/* regular RX path uses back_lock */
-	spin_lock(&session->back_lock);
-	__iscsi_put_task(task);
-	spin_unlock(&session->back_lock);
-	return NULL;
+static int __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+				 char *data, uint32_t data_size)
+{
+	struct iscsi_task *task;
+	int rc;
+
+	task = iscsi_alloc_mgmt_task(conn, hdr, data, data_size);
+	if (!task)
+		return -ENOMEM;
+
+	rc = iscsi_send_mgmt_task(task);
+	if (rc)
+		iscsi_put_task(task);
+	return rc;
 }
 
 int iscsi_conn_send_pdu(struct iscsi_cls_conn *cls_conn, struct iscsi_hdr *hdr,
@@ -813,7 +848,7 @@ int iscsi_conn_send_pdu(struct iscsi_cls_conn *cls_conn, struct iscsi_hdr *hdr,
 	int err = 0;
 
 	spin_lock_bh(&session->frwd_lock);
-	if (!__iscsi_conn_send_pdu(conn, hdr, data, data_size))
+	if (__iscsi_conn_send_pdu(conn, hdr, data, data_size))
 		err = -EPERM;
 	spin_unlock_bh(&session->frwd_lock);
 	return err;
@@ -986,7 +1021,6 @@ static int iscsi_send_nopout(struct iscsi_conn *conn, struct iscsi_nopin *rhdr)
 	if (!rhdr) {
 		if (READ_ONCE(conn->ping_task))
 			return -EINVAL;
-		WRITE_ONCE(conn->ping_task, INVALID_SCSI_TASK);
 	}
 
 	memset(&hdr, 0, sizeof(struct iscsi_nopout));
@@ -1000,10 +1034,18 @@ static int iscsi_send_nopout(struct iscsi_conn *conn, struct iscsi_nopin *rhdr)
 	} else
 		hdr.ttt = RESERVED_ITT;
 
-	task = __iscsi_conn_send_pdu(conn, (struct iscsi_hdr *)&hdr, NULL, 0);
-	if (!task) {
+	task = iscsi_alloc_mgmt_task(conn, (struct iscsi_hdr *)&hdr, NULL, 0);
+	if (!task)
+		return -ENOMEM;
+
+	if (!rhdr)
+		WRITE_ONCE(conn->ping_task, task);
+
+	if (iscsi_send_mgmt_task(task)) {
 		if (!rhdr)
 			WRITE_ONCE(conn->ping_task, NULL);
+		iscsi_put_task(task);
+
 		iscsi_conn_printk(KERN_ERR, conn, "Could not send nopout\n");
 		return -EIO;
 	} else if (!rhdr) {
@@ -1874,11 +1916,8 @@ static int iscsi_exec_task_mgmt_fn(struct iscsi_conn *conn,
 	__must_hold(&session->frwd_lock)
 {
 	struct iscsi_session *session = conn->session;
-	struct iscsi_task *task;
 
-	task = __iscsi_conn_send_pdu(conn, (struct iscsi_hdr *)hdr,
-				      NULL, 0);
-	if (!task) {
+	if (__iscsi_conn_send_pdu(conn, (struct iscsi_hdr *)hdr, NULL, 0)) {
 		spin_unlock_bh(&session->frwd_lock);
 		iscsi_conn_printk(KERN_ERR, conn, "Could not send TMF.\n");
 		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index ec15e820f62b..654cc3918c94 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -135,9 +135,6 @@ struct iscsi_task {
 	void			*dd_data;	/* driver/transport data */
 };
 
-/* invalid scsi_task pointer */
-#define	INVALID_SCSI_TASK	(struct iscsi_task *)-1l
-
 static inline int iscsi_task_has_unsol_data(struct iscsi_task *task)
 {
 	return task->unsol_r2t.data_length > task->unsol_r2t.sent;
-- 
cgit 


From cc06af0bbc21bcee40d57e3ee569d3a09741dafd Mon Sep 17 00:00:00 2001
From: Changyuan Lyu <changyuanl@google.com>
Date: Tue, 21 Jun 2022 18:11:25 +0000
Subject: scsi: trace: Print driver_tag and scheduler_tag in SCSI trace

Trace events like scsi_dispatch_cmd_start and scsi_dispatch_cmd_done are
useful for tracking a command throughout its lifetime. But for some ATA
passthrough commands, the information printed in current logs is not enough
to identify and match them. For example, if two threads send SMART cmd to
the same disk at the same time, their trace logs may look the same, which
makes it hard to match scsi_dispatch_cmd_done and scsi_dispatch_cmd_start.

Printing tags can help us solve the problem.  Further, if a command failed
for some reason and then is retried, its driver_tag will change. So
scheduler_tag is also included such that we can track the retries of a
command.

Link: https://lore.kernel.org/r/20220621181125.3211399-1-changyuanl@google.com
Reviewed-by: Vishakha Channapattan <vishakhavc@google.com>
Reviewed-by: Jolly Shah <jollys@google.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/trace/events/scsi.h | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/scsi.h b/include/trace/events/scsi.h
index 370ade0d4093..a2c7befd451a 100644
--- a/include/trace/events/scsi.h
+++ b/include/trace/events/scsi.h
@@ -166,6 +166,8 @@ TRACE_EVENT(scsi_dispatch_cmd_start,
 		__field( unsigned int,	lun	)
 		__field( unsigned int,	opcode	)
 		__field( unsigned int,	cmd_len )
+		__field( int,	driver_tag)
+		__field( int,	scheduler_tag)
 		__field( unsigned int,	data_sglen )
 		__field( unsigned int,	prot_sglen )
 		__field( unsigned char,	prot_op )
@@ -179,6 +181,8 @@ TRACE_EVENT(scsi_dispatch_cmd_start,
 		__entry->lun		= cmd->device->lun;
 		__entry->opcode		= cmd->cmnd[0];
 		__entry->cmd_len	= cmd->cmd_len;
+		__entry->driver_tag	= scsi_cmd_to_rq(cmd)->tag;
+		__entry->scheduler_tag	= scsi_cmd_to_rq(cmd)->internal_tag;
 		__entry->data_sglen	= scsi_sg_count(cmd);
 		__entry->prot_sglen	= scsi_prot_sg_count(cmd);
 		__entry->prot_op	= scsi_get_prot_op(cmd);
@@ -186,11 +190,11 @@ TRACE_EVENT(scsi_dispatch_cmd_start,
 	),
 
 	TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
-		  " prot_op=%s cmnd=(%s %s raw=%s)",
+		  " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)",
 		  __entry->host_no, __entry->channel, __entry->id,
 		  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
-		  show_prot_op_name(__entry->prot_op),
-		  show_opcode_name(__entry->opcode),
+		  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
+		  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
 		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len))
 );
@@ -209,6 +213,8 @@ TRACE_EVENT(scsi_dispatch_cmd_error,
 		__field( int,		rtn	)
 		__field( unsigned int,	opcode	)
 		__field( unsigned int,	cmd_len )
+		__field( int,	driver_tag)
+		__field( int,	scheduler_tag)
 		__field( unsigned int,	data_sglen )
 		__field( unsigned int,	prot_sglen )
 		__field( unsigned char,	prot_op )
@@ -223,6 +229,8 @@ TRACE_EVENT(scsi_dispatch_cmd_error,
 		__entry->rtn		= rtn;
 		__entry->opcode		= cmd->cmnd[0];
 		__entry->cmd_len	= cmd->cmd_len;
+		__entry->driver_tag	= scsi_cmd_to_rq(cmd)->tag;
+		__entry->scheduler_tag	= scsi_cmd_to_rq(cmd)->internal_tag;
 		__entry->data_sglen	= scsi_sg_count(cmd);
 		__entry->prot_sglen	= scsi_prot_sg_count(cmd);
 		__entry->prot_op	= scsi_get_prot_op(cmd);
@@ -230,11 +238,12 @@ TRACE_EVENT(scsi_dispatch_cmd_error,
 	),
 
 	TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
-		  " prot_op=%s cmnd=(%s %s raw=%s) rtn=%d",
+		  " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \
+		  " rtn=%d",
 		  __entry->host_no, __entry->channel, __entry->id,
 		  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
-		  show_prot_op_name(__entry->prot_op),
-		  show_opcode_name(__entry->opcode),
+		  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
+		  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
 		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  __entry->rtn)
@@ -254,6 +263,8 @@ DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,
 		__field( int,		result	)
 		__field( unsigned int,	opcode	)
 		__field( unsigned int,	cmd_len )
+		__field( int,	driver_tag)
+		__field( int,	scheduler_tag)
 		__field( unsigned int,	data_sglen )
 		__field( unsigned int,	prot_sglen )
 		__field( unsigned char,	prot_op )
@@ -268,19 +279,21 @@ DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,
 		__entry->result		= cmd->result;
 		__entry->opcode		= cmd->cmnd[0];
 		__entry->cmd_len	= cmd->cmd_len;
+		__entry->driver_tag	= scsi_cmd_to_rq(cmd)->tag;
+		__entry->scheduler_tag	= scsi_cmd_to_rq(cmd)->internal_tag;
 		__entry->data_sglen	= scsi_sg_count(cmd);
 		__entry->prot_sglen	= scsi_prot_sg_count(cmd);
 		__entry->prot_op	= scsi_get_prot_op(cmd);
 		memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
 	),
 
-	TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u " \
-		  "prot_sgl=%u prot_op=%s cmnd=(%s %s raw=%s) result=(driver=" \
-		  "%s host=%s message=%s status=%s)",
+	TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u " \
+		  "prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s) " \
+		  "result=(driver=%s host=%s message=%s status=%s)",
 		  __entry->host_no, __entry->channel, __entry->id,
 		  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
-		  show_prot_op_name(__entry->prot_op),
-		  show_opcode_name(__entry->opcode),
+		  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
+		  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
 		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  "DRIVER_OK",
-- 
cgit 


From e244a46a529a9a4c43ae3a2b4bf613e260ec8f81 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Tue, 14 Jun 2022 21:41:17 +0200
Subject: platform/surface: aggregator: Reserve more event- and
 target-categories

With the introduction of the Surface Laptop Studio, more event- and
target categories have been added. Therefore, increase the number of
reserved events and extend the enum of know target categories to
accommodate this.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220614194117.4118897-1-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/trace.h   | 80 +++++++++++++++------------
 include/linux/surface_aggregator/serial_hub.h | 75 +++++++++++++------------
 2 files changed, 85 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/surface/aggregator/trace.h b/drivers/platform/surface/aggregator/trace.h
index de64cf169060..cc9e73fbc18e 100644
--- a/drivers/platform/surface/aggregator/trace.h
+++ b/drivers/platform/surface/aggregator/trace.h
@@ -76,7 +76,7 @@ TRACE_DEFINE_ENUM(SSAM_SSH_TC_HID);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_TCH);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_BKL);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_TAM);
-TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC0);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_UFI);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_USC);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_PEN);
@@ -85,6 +85,11 @@ TRACE_DEFINE_ENUM(SSAM_SSH_TC_AUD);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_SMC);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_KPD);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_REG);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SPT);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SYS);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC1);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SHB);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_POS);
 
 #define SSAM_PTR_UID_LEN		9
 #define SSAM_U8_FIELD_NOT_APPLICABLE	((u16)-1)
@@ -229,40 +234,45 @@ static inline u32 ssam_trace_get_request_tc(const struct ssh_packet *p)
 
 #define ssam_show_ssh_tc(rqid)						\
 	__print_symbolic(rqid,						\
-		{ SSAM_SSH_TC_NOT_APPLICABLE,		"N/A" },	\
-		{ SSAM_SSH_TC_SAM,			"SAM" },	\
-		{ SSAM_SSH_TC_BAT,			"BAT" },	\
-		{ SSAM_SSH_TC_TMP,			"TMP" },	\
-		{ SSAM_SSH_TC_PMC,			"PMC" },	\
-		{ SSAM_SSH_TC_FAN,			"FAN" },	\
-		{ SSAM_SSH_TC_PoM,			"PoM" },	\
-		{ SSAM_SSH_TC_DBG,			"DBG" },	\
-		{ SSAM_SSH_TC_KBD,			"KBD" },	\
-		{ SSAM_SSH_TC_FWU,			"FWU" },	\
-		{ SSAM_SSH_TC_UNI,			"UNI" },	\
-		{ SSAM_SSH_TC_LPC,			"LPC" },	\
-		{ SSAM_SSH_TC_TCL,			"TCL" },	\
-		{ SSAM_SSH_TC_SFL,			"SFL" },	\
-		{ SSAM_SSH_TC_KIP,			"KIP" },	\
-		{ SSAM_SSH_TC_EXT,			"EXT" },	\
-		{ SSAM_SSH_TC_BLD,			"BLD" },	\
-		{ SSAM_SSH_TC_BAS,			"BAS" },	\
-		{ SSAM_SSH_TC_SEN,			"SEN" },	\
-		{ SSAM_SSH_TC_SRQ,			"SRQ" },	\
-		{ SSAM_SSH_TC_MCU,			"MCU" },	\
-		{ SSAM_SSH_TC_HID,			"HID" },	\
-		{ SSAM_SSH_TC_TCH,			"TCH" },	\
-		{ SSAM_SSH_TC_BKL,			"BKL" },	\
-		{ SSAM_SSH_TC_TAM,			"TAM" },	\
-		{ SSAM_SSH_TC_ACC,			"ACC" },	\
-		{ SSAM_SSH_TC_UFI,			"UFI" },	\
-		{ SSAM_SSH_TC_USC,			"USC" },	\
-		{ SSAM_SSH_TC_PEN,			"PEN" },	\
-		{ SSAM_SSH_TC_VID,			"VID" },	\
-		{ SSAM_SSH_TC_AUD,			"AUD" },	\
-		{ SSAM_SSH_TC_SMC,			"SMC" },	\
-		{ SSAM_SSH_TC_KPD,			"KPD" },	\
-		{ SSAM_SSH_TC_REG,			"REG" }		\
+		{ SSAM_SSH_TC_NOT_APPLICABLE,		"N/A"  },	\
+		{ SSAM_SSH_TC_SAM,			"SAM"  },	\
+		{ SSAM_SSH_TC_BAT,			"BAT"  },	\
+		{ SSAM_SSH_TC_TMP,			"TMP"  },	\
+		{ SSAM_SSH_TC_PMC,			"PMC"  },	\
+		{ SSAM_SSH_TC_FAN,			"FAN"  },	\
+		{ SSAM_SSH_TC_PoM,			"PoM"  },	\
+		{ SSAM_SSH_TC_DBG,			"DBG"  },	\
+		{ SSAM_SSH_TC_KBD,			"KBD"  },	\
+		{ SSAM_SSH_TC_FWU,			"FWU"  },	\
+		{ SSAM_SSH_TC_UNI,			"UNI"  },	\
+		{ SSAM_SSH_TC_LPC,			"LPC"  },	\
+		{ SSAM_SSH_TC_TCL,			"TCL"  },	\
+		{ SSAM_SSH_TC_SFL,			"SFL"  },	\
+		{ SSAM_SSH_TC_KIP,			"KIP"  },	\
+		{ SSAM_SSH_TC_EXT,			"EXT"  },	\
+		{ SSAM_SSH_TC_BLD,			"BLD"  },	\
+		{ SSAM_SSH_TC_BAS,			"BAS"  },	\
+		{ SSAM_SSH_TC_SEN,			"SEN"  },	\
+		{ SSAM_SSH_TC_SRQ,			"SRQ"  },	\
+		{ SSAM_SSH_TC_MCU,			"MCU"  },	\
+		{ SSAM_SSH_TC_HID,			"HID"  },	\
+		{ SSAM_SSH_TC_TCH,			"TCH"  },	\
+		{ SSAM_SSH_TC_BKL,			"BKL"  },	\
+		{ SSAM_SSH_TC_TAM,			"TAM"  },	\
+		{ SSAM_SSH_TC_ACC0,			"ACC0" },	\
+		{ SSAM_SSH_TC_UFI,			"UFI"  },	\
+		{ SSAM_SSH_TC_USC,			"USC"  },	\
+		{ SSAM_SSH_TC_PEN,			"PEN"  },	\
+		{ SSAM_SSH_TC_VID,			"VID"  },	\
+		{ SSAM_SSH_TC_AUD,			"AUD"  },	\
+		{ SSAM_SSH_TC_SMC,			"SMC"  },	\
+		{ SSAM_SSH_TC_KPD,			"KPD"  },	\
+		{ SSAM_SSH_TC_REG,			"REG"  },	\
+		{ SSAM_SSH_TC_SPT,			"SPT"  },	\
+		{ SSAM_SSH_TC_SYS,			"SYS"  },	\
+		{ SSAM_SSH_TC_ACC1,			"ACC1" },	\
+		{ SSAM_SSH_TC_SHB,			"SMB"  },	\
+		{ SSAM_SSH_TC_POS,			"POS"  }	\
 	)
 
 DECLARE_EVENT_CLASS(ssam_frame_class,
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index 26b95ec12733..45501b6e54e8 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -201,7 +201,7 @@ static inline u16 ssh_crc(const u8 *buf, size_t len)
  * exception of zero, which is not an event ID. Thus, this is also the
  * absolute maximum number of event handlers that can be registered.
  */
-#define SSH_NUM_EVENTS		34
+#define SSH_NUM_EVENTS		38
 
 /*
  * SSH_NUM_TARGETS - The number of communication targets used in the protocol.
@@ -292,40 +292,45 @@ struct ssam_span {
  * Windows driver.
  */
 enum ssam_ssh_tc {
-				/* Category 0x00 is invalid for EC use. */
-	SSAM_SSH_TC_SAM = 0x01,	/* Generic system functionality, real-time clock. */
-	SSAM_SSH_TC_BAT = 0x02,	/* Battery/power subsystem. */
-	SSAM_SSH_TC_TMP = 0x03,	/* Thermal subsystem. */
-	SSAM_SSH_TC_PMC = 0x04,
-	SSAM_SSH_TC_FAN = 0x05,
-	SSAM_SSH_TC_PoM = 0x06,
-	SSAM_SSH_TC_DBG = 0x07,
-	SSAM_SSH_TC_KBD = 0x08,	/* Legacy keyboard (Laptop 1/2). */
-	SSAM_SSH_TC_FWU = 0x09,
-	SSAM_SSH_TC_UNI = 0x0a,
-	SSAM_SSH_TC_LPC = 0x0b,
-	SSAM_SSH_TC_TCL = 0x0c,
-	SSAM_SSH_TC_SFL = 0x0d,
-	SSAM_SSH_TC_KIP = 0x0e,	/* Manages detachable peripherals (Pro X/8 keyboard cover) */
-	SSAM_SSH_TC_EXT = 0x0f,
-	SSAM_SSH_TC_BLD = 0x10,
-	SSAM_SSH_TC_BAS = 0x11,	/* Detachment system (Surface Book 2/3). */
-	SSAM_SSH_TC_SEN = 0x12,
-	SSAM_SSH_TC_SRQ = 0x13,
-	SSAM_SSH_TC_MCU = 0x14,
-	SSAM_SSH_TC_HID = 0x15,	/* Generic HID input subsystem. */
-	SSAM_SSH_TC_TCH = 0x16,
-	SSAM_SSH_TC_BKL = 0x17,
-	SSAM_SSH_TC_TAM = 0x18,
-	SSAM_SSH_TC_ACC = 0x19,
-	SSAM_SSH_TC_UFI = 0x1a,
-	SSAM_SSH_TC_USC = 0x1b,
-	SSAM_SSH_TC_PEN = 0x1c,
-	SSAM_SSH_TC_VID = 0x1d,
-	SSAM_SSH_TC_AUD = 0x1e,
-	SSAM_SSH_TC_SMC = 0x1f,
-	SSAM_SSH_TC_KPD = 0x20,
-	SSAM_SSH_TC_REG = 0x21,	/* Extended event registry. */
+				  /* Category 0x00 is invalid for EC use. */
+	SSAM_SSH_TC_SAM  = 0x01,  /* Generic system functionality, real-time clock. */
+	SSAM_SSH_TC_BAT  = 0x02,  /* Battery/power subsystem. */
+	SSAM_SSH_TC_TMP  = 0x03,  /* Thermal subsystem. */
+	SSAM_SSH_TC_PMC  = 0x04,
+	SSAM_SSH_TC_FAN  = 0x05,
+	SSAM_SSH_TC_PoM  = 0x06,
+	SSAM_SSH_TC_DBG  = 0x07,
+	SSAM_SSH_TC_KBD  = 0x08,  /* Legacy keyboard (Laptop 1/2). */
+	SSAM_SSH_TC_FWU  = 0x09,
+	SSAM_SSH_TC_UNI  = 0x0a,
+	SSAM_SSH_TC_LPC  = 0x0b,
+	SSAM_SSH_TC_TCL  = 0x0c,
+	SSAM_SSH_TC_SFL  = 0x0d,
+	SSAM_SSH_TC_KIP  = 0x0e,  /* Manages detachable peripherals (Pro X/8 keyboard cover) */
+	SSAM_SSH_TC_EXT  = 0x0f,
+	SSAM_SSH_TC_BLD  = 0x10,
+	SSAM_SSH_TC_BAS  = 0x11,  /* Detachment system (Surface Book 2/3). */
+	SSAM_SSH_TC_SEN  = 0x12,
+	SSAM_SSH_TC_SRQ  = 0x13,
+	SSAM_SSH_TC_MCU  = 0x14,
+	SSAM_SSH_TC_HID  = 0x15,  /* Generic HID input subsystem. */
+	SSAM_SSH_TC_TCH  = 0x16,
+	SSAM_SSH_TC_BKL  = 0x17,
+	SSAM_SSH_TC_TAM  = 0x18,
+	SSAM_SSH_TC_ACC0 = 0x19,
+	SSAM_SSH_TC_UFI  = 0x1a,
+	SSAM_SSH_TC_USC  = 0x1b,
+	SSAM_SSH_TC_PEN  = 0x1c,
+	SSAM_SSH_TC_VID  = 0x1d,
+	SSAM_SSH_TC_AUD  = 0x1e,
+	SSAM_SSH_TC_SMC  = 0x1f,
+	SSAM_SSH_TC_KPD  = 0x20,
+	SSAM_SSH_TC_REG  = 0x21,  /* Extended event registry. */
+	SSAM_SSH_TC_SPT  = 0x22,
+	SSAM_SSH_TC_SYS  = 0x23,
+	SSAM_SSH_TC_ACC1 = 0x24,
+	SSAM_SSH_TC_SHB  = 0x25,
+	SSAM_SSH_TC_POS  = 0x26,  /* For obtaining Laptop Studio screen position. */
 };
 
 
-- 
cgit 


From 1a3c7d0841ae24d28a15bbf87ee7d08614cec957 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Sat, 11 Jun 2022 01:25:11 -0700
Subject: swiotlb: remove the unused swiotlb_force declaration

The 'swiotlb_force' is removed since commit c6af2aa9ffc9 ("swiotlb: make
the swiotlb_init interface more useful").

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..bdc58a0e20b1 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -60,7 +60,6 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #ifdef CONFIG_SWIOTLB
-extern enum swiotlb_force swiotlb_force;
 
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
-- 
cgit 


From f302d180c6d430ea99643b9b2b3407aedaa36703 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 21 Jun 2022 10:19:09 -0700
Subject: af_unix: Include the whole hash table size in UNIX_HASH_SIZE.

Currently, the size of AF_UNIX hash table is UNIX_HASH_SIZE * 2,
the first half for bind()ed sockets and the second half for unbound
ones.  UNIX_HASH_SIZE * 2 is used to define the table and iterate
over it.

In some places, we use ARRAY_SIZE(unix_socket_table) instead of
UNIX_HASH_SIZE * 2.  However, we cannot use it anymore because we
will allocate the hash table dynamically.  Then, we would have to
add UNIX_HASH_SIZE * 2 in many places, which would be troublesome.

This patch adapts the UNIX_HASH_SIZE definition to include bound
and unbound sockets and defines a new UNIX_HASH_MOD macro to ease
calculations.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  7 ++++---
 net/unix/af_unix.c    | 18 +++++++++---------
 net/unix/diag.c       |  6 ++----
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index a7ef624ed726..acb56e463db1 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -16,12 +16,13 @@ void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
 struct sock *unix_peer_get(struct sock *sk);
 
-#define UNIX_HASH_SIZE	256
+#define UNIX_HASH_MOD	(256 - 1)
+#define UNIX_HASH_SIZE	(256 * 2)
 #define UNIX_HASH_BITS	8
 
 extern unsigned int unix_tot_inflight;
-extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
-extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
+extern spinlock_t unix_table_locks[UNIX_HASH_SIZE];
+extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE];
 
 struct unix_address {
 	refcount_t	refcnt;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 990257f02e7c..c0804ae9c96a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,9 +118,9 @@
 
 #include "scm.h"
 
-spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
+spinlock_t unix_table_locks[UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_table_locks);
-struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
+struct hlist_head unix_socket_table[UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
 static atomic_long_t unix_nr_socks;
 
@@ -137,12 +137,12 @@ static unsigned int unix_unbound_hash(struct sock *sk)
 	hash ^= hash >> 8;
 	hash ^= sk->sk_type;
 
-	return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
+	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 }
 
 static unsigned int unix_bsd_hash(struct inode *i)
 {
-	return i->i_ino & (UNIX_HASH_SIZE - 1);
+	return i->i_ino & UNIX_HASH_MOD;
 }
 
 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
@@ -155,14 +155,14 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 	hash ^= hash >> 8;
 	hash ^= type;
 
-	return hash & (UNIX_HASH_SIZE - 1);
+	return hash & UNIX_HASH_MOD;
 }
 
 static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
 {
 	/* hash1 and hash2 is never the same because
-	 * one is between 0 and UNIX_HASH_SIZE - 1, and
-	 * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
+	 * one is between 0 and UNIX_HASH_MOD, and
+	 * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1.
 	 */
 	if (hash1 > hash2)
 		swap(hash1, hash2);
@@ -3239,7 +3239,7 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
 	unsigned long bucket = get_bucket(*pos);
 	struct sock *sk;
 
-	while (bucket < ARRAY_SIZE(unix_socket_table)) {
+	while (bucket < UNIX_HASH_SIZE) {
 		spin_lock(&unix_table_locks[bucket]);
 
 		sk = unix_from_bucket(seq, pos);
@@ -3666,7 +3666,7 @@ static int __init af_unix_init(void)
 
 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
 
-	for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
+	for (i = 0; i < UNIX_HASH_SIZE; i++)
 		spin_lock_init(&unix_table_locks[i]);
 
 	rc = proto_register(&unix_dgram_proto, 1);
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 4e3dc8179fa4..c5d1cca72aa5 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -204,9 +204,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
-	for (slot = s_slot;
-	     slot < ARRAY_SIZE(unix_socket_table);
-	     s_num = 0, slot++) {
+	for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) {
 		struct sock *sk;
 
 		num = 0;
@@ -242,7 +240,7 @@ static struct sock *unix_lookup_by_ino(unsigned int ino)
 	struct sock *sk;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
+	for (i = 0; i < UNIX_HASH_SIZE; i++) {
 		spin_lock(&unix_table_locks[i]);
 		sk_for_each(sk, &unix_socket_table[i])
 			if (ino == sock_i_ino(sk)) {
-- 
cgit 


From b6e811383062f88212082714db849127fa95142c Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 21 Jun 2022 10:19:10 -0700
Subject: af_unix: Define a per-netns hash table.

This commit adds a per netns hash table for AF_UNIX, which size is fixed
as UNIX_HASH_SIZE for now.

The first implementation defines a per-netns hash table as a single array
of lock and list:

	struct unix_hashbucket {
		spinlock_t		lock;
		struct hlist_head	head;
	};

	struct netns_unix {
		struct unix_hashbucket	*hash;
		...
	};

But, Eric pointed out memory cost that the structure has holes because of
sizeof(spinlock_t), which is 4 (or more if LOCKDEP is enabled). [0]  It
could be expensive on a host with thousands of netns and few AF_UNIX
sockets.  For this reason, a per-netns hash table uses two dense arrays.

	struct unix_table {
		spinlock_t		*locks;
		struct hlist_head	*buckets;
	};

	struct netns_unix {
		struct unix_table	table;
		...
	};

Note the length of the list has a significant impact rather than lock
contention, so having shared locks can be an option.  But, per-netns
locks and lists still perform better than the global locks and per-netns
lists. [1]

Also, this patch adds a change so that struct netns_unix disappears from
struct net if CONFIG_UNIX is disabled.

[0]: https://lore.kernel.org/netdev/CANn89iLVxO5aqx16azNU7p7Z-nz5NrnM5QTqOzueVxEnkVTxyg@mail.gmail.com/
[1]: https://lore.kernel.org/netdev/20220617175215.1769-1-kuniyu@amazon.com/

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  2 ++
 include/net/netns/unix.h    |  6 ++++++
 net/unix/af_unix.c          | 38 ++++++++++++++++++++++++++++++++------
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index c4f5601f6e32..20a2992901c2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -120,7 +120,9 @@ struct net {
 	struct netns_core	core;
 	struct netns_mib	mib;
 	struct netns_packet	packet;
+#if IS_ENABLED(CONFIG_UNIX)
 	struct netns_unix	unx;
+#endif
 	struct netns_nexthop	nexthop;
 	struct netns_ipv4	ipv4;
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/netns/unix.h b/include/net/netns/unix.h
index 91a3d7e39198..6f1a33df061d 100644
--- a/include/net/netns/unix.h
+++ b/include/net/netns/unix.h
@@ -5,8 +5,14 @@
 #ifndef __NETNS_UNIX_H__
 #define __NETNS_UNIX_H__
 
+struct unix_table {
+	spinlock_t		*locks;
+	struct hlist_head	*buckets;
+};
+
 struct ctl_table_header;
 struct netns_unix {
+	struct unix_table	table;
 	int			sysctl_max_dgram_qlen;
 	struct ctl_table_header	*ctl;
 };
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c0804ae9c96a..cdd12881a39d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3559,7 +3559,7 @@ static const struct net_proto_family unix_family_ops = {
 
 static int __net_init unix_net_init(struct net *net)
 {
-	int error = -ENOMEM;
+	int i;
 
 	net->unx.sysctl_max_dgram_qlen = 10;
 	if (unix_sysctl_register(net))
@@ -3567,18 +3567,44 @@ static int __net_init unix_net_init(struct net *net)
 
 #ifdef CONFIG_PROC_FS
 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
-			sizeof(struct seq_net_private))) {
-		unix_sysctl_unregister(net);
-		goto out;
+			     sizeof(struct seq_net_private)))
+		goto err_sysctl;
+#endif
+
+	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
+					      sizeof(spinlock_t), GFP_KERNEL);
+	if (!net->unx.table.locks)
+		goto err_proc;
+
+	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
+						sizeof(struct hlist_head),
+						GFP_KERNEL);
+	if (!net->unx.table.buckets)
+		goto free_locks;
+
+	for (i = 0; i < UNIX_HASH_SIZE; i++) {
+		spin_lock_init(&net->unx.table.locks[i]);
+		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
 	}
+
+	return 0;
+
+free_locks:
+	kvfree(net->unx.table.locks);
+err_proc:
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("unix", net->proc_net);
+err_sysctl:
 #endif
-	error = 0;
+	unix_sysctl_unregister(net);
 out:
-	return error;
+	return -ENOMEM;
 }
 
 static void __net_exit unix_net_exit(struct net *net)
 {
+	kvfree(net->unx.table.buckets);
+	kvfree(net->unx.table.locks);
 	unix_sysctl_unregister(net);
 	remove_proc_entry("unix", net->proc_net);
 }
-- 
cgit 


From cf2f225e2653734e66e91c09e1cbe004bfd3d4a7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 21 Jun 2022 10:19:12 -0700
Subject: af_unix: Put a socket into a per-netns hash table.

This commit replaces the global hash table with a per-netns one and removes
the global one.

We now link a socket in each netns's hash table so we can save some netns
comparisons when iterating through a hash bucket.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  1 -
 net/unix/af_unix.c    | 50 ++++++++++++++++++++------------------------------
 net/unix/diag.c       |  9 +++------
 3 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index acb56e463db1..b1748c9b6db2 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -22,7 +22,6 @@ struct sock *unix_peer_get(struct sock *sk);
 
 extern unsigned int unix_tot_inflight;
 extern spinlock_t unix_table_locks[UNIX_HASH_SIZE];
-extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE];
 
 struct unix_address {
 	refcount_t	refcnt;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 79f8fc5cdce8..9d0b07235dbc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -120,8 +120,6 @@
 
 spinlock_t unix_table_locks[UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_table_locks);
-struct hlist_head unix_socket_table[UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_socket_table);
 static atomic_long_t unix_nr_socks;
 
 /* SMP locking strategy:
@@ -308,20 +306,20 @@ static void __unix_remove_socket(struct sock *sk)
 	sk_del_node_init(sk);
 }
 
-static void __unix_insert_socket(struct sock *sk)
+static void __unix_insert_socket(struct net *net, struct sock *sk)
 {
 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
-	sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
+	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 }
 
-static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
-				 unsigned int hash)
+static void __unix_set_addr_hash(struct net *net, struct sock *sk,
+				 struct unix_address *addr, unsigned int hash)
 {
 	__unix_remove_socket(sk);
 	smp_store_release(&unix_sk(sk)->addr, addr);
 
 	sk->sk_hash = hash;
-	__unix_insert_socket(sk);
+	__unix_insert_socket(net, sk);
 }
 
 static void unix_remove_socket(struct net *net, struct sock *sk)
@@ -337,7 +335,7 @@ static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 {
 	spin_lock(&unix_table_locks[sk->sk_hash]);
 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
-	__unix_insert_socket(sk);
+	__unix_insert_socket(net, sk);
 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 	spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
@@ -348,12 +346,9 @@ static struct sock *__unix_find_socket_byname(struct net *net,
 {
 	struct sock *s;
 
-	sk_for_each(s, &unix_socket_table[hash]) {
+	sk_for_each(s, &net->unx.table.buckets[hash]) {
 		struct unix_sock *u = unix_sk(s);
 
-		if (!net_eq(sock_net(s), net))
-			continue;
-
 		if (u->addr->len == len &&
 		    !memcmp(u->addr->name, sunname, len))
 			return s;
@@ -384,7 +379,7 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 
 	spin_lock(&unix_table_locks[hash]);
 	spin_lock(&net->unx.table.locks[hash]);
-	sk_for_each(s, &unix_socket_table[hash]) {
+	sk_for_each(s, &net->unx.table.buckets[hash]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
 		if (dentry && d_backing_inode(dentry) == i) {
@@ -1140,7 +1135,7 @@ retry:
 		goto retry;
 	}
 
-	__unix_set_addr_hash(sk, addr, new_hash);
+	__unix_set_addr_hash(net, sk, addr, new_hash);
 	unix_table_double_unlock(net, old_hash, new_hash);
 	err = 0;
 
@@ -1199,7 +1194,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	unix_table_double_lock(net, old_hash, new_hash);
 	u->path.mnt = mntget(parent.mnt);
 	u->path.dentry = dget(dentry);
-	__unix_set_addr_hash(sk, addr, new_hash);
+	__unix_set_addr_hash(net, sk, addr, new_hash);
 	unix_table_double_unlock(net, old_hash, new_hash);
 	mutex_unlock(&u->bindlock);
 	done_path_create(&parent, dentry);
@@ -1246,7 +1241,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
 		goto out_spin;
 
-	__unix_set_addr_hash(sk, addr, new_hash);
+	__unix_set_addr_hash(net, sk, addr, new_hash);
 	unix_table_double_unlock(net, old_hash, new_hash);
 	mutex_unlock(&u->bindlock);
 	return 0;
@@ -3239,12 +3234,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
 {
 	unsigned long offset = get_offset(*pos);
 	unsigned long bucket = get_bucket(*pos);
-	struct sock *sk;
 	unsigned long count = 0;
+	struct sock *sk;
 
-	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
-		if (sock_net(sk) != seq_file_net(seq))
-			continue;
+	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
+	     sk; sk = sk_next(sk)) {
 		if (++count == offset)
 			break;
 	}
@@ -3279,13 +3273,13 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
 				  loff_t *pos)
 {
 	unsigned long bucket = get_bucket(*pos);
-	struct net *net = seq_file_net(seq);
 
-	for (sk = sk_next(sk); sk; sk = sk_next(sk))
-		if (sock_net(sk) == net)
-			return sk;
+	sk = sk_next(sk);
+	if (sk)
+		return sk;
+
 
-	spin_unlock(&net->unx.table.locks[bucket]);
+	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
 	spin_unlock(&unix_table_locks[bucket]);
 
 	*pos = set_bucket_offset(++bucket, 1);
@@ -3406,7 +3400,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
 
 {
 	struct bpf_unix_iter_state *iter = seq->private;
-	struct net *net = seq_file_net(seq);
 	unsigned int expected = 1;
 	struct sock *sk;
 
@@ -3414,9 +3407,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
 	iter->batch[iter->end_sk++] = start_sk;
 
 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
-		if (sock_net(sk) != net)
-			continue;
-
 		if (iter->end_sk < iter->max_sk) {
 			sock_hold(sk);
 			iter->batch[iter->end_sk++] = sk;
@@ -3425,7 +3415,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
 		expected++;
 	}
 
-	spin_unlock(&net->unx.table.locks[start_sk->sk_hash]);
+	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
 	spin_unlock(&unix_table_locks[start_sk->sk_hash]);
 
 	return expected;
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 7fc377435114..4d0f0ca6a1eb 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -210,9 +210,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		num = 0;
 		spin_lock(&unix_table_locks[slot]);
 		spin_lock(&net->unx.table.locks[slot]);
-		sk_for_each(sk, &unix_socket_table[slot]) {
-			if (!net_eq(sock_net(sk), net))
-				continue;
+		sk_for_each(sk, &net->unx.table.buckets[slot]) {
 			if (num < s_num)
 				goto next;
 			if (!(req->udiag_states & (1 << sk->sk_state)))
@@ -246,13 +244,14 @@ static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino)
 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
 		spin_lock(&unix_table_locks[i]);
 		spin_lock(&net->unx.table.locks[i]);
-		sk_for_each(sk, &unix_socket_table[i])
+		sk_for_each(sk, &net->unx.table.buckets[i]) {
 			if (ino == sock_i_ino(sk)) {
 				sock_hold(sk);
 				spin_unlock(&net->unx.table.locks[i]);
 				spin_unlock(&unix_table_locks[i]);
 				return sk;
 			}
+		}
 		spin_unlock(&net->unx.table.locks[i]);
 		spin_unlock(&unix_table_locks[i]);
 	}
@@ -277,8 +276,6 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	err = -ENOENT;
 	if (sk == NULL)
 		goto out_nosk;
-	if (!net_eq(sock_net(sk), net))
-		goto out;
 
 	err = sock_diag_check_cookie(sk, req->udiag_cookie);
 	if (err)
-- 
cgit 


From 2f7ca90a0188b57a54d3b1159eb7874427a7e07a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 21 Jun 2022 10:19:13 -0700
Subject: af_unix: Remove unix_table_locks.

unix_table_locks are to protect the global hash table, unix_socket_table.
The previous commit removed it, so let's clean up the unnecessary locks.

Here is a test result on EC2 c5.9xlarge where 10 processes run concurrently
in different netns and bind 100,000 sockets for each.

  without this series : 1m 38s
  with this series    :    11s

It is ~10x faster because the global hash table is split into 10 netns in
this case.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  1 -
 net/unix/af_unix.c    | 42 ++++++++----------------------------------
 net/unix/diag.c       |  8 +-------
 3 files changed, 9 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b1748c9b6db2..480fa579787e 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -21,7 +21,6 @@ struct sock *unix_peer_get(struct sock *sk);
 #define UNIX_HASH_BITS	8
 
 extern unsigned int unix_tot_inflight;
-extern spinlock_t unix_table_locks[UNIX_HASH_SIZE];
 
 struct unix_address {
 	refcount_t	refcnt;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9d0b07235dbc..49f6626330c3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,13 +118,11 @@
 
 #include "scm.h"
 
-spinlock_t unix_table_locks[UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_table_locks);
 static atomic_long_t unix_nr_socks;
 
 /* SMP locking strategy:
- *    hash table is protected with spinlock unix_table_locks
- *    each socket state is protected by separate spin lock.
+ *    hash table is protected with spinlock.
+ *    each socket state is protected by separate spinlock.
  */
 
 static unsigned int unix_unbound_hash(struct sock *sk)
@@ -166,9 +164,6 @@ static void unix_table_double_lock(struct net *net,
 	if (hash1 > hash2)
 		swap(hash1, hash2);
 
-	spin_lock(&unix_table_locks[hash1]);
-	spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
-
 	spin_lock(&net->unx.table.locks[hash1]);
 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 }
@@ -178,9 +173,6 @@ static void unix_table_double_unlock(struct net *net,
 {
 	spin_unlock(&net->unx.table.locks[hash1]);
 	spin_unlock(&net->unx.table.locks[hash2]);
-
-	spin_unlock(&unix_table_locks[hash1]);
-	spin_unlock(&unix_table_locks[hash2]);
 }
 
 #ifdef CONFIG_SECURITY_NETWORK
@@ -324,20 +316,16 @@ static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 
 static void unix_remove_socket(struct net *net, struct sock *sk)
 {
-	spin_lock(&unix_table_locks[sk->sk_hash]);
 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
 	__unix_remove_socket(sk);
 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
-	spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
 
 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 {
-	spin_lock(&unix_table_locks[sk->sk_hash]);
 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
 	__unix_insert_socket(net, sk);
 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
-	spin_unlock(&unix_table_locks[sk->sk_hash]);
 }
 
 static struct sock *__unix_find_socket_byname(struct net *net,
@@ -362,13 +350,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
 {
 	struct sock *s;
 
-	spin_lock(&unix_table_locks[hash]);
 	spin_lock(&net->unx.table.locks[hash]);
 	s = __unix_find_socket_byname(net, sunname, len, hash);
 	if (s)
 		sock_hold(s);
 	spin_unlock(&net->unx.table.locks[hash]);
-	spin_unlock(&unix_table_locks[hash]);
 	return s;
 }
 
@@ -377,7 +363,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 	unsigned int hash = unix_bsd_hash(i);
 	struct sock *s;
 
-	spin_lock(&unix_table_locks[hash]);
 	spin_lock(&net->unx.table.locks[hash]);
 	sk_for_each(s, &net->unx.table.buckets[hash]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
@@ -385,12 +370,10 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 		if (dentry && d_backing_inode(dentry) == i) {
 			sock_hold(s);
 			spin_unlock(&net->unx.table.locks[hash]);
-			spin_unlock(&unix_table_locks[hash]);
 			return s;
 		}
 	}
 	spin_unlock(&net->unx.table.locks[hash]);
-	spin_unlock(&unix_table_locks[hash]);
 	return NULL;
 }
 
@@ -1551,9 +1534,9 @@ restart:
 	 *
 	 * The contents of *(otheru->addr) and otheru->path
 	 * are seen fully set up here, since we have found
-	 * otheru in hash under unix_table_locks.  Insertion
-	 * into the hash chain we'd found it in had been done
-	 * in an earlier critical area protected by unix_table_locks,
+	 * otheru in hash under its lock.  Insertion into the
+	 * hash chain we'd found it in had been done in an
+	 * earlier critical area protected by the chain's lock,
 	 * the same one where we'd set *(otheru->addr) contents,
 	 * as well as otheru->path and otheru->addr itself.
 	 *
@@ -3253,7 +3236,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
 	struct sock *sk;
 
 	while (bucket < UNIX_HASH_SIZE) {
-		spin_lock(&unix_table_locks[bucket]);
 		spin_lock(&net->unx.table.locks[bucket]);
 
 		sk = unix_from_bucket(seq, pos);
@@ -3261,7 +3243,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
 			return sk;
 
 		spin_unlock(&net->unx.table.locks[bucket]);
-		spin_unlock(&unix_table_locks[bucket]);
 
 		*pos = set_bucket_offset(++bucket, 1);
 	}
@@ -3280,7 +3261,6 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
 
 
 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
-	spin_unlock(&unix_table_locks[bucket]);
 
 	*pos = set_bucket_offset(++bucket, 1);
 
@@ -3309,10 +3289,8 @@ static void unix_seq_stop(struct seq_file *seq, void *v)
 {
 	struct sock *sk = v;
 
-	if (sk) {
+	if (sk)
 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
-		spin_unlock(&unix_table_locks[sk->sk_hash]);
-	}
 }
 
 static int unix_seq_show(struct seq_file *seq, void *v)
@@ -3337,7 +3315,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
 			sock_i_ino(s));
 
-		if (u->addr) {	// under unix_table_locks here
+		if (u->addr) {	// under a hash table lock here
 			int i, len;
 			seq_putc(seq, ' ');
 
@@ -3416,7 +3394,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
 	}
 
 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
-	spin_unlock(&unix_table_locks[start_sk->sk_hash]);
 
 	return expected;
 }
@@ -3705,13 +3682,10 @@ static void __init bpf_iter_register(void)
 
 static int __init af_unix_init(void)
 {
-	int i, rc = -1;
+	int rc = -1;
 
 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
 
-	for (i = 0; i < UNIX_HASH_SIZE; i++)
-		spin_lock_init(&unix_table_locks[i]);
-
 	rc = proto_register(&unix_dgram_proto, 1);
 	if (rc != 0) {
 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 4d0f0ca6a1eb..105f522a89fe 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -13,7 +13,7 @@
 
 static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 {
-	/* might or might not have unix_table_locks */
+	/* might or might not have a hash table lock */
 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
 
 	if (!addr)
@@ -208,7 +208,6 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		struct sock *sk;
 
 		num = 0;
-		spin_lock(&unix_table_locks[slot]);
 		spin_lock(&net->unx.table.locks[slot]);
 		sk_for_each(sk, &net->unx.table.buckets[slot]) {
 			if (num < s_num)
@@ -220,14 +219,12 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 					 cb->nlh->nlmsg_seq,
 					 NLM_F_MULTI) < 0) {
 				spin_unlock(&net->unx.table.locks[slot]);
-				spin_unlock(&unix_table_locks[slot]);
 				goto done;
 			}
 next:
 			num++;
 		}
 		spin_unlock(&net->unx.table.locks[slot]);
-		spin_unlock(&unix_table_locks[slot]);
 	}
 done:
 	cb->args[0] = slot;
@@ -242,18 +239,15 @@ static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino)
 	int i;
 
 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
-		spin_lock(&unix_table_locks[i]);
 		spin_lock(&net->unx.table.locks[i]);
 		sk_for_each(sk, &net->unx.table.buckets[i]) {
 			if (ino == sock_i_ino(sk)) {
 				sock_hold(sk);
 				spin_unlock(&net->unx.table.locks[i]);
-				spin_unlock(&unix_table_locks[i]);
 				return sk;
 			}
 		}
 		spin_unlock(&net->unx.table.locks[i]);
-		spin_unlock(&unix_table_locks[i]);
 	}
 	return NULL;
 }
-- 
cgit 


From 1892a991886ace2c3450bec801df2cf4028a803a Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 21 Jun 2022 16:58:34 +0200
Subject: ASoC: core: Make snd_soc_unregister_card() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function snd_soc_unregister_card() returned 0 unconditionally and most
callers don't care to check the return value. Make it return void and
adapt the callers that didn't ignore the return value before.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220621145834.198519-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h                 | 2 +-
 sound/soc/atmel/mikroe-proto.c      | 4 +++-
 sound/soc/fsl/pcm030-audio-fabric.c | 5 ++---
 sound/soc/soc-core.c                | 4 +---
 sound/soc/soc-topology-test.c       | 4 +---
 5 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 76ee3c2b8b56..aad24a1d3276 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -426,7 +426,7 @@ enum snd_soc_pcm_subclass {
 };
 
 int snd_soc_register_card(struct snd_soc_card *card);
-int snd_soc_unregister_card(struct snd_soc_card *card);
+void snd_soc_unregister_card(struct snd_soc_card *card);
 int devm_snd_soc_register_card(struct device *dev, struct snd_soc_card *card);
 #ifdef CONFIG_PM_SLEEP
 int snd_soc_suspend(struct device *dev);
diff --git a/sound/soc/atmel/mikroe-proto.c b/sound/soc/atmel/mikroe-proto.c
index ce46d8a0b7e4..954460719aa3 100644
--- a/sound/soc/atmel/mikroe-proto.c
+++ b/sound/soc/atmel/mikroe-proto.c
@@ -157,7 +157,9 @@ put_codec_node:
 
 static int snd_proto_remove(struct platform_device *pdev)
 {
-	return snd_soc_unregister_card(&snd_proto);
+	snd_soc_unregister_card(&snd_proto);
+
+	return 0;
 }
 
 static const struct of_device_id snd_proto_of_match[] = {
diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c
index 83b4a22bf15a..e4c805acc349 100644
--- a/sound/soc/fsl/pcm030-audio-fabric.c
+++ b/sound/soc/fsl/pcm030-audio-fabric.c
@@ -113,12 +113,11 @@ static int pcm030_fabric_probe(struct platform_device *op)
 static int pcm030_fabric_remove(struct platform_device *op)
 {
 	struct pcm030_audio_data *pdata = platform_get_drvdata(op);
-	int ret;
 
-	ret = snd_soc_unregister_card(pdata->card);
+	snd_soc_unregister_card(pdata->card);
 	platform_device_unregister(pdata->codec_device);
 
-	return ret;
+	return 0;
 }
 
 static const struct of_device_id pcm030_audio_match[] = {
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 57f7105c12b7..30f0da711ca9 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2319,14 +2319,12 @@ EXPORT_SYMBOL_GPL(snd_soc_register_card);
  * @card: Card to unregister
  *
  */
-int snd_soc_unregister_card(struct snd_soc_card *card)
+void snd_soc_unregister_card(struct snd_soc_card *card)
 {
 	mutex_lock(&client_mutex);
 	snd_soc_unbind_card(card, true);
 	mutex_unlock(&client_mutex);
 	dev_dbg(card->dev, "ASoC: Unregistered card '%s'\n", card->name);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(snd_soc_unregister_card);
 
diff --git a/sound/soc/soc-topology-test.c b/sound/soc/soc-topology-test.c
index ae3968161509..225d74355974 100644
--- a/sound/soc/soc-topology-test.c
+++ b/sound/soc/soc-topology-test.c
@@ -271,9 +271,7 @@ static void snd_soc_tplg_test_load_with_null_comp(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, 0, ret);
 
 	/* cleanup */
-	ret = snd_soc_unregister_card(&kunit_comp->card);
-	KUNIT_EXPECT_EQ(test, 0, ret);
-
+	snd_soc_unregister_card(&kunit_comp->card);
 	snd_soc_unregister_component(test_dev);
 }
 
-- 
cgit 


From 0829c35dc5346e90f428de61896362b51ab58296 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 21 May 2022 13:02:03 +0200
Subject: pwm: Drop support for legacy drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no drivers left providing the legacy callbacks. So drop
support for these.

If this commit breaks your out-of-tree pwm driver, look at e.g. commit
ec00cd5e63f0 ("pwm: renesas-tpu: Implement .apply() callback") for an
example of the needed conversion for your driver.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 82 +----------------------------------------------------
 include/linux/pwm.h | 12 --------
 2 files changed, 1 insertion(+), 93 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index c7552df32082..0e042410f6b9 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -235,18 +235,8 @@ EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
 static bool pwm_ops_check(const struct pwm_chip *chip)
 {
-
 	const struct pwm_ops *ops = chip->ops;
 
-	/* driver supports legacy, non-atomic operation */
-	if (ops->config && ops->enable && ops->disable) {
-		if (IS_ENABLED(CONFIG_PWM_DEBUG))
-			dev_warn(chip->dev,
-				 "Driver needs updating to atomic API\n");
-
-		return true;
-	}
-
 	if (!ops->apply)
 		return false;
 
@@ -548,73 +538,6 @@ static void pwm_apply_state_debug(struct pwm_device *pwm,
 	}
 }
 
-static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm,
-			    const struct pwm_state *state)
-{
-	int err;
-	struct pwm_state initial_state = pwm->state;
-
-	if (state->polarity != pwm->state.polarity) {
-		if (!chip->ops->set_polarity)
-			return -EINVAL;
-
-		/*
-		 * Changing the polarity of a running PWM is only allowed when
-		 * the PWM driver implements ->apply().
-		 */
-		if (pwm->state.enabled) {
-			chip->ops->disable(chip, pwm);
-
-			/*
-			 * Update pwm->state already here in case
-			 * .set_polarity() or another callback depend on that.
-			 */
-			pwm->state.enabled = false;
-		}
-
-		err = chip->ops->set_polarity(chip, pwm, state->polarity);
-		if (err)
-			goto rollback;
-
-		pwm->state.polarity = state->polarity;
-	}
-
-	if (!state->enabled) {
-		if (pwm->state.enabled)
-			chip->ops->disable(chip, pwm);
-
-		return 0;
-	}
-
-	/*
-	 * We cannot skip calling ->config even if state->period ==
-	 * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle
-	 * because we might have exited early in the last call to
-	 * pwm_apply_state because of !state->enabled and so the two values in
-	 * pwm->state might not be configured in hardware.
-	 */
-	err = chip->ops->config(pwm->chip, pwm,
-				state->duty_cycle,
-				state->period);
-	if (err)
-		goto rollback;
-
-	pwm->state.period = state->period;
-	pwm->state.duty_cycle = state->duty_cycle;
-
-	if (!pwm->state.enabled) {
-		err = chip->ops->enable(chip, pwm);
-		if (err)
-			goto rollback;
-	}
-
-	return 0;
-
-rollback:
-	pwm->state = initial_state;
-	return err;
-}
-
 /**
  * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
@@ -647,10 +570,7 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
 	    state->usage_power == pwm->state.usage_power)
 		return 0;
 
-	if (chip->ops->apply)
-		err = chip->ops->apply(chip, pwm, state);
-	else
-		err = pwm_apply_legacy(chip, pwm, state);
+	err = chip->ops->apply(chip, pwm, state);
 	if (err)
 		return err;
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 9771a0761a40..76463b71ad4e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -261,10 +261,6 @@ pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle,
  *	       called once per PWM device when the PWM chip is
  *	       registered.
  * @owner: helps prevent removal of modules exporting active PWMs
- * @config: configure duty cycles and period length for this PWM
- * @set_polarity: configure the polarity of this PWM
- * @enable: enable PWM output toggling
- * @disable: disable PWM output toggling
  */
 struct pwm_ops {
 	int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
@@ -276,14 +272,6 @@ struct pwm_ops {
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
 	struct module *owner;
-
-	/* Only used by legacy drivers */
-	int (*config)(struct pwm_chip *chip, struct pwm_device *pwm,
-		      int duty_ns, int period_ns);
-	int (*set_polarity)(struct pwm_chip *chip, struct pwm_device *pwm,
-			    enum pwm_polarity polarity);
-	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
-	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
 };
 
 /**
-- 
cgit 


From ef2e35d90890932ee4546231c775326ffd7a3909 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 23 May 2022 19:45:00 +0200
Subject: pwm: Reorder header file to get rid of struct pwm_capture forward
 declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no cyclic dependency, so by reordering the forward declaration
can be dropped.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 76463b71ad4e..6d6946dec771 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -6,7 +6,6 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct pwm_capture;
 struct seq_file;
 
 struct pwm_chip;
@@ -251,6 +250,16 @@ pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle,
 	return 0;
 }
 
+/**
+ * struct pwm_capture - PWM capture data
+ * @period: period of the PWM signal (in nanoseconds)
+ * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
+ */
+struct pwm_capture {
+	unsigned int period;
+	unsigned int duty_cycle;
+};
+
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -300,16 +309,6 @@ struct pwm_chip {
 	struct pwm_device *pwms;
 };
 
-/**
- * struct pwm_capture - PWM capture data
- * @period: period of the PWM signal (in nanoseconds)
- * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
- */
-struct pwm_capture {
-	unsigned int period;
-	unsigned int duty_cycle;
-};
-
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
 struct pwm_device *pwm_request(int pwm_id, const char *label);
-- 
cgit 


From 5c8dca97404bf23f6531456ae4d14c862f6871a3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 23 May 2022 19:45:01 +0200
Subject: pwm: Drop unused forward declaration from pwm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The declaration was necessary until commit cc2d22477779 ("pwm: Drop
per-chip dbg_show callback").

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 6d6946dec771..9429930c5566 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -6,8 +6,6 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct seq_file;
-
 struct pwm_chip;
 
 /**
-- 
cgit 


From 62c0aff64c8d6594357b6217db019552ea664b90 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Jun 2022 20:11:47 +0300
Subject: clk: Remove never used devm_clk_*unregister()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the entire history of the devm_clk_*unregister() existence they were
used only once (*) in 2015. Remove them.

*) The commit 264e3b75de4e ("clk: s2mps11: Simplify s2mps11_clk_probe unwind
   paths") exactly supports the point of the change proposed here.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220622171147.85603-1-andriy.shevchenko@linux.intel.com
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 48 --------------------------------------------
 include/linux/clk-provider.h |  2 --
 2 files changed, 50 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index f00d4c1158d7..7fc191c15507 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4279,54 +4279,6 @@ int devm_clk_hw_register(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register);
 
-static int devm_clk_match(struct device *dev, void *res, void *data)
-{
-	struct clk *c = res;
-	if (WARN_ON(!c))
-		return 0;
-	return c == data;
-}
-
-static int devm_clk_hw_match(struct device *dev, void *res, void *data)
-{
-	struct clk_hw *hw = res;
-
-	if (WARN_ON(!hw))
-		return 0;
-	return hw == data;
-}
-
-/**
- * devm_clk_unregister - resource managed clk_unregister()
- * @dev: device that is unregistering the clock data
- * @clk: clock to unregister
- *
- * Deallocate a clock allocated with devm_clk_register(). Normally
- * this function will not need to be called and the resource management
- * code will ensure that the resource is freed.
- */
-void devm_clk_unregister(struct device *dev, struct clk *clk)
-{
-	WARN_ON(devres_release(dev, devm_clk_unregister_cb, devm_clk_match, clk));
-}
-EXPORT_SYMBOL_GPL(devm_clk_unregister);
-
-/**
- * devm_clk_hw_unregister - resource managed clk_hw_unregister()
- * @dev: device that is unregistering the hardware-specific clock data
- * @hw: link to hardware-specific clock data
- *
- * Unregister a clk_hw registered with devm_clk_hw_register(). Normally
- * this function will not need to be called and the resource management
- * code will ensure that the resource is freed.
- */
-void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
-{
-	WARN_ON(devres_release(dev, devm_clk_hw_unregister_cb, devm_clk_hw_match,
-				hw));
-}
-EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
-
 static void devm_clk_release(struct device *dev, void *res)
 {
 	clk_put(*(struct clk **)res);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c10dc4c659e2..72d937c03a3e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1176,10 +1176,8 @@ int __must_check devm_clk_hw_register(struct device *dev, struct clk_hw *hw);
 int __must_check of_clk_hw_register(struct device_node *node, struct clk_hw *hw);
 
 void clk_unregister(struct clk *clk);
-void devm_clk_unregister(struct device *dev, struct clk *clk);
 
 void clk_hw_unregister(struct clk_hw *hw);
-void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
 
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
-- 
cgit 


From ed59dfd9509d172e4920994ed9cbebf93b0050cc Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 23 May 2022 19:31:25 +0800
Subject: asm-generic: Add memory barrier dma_mb()

The memory barrier dma_mb() is introduced by commit a76a37777f2c
("iommu/arm-smmu-v3: Ensure queue is read after updating prod pointer"),
which is used to ensure that prior (both reads and writes) accesses
to memory by a CPU are ordered w.r.t. a subsequent MMIO write.

Reviewed-by: Arnd Bergmann <arnd@arndb.de> # for asm-generic
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20220523113126.171714-2-wangkefeng.wang@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/memory-barriers.txt | 11 ++++++-----
 include/asm-generic/barrier.h     |  8 ++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index b12df9137e1c..832b5d36e279 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1894,6 +1894,7 @@ There are some more advanced barrier functions:
 
  (*) dma_wmb();
  (*) dma_rmb();
+ (*) dma_mb();
 
      These are for use with consistent memory to guarantee the ordering
      of writes or reads of shared memory accessible to both the CPU and a
@@ -1925,11 +1926,11 @@ There are some more advanced barrier functions:
      The dma_rmb() allows us guarantee the device has released ownership
      before we read the data from the descriptor, and the dma_wmb() allows
      us to guarantee the data is written to the descriptor before the device
-     can see it now has ownership.  Note that, when using writel(), a prior
-     wmb() is not needed to guarantee that the cache coherent memory writes
-     have completed before writing to the MMIO region.  The cheaper
-     writel_relaxed() does not provide this guarantee and must not be used
-     here.
+     can see it now has ownership.  The dma_mb() implies both a dma_rmb() and
+     a dma_wmb().  Note that, when using writel(), a prior wmb() is not needed
+     to guarantee that the cache coherent memory writes have completed before
+     writing to the MMIO region.  The cheaper writel_relaxed() does not provide
+     this guarantee and must not be used here.
 
      See the subsection "Kernel I/O barrier effects" for more information on
      relaxed I/O accessors and the Documentation/core-api/dma-api.rst file for
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index fd7e8fbaeef1..961f4d88f9ef 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -38,6 +38,10 @@
 #define wmb()	do { kcsan_wmb(); __wmb(); } while (0)
 #endif
 
+#ifdef __dma_mb
+#define dma_mb()	do { kcsan_mb(); __dma_mb(); } while (0)
+#endif
+
 #ifdef __dma_rmb
 #define dma_rmb()	do { kcsan_rmb(); __dma_rmb(); } while (0)
 #endif
@@ -65,6 +69,10 @@
 #define wmb()	mb()
 #endif
 
+#ifndef dma_mb
+#define dma_mb()	mb()
+#endif
+
 #ifndef dma_rmb
 #define dma_rmb()	rmb()
 #endif
-- 
cgit 


From a22f18bddd824e96db839ccda75ff7e035e938ca Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jun 2022 20:36:06 +0200
Subject: ACPI / MMC: PM: Unify fixing up device power

Introduce acpi_device_fix_up_power_extended() for fixing up power of
a device having an ACPI companion in a manner that takes the device's
children into account and make the MMC code use it in two places
instead of walking the list of the device ACPI companion's children
directly.

This will help to eliminate the children list head from struct
acpi_device as it is redundant and it is used in questionable ways
in some places (in particular, locking is needed for walking the
list pointed to it safely, but it is often missing).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/acpi/device_pm.c          | 22 ++++++++++++++++++++++
 drivers/mmc/host/sdhci-acpi.c     |  7 ++-----
 drivers/mmc/host/sdhci-pci-core.c | 11 +++--------
 include/acpi/acpi_bus.h           |  1 +
 4 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 130b5f4a50a3..9dce1245689c 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -369,6 +369,28 @@ int acpi_device_fix_up_power(struct acpi_device *device)
 }
 EXPORT_SYMBOL_GPL(acpi_device_fix_up_power);
 
+static int fix_up_power_if_applicable(struct acpi_device *adev, void *not_used)
+{
+	if (adev->status.present && adev->status.enabled)
+		acpi_device_fix_up_power(adev);
+
+	return 0;
+}
+
+/**
+ * acpi_device_fix_up_power_extended - Force device and its children into D0.
+ * @adev: Parent device object whose power state is to be fixed up.
+ *
+ * Call acpi_device_fix_up_power() for @adev and its children so long as they
+ * are reported as present and enabled.
+ */
+void acpi_device_fix_up_power_extended(struct acpi_device *adev)
+{
+	acpi_device_fix_up_power(adev);
+	acpi_dev_for_each_child(adev, fix_up_power_if_applicable, NULL);
+}
+EXPORT_SYMBOL_GPL(acpi_device_fix_up_power_extended);
+
 int acpi_device_update_power(struct acpi_device *device, int *state_p)
 {
 	int state;
diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
index c0350e9c03f3..4cca4c90769b 100644
--- a/drivers/mmc/host/sdhci-acpi.c
+++ b/drivers/mmc/host/sdhci-acpi.c
@@ -775,8 +775,8 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	const struct sdhci_acpi_slot *slot;
-	struct acpi_device *device, *child;
 	const struct dmi_system_id *id;
+	struct acpi_device *device;
 	struct sdhci_acpi_host *c;
 	struct sdhci_host *host;
 	struct resource *iomem;
@@ -796,10 +796,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
 	slot = sdhci_acpi_get_slot(device);
 
 	/* Power on the SDHCI controller and its children */
-	acpi_device_fix_up_power(device);
-	list_for_each_entry(child, &device->children, node)
-		if (child->status.present && child->status.enabled)
-			acpi_device_fix_up_power(child);
+	acpi_device_fix_up_power_extended(device);
 
 	if (sdhci_acpi_byt_defer(dev))
 		return -EPROBE_DEFER;
diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
index ed53276f6ad9..622b7de96c7f 100644
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -1240,16 +1240,11 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_sd = {
 #ifdef CONFIG_ACPI
 static void intel_mrfld_mmc_fix_up_power_slot(struct sdhci_pci_slot *slot)
 {
-	struct acpi_device *device, *child;
+	struct acpi_device *device;
 
 	device = ACPI_COMPANION(&slot->chip->pdev->dev);
-	if (!device)
-		return;
-
-	acpi_device_fix_up_power(device);
-	list_for_each_entry(child, &device->children, node)
-		if (child->status.present && child->status.enabled)
-			acpi_device_fix_up_power(child);
+	if (device)
+		acpi_device_fix_up_power_extended(device);
 }
 #else
 static inline void intel_mrfld_mmc_fix_up_power_slot(struct sdhci_pci_slot *slot) {}
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 76de1aa5d221..54c5566df9fe 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -524,6 +524,7 @@ const char *acpi_power_state_string(int state);
 int acpi_device_set_power(struct acpi_device *device, int state);
 int acpi_bus_init_power(struct acpi_device *device);
 int acpi_device_fix_up_power(struct acpi_device *device);
+void acpi_device_fix_up_power_extended(struct acpi_device *adev);
 int acpi_bus_update_power(acpi_handle handle, int *state_p);
 int acpi_device_update_power(struct acpi_device *device, int *state_p);
 bool acpi_bus_power_manageable(acpi_handle handle);
-- 
cgit 


From 203184571388a988283543f0fd7da1a0da7c3f91 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:53 -0500
Subject: dmaengine: dw-edma: Detach the private data and chip info structures

"struct dw_edma_chip" contains an internal structure "struct dw_edma" that
is used by the eDMA core internally and should not be touched by the eDMA
controller drivers themselves. But currently, the eDMA controller drivers
like "dw-edma-pci" allocate and populate this internal structure before
passing it on to the eDMA core. The eDMA core further populates the
structure and uses it. This is wrong!

Hence, move all the "struct dw_edma" specifics from controller drivers to
the eDMA core.

Link: https://lore.kernel.org/r/20220524152159.2370739-3-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-core.c       | 90 +++++++++++++++++---------------
 drivers/dma/dw-edma/dw-edma-core.h       | 31 ++---------
 drivers/dma/dw-edma/dw-edma-pcie.c       | 82 ++++++++++++-----------------
 drivers/dma/dw-edma/dw-edma-v0-core.c    | 32 ++++++------
 drivers/dma/dw-edma/dw-edma-v0-core.h    |  4 +-
 drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 18 +++----
 drivers/dma/dw-edma/dw-edma-v0-debugfs.h |  8 +--
 include/linux/dma/edma.h                 | 48 ++++++++++++++++-
 8 files changed, 164 insertions(+), 149 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 468d1097a1ec..9a4c96f7d9d9 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -64,8 +64,8 @@ static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk)
 
 static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
 {
+	struct dw_edma_chip *chip = desc->chan->dw->chip;
 	struct dw_edma_chan *chan = desc->chan;
-	struct dw_edma *dw = chan->chip->dw;
 	struct dw_edma_chunk *chunk;
 
 	chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
@@ -82,11 +82,11 @@ static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
 	 */
 	chunk->cb = !(desc->chunks_alloc % 2);
 	if (chan->dir == EDMA_DIR_WRITE) {
-		chunk->ll_region.paddr = dw->ll_region_wr[chan->id].paddr;
-		chunk->ll_region.vaddr = dw->ll_region_wr[chan->id].vaddr;
+		chunk->ll_region.paddr = chip->ll_region_wr[chan->id].paddr;
+		chunk->ll_region.vaddr = chip->ll_region_wr[chan->id].vaddr;
 	} else {
-		chunk->ll_region.paddr = dw->ll_region_rd[chan->id].paddr;
-		chunk->ll_region.vaddr = dw->ll_region_rd[chan->id].vaddr;
+		chunk->ll_region.paddr = chip->ll_region_rd[chan->id].paddr;
+		chunk->ll_region.vaddr = chip->ll_region_rd[chan->id].vaddr;
 	}
 
 	if (desc->chunk) {
@@ -663,7 +663,7 @@ static int dw_edma_alloc_chan_resources(struct dma_chan *dchan)
 	if (chan->status != EDMA_ST_IDLE)
 		return -EBUSY;
 
-	pm_runtime_get(chan->chip->dev);
+	pm_runtime_get(chan->dw->chip->dev);
 
 	return 0;
 }
@@ -685,15 +685,15 @@ static void dw_edma_free_chan_resources(struct dma_chan *dchan)
 		cpu_relax();
 	}
 
-	pm_runtime_put(chan->chip->dev);
+	pm_runtime_put(chan->dw->chip->dev);
 }
 
-static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
+static int dw_edma_channel_setup(struct dw_edma *dw, bool write,
 				 u32 wr_alloc, u32 rd_alloc)
 {
+	struct dw_edma_chip *chip = dw->chip;
 	struct dw_edma_region *dt_region;
 	struct device *dev = chip->dev;
-	struct dw_edma *dw = chip->dw;
 	struct dw_edma_chan *chan;
 	struct dw_edma_irq *irq;
 	struct dma_device *dma;
@@ -726,7 +726,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 
 		chan->vc.chan.private = dt_region;
 
-		chan->chip = chip;
+		chan->dw = dw;
 		chan->id = j;
 		chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ;
 		chan->configured = false;
@@ -734,9 +734,9 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 		chan->status = EDMA_ST_IDLE;
 
 		if (write)
-			chan->ll_max = (dw->ll_region_wr[j].sz / EDMA_LL_SZ);
+			chan->ll_max = (chip->ll_region_wr[j].sz / EDMA_LL_SZ);
 		else
-			chan->ll_max = (dw->ll_region_rd[j].sz / EDMA_LL_SZ);
+			chan->ll_max = (chip->ll_region_rd[j].sz / EDMA_LL_SZ);
 		chan->ll_max -= 1;
 
 		dev_vdbg(dev, "L. List:\tChannel %s[%u] max_cnt=%u\n",
@@ -766,13 +766,13 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 		vchan_init(&chan->vc, dma);
 
 		if (write) {
-			dt_region->paddr = dw->dt_region_wr[j].paddr;
-			dt_region->vaddr = dw->dt_region_wr[j].vaddr;
-			dt_region->sz = dw->dt_region_wr[j].sz;
+			dt_region->paddr = chip->dt_region_wr[j].paddr;
+			dt_region->vaddr = chip->dt_region_wr[j].vaddr;
+			dt_region->sz = chip->dt_region_wr[j].sz;
 		} else {
-			dt_region->paddr = dw->dt_region_rd[j].paddr;
-			dt_region->vaddr = dw->dt_region_rd[j].vaddr;
-			dt_region->sz = dw->dt_region_rd[j].sz;
+			dt_region->paddr = chip->dt_region_rd[j].paddr;
+			dt_region->vaddr = chip->dt_region_rd[j].vaddr;
+			dt_region->sz = chip->dt_region_rd[j].sz;
 		}
 
 		dw_edma_v0_core_device_config(chan);
@@ -826,11 +826,11 @@ static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt)
 		(*mask)++;
 }
 
-static int dw_edma_irq_request(struct dw_edma_chip *chip,
+static int dw_edma_irq_request(struct dw_edma *dw,
 			       u32 *wr_alloc, u32 *rd_alloc)
 {
-	struct device *dev = chip->dev;
-	struct dw_edma *dw = chip->dw;
+	struct dw_edma_chip *chip = dw->chip;
+	struct device *dev = dw->chip->dev;
 	u32 wr_mask = 1;
 	u32 rd_mask = 1;
 	int i, err = 0;
@@ -839,12 +839,16 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 
 	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
 
-	if (dw->nr_irqs < 1)
+	if (chip->nr_irqs < 1 || !chip->ops->irq_vector)
 		return -EINVAL;
 
-	if (dw->nr_irqs == 1) {
+	dw->irq = devm_kcalloc(dev, chip->nr_irqs, sizeof(*dw->irq), GFP_KERNEL);
+	if (!dw->irq)
+		return -ENOMEM;
+
+	if (chip->nr_irqs == 1) {
 		/* Common IRQ shared among all channels */
-		irq = dw->ops->irq_vector(dev, 0);
+		irq = chip->ops->irq_vector(dev, 0);
 		err = request_irq(irq, dw_edma_interrupt_common,
 				  IRQF_SHARED, dw->name, &dw->irq[0]);
 		if (err) {
@@ -854,9 +858,11 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 
 		if (irq_get_msi_desc(irq))
 			get_cached_msi_msg(irq, &dw->irq[0].msi);
+
+		dw->nr_irqs = 1;
 	} else {
 		/* Distribute IRQs equally among all channels */
-		int tmp = dw->nr_irqs;
+		int tmp = chip->nr_irqs;
 
 		while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) {
 			dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt);
@@ -867,7 +873,7 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 		dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt);
 
 		for (i = 0; i < (*wr_alloc + *rd_alloc); i++) {
-			irq = dw->ops->irq_vector(dev, i);
+			irq = chip->ops->irq_vector(dev, i);
 			err = request_irq(irq,
 					  i < *wr_alloc ?
 						dw_edma_interrupt_write :
@@ -901,20 +907,22 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 		return -EINVAL;
 
 	dev = chip->dev;
-	if (!dev)
+	if (!dev || !chip->ops)
 		return -EINVAL;
 
-	dw = chip->dw;
-	if (!dw || !dw->irq || !dw->ops || !dw->ops->irq_vector)
-		return -EINVAL;
+	dw = devm_kzalloc(dev, sizeof(*dw), GFP_KERNEL);
+	if (!dw)
+		return -ENOMEM;
+
+	dw->chip = chip;
 
 	raw_spin_lock_init(&dw->lock);
 
-	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt,
+	dw->wr_ch_cnt = min_t(u16, chip->wr_ch_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE));
 	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH);
 
-	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt,
+	dw->rd_ch_cnt = min_t(u16, chip->rd_ch_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ));
 	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH);
 
@@ -936,17 +944,17 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 	dw_edma_v0_core_off(dw);
 
 	/* Request IRQs */
-	err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc);
+	err = dw_edma_irq_request(dw, &wr_alloc, &rd_alloc);
 	if (err)
 		return err;
 
 	/* Setup write channels */
-	err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc);
+	err = dw_edma_channel_setup(dw, true, wr_alloc, rd_alloc);
 	if (err)
 		goto err_irq_free;
 
 	/* Setup read channels */
-	err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc);
+	err = dw_edma_channel_setup(dw, false, wr_alloc, rd_alloc);
 	if (err)
 		goto err_irq_free;
 
@@ -954,15 +962,15 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 	pm_runtime_enable(dev);
 
 	/* Turn debugfs on */
-	dw_edma_v0_core_debugfs_on(chip);
+	dw_edma_v0_core_debugfs_on(dw);
+
+	chip->dw = dw;
 
 	return 0;
 
 err_irq_free:
 	for (i = (dw->nr_irqs - 1); i >= 0; i--)
-		free_irq(dw->ops->irq_vector(dev, i), &dw->irq[i]);
-
-	dw->nr_irqs = 0;
+		free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]);
 
 	return err;
 }
@@ -980,7 +988,7 @@ int dw_edma_remove(struct dw_edma_chip *chip)
 
 	/* Free irqs */
 	for (i = (dw->nr_irqs - 1); i >= 0; i--)
-		free_irq(dw->ops->irq_vector(dev, i), &dw->irq[i]);
+		free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]);
 
 	/* Power management */
 	pm_runtime_disable(dev);
@@ -1001,7 +1009,7 @@ int dw_edma_remove(struct dw_edma_chip *chip)
 	}
 
 	/* Turn debugfs off */
-	dw_edma_v0_core_debugfs_off(chip);
+	dw_edma_v0_core_debugfs_off(dw);
 
 	return 0;
 }
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 60316d408c3e..85df2d511907 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -15,20 +15,12 @@
 #include "../virt-dma.h"
 
 #define EDMA_LL_SZ					24
-#define EDMA_MAX_WR_CH					8
-#define EDMA_MAX_RD_CH					8
 
 enum dw_edma_dir {
 	EDMA_DIR_WRITE = 0,
 	EDMA_DIR_READ
 };
 
-enum dw_edma_map_format {
-	EDMA_MF_EDMA_LEGACY = 0x0,
-	EDMA_MF_EDMA_UNROLL = 0x1,
-	EDMA_MF_HDMA_COMPAT = 0x5
-};
-
 enum dw_edma_request {
 	EDMA_REQ_NONE = 0,
 	EDMA_REQ_STOP,
@@ -57,12 +49,6 @@ struct dw_edma_burst {
 	u32				sz;
 };
 
-struct dw_edma_region {
-	phys_addr_t			paddr;
-	void				__iomem *vaddr;
-	size_t				sz;
-};
-
 struct dw_edma_chunk {
 	struct list_head		list;
 	struct dw_edma_chan		*chan;
@@ -87,7 +73,7 @@ struct dw_edma_desc {
 
 struct dw_edma_chan {
 	struct virt_dma_chan		vc;
-	struct dw_edma_chip		*chip;
+	struct dw_edma			*dw;
 	int				id;
 	enum dw_edma_dir		dir;
 
@@ -109,10 +95,6 @@ struct dw_edma_irq {
 	struct dw_edma			*dw;
 };
 
-struct dw_edma_core_ops {
-	int	(*irq_vector)(struct device *dev, unsigned int nr);
-};
-
 struct dw_edma {
 	char				name[20];
 
@@ -122,21 +104,14 @@ struct dw_edma {
 	struct dma_device		rd_edma;
 	u16				rd_ch_cnt;
 
-	struct dw_edma_region		rg_region;	/* Registers */
-	struct dw_edma_region		ll_region_wr[EDMA_MAX_WR_CH];
-	struct dw_edma_region		ll_region_rd[EDMA_MAX_RD_CH];
-	struct dw_edma_region		dt_region_wr[EDMA_MAX_WR_CH];
-	struct dw_edma_region		dt_region_rd[EDMA_MAX_RD_CH];
-
 	struct dw_edma_irq		*irq;
 	int				nr_irqs;
 
-	enum dw_edma_map_format		mf;
-
 	struct dw_edma_chan		*chan;
-	const struct dw_edma_core_ops	*ops;
 
 	raw_spinlock_t			lock;		/* Only for legacy */
+
+	struct dw_edma_chip             *chip;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry			*debugfs;
 #endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index bc07923c3bc0..965513698f3d 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -148,7 +148,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	struct dw_edma_pcie_data vsec_data;
 	struct device *dev = &pdev->dev;
 	struct dw_edma_chip *chip;
-	struct dw_edma *dw;
 	int err, nr_irqs;
 	int i, mask;
 
@@ -197,10 +196,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	if (!chip)
 		return -ENOMEM;
 
-	dw = devm_kzalloc(dev, sizeof(*dw), GFP_KERNEL);
-	if (!dw)
-		return -ENOMEM;
-
 	/* IRQs allocation */
 	nr_irqs = pci_alloc_irq_vectors(pdev, 1, vsec_data.irqs,
 					PCI_IRQ_MSI | PCI_IRQ_MSIX);
@@ -211,28 +206,23 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	}
 
 	/* Data structure initialization */
-	chip->dw = dw;
 	chip->dev = dev;
 	chip->id = pdev->devfn;
 
-	dw->mf = vsec_data.mf;
-	dw->nr_irqs = nr_irqs;
-	dw->ops = &dw_edma_pcie_core_ops;
-	dw->wr_ch_cnt = vsec_data.wr_ch_cnt;
-	dw->rd_ch_cnt = vsec_data.rd_ch_cnt;
+	chip->mf = vsec_data.mf;
+	chip->nr_irqs = nr_irqs;
+	chip->ops = &dw_edma_pcie_core_ops;
 
-	dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
-	if (!dw->rg_region.vaddr)
-		return -ENOMEM;
+	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
+	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
 
-	dw->rg_region.vaddr += vsec_data.rg.off;
-	dw->rg_region.paddr = pdev->resource[vsec_data.rg.bar].start;
-	dw->rg_region.paddr += vsec_data.rg.off;
-	dw->rg_region.sz = vsec_data.rg.sz;
+	chip->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
+	if (!chip->rg_region.vaddr)
+		return -ENOMEM;
 
-	for (i = 0; i < dw->wr_ch_cnt; i++) {
-		struct dw_edma_region *ll_region = &dw->ll_region_wr[i];
-		struct dw_edma_region *dt_region = &dw->dt_region_wr[i];
+	for (i = 0; i < chip->wr_ch_cnt; i++) {
+		struct dw_edma_region *ll_region = &chip->ll_region_wr[i];
+		struct dw_edma_region *dt_region = &chip->dt_region_wr[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_wr[i];
 		struct dw_edma_block *dt_block = &vsec_data.dt_wr[i];
 
@@ -255,9 +245,9 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		dt_region->sz = dt_block->sz;
 	}
 
-	for (i = 0; i < dw->rd_ch_cnt; i++) {
-		struct dw_edma_region *ll_region = &dw->ll_region_rd[i];
-		struct dw_edma_region *dt_region = &dw->dt_region_rd[i];
+	for (i = 0; i < chip->rd_ch_cnt; i++) {
+		struct dw_edma_region *ll_region = &chip->ll_region_rd[i];
+		struct dw_edma_region *dt_region = &chip->dt_region_rd[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_rd[i];
 		struct dw_edma_block *dt_block = &vsec_data.dt_rd[i];
 
@@ -281,45 +271,45 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	}
 
 	/* Debug info */
-	if (dw->mf == EDMA_MF_EDMA_LEGACY)
-		pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", dw->mf);
-	else if (dw->mf == EDMA_MF_EDMA_UNROLL)
-		pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", dw->mf);
-	else if (dw->mf == EDMA_MF_HDMA_COMPAT)
-		pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", dw->mf);
+	if (chip->mf == EDMA_MF_EDMA_LEGACY)
+		pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", chip->mf);
+	else if (chip->mf == EDMA_MF_EDMA_UNROLL)
+		pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", chip->mf);
+	else if (chip->mf == EDMA_MF_HDMA_COMPAT)
+		pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", chip->mf);
 	else
-		pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf);
+		pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", chip->mf);
 
-	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p)\n",
 		vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz,
-		dw->rg_region.vaddr, &dw->rg_region.paddr);
+		chip->rg_region.vaddr);
 
 
-	for (i = 0; i < dw->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->wr_ch_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_wr[i].bar,
-			vsec_data.ll_wr[i].off, dw->ll_region_wr[i].sz,
-			dw->ll_region_wr[i].vaddr, &dw->ll_region_wr[i].paddr);
+			vsec_data.ll_wr[i].off, chip->ll_region_wr[i].sz,
+			chip->ll_region_wr[i].vaddr, &chip->ll_region_wr[i].paddr);
 
 		pci_dbg(pdev, "Data:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.dt_wr[i].bar,
-			vsec_data.dt_wr[i].off, dw->dt_region_wr[i].sz,
-			dw->dt_region_wr[i].vaddr, &dw->dt_region_wr[i].paddr);
+			vsec_data.dt_wr[i].off, chip->dt_region_wr[i].sz,
+			chip->dt_region_wr[i].vaddr, &chip->dt_region_wr[i].paddr);
 	}
 
-	for (i = 0; i < dw->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->rd_ch_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_rd[i].bar,
-			vsec_data.ll_rd[i].off, dw->ll_region_rd[i].sz,
-			dw->ll_region_rd[i].vaddr, &dw->ll_region_rd[i].paddr);
+			vsec_data.ll_rd[i].off, chip->ll_region_rd[i].sz,
+			chip->ll_region_rd[i].vaddr, &chip->ll_region_rd[i].paddr);
 
 		pci_dbg(pdev, "Data:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.dt_rd[i].bar,
-			vsec_data.dt_rd[i].off, dw->dt_region_rd[i].sz,
-			dw->dt_region_rd[i].vaddr, &dw->dt_region_rd[i].paddr);
+			vsec_data.dt_rd[i].off, chip->dt_region_rd[i].sz,
+			chip->dt_region_rd[i].vaddr, &chip->dt_region_rd[i].paddr);
 	}
 
-	pci_dbg(pdev, "Nr. IRQs:\t%u\n", dw->nr_irqs);
+	pci_dbg(pdev, "Nr. IRQs:\t%u\n", chip->nr_irqs);
 
 	/* Validating if PCI interrupts were enabled */
 	if (!pci_dev_msi_enabled(pdev)) {
@@ -327,10 +317,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		return -EPERM;
 	}
 
-	dw->irq = devm_kcalloc(dev, nr_irqs, sizeof(*dw->irq), GFP_KERNEL);
-	if (!dw->irq)
-		return -ENOMEM;
-
 	/* Starting eDMA driver */
 	err = dw_edma_probe(chip);
 	if (err) {
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 33bc1e6c4cf2..999e03896186 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -25,7 +25,7 @@ enum dw_edma_control {
 
 static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 {
-	return dw->rg_region.vaddr;
+	return dw->chip->rg_region.vaddr;
 }
 
 #define SET_32(dw, name, value)				\
@@ -96,7 +96,7 @@ static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 static inline struct dw_edma_v0_ch_regs __iomem *
 __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY)
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY)
 		return &(__dw_regs(dw)->type.legacy.ch);
 
 	if (dir == EDMA_DIR_WRITE)
@@ -108,7 +108,7 @@ __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 static inline void writel_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 			     u32 value, void __iomem *addr)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -133,7 +133,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 {
 	u32 value;
 
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -169,7 +169,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 static inline void writeq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 			     u64 value, void __iomem *addr)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -194,7 +194,7 @@ static inline u64 readq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 {
 	u32 value;
 
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -256,7 +256,7 @@ u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir)
 
 enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp;
 
 	tmp = FIELD_GET(EDMA_V0_CH_STATUS_MASK,
@@ -272,7 +272,7 @@ enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan)
 
 void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 
 	SET_RW_32(dw, chan->dir, int_clear,
 		  FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)));
@@ -280,7 +280,7 @@ void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan)
 
 void dw_edma_v0_core_clear_abort_int(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 
 	SET_RW_32(dw, chan->dir, int_clear,
 		  FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id)));
@@ -357,7 +357,7 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 {
 	struct dw_edma_chan *chan = chunk->chan;
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp;
 
 	dw_edma_v0_core_write_chunk(chunk);
@@ -365,7 +365,7 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 	if (first) {
 		/* Enable engine */
 		SET_RW_32(dw, chan->dir, engine_en, BIT(0));
-		if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+		if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 			switch (chan->id) {
 			case 0:
 				SET_RW_COMPAT(dw, chan->dir, ch0_pwr_en,
@@ -435,7 +435,7 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp = 0;
 
 	/* MSI done addr - low, high */
@@ -505,12 +505,12 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
 }
 
 /* eDMA debugfs callbacks */
-void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip)
+void dw_edma_v0_core_debugfs_on(struct dw_edma *dw)
 {
-	dw_edma_v0_debugfs_on(chip);
+	dw_edma_v0_debugfs_on(dw);
 }
 
-void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip)
+void dw_edma_v0_core_debugfs_off(struct dw_edma *dw)
 {
-	dw_edma_v0_debugfs_off(chip);
+	dw_edma_v0_debugfs_off(dw);
 }
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.h b/drivers/dma/dw-edma/dw-edma-v0-core.h
index 2afa626b8300..75aec6d31b21 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.h
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.h
@@ -22,7 +22,7 @@ u32 dw_edma_v0_core_status_abort_int(struct dw_edma *chan, enum dw_edma_dir dir)
 void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first);
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan);
 /* eDMA debug fs callbacks */
-void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip);
+void dw_edma_v0_core_debugfs_on(struct dw_edma *dw);
+void dw_edma_v0_core_debugfs_off(struct dw_edma *dw);
 
 #endif /* _DW_EDMA_V0_CORE_H */
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
index 4b3bcffd15ef..b765adb96999 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
@@ -54,7 +54,7 @@ struct debugfs_entries {
 static int dw_edma_debugfs_u32_get(void *data, u64 *val)
 {
 	void __iomem *reg = (void __force __iomem *)data;
-	if (dw->mf == EDMA_MF_EDMA_LEGACY &&
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY &&
 	    reg >= (void __iomem *)&regs->type.legacy.ch) {
 		void __iomem *ptr = &regs->type.legacy.ch;
 		u32 viewport_sel = 0;
@@ -173,7 +173,7 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir)
 	nr_entries = ARRAY_SIZE(debugfs_regs);
 	dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-	if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+	if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 		nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
 		dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
 					   regs_dir);
@@ -242,7 +242,7 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir)
 	nr_entries = ARRAY_SIZE(debugfs_regs);
 	dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-	if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+	if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 		nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
 		dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
 					   regs_dir);
@@ -282,13 +282,13 @@ static void dw_edma_debugfs_regs(void)
 	dw_edma_debugfs_regs_rd(regs_dir);
 }
 
-void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
+void dw_edma_v0_debugfs_on(struct dw_edma *_dw)
 {
-	dw = chip->dw;
+	dw = _dw;
 	if (!dw)
 		return;
 
-	regs = dw->rg_region.vaddr;
+	regs = dw->chip->rg_region.vaddr;
 	if (!regs)
 		return;
 
@@ -296,16 +296,16 @@ void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
 	if (!dw->debugfs)
 		return;
 
-	debugfs_create_u32("mf", 0444, dw->debugfs, &dw->mf);
+	debugfs_create_u32("mf", 0444, dw->debugfs, &dw->chip->mf);
 	debugfs_create_u16("wr_ch_cnt", 0444, dw->debugfs, &dw->wr_ch_cnt);
 	debugfs_create_u16("rd_ch_cnt", 0444, dw->debugfs, &dw->rd_ch_cnt);
 
 	dw_edma_debugfs_regs();
 }
 
-void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
+void dw_edma_v0_debugfs_off(struct dw_edma *_dw)
 {
-	dw = chip->dw;
+	dw = _dw;
 	if (!dw)
 		return;
 
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
index d0ff25a9ea5c..3391b86edf5a 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
@@ -12,14 +12,14 @@
 #include <linux/dma/edma.h>
 
 #ifdef CONFIG_DEBUG_FS
-void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip);
+void dw_edma_v0_debugfs_on(struct dw_edma *dw);
+void dw_edma_v0_debugfs_off(struct dw_edma *dw);
 #else
-static inline void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
+static inline void dw_edma_v0_debugfs_on(struct dw_edma *dw)
 {
 }
 
-static inline void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
+static inline void dw_edma_v0_debugfs_off(struct dw_edma *dw)
 {
 }
 #endif /* CONFIG_DEBUG_FS */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index d4333e721588..6f64e90d5c38 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -12,17 +12,63 @@
 #include <linux/device.h>
 #include <linux/dmaengine.h>
 
+#define EDMA_MAX_WR_CH                                  8
+#define EDMA_MAX_RD_CH                                  8
+
 struct dw_edma;
 
+struct dw_edma_region {
+	phys_addr_t	paddr;
+	void __iomem	*vaddr;
+	size_t		sz;
+};
+
+struct dw_edma_core_ops {
+	int (*irq_vector)(struct device *dev, unsigned int nr);
+};
+
+enum dw_edma_map_format {
+	EDMA_MF_EDMA_LEGACY = 0x0,
+	EDMA_MF_EDMA_UNROLL = 0x1,
+	EDMA_MF_HDMA_COMPAT = 0x5
+};
+
 /**
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
- * @dw:			 struct dw_edma that is filed by dw_edma_probe()
+ * @nr_irqs:		 total number of DMA IRQs
+ * @ops			 DMA channel to IRQ number mapping
+ * @wr_ch_cnt		 DMA write channel number
+ * @rd_ch_cnt		 DMA read channel number
+ * @rg_region		 DMA register region
+ * @ll_region_wr	 DMA descriptor link list memory for write channel
+ * @ll_region_rd	 DMA descriptor link list memory for read channel
+ * @dt_region_wr	 DMA data memory for write channel
+ * @dt_region_rd	 DMA data memory for read channel
+ * @mf			 DMA register map format
+ * @dw:			 struct dw_edma that is filled by dw_edma_probe()
  */
 struct dw_edma_chip {
 	struct device		*dev;
 	int			id;
+	int			nr_irqs;
+	const struct dw_edma_core_ops   *ops;
+
+	struct dw_edma_region	rg_region;
+
+	u16			wr_ch_cnt;
+	u16			rd_ch_cnt;
+	/* link list address */
+	struct dw_edma_region	ll_region_wr[EDMA_MAX_WR_CH];
+	struct dw_edma_region	ll_region_rd[EDMA_MAX_RD_CH];
+
+	/* data region */
+	struct dw_edma_region	dt_region_wr[EDMA_MAX_WR_CH];
+	struct dw_edma_region	dt_region_rd[EDMA_MAX_RD_CH];
+
+	enum dw_edma_map_format	mf;
+
 	struct dw_edma		*dw;
 };
 
-- 
cgit 


From e51b3048116a6e10b96bd5298cbcb209b6d729cd Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:54 -0500
Subject: dmaengine: dw-edma: Change rg_region to reg_base in struct
 dw_edma_chip

struct dw_edma_region rg_region included virtual address, physical address
and size information. But only the virtual address is used by EDMA driver.
Change it to void __iomem *reg_base to clean up code.

Link: https://lore.kernel.org/r/20220524152159.2370739-4-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-pcie.c       | 6 +++---
 drivers/dma/dw-edma/dw-edma-v0-core.c    | 2 +-
 drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 2 +-
 include/linux/dma/edma.h                 | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index 965513698f3d..9553fb75091d 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -216,8 +216,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
 	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
 
-	chip->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
-	if (!chip->rg_region.vaddr)
+	chip->reg_base = pcim_iomap_table(pdev)[vsec_data.rg.bar];
+	if (!chip->reg_base)
 		return -ENOMEM;
 
 	for (i = 0; i < chip->wr_ch_cnt; i++) {
@@ -282,7 +282,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 
 	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p)\n",
 		vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz,
-		chip->rg_region.vaddr);
+		chip->reg_base);
 
 
 	for (i = 0; i < chip->wr_ch_cnt; i++) {
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 999e03896186..403ade40c1b1 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -25,7 +25,7 @@ enum dw_edma_control {
 
 static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 {
-	return dw->chip->rg_region.vaddr;
+	return dw->chip->reg_base;
 }
 
 #define SET_32(dw, name, value)				\
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
index b765adb96999..5226c9014703 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
@@ -288,7 +288,7 @@ void dw_edma_v0_debugfs_on(struct dw_edma *_dw)
 	if (!dw)
 		return;
 
-	regs = dw->chip->rg_region.vaddr;
+	regs = dw->chip->reg_base;
 	if (!regs)
 		return;
 
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index 6f64e90d5c38..df9ba3ecb437 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -39,6 +39,7 @@ enum dw_edma_map_format {
  * @id:			 instance ID
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
+ * @reg_base		 DMA register base address
  * @wr_ch_cnt		 DMA write channel number
  * @rd_ch_cnt		 DMA read channel number
  * @rg_region		 DMA register region
@@ -55,7 +56,7 @@ struct dw_edma_chip {
 	int			nr_irqs;
 	const struct dw_edma_core_ops   *ops;
 
-	struct dw_edma_region	rg_region;
+	void __iomem		*reg_base;
 
 	u16			wr_ch_cnt;
 	u16			rd_ch_cnt;
-- 
cgit 


From 6951ee96c649f6e963b98c11b2b1a92697d3c45c Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:55 -0500
Subject: dmaengine: dw-edma: Rename wr(rd)_ch_cnt to ll_wr(rd)_cnt in struct
 dw_edma_chip

The struct dw_edma contains wr(rd)_ch_cnt fields. The EDMA driver gets
write(read) channel number from register, then saves these into dw_edma.
The wr(rd)_ch_cnt in dw_edma_chip actually means how many link list memory
are available in ll_region_wr(rd)[EDMA_MAX_WR_CH]. Rename it to
ll_wr(rd)_cnt to indicate actual usage.

Link: https://lore.kernel.org/r/20220524152159.2370739-5-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-core.c |  4 ++--
 drivers/dma/dw-edma/dw-edma-pcie.c | 12 ++++++------
 include/linux/dma/edma.h           |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 9a4c96f7d9d9..af037ec61a86 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -918,11 +918,11 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 
 	raw_spin_lock_init(&dw->lock);
 
-	dw->wr_ch_cnt = min_t(u16, chip->wr_ch_cnt,
+	dw->wr_ch_cnt = min_t(u16, chip->ll_wr_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE));
 	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH);
 
-	dw->rd_ch_cnt = min_t(u16, chip->rd_ch_cnt,
+	dw->rd_ch_cnt = min_t(u16, chip->ll_rd_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ));
 	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH);
 
diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index 9553fb75091d..d6b5e2463884 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -213,14 +213,14 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->nr_irqs = nr_irqs;
 	chip->ops = &dw_edma_pcie_core_ops;
 
-	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
-	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
+	chip->ll_wr_cnt = vsec_data.wr_ch_cnt;
+	chip->ll_rd_cnt = vsec_data.rd_ch_cnt;
 
 	chip->reg_base = pcim_iomap_table(pdev)[vsec_data.rg.bar];
 	if (!chip->reg_base)
 		return -ENOMEM;
 
-	for (i = 0; i < chip->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_wr_cnt; i++) {
 		struct dw_edma_region *ll_region = &chip->ll_region_wr[i];
 		struct dw_edma_region *dt_region = &chip->dt_region_wr[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_wr[i];
@@ -245,7 +245,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		dt_region->sz = dt_block->sz;
 	}
 
-	for (i = 0; i < chip->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_rd_cnt; i++) {
 		struct dw_edma_region *ll_region = &chip->ll_region_rd[i];
 		struct dw_edma_region *dt_region = &chip->dt_region_rd[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_rd[i];
@@ -285,7 +285,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		chip->reg_base);
 
 
-	for (i = 0; i < chip->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_wr_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_wr[i].bar,
 			vsec_data.ll_wr[i].off, chip->ll_region_wr[i].sz,
@@ -297,7 +297,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 			chip->dt_region_wr[i].vaddr, &chip->dt_region_wr[i].paddr);
 	}
 
-	for (i = 0; i < chip->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_rd_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_rd[i].bar,
 			vsec_data.ll_rd[i].off, chip->ll_region_rd[i].sz,
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index df9ba3ecb437..fdbbeda170a9 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -40,8 +40,8 @@ enum dw_edma_map_format {
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
  * @reg_base		 DMA register base address
- * @wr_ch_cnt		 DMA write channel number
- * @rd_ch_cnt		 DMA read channel number
+ * @ll_wr_cnt		 DMA write link list count
+ * @ll_rd_cnt		 DMA read link list count
  * @rg_region		 DMA register region
  * @ll_region_wr	 DMA descriptor link list memory for write channel
  * @ll_region_rd	 DMA descriptor link list memory for read channel
@@ -58,8 +58,8 @@ struct dw_edma_chip {
 
 	void __iomem		*reg_base;
 
-	u16			wr_ch_cnt;
-	u16			rd_ch_cnt;
+	u16			ll_wr_cnt;
+	u16			ll_rd_cnt;
 	/* link list address */
 	struct dw_edma_region	ll_region_wr[EDMA_MAX_WR_CH];
 	struct dw_edma_region	ll_region_rd[EDMA_MAX_RD_CH];
-- 
cgit 


From d6b03171f9fc8127b3a7adfd4e74ee5d4dae5d14 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:58 -0500
Subject: dmaengine: dw-edma: Add support for chip-specific flags

Add a "flags" field to the "struct dw_edma_chip" so that the controller
drivers can pass flags that are relevant to the platform.

DW_EDMA_CHIP_LOCAL - Used by the controller drivers accessing eDMA
locally. Local eDMA access doesn't require generating MSIs to the remote.

Link: https://lore.kernel.org/r/20220524152159.2370739-8-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-v0-core.c |  9 ++++++---
 include/linux/dma/edma.h              | 10 ++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 403ade40c1b1..607647dacc29 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -301,6 +301,7 @@ u32 dw_edma_v0_core_status_abort_int(struct dw_edma *dw, enum dw_edma_dir dir)
 static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 {
 	struct dw_edma_burst *child;
+	struct dw_edma_chan *chan = chunk->chan;
 	struct dw_edma_v0_lli __iomem *lli;
 	struct dw_edma_v0_llp __iomem *llp;
 	u32 control = 0, i = 0;
@@ -314,9 +315,11 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 	j = chunk->bursts_alloc;
 	list_for_each_entry(child, &chunk->burst->list, list) {
 		j--;
-		if (!j)
-			control |= (DW_EDMA_V0_LIE | DW_EDMA_V0_RIE);
-
+		if (!j) {
+			control |= DW_EDMA_V0_LIE;
+			if (!(chan->dw->chip->flags & DW_EDMA_CHIP_LOCAL))
+				control |= DW_EDMA_V0_RIE;
+		}
 		/* Channel control */
 		SET_LL_32(&lli[i].control, control);
 		/* Transfer size */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index fdbbeda170a9..7d8062e9c544 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -33,12 +33,21 @@ enum dw_edma_map_format {
 	EDMA_MF_HDMA_COMPAT = 0x5
 };
 
+/**
+ * enum dw_edma_chip_flags - Flags specific to an eDMA chip
+ * @DW_EDMA_CHIP_LOCAL:		eDMA is used locally by an endpoint
+ */
+enum dw_edma_chip_flags {
+	DW_EDMA_CHIP_LOCAL	= BIT(0),
+};
+
 /**
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
+ * @flags		 dw_edma_chip_flags
  * @reg_base		 DMA register base address
  * @ll_wr_cnt		 DMA write link list count
  * @ll_rd_cnt		 DMA read link list count
@@ -55,6 +64,7 @@ struct dw_edma_chip {
 	int			id;
 	int			nr_irqs;
 	const struct dw_edma_core_ops   *ops;
+	u32			flags;
 
 	void __iomem		*reg_base;
 
-- 
cgit 


From 3cc624beba6310a8a534fb00841f22445a200d54 Mon Sep 17 00:00:00 2001
From: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Date: Thu, 23 Jun 2022 19:32:44 +0300
Subject: fpga: fpga-mgr: support bitstream offset in image buffer

At the moment FPGA manager core loads to the device entire image
provided to fpga_mgr_load(). But it is not always whole FPGA image
buffer meant to be written to the device. In particular, .dat formatted
image for Microchip MPF contains meta info in the header that is not
meant to be written to the device. This is issue for those low level
drivers that loads data to the device with write() fpga_manager_ops
callback, since write() can be called in iterator over scatter-gather
table, not only linear image buffer. On the other hand, write_sg()
callback is provided with whole image in scatter-gather form and can
decide itself which part should be sent to the device.

Add header_size and data_size to the fpga_image_info struct, add
skip_header to the fpga_manager_ops struct and adjust fpga_mgr_write()
callers with respect to them.

  * info->header_size indicates part at the beginning of image buffer
    that contains some meta info. It is optional and can be 0,
    initialized with mops->initial_header_size.

  * mops->skip_header tells fpga-mgr core whether write should start
    from the beginning of image buffer or at the offset of header_size.

  * info->data_size is the size of bitstream data that is meant to be
    written to the device. It is also optional and can be 0, which
    means bitstream data is up to the end of image buffer.

Also add parse_header() callback to fpga_manager_ops, which purpose is
to set info->header_size and info->data_size. At least
initial_header_size bytes of image buffer will be passed into
parse_header() first time. If it is not enough, parse_header() should
set desired size into info->header_size and return -EAGAIN, then it will
be called again with greater part of image buffer on the input.

Suggested-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20220623163248.3672-2-i.bornyakov@metrotek.ru
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
---
 drivers/fpga/fpga-mgr.c       | 223 +++++++++++++++++++++++++++++++++++++-----
 include/linux/fpga/fpga-mgr.h |  24 ++++-
 2 files changed, 220 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 08dc85fcd511..a0fa0a2cb8af 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -74,6 +74,15 @@ static inline int fpga_mgr_write_complete(struct fpga_manager *mgr,
 	return 0;
 }
 
+static inline int fpga_mgr_parse_header(struct fpga_manager *mgr,
+					struct fpga_image_info *info,
+					const char *buf, size_t count)
+{
+	if (mgr->mops->parse_header)
+		return mgr->mops->parse_header(mgr, info, buf, count);
+	return 0;
+}
+
 static inline int fpga_mgr_write_init(struct fpga_manager *mgr,
 				      struct fpga_image_info *info,
 				      const char *buf, size_t count)
@@ -136,24 +145,141 @@ void fpga_image_info_free(struct fpga_image_info *info)
 EXPORT_SYMBOL_GPL(fpga_image_info_free);
 
 /*
- * Call the low level driver's write_init function.  This will do the
+ * Call the low level driver's parse_header function with entire FPGA image
+ * buffer on the input. This will set info->header_size and info->data_size.
+ */
+static int fpga_mgr_parse_header_mapped(struct fpga_manager *mgr,
+					struct fpga_image_info *info,
+					const char *buf, size_t count)
+{
+	int ret;
+
+	mgr->state = FPGA_MGR_STATE_PARSE_HEADER;
+	ret = fpga_mgr_parse_header(mgr, info, buf, count);
+
+	if (info->header_size + info->data_size > count) {
+		dev_err(&mgr->dev, "Bitsream data outruns FPGA image\n");
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		dev_err(&mgr->dev, "Error while parsing FPGA image header\n");
+		mgr->state = FPGA_MGR_STATE_PARSE_HEADER_ERR;
+	}
+
+	return ret;
+}
+
+/*
+ * Call the low level driver's parse_header function with first fragment of
+ * scattered FPGA image on the input. If header fits first fragment,
+ * parse_header will set info->header_size and info->data_size. If it is not,
+ * parse_header will set desired size to info->header_size and -EAGAIN will be
+ * returned.
+ */
+static int fpga_mgr_parse_header_sg_first(struct fpga_manager *mgr,
+					  struct fpga_image_info *info,
+					  struct sg_table *sgt)
+{
+	struct sg_mapping_iter miter;
+	int ret;
+
+	mgr->state = FPGA_MGR_STATE_PARSE_HEADER;
+
+	sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
+	if (sg_miter_next(&miter) &&
+	    miter.length >= info->header_size)
+		ret = fpga_mgr_parse_header(mgr, info, miter.addr, miter.length);
+	else
+		ret = -EAGAIN;
+	sg_miter_stop(&miter);
+
+	if (ret && ret != -EAGAIN) {
+		dev_err(&mgr->dev, "Error while parsing FPGA image header\n");
+		mgr->state = FPGA_MGR_STATE_PARSE_HEADER_ERR;
+	}
+
+	return ret;
+}
+
+/*
+ * Copy scattered FPGA image fragments to temporary buffer and call the
+ * low level driver's parse_header function. This should be called after
+ * fpga_mgr_parse_header_sg_first() returned -EAGAIN. In case of success,
+ * pointer to the newly allocated image header copy will be returned and
+ * its size will be set into *ret_size. Returned buffer needs to be freed.
+ */
+static void *fpga_mgr_parse_header_sg(struct fpga_manager *mgr,
+				      struct fpga_image_info *info,
+				      struct sg_table *sgt, size_t *ret_size)
+{
+	size_t len, new_header_size, header_size = 0;
+	char *new_buf, *buf = NULL;
+	int ret;
+
+	do {
+		new_header_size = info->header_size;
+		if (new_header_size <= header_size) {
+			dev_err(&mgr->dev, "Requested invalid header size\n");
+			ret = -EFAULT;
+			break;
+		}
+
+		new_buf = krealloc(buf, new_header_size, GFP_KERNEL);
+		if (!new_buf) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		buf = new_buf;
+
+		len = sg_pcopy_to_buffer(sgt->sgl, sgt->nents,
+					 buf + header_size,
+					 new_header_size - header_size,
+					 header_size);
+		if (len != new_header_size - header_size) {
+			ret = -EFAULT;
+			break;
+		}
+
+		header_size = new_header_size;
+		ret = fpga_mgr_parse_header(mgr, info, buf, header_size);
+	} while (ret == -EAGAIN);
+
+	if (ret) {
+		dev_err(&mgr->dev, "Error while parsing FPGA image header\n");
+		mgr->state = FPGA_MGR_STATE_PARSE_HEADER_ERR;
+		kfree(buf);
+		buf = ERR_PTR(ret);
+	}
+
+	*ret_size = header_size;
+
+	return buf;
+}
+
+/*
+ * Call the low level driver's write_init function. This will do the
  * device-specific things to get the FPGA into the state where it is ready to
- * receive an FPGA image. The low level driver only gets to see the first
- * initial_header_size bytes in the buffer.
+ * receive an FPGA image. The low level driver gets to see at least first
+ * info->header_size bytes in the buffer. If info->header_size is 0,
+ * write_init will not get any bytes of image buffer.
  */
 static int fpga_mgr_write_init_buf(struct fpga_manager *mgr,
 				   struct fpga_image_info *info,
 				   const char *buf, size_t count)
 {
+	size_t header_size = info->header_size;
 	int ret;
 
 	mgr->state = FPGA_MGR_STATE_WRITE_INIT;
-	if (!mgr->mops->initial_header_size) {
+
+	if (header_size > count)
+		ret = -EINVAL;
+	else if (!header_size)
 		ret = fpga_mgr_write_init(mgr, info, NULL, 0);
-	} else {
-		count = min(mgr->mops->initial_header_size, count);
+	else
 		ret = fpga_mgr_write_init(mgr, info, buf, count);
-	}
 
 	if (ret) {
 		dev_err(&mgr->dev, "Error preparing FPGA for writing\n");
@@ -164,39 +290,50 @@ static int fpga_mgr_write_init_buf(struct fpga_manager *mgr,
 	return 0;
 }
 
-static int fpga_mgr_write_init_sg(struct fpga_manager *mgr,
-				  struct fpga_image_info *info,
-				  struct sg_table *sgt)
+static int fpga_mgr_prepare_sg(struct fpga_manager *mgr,
+			       struct fpga_image_info *info,
+			       struct sg_table *sgt)
 {
 	struct sg_mapping_iter miter;
 	size_t len;
 	char *buf;
 	int ret;
 
-	if (!mgr->mops->initial_header_size)
+	/* Short path. Low level driver don't care about image header. */
+	if (!mgr->mops->initial_header_size && !mgr->mops->parse_header)
 		return fpga_mgr_write_init_buf(mgr, info, NULL, 0);
 
 	/*
 	 * First try to use miter to map the first fragment to access the
 	 * header, this is the typical path.
 	 */
-	sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
-	if (sg_miter_next(&miter) &&
-	    miter.length >= mgr->mops->initial_header_size) {
-		ret = fpga_mgr_write_init_buf(mgr, info, miter.addr,
-					      miter.length);
+	ret = fpga_mgr_parse_header_sg_first(mgr, info, sgt);
+	/* If 0, header fits first fragment, call write_init on it */
+	if (!ret) {
+		sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
+		if (sg_miter_next(&miter)) {
+			ret = fpga_mgr_write_init_buf(mgr, info, miter.addr,
+						      miter.length);
+			sg_miter_stop(&miter);
+			return ret;
+		}
 		sg_miter_stop(&miter);
+	/*
+	 * If -EAGAIN, more sg buffer is needed,
+	 * otherwise an error has occurred.
+	 */
+	} else if (ret != -EAGAIN) {
 		return ret;
 	}
-	sg_miter_stop(&miter);
 
-	/* Otherwise copy the fragments into temporary memory. */
-	buf = kmalloc(mgr->mops->initial_header_size, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	/*
+	 * Copy the fragments into temporary memory.
+	 * Copying is done inside fpga_mgr_parse_header_sg().
+	 */
+	buf = fpga_mgr_parse_header_sg(mgr, info, sgt, &len);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
-	len = sg_copy_to_buffer(sgt->sgl, sgt->nents, buf,
-				mgr->mops->initial_header_size);
 	ret = fpga_mgr_write_init_buf(mgr, info, buf, len);
 
 	kfree(buf);
@@ -227,7 +364,7 @@ static int fpga_mgr_buf_load_sg(struct fpga_manager *mgr,
 {
 	int ret;
 
-	ret = fpga_mgr_write_init_sg(mgr, info, sgt);
+	ret = fpga_mgr_prepare_sg(mgr, info, sgt);
 	if (ret)
 		return ret;
 
@@ -236,17 +373,35 @@ static int fpga_mgr_buf_load_sg(struct fpga_manager *mgr,
 	if (mgr->mops->write_sg) {
 		ret = fpga_mgr_write_sg(mgr, sgt);
 	} else {
+		size_t length, count = 0, data_size = info->data_size;
 		struct sg_mapping_iter miter;
 
 		sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
+
+		if (mgr->mops->skip_header &&
+		    !sg_miter_skip(&miter, info->header_size)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		while (sg_miter_next(&miter)) {
-			ret = fpga_mgr_write(mgr, miter.addr, miter.length);
+			if (data_size)
+				length = min(miter.length, data_size - count);
+			else
+				length = miter.length;
+
+			ret = fpga_mgr_write(mgr, miter.addr, length);
 			if (ret)
 				break;
+
+			count += length;
+			if (data_size && count >= data_size)
+				break;
 		}
 		sg_miter_stop(&miter);
 	}
 
+out:
 	if (ret) {
 		dev_err(&mgr->dev, "Error while writing image data to FPGA\n");
 		mgr->state = FPGA_MGR_STATE_WRITE_ERR;
@@ -262,10 +417,22 @@ static int fpga_mgr_buf_load_mapped(struct fpga_manager *mgr,
 {
 	int ret;
 
+	ret = fpga_mgr_parse_header_mapped(mgr, info, buf, count);
+	if (ret)
+		return ret;
+
 	ret = fpga_mgr_write_init_buf(mgr, info, buf, count);
 	if (ret)
 		return ret;
 
+	if (mgr->mops->skip_header) {
+		buf += info->header_size;
+		count -= info->header_size;
+	}
+
+	if (info->data_size)
+		count = info->data_size;
+
 	/*
 	 * Write the FPGA image to the FPGA.
 	 */
@@ -404,6 +571,8 @@ static int fpga_mgr_firmware_load(struct fpga_manager *mgr,
  */
 int fpga_mgr_load(struct fpga_manager *mgr, struct fpga_image_info *info)
 {
+	info->header_size = mgr->mops->initial_header_size;
+
 	if (info->sgt)
 		return fpga_mgr_buf_load_sg(mgr, info, info->sgt);
 	if (info->buf && info->count)
@@ -424,6 +593,10 @@ static const char * const state_str[] = {
 	[FPGA_MGR_STATE_FIRMWARE_REQ] =		"firmware request",
 	[FPGA_MGR_STATE_FIRMWARE_REQ_ERR] =	"firmware request error",
 
+	/* Parse FPGA image header */
+	[FPGA_MGR_STATE_PARSE_HEADER] =		"parse header",
+	[FPGA_MGR_STATE_PARSE_HEADER_ERR] =	"parse header error",
+
 	/* Preparing FPGA to receive image */
 	[FPGA_MGR_STATE_WRITE_INIT] =		"write init",
 	[FPGA_MGR_STATE_WRITE_INIT_ERR] =	"write init error",
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 0f9468771bb9..54f63459efd6 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -22,6 +22,8 @@ struct sg_table;
  * @FPGA_MGR_STATE_RESET: FPGA in reset state
  * @FPGA_MGR_STATE_FIRMWARE_REQ: firmware request in progress
  * @FPGA_MGR_STATE_FIRMWARE_REQ_ERR: firmware request failed
+ * @FPGA_MGR_STATE_PARSE_HEADER: parse FPGA image header
+ * @FPGA_MGR_STATE_PARSE_HEADER_ERR: Error during PARSE_HEADER stage
  * @FPGA_MGR_STATE_WRITE_INIT: preparing FPGA for programming
  * @FPGA_MGR_STATE_WRITE_INIT_ERR: Error during WRITE_INIT stage
  * @FPGA_MGR_STATE_WRITE: writing image to FPGA
@@ -41,7 +43,9 @@ enum fpga_mgr_states {
 	FPGA_MGR_STATE_FIRMWARE_REQ,
 	FPGA_MGR_STATE_FIRMWARE_REQ_ERR,
 
-	/* write sequence: init, write, complete */
+	/* write sequence: parse header, init, write, complete */
+	FPGA_MGR_STATE_PARSE_HEADER,
+	FPGA_MGR_STATE_PARSE_HEADER_ERR,
 	FPGA_MGR_STATE_WRITE_INIT,
 	FPGA_MGR_STATE_WRITE_INIT_ERR,
 	FPGA_MGR_STATE_WRITE,
@@ -85,6 +89,9 @@ enum fpga_mgr_states {
  * @sgt: scatter/gather table containing FPGA image
  * @buf: contiguous buffer containing FPGA image
  * @count: size of buf
+ * @header_size: size of image header.
+ * @data_size: size of image data to be sent to the device. If not specified,
+ *	whole image will be used. Header may be skipped in either case.
  * @region_id: id of target region
  * @dev: device that owns this
  * @overlay: Device Tree overlay
@@ -98,6 +105,8 @@ struct fpga_image_info {
 	struct sg_table *sgt;
 	const char *buf;
 	size_t count;
+	size_t header_size;
+	size_t data_size;
 	int region_id;
 	struct device *dev;
 #ifdef CONFIG_OF
@@ -137,9 +146,16 @@ struct fpga_manager_info {
 
 /**
  * struct fpga_manager_ops - ops for low level fpga manager drivers
- * @initial_header_size: Maximum number of bytes that should be passed into write_init
+ * @initial_header_size: minimum number of bytes that should be passed into
+ *	parse_header and write_init.
+ * @skip_header: bool flag to tell fpga-mgr core whether it should skip
+ *	info->header_size part at the beginning of the image when invoking
+ *	write callback.
  * @state: returns an enum value of the FPGA's state
  * @status: returns status of the FPGA, including reconfiguration error code
+ * @parse_header: parse FPGA image header to set info->header_size and
+ *	info->data_size. In case the input buffer is not large enough, set
+ *	required size to info->header_size and return -EAGAIN.
  * @write_init: prepare the FPGA to receive configuration data
  * @write: write count bytes of configuration data to the FPGA
  * @write_sg: write the scatter list of configuration data to the FPGA
@@ -153,8 +169,12 @@ struct fpga_manager_info {
  */
 struct fpga_manager_ops {
 	size_t initial_header_size;
+	bool skip_header;
 	enum fpga_mgr_states (*state)(struct fpga_manager *mgr);
 	u64 (*status)(struct fpga_manager *mgr);
+	int (*parse_header)(struct fpga_manager *mgr,
+			    struct fpga_image_info *info,
+			    const char *buf, size_t count);
 	int (*write_init)(struct fpga_manager *mgr,
 			  struct fpga_image_info *info,
 			  const char *buf, size_t count);
-- 
cgit 


From fdfd42892f311e2b3695852036e5be23661dc590 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 15 Jun 2022 17:41:41 +0200
Subject: jump_label: mips: move module NOP patching into arch code

MIPS is the only remaining architecture that needs to patch jump label
NOP encodings to initialize them at load time. So let's move the module
patching part of that from generic code into arch/mips, and drop it from
the others.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220615154142.1574619-3-ardb@kernel.org
---
 arch/mips/kernel/jump_label.c | 19 +++++++++++++++++++
 arch/mips/kernel/module.c     |  5 +++--
 arch/s390/kernel/module.c     |  1 -
 arch/sparc/kernel/module.c    |  3 ---
 arch/x86/kernel/module.c      |  3 ---
 include/linux/jump_label.h    |  7 +------
 kernel/jump_label.c           | 27 +--------------------------
 7 files changed, 24 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index 662c8db9f45b..71a882c8c6eb 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -88,3 +88,22 @@ void arch_jump_label_transform(struct jump_entry *e,
 
 	mutex_unlock(&text_mutex);
 }
+
+#ifdef CONFIG_MODULES
+void jump_label_apply_nops(struct module *mod)
+{
+	struct jump_entry *iter_start = mod->jump_entries;
+	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+	struct jump_entry *iter;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (iter_start == iter_stop)
+		return;
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		/* Only write NOPs for arch_branch_static(). */
+		if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+			arch_jump_label_transform(iter, JUMP_LABEL_NOP);
+	}
+}
+#endif
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 14f46d17500a..0c936cbf20c5 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/jump_label.h>
 
+extern void jump_label_apply_nops(struct module *mod);
 
 struct mips_hi16 {
 	struct mips_hi16 *next;
@@ -428,8 +429,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 	const Elf_Shdr *s;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
-	/* Make jump label nops. */
-	jump_label_apply_nops(me);
+	if (IS_ENABLED(CONFIG_JUMP_LABEL))
+		jump_label_apply_nops(me);
 
 	INIT_LIST_HEAD(&me->arch.dbe_list);
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 26125a9c436d..2d159b32885b 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -548,6 +548,5 @@ int module_finalize(const Elf_Ehdr *hdr,
 #endif /* CONFIG_FUNCTION_TRACER */
 	}
 
-	jump_label_apply_nops(me);
 	return 0;
 }
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index df39580f398d..66c45a2764bc 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -208,9 +208,6 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
-	/* make jump label nops */
-	jump_label_apply_nops(me);
-
 	do_patch_sections(hdr, sechdrs);
 
 	/* Cheetah's I-cache is fully coherent.  */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b98ffcf4d250..95b9cf25d4bd 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -304,9 +304,6 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    tseg, tseg + text->sh_size);
 	}
 
-	/* make jump label nops */
-	jump_label_apply_nops(me);
-
 	if (orc && orc_ip)
 		unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
 				   (void *)orc->sh_addr, orc->sh_size);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index bf1eef337a07..2003a0935478 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -230,12 +230,12 @@ extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
 extern void static_key_slow_inc_cpuslocked(struct static_key *key);
 extern void static_key_slow_dec_cpuslocked(struct static_key *key);
-extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
 extern void static_key_disable(struct static_key *key);
 extern void static_key_enable_cpuslocked(struct static_key *key);
 extern void static_key_disable_cpuslocked(struct static_key *key);
+extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);
 
 /*
  * We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -303,11 +303,6 @@ static inline int jump_label_text_reserved(void *start, void *end)
 static inline void jump_label_lock(void) {}
 static inline void jump_label_unlock(void) {}
 
-static inline int jump_label_apply_nops(struct module *mod)
-{
-	return 0;
-}
-
 static inline void static_key_enable(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b156e152d6b4..b1ac2948be79 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -508,7 +508,7 @@ void __init jump_label_init(void)
 
 #ifdef CONFIG_MODULES
 
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
 {
 	struct static_key *key = jump_entry_key(entry);
 	bool type = static_key_type(key);
@@ -596,31 +596,6 @@ static void __jump_label_mod_update(struct static_key *key)
 	}
 }
 
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
-	struct jump_entry *iter_start = mod->jump_entries;
-	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
-	struct jump_entry *iter;
-
-	/* if the module doesn't have jump label entries, just return */
-	if (iter_start == iter_stop)
-		return;
-
-	for (iter = iter_start; iter < iter_stop; iter++) {
-		/* Only write NOPs for arch_branch_static(). */
-		if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
-			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
-	}
-}
-
 static int jump_label_add_module(struct module *mod)
 {
 	struct jump_entry *iter_start = mod->jump_entries;
-- 
cgit 


From 7e6b9db27de9f69a705c1a046d45882c768e16c3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 15 Jun 2022 17:41:42 +0200
Subject: jump_label: make initial NOP patching the special case

Instead of defaulting to patching NOP opcodes at init time, and leaving
it to the architectures to override this if this is not needed, switch
to a model where doing nothing is the default. This is the common case
by far, as only MIPS requires NOP patching at init time. On all other
architectures, the correct encodings are emitted by the compiler and so
no initial patching is needed.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220615154142.1574619-4-ardb@kernel.org
---
 Documentation/staging/static-keys.rst |  3 ---
 arch/arc/kernel/jump_label.c          | 13 -------------
 arch/arm/kernel/jump_label.c          |  6 ------
 arch/arm64/kernel/jump_label.c        | 11 -----------
 arch/mips/include/asm/jump_label.h    |  2 ++
 arch/parisc/kernel/jump_label.c       | 11 -----------
 arch/riscv/kernel/jump_label.c        | 12 ------------
 arch/s390/kernel/jump_label.c         |  5 -----
 arch/x86/kernel/jump_label.c          | 13 -------------
 include/linux/jump_label.h            |  2 --
 kernel/jump_label.c                   | 14 +++++---------
 11 files changed, 7 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/Documentation/staging/static-keys.rst b/Documentation/staging/static-keys.rst
index 38290b9f25eb..b0a519f456cf 100644
--- a/Documentation/staging/static-keys.rst
+++ b/Documentation/staging/static-keys.rst
@@ -201,9 +201,6 @@ static_key->entry field makes use of the two least significant bits.
 * ``void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type)``,
     see: arch/x86/kernel/jump_label.c
 
-* ``__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type)``,
-    see: arch/x86/kernel/jump_label.c
-
 * ``struct jump_entry``,
     see: arch/x86/include/asm/jump_label.h
 
diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
index b8600dc325b5..70b74a5d047b 100644
--- a/arch/arc/kernel/jump_label.c
+++ b/arch/arc/kernel/jump_label.c
@@ -96,19 +96,6 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
 }
 
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	/*
-	 * We use only one NOP type (1x, 4 byte) in arch_static_branch, so
-	 * there's no need to patch an identical NOP over the top of it here.
-	 * The generic code calls 'arch_jump_label_transform' if the NOP needs
-	 * to be replaced by a branch, so 'arch_jump_label_transform_static' is
-	 * never called with type other than JUMP_LABEL_NOP.
-	 */
-	BUG_ON(type != JUMP_LABEL_NOP);
-}
-
 #ifdef CONFIG_ARC_DBG_JUMP_LABEL
 #define SELFTEST_MSG	"ARC: instruction generation self-test: "
 
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index 303b3ab87f7e..eb9c24b6e8e2 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -27,9 +27,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 {
 	__arch_jump_label_transform(entry, type, false);
 }
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	__arch_jump_label_transform(entry, type, true);
-}
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index fc98037e1220..faf88ec9c48e 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -26,14 +26,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 
 	aarch64_insn_patch_text_nosync(addr, insn);
 }
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	/*
-	 * We use the architected A64 NOP in arch_static_branch, so there's no
-	 * need to patch an identical A64 NOP over the top of it here. The core
-	 * will call arch_jump_label_transform from a module notifier if the
-	 * NOP needs to be replaced by a branch.
-	 */
-}
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 3185fd3220ec..c5c6864e64bc 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -8,6 +8,8 @@
 #ifndef _ASM_MIPS_JUMP_LABEL_H
 #define _ASM_MIPS_JUMP_LABEL_H
 
+#define arch_jump_label_transform_static arch_jump_label_transform
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c
index d2f3cb12e282..e253b134500d 100644
--- a/arch/parisc/kernel/jump_label.c
+++ b/arch/parisc/kernel/jump_label.c
@@ -42,14 +42,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 
 	patch_text(addr, insn);
 }
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	/*
-	 * We use the architected NOP in arch_static_branch, so there's no
-	 * need to patch an identical NOP over the top of it here. The core
-	 * will call arch_jump_label_transform from a module notifier if the
-	 * NOP needs to be replaced by a branch.
-	 */
-}
diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c
index 20e09056d141..e6694759dbd0 100644
--- a/arch/riscv/kernel/jump_label.c
+++ b/arch/riscv/kernel/jump_label.c
@@ -39,15 +39,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	patch_text_nosync(addr, &insn, sizeof(insn));
 	mutex_unlock(&text_mutex);
 }
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	/*
-	 * We use the same instructions in the arch_static_branch and
-	 * arch_static_branch_jump inline functions, so there's no
-	 * need to patch them up here.
-	 * The core will call arch_jump_label_transform  when those
-	 * instructions need to be replaced.
-	 */
-}
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index d764f0d229ab..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -80,8 +80,3 @@ void arch_jump_label_transform_apply(void)
 {
 	text_poke_sync();
 }
-
-void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
-						       enum jump_label_type type)
-{
-}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 68f091ba8443..f5b8ef02d172 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -146,16 +146,3 @@ void arch_jump_label_transform_apply(void)
 	text_poke_finish();
 	mutex_unlock(&text_mutex);
 }
-
-static enum {
-	JL_STATE_START,
-	JL_STATE_NO_UPDATE,
-	JL_STATE_UPDATE,
-} jlstate __initdata_or_module = JL_STATE_START;
-
-__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
-				      enum jump_label_type type)
-{
-	if (jlstate == JL_STATE_UPDATE)
-		jump_label_transform(entry, type, 1);
-}
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2003a0935478..570831ca9951 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -220,8 +220,6 @@ extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
 				      enum jump_label_type type);
-extern void arch_jump_label_transform_static(struct jump_entry *entry,
-					     enum jump_label_type type);
 extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
 					    enum jump_label_type type);
 extern void arch_jump_label_transform_apply(void);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b1ac2948be79..714ac4c3b556 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -332,17 +332,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 	return 0;
 }
 
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
-					    enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+					     enum jump_label_type type)
 {
-	arch_jump_label_transform(entry, type);
+	/* nothing to do on most architectures */
 }
+#endif
 
 static inline struct jump_entry *static_key_entries(struct static_key *key)
 {
-- 
cgit 


From eae6d58d67d9739be5f7ae2dbead1d0ef6528243 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Jun 2022 15:26:06 +0200
Subject: locking/lockdep: Fix lockdep_init_map_*() confusion

Commit dfd5e3f5fe27 ("locking/lockdep: Mark local_lock_t") added yet
another lockdep_init_map_*() variant, but forgot to update all the
existing users of the most complicated version.

This could lead to a loss of lock_type and hence an incorrect report.
Given the relative rarity of both local_lock and these annotations,
this is unlikely to happen in practise, still, best fix things.

Fixes: dfd5e3f5fe27 ("locking/lockdep: Mark local_lock_t")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YqyEDtoan20K0CVD@worktop.programming.kicks-ass.net
---
 include/linux/lockdep.h  | 30 +++++++++++++++++-------------
 kernel/locking/lockdep.c |  7 ++++---
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b6829b970093..1f1099dac3f0 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -188,7 +188,7 @@ static inline void
 lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
 		       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
 {
-	lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL);
+	lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
 }
 
 static inline void
@@ -211,24 +211,28 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
  * or they are too narrow (they suffer from a false class-split):
  */
 #define lockdep_set_class(lock, key)				\
-	lockdep_init_map_waits(&(lock)->dep_map, #key, key, 0,	\
-			       (lock)->dep_map.wait_type_inner,	\
-			       (lock)->dep_map.wait_type_outer)
+	lockdep_init_map_type(&(lock)->dep_map, #key, key, 0,	\
+			      (lock)->dep_map.wait_type_inner,	\
+			      (lock)->dep_map.wait_type_outer,	\
+			      (lock)->dep_map.lock_type)
 
 #define lockdep_set_class_and_name(lock, key, name)		\
-	lockdep_init_map_waits(&(lock)->dep_map, name, key, 0,	\
-			       (lock)->dep_map.wait_type_inner,	\
-			       (lock)->dep_map.wait_type_outer)
+	lockdep_init_map_type(&(lock)->dep_map, name, key, 0,	\
+			      (lock)->dep_map.wait_type_inner,	\
+			      (lock)->dep_map.wait_type_outer,	\
+			      (lock)->dep_map.lock_type)
 
 #define lockdep_set_class_and_subclass(lock, key, sub)		\
-	lockdep_init_map_waits(&(lock)->dep_map, #key, key, sub,\
-			       (lock)->dep_map.wait_type_inner,	\
-			       (lock)->dep_map.wait_type_outer)
+	lockdep_init_map_type(&(lock)->dep_map, #key, key, sub,	\
+			      (lock)->dep_map.wait_type_inner,	\
+			      (lock)->dep_map.wait_type_outer,	\
+			      (lock)->dep_map.lock_type)
 
 #define lockdep_set_subclass(lock, sub)					\
-	lockdep_init_map_waits(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
-			       (lock)->dep_map.wait_type_inner,		\
-			       (lock)->dep_map.wait_type_outer)
+	lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
+			      (lock)->dep_map.wait_type_inner,		\
+			      (lock)->dep_map.wait_type_outer,		\
+			      (lock)->dep_map.lock_type)
 
 #define lockdep_set_novalidate_class(lock) \
 	lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f06b91ca6482..e2f179491b08 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5238,9 +5238,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 		return 0;
 	}
 
-	lockdep_init_map_waits(lock, name, key, 0,
-			       lock->wait_type_inner,
-			       lock->wait_type_outer);
+	lockdep_init_map_type(lock, name, key, 0,
+			      lock->wait_type_inner,
+			      lock->wait_type_outer,
+			      lock->lock_type);
 	class = register_lock_class(lock, subclass, 0);
 	hlock->class_idx = class - lock_classes;
 
-- 
cgit 


From f41b284a2c187c299f496f6fa1914ec986bdf0ee Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Wed, 15 Jun 2022 09:55:19 +0800
Subject: xfrm: change the type of xfrm_register_km and xfrm_unregister_km

Functions xfrm_register_km and xfrm_unregister_km do always return 0,
change the type of functions to void.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    | 4 ++--
 net/key/af_key.c      | 6 +-----
 net/xfrm/xfrm_state.c | 6 ++----
 net/xfrm/xfrm_user.c  | 6 ++----
 4 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9287712ad977..afb5940919f8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -583,8 +583,8 @@ struct xfrm_mgr {
 	bool			(*is_alive)(const struct km_event *c);
 };
 
-int xfrm_register_km(struct xfrm_mgr *km);
-int xfrm_unregister_km(struct xfrm_mgr *km);
+void xfrm_register_km(struct xfrm_mgr *km);
+void xfrm_unregister_km(struct xfrm_mgr *km);
 
 struct xfrm_tunnel_skb_cb {
 	union {
diff --git a/net/key/af_key.c b/net/key/af_key.c
index fb16d7c4e1b8..fda2dcc8a383 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3894,14 +3894,10 @@ static int __init ipsec_pfkey_init(void)
 	err = sock_register(&pfkey_family_ops);
 	if (err != 0)
 		goto out_unregister_pernet;
-	err = xfrm_register_km(&pfkeyv2_mgr);
-	if (err != 0)
-		goto out_sock_unregister;
+	xfrm_register_km(&pfkeyv2_mgr);
 out:
 	return err;
 
-out_sock_unregister:
-	sock_unregister(PF_KEY);
 out_unregister_pernet:
 	unregister_pernet_subsys(&pfkey_net_ops);
 out_unregister_key_proto:
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 08564e0eef20..03b180878e61 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2481,22 +2481,20 @@ EXPORT_SYMBOL(xfrm_user_policy);
 
 static DEFINE_SPINLOCK(xfrm_km_lock);
 
-int xfrm_register_km(struct xfrm_mgr *km)
+void xfrm_register_km(struct xfrm_mgr *km)
 {
 	spin_lock_bh(&xfrm_km_lock);
 	list_add_tail_rcu(&km->list, &xfrm_km_list);
 	spin_unlock_bh(&xfrm_km_lock);
-	return 0;
 }
 EXPORT_SYMBOL(xfrm_register_km);
 
-int xfrm_unregister_km(struct xfrm_mgr *km)
+void xfrm_unregister_km(struct xfrm_mgr *km)
 {
 	spin_lock_bh(&xfrm_km_lock);
 	list_del_rcu(&km->list);
 	spin_unlock_bh(&xfrm_km_lock);
 	synchronize_rcu();
-	return 0;
 }
 EXPORT_SYMBOL(xfrm_unregister_km);
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 6a58fec6a1fb..2ff017117730 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3633,10 +3633,8 @@ static int __init xfrm_user_init(void)
 	rv = register_pernet_subsys(&xfrm_user_net_ops);
 	if (rv < 0)
 		return rv;
-	rv = xfrm_register_km(&netlink_mgr);
-	if (rv < 0)
-		unregister_pernet_subsys(&xfrm_user_net_ops);
-	return rv;
+	xfrm_register_km(&netlink_mgr);
+	return 0;
 }
 
 static void __exit xfrm_user_exit(void)
-- 
cgit 


From 084cc29f8bbb034cf30a7ee07a816c115e0c28df Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Mon, 13 Jun 2022 21:25:21 +0000
Subject: KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis

In some cases, the NX hugepage mitigation for iTLB multihit is not
needed for all guests on a host. Allow disabling the mitigation on a
per-VM basis to avoid the performance hit of NX hugepages on trusted
workloads.

In order to disable NX hugepages on a VM, ensure that the userspace
actor has permission to reboot the system. Since disabling NX hugepages
would allow a guest to crash the system, it is similar to reboot
permissions.

Ideally, KVM would require userspace to prove it has access to KVM's
nx_huge_pages module param, e.g. so that userspace can opt out without
needing full reboot permissions.  But getting access to the module param
file info is difficult because it is buried in layers of sysfs and module
glue. Requiring CAP_SYS_BOOT is sufficient for all known use cases.

Suggested-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20220613212523.3436117-9-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 16 ++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu_internal.h |  7 ++++---
 arch/x86/kvm/mmu/spte.c         |  7 ++++---
 arch/x86/kvm/mmu/spte.h         |  3 ++-
 arch/x86/kvm/mmu/tdp_mmu.c      |  2 +-
 arch/x86/kvm/x86.c              | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  1 +
 8 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 320cb04f7bd9..bafaeedd455c 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the
 dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is
 available and supports the `KVM_PV_DUMP_CPU` subcommand.
 
+8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+---------------------------
+
+:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+:Architectures: x86
+:Type: vm
+:Parameters: arg[0] must be 0.
+:Returns 0 on success, -EPERM if the userspace process does not
+	 have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been
+	 created.
+
+This capability disables the NX huge pages mitigation for iTLB MULTIHIT.
+
+The capability has no effect if the nx_huge_pages module parameter is not set.
+
+This capability may only be set before any vCPUs are created.
 
 9. Known KVM API problems
 =========================
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e37727a74d0a..7e4c31b57a75 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1336,6 +1336,8 @@ struct kvm_arch {
 	 * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
 	 */
 	u32 max_vcpu_ids;
+
+	bool disable_nx_huge_pages;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5e1e3c8f8aaa..bb9d12ac0db3 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
 
 extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
 {
-	return READ_ONCE(nx_huge_pages);
+	return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
 }
 
 struct kvm_page_fault {
@@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.user = err & PFERR_USER_MASK,
 		.prefetch = prefetch,
 		.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
-		.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+		.nx_huge_page_workaround_enabled =
+			is_nx_huge_page_enabled(vcpu->kvm),
 
 		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
 		.req_level = PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 242e4828d7df..db294c1beea2 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		spte |= spte_shadow_accessed_mask(spte);
 
 	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
-	    is_nx_huge_page_enabled()) {
+	    is_nx_huge_page_enabled(vcpu->kvm)) {
 		pte_access &= ~ACC_EXEC_MASK;
 	}
 
@@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte)
  * This is used during huge page splitting to build the SPTEs that make up the
  * new page table.
  */
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+			      int index)
 {
 	u64 child_spte;
 	int child_level;
@@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
 		 * When splitting to a 4K page, mark the page executable as the
 		 * NX hugepage mitigation no longer applies.
 		 */
-		if (is_nx_huge_page_enabled())
+		if (is_nx_huge_page_enabled(kvm))
 			child_spte = make_spte_executable(child_spte);
 	}
 
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 121c5eaaec77..256f90587e8d 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
 	       u64 old_spte, bool prefetch, bool can_unsync,
 	       bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+			      int index);
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
 u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1ea40809ef1f..522e2532343b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
 	 * not been linked in yet and thus is not reachable from any other CPU.
 	 */
 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
-		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i);
 
 	/*
 	 * Replace the huge spte with a pointer to the populated lower level
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1b3b2ea8ee0..7ce0c6fe166d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SYS_ATTRIBUTES:
 	case KVM_CAP_VAPIC:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
@@ -6184,6 +6185,35 @@ split_irqchip_unlock:
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+		r = -EINVAL;
+
+		/*
+		 * Since the risk of disabling NX hugepages is a guest crashing
+		 * the system, ensure the userspace process has permission to
+		 * reboot the system.
+		 *
+		 * Note that unlike the reboot() syscall, the process must have
+		 * this capability in the root namespace because exposing
+		 * /dev/kvm into a container does not limit the scope of the
+		 * iTLB multihit bug to that container. In other words,
+		 * this must use capable(), not ns_capable().
+		 */
+		if (!capable(CAP_SYS_BOOT)) {
+			r = -EPERM;
+			break;
+		}
+
+		if (cap->args[0])
+			break;
+
+		mutex_lock(&kvm->lock);
+		if (!kvm->created_vcpus) {
+			kvm->arch.disable_nx_huge_pages = true;
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7569b4ec199c..a36e78710382 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED_DUMP 217
 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
 #define KVM_CAP_X86_NOTIFY_VMEXIT 219
+#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit 


From 837f66c71207542283831d0762c5dca3db5b397a Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 22 Jun 2022 15:27:08 -0400
Subject: KVM: Allow for different capacities in kvm_mmu_memory_cache structs

Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at
declaration time rather than being fixed for all declarations. This will
be used in a follow-up commit to declare an cache in x86 with a capacity
of 512+ objects without having to increase the capacity of all caches in
KVM.

This change requires each cache now specify its capacity at runtime,
since the cache struct itself no longer has a fixed capacity known at
compile time. To protect against someone accidentally defining a
kvm_mmu_memory_cache struct directly (without the extra storage), this
commit includes a WARN_ON() in kvm_mmu_topup_memory_cache().

In order to support different capacities, this commit changes the
objects pointer array to be dynamically allocated the first time the
cache is topped-up.

While here, opportunistically clean up the stack-allocated
kvm_mmu_memory_cache structs in riscv and arm64 to use designated
initializers.

No functional change intended.

Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220516232138.1783324-22-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/mmu.c      |  2 +-
 arch/riscv/kvm/mmu.c      |  5 +----
 include/linux/kvm_host.h  |  1 +
 include/linux/kvm_types.h |  6 +++++-
 virt/kvm/kvm_main.c       | 33 ++++++++++++++++++++++++++++++---
 5 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f5651a05b6a8..87f1cd0df36e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -786,7 +786,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 {
 	phys_addr_t addr;
 	int ret = 0;
-	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
 	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
 				     KVM_PGTABLE_PROT_R |
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 1c00695ebee7..081f8d2b9cf3 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -350,10 +350,7 @@ static int gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 	int ret = 0;
 	unsigned long pfn;
 	phys_addr_t addr, end;
-	struct kvm_mmu_memory_cache pcache;
-
-	memset(&pcache, 0, sizeof(pcache));
-	pcache.gfp_zero = __GFP_ZERO;
+	struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO };
 
 	end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
 	pfn = __phys_to_pfn(hpa);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a2bbdf3ab086..3554e48406e4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1356,6 +1356,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
+int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index f328a01db4fe..4d933518060f 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -85,12 +85,16 @@ struct gfn_to_pfn_cache {
  * MMU flows is problematic, as is triggering reclaim, I/O, etc... while
  * holding MMU locks.  Note, these caches act more like prefetch buffers than
  * classical caches, i.e. objects are not returned to the cache on being freed.
+ *
+ * The @capacity field and @objects array are lazily initialized when the cache
+ * is topped up (__kvm_mmu_topup_memory_cache()).
  */
 struct kvm_mmu_memory_cache {
 	int nobjs;
 	gfp_t gfp_zero;
 	struct kmem_cache *kmem_cache;
-	void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE];
+	int capacity;
+	void **objects;
 };
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b8ae83e09d7..45188d11812c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -396,14 +396,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 		return (void *)__get_free_page(gfp_flags);
 }
 
-int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 {
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
 	void *obj;
 
 	if (mc->nobjs >= min)
 		return 0;
-	while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
-		obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
+
+	if (unlikely(!mc->objects)) {
+		if (WARN_ON_ONCE(!capacity))
+			return -EIO;
+
+		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
+		if (!mc->objects)
+			return -ENOMEM;
+
+		mc->capacity = capacity;
+	}
+
+	/* It is illegal to request a different capacity across topups. */
+	if (WARN_ON_ONCE(mc->capacity != capacity))
+		return -EIO;
+
+	while (mc->nobjs < mc->capacity) {
+		obj = mmu_memory_cache_alloc_obj(mc, gfp);
 		if (!obj)
 			return mc->nobjs >= min ? 0 : -ENOMEM;
 		mc->objects[mc->nobjs++] = obj;
@@ -411,6 +428,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 	return 0;
 }
 
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+{
+	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
+}
+
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 {
 	return mc->nobjs;
@@ -424,6 +446,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 		else
 			free_page((unsigned long)mc->objects[--mc->nobjs]);
 	}
+
+	kvfree(mc->objects);
+
+	mc->objects = NULL;
+	mc->capacity = 0;
 }
 
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-- 
cgit 


From f2b3b28ce5237a4995a17d6f6aac9e05243d7a0b Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 21 Jun 2022 15:49:18 +0800
Subject: bonding: add slave_dev field for bond_opt_value

Currently, bond_opt_value are mostly used for bonding option settings. If
we want to set a value for slave, we need to re-alloc a string to store
both slave name and vlaue, like bond_option_queue_id_set() does, which
is complex and dumb.

As Jon suggested, let's add a union field slave_dev for bond_opt_value,
which will be benefit for future slave option setting. In function
__bond_opt_init(), we will always check the extra field and set it
if it's not NULL.

Suggested-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bond_options.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 1618b76f4903..eade8236a4df 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -83,7 +83,10 @@ struct bond_opt_value {
 	char *string;
 	u64 value;
 	u32 flags;
-	char extra[BOND_OPT_EXTRA_MAXLEN];
+	union {
+		char extra[BOND_OPT_EXTRA_MAXLEN];
+		struct net_device *slave_dev;
+	};
 };
 
 struct bonding;
@@ -133,13 +136,16 @@ static inline void __bond_opt_init(struct bond_opt_value *optval,
 		optval->value = value;
 	else if (string)
 		optval->string = string;
-	else if (extra_len <= BOND_OPT_EXTRA_MAXLEN)
+
+	if (extra && extra_len <= BOND_OPT_EXTRA_MAXLEN)
 		memcpy(optval->extra, extra, extra_len);
 }
 #define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0)
 #define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0)
 #define bond_opt_initextra(optval, extra, extra_len) \
 	__bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len)
+#define bond_opt_slave_initval(optval, slave_dev, value) \
+	__bond_opt_init(optval, NULL, value, slave_dev, sizeof(struct net_device *))
 
 void bond_option_arp_ip_targets_clear(struct bonding *bond);
 #if IS_ENABLED(CONFIG_IPV6)
-- 
cgit 


From 0a2ff7cc8ad48a86939a91bd3457f38e59e741a1 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 21 Jun 2022 15:49:19 +0800
Subject: Bonding: add per-port priority for failover re-selection

Add per port priority support for bonding active slave re-selection during
failover. A higher number means higher priority in selection. The primary
slave still has the highest priority. This option also follows the
primary_reselect rules.

This option could only be configured via netlink.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 11 +++++++++++
 drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++++
 drivers/net/bonding/bond_netlink.c   | 15 +++++++++++++++
 drivers/net/bonding/bond_options.c   | 33 +++++++++++++++++++++++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 8 files changed, 90 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 43be3782e5df..53a18ff7cf23 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -780,6 +780,17 @@ peer_notif_delay
 	value is 0 which means to match the value of the link monitor
 	interval.
 
+prio
+	Slave priority. A higher number means higher priority.
+	The primary slave has the highest priority. This option also
+	follows the primary_reselect rules.
+
+	This option could only be configured via netlink, and is only valid
+	for active-backup(1), balance-tlb (5) and balance-alb (6) mode.
+	The valid value range is a signed 32 bit integer.
+
+	The default value is 0.
+
 primary
 
 	A string (eth0, eth2, etc) specifying which slave is the
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index dc3e9a06e1aa..e75acb14d066 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1026,12 +1026,38 @@ out:
 
 }
 
+/**
+ * bond_choose_primary_or_current - select the primary or high priority slave
+ * @bond: our bonding struct
+ *
+ * - Check if there is a primary link. If the primary link was set and is up,
+ *   go on and do link reselection.
+ *
+ * - If primary link is not set or down, find the highest priority link.
+ *   If the highest priority link is not current slave, set it as primary
+ *   link and do link reselection.
+ */
 static struct slave *bond_choose_primary_or_current(struct bonding *bond)
 {
 	struct slave *prim = rtnl_dereference(bond->primary_slave);
 	struct slave *curr = rtnl_dereference(bond->curr_active_slave);
+	struct slave *slave, *hprio = NULL;
+	struct list_head *iter;
 
 	if (!prim || prim->link != BOND_LINK_UP) {
+		bond_for_each_slave(bond, slave, iter) {
+			if (slave->link == BOND_LINK_UP) {
+				hprio = hprio ?: slave;
+				if (slave->prio > hprio->prio)
+					hprio = slave;
+			}
+		}
+
+		if (hprio && hprio != curr) {
+			prim = hprio;
+			goto link_reselect;
+		}
+
 		if (!curr || curr->link != BOND_LINK_UP)
 			return NULL;
 		return curr;
@@ -1042,6 +1068,7 @@ static struct slave *bond_choose_primary_or_current(struct bonding *bond)
 		return prim;
 	}
 
+link_reselect:
 	if (!curr || curr->link != BOND_LINK_UP)
 		return prim;
 
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 5a6f44455b95..c2d080fc4fc4 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -27,6 +27,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev,
 		nla_total_size(sizeof(u16)) +	/* IFLA_BOND_SLAVE_AD_AGGREGATOR_ID */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */
 		nla_total_size(sizeof(u16)) +	/* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */
+		nla_total_size(sizeof(s32)) +	/* IFLA_BOND_SLAVE_PRIO */
 		0;
 }
 
@@ -53,6 +54,9 @@ static int bond_fill_slave_info(struct sk_buff *skb,
 	if (nla_put_u16(skb, IFLA_BOND_SLAVE_QUEUE_ID, slave->queue_id))
 		goto nla_put_failure;
 
+	if (nla_put_s32(skb, IFLA_BOND_SLAVE_PRIO, slave->prio))
+		goto nla_put_failure;
+
 	if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) {
 		const struct aggregator *agg;
 		const struct port *ad_port;
@@ -117,6 +121,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
 	[IFLA_BOND_SLAVE_QUEUE_ID]	= { .type = NLA_U16 },
+	[IFLA_BOND_SLAVE_PRIO]		= { .type = NLA_S32 },
 };
 
 static int bond_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -157,6 +162,16 @@ static int bond_slave_changelink(struct net_device *bond_dev,
 			return err;
 	}
 
+	if (data[IFLA_BOND_SLAVE_PRIO]) {
+		int prio = nla_get_s32(data[IFLA_BOND_SLAVE_PRIO]);
+
+		bond_opt_slave_initval(&newval, &slave_dev, prio);
+		err = __bond_opt_set(bond, BOND_OPT_PRIO, &newval,
+				     data[IFLA_BOND_SLAVE_PRIO], extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 96eef19cffc4..3498db1c1b3c 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -40,6 +40,8 @@ static int bond_option_arp_validate_set(struct bonding *bond,
 					const struct bond_opt_value *newval);
 static int bond_option_arp_all_targets_set(struct bonding *bond,
 					   const struct bond_opt_value *newval);
+static int bond_option_prio_set(struct bonding *bond,
+				const struct bond_opt_value *newval);
 static int bond_option_primary_set(struct bonding *bond,
 				   const struct bond_opt_value *newval);
 static int bond_option_primary_reselect_set(struct bonding *bond,
@@ -365,6 +367,16 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_intmax_tbl,
 		.set = bond_option_miimon_set
 	},
+	[BOND_OPT_PRIO] = {
+		.id = BOND_OPT_PRIO,
+		.name = "prio",
+		.desc = "Link priority for failover re-selection",
+		.flags = BOND_OPTFLAG_RAWVAL,
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) |
+						BIT(BOND_MODE_TLB) |
+						BIT(BOND_MODE_ALB)),
+		.set = bond_option_prio_set
+	},
 	[BOND_OPT_PRIMARY] = {
 		.id = BOND_OPT_PRIMARY,
 		.name = "primary",
@@ -1306,6 +1318,27 @@ static int bond_option_missed_max_set(struct bonding *bond,
 	return 0;
 }
 
+static int bond_option_prio_set(struct bonding *bond,
+				const struct bond_opt_value *newval)
+{
+	struct slave *slave;
+
+	slave = bond_slave_get_rtnl(newval->slave_dev);
+	if (!slave) {
+		netdev_dbg(newval->slave_dev, "%s called on NULL slave\n", __func__);
+		return -ENODEV;
+	}
+	slave->prio = newval->value;
+
+	if (rtnl_dereference(bond->primary_slave))
+		slave_warn(bond->dev, slave->dev,
+			   "prio updated, but will not affect failover re-selection as primary slave have been set\n");
+	else
+		bond_select_active_slave(bond);
+
+	return 0;
+}
+
 static int bond_option_primary_set(struct bonding *bond,
 				   const struct bond_opt_value *newval)
 {
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index eade8236a4df..d2aea5cf1e41 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -67,6 +67,7 @@ enum {
 	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_MISSED_MAX,
 	BOND_OPT_NS_TARGETS,
+	BOND_OPT_PRIO,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index cb904d356e31..6e78d657aa05 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -178,6 +178,7 @@ struct slave {
 	u32    speed;
 	u16    queue_id;
 	u8     perm_hwaddr[MAX_ADDR_LEN];
+	int    prio;
 	struct ad_slave_info *ad_info;
 	struct tlb_slave_info tlb_info;
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5f58dcfe2787..e36d9d2c65a7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -963,6 +963,7 @@ enum {
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
 	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
 	IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
+	IFLA_BOND_SLAVE_PRIO,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index b339bf2196ca..0242f31e339c 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -890,6 +890,7 @@ enum {
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
 	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
 	IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
+	IFLA_BOND_SLAVE_PRIO,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
-- 
cgit 


From ebc3197963fc42841b183d0280bd3f9f85f13c30 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Jun 2022 04:34:32 +0000
Subject: ipmr: add rcu protection over (struct vif_device)->dev

We will soon use RCU instead of rwlock in ipmr & ip6mr

This preliminary patch adds proper rcu verbs to read/write
(struct vif_device)->dev

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h | 11 +++++---
 net/ipv4/ipmr.c             | 65 ++++++++++++++++++++++++++-------------------
 net/ipv4/ipmr_base.c        | 49 +++++++++++++++++++++-------------
 net/ipv6/ip6mr.c            | 63 ++++++++++++++++++++++++-------------------
 4 files changed, 111 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index e05ee9f001ff..10d1e4fb4e9f 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -26,7 +26,7 @@
  * @remote: Remote address for tunnels
  */
 struct vif_device {
-	struct net_device *dev;
+	struct net_device __rcu *dev;
 	netdevice_tracker dev_tracker;
 	unsigned long bytes_in, bytes_out;
 	unsigned long pkt_in, pkt_out;
@@ -52,6 +52,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb,
 				       unsigned short family,
 				       enum fib_event_type event_type,
 				       struct vif_device *vif,
+				       struct net_device *vif_dev,
 				       unsigned short vif_index, u32 tb_id,
 				       struct netlink_ext_ack *extack)
 {
@@ -60,7 +61,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb,
 			.family = family,
 			.extack = extack,
 		},
-		.dev = vif->dev,
+		.dev = vif_dev,
 		.vif_index = vif_index,
 		.vif_flags = vif->flags,
 		.tb_id = tb_id,
@@ -73,6 +74,7 @@ static inline int mr_call_vif_notifiers(struct net *net,
 					unsigned short family,
 					enum fib_event_type event_type,
 					struct vif_device *vif,
+					struct net_device *vif_dev,
 					unsigned short vif_index, u32 tb_id,
 					unsigned int *ipmr_seq)
 {
@@ -80,7 +82,7 @@ static inline int mr_call_vif_notifiers(struct net *net,
 		.info = {
 			.family = family,
 		},
-		.dev = vif->dev,
+		.dev = vif_dev,
 		.vif_index = vif_index,
 		.vif_flags = vif->flags,
 		.tb_id = tb_id,
@@ -98,7 +100,8 @@ static inline int mr_call_vif_notifiers(struct net *net,
 #define MAXVIFS	32
 #endif
 
-#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
+/* Note: This helper is deprecated. */
+#define VIF_EXISTS(_mrt, _idx) (!!rcu_access_pointer((_mrt)->vif_table[_idx].dev))
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8324e541d193..10371a9e78fc 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -79,6 +79,12 @@ struct ipmr_result {
 
 static DEFINE_RWLOCK(mrt_lock);
 
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+	return rcu_dereference_check(vif->dev,
+				     lockdep_is_held(&mrt_lock));
+}
+
 /* Multicast router control variables */
 
 /* Special spinlock for queue of unresolved entries */
@@ -586,7 +592,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 
 	read_lock(&mrt_lock);
 	if (mrt->mroute_reg_vif_num >= 0)
-		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+		reg_dev = vif_dev_read(&mrt->vif_table[mrt->mroute_reg_vif_num]);
 	read_unlock(&mrt_lock);
 
 	if (!reg_dev)
@@ -614,10 +620,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 static int call_ipmr_vif_entry_notifiers(struct net *net,
 					 enum fib_event_type event_type,
 					 struct vif_device *vif,
+					 struct net_device *vif_dev,
 					 vifi_t vif_index, u32 tb_id)
 {
 	return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
-				     vif, vif_index, tb_id,
+				     vif, vif_dev, vif_index, tb_id,
 				     &net->ipv4.ipmr_seq);
 }
 
@@ -649,18 +656,14 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 	v = &mrt->vif_table[vifi];
 
-	if (VIF_EXISTS(mrt, vifi))
-		call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
-					      mrt->id);
+	dev = rtnl_dereference(v->dev);
+	if (!dev)
+		return -EADDRNOTAVAIL;
 
 	write_lock_bh(&mrt_lock);
-	dev = v->dev;
-	v->dev = NULL;
-
-	if (!dev) {
-		write_unlock_bh(&mrt_lock);
-		return -EADDRNOTAVAIL;
-	}
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+				      vifi, mrt->id);
+	RCU_INIT_POINTER(v->dev, NULL);
 
 	if (vifi == mrt->mroute_reg_vif_num)
 		mrt->mroute_reg_vif_num = -1;
@@ -890,14 +893,15 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
-	v->dev = dev;
+	rcu_assign_pointer(v->dev, dev);
 	netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
 	if (v->flags & VIFF_REGISTER)
 		mrt->mroute_reg_vif_num = vifi;
 	if (vifi+1 > mrt->maxvif)
 		mrt->maxvif = vifi+1;
 	write_unlock_bh(&mrt_lock);
-	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+				      vifi, mrt->id);
 	return 0;
 }
 
@@ -1726,7 +1730,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
 	ipmr_for_each_table(mrt, net) {
 		v = &mrt->vif_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
-			if (v->dev == dev)
+			if (rcu_access_pointer(v->dev) == dev)
 				vif_delete(mrt, ct, 1, NULL);
 		}
 	}
@@ -1811,19 +1815,21 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct vif_device *vif = &mrt->vif_table[vifi];
+	struct net_device *vif_dev;
 	struct net_device *dev;
 	struct rtable *rt;
 	struct flowi4 fl4;
 	int    encap = 0;
 
-	if (!vif->dev)
+	vif_dev = vif_dev_read(vif);
+	if (!vif_dev)
 		goto out_free;
 
 	if (vif->flags & VIFF_REGISTER) {
 		vif->pkt_out++;
 		vif->bytes_out += skb->len;
-		vif->dev->stats.tx_bytes += skb->len;
-		vif->dev->stats.tx_packets++;
+		vif_dev->stats.tx_bytes += skb->len;
+		vif_dev->stats.tx_packets++;
 		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
 		goto out_free;
 	}
@@ -1881,8 +1887,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	if (vif->flags & VIFF_TUNNEL) {
 		ip_encap(net, skb, vif->local, vif->remote);
 		/* FIXME: extra output firewall step used to be here. --RR */
-		vif->dev->stats.tx_packets++;
-		vif->dev->stats.tx_bytes += skb->len;
+		vif_dev->stats.tx_packets++;
+		vif_dev->stats.tx_bytes += skb->len;
 	}
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
@@ -1911,7 +1917,7 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
 	int ct;
 
 	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
-		if (mrt->vif_table[ct].dev == dev)
+		if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
 			break;
 	}
 	return ct;
@@ -1944,7 +1950,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 	}
 
 	/* Wrong interface: drop packet and (maybe) send PIM assert. */
-	if (mrt->vif_table[vif].dev != dev) {
+	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
 		if (rt_is_output_route(skb_rtable(skb))) {
 			/* It is our own packet, looped back.
 			 * Very complicated situation...
@@ -2744,18 +2750,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
 
 static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
 {
+	struct net_device *vif_dev;
 	struct nlattr *vif_nest;
 	struct vif_device *vif;
 
+	vif = &mrt->vif_table[vifid];
+	vif_dev = vif_dev_read(vif);
 	/* if the VIF doesn't exist just continue */
-	if (!VIF_EXISTS(mrt, vifid))
+	if (!vif_dev)
 		return true;
 
-	vif = &mrt->vif_table[vifid];
 	vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
 	if (!vif_nest)
 		return false;
-	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+
+	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
 	    nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
 	    nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
 	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
@@ -2919,9 +2928,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
 	} else {
 		const struct vif_device *vif = v;
-		const char *name =  vif->dev ?
-				    vif->dev->name : "none";
+		const struct net_device *vif_dev;
+		const char *name;
 
+		vif_dev = vif_dev_read(vif);
+		name = vif_dev ? vif_dev->name : "none";
 		seq_printf(seq,
 			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
 			   vif - mrt->vif_table,
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index aa8738a91210..59f62b938472 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v,
 		     unsigned short flags,
 		     unsigned short get_iflink_mask)
 {
-	v->dev = NULL;
+	RCU_INIT_POINTER(v->dev, NULL);
 	v->bytes_in = 0;
 	v->bytes_out = 0;
 	v->pkt_in = 0;
@@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next);
 int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		   struct mr_mfc *c, struct rtmsg *rtm)
 {
+	struct net_device *vif_dev;
 	struct rta_mfc_stats mfcs;
 	struct nlattr *mp_attr;
 	struct rtnexthop *nhp;
@@ -220,10 +221,13 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		return -ENOENT;
 	}
 
-	if (VIF_EXISTS(mrt, c->mfc_parent) &&
-	    nla_put_u32(skb, RTA_IIF,
-			mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+	rcu_read_lock();
+	vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+	if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+		rcu_read_unlock();
 		return -EMSGSIZE;
+	}
+	rcu_read_unlock();
 
 	if (c->mfc_flags & MFC_OFFLOAD)
 		rtm->rtm_flags |= RTNH_F_OFFLOAD;
@@ -232,23 +236,27 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (!mp_attr)
 		return -EMSGSIZE;
 
+	rcu_read_lock();
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-			struct vif_device *vif;
+		struct vif_device *vif = &mrt->vif_table[ct];
+
+		vif_dev = rcu_dereference(vif->dev);
+		if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
 
 			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
 			if (!nhp) {
+				rcu_read_unlock();
 				nla_nest_cancel(skb, mp_attr);
 				return -EMSGSIZE;
 			}
 
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			vif = &mrt->vif_table[ct];
-			nhp->rtnh_ifindex = vif->dev->ifindex;
+			nhp->rtnh_ifindex = vif_dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
+	rcu_read_unlock();
 
 	nla_nest_end(skb, mp_attr);
 
@@ -275,13 +283,14 @@ static bool mr_mfc_uses_dev(const struct mr_table *mrt,
 	int ct;
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-			const struct vif_device *vif;
-
-			vif = &mrt->vif_table[ct];
-			if (vif->dev == dev)
-				return true;
-		}
+		const struct net_device *vif_dev;
+		const struct vif_device *vif;
+
+		vif = &mrt->vif_table[ct];
+		vif_dev = rcu_access_pointer(vif->dev);
+		if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+		    vif_dev == dev)
+			return true;
 	}
 	return false;
 }
@@ -402,18 +411,22 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 
 	for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
 		struct vif_device *v = &mrt->vif_table[0];
+		struct net_device *vif_dev;
 		struct mr_mfc *mfc;
 		int vifi;
 
 		/* Notifiy on table VIF entries */
 		read_lock(mrt_lock);
 		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
-			if (!v->dev)
+			vif_dev = rcu_dereference_check(v->dev,
+							lockdep_is_held(mrt_lock));
+			if (!vif_dev)
 				continue;
 
 			err = mr_call_vif_notifier(nb, family,
-						   FIB_EVENT_VIF_ADD,
-						   v, vifi, mrt->id, extack);
+						   FIB_EVENT_VIF_ADD, v,
+						   vif_dev, vifi,
+						   mrt->id, extack);
 			if (err)
 				break;
 		}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index aa66c032ba97..44cb3d88bbd6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -64,6 +64,12 @@ struct ip6mr_result {
 
 static DEFINE_RWLOCK(mrt_lock);
 
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+	return rcu_dereference_check(vif->dev,
+				     lockdep_is_held(&mrt_lock));
+}
+
 /* Multicast router control variables */
 
 /* Special spinlock for queue of unresolved entries */
@@ -430,7 +436,11 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags\n");
 	} else {
 		const struct vif_device *vif = v;
-		const char *name = vif->dev ? vif->dev->name : "none";
+		const struct net_device *vif_dev;
+		const char *name;
+
+		vif_dev = vif_dev_read(vif);
+		name = vif_dev ? vif_dev->name : "none";
 
 		seq_printf(seq,
 			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
@@ -553,7 +563,7 @@ static int pim6_rcv(struct sk_buff *skb)
 
 	read_lock(&mrt_lock);
 	if (reg_vif_num >= 0)
-		reg_dev = mrt->vif_table[reg_vif_num].dev;
+		reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);
 	read_unlock(&mrt_lock);
 
 	if (!reg_dev)
@@ -668,10 +678,11 @@ failure:
 static int call_ip6mr_vif_entry_notifiers(struct net *net,
 					  enum fib_event_type event_type,
 					  struct vif_device *vif,
+					  struct net_device *vif_dev,
 					  mifi_t vif_index, u32 tb_id)
 {
 	return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
-				     vif, vif_index, tb_id,
+				     vif, vif_dev, vif_index, tb_id,
 				     &net->ipv6.ipmr_seq);
 }
 
@@ -696,19 +707,15 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 
 	v = &mrt->vif_table[vifi];
 
-	if (VIF_EXISTS(mrt, vifi))
-		call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
-					       FIB_EVENT_VIF_DEL, v, vifi,
-					       mrt->id);
+	dev = rtnl_dereference(v->dev);
+	if (!dev)
+		return -EADDRNOTAVAIL;
 
+	call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+				       FIB_EVENT_VIF_DEL, v, dev,
+				       vifi, mrt->id);
 	write_lock_bh(&mrt_lock);
-	dev = v->dev;
-	v->dev = NULL;
-
-	if (!dev) {
-		write_unlock_bh(&mrt_lock);
-		return -EADDRNOTAVAIL;
-	}
+	RCU_INIT_POINTER(v->dev, NULL);
 
 #ifdef CONFIG_IPV6_PIMSM_V2
 	if (vifi == mrt->mroute_reg_vif_num)
@@ -911,7 +918,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
-	v->dev = dev;
+	rcu_assign_pointer(v->dev, dev);
 	netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
 #ifdef CONFIG_IPV6_PIMSM_V2
 	if (v->flags & MIFF_REGISTER)
@@ -921,7 +928,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
 		mrt->maxvif = vifi + 1;
 	write_unlock_bh(&mrt_lock);
 	call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
-				       v, vifi, mrt->id);
+				       v, dev, vifi, mrt->id);
 	return 0;
 }
 
@@ -1241,7 +1248,7 @@ static int ip6mr_device_event(struct notifier_block *this,
 	ip6mr_for_each_table(mrt, net) {
 		v = &mrt->vif_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
-			if (v->dev == dev)
+			if (rcu_access_pointer(v->dev) == dev)
 				mif6_delete(mrt, ct, 1, NULL);
 		}
 	}
@@ -2019,21 +2026,22 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
 static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
 			  struct sk_buff *skb, int vifi)
 {
-	struct ipv6hdr *ipv6h;
 	struct vif_device *vif = &mrt->vif_table[vifi];
-	struct net_device *dev;
+	struct net_device *vif_dev;
+	struct ipv6hdr *ipv6h;
 	struct dst_entry *dst;
 	struct flowi6 fl6;
 
-	if (!vif->dev)
+	vif_dev = vif_dev_read(vif);
+	if (!vif_dev)
 		goto out_free;
 
 #ifdef CONFIG_IPV6_PIMSM_V2
 	if (vif->flags & MIFF_REGISTER) {
 		vif->pkt_out++;
 		vif->bytes_out += skb->len;
-		vif->dev->stats.tx_bytes += skb->len;
-		vif->dev->stats.tx_packets++;
+		vif_dev->stats.tx_bytes += skb->len;
+		vif_dev->stats.tx_packets++;
 		ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
 		goto out_free;
 	}
@@ -2066,14 +2074,13 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
 	 * not mrouter) cannot join to more than one interface - it will
 	 * result in receiving multiple packets.
 	 */
-	dev = vif->dev;
-	skb->dev = dev;
+	skb->dev = vif_dev;
 	vif->pkt_out++;
 	vif->bytes_out += skb->len;
 
 	/* We are about to write */
 	/* XXX: extension headers? */
-	if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
+	if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
 		goto out_free;
 
 	ipv6h = ipv6_hdr(skb);
@@ -2082,7 +2089,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
 	IP6CB(skb)->flags |= IP6SKB_FORWARDED;
 
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
-		       net, NULL, skb, skb->dev, dev,
+		       net, NULL, skb, skb->dev, vif_dev,
 		       ip6mr_forward2_finish);
 
 out_free:
@@ -2095,7 +2102,7 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
 	int ct;
 
 	for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
-		if (mrt->vif_table[ct].dev == dev)
+		if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
 			break;
 	}
 	return ct;
@@ -2133,7 +2140,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (mrt->vif_table[vif].dev != dev) {
+	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
 		c->_c.mfc_un.res.wrong_if++;
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
-- 
cgit 


From 194366b28b8306b7a24596c57c09635ab2891252 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Jun 2022 04:34:46 +0000
Subject: ipmr: adopt rcu_read_lock() in mr_dump()

We no longer need to acquire mrt_lock() in mr_dump,
using rcu_read_lock() is enough.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h | 4 ++--
 net/ipv4/ipmr.c             | 2 +-
 net/ipv4/ipmr_base.c        | 8 +++-----
 net/ipv6/ip6mr.c            | 2 +-
 4 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 10d1e4fb4e9f..9dd4bf157255 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -308,7 +308,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 			      struct netlink_ext_ack *extack),
 	    struct mr_table *(*mr_iter)(struct net *net,
 					struct mr_table *mrt),
-	    rwlock_t *mrt_lock, struct netlink_ext_ack *extack);
+	    struct netlink_ext_ack *extack);
 #else
 static inline void vif_device_init(struct vif_device *v,
 				   struct net_device *dev,
@@ -363,7 +363,7 @@ static inline int mr_dump(struct net *net, struct notifier_block *nb,
 					    struct netlink_ext_ack *extack),
 			  struct mr_table *(*mr_iter)(struct net *net,
 						      struct mr_table *mrt),
-			  rwlock_t *mrt_lock, struct netlink_ext_ack *extack)
+			  struct netlink_ext_ack *extack)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 69ccd3d7c655..38963b8de7af 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3027,7 +3027,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb,
 		     struct netlink_ext_ack *extack)
 {
 	return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
-		       ipmr_mr_table_iter, &mrt_lock, extack);
+		       ipmr_mr_table_iter, extack);
 }
 
 static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 59f62b938472..271dc03fc6db 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -399,7 +399,6 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 			      struct netlink_ext_ack *extack),
 	    struct mr_table *(*mr_iter)(struct net *net,
 					struct mr_table *mrt),
-	    rwlock_t *mrt_lock,
 	    struct netlink_ext_ack *extack)
 {
 	struct mr_table *mrt;
@@ -416,10 +415,9 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 		int vifi;
 
 		/* Notifiy on table VIF entries */
-		read_lock(mrt_lock);
+		rcu_read_lock();
 		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
-			vif_dev = rcu_dereference_check(v->dev,
-							lockdep_is_held(mrt_lock));
+			vif_dev = rcu_dereference(v->dev);
 			if (!vif_dev)
 				continue;
 
@@ -430,7 +428,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 			if (err)
 				break;
 		}
-		read_unlock(mrt_lock);
+		rcu_read_unlock();
 
 		if (err)
 			return err;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 08ac177fe30c..f0a9bceb8e3c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1267,7 +1267,7 @@ static int ip6mr_dump(struct net *net, struct notifier_block *nb,
 		      struct netlink_ext_ack *extack)
 {
 	return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
-		       ip6mr_mr_table_iter, &mrt_lock, extack);
+		       ip6mr_mr_table_iter, extack);
 }
 
 static struct notifier_block ip6_mr_notifier = {
-- 
cgit 


From ba1afa676d0babf99e99f5415db43fdd7ecef104 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 24 Jun 2022 17:31:47 +0800
Subject: lib: bitmap: fix the duplicated comments on bitmap_to_arr64()

Thanks to the recent commit 0a97953fd221 ("lib: add
bitmap_{from,to}_arr64") now we can directly convert a U64 value into a
bitmap and vice verse.

However when checking the header there is duplicated helper for
bitmap_to_arr64(), but no bitmap_from_arr64().

Just fix the copy-n-paste error.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 2e6cd5681040..f091a1664bf1 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -71,9 +71,9 @@ struct device;
  *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
  *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
  *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
+ *  bitmap_from_arr64(dst, buf, nbits)          Copy nbits from u64[] buf to dst
  *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
  *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
- *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
  *  bitmap_get_value8(map, start)               Get 8bit value from map at start
  *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
  *
-- 
cgit 


From e61c451476e61450f6771ce03bbc01210a09be16 Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Fri, 22 Apr 2022 14:24:35 +0800
Subject: dma-mapping: Add dma_release_coherent_memory to DMA API

Add dma_release_coherent_memory to DMA API to allow dma
user call it to release dev->dma_mem when the device is
removed.

Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220422062436.14384-2-mark-pk.tsai@mediatek.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/dma-map-ops.h |  3 +++
 kernel/dma/coherent.c       | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d5b06b3a4a6..53db9655efe9 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -166,6 +166,7 @@ static inline void dma_pernuma_cma_reserve(void) { }
 #ifdef CONFIG_DMA_DECLARE_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 		dma_addr_t device_addr, size_t size);
+void dma_release_coherent_memory(struct device *dev);
 int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 		dma_addr_t *dma_handle, void **ret);
 int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr);
@@ -177,6 +178,8 @@ static inline int dma_declare_coherent_memory(struct device *dev,
 {
 	return -ENOSYS;
 }
+
+#define dma_release_coherent_memory(dev) (0)
 #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 375fb3c9538d..c21abc77c53e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -74,7 +74,7 @@ out_unmap_membase:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
 {
 	if (!mem)
 		return;
@@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 
 	ret = dma_assign_coherent_memory(dev, mem);
 	if (ret)
-		dma_release_coherent_memory(mem);
+		_dma_release_coherent_memory(mem);
 	return ret;
 }
 
+void dma_release_coherent_memory(struct device *dev)
+{
+	if (dev)
+		_dma_release_coherent_memory(dev->dma_mem);
+}
+
 static void *__dma_alloc_from_coherent(struct device *dev,
 				       struct dma_coherent_mem *mem,
 				       ssize_t size, dma_addr_t *dma_handle)
-- 
cgit 


From e36de87d34a7f2f26b2e2129c6dc18a0024663eb Mon Sep 17 00:00:00 2001
From: Vineeth Pillai <vineeth@bitbyteword.org>
Date: Mon, 23 May 2022 15:03:27 -0400
Subject: KVM: debugfs: expose pid of vcpu threads

Add a new debugfs file to expose the pid of each vcpu threads. This
is very helpful for userland tools to get the vcpu pids without
worrying about thread naming conventions of the VMM.

Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
Message-Id: <20220523190327.2658-1-vineeth@bitbyteword.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3554e48406e4..3b40f8d68fbb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1435,6 +1435,8 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state);
 
 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry);
+#else
+static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
 #endif
 
 int kvm_arch_hardware_enable(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 45188d11812c..da263c370d00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3822,9 +3822,18 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+static int vcpu_get_pid(void *data, u64 *val)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+	*val = pid_nr(rcu_access_pointer(vcpu->pid));
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
+
 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
-#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 	struct dentry *debugfs_dentry;
 	char dir_name[ITOA_MAX_LEN * 2];
 
@@ -3834,10 +3843,12 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
 	debugfs_dentry = debugfs_create_dir(dir_name,
 					    vcpu->kvm->debugfs_dentry);
+	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
+			    &vcpu_get_pid_fops);
 
 	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
-#endif
 }
+#endif
 
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
-- 
cgit 


From 0c4c516fa206d6e8a3986eebd3016a96022792c6 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Wed, 22 Jun 2022 14:45:11 -0400
Subject: fs: dlm: remove additional dereference of lksb

This patch removes a dereference of lksb of lkb when calling ast
tracepoint. First it reduces additional overhead, even if traces
are not active. Second we can deference it in TP_fast_assign from
the existing lkb parameter.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c               | 2 +-
 include/trace/events/dlm.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index df25c3e785cf..19ef136f9e4f 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -260,7 +260,7 @@ void dlm_callback_work(struct work_struct *work)
 		} else if (callbacks[i].flags & DLM_CB_CAST) {
 			lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
 			lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
-			trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
+			trace_dlm_ast(ls, lkb);
 			castfn(lkb->lkb_astparam);
 		}
 	}
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 32088c603244..e333176ecfaf 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -138,9 +138,9 @@ TRACE_EVENT(dlm_bast,
 
 TRACE_EVENT(dlm_ast,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_lksb *lksb),
+	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb),
 
-	TP_ARGS(ls, lkb, lksb),
+	TP_ARGS(ls, lkb),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
@@ -152,8 +152,8 @@ TRACE_EVENT(dlm_ast,
 	TP_fast_assign(
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
-		__entry->sb_flags = lksb->sb_flags;
-		__entry->sb_status = lksb->sb_status;
+		__entry->sb_flags = lkb->lkb_lksb->sb_flags;
+		__entry->sb_status = lkb->lkb_lksb->sb_status;
 	),
 
 	TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d",
-- 
cgit 


From 5d92a30e900dc97221e36f09ae740457d560d281 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Wed, 22 Jun 2022 14:45:12 -0400
Subject: fs: dlm: add resource name to tracepoints

This patch adds the resource name to dlm tracepoints.  The name
usually comes through the lkb_resource, but in some cases a resource
may not yet be associated with an lkb, in which case the name and
namelen parameters are used.

It should be okay to access the lkb_resource and the res_name field at
the time when the tracepoint is invoked. The resource is assigned to a
lkb and it's reference is being held during the tracepoint call. During
this time the resource cannot be freed. Also a lkb will never switch
its assigned resource. The name of a dlm_rsb is assigned at creation
time and should never be changed during runtime as well.

The TP_printk() call uses always a hexadecimal string array
representation for the resource name (which is not necessarily ascii.)

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c              |   4 +-
 include/trace/events/dlm.h | 110 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 94 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 226822f49d30..e80d42ba64ae 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3472,7 +3472,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	if (error)
 		goto out;
 
-	trace_dlm_lock_start(ls, lkb, mode, flags);
+	trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
 
 	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
 			      astarg, bast, &args);
@@ -3487,7 +3487,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	if (error == -EINPROGRESS)
 		error = 0;
  out_put:
-	trace_dlm_lock_end(ls, lkb, mode, flags, error);
+	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
 
 	if (convert || error)
 		__put_lkb(ls, lkb);
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index e333176ecfaf..bad21222130e 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -49,38 +49,52 @@
 /* note: we begin tracing dlm_lock_start() only if ls and lkb are found */
 TRACE_EVENT(dlm_lock_start,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode,
-		 __u32 flags),
+	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+		 unsigned int namelen, int mode, __u32 flags),
 
-	TP_ARGS(ls, lkb, mode, flags),
+	TP_ARGS(ls, lkb, name, namelen, mode, flags),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
 		__field(__u32, lkb_id)
 		__field(int, mode)
 		__field(__u32, flags)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->mode = mode;
 		__entry->flags = flags;
+
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
+		else if (name)
+			memcpy(__get_dynamic_array(res_name), name,
+			       __get_dynamic_array_len(res_name));
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s",
+	TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s res_name=%s",
 		  __entry->ls_id, __entry->lkb_id,
 		  show_lock_mode(__entry->mode),
-		  show_lock_flags(__entry->flags))
+		  show_lock_flags(__entry->flags),
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
 TRACE_EVENT(dlm_lock_end,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, __u32 flags,
-		 int error),
+	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+		 unsigned int namelen, int mode, __u32 flags, int error),
 
-	TP_ARGS(ls, lkb, mode, flags, error),
+	TP_ARGS(ls, lkb, name, namelen, mode, flags, error),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
@@ -88,14 +102,26 @@ TRACE_EVENT(dlm_lock_end,
 		__field(int, mode)
 		__field(__u32, flags)
 		__field(int, error)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->mode = mode;
 		__entry->flags = flags;
 
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
+		else if (name)
+			memcpy(__get_dynamic_array(res_name), name,
+			       __get_dynamic_array_len(res_name));
+
 		/* return value will be zeroed in those cases by dlm_lock()
 		 * we do it here again to not introduce more overhead if
 		 * trace isn't running and error reflects the return value.
@@ -104,12 +130,15 @@ TRACE_EVENT(dlm_lock_end,
 			__entry->error = 0;
 		else
 			__entry->error = error;
+
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d",
+	TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d res_name=%s",
 		  __entry->ls_id, __entry->lkb_id,
 		  show_lock_mode(__entry->mode),
-		  show_lock_flags(__entry->flags), __entry->error)
+		  show_lock_flags(__entry->flags), __entry->error,
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
@@ -123,16 +152,28 @@ TRACE_EVENT(dlm_bast,
 		__field(__u32, ls_id)
 		__field(__u32, lkb_id)
 		__field(int, mode)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->mode = mode;
+
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x mode=%s", __entry->ls_id,
-		  __entry->lkb_id, show_lock_mode(__entry->mode))
+	TP_printk("ls_id=%u lkb_id=%x mode=%s res_name=%s",
+		  __entry->ls_id, __entry->lkb_id,
+		  show_lock_mode(__entry->mode),
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
@@ -147,18 +188,29 @@ TRACE_EVENT(dlm_ast,
 		__field(__u32, lkb_id)
 		__field(u8, sb_flags)
 		__field(int, sb_status)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->sb_flags = lkb->lkb_lksb->sb_flags;
 		__entry->sb_status = lkb->lkb_lksb->sb_status;
+
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d",
+	TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d res_name=%s",
 		  __entry->ls_id, __entry->lkb_id,
-		  show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status)
+		  show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status,
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
@@ -173,17 +225,28 @@ TRACE_EVENT(dlm_unlock_start,
 		__field(__u32, ls_id)
 		__field(__u32, lkb_id)
 		__field(__u32, flags)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->flags = flags;
+
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x flags=%s",
+	TP_printk("ls_id=%u lkb_id=%x flags=%s res_name=%s",
 		  __entry->ls_id, __entry->lkb_id,
-		  show_lock_flags(__entry->flags))
+		  show_lock_flags(__entry->flags),
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
@@ -199,18 +262,29 @@ TRACE_EVENT(dlm_unlock_end,
 		__field(__u32, lkb_id)
 		__field(__u32, flags)
 		__field(int, error)
+		__dynamic_array(unsigned char, res_name,
+				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
 	),
 
 	TP_fast_assign(
+		struct dlm_rsb *r;
+
 		__entry->ls_id = ls->ls_global_id;
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->flags = flags;
 		__entry->error = error;
+
+		r = lkb->lkb_resource;
+		if (r)
+			memcpy(__get_dynamic_array(res_name), r->res_name,
+			       __get_dynamic_array_len(res_name));
 	),
 
-	TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d",
+	TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d res_name=%s",
 		  __entry->ls_id, __entry->lkb_id,
-		  show_lock_flags(__entry->flags), __entry->error)
+		  show_lock_flags(__entry->flags), __entry->error,
+		  __print_hex_str(__get_dynamic_array(res_name),
+				  __get_dynamic_array_len(res_name)))
 
 );
 
-- 
cgit 


From 8ca869b24538a7b5501af368e87e4a59b0c04117 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 22 Jun 2022 09:31:31 +0200
Subject: pstore: Add priv field to pstore_record for backend specific use

The EFI pstore backend will need to store per-record variable name data
when we switch away from the efivars layer. Add a priv field to struct
pstore_record, and document it as holding a backend specific pointer
that is assumed to be a kmalloc()d buffer, and will be kfree()d when the
entire record is freed.

Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/pstore/inode.c      | 1 +
 fs/pstore/platform.c   | 1 +
 include/linux/pstore.h | 4 ++++
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 14658b009f1b..ffbadb8b3032 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -55,6 +55,7 @@ static void free_pstore_private(struct pstore_private *private)
 		return;
 	if (private->record) {
 		kfree(private->record->buf);
+		kfree(private->record->priv);
 		kfree(private->record);
 	}
 	kfree(private);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e26162f102ff..0c034ea39954 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -769,6 +769,7 @@ void pstore_get_backend_records(struct pstore_info *psi,
 		if (rc) {
 			/* pstore_mkfile() did not take record, so free it. */
 			kfree(record->buf);
+			kfree(record->priv);
 			kfree(record);
 			if (rc != -EEXIST || !quiet)
 				failed++;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index e97a8188f0fd..638507a3c8ff 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -57,6 +57,9 @@ struct pstore_info;
  * @size:	size of @buf
  * @ecc_notice_size:
  *		ECC information for @buf
+ * @priv:	pointer for backend specific use, will be
+ *		kfree()d by the pstore core if non-NULL
+ *		when the record is freed.
  *
  * Valid for PSTORE_TYPE_DMESG @type:
  *
@@ -74,6 +77,7 @@ struct pstore_record {
 	char			*buf;
 	ssize_t			size;
 	ssize_t			ecc_notice_size;
+	void			*priv;
 
 	int			count;
 	enum kmsg_dump_reason	reason;
-- 
cgit 


From ec3507b2ca51286de6ecd85fdac8e722219cdef8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 20 Jun 2022 17:04:32 +0200
Subject: efi: vars: Don't drop lock in the middle of efivar_init()

Even though the efivars_lock lock is documented as protecting the
efivars->ops pointer (among other things), efivar_init() happily
releases and reacquires the lock for every EFI variable that it
enumerates. This used to be needed because the lock was originally a
spinlock, which prevented the callback that is invoked for every
variable from being able to sleep. However, releasing the lock could
potentially invalidate the ops pointer, but more importantly, it might
allow a SetVariable() runtime service call to take place concurrently,
and the UEFI spec does not define how this affects an enumeration that
is running in parallel using the GetNextVariable() runtime service,
which is what efivar_init() uses.

In the meantime, the lock has been converted into a semaphore, and the
only reason we need to drop the lock is because the efivarfs pseudo
filesystem driver will otherwise deadlock when it invokes the efivars
API from the callback to create the efivar_entry items and insert them
into the linked list. (EFI pstore is affected in a similar way)

So let's switch to helpers that can be used while the lock is already
taken. This way, we can hold on to the lock throughout the enumeration.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efi-pstore.c |  7 ++-----
 drivers/firmware/efi/efivars.c    |  5 +----
 drivers/firmware/efi/vars.c       | 22 +++++++++++-----------
 fs/efivarfs/super.c               |  6 ++----
 include/linux/efi.h               |  1 +
 5 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 7e771c56c13c..0d80cc7ff6ca 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -364,7 +364,6 @@ static int efi_pstore_callback(efi_char16_t *name, efi_guid_t vendor,
 			       unsigned long name_size, void *data)
 {
 	struct efivar_entry *entry;
-	int ret;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
@@ -373,11 +372,9 @@ static int efi_pstore_callback(efi_char16_t *name, efi_guid_t vendor,
 	memcpy(entry->var.VariableName, name, name_size);
 	entry->var.VendorGuid = vendor;
 
-	ret = efivar_entry_add(entry, &efi_pstore_list);
-	if (ret)
-		kfree(entry);
+	__efivar_entry_add(entry, &efi_pstore_list);
 
-	return ret;
+	return 0;
 }
 
 static int efi_pstore_update_entry(efi_char16_t *name, efi_guid_t vendor,
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index ea0bc39dc965..c19db0b35c0d 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -527,10 +527,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 	}
 
 	kobject_uevent(&new_var->kobj, KOBJ_ADD);
-	if (efivar_entry_add(new_var, &efivar_sysfs_list)) {
-		efivar_unregister(new_var);
-		return -EINTR;
-	}
+	__efivar_entry_add(new_var, &efivar_sysfs_list);
 
 	return 0;
 }
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index cae590bd08f2..146360e2f1cb 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -450,9 +450,6 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 						&vendor_guid);
 		switch (status) {
 		case EFI_SUCCESS:
-			if (duplicates)
-				up(&efivars_lock);
-
 			variable_name_size = var_name_strnsize(variable_name,
 							       variable_name_size);
 
@@ -476,14 +473,6 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 				if (err)
 					status = EFI_NOT_FOUND;
 			}
-
-			if (duplicates) {
-				if (down_interruptible(&efivars_lock)) {
-					err = -EINTR;
-					goto free;
-				}
-			}
-
 			break;
 		case EFI_UNSUPPORTED:
 			err = -EOPNOTSUPP;
@@ -526,6 +515,17 @@ int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
 }
 EXPORT_SYMBOL_GPL(efivar_entry_add);
 
+/**
+ * __efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+	list_add(&entry->list, head);
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_add);
+
 /**
  * efivar_entry_remove - remove entry from variable list
  * @entry: entry to remove from list
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 15880a68faad..09dfa8362f50 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -155,10 +155,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 		goto fail_inode;
 	}
 
-	efivar_entry_size(entry, &size);
-	err = efivar_entry_add(entry, &efivarfs_list);
-	if (err)
-		goto fail_inode;
+	__efivar_entry_get(entry, NULL, &size, NULL);
+	__efivar_entry_add(entry, &efivarfs_list);
 
 	/* copied by the above to local storage in the dentry. */
 	kfree(name);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 53f64c14a525..56f04b6daeb0 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1064,6 +1064,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 		void *data, bool duplicates, struct list_head *head);
 
 int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 int efivar_entry_remove(struct efivar_entry *entry);
 
 int __efivar_entry_delete(struct efivar_entry *entry);
-- 
cgit 


From 472831d4c4b2d8eac783b256e5c829487d5310df Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 20 Jun 2022 13:17:20 +0200
Subject: efi: vars: Add thin wrapper around EFI get/set variable interface

The current efivars layer is a jumble of list iterators, shadow data
structures and safe variable manipulation helpers that really belong in
the efivarfs pseudo file system once the obsolete sysfs access method to
EFI variables is removed.

So split off a minimal efivar get/set variable API that reuses the
existing efivars_lock semaphore to mediate access to the various runtime
services, primarily to ensure that performing a SetVariable() on one CPU
while another is calling GetNextVariable() in a loop to enumerate the
contents of the EFI variable store does not result in surprises.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/vars.c | 154 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/efi.h         |  20 ++++++
 2 files changed, 164 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 146360e2f1cb..41c82614a4b2 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -298,14 +298,10 @@ efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
 }
 EXPORT_SYMBOL_GPL(efivar_variable_is_removable);
 
-static efi_status_t
-check_var_size(u32 attributes, unsigned long size)
+efi_status_t check_var_size(u32 attributes, unsigned long size)
 {
 	const struct efivar_operations *fops;
 
-	if (!__efivars)
-		return EFI_UNSUPPORTED;
-
 	fops = __efivars->ops;
 
 	if (!fops->query_variable_store)
@@ -313,15 +309,12 @@ check_var_size(u32 attributes, unsigned long size)
 
 	return fops->query_variable_store(attributes, size, false);
 }
+EXPORT_SYMBOL_NS_GPL(check_var_size, EFIVAR);
 
-static efi_status_t
-check_var_size_nonblocking(u32 attributes, unsigned long size)
+efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size)
 {
 	const struct efivar_operations *fops;
 
-	if (!__efivars)
-		return EFI_UNSUPPORTED;
-
 	fops = __efivars->ops;
 
 	if (!fops->query_variable_store)
@@ -329,6 +322,7 @@ check_var_size_nonblocking(u32 attributes, unsigned long size)
 
 	return fops->query_variable_store(attributes, size, true);
 }
+EXPORT_SYMBOL_NS_GPL(check_var_size_nonblocking, EFIVAR);
 
 static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
 				struct list_head *head)
@@ -1220,3 +1214,143 @@ int efivar_supports_writes(void)
 	return __efivars && __efivars->ops->set_variable;
 }
 EXPORT_SYMBOL_GPL(efivar_supports_writes);
+
+/*
+ * efivar_lock() - obtain the efivar lock, wait for it if needed
+ * @return 0 on success, error code on failure
+ */
+int efivar_lock(void)
+{
+	if (down_interruptible(&efivars_lock))
+		return -EINTR;
+	if (!__efivars->ops) {
+		up(&efivars_lock);
+		return -ENODEV;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(efivar_lock, EFIVAR);
+
+/*
+ * efivar_lock() - obtain the efivar lock if it is free
+ * @return 0 on success, error code on failure
+ */
+int efivar_trylock(void)
+{
+	if (down_trylock(&efivars_lock))
+		 return -EBUSY;
+	if (!__efivars->ops) {
+		up(&efivars_lock);
+		return -ENODEV;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(efivar_trylock, EFIVAR);
+
+/*
+ * efivar_unlock() - release the efivar lock
+ */
+void efivar_unlock(void)
+{
+	up(&efivars_lock);
+}
+EXPORT_SYMBOL_NS_GPL(efivar_unlock, EFIVAR);
+
+/*
+ * efivar_get_variable() - retrieve a variable identified by name/vendor
+ *
+ * Must be called with efivars_lock held.
+ */
+efi_status_t efivar_get_variable(efi_char16_t *name, efi_guid_t *vendor,
+				 u32 *attr, unsigned long *size, void *data)
+{
+	return __efivars->ops->get_variable(name, vendor, attr, size, data);
+}
+EXPORT_SYMBOL_NS_GPL(efivar_get_variable, EFIVAR);
+
+/*
+ * efivar_get_next_variable() - enumerate the next name/vendor pair
+ *
+ * Must be called with efivars_lock held.
+ */
+efi_status_t efivar_get_next_variable(unsigned long *name_size,
+				      efi_char16_t *name, efi_guid_t *vendor)
+{
+	return __efivars->ops->get_next_variable(name_size, name, vendor);
+}
+EXPORT_SYMBOL_NS_GPL(efivar_get_next_variable, EFIVAR);
+
+/*
+ * efivar_set_variable_blocking() - local helper function for set_variable
+ *
+ * Must be called with efivars_lock held.
+ */
+static efi_status_t
+efivar_set_variable_blocking(efi_char16_t *name, efi_guid_t *vendor,
+			     u32 attr, unsigned long data_size, void *data)
+{
+	efi_status_t status;
+
+	if (data_size > 0) {
+		status = check_var_size(attr, data_size +
+					      ucs2_strsize(name, 1024));
+		if (status != EFI_SUCCESS)
+			return status;
+	}
+	return __efivars->ops->set_variable(name, vendor, attr, data_size, data);
+}
+
+/*
+ * efivar_set_variable_locked() - set a variable identified by name/vendor
+ *
+ * Must be called with efivars_lock held. If @nonblocking is set, it will use
+ * non-blocking primitives so it is guaranteed not to sleep.
+ */
+efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor,
+					u32 attr, unsigned long data_size,
+					void *data, bool nonblocking)
+{
+	efi_set_variable_t *setvar;
+	efi_status_t status;
+
+	if (!nonblocking)
+		return efivar_set_variable_blocking(name, vendor, attr,
+						    data_size, data);
+
+	/*
+	 * If no _nonblocking variant exists, the ordinary one
+	 * is assumed to be non-blocking.
+	 */
+	setvar = __efivars->ops->set_variable_nonblocking ?:
+		 __efivars->ops->set_variable;
+
+	if (data_size > 0) {
+		status = check_var_size_nonblocking(attr, data_size +
+							  ucs2_strsize(name, 1024));
+		if (status != EFI_SUCCESS)
+			return status;
+	}
+	return setvar(name, vendor, attr, data_size, data);
+}
+EXPORT_SYMBOL_NS_GPL(efivar_set_variable_locked, EFIVAR);
+
+/*
+ * efivar_set_variable() - set a variable identified by name/vendor
+ *
+ * Can be called without holding the efivars_lock. Will sleep on obtaining the
+ * lock, or on obtaining other locks that are needed in order to complete the
+ * call.
+ */
+efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor,
+				 u32 attr, unsigned long data_size, void *data)
+{
+	efi_status_t status;
+
+	if (efivar_lock())
+		return EFI_ABORTED;
+
+	status = efivar_set_variable_blocking(name, vendor, attr, data_size, data);
+	efivar_unlock();
+	return status;
+}
+EXPORT_SYMBOL_NS_GPL(efivar_set_variable, EFIVAR);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 56f04b6daeb0..c828ab6f0e2a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1099,6 +1099,26 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 				  size_t len);
 
+int efivar_lock(void);
+int efivar_trylock(void);
+void efivar_unlock(void);
+
+efi_status_t efivar_get_variable(efi_char16_t *name, efi_guid_t *vendor,
+				 u32 *attr, unsigned long *size, void *data);
+
+efi_status_t efivar_get_next_variable(unsigned long *name_size,
+				      efi_char16_t *name, efi_guid_t *vendor);
+
+efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor,
+					u32 attr, unsigned long data_size,
+					void *data, bool nonblocking);
+
+efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor,
+				 u32 attr, unsigned long data_size, void *data);
+
+efi_status_t check_var_size(u32 attributes, unsigned long size);
+efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size);
+
 #if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER)
 extern bool efi_capsule_pending(int *reset_type);
 
-- 
cgit 


From 859748255b43460685e93a1f8a40b8cdc3be02f2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 20 Jun 2022 13:21:26 +0200
Subject: efi: pstore: Omit efivars caching EFI varstore access layer

Avoid the efivars layer and simply call the newly introduced EFI
varstore helpers instead. This simplifies the code substantially, and
also allows us to remove some hacks in the shared efivars layer that
were added for efi-pstore specifically.

In order to be able to delete the EFI variable associated with a record,
store the UTF-16 name of the variable in the pstore record's priv field.
That way, we don't have to make guesses regarding which variable the
record may have been loaded from.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/Kconfig      |   1 +
 drivers/firmware/efi/efi-pstore.c | 374 +++++++++-----------------------------
 drivers/firmware/efi/efivars.c    |  12 +-
 drivers/firmware/efi/vars.c       |  12 +-
 include/linux/efi.h               |   2 -
 5 files changed, 91 insertions(+), 310 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 6fd4414c4836..7fe8b5c686d2 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -22,6 +22,7 @@ config EFI_ESRT
 config EFI_VARS_PSTORE
 	tristate "Register efivars backend for pstore"
 	depends on PSTORE
+	select UCS2_STRING
 	default y
 	help
 	  Say Y here to enable use efivars as a backend to pstore. This
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 0d80cc7ff6ca..3bddc152fcd4 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -6,6 +6,8 @@
 #include <linux/slab.h>
 #include <linux/ucs2_string.h>
 
+MODULE_IMPORT_NS(EFIVAR);
+
 #define DUMP_NAME_LEN 66
 
 #define EFIVARS_DATA_SIZE_MAX 1024
@@ -20,18 +22,25 @@ module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644);
 	 EFI_VARIABLE_BOOTSERVICE_ACCESS | \
 	 EFI_VARIABLE_RUNTIME_ACCESS)
 
-static LIST_HEAD(efi_pstore_list);
-static DECLARE_WORK(efivar_work, NULL);
-
 static int efi_pstore_open(struct pstore_info *psi)
 {
-	psi->data = NULL;
+	int err;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+
+	psi->data = kzalloc(EFIVARS_DATA_SIZE_MAX, GFP_KERNEL);
+	if (!psi->data)
+		return -ENOMEM;
+
 	return 0;
 }
 
 static int efi_pstore_close(struct pstore_info *psi)
 {
-	psi->data = NULL;
+	efivar_unlock();
+	kfree(psi->data);
 	return 0;
 }
 
@@ -40,22 +49,17 @@ static inline u64 generic_id(u64 timestamp, unsigned int part, int count)
 	return (timestamp * 100 + part) * 1000 + count;
 }
 
-static int efi_pstore_read_func(struct efivar_entry *entry,
-				struct pstore_record *record)
+static int efi_pstore_read_func(struct pstore_record *record,
+				efi_char16_t *varname)
 {
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
+	unsigned long wlen, size = EFIVARS_DATA_SIZE_MAX;
 	char name[DUMP_NAME_LEN], data_type;
-	int i;
+	efi_status_t status;
 	int cnt;
 	unsigned int part;
-	unsigned long size;
 	u64 time;
 
-	if (efi_guidcmp(entry->var.VendorGuid, vendor))
-		return 0;
-
-	for (i = 0; i < DUMP_NAME_LEN; i++)
-		name[i] = entry->var.VariableName[i];
+	ucs2_as_utf8(name, varname, DUMP_NAME_LEN);
 
 	if (sscanf(name, "dump-type%u-%u-%d-%llu-%c",
 		   &record->type, &part, &cnt, &time, &data_type) == 5) {
@@ -95,161 +99,75 @@ static int efi_pstore_read_func(struct efivar_entry *entry,
 	} else
 		return 0;
 
-	entry->var.DataSize = 1024;
-	__efivar_entry_get(entry, &entry->var.Attributes,
-			   &entry->var.DataSize, entry->var.Data);
-	size = entry->var.DataSize;
-	memcpy(record->buf, entry->var.Data,
-	       (size_t)min_t(unsigned long, EFIVARS_DATA_SIZE_MAX, size));
-
-	return size;
-}
-
-/**
- * efi_pstore_scan_sysfs_enter
- * @pos: scanning entry
- * @next: next entry
- * @head: list head
- */
-static void efi_pstore_scan_sysfs_enter(struct efivar_entry *pos,
-					struct efivar_entry *next,
-					struct list_head *head)
-{
-	pos->scanning = true;
-	if (&next->list != head)
-		next->scanning = true;
-}
-
-/**
- * __efi_pstore_scan_sysfs_exit
- * @entry: deleting entry
- * @turn_off_scanning: Check if a scanning flag should be turned off
- */
-static inline int __efi_pstore_scan_sysfs_exit(struct efivar_entry *entry,
-						bool turn_off_scanning)
-{
-	if (entry->deleting) {
-		list_del(&entry->list);
-		efivar_entry_iter_end();
-		kfree(entry);
-		if (efivar_entry_iter_begin())
-			return -EINTR;
-	} else if (turn_off_scanning)
-		entry->scanning = false;
-
-	return 0;
-}
-
-/**
- * efi_pstore_scan_sysfs_exit
- * @pos: scanning entry
- * @next: next entry
- * @head: list head
- * @stop: a flag checking if scanning will stop
- */
-static int efi_pstore_scan_sysfs_exit(struct efivar_entry *pos,
-				       struct efivar_entry *next,
-				       struct list_head *head, bool stop)
-{
-	int ret = __efi_pstore_scan_sysfs_exit(pos, true);
-
-	if (ret)
-		return ret;
-
-	if (stop)
-		ret = __efi_pstore_scan_sysfs_exit(next, &next->list != head);
-	return ret;
-}
+	record->buf = kmalloc(size, GFP_KERNEL);
+	if (!record->buf)
+		return -ENOMEM;
 
-/**
- * efi_pstore_sysfs_entry_iter
- *
- * @record: pstore record to pass to callback
- *
- * You MUST call efivar_entry_iter_begin() before this function, and
- * efivar_entry_iter_end() afterwards.
- *
- */
-static int efi_pstore_sysfs_entry_iter(struct pstore_record *record)
-{
-	struct efivar_entry **pos = (struct efivar_entry **)&record->psi->data;
-	struct efivar_entry *entry, *n;
-	struct list_head *head = &efi_pstore_list;
-	int size = 0;
-	int ret;
-
-	if (!*pos) {
-		list_for_each_entry_safe(entry, n, head, list) {
-			efi_pstore_scan_sysfs_enter(entry, n, head);
-
-			size = efi_pstore_read_func(entry, record);
-			ret = efi_pstore_scan_sysfs_exit(entry, n, head,
-							 size < 0);
-			if (ret)
-				return ret;
-			if (size)
-				break;
-		}
-		*pos = n;
-		return size;
+	status = efivar_get_variable(varname, &LINUX_EFI_CRASH_GUID, NULL,
+				     &size, record->buf);
+	if (status != EFI_SUCCESS) {
+		kfree(record->buf);
+		return -EIO;
 	}
 
-	list_for_each_entry_safe_from((*pos), n, head, list) {
-		efi_pstore_scan_sysfs_enter((*pos), n, head);
-
-		size = efi_pstore_read_func((*pos), record);
-		ret = efi_pstore_scan_sysfs_exit((*pos), n, head, size < 0);
-		if (ret)
-			return ret;
-		if (size)
-			break;
+	/*
+	 * Store the name of the variable in the pstore_record priv field, so
+	 * we can reuse it later if we need to delete the EFI variable from the
+	 * variable store.
+	 */
+	wlen = (ucs2_strnlen(varname, DUMP_NAME_LEN) + 1) * sizeof(efi_char16_t);
+	record->priv = kmemdup(varname, wlen, GFP_KERNEL);
+	if (!record->priv) {
+		kfree(record->buf);
+		return -ENOMEM;
 	}
-	*pos = n;
+
 	return size;
 }
 
-/**
- * efi_pstore_read
- *
- * This function returns a size of NVRAM entry logged via efi_pstore_write().
- * The meaning and behavior of efi_pstore/pstore are as below.
- *
- * size > 0: Got data of an entry logged via efi_pstore_write() successfully,
- *           and pstore filesystem will continue reading subsequent entries.
- * size == 0: Entry was not logged via efi_pstore_write(),
- *            and efi_pstore driver will continue reading subsequent entries.
- * size < 0: Failed to get data of entry logging via efi_pstore_write(),
- *           and pstore will stop reading entry.
- */
 static ssize_t efi_pstore_read(struct pstore_record *record)
 {
-	ssize_t size;
+	efi_char16_t *varname = record->psi->data;
+	efi_guid_t guid = LINUX_EFI_CRASH_GUID;
+	unsigned long varname_size;
+	efi_status_t status;
 
-	record->buf = kzalloc(EFIVARS_DATA_SIZE_MAX, GFP_KERNEL);
-	if (!record->buf)
-		return -ENOMEM;
+	for (;;) {
+		varname_size = EFIVARS_DATA_SIZE_MAX;
 
-	if (efivar_entry_iter_begin()) {
-		size = -EINTR;
-		goto out;
-	}
-	size = efi_pstore_sysfs_entry_iter(record);
-	efivar_entry_iter_end();
+		/*
+		 * If this is the first read() call in the pstore enumeration,
+		 * varname will be the empty string, and the GetNextVariable()
+		 * runtime service call will return the first EFI variable in
+		 * its own enumeration order, ignoring the guid argument.
+		 *
+		 * Subsequent calls to GetNextVariable() must pass the name and
+		 * guid values returned by the previous call, which is why we
+		 * store varname in record->psi->data. Given that we only
+		 * enumerate variables with the efi-pstore GUID, there is no
+		 * need to record the guid return value.
+		 */
+		status = efivar_get_next_variable(&varname_size, varname, &guid);
+		if (status == EFI_NOT_FOUND)
+			return 0;
 
-out:
-	if (size <= 0) {
-		kfree(record->buf);
-		record->buf = NULL;
+		if (status != EFI_SUCCESS)
+			return -EIO;
+
+		/* skip variables that don't concern us */
+		if (efi_guidcmp(guid, LINUX_EFI_CRASH_GUID))
+			continue;
+
+		return efi_pstore_read_func(record, varname);
 	}
-	return size;
 }
 
 static int efi_pstore_write(struct pstore_record *record)
 {
 	char name[DUMP_NAME_LEN];
 	efi_char16_t efi_name[DUMP_NAME_LEN];
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	int i, ret = 0;
+	efi_status_t status;
+	int i;
 
 	record->id = generic_id(record->time.tv_sec, record->part,
 				record->count);
@@ -265,88 +183,26 @@ static int efi_pstore_write(struct pstore_record *record)
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
 
-	ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
-			      false, record->size, record->psi->buf);
-
-	if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE))
-		if (!schedule_work(&efivar_work))
-			module_put(THIS_MODULE);
-
-	return ret;
+	if (efivar_trylock())
+		return -EBUSY;
+	status = efivar_set_variable_locked(efi_name, &LINUX_EFI_CRASH_GUID,
+					    PSTORE_EFI_ATTRIBUTES,
+					    record->size, record->psi->buf,
+					    true);
+	efivar_unlock();
+	return status == EFI_SUCCESS ? 0 : -EIO;
 };
 
-/*
- * Clean up an entry with the same name
- */
-static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
-{
-	efi_char16_t *efi_name = data;
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	unsigned long ucs2_len = ucs2_strlen(efi_name);
-
-	if (efi_guidcmp(entry->var.VendorGuid, vendor))
-		return 0;
-
-	if (ucs2_strncmp(entry->var.VariableName, efi_name, (size_t)ucs2_len))
-		return 0;
-
-	if (entry->scanning) {
-		/*
-		 * Skip deletion because this entry will be deleted
-		 * after scanning is completed.
-		 */
-		entry->deleting = true;
-	} else
-		list_del(&entry->list);
-
-	/* found */
-	__efivar_entry_delete(entry);
-
-	return 1;
-}
-
-static int efi_pstore_erase_name(const char *name)
-{
-	struct efivar_entry *entry = NULL;
-	efi_char16_t efi_name[DUMP_NAME_LEN];
-	int found, i;
-
-	for (i = 0; i < DUMP_NAME_LEN; i++) {
-		efi_name[i] = name[i];
-		if (name[i] == '\0')
-			break;
-	}
-
-	if (efivar_entry_iter_begin())
-		return -EINTR;
-
-	found = __efivar_entry_iter(efi_pstore_erase_func, &efi_pstore_list,
-				    efi_name, &entry);
-	efivar_entry_iter_end();
-
-	if (found && !entry->scanning)
-		kfree(entry);
-
-	return found ? 0 : -ENOENT;
-}
-
 static int efi_pstore_erase(struct pstore_record *record)
 {
-	char name[DUMP_NAME_LEN];
-	int ret;
+	efi_status_t status;
 
-	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lld",
-		 record->type, record->part, record->count,
-		 (long long)record->time.tv_sec);
-	ret = efi_pstore_erase_name(name);
-	if (ret != -ENOENT)
-		return ret;
-
-	snprintf(name, sizeof(name), "dump-type%u-%u-%lld",
-		record->type, record->part, (long long)record->time.tv_sec);
-	ret = efi_pstore_erase_name(name);
+	status = efivar_set_variable(record->priv, &LINUX_EFI_CRASH_GUID,
+				     PSTORE_EFI_ATTRIBUTES, 0, NULL);
 
-	return ret;
+	if (status != EFI_SUCCESS && status != EFI_NOT_FOUND)
+		return -EIO;
+	return 0;
 }
 
 static struct pstore_info efi_pstore_info = {
@@ -360,74 +216,14 @@ static struct pstore_info efi_pstore_info = {
 	.erase		= efi_pstore_erase,
 };
 
-static int efi_pstore_callback(efi_char16_t *name, efi_guid_t vendor,
-			       unsigned long name_size, void *data)
-{
-	struct efivar_entry *entry;
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	memcpy(entry->var.VariableName, name, name_size);
-	entry->var.VendorGuid = vendor;
-
-	__efivar_entry_add(entry, &efi_pstore_list);
-
-	return 0;
-}
-
-static int efi_pstore_update_entry(efi_char16_t *name, efi_guid_t vendor,
-				   unsigned long name_size, void *data)
-{
-	struct efivar_entry *entry = data;
-
-	if (efivar_entry_find(name, vendor, &efi_pstore_list, false))
-		return 0;
-
-	memcpy(entry->var.VariableName, name, name_size);
-	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
-
-	return 1;
-}
-
-static void efi_pstore_update_entries(struct work_struct *work)
-{
-	struct efivar_entry *entry;
-	int err;
-
-	/* Add new sysfs entries */
-	while (1) {
-		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			return;
-
-		err = efivar_init(efi_pstore_update_entry, entry,
-				  false, &efi_pstore_list);
-		if (!err)
-			break;
-
-		efivar_entry_add(entry, &efi_pstore_list);
-	}
-
-	kfree(entry);
-	module_put(THIS_MODULE);
-}
-
 static __init int efivars_pstore_init(void)
 {
-	int ret;
-
-	if (!efivars_kobject() || !efivar_supports_writes())
+	if (!efivar_supports_writes())
 		return 0;
 
 	if (efivars_pstore_disable)
 		return 0;
 
-	ret = efivar_init(efi_pstore_callback, NULL, true, &efi_pstore_list);
-	if (ret)
-		return ret;
-
 	efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL);
 	if (!efi_pstore_info.buf)
 		return -ENOMEM;
@@ -440,8 +236,6 @@ static __init int efivars_pstore_init(void)
 		efi_pstore_info.bufsize = 0;
 	}
 
-	INIT_WORK(&efivar_work, efi_pstore_update_entries);
-
 	return 0;
 }
 
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index c19db0b35c0d..8341fb15f62e 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -467,16 +467,12 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	else if (__efivar_entry_delete(entry))
 		err = -EIO;
 
-	if (err) {
-		efivar_entry_iter_end();
+	efivar_entry_iter_end();
+
+	if (err)
 		return err;
-	}
 
-	if (!entry->scanning) {
-		efivar_entry_iter_end();
-		efivar_unregister(entry);
-	} else
-		efivar_entry_iter_end();
+	efivar_unregister(entry);
 
 	/* It's dead Jim.... */
 	return count;
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 41c82614a4b2..5640ffa81544 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -821,16 +821,8 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
 	if (!found)
 		return NULL;
 
-	if (remove) {
-		if (entry->scanning) {
-			/*
-			 * The entry will be deleted
-			 * after scanning is completed.
-			 */
-			entry->deleting = true;
-		} else
-			list_del(&entry->list);
-	}
+	if (remove)
+		list_del(&entry->list);
 
 	return entry;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c828ab6f0e2a..08bc6215e3b4 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1043,8 +1043,6 @@ struct efivar_entry {
 	struct efi_variable var;
 	struct list_head list;
 	struct kobject kobj;
-	bool scanning;
-	bool deleting;
 };
 
 static inline void
-- 
cgit 


From 0f5b2c69a4cbe4166ca24b76d5ada98ed2867741 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 20 Jun 2022 13:34:03 +0200
Subject: efi: vars: Remove deprecated 'efivars' sysfs interface

Commit 5d9db883761a ("efi: Add support for a UEFI variable filesystem")
dated Oct 5, 2012, introduced a new efivarfs pseudo-filesystem to
replace the efivars sysfs interface that was used up to that point to
expose EFI variables to user space.

The main problem with the sysfs interface was that it only supported up
to 1024 bytes of payload per file, whereas the underlying variables
themselves are only bounded by a platform specific per-variable and
global limit that is typically much higher than 1024 bytes.

The deprecated sysfs interface is only enabled on x86 and Itanium, other
EFI enabled architectures only support the efivarfs pseudo-filesystem.

So let's finally rip off the band aid, and drop the old interface
entirely. This will make it easier to refactor and clean up the
underlying infrastructure that is shared between efivars, efivarfs and
efi-pstore, and is long overdue for a makeover.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 Documentation/x86/x86_64/uefi.rst        |   2 +-
 arch/arm/configs/milbeaut_m10v_defconfig |   1 -
 arch/ia64/configs/bigsur_defconfig       |   1 -
 arch/ia64/configs/generic_defconfig      |   1 -
 arch/ia64/configs/gensparse_defconfig    |   1 -
 arch/ia64/configs/tiger_defconfig        |   1 -
 arch/ia64/configs/zx1_defconfig          |   1 -
 arch/x86/configs/i386_defconfig          |   1 -
 arch/x86/configs/x86_64_defconfig        |   1 -
 drivers/firmware/efi/Kconfig             |  12 -
 drivers/firmware/efi/Makefile            |   1 -
 drivers/firmware/efi/efivars.c           | 660 -------------------------------
 drivers/firmware/efi/vars.c              | 265 +------------
 include/linux/efi.h                      |  18 -
 14 files changed, 3 insertions(+), 963 deletions(-)
 delete mode 100644 drivers/firmware/efi/efivars.c

(limited to 'include')

diff --git a/Documentation/x86/x86_64/uefi.rst b/Documentation/x86/x86_64/uefi.rst
index 3b894103a734..fbc30c9a071d 100644
--- a/Documentation/x86/x86_64/uefi.rst
+++ b/Documentation/x86/x86_64/uefi.rst
@@ -29,7 +29,7 @@ Mechanics
   be selected::
 
 	CONFIG_EFI=y
-	CONFIG_EFI_VARS=y or m		# optional
+	CONFIG_EFIVAR_FS=y or m		# optional
 
 - Create a VFAT partition on the disk
 - Copy the following to the VFAT partition:
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 7c07f9893a0f..9b4789af0201 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -44,7 +44,6 @@ CONFIG_ARM_CPUIDLE=y
 CONFIG_VFP=y
 CONFIG_NEON=y
 CONFIG_KERNEL_MODE_NEON=y
-CONFIG_EFI_VARS=m
 CONFIG_EFI_CAPSULE_LOADER=m
 CONFIG_ARM_CRYPTO=y
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 0341a67cc1bf..a3724882295c 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -10,7 +10,6 @@ CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_PREEMPT=y
 CONFIG_IA64_PALINFO=y
-CONFIG_EFI_VARS=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ACPI_BUTTON=m
 CONFIG_ACPI_FAN=m
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index 8916a2850c48..a3dff482a3d7 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -21,7 +21,6 @@ CONFIG_IA64_MCA_RECOVERY=y
 CONFIG_IA64_PALINFO=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
-CONFIG_EFI_VARS=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ACPI_BUTTON=m
 CONFIG_ACPI_FAN=m
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index 281eb9c544f9..4cd46105b020 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -18,7 +18,6 @@ CONFIG_HOTPLUG_CPU=y
 CONFIG_SPARSEMEM_MANUAL=y
 CONFIG_IA64_MCA_RECOVERY=y
 CONFIG_IA64_PALINFO=y
-CONFIG_EFI_VARS=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ACPI_BUTTON=m
 CONFIG_ACPI_FAN=m
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index b4f9819a1a45..a2045d73adfa 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -23,7 +23,6 @@ CONFIG_FORCE_CPEI_RETARGET=y
 CONFIG_IA64_MCA_RECOVERY=y
 CONFIG_IA64_PALINFO=y
 CONFIG_KEXEC=y
-CONFIG_EFI_VARS=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ACPI_BUTTON=m
 CONFIG_ACPI_FAN=m
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 851d8594cdb8..99f8b2a0332b 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -12,7 +12,6 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_IA64_MCA_RECOVERY=y
 CONFIG_IA64_PALINFO=y
 CONFIG_CRASH_DUMP=y
-CONFIG_EFI_VARS=y
 CONFIG_BINFMT_MISC=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_ACPI=y
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 98a4852ed6a0..7207219509f6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -135,7 +135,6 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_DEBUG_DEVRES=y
 CONFIG_CONNECTOR=y
-CONFIG_EFI_VARS=y
 CONFIG_EFI_CAPSULE_LOADER=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_VIRTIO_BLK=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 69784505a7a8..5ce67b73e218 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -134,7 +134,6 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_DEBUG_DEVRES=y
 CONFIG_CONNECTOR=y
-CONFIG_EFI_VARS=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_SD=y
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 7fe8b5c686d2..6cb7384ad2ac 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -2,18 +2,6 @@
 menu "EFI (Extensible Firmware Interface) Support"
 	depends on EFI
 
-config EFI_VARS
-	tristate "EFI Variable Support via sysfs"
-	depends on EFI && (X86 || IA64)
-	default n
-	help
-	  If you say Y here, you are able to get EFI (Extensible Firmware
-	  Interface) variable information via sysfs.  You may read,
-	  write, create, and destroy EFI variables through this interface.
-	  Note that this driver is only retained for compatibility with
-	  legacy users: new users should use the efivarfs filesystem
-	  instead.
-
 config EFI_ESRT
 	bool
 	depends on EFI && !IA64
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index c02ff25dd477..8d151e332584 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -17,7 +17,6 @@ ifneq ($(CONFIG_EFI_CAPSULE_LOADER),)
 obj-$(CONFIG_EFI)			+= capsule.o
 endif
 obj-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= fdtparams.o
-obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_ESRT)			+= esrt.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
 obj-$(CONFIG_UEFI_CPER)			+= cper.o
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
deleted file mode 100644
index 801a65582172..000000000000
--- a/drivers/firmware/efi/efivars.c
+++ /dev/null
@@ -1,660 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Originally from efivars.c,
- *
- * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
- * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
- *
- * This code takes all variables accessible from EFI runtime and
- *  exports them via sysfs
- */
-
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/ucs2_string.h>
-#include <linux/compat.h>
-
-#define EFIVARS_VERSION "0.08"
-#define EFIVARS_DATE "2004-May-17"
-
-MODULE_AUTHOR("Matt Domsch <Matt_Domsch@Dell.com>");
-MODULE_DESCRIPTION("sysfs interface to EFI Variables");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(EFIVARS_VERSION);
-
-static LIST_HEAD(efivar_sysfs_list);
-
-static struct kset *efivars_kset;
-
-static struct bin_attribute *efivars_new_var;
-static struct bin_attribute *efivars_del_var;
-
-struct compat_efi_variable {
-	efi_char16_t  VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
-	efi_guid_t    VendorGuid;
-	__u32         DataSize;
-	__u8          Data[1024];
-	__u32         Status;
-	__u32         Attributes;
-} __packed;
-
-struct efivar_attribute {
-	struct attribute attr;
-	ssize_t (*show) (struct efivar_entry *entry, char *buf);
-	ssize_t (*store)(struct efivar_entry *entry, const char *buf, size_t count);
-};
-
-#define EFIVAR_ATTR(_name, _mode, _show, _store) \
-struct efivar_attribute efivar_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode}, \
-	.show = _show, \
-	.store = _store, \
-};
-
-#define to_efivar_attr(_attr) container_of(_attr, struct efivar_attribute, attr)
-#define to_efivar_entry(obj)  container_of(obj, struct efivar_entry, kobj)
-
-/*
- * Prototype for sysfs creation function
- */
-static int
-efivar_create_sysfs_entry(struct efivar_entry *new_var);
-
-static ssize_t
-efivar_guid_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	char *str = buf;
-
-	if (!entry || !buf)
-		return 0;
-
-	efi_guid_to_str(&var->VendorGuid, str);
-	str += strlen(str);
-	str += sprintf(str, "\n");
-
-	return str - buf;
-}
-
-static ssize_t
-efivar_attr_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	unsigned long size = sizeof(var->Data);
-	char *str = buf;
-	int ret;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	ret = efivar_entry_get(entry, &var->Attributes, &size, var->Data);
-	var->DataSize = size;
-	if (ret)
-		return -EIO;
-
-	if (var->Attributes & EFI_VARIABLE_NON_VOLATILE)
-		str += sprintf(str, "EFI_VARIABLE_NON_VOLATILE\n");
-	if (var->Attributes & EFI_VARIABLE_BOOTSERVICE_ACCESS)
-		str += sprintf(str, "EFI_VARIABLE_BOOTSERVICE_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_RUNTIME_ACCESS)
-		str += sprintf(str, "EFI_VARIABLE_RUNTIME_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_HARDWARE_ERROR_RECORD)
-		str += sprintf(str, "EFI_VARIABLE_HARDWARE_ERROR_RECORD\n");
-	if (var->Attributes & EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS)
-		str += sprintf(str,
-			"EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS\n");
-	if (var->Attributes &
-			EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS)
-		str += sprintf(str,
-			"EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_APPEND_WRITE)
-		str += sprintf(str, "EFI_VARIABLE_APPEND_WRITE\n");
-	return str - buf;
-}
-
-static ssize_t
-efivar_size_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	unsigned long size = sizeof(var->Data);
-	char *str = buf;
-	int ret;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	ret = efivar_entry_get(entry, &var->Attributes, &size, var->Data);
-	var->DataSize = size;
-	if (ret)
-		return -EIO;
-
-	str += sprintf(str, "0x%lx\n", var->DataSize);
-	return str - buf;
-}
-
-static ssize_t
-efivar_data_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	unsigned long size = sizeof(var->Data);
-	int ret;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	ret = efivar_entry_get(entry, &var->Attributes, &size, var->Data);
-	var->DataSize = size;
-	if (ret)
-		return -EIO;
-
-	memcpy(buf, var->Data, var->DataSize);
-	return var->DataSize;
-}
-
-static inline int
-sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor,
-	     unsigned long size, u32 attributes, u8 *data)
-{
-	/*
-	 * If only updating the variable data, then the name
-	 * and guid should remain the same
-	 */
-	if (memcmp(name, var->VariableName, sizeof(var->VariableName)) ||
-		efi_guidcmp(vendor, var->VendorGuid)) {
-		printk(KERN_ERR "efivars: Cannot edit the wrong variable!\n");
-		return -EINVAL;
-	}
-
-	if ((size <= 0) || (attributes == 0)){
-		printk(KERN_ERR "efivars: DataSize & Attributes must be valid!\n");
-		return -EINVAL;
-	}
-
-	if ((attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    efivar_validate(vendor, name, data, size) == false) {
-		printk(KERN_ERR "efivars: Malformed variable content\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static void
-copy_out_compat(struct efi_variable *dst, struct compat_efi_variable *src)
-{
-	memcpy(dst->VariableName, src->VariableName, EFI_VAR_NAME_LEN);
-	memcpy(dst->Data, src->Data, sizeof(src->Data));
-
-	dst->VendorGuid = src->VendorGuid;
-	dst->DataSize = src->DataSize;
-	dst->Attributes = src->Attributes;
-}
-
-/*
- * We allow each variable to be edited via rewriting the
- * entire efi variable structure.
- */
-static ssize_t
-efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
-{
-	struct efi_variable *new_var, *var = &entry->var;
-	efi_char16_t *name;
-	unsigned long size;
-	efi_guid_t vendor;
-	u32 attributes;
-	u8 *data;
-	int err;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	if (in_compat_syscall()) {
-		struct compat_efi_variable *compat;
-
-		if (count != sizeof(*compat))
-			return -EINVAL;
-
-		compat = (struct compat_efi_variable *)buf;
-		attributes = compat->Attributes;
-		vendor = compat->VendorGuid;
-		name = compat->VariableName;
-		size = compat->DataSize;
-		data = compat->Data;
-
-		err = sanity_check(var, name, vendor, size, attributes, data);
-		if (err)
-			return err;
-
-		copy_out_compat(&entry->var, compat);
-	} else {
-		if (count != sizeof(struct efi_variable))
-			return -EINVAL;
-
-		new_var = (struct efi_variable *)buf;
-
-		attributes = new_var->Attributes;
-		vendor = new_var->VendorGuid;
-		name = new_var->VariableName;
-		size = new_var->DataSize;
-		data = new_var->Data;
-
-		err = sanity_check(var, name, vendor, size, attributes, data);
-		if (err)
-			return err;
-
-		memcpy(&entry->var, new_var, count);
-	}
-
-	err = efivar_entry_set(entry, attributes, size, data, NULL);
-	if (err) {
-		printk(KERN_WARNING "efivars: set_variable() failed: status=%d\n", err);
-		return -EIO;
-	}
-
-	return count;
-}
-
-static ssize_t
-efivar_show_raw(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	struct compat_efi_variable *compat;
-	unsigned long datasize = sizeof(var->Data);
-	size_t size;
-	int ret;
-
-	if (!entry || !buf)
-		return 0;
-
-	ret = efivar_entry_get(entry, &var->Attributes, &datasize, var->Data);
-	var->DataSize = datasize;
-	if (ret)
-		return -EIO;
-
-	if (in_compat_syscall()) {
-		compat = (struct compat_efi_variable *)buf;
-
-		size = sizeof(*compat);
-		memcpy(compat->VariableName, var->VariableName,
-			EFI_VAR_NAME_LEN);
-		memcpy(compat->Data, var->Data, sizeof(compat->Data));
-
-		compat->VendorGuid = var->VendorGuid;
-		compat->DataSize = var->DataSize;
-		compat->Attributes = var->Attributes;
-	} else {
-		size = sizeof(*var);
-		memcpy(buf, var, size);
-	}
-
-	return size;
-}
-
-/*
- * Generic read/write functions that call the specific functions of
- * the attributes...
- */
-static ssize_t efivar_attr_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	struct efivar_entry *var = to_efivar_entry(kobj);
-	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = -EIO;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (efivar_attr->show) {
-		ret = efivar_attr->show(var, buf);
-	}
-	return ret;
-}
-
-static ssize_t efivar_attr_store(struct kobject *kobj, struct attribute *attr,
-				const char *buf, size_t count)
-{
-	struct efivar_entry *var = to_efivar_entry(kobj);
-	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = -EIO;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (efivar_attr->store)
-		ret = efivar_attr->store(var, buf, count);
-
-	return ret;
-}
-
-static const struct sysfs_ops efivar_attr_ops = {
-	.show = efivar_attr_show,
-	.store = efivar_attr_store,
-};
-
-static void efivar_release(struct kobject *kobj)
-{
-	struct efivar_entry *var = to_efivar_entry(kobj);
-	kfree(var);
-}
-
-static EFIVAR_ATTR(guid, 0400, efivar_guid_read, NULL);
-static EFIVAR_ATTR(attributes, 0400, efivar_attr_read, NULL);
-static EFIVAR_ATTR(size, 0400, efivar_size_read, NULL);
-static EFIVAR_ATTR(data, 0400, efivar_data_read, NULL);
-static EFIVAR_ATTR(raw_var, 0600, efivar_show_raw, efivar_store_raw);
-
-static struct attribute *def_attrs[] = {
-	&efivar_attr_guid.attr,
-	&efivar_attr_size.attr,
-	&efivar_attr_attributes.attr,
-	&efivar_attr_data.attr,
-	&efivar_attr_raw_var.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(def);
-
-static struct kobj_type efivar_ktype = {
-	.release = efivar_release,
-	.sysfs_ops = &efivar_attr_ops,
-	.default_groups = def_groups,
-};
-
-static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
-			     struct bin_attribute *bin_attr,
-			     char *buf, loff_t pos, size_t count)
-{
-	struct compat_efi_variable *compat = (struct compat_efi_variable *)buf;
-	struct efi_variable *new_var = (struct efi_variable *)buf;
-	struct efivar_entry *new_entry;
-	bool need_compat = in_compat_syscall();
-	efi_char16_t *name;
-	unsigned long size;
-	u32 attributes;
-	u8 *data;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (need_compat) {
-		if (count != sizeof(*compat))
-			return -EINVAL;
-
-		attributes = compat->Attributes;
-		name = compat->VariableName;
-		size = compat->DataSize;
-		data = compat->Data;
-	} else {
-		if (count != sizeof(*new_var))
-			return -EINVAL;
-
-		attributes = new_var->Attributes;
-		name = new_var->VariableName;
-		size = new_var->DataSize;
-		data = new_var->Data;
-	}
-
-	if ((attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    efivar_validate(new_var->VendorGuid, name, data,
-			    size) == false) {
-		printk(KERN_ERR "efivars: Malformed variable content\n");
-		return -EINVAL;
-	}
-
-	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-	if (!new_entry)
-		return -ENOMEM;
-
-	if (need_compat)
-		copy_out_compat(&new_entry->var, compat);
-	else
-		memcpy(&new_entry->var, new_var, sizeof(*new_var));
-
-	err = efivar_entry_set(new_entry, attributes, size,
-			       data, &efivar_sysfs_list);
-	if (err) {
-		if (err == -EEXIST)
-			err = -EINVAL;
-		goto out;
-	}
-
-	if (efivar_create_sysfs_entry(new_entry)) {
-		printk(KERN_WARNING "efivars: failed to create sysfs entry.\n");
-		kfree(new_entry);
-	}
-	return count;
-
-out:
-	kfree(new_entry);
-	return err;
-}
-
-static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
-			     struct bin_attribute *bin_attr,
-			     char *buf, loff_t pos, size_t count)
-{
-	struct efi_variable *del_var = (struct efi_variable *)buf;
-	struct compat_efi_variable *compat;
-	struct efivar_entry *entry;
-	efi_char16_t *name;
-	efi_guid_t vendor;
-	int err = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (in_compat_syscall()) {
-		if (count != sizeof(*compat))
-			return -EINVAL;
-
-		compat = (struct compat_efi_variable *)buf;
-		name = compat->VariableName;
-		vendor = compat->VendorGuid;
-	} else {
-		if (count != sizeof(*del_var))
-			return -EINVAL;
-
-		name = del_var->VariableName;
-		vendor = del_var->VendorGuid;
-	}
-
-	if (efivar_entry_iter_begin())
-		return -EINTR;
-	entry = efivar_entry_find(name, vendor, &efivar_sysfs_list, true);
-	if (!entry)
-		err = -EINVAL;
-	else if (__efivar_entry_delete(entry))
-		err = -EIO;
-
-	efivar_entry_iter_end();
-
-	if (err)
-		return err;
-
-	efivar_unregister(entry);
-
-	/* It's dead Jim.... */
-	return count;
-}
-
-/**
- * efivar_create_sysfs_entry - create a new entry in sysfs
- * @new_var: efivar entry to create
- *
- * Returns 0 on success, negative error code on failure
- */
-static int
-efivar_create_sysfs_entry(struct efivar_entry *new_var)
-{
-	int short_name_size;
-	char *short_name;
-	unsigned long utf8_name_size;
-	efi_char16_t *variable_name = new_var->var.VariableName;
-	int ret;
-
-	/*
-	 * Length of the variable bytes in UTF8, plus the '-' separator,
-	 * plus the GUID, plus trailing NUL
-	 */
-	utf8_name_size = ucs2_utf8size(variable_name);
-	short_name_size = utf8_name_size + 1 + EFI_VARIABLE_GUID_LEN + 1;
-
-	short_name = kmalloc(short_name_size, GFP_KERNEL);
-	if (!short_name)
-		return -ENOMEM;
-
-	ucs2_as_utf8(short_name, variable_name, short_name_size);
-
-	/* This is ugly, but necessary to separate one vendor's
-	   private variables from another's.         */
-	short_name[utf8_name_size] = '-';
-	efi_guid_to_str(&new_var->var.VendorGuid,
-			 short_name + utf8_name_size + 1);
-
-	new_var->kobj.kset = efivars_kset;
-
-	ret = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
-				   NULL, "%s", short_name);
-	kfree(short_name);
-	if (ret) {
-		kobject_put(&new_var->kobj);
-		return ret;
-	}
-
-	kobject_uevent(&new_var->kobj, KOBJ_ADD);
-	__efivar_entry_add(new_var, &efivar_sysfs_list);
-
-	return 0;
-}
-
-static int
-create_efivars_bin_attributes(void)
-{
-	struct bin_attribute *attr;
-	int error;
-
-	/* new_var */
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr)
-		return -ENOMEM;
-
-	attr->attr.name = "new_var";
-	attr->attr.mode = 0200;
-	attr->write = efivar_create;
-	efivars_new_var = attr;
-
-	/* del_var */
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr) {
-		error = -ENOMEM;
-		goto out_free;
-	}
-	attr->attr.name = "del_var";
-	attr->attr.mode = 0200;
-	attr->write = efivar_delete;
-	efivars_del_var = attr;
-
-	sysfs_bin_attr_init(efivars_new_var);
-	sysfs_bin_attr_init(efivars_del_var);
-
-	/* Register */
-	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_new_var);
-	if (error) {
-		printk(KERN_ERR "efivars: unable to create new_var sysfs file"
-			" due to error %d\n", error);
-		goto out_free;
-	}
-
-	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_del_var);
-	if (error) {
-		printk(KERN_ERR "efivars: unable to create del_var sysfs file"
-			" due to error %d\n", error);
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
-		goto out_free;
-	}
-
-	return 0;
-out_free:
-	kfree(efivars_del_var);
-	efivars_del_var = NULL;
-	kfree(efivars_new_var);
-	efivars_new_var = NULL;
-	return error;
-}
-
-static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor,
-				  unsigned long name_size, void *data)
-{
-	struct efivar_entry *entry;
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	memcpy(entry->var.VariableName, name, name_size);
-	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
-
-	efivar_create_sysfs_entry(entry);
-
-	return 0;
-}
-
-static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
-{
-	efivar_entry_remove(entry);
-	efivar_unregister(entry);
-	return 0;
-}
-
-static void efivars_sysfs_exit(void)
-{
-	/* Remove all entries and destroy */
-	int err;
-
-	err = efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL);
-	if (err) {
-		pr_err("efivars: Failed to destroy sysfs entries\n");
-		return;
-	}
-
-	if (efivars_new_var)
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
-	if (efivars_del_var)
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_del_var);
-	kfree(efivars_new_var);
-	kfree(efivars_del_var);
-	kset_unregister(efivars_kset);
-}
-
-static int efivars_sysfs_init(void)
-{
-	struct kobject *parent_kobj = efivars_kobject();
-	int error = 0;
-
-	/* No efivars has been registered yet */
-	if (!parent_kobj || !efivar_supports_writes())
-		return 0;
-
-	printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION,
-	       EFIVARS_DATE);
-
-	efivars_kset = kset_create_and_add("vars", NULL, parent_kobj);
-	if (!efivars_kset) {
-		printk(KERN_ERR "efivars: Subsystem registration failed.\n");
-		return -ENOMEM;
-	}
-
-	efivar_init(efivars_sysfs_callback, NULL, true, &efivar_sysfs_list);
-
-	error = create_efivars_bin_attributes();
-	if (error) {
-		efivars_sysfs_exit();
-		return error;
-	}
-
-	return 0;
-}
-
-module_init(efivars_sysfs_init);
-module_exit(efivars_sysfs_exit);
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 932435945c85..94de1d0cb4e7 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -547,36 +547,6 @@ static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
 	up(&efivars_lock);
 }
 
-/**
- * __efivar_entry_delete - delete an EFI variable
- * @entry: entry containing EFI variable to delete
- *
- * Delete the variable from the firmware but leave @entry on the
- * variable list.
- *
- * This function differs from efivar_entry_delete() because it does
- * not remove @entry from the variable list. Also, it is safe to be
- * called from within a efivar_entry_iter_begin() and
- * efivar_entry_iter_end() region, unlike efivar_entry_delete().
- *
- * Returns 0 on success, or a converted EFI status code if
- * set_variable() fails.
- */
-int __efivar_entry_delete(struct efivar_entry *entry)
-{
-	efi_status_t status;
-
-	if (!__efivars)
-		return -EINVAL;
-
-	status = __efivars->ops->set_variable(entry->var.VariableName,
-					      &entry->var.VendorGuid,
-					      0, 0, NULL);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_delete);
-
 /**
  * efivar_entry_delete - delete variable and remove entry from list
  * @entry: entry containing variable to delete
@@ -614,213 +584,6 @@ int efivar_entry_delete(struct efivar_entry *entry)
 }
 EXPORT_SYMBOL_GPL(efivar_entry_delete);
 
-/**
- * efivar_entry_set - call set_variable()
- * @entry: entry containing the EFI variable to write
- * @attributes: variable attributes
- * @size: size of @data buffer
- * @data: buffer containing variable data
- * @head: head of variable list
- *
- * Calls set_variable() for an EFI variable. If creating a new EFI
- * variable, this function is usually followed by efivar_entry_add().
- *
- * Before writing the variable, the remaining EFI variable storage
- * space is checked to ensure there is enough room available.
- *
- * If @head is not NULL a lookup is performed to determine whether
- * the entry is already on the list.
- *
- * Returns 0 on success, -EINTR if we can't grab the semaphore,
- * -EEXIST if a lookup is performed and the entry already exists on
- * the list, or a converted EFI status code if set_variable() fails.
- */
-int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
-		     unsigned long size, void *data, struct list_head *head)
-{
-	const struct efivar_operations *ops;
-	efi_status_t status;
-	efi_char16_t *name = entry->var.VariableName;
-	efi_guid_t vendor = entry->var.VendorGuid;
-
-	if (down_interruptible(&efivars_lock))
-		return -EINTR;
-
-	if (!__efivars) {
-		up(&efivars_lock);
-		return -EINVAL;
-	}
-	ops = __efivars->ops;
-	if (head && efivar_entry_find(name, vendor, head, false)) {
-		up(&efivars_lock);
-		return -EEXIST;
-	}
-
-	status = check_var_size(attributes, size + ucs2_strsize(name, 1024));
-	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
-		status = ops->set_variable(name, &vendor,
-					   attributes, size, data);
-
-	up(&efivars_lock);
-
-	return efi_status_to_err(status);
-
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set);
-
-/*
- * efivar_entry_set_nonblocking - call set_variable_nonblocking()
- *
- * This function is guaranteed to not block and is suitable for calling
- * from crash/panic handlers.
- *
- * Crucially, this function will not block if it cannot acquire
- * efivars_lock. Instead, it returns -EBUSY.
- */
-static int
-efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor,
-			     u32 attributes, unsigned long size, void *data)
-{
-	const struct efivar_operations *ops;
-	efi_status_t status;
-
-	if (down_trylock(&efivars_lock))
-		return -EBUSY;
-
-	if (!__efivars) {
-		up(&efivars_lock);
-		return -EINVAL;
-	}
-
-	status = check_var_size_nonblocking(attributes,
-					    size + ucs2_strsize(name, 1024));
-	if (status != EFI_SUCCESS) {
-		up(&efivars_lock);
-		return -ENOSPC;
-	}
-
-	ops = __efivars->ops;
-	status = ops->set_variable_nonblocking(name, &vendor, attributes,
-					       size, data);
-
-	up(&efivars_lock);
-	return efi_status_to_err(status);
-}
-
-/**
- * efivar_entry_set_safe - call set_variable() if enough space in firmware
- * @name: buffer containing the variable name
- * @vendor: variable vendor guid
- * @attributes: variable attributes
- * @block: can we block in this context?
- * @size: size of @data buffer
- * @data: buffer containing variable data
- *
- * Ensures there is enough free storage in the firmware for this variable, and
- * if so, calls set_variable(). If creating a new EFI variable, this function
- * is usually followed by efivar_entry_add().
- *
- * Returns 0 on success, -ENOSPC if the firmware does not have enough
- * space for set_variable() to succeed, or a converted EFI status code
- * if set_variable() fails.
- */
-int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
-			  bool block, unsigned long size, void *data)
-{
-	const struct efivar_operations *ops;
-	efi_status_t status;
-	unsigned long varsize;
-
-	if (!__efivars)
-		return -EINVAL;
-
-	ops = __efivars->ops;
-	if (!ops->query_variable_store)
-		return -ENOSYS;
-
-	/*
-	 * If the EFI variable backend provides a non-blocking
-	 * ->set_variable() operation and we're in a context where we
-	 * cannot block, then we need to use it to avoid live-locks,
-	 * since the implication is that the regular ->set_variable()
-	 * will block.
-	 *
-	 * If no ->set_variable_nonblocking() is provided then
-	 * ->set_variable() is assumed to be non-blocking.
-	 */
-	if (!block && ops->set_variable_nonblocking)
-		return efivar_entry_set_nonblocking(name, vendor, attributes,
-						    size, data);
-
-	varsize = size + ucs2_strsize(name, 1024);
-	if (!block) {
-		if (down_trylock(&efivars_lock))
-			return -EBUSY;
-		status = check_var_size_nonblocking(attributes, varsize);
-	} else {
-		if (down_interruptible(&efivars_lock))
-			return -EINTR;
-		status = check_var_size(attributes, varsize);
-	}
-
-	if (status != EFI_SUCCESS) {
-		up(&efivars_lock);
-		return -ENOSPC;
-	}
-
-	status = ops->set_variable(name, &vendor, attributes, size, data);
-
-	up(&efivars_lock);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set_safe);
-
-/**
- * efivar_entry_find - search for an entry
- * @name: the EFI variable name
- * @guid: the EFI variable vendor's guid
- * @head: head of the variable list
- * @remove: should we remove the entry from the list?
- *
- * Search for an entry on the variable list that has the EFI variable
- * name @name and vendor guid @guid. If an entry is found on the list
- * and @remove is true, the entry is removed from the list.
- *
- * The caller MUST call efivar_entry_iter_begin() and
- * efivar_entry_iter_end() before and after the invocation of this
- * function, respectively.
- *
- * Returns the entry if found on the list, %NULL otherwise.
- */
-struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
-				       struct list_head *head, bool remove)
-{
-	struct efivar_entry *entry, *n;
-	int strsize1, strsize2;
-	bool found = false;
-
-	list_for_each_entry_safe(entry, n, head, list) {
-		strsize1 = ucs2_strsize(name, 1024);
-		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
-		if (strsize1 == strsize2 &&
-		    !memcmp(name, &(entry->var.VariableName), strsize1) &&
-		    !efi_guidcmp(guid, entry->var.VendorGuid)) {
-			found = true;
-			break;
-		}
-	}
-
-	if (!found)
-		return NULL;
-
-	if (remove)
-		list_del(&entry->list);
-
-	return entry;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_find);
-
 /**
  * efivar_entry_size - obtain the size of a variable
  * @entry: entry for this variable
@@ -1010,30 +773,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(efivar_entry_set_get_size);
 
-/**
- * efivar_entry_iter_begin - begin iterating the variable list
- *
- * Lock the variable list to prevent entry insertion and removal until
- * efivar_entry_iter_end() is called. This function is usually used in
- * conjunction with __efivar_entry_iter() or efivar_entry_iter().
- */
-int efivar_entry_iter_begin(void)
-{
-	return down_interruptible(&efivars_lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter_begin);
-
-/**
- * efivar_entry_iter_end - finish iterating the variable list
- *
- * Unlock the variable list and allow modifications to the list again.
- */
-void efivar_entry_iter_end(void)
-{
-	up(&efivars_lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter_end);
-
 /**
  * efivar_entry_iter - iterate over variable list
  * @func: callback function
@@ -1054,7 +793,7 @@ int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 	struct efivar_entry *entry, *n;
 	int err = 0;
 
-	err = efivar_entry_iter_begin();
+	err = down_interruptible(&efivars_lock);
 	if (err)
 		return err;
 
@@ -1063,7 +802,7 @@ int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 		if (err)
 			break;
 	}
-	efivar_entry_iter_end();
+	up(&efivars_lock);
 
 	return err;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 93ce85a14a46..10ef0a0d5e9a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1045,12 +1045,6 @@ struct efivar_entry {
 	struct kobject kobj;
 };
 
-static inline void
-efivar_unregister(struct efivar_entry *var)
-{
-	kobject_put(&var->kobj);
-}
-
 int efivars_register(struct efivars *efivars,
 		     const struct efivar_operations *ops,
 		     struct kobject *kobject);
@@ -1064,8 +1058,6 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void efivar_entry_remove(struct efivar_entry *entry);
-
-int __efivar_entry_delete(struct efivar_entry *entry);
 int efivar_entry_delete(struct efivar_entry *entry);
 
 int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
@@ -1073,22 +1065,12 @@ int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
 		       unsigned long *size, void *data);
 int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
 		     unsigned long *size, void *data);
-int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
-		     unsigned long size, void *data, struct list_head *head);
 int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 			      unsigned long *size, void *data, bool *set);
-int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
-			  bool block, unsigned long size, void *data);
-
-int efivar_entry_iter_begin(void);
-void efivar_entry_iter_end(void);
 
 int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 		      struct list_head *head, void *data);
 
-struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
-				       struct list_head *head, bool remove);
-
 bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 		     unsigned long data_size);
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
-- 
cgit 


From 3a75f9f2f9ad19bb9a0f566373ae91d8f09db85e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 21 Jun 2022 15:48:29 +0200
Subject: efi: vars: Use locking version to iterate over efivars linked lists

Both efivars and efivarfs uses __efivar_entry_iter() to go over the
linked list that shadows the list of EFI variables held by the firmware,
but fail to call the begin/end helpers that are documented as a
prerequisite.

So switch to the proper version, which is efivar_entry_iter(). Given
that in both cases, efivar_entry_remove() is invoked with the lock held
already, don't take the lock there anymore.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efivars.c | 8 ++------
 drivers/firmware/efi/vars.c    | 9 +--------
 fs/efivarfs/super.c            | 9 +++------
 include/linux/efi.h            | 2 +-
 4 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index 8341fb15f62e..801a65582172 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -602,10 +602,7 @@ static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor,
 
 static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
 {
-	int err = efivar_entry_remove(entry);
-
-	if (err)
-		return err;
+	efivar_entry_remove(entry);
 	efivar_unregister(entry);
 	return 0;
 }
@@ -615,8 +612,7 @@ static void efivars_sysfs_exit(void)
 	/* Remove all entries and destroy */
 	int err;
 
-	err = __efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list,
-				  NULL, NULL);
+	err = efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL);
 	if (err) {
 		pr_err("efivars: Failed to destroy sysfs entries\n");
 		return;
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 5640ffa81544..29540013b358 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -523,17 +523,10 @@ EXPORT_SYMBOL_GPL(__efivar_entry_add);
 /**
  * efivar_entry_remove - remove entry from variable list
  * @entry: entry to remove from list
- *
- * Returns 0 on success, or a kernel error code on failure.
  */
-int efivar_entry_remove(struct efivar_entry *entry)
+void efivar_entry_remove(struct efivar_entry *entry)
 {
-	if (down_interruptible(&efivars_lock))
-		return -EINTR;
 	list_del(&entry->list);
-	up(&efivars_lock);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(efivar_entry_remove);
 
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 09dfa8362f50..6780fc81cc11 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -180,10 +180,7 @@ fail:
 
 static int efivarfs_destroy(struct efivar_entry *entry, void *data)
 {
-	int err = efivar_entry_remove(entry);
-
-	if (err)
-		return err;
+	efivar_entry_remove(entry);
 	kfree(entry);
 	return 0;
 }
@@ -219,7 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
 	if (err)
-		__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+		efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
 
 	return err;
 }
@@ -244,7 +241,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
 	kill_litter_super(sb);
 
 	/* Remove all entries and destroy */
-	__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+	efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
 }
 
 static struct file_system_type efivarfs_type = {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 08bc6215e3b4..54ca2d6b6c78 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1063,7 +1063,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 
 int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
-int efivar_entry_remove(struct efivar_entry *entry);
+void efivar_entry_remove(struct efivar_entry *entry);
 
 int __efivar_entry_delete(struct efivar_entry *entry);
 int efivar_entry_delete(struct efivar_entry *entry);
-- 
cgit 


From 5ac941367a6f85777ef34ec15d60e17ea8e446d4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 21 Jun 2022 15:54:53 +0200
Subject: efi: vars: Drop __efivar_entry_iter() helper which is no longer used

__efivar_entry_iter() uses a list iterator in a dubious way, i.e., it
assumes that the iteration variable always points to an object of the
appropriate type, even if the list traversal exhausts the list
completely, in which case it will point somewhere in the vicinity of the
list's anchor instead.

Fortunately, we no longer use this function so we can just get rid of it
entirely.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/vars.c | 61 ++++++---------------------------------------
 include/linux/efi.h         |  3 ---
 2 files changed, 7 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 29540013b358..932435945c85 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -1034,59 +1034,6 @@ void efivar_entry_iter_end(void)
 }
 EXPORT_SYMBOL_GPL(efivar_entry_iter_end);
 
-/**
- * __efivar_entry_iter - iterate over variable list
- * @func: callback function
- * @head: head of the variable list
- * @data: function-specific data to pass to callback
- * @prev: entry to begin iterating from
- *
- * Iterate over the list of EFI variables and call @func with every
- * entry on the list. It is safe for @func to remove entries in the
- * list via efivar_entry_delete().
- *
- * You MUST call efivar_entry_iter_begin() before this function, and
- * efivar_entry_iter_end() afterwards.
- *
- * It is possible to begin iteration from an arbitrary entry within
- * the list by passing @prev. @prev is updated on return to point to
- * the last entry passed to @func. To begin iterating from the
- * beginning of the list @prev must be %NULL.
- *
- * The restrictions for @func are the same as documented for
- * efivar_entry_iter().
- */
-int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-			struct list_head *head, void *data,
-			struct efivar_entry **prev)
-{
-	struct efivar_entry *entry, *n;
-	int err = 0;
-
-	if (!prev || !*prev) {
-		list_for_each_entry_safe(entry, n, head, list) {
-			err = func(entry, data);
-			if (err)
-				break;
-		}
-
-		if (prev)
-			*prev = entry;
-
-		return err;
-	}
-
-
-	list_for_each_entry_safe_continue((*prev), n, head, list) {
-		err = func(*prev, data);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_iter);
-
 /**
  * efivar_entry_iter - iterate over variable list
  * @func: callback function
@@ -1104,12 +1051,18 @@ EXPORT_SYMBOL_GPL(__efivar_entry_iter);
 int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 		      struct list_head *head, void *data)
 {
+	struct efivar_entry *entry, *n;
 	int err = 0;
 
 	err = efivar_entry_iter_begin();
 	if (err)
 		return err;
-	err = __efivar_entry_iter(func, head, data, NULL);
+
+	list_for_each_entry_safe(entry, n, head, list) {
+		err = func(entry, data);
+		if (err)
+			break;
+	}
 	efivar_entry_iter_end();
 
 	return err;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 54ca2d6b6c78..93ce85a14a46 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1083,9 +1083,6 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
 int efivar_entry_iter_begin(void);
 void efivar_entry_iter_end(void);
 
-int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-			struct list_head *head, void *data,
-			struct efivar_entry **prev);
 int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 		      struct list_head *head, void *data);
 
-- 
cgit 


From 2d82e6227ea189c0589e7383a36616ac2a2d248c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 20 Jun 2022 18:19:43 +0200
Subject: efi: vars: Move efivar caching layer into efivarfs

Move the fiddly bits of the efivar layer into its only remaining user,
efivarfs, and confine its use to that particular module. All other uses
of the EFI variable store have no need for this additional layer of
complexity, given that they either only read variables, or read and
write variables into a separate GUIDed namespace, and cannot be used to
manipulate EFI variables that are covered by the EFI spec and/or affect
the boot flow.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efi.c  |   1 +
 drivers/firmware/efi/vars.c | 733 -------------------------------------------
 fs/efivarfs/Makefile        |   2 +-
 fs/efivarfs/internal.h      |  40 +++
 fs/efivarfs/vars.c          | 738 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/efi.h         |  38 ---
 6 files changed, 780 insertions(+), 772 deletions(-)
 create mode 100644 fs/efivarfs/vars.c

(limited to 'include')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 7f06065d3eb0..e4080ad96089 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -899,6 +899,7 @@ int efi_status_to_err(efi_status_t status)
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(efi_status_to_err);
 
 static DEFINE_SPINLOCK(efi_mem_reserve_persistent_lock);
 static struct linux_efi_memreserve *efi_memreserve_root __ro_after_init;
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index cafc128a5774..dd74d2ad3184 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -6,298 +6,20 @@
  * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
  */
 
-#include <linux/capability.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/smp.h>
 #include <linux/efi.h>
-#include <linux/sysfs.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
 #include <linux/ucs2_string.h>
 
 /* Private pointer to registered efivars */
 static struct efivars *__efivars;
 
-/*
- * efivars_lock protects three things:
- * 1) efivarfs_list and efivars_sysfs_list
- * 2) ->ops calls
- * 3) (un)registration of __efivars
- */
 static DEFINE_SEMAPHORE(efivars_lock);
 
-static bool
-validate_device_path(efi_char16_t *var_name, int match, u8 *buffer,
-		     unsigned long len)
-{
-	struct efi_generic_dev_path *node;
-	int offset = 0;
-
-	node = (struct efi_generic_dev_path *)buffer;
-
-	if (len < sizeof(*node))
-		return false;
-
-	while (offset <= len - sizeof(*node) &&
-	       node->length >= sizeof(*node) &&
-		node->length <= len - offset) {
-		offset += node->length;
-
-		if ((node->type == EFI_DEV_END_PATH ||
-		     node->type == EFI_DEV_END_PATH2) &&
-		    node->sub_type == EFI_DEV_END_ENTIRE)
-			return true;
-
-		node = (struct efi_generic_dev_path *)(buffer + offset);
-	}
-
-	/*
-	 * If we're here then either node->length pointed past the end
-	 * of the buffer or we reached the end of the buffer without
-	 * finding a device path end node.
-	 */
-	return false;
-}
-
-static bool
-validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer,
-		    unsigned long len)
-{
-	/* An array of 16-bit integers */
-	if ((len % 2) != 0)
-		return false;
-
-	return true;
-}
-
-static bool
-validate_load_option(efi_char16_t *var_name, int match, u8 *buffer,
-		     unsigned long len)
-{
-	u16 filepathlength;
-	int i, desclength = 0, namelen;
-
-	namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN);
-
-	/* Either "Boot" or "Driver" followed by four digits of hex */
-	for (i = match; i < match+4; i++) {
-		if (var_name[i] > 127 ||
-		    hex_to_bin(var_name[i] & 0xff) < 0)
-			return true;
-	}
-
-	/* Reject it if there's 4 digits of hex and then further content */
-	if (namelen > match + 4)
-		return false;
-
-	/* A valid entry must be at least 8 bytes */
-	if (len < 8)
-		return false;
-
-	filepathlength = buffer[4] | buffer[5] << 8;
-
-	/*
-	 * There's no stored length for the description, so it has to be
-	 * found by hand
-	 */
-	desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
-
-	/* Each boot entry must have a descriptor */
-	if (!desclength)
-		return false;
-
-	/*
-	 * If the sum of the length of the description, the claimed filepath
-	 * length and the original header are greater than the length of the
-	 * variable, it's malformed
-	 */
-	if ((desclength + filepathlength + 6) > len)
-		return false;
-
-	/*
-	 * And, finally, check the filepath
-	 */
-	return validate_device_path(var_name, match, buffer + desclength + 6,
-				    filepathlength);
-}
-
-static bool
-validate_uint16(efi_char16_t *var_name, int match, u8 *buffer,
-		unsigned long len)
-{
-	/* A single 16-bit integer */
-	if (len != 2)
-		return false;
-
-	return true;
-}
-
-static bool
-validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer,
-		      unsigned long len)
-{
-	int i;
-
-	for (i = 0; i < len; i++) {
-		if (buffer[i] > 127)
-			return false;
-
-		if (buffer[i] == 0)
-			return true;
-	}
-
-	return false;
-}
-
-struct variable_validate {
-	efi_guid_t vendor;
-	char *name;
-	bool (*validate)(efi_char16_t *var_name, int match, u8 *data,
-			 unsigned long len);
-};
-
-/*
- * This is the list of variables we need to validate, as well as the
- * whitelist for what we think is safe not to default to immutable.
- *
- * If it has a validate() method that's not NULL, it'll go into the
- * validation routine.  If not, it is assumed valid, but still used for
- * whitelisting.
- *
- * Note that it's sorted by {vendor,name}, but globbed names must come after
- * any other name with the same prefix.
- */
-static const struct variable_validate variable_validate[] = {
-	{ EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 },
-	{ EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order },
-	{ EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option },
-	{ EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order },
-	{ EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path },
-	{ EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string },
-	{ EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL },
-	{ EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string },
-	{ EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 },
-	{ LINUX_EFI_CRASH_GUID, "*", NULL },
-	{ NULL_GUID, "", NULL },
-};
-
-/*
- * Check if @var_name matches the pattern given in @match_name.
- *
- * @var_name: an array of @len non-NUL characters.
- * @match_name: a NUL-terminated pattern string, optionally ending in "*". A
- *              final "*" character matches any trailing characters @var_name,
- *              including the case when there are none left in @var_name.
- * @match: on output, the number of non-wildcard characters in @match_name
- *         that @var_name matches, regardless of the return value.
- * @return: whether @var_name fully matches @match_name.
- */
-static bool
-variable_matches(const char *var_name, size_t len, const char *match_name,
-		 int *match)
-{
-	for (*match = 0; ; (*match)++) {
-		char c = match_name[*match];
-
-		switch (c) {
-		case '*':
-			/* Wildcard in @match_name means we've matched. */
-			return true;
-
-		case '\0':
-			/* @match_name has ended. Has @var_name too? */
-			return (*match == len);
-
-		default:
-			/*
-			 * We've reached a non-wildcard char in @match_name.
-			 * Continue only if there's an identical character in
-			 * @var_name.
-			 */
-			if (*match < len && c == var_name[*match])
-				continue;
-			return false;
-		}
-	}
-}
-
-bool
-efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
-		unsigned long data_size)
-{
-	int i;
-	unsigned long utf8_size;
-	u8 *utf8_name;
-
-	utf8_size = ucs2_utf8size(var_name);
-	utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL);
-	if (!utf8_name)
-		return false;
-
-	ucs2_as_utf8(utf8_name, var_name, utf8_size);
-	utf8_name[utf8_size] = '\0';
-
-	for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
-		const char *name = variable_validate[i].name;
-		int match = 0;
-
-		if (efi_guidcmp(vendor, variable_validate[i].vendor))
-			continue;
-
-		if (variable_matches(utf8_name, utf8_size+1, name, &match)) {
-			if (variable_validate[i].validate == NULL)
-				break;
-			kfree(utf8_name);
-			return variable_validate[i].validate(var_name, match,
-							     data, data_size);
-		}
-	}
-	kfree(utf8_name);
-	return true;
-}
-EXPORT_SYMBOL_GPL(efivar_validate);
-
-bool
-efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
-			     size_t len)
-{
-	int i;
-	bool found = false;
-	int match = 0;
-
-	/*
-	 * Check if our variable is in the validated variables list
-	 */
-	for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
-		if (efi_guidcmp(variable_validate[i].vendor, vendor))
-			continue;
-
-		if (variable_matches(var_name, len,
-				     variable_validate[i].name, &match)) {
-			found = true;
-			break;
-		}
-	}
-
-	/*
-	 * If it's in our list, it is removable.
-	 */
-	return found;
-}
-EXPORT_SYMBOL_GPL(efivar_variable_is_removable);
-
 efi_status_t check_var_size(u32 attributes, unsigned long size)
 {
 	const struct efivar_operations *fops;
@@ -324,461 +46,6 @@ efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size)
 }
 EXPORT_SYMBOL_NS_GPL(check_var_size_nonblocking, EFIVAR);
 
-static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
-				struct list_head *head)
-{
-	struct efivar_entry *entry, *n;
-	unsigned long strsize1, strsize2;
-	bool found = false;
-
-	strsize1 = ucs2_strsize(variable_name, 1024);
-	list_for_each_entry_safe(entry, n, head, list) {
-		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
-		if (strsize1 == strsize2 &&
-			!memcmp(variable_name, &(entry->var.VariableName),
-				strsize2) &&
-			!efi_guidcmp(entry->var.VendorGuid,
-				*vendor)) {
-			found = true;
-			break;
-		}
-	}
-	return found;
-}
-
-/*
- * Returns the size of variable_name, in bytes, including the
- * terminating NULL character, or variable_name_size if no NULL
- * character is found among the first variable_name_size bytes.
- */
-static unsigned long var_name_strnsize(efi_char16_t *variable_name,
-				       unsigned long variable_name_size)
-{
-	unsigned long len;
-	efi_char16_t c;
-
-	/*
-	 * The variable name is, by definition, a NULL-terminated
-	 * string, so make absolutely sure that variable_name_size is
-	 * the value we expect it to be. If not, return the real size.
-	 */
-	for (len = 2; len <= variable_name_size; len += sizeof(c)) {
-		c = variable_name[(len / sizeof(c)) - 1];
-		if (!c)
-			break;
-	}
-
-	return min(len, variable_name_size);
-}
-
-/*
- * Print a warning when duplicate EFI variables are encountered and
- * disable the sysfs workqueue since the firmware is buggy.
- */
-static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
-			     unsigned long len16)
-{
-	size_t i, len8 = len16 / sizeof(efi_char16_t);
-	char *str8;
-
-	str8 = kzalloc(len8, GFP_KERNEL);
-	if (!str8)
-		return;
-
-	for (i = 0; i < len8; i++)
-		str8[i] = str16[i];
-
-	printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
-	       str8, vendor_guid);
-	kfree(str8);
-}
-
-/**
- * efivar_init - build the initial list of EFI variables
- * @func: callback function to invoke for every variable
- * @data: function-specific data to pass to @func
- * @duplicates: error if we encounter duplicates on @head?
- * @head: initialised head of variable list
- *
- * Get every EFI variable from the firmware and invoke @func. @func
- * should call efivar_entry_add() to build the list of variables.
- *
- * Returns 0 on success, or a kernel error code on failure.
- */
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data, bool duplicates, struct list_head *head)
-{
-	unsigned long variable_name_size = 1024;
-	efi_char16_t *variable_name;
-	efi_status_t status;
-	efi_guid_t vendor_guid;
-	int err = 0;
-
-	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
-	if (!variable_name) {
-		printk(KERN_ERR "efivars: Memory allocation failed.\n");
-		return -ENOMEM;
-	}
-
-	err = efivar_lock();
-	if (err)
-		goto free;
-
-	/*
-	 * Per EFI spec, the maximum storage allocated for both
-	 * the variable name and variable data is 1024 bytes.
-	 */
-
-	do {
-		variable_name_size = 1024;
-
-		status = efivar_get_next_variable(&variable_name_size,
-						  variable_name,
-						  &vendor_guid);
-		switch (status) {
-		case EFI_SUCCESS:
-			variable_name_size = var_name_strnsize(variable_name,
-							       variable_name_size);
-
-			/*
-			 * Some firmware implementations return the
-			 * same variable name on multiple calls to
-			 * get_next_variable(). Terminate the loop
-			 * immediately as there is no guarantee that
-			 * we'll ever see a different variable name,
-			 * and may end up looping here forever.
-			 */
-			if (duplicates &&
-			    variable_is_present(variable_name, &vendor_guid,
-						head)) {
-				dup_variable_bug(variable_name, &vendor_guid,
-						 variable_name_size);
-				status = EFI_NOT_FOUND;
-			} else {
-				err = func(variable_name, vendor_guid,
-					   variable_name_size, data);
-				if (err)
-					status = EFI_NOT_FOUND;
-			}
-			break;
-		case EFI_UNSUPPORTED:
-			err = -EOPNOTSUPP;
-			status = EFI_NOT_FOUND;
-			break;
-		case EFI_NOT_FOUND:
-			break;
-		default:
-			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
-				status);
-			status = EFI_NOT_FOUND;
-			break;
-		}
-
-	} while (status != EFI_NOT_FOUND);
-
-	efivar_unlock();
-free:
-	kfree(variable_name);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(efivar_init);
-
-/**
- * efivar_entry_add - add entry to variable list
- * @entry: entry to add to list
- * @head: list head
- *
- * Returns 0 on success, or a kernel error code on failure.
- */
-int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
-{
-	int err;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-	list_add(&entry->list, head);
-	efivar_unlock();
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_add);
-
-/**
- * __efivar_entry_add - add entry to variable list
- * @entry: entry to add to list
- * @head: list head
- */
-void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
-{
-	list_add(&entry->list, head);
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_add);
-
-/**
- * efivar_entry_remove - remove entry from variable list
- * @entry: entry to remove from list
- */
-void efivar_entry_remove(struct efivar_entry *entry)
-{
-	list_del(&entry->list);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_remove);
-
-/*
- * efivar_entry_list_del_unlock - remove entry from variable list
- * @entry: entry to remove
- *
- * Remove @entry from the variable list and release the list lock.
- *
- * NOTE: slightly weird locking semantics here - we expect to be
- * called with the efivars lock already held, and we release it before
- * returning. This is because this function is usually called after
- * set_variable() while the lock is still held.
- */
-static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
-{
-	list_del(&entry->list);
-	efivar_unlock();
-}
-
-/**
- * efivar_entry_delete - delete variable and remove entry from list
- * @entry: entry containing variable to delete
- *
- * Delete the variable from the firmware and remove @entry from the
- * variable list. It is the caller's responsibility to free @entry
- * once we return.
- *
- * Returns 0 on success, -EINTR if we can't grab the semaphore,
- * converted EFI status code if set_variable() fails.
- */
-int efivar_entry_delete(struct efivar_entry *entry)
-{
-	efi_status_t status;
-	int err;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-
-	status = efivar_set_variable_locked(entry->var.VariableName,
-					    &entry->var.VendorGuid,
-					    0, 0, NULL, false);
-	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
-		efivar_unlock();
-		return efi_status_to_err(status);
-	}
-
-	efivar_entry_list_del_unlock(entry);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_delete);
-
-/**
- * efivar_entry_size - obtain the size of a variable
- * @entry: entry for this variable
- * @size: location to store the variable's size
- */
-int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
-{
-	efi_status_t status;
-	int err;
-
-	*size = 0;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-
-	status = efivar_get_variable(entry->var.VariableName,
-				     &entry->var.VendorGuid, NULL, size, NULL);
-	efivar_unlock();
-
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_size);
-
-/**
- * __efivar_entry_get - call get_variable()
- * @entry: read data for this variable
- * @attributes: variable attributes
- * @size: size of @data buffer
- * @data: buffer to store variable data
- *
- * The caller MUST hold the efivar lock when calling this function.
- */
-int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
-		       unsigned long *size, void *data)
-{
-	efi_status_t status;
-
-	status = efivar_get_variable(entry->var.VariableName,
-				     &entry->var.VendorGuid,
-				     attributes, size, data);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_get);
-
-/**
- * efivar_entry_get - call get_variable()
- * @entry: read data for this variable
- * @attributes: variable attributes
- * @size: size of @data buffer
- * @data: buffer to store variable data
- */
-int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
-		     unsigned long *size, void *data)
-{
-	int err;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-	err = __efivar_entry_get(entry, attributes, size, data);
-	efivar_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_get);
-
-/**
- * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
- * @entry: entry containing variable to set and get
- * @attributes: attributes of variable to be written
- * @size: size of data buffer
- * @data: buffer containing data to write
- * @set: did the set_variable() call succeed?
- *
- * This is a pretty special (complex) function. See efivarfs_file_write().
- *
- * Atomically call set_variable() for @entry and if the call is
- * successful, return the new size of the variable from get_variable()
- * in @size. The success of set_variable() is indicated by @set.
- *
- * Returns 0 on success, -EINVAL if the variable data is invalid,
- * -ENOSPC if the firmware does not have enough available space, or a
- * converted EFI status code if either of set_variable() or
- * get_variable() fail.
- *
- * If the EFI variable does not exist when calling set_variable()
- * (EFI_NOT_FOUND), @entry is removed from the variable list.
- */
-int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
-			      unsigned long *size, void *data, bool *set)
-{
-	efi_char16_t *name = entry->var.VariableName;
-	efi_guid_t *vendor = &entry->var.VendorGuid;
-	efi_status_t status;
-	int err;
-
-	*set = false;
-
-	if (efivar_validate(*vendor, name, data, *size) == false)
-		return -EINVAL;
-
-	/*
-	 * The lock here protects the get_variable call, the conditional
-	 * set_variable call, and removal of the variable from the efivars
-	 * list (in the case of an authenticated delete).
-	 */
-	err = efivar_lock();
-	if (err)
-		return err;
-
-	/*
-	 * Ensure that the available space hasn't shrunk below the safe level
-	 */
-	status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
-	if (status != EFI_SUCCESS) {
-		if (status != EFI_UNSUPPORTED) {
-			err = efi_status_to_err(status);
-			goto out;
-		}
-
-		if (*size > 65536) {
-			err = -ENOSPC;
-			goto out;
-		}
-	}
-
-	status = efivar_set_variable_locked(name, vendor, attributes, *size,
-					    data, false);
-	if (status != EFI_SUCCESS) {
-		err = efi_status_to_err(status);
-		goto out;
-	}
-
-	*set = true;
-
-	/*
-	 * Writing to the variable may have caused a change in size (which
-	 * could either be an append or an overwrite), or the variable to be
-	 * deleted. Perform a GetVariable() so we can tell what actually
-	 * happened.
-	 */
-	*size = 0;
-	status = efivar_get_variable(entry->var.VariableName,
-				    &entry->var.VendorGuid,
-				    NULL, size, NULL);
-
-	if (status == EFI_NOT_FOUND)
-		efivar_entry_list_del_unlock(entry);
-	else
-		efivar_unlock();
-
-	if (status && status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
-
-	return 0;
-
-out:
-	efivar_unlock();
-	return err;
-
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set_get_size);
-
-/**
- * efivar_entry_iter - iterate over variable list
- * @func: callback function
- * @head: head of variable list
- * @data: function-specific data to pass to callback
- *
- * Iterate over the list of EFI variables and call @func with every
- * entry on the list. It is safe for @func to remove entries in the
- * list via efivar_entry_delete() while iterating.
- *
- * Some notes for the callback function:
- *  - a non-zero return value indicates an error and terminates the loop
- *  - @func is called from atomic context
- */
-int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-		      struct list_head *head, void *data)
-{
-	struct efivar_entry *entry, *n;
-	int err = 0;
-
-	err = efivar_lock();
-	if (err)
-		return err;
-
-	list_for_each_entry_safe(entry, n, head, list) {
-		err = func(entry, data);
-		if (err)
-			break;
-	}
-	efivar_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter);
-
 /**
  * efivars_kobject - get the kobject for the registered efivars
  *
diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
index 0b1c5e63eb71..7bfc2f9754a8 100644
--- a/fs/efivarfs/Makefile
+++ b/fs/efivarfs/Makefile
@@ -5,4 +5,4 @@
 
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs.o
 
-efivarfs-objs			:= inode.o file.o super.o
+efivarfs-objs			:= inode.o file.o super.o vars.o
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 30ae44cb7453..8ebf3a6a8aa2 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -7,6 +7,46 @@
 #define EFIVAR_FS_INTERNAL_H
 
 #include <linux/list.h>
+#include <linux/efi.h>
+
+struct efi_variable {
+	efi_char16_t  VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
+	efi_guid_t    VendorGuid;
+	unsigned long DataSize;
+	__u8          Data[1024];
+	efi_status_t  Status;
+	__u32         Attributes;
+} __attribute__((packed));
+
+struct efivar_entry {
+	struct efi_variable var;
+	struct list_head list;
+	struct kobject kobj;
+};
+
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data, bool duplicates, struct list_head *head);
+
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void efivar_entry_remove(struct efivar_entry *entry);
+int efivar_entry_delete(struct efivar_entry *entry);
+
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		       unsigned long *size, void *data);
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		     unsigned long *size, void *data);
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+			      unsigned long *size, void *data, bool *set);
+
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+		      struct list_head *head, void *data);
+
+bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+		     unsigned long data_size);
+bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
+				  size_t len);
 
 extern const struct file_operations efivarfs_file_operations;
 extern const struct inode_operations efivarfs_dir_inode_operations;
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
new file mode 100644
index 000000000000..a0ef63cfcecb
--- /dev/null
+++ b/fs/efivarfs/vars.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Originally from efivars.c
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/efi.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/ucs2_string.h>
+
+#include "internal.h"
+
+MODULE_IMPORT_NS(EFIVAR);
+
+static bool
+validate_device_path(efi_char16_t *var_name, int match, u8 *buffer,
+		     unsigned long len)
+{
+	struct efi_generic_dev_path *node;
+	int offset = 0;
+
+	node = (struct efi_generic_dev_path *)buffer;
+
+	if (len < sizeof(*node))
+		return false;
+
+	while (offset <= len - sizeof(*node) &&
+	       node->length >= sizeof(*node) &&
+		node->length <= len - offset) {
+		offset += node->length;
+
+		if ((node->type == EFI_DEV_END_PATH ||
+		     node->type == EFI_DEV_END_PATH2) &&
+		    node->sub_type == EFI_DEV_END_ENTIRE)
+			return true;
+
+		node = (struct efi_generic_dev_path *)(buffer + offset);
+	}
+
+	/*
+	 * If we're here then either node->length pointed past the end
+	 * of the buffer or we reached the end of the buffer without
+	 * finding a device path end node.
+	 */
+	return false;
+}
+
+static bool
+validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer,
+		    unsigned long len)
+{
+	/* An array of 16-bit integers */
+	if ((len % 2) != 0)
+		return false;
+
+	return true;
+}
+
+static bool
+validate_load_option(efi_char16_t *var_name, int match, u8 *buffer,
+		     unsigned long len)
+{
+	u16 filepathlength;
+	int i, desclength = 0, namelen;
+
+	namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN);
+
+	/* Either "Boot" or "Driver" followed by four digits of hex */
+	for (i = match; i < match+4; i++) {
+		if (var_name[i] > 127 ||
+		    hex_to_bin(var_name[i] & 0xff) < 0)
+			return true;
+	}
+
+	/* Reject it if there's 4 digits of hex and then further content */
+	if (namelen > match + 4)
+		return false;
+
+	/* A valid entry must be at least 8 bytes */
+	if (len < 8)
+		return false;
+
+	filepathlength = buffer[4] | buffer[5] << 8;
+
+	/*
+	 * There's no stored length for the description, so it has to be
+	 * found by hand
+	 */
+	desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+
+	/* Each boot entry must have a descriptor */
+	if (!desclength)
+		return false;
+
+	/*
+	 * If the sum of the length of the description, the claimed filepath
+	 * length and the original header are greater than the length of the
+	 * variable, it's malformed
+	 */
+	if ((desclength + filepathlength + 6) > len)
+		return false;
+
+	/*
+	 * And, finally, check the filepath
+	 */
+	return validate_device_path(var_name, match, buffer + desclength + 6,
+				    filepathlength);
+}
+
+static bool
+validate_uint16(efi_char16_t *var_name, int match, u8 *buffer,
+		unsigned long len)
+{
+	/* A single 16-bit integer */
+	if (len != 2)
+		return false;
+
+	return true;
+}
+
+static bool
+validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer,
+		      unsigned long len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (buffer[i] > 127)
+			return false;
+
+		if (buffer[i] == 0)
+			return true;
+	}
+
+	return false;
+}
+
+struct variable_validate {
+	efi_guid_t vendor;
+	char *name;
+	bool (*validate)(efi_char16_t *var_name, int match, u8 *data,
+			 unsigned long len);
+};
+
+/*
+ * This is the list of variables we need to validate, as well as the
+ * whitelist for what we think is safe not to default to immutable.
+ *
+ * If it has a validate() method that's not NULL, it'll go into the
+ * validation routine.  If not, it is assumed valid, but still used for
+ * whitelisting.
+ *
+ * Note that it's sorted by {vendor,name}, but globbed names must come after
+ * any other name with the same prefix.
+ */
+static const struct variable_validate variable_validate[] = {
+	{ EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 },
+	{ EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order },
+	{ EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option },
+	{ EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order },
+	{ EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path },
+	{ EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string },
+	{ EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL },
+	{ EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string },
+	{ EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 },
+	{ LINUX_EFI_CRASH_GUID, "*", NULL },
+	{ NULL_GUID, "", NULL },
+};
+
+/*
+ * Check if @var_name matches the pattern given in @match_name.
+ *
+ * @var_name: an array of @len non-NUL characters.
+ * @match_name: a NUL-terminated pattern string, optionally ending in "*". A
+ *              final "*" character matches any trailing characters @var_name,
+ *              including the case when there are none left in @var_name.
+ * @match: on output, the number of non-wildcard characters in @match_name
+ *         that @var_name matches, regardless of the return value.
+ * @return: whether @var_name fully matches @match_name.
+ */
+static bool
+variable_matches(const char *var_name, size_t len, const char *match_name,
+		 int *match)
+{
+	for (*match = 0; ; (*match)++) {
+		char c = match_name[*match];
+
+		switch (c) {
+		case '*':
+			/* Wildcard in @match_name means we've matched. */
+			return true;
+
+		case '\0':
+			/* @match_name has ended. Has @var_name too? */
+			return (*match == len);
+
+		default:
+			/*
+			 * We've reached a non-wildcard char in @match_name.
+			 * Continue only if there's an identical character in
+			 * @var_name.
+			 */
+			if (*match < len && c == var_name[*match])
+				continue;
+			return false;
+		}
+	}
+}
+
+bool
+efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+		unsigned long data_size)
+{
+	int i;
+	unsigned long utf8_size;
+	u8 *utf8_name;
+
+	utf8_size = ucs2_utf8size(var_name);
+	utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL);
+	if (!utf8_name)
+		return false;
+
+	ucs2_as_utf8(utf8_name, var_name, utf8_size);
+	utf8_name[utf8_size] = '\0';
+
+	for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+		const char *name = variable_validate[i].name;
+		int match = 0;
+
+		if (efi_guidcmp(vendor, variable_validate[i].vendor))
+			continue;
+
+		if (variable_matches(utf8_name, utf8_size+1, name, &match)) {
+			if (variable_validate[i].validate == NULL)
+				break;
+			kfree(utf8_name);
+			return variable_validate[i].validate(var_name, match,
+							     data, data_size);
+		}
+	}
+	kfree(utf8_name);
+	return true;
+}
+
+bool
+efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
+			     size_t len)
+{
+	int i;
+	bool found = false;
+	int match = 0;
+
+	/*
+	 * Check if our variable is in the validated variables list
+	 */
+	for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+		if (efi_guidcmp(variable_validate[i].vendor, vendor))
+			continue;
+
+		if (variable_matches(var_name, len,
+				     variable_validate[i].name, &match)) {
+			found = true;
+			break;
+		}
+	}
+
+	/*
+	 * If it's in our list, it is removable.
+	 */
+	return found;
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
+				struct list_head *head)
+{
+	struct efivar_entry *entry, *n;
+	unsigned long strsize1, strsize2;
+	bool found = false;
+
+	strsize1 = ucs2_strsize(variable_name, 1024);
+	list_for_each_entry_safe(entry, n, head, list) {
+		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
+		if (strsize1 == strsize2 &&
+			!memcmp(variable_name, &(entry->var.VariableName),
+				strsize2) &&
+			!efi_guidcmp(entry->var.VendorGuid,
+				*vendor)) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+/*
+ * Returns the size of variable_name, in bytes, including the
+ * terminating NULL character, or variable_name_size if no NULL
+ * character is found among the first variable_name_size bytes.
+ */
+static unsigned long var_name_strnsize(efi_char16_t *variable_name,
+				       unsigned long variable_name_size)
+{
+	unsigned long len;
+	efi_char16_t c;
+
+	/*
+	 * The variable name is, by definition, a NULL-terminated
+	 * string, so make absolutely sure that variable_name_size is
+	 * the value we expect it to be. If not, return the real size.
+	 */
+	for (len = 2; len <= variable_name_size; len += sizeof(c)) {
+		c = variable_name[(len / sizeof(c)) - 1];
+		if (!c)
+			break;
+	}
+
+	return min(len, variable_name_size);
+}
+
+/*
+ * Print a warning when duplicate EFI variables are encountered and
+ * disable the sysfs workqueue since the firmware is buggy.
+ */
+static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
+			     unsigned long len16)
+{
+	size_t i, len8 = len16 / sizeof(efi_char16_t);
+	char *str8;
+
+	str8 = kzalloc(len8, GFP_KERNEL);
+	if (!str8)
+		return;
+
+	for (i = 0; i < len8; i++)
+		str8[i] = str16[i];
+
+	printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
+	       str8, vendor_guid);
+	kfree(str8);
+}
+
+/**
+ * efivar_init - build the initial list of EFI variables
+ * @func: callback function to invoke for every variable
+ * @data: function-specific data to pass to @func
+ * @duplicates: error if we encounter duplicates on @head?
+ * @head: initialised head of variable list
+ *
+ * Get every EFI variable from the firmware and invoke @func. @func
+ * should call efivar_entry_add() to build the list of variables.
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data, bool duplicates, struct list_head *head)
+{
+	unsigned long variable_name_size = 1024;
+	efi_char16_t *variable_name;
+	efi_status_t status;
+	efi_guid_t vendor_guid;
+	int err = 0;
+
+	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+	if (!variable_name) {
+		printk(KERN_ERR "efivars: Memory allocation failed.\n");
+		return -ENOMEM;
+	}
+
+	err = efivar_lock();
+	if (err)
+		goto free;
+
+	/*
+	 * Per EFI spec, the maximum storage allocated for both
+	 * the variable name and variable data is 1024 bytes.
+	 */
+
+	do {
+		variable_name_size = 1024;
+
+		status = efivar_get_next_variable(&variable_name_size,
+						  variable_name,
+						  &vendor_guid);
+		switch (status) {
+		case EFI_SUCCESS:
+			variable_name_size = var_name_strnsize(variable_name,
+							       variable_name_size);
+
+			/*
+			 * Some firmware implementations return the
+			 * same variable name on multiple calls to
+			 * get_next_variable(). Terminate the loop
+			 * immediately as there is no guarantee that
+			 * we'll ever see a different variable name,
+			 * and may end up looping here forever.
+			 */
+			if (duplicates &&
+			    variable_is_present(variable_name, &vendor_guid,
+						head)) {
+				dup_variable_bug(variable_name, &vendor_guid,
+						 variable_name_size);
+				status = EFI_NOT_FOUND;
+			} else {
+				err = func(variable_name, vendor_guid,
+					   variable_name_size, data);
+				if (err)
+					status = EFI_NOT_FOUND;
+			}
+			break;
+		case EFI_UNSUPPORTED:
+			err = -EOPNOTSUPP;
+			status = EFI_NOT_FOUND;
+			break;
+		case EFI_NOT_FOUND:
+			break;
+		default:
+			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
+				status);
+			status = EFI_NOT_FOUND;
+			break;
+		}
+
+	} while (status != EFI_NOT_FOUND);
+
+	efivar_unlock();
+free:
+	kfree(variable_name);
+
+	return err;
+}
+
+/**
+ * efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+	int err;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+	list_add(&entry->list, head);
+	efivar_unlock();
+
+	return 0;
+}
+
+/**
+ * __efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+	list_add(&entry->list, head);
+}
+
+/**
+ * efivar_entry_remove - remove entry from variable list
+ * @entry: entry to remove from list
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+void efivar_entry_remove(struct efivar_entry *entry)
+{
+	list_del(&entry->list);
+}
+
+/*
+ * efivar_entry_list_del_unlock - remove entry from variable list
+ * @entry: entry to remove
+ *
+ * Remove @entry from the variable list and release the list lock.
+ *
+ * NOTE: slightly weird locking semantics here - we expect to be
+ * called with the efivars lock already held, and we release it before
+ * returning. This is because this function is usually called after
+ * set_variable() while the lock is still held.
+ */
+static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
+{
+	list_del(&entry->list);
+	efivar_unlock();
+}
+
+/**
+ * efivar_entry_delete - delete variable and remove entry from list
+ * @entry: entry containing variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * Returns 0 on success, -EINTR if we can't grab the semaphore,
+ * converted EFI status code if set_variable() fails.
+ */
+int efivar_entry_delete(struct efivar_entry *entry)
+{
+	efi_status_t status;
+	int err;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+
+	status = efivar_set_variable_locked(entry->var.VariableName,
+					    &entry->var.VendorGuid,
+					    0, 0, NULL, false);
+	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
+		efivar_unlock();
+		return efi_status_to_err(status);
+	}
+
+	efivar_entry_list_del_unlock(entry);
+	return 0;
+}
+
+/**
+ * efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ */
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+	efi_status_t status;
+	int err;
+
+	*size = 0;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+
+	status = efivar_get_variable(entry->var.VariableName,
+				     &entry->var.VendorGuid, NULL, size, NULL);
+	efivar_unlock();
+
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+}
+
+/**
+ * __efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ */
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		       unsigned long *size, void *data)
+{
+	efi_status_t status;
+
+	status = efivar_get_variable(entry->var.VariableName,
+				     &entry->var.VendorGuid,
+				     attributes, size, data);
+
+	return efi_status_to_err(status);
+}
+
+/**
+ * efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ */
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		     unsigned long *size, void *data)
+{
+	int err;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+	err = __efivar_entry_get(entry, attributes, size, data);
+	efivar_unlock();
+
+	return 0;
+}
+
+/**
+ * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
+ * @entry: entry containing variable to set and get
+ * @attributes: attributes of variable to be written
+ * @size: size of data buffer
+ * @data: buffer containing data to write
+ * @set: did the set_variable() call succeed?
+ *
+ * This is a pretty special (complex) function. See efivarfs_file_write().
+ *
+ * Atomically call set_variable() for @entry and if the call is
+ * successful, return the new size of the variable from get_variable()
+ * in @size. The success of set_variable() is indicated by @set.
+ *
+ * Returns 0 on success, -EINVAL if the variable data is invalid,
+ * -ENOSPC if the firmware does not have enough available space, or a
+ * converted EFI status code if either of set_variable() or
+ * get_variable() fail.
+ *
+ * If the EFI variable does not exist when calling set_variable()
+ * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ */
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+			      unsigned long *size, void *data, bool *set)
+{
+	efi_char16_t *name = entry->var.VariableName;
+	efi_guid_t *vendor = &entry->var.VendorGuid;
+	efi_status_t status;
+	int err;
+
+	*set = false;
+
+	if (efivar_validate(*vendor, name, data, *size) == false)
+		return -EINVAL;
+
+	/*
+	 * The lock here protects the get_variable call, the conditional
+	 * set_variable call, and removal of the variable from the efivars
+	 * list (in the case of an authenticated delete).
+	 */
+	err = efivar_lock();
+	if (err)
+		return err;
+
+	/*
+	 * Ensure that the available space hasn't shrunk below the safe level
+	 */
+	status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
+	if (status != EFI_SUCCESS) {
+		if (status != EFI_UNSUPPORTED) {
+			err = efi_status_to_err(status);
+			goto out;
+		}
+
+		if (*size > 65536) {
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	status = efivar_set_variable_locked(name, vendor, attributes, *size,
+					    data, false);
+	if (status != EFI_SUCCESS) {
+		err = efi_status_to_err(status);
+		goto out;
+	}
+
+	*set = true;
+
+	/*
+	 * Writing to the variable may have caused a change in size (which
+	 * could either be an append or an overwrite), or the variable to be
+	 * deleted. Perform a GetVariable() so we can tell what actually
+	 * happened.
+	 */
+	*size = 0;
+	status = efivar_get_variable(entry->var.VariableName,
+				    &entry->var.VendorGuid,
+				    NULL, size, NULL);
+
+	if (status == EFI_NOT_FOUND)
+		efivar_entry_list_del_unlock(entry);
+	else
+		efivar_unlock();
+
+	if (status && status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+
+out:
+	efivar_unlock();
+	return err;
+
+}
+
+/**
+ * efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of variable list
+ * @data: function-specific data to pass to callback
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete() while iterating.
+ *
+ * Some notes for the callback function:
+ *  - a non-zero return value indicates an error and terminates the loop
+ *  - @func is called from atomic context
+ */
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+		      struct list_head *head, void *data)
+{
+	struct efivar_entry *entry, *n;
+	int err = 0;
+
+	err = efivar_lock();
+	if (err)
+		return err;
+
+	list_for_each_entry_safe(entry, n, head, list) {
+		err = func(entry, data);
+		if (err)
+			break;
+	}
+	efivar_unlock();
+
+	return err;
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 10ef0a0d5e9a..8122c2ed505c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1030,21 +1030,6 @@ struct efivars {
 
 #define EFI_VAR_NAME_LEN	1024
 
-struct efi_variable {
-	efi_char16_t  VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
-	efi_guid_t    VendorGuid;
-	unsigned long DataSize;
-	__u8          Data[1024];
-	efi_status_t  Status;
-	__u32         Attributes;
-} __attribute__((packed));
-
-struct efivar_entry {
-	struct efi_variable var;
-	struct list_head list;
-	struct kobject kobj;
-};
-
 int efivars_register(struct efivars *efivars,
 		     const struct efivar_operations *ops,
 		     struct kobject *kobject);
@@ -1052,29 +1037,6 @@ int efivars_unregister(struct efivars *efivars);
 struct kobject *efivars_kobject(void);
 
 int efivar_supports_writes(void);
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data, bool duplicates, struct list_head *head);
-
-int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
-void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
-void efivar_entry_remove(struct efivar_entry *entry);
-int efivar_entry_delete(struct efivar_entry *entry);
-
-int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
-int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
-		       unsigned long *size, void *data);
-int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
-		     unsigned long *size, void *data);
-int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
-			      unsigned long *size, void *data, bool *set);
-
-int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-		      struct list_head *head, void *data);
-
-bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
-		     unsigned long data_size);
-bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
-				  size_t len);
 
 int efivar_lock(void);
 int efivar_trylock(void);
-- 
cgit 


From ede57d58e6f38d5bc66137368e4a1e68a157af6e Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Wed, 22 Jun 2022 18:09:03 +0200
Subject: net: helper function skb_len_add

Move the len fields manipulation in the skbs to a helper function.
There is a comment specifically requesting this and there are several
other areas in the code displaying the same pattern which can be
refactored.
This improves code readability.

Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Link: https://lore.kernel.org/r/20220622160853.GA6478@debian
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 12 ++++++++++++
 include/net/sock.h     |  4 +---
 net/core/skbuff.c      | 13 +++----------
 net/ipv4/esp4.c        |  4 +---
 net/ipv4/ip_output.c   |  8 ++------
 5 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cd4a8268894a..f6a27ab19202 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2351,6 +2351,18 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 	return skb_headlen(skb) + __skb_pagelen(skb);
 }
 
+/**
+ * skb_len_add - adds a number to len fields of skb
+ * @skb: buffer to add len to
+ * @delta: number of bytes to add
+ */
+static inline void skb_len_add(struct sk_buff *skb, int delta)
+{
+	skb->len += delta;
+	skb->data_len += delta;
+	skb->truesize += delta;
+}
+
 /**
  * __skb_fill_page_desc - initialise a paged fragment in an skb
  * @skb: buffer containing fragment to be initialised
diff --git a/include/net/sock.h b/include/net/sock.h
index 5bed1ea7a722..40bbd0e8925b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2219,9 +2219,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
 	if (err)
 		return err;
 
-	skb->len	     += copy;
-	skb->data_len	     += copy;
-	skb->truesize	     += copy;
+	skb_len_add(skb, copy);
 	sk_wmem_queued_add(sk, copy);
 	sk_mem_charge(sk, copy);
 	return 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00bf35ee8205..c62e42d0c531 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3195,9 +3195,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
 		}
 	}
 
-	to->truesize += len + plen;
-	to->len += len + plen;
-	to->data_len += len + plen;
+	skb_len_add(to, len + plen);
 
 	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
 		skb_tx_error(from);
@@ -3634,13 +3632,8 @@ onlymerged:
 	tgt->ip_summed = CHECKSUM_PARTIAL;
 	skb->ip_summed = CHECKSUM_PARTIAL;
 
-	/* Yak, is it really working this way? Some helper please? */
-	skb->len -= shiftlen;
-	skb->data_len -= shiftlen;
-	skb->truesize -= shiftlen;
-	tgt->len += shiftlen;
-	tgt->data_len += shiftlen;
-	tgt->truesize += shiftlen;
+	skb_len_add(skb, -shiftlen);
+	skb_len_add(tgt, shiftlen);
 
 	return shiftlen;
 }
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b21238df3301..7eae8d686e20 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -502,9 +502,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 
 			nfrags++;
 
-			skb->len += tailen;
-			skb->data_len += tailen;
-			skb->truesize += tailen;
+			skb_len_add(skb, tailen);
 			if (sk && sk_fullsock(sk))
 				refcount_add(tailen, &sk->sk_wmem_alloc);
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 00b4bf26fd93..5e32a2f86fbd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1214,9 +1214,7 @@ alloc_new_skb:
 
 			pfrag->offset += copy;
 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
-			skb->len += copy;
-			skb->data_len += copy;
-			skb->truesize += copy;
+			skb_len_add(skb, copy);
 			wmem_alloc_delta += copy;
 		} else {
 			err = skb_zerocopy_iter_dgram(skb, from, copy);
@@ -1443,9 +1441,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
 		}
 
-		skb->len += len;
-		skb->data_len += len;
-		skb->truesize += len;
+		skb_len_add(skb, len);
 		refcount_add(len, &sk->sk_wmem_alloc);
 		offset += len;
 		size -= len;
-- 
cgit 


From 1e5267cd0895183e09c5bb76da85c674014285d0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:47 +0200
Subject: mnt_idmapping: add vfs{g,u}id_t

Introduces new vfs{g,u}id_t types. Similar to k{g,u}id_t the new types
are just simple wrapper structs around regular {g,u}id_t types.

They allows to establish a type safety boundary between {g,u}ids on
idmapped mounts and {g,u}ids as they are represented in filesystems
themselves.

A vfs{g,u}id_t is always created from a k{g,u}id_t, never directly from
a {g,u}id_t as idmapped mounts remap a given {g,u}id according to the
mount's idmapping. This is expressed in the VFS{G,U}IDT_INIT() macros.

A vfs{g,u}id_t may be used as a k{g,u}id_t via AS_K{G,U}IDT(). This
often happens when we need to check whether a {g,u}id mapped according
to an idmapped mount is identical to a given k{g,u}id_t. For an example,
see vfsgid_in_group_p() which determines whether the value of vfsgid_t
matches the value of any of the caller's groups. Similar logic is
expressed in the k{g,u}id_eq_vfs{g,u}id().

The from_vfs{g,u}id() helpers map a given vfs{g,u}id_t from the mount's
idmapping into the filesystem idmapping. They make it possible to update
a filesystem object such as inode->i_{g,u}id with the correct value.

This makes it harder to accidently write a wrong {g,u}id anwywhere. The
vfs{g,u}id_has_fsmapping() helpers check whether a given vfs{g,u}id_t
can be mapped into the filesystem idmapping.

All new helpers are nops on non-idmapped mounts.

I've done work on this roughly 7 months ago but dropped it to focus on
the testsuite. Linus brought this up independently just last week and
it's time to move this along (see [1]).

[1]: https://lore.kernel.org/lkml/CAHk-=win6+ahs1EwLkcq8apqLi_1wXFWbrPf340zYEhObpz4jA@mail.gmail.com

Link: https://lore.kernel.org/r/20220621141454.2914719-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 262 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 234 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index ee5a217de2a8..71b4cd5a7466 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -13,6 +13,122 @@ struct user_namespace;
  */
 extern struct user_namespace init_user_ns;
 
+typedef struct {
+	uid_t val;
+} vfsuid_t;
+
+typedef struct {
+	gid_t val;
+} vfsgid_t;
+
+#ifdef CONFIG_MULTIUSER
+static inline uid_t __vfsuid_val(vfsuid_t uid)
+{
+	return uid.val;
+}
+
+static inline gid_t __vfsgid_val(vfsgid_t gid)
+{
+	return gid.val;
+}
+#else
+static inline uid_t __vfsuid_val(vfsuid_t uid)
+{
+	return 0;
+}
+
+static inline gid_t __vfsgid_val(vfsgid_t gid)
+{
+	return 0;
+}
+#endif
+
+static inline bool vfsuid_valid(vfsuid_t uid)
+{
+	return __vfsuid_val(uid) != (uid_t)-1;
+}
+
+static inline bool vfsgid_valid(vfsgid_t gid)
+{
+	return __vfsgid_val(gid) != (gid_t)-1;
+}
+
+static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right)
+{
+	return __vfsuid_val(left) == __vfsuid_val(right);
+}
+
+static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right)
+{
+	return __vfsgid_val(left) == __vfsgid_val(right);
+}
+
+/**
+ * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value
+ * @kuid: the kuid to compare
+ * @vfsuid: the vfsuid to compare
+ *
+ * Check whether @kuid and @vfsuid have the same values.
+ *
+ * Return: true if @kuid and @vfsuid have the same value, false if not.
+ */
+static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
+{
+	return __vfsuid_val(vfsuid) == __kuid_val(kuid);
+}
+
+/**
+ * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value
+ * @kgid: the kgid to compare
+ * @vfsgid: the vfsgid to compare
+ *
+ * Check whether @kgid and @vfsgid have the same values.
+ *
+ * Return: true if @kgid and @vfsgid have the same value, false if not.
+ */
+static inline bool vfsgid_eq_kgid(kgid_t kgid, vfsgid_t vfsgid)
+{
+	return __vfsgid_val(vfsgid) == __kgid_val(kgid);
+}
+
+/*
+ * vfs{g,u}ids are created from k{g,u}ids.
+ * We don't allow them to be created from regular {u,g}id.
+ */
+#define VFSUIDT_INIT(val) (vfsuid_t){ __kuid_val(val) }
+#define VFSGIDT_INIT(val) (vfsgid_t){ __kgid_val(val) }
+
+#define INVALID_VFSUID VFSUIDT_INIT(INVALID_UID)
+#define INVALID_VFSGID VFSGIDT_INIT(INVALID_GID)
+
+/*
+ * Allow a vfs{g,u}id to be used as a k{g,u}id where we want to compare
+ * whether the mapped value is identical to value of a k{g,u}id.
+ */
+#define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) }
+#define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) }
+
+#ifdef CONFIG_MULTIUSER
+/**
+ * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
+ * @vfsgid: the mnt gid to match
+ *
+ * This function can be used to determine whether @vfsuid matches any of the
+ * caller's groups.
+ *
+ * Return: 1 if vfsuid matches caller's groups, 0 if not.
+ */
+static inline int vfsgid_in_group_p(vfsgid_t vfsgid)
+{
+	return in_group_p(AS_KGIDT(vfsgid));
+}
+#else
+static inline int vfsgid_in_group_p(vfsgid_t vfsgid)
+{
+	return 1;
+}
+#endif
+
 /**
  * initial_idmapping - check whether this is the initial mapping
  * @ns: idmapping to check
@@ -67,21 +183,29 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
  * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is
  * returned.
  */
-static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
-				    struct user_namespace *fs_userns,
-				    kuid_t kuid)
+
+static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns,
+				   struct user_namespace *fs_userns,
+				   kuid_t kuid)
 {
 	uid_t uid;
 
 	if (no_idmapping(mnt_userns, fs_userns))
-		return kuid;
+		return VFSUIDT_INIT(kuid);
 	if (initial_idmapping(fs_userns))
 		uid = __kuid_val(kuid);
 	else
 		uid = from_kuid(fs_userns, kuid);
 	if (uid == (uid_t)-1)
-		return INVALID_UID;
-	return make_kuid(mnt_userns, uid);
+		return INVALID_VFSUID;
+	return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+}
+
+static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
+				    struct user_namespace *fs_userns,
+				    kuid_t kuid)
+{
+	return AS_KUIDT(make_vfsuid(mnt_userns, fs_userns, kuid));
 }
 
 /**
@@ -104,21 +228,56 @@ static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
  * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is
  * returned.
  */
-static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns,
-				    struct user_namespace *fs_userns,
-				    kgid_t kgid)
+
+static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns,
+				   struct user_namespace *fs_userns,
+				   kgid_t kgid)
 {
 	gid_t gid;
 
 	if (no_idmapping(mnt_userns, fs_userns))
-		return kgid;
+		return VFSGIDT_INIT(kgid);
 	if (initial_idmapping(fs_userns))
 		gid = __kgid_val(kgid);
 	else
 		gid = from_kgid(fs_userns, kgid);
 	if (gid == (gid_t)-1)
-		return INVALID_GID;
-	return make_kgid(mnt_userns, gid);
+		return INVALID_VFSGID;
+	return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+}
+
+static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns,
+				    struct user_namespace *fs_userns,
+				    kgid_t kgid)
+{
+	return AS_KGIDT(make_vfsgid(mnt_userns, fs_userns, kgid));
+}
+
+/**
+ * from_vfsuid - map a vfsuid into the filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsuid : vfsuid to be mapped
+ *
+ * Map @vfsuid into the filesystem idmapping. This function has to be used in
+ * order to e.g. write @vfsuid to inode->i_uid.
+ *
+ * Return: @vfsuid mapped into the filesystem idmapping
+ */
+static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns,
+				 struct user_namespace *fs_userns,
+				 vfsuid_t vfsuid)
+{
+	uid_t uid;
+
+	if (no_idmapping(mnt_userns, fs_userns))
+		return AS_KUIDT(vfsuid);
+	uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+	if (uid == (uid_t)-1)
+		return INVALID_UID;
+	if (initial_idmapping(fs_userns))
+		return KUIDT_INIT(uid);
+	return make_kuid(fs_userns, uid);
 }
 
 /**
@@ -145,16 +304,53 @@ static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns,
 				      struct user_namespace *fs_userns,
 				      kuid_t kuid)
 {
-	uid_t uid;
+	return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
+}
+
+/**
+ * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsuid: vfsuid to be mapped
+ *
+ * Check whether @vfsuid has a mapping in the filesystem idmapping. Use this
+ * function to check whether the filesystem idmapping has a mapping for
+ * @vfsuid.
+ *
+ * Return: true if @vfsuid has a mapping in the filesystem, false if not.
+ */
+static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns,
+					struct user_namespace *fs_userns,
+					vfsuid_t vfsuid)
+{
+	return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid));
+}
+
+/**
+ * from_vfsgid - map a vfsgid into the filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsgid : vfsgid to be mapped
+ *
+ * Map @vfsgid into the filesystem idmapping. This function has to be used in
+ * order to e.g. write @vfsgid to inode->i_gid.
+ *
+ * Return: @vfsgid mapped into the filesystem idmapping
+ */
+static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns,
+				 struct user_namespace *fs_userns,
+				 vfsgid_t vfsgid)
+{
+	gid_t gid;
 
 	if (no_idmapping(mnt_userns, fs_userns))
-		return kuid;
-	uid = from_kuid(mnt_userns, kuid);
-	if (uid == (uid_t)-1)
-		return INVALID_UID;
+		return AS_KGIDT(vfsgid);
+	gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+	if (gid == (gid_t)-1)
+		return INVALID_GID;
 	if (initial_idmapping(fs_userns))
-		return KUIDT_INIT(uid);
-	return make_kuid(fs_userns, uid);
+		return KGIDT_INIT(gid);
+	return make_kgid(fs_userns, gid);
 }
 
 /**
@@ -181,16 +377,26 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
 				      struct user_namespace *fs_userns,
 				      kgid_t kgid)
 {
-	gid_t gid;
+	return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
+}
 
-	if (no_idmapping(mnt_userns, fs_userns))
-		return kgid;
-	gid = from_kgid(mnt_userns, kgid);
-	if (gid == (gid_t)-1)
-		return INVALID_GID;
-	if (initial_idmapping(fs_userns))
-		return KGIDT_INIT(gid);
-	return make_kgid(fs_userns, gid);
+/**
+ * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsgid: vfsgid to be mapped
+ *
+ * Check whether @vfsgid has a mapping in the filesystem idmapping. Use this
+ * function to check whether the filesystem idmapping has a mapping for
+ * @vfsgid.
+ *
+ * Return: true if @vfsgid has a mapping in the filesystem, false if not.
+ */
+static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns,
+					struct user_namespace *fs_userns,
+					vfsgid_t vfsgid)
+{
+	return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid));
 }
 
 /**
-- 
cgit 


From 234a3113f28d02973ecf501f83d796ea89db295f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:48 +0200
Subject: fs: add two type safe mapping helpers

Introduce i_{g,u}id_into_vfs{g,u}id(). They return vfs{g,u}id_t. This
makes it way harder to confused idmapped mount {g,u}ids with filesystem
{g,u}ids.

The two helpers will eventually replace the old non type safe
i_{g,u}id_into_mnt() helpers once we finished converting all places. Add
a comment noting that they will be removed in the future.

All new helpers are nops on non-idmapped mounts.

Link: https://lore.kernel.org/r/20220621141454.2914719-3-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/fs.h | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..afcffa251cb9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1600,13 +1600,30 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
  * @mnt_userns: user namespace of the mount the inode was found from
  * @inode: inode to map
  *
+ * Note, this will eventually be removed completely in favor of the type-safe
+ * i_uid_into_vfsuid().
+ *
  * Return: the inode's i_uid mapped down according to @mnt_userns.
  * If the inode's i_uid has no mapping INVALID_UID is returned.
  */
 static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
+	return AS_KUIDT(make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid));
+}
+
+/**
+ * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: whe inode's i_uid mapped down according to @mnt_userns.
+ * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
+ */
+static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns,
+					 const struct inode *inode)
+{
+	return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid);
 }
 
 /**
@@ -1614,13 +1631,30 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
  * @mnt_userns: user namespace of the mount the inode was found from
  * @inode: inode to map
  *
+ * Note, this will eventually be removed completely in favor of the type-safe
+ * i_gid_into_vfsgid().
+ *
  * Return: the inode's i_gid mapped down according to @mnt_userns.
  * If the inode's i_gid has no mapping INVALID_GID is returned.
  */
 static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
-	return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
+	return AS_KGIDT(make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid));
+}
+
+/**
+ * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: the inode's i_gid mapped down according to @mnt_userns.
+ * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
+ */
+static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns,
+					 const struct inode *inode)
+{
+	return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid);
 }
 
 /**
-- 
cgit 


From 45c311501c77217c50d08ed08aa722c812d92ab5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:49 +0200
Subject: fs: use mount types in iattr

Add ia_vfs{g,u}id members of type vfs{g,u}id_t to struct iattr. We use
an anonymous union (similar to what we do in struct file) around
ia_{g,u}id and ia_vfs{g,u}id.

At the end of this series ia_{g,u}id and ia_vfs{g,u}id will always
contain the same value independent of whether struct iattr is
initialized from an idmapped mount. This is a change from how this is
done today.

Wrapping this in a anonymous unions has a few advantages. It allows us
to avoid needlessly increasing struct iattr. Since the types for
ia_{g,u}id and ia_vfs{g,u}id are structures with overlapping/identical
members they are covered by 6.5.2.3/6 of the C standard and it is safe
to initialize and access them.

Filesystems that raise FS_ALLOW_IDMAP and thus support idmapped mounts
will have to use ia_vfs{g,u}id and the associated helpers. And will be
ported at the end of this series. They will immediately benefit from the
type safe new helpers.

Filesystems that do not support FS_ALLOW_IDMAP can continue to use
ia_{g,u}id for now. The aim is to convert every filesystem to always use
ia_vfs{g,u}id and thus ultimately remove the ia_{g,u}id members.

Link: https://lore.kernel.org/r/20220621141454.2914719-4-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/fs.h            | 18 ++++++++++++++++--
 include/linux/mnt_idmapping.h |  5 +++++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index afcffa251cb9..54ffcdce3ccb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -221,8 +221,22 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 struct iattr {
 	unsigned int	ia_valid;
 	umode_t		ia_mode;
-	kuid_t		ia_uid;
-	kgid_t		ia_gid;
+	/*
+	 * The two anonymous unions wrap structures with the same member.
+	 *
+	 * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
+	 * are a dedicated type requiring the filesystem to use the dedicated
+	 * helpers. Other filesystem can continue to use ia_{g,u}id until they
+	 * have been ported.
+	 */
+	union {
+		kuid_t		ia_uid;
+		vfsuid_t	ia_vfsuid;
+	};
+	union {
+		kgid_t		ia_gid;
+		vfsgid_t	ia_vfsgid;
+	};
 	loff_t		ia_size;
 	struct timespec64 ia_atime;
 	struct timespec64 ia_mtime;
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 71b4cd5a7466..6a752b61088b 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -21,6 +21,11 @@ typedef struct {
 	gid_t val;
 } vfsgid_t;
 
+static_assert(sizeof(vfsuid_t) == sizeof(kuid_t));
+static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
+static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
+static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
+
 #ifdef CONFIG_MULTIUSER
 static inline uid_t __vfsuid_val(vfsuid_t uid)
 {
-- 
cgit 


From 1f36146a5a3dc6098566d34a9886f9e97c88d93e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:50 +0200
Subject: fs: introduce tiny iattr ownership update helpers

Nearly all fileystems currently open-code the same checks for
determining whether the i_{g,u}id fields of an inode need to be updated
and then updating the fields.

Introduce tiny helpers i_{g,u}id_needs_update() and i_{g,u}id_update()
that wrap this logic. This allows filesystems to not care about updating
inode->i_{g,u}id with the correct values themselves instead leaving this
to the helpers.

We also get rid of a lot of code duplication and make it easier to
change struct iattr in the future since changes can be localized to
these helpers.

And finally we make it hard to conflate k{g,u}id_t types with
vfs{g,u}id_t types for filesystems that support idmapped mounts.

In the following patch we will port all filesystems that raise
FS_ALLOW_IDMAP to use the new helpers. However, the ultimate goal is to
convert all filesystems to make use of these helpers.

All new helpers are nops on non-idmapped mounts.

Link: https://lore.kernel.org/r/20220621141454.2914719-5-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/fs.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 54ffcdce3ccb..0f8cc7f2665a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1640,6 +1640,44 @@ static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns,
 	return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid);
 }
 
+/**
+ * i_uid_needs_update - check whether inode's i_uid needs to be updated
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Check whether the $inode's i_uid field needs to be updated taking idmapped
+ * mounts into account if the filesystem supports it.
+ *
+ * Return: true if @inode's i_uid field needs to be updated, false if not.
+ */
+static inline bool i_uid_needs_update(struct user_namespace *mnt_userns,
+				      const struct iattr *attr,
+				      const struct inode *inode)
+{
+	return ((attr->ia_valid & ATTR_UID) &&
+		!vfsuid_eq(attr->ia_vfsuid,
+			   i_uid_into_vfsuid(mnt_userns, inode)));
+}
+
+/**
+ * i_uid_update - update @inode's i_uid field
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Safely update @inode's i_uid field translating the vfsuid of any idmapped
+ * mount into the filesystem kuid.
+ */
+static inline void i_uid_update(struct user_namespace *mnt_userns,
+				const struct iattr *attr,
+				struct inode *inode)
+{
+	if (attr->ia_valid & ATTR_UID)
+		inode->i_uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+					   attr->ia_vfsuid);
+}
+
 /**
  * i_gid_into_mnt - map an inode's i_gid down into a mnt_userns
  * @mnt_userns: user namespace of the mount the inode was found from
@@ -1671,6 +1709,44 @@ static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns,
 	return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid);
 }
 
+/**
+ * i_gid_needs_update - check whether inode's i_gid needs to be updated
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Check whether the $inode's i_gid field needs to be updated taking idmapped
+ * mounts into account if the filesystem supports it.
+ *
+ * Return: true if @inode's i_gid field needs to be updated, false if not.
+ */
+static inline bool i_gid_needs_update(struct user_namespace *mnt_userns,
+				      const struct iattr *attr,
+				      const struct inode *inode)
+{
+	return ((attr->ia_valid & ATTR_GID) &&
+		!vfsgid_eq(attr->ia_vfsgid,
+			   i_gid_into_vfsgid(mnt_userns, inode)));
+}
+
+/**
+ * i_gid_update - update @inode's i_gid field
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Safely update @inode's i_gid field translating the vfsgid of any idmapped
+ * mount into the filesystem kgid.
+ */
+static inline void i_gid_update(struct user_namespace *mnt_userns,
+				const struct iattr *attr,
+				struct inode *inode)
+{
+	if (attr->ia_valid & ATTR_GID)
+		inode->i_gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+					   attr->ia_vfsgid);
+}
+
 /**
  * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
  * @inode: inode to initialize
-- 
cgit 


From 35faf3109a78516f60ca13f957083d5e5535fde0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:51 +0200
Subject: fs: port to iattr ownership update helpers

Earlier we introduced new helpers to abstract ownership update and
remove code duplication. This converts all filesystems supporting
idmapped mounts to make use of these new helpers.

For now we always pass the initial idmapping which makes the idmapping
functions these helpers call nops.

This is done because we currently always pass the actual value to be
written to i_{g,u}id via struct iattr. While this allowed us to treat
the {g,u}id values in struct iattr as values that can be directly
written to inode->i_{g,u}id it also increases the potential for
confusion for filesystems.

Now that we are have dedicated types to prevent this confusion we will
ultimately only map the value from the idmapped mount into a filesystem
value that can be written to inode->i_{g,u}id when the filesystem
actually updates the inode. So pass down the initial idmapping until we
finished that conversion at which point we pass down the mount's
idmapping.

No functional changes intended.

Link: https://lore.kernel.org/r/20220621141454.2914719-6-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c                         |  6 ++----
 fs/ext2/inode.c                   |  4 ++--
 fs/ext4/inode.c                   | 10 ++++------
 fs/f2fs/file.c                    | 18 ++++++------------
 fs/quota/dquot.c                  |  4 ++--
 fs/xfs/xfs_iops.c                 |  8 ++++----
 include/linux/quotaops.h          |  6 +++---
 security/integrity/evm/evm_main.c |  4 ++--
 8 files changed, 25 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index dbe996b0dedf..2e180dd9460f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -242,10 +242,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	if (ia_valid & ATTR_UID)
-		inode->i_uid = attr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		inode->i_gid = attr->ia_gid;
+	i_uid_update(&init_user_ns, attr, inode);
+	i_gid_update(&init_user_ns, attr, inode);
 	if (ia_valid & ATTR_ATIME)
 		inode->i_atime = attr->ia_atime;
 	if (ia_valid & ATTR_MTIME)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e6b932219803..6dc66ab97d20 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1684,8 +1684,8 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		if (error)
 			return error;
 	}
-	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
-	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
+	if (i_uid_needs_update(&init_user_ns, iattr, inode) ||
+	    i_gid_needs_update(&init_user_ns, iattr, inode)) {
 		error = dquot_transfer(inode, iattr);
 		if (error)
 			return error;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 84c0eb55071d..05d932f81c53 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5356,8 +5356,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 			return error;
 	}
 
-	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
-	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+	if (i_uid_needs_update(&init_user_ns, attr, inode) ||
+	    i_gid_needs_update(&init_user_ns, attr, inode)) {
 		handle_t *handle;
 
 		/* (user+group)*(old+new) structure, inode write (sb,
@@ -5383,10 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
-		if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
+		i_uid_update(&init_user_ns, attr, inode);
+		i_gid_update(&init_user_ns, attr, inode);
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 		if (unlikely(error)) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bd14cef1b08f..a35d6b12bd63 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	if (ia_valid & ATTR_UID)
-		inode->i_uid = attr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		inode->i_gid = attr->ia_gid;
+	i_uid_update(&init_user_ns, attr, inode);
+	i_gid_update(&init_user_ns, attr, inode);
 	if (ia_valid & ATTR_ATIME)
 		inode->i_atime = attr->ia_atime;
 	if (ia_valid & ATTR_MTIME)
@@ -922,10 +920,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		if (err)
 			return err;
 	}
-	if ((attr->ia_valid & ATTR_UID &&
-		!uid_eq(attr->ia_uid, inode->i_uid)) ||
-		(attr->ia_valid & ATTR_GID &&
-		!gid_eq(attr->ia_gid, inode->i_gid))) {
+	if (i_uid_needs_update(&init_user_ns, attr, inode) ||
+	    i_gid_needs_update(&init_user_ns, attr, inode)) {
 		f2fs_lock_op(F2FS_I_SB(inode));
 		err = dquot_transfer(inode, attr);
 		if (err) {
@@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 * update uid/gid under lock_op(), so that dquot and inode can
 		 * be updated atomically.
 		 */
-		if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
+		i_uid_update(&init_user_ns, attr, inode);
+		i_gid_update(&init_user_ns, attr, inode);
 		f2fs_mark_inode_dirty_sync(inode, true);
 		f2fs_unlock_op(F2FS_I_SB(inode));
 	}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09d1307959d0..6cec2bfbf51b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2095,7 +2095,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	if (!dquot_active(inode))
 		return 0;
 
-	if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
+	if (i_uid_needs_update(&init_user_ns, iattr, inode)) {
 		dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
 		if (IS_ERR(dquot)) {
 			if (PTR_ERR(dquot) != -ESRCH) {
@@ -2106,7 +2106,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		}
 		transfer_to[USRQUOTA] = dquot;
 	}
-	if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
+	if (i_gid_needs_update(&init_user_ns, iattr, inode)) {
 		dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
 		if (IS_ERR(dquot)) {
 			if (PTR_ERR(dquot) != -ESRCH) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 29f5b8b8aca6..31ec29565fb4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -704,13 +704,13 @@ xfs_setattr_nonsize(
 	 * didn't have the inode locked, inode's dquot(s) would have changed
 	 * also.
 	 */
-	if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
-	    !uid_eq(inode->i_uid, iattr->ia_uid)) {
+	if (XFS_IS_UQUOTA_ON(mp) &&
+	    i_uid_needs_update(&init_user_ns, iattr, inode)) {
 		ASSERT(udqp);
 		old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
 	}
-	if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
-	    !gid_eq(inode->i_gid, iattr->ia_gid)) {
+	if (XFS_IS_GQUOTA_ON(mp) &&
+	    i_gid_needs_update(&init_user_ns, iattr, inode)) {
 		ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
 		ASSERT(gdqp);
 		old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index a0f6668924d3..61ee34861ca2 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -22,9 +22,9 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 /* i_mutex must being held */
 static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
 {
-	return (ia->ia_valid & ATTR_SIZE) ||
-		(ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) ||
-		(ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid));
+	return ((ia->ia_valid & ATTR_SIZE) ||
+		i_uid_needs_update(&init_user_ns, ia, inode) ||
+		i_gid_needs_update(&init_user_ns, ia, inode));
 }
 
 #if defined(CONFIG_QUOTA)
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index cc88f02c7562..bcde6bc2a2ce 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -760,8 +760,8 @@ static int evm_attr_change(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_backing_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
 
-	if ((!(ia_valid & ATTR_UID) || uid_eq(attr->ia_uid, inode->i_uid)) &&
-	    (!(ia_valid & ATTR_GID) || gid_eq(attr->ia_gid, inode->i_gid)) &&
+	if (!i_uid_needs_update(&init_user_ns, attr, inode) &&
+	    !i_gid_needs_update(&init_user_ns, attr, inode) &&
 	    (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
 		return 0;
 
-- 
cgit 


From 71e7b535b8900d7ce7d5279fa472711db5251ae5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:52 +0200
Subject: quota: port quota helpers mount ids

Port the is_quota_modification() and dqout_transfer() helper to type
safe vfs{g,u}id_t. Since these helpers are only called by a few
filesystems don't introduce a new helper but simply extend the existing
helpers to pass down the mount's idmapping.

Note, that this is a non-functional change, i.e. nothing will have
happened here or at the end of this series to how quota are done! This
a change necessary because we will at the end of this series make
ownership changes easier to reason about by keeping the original value
in struct iattr for both non-idmapped and idmapped mounts.

For now we always pass the initial idmapping which makes the idmapping
functions these helpers call nops.

This is done because we currently always pass the actual value to be
written to i_{g,u}id via struct iattr. While this allowed us to treat
the {g,u}id values in struct iattr as values that can be directly
written to inode->i_{g,u}id it also increases the potential for
confusion for filesystems.

Now that we are have dedicated types to prevent this confusion we will
ultimately only map the value from the idmapped mount into a filesystem
value that can be written to inode->i_{g,u}id when the filesystem
actually updates the inode. So pass down the initial idmapping until we
finished that conversion at which point we pass down the mount's
idmapping.

Since struct iattr uses an anonymous union with overlapping types as
supported by the C standard, filesystems that haven't converted to
ia_vfs{g,u}id won't see any difference and things will continue to work
as before. In other words, no functional changes intended with this
change.

Link: https://lore.kernel.org/r/20220621141454.2914719-7-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.cz>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ext2/inode.c          | 4 ++--
 fs/ext4/inode.c          | 4 ++--
 fs/f2fs/file.c           | 4 ++--
 fs/f2fs/recovery.c       | 2 +-
 fs/jfs/file.c            | 4 ++--
 fs/ocfs2/file.c          | 2 +-
 fs/quota/dquot.c         | 3 ++-
 fs/reiserfs/inode.c      | 4 ++--
 fs/zonefs/super.c        | 2 +-
 include/linux/quotaops.h | 9 ++++++---
 10 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6dc66ab97d20..593b79416e0e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	if (is_quota_modification(inode, iattr)) {
+	if (is_quota_modification(&init_user_ns, inode, iattr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
 	}
 	if (i_uid_needs_update(&init_user_ns, iattr, inode) ||
 	    i_gid_needs_update(&init_user_ns, iattr, inode)) {
-		error = dquot_transfer(inode, iattr);
+		error = dquot_transfer(&init_user_ns, inode, iattr);
 		if (error)
 			return error;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 05d932f81c53..72f08c184768 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5350,7 +5350,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	if (is_quota_modification(inode, attr)) {
+	if (is_quota_modification(&init_user_ns, inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
@@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 * counts xattr inode references.
 		 */
 		down_read(&EXT4_I(inode)->xattr_sem);
-		error = dquot_transfer(inode, attr);
+		error = dquot_transfer(&init_user_ns, inode, attr);
 		up_read(&EXT4_I(inode)->xattr_sem);
 
 		if (error) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a35d6b12bd63..02b2d56d4edc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -915,7 +915,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (err)
 		return err;
 
-	if (is_quota_modification(inode, attr)) {
+	if (is_quota_modification(&init_user_ns, inode, attr)) {
 		err = f2fs_dquot_initialize(inode);
 		if (err)
 			return err;
@@ -923,7 +923,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (i_uid_needs_update(&init_user_ns, attr, inode) ||
 	    i_gid_needs_update(&init_user_ns, attr, inode)) {
 		f2fs_lock_op(F2FS_I_SB(inode));
-		err = dquot_transfer(inode, attr);
+		err = dquot_transfer(&init_user_ns, inode, attr);
 		if (err) {
 			set_sbi_flag(F2FS_I_SB(inode),
 					SBI_QUOTA_NEED_REPAIR);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 3cb7f8a43b4d..8e5a089f1ac8 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -266,7 +266,7 @@ static int recover_quota_data(struct inode *inode, struct page *page)
 	if (!attr.ia_valid)
 		return 0;
 
-	err = dquot_transfer(inode, &attr);
+	err = dquot_transfer(&init_user_ns, inode, &attr);
 	if (err)
 		set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
 	return err;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1d732fd223d4..c18569b9895d 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (rc)
 		return rc;
 
-	if (is_quota_modification(inode, iattr)) {
+	if (is_quota_modification(&init_user_ns, inode, iattr)) {
 		rc = dquot_initialize(inode);
 		if (rc)
 			return rc;
 	}
 	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		rc = dquot_transfer(inode, iattr);
+		rc = dquot_transfer(&init_user_ns, inode, iattr);
 		if (rc)
 			return rc;
 	}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7497cd592258..0e09cd8911da 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (status)
 		return status;
 
-	if (is_quota_modification(inode, attr)) {
+	if (is_quota_modification(&init_user_ns, inode, attr)) {
 		status = dquot_initialize(inode);
 		if (status)
 			return status;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6cec2bfbf51b..df9af1ce2851 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer);
 /* Wrapper for transferring ownership of an inode for uid/gid only
  * Called from FSXXX_setattr()
  */
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+		   struct iattr *iattr)
 {
 	struct dquot *transfer_to[MAXQUOTAS] = {};
 	struct dquot *dquot;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0cffe054b78e..1e89e76972a0 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3284,7 +3284,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	/* must be turned off for recursive notify_change calls */
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 
-	if (is_quota_modification(inode, attr)) {
+	if (is_quota_modification(&init_user_ns, inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
@@ -3367,7 +3367,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		reiserfs_write_unlock(inode->i_sb);
 		if (error)
 			goto out;
-		error = dquot_transfer(inode, attr);
+		error = dquot_transfer(&init_user_ns, inode, attr);
 		reiserfs_write_lock(inode->i_sb);
 		if (error) {
 			journal_end(&th);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 053299758deb..dd422b1d7320 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
 	     !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    ((iattr->ia_valid & ATTR_GID) &&
 	     !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		ret = dquot_transfer(inode, iattr);
+		ret = dquot_transfer(&init_user_ns, inode, iattr);
 		if (ret)
 			return ret;
 	}
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 61ee34861ca2..0342ff6584fd 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -20,7 +20,8 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 }
 
 /* i_mutex must being held */
-static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
+static inline bool is_quota_modification(struct user_namespace *mnt_userns,
+					 struct inode *inode, struct iattr *ia)
 {
 	return ((ia->ia_valid & ATTR_SIZE) ||
 		i_uid_needs_update(&init_user_ns, ia, inode) ||
@@ -115,7 +116,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid id,
 		struct qc_dqblk *di);
 
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
-int dquot_transfer(struct inode *inode, struct iattr *iattr);
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+		   struct iattr *iattr);
 
 static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
 {
@@ -234,7 +236,8 @@ static inline void dquot_free_inode(struct inode *inode)
 {
 }
 
-static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static inline int dquot_transfer(struct user_namespace *mnt_userns,
+				 struct inode *inode, struct iattr *iattr)
 {
 	return 0;
 }
-- 
cgit 


From 0e363cf3fa598c69340794da068d4d9cbc869322 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:53 +0200
Subject: security: pass down mount idmapping to setattr hook

Before this change we used to take a shortcut and place the actual
values that would be written to inode->i_{g,u}id into struct iattr. This
had the advantage that we moved idmappings mostly out of the picture
early on but it made reasoning about changes more difficult than it
should be.

The filesystem was never explicitly told that it dealt with an idmapped
mount. The transition to the value that needed to be stored in
inode->i_{g,u}id appeared way too early and increased the probability of
bugs in various codepaths.

We know place the same value in struct iattr no matter if this is an
idmapped mount or not. The vfs will only deal with type safe
vfs{g,u}id_t. This makes it massively safer to perform permission checks
as the type will tell us what checks we need to perform and what helpers
we need to use.

Adapt the security_inode_setattr() helper to pass down the mount's
idmapping to account for that change.

Link: https://lore.kernel.org/r/20220621141454.2914719-8-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c                         | 2 +-
 fs/fat/file.c                     | 3 ++-
 include/linux/evm.h               | 6 ++++--
 include/linux/security.h          | 8 +++++---
 security/integrity/evm/evm_main.c | 8 +++++---
 security/security.c               | 5 +++--
 6 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index 2e180dd9460f..88e2ca30d42e 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -411,7 +411,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
-	error = security_inode_setattr(dentry, attr);
+	error = security_inode_setattr(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 	error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3dae3ed60f3a..530f18173db2 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	 * out the RO attribute for checking by the security
 	 * module, just because it maps to a file mode.
 	 */
-	err = security_inode_setattr(file->f_path.dentry, &ia);
+	err = security_inode_setattr(&init_user_ns,
+				     file->f_path.dentry, &ia);
 	if (err)
 		goto out_unlock_inode;
 
diff --git a/include/linux/evm.h b/include/linux/evm.h
index 4c374be70247..aa63e0b3c0a2 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -21,7 +21,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry,
 					     void *xattr_value,
 					     size_t xattr_value_len,
 					     struct integrity_iint_cache *iint);
-extern int evm_inode_setattr(struct dentry *dentry, struct iattr *attr);
+extern int evm_inode_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, struct iattr *attr);
 extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid);
 extern int evm_inode_setxattr(struct user_namespace *mnt_userns,
 			      struct dentry *dentry, const char *name,
@@ -68,7 +69,8 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry,
 }
 #endif
 
-static inline int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
+static inline int evm_inode_setattr(struct user_namespace *mnt_userns,
+				    struct dentry *dentry, struct iattr *attr)
 {
 	return 0;
 }
diff --git a/include/linux/security.h b/include/linux/security.h
index 7fc4e9f49f54..4d0baf30266e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -353,7 +353,8 @@ int security_inode_readlink(struct dentry *dentry);
 int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 			       bool rcu);
 int security_inode_permission(struct inode *inode, int mask);
-int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
+int security_inode_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(const struct path *path);
 int security_inode_setxattr(struct user_namespace *mnt_userns,
 			    struct dentry *dentry, const char *name,
@@ -848,8 +849,9 @@ static inline int security_inode_permission(struct inode *inode, int mask)
 	return 0;
 }
 
-static inline int security_inode_setattr(struct dentry *dentry,
-					  struct iattr *attr)
+static inline int security_inode_setattr(struct user_namespace *mnt_userns,
+					 struct dentry *dentry,
+					 struct iattr *attr)
 {
 	return 0;
 }
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index bcde6bc2a2ce..7f4af5b58583 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -755,7 +755,8 @@ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name)
 	evm_update_evmxattr(dentry, xattr_name, NULL, 0);
 }
 
-static int evm_attr_change(struct dentry *dentry, struct iattr *attr)
+static int evm_attr_change(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
@@ -775,7 +776,8 @@ static int evm_attr_change(struct dentry *dentry, struct iattr *attr)
  * Permit update of file attributes when files have a valid EVM signature,
  * except in the case of them having an immutable portable signature.
  */
-int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
+int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 	enum integrity_status evm_status;
@@ -801,7 +803,7 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
 		return 0;
 
 	if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
-	    !evm_attr_change(dentry, attr))
+	    !evm_attr_change(mnt_userns, dentry, attr))
 		return 0;
 
 	integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
diff --git a/security/security.c b/security/security.c
index 188b8f782220..f85afb02ea1c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1324,7 +1324,8 @@ int security_inode_permission(struct inode *inode, int mask)
 	return call_int_hook(inode_permission, 0, inode, mask);
 }
 
-int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
+int security_inode_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *attr)
 {
 	int ret;
 
@@ -1333,7 +1334,7 @@ int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 	ret = call_int_hook(inode_setattr, 0, dentry, attr);
 	if (ret)
 		return ret;
-	return evm_inode_setattr(dentry, attr);
+	return evm_inode_setattr(mnt_userns, dentry, attr);
 }
 EXPORT_SYMBOL_GPL(security_inode_setattr);
 
-- 
cgit 


From b27c82e1296572cfa3997e58db3118a33915f85c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 21 Jun 2022 16:14:54 +0200
Subject: attr: port attribute changes to new types

Now that we introduced new infrastructure to increase the type safety
for filesystems supporting idmapped mounts port the first part of the
vfs over to them.

This ports the attribute changes codepaths to rely on the new better
helpers using a dedicated type.

Before this change we used to take a shortcut and place the actual
values that would be written to inode->i_{g,u}id into struct iattr. This
had the advantage that we moved idmappings mostly out of the picture
early on but it made reasoning about changes more difficult than it
should be.

The filesystem was never explicitly told that it dealt with an idmapped
mount. The transition to the value that needed to be stored in
inode->i_{g,u}id appeared way too early and increased the probability of
bugs in various codepaths.

We know place the same value in struct iattr no matter if this is an
idmapped mount or not. The vfs will only deal with type safe
vfs{g,u}id_t. This makes it massively safer to perform permission checks
as the type will tell us what checks we need to perform and what helpers
we need to use.

Fileystems raising FS_ALLOW_IDMAP can't simply write ia_vfs{g,u}id to
inode->i_{g,u}id since they are different types. Instead they need to
use the dedicated vfs{g,u}id_to_k{g,u}id() helpers that map the
vfs{g,u}id into the filesystem.

The other nice effect is that filesystems like overlayfs don't need to
care about idmappings explicitly anymore and can simply set up struct
iattr accordingly directly.

Link: https://lore.kernel.org/lkml/CAHk-=win6+ahs1EwLkcq8apqLi_1wXFWbrPf340zYEhObpz4jA@mail.gmail.com [1]
Link: https://lore.kernel.org/r/20220621141454.2914719-9-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
CC: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c                         | 68 ++++++++++++++++++---------------------
 fs/ext2/inode.c                   |  8 ++---
 fs/ext4/inode.c                   | 12 +++----
 fs/f2fs/file.c                    | 16 ++++-----
 fs/f2fs/recovery.c                |  8 ++---
 fs/fat/file.c                     |  8 +++--
 fs/jfs/file.c                     |  4 +--
 fs/ocfs2/file.c                   |  2 +-
 fs/open.c                         | 60 +++++++++++++++++++++++++---------
 fs/overlayfs/copy_up.c            |  4 +--
 fs/overlayfs/overlayfs.h          | 12 +------
 fs/quota/dquot.c                  | 14 +++++---
 fs/reiserfs/inode.c               |  4 +--
 fs/xfs/xfs_iops.c                 | 10 +++---
 fs/zonefs/super.c                 |  2 +-
 include/linux/fs.h                |  4 +++
 include/linux/quotaops.h          |  4 +--
 security/integrity/evm/evm_main.c |  4 +--
 18 files changed, 137 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index 88e2ca30d42e..1ba7ddef537f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -31,15 +31,15 @@
  * performed on the raw inode simply passs init_user_ns.
  */
 static bool chown_ok(struct user_namespace *mnt_userns,
-		     const struct inode *inode,
-		     kuid_t uid)
+		     const struct inode *inode, vfsuid_t ia_vfsuid)
 {
-	kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
-	if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
+	vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
+	    vfsuid_eq(ia_vfsuid, vfsuid))
 		return true;
 	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (uid_eq(kuid, INVALID_UID) &&
+	if (!vfsuid_valid(vfsuid) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
@@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns,
  * performed on the raw inode simply passs init_user_ns.
  */
 static bool chgrp_ok(struct user_namespace *mnt_userns,
-		     const struct inode *inode, kgid_t gid)
+		     const struct inode *inode, vfsgid_t ia_vfsgid)
 {
-	kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
-	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
-		kgid_t mapped_gid;
-
-		if (gid_eq(gid, inode->i_gid))
+	vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+	vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
+		if (vfsgid_eq(ia_vfsgid, vfsgid))
 			return true;
-		mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
-		if (in_group_p(mapped_gid))
+		if (vfsgid_in_group_p(ia_vfsgid))
 			return true;
 	}
 	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (gid_eq(kgid, INVALID_GID) &&
+	if (!vfsgid_valid(vfsgid) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
@@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
 		goto kill_priv;
 
 	/* Make sure a caller can chown. */
-	if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
+	if ((ia_valid & ATTR_UID) &&
+	    !chown_ok(mnt_userns, inode, attr->ia_vfsuid))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
-	if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
+	if ((ia_valid & ATTR_GID) &&
+	    !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		kgid_t mapped_gid;
+		vfsgid_t vfsgid;
 
 		if (!inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 
 		if (ia_valid & ATTR_GID)
-			mapped_gid = mapped_kgid_fs(mnt_userns,
-						i_user_ns(inode), attr->ia_gid);
+			vfsgid = attr->ia_vfsgid;
 		else
-			mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+			vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
 
 		/* Also check the setgid bit! */
-		if (!in_group_p(mapped_gid) &&
+		if (!vfsgid_in_group_p(vfsgid) &&
 		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
@@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
  * setattr_copy must be called with i_mutex held.
  *
  * setattr_copy updates the inode's metadata with that specified
- * in attr on idmapped mounts. If file ownership is changed setattr_copy
- * doesn't map ia_uid and ia_gid. It will asssume the caller has already
- * provided the intended values. Necessary permission checks to determine
+ * in attr on idmapped mounts. Necessary permission checks to determine
  * whether or not the S_ISGID property needs to be removed are performed with
  * the correct idmapped mount permission helpers.
  * Noticeably missing is inode size update, which is more complex
@@ -242,8 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	i_uid_update(&init_user_ns, attr, inode);
-	i_gid_update(&init_user_ns, attr, inode);
+	i_uid_update(mnt_userns, attr, inode);
+	i_gid_update(mnt_userns, attr, inode);
 	if (ia_valid & ATTR_ATIME)
 		inode->i_atime = attr->ia_atime;
 	if (ia_valid & ATTR_MTIME)
@@ -252,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
-		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
-		if (!in_group_p(kgid) &&
+		vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+		if (!vfsgid_in_group_p(vfsgid) &&
 		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
@@ -304,9 +301,6 @@ EXPORT_SYMBOL(may_setattr);
  * retry.  Because breaking a delegation may take a long time, the
  * caller should drop the i_mutex before doing so.
  *
- * If file ownership is changed notify_change() doesn't map ia_uid and
- * ia_gid. It will asssume the caller has already provided the intended values.
- *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.  Also, passing NULL is fine for callers holding
@@ -395,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 	 * namespace of the superblock.
 	 */
 	if (ia_valid & ATTR_UID &&
-	    !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+	    !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+				  attr->ia_vfsuid))
 		return -EOVERFLOW;
 	if (ia_valid & ATTR_GID &&
-	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+	    !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+				  attr->ia_vfsgid))
 		return -EOVERFLOW;
 
 	/* Don't allow modifications of files with invalid uids or
 	 * gids unless those uids & gids are being made valid.
 	 */
 	if (!(ia_valid & ATTR_UID) &&
-	    !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
+	    !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)))
 		return -EOVERFLOW;
 	if (!(ia_valid & ATTR_GID) &&
-	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+	    !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
 		return -EOVERFLOW;
 
-	error = security_inode_setattr(&init_user_ns, dentry, attr);
+	error = security_inode_setattr(mnt_userns, dentry, attr);
 	if (error)
 		return error;
 	error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 593b79416e0e..7a192e4e7fa9 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	if (is_quota_modification(&init_user_ns, inode, iattr)) {
+	if (is_quota_modification(mnt_userns, inode, iattr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
 	}
-	if (i_uid_needs_update(&init_user_ns, iattr, inode) ||
-	    i_gid_needs_update(&init_user_ns, iattr, inode)) {
-		error = dquot_transfer(&init_user_ns, inode, iattr);
+	if (i_uid_needs_update(mnt_userns, iattr, inode) ||
+	    i_gid_needs_update(mnt_userns, iattr, inode)) {
+		error = dquot_transfer(mnt_userns, inode, iattr);
 		if (error)
 			return error;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 72f08c184768..3dcc1dd1f179 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5350,14 +5350,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	if (is_quota_modification(&init_user_ns, inode, attr)) {
+	if (is_quota_modification(mnt_userns, inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
 	}
 
-	if (i_uid_needs_update(&init_user_ns, attr, inode) ||
-	    i_gid_needs_update(&init_user_ns, attr, inode)) {
+	if (i_uid_needs_update(mnt_userns, attr, inode) ||
+	    i_gid_needs_update(mnt_userns, attr, inode)) {
 		handle_t *handle;
 
 		/* (user+group)*(old+new) structure, inode write (sb,
@@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 * counts xattr inode references.
 		 */
 		down_read(&EXT4_I(inode)->xattr_sem);
-		error = dquot_transfer(&init_user_ns, inode, attr);
+		error = dquot_transfer(mnt_userns, inode, attr);
 		up_read(&EXT4_I(inode)->xattr_sem);
 
 		if (error) {
@@ -5383,8 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
-		i_uid_update(&init_user_ns, attr, inode);
-		i_gid_update(&init_user_ns, attr, inode);
+		i_uid_update(mnt_userns, attr, inode);
+		i_gid_update(mnt_userns, attr, inode);
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 		if (unlikely(error)) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 02b2d56d4edc..d66e37d80a2d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -861,8 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	i_uid_update(&init_user_ns, attr, inode);
-	i_gid_update(&init_user_ns, attr, inode);
+	i_uid_update(mnt_userns, attr, inode);
+	i_gid_update(mnt_userns, attr, inode);
 	if (ia_valid & ATTR_ATIME)
 		inode->i_atime = attr->ia_atime;
 	if (ia_valid & ATTR_MTIME)
@@ -915,15 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (err)
 		return err;
 
-	if (is_quota_modification(&init_user_ns, inode, attr)) {
+	if (is_quota_modification(mnt_userns, inode, attr)) {
 		err = f2fs_dquot_initialize(inode);
 		if (err)
 			return err;
 	}
-	if (i_uid_needs_update(&init_user_ns, attr, inode) ||
-	    i_gid_needs_update(&init_user_ns, attr, inode)) {
+	if (i_uid_needs_update(mnt_userns, attr, inode) ||
+	    i_gid_needs_update(mnt_userns, attr, inode)) {
 		f2fs_lock_op(F2FS_I_SB(inode));
-		err = dquot_transfer(&init_user_ns, inode, attr);
+		err = dquot_transfer(mnt_userns, inode, attr);
 		if (err) {
 			set_sbi_flag(F2FS_I_SB(inode),
 					SBI_QUOTA_NEED_REPAIR);
@@ -934,8 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 * update uid/gid under lock_op(), so that dquot and inode can
 		 * be updated atomically.
 		 */
-		i_uid_update(&init_user_ns, attr, inode);
-		i_gid_update(&init_user_ns, attr, inode);
+		i_uid_update(mnt_userns, attr, inode);
+		i_gid_update(mnt_userns, attr, inode);
 		f2fs_mark_inode_dirty_sync(inode, true);
 		f2fs_unlock_op(F2FS_I_SB(inode));
 	}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 8e5a089f1ac8..dcd0a1e35095 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -255,12 +255,12 @@ static int recover_quota_data(struct inode *inode, struct page *page)
 
 	memset(&attr, 0, sizeof(attr));
 
-	attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
-	attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+	attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid));
+	attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid));
 
-	if (!uid_eq(attr.ia_uid, inode->i_uid))
+	if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode)))
 		attr.ia_valid |= ATTR_UID;
-	if (!gid_eq(attr.ia_gid, inode->i_gid))
+	if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode)))
 		attr.ia_valid |= ATTR_GID;
 
 	if (!attr.ia_valid)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 530f18173db2..3e4eb3467cb4 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -90,7 +90,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	 * out the RO attribute for checking by the security
 	 * module, just because it maps to a file mode.
 	 */
-	err = security_inode_setattr(&init_user_ns,
+	err = security_inode_setattr(file_mnt_user_ns(file),
 				     file->f_path.dentry, &ia);
 	if (err)
 		goto out_unlock_inode;
@@ -517,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	}
 
 	if (((attr->ia_valid & ATTR_UID) &&
-	     (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+	     (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid),
+		      sbi->options.fs_uid))) ||
 	    ((attr->ia_valid & ATTR_GID) &&
-	     (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+	     (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid),
+		      sbi->options.fs_gid))) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
 	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index c18569b9895d..332dc9ac47a9 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (rc)
 		return rc;
 
-	if (is_quota_modification(&init_user_ns, inode, iattr)) {
+	if (is_quota_modification(mnt_userns, inode, iattr)) {
 		rc = dquot_initialize(inode);
 		if (rc)
 			return rc;
 	}
 	if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		rc = dquot_transfer(&init_user_ns, inode, iattr);
+		rc = dquot_transfer(mnt_userns, inode, iattr);
 		if (rc)
 			return rc;
 	}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e09cd8911da..9c67edd215d5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (status)
 		return status;
 
-	if (is_quota_modification(&init_user_ns, inode, attr)) {
+	if (is_quota_modification(mnt_userns, inode, attr)) {
 		status = dquot_initialize(inode);
 		if (status)
 			return status;
diff --git a/fs/open.c b/fs/open.c
index 1d57fbde2feb..2790aac66e58 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 	return do_fchmodat(AT_FDCWD, filename, mode);
 }
 
+/**
+ * setattr_vfsuid - check and set ia_fsuid attribute
+ * @kuid: new inode owner
+ *
+ * Check whether @kuid is valid and if so generate and set vfsuid_t in
+ * ia_vfsuid.
+ *
+ * Return: true if @kuid is valid, false if not.
+ */
+static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
+{
+	if (!uid_valid(kuid))
+		return false;
+	attr->ia_valid |= ATTR_UID;
+	attr->ia_vfsuid = VFSUIDT_INIT(kuid);
+	return true;
+}
+
+/**
+ * setattr_vfsgid - check and set ia_fsgid attribute
+ * @kgid: new inode owner
+ *
+ * Check whether @kgid is valid and if so generate and set vfsgid_t in
+ * ia_vfsgid.
+ *
+ * Return: true if @kgid is valid, false if not.
+ */
+static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
+{
+	if (!gid_valid(kgid))
+		return false;
+	attr->ia_valid |= ATTR_GID;
+	attr->ia_vfsgid = VFSGIDT_INIT(kgid);
+	return true;
+}
+
 int chown_common(const struct path *path, uid_t user, gid_t group)
 {
 	struct user_namespace *mnt_userns, *fs_userns;
@@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
 
 	mnt_userns = mnt_user_ns(path->mnt);
 	fs_userns = i_user_ns(inode);
-	uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
-	gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
 
 retry_deleg:
 	newattrs.ia_valid =  ATTR_CTIME;
-	if (user != (uid_t) -1) {
-		if (!uid_valid(uid))
-			return -EINVAL;
-		newattrs.ia_valid |= ATTR_UID;
-		newattrs.ia_uid = uid;
-	}
-	if (group != (gid_t) -1) {
-		if (!gid_valid(gid))
-			return -EINVAL;
-		newattrs.ia_valid |= ATTR_GID;
-		newattrs.ia_gid = gid;
-	}
+	if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
+		return -EINVAL;
+	if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
+		return -EINVAL;
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 	inode_lock(inode);
-	error = security_path_chown(path, uid, gid);
+	/* Continue to send actual fs values, not the mount values. */
+	error = security_path_chown(
+		path,
+		from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid),
+		from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid));
 	if (!error)
 		error = notify_change(mnt_userns, path->dentry, &newattrs,
 				      &delegated_inode);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 714ec569d25b..245e2cb62708 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
 	if (!err) {
 		struct iattr attr = {
 			.ia_valid = ATTR_UID | ATTR_GID,
-			.ia_uid = stat->uid,
-			.ia_gid = stat->gid,
+			.ia_vfsuid = VFSUIDT_INIT(stat->uid),
+			.ia_vfsgid = VFSGIDT_INIT(stat->gid),
 		};
 		err = ovl_do_notify_change(ofs, upperdentry, &attr);
 	}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f34b7e02eee..e22e20f4811a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
 				       struct dentry *upperdentry,
 				       struct iattr *attr)
 {
-	struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs);
-	struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry));
-
-	if (attr->ia_valid & ATTR_UID)
-		attr->ia_uid = mapped_kuid_user(upper_mnt_userns,
-						fs_userns, attr->ia_uid);
-	if (attr->ia_valid & ATTR_GID)
-		attr->ia_gid = mapped_kgid_user(upper_mnt_userns,
-						fs_userns, attr->ia_gid);
-
-	return notify_change(upper_mnt_userns, upperdentry, attr, NULL);
+	return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
 }
 
 static inline int ovl_do_rmdir(struct ovl_fs *ofs,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df9af1ce2851..28966da7834e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2096,8 +2096,11 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
 	if (!dquot_active(inode))
 		return 0;
 
-	if (i_uid_needs_update(&init_user_ns, iattr, inode)) {
-		dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+	if (i_uid_needs_update(mnt_userns, iattr, inode)) {
+		kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsuid);
+
+		dquot = dqget(sb, make_kqid_uid(kuid));
 		if (IS_ERR(dquot)) {
 			if (PTR_ERR(dquot) != -ESRCH) {
 				ret = PTR_ERR(dquot);
@@ -2107,8 +2110,11 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
 		}
 		transfer_to[USRQUOTA] = dquot;
 	}
-	if (i_gid_needs_update(&init_user_ns, iattr, inode)) {
-		dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+	if (i_gid_needs_update(mnt_userns, iattr, inode)) {
+		kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsgid);
+
+		dquot = dqget(sb, make_kqid_gid(kgid));
 		if (IS_ERR(dquot)) {
 			if (PTR_ERR(dquot) != -ESRCH) {
 				ret = PTR_ERR(dquot);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1e89e76972a0..1141053b96ed 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3284,7 +3284,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	/* must be turned off for recursive notify_change calls */
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 
-	if (is_quota_modification(&init_user_ns, inode, attr)) {
+	if (is_quota_modification(mnt_userns, inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
 			return error;
@@ -3367,7 +3367,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		reiserfs_write_unlock(inode->i_sb);
 		if (error)
 			goto out;
-		error = dquot_transfer(&init_user_ns, inode, attr);
+		error = dquot_transfer(mnt_userns, inode, attr);
 		reiserfs_write_lock(inode->i_sb);
 		if (error) {
 			journal_end(&th);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 31ec29565fb4..a7402f6ea510 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -667,13 +667,15 @@ xfs_setattr_nonsize(
 		uint	qflags = 0;
 
 		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
-			uid = iattr->ia_uid;
+			uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsuid);
 			qflags |= XFS_QMOPT_UQUOTA;
 		} else {
 			uid = inode->i_uid;
 		}
 		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
-			gid = iattr->ia_gid;
+			gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+					  iattr->ia_vfsgid);
 			qflags |= XFS_QMOPT_GQUOTA;
 		}  else {
 			gid = inode->i_gid;
@@ -705,12 +707,12 @@ xfs_setattr_nonsize(
 	 * also.
 	 */
 	if (XFS_IS_UQUOTA_ON(mp) &&
-	    i_uid_needs_update(&init_user_ns, iattr, inode)) {
+	    i_uid_needs_update(mnt_userns, iattr, inode)) {
 		ASSERT(udqp);
 		old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
 	}
 	if (XFS_IS_GQUOTA_ON(mp) &&
-	    i_gid_needs_update(&init_user_ns, iattr, inode)) {
+	    i_gid_needs_update(mnt_userns, iattr, inode)) {
 		ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
 		ASSERT(gdqp);
 		old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index dd422b1d7320..f5d8338967cb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
 	     !uid_eq(iattr->ia_uid, inode->i_uid)) ||
 	    ((iattr->ia_valid & ATTR_GID) &&
 	     !gid_eq(iattr->ia_gid, inode->i_gid))) {
-		ret = dquot_transfer(&init_user_ns, inode, iattr);
+		ret = dquot_transfer(mnt_userns, inode, iattr);
 		if (ret)
 			return ret;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0f8cc7f2665a..d6e3347cbf69 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -228,6 +228,10 @@ struct iattr {
 	 * are a dedicated type requiring the filesystem to use the dedicated
 	 * helpers. Other filesystem can continue to use ia_{g,u}id until they
 	 * have been ported.
+	 *
+	 * They always contain the same value. In other words FS_ALLOW_IDMAP
+	 * pass down the same value on idmapped mounts as they would on regular
+	 * mounts.
 	 */
 	union {
 		kuid_t		ia_uid;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 0342ff6584fd..0d8625d71733 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -24,8 +24,8 @@ static inline bool is_quota_modification(struct user_namespace *mnt_userns,
 					 struct inode *inode, struct iattr *ia)
 {
 	return ((ia->ia_valid & ATTR_SIZE) ||
-		i_uid_needs_update(&init_user_ns, ia, inode) ||
-		i_gid_needs_update(&init_user_ns, ia, inode));
+		i_uid_needs_update(mnt_userns, ia, inode) ||
+		i_gid_needs_update(mnt_userns, ia, inode));
 }
 
 #if defined(CONFIG_QUOTA)
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 7f4af5b58583..93e8bc047a73 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -761,8 +761,8 @@ static int evm_attr_change(struct user_namespace *mnt_userns,
 	struct inode *inode = d_backing_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
 
-	if (!i_uid_needs_update(&init_user_ns, attr, inode) &&
-	    !i_gid_needs_update(&init_user_ns, attr, inode) &&
+	if (!i_uid_needs_update(mnt_userns, attr, inode) &&
+	    !i_gid_needs_update(mnt_userns, attr, inode) &&
 	    (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
 		return 0;
 
-- 
cgit 


From 0cae04373b77a117830e5f7d7aaa7eaf01f950d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Jun 2022 09:47:33 +0200
Subject: dmaengine: remove DMA_MEMCPY_SG once again

This was removed before due to the complete lack of users, but
3218910fd585 ("dmaengine: Add core function and capability check for
DMA_MEMCPY_SG") and 29cf37fa6dd9 ("dmaengine: Add consumer for the new
DMA_MEMCPY_SG API function.") added it back despite still not having
any users whatsoever.

Fixes: 3218910fd585 ("dmaengine: Add core function and capability check for DMA_MEMCPY_SG")
Fixes: 29cf37fa6dd9 ("dmaengine: Add consumer for the new DMA_MEMCPY_SG API function.")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/20220606074733.622616-1-hch@lst.de
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/driver-api/dmaengine/provider.rst |  10 --
 drivers/dma/dmaengine.c                         |   7 --
 drivers/dma/xilinx/xilinx_dma.c                 | 122 ------------------------
 include/linux/dmaengine.h                       |  20 ----
 4 files changed, 159 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 1e0f1f85d10e..ceac2a300e32 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -162,16 +162,6 @@ Currently, the types available are:
 
   - The device is able to do memory to memory copies
 
-- - DMA_MEMCPY_SG
-
-  - The device supports memory to memory scatter-gather transfers.
-
-  - Even though a plain memcpy can look like a particular case of a
-    scatter-gather transfer, with a single chunk to copy, it's a distinct
-    transaction type in the mem2mem transfer case. This is because some very
-    simple devices might be able to do contiguous single-chunk memory copies,
-    but have no support for more complex SG transfers.
-
   - No matter what the overall size of the combined chunks for source and
     destination is, only as many bytes as the smallest of the two will be
     transmitted. That means the number and size of the scatter-gather buffers in
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e80feeea0e01..c741b6431958 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1153,13 +1153,6 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
-	if (dma_has_cap(DMA_MEMCPY_SG, device->cap_mask) && !device->device_prep_dma_memcpy_sg) {
-		dev_err(device->dev,
-			"Device claims capability %s, but op is not defined\n",
-			"DMA_MEMCPY_SG");
-		return -EIO;
-	}
-
 	if (dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor) {
 		dev_err(device->dev,
 			"Device claims capability %s, but op is not defined\n",
diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index cd62bbb50e8b..6276934d4d2b 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -2127,126 +2127,6 @@ error:
 	return NULL;
 }
 
-/**
- * xilinx_cdma_prep_memcpy_sg - prepare descriptors for a memcpy_sg transaction
- * @dchan: DMA channel
- * @dst_sg: Destination scatter list
- * @dst_sg_len: Number of entries in destination scatter list
- * @src_sg: Source scatter list
- * @src_sg_len: Number of entries in source scatter list
- * @flags: transfer ack flags
- *
- * Return: Async transaction descriptor on success and NULL on failure
- */
-static struct dma_async_tx_descriptor *xilinx_cdma_prep_memcpy_sg(
-			struct dma_chan *dchan, struct scatterlist *dst_sg,
-			unsigned int dst_sg_len, struct scatterlist *src_sg,
-			unsigned int src_sg_len, unsigned long flags)
-{
-	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
-	struct xilinx_dma_tx_descriptor *desc;
-	struct xilinx_cdma_tx_segment *segment, *prev = NULL;
-	struct xilinx_cdma_desc_hw *hw;
-	size_t len, dst_avail, src_avail;
-	dma_addr_t dma_dst, dma_src;
-
-	if (unlikely(dst_sg_len == 0 || src_sg_len == 0))
-		return NULL;
-
-	if (unlikely(!dst_sg  || !src_sg))
-		return NULL;
-
-	desc = xilinx_dma_alloc_tx_descriptor(chan);
-	if (!desc)
-		return NULL;
-
-	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
-	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
-
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-	/*
-	 * loop until there is either no more source or no more destination
-	 * scatterlist entry
-	 */
-	while (true) {
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, chan->xdev->max_buffer_len);
-		if (len == 0)
-			goto fetch;
-
-		/* Allocate the link descriptor from DMA pool */
-		segment = xilinx_cdma_alloc_tx_segment(chan);
-		if (!segment)
-			goto error;
-
-		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
-			dst_avail;
-		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
-			src_avail;
-		hw = &segment->hw;
-		hw->control = len;
-		hw->src_addr = dma_src;
-		hw->dest_addr = dma_dst;
-		if (chan->ext_addr) {
-			hw->src_addr_msb = upper_32_bits(dma_src);
-			hw->dest_addr_msb = upper_32_bits(dma_dst);
-		}
-
-		if (prev) {
-			prev->hw.next_desc = segment->phys;
-			if (chan->ext_addr)
-				prev->hw.next_desc_msb =
-					upper_32_bits(segment->phys);
-		}
-
-		prev = segment;
-		dst_avail -= len;
-		src_avail -= len;
-		list_add_tail(&segment->node, &desc->segments);
-
-fetch:
-		/* Fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-			if (dst_sg_len == 0)
-				break;
-			dst_sg = sg_next(dst_sg);
-			if (dst_sg == NULL)
-				break;
-			dst_sg_len--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-		/* Fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-			if (src_sg_len == 0)
-				break;
-			src_sg = sg_next(src_sg);
-			if (src_sg == NULL)
-				break;
-			src_sg_len--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	if (list_empty(&desc->segments)) {
-		dev_err(chan->xdev->dev,
-			"%s: Zero-size SG transfer requested\n", __func__);
-		goto error;
-	}
-
-	/* Link the last hardware descriptor with the first. */
-	segment = list_first_entry(&desc->segments,
-				struct xilinx_cdma_tx_segment, node);
-	desc->async_tx.phys = segment->phys;
-	prev->hw.next_desc = segment->phys;
-
-	return &desc->async_tx;
-
-error:
-	xilinx_dma_free_tx_descriptor(chan, desc);
-	return NULL;
-}
-
 /**
  * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
  * @dchan: DMA channel
@@ -3240,9 +3120,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
 	} else if (xdev->dma_config->dmatype == XDMA_TYPE_CDMA) {
 		dma_cap_set(DMA_MEMCPY, xdev->common.cap_mask);
-		dma_cap_set(DMA_MEMCPY_SG, xdev->common.cap_mask);
 		xdev->common.device_prep_dma_memcpy = xilinx_cdma_prep_memcpy;
-		xdev->common.device_prep_dma_memcpy_sg = xilinx_cdma_prep_memcpy_sg;
 		/* Residue calculation is supported by only AXI DMA and CDMA */
 		xdev->common.residue_granularity =
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b46b88e6aa0d..c923f4e60f24 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -50,7 +50,6 @@ enum dma_status {
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
-	DMA_MEMCPY_SG,
 	DMA_XOR,
 	DMA_PQ,
 	DMA_XOR_VAL,
@@ -887,11 +886,6 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
@@ -1060,20 +1054,6 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
-static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags)
-{
-	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg)
-		return NULL;
-
-	return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents,
-						       src_sg, src_nents,
-						       flags);
-}
-
 static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
 		enum dma_desc_metadata_mode mode)
 {
-- 
cgit 


From 742ab6df974ae8384a2dd213db1a3a06cf6d8936 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Jun 2022 23:15:32 +0200
Subject: x86/kvm/vmx: Make noinstr clean

The recent mmio_stale_data fixes broke the noinstr constraints:

  vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x15b: call to wrmsrl.constprop.0() leaves .noinstr.text section
  vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1bf: call to kvm_arch_has_assigned_device() leaves .noinstr.text section

make it all happy again.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/vmx/vmx.c   | 6 +++---
 arch/x86/kvm/x86.c       | 4 ++--
 include/linux/kvm_host.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3a919e49129b..009bbae9ad66 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -383,9 +383,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
 	if (!vmx->disable_fb_clear)
 		return;
 
-	rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+	msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
 	msr |= FB_CLEAR_DIS;
-	wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
 	/* Cache the MSR value to avoid reading it later */
 	vmx->msr_ia32_mcu_opt_ctrl = msr;
 }
@@ -396,7 +396,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
 		return;
 
 	vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
-	wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
 }
 
 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1910e1e78b15..26d0cac32f73 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12631,9 +12631,9 @@ void kvm_arch_end_assignment(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
 
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
 {
-	return atomic_read(&kvm->arch.assigned_device_count);
+	return arch_atomic_read(&kvm->arch.assigned_device_count);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c20f2d55840c..83cf7fd842e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1513,7 +1513,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm)
 {
 }
 
-static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
+static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
 {
 	return false;
 }
-- 
cgit 


From 6b80b59b3555706508008f1f127b5412c89c7fd8 Mon Sep 17 00:00:00 2001
From: Alexandre Chartre <alexandre.chartre@oracle.com>
Date: Tue, 14 Jun 2022 23:15:49 +0200
Subject: x86/bugs: Report AMD retbleed vulnerability

Report that AMD x86 CPUs are vulnerable to the RETBleed (Arbitrary
Speculative Code Execution with Return Instructions) attack.

  [peterz: add hygon]
  [kim: invert parity; fam15h]

Co-developed-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/kernel/cpu/bugs.c         | 13 +++++++++++++
 arch/x86/kernel/cpu/common.c       | 19 +++++++++++++++++++
 drivers/base/cpu.c                 |  8 ++++++++
 include/linux/cpu.h                |  2 ++
 5 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index fa3d0db1470e..c16503ca3b75 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -453,5 +453,6 @@
 #define X86_BUG_ITLB_MULTIHIT		X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
 #define X86_BUG_SRBDS			X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
 #define X86_BUG_MMIO_STALE_DATA		X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
+#define X86_BUG_RETBLEED		X86_BUG(26) /* CPU is affected by RETBleed */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 74c62cc47a5f..74fdd21e416b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1994,6 +1994,11 @@ static ssize_t srbds_show_state(char *buf)
 	return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
 }
 
+static ssize_t retbleed_show_state(char *buf)
+{
+	return sprintf(buf, "Vulnerable\n");
+}
+
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       char *buf, unsigned int bug)
 {
@@ -2039,6 +2044,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_MMIO_STALE_DATA:
 		return mmio_stale_data_show_state(buf);
 
+	case X86_BUG_RETBLEED:
+		return retbleed_show_state(buf);
+
 	default:
 		break;
 	}
@@ -2095,4 +2103,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
 }
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4730b0a58f24..4089c173c6ae 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1205,16 +1205,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	{}
 };
 
+#define VULNBL(vendor, family, model, blacklist)	\
+	X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
 #define VULNBL_INTEL_STEPPINGS(model, steppings, issues)		   \
 	X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6,		   \
 					    INTEL_FAM6_##model, steppings, \
 					    X86_FEATURE_ANY, issues)
 
+#define VULNBL_AMD(family, blacklist)		\
+	VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
+#define VULNBL_HYGON(family, blacklist)		\
+	VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
+
 #define SRBDS		BIT(0)
 /* CPU is affected by X86_BUG_MMIO_STALE_DATA */
 #define MMIO		BIT(1)
 /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
 #define MMIO_SBDS	BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED	BIT(3)
 
 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_INTEL_STEPPINGS(IVYBRIDGE,	X86_STEPPING_ANY,		SRBDS),
@@ -1247,6 +1258,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_INTEL_STEPPINGS(ATOM_TREMONT,	X86_STEPPINGS(0x1, 0x1),	MMIO | MMIO_SBDS),
 	VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D,	X86_STEPPING_ANY,		MMIO),
 	VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L,	X86_STEPPINGS(0x0, 0x0),	MMIO | MMIO_SBDS),
+
+	VULNBL_AMD(0x15, RETBLEED),
+	VULNBL_AMD(0x16, RETBLEED),
+	VULNBL_AMD(0x17, RETBLEED),
+	VULNBL_HYGON(0x18, RETBLEED),
 	{}
 };
 
@@ -1348,6 +1364,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	    !arch_cap_mmio_immune(ia32_cap))
 		setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
 
+	if (cpu_matches(cpu_vuln_blacklist, RETBLEED))
+		setup_force_cpu_bug(X86_BUG_RETBLEED);
+
 	if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
 		return;
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index a97776ea9d99..4c98849577d4 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -570,6 +570,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device *dev,
 	return sysfs_emit(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_retbleed(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
@@ -580,6 +586,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
 static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
 static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL);
 static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
+static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -592,6 +599,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_itlb_multihit.attr,
 	&dev_attr_srbds.attr,
 	&dev_attr_mmio_stale_data.attr,
+	&dev_attr_retbleed.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2c7477354744..314802f98b9d 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr,
 extern ssize_t cpu_show_mmio_stale_data(struct device *dev,
 					struct device_attribute *attr,
 					char *buf);
+extern ssize_t cpu_show_retbleed(struct device *dev,
+				 struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
-- 
cgit 


From a09a6e2399ba0595c3042b3164f3ca68a3cff33e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Jun 2022 23:16:03 +0200
Subject: objtool: Add entry UNRET validation

Since entry asm is tricky, add a validation pass that ensures the
retbleed mitigation has been done before the first actual RET
instruction.

Entry points are those that either have UNWIND_HINT_ENTRY, which acts
as UNWIND_HINT_EMPTY but marks the instruction as an entry point, or
those that have UWIND_HINT_IRET_REGS at +0.

This is basically a variant of validate_branch() that is
intra-function and it will simply follow all branches from marked
entry points and ensures that all paths lead to ANNOTATE_UNRET_END.

If a path hits RET or an indirection the path is a fail and will be
reported.

There are 3 ANNOTATE_UNRET_END instances:

 - UNTRAIN_RET itself
 - exception from-kernel; this path doesn't need UNTRAIN_RET
 - all early exceptions; these also don't need UNTRAIN_RET

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/entry/entry_64.S               |   3 +-
 arch/x86/entry/entry_64_compat.S        |   6 +-
 arch/x86/include/asm/nospec-branch.h    |  12 +++
 arch/x86/include/asm/unwind_hints.h     |   4 +
 arch/x86/kernel/head_64.S               |   5 +
 arch/x86/xen/xen-asm.S                  |  10 +-
 include/linux/objtool.h                 |   3 +
 scripts/Makefile.vmlinux_o              |   2 +-
 tools/include/linux/objtool.h           |   3 +
 tools/objtool/builtin-check.c           |   6 ++
 tools/objtool/check.c                   | 177 ++++++++++++++++++++++++++++++--
 tools/objtool/include/objtool/builtin.h |   1 +
 tools/objtool/include/objtool/check.h   |  11 +-
 13 files changed, 222 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0c88802e1155..65e3b8b7cbe5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -85,7 +85,7 @@
  */
 
 SYM_CODE_START(entry_SYSCALL_64)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 
 	swapgs
@@ -1095,6 +1095,7 @@ SYM_CODE_START_LOCAL(error_entry)
 .Lerror_entry_done_lfence:
 	FENCE_SWAPGS_KERNEL_ENTRY
 	leaq	8(%rsp), %rax			/* return pt_regs pointer */
+	ANNOTATE_UNRET_END
 	RET
 
 .Lbstep_iret:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index bcb89d23ac0e..682338e7e2a3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,7 +49,7 @@
  * 0(%ebp) arg6
  */
 SYM_CODE_START(entry_SYSENTER_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	/* Interrupts are off on entry. */
 	swapgs
@@ -179,7 +179,7 @@ SYM_CODE_END(entry_SYSENTER_compat)
  * 0(%esp) arg6
  */
 SYM_CODE_START(entry_SYSCALL_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	/* Interrupts are off on entry. */
 	swapgs
@@ -305,7 +305,7 @@ SYM_CODE_END(entry_SYSCALL_compat)
  * ebp  arg6
  */
 SYM_CODE_START(entry_INT80_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	/*
 	 * Interrupts are off on entry.
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 05dd75478d7b..bba42bd78edf 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -81,6 +81,17 @@
  */
 #define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE
 
+/*
+ * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
+ * eventually turn into it's own annotation.
+ */
+.macro ANNOTATE_UNRET_END
+#ifdef CONFIG_DEBUG_ENTRY
+	ANNOTATE_RETPOLINE_SAFE
+	nop
+#endif
+.endm
+
 /*
  * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
  * indirect jmp/call which may be susceptible to the Spectre variant 2
@@ -131,6 +142,7 @@
  */
 .macro UNTRAIN_RET
 #ifdef CONFIG_RETPOLINE
+	ANNOTATE_UNRET_END
 	ALTERNATIVE_2 "",						\
 	              "call zen_untrain_ret", X86_FEATURE_UNRET,	\
 		      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 8b33674288ea..6f70fe4c93f2 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -11,6 +11,10 @@
 	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1
 .endm
 
+.macro UNWIND_HINT_ENTRY
+	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1
+.endm
+
 .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
 	.if \base == %rsp
 		.if \indirect
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 92c4afa2b729..d860d437631b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -389,6 +389,8 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
 	UNWIND_HINT_IRET_REGS offset=8
 	ENDBR
 
+	ANNOTATE_UNRET_END
+
 	/* Build pt_regs */
 	PUSH_AND_CLEAR_REGS
 
@@ -448,6 +450,7 @@ SYM_CODE_END(early_idt_handler_array)
 
 SYM_CODE_START_LOCAL(early_idt_handler_common)
 	UNWIND_HINT_IRET_REGS offset=16
+	ANNOTATE_UNRET_END
 	/*
 	 * The stack is the hardware frame, an error code or zero, and the
 	 * vector number.
@@ -497,6 +500,8 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
 	UNWIND_HINT_IRET_REGS offset=8
 	ENDBR
 
+	ANNOTATE_UNRET_END
+
 	/* Build pt_regs */
 	PUSH_AND_CLEAR_REGS
 
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6bf9d45b9178..6b4fdf6b9542 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -121,7 +121,7 @@ SYM_FUNC_END(xen_read_cr2_direct);
 
 .macro xen_pv_trap name
 SYM_CODE_START(xen_\name)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	pop %rcx
 	pop %r11
@@ -235,7 +235,7 @@ SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
 
 /* Normal 64-bit system call target */
 SYM_CODE_START(xen_entry_SYSCALL_64)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	popq %rcx
 	popq %r11
@@ -255,7 +255,7 @@ SYM_CODE_END(xen_entry_SYSCALL_64)
 
 /* 32-bit compat syscall target */
 SYM_CODE_START(xen_entry_SYSCALL_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	popq %rcx
 	popq %r11
@@ -273,7 +273,7 @@ SYM_CODE_END(xen_entry_SYSCALL_compat)
 
 /* 32-bit compat sysenter target */
 SYM_CODE_START(xen_entry_SYSENTER_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	/*
 	 * NB: Xen is polite and clears TF from EFLAGS for us.  This means
@@ -297,7 +297,7 @@ SYM_CODE_END(xen_entry_SYSENTER_compat)
 
 SYM_CODE_START(xen_entry_SYSCALL_compat)
 SYM_CODE_START(xen_entry_SYSENTER_compat)
-	UNWIND_HINT_EMPTY
+	UNWIND_HINT_ENTRY
 	ENDBR
 	lea 16(%rsp), %rsp	/* strip %rcx, %r11 */
 	mov $-ENOSYS, %rax
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 15b940ec1eac..b026f1ae39c6 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -32,11 +32,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
  * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 #define UNWIND_HINT_TYPE_FUNC		3
+#define UNWIND_HINT_TYPE_ENTRY		4
 
 #ifdef CONFIG_OBJTOOL
 
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 3c97a1564947..bc67748044a6 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -44,7 +44,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION))
 
 objtool_args := \
 	$(if $(delay-objtool),$(objtool_args)) \
-	$(if $(CONFIG_NOINSTR_VALIDATION), --noinstr) \
+	$(if $(CONFIG_NOINSTR_VALIDATION), --noinstr $(if $(CONFIG_RETPOLINE), --unret)) \
 	$(if $(CONFIG_GCOV_KERNEL), --no-unreachable) \
 	--link
 
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 15b940ec1eac..b026f1ae39c6 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -32,11 +32,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
  * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 #define UNWIND_HINT_TYPE_FUNC		3
+#define UNWIND_HINT_TYPE_ENTRY		4
 
 #ifdef CONFIG_OBJTOOL
 
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index f4c3a5091737..c063e1ff96b2 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -68,6 +68,7 @@ const struct option check_options[] = {
 	OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"),
 	OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"),
 	OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"),
+	OPT_BOOLEAN(0,   "unret", &opts.unret, "validate entry unret placement"),
 	OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"),
 	OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"),
 	OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"),
@@ -163,6 +164,11 @@ static bool link_opts_valid(struct objtool_file *file)
 		return false;
 	}
 
+	if (opts.unret) {
+		ERROR("--unret requires --link");
+		return false;
+	}
+
 	return true;
 }
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 7dc378156a63..822a490e6d87 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2032,16 +2032,24 @@ static int read_unwind_hints(struct objtool_file *file)
 
 		insn->hint = true;
 
-		if (opts.ibt && hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) {
+		if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) {
 			struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset);
 
-			if (sym && sym->bind == STB_GLOBAL &&
-			    insn->type != INSN_ENDBR && !insn->noendbr) {
-				WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR",
-					  insn->sec, insn->offset);
+			if (sym && sym->bind == STB_GLOBAL) {
+				if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) {
+					WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR",
+						  insn->sec, insn->offset);
+				}
+
+				insn->entry = 1;
 			}
 		}
 
+		if (hint->type == UNWIND_HINT_TYPE_ENTRY) {
+			hint->type = UNWIND_HINT_TYPE_CALL;
+			insn->entry = 1;
+		}
+
 		if (hint->type == UNWIND_HINT_TYPE_FUNC) {
 			insn->cfi = &func_cfi;
 			continue;
@@ -2116,8 +2124,9 @@ static int read_retpoline_hints(struct objtool_file *file)
 
 		if (insn->type != INSN_JUMP_DYNAMIC &&
 		    insn->type != INSN_CALL_DYNAMIC &&
-		    insn->type != INSN_RETURN) {
-			WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret",
+		    insn->type != INSN_RETURN &&
+		    insn->type != INSN_NOP) {
+			WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret/nop",
 				  insn->sec, insn->offset);
 			return -1;
 		}
@@ -3305,8 +3314,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 			return 1;
 		}
 
-		visited = 1 << state.uaccess;
-		if (insn->visited) {
+		visited = VISITED_BRANCH << state.uaccess;
+		if (insn->visited & VISITED_BRANCH_MASK) {
 			if (!insn->hint && !insn_cfi_match(insn, &state.cfi))
 				return 1;
 
@@ -3520,6 +3529,145 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
 	return warnings;
 }
 
+/*
+ * Validate rethunk entry constraint: must untrain RET before the first RET.
+ *
+ * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes
+ * before an actual RET instruction.
+ */
+static int validate_entry(struct objtool_file *file, struct instruction *insn)
+{
+	struct instruction *next, *dest;
+	int ret, warnings = 0;
+
+	for (;;) {
+		next = next_insn_to_validate(file, insn);
+
+		if (insn->visited & VISITED_ENTRY)
+			return 0;
+
+		insn->visited |= VISITED_ENTRY;
+
+		if (!insn->ignore_alts && !list_empty(&insn->alts)) {
+			struct alternative *alt;
+			bool skip_orig = false;
+
+			list_for_each_entry(alt, &insn->alts, list) {
+				if (alt->skip_orig)
+					skip_orig = true;
+
+				ret = validate_entry(file, alt->insn);
+				if (ret) {
+				        if (opts.backtrace)
+						BT_FUNC("(alt)", insn);
+					return ret;
+				}
+			}
+
+			if (skip_orig)
+				return 0;
+		}
+
+		switch (insn->type) {
+
+		case INSN_CALL_DYNAMIC:
+		case INSN_JUMP_DYNAMIC:
+		case INSN_JUMP_DYNAMIC_CONDITIONAL:
+			WARN_FUNC("early indirect call", insn->sec, insn->offset);
+			return 1;
+
+		case INSN_JUMP_UNCONDITIONAL:
+		case INSN_JUMP_CONDITIONAL:
+			if (!is_sibling_call(insn)) {
+				if (!insn->jump_dest) {
+					WARN_FUNC("unresolved jump target after linking?!?",
+						  insn->sec, insn->offset);
+					return -1;
+				}
+				ret = validate_entry(file, insn->jump_dest);
+				if (ret) {
+					if (opts.backtrace) {
+						BT_FUNC("(branch%s)", insn,
+							insn->type == INSN_JUMP_CONDITIONAL ? "-cond" : "");
+					}
+					return ret;
+				}
+
+				if (insn->type == INSN_JUMP_UNCONDITIONAL)
+					return 0;
+
+				break;
+			}
+
+			/* fallthrough */
+		case INSN_CALL:
+			dest = find_insn(file, insn->call_dest->sec,
+					 insn->call_dest->offset);
+			if (!dest) {
+				WARN("Unresolved function after linking!?: %s",
+				     insn->call_dest->name);
+				return -1;
+			}
+
+			ret = validate_entry(file, dest);
+			if (ret) {
+				if (opts.backtrace)
+					BT_FUNC("(call)", insn);
+				return ret;
+			}
+			/*
+			 * If a call returns without error, it must have seen UNTRAIN_RET.
+			 * Therefore any non-error return is a success.
+			 */
+			return 0;
+
+		case INSN_RETURN:
+			WARN_FUNC("RET before UNTRAIN", insn->sec, insn->offset);
+			return 1;
+
+		case INSN_NOP:
+			if (insn->retpoline_safe)
+				return 0;
+			break;
+
+		default:
+			break;
+		}
+
+		if (!next) {
+			WARN_FUNC("teh end!", insn->sec, insn->offset);
+			return -1;
+		}
+		insn = next;
+	}
+
+	return warnings;
+}
+
+/*
+ * Validate that all branches starting at 'insn->entry' encounter UNRET_END
+ * before RET.
+ */
+static int validate_unret(struct objtool_file *file)
+{
+	struct instruction *insn;
+	int ret, warnings = 0;
+
+	for_each_insn(file, insn) {
+		if (!insn->entry)
+			continue;
+
+		ret = validate_entry(file, insn);
+		if (ret < 0) {
+			WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset);
+			return ret;
+		}
+		warnings += ret;
+	}
+
+	return warnings;
+}
+
 static int validate_retpoline(struct objtool_file *file)
 {
 	struct instruction *insn;
@@ -4039,6 +4187,17 @@ int check(struct objtool_file *file)
 		warnings += ret;
 	}
 
+	if (opts.unret) {
+		/*
+		 * Must be after validate_branch() and friends, it plays
+		 * further games with insn->visited.
+		 */
+		ret = validate_unret(file);
+		if (ret < 0)
+			return ret;
+		warnings += ret;
+	}
+
 	if (opts.ibt) {
 		ret = validate_ibt(file);
 		if (ret < 0)
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 280ea18b7f2b..0c476b0b40a3 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -19,6 +19,7 @@ struct opts {
 	bool noinstr;
 	bool orc;
 	bool retpoline;
+	bool unret;
 	bool sls;
 	bool stackval;
 	bool static_call;
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index f10d7374f388..0eeedeacbefb 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -51,8 +51,10 @@ struct instruction {
 	   ignore_alts	: 1,
 	   hint		: 1,
 	   retpoline_safe : 1,
-	   noendbr	: 1;
-		/* 2 bit hole */
+	   noendbr	: 1,
+	   entry	: 1;
+		/* 1 bit hole */
+
 	s8 instr;
 	u8 visited;
 	/* u8 hole */
@@ -69,6 +71,11 @@ struct instruction {
 	struct cfi_state *cfi;
 };
 
+#define VISITED_BRANCH		0x01
+#define VISITED_BRANCH_UACCESS	0x02
+#define VISITED_BRANCH_MASK	0x03
+#define VISITED_ENTRY		0x04
+
 static inline bool is_static_jump(struct instruction *insn)
 {
 	return insn->type == INSN_JUMP_CONDITIONAL ||
-- 
cgit 


From 8faea26e611189e933ea2281975ff4dc7c1106b6 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Fri, 24 Jun 2022 12:52:40 +0200
Subject: objtool: Re-add UNWIND_HINT_{SAVE_RESTORE}

Commit

  c536ed2fffd5 ("objtool: Remove SAVE/RESTORE hints")

removed the save/restore unwind hints because they were no longer
needed. Now they're going to be needed again so re-add them.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/unwind_hints.h   | 12 +++++++++--
 include/linux/objtool.h               |  6 ++++--
 tools/include/linux/objtool.h         |  6 ++++--
 tools/objtool/check.c                 | 40 +++++++++++++++++++++++++++++++++++
 tools/objtool/include/objtool/check.h | 19 +++++++++--------
 5 files changed, 68 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 6f70fe4c93f2..f66fbe6537dd 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -8,11 +8,11 @@
 #ifdef __ASSEMBLY__
 
 .macro UNWIND_HINT_EMPTY
-	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1
+	UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1
 .endm
 
 .macro UNWIND_HINT_ENTRY
-	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1
+	UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
 .endm
 
 .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
@@ -56,6 +56,14 @@
 	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
 .endm
 
+.macro UNWIND_HINT_SAVE
+	UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE
+.endm
+
+.macro UNWIND_HINT_RESTORE
+	UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE
+.endm
+
 #else
 
 #define UNWIND_HINT_FUNC \
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index b026f1ae39c6..10bc88cc3bf6 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -40,6 +40,8 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 #define UNWIND_HINT_TYPE_FUNC		3
 #define UNWIND_HINT_TYPE_ENTRY		4
+#define UNWIND_HINT_TYPE_SAVE		5
+#define UNWIND_HINT_TYPE_RESTORE	6
 
 #ifdef CONFIG_OBJTOOL
 
@@ -127,7 +129,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -180,7 +182,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index b026f1ae39c6..10bc88cc3bf6 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -40,6 +40,8 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
 #define UNWIND_HINT_TYPE_FUNC		3
 #define UNWIND_HINT_TYPE_ENTRY		4
+#define UNWIND_HINT_TYPE_SAVE		5
+#define UNWIND_HINT_TYPE_RESTORE	6
 
 #ifdef CONFIG_OBJTOOL
 
@@ -127,7 +129,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -180,7 +182,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 822a490e6d87..ddfdd138cc2a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2032,6 +2032,17 @@ static int read_unwind_hints(struct objtool_file *file)
 
 		insn->hint = true;
 
+		if (hint->type == UNWIND_HINT_TYPE_SAVE) {
+			insn->hint = false;
+			insn->save = true;
+			continue;
+		}
+
+		if (hint->type == UNWIND_HINT_TYPE_RESTORE) {
+			insn->restore = true;
+			continue;
+		}
+
 		if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) {
 			struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset);
 
@@ -3329,6 +3340,35 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 			state.instr += insn->instr;
 
 		if (insn->hint) {
+			if (insn->restore) {
+				struct instruction *save_insn, *i;
+
+				i = insn;
+				save_insn = NULL;
+
+				sym_for_each_insn_continue_reverse(file, func, i) {
+					if (i->save) {
+						save_insn = i;
+						break;
+					}
+				}
+
+				if (!save_insn) {
+					WARN_FUNC("no corresponding CFI save for CFI restore",
+						  sec, insn->offset);
+					return 1;
+				}
+
+				if (!save_insn->visited) {
+					WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo",
+						  sec, insn->offset);
+					return 1;
+				}
+
+				insn->cfi = save_insn->cfi;
+				nr_cfi_reused++;
+			}
+
 			state.cfi = *insn->cfi;
 		} else {
 			/* XXX track if we actually changed state.cfi */
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 0eeedeacbefb..036129cebeee 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -46,18 +46,19 @@ struct instruction {
 	enum insn_type type;
 	unsigned long immediate;
 
-	u8 dead_end	: 1,
-	   ignore	: 1,
-	   ignore_alts	: 1,
-	   hint		: 1,
-	   retpoline_safe : 1,
-	   noendbr	: 1,
-	   entry	: 1;
-		/* 1 bit hole */
+	u16 dead_end		: 1,
+	   ignore		: 1,
+	   ignore_alts		: 1,
+	   hint			: 1,
+	   save			: 1,
+	   restore		: 1,
+	   retpoline_safe	: 1,
+	   noendbr		: 1,
+	   entry		: 1;
+		/* 7 bit hole */
 
 	s8 instr;
 	u8 visited;
-	/* u8 hole */
 
 	struct alt_group *alt_group;
 	struct symbol *call_dest;
-- 
cgit 


From 67f38b1c7324a13113e23f7177b1f1c223da3f55 Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:47 +0200
Subject: net: dsa: add support for ethtool get_rmon_stats()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to allow dsa drivers to specify the .get_rmon_stats()
operation.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  3 +++
 net/dsa/slave.c   | 13 +++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 14f07275852b..64da5ed27fdc 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -888,6 +888,9 @@ struct dsa_switch_ops {
 				     struct ethtool_eth_mac_stats *mac_stats);
 	void	(*get_eth_ctrl_stats)(struct dsa_switch *ds, int port,
 				      struct ethtool_eth_ctrl_stats *ctrl_stats);
+	void	(*get_rmon_stats)(struct dsa_switch *ds, int port,
+				  struct ethtool_rmon_stats *rmon_stats,
+				  const struct ethtool_rmon_hist_range **ranges);
 	void	(*get_stats64)(struct dsa_switch *ds, int port,
 				   struct rtnl_link_stats64 *s);
 	void	(*self_test)(struct dsa_switch *ds, int port,
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 509b98dd9954..760ca58307a3 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1002,6 +1002,18 @@ dsa_slave_get_eth_ctrl_stats(struct net_device *dev,
 		ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
 }
 
+static void
+dsa_slave_get_rmon_stats(struct net_device *dev,
+			 struct ethtool_rmon_stats *rmon_stats,
+			 const struct ethtool_rmon_hist_range **ranges)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_rmon_stats)
+		ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
 static void dsa_slave_net_selftest(struct net_device *ndev,
 				   struct ethtool_test *etest, u64 *buf)
 {
@@ -2081,6 +2093,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_eth_phy_stats	= dsa_slave_get_eth_phy_stats,
 	.get_eth_mac_stats	= dsa_slave_get_eth_mac_stats,
 	.get_eth_ctrl_stats	= dsa_slave_get_eth_ctrl_stats,
+	.get_rmon_stats		= dsa_slave_get_rmon_stats,
 	.set_wol		= dsa_slave_set_wol,
 	.get_wol		= dsa_slave_get_wol,
 	.set_eee		= dsa_slave_set_eee,
-- 
cgit 


From a08d6a6dc82036cbd889fe3d53f9c69dc13436eb Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:48 +0200
Subject: net: dsa: add Renesas RZ/N1 switch tag driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The switch that is present on the Renesas RZ/N1 SoC uses a specific
VLAN value followed by 6 bytes which contains forwarding configuration.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h             |   2 +
 include/uapi/linux/if_ether.h |   1 +
 net/dsa/Kconfig               |   7 +++
 net/dsa/Makefile              |   1 +
 net/dsa/tag_rzn1_a5psw.c      | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 124 insertions(+)
 create mode 100644 net/dsa/tag_rzn1_a5psw.c

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 64da5ed27fdc..33283eeda697 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -53,6 +53,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_SJA1110_VALUE		23
 #define DSA_TAG_PROTO_RTL8_4_VALUE		24
 #define DSA_TAG_PROTO_RTL8_4T_VALUE		25
+#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE		26
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -81,6 +82,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_SJA1110		= DSA_TAG_PROTO_SJA1110_VALUE,
 	DSA_TAG_PROTO_RTL8_4		= DSA_TAG_PROTO_RTL8_4_VALUE,
 	DSA_TAG_PROTO_RTL8_4T		= DSA_TAG_PROTO_RTL8_4T_VALUE,
+	DSA_TAG_PROTO_RZN1_A5PSW	= DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
 };
 
 struct dsa_switch;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 1d0bccc3fa54..d370165bc621 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -116,6 +116,7 @@
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_DSA_A5PSW	0xE001		/* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 8cb87b5067ee..63853fff4e2f 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -132,6 +132,13 @@ config NET_DSA_TAG_RTL8_4
 	  Say Y or M if you want to enable support for tagging frames for Realtek
 	  switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC.
 
+config NET_DSA_TAG_RZN1_A5PSW
+	tristate "Tag driver for Renesas RZ/N1 A5PSW switch"
+	help
+	  Say Y or M if you want to enable support for tagging frames for
+	  Renesas RZ/N1 embedded switch that uses an 8 byte tag located after
+	  destination MAC address.
+
 config NET_DSA_TAG_LAN9303
 	tristate "Tag driver for SMSC/Microchip LAN9303 family of switches"
 	help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9f75820e7c98..af28c24ead18 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
 obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
 obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
 obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o
+obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
 obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
 obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
 obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
new file mode 100644
index 000000000000..e2a5ee6ae688
--- /dev/null
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+/* To define the outgoing port and to discover the incoming port a TAG is
+ * inserted after Src MAC :
+ *
+ *       Dest MAC       Src MAC           TAG         Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ *                                |<--------------->|
+ *
+ * See struct a5psw_tag for layout
+ */
+
+#define ETH_P_DSA_A5PSW			0xE001
+#define A5PSW_TAG_LEN			8
+#define A5PSW_CTRL_DATA_FORCE_FORWARD	BIT(0)
+/* This is both used for xmit tag and rcv tagging */
+#define A5PSW_CTRL_DATA_PORT		GENMASK(3, 0)
+
+struct a5psw_tag {
+	__be16 ctrl_tag;
+	__be16 ctrl_data;
+	__be16 ctrl_data2_hi;
+	__be16 ctrl_data2_lo;
+};
+
+static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct a5psw_tag *ptag;
+	u32 data2_val;
+
+	BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN);
+
+	/* The Ethernet switch we are interfaced with needs packets to be at
+	 * least 60 bytes otherwise they will be discarded when they enter the
+	 * switch port logic.
+	 */
+	if (__skb_put_padto(skb, ETH_ZLEN, false))
+		return NULL;
+
+	/* provide 'A5PSW_TAG_LEN' bytes additional space */
+	skb_push(skb, A5PSW_TAG_LEN);
+
+	/* make room between MACs and Ether-Type to insert tag */
+	dsa_alloc_etype_header(skb, A5PSW_TAG_LEN);
+
+	ptag = dsa_etype_header_pos_tx(skb);
+
+	data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index));
+	ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW);
+	ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD);
+	ptag->ctrl_data2_lo = htons(data2_val);
+	ptag->ctrl_data2_hi = 0;
+
+	return skb;
+}
+
+static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	struct a5psw_tag *tag;
+	int port;
+
+	if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
+		dev_warn_ratelimited(&dev->dev,
+				     "Dropping packet, cannot pull\n");
+		return NULL;
+	}
+
+	tag = dsa_etype_header_pos_rx(skb);
+
+	if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
+		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+		return NULL;
+	}
+
+	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
+
+	skb->dev = dsa_master_find_slave(dev, 0, port);
+	if (!skb->dev)
+		return NULL;
+
+	skb_pull_rcsum(skb, A5PSW_TAG_LEN);
+	dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
+
+	dsa_default_offload_fwd_mark(skb);
+
+	return skb;
+}
+
+static const struct dsa_device_ops a5psw_netdev_ops = {
+	.name	= "a5psw",
+	.proto	= DSA_TAG_PROTO_RZN1_A5PSW,
+	.xmit	= a5psw_tag_xmit,
+	.rcv	= a5psw_tag_rcv,
+	.needed_headroom = A5PSW_TAG_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW);
+module_dsa_tag_driver(a5psw_netdev_ops);
-- 
cgit 


From c823c2bf91568f3e473479e25dcc225a5be4b7b1 Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:49 +0200
Subject: dt-bindings: net: pcs: add bindings for Renesas RZ/N1 MII converter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This MII converter can be found on the RZ/N1 processor family. The MII
converter ports are declared as subnodes which are then referenced by
users of the PCS driver such as the switch.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../bindings/net/pcs/renesas,rzn1-miic.yaml        | 171 +++++++++++++++++++++
 include/dt-bindings/net/pcs-rzn1-miic.h            |  33 ++++
 2 files changed, 204 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
 create mode 100644 include/dt-bindings/net/pcs-rzn1-miic.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
new file mode 100644
index 000000000000..2d33bbab7163
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/pcs/renesas,rzn1-miic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/N1 MII converter
+
+maintainers:
+  - Clément Léger <clement.leger@bootlin.com>
+
+description: |
+  This MII converter is present on the Renesas RZ/N1 SoC family. It is
+  responsible to do MII passthrough or convert it to RMII/RGMII.
+
+properties:
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
+
+  compatible:
+    items:
+      - enum:
+          - renesas,r9a06g032-miic
+      - const: renesas,rzn1-miic
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: MII reference clock
+      - description: RGMII reference clock
+      - description: RMII reference clock
+      - description: AHB clock used for the MII converter register interface
+
+  clock-names:
+    items:
+      - const: mii_ref
+      - const: rgmii_ref
+      - const: rmii_ref
+      - const: hclk
+
+  renesas,miic-switch-portin:
+    description: MII Switch PORTIN configuration. This value should use one of
+      the values defined in dt-bindings/net/pcs-rzn1-miic.h.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [1, 2]
+
+  power-domains:
+    maxItems: 1
+
+patternProperties:
+  "^mii-conv@[0-5]$":
+    type: object
+    description: MII converter port
+
+    properties:
+      reg:
+        description: MII Converter port number.
+        enum: [1, 2, 3, 4, 5]
+
+      renesas,miic-input:
+        description: Converter input port configuration. This value should use
+          one of the values defined in dt-bindings/net/pcs-rzn1-miic.h.
+        $ref: /schemas/types.yaml#/definitions/uint32
+
+    required:
+      - reg
+      - renesas,miic-input
+
+    additionalProperties: false
+
+    allOf:
+      - if:
+          properties:
+            reg:
+              const: 1
+        then:
+          properties:
+            renesas,miic-input:
+              const: 0
+      - if:
+          properties:
+            reg:
+              const: 2
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [1, 11]
+      - if:
+          properties:
+            reg:
+              const: 3
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [7, 10]
+      - if:
+          properties:
+            reg:
+              const: 4
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [4, 6, 9, 13]
+      - if:
+          properties:
+            reg:
+              const: 5
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [3, 5, 8, 12]
+
+required:
+  - '#address-cells'
+  - '#size-cells'
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - power-domains
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/net/pcs-rzn1-miic.h>
+    #include <dt-bindings/clock/r9a06g032-sysctrl.h>
+
+    eth-miic@44030000 {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      compatible = "renesas,r9a06g032-miic", "renesas,rzn1-miic";
+      reg = <0x44030000 0x10000>;
+      clocks = <&sysctrl R9A06G032_CLK_MII_REF>,
+              <&sysctrl R9A06G032_CLK_RGMII_REF>,
+              <&sysctrl R9A06G032_CLK_RMII_REF>,
+              <&sysctrl R9A06G032_HCLK_SWITCH_RG>;
+      clock-names = "mii_ref", "rgmii_ref", "rmii_ref", "hclk";
+      renesas,miic-switch-portin = <MIIC_GMAC2_PORT>;
+      power-domains = <&sysctrl>;
+
+      mii_conv1: mii-conv@1 {
+        renesas,miic-input = <MIIC_GMAC1_PORT>;
+        reg = <1>;
+      };
+
+      mii_conv2: mii-conv@2 {
+        renesas,miic-input = <MIIC_SWITCH_PORTD>;
+        reg = <2>;
+      };
+
+      mii_conv3: mii-conv@3 {
+        renesas,miic-input = <MIIC_SWITCH_PORTC>;
+        reg = <3>;
+      };
+
+      mii_conv4: mii-conv@4 {
+        renesas,miic-input = <MIIC_SWITCH_PORTB>;
+        reg = <4>;
+      };
+
+      mii_conv5: mii-conv@5 {
+        renesas,miic-input = <MIIC_SWITCH_PORTA>;
+        reg = <5>;
+      };
+    };
diff --git a/include/dt-bindings/net/pcs-rzn1-miic.h b/include/dt-bindings/net/pcs-rzn1-miic.h
new file mode 100644
index 000000000000..784782eaec9e
--- /dev/null
+++ b/include/dt-bindings/net/pcs-rzn1-miic.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2022 Schneider-Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#ifndef _DT_BINDINGS_PCS_RZN1_MIIC
+#define _DT_BINDINGS_PCS_RZN1_MIIC
+
+/*
+ * Reefer to the datasheet [1] section 8.2.1, Internal Connection of Ethernet
+ * Ports to check the available combination
+ *
+ * [1] REN_r01uh0750ej0140-rzn1-introduction_MAT_20210228.pdf
+ */
+
+#define MIIC_GMAC1_PORT			0
+#define MIIC_GMAC2_PORT			1
+#define MIIC_RTOS_PORT			2
+#define MIIC_SERCOS_PORTA		3
+#define MIIC_SERCOS_PORTB		4
+#define MIIC_ETHERCAT_PORTA		5
+#define MIIC_ETHERCAT_PORTB		6
+#define MIIC_ETHERCAT_PORTC		7
+#define MIIC_SWITCH_PORTA		8
+#define MIIC_SWITCH_PORTB		9
+#define MIIC_SWITCH_PORTC		10
+#define MIIC_SWITCH_PORTD		11
+#define MIIC_HSR_PORTA			12
+#define MIIC_HSR_PORTB			13
+
+#endif
-- 
cgit 


From 7dc54d3b8d9100c65b0860f76342fc9f3b4694d9 Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:50 +0200
Subject: net: pcs: add Renesas MII converter driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a PCS driver for the MII converter that is present on the Renesas
RZ/N1 SoC. This MII converter is reponsible for converting MII to
RMII/RGMII or act as a MII pass-trough. Exposing it as a PCS allows to
reuse it in both the switch driver and the stmmac driver. Currently,
this driver only allows the PCS to be used by the dual Cortex-A7
subsystem since the register locking system is not used.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/Kconfig         |   8 +
 drivers/net/pcs/Makefile        |   1 +
 drivers/net/pcs/pcs-rzn1-miic.c | 520 ++++++++++++++++++++++++++++++++++++++++
 include/linux/pcs-rzn1-miic.h   |  18 ++
 4 files changed, 547 insertions(+)
 create mode 100644 drivers/net/pcs/pcs-rzn1-miic.c
 create mode 100644 include/linux/pcs-rzn1-miic.h

(limited to 'include')

diff --git a/drivers/net/pcs/Kconfig b/drivers/net/pcs/Kconfig
index f778e5155fae..6289b7c765f1 100644
--- a/drivers/net/pcs/Kconfig
+++ b/drivers/net/pcs/Kconfig
@@ -18,4 +18,12 @@ config PCS_LYNX
 	  This module provides helpers to phylink for managing the Lynx PCS
 	  which is part of the Layerscape and QorIQ Ethernet SERDES.
 
+config PCS_RZN1_MIIC
+	tristate "Renesas RZ/N1 MII converter"
+	depends on OF && (ARCH_RZN1 || COMPILE_TEST)
+	help
+	  This module provides a driver for the MII converter that is available
+	  on RZ/N1 SoCs. This PCS converts MII to RMII/RGMII or can be set in
+	  pass-through mode for MII.
+
 endmenu
diff --git a/drivers/net/pcs/Makefile b/drivers/net/pcs/Makefile
index 0603d469bd57..0ff5388fcdea 100644
--- a/drivers/net/pcs/Makefile
+++ b/drivers/net/pcs/Makefile
@@ -5,3 +5,4 @@ pcs_xpcs-$(CONFIG_PCS_XPCS)	:= pcs-xpcs.o pcs-xpcs-nxp.o
 
 obj-$(CONFIG_PCS_XPCS)		+= pcs_xpcs.o
 obj-$(CONFIG_PCS_LYNX)		+= pcs-lynx.o
+obj-$(CONFIG_PCS_RZN1_MIIC)	+= pcs-rzn1-miic.o
diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c
new file mode 100644
index 000000000000..8f5e910f443d
--- /dev/null
+++ b/drivers/net/pcs/pcs-rzn1-miic.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/mdio.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pcs-rzn1-miic.h>
+#include <linux/phylink.h>
+#include <linux/pm_runtime.h>
+#include <dt-bindings/net/pcs-rzn1-miic.h>
+
+#define MIIC_PRCMD			0x0
+#define MIIC_ESID_CODE			0x4
+
+#define MIIC_MODCTRL			0x20
+#define MIIC_MODCTRL_SW_MODE		GENMASK(4, 0)
+
+#define MIIC_CONVCTRL(port)		(0x100 + (port) * 4)
+
+#define MIIC_CONVCTRL_CONV_SPEED	GENMASK(1, 0)
+#define CONV_MODE_10MBPS		0
+#define CONV_MODE_100MBPS		1
+#define CONV_MODE_1000MBPS		2
+
+#define MIIC_CONVCTRL_CONV_MODE		GENMASK(3, 2)
+#define CONV_MODE_MII			0
+#define CONV_MODE_RMII			1
+#define CONV_MODE_RGMII			2
+
+#define MIIC_CONVCTRL_FULLD		BIT(8)
+#define MIIC_CONVCTRL_RGMII_LINK	BIT(12)
+#define MIIC_CONVCTRL_RGMII_DUPLEX	BIT(13)
+#define MIIC_CONVCTRL_RGMII_SPEED	GENMASK(15, 14)
+
+#define MIIC_CONVRST			0x114
+#define MIIC_CONVRST_PHYIF_RST(port)	BIT(port)
+#define MIIC_CONVRST_PHYIF_RST_MASK	GENMASK(4, 0)
+
+#define MIIC_SWCTRL			0x304
+#define MIIC_SWDUPC			0x308
+
+#define MIIC_MAX_NR_PORTS		5
+
+#define MIIC_MODCTRL_CONF_CONV_NUM	6
+#define MIIC_MODCTRL_CONF_NONE		-1
+
+/**
+ * struct modctrl_match - Matching table entry for  convctrl configuration
+ *			  See section 8.2.1 of manual.
+ * @mode_cfg: Configuration value for convctrl
+ * @conv: Configuration of ethernet port muxes. First index is SWITCH_PORTIN,
+ *	  then index 1 - 5 are CONV1 - CONV5.
+ */
+struct modctrl_match {
+	u32 mode_cfg;
+	u8 conv[MIIC_MODCTRL_CONF_CONV_NUM];
+};
+
+static struct modctrl_match modctrl_match_table[] = {
+	{0x0, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_SERCOS_PORTB, MIIC_SERCOS_PORTA}},
+	{0x1, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0x2, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_ETHERCAT_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0x3, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_SWITCH_PORTB, MIIC_SWITCH_PORTA}},
+
+	{0x8, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_SERCOS_PORTB, MIIC_SERCOS_PORTA}},
+	{0x9, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0xA, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_ETHERCAT_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0xB, {MIIC_RTOS_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+	       MIIC_SWITCH_PORTC, MIIC_SWITCH_PORTB, MIIC_SWITCH_PORTA}},
+
+	{0x10, {MIIC_GMAC2_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+		MIIC_SWITCH_PORTC, MIIC_SERCOS_PORTB, MIIC_SERCOS_PORTA}},
+	{0x11, {MIIC_GMAC2_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+		MIIC_SWITCH_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0x12, {MIIC_GMAC2_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+		MIIC_ETHERCAT_PORTC, MIIC_ETHERCAT_PORTB, MIIC_ETHERCAT_PORTA}},
+	{0x13, {MIIC_GMAC2_PORT, MIIC_GMAC1_PORT, MIIC_SWITCH_PORTD,
+		MIIC_SWITCH_PORTC, MIIC_SWITCH_PORTB, MIIC_SWITCH_PORTA}}
+};
+
+static const char * const conf_to_string[] = {
+	[MIIC_GMAC1_PORT]	= "GMAC1_PORT",
+	[MIIC_GMAC2_PORT]	= "GMAC2_PORT",
+	[MIIC_RTOS_PORT]	= "RTOS_PORT",
+	[MIIC_SERCOS_PORTA]	= "SERCOS_PORTA",
+	[MIIC_SERCOS_PORTB]	= "SERCOS_PORTB",
+	[MIIC_ETHERCAT_PORTA]	= "ETHERCAT_PORTA",
+	[MIIC_ETHERCAT_PORTB]	= "ETHERCAT_PORTB",
+	[MIIC_ETHERCAT_PORTC]	= "ETHERCAT_PORTC",
+	[MIIC_SWITCH_PORTA]	= "SWITCH_PORTA",
+	[MIIC_SWITCH_PORTB]	= "SWITCH_PORTB",
+	[MIIC_SWITCH_PORTC]	= "SWITCH_PORTC",
+	[MIIC_SWITCH_PORTD]	= "SWITCH_PORTD",
+	[MIIC_HSR_PORTA]	= "HSR_PORTA",
+	[MIIC_HSR_PORTB]	= "HSR_PORTB",
+};
+
+static const char *index_to_string[MIIC_MODCTRL_CONF_CONV_NUM] = {
+	"SWITCH_PORTIN",
+	"CONV1",
+	"CONV2",
+	"CONV3",
+	"CONV4",
+	"CONV5",
+};
+
+/**
+ * struct miic - MII converter structure
+ * @base: base address of the MII converter
+ * @dev: Device associated to the MII converter
+ * @clks: Clocks used for this device
+ * @nclk: Number of clocks
+ * @lock: Lock used for read-modify-write access
+ */
+struct miic {
+	void __iomem *base;
+	struct device *dev;
+	struct clk_bulk_data *clks;
+	int nclk;
+	spinlock_t lock;
+};
+
+/**
+ * struct miic_port - Per port MII converter struct
+ * @miic: backiling to MII converter structure
+ * @pcs: PCS structure associated to the port
+ * @port: port number
+ */
+struct miic_port {
+	struct miic *miic;
+	struct phylink_pcs pcs;
+	int port;
+};
+
+static struct miic_port *phylink_pcs_to_miic_port(struct phylink_pcs *pcs)
+{
+	return container_of(pcs, struct miic_port, pcs);
+}
+
+static void miic_reg_writel(struct miic *miic, int offset, u32 value)
+{
+	writel(value, miic->base + offset);
+}
+
+static u32 miic_reg_readl(struct miic *miic, int offset)
+{
+	return readl(miic->base + offset);
+}
+
+static void miic_reg_rmw(struct miic *miic, int offset, u32 mask, u32 val)
+{
+	u32 reg;
+
+	spin_lock(&miic->lock);
+
+	reg = miic_reg_readl(miic, offset);
+	reg &= ~mask;
+	reg |= val;
+	miic_reg_writel(miic, offset, reg);
+
+	spin_unlock(&miic->lock);
+}
+
+static void miic_converter_enable(struct miic *miic, int port, int enable)
+{
+	u32 val = 0;
+
+	if (enable)
+		val = MIIC_CONVRST_PHYIF_RST(port);
+
+	miic_reg_rmw(miic, MIIC_CONVRST, MIIC_CONVRST_PHYIF_RST(port), val);
+}
+
+static int miic_config(struct phylink_pcs *pcs, unsigned int mode,
+		       phy_interface_t interface,
+		       const unsigned long *advertising, bool permit)
+{
+	struct miic_port *miic_port = phylink_pcs_to_miic_port(pcs);
+	struct miic *miic = miic_port->miic;
+	int port = miic_port->port;
+	u32 speed, conv_mode, val;
+
+	switch (interface) {
+	case PHY_INTERFACE_MODE_RMII:
+		conv_mode = CONV_MODE_RMII;
+		speed = CONV_MODE_100MBPS;
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+		conv_mode = CONV_MODE_RGMII;
+		speed = CONV_MODE_1000MBPS;
+		break;
+	case PHY_INTERFACE_MODE_MII:
+		conv_mode = CONV_MODE_MII;
+		/* When in MII mode, speed should be set to 0 (which is actually
+		 * CONV_MODE_10MBPS)
+		 */
+		speed = CONV_MODE_10MBPS;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	val = FIELD_PREP(MIIC_CONVCTRL_CONV_MODE, conv_mode) |
+	      FIELD_PREP(MIIC_CONVCTRL_CONV_SPEED, speed);
+
+	miic_reg_rmw(miic, MIIC_CONVCTRL(port),
+		     MIIC_CONVCTRL_CONV_MODE | MIIC_CONVCTRL_CONV_SPEED, val);
+	miic_converter_enable(miic_port->miic, miic_port->port, 1);
+
+	return 0;
+}
+
+static void miic_link_up(struct phylink_pcs *pcs, unsigned int mode,
+			 phy_interface_t interface, int speed, int duplex)
+{
+	struct miic_port *miic_port = phylink_pcs_to_miic_port(pcs);
+	struct miic *miic = miic_port->miic;
+	u32 conv_speed = 0, val = 0;
+	int port = miic_port->port;
+
+	if (duplex == DUPLEX_FULL)
+		val |= MIIC_CONVCTRL_FULLD;
+
+	/* No speed in MII through-mode */
+	if (interface != PHY_INTERFACE_MODE_MII) {
+		switch (speed) {
+		case SPEED_1000:
+			conv_speed = CONV_MODE_1000MBPS;
+			break;
+		case SPEED_100:
+			conv_speed = CONV_MODE_100MBPS;
+			break;
+		case SPEED_10:
+			conv_speed = CONV_MODE_10MBPS;
+			break;
+		default:
+			return;
+		}
+	}
+
+	val |= FIELD_PREP(MIIC_CONVCTRL_CONV_SPEED, conv_speed);
+
+	miic_reg_rmw(miic, MIIC_CONVCTRL(port),
+		     (MIIC_CONVCTRL_CONV_SPEED | MIIC_CONVCTRL_FULLD), val);
+}
+
+static int miic_validate(struct phylink_pcs *pcs, unsigned long *supported,
+			 const struct phylink_link_state *state)
+{
+	if (phy_interface_mode_is_rgmii(state->interface) ||
+	    state->interface == PHY_INTERFACE_MODE_RMII ||
+	    state->interface == PHY_INTERFACE_MODE_MII)
+		return 1;
+
+	return -EINVAL;
+}
+
+static const struct phylink_pcs_ops miic_phylink_ops = {
+	.pcs_validate = miic_validate,
+	.pcs_config = miic_config,
+	.pcs_link_up = miic_link_up,
+};
+
+struct phylink_pcs *miic_create(struct device *dev, struct device_node *np)
+{
+	struct platform_device *pdev;
+	struct miic_port *miic_port;
+	struct device_node *pcs_np;
+	struct miic *miic;
+	u32 port;
+
+	if (!of_device_is_available(np))
+		return ERR_PTR(-ENODEV);
+
+	if (of_property_read_u32(np, "reg", &port))
+		return ERR_PTR(-EINVAL);
+
+	if (port > MIIC_MAX_NR_PORTS || port < 1)
+		return ERR_PTR(-EINVAL);
+
+	/* The PCS pdev is attached to the parent node */
+	pcs_np = of_get_parent(np);
+	if (!pcs_np)
+		return ERR_PTR(-ENODEV);
+
+	if (!of_device_is_available(pcs_np)) {
+		of_node_put(pcs_np);
+		return ERR_PTR(-ENODEV);
+	}
+
+	pdev = of_find_device_by_node(pcs_np);
+	of_node_put(pcs_np);
+	if (!pdev || !platform_get_drvdata(pdev))
+		return ERR_PTR(-EPROBE_DEFER);
+
+	miic_port = kzalloc(sizeof(*miic_port), GFP_KERNEL);
+	if (!miic_port)
+		return ERR_PTR(-ENOMEM);
+
+	miic = platform_get_drvdata(pdev);
+	device_link_add(dev, miic->dev, DL_FLAG_AUTOREMOVE_CONSUMER);
+
+	miic_port->miic = miic;
+	miic_port->port = port - 1;
+	miic_port->pcs.ops = &miic_phylink_ops;
+
+	return &miic_port->pcs;
+}
+EXPORT_SYMBOL(miic_create);
+
+void miic_destroy(struct phylink_pcs *pcs)
+{
+	struct miic_port *miic_port = phylink_pcs_to_miic_port(pcs);
+
+	miic_converter_enable(miic_port->miic, miic_port->port, 0);
+	kfree(miic_port);
+}
+EXPORT_SYMBOL(miic_destroy);
+
+static int miic_init_hw(struct miic *miic, u32 cfg_mode)
+{
+	int port;
+
+	/* Unlock write access to accessory registers (cf datasheet). If this
+	 * is going to be used in conjunction with the Cortex-M3, this sequence
+	 * will have to be moved in register write
+	 */
+	miic_reg_writel(miic, MIIC_PRCMD, 0x00A5);
+	miic_reg_writel(miic, MIIC_PRCMD, 0x0001);
+	miic_reg_writel(miic, MIIC_PRCMD, 0xFFFE);
+	miic_reg_writel(miic, MIIC_PRCMD, 0x0001);
+
+	miic_reg_writel(miic, MIIC_MODCTRL,
+			FIELD_PREP(MIIC_MODCTRL_SW_MODE, cfg_mode));
+
+	for (port = 0; port < MIIC_MAX_NR_PORTS; port++) {
+		miic_converter_enable(miic, port, 0);
+		/* Disable speed/duplex control from these registers, datasheet
+		 * says switch registers should be used to setup switch port
+		 * speed and duplex.
+		 */
+		miic_reg_writel(miic, MIIC_SWCTRL, 0x0);
+		miic_reg_writel(miic, MIIC_SWDUPC, 0x0);
+	}
+
+	return 0;
+}
+
+static bool miic_modctrl_match(s8 table_val[MIIC_MODCTRL_CONF_CONV_NUM],
+			       s8 dt_val[MIIC_MODCTRL_CONF_CONV_NUM])
+{
+	int i;
+
+	for (i = 0; i < MIIC_MODCTRL_CONF_CONV_NUM; i++) {
+		if (dt_val[i] == MIIC_MODCTRL_CONF_NONE)
+			continue;
+
+		if (dt_val[i] != table_val[i])
+			return false;
+	}
+
+	return true;
+}
+
+static void miic_dump_conf(struct device *dev,
+			   s8 conf[MIIC_MODCTRL_CONF_CONV_NUM])
+{
+	const char *conf_name;
+	int i;
+
+	for (i = 0; i < MIIC_MODCTRL_CONF_CONV_NUM; i++) {
+		if (conf[i] != MIIC_MODCTRL_CONF_NONE)
+			conf_name = conf_to_string[conf[i]];
+		else
+			conf_name = "NONE";
+
+		dev_err(dev, "%s: %s\n", index_to_string[i], conf_name);
+	}
+}
+
+static int miic_match_dt_conf(struct device *dev,
+			      s8 dt_val[MIIC_MODCTRL_CONF_CONV_NUM],
+			      u32 *mode_cfg)
+{
+	struct modctrl_match *table_entry;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(modctrl_match_table); i++) {
+		table_entry = &modctrl_match_table[i];
+
+		if (miic_modctrl_match(table_entry->conv, dt_val)) {
+			*mode_cfg = table_entry->mode_cfg;
+			return 0;
+		}
+	}
+
+	dev_err(dev, "Failed to apply requested configuration\n");
+	miic_dump_conf(dev, dt_val);
+
+	return -EINVAL;
+}
+
+static int miic_parse_dt(struct device *dev, u32 *mode_cfg)
+{
+	s8 dt_val[MIIC_MODCTRL_CONF_CONV_NUM];
+	struct device_node *np = dev->of_node;
+	struct device_node *conv;
+	u32 conf;
+	int port;
+
+	memset(dt_val, MIIC_MODCTRL_CONF_NONE, sizeof(dt_val));
+
+	if (of_property_read_u32(np, "renesas,miic-switch-portin", &conf) == 0)
+		dt_val[0] = conf;
+
+	for_each_child_of_node(np, conv) {
+		if (of_property_read_u32(conv, "reg", &port))
+			continue;
+
+		if (!of_device_is_available(conv))
+			continue;
+
+		if (of_property_read_u32(conv, "renesas,miic-input", &conf) == 0)
+			dt_val[port] = conf;
+	}
+
+	return miic_match_dt_conf(dev, dt_val, mode_cfg);
+}
+
+static int miic_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct miic *miic;
+	u32 mode_cfg;
+	int ret;
+
+	ret = miic_parse_dt(dev, &mode_cfg);
+	if (ret < 0)
+		return ret;
+
+	miic = devm_kzalloc(dev, sizeof(*miic), GFP_KERNEL);
+	if (!miic)
+		return -ENOMEM;
+
+	spin_lock_init(&miic->lock);
+	miic->dev = dev;
+	miic->base = devm_platform_ioremap_resource(pdev, 0);
+	if (!miic->base)
+		return -EINVAL;
+
+	ret = devm_pm_runtime_enable(dev);
+	if (ret < 0)
+		return ret;
+
+	ret = pm_runtime_resume_and_get(dev);
+	if (ret < 0)
+		return ret;
+
+	ret = miic_init_hw(miic, mode_cfg);
+	if (ret)
+		goto disable_runtime_pm;
+
+	/* miic_create() relies on that fact that data are attached to the
+	 * platform device to determine if the driver is ready so this needs to
+	 * be the last thing to be done after everything is initialized
+	 * properly.
+	 */
+	platform_set_drvdata(pdev, miic);
+
+	return 0;
+
+disable_runtime_pm:
+	pm_runtime_put(dev);
+
+	return ret;
+}
+
+static int miic_remove(struct platform_device *pdev)
+{
+	pm_runtime_put(&pdev->dev);
+
+	return 0;
+}
+
+static const struct of_device_id miic_of_mtable[] = {
+	{ .compatible = "renesas,rzn1-miic" },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, miic_of_mtable);
+
+static struct platform_driver miic_driver = {
+	.driver = {
+		.name	 = "rzn1_miic",
+		.suppress_bind_attrs = true,
+		.of_match_table = miic_of_mtable,
+	},
+	.probe = miic_probe,
+	.remove = miic_remove,
+};
+module_platform_driver(miic_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Renesas MII converter PCS driver");
+MODULE_AUTHOR("Clément Léger <clement.leger@bootlin.com>");
diff --git a/include/linux/pcs-rzn1-miic.h b/include/linux/pcs-rzn1-miic.h
new file mode 100644
index 000000000000..56d12b21365d
--- /dev/null
+++ b/include/linux/pcs-rzn1-miic.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#ifndef __LINUX_PCS_MIIC_H
+#define __LINUX_PCS_MIIC_H
+
+struct phylink;
+struct device_node;
+
+struct phylink_pcs *miic_create(struct device *dev, struct device_node *np);
+
+void miic_destroy(struct phylink_pcs *pcs);
+
+#endif /* __LINUX_PCS_MIIC_H */
-- 
cgit 


From 6deb209dc6b0952a460da17ee61223ee3b3429d5 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
Date: Fri, 24 Jun 2022 15:34:25 -0600
Subject: net: Print hashed skb addresses for all net and qdisc events

The following commits added support for printing the real address-
65875073eddd ("net: use %px to print skb address in trace_netif_receive_skb")
70713dddf3d2 ("net_sched: introduce tracepoint trace_qdisc_enqueue()")
851f36e40962 ("net_sched: use %px to print skb address in trace_qdisc_dequeue()")

However, tracing the packet traversal shows a mix of hashes and real
addresses. Pasting a sample trace for reference-

ping-14249   [002] .....  3424.046612: netif_rx_entry: dev=lo napi_id=0x3 queue_mapping=0
skbaddr=00000000dcbed83e vlan_tagged=0 vlan_proto=0x0000 vlan_tci=0x0000 protocol=0x0800
ip_summed=0 hash=0x00000000 l4_hash=0 len=84 data_len=0 truesize=768 mac_header_valid=1
mac_header=-14 nr_frags=0 gso_size=0 gso_type=0x0
ping-14249   [002] .....  3424.046615: netif_rx: dev=lo skbaddr=ffffff888e5d1000 len=84

Switch the trace print formats to %p for all the events to have a
consistent format of printing the hashed addresses in all cases.

Signed-off-by: Sean Tranchetti <quic_stranche@quicinc.com>
Signed-off-by: Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net.h   | 2 +-
 include/trace/events/qdisc.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 032b431b987b..da611a7aaf97 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -136,7 +136,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
 		__assign_str(name, skb->dev->name);
 	),
 
-	TP_printk("dev=%s skbaddr=%px len=%u",
+	TP_printk("dev=%s skbaddr=%p len=%u",
 		__get_str(name), __entry->skbaddr, __entry->len)
 )
 
diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 59c945b66f9c..a3995925cb05 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -41,7 +41,7 @@ TRACE_EVENT(qdisc_dequeue,
 		__entry->txq_state	= txq->state;
 	),
 
-	TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%px",
+	TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p",
 		  __entry->ifindex, __entry->handle, __entry->parent,
 		  __entry->txq_state, __entry->packets, __entry->skbaddr )
 );
@@ -70,7 +70,7 @@ TRACE_EVENT(qdisc_enqueue,
 		__entry->parent	 = qdisc->parent;
 	),
 
-	TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
+	TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p",
 		  __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
 );
 
-- 
cgit 


From abc5992b9dd0ba599f7a91d5a0f4569a78ae88ac Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 7 Jun 2022 20:50:23 +0800
Subject: mm: ioremap: Use more sensible name in ioremap_prot()

Use more meaningful and sensible naming phys_addr
instead addr in ioremap_prot().

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220610092255.32445-1-wangkefeng.wang@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/io.h |  3 ++-
 mm/ioremap.c             | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ce93aaf69f8..b76379628a02 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -964,7 +964,8 @@ static inline void iounmap(volatile void __iomem *addr)
 #elif defined(CONFIG_GENERIC_IOREMAP)
 #include <linux/pgtable.h>
 
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+			   unsigned long prot);
 void iounmap(volatile void __iomem *addr);
 
 static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fe598ecd9b7..2d754b48d230 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -11,20 +11,21 @@
 #include <linux/io.h>
 #include <linux/export.h>
 
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+			   unsigned long prot)
 {
 	unsigned long offset, vaddr;
 	phys_addr_t last_addr;
 	struct vm_struct *area;
 
 	/* Disallow wrap-around or zero size */
-	last_addr = addr + size - 1;
-	if (!size || last_addr < addr)
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
 		return NULL;
 
 	/* Page-align mappings */
-	offset = addr & (~PAGE_MASK);
-	addr -= offset;
+	offset = phys_addr & (~PAGE_MASK);
+	phys_addr -= offset;
 	size = PAGE_ALIGN(size + offset);
 
 	area = get_vm_area_caller(size, VM_IOREMAP,
@@ -33,7 +34,8 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
 		return NULL;
 	vaddr = (unsigned long)area->addr;
 
-	if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+	if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
+			       __pgprot(prot))) {
 		free_vm_area(area);
 		return NULL;
 	}
-- 
cgit 


From 18e780b4e6ab89a3a10f46d151971863562c1c91 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 7 Jun 2022 20:50:25 +0800
Subject: mm: ioremap: Add ioremap/iounmap_allowed()

Add special hook for architecture to verify addr, size or prot
when ioremap() or iounmap(), which will make the generic ioremap
more useful.

  ioremap_allowed() return a bool,
    - true means continue to remap
    - false means skip remap and return directly
  iounmap_allowed() return a bool,
    - true means continue to vunmap
    - false code means skip vunmap and return directly

Meanwhile, only vunmap the address when it is in vmalloc area
as the generic ioremap only returns vmalloc addresses.

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Baoquan He <bhe@redhat.com>
Link: https://lore.kernel.org/r/20220607125027.44946-5-wangkefeng.wang@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/io.h | 26 ++++++++++++++++++++++++++
 mm/ioremap.c             | 11 ++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index b76379628a02..db5b890eaff7 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -964,6 +964,32 @@ static inline void iounmap(volatile void __iomem *addr)
 #elif defined(CONFIG_GENERIC_IOREMAP)
 #include <linux/pgtable.h>
 
+/*
+ * Arch code can implement the following two hooks when using GENERIC_IOREMAP
+ * ioremap_allowed() return a bool,
+ *   - true means continue to remap
+ *   - false means skip remap and return directly
+ * iounmap_allowed() return a bool,
+ *   - true means continue to vunmap
+ *   - false means skip vunmap and return directly
+ */
+#ifndef ioremap_allowed
+#define ioremap_allowed ioremap_allowed
+static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size,
+				   unsigned long prot)
+{
+	return true;
+}
+#endif
+
+#ifndef iounmap_allowed
+#define iounmap_allowed iounmap_allowed
+static inline bool iounmap_allowed(void *addr)
+{
+	return true;
+}
+#endif
+
 void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 			   unsigned long prot);
 void iounmap(volatile void __iomem *addr);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index e1d008e8f87f..8652426282cc 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,6 +28,9 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 	phys_addr -= offset;
 	size = PAGE_ALIGN(size + offset);
 
+	if (!ioremap_allowed(phys_addr, size, prot))
+		return NULL;
+
 	area = get_vm_area_caller(size, VM_IOREMAP,
 			__builtin_return_address(0));
 	if (!area)
@@ -47,6 +50,12 @@ EXPORT_SYMBOL(ioremap_prot);
 
 void iounmap(volatile void __iomem *addr)
 {
-	vunmap((void *)((unsigned long)addr & PAGE_MASK));
+	void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
+
+	if (!iounmap_allowed(vaddr))
+		return;
+
+	if (is_vmalloc_addr(vaddr))
+		vunmap(vaddr);
 }
 EXPORT_SYMBOL(iounmap);
-- 
cgit 


From 1c348f748b4dd7711c5564a8fce0842529498dff Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 23 Jun 2022 13:51:15 +0100
Subject: ASoC: soc-component: Add legacy_dai_naming flag

Historically, the legacy DAI naming scheme was applied to platform
drivers and the newer scheme to CODEC drivers. During componentisation
the core lost the knowledge of if a driver was a CODEC or platform, they
were all now components. To continue to support the legacy naming on
older platform drivers a flag was added to the snd_soc_component_driver
structure, non_legacy_dai_naming, to indicate to use the new scheme and
this was applied to all CODECs as part of the migration.

However, a slight issue appears to be developing with respect to this
flag being opt in for the non-legacy scheme, which presumably we want to
be the primary scheme used. Many codec drivers appear to forget to
include this flag:

  grep -l -r "snd_soc_component_driver" sound/soc/codecs/*.c |
  xargs grep -L "non_legacy_dai_naming" | wc
     48      48    556

It would seem more sensible to change the flag to legacy_dai_naming
making the new scheme opt out. As a first step this patch adds a new
flag for this so that the users can be updated.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220623125250.2355471-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-component.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h
index 5c4cfa70b018..96c2f5fffc51 100644
--- a/include/sound/soc-component.h
+++ b/include/sound/soc-component.h
@@ -179,6 +179,7 @@ struct snd_soc_component_driver {
 	 * analogue).
 	 */
 	unsigned int endianness:1;
+	unsigned int legacy_dai_naming:1;
 	unsigned int non_legacy_dai_naming:1;
 
 	/* this component uses topology and ignore machine driver FEs */
-- 
cgit 


From 8da443b1a4036b5863fd2ec7e0251164b704c726 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Jun 2022 11:05:34 +0200
Subject: tty/vt: consolemap: rename struct vc_data::vc_uni_pagedir*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As a follow-up to the commit 4173f018aae1 (tty/vt: consolemap: rename
and document struct uni_pagedir), rename also the members of struct
vc_data. I.e. pagedir -> pagedict. And while touching all the places,
remove also the unnecessary vc_ prefix.

Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220614090537.15557-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c             | 46 ++++++++++++++++-----------------
 drivers/tty/vt/vt.c                     |  8 +++---
 drivers/usb/misc/sisusbvga/sisusb_con.c |  2 +-
 drivers/video/console/vgacon.c          |  8 +++---
 drivers/video/fbdev/core/fbcon.c        |  8 +++---
 include/linux/console_struct.h          |  4 +--
 6 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 3d0e10dac6d9..16d0d8f04f0e 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -296,7 +296,7 @@ u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 	if (glyph >= MAX_GLYPH)
 		return 0;
 
-	p = *conp->vc_uni_pagedir_loc;
+	p = *conp->uni_pagedict_loc;
 	if (!p)
 		return glyph;
 
@@ -323,7 +323,7 @@ static void update_user_maps(void)
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
 			continue;
-		p = *vc_cons[i].d->vc_uni_pagedir_loc;
+		p = *vc_cons[i].d->uni_pagedict_loc;
 		if (p && p != q) {
 			set_inverse_transl(vc_cons[i].d, p, USER_MAP);
 			set_inverse_trans_unicode(p);
@@ -445,10 +445,10 @@ void con_free_unimap(struct vc_data *vc)
 {
 	struct uni_pagedict *p;
 
-	p = *vc->vc_uni_pagedir_loc;
+	p = *vc->uni_pagedict_loc;
 	if (!p)
 		return;
-	*vc->vc_uni_pagedir_loc = NULL;
+	*vc->uni_pagedict_loc = NULL;
 	if (--p->refcount)
 		return;
 	con_release_unimap(p);
@@ -463,7 +463,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1)
 	for (cons = 0; cons < MAX_NR_CONSOLES; cons++) {
 		if (!vc_cons_allocated(cons))
 			continue;
-		dict2 = *vc_cons[cons].d->vc_uni_pagedir_loc;
+		dict2 = *vc_cons[cons].d->uni_pagedict_loc;
 		if (!dict2 || dict2 == dict1 || dict2->sum != dict1->sum)
 			continue;
 		for (d = 0; d < UNI_DIRS; d++) {
@@ -487,7 +487,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1)
 		}
 		if (d == UNI_DIRS) {
 			dict2->refcount++;
-			*conp->vc_uni_pagedir_loc = dict2;
+			*conp->uni_pagedict_loc = dict2;
 			con_release_unimap(dict1);
 			kfree(dict1);
 			return 1;
@@ -531,14 +531,14 @@ con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos)
 
 static int con_allocate_new(struct vc_data *vc)
 {
-	struct uni_pagedict *new, *old = *vc->vc_uni_pagedir_loc;
+	struct uni_pagedict *new, *old = *vc->uni_pagedict_loc;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
 	new->refcount = 1;
-	*vc->vc_uni_pagedir_loc = new;
+	*vc->uni_pagedict_loc = new;
 
 	if (old)
 		old->refcount--;
@@ -549,7 +549,7 @@ static int con_allocate_new(struct vc_data *vc)
 /* Caller must hold the lock */
 static int con_do_clear_unimap(struct vc_data *vc)
 {
-	struct uni_pagedict *old = *vc->vc_uni_pagedir_loc;
+	struct uni_pagedict *old = *vc->uni_pagedict_loc;
 
 	if (!old || old->refcount > 1)
 		return con_allocate_new(vc);
@@ -583,7 +583,7 @@ static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc,
 	if (ret)
 		return ERR_PTR(ret);
 
-	new = *vc->vc_uni_pagedir_loc;
+	new = *vc->uni_pagedict_loc;
 
 	/*
 	 * uni_pgdir is a 32*32*64 table with rows allocated when its first
@@ -616,7 +616,7 @@ static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc,
 				ret = con_insert_unipair(new, uni, row[g]);
 				if (ret) {
 					old->refcount++;
-					*vc->vc_uni_pagedir_loc = old;
+					*vc->uni_pagedict_loc = old;
 					con_release_unimap(new);
 					kfree(new);
 					return ERR_PTR(ret);
@@ -644,7 +644,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 	console_lock();
 
 	/* Save original vc_unipagdir_loc in case we allocate a new one */
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	if (!dict) {
 		err = -EINVAL;
 		goto out_unlock;
@@ -704,12 +704,12 @@ int con_set_default_unimap(struct vc_data *vc)
 	u16 *dfont;
 
 	if (dflt) {
-		dict = *vc->vc_uni_pagedir_loc;
+		dict = *vc->uni_pagedict_loc;
 		if (dict == dflt)
 			return 0;
 
 		dflt->refcount++;
-		*vc->vc_uni_pagedir_loc = dflt;
+		*vc->uni_pagedict_loc = dflt;
 		if (dict && !--dict->refcount) {
 			con_release_unimap(dict);
 			kfree(dict);
@@ -723,7 +723,7 @@ int con_set_default_unimap(struct vc_data *vc)
 	if (err)
 		return err;
 
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	dfont = dfont_unitable;
 
 	for (fontpos = 0; fontpos < 256U; fontpos++)
@@ -734,7 +734,7 @@ int con_set_default_unimap(struct vc_data *vc)
 		}
 
 	if (con_unify_unimap(vc, dict)) {
-		dflt = *vc->vc_uni_pagedir_loc;
+		dflt = *vc->uni_pagedict_loc;
 		return err;
 	}
 
@@ -757,14 +757,14 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
 {
 	struct uni_pagedict *src;
 
-	if (!*src_vc->vc_uni_pagedir_loc)
+	if (!*src_vc->uni_pagedict_loc)
 		return -EINVAL;
-	if (*dst_vc->vc_uni_pagedir_loc == *src_vc->vc_uni_pagedir_loc)
+	if (*dst_vc->uni_pagedict_loc == *src_vc->uni_pagedict_loc)
 		return 0;
 	con_free_unimap(dst_vc);
-	src = *src_vc->vc_uni_pagedir_loc;
+	src = *src_vc->uni_pagedict_loc;
 	src->refcount++;
-	*dst_vc->vc_uni_pagedir_loc = src;
+	*dst_vc->uni_pagedict_loc = src;
 	return 0;
 }
 EXPORT_SYMBOL(con_copy_unimap);
@@ -791,7 +791,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct,
 	console_lock();
 
 	ect = 0;
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	if (!dict)
 		goto unlock;
 
@@ -873,7 +873,7 @@ int conv_uni_to_pc(struct vc_data *conp, long ucs)
 	else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE)
 		return ucs & UNI_DIRECT_MASK;
 
-	dict = *conp->vc_uni_pagedir_loc;
+	dict = *conp->uni_pagedict_loc;
 	if (!dict)
 		return -3;
 
@@ -903,7 +903,7 @@ console_map_init(void)
 	int i;
 
 	for (i = 0; i < MAX_NR_CONSOLES; i++)
-		if (vc_cons_allocated(i) && !*vc_cons[i].d->vc_uni_pagedir_loc)
+		if (vc_cons_allocated(i) && !*vc_cons[i].d->uni_pagedict_loc)
 			con_set_default_unimap(vc_cons[i].d);
 }
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index c718b0d01e3d..1899b8a5d73e 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1063,10 +1063,10 @@ static void visual_init(struct vc_data *vc, int num, int init)
 	__module_get(vc->vc_sw->owner);
 	vc->vc_num = num;
 	vc->vc_display_fg = &master_display_fg;
-	if (vc->vc_uni_pagedir_loc)
+	if (vc->uni_pagedict_loc)
 		con_free_unimap(vc);
-	vc->vc_uni_pagedir_loc = &vc->vc_uni_pagedir;
-	vc->vc_uni_pagedir = NULL;
+	vc->uni_pagedict_loc = &vc->uni_pagedict;
+	vc->uni_pagedict = NULL;
 	vc->vc_hi_font_mask = 0;
 	vc->vc_complement_mask = 0;
 	vc->vc_can_do_color = 0;
@@ -1136,7 +1136,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 
 	visual_init(vc, currcons, 1);
 
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_set_default_unimap(vc);
 
 	err = -EINVAL;
diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index dfa0d5ce6012..fcb95fb639e0 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -248,7 +248,7 @@ sisusbcon_init(struct vc_data *c, int init)
 	 */
 	kref_get(&sisusb->kref);
 
-	if (!*c->vc_uni_pagedir_loc)
+	if (!*c->uni_pagedict_loc)
 		con_set_default_unimap(c);
 
 	mutex_unlock(&sisusb->lock);
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 058a78b8dbcf..fcdf017e2665 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -367,10 +367,10 @@ static void vgacon_init(struct vc_data *c, int init)
 	c->vc_complement_mask = 0x7700;
 	if (vga_512_chars)
 		c->vc_hi_font_mask = 0x0800;
-	p = *c->vc_uni_pagedir_loc;
-	if (c->vc_uni_pagedir_loc != &vgacon_uni_pagedir) {
+	p = *c->uni_pagedict_loc;
+	if (c->uni_pagedict_loc != &vgacon_uni_pagedir) {
 		con_free_unimap(c);
-		c->vc_uni_pagedir_loc = &vgacon_uni_pagedir;
+		c->uni_pagedict_loc = &vgacon_uni_pagedir;
 		vgacon_refcount++;
 	}
 	if (!vgacon_uni_pagedir && p)
@@ -392,7 +392,7 @@ static void vgacon_deinit(struct vc_data *c)
 
 	if (!--vgacon_refcount)
 		con_free_unimap(c);
-	c->vc_uni_pagedir_loc = &c->vc_uni_pagedir;
+	c->uni_pagedict_loc = &c->uni_pagedict;
 	con_set_default_unimap(c);
 }
 
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index c4e91715ef00..b10236fe5886 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -1058,9 +1058,9 @@ static void fbcon_init(struct vc_data *vc, int init)
 			vc->vc_complement_mask <<= 1;
 	}
 
-	if (!*svc->vc_uni_pagedir_loc)
+	if (!*svc->uni_pagedict_loc)
 		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_copy_unimap(vc, svc);
 
 	ops = info->fbcon_par;
@@ -1382,9 +1382,9 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
 			vc->vc_complement_mask <<= 1;
 	}
 
-	if (!*svc->vc_uni_pagedir_loc)
+	if (!*svc->uni_pagedict_loc)
 		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_copy_unimap(vc, svc);
 
 	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index f75033f0277f..1518568aaf0f 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -157,8 +157,8 @@ struct vc_data {
 	unsigned int	vc_bell_duration;	/* Console bell duration */
 	unsigned short	vc_cur_blink_ms;	/* Cursor blink duration */
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
-	struct uni_pagedict *vc_uni_pagedir;
-	struct uni_pagedict **vc_uni_pagedir_loc; /* [!] Location of uni_pagedict variable for this console */
+	struct uni_pagedict *uni_pagedict;
+	struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */
 	struct uni_screen *vc_uni_screen;	/* unicode screen content */
 	/* additional information is in vt_kern.h */
 };
-- 
cgit 


From 01936221278c5af60d82b8e78ca74caa491c0d31 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 23 Jun 2022 13:52:50 +0100
Subject: ASoC: soc-component: Remove non_legacy_dai_naming flag

Now all the users are moved over to the new legacy_dai_naming flag,
remove the now unused old flag.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220623125250.2355471-97-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-component.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h
index 96c2f5fffc51..c26ffb033777 100644
--- a/include/sound/soc-component.h
+++ b/include/sound/soc-component.h
@@ -180,7 +180,6 @@ struct snd_soc_component_driver {
 	 */
 	unsigned int endianness:1;
 	unsigned int legacy_dai_naming:1;
-	unsigned int non_legacy_dai_naming:1;
 
 	/* this component uses topology and ignore machine driver FEs */
 	const char *ignore_machine;
-- 
cgit 


From 1714582a3a087eda8786d5a1b32b2ec86ca8a303 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 21 Jun 2022 08:12:24 +0200
Subject: spi: Move ctlr->cur_msg_prepared to struct spi_message

This enables the possibility to transfer a message that is not at the
current tip of the async message queue.
This is in preparation of the next patch(es) which enable spi_sync messages
to skip the queue altogether.

Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220621061234.3626638-2-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 7 ++++---
 include/linux/spi/spi.h | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c78d1ceeaa42..eb6360153fa1 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1684,7 +1684,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 			spi_finalize_current_message(ctlr);
 			goto out;
 		}
-		ctlr->cur_msg_prepared = true;
+		msg->prepared = true;
 	}
 
 	ret = spi_map_msg(ctlr, msg);
@@ -1926,7 +1926,7 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 	 */
 	spi_res_release(ctlr, mesg);
 
-	if (ctlr->cur_msg_prepared && ctlr->unprepare_message) {
+	if (mesg->prepared && ctlr->unprepare_message) {
 		ret = ctlr->unprepare_message(ctlr, mesg);
 		if (ret) {
 			dev_err(&ctlr->dev, "failed to unprepare message: %d\n",
@@ -1934,9 +1934,10 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 		}
 	}
 
+	mesg->prepared = false;
+
 	spin_lock_irqsave(&ctlr->queue_lock, flags);
 	ctlr->cur_msg = NULL;
-	ctlr->cur_msg_prepared = false;
 	ctlr->fallback = false;
 	kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
 	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c96f526d9a20..1a75c26742f2 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -385,8 +385,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @queue: message queue
  * @idling: the device is entering idle state
  * @cur_msg: the currently in-flight message
- * @cur_msg_prepared: spi_prepare_message was called for the currently
- *                    in-flight message
  * @cur_msg_mapped: message has been mapped for DMA
  * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip
  *           selected
@@ -621,7 +619,6 @@ struct spi_controller {
 	bool				running;
 	bool				rt;
 	bool				auto_runtime_pm;
-	bool                            cur_msg_prepared;
 	bool				cur_msg_mapped;
 	char				last_cs;
 	bool				last_cs_mode_high;
@@ -988,6 +985,7 @@ struct spi_transfer {
  * @queue: for use by whichever driver currently owns the message
  * @state: for use by whichever driver currently owns the message
  * @resources: for resource management when the spi message is processed
+ * @prepared: spi_prepare_message was called for the this message
  *
  * A @spi_message is used to execute an atomic sequence of data transfers,
  * each represented by a struct spi_transfer.  The sequence is "atomic"
@@ -1037,6 +1035,9 @@ struct spi_message {
 
 	/* list of spi_res reources when the spi message is processed */
 	struct list_head        resources;
+
+	/* spi_prepare_message was called for this message */
+	bool                    prepared;
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
-- 
cgit 


From ae7d2346dc89ae89a6e0aabe6037591a11e593c0 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 21 Jun 2022 08:12:25 +0200
Subject: spi: Don't use the message queue if possible in spi_sync

The interaction with the controller message queue and its corresponding
auxiliary flags and variables requires the use of the queue_lock which is
costly. Since spi_sync will transfer the complete message anyway, and not
return until it is finished, there is no need to put the message into the
queue if the queue is empty. This can save a lot of overhead.

As an example of how significant this is, when using the MCP2518FD SPI CAN
controller on a i.MX8MM SoC, the time during which the interrupt line
stays active (during 3 relatively short spi_sync messages), is reduced
from 98us to 72us by this patch.

Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220621061234.3626638-3-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 246 +++++++++++++++++++++++++++++-------------------
 include/linux/spi/spi.h |  11 ++-
 2 files changed, 159 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index eb6360153fa1..2d057d03c4f7 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1549,6 +1549,80 @@ static void spi_idle_runtime_pm(struct spi_controller *ctlr)
 	}
 }
 
+static int __spi_pump_transfer_message(struct spi_controller *ctlr,
+		struct spi_message *msg, bool was_busy)
+{
+	struct spi_transfer *xfer;
+	int ret;
+
+	if (!was_busy && ctlr->auto_runtime_pm) {
+		ret = pm_runtime_get_sync(ctlr->dev.parent);
+		if (ret < 0) {
+			pm_runtime_put_noidle(ctlr->dev.parent);
+			dev_err(&ctlr->dev, "Failed to power device: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
+	if (!was_busy)
+		trace_spi_controller_busy(ctlr);
+
+	if (!was_busy && ctlr->prepare_transfer_hardware) {
+		ret = ctlr->prepare_transfer_hardware(ctlr);
+		if (ret) {
+			dev_err(&ctlr->dev,
+				"failed to prepare transfer hardware: %d\n",
+				ret);
+
+			if (ctlr->auto_runtime_pm)
+				pm_runtime_put(ctlr->dev.parent);
+
+			msg->status = ret;
+			spi_finalize_current_message(ctlr);
+
+			return ret;
+		}
+	}
+
+	trace_spi_message_start(msg);
+
+	if (ctlr->prepare_message) {
+		ret = ctlr->prepare_message(ctlr, msg);
+		if (ret) {
+			dev_err(&ctlr->dev, "failed to prepare message: %d\n",
+				ret);
+			msg->status = ret;
+			spi_finalize_current_message(ctlr);
+			return ret;
+		}
+		msg->prepared = true;
+	}
+
+	ret = spi_map_msg(ctlr, msg);
+	if (ret) {
+		msg->status = ret;
+		spi_finalize_current_message(ctlr);
+		return ret;
+	}
+
+	if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) {
+		list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+			xfer->ptp_sts_word_pre = 0;
+			ptp_read_system_prets(xfer->ptp_sts);
+		}
+	}
+
+	ret = ctlr->transfer_one_message(ctlr, msg);
+	if (ret) {
+		dev_err(&ctlr->dev,
+			"failed to transfer one message from queue\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * __spi_pump_messages - function which processes spi message queue
  * @ctlr: controller to process queue for
@@ -1564,7 +1638,6 @@ static void spi_idle_runtime_pm(struct spi_controller *ctlr)
  */
 static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 {
-	struct spi_transfer *xfer;
 	struct spi_message *msg;
 	bool was_busy = false;
 	unsigned long flags;
@@ -1599,6 +1672,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 			    !ctlr->unprepare_transfer_hardware) {
 				spi_idle_runtime_pm(ctlr);
 				ctlr->busy = false;
+				ctlr->queue_empty = true;
 				trace_spi_controller_idle(ctlr);
 			} else {
 				kthread_queue_work(ctlr->kworker,
@@ -1625,6 +1699,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 
 		spin_lock_irqsave(&ctlr->queue_lock, flags);
 		ctlr->idling = false;
+		ctlr->queue_empty = true;
 		spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 		return;
 	}
@@ -1641,75 +1716,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 
 	mutex_lock(&ctlr->io_mutex);
-
-	if (!was_busy && ctlr->auto_runtime_pm) {
-		ret = pm_runtime_resume_and_get(ctlr->dev.parent);
-		if (ret < 0) {
-			dev_err(&ctlr->dev, "Failed to power device: %d\n",
-				ret);
-			mutex_unlock(&ctlr->io_mutex);
-			return;
-		}
-	}
-
-	if (!was_busy)
-		trace_spi_controller_busy(ctlr);
-
-	if (!was_busy && ctlr->prepare_transfer_hardware) {
-		ret = ctlr->prepare_transfer_hardware(ctlr);
-		if (ret) {
-			dev_err(&ctlr->dev,
-				"failed to prepare transfer hardware: %d\n",
-				ret);
-
-			if (ctlr->auto_runtime_pm)
-				pm_runtime_put(ctlr->dev.parent);
-
-			msg->status = ret;
-			spi_finalize_current_message(ctlr);
-
-			mutex_unlock(&ctlr->io_mutex);
-			return;
-		}
-	}
-
-	trace_spi_message_start(msg);
-
-	if (ctlr->prepare_message) {
-		ret = ctlr->prepare_message(ctlr, msg);
-		if (ret) {
-			dev_err(&ctlr->dev, "failed to prepare message: %d\n",
-				ret);
-			msg->status = ret;
-			spi_finalize_current_message(ctlr);
-			goto out;
-		}
-		msg->prepared = true;
-	}
-
-	ret = spi_map_msg(ctlr, msg);
-	if (ret) {
-		msg->status = ret;
-		spi_finalize_current_message(ctlr);
-		goto out;
-	}
-
-	if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) {
-		list_for_each_entry(xfer, &msg->transfers, transfer_list) {
-			xfer->ptp_sts_word_pre = 0;
-			ptp_read_system_prets(xfer->ptp_sts);
-		}
-	}
-
-	ret = ctlr->transfer_one_message(ctlr, msg);
-	if (ret) {
-		dev_err(&ctlr->dev,
-			"failed to transfer one message from queue: %d\n",
-			ret);
-		goto out;
-	}
-
-out:
+	ret = __spi_pump_transfer_message(ctlr, msg, was_busy);
 	mutex_unlock(&ctlr->io_mutex);
 
 	/* Prod the scheduler in case transfer_one() was busy waiting */
@@ -1839,6 +1846,7 @@ static int spi_init_queue(struct spi_controller *ctlr)
 {
 	ctlr->running = false;
 	ctlr->busy = false;
+	ctlr->queue_empty = true;
 
 	ctlr->kworker = kthread_create_worker(0, dev_name(&ctlr->dev));
 	if (IS_ERR(ctlr->kworker)) {
@@ -1936,11 +1944,20 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 
 	mesg->prepared = false;
 
-	spin_lock_irqsave(&ctlr->queue_lock, flags);
-	ctlr->cur_msg = NULL;
-	ctlr->fallback = false;
-	kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
-	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
+	if (!mesg->sync) {
+		/*
+		 * This message was sent via the async message queue. Handle
+		 * the queue and kick the worker thread to do the
+		 * idling/shutdown or send the next message if needed.
+		 */
+		spin_lock_irqsave(&ctlr->queue_lock, flags);
+		WARN(ctlr->cur_msg != mesg,
+			"Finalizing queued message that is not the current head of queue!");
+		ctlr->cur_msg = NULL;
+		ctlr->fallback = false;
+		kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
+		spin_unlock_irqrestore(&ctlr->queue_lock, flags);
+	}
 
 	trace_spi_message_done(mesg);
 
@@ -2043,6 +2060,7 @@ static int __spi_queued_transfer(struct spi_device *spi,
 	msg->status = -EINPROGRESS;
 
 	list_add_tail(&msg->queue, &ctlr->queue);
+	ctlr->queue_empty = false;
 	if (!ctlr->busy && need_pump)
 		kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
 
@@ -3938,6 +3956,39 @@ static int spi_async_locked(struct spi_device *spi, struct spi_message *message)
 
 }
 
+static void __spi_transfer_message_noqueue(struct spi_controller *ctlr, struct spi_message *msg)
+{
+	bool was_busy;
+	int ret;
+
+	mutex_lock(&ctlr->io_mutex);
+
+	/* If another context is idling the device then wait */
+	while (ctlr->idling)
+		usleep_range(10000, 11000);
+
+	was_busy = READ_ONCE(ctlr->busy);
+
+	ret = __spi_pump_transfer_message(ctlr, msg, was_busy);
+	if (ret)
+		goto out;
+
+	if (!was_busy) {
+		kfree(ctlr->dummy_rx);
+		ctlr->dummy_rx = NULL;
+		kfree(ctlr->dummy_tx);
+		ctlr->dummy_tx = NULL;
+		if (ctlr->unprepare_transfer_hardware &&
+		    ctlr->unprepare_transfer_hardware(ctlr))
+			dev_err(&ctlr->dev,
+				"failed to unprepare transfer hardware\n");
+		spi_idle_runtime_pm(ctlr);
+	}
+
+out:
+	mutex_unlock(&ctlr->io_mutex);
+}
+
 /*-------------------------------------------------------------------------*/
 
 /*
@@ -3956,51 +4007,52 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	DECLARE_COMPLETION_ONSTACK(done);
 	int status;
 	struct spi_controller *ctlr = spi->controller;
-	unsigned long flags;
 
 	status = __spi_validate(spi, message);
 	if (status != 0)
 		return status;
 
-	message->complete = spi_complete;
-	message->context = &done;
 	message->spi = spi;
 
 	SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_sync);
 	SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_sync);
 
 	/*
-	 * If we're not using the legacy transfer method then we will
-	 * try to transfer in the calling context so special case.
-	 * This code would be less tricky if we could remove the
-	 * support for driver implemented message queues.
+	 * Checking queue_empty here only guarantees async/sync message
+	 * ordering when coming from the same context. It does not need to
+	 * guard against reentrancy from a different context. The io_mutex
+	 * will catch those cases.
 	 */
-	if (ctlr->transfer == spi_queued_transfer) {
-		spin_lock_irqsave(&ctlr->bus_lock_spinlock, flags);
+	if (READ_ONCE(ctlr->queue_empty)) {
+		message->sync = true;
+		message->actual_length = 0;
+		message->status = -EINPROGRESS;
 
 		trace_spi_message_submit(message);
 
-		status = __spi_queued_transfer(spi, message, false);
+		SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_sync_immediate);
+		SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_sync_immediate);
 
-		spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags);
-	} else {
-		status = spi_async_locked(spi, message);
+		__spi_transfer_message_noqueue(ctlr, message);
+
+		return message->status;
 	}
 
+	/*
+	 * There are messages in the async queue that could have originated
+	 * from the same context, so we need to preserve ordering.
+	 * Therefor we send the message to the async queue and wait until they
+	 * are completed.
+	 */
+	message->complete = spi_complete;
+	message->context = &done;
+	status = spi_async_locked(spi, message);
 	if (status == 0) {
-		/* Push out the messages in the calling context if we can */
-		if (ctlr->transfer == spi_queued_transfer) {
-			SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics,
-						       spi_sync_immediate);
-			SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics,
-						       spi_sync_immediate);
-			__spi_pump_messages(ctlr, false);
-		}
-
 		wait_for_completion(&done);
 		status = message->status;
 	}
 	message->context = NULL;
+
 	return status;
 }
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 1a75c26742f2..74261a83b5fa 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -461,6 +461,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @irq_flags: Interrupt enable state during PTP system timestamping
  * @fallback: fallback to pio if dma transfer return failure with
  *	SPI_TRANS_FAIL_NO_START.
+ * @queue_empty: signal green light for opportunistically skipping the queue
+ *	for spi_sync transfers.
  *
  * Each SPI controller can communicate with one or more @spi_device
  * children.  These make a small bus, sharing MOSI, MISO and SCK signals
@@ -677,6 +679,9 @@ struct spi_controller {
 
 	/* Interrupt enable state during PTP system timestamping */
 	unsigned long		irq_flags;
+
+	/* Flag for enabling opportunistic skipping of the queue in spi_sync */
+	bool			queue_empty;
 };
 
 static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)
@@ -986,6 +991,7 @@ struct spi_transfer {
  * @state: for use by whichever driver currently owns the message
  * @resources: for resource management when the spi message is processed
  * @prepared: spi_prepare_message was called for the this message
+ * @sync: this message took the direct sync path skipping the async queue
  *
  * A @spi_message is used to execute an atomic sequence of data transfers,
  * each represented by a struct spi_transfer.  The sequence is "atomic"
@@ -1037,7 +1043,10 @@ struct spi_message {
 	struct list_head        resources;
 
 	/* spi_prepare_message was called for this message */
-	bool                    prepared;
+	bool			prepared;
+
+	/* this message is skipping the async queue */
+	bool			sync;
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
-- 
cgit 


From 66a221593cb26dd6aabba63bcd18173f4e69c7ab Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 21 Jun 2022 08:12:30 +0200
Subject: spi: Remove the now unused ctlr->idling flag

The ctlr->idling flag is never checked now, so we don't need to set it
either.

Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220621061234.3626638-8-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 2 --
 include/linux/spi/spi.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 71b767a9ad77..52736e339645 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1674,7 +1674,6 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 		}
 
 		ctlr->busy = false;
-		ctlr->idling = true;
 		spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 
 		kfree(ctlr->dummy_rx);
@@ -1689,7 +1688,6 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 		trace_spi_controller_idle(ctlr);
 
 		spin_lock_irqsave(&ctlr->queue_lock, flags);
-		ctlr->idling = false;
 		ctlr->queue_empty = true;
 		goto out_unlock;
 	}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 74261a83b5fa..c58f46be762f 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -383,7 +383,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @pump_messages: work struct for scheduling work to the message pump
  * @queue_lock: spinlock to syncronise access to message queue
  * @queue: message queue
- * @idling: the device is entering idle state
  * @cur_msg: the currently in-flight message
  * @cur_msg_mapped: message has been mapped for DMA
  * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip
@@ -616,7 +615,6 @@ struct spi_controller {
 	spinlock_t			queue_lock;
 	struct list_head		queue;
 	struct spi_message		*cur_msg;
-	bool				idling;
 	bool				busy;
 	bool				running;
 	bool				rt;
-- 
cgit 


From 69fa95905d40846756d22402690ddf5361a9d13b Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 21 Jun 2022 08:12:33 +0200
Subject: spi: Ensure the io_mutex is held until spi_finalize_current_message()

This patch introduces a completion that is completed in
spi_finalize_current_message() and waited for in
__spi_pump_transfer_message(). This way all manipulation of ctlr->cur_msg
is done with the io_mutex held and strictly ordered:
__spi_pump_transfer_message() will not return until
spi_finalize_current_message() is done using ctlr->cur_msg, and its
calling context is only touching ctlr->cur_msg after returning.
Due to this, we can safely drop the spin-locks around ctlr->cur_msg.

Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220621061234.3626638-11-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 32 ++++++++++++++------------------
 include/linux/spi/spi.h |  6 ++----
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 3df84f43918c..db08cb868652 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1613,11 +1613,14 @@ static int __spi_pump_transfer_message(struct spi_controller *ctlr,
 		}
 	}
 
+	reinit_completion(&ctlr->cur_msg_completion);
 	ret = ctlr->transfer_one_message(ctlr, msg);
 	if (ret) {
 		dev_err(&ctlr->dev,
 			"failed to transfer one message from queue\n");
 		return ret;
+	} else {
+		wait_for_completion(&ctlr->cur_msg_completion);
 	}
 
 	return 0;
@@ -1704,6 +1707,12 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 
 	ret = __spi_pump_transfer_message(ctlr, msg, was_busy);
+
+	if (!ret)
+		kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
+	ctlr->cur_msg = NULL;
+	ctlr->fallback = false;
+
 	mutex_unlock(&ctlr->io_mutex);
 
 	/* Prod the scheduler in case transfer_one() was busy waiting */
@@ -1897,12 +1906,9 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 {
 	struct spi_transfer *xfer;
 	struct spi_message *mesg;
-	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&ctlr->queue_lock, flags);
 	mesg = ctlr->cur_msg;
-	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 
 	if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) {
 		list_for_each_entry(xfer, &mesg->transfers, transfer_list) {
@@ -1936,20 +1942,7 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 
 	mesg->prepared = false;
 
-	if (!mesg->sync) {
-		/*
-		 * This message was sent via the async message queue. Handle
-		 * the queue and kick the worker thread to do the
-		 * idling/shutdown or send the next message if needed.
-		 */
-		spin_lock_irqsave(&ctlr->queue_lock, flags);
-		WARN(ctlr->cur_msg != mesg,
-			"Finalizing queued message that is not the current head of queue!");
-		ctlr->cur_msg = NULL;
-		ctlr->fallback = false;
-		kthread_queue_work(ctlr->kworker, &ctlr->pump_messages);
-		spin_unlock_irqrestore(&ctlr->queue_lock, flags);
-	}
+	complete(&ctlr->cur_msg_completion);
 
 	trace_spi_message_done(mesg);
 
@@ -3036,6 +3029,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 	}
 	ctlr->bus_lock_flag = 0;
 	init_completion(&ctlr->xfer_completion);
+	init_completion(&ctlr->cur_msg_completion);
 	if (!ctlr->max_dma_len)
 		ctlr->max_dma_len = INT_MAX;
 
@@ -3962,6 +3956,9 @@ static void __spi_transfer_message_noqueue(struct spi_controller *ctlr, struct s
 	if (ret)
 		goto out;
 
+	ctlr->cur_msg = NULL;
+	ctlr->fallback = false;
+
 	if (!was_busy) {
 		kfree(ctlr->dummy_rx);
 		ctlr->dummy_rx = NULL;
@@ -4013,7 +4010,6 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	 * will catch those cases.
 	 */
 	if (READ_ONCE(ctlr->queue_empty)) {
-		message->sync = true;
 		message->actual_length = 0;
 		message->status = -EINPROGRESS;
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c58f46be762f..c56e0d240a58 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -384,6 +384,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @queue_lock: spinlock to syncronise access to message queue
  * @queue: message queue
  * @cur_msg: the currently in-flight message
+ * @cur_msg_completion: a completion for the current in-flight message
  * @cur_msg_mapped: message has been mapped for DMA
  * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip
  *           selected
@@ -615,6 +616,7 @@ struct spi_controller {
 	spinlock_t			queue_lock;
 	struct list_head		queue;
 	struct spi_message		*cur_msg;
+	struct completion               cur_msg_completion;
 	bool				busy;
 	bool				running;
 	bool				rt;
@@ -989,7 +991,6 @@ struct spi_transfer {
  * @state: for use by whichever driver currently owns the message
  * @resources: for resource management when the spi message is processed
  * @prepared: spi_prepare_message was called for the this message
- * @sync: this message took the direct sync path skipping the async queue
  *
  * A @spi_message is used to execute an atomic sequence of data transfers,
  * each represented by a struct spi_transfer.  The sequence is "atomic"
@@ -1042,9 +1043,6 @@ struct spi_message {
 
 	/* spi_prepare_message was called for this message */
 	bool			prepared;
-
-	/* this message is skipping the async queue */
-	bool			sync;
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
-- 
cgit 


From dc3029056b02414c29b6627e3dd7b16624725ae9 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Tue, 21 Jun 2022 08:12:34 +0200
Subject: spi: opportunistically skip ctlr->cur_msg_completion

There are only a few drivers that do not call
spi_finalize_current_message() in the context of transfer_one_message(),
and even for those cases the completion ctlr->cur_msg_completion is not
needed always. The calls to complete() and wait_for_completion() each
take a spin-lock, which is costly. This patch makes it possible to avoid
those calls in the big majority of cases, by introducing two flags that
with the help of ordering via barriers can avoid using the completion
safely. In case of a race with the context calling
spi_finalize_current_message(), the scheme errs on the safe side and takes
the completion.
The impact of this patch is worth the effort: On a i.MX8MM SoC, the time
the SPI bus is idle between two consecutive calls to spi_sync(), is
reduced from 19.6us to 16.8us... roughly 15%.

Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220621061234.3626638-12-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 27 +++++++++++++++++++++++++--
 include/linux/spi/spi.h |  8 ++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index db08cb868652..ef37f043fd17 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1613,14 +1613,34 @@ static int __spi_pump_transfer_message(struct spi_controller *ctlr,
 		}
 	}
 
+	/*
+	 * Drivers implementation of transfer_one_message() must arrange for
+	 * spi_finalize_current_message() to get called. Most drivers will do
+	 * this in the calling context, but some don't. For those cases, a
+	 * completion is used to guarantee that this function does not return
+	 * until spi_finalize_current_message() is done accessing
+	 * ctlr->cur_msg.
+	 * Use of the following two flags enable to opportunistically skip the
+	 * use of the completion since its use involves expensive spin locks.
+	 * In case of a race with the context that calls
+	 * spi_finalize_current_message() the completion will always be used,
+	 * due to strict ordering of these flags using barriers.
+	 */
+	WRITE_ONCE(ctlr->cur_msg_incomplete, true);
+	WRITE_ONCE(ctlr->cur_msg_need_completion, false);
 	reinit_completion(&ctlr->cur_msg_completion);
+	smp_wmb(); /* make these available to spi_finalize_current_message */
+
 	ret = ctlr->transfer_one_message(ctlr, msg);
 	if (ret) {
 		dev_err(&ctlr->dev,
 			"failed to transfer one message from queue\n");
 		return ret;
 	} else {
-		wait_for_completion(&ctlr->cur_msg_completion);
+		WRITE_ONCE(ctlr->cur_msg_need_completion, true);
+		smp_mb(); /* see spi_finalize_current_message()... */
+		if (READ_ONCE(ctlr->cur_msg_incomplete))
+			wait_for_completion(&ctlr->cur_msg_completion);
 	}
 
 	return 0;
@@ -1942,7 +1962,10 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 
 	mesg->prepared = false;
 
-	complete(&ctlr->cur_msg_completion);
+	WRITE_ONCE(ctlr->cur_msg_incomplete, false);
+	smp_mb(); /* See __spi_pump_transfer_message()... */
+	if (READ_ONCE(ctlr->cur_msg_need_completion))
+		complete(&ctlr->cur_msg_completion);
 
 	trace_spi_message_done(mesg);
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c56e0d240a58..eb0d316e3c36 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -385,6 +385,12 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @queue: message queue
  * @cur_msg: the currently in-flight message
  * @cur_msg_completion: a completion for the current in-flight message
+ * @cur_msg_incomplete: Flag used internally to opportunistically skip
+ *	the @cur_msg_completion. This flag is used to check if the driver has
+ *	already called spi_finalize_current_message().
+ * @cur_msg_need_completion: Flag used internally to opportunistically skip
+ *	the @cur_msg_completion. This flag is used to signal the context that
+ *	is running spi_finalize_current_message() that it needs to complete()
  * @cur_msg_mapped: message has been mapped for DMA
  * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip
  *           selected
@@ -617,6 +623,8 @@ struct spi_controller {
 	struct list_head		queue;
 	struct spi_message		*cur_msg;
 	struct completion               cur_msg_completion;
+	bool				cur_msg_incomplete;
+	bool				cur_msg_need_completion;
 	bool				busy;
 	bool				running;
 	bool				rt;
-- 
cgit 


From 4a2dcc35911324d6fcde09b1760cf4f2962699ef Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 10 Jun 2022 12:58:23 -0700
Subject: block: introduce bdev_dma_alignment helper

Preparing for upcoming dma_alignment users that have a block_device, but
don't need the request_queue.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220610195830.3574005-5-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f7b43444c5f..2556fcdb645b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1365,6 +1365,11 @@ static inline int queue_dma_alignment(const struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
+static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
+{
+	return queue_dma_alignment(bdev_get_queue(bdev));
+}
+
 static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
-- 
cgit 


From cfa320f72882f0e944e2237287db84b0f7df877d Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 10 Jun 2022 12:58:27 -0700
Subject: iov: introduce iov_iter_aligned

The existing iov_iter_alignment() function returns the logical OR of
address and length. For cases where address and length need to be
considered separately, introduce a helper function that a caller can
specificy length and address masks that indicate if the iov is
unaligned.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220610195830.3574005-9-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h |  2 ++
 lib/iov_iter.c      | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 739285fe5a2f..34ba4a731179 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -219,6 +219,8 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 #endif
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
+bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
+			unsigned len_mask);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0b64695ab632..507e732ef7cf 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1268,6 +1268,98 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 }
 EXPORT_SYMBOL(iov_iter_discard);
 
+static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
+				   unsigned len_mask)
+{
+	size_t size = i->count;
+	size_t skip = i->iov_offset;
+	unsigned k;
+
+	for (k = 0; k < i->nr_segs; k++, skip = 0) {
+		size_t len = i->iov[k].iov_len - skip;
+
+		if (len > size)
+			len = size;
+		if (len & len_mask)
+			return false;
+		if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask)
+			return false;
+
+		size -= len;
+		if (!size)
+			break;
+	}
+	return true;
+}
+
+static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
+				  unsigned len_mask)
+{
+	size_t size = i->count;
+	unsigned skip = i->iov_offset;
+	unsigned k;
+
+	for (k = 0; k < i->nr_segs; k++, skip = 0) {
+		size_t len = i->bvec[k].bv_len - skip;
+
+		if (len > size)
+			len = size;
+		if (len & len_mask)
+			return false;
+		if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
+			return false;
+
+		size -= len;
+		if (!size)
+			break;
+	}
+	return true;
+}
+
+/**
+ * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
+ * 	are aligned to the parameters.
+ *
+ * @i: &struct iov_iter to restore
+ * @addr_mask: bit mask to check against the iov element's addresses
+ * @len_mask: bit mask to check against the iov element's lengths
+ *
+ * Return: false if any addresses or lengths intersect with the provided masks
+ */
+bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
+			 unsigned len_mask)
+{
+	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
+		return iov_iter_aligned_iovec(i, addr_mask, len_mask);
+
+	if (iov_iter_is_bvec(i))
+		return iov_iter_aligned_bvec(i, addr_mask, len_mask);
+
+	if (iov_iter_is_pipe(i)) {
+		unsigned int p_mask = i->pipe->ring_size - 1;
+		size_t size = i->count;
+
+		if (size & len_mask)
+			return false;
+		if (size && allocated(&i->pipe->bufs[i->head & p_mask])) {
+			if (i->iov_offset & addr_mask)
+				return false;
+		}
+
+		return true;
+	}
+
+	if (iov_iter_is_xarray(i)) {
+		if (i->count & len_mask)
+			return false;
+		if ((i->xarray_start + i->iov_offset) & addr_mask)
+			return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
+
 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 {
 	unsigned long res = 0;
-- 
cgit 


From 5debd9691c3ac64c3acd6867c264ad38bbe48cdc Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 10 Jun 2022 12:58:28 -0700
Subject: block: introduce bdev_iter_is_aligned helper

Provide a convenient function for this repeatable coding pattern.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220610195830.3574005-10-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2556fcdb645b..0b8bc1fe0b2c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1370,6 +1370,13 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
 	return queue_dma_alignment(bdev_get_queue(bdev));
 }
 
+static inline bool bdev_iter_is_aligned(struct block_device *bdev,
+					struct iov_iter *iter)
+{
+	return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev),
+				   bdev_logical_block_size(bdev) - 1);
+}
+
 static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
-- 
cgit 


From b1a000d3b8ec582da64bb644be633e5a0beffcbf Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 10 Jun 2022 12:58:29 -0700
Subject: block: relax direct io memory alignment

Use the address alignment requirements from the block_device for direct
io instead of requiring addresses be aligned to the block size. User
space can discover the alignment requirements from the dma_alignment
queue attribute.

User space can specify any hardware compatible DMA offset for each
segment, but every segment length is still required to be a multiple of
the block size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220610195830.3574005-11-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c            | 9 +++++++++
 block/fops.c           | 4 ++--
 include/linux/blkdev.h | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index ee5fe1bb015e..933ea3210954 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1220,7 +1220,16 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
+	/*
+	 * Each segment in the iov is required to be a block size multiple.
+	 * However, we may not be able to get the entire segment if it spans
+	 * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+	 * result to ensure the bio's total size is correct. The remainder of
+	 * the iov data will be picked up in the next bio iteration.
+	 */
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (size > 0)
+		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
diff --git a/block/fops.c b/block/fops.c
index 9d32df6fc315..86d3cab9bf93 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -45,8 +45,8 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
 static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 			      struct iov_iter *iter)
 {
-	return ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1));
+	return pos & (bdev_logical_block_size(bdev) - 1) ||
+		!bdev_iter_is_aligned(bdev, iter);
 }
 
 #define DIO_INLINE_BIO_VECS 4
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0b8bc1fe0b2c..886c44e97434 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -424,6 +424,11 @@ struct request_queue {
 	unsigned long		nr_requests;	/* Max # of requests */
 
 	unsigned int		dma_pad_mask;
+	/*
+	 * Drivers that set dma_alignment to less than 511 must be prepared to
+	 * handle individual bvec's that are not a multiple of a SECTOR_SIZE
+	 * due to possible offsets.
+	 */
 	unsigned int		dma_alignment;
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-- 
cgit 


From 8689461be3f1ce6686bc26f1f379790bb0fc7a8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Jun 2022 11:09:29 +0200
Subject: block: factor out a chunk_size_left helper

Factor out a helper from blk_max_size_offset so that it can be reused
independently.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20220614090934.570632-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 886c44e97434..283961257cc9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -934,6 +934,17 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	return q->limits.max_sectors;
 }
 
+/*
+ * Return how much of the chunk is left to be used for I/O at a given offset.
+ */
+static inline unsigned int blk_chunk_sectors_left(sector_t offset,
+		unsigned int chunk_sectors)
+{
+	if (unlikely(!is_power_of_2(chunk_sectors)))
+		return chunk_sectors - sector_div(offset, chunk_sectors);
+	return chunk_sectors - (offset & (chunk_sectors - 1));
+}
+
 /*
  * Return maximum size of a request at given offset. Only valid for
  * file system requests.
@@ -949,12 +960,8 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 			return q->limits.max_sectors;
 	}
 
-	if (likely(is_power_of_2(chunk_sectors)))
-		chunk_sectors -= offset & (chunk_sectors - 1);
-	else
-		chunk_sectors -= sector_div(offset, chunk_sectors);
-
-	return min(q->limits.max_sectors, chunk_sectors);
+	return min(q->limits.max_sectors,
+			blk_chunk_sectors_left(offset, chunk_sectors));
 }
 
 /*
-- 
cgit 


From efef739d5f37dc998b113fb965aea68d42a9eddc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Jun 2022 11:09:33 +0200
Subject: block: fold blk_max_size_offset into get_max_io_size

Now that blk_max_size_offset has a single caller left, fold it into that
and clean up the naming convention for the local variables there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220614090934.570632-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c      |  9 +++++++--
 include/linux/blkdev.h | 19 -------------------
 2 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4da981efddee..0f5f42ebd0bb 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -166,9 +166,14 @@ static inline unsigned get_max_io_size(struct request_queue *q,
 {
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
-	unsigned max_sectors, start, end;
+	unsigned max_sectors = queue_max_sectors(q), start, end;
+
+	if (q->limits.chunk_sectors) {
+		max_sectors = min(max_sectors,
+			blk_chunk_sectors_left(bio->bi_iter.bi_sector,
+					       q->limits.chunk_sectors));
+	}
 
-	max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
 	start = bio->bi_iter.bi_sector & (pbs - 1);
 	end = (start + max_sectors) & ~(pbs - 1);
 	if (end > start)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 283961257cc9..652c357dafb9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -945,25 +945,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
 	return chunk_sectors - (offset & (chunk_sectors - 1));
 }
 
-/*
- * Return maximum size of a request at given offset. Only valid for
- * file system requests.
- */
-static inline unsigned int blk_max_size_offset(struct request_queue *q,
-					       sector_t offset,
-					       unsigned int chunk_sectors)
-{
-	if (!chunk_sectors) {
-		if (q->limits.chunk_sectors)
-			chunk_sectors = q->limits.chunk_sectors;
-		else
-			return q->limits.max_sectors;
-	}
-
-	return min(q->limits.max_sectors,
-			blk_chunk_sectors_left(offset, chunk_sectors));
-}
-
 /*
  * Access functions for manipulating queue properties
  */
-- 
cgit 


From 2a9336c42a6abdcef564d522648a684a474a3483 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Jun 2022 11:09:34 +0200
Subject: block: move blk_queue_get_max_sectors to blk.h

blk_queue_get_max_sectors is private to the block layer, so move it out
of blkdev.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220614090934.570632-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h            | 13 +++++++++++++
 include/linux/blkdev.h | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/block/blk.h b/block/blk.h
index 434017701403..8e79296ee97a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -159,6 +159,19 @@ static inline bool blk_discard_mergable(struct request *req)
 	return false;
 }
 
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+						     int op)
+{
+	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
+		return min(q->limits.max_discard_sectors,
+			   UINT_MAX >> SECTOR_SHIFT);
+
+	if (unlikely(op == REQ_OP_WRITE_ZEROES))
+		return q->limits.max_write_zeroes_sectors;
+
+	return q->limits.max_sectors;
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 652c357dafb9..b2d42201bd5d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -921,19 +921,6 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
-						     int op)
-{
-	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
-		return min(q->limits.max_discard_sectors,
-			   UINT_MAX >> SECTOR_SHIFT);
-
-	if (unlikely(op == REQ_OP_WRITE_ZEROES))
-		return q->limits.max_write_zeroes_sectors;
-
-	return q->limits.max_sectors;
-}
-
 /*
  * Return how much of the chunk is left to be used for I/O at a given offset.
  */
-- 
cgit 


From e589f46445960c274cc813a1cc8e2fc73b2a1849 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2022 09:48:26 +0200
Subject: block: fix default IO priority handling again

Commit e70344c05995 ("block: fix default IO priority handling")
introduced an inconsistency in get_current_ioprio() that tasks without
IO context return IOPRIO_DEFAULT priority while tasks with freshly
allocated IO context will return 0 (IOPRIO_CLASS_NONE/0) IO priority.
Tasks without IO context used to be rare before 5a9d041ba2f6 ("block:
move io_context creation into where it's needed") but after this commit
they became common because now only BFQ IO scheduler setups task's IO
context. Similar inconsistency is there for get_task_ioprio() so this
inconsistency is now exposed to userspace and userspace will see
different IO priority for tasks operating on devices with BFQ compared
to devices without BFQ. Furthemore the changes done by commit
e70344c05995 change the behavior when no IO priority is set for BFQ IO
scheduler which is also documented in ioprio_set(2) manpage:

"If no I/O scheduler has been set for a thread, then by default the I/O
priority will follow the CPU nice value (setpriority(2)).  In Linux
kernels before version 2.6.24, once an I/O priority had been set using
ioprio_set(), there was no way to reset the I/O scheduling behavior to
the default. Since Linux 2.6.24, specifying ioprio as 0 can be used to
reset to the default I/O scheduling behavior."

So make sure we default to IOPRIO_CLASS_NONE as used to be the case
before commit e70344c05995. Also cleanup alloc_io_context() to
explicitely set this IO priority for the allocated IO context to avoid
future surprises. Note that we tweak ioprio_best() to maintain
ioprio_get(2) behavior and make this commit easily backportable.

CC: stable@vger.kernel.org
Fixes: e70344c05995 ("block: fix default IO priority handling")
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623074840.5960-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c        | 2 ++
 block/ioprio.c         | 4 ++--
 include/linux/ioprio.h | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index df9cfe4ca532..63fc02042408 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
 #endif
+	ioc->ioprio = IOPRIO_DEFAULT;
+
 	return ioc;
 }
 
diff --git a/block/ioprio.c b/block/ioprio.c
index 2fe068fcaad5..2a34cbca18ae 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -157,9 +157,9 @@ out:
 int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
 	if (!ioprio_valid(aprio))
-		aprio = IOPRIO_DEFAULT;
+		aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM);
 	if (!ioprio_valid(bprio))
-		bprio = IOPRIO_DEFAULT;
+		bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM);
 
 	return min(aprio, bprio);
 }
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 3f53bc27a19b..3d088a88f832 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -11,7 +11,7 @@
 /*
  * Default IO priority.
  */
-#define IOPRIO_DEFAULT	IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM)
+#define IOPRIO_DEFAULT	IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0)
 
 /*
  * Check that a priority value has a valid class.
-- 
cgit 


From f7eda402878b12bc0884c5bc1192a9e76ad121fb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2022 09:48:27 +0200
Subject: block: Return effective IO priority from get_current_ioprio()

get_current_ioprio() is used to initialize IO priority of various
requests. As such it should be returning the effective IO priority of
the task (i.e., reflecting the fact that unset IO priority should get
set based on task's CPU priority) so that the conversion is concentrated
in one place.

Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623074840.5960-2-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 3d088a88f832..61ed6bb4998e 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -53,10 +53,17 @@ static inline int task_nice_ioclass(struct task_struct *task)
 static inline int get_current_ioprio(void)
 {
 	struct io_context *ioc = current->io_context;
+	int prio;
 
 	if (ioc)
-		return ioc->ioprio;
-	return IOPRIO_DEFAULT;
+		prio = ioc->ioprio;
+	else
+		prio = IOPRIO_DEFAULT;
+
+	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+					 task_nice_ioprio(current));
+	return prio;
 }
 
 /*
-- 
cgit 


From 893e5d32d5832674bcf6465f27958e883b72b346 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2022 09:48:28 +0200
Subject: block: Generalize get_current_ioprio() for any task

get_current_ioprio() operates only on current task. We will need the
same functionality for other tasks as well. Generalize
get_current_ioprio() for that and also move the bulk out of the header
file because it is large enough.

Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623074840.5960-3-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioprio.c         | 26 ++++++++++++++++++++++++++
 include/linux/ioprio.h | 26 ++++++++++----------------
 2 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/block/ioprio.c b/block/ioprio.c
index 2a34cbca18ae..c4e3476155a1 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -138,6 +138,32 @@ out:
 	return ret;
 }
 
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+int __get_task_ioprio(struct task_struct *p)
+{
+	struct io_context *ioc = p->io_context;
+	int prio;
+
+	if (p != current)
+		lockdep_assert_held(&p->alloc_lock);
+	if (ioc)
+		prio = ioc->ioprio;
+	else
+		prio = IOPRIO_DEFAULT;
+
+	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+					 task_nice_ioprio(p));
+	return prio;
+}
+EXPORT_SYMBOL_GPL(__get_task_ioprio);
+
 static int get_task_ioprio(struct task_struct *p)
 {
 	int ret;
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 61ed6bb4998e..9752cf4a9c7c 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -46,24 +46,18 @@ static inline int task_nice_ioclass(struct task_struct *task)
 		return IOPRIO_CLASS_BE;
 }
 
-/*
- * If the calling process has set an I/O priority, use that. Otherwise, return
- * the default I/O priority.
- */
-static inline int get_current_ioprio(void)
+#ifdef CONFIG_BLOCK
+int __get_task_ioprio(struct task_struct *p);
+#else
+static inline int __get_task_ioprio(struct task_struct *p)
 {
-	struct io_context *ioc = current->io_context;
-	int prio;
-
-	if (ioc)
-		prio = ioc->ioprio;
-	else
-		prio = IOPRIO_DEFAULT;
+	return IOPRIO_DEFAULT;
+}
+#endif /* CONFIG_BLOCK */
 
-	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
-		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
-					 task_nice_ioprio(current));
-	return prio;
+static inline int get_current_ioprio(void)
+{
+	return __get_task_ioprio(current);
 }
 
 /*
-- 
cgit 


From fc25545e17bd74befe0b8ab2c65ac84936be5066 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2022 09:48:29 +0200
Subject: block: Make ioprio_best() static

Nobody outside of block/ioprio.c uses it.

Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623074840.5960-4-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioprio.c         | 2 +-
 include/linux/ioprio.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/ioprio.c b/block/ioprio.c
index c4e3476155a1..8c46f672a0ba 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -180,7 +180,7 @@ out:
 	return ret;
 }
 
-int ioprio_best(unsigned short aprio, unsigned short bprio)
+static int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
 	if (!ioprio_valid(aprio))
 		aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM);
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 9752cf4a9c7c..7578d4f6a969 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -60,11 +60,6 @@ static inline int get_current_ioprio(void)
 	return __get_task_ioprio(current);
 }
 
-/*
- * For inheritance, return the highest of the two given priorities
- */
-extern int ioprio_best(unsigned short aprio, unsigned short bprio);
-
 extern int set_task_ioprio(struct task_struct *task, int ioprio);
 
 #ifdef CONFIG_BLOCK
-- 
cgit 


From ab24a01b276508dc884761bcb8e2759c36702377 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Tue, 14 Jun 2022 10:50:54 +0300
Subject: tty: Add closing marker into comment in tty_ldisc.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The closing `` is missing. Add it.

Fixes: 6bb6fa6908eb ("tty: Implement lookahead to process XON/XOFF timely")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/9bc6d45d-48c8-519-1646-78ba22505b1f@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 33678e1936f6..ede6f2157f32 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -187,7 +187,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	function for automatic flow control.
  *
  * @lookahead_buf: [DRV] ``void ()(struct tty_struct *tty,
- *			const unsigned char *cp, const char *fp, int count)
+ *			const unsigned char *cp, const char *fp, int count)``
  *
  *	This function is called by the low-level tty driver for characters
  *	not eaten by ->receive_buf() or ->receive_buf2(). It is useful for
-- 
cgit 


From f9008285bb69e4713918a665250ab2d356b731ba Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 13 Jun 2022 14:39:05 +0300
Subject: serial: Drop timeout from uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 31f6bd7fad3b ("serial: Store character timing information
to uart_port"), per frame timing information is available on uart_port.
Uart port's timeout can be derived from frame_time by multiplying with
fifosize.

Most callers of uart_poll_timeout are not made under port's lock. To be
on the safe side, make sure frame_time is only accessed once. As
fifo_size is effectively a constant, it shouldn't cause any issues.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220613113905.22962-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst |  5 +++--
 drivers/tty/serial/mux.c                   |  6 ------
 drivers/tty/serial/serial_core.c           | 25 ++++++++++---------------
 include/linux/serial_core.h                | 16 ++++++++++++++--
 4 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index 7ef83fd3917b..1e7ab4142d49 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -422,8 +422,9 @@ Other functions
 ---------------
 
 uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
+	Update the frame timing information according to the number of bits,
+	parity, stop bits and baud rate. The FIFO drain timeout is derived
+	from the frame timing information.
 
 	Locking: caller is expected to take port->lock
 
diff --git a/drivers/tty/serial/mux.c b/drivers/tty/serial/mux.c
index 643dfbcc43f9..0ba0f4d9459d 100644
--- a/drivers/tty/serial/mux.c
+++ b/drivers/tty/serial/mux.c
@@ -481,12 +481,6 @@ static int __init mux_probe(struct parisc_device *dev)
 		port->line	= port_cnt;
 		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_MUX_CONSOLE);
 
-		/* The port->timeout needs to match what is present in
-		 * uart_wait_until_sent in serial_core.c.  Otherwise
-		 * the time spent in msleep_interruptable will be very
-		 * long, causing the appearance of a console hang.
-		 */
-		port->timeout   = HZ / 50;
 		spin_lock_init(&port->lock);
 
 		status = uart_add_one_port(&mux_driver, port);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 1368b0ef7d7f..75ece750bedc 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -327,13 +327,14 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 }
 
 /**
- *	uart_update_timeout - update per-port FIFO timeout.
+ *	uart_update_timeout - update per-port frame timing information.
  *	@port:  uart_port structure describing the port
  *	@cflag: termios cflag value
  *	@baud:  speed of the port
  *
- *	Set the port FIFO timeout value.  The @cflag value should
- *	reflect the actual hardware settings.
+ *	Set the port frame timing information from which the FIFO timeout
+ *	value is derived. The @cflag value should reflect the actual hardware
+ *	settings.
  */
 void
 uart_update_timeout(struct uart_port *port, unsigned int cflag,
@@ -343,13 +344,6 @@ uart_update_timeout(struct uart_port *port, unsigned int cflag,
 	u64 frame_time;
 
 	frame_time = (u64)size * NSEC_PER_SEC;
-	size *= port->fifosize;
-
-	/*
-	 * Figure the timeout to send the above number of bits.
-	 * Add .02 seconds of slop
-	 */
-	port->timeout = (HZ * size) / baud + HZ/50;
 	port->frame_time = DIV64_U64_ROUND_UP(frame_time, baud);
 }
 EXPORT_SYMBOL(uart_update_timeout);
@@ -1698,7 +1692,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
-	unsigned long char_time, expire;
+	unsigned long char_time, expire, fifo_timeout;
 
 	port = uart_port_ref(state);
 	if (!port)
@@ -1728,12 +1722,13 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 		 * amount of time to send the entire FIFO, it probably won't
 		 * ever clear.  This assumes the UART isn't doing flow
 		 * control, which is currently the case.  Hence, if it ever
-		 * takes longer than port->timeout, this is probably due to a
+		 * takes longer than FIFO timeout, this is probably due to a
 		 * UART bug of some kind.  So, we clamp the timeout parameter at
-		 * 2*port->timeout.
+		 * 2 * FIFO timeout.
 		 */
-		if (timeout == 0 || timeout > 2 * port->timeout)
-			timeout = 2 * port->timeout;
+		fifo_timeout = uart_fifo_timeout(port);
+		if (timeout == 0 || timeout > 2 * fifo_timeout)
+			timeout = 2 * fifo_timeout;
 	}
 
 	expire = jiffies + timeout;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 8032ffa741ed..faaf2372c60d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -232,7 +232,6 @@ struct uart_port {
 
 	int			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
-	unsigned int		timeout;		/* character-based timeout */
 	unsigned int		frame_time;		/* frame timing in ns */
 	unsigned int		type;			/* port type */
 	const struct uart_ops	*ops;
@@ -335,10 +334,23 @@ unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios
 				unsigned int max);
 unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);
 
+/*
+ * Calculates FIFO drain time.
+ */
+static inline unsigned long uart_fifo_timeout(struct uart_port *port)
+{
+	u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize;
+
+	/* Add .02 seconds of slop */
+	fifo_timeout += 20 * NSEC_PER_MSEC;
+
+	return max(nsecs_to_jiffies(fifo_timeout), 1UL);
+}
+
 /* Base timer interval for polling */
 static inline int uart_poll_timeout(struct uart_port *port)
 {
-	int timeout = port->timeout;
+	int timeout = uart_fifo_timeout(port);
 
 	return timeout > 6 ? (timeout / 2 - 2) : 1;
 }
-- 
cgit 


From e23ee9d2c4ccb08fdfee3aea0a04a27bccd77433 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:21 +0300
Subject: serial: Use bits for UART_LSR_BRK_ERROR_BITS/MSR_ANY_DELTA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of listing the bits for UART_LSR_BRK_ERROR_BITS and
UART_MSR_ANY_DELTA in comment, use them to define instead.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_reg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index f51bc8f36813..bab3b39266cc 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -139,7 +139,7 @@
 #define UART_LSR_PE		0x04 /* Parity error indicator */
 #define UART_LSR_OE		0x02 /* Overrun error indicator */
 #define UART_LSR_DR		0x01 /* Receiver data ready */
-#define UART_LSR_BRK_ERROR_BITS	0x1E /* BI, FE, PE, OE bits */
+#define UART_LSR_BRK_ERROR_BITS	(UART_LSR_BI|UART_LSR_FE|UART_LSR_PE|UART_LSR_OE)
 
 #define UART_MSR	6	/* In:  Modem Status Register */
 #define UART_MSR_DCD		0x80 /* Data Carrier Detect */
@@ -150,7 +150,7 @@
 #define UART_MSR_TERI		0x04 /* Trailing edge ring indicator */
 #define UART_MSR_DDSR		0x02 /* Delta DSR */
 #define UART_MSR_DCTS		0x01 /* Delta CTS */
-#define UART_MSR_ANY_DELTA	0x0F /* Any of the delta bits! */
+#define UART_MSR_ANY_DELTA	(UART_MSR_DDCD|UART_MSR_TERI|UART_MSR_DDSR|UART_MSR_DCTS)
 
 #define UART_SCR	7	/* I/O: Scratch Register */
 
-- 
cgit 


From eb47b59afb7e46c952d7b03884245364990d4910 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:23 +0300
Subject: serial: Convert SERIAL_XMIT_SIZE to UART_XMIT_SIZE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both UART_XMIT_SIZE and SERIAL_XMIT_SIZE are defined. Make them all
UART_XMIT_SIZE.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-6-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/amiserial.c         | 18 +++++++++---------
 drivers/tty/mips_ejtag_fdc.c    |  2 +-
 drivers/tty/serial/meson_uart.c |  2 +-
 drivers/tty/serial/owl-uart.c   |  2 +-
 drivers/tty/serial/rda-uart.c   |  2 +-
 include/linux/serial.h          |  6 ------
 6 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index afb2d373dd47..5458e2b1c125 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -51,6 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/serial.h>
 #include <linux/serial_reg.h>
+#include <linux/serial_core.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
@@ -283,12 +284,12 @@ static void transmit_chars(struct serial_state *info)
 
 	amiga_custom.serdat = info->xmit.buf[info->xmit.tail++] | 0x100;
 	mb();
-	info->xmit.tail = info->xmit.tail & (SERIAL_XMIT_SIZE-1);
+	info->xmit.tail = info->xmit.tail & (UART_XMIT_SIZE - 1);
 	info->icount.tx++;
 
 	if (CIRC_CNT(info->xmit.head,
 		     info->xmit.tail,
-		     SERIAL_XMIT_SIZE) < WAKEUP_CHARS)
+		     UART_XMIT_SIZE) < WAKEUP_CHARS)
 		tty_wakeup(info->tport.tty);
 
 #ifdef SERIAL_DEBUG_INTR
@@ -708,13 +709,13 @@ static int rs_put_char(struct tty_struct *tty, unsigned char ch)
 	local_irq_save(flags);
 	if (CIRC_SPACE(info->xmit.head,
 		       info->xmit.tail,
-		       SERIAL_XMIT_SIZE) == 0) {
+		       UART_XMIT_SIZE) == 0) {
 		local_irq_restore(flags);
 		return 0;
 	}
 
 	info->xmit.buf[info->xmit.head++] = ch;
-	info->xmit.head &= SERIAL_XMIT_SIZE-1;
+	info->xmit.head &= UART_XMIT_SIZE - 1;
 	local_irq_restore(flags);
 	return 1;
 }
@@ -753,15 +754,14 @@ static int rs_write(struct tty_struct * tty, const unsigned char *buf, int count
 	while (1) {
 		c = CIRC_SPACE_TO_END(info->xmit.head,
 				      info->xmit.tail,
-				      SERIAL_XMIT_SIZE);
+				      UART_XMIT_SIZE);
 		if (count < c)
 			c = count;
 		if (c <= 0) {
 			break;
 		}
 		memcpy(info->xmit.buf + info->xmit.head, buf, c);
-		info->xmit.head = ((info->xmit.head + c) &
-				   (SERIAL_XMIT_SIZE-1));
+		info->xmit.head = (info->xmit.head + c) & (UART_XMIT_SIZE - 1);
 		buf += c;
 		count -= c;
 		ret += c;
@@ -788,14 +788,14 @@ static unsigned int rs_write_room(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
-	return CIRC_SPACE(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE);
+	return CIRC_SPACE(info->xmit.head, info->xmit.tail, UART_XMIT_SIZE);
 }
 
 static unsigned int rs_chars_in_buffer(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
-	return CIRC_CNT(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE);
+	return CIRC_CNT(info->xmit.head, info->xmit.tail, UART_XMIT_SIZE);
 }
 
 static void rs_flush_buffer(struct tty_struct *tty)
diff --git a/drivers/tty/mips_ejtag_fdc.c b/drivers/tty/mips_ejtag_fdc.c
index 49907427a165..e81701a66429 100644
--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -916,7 +916,7 @@ static int mips_ejtag_fdc_tty_probe(struct mips_cdmm_device *dev)
 	mips_ejtag_fdc_write(priv, REG_FDCFG, cfg);
 
 	/* Make each port's xmit FIFO big enough to fill FDC TX FIFO */
-	priv->xmit_size = min(tx_fifo * 4, (unsigned int)SERIAL_XMIT_SIZE);
+	priv->xmit_size = min(tx_fifo * 4, (unsigned int)UART_XMIT_SIZE);
 
 	driver = tty_alloc_driver(NUM_TTY_CHANNELS, TTY_DRIVER_REAL_RAW);
 	if (IS_ERR(driver))
diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c
index 4869c0059c98..6c8db19fd572 100644
--- a/drivers/tty/serial/meson_uart.c
+++ b/drivers/tty/serial/meson_uart.c
@@ -162,7 +162,7 @@ static void meson_uart_start_tx(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		writel(ch, port->membase + AML_UART_WFIFO);
-		xmit->tail = (xmit->tail+1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c
index 44d20e5a7dd3..888e17e3f25f 100644
--- a/drivers/tty/serial/owl-uart.c
+++ b/drivers/tty/serial/owl-uart.c
@@ -201,7 +201,7 @@ static void owl_uart_send_chars(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		owl_uart_write(port, ch, OWL_UART_TXDAT);
-		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c
index f556b4955f59..feb2054aba37 100644
--- a/drivers/tty/serial/rda-uart.c
+++ b/drivers/tty/serial/rda-uart.c
@@ -353,7 +353,7 @@ static void rda_uart_send_chars(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		rda_uart_write(port, ch, RDA_UART_RXTX_BUFFER);
-		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 0b8b7d7c8f33..70a9866e4abb 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -9,7 +9,6 @@
 #ifndef _LINUX_SERIAL_H
 #define _LINUX_SERIAL_H
 
-#include <asm/page.h>
 #include <uapi/linux/serial.h>
 
 /* Helper for dealing with UART_LCR_WLEN* defines */
@@ -25,11 +24,6 @@ struct async_icount {
 	__u32	buf_overrun;
 };
 
-/*
- * The size of the serial xmit buffer is 1 page, or 4096 bytes
- */
-#define SERIAL_XMIT_SIZE PAGE_SIZE
-
 #include <linux/compiler.h>
 
 #endif /* _LINUX_SERIAL_H */
-- 
cgit 


From 34619de1b8cb52afa90bbeb3b4fbad34c28f19cf Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:24 +0300
Subject: serial: Consolidate BOTH_EMPTY use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per file BOTH_EMPTY defines are littering our source code here and
there. Define once in serial.h and create helper for the check
too.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-7-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/ath79/early_printk.c           |  9 +++++----
 drivers/accessibility/speakup/serialio.h |  3 +--
 drivers/tty/serial/8250/8250_early.c     |  4 +---
 drivers/tty/serial/8250/8250_port.c      | 12 +++++-------
 drivers/tty/serial/omap-serial.c         |  7 +++----
 drivers/tty/serial/pch_uart.c            |  7 +++----
 drivers/tty/serial/pxa.c                 |  5 ++---
 drivers/tty/serial/sunsu.c               |  4 +---
 drivers/tty/serial/vr41xx_siu.c          |  4 +---
 include/linux/serial.h                   |  9 +++++++++
 10 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/arch/mips/ath79/early_printk.c b/arch/mips/ath79/early_printk.c
index 8751d067f98f..f6d02b425a10 100644
--- a/arch/mips/ath79/early_printk.c
+++ b/arch/mips/ath79/early_printk.c
@@ -8,6 +8,7 @@
 
 #include <linux/io.h>
 #include <linux/errno.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <asm/addrspace.h>
 #include <asm/setup.h>
@@ -29,15 +30,15 @@ static inline void prom_putchar_wait(void __iomem *reg, u32 mask, u32 val)
 	} while (1);
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void prom_putchar_ar71xx(char ch)
 {
 	void __iomem *base = (void __iomem *)(KSEG1ADDR(AR71XX_UART_BASE));
 
-	prom_putchar_wait(base + UART_LSR * 4, BOTH_EMPTY, BOTH_EMPTY);
+	prom_putchar_wait(base + UART_LSR * 4, UART_LSR_BOTH_EMPTY,
+			  UART_LSR_BOTH_EMPTY);
 	__raw_writel((unsigned char)ch, base + UART_TX * 4);
-	prom_putchar_wait(base + UART_LSR * 4, BOTH_EMPTY, BOTH_EMPTY);
+	prom_putchar_wait(base + UART_LSR * 4, UART_LSR_BOTH_EMPTY,
+			  UART_LSR_BOTH_EMPTY);
 }
 
 static void prom_putchar_ar933x(char ch)
diff --git a/drivers/accessibility/speakup/serialio.h b/drivers/accessibility/speakup/serialio.h
index 6f8f86f161bb..b4f9a1925b81 100644
--- a/drivers/accessibility/speakup/serialio.h
+++ b/drivers/accessibility/speakup/serialio.h
@@ -33,9 +33,8 @@ struct old_serial_port {
 #define NUM_DISABLE_TIMEOUTS 3
 /* buffer timeout in ms */
 #define SPK_TIMEOUT 100
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
 
 #define spk_serial_tx_busy() \
-	((inb(speakup_info.port_tts + UART_LSR) & BOTH_EMPTY) != BOTH_EMPTY)
+	(!uart_lsr_tx_empty(inb(speakup_info.port_tts + UART_LSR)))
 
 #endif
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index e52585064565..f271becfc46c 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -84,8 +84,6 @@ static void serial8250_early_out(struct uart_port *port, int offset, int value)
 	}
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void serial_putc(struct uart_port *port, unsigned char c)
 {
 	unsigned int status;
@@ -94,7 +92,7 @@ static void serial_putc(struct uart_port *port, unsigned char c)
 
 	for (;;) {
 		status = serial8250_early_in(port, UART_LSR);
-		if ((status & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(status))
 			break;
 		cpu_relax();
 	}
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 1311b00f8194..55b252954a92 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -50,8 +50,6 @@
 #define DEBUG_AUTOCONF(fmt...)	do { } while (0)
 #endif
 
-#define BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  * Here we define the default xmit fifo size used for each type of UART.
  */
@@ -1843,7 +1841,7 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 		if (uart_circ_empty(xmit))
 			break;
 		if ((up->capabilities & UART_CAP_HFIFO) &&
-		    (serial_in(up, UART_LSR) & BOTH_EMPTY) != BOTH_EMPTY)
+		    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
 			break;
 		/* The BCM2835 MINI UART THRE bit is really a not-full bit. */
 		if ((up->capabilities & UART_CAP_MINI) &&
@@ -2003,7 +2001,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 
 	serial8250_rpm_put(up);
 
-	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
+	return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0;
 }
 
 unsigned int serial8250_do_get_mctrl(struct uart_port *port)
@@ -2151,7 +2149,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
 	else
 		serial_port_out(port, UART_IER, 0);
 
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 	/*
 	 *	Send the character out.
 	 */
@@ -2161,7 +2159,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 	serial_port_out(port, UART_IER, ier);
 	serial8250_rpm_put(up);
 }
@@ -3431,7 +3429,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 
 	if (em485) {
 		mdelay(port->rs485.delay_rts_after_send);
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 98622c35d896..52cb1a68b053 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
@@ -1102,8 +1103,6 @@ serial_omap_type(struct uart_port *port)
 	return up->name;
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void __maybe_unused wait_for_xmitr(struct uart_omap_port *up)
 {
 	unsigned int status, tmout = 10000;
@@ -1118,7 +1117,7 @@ static void __maybe_unused wait_for_xmitr(struct uart_omap_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
@@ -1186,7 +1185,7 @@ static void omap_serial_early_putc(struct uart_port *port, unsigned char c)
 
 	for (;;) {
 		status = omap_serial_early_in(port, UART_LSR);
-		if ((status & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(status))
 			break;
 		cpu_relax();
 	}
diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 3b26524d48e3..8a9065e4a903 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -3,6 +3,7 @@
  *Copyright (C) 2011 LAPIS Semiconductor Co., Ltd.
  */
 #include <linux/kernel.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -189,8 +190,6 @@ enum {
 #define PCH_UART_HAL_LOOP		(PCH_UART_MCR_LOOP)
 #define PCH_UART_HAL_AFE		(PCH_UART_MCR_AFE)
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 #define DEFAULT_UARTCLK   1843200 /*   1.8432 MHz */
 #define CMITC_UARTCLK   192000000 /* 192.0000 MHz */
 #define FRI2_64_UARTCLK  64000000 /*  64.0000 MHz */
@@ -1516,7 +1515,7 @@ static void pch_uart_put_poll_char(struct uart_port *port,
 	 * Finally, wait for transmitter to become empty
 	 * and restore the IER
 	 */
-	wait_for_xmitr(priv, BOTH_EMPTY);
+	wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY);
 	iowrite8(ier, priv->membase + UART_IER);
 }
 #endif /* CONFIG_CONSOLE_POLL */
@@ -1602,7 +1601,7 @@ pch_console_write(struct console *co, const char *s, unsigned int count)
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(priv, BOTH_EMPTY);
+	wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY);
 	iowrite8(ier, priv->membase + UART_IER);
 
 	if (port_locked)
diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c
index e80ba8e10407..9309ffd87c8e 100644
--- a/drivers/tty/serial/pxa.c
+++ b/drivers/tty/serial/pxa.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/circ_buf.h>
 #include <linux/delay.h>
@@ -575,8 +576,6 @@ static struct uart_driver serial_pxa_reg;
 
 #ifdef CONFIG_SERIAL_PXA_CONSOLE
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  *	Wait for transmitter & holding register to empty
  */
@@ -594,7 +593,7 @@ static void wait_for_xmitr(struct uart_pxa_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c
index fff50b5b82eb..84d545e5a8c7 100644
--- a/drivers/tty/serial/sunsu.c
+++ b/drivers/tty/serial/sunsu.c
@@ -1249,8 +1249,6 @@ static int sunsu_kbd_ms_init(struct uart_sunsu_port *up)
 
 #ifdef CONFIG_SERIAL_SUNSU_CONSOLE
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  *	Wait for transmitter & holding register to empty
  */
@@ -1268,7 +1266,7 @@ static void wait_for_xmitr(struct uart_sunsu_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
diff --git a/drivers/tty/serial/vr41xx_siu.c b/drivers/tty/serial/vr41xx_siu.c
index e0bf003ca3a1..1ba689a81abd 100644
--- a/drivers/tty/serial/vr41xx_siu.c
+++ b/drivers/tty/serial/vr41xx_siu.c
@@ -703,8 +703,6 @@ static int siu_init_ports(struct platform_device *pdev)
 
 #ifdef CONFIG_SERIAL_VR41XX_CONSOLE
 
-#define BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
-
 static void wait_for_xmitr(struct uart_port *port)
 {
 	int timeout = 10000;
@@ -715,7 +713,7 @@ static void wait_for_xmitr(struct uart_port *port)
 		if (lsr & UART_LSR_BI)
 			lsr_break_flag[port->line] = UART_LSR_BI;
 
-		if ((lsr & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(lsr))
 			break;
 	} while (timeout-- > 0);
 
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 70a9866e4abb..3d6fe3ef92cf 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -10,10 +10,19 @@
 #define _LINUX_SERIAL_H
 
 #include <uapi/linux/serial.h>
+#include <uapi/linux/serial_reg.h>
 
 /* Helper for dealing with UART_LCR_WLEN* defines */
 #define UART_LCR_WLEN(x)	((x) - 5)
 
+/* FIFO and shifting register empty */
+#define UART_LSR_BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
+
+static inline bool uart_lsr_tx_empty(u16 lsr)
+{
+	return (lsr & UART_LSR_BOTH_EMPTY) == UART_LSR_BOTH_EMPTY;
+}
+
 /*
  * Counters of the input lines (CTS, DSR, RI, CD) interrupts
  */
-- 
cgit 


From f8ba5680a56be696b3f4343ed0a591abab807da4 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:05 +0300
Subject: serial: 8250: make saved LSR larger
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DW flags address received as BIT(8) in LSR. In order to not lose that
on read, enlarge lsr_saved_flags to u16.

Adjust lsr/status variables and related call chains to use u16.
Technically, some of these type conversion would not be needed but it
doesn't hurt to be consistent.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h         |  4 ++--
 drivers/tty/serial/8250/8250_exar.c    |  2 +-
 drivers/tty/serial/8250/8250_fsl.c     |  2 +-
 drivers/tty/serial/8250/8250_ingenic.c |  2 +-
 drivers/tty/serial/8250/8250_omap.c    |  7 +++----
 drivers/tty/serial/8250/8250_port.c    | 17 +++++++++--------
 include/linux/serial_8250.h            |  6 +++---
 7 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index b120da57c61f..0ff5688ba90c 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -133,9 +133,9 @@ static inline void serial_out(struct uart_8250_port *up, int offset, int value)
  *
  *	Returns LSR value or'ed with the preserved flags (if any).
  */
-static inline unsigned int serial_lsr_in(struct uart_8250_port *up)
+static inline u16 serial_lsr_in(struct uart_8250_port *up)
 {
-	unsigned int lsr = up->lsr_saved_flags;
+	u16 lsr = up->lsr_saved_flags;
 
 	lsr |= serial_in(up, UART_LSR);
 	up->lsr_saved_flags = lsr & LSR_SAVE_FLAGS;
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 528779b40049..3d999eec4087 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -195,11 +195,11 @@ static int xr17v35x_startup(struct uart_port *port)
 
 static void exar_shutdown(struct uart_port *port)
 {
-	unsigned char lsr;
 	bool tx_complete = false;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	struct circ_buf *xmit = &port->state->xmit;
 	int i = 0;
+	u16 lsr;
 
 	do {
 		lsr = serial_in(up, UART_LSR);
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index 9c01c531349d..fd4005fcd0d6 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -25,8 +25,8 @@
 
 int fsl8250_handle_irq(struct uart_port *port)
 {
-	unsigned char lsr, orig_lsr;
 	unsigned long flags;
+	u16 lsr, orig_lsr;
 	unsigned int iir;
 	struct uart_8250_port *up = up_to_u8250p(port);
 
diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
index cff91aa03f29..2b2f5d8d24b9 100644
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -54,7 +54,7 @@ static void early_out(struct uart_port *port, int offset, uint8_t value)
 
 static void ingenic_early_console_putc(struct uart_port *port, unsigned char c)
 {
-	uint8_t lsr;
+	u16 lsr;
 
 	do {
 		lsr = early_in(port, UART_LSR);
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index ac8bfa042391..0dcecbbc3967 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -1115,8 +1115,7 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
 	return omap_8250_rx_dma(up);
 }
 
-static unsigned char omap_8250_handle_rx_dma(struct uart_8250_port *up,
-					     u8 iir, unsigned char status)
+static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status)
 {
 	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
 	    (iir & UART_IIR_RDI)) {
@@ -1130,7 +1129,7 @@ static unsigned char omap_8250_handle_rx_dma(struct uart_8250_port *up,
 }
 
 static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
-				     unsigned char status)
+				     u16 status)
 {
 	/*
 	 * Queue a new transfer if FIFO has data.
@@ -1164,7 +1163,7 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	struct omap8250_priv *priv = up->port.private_data;
-	unsigned char status;
+	u16 status;
 	u8 iir;
 
 	serial8250_rpm_get(up);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 55b252954a92..c8ae0e8376d4 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1502,7 +1502,7 @@ static inline void __stop_tx(struct uart_8250_port *p)
 	struct uart_8250_em485 *em485 = p->em485;
 
 	if (em485) {
-		unsigned char lsr = serial_lsr_in(p);
+		u16 lsr = serial_lsr_in(p);
 		u64 stop_delay = 0;
 
 		p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
@@ -1563,7 +1563,7 @@ static inline void __start_tx(struct uart_port *port)
 
 	if (serial8250_set_THRI(up)) {
 		if (up->bugs & UART_BUG_TXEN) {
-			unsigned char lsr = serial_lsr_in(up);
+			u16 lsr = serial_lsr_in(up);
 
 			if (lsr & UART_LSR_THRE)
 				serial8250_tx_chars(up);
@@ -1716,7 +1716,7 @@ static void serial8250_enable_ms(struct uart_port *port)
 	serial8250_rpm_put(up);
 }
 
-void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr)
+void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
 {
 	struct uart_port *port = &up->port;
 	unsigned char ch;
@@ -1785,7 +1785,7 @@ EXPORT_SYMBOL_GPL(serial8250_read_char);
  * (such as THRE) because the LSR value might come from an already consumed
  * character.
  */
-unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr)
+u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
 {
 	struct uart_port *port = &up->port;
 	int max_count = 256;
@@ -1905,10 +1905,10 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
  */
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 {
-	unsigned char status;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	bool skip_rx = false;
 	unsigned long flags;
+	u16 status;
 
 	if (iir & UART_IIR_NO_INT)
 		return 0;
@@ -1991,7 +1991,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned long flags;
-	unsigned int lsr;
+	u16 lsr;
 
 	serial8250_rpm_get(up);
 
@@ -2114,8 +2114,8 @@ static void wait_for_xmitr(struct uart_8250_port *up, int bits)
 static int serial8250_get_poll_char(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned char lsr;
 	int status;
+	u16 lsr;
 
 	serial8250_rpm_get(up);
 
@@ -2170,8 +2170,9 @@ int serial8250_do_startup(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned long flags;
-	unsigned char lsr, iir;
+	unsigned char iir;
 	int retval;
+	u16 lsr;
 
 	if (!port->fifosize)
 		port->fifosize = uart_config[port->type].fifo_size;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index ff84a3ed10ea..4565f25ba9a2 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -119,7 +119,7 @@ struct uart_8250_port {
 	 * be immediately processed.
 	 */
 #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS
-	unsigned char		lsr_saved_flags;
+	u16			lsr_saved_flags;
 #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
 	unsigned char		msr_saved_flags;
 
@@ -170,8 +170,8 @@ extern void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
 				      unsigned int quot_frac);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
-unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
-void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr);
+u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr);
+void serial8250_read_char(struct uart_8250_port *up, u16 lsr);
 void serial8250_tx_chars(struct uart_8250_port *up);
 unsigned int serial8250_modem_status(struct uart_8250_port *up);
 void serial8250_init_port(struct uart_8250_port *up);
-- 
cgit 


From 507bd6fbaaefcb8dd89bd00baddf00b439d30c51 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:06 +0300
Subject: serial: 8250: create lsr_save_mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow drivers to alter LSR save mask.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-3-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h      | 2 +-
 drivers/tty/serial/8250/8250_core.c | 4 ++++
 drivers/tty/serial/8250/8250_dw.c   | 2 +-
 include/linux/serial_8250.h         | 1 +
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 0ff5688ba90c..5cc967fe3b59 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -138,7 +138,7 @@ static inline u16 serial_lsr_in(struct uart_8250_port *up)
 	u16 lsr = up->lsr_saved_flags;
 
 	lsr |= serial_in(up, UART_LSR);
-	up->lsr_saved_flags = lsr & LSR_SAVE_FLAGS;
+	up->lsr_saved_flags = lsr & up->lsr_save_mask;
 
 	return lsr;
 }
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 90ddc8924811..57e86133af4f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1007,6 +1007,7 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 		uart->port.rs485	= up->port.rs485;
 		uart->rs485_start_tx	= up->rs485_start_tx;
 		uart->rs485_stop_tx	= up->rs485_stop_tx;
+		uart->lsr_save_mask	= up->lsr_save_mask;
 		uart->dma		= up->dma;
 
 		/* Take tx_loadsz from fifosize if it wasn't set separately */
@@ -1094,6 +1095,9 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 			ret = 0;
 		}
 
+		if (!uart->lsr_save_mask)
+			uart->lsr_save_mask = LSR_SAVE_FLAGS;	/* Use default LSR mask */
+
 		/* Initialise interrupt backoff work if required */
 		if (up->overrun_backoff_time_ms > 0) {
 			uart->overrun_backoff_time_ms =
diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c
index 4cc69bb612ab..167a691c7b19 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -129,7 +129,7 @@ static void dw8250_tx_wait_empty(struct uart_port *p)
 
 	while (tries--) {
 		lsr = readb (p->membase + (UART_LSR << p->regshift));
-		up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
+		up->lsr_saved_flags |= lsr & up->lsr_save_mask;
 
 		if (lsr & UART_LSR_TEMT)
 			break;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 4565f25ba9a2..8c7b793aa4d7 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -120,6 +120,7 @@ struct uart_8250_port {
 	 */
 #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS
 	u16			lsr_saved_flags;
+	u16			lsr_save_mask;
 #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
 	unsigned char		msr_saved_flags;
 
-- 
cgit 


From ae50bb2752836277ae15aa4e9d99074d6d947946 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:08 +0300
Subject: serial: take termios_rwsem for ->rs485_config() & pass termios as
 param
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to alter ADDRB within ->rs485_config(), take termios_rwsem
before calling ->rs485_config() and pass termios.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-5-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h         |  3 ++-
 drivers/tty/serial/8250/8250_dwlib.c   |  3 ++-
 drivers/tty/serial/8250/8250_exar.c    |  9 +++++----
 drivers/tty/serial/8250/8250_fintek.c  |  2 +-
 drivers/tty/serial/8250/8250_lpc18xx.c |  2 +-
 drivers/tty/serial/8250/8250_pci.c     |  2 +-
 drivers/tty/serial/8250/8250_port.c    |  3 ++-
 drivers/tty/serial/amba-pl011.c        |  2 +-
 drivers/tty/serial/ar933x_uart.c       |  2 +-
 drivers/tty/serial/atmel_serial.c      |  2 +-
 drivers/tty/serial/fsl_lpuart.c        |  4 ++--
 drivers/tty/serial/imx.c               |  2 +-
 drivers/tty/serial/max310x.c           |  2 +-
 drivers/tty/serial/mcf.c               |  3 ++-
 drivers/tty/serial/omap-serial.c       |  3 ++-
 drivers/tty/serial/sc16is7xx.c         |  2 +-
 drivers/tty/serial/serial_core.c       | 14 ++++++++++----
 drivers/tty/serial/stm32-usart.c       |  2 +-
 include/linux/serial_core.h            |  1 +
 19 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 5cc967fe3b59..287153d32536 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -203,7 +203,8 @@ void serial8250_rpm_put(struct uart_8250_port *p);
 void serial8250_rpm_get_tx(struct uart_8250_port *p);
 void serial8250_rpm_put_tx(struct uart_8250_port *p);
 
-int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485);
+int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485);
 void serial8250_em485_start_tx(struct uart_8250_port *p);
 void serial8250_em485_stop_tx(struct uart_8250_port *p);
 void serial8250_em485_destroy(struct uart_8250_port *p);
diff --git a/drivers/tty/serial/8250/8250_dwlib.c b/drivers/tty/serial/8250/8250_dwlib.c
index c83e7eaf3877..d1ff3daeb0ba 100644
--- a/drivers/tty/serial/8250/8250_dwlib.c
+++ b/drivers/tty/serial/8250/8250_dwlib.c
@@ -85,7 +85,8 @@ void dw8250_do_set_termios(struct uart_port *p, struct ktermios *termios, struct
 }
 EXPORT_SYMBOL_GPL(dw8250_do_set_termios);
 
-static int dw8250_rs485_config(struct uart_port *p, struct serial_rs485 *rs485)
+static int dw8250_rs485_config(struct uart_port *p, struct ktermios *termios,
+			       struct serial_rs485 *rs485)
 {
 	u32 tcr;
 
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 3d999eec4087..f5344cfe303c 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -112,7 +112,8 @@
 struct exar8250;
 
 struct exar8250_platform {
-	int (*rs485_config)(struct uart_port *, struct serial_rs485 *);
+	int (*rs485_config)(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485);
 	const struct serial_rs485 *rs485_supported;
 	int (*register_gpio)(struct pci_dev *, struct uart_8250_port *);
 	void (*unregister_gpio)(struct uart_8250_port *);
@@ -409,7 +410,7 @@ static void xr17v35x_unregister_gpio(struct uart_8250_port *port)
 	port->port.private_data = NULL;
 }
 
-static int generic_rs485_config(struct uart_port *port,
+static int generic_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	bool is_rs485 = !!(rs485->flags & SER_RS485_ENABLED);
@@ -441,7 +442,7 @@ static const struct exar8250_platform exar8250_default_platform = {
 	.rs485_supported = &generic_rs485_supported,
 };
 
-static int iot2040_rs485_config(struct uart_port *port,
+static int iot2040_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	bool is_rs485 = !!(rs485->flags & SER_RS485_ENABLED);
@@ -471,7 +472,7 @@ static int iot2040_rs485_config(struct uart_port *port,
 	value |= mode;
 	writeb(value, p + UART_EXAR_MPIOLVL_7_0);
 
-	return generic_rs485_config(port, rs485);
+	return generic_rs485_config(port, termios, rs485);
 }
 
 static const struct serial_rs485 iot2040_rs485_supported = {
diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c
index 1fb86c73786c..eea693f5b577 100644
--- a/drivers/tty/serial/8250/8250_fintek.c
+++ b/drivers/tty/serial/8250/8250_fintek.c
@@ -191,7 +191,7 @@ static int fintek_8250_get_ldn_range(struct fintek_8250 *pdata, int *min,
 	return -ENODEV;
 }
 
-static int fintek_8250_rs485_config(struct uart_port *port,
+static int fintek_8250_rs485_config(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485)
 {
 	uint8_t config = 0;
diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c
index 3a1cb51cbc91..d7cb3bb52069 100644
--- a/drivers/tty/serial/8250/8250_lpc18xx.c
+++ b/drivers/tty/serial/8250/8250_lpc18xx.c
@@ -32,7 +32,7 @@ struct lpc18xx_uart_data {
 	int line;
 };
 
-static int lpc18xx_rs485_config(struct uart_port *port,
+static int lpc18xx_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index b6d71268aa7d..d31d2350a9db 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1553,7 +1553,7 @@ pci_brcm_trumanage_setup(struct serial_private *priv,
 #define FINTEK_RTS_INVERT		BIT(5)
 
 /* We should do proper H/W transceiver setting before change to RS485 mode */
-static int pci_fintek_rs485_config(struct uart_port *port,
+static int pci_fintek_rs485_config(struct uart_port *port, struct ktermios *termios,
 			       struct serial_rs485 *rs485)
 {
 	struct pci_dev *pci_dev = to_pci_dev(port->dev);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index c8ae0e8376d4..d4337d8346c8 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -664,7 +664,8 @@ EXPORT_SYMBOL_GPL(serial8250_em485_supported);
  * if the uart is incapable of driving RTS as a Transmit Enable signal in
  * hardware, relying on software emulation instead.
  */
-int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485)
+int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index eccd66625d25..c8f52945a4aa 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2197,7 +2197,7 @@ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser)
 	return ret;
 }
 
-static int pl011_rs485_config(struct uart_port *port,
+static int pl011_rs485_config(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485)
 {
 	struct uart_amba_port *uap =
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index ab2c5b2a1ce8..b73ce13683db 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -580,7 +580,7 @@ static const struct uart_ops ar933x_uart_ops = {
 	.verify_port	= ar933x_uart_verify_port,
 };
 
-static int ar933x_config_rs485(struct uart_port *port,
+static int ar933x_config_rs485(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485conf)
 {
 	struct ar933x_uart_port *up =
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index 3a94c2bdda72..bc6004679585 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -283,7 +283,7 @@ static void atmel_tasklet_schedule(struct atmel_uart_port *atmel_port,
 }
 
 /* Enable or disable the rs485 support */
-static int atmel_config_rs485(struct uart_port *port,
+static int atmel_config_rs485(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485conf)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index d35414cb3e4e..8fe0494d4057 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -1355,7 +1355,7 @@ static void lpuart_dma_rx_free(struct uart_port *port)
 	sport->dma_rx_cookie = -EINVAL;
 }
 
-static int lpuart_config_rs485(struct uart_port *port,
+static int lpuart_config_rs485(struct uart_port *port, struct ktermios *termios,
 			struct serial_rs485 *rs485)
 {
 	struct lpuart_port *sport = container_of(port,
@@ -1385,7 +1385,7 @@ static int lpuart_config_rs485(struct uart_port *port,
 	return 0;
 }
 
-static int lpuart32_config_rs485(struct uart_port *port,
+static int lpuart32_config_rs485(struct uart_port *port, struct ktermios *termios,
 			struct serial_rs485 *rs485)
 {
 	struct lpuart_port *sport = container_of(port,
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index f4edde54175f..3457006cea3f 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -1907,7 +1907,7 @@ static void imx_uart_poll_put_char(struct uart_port *port, unsigned char c)
 #endif
 
 /* called with port.lock taken and irqs off or from .probe without locking */
-static int imx_uart_rs485_config(struct uart_port *port,
+static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termios,
 				 struct serial_rs485 *rs485conf)
 {
 	struct imx_port *sport = (struct imx_port *)port;
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 4915a786e315..e162bfb44080 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1036,7 +1036,7 @@ static void max310x_rs_proc(struct work_struct *ws)
 			MAX310X_MODE2_ECHOSUPR_BIT, mode2);
 }
 
-static int max310x_rs485_config(struct uart_port *port,
+static int max310x_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	struct max310x_one *one = to_max310x_port(port);
diff --git a/drivers/tty/serial/mcf.c b/drivers/tty/serial/mcf.c
index 036f178e3d66..73c5287b8e5e 100644
--- a/drivers/tty/serial/mcf.c
+++ b/drivers/tty/serial/mcf.c
@@ -431,7 +431,8 @@ static int mcf_verify_port(struct uart_port *port, struct serial_struct *ser)
 /****************************************************************************/
 
 /* Enable or disable the RS485 support */
-static int mcf_config_rs485(struct uart_port *port, struct serial_rs485 *rs485)
+static int mcf_config_rs485(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485)
 {
 	unsigned char mr1, mr2;
 
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 52cb1a68b053..196bae704f85 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1324,7 +1324,8 @@ static inline void serial_omap_add_console_port(struct uart_omap_port *up)
 
 /* Enable or disable the rs485 support */
 static int
-serial_omap_config_rs485(struct uart_port *port, struct serial_rs485 *rs485)
+serial_omap_config_rs485(struct uart_port *port, struct ktermios *termios,
+			 struct serial_rs485 *rs485)
 {
 	struct uart_omap_port *up = to_uart_omap_port(port);
 	unsigned int mode;
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 2ceecaa4a478..8cb92a3b3fb8 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1127,7 +1127,7 @@ static void sc16is7xx_set_termios(struct uart_port *port,
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 
-static int sc16is7xx_config_rs485(struct uart_port *port,
+static int sc16is7xx_config_rs485(struct uart_port *port, struct ktermios *termios,
 				  struct serial_rs485 *rs485)
 {
 	struct sc16is7xx_port *s = dev_get_drvdata(port->dev);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 75ece750bedc..2529153c8979 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1353,7 +1353,7 @@ int uart_rs485_config(struct uart_port *port)
 
 	uart_sanitize_serial_rs485(port, rs485);
 
-	ret = port->rs485_config(port, rs485);
+	ret = port->rs485_config(port, NULL, rs485);
 	if (ret)
 		memset(rs485, 0, sizeof(*rs485));
 
@@ -1377,7 +1377,7 @@ static int uart_get_rs485_config(struct uart_port *port,
 	return 0;
 }
 
-static int uart_set_rs485_config(struct uart_port *port,
+static int uart_set_rs485_config(struct tty_struct *tty, struct uart_port *port,
 			 struct serial_rs485 __user *rs485_user)
 {
 	struct serial_rs485 rs485;
@@ -1396,7 +1396,7 @@ static int uart_set_rs485_config(struct uart_port *port,
 	uart_sanitize_serial_rs485(port, &rs485);
 
 	spin_lock_irqsave(&port->lock, flags);
-	ret = port->rs485_config(port, &rs485);
+	ret = port->rs485_config(port, &tty->termios, &rs485);
 	if (!ret)
 		port->rs485 = rs485;
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -1505,6 +1505,10 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
+	/* rs485_config requires more locking than others */
+	if (cmd == TIOCGRS485)
+		down_write(&tty->termios_rwsem);
+
 	mutex_lock(&port->mutex);
 	uport = uart_port_check(state);
 
@@ -1528,7 +1532,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 		break;
 
 	case TIOCSRS485:
-		ret = uart_set_rs485_config(uport, uarg);
+		ret = uart_set_rs485_config(tty, uport, uarg);
 		break;
 
 	case TIOCSISO7816:
@@ -1545,6 +1549,8 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	}
 out_up:
 	mutex_unlock(&port->mutex);
+	if (cmd == TIOCGRS485)
+		up_write(&tty->termios_rwsem);
 out:
 	return ret;
 }
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index db3dd9731ee1..13992e64a7df 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -97,7 +97,7 @@ static void stm32_usart_config_reg_rs485(u32 *cr1, u32 *cr3, u32 delay_ADE,
 	*cr1 |= rs485_deat_dedt;
 }
 
-static int stm32_usart_config_rs485(struct uart_port *port,
+static int stm32_usart_config_rs485(struct uart_port *port, struct ktermios *termios,
 				    struct serial_rs485 *rs485conf)
 {
 	struct stm32_port *stm32_port = to_stm32_port(port);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index faaf2372c60d..b7b86ee3cb12 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -133,6 +133,7 @@ struct uart_port {
 				      unsigned int old);
 	void			(*handle_break)(struct uart_port *);
 	int			(*rs485_config)(struct uart_port *,
+						struct ktermios *termios,
 						struct serial_rs485 *rs485);
 	int			(*iso7816_config)(struct uart_port *,
 						  struct serial_iso7816 *iso7816);
-- 
cgit 


From 4f768e94774c58c9f7f54ebd38dadf172970046a Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:09 +0300
Subject: serial: Support for RS-485 multipoint addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for RS-485 multipoint addressing using 9th bit [*]. The
addressing mode is configured through ->rs485_config().

ADDRB in termios indicates 9th bit addressing mode is enabled. In this
mode, 9th bit is used to indicate an address (byte) within the
communication line. ADDRB can only be enabled/disabled through
->rs485_config() that is also responsible for setting the destination and
receiver (filter) addresses.

Add traps to detect unwanted changes to struct serial_rs485 layout using
static_assert().

[*] Technically, RS485 is just an electronic spec and does not itself
specify the 9th bit addressing mode but 9th bit seems at least
"semi-standard" way to do addressing with RS485.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-6-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst       |  2 ++
 Documentation/driver-api/serial/serial-rs485.rst | 26 +++++++++++++++++++++++-
 drivers/tty/serial/serial_core.c                 | 22 +++++++++++++++++++-
 drivers/tty/tty_ioctl.c                          |  4 ++++
 include/uapi/asm-generic/termbits-common.h       |  1 +
 include/uapi/linux/serial.h                      | 20 ++++++++++++++++--
 6 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index 1e7ab4142d49..ee1679858aa2 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -261,6 +261,8 @@ hardware.
 			- parity enable
 		PARODD
 			- odd parity (when PARENB is in force)
+		ADDRB
+			- address bit (changed through .rs485_config()).
 		CREAD
 			- enable reception of characters (if not set,
 			  still receive characters from the port, but
diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
index 00b5d333acba..6ebad75c74ed 100644
--- a/Documentation/driver-api/serial/serial-rs485.rst
+++ b/Documentation/driver-api/serial/serial-rs485.rst
@@ -99,7 +99,31 @@ RS485 Serial Communications
 		/* Error handling. See errno. */
 	}
 
-5. References
+5. Multipoint Addressing
+========================
+
+   The Linux kernel provides addressing mode for multipoint RS-485 serial
+   communications line. The addressing mode is enabled with SER_RS485_ADDRB
+   flag in serial_rs485. Struct serial_rs485 has two additional flags and
+   fields for enabling receive and destination addresses.
+
+   Address mode flags:
+	- SER_RS485_ADDRB: Enabled addressing mode (sets also ADDRB in termios).
+	- SER_RS485_ADDR_RECV: Receive (filter) address enabled.
+	- SER_RS485_ADDR_DEST: Set destination address.
+
+   Address fields (enabled with corresponding SER_RS485_ADDR_* flag):
+	- addr_recv: Receive address.
+	- addr_dest: Destination address.
+
+   Once a receive address is set, the communication can occur only with the
+   particular device and other peers are filtered out. It is left up to the
+   receiver side to enforce the filtering. Receive address will be cleared
+   if SER_RS485_ADDR_RECV is not set.
+
+   Note: not all devices supporting RS485 support multipoint addressing.
+
+6. References
 =============
 
  [1]	include/uapi/linux/serial.h
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2529153c8979..85ef7ef00b82 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1288,6 +1288,17 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 	if (flags & ~port->rs485_supported->flags)
 		return -EINVAL;
 
+	/* Asking for address w/o addressing mode? */
+	if (!(rs485->flags & SER_RS485_ADDRB) &&
+	    (rs485->flags & (SER_RS485_ADDR_RECV|SER_RS485_ADDR_DEST)))
+		return -EINVAL;
+
+	/* Address given but not enabled? */
+	if (!(rs485->flags & SER_RS485_ADDR_RECV) && rs485->addr_recv)
+		return -EINVAL;
+	if (!(rs485->flags & SER_RS485_ADDR_DEST) && rs485->addr_dest)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -1343,7 +1354,8 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 	rs485->flags &= supported_flags;
 
 	/* Return clean padding area to userspace */
-	memset(rs485->padding, 0, sizeof(rs485->padding));
+	memset(rs485->padding0, 0, sizeof(rs485->padding0));
+	memset(rs485->padding1, 0, sizeof(rs485->padding1));
 }
 
 int uart_rs485_config(struct uart_port *port)
@@ -3402,5 +3414,13 @@ int uart_get_rs485_mode(struct uart_port *port)
 }
 EXPORT_SYMBOL_GPL(uart_get_rs485_mode);
 
+/* Compile-time assertions for serial_rs485 layout */
+static_assert(offsetof(struct serial_rs485, padding) ==
+              (offsetof(struct serial_rs485, delay_rts_after_send) + sizeof(__u32)));
+static_assert(offsetof(struct serial_rs485, padding1) ==
+	      offsetof(struct serial_rs485, padding[1]));
+static_assert((offsetof(struct serial_rs485, padding[4]) + sizeof(__u32)) ==
+	      sizeof(struct serial_rs485));
+
 MODULE_DESCRIPTION("Serial driver core");
 MODULE_LICENSE("GPL");
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index adae687f654b..2a76b330e108 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -319,6 +319,8 @@ unsigned char tty_get_frame_size(unsigned int cflag)
 		bits++;
 	if (cflag & PARENB)
 		bits++;
+	if (cflag & ADDRB)
+		bits++;
 
 	return bits;
 }
@@ -353,6 +355,8 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 	old_termios = tty->termios;
 	tty->termios = *new_termios;
 	unset_locked_termios(tty, &old_termios);
+	/* Reset any ADDRB changes, ADDRB is changed through ->rs485_config() */
+	tty->termios.c_cflag ^= (tty->termios.c_cflag ^ old_termios.c_cflag) & ADDRB;
 
 	if (tty->ops->set_termios)
 		tty->ops->set_termios(tty, &old_termios);
diff --git a/include/uapi/asm-generic/termbits-common.h b/include/uapi/asm-generic/termbits-common.h
index 4d084fe8def5..4a6a79f28b21 100644
--- a/include/uapi/asm-generic/termbits-common.h
+++ b/include/uapi/asm-generic/termbits-common.h
@@ -46,6 +46,7 @@ typedef unsigned int	speed_t;
 #define EXTA		B19200
 #define EXTB		B38400
 
+#define ADDRB		0x20000000	/* address bit */
 #define CMSPAR		0x40000000	/* mark or space (stick) parity */
 #define CRTSCTS		0x80000000	/* flow control */
 
diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h
index fa6b16e5fdd8..cea06924b295 100644
--- a/include/uapi/linux/serial.h
+++ b/include/uapi/linux/serial.h
@@ -126,10 +126,26 @@ struct serial_rs485 {
 #define SER_RS485_TERMINATE_BUS		(1 << 5)	/* Enable bus
 							   termination
 							   (if supported) */
+
+/* RS-485 addressing mode */
+#define SER_RS485_ADDRB			(1 << 6)	/* Enable addressing mode */
+#define SER_RS485_ADDR_RECV		(1 << 7)	/* Receive address filter */
+#define SER_RS485_ADDR_DEST		(1 << 8)	/* Destination address */
+
 	__u32	delay_rts_before_send;	/* Delay before send (milliseconds) */
 	__u32	delay_rts_after_send;	/* Delay after send (milliseconds) */
-	__u32	padding[5];		/* Memory is cheap, new structs
-					   are a royal PITA .. */
+
+	/* The fields below are defined by flags */
+	union {
+		__u32	padding[5];		/* Memory is cheap, new structs are a pain */
+
+		struct {
+			__u8	addr_recv;
+			__u8	addr_dest;
+			__u8	padding0[2];
+			__u32	padding1[4];
+		};
+	};
 };
 
 /*
-- 
cgit 


From ec5ad331680c96ef3dd30dc297b206988023b9e1 Mon Sep 17 00:00:00 2001
From: Max Staudt <max@enpas.org>
Date: Sat, 18 Jun 2022 20:01:34 +0200
Subject: tty: Add N_CAN327 line discipline ID for ELM327 based CAN driver

The actual driver will be added via the CAN tree.

Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Max Staudt <max@enpas.org>
Link: https://lore.kernel.org/r/20220618180134.9890-1-max@enpas.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 9d0f06bfbac3..68aeae2addec 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -38,8 +38,9 @@
 #define N_NULL		27	/* Null ldisc used for error handling */
 #define N_MCTP		28	/* MCTP-over-serial */
 #define N_DEVELOPMENT	29	/* Manual out-of-tree testing */
+#define N_CAN327	30	/* ELM327 based OBD-II interfaces */
 
 /* Always the newest line discipline + 1 */
-#define NR_LDISCS	30
+#define NR_LDISCS	31
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit 


From 4d0548a7b806a78ba253f1389b9ecdcaca47d583 Mon Sep 17 00:00:00 2001
From: Seth Forshee <sforshee@digitalocean.com>
Date: Mon, 27 Jun 2022 08:01:58 -0500
Subject: mnt_idmapping: return false when comparing two invalid ids

INVALID_VFS{U,G}ID represent ids which have no mapping in the target
mnt_usersns. This can happen for a couple of different reasons -- the
source id might be valid but has no mapping in mnt_userns, or the source
id might have been invalid (either due to a failed mapping or because it
was set to invalid to indicate it is uninitialized).

This means that two arbitrary vfs{u,g}ids which are both invalid could
represent two different underlying ids, or they could represent a failed
mapping and an uninitialized value. In these situation the vfs{u,g}id
equality functions evaluate these ids as equal, and care must be taken
when comparing ids to avoid problems. It would be less error prone to
always evaluate two invalid ids as not equal to each other, and to check
explicitly for vfs{u,g}id validity when that is needed.

Change all vfs{u,g}id equality functions to return false when both ids
are invalid. Functions for checking whether an id is valid exist and are
already being used by code which needs to check this.

Link: https://lore.kernel.org/linux-fsdevel/YrIMZirGoE0VIO45@do-x1extreme
Signed-off-by: Seth Forshee <sforshee@digitalocean.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 6a752b61088b..21bd22a7b326 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -60,12 +60,12 @@ static inline bool vfsgid_valid(vfsgid_t gid)
 
 static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right)
 {
-	return __vfsuid_val(left) == __vfsuid_val(right);
+	return vfsuid_valid(left) && __vfsuid_val(left) == __vfsuid_val(right);
 }
 
 static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right)
 {
-	return __vfsgid_val(left) == __vfsgid_val(right);
+	return vfsgid_valid(left) && __vfsgid_val(left) == __vfsgid_val(right);
 }
 
 /**
@@ -76,10 +76,11 @@ static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right)
  * Check whether @kuid and @vfsuid have the same values.
  *
  * Return: true if @kuid and @vfsuid have the same value, false if not.
+ * Comparison between two invalid uids returns false.
  */
 static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
 {
-	return __vfsuid_val(vfsuid) == __kuid_val(kuid);
+	return vfsuid_valid(vfsuid) && __vfsuid_val(vfsuid) == __kuid_val(kuid);
 }
 
 /**
@@ -90,10 +91,11 @@ static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
  * Check whether @kgid and @vfsgid have the same values.
  *
  * Return: true if @kgid and @vfsgid have the same value, false if not.
+ * Comparison between two invalid gids returns false.
  */
 static inline bool vfsgid_eq_kgid(kgid_t kgid, vfsgid_t vfsgid)
 {
-	return __vfsgid_val(vfsgid) == __kgid_val(kgid);
+	return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid);
 }
 
 /*
-- 
cgit 


From 9864bb4801331daa48514face9d0f4861e4d485b Mon Sep 17 00:00:00 2001
From: Li Li <dualli@google.com>
Date: Thu, 26 May 2022 15:00:18 -0700
Subject: Binder: add TF_UPDATE_TXN to replace outdated txn

When the target process is busy, incoming oneway transactions are
queued in the async_todo list. If the clients continue sending extra
oneway transactions while the target process is frozen, this queue can
become too large to accommodate new transactions. That's why binder
driver introduced ONEWAY_SPAM_DETECTION to detect this situation. It's
helpful to debug the async binder buffer exhausting issue, but the
issue itself isn't solved directly.

In real cases applications are designed to send oneway transactions
repeatedly, delivering updated inforamtion to the target process.
Typical examples are Wi-Fi signal strength and some real time sensor
data. Even if the apps might only care about the lastet information,
all outdated oneway transactions are still accumulated there until the
frozen process is thawed later. For this kind of situations, there's
no existing method to skip those outdated transactions and deliver the
latest one only.

This patch introduces a new transaction flag TF_UPDATE_TXN. To use it,
use apps can set this new flag along with TF_ONE_WAY. When such an
oneway transaction is to be queued into the async_todo list of a frozen
process, binder driver will check if any previous pending transactions
can be superseded by comparing their code, flags and target node. If
such an outdated pending transaction is found, the latest transaction
will supersede that outdated one. This effectively prevents the async
binder buffer running out and saves unnecessary binder read workloads.

Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Li Li <dualli@google.com>
Link: https://lore.kernel.org/r/20220526220018.3334775-2-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 85 +++++++++++++++++++++++++++++++++++--
 drivers/android/binder_trace.h      |  4 ++
 include/uapi/linux/android/binder.h |  1 +
 3 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 362c0deb65f1..d4f84f25c30b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2626,6 +2626,56 @@ static int binder_fixup_parent(struct list_head *pf_head,
 	return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0);
 }
 
+/**
+ * binder_can_update_transaction() - Can a txn be superseded by an updated one?
+ * @t1: the pending async txn in the frozen process
+ * @t2: the new async txn to supersede the outdated pending one
+ *
+ * Return:  true if t2 can supersede t1
+ *          false if t2 can not supersede t1
+ */
+static bool binder_can_update_transaction(struct binder_transaction *t1,
+					  struct binder_transaction *t2)
+{
+	if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) !=
+	    (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc)
+		return false;
+	if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code &&
+	    t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid &&
+	    t1->buffer->target_node->ptr == t2->buffer->target_node->ptr &&
+	    t1->buffer->target_node->cookie == t2->buffer->target_node->cookie)
+		return true;
+	return false;
+}
+
+/**
+ * binder_find_outdated_transaction_ilocked() - Find the outdated transaction
+ * @t:		 new async transaction
+ * @target_list: list to find outdated transaction
+ *
+ * Return: the outdated transaction if found
+ *         NULL if no outdated transacton can be found
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static struct binder_transaction *
+binder_find_outdated_transaction_ilocked(struct binder_transaction *t,
+					 struct list_head *target_list)
+{
+	struct binder_work *w;
+
+	list_for_each_entry(w, target_list, entry) {
+		struct binder_transaction *t_queued;
+
+		if (w->type != BINDER_WORK_TRANSACTION)
+			continue;
+		t_queued = container_of(w, struct binder_transaction, work);
+		if (binder_can_update_transaction(t_queued, t))
+			return t_queued;
+	}
+	return NULL;
+}
+
 /**
  * binder_proc_transaction() - sends a transaction to a process and wakes it up
  * @t:		transaction to send
@@ -2651,6 +2701,7 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	struct binder_node *node = t->buffer->target_node;
 	bool oneway = !!(t->flags & TF_ONE_WAY);
 	bool pending_async = false;
+	struct binder_transaction *t_outdated = NULL;
 
 	BUG_ON(!node);
 	binder_node_lock(node);
@@ -2678,12 +2729,24 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	if (!thread && !pending_async)
 		thread = binder_select_thread_ilocked(proc);
 
-	if (thread)
+	if (thread) {
 		binder_enqueue_thread_work_ilocked(thread, &t->work);
-	else if (!pending_async)
+	} else if (!pending_async) {
 		binder_enqueue_work_ilocked(&t->work, &proc->todo);
-	else
+	} else {
+		if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) {
+			t_outdated = binder_find_outdated_transaction_ilocked(t,
+									      &node->async_todo);
+			if (t_outdated) {
+				binder_debug(BINDER_DEBUG_TRANSACTION,
+					     "txn %d supersedes %d\n",
+					     t->debug_id, t_outdated->debug_id);
+				list_del_init(&t_outdated->work.entry);
+				proc->outstanding_txns--;
+			}
+		}
 		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
+	}
 
 	if (!pending_async)
 		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
@@ -2692,6 +2755,22 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	binder_inner_proc_unlock(proc);
 	binder_node_unlock(node);
 
+	/*
+	 * To reduce potential contention, free the outdated transaction and
+	 * buffer after releasing the locks.
+	 */
+	if (t_outdated) {
+		struct binder_buffer *buffer = t_outdated->buffer;
+
+		t_outdated->buffer = NULL;
+		buffer->transaction = NULL;
+		trace_binder_transaction_update_buffer_release(buffer);
+		binder_transaction_buffer_release(proc, NULL, buffer, 0, 0);
+		binder_alloc_free_buf(&proc->alloc, buffer);
+		kfree(t_outdated);
+		binder_stats_deleted(BINDER_STAT_TRANSACTION);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 8eeccdc64724..8cc07e6a4273 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -311,6 +311,10 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
 	TP_PROTO(struct binder_buffer *buffer),
 	TP_ARGS(buffer));
 
+DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release,
+	     TP_PROTO(struct binder_buffer *buffer),
+	     TP_ARGS(buffer));
+
 TRACE_EVENT(binder_update_page_range,
 	TP_PROTO(struct binder_alloc *alloc, bool allocate,
 		 void __user *start, void __user *end),
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 986333cf5bbe..e72e4de8f452 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -287,6 +287,7 @@ enum transaction_flags {
 	TF_STATUS_CODE	= 0x08,	/* contents are a 32-bit status code */
 	TF_ACCEPT_FDS	= 0x10,	/* allow replies with file descriptors */
 	TF_CLEAR_BUF	= 0x20,	/* clear buffer on txn complete */
+	TF_UPDATE_TXN	= 0x40,	/* update the outdated pending async txn */
 };
 
 struct binder_transaction_data {
-- 
cgit 


From 713eb3c1261a1f89e35bdf233265aa5a2c46e9b2 Mon Sep 17 00:00:00 2001
From: Max Staudt <max@enpas.org>
Date: Sat, 18 Jun 2022 20:01:34 +0200
Subject: tty: Add N_CAN327 line discipline ID for ELM327 based CAN driver

The actual driver will be added via the CAN tree.

Link: https://lore.kernel.org/all/20220618180134.9890-1-max@enpas.org
Link: https://lore.kernel.org/all/Yrm9Ezlw1dLmIxyS@kroah.com
Signed-off-by: Max Staudt <max@enpas.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/tty.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 9d0f06bfbac3..68aeae2addec 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -38,8 +38,9 @@
 #define N_NULL		27	/* Null ldisc used for error handling */
 #define N_MCTP		28	/* MCTP-over-serial */
 #define N_DEVELOPMENT	29	/* Manual out-of-tree testing */
+#define N_CAN327	30	/* ELM327 based OBD-II interfaces */
 
 /* Always the newest line discipline + 1 */
-#define NR_LDISCS	30
+#define NR_LDISCS	31
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit 


From 38a523a2946d3a0961d141d477a1ee2b1f3bdbb1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 27 Jun 2022 16:36:57 +0200
Subject: Revert "devcoredump: remove the useless gfp_t parameter in
 dev_coredumpv and dev_coredumpm"

This reverts commit 77515ebaf01920e2db49e04672ef669a7c2907f2 as it
causes build problems in linux-next.  It needs to be reintroduced in a
way that can allow the api to evolve and not require a "flag day" to
catch all users.

Link: https://lore.kernel.org/r/20220623160723.7a44b573@canb.auug.org.au
Cc: Duoming Zhou <duoming@zju.edu.cn>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/devcoredump.c                               | 16 ++++++++++------
 drivers/bluetooth/btmrvl_sdio.c                          |  2 +-
 drivers/bluetooth/hci_qca.c                              |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_dump.c                   |  2 +-
 drivers/gpu/drm/msm/disp/msm_disp_snapshot.c             |  4 ++--
 drivers/gpu/drm/msm/msm_gpu.c                            |  4 ++--
 drivers/media/platform/qcom/venus/core.c                 |  2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c           |  2 +-
 drivers/net/wireless/ath/ath10k/coredump.c               |  2 +-
 drivers/net/wireless/ath/wil6210/wil_crash_dump.c        |  2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c |  2 +-
 drivers/net/wireless/intel/iwlwifi/fw/dbg.c              |  6 ++++--
 drivers/net/wireless/marvell/mwifiex/main.c              |  3 ++-
 drivers/net/wireless/mediatek/mt76/mt7615/mac.c          |  3 ++-
 drivers/net/wireless/mediatek/mt76/mt7921/mac.c          |  3 ++-
 drivers/net/wireless/realtek/rtw88/main.c                |  2 +-
 drivers/net/wireless/realtek/rtw89/ser.c                 |  2 +-
 drivers/remoteproc/qcom_q6v5_mss.c                       |  2 +-
 drivers/remoteproc/remoteproc_coredump.c                 |  8 ++++----
 include/drm/drm_print.h                                  |  2 +-
 include/linux/devcoredump.h                              | 13 +++++++------
 sound/soc/intel/avs/apl.c                                |  2 +-
 sound/soc/intel/avs/skl.c                                |  2 +-
 sound/soc/intel/catpt/dsp.c                              |  2 +-
 24 files changed, 50 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 8535f0bd5dfb..f4d794d6bb85 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -173,13 +173,15 @@ static void devcd_freev(void *data)
  * @dev: the struct device for the crashed device
  * @data: vmalloc data containing the device coredump
  * @datalen: length of the data
+ * @gfp: allocation flags
  *
  * This function takes ownership of the vmalloc'ed data and will free
  * it when it is no longer used. See dev_coredumpm() for more information.
  */
-void dev_coredumpv(struct device *dev, void *data, size_t datalen)
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
+		   gfp_t gfp)
 {
-	dev_coredumpm(dev, NULL, data, datalen, devcd_readv, devcd_freev);
+	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpv);
 
@@ -234,6 +236,7 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
  * @owner: the module that contains the read/free functions, use %THIS_MODULE
  * @data: data cookie for the @read/@free functions
  * @datalen: length of the data
+ * @gfp: allocation flags
  * @read: function to read from the given buffer
  * @free: function to free the given buffer
  *
@@ -243,7 +246,7 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
  * function will be called to free the data.
  */
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   void *data, size_t datalen,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 				   void *data, size_t datalen),
 		   void (*free)(void *data))
@@ -265,7 +268,7 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 	if (!try_module_get(owner))
 		goto free;
 
-	devcd = kzalloc(sizeof(*devcd), GFP_KERNEL);
+	devcd = kzalloc(sizeof(*devcd), gfp);
 	if (!devcd)
 		goto put_module;
 
@@ -315,6 +318,7 @@ EXPORT_SYMBOL_GPL(dev_coredumpm);
  * @dev: the struct device for the crashed device
  * @table: the dump data
  * @datalen: length of the data
+ * @gfp: allocation flags
  *
  * Creates a new device coredump for the given device. If a previous one hasn't
  * been read yet, the new coredump is discarded. The data lifetime is determined
@@ -322,9 +326,9 @@ EXPORT_SYMBOL_GPL(dev_coredumpm);
  * it will free the data.
  */
 void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-		    size_t datalen)
+		    size_t datalen, gfp_t gfp)
 {
-	dev_coredumpm(dev, NULL, table, datalen, devcd_read_from_sgtable,
+	dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,
 		      devcd_free_sgtable);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpsg);
diff --git a/drivers/bluetooth/btmrvl_sdio.c b/drivers/bluetooth/btmrvl_sdio.c
index 9b9728719db2..b8ef66f89fc1 100644
--- a/drivers/bluetooth/btmrvl_sdio.c
+++ b/drivers/bluetooth/btmrvl_sdio.c
@@ -1515,7 +1515,7 @@ done:
 	/* fw_dump_data will be free in device coredump release function
 	 * after 5 min
 	 */
-	dev_coredumpv(&card->func->dev, fw_dump_data, fw_dump_len);
+	dev_coredumpv(&card->func->dev, fw_dump_data, fw_dump_len, GFP_KERNEL);
 	BT_INFO("== btmrvl firmware dump to /sys/class/devcoredump end");
 }
 
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 2e4074211ae9..eab34e24d944 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1120,7 +1120,7 @@ static void qca_controller_memdump(struct work_struct *work)
 				    qca_memdump->ram_dump_size);
 			memdump_buf = qca_memdump->memdump_buf_head;
 			dev_coredumpv(&hu->serdev->dev, memdump_buf,
-				      qca_memdump->received_dump);
+				      qca_memdump->received_dump, GFP_KERNEL);
 			cancel_delayed_work(&qca->ctrl_memdump_timeout);
 			kfree(qca->qca_memdump);
 			qca->qca_memdump = NULL;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 519fcb234b3e..f418e0b75772 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -225,5 +225,5 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit)
 
 	etnaviv_core_dump_header(&iter, ETDUMP_BUF_END, iter.data);
 
-	dev_coredumpv(gpu->dev, iter.start, iter.data - iter.start);
+	dev_coredumpv(gpu->dev, iter.start, iter.data - iter.start, GFP_KERNEL);
 }
diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
index f057d294c30b..e75b97127c0d 100644
--- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
+++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
@@ -74,8 +74,8 @@ static void _msm_disp_snapshot_work(struct kthread_work *work)
 	 * If there is a codedump pending for the device, the dev_coredumpm()
 	 * will also free new coredump state.
 	 */
-	dev_coredumpm(disp_state->dev, THIS_MODULE, disp_state, 0,
-		      disp_devcoredump_read, msm_disp_state_free);
+	dev_coredumpm(disp_state->dev, THIS_MODULE, disp_state, 0, GFP_KERNEL,
+			disp_devcoredump_read, msm_disp_state_free);
 }
 
 void msm_disp_snapshot_state(struct drm_device *drm_dev)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 30576ced0a0a..eb8a6663f309 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -317,8 +317,8 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
 	gpu->crashstate = state;
 
 	/* FIXME: Release the crashstate if this errors out? */
-	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0,
-		      msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
+		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
 }
 #else
 static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c
index db84dfb3fb11..877eca125803 100644
--- a/drivers/media/platform/qcom/venus/core.c
+++ b/drivers/media/platform/qcom/venus/core.c
@@ -49,7 +49,7 @@ static void venus_coredump(struct venus_core *core)
 
 	memcpy(data, mem_va, mem_size);
 	memunmap(mem_va);
-	dev_coredumpv(dev, data, mem_size);
+	dev_coredumpv(dev, data, mem_size, GFP_KERNEL);
 }
 
 static void venus_event_notify(struct venus_core *core, u32 event)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
index fa520ab7c960..c991b30bc9f0 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-dump.c
@@ -281,5 +281,5 @@ void mcp251xfd_dump(const struct mcp251xfd_priv *priv)
 	mcp251xfd_dump_end(priv, &iter);
 
 	dev_coredumpv(&priv->spi->dev, iter.start,
-		      iter.data - iter.start);
+		      iter.data - iter.start, GFP_KERNEL);
 }
diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c
index dc9237069921..fe6b6f97a916 100644
--- a/drivers/net/wireless/ath/ath10k/coredump.c
+++ b/drivers/net/wireless/ath/ath10k/coredump.c
@@ -1607,7 +1607,7 @@ int ath10k_coredump_submit(struct ath10k *ar)
 		return -ENODATA;
 	}
 
-	dev_coredumpv(ar->dev, dump, le32_to_cpu(dump->len));
+	dev_coredumpv(ar->dev, dump, le32_to_cpu(dump->len), GFP_KERNEL);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/ath/wil6210/wil_crash_dump.c b/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
index 79299609dd62..89c12cb2aaab 100644
--- a/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
+++ b/drivers/net/wireless/ath/wil6210/wil_crash_dump.c
@@ -117,6 +117,6 @@ void wil_fw_core_dump(struct wil6210_priv *wil)
 	/* fw_dump_data will be free in device coredump release function
 	 * after 5 min
 	 */
-	dev_coredumpv(wil_to_dev(wil), fw_dump_data, fw_dump_size);
+	dev_coredumpv(wil_to_dev(wil), fw_dump_data, fw_dump_size, GFP_KERNEL);
 	wil_info(wil, "fw core dumped, size %d bytes\n", fw_dump_size);
 }
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
index 87f3652ef3bd..eecf8a38d94a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
@@ -37,7 +37,7 @@ int brcmf_debug_create_memdump(struct brcmf_bus *bus, const void *data,
 		return err;
 	}
 
-	dev_coredumpv(bus->dev, dump, len + ramsize);
+	dev_coredumpv(bus->dev, dump, len + ramsize, GFP_KERNEL);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
index f2f7cf494a8c..abf49022edbe 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
@@ -2601,7 +2601,8 @@ static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt,
 					     fw_error_dump.trans_ptr->data,
 					     fw_error_dump.trans_ptr->len,
 					     fw_error_dump.fwrt_len);
-		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len);
+		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len,
+			       GFP_KERNEL);
 	}
 	vfree(fw_error_dump.fwrt_ptr);
 	vfree(fw_error_dump.trans_ptr);
@@ -2646,7 +2647,8 @@ static void iwl_fw_error_ini_dump(struct iwl_fw_runtime *fwrt,
 					     entry->data, entry->size, offs);
 			offs += entry->size;
 		}
-		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len);
+		dev_coredumpsg(fwrt->trans->dev, sg_dump_data, file_len,
+			       GFP_KERNEL);
 	}
 	iwl_dump_ini_list_free(&dump_list);
 }
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 26fef0ab1b0d..ace7371c4773 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1115,7 +1115,8 @@ void mwifiex_upload_device_dump(struct mwifiex_adapter *adapter)
 	 */
 	mwifiex_dbg(adapter, MSG,
 		    "== mwifiex dump information to /sys/class/devcoredump start\n");
-	dev_coredumpv(adapter->dev, adapter->devdump_data, adapter->devdump_len);
+	dev_coredumpv(adapter->dev, adapter->devdump_data, adapter->devdump_len,
+		      GFP_KERNEL);
 	mwifiex_dbg(adapter, MSG,
 		    "== mwifiex dump information to /sys/class/devcoredump end\n");
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
index 5336fe8c668d..bd687f7de628 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
@@ -2421,5 +2421,6 @@ void mt7615_coredump_work(struct work_struct *work)
 
 		dev_kfree_skb(skb);
 	}
-	dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ);
+	dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ,
+		      GFP_KERNEL);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
index cac284f95ce0..a630ddbf19e5 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
@@ -1630,7 +1630,8 @@ void mt7921_coredump_work(struct work_struct *work)
 	}
 
 	if (dump)
-		dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ);
+		dev_coredumpv(dev->mt76.dev, dump, MT76_CONNAC_COREDUMP_SZ,
+			      GFP_KERNEL);
 
 	mt7921_reset(&dev->mt76);
 }
diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c
index a276544cecdd..efabd5b1bf5b 100644
--- a/drivers/net/wireless/realtek/rtw88/main.c
+++ b/drivers/net/wireless/realtek/rtw88/main.c
@@ -414,7 +414,7 @@ static void rtw_fwcd_dump(struct rtw_dev *rtwdev)
 	 * framework. Note that a new dump will be discarded if a previous one
 	 * hasn't been released yet.
 	 */
-	dev_coredumpv(rtwdev->dev, desc->data, desc->size);
+	dev_coredumpv(rtwdev->dev, desc->data, desc->size, GFP_KERNEL);
 }
 
 static void rtw_fwcd_free(struct rtw_dev *rtwdev, bool free_self)
diff --git a/drivers/net/wireless/realtek/rtw89/ser.c b/drivers/net/wireless/realtek/rtw89/ser.c
index d28fe01ad729..9e95ed972710 100644
--- a/drivers/net/wireless/realtek/rtw89/ser.c
+++ b/drivers/net/wireless/realtek/rtw89/ser.c
@@ -127,7 +127,7 @@ static void rtw89_ser_cd_send(struct rtw89_dev *rtwdev,
 	 * will be discarded if a previous one hasn't been released by
 	 * framework yet.
 	 */
-	dev_coredumpv(rtwdev->dev, buf, sizeof(*buf));
+	dev_coredumpv(rtwdev->dev, buf, sizeof(*buf), GFP_KERNEL);
 }
 
 static void rtw89_ser_cd_free(struct rtw89_dev *rtwdev,
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 813d87faef6c..af217de75e4d 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -597,7 +597,7 @@ static void q6v5_dump_mba_logs(struct q6v5 *qproc)
 	data = vmalloc(MBA_LOG_SIZE);
 	if (data) {
 		memcpy(data, mba_region, MBA_LOG_SIZE);
-		dev_coredumpv(&rproc->dev, data, MBA_LOG_SIZE);
+		dev_coredumpv(&rproc->dev, data, MBA_LOG_SIZE, GFP_KERNEL);
 	}
 	memunmap(mba_region);
 }
diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c
index cd55c2abd227..4b093420d98a 100644
--- a/drivers/remoteproc/remoteproc_coredump.c
+++ b/drivers/remoteproc/remoteproc_coredump.c
@@ -309,7 +309,7 @@ void rproc_coredump(struct rproc *rproc)
 		phdr += elf_size_of_phdr(class);
 	}
 	if (dump_conf == RPROC_COREDUMP_ENABLED) {
-		dev_coredumpv(&rproc->dev, data, data_size);
+		dev_coredumpv(&rproc->dev, data, data_size, GFP_KERNEL);
 		return;
 	}
 
@@ -318,7 +318,7 @@ void rproc_coredump(struct rproc *rproc)
 	dump_state.header = data;
 	init_completion(&dump_state.dump_done);
 
-	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size,
+	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size, GFP_KERNEL,
 		      rproc_coredump_read, rproc_coredump_free);
 
 	/*
@@ -449,7 +449,7 @@ void rproc_coredump_using_sections(struct rproc *rproc)
 	}
 
 	if (dump_conf == RPROC_COREDUMP_ENABLED) {
-		dev_coredumpv(&rproc->dev, data, data_size);
+		dev_coredumpv(&rproc->dev, data, data_size, GFP_KERNEL);
 		return;
 	}
 
@@ -458,7 +458,7 @@ void rproc_coredump_using_sections(struct rproc *rproc)
 	dump_state.header = data;
 	init_completion(&dump_state.dump_done);
 
-	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size,
+	dev_coredumpm(&rproc->dev, NULL, &dump_state, data_size, GFP_KERNEL,
 		      rproc_coredump_read, rproc_coredump_free);
 
 	/* Wait until the dump is read and free is called. Data is freed
diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index b41850366bcc..22fabdeed297 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -162,7 +162,7 @@ struct drm_print_iterator {
  *	void makecoredump(...)
  *	{
  *		...
- *		dev_coredumpm(dev, THIS_MODULE, data, 0,
+ *		dev_coredumpm(dev, THIS_MODULE, data, 0, GFP_KERNEL,
  *			coredump_read, ...)
  *	}
  *
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c7d840d824c3..c008169ed2c6 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -52,26 +52,27 @@ static inline void _devcd_free_sgtable(struct scatterlist *table)
 
 
 #ifdef CONFIG_DEV_COREDUMP
-void dev_coredumpv(struct device *dev, void *data, size_t datalen);
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
+		   gfp_t gfp);
 
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   void *data, size_t datalen,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 				   void *data, size_t datalen),
 		   void (*free)(void *data));
 
 void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-		    size_t datalen);
+		    size_t datalen, gfp_t gfp);
 #else
 static inline void dev_coredumpv(struct device *dev, void *data,
-				 size_t datalen)
+				 size_t datalen, gfp_t gfp)
 {
 	vfree(data);
 }
 
 static inline void
 dev_coredumpm(struct device *dev, struct module *owner,
-	      void *data, size_t datalen,
+	      void *data, size_t datalen, gfp_t gfp,
 	      ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 			      void *data, size_t datalen),
 	      void (*free)(void *data))
@@ -80,7 +81,7 @@ dev_coredumpm(struct device *dev, struct module *owner,
 }
 
 static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
-				  size_t datalen)
+				  size_t datalen, gfp_t gfp)
 {
 	_devcd_free_sgtable(table);
 }
diff --git a/sound/soc/intel/avs/apl.c b/sound/soc/intel/avs/apl.c
index 1ff57f1a483d..b8e2b23c9f64 100644
--- a/sound/soc/intel/avs/apl.c
+++ b/sound/soc/intel/avs/apl.c
@@ -164,7 +164,7 @@ static int apl_coredump(struct avs_dev *adev, union avs_notify_msg *msg)
 	} while (offset < msg->ext.coredump.stack_dump_size);
 
 exit:
-	dev_coredumpv(adev->dev, dump, dump_size);
+	dev_coredumpv(adev->dev, dump, dump_size, GFP_KERNEL);
 
 	return 0;
 }
diff --git a/sound/soc/intel/avs/skl.c b/sound/soc/intel/avs/skl.c
index 3413162768dc..bda5ec7510fe 100644
--- a/sound/soc/intel/avs/skl.c
+++ b/sound/soc/intel/avs/skl.c
@@ -88,7 +88,7 @@ static int skl_coredump(struct avs_dev *adev, union avs_notify_msg *msg)
 		return -ENOMEM;
 
 	memcpy_fromio(dump, avs_sram_addr(adev, AVS_FW_REGS_WINDOW), AVS_FW_REGS_SIZE);
-	dev_coredumpv(adev->dev, dump, AVS_FW_REGS_SIZE);
+	dev_coredumpv(adev->dev, dump, AVS_FW_REGS_SIZE, GFP_KERNEL);
 
 	return 0;
 }
diff --git a/sound/soc/intel/catpt/dsp.c b/sound/soc/intel/catpt/dsp.c
index d2afe9ff1e3a..346bec000306 100644
--- a/sound/soc/intel/catpt/dsp.c
+++ b/sound/soc/intel/catpt/dsp.c
@@ -539,7 +539,7 @@ int catpt_coredump(struct catpt_dev *cdev)
 		pos += CATPT_DMA_REGS_SIZE;
 	}
 
-	dev_coredumpv(cdev->dev, dump, dump_size);
+	dev_coredumpv(cdev->dev, dump, dump_size, GFP_KERNEL);
 
 	return 0;
 }
-- 
cgit 


From 086c00c71fc8d47db6983f419a45f9ee167de03f Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 15 Jun 2022 12:10:56 +1000
Subject: kernfs: make ->attr.open RCU protected.

After removal of kernfs_open_node->refcnt in the previous patch,
kernfs_open_node_lock can be removed as well by making ->attr.open
RCU protected. kernfs_put_open_node can delegate freeing to ->attr.open
to RCU and other readers of ->attr.open can do so under rcu_read_(un)lock.

Suggested by: Al Viro <viro@zeniv.linux.org.uk>

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20220615021059.862643-2-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 147 +++++++++++++++++++++++++++++++++----------------
 include/linux/kernfs.h |   2 +-
 2 files changed, 102 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 54b2a13ac9a2..22e7481e7b63 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -23,16 +23,16 @@
  * for each kernfs_node with one or more open files.
  *
  * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
- * protected by kernfs_open_node_lock.
+ * RCU protected.
  *
  * filp->private_data points to seq_file whose ->private points to
  * kernfs_open_file.  kernfs_open_files are chained at
  * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
  */
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
 static DEFINE_MUTEX(kernfs_open_file_mutex);
 
 struct kernfs_open_node {
+	struct rcu_head		rcu_head;
 	atomic_t		event;
 	wait_queue_head_t	poll;
 	struct list_head	files; /* goes through kernfs_open_file.list */
@@ -51,6 +51,52 @@ struct kernfs_open_node {
 static DEFINE_SPINLOCK(kernfs_notify_lock);
 static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
 
+/**
+ * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
+ *
+ * @of: associated kernfs_open_file instance.
+ * @kn: target kernfs_node.
+ *
+ * Fetch and return ->attr.open of @kn if @of->list is non empty.
+ * If @of->list is not empty we can safely assume that @of is on
+ * @kn->attr.open->files list and this guarantees that @kn->attr.open
+ * will not vanish i.e. dereferencing outside RCU read-side critical
+ * section is safe here.
+ *
+ * The caller needs to make sure that @of->list is not empty.
+ */
+static struct kernfs_open_node *
+kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+
+	on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
+
+	return on;
+}
+
+/**
+ * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn
+ *
+ * @kn: target kernfs_node.
+ *
+ * Fetch and return ->attr.open of @kn when caller holds the
+ * kernfs_open_file_mutex.
+ *
+ * Update of ->attr.open happens under kernfs_open_file_mutex. So when
+ * the caller guarantees that this mutex is being held, other updaters can't
+ * change ->attr.open and this means that we can safely deref ->attr.open
+ * outside RCU read-side critical section.
+ *
+ * The caller needs to make sure that kernfs_open_file_mutex is held.
+ */
+static struct kernfs_open_node *
+kernfs_deref_open_node_protected(struct kernfs_node *kn)
+{
+	return rcu_dereference_protected(kn->attr.open,
+				lockdep_is_held(&kernfs_open_file_mutex));
+}
+
 static struct kernfs_open_file *kernfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
@@ -156,8 +202,12 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
 static int kernfs_seq_show(struct seq_file *sf, void *v)
 {
 	struct kernfs_open_file *of = sf->private;
+	struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn);
 
-	of->event = atomic_read(&of->kn->attr.open->event);
+	if (!on)
+		return -EINVAL;
+
+	of->event = atomic_read(&on->event);
 
 	return of->kn->attr.ops->seq_show(sf, v);
 }
@@ -180,6 +230,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
 	ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
 	const struct kernfs_ops *ops;
+	struct kernfs_open_node *on;
 	char *buf;
 
 	buf = of->prealloc_buf;
@@ -201,7 +252,15 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		goto out_free;
 	}
 
-	of->event = atomic_read(&of->kn->attr.open->event);
+	on = kernfs_deref_open_node(of, of->kn);
+	if (!on) {
+		len = -EINVAL;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	of->event = atomic_read(&on->event);
+
 	ops = kernfs_ops(of->kn);
 	if (ops->read)
 		len = ops->read(of, buf, len, iocb->ki_pos);
@@ -518,36 +577,29 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
 {
 	struct kernfs_open_node *on, *new_on = NULL;
 
- retry:
 	mutex_lock(&kernfs_open_file_mutex);
-	spin_lock_irq(&kernfs_open_node_lock);
-
-	if (!kn->attr.open && new_on) {
-		kn->attr.open = new_on;
-		new_on = NULL;
-	}
-
-	on = kn->attr.open;
-	if (on)
-		list_add_tail(&of->list, &on->files);
-
-	spin_unlock_irq(&kernfs_open_node_lock);
-	mutex_unlock(&kernfs_open_file_mutex);
+	on = kernfs_deref_open_node_protected(kn);
 
 	if (on) {
-		kfree(new_on);
+		list_add_tail(&of->list, &on->files);
+		mutex_unlock(&kernfs_open_file_mutex);
 		return 0;
+	} else {
+		/* not there, initialize a new one */
+		new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+		if (!new_on) {
+			mutex_unlock(&kernfs_open_file_mutex);
+			return -ENOMEM;
+		}
+		atomic_set(&new_on->event, 1);
+		init_waitqueue_head(&new_on->poll);
+		INIT_LIST_HEAD(&new_on->files);
+		list_add_tail(&of->list, &new_on->files);
+		rcu_assign_pointer(kn->attr.open, new_on);
 	}
+	mutex_unlock(&kernfs_open_file_mutex);
 
-	/* not there, initialize a new one and retry */
-	new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
-	if (!new_on)
-		return -ENOMEM;
-
-	atomic_set(&new_on->event, 1);
-	init_waitqueue_head(&new_on->poll);
-	INIT_LIST_HEAD(&new_on->files);
-	goto retry;
+	return 0;
 }
 
 /**
@@ -566,24 +618,25 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
 static void kernfs_unlink_open_file(struct kernfs_node *kn,
 				 struct kernfs_open_file *of)
 {
-	struct kernfs_open_node *on = kn->attr.open;
-	unsigned long flags;
+	struct kernfs_open_node *on;
 
 	mutex_lock(&kernfs_open_file_mutex);
-	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	on = kernfs_deref_open_node_protected(kn);
+	if (!on) {
+		mutex_unlock(&kernfs_open_file_mutex);
+		return;
+	}
 
 	if (of)
 		list_del(&of->list);
 
-	if (list_empty(&on->files))
-		kn->attr.open = NULL;
-	else
-		on = NULL;
+	if (list_empty(&on->files)) {
+		rcu_assign_pointer(kn->attr.open, NULL);
+		kfree_rcu(on, rcu_head);
+	}
 
-	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
 	mutex_unlock(&kernfs_open_file_mutex);
-
-	kfree(on);
 }
 
 static int kernfs_fop_open(struct inode *inode, struct file *file)
@@ -773,17 +826,16 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
 	 * check under kernfs_open_file_mutex will ensure bailing out if
 	 * ->attr.open became NULL while waiting for the mutex.
 	 */
-	if (!kn->attr.open)
+	if (!rcu_access_pointer(kn->attr.open))
 		return;
 
 	mutex_lock(&kernfs_open_file_mutex);
-	if (!kn->attr.open) {
+	on = kernfs_deref_open_node_protected(kn);
+	if (!on) {
 		mutex_unlock(&kernfs_open_file_mutex);
 		return;
 	}
 
-	on = kn->attr.open;
-
 	list_for_each_entry(of, &on->files, list) {
 		struct inode *inode = file_inode(of->file);
 
@@ -814,7 +866,10 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
 __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
 {
 	struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
-	struct kernfs_open_node *on = kn->attr.open;
+	struct kernfs_open_node *on = kernfs_deref_open_node(of, kn);
+
+	if (!on)
+		return EPOLLERR;
 
 	poll_wait(of->file, &on->poll, wait);
 
@@ -921,13 +976,13 @@ void kernfs_notify(struct kernfs_node *kn)
 		return;
 
 	/* kick poll immediately */
-	spin_lock_irqsave(&kernfs_open_node_lock, flags);
-	on = kn->attr.open;
+	rcu_read_lock();
+	on = rcu_dereference(kn->attr.open);
 	if (on) {
 		atomic_inc(&on->event);
 		wake_up_interruptible(&on->poll);
 	}
-	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	rcu_read_unlock();
 
 	/* schedule work to kick fsnotify */
 	spin_lock_irqsave(&kernfs_notify_lock, flags);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index e2ae15a6225e..13f54f078a52 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -114,7 +114,7 @@ struct kernfs_elem_symlink {
 
 struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
-	struct kernfs_open_node	*open;
+	struct kernfs_open_node __rcu	*open;
 	loff_t			size;
 	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
-- 
cgit 


From b8f35fa1188b84035c59d4842826c4e93a1b1c9f Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 15 Jun 2022 12:10:57 +1000
Subject: kernfs: Change kernfs_notify_list to llist.

At present kernfs_notify_list is implemented as a singly linked
list of kernfs_node(s), where last element points to itself and
value of ->attr.next tells if node is present on the list or not.
Both addition and deletion to list happen under kernfs_notify_lock.

Change kernfs_notify_list to llist so that addition to list can heppen
locklessly.

Suggested by: Al Viro <viro@zeniv.linux.org.uk>

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20220615021059.862643-3-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 47 ++++++++++++++++++++---------------------------
 include/linux/kernfs.h |  2 +-
 2 files changed, 21 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 22e7481e7b63..7d9138d2be3b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -38,18 +38,16 @@ struct kernfs_open_node {
 	struct list_head	files; /* goes through kernfs_open_file.list */
 };
 
-/*
- * kernfs_notify() may be called from any context and bounces notifications
- * through a work item.  To minimize space overhead in kernfs_node, the
- * pending queue is implemented as a singly linked list of kernfs_nodes.
- * The list is terminated with the self pointer so that whether a
- * kernfs_node is on the list or not can be determined by testing the next
- * pointer for NULL.
+/**
+ * attribute_to_node - get kernfs_node object corresponding to a kernfs attribute
+ * @ptr:	&struct kernfs_elem_attr
+ * @type:	struct kernfs_node
+ * @member:	name of member (i.e attr)
  */
-#define KERNFS_NOTIFY_EOL			((void *)&kernfs_notify_list)
+#define attribute_to_node(ptr, type, member)	\
+	container_of(ptr, type, member)
 
-static DEFINE_SPINLOCK(kernfs_notify_lock);
-static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+static LLIST_HEAD(kernfs_notify_list);
 
 /**
  * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
@@ -902,18 +900,16 @@ static void kernfs_notify_workfn(struct work_struct *work)
 	struct kernfs_node *kn;
 	struct kernfs_super_info *info;
 	struct kernfs_root *root;
+	struct llist_node *free;
+	struct kernfs_elem_attr *attr;
 repeat:
 	/* pop one off the notify_list */
-	spin_lock_irq(&kernfs_notify_lock);
-	kn = kernfs_notify_list;
-	if (kn == KERNFS_NOTIFY_EOL) {
-		spin_unlock_irq(&kernfs_notify_lock);
+	free = llist_del_first(&kernfs_notify_list);
+	if (free == NULL)
 		return;
-	}
-	kernfs_notify_list = kn->attr.notify_next;
-	kn->attr.notify_next = NULL;
-	spin_unlock_irq(&kernfs_notify_lock);
 
+	attr = llist_entry(free, struct kernfs_elem_attr, notify_next);
+	kn = attribute_to_node(attr, struct kernfs_node, attr);
 	root = kernfs_root(kn);
 	/* kick fsnotify */
 	down_write(&root->kernfs_rwsem);
@@ -969,12 +965,14 @@ repeat:
 void kernfs_notify(struct kernfs_node *kn)
 {
 	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
-	unsigned long flags;
 	struct kernfs_open_node *on;
 
 	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
 		return;
 
+	/* Because we are using llist for kernfs_notify_list */
+	WARN_ON_ONCE(in_nmi());
+
 	/* kick poll immediately */
 	rcu_read_lock();
 	on = rcu_dereference(kn->attr.open);
@@ -985,14 +983,9 @@ void kernfs_notify(struct kernfs_node *kn)
 	rcu_read_unlock();
 
 	/* schedule work to kick fsnotify */
-	spin_lock_irqsave(&kernfs_notify_lock, flags);
-	if (!kn->attr.notify_next) {
-		kernfs_get(kn);
-		kn->attr.notify_next = kernfs_notify_list;
-		kernfs_notify_list = kn;
-		schedule_work(&kernfs_notify_work);
-	}
-	spin_unlock_irqrestore(&kernfs_notify_lock, flags);
+	kernfs_get(kn);
+	llist_add(&kn->attr.notify_next, &kernfs_notify_list);
+	schedule_work(&kernfs_notify_work);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 13f54f078a52..2dd9c8df0f4f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -116,7 +116,7 @@ struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node __rcu	*open;
 	loff_t			size;
-	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
+	struct llist_node	notify_next;	/* for kernfs_notify() */
 };
 
 /*
-- 
cgit 


From 1d25b84e444ad66313c473407979ea9cd33deb3f Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 15 Jun 2022 12:10:59 +1000
Subject: kernfs: Replace global kernfs_open_file_mutex with hashed mutexes.

In current kernfs design a single mutex, kernfs_open_file_mutex, protects
the list of kernfs_open_file instances corresponding to a sysfs attribute.
So even if different tasks are opening or closing different sysfs files
they can contend on osq_lock of this mutex. The contention is more apparent
in large scale systems with few hundred CPUs where most of the CPUs have
running tasks that are opening, accessing or closing sysfs files at any
point of time.

Using hashed mutexes in place of a single global mutex, can significantly
reduce contention around global mutex and hence can provide better
scalability. Moreover as these hashed mutexes are not part of kernfs_node
objects we will not see any singnificant change in memory utilization of
kernfs based file systems like sysfs, cgroupfs etc.

Modify interface introduced in previous patch to make use of hashed
mutexes. Use kernfs_node address as hashing key.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20220615021059.862643-5-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c            | 17 +++-----------
 fs/kernfs/kernfs-internal.h |  4 ++++
 fs/kernfs/mount.c           | 19 +++++++++++++++
 include/linux/kernfs.h      | 57 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 3b354caad6b5..bb933221b4ba 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,19 +18,6 @@
 
 #include "kernfs-internal.h"
 
-/*
- * There's one kernfs_open_file for each open file and one kernfs_open_node
- * for each kernfs_node with one or more open files.
- *
- * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
- * RCU protected.
- *
- * filp->private_data points to seq_file whose ->private points to
- * kernfs_open_file.  kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
- */
-static DEFINE_MUTEX(kernfs_open_file_mutex);
-
 struct kernfs_open_node {
 	struct rcu_head		rcu_head;
 	atomic_t		event;
@@ -51,7 +38,9 @@ static LLIST_HEAD(kernfs_notify_list);
 
 static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
 {
-	return &kernfs_open_file_mutex;
+	int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+	return &kernfs_locks->open_file_mutex[idx];
 }
 
 static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eeaa779b929c..3ae214d02d44 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -164,4 +164,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn);
  */
 extern const struct inode_operations kernfs_symlink_iops;
 
+/*
+ * kernfs locks
+ */
+extern struct kernfs_global_locks *kernfs_locks;
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a..d0859f72d2d6 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,6 +20,7 @@
 #include "kernfs-internal.h"
 
 struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
+struct kernfs_global_locks *kernfs_locks;
 
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
@@ -387,6 +388,22 @@ void kernfs_kill_sb(struct super_block *sb)
 	kfree(info);
 }
 
+static void __init kernfs_mutex_init(void)
+{
+	int count;
+
+	for (count = 0; count < NR_KERNFS_LOCKS; count++)
+		mutex_init(&kernfs_locks->open_file_mutex[count]);
+}
+
+static void __init kernfs_lock_init(void)
+{
+	kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
+	WARN_ON(!kernfs_locks);
+
+	kernfs_mutex_init();
+}
+
 void __init kernfs_init(void)
 {
 	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
@@ -397,4 +414,6 @@ void __init kernfs_init(void)
 	kernfs_iattrs_cache  = kmem_cache_create("kernfs_iattrs_cache",
 					      sizeof(struct kernfs_iattrs),
 					      0, SLAB_PANIC, NULL);
+
+	kernfs_lock_init();
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2dd9c8df0f4f..13e703f615f7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -18,6 +18,7 @@
 #include <linux/uidgid.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
+#include <linux/cache.h>
 
 struct file;
 struct dentry;
@@ -34,6 +35,62 @@ struct kernfs_fs_context;
 struct kernfs_open_node;
 struct kernfs_iattrs;
 
+/*
+ * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
+ * table of locks.
+ * Having a small hash table would impact scalability, since
+ * more and more kernfs_node objects will end up using same lock
+ * and having a very large hash table would waste memory.
+ *
+ * At the moment size of hash table of locks is being set based on
+ * the number of CPUs as follows:
+ *
+ * NR_CPU      NR_KERNFS_LOCK_BITS      NR_KERNFS_LOCKS
+ *   1                  1                       2
+ *  2-3                 2                       4
+ *  4-7                 4                       16
+ *  8-15                6                       64
+ *  16-31               8                       256
+ *  32 and more         10                      1024
+ *
+ * The above relation between NR_CPU and number of locks is based
+ * on some internal experimentation which involved booting qemu
+ * with different values of smp, performing some sysfs operations
+ * on all CPUs and observing how increase in number of locks impacts
+ * completion time of these sysfs operations on each CPU.
+ */
+#ifdef CONFIG_SMP
+#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
+#else
+#define NR_KERNFS_LOCK_BITS     1
+#endif
+
+#define NR_KERNFS_LOCKS     (1 << NR_KERNFS_LOCK_BITS)
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.
+ *
+ * kernfs_open_files are chained at kernfs_open_node->files, which is
+ * protected by kernfs_global_locks.open_file_mutex[i].
+ *
+ * To reduce possible contention in sysfs access, arising due to single
+ * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
+ * object address as hash keys to get the index of these locks.
+ *
+ * Hashed mutexes are safe to use here because operations using these don't
+ * rely on global exclusion.
+ *
+ * In future we intend to replace other global locks with hashed ones as well.
+ * kernfs_global_locks acts as a holder for all such hash tables.
+ */
+struct kernfs_global_locks {
+	struct mutex open_file_mutex[NR_KERNFS_LOCKS];
+};
+
 enum kernfs_node_type {
 	KERNFS_DIR		= 0x0001,
 	KERNFS_FILE		= 0x0002,
-- 
cgit 


From 8f486cab263ca1b761c1aab9b36975afd355b799 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Thu, 23 Jun 2022 01:03:42 -0700
Subject: driver core: fw_devlink: Allow firmware to mark devices as best
 effort

When firmware sets the FWNODE_FLAG_BEST_EFFORT flag for a fwnode,
fw_devlink will do a best effort ordering for that device where it'll
only enforce the probe/suspend/resume ordering of that device with
suppliers that have drivers. The driver of that device can then decide
if it wants to defer probe or probe without the suppliers.

This will be useful for avoid probe delays of the console device that
were caused by commit 71066545b48e ("driver core: Set
fw_devlink.strict=1 by default").

Fixes: 71066545b48e ("driver core: Set fw_devlink.strict=1 by default")
Reported-by: Sascha Hauer <sha@pengutronix.de>
Reported-by: Peng Fan <peng.fan@nxp.com>
Tested-by: Peng Fan <peng.fan@nxp.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20220623080344.783549-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 3 ++-
 include/linux/fwnode.h | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 839f64485a55..ccdd5b4295de 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -968,7 +968,8 @@ static void device_links_missing_supplier(struct device *dev)
 
 static bool dev_is_best_effort(struct device *dev)
 {
-	return fw_devlink_best_effort && dev->can_match;
+	return (fw_devlink_best_effort && dev->can_match) ||
+		(dev->fwnode && (dev->fwnode->flags & FWNODE_FLAG_BEST_EFFORT));
 }
 
 /**
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 9a81c4410b9f..89b9bdfca925 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -27,11 +27,15 @@ struct device;
  *			     driver needs its child devices to be bound with
  *			     their respective drivers as soon as they are
  *			     added.
+ * BEST_EFFORT: The fwnode/device needs to probe early and might be missing some
+ *		suppliers. Only enforce ordering with suppliers that have
+ *		drivers.
  */
 #define FWNODE_FLAG_LINKS_ADDED			BIT(0)
 #define FWNODE_FLAG_NOT_DEVICE			BIT(1)
 #define FWNODE_FLAG_INITIALIZED			BIT(2)
 #define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD	BIT(3)
+#define FWNODE_FLAG_BEST_EFFORT			BIT(4)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
-- 
cgit 


From d1877e639bc6bf1c3131eda3f9ede73f8da96c22 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 8 Jun 2022 12:55:13 -0600
Subject: vfio: de-extern-ify function prototypes

The use of 'extern' in function prototypes has been disrecommended in
the kernel coding style for several years now, remove them from all vfio
related files so contributors no longer need to decide between style and
consistency.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/165471414407.203056.474032786990662279.stgit@omen
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst | 10 ++--
 drivers/s390/cio/vfio_ccw_cp.h                    | 12 ++--
 drivers/s390/cio/vfio_ccw_private.h               |  6 +-
 drivers/vfio/fsl-mc/vfio_fsl_mc_private.h         |  2 +-
 drivers/vfio/platform/vfio_platform_private.h     | 21 ++++---
 include/linux/vfio.h                              | 70 +++++++++++------------
 include/linux/vfio_pci_core.h                     | 65 +++++++++++----------
 7 files changed, 91 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index eded8719180f..1c57815619fd 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -114,11 +114,11 @@ to register and unregister itself with the core driver:
 
 * Register::
 
-    extern int  mdev_register_driver(struct mdev_driver *drv);
+    int mdev_register_driver(struct mdev_driver *drv);
 
 * Unregister::
 
-    extern void mdev_unregister_driver(struct mdev_driver *drv);
+    void mdev_unregister_driver(struct mdev_driver *drv);
 
 The mediated bus driver's probe function should create a vfio_device on top of
 the mdev_device and connect it to an appropriate implementation of
@@ -127,8 +127,8 @@ vfio_device_ops.
 When a driver wants to add the GUID creation sysfs to an existing device it has
 probe'd to then it should call::
 
-	extern int  mdev_register_device(struct device *dev,
-	                                 struct mdev_driver *mdev_driver);
+    int mdev_register_device(struct device *dev,
+                             struct mdev_driver *mdev_driver);
 
 This will provide the 'mdev_supported_types/XX/create' files which can then be
 used to trigger the creation of a mdev_device. The created mdev_device will be
@@ -136,7 +136,7 @@ attached to the specified driver.
 
 When the driver needs to remove itself it calls::
 
-	extern void mdev_unregister_device(struct device *dev);
+    void mdev_unregister_device(struct device *dev);
 
 Which will unbind and destroy all the created mdevs and remove the sysfs files.
 
diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h
index e4c436199b4c..3194d887e08e 100644
--- a/drivers/s390/cio/vfio_ccw_cp.h
+++ b/drivers/s390/cio/vfio_ccw_cp.h
@@ -41,11 +41,11 @@ struct channel_program {
 	struct ccw1 *guest_cp;
 };
 
-extern int cp_init(struct channel_program *cp, union orb *orb);
-extern void cp_free(struct channel_program *cp);
-extern int cp_prefetch(struct channel_program *cp);
-extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
-extern void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
-extern bool cp_iova_pinned(struct channel_program *cp, u64 iova);
+int cp_init(struct channel_program *cp, union orb *orb);
+void cp_free(struct channel_program *cp);
+int cp_prefetch(struct channel_program *cp);
+union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
+void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
+bool cp_iova_pinned(struct channel_program *cp, u64 iova);
 
 #endif
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 7272eb788612..b7163bac8cc7 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -119,10 +119,10 @@ struct vfio_ccw_private {
 	struct work_struct	crw_work;
 } __aligned(8);
 
-extern int vfio_ccw_mdev_reg(struct subchannel *sch);
-extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
+int vfio_ccw_mdev_reg(struct subchannel *sch);
+void vfio_ccw_mdev_unreg(struct subchannel *sch);
 
-extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
+int vfio_ccw_sch_quiesce(struct subchannel *sch);
 
 extern struct mdev_driver vfio_ccw_mdev_driver;
 
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
index 4ad63ececb91..7a29f572f93d 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
@@ -39,7 +39,7 @@ struct vfio_fsl_mc_device {
 	struct vfio_fsl_mc_irq      *mc_irqs;
 };
 
-extern int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
 			       u32 flags, unsigned int index,
 			       unsigned int start, unsigned int count,
 			       void *data);
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 520d2a8e8375..691b43f4b2b2 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -78,21 +78,20 @@ struct vfio_platform_reset_node {
 	vfio_platform_reset_fn_t of_reset;
 };
 
-extern int vfio_platform_probe_common(struct vfio_platform_device *vdev,
-				      struct device *dev);
+int vfio_platform_probe_common(struct vfio_platform_device *vdev,
+			       struct device *dev);
 void vfio_platform_remove_common(struct vfio_platform_device *vdev);
 
-extern int vfio_platform_irq_init(struct vfio_platform_device *vdev);
-extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
+int vfio_platform_irq_init(struct vfio_platform_device *vdev);
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
 
-extern int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
-					uint32_t flags, unsigned index,
-					unsigned start, unsigned count,
-					void *data);
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+				 uint32_t flags, unsigned index,
+				 unsigned start, unsigned count, void *data);
 
-extern void __vfio_platform_register_reset(struct vfio_platform_reset_node *n);
-extern void vfio_platform_unregister_reset(const char *compat,
-					   vfio_platform_reset_fn_t fn);
+void __vfio_platform_register_reset(struct vfio_platform_reset_node *n);
+void vfio_platform_unregister_reset(const char *compat,
+				    vfio_platform_reset_fn_t fn);
 #define vfio_platform_register_reset(__compat, __reset)		\
 static struct vfio_platform_reset_node __reset ## _node = {	\
 	.owner = THIS_MODULE,					\
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index aa888cc51757..49580fa2073a 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -140,19 +140,19 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 /*
  * External user API
  */
-extern struct iommu_group *vfio_file_iommu_group(struct file *file);
-extern bool vfio_file_enforced_coherent(struct file *file);
-extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
-extern bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
+struct iommu_group *vfio_file_iommu_group(struct file *file);
+bool vfio_file_enforced_coherent(struct file *file);
+void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
+bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
-extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
-			  int npage, int prot, unsigned long *phys_pfn);
-extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-			    int npage);
-extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
-		       void *data, size_t len, bool write);
+int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		   int npage, int prot, unsigned long *phys_pfn);
+int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		     int npage);
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
+		void *data, size_t len, bool write);
 
 /* each type has independent events */
 enum vfio_notify_type {
@@ -162,13 +162,13 @@ enum vfio_notify_type {
 /* events for VFIO_IOMMU_NOTIFY */
 #define VFIO_IOMMU_NOTIFY_DMA_UNMAP	BIT(0)
 
-extern int vfio_register_notifier(struct vfio_device *device,
-				  enum vfio_notify_type type,
-				  unsigned long *required_events,
-				  struct notifier_block *nb);
-extern int vfio_unregister_notifier(struct vfio_device *device,
-				    enum vfio_notify_type type,
-				    struct notifier_block *nb);
+int vfio_register_notifier(struct vfio_device *device,
+			   enum vfio_notify_type type,
+			   unsigned long *required_events,
+			   struct notifier_block *nb);
+int vfio_unregister_notifier(struct vfio_device *device,
+			     enum vfio_notify_type type,
+			     struct notifier_block *nb);
 
 
 /*
@@ -178,25 +178,24 @@ struct vfio_info_cap {
 	struct vfio_info_cap_header *buf;
 	size_t size;
 };
-extern struct vfio_info_cap_header *vfio_info_cap_add(
-		struct vfio_info_cap *caps, size_t size, u16 id, u16 version);
-extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
+struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
+					       size_t size, u16 id,
+					       u16 version);
+void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
 
-extern int vfio_info_add_capability(struct vfio_info_cap *caps,
-				    struct vfio_info_cap_header *cap,
-				    size_t size);
+int vfio_info_add_capability(struct vfio_info_cap *caps,
+			     struct vfio_info_cap_header *cap, size_t size);
 
-extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
-					      int num_irqs, int max_irq_type,
-					      size_t *data_size);
+int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
+				       int num_irqs, int max_irq_type,
+				       size_t *data_size);
 
 struct pci_dev;
 #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
-extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
-extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
-extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
-				       unsigned int cmd,
-				       unsigned long arg);
+void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
+long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd,
+				unsigned long arg);
 #else
 static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
@@ -230,10 +229,9 @@ struct virqfd {
 	struct virqfd		**pvirqfd;
 };
 
-extern int vfio_virqfd_enable(void *opaque,
-			      int (*handler)(void *, void *),
-			      void (*thread)(void *, void *),
-			      void *data, struct virqfd **pvirqfd, int fd);
-extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
+int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
+		       void (*thread)(void *, void *), void *data,
+		       struct virqfd **pvirqfd, int fd);
+void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
 #endif /* VFIO_H */
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 23c176d4b073..22de2bce6394 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -147,23 +147,23 @@ struct vfio_pci_core_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
-extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
-				   uint32_t flags, unsigned index,
-				   unsigned start, unsigned count, void *data);
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
+			    uint32_t flags, unsigned index,
+			    unsigned start, unsigned count, void *data);
 
-extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
-				  char __user *buf, size_t count,
-				  loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
+			   char __user *buf, size_t count,
+			   loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			       size_t count, loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
 
 #ifdef CONFIG_VFIO_PCI_VGA
-extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			       size_t count, loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
 #else
 static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
 				      char __user *buf, size_t count,
@@ -173,32 +173,31 @@ static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
 }
 #endif
 
-extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
-			       uint64_t data, int count, int fd);
+long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
+			uint64_t data, int count, int fd);
 
-extern int vfio_pci_init_perm_bits(void);
-extern void vfio_pci_uninit_perm_bits(void);
+int vfio_pci_init_perm_bits(void);
+void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_config_init(struct vfio_pci_core_device *vdev);
-extern void vfio_config_free(struct vfio_pci_core_device *vdev);
+int vfio_config_init(struct vfio_pci_core_device *vdev);
+void vfio_config_free(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
-					unsigned int type, unsigned int subtype,
-					const struct vfio_pci_regops *ops,
-					size_t size, u32 flags, void *data);
+int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
+				 unsigned int type, unsigned int subtype,
+				 const struct vfio_pci_regops *ops,
+				 size_t size, u32 flags, void *data);
 
-extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
-				    pci_power_t state);
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
+			     pci_power_t state);
 
-extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
-						    *vdev);
-extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
-					       u16 cmd);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
+					u16 cmd);
 
 #ifdef CONFIG_VFIO_PCI_IGD
-extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
 #else
 static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
@@ -207,8 +206,8 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 #endif
 
 #ifdef CONFIG_S390
-extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
-				       struct vfio_info_cap *caps);
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				struct vfio_info_cap *caps);
 #else
 static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 					      struct vfio_info_cap *caps)
-- 
cgit 


From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 27 Jun 2022 09:00:26 +0300
Subject: docs: rename Documentation/vm to Documentation/mm

so it will be consistent with code mm directory and with
Documentation/admin-guide/mm and won't be confused with virtual machines.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Wu XiangCheng <bobwxc@email.cn>
---
 Documentation/ABI/testing/sysfs-kernel-mm-ksm      |   2 +-
 Documentation/ABI/testing/sysfs-kernel-slab        |   4 +-
 Documentation/admin-guide/kernel-parameters.txt    |  10 +-
 Documentation/admin-guide/mm/concepts.rst          |   2 +-
 Documentation/admin-guide/mm/damon/index.rst       |   2 +-
 Documentation/admin-guide/mm/damon/reclaim.rst     |   2 +-
 Documentation/admin-guide/mm/damon/usage.rst       |   8 +-
 Documentation/admin-guide/sysctl/vm.rst            |   2 +-
 Documentation/core-api/index.rst                   |   2 +-
 Documentation/filesystems/proc.rst                 |   2 +-
 Documentation/index.rst                            |   2 +-
 Documentation/mm/active_mm.rst                     |  91 ++++
 Documentation/mm/arch_pgtable_helpers.rst          | 260 +++++++++
 Documentation/mm/balance.rst                       | 102 ++++
 Documentation/mm/bootmem.rst                       |   5 +
 Documentation/mm/damon/api.rst                     |  20 +
 Documentation/mm/damon/design.rst                  | 176 ++++++
 Documentation/mm/damon/faq.rst                     |  50 ++
 Documentation/mm/damon/index.rst                   |  29 +
 Documentation/mm/free_page_reporting.rst           |  40 ++
 Documentation/mm/frontswap.rst                     | 266 +++++++++
 Documentation/mm/highmem.rst                       | 167 ++++++
 Documentation/mm/hmm.rst                           | 452 ++++++++++++++++
 Documentation/mm/hugetlbfs_reserv.rst              | 596 +++++++++++++++++++++
 Documentation/mm/hwpoison.rst                      | 184 +++++++
 Documentation/mm/index.rst                         |  68 +++
 Documentation/mm/ksm.rst                           |  87 +++
 Documentation/mm/memory-model.rst                  | 177 ++++++
 Documentation/mm/mmu_notifier.rst                  |  99 ++++
 Documentation/mm/numa.rst                          | 150 ++++++
 Documentation/mm/oom.rst                           |   5 +
 Documentation/mm/overcommit-accounting.rst         |  88 +++
 Documentation/mm/page_allocation.rst               |   5 +
 Documentation/mm/page_cache.rst                    |   5 +
 Documentation/mm/page_frags.rst                    |  45 ++
 Documentation/mm/page_migration.rst                | 288 ++++++++++
 Documentation/mm/page_owner.rst                    | 196 +++++++
 Documentation/mm/page_reclaim.rst                  |   5 +
 Documentation/mm/page_table_check.rst              |  56 ++
 Documentation/mm/page_tables.rst                   |   5 +
 Documentation/mm/physical_memory.rst               |   5 +
 Documentation/mm/process_addrs.rst                 |   5 +
 Documentation/mm/remap_file_pages.rst              |  33 ++
 Documentation/mm/shmfs.rst                         |   5 +
 Documentation/mm/slab.rst                          |   5 +
 Documentation/mm/slub.rst                          | 452 ++++++++++++++++
 Documentation/mm/split_page_table_lock.rst         | 100 ++++
 Documentation/mm/swap.rst                          |   5 +
 Documentation/mm/transhuge.rst                     | 187 +++++++
 Documentation/mm/unevictable-lru.rst               | 554 +++++++++++++++++++
 Documentation/mm/vmalloc.rst                       |   5 +
 Documentation/mm/vmalloced-kernel-stacks.rst       | 153 ++++++
 Documentation/mm/vmemmap_dedup.rst                 | 223 ++++++++
 Documentation/mm/z3fold.rst                        |  30 ++
 Documentation/mm/zsmalloc.rst                      |  82 +++
 .../zh_CN/admin-guide/mm/damon/index.rst           |   2 +-
 .../zh_CN/admin-guide/mm/damon/reclaim.rst         |   2 +-
 .../zh_CN/admin-guide/mm/damon/usage.rst           |   8 +-
 .../translations/zh_CN/core-api/index.rst          |   2 +-
 Documentation/translations/zh_CN/index.rst         |   2 +-
 Documentation/translations/zh_CN/mm/active_mm.rst  |  85 +++
 Documentation/translations/zh_CN/mm/balance.rst    |  81 +++
 Documentation/translations/zh_CN/mm/damon/api.rst  |  32 ++
 .../translations/zh_CN/mm/damon/design.rst         | 140 +++++
 Documentation/translations/zh_CN/mm/damon/faq.rst  |  48 ++
 .../translations/zh_CN/mm/damon/index.rst          |  32 ++
 .../translations/zh_CN/mm/free_page_reporting.rst  |  38 ++
 Documentation/translations/zh_CN/mm/frontswap.rst  | 196 +++++++
 Documentation/translations/zh_CN/mm/highmem.rst    | 128 +++++
 Documentation/translations/zh_CN/mm/hmm.rst        | 361 +++++++++++++
 .../translations/zh_CN/mm/hugetlbfs_reserv.rst     | 436 +++++++++++++++
 Documentation/translations/zh_CN/mm/hwpoison.rst   | 166 ++++++
 Documentation/translations/zh_CN/mm/index.rst      |  54 ++
 Documentation/translations/zh_CN/mm/ksm.rst        |  70 +++
 .../translations/zh_CN/mm/memory-model.rst         | 135 +++++
 .../translations/zh_CN/mm/mmu_notifier.rst         |  97 ++++
 Documentation/translations/zh_CN/mm/numa.rst       | 101 ++++
 .../zh_CN/mm/overcommit-accounting.rst             |  86 +++
 Documentation/translations/zh_CN/mm/page_frags.rst |  38 ++
 Documentation/translations/zh_CN/mm/page_owner.rst | 116 ++++
 .../translations/zh_CN/mm/page_table_check.rst     |  56 ++
 .../translations/zh_CN/mm/remap_file_pages.rst     |  32 ++
 .../zh_CN/mm/split_page_table_lock.rst             |  96 ++++
 Documentation/translations/zh_CN/mm/z3fold.rst     |  31 ++
 Documentation/translations/zh_CN/mm/zsmalloc.rst   |  78 +++
 Documentation/translations/zh_CN/vm/active_mm.rst  |  85 ---
 Documentation/translations/zh_CN/vm/balance.rst    |  81 ---
 Documentation/translations/zh_CN/vm/damon/api.rst  |  32 --
 .../translations/zh_CN/vm/damon/design.rst         | 140 -----
 Documentation/translations/zh_CN/vm/damon/faq.rst  |  48 --
 .../translations/zh_CN/vm/damon/index.rst          |  33 --
 .../translations/zh_CN/vm/free_page_reporting.rst  |  38 --
 Documentation/translations/zh_CN/vm/frontswap.rst  | 196 -------
 Documentation/translations/zh_CN/vm/highmem.rst    | 128 -----
 Documentation/translations/zh_CN/vm/hmm.rst        | 361 -------------
 .../translations/zh_CN/vm/hugetlbfs_reserv.rst     | 436 ---------------
 Documentation/translations/zh_CN/vm/hwpoison.rst   | 166 ------
 Documentation/translations/zh_CN/vm/index.rst      |  54 --
 Documentation/translations/zh_CN/vm/ksm.rst        |  70 ---
 .../translations/zh_CN/vm/memory-model.rst         | 135 -----
 .../translations/zh_CN/vm/mmu_notifier.rst         |  97 ----
 Documentation/translations/zh_CN/vm/numa.rst       | 101 ----
 .../zh_CN/vm/overcommit-accounting.rst             |  86 ---
 Documentation/translations/zh_CN/vm/page_frags.rst |  38 --
 Documentation/translations/zh_CN/vm/page_owner.rst | 116 ----
 .../translations/zh_CN/vm/page_table_check.rst     |  56 --
 .../translations/zh_CN/vm/remap_file_pages.rst     |  32 --
 .../zh_CN/vm/split_page_table_lock.rst             |  96 ----
 Documentation/translations/zh_CN/vm/z3fold.rst     |  31 --
 Documentation/translations/zh_CN/vm/zsmalloc.rst   |  78 ---
 Documentation/translations/zh_TW/index.rst         |   2 +-
 Documentation/vm/.gitignore                        |   3 -
 Documentation/vm/active_mm.rst                     |  91 ----
 Documentation/vm/arch_pgtable_helpers.rst          | 260 ---------
 Documentation/vm/balance.rst                       | 102 ----
 Documentation/vm/bootmem.rst                       |   5 -
 Documentation/vm/damon/api.rst                     |  20 -
 Documentation/vm/damon/design.rst                  | 176 ------
 Documentation/vm/damon/faq.rst                     |  50 --
 Documentation/vm/damon/index.rst                   |  29 -
 Documentation/vm/free_page_reporting.rst           |  40 --
 Documentation/vm/frontswap.rst                     | 266 ---------
 Documentation/vm/highmem.rst                       | 167 ------
 Documentation/vm/hmm.rst                           | 452 ----------------
 Documentation/vm/hugetlbfs_reserv.rst              | 596 ---------------------
 Documentation/vm/hwpoison.rst                      | 184 -------
 Documentation/vm/index.rst                         |  68 ---
 Documentation/vm/ksm.rst                           |  87 ---
 Documentation/vm/memory-model.rst                  | 177 ------
 Documentation/vm/mmu_notifier.rst                  |  99 ----
 Documentation/vm/numa.rst                          | 150 ------
 Documentation/vm/oom.rst                           |   5 -
 Documentation/vm/overcommit-accounting.rst         |  88 ---
 Documentation/vm/page_allocation.rst               |   5 -
 Documentation/vm/page_cache.rst                    |   5 -
 Documentation/vm/page_frags.rst                    |  45 --
 Documentation/vm/page_migration.rst                | 288 ----------
 Documentation/vm/page_owner.rst                    | 196 -------
 Documentation/vm/page_reclaim.rst                  |   5 -
 Documentation/vm/page_table_check.rst              |  56 --
 Documentation/vm/page_tables.rst                   |   5 -
 Documentation/vm/physical_memory.rst               |   5 -
 Documentation/vm/process_addrs.rst                 |   5 -
 Documentation/vm/remap_file_pages.rst              |  33 --
 Documentation/vm/shmfs.rst                         |   5 -
 Documentation/vm/slab.rst                          |   5 -
 Documentation/vm/slub.rst                          | 452 ----------------
 Documentation/vm/split_page_table_lock.rst         | 100 ----
 Documentation/vm/swap.rst                          |   5 -
 Documentation/vm/transhuge.rst                     | 187 -------
 Documentation/vm/unevictable-lru.rst               | 554 -------------------
 Documentation/vm/vmalloc.rst                       |   5 -
 Documentation/vm/vmalloced-kernel-stacks.rst       | 153 ------
 Documentation/vm/vmemmap_dedup.rst                 | 223 --------
 Documentation/vm/z3fold.rst                        |  30 --
 Documentation/vm/zsmalloc.rst                      |  82 ---
 MAINTAINERS                                        |  12 +-
 arch/loongarch/Kconfig                             |   2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h       |   2 +-
 include/linux/hmm.h                                |   4 +-
 include/linux/memremap.h                           |   2 +-
 include/linux/mmu_notifier.h                       |   2 +-
 include/linux/sched/mm.h                           |   4 +-
 include/linux/swap.h                               |   2 +-
 mm/Kconfig                                         |   2 +-
 mm/debug_vm_pgtable.c                              |   2 +-
 mm/frontswap.c                                     |   2 +-
 mm/huge_memory.c                                   |   2 +-
 mm/hugetlb.c                                       |   6 +-
 mm/hugetlb_vmemmap.c                               |   2 +-
 mm/ksm.c                                           |   4 +-
 mm/mmap.c                                          |   2 +-
 mm/rmap.c                                          |   8 +-
 mm/sparse-vmemmap.c                                |   2 +-
 mm/util.c                                          |   2 +-
 tools/vm/page_owner_sort.c                         |   2 +-
 176 files changed, 8355 insertions(+), 8359 deletions(-)
 create mode 100644 Documentation/mm/active_mm.rst
 create mode 100644 Documentation/mm/arch_pgtable_helpers.rst
 create mode 100644 Documentation/mm/balance.rst
 create mode 100644 Documentation/mm/bootmem.rst
 create mode 100644 Documentation/mm/damon/api.rst
 create mode 100644 Documentation/mm/damon/design.rst
 create mode 100644 Documentation/mm/damon/faq.rst
 create mode 100644 Documentation/mm/damon/index.rst
 create mode 100644 Documentation/mm/free_page_reporting.rst
 create mode 100644 Documentation/mm/frontswap.rst
 create mode 100644 Documentation/mm/highmem.rst
 create mode 100644 Documentation/mm/hmm.rst
 create mode 100644 Documentation/mm/hugetlbfs_reserv.rst
 create mode 100644 Documentation/mm/hwpoison.rst
 create mode 100644 Documentation/mm/index.rst
 create mode 100644 Documentation/mm/ksm.rst
 create mode 100644 Documentation/mm/memory-model.rst
 create mode 100644 Documentation/mm/mmu_notifier.rst
 create mode 100644 Documentation/mm/numa.rst
 create mode 100644 Documentation/mm/oom.rst
 create mode 100644 Documentation/mm/overcommit-accounting.rst
 create mode 100644 Documentation/mm/page_allocation.rst
 create mode 100644 Documentation/mm/page_cache.rst
 create mode 100644 Documentation/mm/page_frags.rst
 create mode 100644 Documentation/mm/page_migration.rst
 create mode 100644 Documentation/mm/page_owner.rst
 create mode 100644 Documentation/mm/page_reclaim.rst
 create mode 100644 Documentation/mm/page_table_check.rst
 create mode 100644 Documentation/mm/page_tables.rst
 create mode 100644 Documentation/mm/physical_memory.rst
 create mode 100644 Documentation/mm/process_addrs.rst
 create mode 100644 Documentation/mm/remap_file_pages.rst
 create mode 100644 Documentation/mm/shmfs.rst
 create mode 100644 Documentation/mm/slab.rst
 create mode 100644 Documentation/mm/slub.rst
 create mode 100644 Documentation/mm/split_page_table_lock.rst
 create mode 100644 Documentation/mm/swap.rst
 create mode 100644 Documentation/mm/transhuge.rst
 create mode 100644 Documentation/mm/unevictable-lru.rst
 create mode 100644 Documentation/mm/vmalloc.rst
 create mode 100644 Documentation/mm/vmalloced-kernel-stacks.rst
 create mode 100644 Documentation/mm/vmemmap_dedup.rst
 create mode 100644 Documentation/mm/z3fold.rst
 create mode 100644 Documentation/mm/zsmalloc.rst
 create mode 100644 Documentation/translations/zh_CN/mm/active_mm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/balance.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/api.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/design.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/faq.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/index.rst
 create mode 100644 Documentation/translations/zh_CN/mm/free_page_reporting.rst
 create mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst
 create mode 100644 Documentation/translations/zh_CN/mm/highmem.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hmm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hwpoison.rst
 create mode 100644 Documentation/translations/zh_CN/mm/index.rst
 create mode 100644 Documentation/translations/zh_CN/mm/ksm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/memory-model.rst
 create mode 100644 Documentation/translations/zh_CN/mm/mmu_notifier.rst
 create mode 100644 Documentation/translations/zh_CN/mm/numa.rst
 create mode 100644 Documentation/translations/zh_CN/mm/overcommit-accounting.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_frags.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_owner.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_table_check.rst
 create mode 100644 Documentation/translations/zh_CN/mm/remap_file_pages.rst
 create mode 100644 Documentation/translations/zh_CN/mm/split_page_table_lock.rst
 create mode 100644 Documentation/translations/zh_CN/mm/z3fold.rst
 create mode 100644 Documentation/translations/zh_CN/mm/zsmalloc.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/active_mm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/balance.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/api.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/design.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/faq.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/index.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/free_page_reporting.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/frontswap.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/highmem.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hmm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hwpoison.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/index.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/ksm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/memory-model.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/mmu_notifier.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/numa.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/overcommit-accounting.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_frags.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_owner.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_table_check.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/remap_file_pages.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/split_page_table_lock.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/z3fold.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/zsmalloc.rst
 delete mode 100644 Documentation/vm/.gitignore
 delete mode 100644 Documentation/vm/active_mm.rst
 delete mode 100644 Documentation/vm/arch_pgtable_helpers.rst
 delete mode 100644 Documentation/vm/balance.rst
 delete mode 100644 Documentation/vm/bootmem.rst
 delete mode 100644 Documentation/vm/damon/api.rst
 delete mode 100644 Documentation/vm/damon/design.rst
 delete mode 100644 Documentation/vm/damon/faq.rst
 delete mode 100644 Documentation/vm/damon/index.rst
 delete mode 100644 Documentation/vm/free_page_reporting.rst
 delete mode 100644 Documentation/vm/frontswap.rst
 delete mode 100644 Documentation/vm/highmem.rst
 delete mode 100644 Documentation/vm/hmm.rst
 delete mode 100644 Documentation/vm/hugetlbfs_reserv.rst
 delete mode 100644 Documentation/vm/hwpoison.rst
 delete mode 100644 Documentation/vm/index.rst
 delete mode 100644 Documentation/vm/ksm.rst
 delete mode 100644 Documentation/vm/memory-model.rst
 delete mode 100644 Documentation/vm/mmu_notifier.rst
 delete mode 100644 Documentation/vm/numa.rst
 delete mode 100644 Documentation/vm/oom.rst
 delete mode 100644 Documentation/vm/overcommit-accounting.rst
 delete mode 100644 Documentation/vm/page_allocation.rst
 delete mode 100644 Documentation/vm/page_cache.rst
 delete mode 100644 Documentation/vm/page_frags.rst
 delete mode 100644 Documentation/vm/page_migration.rst
 delete mode 100644 Documentation/vm/page_owner.rst
 delete mode 100644 Documentation/vm/page_reclaim.rst
 delete mode 100644 Documentation/vm/page_table_check.rst
 delete mode 100644 Documentation/vm/page_tables.rst
 delete mode 100644 Documentation/vm/physical_memory.rst
 delete mode 100644 Documentation/vm/process_addrs.rst
 delete mode 100644 Documentation/vm/remap_file_pages.rst
 delete mode 100644 Documentation/vm/shmfs.rst
 delete mode 100644 Documentation/vm/slab.rst
 delete mode 100644 Documentation/vm/slub.rst
 delete mode 100644 Documentation/vm/split_page_table_lock.rst
 delete mode 100644 Documentation/vm/swap.rst
 delete mode 100644 Documentation/vm/transhuge.rst
 delete mode 100644 Documentation/vm/unevictable-lru.rst
 delete mode 100644 Documentation/vm/vmalloc.rst
 delete mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst
 delete mode 100644 Documentation/vm/vmemmap_dedup.rst
 delete mode 100644 Documentation/vm/z3fold.rst
 delete mode 100644 Documentation/vm/zsmalloc.rst

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 1c9bed5595f5..d244674a9480 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -41,7 +41,7 @@ Description:	Kernel Samepage Merging daemon sysfs interface
 		sleep_millisecs: how many milliseconds ksm should sleep between
 		scans.
 
-		See Documentation/vm/ksm.rst for more information.
+		See Documentation/mm/ksm.rst for more information.
 
 What:		/sys/kernel/mm/ksm/merge_across_nodes
 Date:		January 2013
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index c440f4946e12..cd5fb8fa3ddf 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
 		The alloc_calls file is read-only and lists the kernel code
 		locations from which allocations for this cache were performed.
 		The alloc_calls file only contains information if debugging is
-		enabled for that cache (see Documentation/vm/slub.rst).
+		enabled for that cache (see Documentation/mm/slub.rst).
 
 What:		/sys/kernel/slab/<cache>/alloc_fastpath
 Date:		February 2008
@@ -219,7 +219,7 @@ Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 Description:
 		The free_calls file is read-only and lists the locations of
 		object frees if slab debugging is enabled (see
-		Documentation/vm/slub.rst).
+		Documentation/mm/slub.rst).
 
 What:		/sys/kernel/slab/<cache>/free_fastpath
 Date:		February 2008
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..8c0ea6b6c6a9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5442,7 +5442,7 @@
 			cache (risks via metadata attacks are mostly
 			unchanged). Debug options disable merging on their
 			own.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slab_max_order=	[MM, SLAB]
 			Determines the maximum allowed order for slabs.
@@ -5456,13 +5456,13 @@
 			slub_debug can create guard zones around objects and
 			may poison objects when not in use. Also tracks the
 			last alloc / free. For more information see
-			Documentation/vm/slub.rst.
+			Documentation/mm/slub.rst.
 
 	slub_max_order= [MM, SLUB]
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
 			fragmentation. For more information see
-			Documentation/vm/slub.rst.
+			Documentation/mm/slub.rst.
 
 	slub_min_objects=	[MM, SLUB]
 			The minimum number of objects per slab. SLUB will
@@ -5471,12 +5471,12 @@
 			the number of objects indicated. The higher the number
 			of objects the smaller the overhead of tracking slabs
 			and the less frequently locks need to be acquired.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slub_min_order=	[MM, SLUB]
 			Determines the minimum page order for slabs. Must be
 			lower than slub_max_order.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slub_merge	[MM, SLUB]
 			Same with slab_merge.
diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst
index b966fcff993b..c79f1e336222 100644
--- a/Documentation/admin-guide/mm/concepts.rst
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux
 constructs an independent memory management subsystem. A node has its
 own set of zones, lists of free and used pages and various statistics
 counters. You can find more details about NUMA in
-:ref:`Documentation/vm/numa.rst <numa>` and in
+:ref:`Documentation/mm/numa.rst <numa>` and in
 :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
 
 Page cache
diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 61aff88347f3..c4681fa69b9c 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -4,7 +4,7 @@
 Monitoring Data Accesses
 ========================
 
-:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
+:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
 Using DAMON, users can analyze the memory access patterns of their systems and
 optimize those.
 
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 46306f1f34b1..a8bd3bd29959 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -268,4 +268,4 @@ granularity reclamation. ::
 
 .. [1] https://research.google/pubs/pub48551/
 .. [2] https://lwn.net/Articles/787611/
-.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
+.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1bb7b72414b2..5540a3a40fc9 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -30,11 +30,11 @@ DAMON provides below interfaces for different users.
   <sysfs_interface>`.  This will be removed after next LTS kernel is released,
   so users should move to the :ref:`sysfs interface <sysfs_interface>`.
 - *Kernel Space Programming Interface.*
-  :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
+  :doc:`This </mm/damon/api>` is for kernel space programmers.  Using this,
   users can utilize every feature of DAMON most flexibly and efficiently by
   writing kernel space DAMON application programs for you.  You can even extend
   DAMON for various address spaces.  For detail, please refer to the interface
-  :doc:`document </vm/damon/api>`.
+  :doc:`document </mm/damon/api>`.
 
 .. _sysfs_interface:
 
@@ -185,7 +185,7 @@ controls the monitoring overhead, exist.  You can set and get the values by
 writing to and rading from the files.
 
 For more details about the intervals and monitoring regions range, please refer
-to the Design document (:doc:`/vm/damon/design`).
+to the Design document (:doc:`/mm/damon/design`).
 
 contexts/<N>/targets/
 ---------------------
@@ -402,7 +402,7 @@ Attributes
 Users can get and set the ``sampling interval``, ``aggregation interval``,
 ``update interval``, and min/max number of monitoring target regions by
 reading from and writing to the ``attrs`` file.  To know about the monitoring
-attributes in detail, please refer to the :doc:`/vm/damon/design`.  For
+attributes in detail, please refer to the :doc:`/mm/damon/design`.  For
 example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
 1000, and then check it again::
 
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 5c9aa171a0d3..4a440a7cfeb0 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -760,7 +760,7 @@ and don't use much of it.
 
 The default value is 0.
 
-See Documentation/vm/overcommit-accounting.rst and
+See Documentation/mm/overcommit-accounting.rst and
 mm/util.c::__vm_enough_memory() for more information.
 
 
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index dedd4d853329..5b1188494bcd 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -87,7 +87,7 @@ Memory management
 =================
 
 How to allocate and use memory in the kernel.  Note that there is a lot
-more memory-management documentation in Documentation/vm/index.rst.
+more memory-management documentation in Documentation/mm/index.rst.
 
 .. toctree::
    :maxdepth: 1
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 1bc91fb8c321..8543a59f288f 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1109,7 +1109,7 @@ CommitLimit
               yield a CommitLimit of 7.3G.
 
               For more details, see the memory overcommit documentation
-              in vm/overcommit-accounting.
+              in mm/overcommit-accounting.
 Committed_AS
               The amount of memory presently allocated on the system.
               The committed memory is a sum of all of the memory which
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 67036a05b771..4737c18c97ff 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -128,7 +128,7 @@ needed).
    sound/index
    crypto/index
    filesystems/index
-   vm/index
+   mm/index
    bpf/index
    usb/index
    PCI/index
diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst
new file mode 100644
index 000000000000..6f8269c284ed
--- /dev/null
+++ b/Documentation/mm/active_mm.rst
@@ -0,0 +1,91 @@
+.. _active_mm:
+
+=========
+Active MM
+=========
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ Cc'd to linux-kernel, because I don't write explanations all that often,
+ and when I do I feel better about more people reading them.
+
+ On Fri, 30 Jul 1999, David Mosberger wrote:
+ >
+ > Is there a brief description someplace on how "mm" vs. "active_mm" in
+ > the task_struct are supposed to be used?  (My apologies if this was
+ > discussed on the mailing lists---I just returned from vacation and
+ > wasn't able to follow linux-kernel for a while).
+
+ Basically, the new setup is:
+
+  - we have "real address spaces" and "anonymous address spaces". The
+    difference is that an anonymous address space doesn't care about the
+    user-level page tables at all, so when we do a context switch into an
+    anonymous address space we just leave the previous address space
+    active.
+
+    The obvious use for a "anonymous address space" is any thread that
+    doesn't need any user mappings - all kernel threads basically fall into
+    this category, but even "real" threads can temporarily say that for
+    some amount of time they are not going to be interested in user space,
+    and that the scheduler might as well try to avoid wasting time on
+    switching the VM state around. Currently only the old-style bdflush
+    sync does that.
+
+  - "tsk->mm" points to the "real address space". For an anonymous process,
+    tsk->mm will be NULL, for the logical reason that an anonymous process
+    really doesn't _have_ a real address space at all.
+
+  - however, we obviously need to keep track of which address space we
+    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+    which shows what the currently active address space is.
+
+    The rule is that for a process with a real address space (ie tsk->mm is
+    non-NULL) the active_mm obviously always has to be the same as the real
+    one.
+
+    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+    "borrowed" mm while the anonymous process is running. When the
+    anonymous process gets scheduled away, the borrowed address space is
+    returned and cleared.
+
+ To support all that, the "struct mm_struct" now has two counters: a
+ "mm_users" counter that is how many "real address space users" there are,
+ and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+ users) plus one if there are any real users.
+
+ Usually there is at least one real user, but it could be that the real
+ user exited on another CPU while a lazy user was still active, so you do
+ actually get cases where you have a address space that is _only_ used by
+ lazy users. That is often a short-lived state, because once that thread
+ gets scheduled away in favour of a real thread, the "zombie" mm gets
+ released because "mm_count" becomes zero.
+
+ Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+ more. "init_mm" should be considered just a "lazy context when no other
+ context is available", and in fact it is mainly used just at bootup when
+ no real VM has yet been created. So code that used to check
+
+ 	if (current->mm == &init_mm)
+
+ should generally just do
+
+ 	if (!current->mm)
+
+ instead (which makes more sense anyway - the test is basically one of "do
+ we have a user context", and is generally done by the page fault handler
+ and things like that).
+
+ Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+ because it slightly changes the interfaces to accommodate the alpha (who
+ would have thought it, but the alpha actually ends up having one of the
+ ugliest context switch codes - unlike the other architectures where the MM
+ and register state is separate, the alpha PALcode joins the two, and you
+ need to switch both together).
+
+ (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
new file mode 100644
index 000000000000..cbaee9e59241
--- /dev/null
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -0,0 +1,260 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _arch_page_table_helpers:
+
+===============================
+Architecture Page Table Helpers
+===============================
+
+Generic MM expects architectures (with MMU) to provide helpers to create, access
+and modify page table entries at various level for different memory functions.
+These page table helpers need to conform to a common semantics across platforms.
+Following tables describe the expected semantics which can also be tested during
+boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
+test need to be in sync.
+
+
+PTE Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pte_same                  | Tests whether both PTE entries are the same      |
++---------------------------+--------------------------------------------------+
+| pte_bad                   | Tests a non-table mapped PTE                     |
++---------------------------+--------------------------------------------------+
+| pte_present               | Tests a valid mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_young                 | Tests a young PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_dirty                 | Tests a dirty PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_write                 | Tests a writable PTE                             |
++---------------------------+--------------------------------------------------+
+| pte_special               | Tests a special PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_protnone              | Tests a PROT_NONE PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_soft_dirty            | Tests a soft dirty PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_mkyoung               | Creates a young PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkold                 | Creates an old PTE                               |
++---------------------------+--------------------------------------------------+
+| pte_mkdirty               | Creates a dirty PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkclean               | Creates a clean PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkwrite               | Creates a writable PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_wrprotect             | Creates a write protected PTE                    |
++---------------------------+--------------------------------------------------+
+| pte_mkspecial             | Creates a special PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
++---------------------------+--------------------------------------------------+
+| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
++---------------------------+--------------------------------------------------+
+| pte_mknotpresent          | Invalidates a mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| ptep_clear                | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear        | Clears and returns PTE                           |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear_full   | Clears and returns PTE (batched PTE unmap)       |
++---------------------------+--------------------------------------------------+
+| ptep_test_and_clear_young | Clears young from a PTE                          |
++---------------------------+--------------------------------------------------+
+| ptep_set_wrprotect        | Converts into a write protected PTE              |
++---------------------------+--------------------------------------------------+
+| ptep_set_access_flags     | Converts into a more permissive PTE              |
++---------------------------+--------------------------------------------------+
+
+
+PMD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pmd_same                  | Tests whether both PMD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pmd_bad                   | Tests a non-table mapped PMD                     |
++---------------------------+--------------------------------------------------+
+| pmd_leaf                  | Tests a leaf mapped PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
++---------------------------+--------------------------------------------------+
+| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
++---------------------------+--------------------------------------------------+
+| pmd_present               | Tests a valid mapped PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_young                 | Tests a young PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_dirty                 | Tests a dirty PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_write                 | Tests a writable PMD                             |
++---------------------------+--------------------------------------------------+
+| pmd_special               | Tests a special PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_protnone              | Tests a PROT_NONE PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_soft_dirty            | Tests a soft dirty PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_mkyoung               | Creates a young PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkold                 | Creates an old PMD                               |
++---------------------------+--------------------------------------------------+
+| pmd_mkdirty               | Creates a dirty PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkclean               | Creates a clean PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrite               | Creates a writable PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_wrprotect             | Creates a write protected PMD                    |
++---------------------------+--------------------------------------------------+
+| pmd_mkspecial             | Creates a special PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
++---------------------------+--------------------------------------------------+
+| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
++---------------------------+--------------------------------------------------+
+| pmd_set_huge              | Creates a PMD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pmd_clear_huge            | Clears a PMD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear        | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear_full   | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_test_and_clear_young | Clears young from a PMD                          |
++---------------------------+--------------------------------------------------+
+| pmdp_set_wrprotect        | Converts into a write protected PMD              |
++---------------------------+--------------------------------------------------+
+| pmdp_set_access_flags     | Converts into a more permissive PMD              |
++---------------------------+--------------------------------------------------+
+
+
+PUD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pud_same                  | Tests whether both PUD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pud_bad                   | Tests a non-table mapped PUD                     |
++---------------------------+--------------------------------------------------+
+| pud_leaf                  | Tests a leaf mapped PUD                          |
++---------------------------+--------------------------------------------------+
+| pud_huge                  | Tests a HugeTLB mapped PUD                       |
++---------------------------+--------------------------------------------------+
+| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
++---------------------------+--------------------------------------------------+
+| pud_present               | Tests a valid mapped PUD                         |
++---------------------------+--------------------------------------------------+
+| pud_young                 | Tests a young PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_dirty                 | Tests a dirty PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_write                 | Tests a writable PUD                             |
++---------------------------+--------------------------------------------------+
+| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
++---------------------------+--------------------------------------------------+
+| pud_mkyoung               | Creates a young PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkold                 | Creates an old PUD                               |
++---------------------------+--------------------------------------------------+
+| pud_mkdirty               | Creates a dirty PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkclean               | Creates a clean PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkwrite               | Creates a writable PUD                           |
++---------------------------+--------------------------------------------------+
+| pud_wrprotect             | Creates a write protected PUD                    |
++---------------------------+--------------------------------------------------+
+| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
++---------------------------+--------------------------------------------------+
+| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
++---------------------------+--------------------------------------------------+
+| pud_set_huge              | Creates a PUD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pud_clear_huge            | Clears a PUD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear        | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear_full   | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_test_and_clear_young | Clears young from a PUD                          |
++---------------------------+--------------------------------------------------+
+| pudp_set_wrprotect        | Converts into a write protected PUD              |
++---------------------------+--------------------------------------------------+
+| pudp_set_access_flags     | Converts into a more permissive PUD              |
++---------------------------+--------------------------------------------------+
+
+
+HugeTLB Page Table Helpers
+==========================
+
++---------------------------+--------------------------------------------------+
+| pte_huge                  | Tests a HugeTLB                                  |
++---------------------------+--------------------------------------------------+
+| pte_mkhuge                | Creates a HugeTLB                                |
++---------------------------+--------------------------------------------------+
+| huge_pte_dirty            | Tests a dirty HugeTLB                            |
++---------------------------+--------------------------------------------------+
+| huge_pte_write            | Tests a writable HugeTLB                         |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
++---------------------------+--------------------------------------------------+
+| huge_pte_wrprotect        | Creates a write protected HugeTLB                |
++---------------------------+--------------------------------------------------+
+| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
++---------------------------+--------------------------------------------------+
+
+
+SWAP Page Table Helpers
+========================
+
++---------------------------+--------------------------------------------------+
+| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
++---------------------------+--------------------------------------------------+
+| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
++---------------------------+--------------------------------------------------+
+| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| is_migration_entry        | Tests a migration (read or write) swapped entry  |
++-------------------------------+----------------------------------------------+
+| is_writable_migration_entry   | Tests a write migration swapped entry        |
++-------------------------------+----------------------------------------------+
+| make_readable_migration_entry | Creates a read migration swapped entry       |
++-------------------------------+----------------------------------------------+
+| make_writable_migration_entry | Creates a write migration swapped entry      |
++-------------------------------+----------------------------------------------+
+
+[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst
new file mode 100644
index 000000000000..6a1fadf3e173
--- /dev/null
+++ b/Documentation/mm/balance.rst
@@ -0,0 +1,102 @@
+.. _balance:
+
+================
+Memory Balancing
+================
+
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+well as for non __GFP_IO allocations.
+
+The first reason why a caller may avoid reclaim is that the caller can not
+sleep due to holding a spinlock or is in interrupt context. The second may
+be that the caller is willing to fail the allocation without incurring the
+overhead of page reclaim. This may happen for opportunistic high-order
+allocation requests that have order-0 fallback options. In such cases,
+the caller may also wish to avoid waking kswapd.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done,
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
+structure to decide whether a zone needs balancing.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
+
+
+(Good) Ideas that I have heard:
+
+1. Dynamic experience should influence balancing: number of failed requests
+   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/mm/bootmem.rst b/Documentation/mm/bootmem.rst
new file mode 100644
index 000000000000..eb2b31eedfa1
--- /dev/null
+++ b/Documentation/mm/bootmem.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Boot Memory
+===========
diff --git a/Documentation/mm/damon/api.rst b/Documentation/mm/damon/api.rst
new file mode 100644
index 000000000000..08f34df45523
--- /dev/null
+++ b/Documentation/mm/damon/api.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+API Reference
+=============
+
+Kernel space programs can use every feature of DAMON using below APIs.  All you
+need to do is including ``damon.h``, which is located in ``include/linux/`` of
+the source tree.
+
+Structures
+==========
+
+.. kernel-doc:: include/linux/damon.h
+
+
+Functions
+=========
+
+.. kernel-doc:: mm/damon/core.c
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
new file mode 100644
index 000000000000..0cff6fac6b7e
--- /dev/null
+++ b/Documentation/mm/damon/design.rst
@@ -0,0 +1,176 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+Design
+======
+
+Configurable Layers
+===================
+
+DAMON provides data access monitoring functionality while making the accuracy
+and the overhead controllable.  The fundamental access monitorings require
+primitives that dependent on and optimized for the target address space.  On
+the other hand, the accuracy and overhead tradeoff mechanism, which is the core
+of DAMON, is in the pure logic space.  DAMON separates the two parts in
+different layers and defines its interface to allow various low level
+primitives implementations configurable with the core logic.  We call the low
+level primitives implementations monitoring operations.
+
+Due to this separated design and the configurable interface, users can extend
+DAMON for any address space by configuring the core logics with appropriate
+monitoring operations.  If appropriate one is not provided, users can implement
+the operations on their own.
+
+For example, physical memory, virtual memory, swap space, those for specific
+processes, NUMA nodes, files, and backing memory devices would be supportable.
+Also, if some architectures or devices support special optimized access check
+primitives, those will be easily configurable.
+
+
+Reference Implementations of Address Space Specific Monitoring Operations
+=========================================================================
+
+The monitoring operations are defined in two parts:
+
+1. Identification of the monitoring target address range for the address space.
+2. Access check of specific address range in the target space.
+
+DAMON currently provides the implementations of the operations for the physical
+and virtual address spaces. Below two subsections describe how those work.
+
+
+VMA-based Target Address Range Construction
+-------------------------------------------
+
+This is only for the virtual address space monitoring operations
+implementation.  That for the physical address space simply asks users to
+manually set the monitoring target address ranges.
+
+Only small parts in the super-huge virtual address space of the processes are
+mapped to the physical memory and accessed.  Thus, tracking the unmapped
+address regions is just wasteful.  However, because DAMON can deal with some
+level of noise using the adaptive regions adjustment mechanism, tracking every
+mapping is not strictly required but could even incur a high overhead in some
+cases.  That said, too huge unmapped areas inside the monitoring target should
+be removed to not take the time for the adaptive mechanism.
+
+For the reason, this implementation converts the complex mappings to three
+distinct regions that cover every mapped area of the address space.  The two
+gaps between the three regions are the two biggest unmapped areas in the given
+address space.  The two biggest unmapped areas would be the gap between the
+heap and the uppermost mmap()-ed region, and the gap between the lowermost
+mmap()-ed region and the stack in most of the cases.  Because these gaps are
+exceptionally huge in usual address spaces, excluding these will be sufficient
+to make a reasonable trade-off.  Below shows this in detail::
+
+    <heap>
+    <BIG UNMAPPED REGION 1>
+    <uppermost mmap()-ed region>
+    (small mmap()-ed regions and munmap()-ed regions)
+    <lowermost mmap()-ed region>
+    <BIG UNMAPPED REGION 2>
+    <stack>
+
+
+PTE Accessed-bit Based Access Check
+-----------------------------------
+
+Both of the implementations for physical and virtual address spaces use PTE
+Accessed-bit for basic access checks.  Only one difference is the way of
+finding the relevant PTE Accessed bit(s) from the address.  While the
+implementation for the virtual address walks the page table for the target task
+of the address, the implementation for the physical address walks every page
+table having a mapping to the address.  In this way, the implementations find
+and clear the bit(s) for next sampling target address and checks whether the
+bit(s) set again after one sampling period.  This could disturb other kernel
+subsystems using the Accessed bits, namely Idle page tracking and the reclaim
+logic.  DAMON does nothing to avoid disturbing Idle page tracking, so handling
+the interference is the responsibility of sysadmins.  However, it solves the
+conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
+as Idle page tracking does.
+
+
+Address Space Independent Core Mechanisms
+=========================================
+
+Below four sections describe each of the DAMON core mechanisms and the five
+monitoring attributes, ``sampling interval``, ``aggregation interval``,
+``update interval``, ``minimum number of regions``, and ``maximum number of
+regions``.
+
+
+Access Frequency Monitoring
+---------------------------
+
+The output of DAMON says what pages are how frequently accessed for a given
+duration.  The resolution of the access frequency is controlled by setting
+``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
+access to each page per ``sampling interval`` and aggregates the results.  In
+other words, counts the number of the accesses to each page.  After each
+``aggregation interval`` passes, DAMON calls callback functions that previously
+registered by users so that users can read the aggregated results and then
+clears the results.  This can be described in below simple pseudo-code::
+
+    while monitoring_on:
+        for page in monitoring_target:
+            if accessed(page):
+                nr_accesses[page] += 1
+        if time() % aggregation_interval == 0:
+            for callback in user_registered_callbacks:
+                callback(monitoring_target, nr_accesses)
+            for page in monitoring_target:
+                nr_accesses[page] = 0
+        sleep(sampling interval)
+
+The monitoring overhead of this mechanism will arbitrarily increase as the
+size of the target workload grows.
+
+
+Region Based Sampling
+---------------------
+
+To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
+that assumed to have the same access frequencies into a region.  As long as the
+assumption (pages in a region have the same access frequencies) is kept, only
+one page in the region is required to be checked.  Thus, for each ``sampling
+interval``, DAMON randomly picks one page in each region, waits for one
+``sampling interval``, checks whether the page is accessed meanwhile, and
+increases the access frequency of the region if so.  Therefore, the monitoring
+overhead is controllable by setting the number of regions.  DAMON allows users
+to set the minimum and the maximum number of regions for the trade-off.
+
+This scheme, however, cannot preserve the quality of the output if the
+assumption is not guaranteed.
+
+
+Adaptive Regions Adjustment
+---------------------------
+
+Even somehow the initial monitoring target regions are well constructed to
+fulfill the assumption (pages in same region have similar access frequencies),
+the data access pattern can be dynamically changed.  This will result in low
+monitoring quality.  To keep the assumption as much as possible, DAMON
+adaptively merges and splits each region based on their access frequency.
+
+For each ``aggregation interval``, it compares the access frequencies of
+adjacent regions and merges those if the frequency difference is small.  Then,
+after it reports and clears the aggregated access frequency of each region, it
+splits each region into two or three regions if the total number of regions
+will not exceed the user-specified maximum number of regions after the split.
+
+In this way, DAMON provides its best-effort quality and minimal overhead while
+keeping the bounds users set for their trade-off.
+
+
+Dynamic Target Space Updates Handling
+-------------------------------------
+
+The monitoring target address range could dynamically changed.  For example,
+virtual memory could be dynamically mapped and unmapped.  Physical memory could
+be hot-plugged.
+
+As the changes could be quite frequent in some cases, DAMON allows the
+monitoring operations to check dynamic changes including memory mapping changes
+and applies it to monitoring operations-related data structures such as the
+abstracted monitoring target memory area only for each of a user-specified time
+interval (``update interval``).
diff --git a/Documentation/mm/damon/faq.rst b/Documentation/mm/damon/faq.rst
new file mode 100644
index 000000000000..dde7e2414ee6
--- /dev/null
+++ b/Documentation/mm/damon/faq.rst
@@ -0,0 +1,50 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Frequently Asked Questions
+==========================
+
+Why a new subsystem, instead of extending perf or other user space tools?
+=========================================================================
+
+First, because it needs to be lightweight as much as possible so that it can be
+used online, any unnecessary overhead such as kernel - user space context
+switching cost should be avoided.  Second, DAMON aims to be used by other
+programs including the kernel.  Therefore, having a dependency on specific
+tools like perf is not desirable.  These are the two biggest reasons why DAMON
+is implemented in the kernel space.
+
+
+Can 'idle pages tracking' or 'perf mem' substitute DAMON?
+=========================================================
+
+Idle page tracking is a low level primitive for access check of the physical
+address space.  'perf mem' is similar, though it can use sampling to minimize
+the overhead.  On the other hand, DAMON is a higher-level framework for the
+monitoring of various address spaces.  It is focused on memory management
+optimization and provides sophisticated accuracy/overhead handling mechanisms.
+Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
+DAMON's output, but cannot substitute DAMON.
+
+
+Does DAMON support virtual memory only?
+=======================================
+
+No.  The core of the DAMON is address space independent.  The address space
+specific monitoring operations including monitoring target regions
+constructions and actual access checks can be implemented and configured on the
+DAMON core by the users.  In this way, DAMON users can monitor any address
+space with any access check technique.
+
+Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
+implementations of the address space dependent functions for the virtual memory
+and the physical memory by default, for a reference and convenient use.
+
+
+Can I simply monitor page granularity?
+======================================
+
+Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
+working set size divided by the page size.  Because the monitoring target
+regions size is forced to be ``>=page size``, the region split will make no
+effect.
diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst
new file mode 100644
index 000000000000..48c0bbff98b2
--- /dev/null
+++ b/Documentation/mm/damon/index.rst
@@ -0,0 +1,29 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+DAMON: Data Access MONitor
+==========================
+
+DAMON is a data access monitoring framework subsystem for the Linux kernel.
+The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
+
+ - *accurate* (the monitoring output is useful enough for DRAM level memory
+   management; It might not appropriate for CPU Cache levels, though),
+ - *light-weight* (the monitoring overhead is low enough to be applied online),
+   and
+ - *scalable* (the upper-bound of the overhead is in constant range regardless
+   of the size of target workloads).
+
+Using this framework, therefore, the kernel's memory management mechanisms can
+make advanced decisions.  Experimental memory management optimization works
+that incurring high data accesses monitoring overhead could implemented again.
+In user space, meanwhile, users who have some special workloads can write
+personalized applications for better understanding and optimizations of their
+workloads and systems.
+
+.. toctree::
+   :maxdepth: 2
+
+   faq
+   design
+   api
diff --git a/Documentation/mm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst
new file mode 100644
index 000000000000..8c05e62d8b2b
--- /dev/null
+++ b/Documentation/mm/free_page_reporting.rst
@@ -0,0 +1,40 @@
+.. _free_page_reporting:
+
+=====================
+Free Page Reporting
+=====================
+
+Free page reporting is an API by which a device can register to receive
+lists of pages that are currently unused by the system. This is useful in
+the case of virtualization where a guest is then able to use this data to
+notify the hypervisor that it is no longer using certain pages in memory.
+
+For the driver, typically a balloon driver, to use of this functionality
+it will allocate and initialize a page_reporting_dev_info structure. The
+field within the structure it will populate is the "report" function
+pointer used to process the scatterlist. It must also guarantee that it can
+handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
+call to the function. A call to page_reporting_register will register the
+page reporting interface with the reporting framework assuming no other
+page reporting devices are already registered.
+
+Once registered the page reporting API will begin reporting batches of
+pages to the driver. The API will start reporting pages 2 seconds after
+the interface is registered and will continue to do so 2 seconds after any
+page of a sufficiently high order is freed.
+
+Pages reported will be stored in the scatterlist passed to the reporting
+function with the final entry having the end bit set in entry nent - 1.
+While pages are being processed by the report function they will not be
+accessible to the allocator. Once the report function has been completed
+the pages will be returned to the free area from which they were obtained.
+
+Prior to removing a driver that is making use of free page reporting it
+is necessary to call page_reporting_unregister to have the
+page_reporting_dev_info structure that is currently in use by free page
+reporting removed. Doing this will prevent further reports from being
+issued via the interface. If another driver or the same driver is
+registered it is possible for it to resume where the previous driver had
+left off in terms of reporting free pages.
+
+Alexander Duyck, Dec 04, 2019
diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst
new file mode 100644
index 000000000000..feecc5e24477
--- /dev/null
+++ b/Documentation/mm/frontswap.rst
@@ -0,0 +1,266 @@
+.. _frontswap:
+
+=========
+Frontswap
+=========
+
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size.  The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "store" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "load" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from transcendent memory.  An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further stores with that swap type.
+
+Once a page is successfully stored, a matching load on the page will normally
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the store returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a store returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+Note that if a page is stored and the page already exists in transcendent memory
+(a "duplicate" store), either the store succeeds and the data is overwritten,
+or the store fails AND the page is invalidated.  This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+``failed_stores``
+	how many store attempts have failed
+
+``loads``
+	how many loads were attempted (all should succeed)
+
+``succ_stores``
+	how many store attempts have succeeded
+
+``invalidates``
+	how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+===
+
+* Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device".
+
+Frontswap with a fairly small impact on the kernel,
+provides a huge amount of flexibility for more dynamic, flexible RAM
+utilization in various system configurations:
+
+In the single kernel case, aka "zcache", pages are compressed and
+stored in local memory, thus increasing the total anonymous pages
+that can be safely kept in RAM.  Zcache essentially trades off CPU
+cycles used in compression/decompression for better memory utilization.
+Benchmarks have shown little or no impact when memory pressure is
+low while providing a significant performance improvement (25%+)
+on some workloads under high memory pressure.
+
+"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
+support for clustered systems.  Frontswap pages are locally compressed
+as in zcache, but then "remotified" to another system's RAM.  This
+allows RAM to be dynamically load-balanced back-and-forth as needed,
+i.e. when system A is overcommitted, it can swap to system B, and
+vice versa.  RAMster can also be configured as a memory server so
+many servers in a cluster can swap, dynamically as needed, to a single
+server configured with a large amount of RAM... without pre-configuring
+how much of the RAM is available for each of the clients!
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "selfballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM (if overall host system memory
+conditions allow), thus mitigating the potentially awful performance impact
+of unplanned swapping.
+
+A KVM implementation is underway and has been RFC'ed to lkml.  And,
+using frontswap, investigation is also underway on the use of NVM as
+a memory extension technology.
+
+* Sure there may be performance advantages in some situations, but
+  what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "store"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+When swap pages are stored in transcendent memory instead of written
+out to disk, there is a side effect that this may create more memory
+pressure that can potentially outweigh the other advantages.  A
+backend, such as zcache, must implement policies to carefully (but
+dynamically) manage memory limits to ensure this doesn't happen.
+
+* OK, how about a quick overview of what this frontswap patch does
+  in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "store" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_store is called.  Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_store returns -1 and the kernel swaps the page
+to the swap device as normal.  Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_load() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+store" and (possibly) a "frontswap backend loads", which are presumably much
+faster.
+
+* Can't frontswap be configured as a "special" swap device that is
+  just higher priority than any real swap device (e.g. like zswap,
+  or maybe swap-over-nbd/NFS)?
+
+No.  First, the existing swap subsystem doesn't allow for any kind of
+swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
+but this would require fairly drastic changes.  Even if it were
+rewritten, the existing swap subsystem uses the block I/O layer which
+assumes a swap device is fixed size and any page in it is linearly
+addressable.  Frontswap barely touches the existing swap subsystem,
+and works around the constraints of the block I/O subsystem to provide
+a great deal of flexibility and dynamicity.
+
+For example, the acceptance of any swap page by the frontswap backend is
+entirely unpredictable. This is critical to the definition of frontswap
+backends because it grants completely dynamic discretion to the
+backend.  In zcache, one cannot know a priori how compressible a page is.
+"Poorly" compressible pages can be rejected, and "poorly" can itself be
+defined dynamically depending on current memory constraints.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.  Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem.  That said, only the initial "store"
+and "load" operations need be synchronous.  A separate asynchronous thread
+is free to manipulate the pages stored by frontswap.  For example,
+the "remotification" thread in RAMster uses standard asynchronous
+kernel sockets to move compressed frontswap pages to a remote machine.
+Similarly, a KVM guest-side implementation could do in-guest compression
+and use "batched" hypercalls.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+There is a downside to the transcendent memory specifications for
+frontswap:  Since any "store" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.  This means
+that frontswap cannot contain more pages than the total of swapon'd
+swap devices.  For example, if NO swap device is configured on some
+installation, frontswap is useless.  Swapless portable devices
+can still use frontswap but a backend for such devices must configure
+some kind of "ghost" swap device and ensure that it is never used.
+
+* Why this weird definition about "duplicate stores"?  If a page
+  has been previously successfully stored, can't it always be
+  successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the store must be rejected.  Whenever
+frontswap rejects a store that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+* Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst
new file mode 100644
index 000000000000..c9887f241c6c
--- /dev/null
+++ b/Documentation/mm/highmem.rst
@@ -0,0 +1,167 @@
+.. _highmem:
+
+====================
+High Memory Handling
+====================
+
+By: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+.. contents:: :local:
+
+What Is High Memory?
+====================
+
+High memory (highmem) is used when the size of physical memory approaches or
+exceeds the maximum size of virtual memory.  At that point it becomes
+impossible for the kernel to keep all of the available physical memory mapped
+at all times.  This means the kernel needs to start using temporary mappings of
+the pieces of physical memory that it wants to access.
+
+The part of (physical) memory not covered by a permanent mapping is what we
+refer to as 'highmem'.  There are various architecture dependent constraints on
+where exactly that border lies.
+
+In the i386 arch, for example, we choose to map the kernel into every process's
+VM space so that we don't have to pay the full TLB invalidation costs for
+kernel entry/exit.  This means the available virtual memory space (4GiB on
+i386) has to be divided between user and kernel space.
+
+The traditional split for architectures using this approach is 3:1, 3GiB for
+userspace and the top 1GiB for kernel space::
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+This means that the kernel can at most map 1GiB of physical memory at any one
+time, but because we need virtual address space for other things - including
+temporary maps to access the rest of the physical memory - the actual direct
+map will typically be less (usually around ~896MiB).
+
+Other architectures that have mm context tagged TLBs can have separate kernel
+and user maps.  Some hardware (like some ARMs), however, have limited virtual
+space when they use mm context tags.
+
+
+Temporary Virtual Mappings
+==========================
+
+The kernel contains several ways of creating temporary mappings. The following
+list shows them in order of preference of use.
+
+* kmap_local_page().  This function is used to require short term mappings.
+  It can be invoked from any context (including interrupts) but the mappings
+  can only be used in the context which acquired them.
+
+  This function should be preferred, where feasible, over all the others.
+
+  These mappings are thread-local and CPU-local, meaning that the mapping
+  can only be accessed from within this thread and the thread is bound the
+  CPU while the mapping is active. Even if the thread is preempted (since
+  preemption is never disabled by the function) the CPU can not be
+  unplugged from the system via CPU-hotplug until the mapping is disposed.
+
+  It's valid to take pagefaults in a local kmap region, unless the context
+  in which the local mapping is acquired does not allow it for other reasons.
+
+  kmap_local_page() always returns a valid virtual address and it is assumed
+  that kunmap_local() will never fail.
+
+  Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
+  extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
+  because the map implementation is stack based. See kmap_local_page() kdocs
+  (included in the "Functions" section) for details on how to manage nested
+  mappings.
+
+* kmap_atomic().  This permits a very short duration mapping of a single
+  page.  Since the mapping is restricted to the CPU that issued it, it
+  performs well, but the issuing task is therefore required to stay on that
+  CPU until it has finished, lest some other task displace its mappings.
+
+  kmap_atomic() may also be used by interrupt contexts, since it does not
+  sleep and the callers too may not sleep until after kunmap_atomic() is
+  called.
+
+  Each call of kmap_atomic() in the kernel creates a non-preemptible section
+  and disable pagefaults. This could be a source of unwanted latency. Therefore
+  users should prefer kmap_local_page() instead of kmap_atomic().
+
+  It is assumed that k[un]map_atomic() won't fail.
+
+* kmap().  This should be used to make short duration mapping of a single
+  page with no restrictions on preemption or migration. It comes with an
+  overhead as mapping space is restricted and protected by a global lock
+  for synchronization. When mapping is no longer needed, the address that
+  the page was mapped to must be released with kunmap().
+
+  Mapping changes must be propagated across all the CPUs. kmap() also
+  requires global TLB invalidation when the kmap's pool wraps and it might
+  block when the mapping space is fully utilized until a slot becomes
+  available. Therefore, kmap() is only callable from preemptible context.
+
+  All the above work is necessary if a mapping must last for a relatively
+  long time but the bulk of high-memory mappings in the kernel are
+  short-lived and only used in one place. This means that the cost of
+  kmap() is mostly wasted in such cases. kmap() was not intended for long
+  term mappings but it has morphed in that direction and its use is
+  strongly discouraged in newer code and the set of the preceding functions
+  should be preferred.
+
+  On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have
+  no real work to do because a 64-bit address space is more than sufficient to
+  address all the physical memory whose pages are permanently mapped.
+
+* vmap().  This can be used to make a long duration mapping of multiple
+  physical pages into a contiguous virtual space.  It needs global
+  synchronization to unmap.
+
+
+Cost of Temporary Mappings
+==========================
+
+The cost of creating temporary mappings can be quite high.  The arch has to
+manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
+
+If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
+simply with a bit of arithmetic that will convert the page struct address into
+a pointer to the page contents rather than juggling mappings about.  In such a
+case, the unmap operation may be a null operation.
+
+If CONFIG_MMU is not set, then there can be no temporary mappings and no
+highmem.  In such a case, the arithmetic approach will also be used.
+
+
+i386 PAE
+========
+
+The i386 arch, under some circumstances, will permit you to stick up to 64GiB
+of RAM into your 32-bit machine.  This has a number of consequences:
+
+* Linux needs a page-frame structure for each page in the system and the
+  pageframes need to live in the permanent mapping, which means:
+
+* you can have 896M/sizeof(struct page) page-frames at most; with struct
+  page being 32-bytes that would end up being something in the order of 112G
+  worth of pages; the kernel, however, needs to store more than just
+  page-frames in that memory...
+
+* PAE makes your page tables larger - which slows the system down as more
+  data has to be accessed to traverse in TLB fills and the like.  One
+  advantage is that PAE has more PTE bits and can provide advanced features
+  like NX and PAT.
+
+The general recommendation is that you don't use more than 8GiB on a 32-bit
+machine - although more might work for you and your workload, you're pretty
+much on your own - don't expect kernel developers to really care much if things
+come apart.
+
+
+Functions
+=========
+
+.. kernel-doc:: include/linux/highmem.h
+.. kernel-doc:: include/linux/highmem-internal.h
diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
new file mode 100644
index 000000000000..f2a59ed82ed3
--- /dev/null
+++ b/Documentation/mm/hmm.rst
@@ -0,0 +1,452 @@
+.. _hmm:
+
+=====================================
+Heterogeneous Memory Management (HMM)
+=====================================
+
+Provide infrastructure and helpers to integrate non-conventional memory (device
+memory like GPU on board memory) into regular kernel path, with the cornerstone
+of this being specialized struct page for such memory (see sections 5 to 7 of
+this document).
+
+HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
+allowing a device to transparently access program addresses coherently with
+the CPU meaning that any valid pointer on the CPU is also a valid pointer
+for the device. This is becoming mandatory to simplify the use of advanced
+heterogeneous computing where GPU, DSP, or FPGA are used to perform various
+computations on behalf of a process.
+
+This document is divided as follows: in the first section I expose the problems
+related to using device specific memory allocators. In the second section, I
+expose the hardware limitations that are inherent to many platforms. The third
+section gives an overview of the HMM design. The fourth section explains how
+CPU page-table mirroring works and the purpose of HMM in this context. The
+fifth section deals with how device memory is represented inside the kernel.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
+
+.. contents:: :local:
+
+Problems of using a device specific memory allocator
+====================================================
+
+Devices with a large amount of on board memory (several gigabytes) like GPUs
+have historically managed their memory through dedicated driver specific APIs.
+This creates a disconnect between memory allocated and managed by a device
+driver and regular application memory (private anonymous, shared memory, or
+regular file backed memory). From here on I will refer to this aspect as split
+address space. I use shared address space to refer to the opposite situation:
+i.e., one in which any application memory region can be used by a device
+transparently.
+
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
+are not equal from the device point of view which complicates large programs
+that rely on a wide set of libraries.
+
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
+share) and memory allocated through the device driver API (this still ends up
+with an mmap but of the device file).
+
+For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
+complex data set needs to re-map all the pointer relations between each of its
+elements. This is error prone and programs get harder to debug because of the
+duplicate data set and addresses.
+
+Split address space also means that libraries cannot transparently use data
+they are getting from the core program or another library and thus each library
+might have to duplicate its input data set using the device specific memory
+allocator. Large projects suffer from this and waste resources because of the
+various memory copies.
+
+Duplicating each library API to accept as input or output memory allocated by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosion in the library entry points.
+
+Finally, with the advance of high level language constructs (in C++ but in
+other languages too) it is now possible for the compiler to leverage GPUs and
+other devices without programmer knowledge. Some compiler identified patterns
+are only do-able with a shared address space. It is also more reasonable to use
+a shared address space for all other patterns.
+
+
+I/O bus, device memory characteristics
+======================================
+
+I/O buses cripple shared address spaces due to a few limitations. Most I/O
+buses only allow basic memory access from device to main memory; even cache
+coherency is often optional. Access to device memory from a CPU is even more
+limited. More often than not, it is not cache coherent.
+
+If we only consider the PCIE bus, then a device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However, it only allows
+a limited set of atomic operations from the device on main memory. This is worse
+in the other direction: the CPU can only access a limited range of the device
+memory and cannot perform atomic operations on it. Thus device memory cannot
+be considered the same as regular memory from the kernel point of view.
+
+Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
+and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
+The final limitation is latency. Access to main memory from the device has an
+order of magnitude higher latency than when the device accesses its own memory.
+
+Some platforms are developing new I/O buses or additions/modifications to PCIE
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
+architecture supports. Sadly, not all platforms are following this trend and
+some major architectures are left without hardware solutions to these problems.
+
+So for shared address space to make sense, not only must we allow devices to
+access any memory but we must also permit any memory to be migrated to device
+memory while the device is using it (blocking CPU access while it happens).
+
+
+Shared address space and migration
+==================================
+
+HMM intends to provide two main features. The first one is to share the address
+space by duplicating the CPU page table in the device page table so the same
+address points to the same physical memory for any valid main memory address in
+the process address space.
+
+To achieve this, HMM offers a set of helpers to populate the device page table
+while keeping track of CPU page table updates. Device page table updates are
+not as easy as CPU page table updates. To update the device page table, you must
+allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
+specific commands in it to perform the update (unmap, cache invalidations, and
+flush, ...). This cannot be done through common code for all devices. Hence
+why HMM provides helpers to factor out everything that can be while leaving the
+hardware specific details to the device driver.
+
+The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
+allows allocating a struct page for each page of device memory. Those pages
+are special because the CPU cannot map them. However, they allow migrating
+main memory to device memory using existing migration mechanisms and everything
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+memory for the device memory and second to perform migration. Policy decisions
+of what and when to migrate is left to the device driver.
+
+Note that any CPU access to a device page triggers a page fault and a migration
+back to main memory. For example, when a page backing a given CPU address A is
+migrated from a main memory page to a device page, then any CPU access to
+address A triggers a page fault and initiates a migration back to main memory.
+
+With these two features, HMM not only allows a device to mirror process address
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
+used by the device.
+
+
+Address space mirroring implementation and API
+==============================================
+
+Address space mirroring's main objective is to allow duplication of a range of
+CPU page table into a device page table; HMM helps keep both synchronized. A
+device driver that wants to mirror a process address space must start with the
+registration of a mmu_interval_notifier::
+
+ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+				  struct mm_struct *mm, unsigned long start,
+				  unsigned long length,
+				  const struct mmu_interval_notifier_ops *ops);
+
+During the ops->invalidate() callback the device driver must perform the
+update action to the range (mark range read only, or fully unmap, etc.). The
+device must complete the update before the driver callback returns.
+
+When the device driver wants to populate a range of virtual addresses, it can
+use::
+
+  int hmm_range_fault(struct hmm_range *range);
+
+It will trigger a page fault on missing or read-only entries if write access is
+requested (see below). Page faults use the generic mm page fault code path just
+like a CPU page fault.
+
+Both functions copy CPU page table entries into their pfns array argument. Each
+entry in that array corresponds to an address in the virtual range. HMM
+provides a set of flags to help the driver identify special CPU page table
+entries.
+
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
+
+ int driver_populate_range(...)
+ {
+      struct hmm_range range;
+      ...
+
+      range.notifier = &interval_sub;
+      range.start = ...;
+      range.end = ...;
+      range.hmm_pfns = ...;
+
+      if (!mmget_not_zero(interval_sub->notifier.mm))
+          return -EFAULT;
+
+ again:
+      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+      mmap_read_lock(mm);
+      ret = hmm_range_fault(&range);
+      if (ret) {
+          mmap_read_unlock(mm);
+          if (ret == -EBUSY)
+                 goto again;
+          return ret;
+      }
+      mmap_read_unlock(mm);
+
+      take_lock(driver->update);
+      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      /* Use pfns array content to update device page table,
+       * under the update lock */
+
+      release_lock(driver->update);
+      return 0;
+ }
+
+The driver->update lock is the same lock that the driver takes inside its
+invalidate() callback. That lock must be held before calling
+mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
+update.
+
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance if the device driver wants pages for a range with at least read
+permission, it sets::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all pages
+in the range with at least read permission.
+
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
+
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission i.e., if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+After hmm_range_fault completes the flag bits are set to the current state of
+the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
+writable.
+
+
+Represent and manage device memory from core kernel point of view
+=================================================================
+
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
+addresses that were backed by device memory. It turns out that this ended up
+replicating most of the fields of struct page and also needed many kernel code
+paths to be updated to understand this new kind of memory.
+
+Most kernel code paths never try to access the memory behind a page
+but only care about struct page contents. Because of this, HMM switched to
+directly using struct page for device memory which left most kernel code paths
+unaware of the difference. We only need to make sure that no one ever tries to
+map those pages from the CPU side.
+
+Migration to and from device memory
+===================================
+
+Because the CPU cannot access device memory directly, the device driver must
+use hardware DMA or device specific load/store instructions to migrate data.
+The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
+functions are designed to make drivers easier to write and to centralize common
+code across drivers.
+
+Before migrating pages to device private memory, special device private
+``struct page`` need to be created. These will be used as special "swap"
+page table entries so that a CPU process will fault if it tries to access
+a page that has been migrated to device private memory.
+
+These can be allocated and freed with::
+
+    struct resource *res;
+    struct dev_pagemap pagemap;
+
+    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
+                                  "name of driver resource");
+    pagemap.type = MEMORY_DEVICE_PRIVATE;
+    pagemap.range.start = res->start;
+    pagemap.range.end = res->end;
+    pagemap.nr_range = 1;
+    pagemap.ops = &device_devmem_ops;
+    memremap_pages(&pagemap, numa_node_id());
+
+    memunmap_pages(&pagemap);
+    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
+
+There are also devm_request_free_mem_region(), devm_memremap_pages(),
+devm_memunmap_pages(), and devm_release_mem_region() when the resources can
+be tied to a ``struct device``.
+
+The overall migration steps are similar to migrating NUMA pages within system
+memory (see :ref:`Page migration <page_migration>`) but the steps are split
+between device driver specific code and shared common code:
+
+1. ``mmap_read_lock()``
+
+   The device driver has to pass a ``struct vm_area_struct`` to
+   migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to
+   be held for the duration of the migration.
+
+2. ``migrate_vma_setup(struct migrate_vma *args)``
+
+   The device driver initializes the ``struct migrate_vma`` fields and passes
+   the pointer to migrate_vma_setup(). The ``args->flags`` field is used to
+   filter which source pages should be migrated. For example, setting
+   ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and
+   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in
+   device private memory. If the latter flag is set, the ``args->pgmap_owner``
+   field is used to identify device private pages owned by the driver. This
+   avoids trying to migrate device private pages residing in other devices.
+   Currently only anonymous private VMA ranges can be migrated to or from
+   system memory and device private memory.
+
+   One of the first steps migrate_vma_setup() does is to invalidate other
+   device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and
+   ``mmu_notifier_invalidate_range_end()`` calls around the page table
+   walks to fill in the ``args->src`` array with PFNs to be migrated.
+   The ``invalidate_range_start()`` callback is passed a
+   ``struct mmu_notifier_range`` with the ``event`` field set to
+   ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
+   the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
+   allows the device driver to skip the invalidation callback and only
+   invalidate device private MMU mappings that are actually migrating.
+   This is explained more in the next section.
+
+   While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()``
+   entry results in a valid "zero" PFN stored in the ``args->src`` array.
+   This lets the driver allocate device private memory and clear it instead
+   of copying a page of zeros. Valid PTE entries to system memory or
+   device private struct pages will be locked with ``lock_page()``, isolated
+   from the LRU (if system memory since device private pages are not on
+   the LRU), unmapped from the process, and a special migration PTE is
+   inserted in place of the original PTE.
+   migrate_vma_setup() also clears the ``args->dst`` array.
+
+3. The device driver allocates destination pages and copies source pages to
+   destination pages.
+
+   The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE``
+   bit is set and skips entries that are not migrating. The device driver
+   can also choose to skip migrating a page by not filling in the ``dst``
+   array for that page.
+
+   The driver then allocates either a device private struct page or a
+   system memory page, locks the page with ``lock_page()``, and fills in the
+   ``dst`` array entry with::
+
+     dst[i] = migrate_pfn(page_to_pfn(dpage));
+
+   Now that the driver knows that this page is being migrated, it can
+   invalidate device private MMU mappings and copy device private memory
+   to system memory or another device private page. The core Linux kernel
+   handles CPU page table invalidations so the device driver only has to
+   invalidate its own MMU mappings.
+
+   The driver can use ``migrate_pfn_to_page(src[i])`` to get the
+   ``struct page`` of the source and either copy the source page to the
+   destination or clear the destination device private memory if the pointer
+   is ``NULL`` meaning the source page was not populated in system memory.
+
+4. ``migrate_vma_pages()``
+
+   This step is where the migration is actually "committed".
+
+   If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this
+   is where the newly allocated page is inserted into the CPU's page table.
+   This can fail if a CPU thread faults on the same page. However, the page
+   table is locked and only one of the new pages will be inserted.
+   The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared
+   if it loses the race.
+
+   If the source page was locked, isolated, etc. the source ``struct page``
+   information is now copied to destination ``struct page`` finalizing the
+   migration on the CPU side.
+
+5. Device driver updates device MMU page tables for pages still migrating,
+   rolling back pages not migrating.
+
+   If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device
+   driver can update the device MMU and set the write enable bit if the
+   ``MIGRATE_PFN_WRITE`` bit is set.
+
+6. ``migrate_vma_finalize()``
+
+   This step replaces the special migration page table entry with the new
+   page's page table entry and releases the reference to the source and
+   destination ``struct page``.
+
+7. ``mmap_read_unlock()``
+
+   The lock can now be released.
+
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
+Memory cgroup (memcg) and rss accounting
+========================================
+
+For now, device memory is accounted as any regular page in rss counters (either
+anonymous if device page is used for anonymous, file if device page is used for
+file backed page, or shmem if device page is used for shared memory). This is a
+deliberate choice to keep existing applications, that might start using device
+memory without knowing about it, running unimpacted.
+
+A drawback is that the OOM killer might kill an application using a lot of
+device memory and not a lot of regular system memory and thus not freeing much
+system memory. We want to gather more real world experience on how applications
+and system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory pages are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory cannot fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is used and its impact on memory
+resource control.
+
+
+Note that device memory can never be pinned by a device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is dropped in case of shared memory or file backed memory.
diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..f143954e0d05
--- /dev/null
+++ b/Documentation/mm/hugetlbfs_reserv.rst
@@ -0,0 +1,596 @@
+.. _hugetlbfs_reserve:
+
+=====================
+Hugetlbfs Reservation
+=====================
+
+Overview
+========
+
+Huge pages as described at :ref:`hugetlbpage` are typically
+preallocated for application use.  These huge pages are instantiated in a
+task's address space at page fault time if the VMA indicates huge pages are
+to be used.  If no huge page exists at page fault time, the task is sent
+a SIGBUS and often dies an unhappy death.  Shortly after huge page support
+was added, it was determined that it would be better to detect a shortage
+of huge pages at mmap() time.  The idea is that if there were not enough
+huge pages to cover the mapping, the mmap() would fail.  This was first
+done with a simple check in the code at mmap() time to determine if there
+were enough free huge pages to cover the mapping.  Like most things in the
+kernel, the code has evolved over time.  However, the basic idea was to
+'reserve' huge pages at mmap() time to ensure that huge pages would be
+available for page faults in that mapping.  The description below attempts to
+describe how huge page reserve processing is done in the v4.10 kernel.
+
+
+Audience
+========
+This description is primarily targeted at kernel developers who are modifying
+hugetlbfs code.
+
+
+The Data Structures
+===================
+
+resv_huge_pages
+	This is a global (per-hstate) count of reserved huge pages.  Reserved
+	huge pages are only available to the task which reserved them.
+	Therefore, the number of huge pages generally available is computed
+	as (``free_huge_pages - resv_huge_pages``).
+Reserve Map
+	A reserve map is described by the structure::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
+	There is one reserve map for each huge page mapping in the system.
+	The regions list within the resv_map describes the regions within
+	the mapping.  A region is described as::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
+	The 'from' and 'to' fields of the file region structure are huge page
+	indices into the mapping.  Depending on the type of mapping, a
+	region in the reserv_map may indicate reservations exist for the
+	range, or reservations do not exist.
+Flags for MAP_PRIVATE Reservations
+	These are stored in the bottom bits of the reservation map pointer.
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		Indicates this task is the owner of the reservations
+		associated with the mapping.
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		Indicates task originally mapping this range (and creating
+		reserves) has unmapped a page from this task (the child)
+		due to a failed COW.
+Page Flags
+	The PagePrivate page flag is used to indicate that a huge page
+	reservation must be restored when the huge page is freed.  More
+	details will be discussed in the "Freeing huge pages" section.
+
+
+Reservation Map Location (Private or Shared)
+============================================
+
+A huge page mapping or segment is either private or shared.  If private,
+it is typically only available to a single address space (task).  If shared,
+it can be mapped into multiple address spaces (tasks).  The location and
+semantics of the reservation map is significantly different for the two types
+of mappings.  Location differences are:
+
+- For private mappings, the reservation map hangs off the VMA structure.
+  Specifically, vma->vm_private_data.  This reserve map is created at the
+  time the mapping (mmap(MAP_PRIVATE)) is created.
+- For shared mappings, the reservation map hangs off the inode.  Specifically,
+  inode->i_mapping->private_data.  Since shared mappings are always backed
+  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
+  contains a reservation map.  As a result, the reservation map is allocated
+  when the inode is created.
+
+
+Creating Reservations
+=====================
+Reservations are created when a huge page backed shared memory segment is
+created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
+These operations result in a call to the routine hugetlb_reserve_pages()::
+
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
+
+The first thing hugetlb_reserve_pages() does is check if the NORESERVE
+flag was specified in either the shmget() or mmap() call.  If NORESERVE
+was specified, then this routine returns immediately as no reservations
+are desired.
+
+The arguments 'from' and 'to' are huge page indices into the mapping or
+underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
+the length of the segment/mapping.  For mmap(), the offset argument could
+be used to specify the offset into the underlying file.  In such a case,
+the 'from' and 'to' arguments have been adjusted by this offset.
+
+One of the big differences between PRIVATE and SHARED mappings is the way
+in which reservations are represented in the reservation map.
+
+- For shared mappings, an entry in the reservation map indicates a reservation
+  exists or did exist for the corresponding page.  As reservations are
+  consumed, the reservation map is not modified.
+- For private mappings, the lack of an entry in the reservation map indicates
+  a reservation exists for the corresponding page.  As reservations are
+  consumed, entries are added to the reservation map.  Therefore, the
+  reservation map can also be used to determine which reservations have
+  been consumed.
+
+For private mappings, hugetlb_reserve_pages() creates the reservation map and
+hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
+to indicate this VMA owns the reservations.
+
+The reservation map is consulted to determine how many huge page reservations
+are needed for the current mapping/segment.  For private mappings, this is
+always the value (to - from).  However, for shared mappings it is possible that
+some reservations may already exist within the range (to - from).  See the
+section :ref:`Reservation Map Modifications <resv_map_modifications>`
+for details on how this is accomplished.
+
+The mapping may be associated with a subpool.  If so, the subpool is consulted
+to ensure there is sufficient space for the mapping.  It is possible that the
+subpool has set aside reservations that can be used for the mapping.  See the
+section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
+
+After consulting the reservation map and subpool, the number of needed new
+reservations is known.  The routine hugetlb_acct_memory() is called to check
+for and take the requested number of reservations.  hugetlb_acct_memory()
+calls into routines that potentially allocate and adjust surplus page counts.
+However, within those routines the code is simply checking to ensure there
+are enough free huge pages to accommodate the reservation.  If there are,
+the global reservation count resv_huge_pages is adjusted something like the
+following::
+
+	if (resv_needed <= (resv_huge_pages - free_huge_pages))
+		resv_huge_pages += resv_needed;
+
+Note that the global lock hugetlb_lock is held when checking and adjusting
+these counters.
+
+If there were enough free huge pages and the global count resv_huge_pages
+was adjusted, then the reservation map associated with the mapping is
+modified to reflect the reservations.  In the case of a shared mapping, a
+file_region will exist that includes the range 'from' - 'to'.  For private
+mappings, no modifications are made to the reservation map as lack of an
+entry indicates a reservation exists.
+
+If hugetlb_reserve_pages() was successful, the global reservation count and
+reservation map associated with the mapping will be modified as required to
+ensure reservations exist for the range 'from' - 'to'.
+
+.. _consume_resv:
+
+Consuming Reservations/Allocating a Huge Page
+=============================================
+
+Reservations are consumed when huge pages associated with the reservations
+are allocated and instantiated in the corresponding mapping.  The allocation
+is performed within the routine alloc_huge_page()::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
+alloc_huge_page is passed a VMA pointer and a virtual address, so it can
+consult the reservation map to determine if a reservation exists.  In addition,
+alloc_huge_page takes the argument avoid_reserve which indicates reserves
+should not be used even if it appears they have been set aside for the
+specified address.  The avoid_reserve argument is most often used in the case
+of Copy on Write and Page Migration where additional copies of an existing
+page are being allocated.
+
+The helper routine vma_needs_reservation() is called to determine if a
+reservation exists for the address within the mapping(vma).  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
+information on what this routine does.
+The value returned from vma_needs_reservation() is generally
+0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
+If a reservation does not exist, and there is a subpool associated with the
+mapping the subpool is consulted to determine if it contains reservations.
+If the subpool contains reservations, one can be used for this allocation.
+However, in every case the avoid_reserve argument overrides the use of
+a reservation for the allocation.  After determining whether a reservation
+exists and can be used for the allocation, the routine dequeue_huge_page_vma()
+is called.  This routine takes two arguments related to reservations:
+
+- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
+- chg, even though this argument is of type long only the values 0 or 1 are
+  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
+  reservation exists (see the section "Memory Policy and Reservations" for
+  possible issues).  If the value is 1, it indicates a reservation does not
+  exist and the page must be taken from the global free pool if possible.
+
+The free lists associated with the memory policy of the VMA are searched for
+a free page.  If a page is found, the value free_huge_pages is decremented
+when the page is removed from the free list.  If there was a reservation
+associated with the page, the following adjustments are made::
+
+	SetPagePrivate(page);	/* Indicates allocating this page consumed
+				 * a reservation, and if an error is
+				 * encountered such that the page must be
+				 * freed, the reservation will be restored. */
+	resv_huge_pages--;	/* Decrement the global reservation count */
+
+Note, if no huge page can be found that satisfies the VMA's memory policy
+an attempt will be made to allocate one using the buddy allocator.  This
+brings up the issue of surplus huge pages and overcommit which is beyond
+the scope reservations.  Even if a surplus page is allocated, the same
+reservation based adjustments as above will be made: SetPagePrivate(page) and
+resv_huge_pages--.
+
+After obtaining a new huge page, (page)->private is set to the value of
+the subpool associated with the page if it exists.  This will be used for
+subpool accounting when the page is freed.
+
+The routine vma_commit_reservation() is then called to adjust the reserve
+map based on the consumption of the reservation.  In general, this involves
+ensuring the page is represented within a file_region structure of the region
+map.  For shared mappings where the reservation was present, an entry
+in the reserve map already existed so no change is made.  However, if there
+was no reservation in a shared mapping or this was a private mapping a new
+entry must be created.
+
+It is possible that the reserve map could have been changed between the call
+to vma_needs_reservation() at the beginning of alloc_huge_page() and the
+call to vma_commit_reservation() after the page was allocated.  This would
+be possible if hugetlb_reserve_pages was called for the same page in a shared
+mapping.  In such cases, the reservation count and subpool free page count
+will be off by one.  This rare condition can be identified by comparing the
+return value from vma_needs_reservation and vma_commit_reservation.  If such
+a race is detected, the subpool and global reserve counts are adjusted to
+compensate.  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
+information on these routines.
+
+
+Instantiate Huge Pages
+======================
+
+After huge page allocation, the page is typically added to the page tables
+of the allocating task.  Before this, pages in a shared mapping are added
+to the page cache and pages in private mappings are added to an anonymous
+reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
+when a huge page that has been instantiated is freed no adjustment is made
+to the global reservation count (resv_huge_pages).
+
+
+Freeing Huge Pages
+==================
+
+Huge page freeing is performed by the routine free_huge_page().  This routine
+is the destructor for hugetlbfs compound pages.  As a result, it is only
+passed a pointer to the page struct.  When a huge page is freed, reservation
+accounting may need to be performed.  This would be the case if the page was
+associated with a subpool that contained reserves, or the page is being freed
+on an error path where a global reserve count must be restored.
+
+The page->private field points to any subpool associated with the page.
+If the PagePrivate flag is set, it indicates the global reserve count should
+be adjusted (see the section
+:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
+for information on how these are set).
+
+The routine first calls hugepage_subpool_put_pages() for the page.  If this
+routine returns a value of 0 (which does not equal the value passed 1) it
+indicates reserves are associated with the subpool, and this newly free page
+must be used to keep the number of subpool reserves above the minimum size.
+Therefore, the global resv_huge_pages counter is incremented in this case.
+
+If the PagePrivate flag was set in the page, the global resv_huge_pages counter
+will always be incremented.
+
+.. _sub_pool_resv:
+
+Subpool Reservations
+====================
+
+There is a struct hstate associated with each huge page size.  The hstate
+tracks all huge pages of the specified size.  A subpool represents a subset
+of pages within a hstate that is associated with a mounted hugetlbfs
+filesystem.
+
+When a hugetlbfs filesystem is mounted a min_size option can be specified
+which indicates the minimum number of huge pages required by the filesystem.
+If this option is specified, the number of huge pages corresponding to
+min_size are reserved for use by the filesystem.  This number is tracked in
+the min_hpages field of a struct hugepage_subpool.  At mount time,
+hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
+huge pages.  If they can not be reserved, the mount fails.
+
+The routines hugepage_subpool_get/put_pages() are called when pages are
+obtained from or released back to a subpool.  They perform all subpool
+accounting, and track any reservations associated with the subpool.
+hugepage_subpool_get/put_pages are passed the number of huge pages by which
+to adjust the subpool 'used page' count (down for get, up for put).  Normally,
+they return the same value that was passed or an error if not enough pages
+exist in the subpool.
+
+However, if reserves are associated with the subpool a return value less
+than the passed value may be returned.  This return value indicates the
+number of additional global pool adjustments which must be made.  For example,
+suppose a subpool contains 3 reserved huge pages and someone asks for 5.
+The 3 reserved pages associated with the subpool can be used to satisfy part
+of the request.  But, 2 pages must be obtained from the global pools.  To
+relay this information to the caller, the value 2 is returned.  The caller
+is then responsible for attempting to obtain the additional two pages from
+the global pools.
+
+
+COW and Reservations
+====================
+
+Since shared mappings all point to and use the same underlying pages, the
+biggest reservation concern for COW is private mappings.  In this case,
+two tasks can be pointing at the same previously allocated page.  One task
+attempts to write to the page, so a new page must be allocated so that each
+task points to its own page.
+
+When the page was originally allocated, the reservation for that page was
+consumed.  When an attempt to allocate a new page is made as a result of
+COW, it is possible that no free huge pages are free and the allocation
+will fail.
+
+When the private mapping was originally created, the owner of the mapping
+was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
+map of the owner.  Since the owner created the mapping, the owner owns all
+the reservations associated with the mapping.  Therefore, when a write fault
+occurs and there is no page available, different action is taken for the owner
+and non-owner of the reservation.
+
+In the case where the faulting task is not the owner, the fault will fail and
+the task will typically receive a SIGBUS.
+
+If the owner is the faulting task, we want it to succeed since it owned the
+original reservation.  To accomplish this, the page is unmapped from the
+non-owning task.  In this way, the only reference is from the owning task.
+In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
+of the non-owning task.  The non-owning task may receive a SIGBUS if it later
+faults on a non-present page.  But, the original owner of the
+mapping/reservation will behave as expected.
+
+
+.. _resv_map_modifications:
+
+Reservation Map Modifications
+=============================
+
+The following low level routines are used to make modifications to a
+reservation map.  Typically, these routines are not called directly.  Rather,
+a reservation map helper routine is called which calls one of these low level
+routines.  These low level routines are fairly well documented in the source
+code (mm/hugetlb.c).  These routines are::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
+
+Operations on the reservation map typically involve two operations:
+
+1) region_chg() is called to examine the reserve map and determine how
+   many pages in the specified range [f, t) are NOT currently represented.
+
+   The calling code performs global checks and allocations to determine if
+   there are enough huge pages for the operation to succeed.
+
+2)
+  a) If the operation can succeed, region_add() is called to actually modify
+     the reservation map for the same range [f, t) previously passed to
+     region_chg().
+  b) If the operation can not succeed, region_abort is called for the same
+     range [f, t) to abort the operation.
+
+Note that this is a two step process where region_add() and region_abort()
+are guaranteed to succeed after a prior call to region_chg() for the same
+range.  region_chg() is responsible for pre-allocating any data structures
+necessary to ensure the subsequent operations (specifically region_add()))
+will succeed.
+
+As mentioned above, region_chg() determines the number of pages in the range
+which are NOT currently represented in the map.  This number is returned to
+the caller.  region_add() returns the number of pages in the range added to
+the map.  In most cases, the return value of region_add() is the same as the
+return value of region_chg().  However, in the case of shared mappings it is
+possible for changes to the reservation map to be made between the calls to
+region_chg() and region_add().  In this case, the return value of region_add()
+will not match the return value of region_chg().  It is likely that in such
+cases global counts and subpool accounting will be incorrect and in need of
+adjustment.  It is the responsibility of the caller to check for this condition
+and make the appropriate adjustments.
+
+The routine region_del() is called to remove regions from a reservation map.
+It is typically called in the following situations:
+
+- When a file in the hugetlbfs filesystem is being removed, the inode will
+  be released and the reservation map freed.  Before freeing the reservation
+  map, all the individual file_region structures must be freed.  In this case
+  region_del is passed the range [0, LONG_MAX).
+- When a hugetlbfs file is being truncated.  In this case, all allocated pages
+  after the new file size must be freed.  In addition, any file_region entries
+  in the reservation map past the new end of file must be deleted.  In this
+  case, region_del is passed the range [new_end_of_file, LONG_MAX).
+- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
+  are removed from the middle of the file one at a time.  As the pages are
+  removed, region_del() is called to remove the corresponding entry from the
+  reservation map.  In this case, region_del is passed the range
+  [page_idx, page_idx + 1).
+
+In every case, region_del() will return the number of pages removed from the
+reservation map.  In VERY rare cases, region_del() can fail.  This can only
+happen in the hole punch case where it has to split an existing file_region
+entry and can not allocate a new structure.  In this error case, region_del()
+will return -ENOMEM.  The problem here is that the reservation map will
+indicate that there is a reservation for the page.  However, the subpool and
+global reservation counts will not reflect the reservation.  To handle this
+situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
+counters so that they correspond with the reservation map entry that could
+not be deleted.
+
+region_count() is called when unmapping a private huge page mapping.  In
+private mappings, the lack of a entry in the reservation map indicates that
+a reservation exists.  Therefore, by counting the number of entries in the
+reservation map we know how many reservations were consumed and how many are
+outstanding (outstanding = (end - start) - region_count(resv, start, end)).
+Since the mapping is going away, the subpool and global reservation counts
+are decremented by the number of outstanding reservations.
+
+.. _resv_map_helpers:
+
+Reservation Map Helper Routines
+===============================
+
+Several helper routines exist to query and modify the reservation maps.
+These routines are only interested with reservations for a specific huge
+page, so they just pass in an address instead of a range.  In addition,
+they pass in the associated VMA.  From the VMA, the type of mapping (private
+or shared) and the location of the reservation map (inode or VMA) can be
+determined.  These routines simply call the underlying routines described
+in the section "Reservation Map Modifications".  However, they do take into
+account the 'opposite' meaning of reservation map entries for private and
+shared mappings and hide this detail from the caller::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
+
+This routine calls region_chg() for the specified page.  If no reservation
+exists, 1 is returned.  If a reservation exists, 0 is returned::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
+
+This calls region_add() for the specified page.  As in the case of region_chg
+and region_add, this routine is to be called after a previous call to
+vma_needs_reservation.  It will add a reservation entry for the page.  It
+returns 1 if the reservation was added and 0 if not.  The return value should
+be compared with the return value of the previous call to
+vma_needs_reservation.  An unexpected difference indicates the reservation
+map was modified between calls::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This calls region_abort() for the specified page.  As in the case of region_chg
+and region_abort, this routine is to be called after a previous call to
+vma_needs_reservation.  It will abort/end the in progress reservation add
+operation::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This is a special wrapper routine to help facilitate reservation cleanup
+on error paths.  It is only called from the routine restore_reserve_on_error().
+This routine is used in conjunction with vma_needs_reservation in an attempt
+to add a reservation to the reservation map.  It takes into account the
+different reservation map semantics for private and shared mappings.  Hence,
+region_add is called for shared mappings (as an entry present in the map
+indicates a reservation), and region_del is called for private mappings (as
+the absence of an entry in the map indicates a reservation).  See the section
+"Reservation cleanup in error paths" for more information on what needs to
+be done on error paths.
+
+
+Reservation Cleanup in Error Paths
+==================================
+
+As mentioned in the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
+map modifications are performed in two steps.  First vma_needs_reservation
+is called before a page is allocated.  If the allocation is successful,
+then vma_commit_reservation is called.  If not, vma_end_reservation is called.
+Global and subpool reservation counts are adjusted based on success or failure
+of the operation and all is well.
+
+Additionally, after a huge page is instantiated the PagePrivate flag is
+cleared so that accounting when the page is ultimately freed is correct.
+
+However, there are several instances where errors are encountered after a huge
+page is allocated but before it is instantiated.  In this case, the page
+allocation has consumed the reservation and made the appropriate subpool,
+reservation map and global count adjustments.  If the page is freed at this
+time (before instantiation and clearing of PagePrivate), then free_huge_page
+will increment the global reservation count.  However, the reservation map
+indicates the reservation was consumed.  This resulting inconsistent state
+will cause the 'leak' of a reserved huge page.  The global reserve count will
+be  higher than it should and prevent allocation of a pre-allocated page.
+
+The routine restore_reserve_on_error() attempts to handle this situation.  It
+is fairly well documented.  The intention of this routine is to restore
+the reservation map to the way it was before the page allocation.   In this
+way, the state of the reservation map will correspond to the global reservation
+count after the page is freed.
+
+The routine restore_reserve_on_error itself may encounter errors while
+attempting to restore the reservation map entry.  In this case, it will
+simply clear the PagePrivate flag of the page.  In this way, the global
+reserve count will not be incremented when the page is freed.  However, the
+reservation map will continue to look as though the reservation was consumed.
+A page can still be allocated for the address, but it will not use a reserved
+page as originally intended.
+
+There is some code (most notably userfaultfd) which can not call
+restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
+so that a reservation will not be leaked when the huge page is freed.
+
+
+Reservations and Memory Policy
+==============================
+Per-node huge page lists existed in struct hstate when git was first used
+to manage Linux code.  The concept of reservations was added some time later.
+When reservations were added, no attempt was made to take memory policy
+into account.  While cpusets are not exactly the same as memory policy, this
+comment in hugetlb_acct_memory sums up the interaction between reservations
+and cpusets/memory policy::
+
+	/*
+	 * When cpuset is configured, it breaks the strict hugetlb page
+	 * reservation as the accounting is done on a global variable. Such
+	 * reservation is completely rubbish in the presence of cpuset because
+	 * the reservation is not checked against page availability for the
+	 * current cpuset. Application can still potentially OOM'ed by kernel
+	 * with lack of free htlb page in cpuset that the task is in.
+	 * Attempt to enforce strict accounting with cpuset is almost
+	 * impossible (or too ugly) because cpuset is too fluid that
+	 * task or memory node can be dynamically moved between cpusets.
+	 *
+	 * The change of semantics for shared hugetlb mapping with cpuset is
+	 * undesirable. However, in order to preserve some of the semantics,
+	 * we fall back to check against current free page availability as
+	 * a best attempt and hopefully to minimize the impact of changing
+	 * semantics that cpuset has.
+	 */
+
+Huge page reservations were added to prevent unexpected page allocation
+failures (OOM) at page fault time.  However, if an application makes use
+of cpusets or memory policy there is no guarantee that huge pages will be
+available on the required nodes.  This is true even if there are a sufficient
+number of global reservations.
+
+Hugetlbfs regression testing
+============================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions.  In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
+
+--
+Mike Kravetz, 7 April 2017
diff --git a/Documentation/mm/hwpoison.rst b/Documentation/mm/hwpoison.rst
new file mode 100644
index 000000000000..b9d5253c1305
--- /dev/null
+++ b/Documentation/mm/hwpoison.rst
@@ -0,0 +1,184 @@
+.. hwpoison:
+
+========
+hwpoison
+========
+
+What is hwpoison?
+=================
+
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery``). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+
+This patchkit implements the necessary infrastructure in the VM.
+
+To quote the overview comment::
+
+	High level machine check handler. Handles pages reported by the
+	hardware as being corrupted usually due to a 2bit ECC memory or cache
+	failure.
+
+	This focusses on pages detected as corrupted in the background.
+	When the current CPU tries to consume corruption the currently
+	running process can just be killed directly instead. This implies
+	that if the error cannot be handled for some reason it's safe to
+	just ignore it because no corruption has been consumed yet. Instead
+	when that happens another machine check will happen.
+
+	Handles page cache pages in various states. The tricky part
+	here is that we can access any page asynchronous to other VM
+	users, because memory failures could happen anytime and anywhere,
+	possibly violating some of their assumptions. This is why this code
+	has to be extremely careful. Generally it tries to use normal locking
+	rules, as in get the standard locks, even if that means the
+	error handling takes potentially a long time.
+
+	Some of the operations here are somewhat inefficient and have non
+	linear algorithmic complexity, because the data structures have not
+	been optimized for this case. This is in particular the case
+	for the mapping from a vma to a process. Since this case is expected
+	to be rare we hope we can get away with this.
+
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+
+Failure recovery modes
+======================
+
+There are two (actually three) modes memory failure recovery can be in:
+
+vm.memory_failure_recovery sysctl set to zero:
+	All memory failures cause a panic. Do not attempt recovery.
+
+early kill
+	(can be controlled globally and per process)
+	Send SIGBUS to the application as soon as the error is detected
+	This allows applications who can process memory errors in a gentle
+	way (e.g. drop affected object)
+	This is the mode used by KVM qemu.
+
+late kill
+	Send SIGBUS when the application runs into the corrupted page.
+	This is best for memory error unaware applications and default
+	Note some pages are always handled as late kill.
+
+User control
+============
+
+vm.memory_failure_recovery
+	See sysctl.txt
+
+vm.memory_failure_early_kill
+	Enable early kill mode globally
+
+PR_MCE_KILL
+	Set early/late kill mode/revert to system default
+
+	arg1: PR_MCE_KILL_CLEAR:
+		Revert to system default
+	arg1: PR_MCE_KILL_SET:
+		arg2 defines thread specific mode
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			Use system global default
+
+	Note that if you want to have a dedicated thread which handles
+	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
+	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
+	the SIGBUS is sent to the main thread.
+
+PR_MCE_KILL_GET
+	return current mode
+
+Testing
+=======
+
+* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
+  process for testing
+
+* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
+
+  corrupt-pfn
+	Inject hwpoison fault at PFN echoed into this file. This does
+	some early filtering to avoid corrupted unintended pages in test suites.
+
+  unpoison-pfn
+	Software-unpoison page at PFN echoed into this file. This way
+	a page can be reused again.  This only works for Linux
+	injected failures, not for real memory failures. Once any hardware
+	memory failure happens, this feature is disabled.
+
+  Note these injection interfaces are not stable and might change between
+  kernel versions
+
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	Only handle memory failures to pages associated with the file
+	system defined by block device major/minor.  -1U is the
+	wildcard value.  This should be only used for testing with
+	artificial injection.
+
+  corrupt-filter-memcg
+	Limit injection to pages owned by memgroup. Specified by inode
+	number of the memcg.
+
+	Example::
+
+		mkdir /sys/fs/cgroup/mem/hwpoison
+
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	When specified, only poison pages if ((page_flags & mask) ==
+	value).  This allows stress testing of many kinds of
+	pages. The page_flags are the same as in /proc/kpageflags. The
+	flag bits are defined in include/linux/kernel-page-flags.h and
+	documented in Documentation/admin-guide/mm/pagemap.rst
+
+* Architecture specific MCE injector
+
+  x86 has mce-inject, mce-test
+
+  Some portable hwpoison test programs in mce-test, see below.
+
+References
+==========
+
+http://halobates.de/mce-lc09-2.pdf
+	Overview presentation from LinuxCon 09
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	Test suite (hwpoison specific portable tests in tsrc)
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86 specific injector
+
+
+Limitations
+===========
+- Not all page types are supported and never will. Most kernel internal
+  objects cannot be recovered, only LRU pages for now.
+
+---
+Andi Kleen, Oct 2009
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
new file mode 100644
index 000000000000..575ccd40e30c
--- /dev/null
+++ b/Documentation/mm/index.rst
@@ -0,0 +1,68 @@
+=====================================
+Linux Memory Management Documentation
+=====================================
+
+Memory Management Guide
+=======================
+
+This is a guide to understanding the memory management subsystem
+of Linux.  If you are looking for advice on simply allocating memory,
+see the :ref:`memory_allocation`.  For controlling and tuning guides,
+see the :doc:`admin guide <../admin-guide/mm/index>`.
+
+.. toctree::
+   :maxdepth: 1
+
+   physical_memory
+   page_tables
+   process_addrs
+   bootmem
+   page_allocation
+   vmalloc
+   slab
+   highmem
+   page_reclaim
+   swap
+   page_cache
+   shmfs
+   oom
+
+Legacy Documentation
+====================
+
+This is a collection of older documents about the Linux memory management
+(MM) subsystem internals with different level of details ranging from
+notes and mailing list responses for elaborating descriptions of data
+structures and algorithms.  It should all be integrated nicely into the
+above structured documentation, or deleted if it has served its purpose.
+
+.. toctree::
+   :maxdepth: 1
+
+   active_mm
+   arch_pgtable_helpers
+   balance
+   damon/index
+   free_page_reporting
+   frontswap
+   hmm
+   hwpoison
+   hugetlbfs_reserv
+   ksm
+   memory-model
+   mmu_notifier
+   numa
+   overcommit-accounting
+   page_migration
+   page_frags
+   page_owner
+   page_table_check
+   remap_file_pages
+   slub
+   split_page_table_lock
+   transhuge
+   unevictable-lru
+   vmalloced-kernel-stacks
+   vmemmap_dedup
+   z3fold
+   zsmalloc
diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst
new file mode 100644
index 000000000000..9e37add068e6
--- /dev/null
+++ b/Documentation/mm/ksm.rst
@@ -0,0 +1,87 @@
+.. _ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/
+
+The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
+
+Design
+======
+
+Overview
+--------
+
+.. kernel-doc:: mm/ksm.c
+   :DOC: Overview
+
+Reverse mapping
+---------------
+KSM maintains reverse mapping information for KSM pages in the stable
+tree.
+
+If a KSM page is shared between less than ``max_page_sharing`` VMAs,
+the node of the stable tree that represents such KSM page points to a
+list of struct rmap_item and the ``page->mapping`` of the
+KSM page points to the stable tree node.
+
+When the sharing passes this threshold, KSM adds a second dimension to
+the stable tree. The tree node becomes a "chain" that links one or
+more "dups". Each "dup" keeps reverse mapping information for a KSM
+page with ``page->mapping`` pointing to that "dup".
+
+Every "chain" and all "dups" linked into a "chain" enforce the
+invariant that they represent the same write protected memory content,
+even if each "dup" will be pointed by a different KSM page copy of
+that content.
+
+This way the stable tree lookup computational complexity is unaffected
+if compared to an unlimited list of reverse mappings. It is still
+enforced that there cannot be KSM page content duplicates in the
+stable tree itself.
+
+The deduplication limit enforced by ``max_page_sharing`` is required
+to avoid the virtual memory rmap lists to grow too large. The rmap
+walk has O(N) complexity where N is the number of rmap_items
+(i.e. virtual mappings) that are sharing the page, which is in turn
+capped by ``max_page_sharing``. So this effectively spreads the linear
+O(N) computational complexity from rmap walk context over different
+KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
+but N is the number of stable_node "dups", not the number of
+rmap_items, so it has not a significant impact on ksmd performance. In
+practice the best stable_node "dup" candidate will be kept and found
+at the head of the "dups" list.
+
+High values of ``max_page_sharing`` result in faster memory merging
+(because there will be fewer stable_node dups queued into the
+stable_node chain->hlist to check for pruning) and higher
+deduplication factor at the expense of slower worst case for rmap
+walks for any KSM page which can happen during swapping, compaction,
+NUMA balancing and page migration.
+
+The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
+``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to free up
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
+The whole list of stable_node "dups" linked in the stable_node
+"chains" is scanned periodically in order to prune stale stable_nodes.
+The frequency of such scans is defined by
+``stable_node_chains_prune_millisecs`` sysfs tunable.
+
+Reference
+---------
+.. kernel-doc:: mm/ksm.c
+   :functions: mm_slot ksm_scan stable_node rmap_item
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst
new file mode 100644
index 000000000000..3779e562dc76
--- /dev/null
+++ b/Documentation/mm/memory-model.rst
@@ -0,0 +1,177 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _physical_memory_model:
+
+=====================
+Physical Memory Model
+=====================
+
+Physical memory in a system may be addressed in different ways. The
+simplest case is when the physical memory starts at address 0 and
+spans a contiguous range up to the maximal address. It could be,
+however, that this range contains small holes that are not accessible
+for the CPU. Then there could be several contiguous ranges at
+completely distinct addresses. And, don't forget about NUMA, where
+different memory banks are attached to different CPUs.
+
+Linux abstracts this diversity using one of the two memory models:
+FLATMEM and SPARSEMEM. Each architecture defines what
+memory models it supports, what the default memory model is and
+whether it is possible to manually override that default.
+
+All the memory models track the status of physical page frames using
+struct page arranged in one or more arrays.
+
+Regardless of the selected memory model, there exists one-to-one
+mapping between the physical page frame number (PFN) and the
+corresponding `struct page`.
+
+Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn`
+helpers that allow the conversion from PFN to `struct page` and vice
+versa.
+
+FLATMEM
+=======
+
+The simplest memory model is FLATMEM. This model is suitable for
+non-NUMA systems with contiguous, or mostly contiguous, physical
+memory.
+
+In the FLATMEM memory model, there is a global `mem_map` array that
+maps the entire physical memory. For most architectures, the holes
+have entries in the `mem_map` array. The `struct page` objects
+corresponding to the holes are never fully initialized.
+
+To allocate the `mem_map` array, architecture specific setup code should
+call :c:func:`free_area_init` function. Yet, the mappings array is not
+usable until the call to :c:func:`memblock_free_all` that hands all the
+memory to the page allocator.
+
+An architecture may free parts of the `mem_map` array that do not cover the
+actual physical pages. In such case, the architecture specific
+:c:func:`pfn_valid` implementation should take the holes in the
+`mem_map` into account.
+
+With FLATMEM, the conversion between a PFN and the `struct page` is
+straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
+`mem_map` array.
+
+The `ARCH_PFN_OFFSET` defines the first page frame number for
+systems with physical memory starting at address different from 0.
+
+SPARSEMEM
+=========
+
+SPARSEMEM is the most versatile memory model available in Linux and it
+is the only memory model that supports several advanced features such
+as hot-plug and hot-remove of the physical memory, alternative memory
+maps for non-volatile memory devices and deferred initialization of
+the memory map for larger systems.
+
+The SPARSEMEM model presents the physical memory as a collection of
+sections. A section is represented with struct mem_section
+that contains `section_mem_map` that is, logically, a pointer to an
+array of struct pages. However, it is stored with some other magic
+that aids the sections management. The section size and maximal number
+of section is specified using `SECTION_SIZE_BITS` and
+`MAX_PHYSMEM_BITS` constants defined by each architecture that
+supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a
+physical address that an architecture supports, the
+`SECTION_SIZE_BITS` is an arbitrary value.
+
+The maximal number of sections is denoted `NR_MEM_SECTIONS` and
+defined as
+
+.. math::
+
+   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
+
+The `mem_section` objects are arranged in a two-dimensional array
+called `mem_sections`. The size and placement of this array depend
+on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of
+sections:
+
+* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections`
+  array is static and has `NR_MEM_SECTIONS` rows. Each row holds a
+  single `mem_section` object.
+* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections`
+  array is dynamically allocated. Each row contains PAGE_SIZE worth of
+  `mem_section` objects and the number of rows is calculated to fit
+  all the memory sections.
+
+The architecture setup code should call sparse_init() to
+initialize the memory sections and the memory maps.
+
+With SPARSEMEM there are two possible ways to convert a PFN to the
+corresponding `struct page` - a "classic sparse" and "sparse
+vmemmap". The selection is made at build time and it is determined by
+the value of `CONFIG_SPARSEMEM_VMEMMAP`.
+
+The classic sparse encodes the section number of a page in page->flags
+and uses high bits of a PFN to access the section that maps that page
+frame. Inside a section, the PFN is the index to the array of pages.
+
+The sparse vmemmap uses a virtually mapped memory map to optimize
+pfn_to_page and page_to_pfn operations. There is a global `struct
+page *vmemmap` pointer that points to a virtually contiguous array of
+`struct page` objects. A PFN is an index to that array and the
+offset of the `struct page` from `vmemmap` is the PFN of that
+page.
+
+To use vmemmap, an architecture has to reserve a range of virtual
+addresses that will map the physical pages containing the memory
+map and make sure that `vmemmap` points to that range. In addition,
+the architecture should implement :c:func:`vmemmap_populate` method
+that will allocate the physical memory and create page tables for the
+virtual memory map. If an architecture does not have any special
+requirements for the vmemmap mappings, it can use default
+:c:func:`vmemmap_populate_basepages` provided by the generic memory
+management.
+
+The virtually mapped memory map allows storing `struct page` objects
+for persistent memory devices in pre-allocated storage on those
+devices. This storage is represented with struct vmem_altmap
+that is eventually passed to vmemmap_populate() through a long chain
+of function calls. The vmemmap_populate() implementation may use the
+`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
+allocate memory map on the persistent memory device.
+
+ZONE_DEVICE
+===========
+The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer
+`struct page` `mem_map` services for device driver identified physical
+address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact
+that the page objects for these address ranges are never marked online,
+and that a reference must be taken against the device, not just the page
+to keep the memory pinned for active use. `ZONE_DEVICE`, via
+:c:func:`devm_memremap_pages`, performs just enough memory hotplug to
+turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and
+:c:func:`get_user_pages` service for the given range of pfns. Since the
+page reference count never drops below 1 the page is never tracked as
+free memory and the page's `struct list_head lru` space is repurposed
+for back referencing to the host device / driver that mapped the memory.
+
+While `SPARSEMEM` presents memory as a collection of sections,
+optionally collected into memory blocks, `ZONE_DEVICE` users have a need
+for smaller granularity of populating the `mem_map`. Given that
+`ZONE_DEVICE` memory is never marked online it is subsequently never
+subject to its memory ranges being exposed through the sysfs memory
+hotplug api on memory block boundaries. The implementation relies on
+this lack of user-api constraint to allow sub-section sized memory
+ranges to be specified to :c:func:`arch_add_memory`, the top-half of
+memory hotplug. Sub-section support allows for 2MB as the cross-arch
+common alignment granularity for :c:func:`devm_memremap_pages`.
+
+The users of `ZONE_DEVICE` are:
+
+* pmem: Map platform persistent memory to be used as a direct-I/O target
+  via DAX mappings.
+
+* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
+  event callbacks to allow a device-driver to coordinate memory management
+  events related to device-memory, typically GPU memory. See
+  Documentation/mm/hmm.rst.
+
+* p2pdma: Create `struct page` objects to allow peer devices in a
+  PCI/-E topology to coordinate direct-DMA operations between themselves,
+  i.e. bypass host memory.
diff --git a/Documentation/mm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst
new file mode 100644
index 000000000000..df5d7777fc6b
--- /dev/null
+++ b/Documentation/mm/mmu_notifier.rst
@@ -0,0 +1,99 @@
+.. _mmu_notifier:
+
+When do you need to notify inside page table lock ?
+===================================================
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {try to write to addrA}
+ CPU-thread-1  {try to write to addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA and populate device TLB}
+ DEV-thread-2  {read addrB and populate device TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+ CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {write to addrA which is a write to new page}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {write to addrB which is a write to new page}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA from old page}
+ DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seeing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/mm/numa.rst b/Documentation/mm/numa.rst
new file mode 100644
index 000000000000..99fdeca917ca
--- /dev/null
+++ b/Documentation/mm/numa.rst
@@ -0,0 +1,150 @@
+.. _numa:
+
+Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+
+=============
+What is NUMA?
+=============
+
+This question can be answered from a couple of perspectives:  the
+hardware view and the Linux software view.
+
+From the hardware perspective, a NUMA system is a computer platform that
+comprises multiple components or assemblies each of which may contain 0
+or more CPUs, local memory, and/or IO buses.  For brevity and to
+disambiguate the hardware view of these physical components/assemblies
+from the software abstraction thereof, we'll call the components/assemblies
+'cells' in this document.
+
+Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
+of the system--although some components necessary for a stand-alone SMP system
+may not be populated on any given cell.   The cells of the NUMA system are
+connected together with some sort of system interconnect--e.g., a crossbar or
+point-to-point link are common types of NUMA system interconnects.  Both of
+these types of interconnects can be aggregated to create NUMA platforms with
+cells at multiple distances from other cells.
+
+For Linux, the NUMA platforms of interest are primarily what is known as Cache
+Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
+to and accessible from any CPU attached to any cell and cache coherency
+is handled in hardware by the processor caches and/or the system interconnect.
+
+Memory access time and effective memory bandwidth varies depending on how far
+away the cell containing the CPU or IO bus making the memory access is from the
+cell containing the target memory.  For example, access to memory by CPUs
+attached to the same cell will experience faster access times and higher
+bandwidths than accesses to memory on other, remote cells.  NUMA platforms
+can have cells at multiple remote distances from any given cell.
+
+Platform vendors don't build NUMA systems just to make software developers'
+lives interesting.  Rather, this architecture is a means to provide scalable
+memory bandwidth.  However, to achieve scalable memory bandwidth, system and
+application software must arrange for a large majority of the memory references
+[cache misses] to be to "local" memory--memory on the same cell, if any--or
+to the closest cell with memory.
+
+This leads to the Linux software view of a NUMA system:
+
+Linux divides the system's hardware resources into multiple software
+abstractions called "nodes".  Linux maps the nodes onto the physical cells
+of the hardware platform, abstracting away some of the details for some
+architectures.  As with physical cells, software nodes may contain 0 or more
+CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
+"closer" nodes--nodes that map to closer cells--will generally experience
+faster access times and higher effective bandwidth than accesses to more
+remote cells.
+
+For some architectures, such as x86, Linux will "hide" any node representing a
+physical cell that has no memory attached, and reassign any CPUs attached to
+that cell to a node representing a cell that does have memory.  Thus, on
+these architectures, one cannot assume that all CPUs that Linux associates with
+a given node will see the same local memory access times and bandwidth.
+
+In addition, for some architectures, again x86 is an example, Linux supports
+the emulation of additional nodes.  For NUMA emulation, linux will carve up
+the existing nodes--or the system memory for non-NUMA platforms--into multiple
+nodes.  Each emulated node will manage a fraction of the underlying cells'
+physical memory.  NUMA emluation is useful for testing NUMA kernel and
+application features on non-NUMA platforms, and as a sort of memory resource
+management mechanism when used together with cpusets.
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+For each node with memory, Linux constructs an independent memory management
+subsystem, complete with its own free page lists, in-use page lists, usage
+statistics and locks to mediate access.  In addition, Linux constructs for
+each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
+an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
+selected zone/node cannot satisfy the allocation request.  This situation,
+when a zone has no available memory to satisfy a request, is called
+"overflow" or "fallback".
+
+Because some nodes contain multiple zones containing different types of
+memory, Linux must decide whether to order the zonelists such that allocations
+fall back to the same zone type on a different node, or to a different zone
+type on the same node.  This is an important consideration because some zones,
+such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
+a default Node ordered zonelist. This means it tries to fallback to other zones
+from the same node before using remote nodes which are ordered by NUMA distance.
+
+By default, Linux will attempt to satisfy memory allocation requests from the
+node to which the CPU that executes the request is assigned.  Specifically,
+Linux will attempt to allocate from the first node in the appropriate zonelist
+for the node where the request originates.  This is called "local allocation."
+If the "local" node cannot satisfy the request, the kernel will examine other
+nodes' zones in the selected zonelist looking for the first zone in the list
+that can satisfy the request.
+
+Local allocation will tend to keep subsequent access to the allocated memory
+"local" to the underlying physical resources and off the system interconnect--
+as long as the task on whose behalf the kernel allocated some memory does not
+later migrate away from that memory.  The Linux scheduler is aware of the
+NUMA topology of the platform--embodied in the "scheduling domains" data
+structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler
+attempts to minimize task migration to distant scheduling domains.  However,
+the scheduler does not take a task's NUMA footprint into account directly.
+Thus, under sufficient imbalance, tasks can migrate between nodes, remote
+from their initial node and kernel data structures.
+
+System administrators and application designers can restrict a task's migration
+to improve NUMA locality using various CPU affinity command line interfaces,
+such as taskset(1) and numactl(1), and program interfaces such as
+sched_setaffinity(2).  Further, one can modify the kernel's default local
+allocation behavior using Linux NUMA memory policy. [see
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
+
+System administrators can restrict the CPUs and nodes' memories that a non-
+privileged user can specify in the scheduling or NUMA commands and functions
+using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+On architectures that do not hide memoryless nodes, Linux will include only
+zones [nodes] with memory in the zonelists.  This means that for a memoryless
+node the "local memory node"--the node of the first zone in CPU's node's
+zonelist--will not be the node itself.  Rather, it will be the node that the
+kernel selected as the nearest node with memory when it built the zonelists.
+So, default, local allocations will succeed with the kernel supplying the
+closest available memory.  This is a consequence of the same mechanism that
+allows such allocations to fallback to other nearby nodes when a node that
+does contain memory overflows.
+
+Some kernel allocations do not want or cannot tolerate this allocation fallback
+behavior.  Rather they want to be sure they get memory from the specified node
+or get notified that the node has no free memory.  This is usually the case when
+a subsystem allocates per CPU memory resources, for example.
+
+A typical model for making such an allocation is to obtain the node id of the
+node to which the "current CPU" is attached using one of the kernel's
+numa_node_id() or CPU_to_node() functions and then request memory from only
+the node id returned.  When such an allocation fails, the requesting subsystem
+may revert to its own fallback path.  The slab kernel memory allocator is an
+example of this.  Or, the subsystem may choose to disable or not to enable
+itself on allocation failure.  The kernel profiling subsystem is an example of
+this.
+
+If the architecture supports--does not hide--memoryless nodes, then CPUs
+attached to memoryless nodes would always incur the fallback path overhead
+or some subsystems would fail to initialize if they attempted to allocated
+memory exclusively from a node without memory.  To support such
+architectures transparently, kernel subsystems can use the numa_mem_id()
+or cpu_to_mem() function to locate the "local memory node" for the calling or
+specified CPU.  Again, this is the same node from which default, local page
+allocations will be attempted.
diff --git a/Documentation/mm/oom.rst b/Documentation/mm/oom.rst
new file mode 100644
index 000000000000..18e9e40c1ec1
--- /dev/null
+++ b/Documentation/mm/oom.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+Out Of Memory Handling
+======================
diff --git a/Documentation/mm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst
new file mode 100644
index 000000000000..1addb0c374a4
--- /dev/null
+++ b/Documentation/mm/overcommit-accounting.rst
@@ -0,0 +1,88 @@
+.. _overcommit_accounting:
+
+=====================
+Overcommit Accounting
+=====================
+
+The Linux kernel supports the following overcommit handling modes
+
+0
+	Heuristic overcommit handling. Obvious overcommits of address
+	space are refused. Used for a typical system. It ensures a
+	seriously wild allocation fails while allowing overcommit to
+	reduce swap usage.  root is allowed to allocate slightly more
+	memory in this mode. This is the default.
+
+1
+	Always overcommit. Appropriate for some scientific
+	applications. Classic example is code using sparse arrays and
+	just relying on the virtual memory consisting almost entirely
+	of zero pages.
+
+2
+	Don't overcommit. The total address space commit for the
+	system is not permitted to exceed swap + a configurable amount
+	(default is 50%) of physical RAM.  Depending on the amount you
+	use, in most situations this means a process will not be
+	killed while accessing pages but will receive errors on memory
+	allocation as appropriate.
+
+	Useful for applications that want to guarantee their memory
+	allocations will be available in the future without having to
+	initialize every page.
+
+The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
+
+The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
+or ``vm.overcommit_kbytes`` (absolute value). These only have an effect
+when ``vm.overcommit_memory`` is set to 2.
+
+The current overcommit limit and amount committed are viewable in
+``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
+
+Gotchas
+=======
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+============
+
+The overcommit is based on the following rules
+
+For a file backed map
+	| SHARED or READ-only	-	0 cost (the file is the map not swap)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+For an anonymous or ``/dev/zero`` map
+	| SHARED			-	size of mapping
+	| PRIVATE READ-only	-	0 cost (but of little use)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+Additional accounting
+	| Pages made writable copies by mmap
+	| shmfs memory drawn from the same pool
+
+Status
+======
+
+*	We account mmap memory mappings
+*	We account mprotect changes in commit
+*	We account mremap changes in size
+*	We account brk
+*	We account munmap
+*	We report the commit status in /proc
+*	Account and check on fork
+*	Review stack handling/building on exec
+*	SHMfs accounting
+*	Implement actual limit enforcement
+
+To Do
+=====
+*	Account ptrace pages (this is hard)
diff --git a/Documentation/mm/page_allocation.rst b/Documentation/mm/page_allocation.rst
new file mode 100644
index 000000000000..d9b4495561f1
--- /dev/null
+++ b/Documentation/mm/page_allocation.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Page Allocation
+===============
diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst
new file mode 100644
index 000000000000..75eba7c431b2
--- /dev/null
+++ b/Documentation/mm/page_cache.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Page Cache
+==========
diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst
new file mode 100644
index 000000000000..7d6f9385d129
--- /dev/null
+++ b/Documentation/mm/page_frags.rst
@@ -0,0 +1,45 @@
+.. _page_frags:
+
+==============
+Page fragments
+==============
+
+A page fragment is an arbitrary-length arbitrary-offset area of memory
+which resides within a 0 or higher order compound page.  Multiple
+fragments within that page are individually refcounted, in the page's
+reference counter.
+
+The page_frag functions, page_frag_alloc and page_frag_free, provide a
+simple allocation framework for page fragments.  This is used by the
+network stack and network device drivers to provide a backing region of
+memory for use as either an sk_buff->head, or to be used in the "frags"
+portion of skb_shared_info.
+
+In order to make use of the page fragment APIs a backing page fragment
+cache is needed.  This provides a central point for the fragment allocation
+and tracks allows multiple calls to make use of a cached page.  The
+advantage to doing this is that multiple calls to get_page can be avoided
+which can be expensive at allocation time.  However due to the nature of
+this caching it is required that any calls to the cache be protected by
+either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
+to be disabled when executing the fragment allocation.
+
+The network stack uses two separate caches per CPU to handle fragment
+allocation.  The netdev_alloc_cache is used by callers making use of the
+netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
+used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+main difference between these two calls is the context in which they may be
+called.  The "netdev" prefixed functions are usable in any context as these
+functions will disable interrupts, while the "napi" prefixed functions are
+only usable within the softirq context.
+
+Many network device drivers use a similar methodology for allocating page
+fragments, but the page fragments are cached at the ring or descriptor
+level.  In order to enable these cases it is necessary to provide a generic
+way of tearing down a page cache.  For this reason __page_frag_cache_drain
+was implemented.  It allows for freeing multiple references from a single
+page via a single call.  The advantage to doing this is that it allows for
+cleaning up the multiple references that were added to a page in order to
+avoid calling get_page per allocation.
+
+Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst
new file mode 100644
index 000000000000..8c5cb8147e55
--- /dev/null
+++ b/Documentation/mm/page_migration.rst
@@ -0,0 +1,288 @@
+.. _page_migration:
+
+==============
+Page migration
+==============
+
+Page migration allows moving the physical location of pages between
+nodes in a NUMA system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+Also see :ref:`Heterogeneous Memory Management (HMM) <hmm>`
+for migrating pages to or from device private memory.
+
+The main intent of page migration is to reduce the latency of memory accesses
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy via mbind(). The pages of a process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages() function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+Page migration functions are provided by the numactl package by Andi Kleen
+(a version later than 0.9.3 is required. Get it from
+https://github.com/numactl/numactl.git). numactl provides libnuma
+which provides an interface similar to other NUMA functionality for page
+migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
+pages of a process are located. See also the numa_maps documentation in the
+proc(5) man page.
+
+Manual migration is useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. The kernel itself only provides
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+For example, A NUMA profiler may obtain a log showing frequent off-node
+accesses and may use the result to move pages to more advantageous
+locations.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset (See
+:ref:`CPUSETS <cpusets>`).
+Cpusets allow the automation of process locality. If a task is moved to
+a new cpuset then also all its pages are moved with it so that the
+performance of the process does not sink dramatically. Also the pages
+of processes in a cpuset are moved if the allowed memory nodes of a
+cpuset are changed.
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() from the kernel
+(for userspace usage see the Andi Kleen's numactl package mentioned above)
+and then a low level description of how the low level details work.
+
+In kernel use of migrate_pages()
+================================
+
+1. Remove pages from the LRU.
+
+   Lists of pages to be migrated are generated by scanning over
+   pages and moving them into lists. This is done by
+   calling isolate_lru_page().
+   Calling isolate_lru_page() increases the references to the page
+   so that it cannot vanish while the page migration occurs.
+   It also prevents the swapper or other scans from encountering
+   the page.
+
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
+
+3. The migrate_pages() function is called which attempts
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
+
+How migrate_pages() works
+=========================
+
+migrate_pages() does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time. The page has
+already been removed from the LRU via isolate_lru_page() and the refcount
+is increased so that the page cannot be freed while page migration occurs.
+
+Steps:
+
+1. Lock the page to be migrated.
+
+2. Ensure that writeback is complete.
+
+3. Lock the new page that we want to move to. It is locked so that accesses to
+   this (not yet up-to-date) page immediately block while the move is in progress.
+
+4. All the page table references to the page are converted to migration
+   entries. This decreases the mapcount of a page. If the resulting
+   mapcount is not zero then we do not migrate the page. All user space
+   processes that attempt to access the page will now wait on the page lock
+   or wait for the migration page table entry to be removed.
+
+5. The i_pages lock is taken. This will cause all processes trying
+   to access the page via the mapping to block on the spinlock.
+
+6. The refcount of the page is examined and we back out if references remain.
+   Otherwise, we know that we are the only one referencing this page.
+
+7. The radix tree is checked and if it does not contain the pointer to this
+   page then we back out because someone else modified the radix tree.
+
+8. The new page is prepped with some settings from the old page so that
+   accesses to the new page will discover a page with the correct settings.
+
+9. The radix tree is changed to point to the new page.
+
+10. The reference count of the old page is dropped because the address space
+    reference is gone. A reference to the new page is established because
+    the new page is referenced by the address space.
+
+11. The i_pages lock is dropped. With that lookups in the mapping
+    become possible again. Processes will move from spinning on the lock
+    to sleeping on the locked new page.
+
+12. The page contents are copied to the new page.
+
+13. The remaining page flags are copied to the new page.
+
+14. The old page flags are cleared to indicate that the page does
+    not provide any information anymore.
+
+15. Queued up writeback on the new page is triggered.
+
+16. If migration entries were inserted into the page table, then replace them
+    with real ptes. Doing so will enable access for user space processes not
+    already waiting for the page lock.
+
+17. The page locks are dropped from the old and new page.
+    Processes waiting on the page lock will redo their page faults
+    and will reach the new page.
+
+18. The new page is moved to the LRU and can be scanned by the swapper,
+    etc. again.
+
+Non-LRU page migration
+======================
+
+Although migration originally aimed for reducing the latency of memory accesses
+for NUMA, compaction also uses migration to create high-order pages.
+
+Current problem of the implementation is that it is designed to migrate only
+*LRU* pages. However, there are potential non-LRU pages which can be migrated
+in drivers, for example, zsmalloc, virtio-balloon pages.
+
+For virtio-balloon pages, some parts of migration code path have been hooked
+up and added virtio-balloon specific functions to intercept migration logics.
+It's too specific to a driver so other drivers who want to make their pages
+movable would have to add their own specific hooks in the migration path.
+
+To overcome the problem, VM supports non-LRU page migration which provides
+generic functions for non-LRU movable pages without driver specific hooks
+in the migration path.
+
+If a driver wants to make its pages movable, it should define three functions
+which are function pointers of struct address_space_operations.
+
+1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
+
+   What VM expects from isolate_page() function of driver is to return *true*
+   if driver isolates the page successfully. On returning true, VM marks the page
+   as PG_isolated so concurrent isolation in several CPUs skip the page
+   for isolation. If a driver cannot isolate the page, it should return *false*.
+
+   Once page is successfully isolated, VM uses page.lru fields so driver
+   shouldn't expect to preserve values in those fields.
+
+2. ``int (*migratepage) (struct address_space *mapping,``
+|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
+
+   After isolation, VM calls migratepage() of driver with the isolated page.
+   The function of migratepage() is to move the contents of the old page to the
+   new page
+   and set up fields of struct page newpage. Keep in mind that you should
+   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+   under page_lock if you migrated the oldpage successfully and returned
+   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
+   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
+   because VM interprets -EAGAIN as "temporary migration failure". On returning
+   any error except -EAGAIN, VM will give up the page migration without
+   retrying.
+
+   Driver shouldn't touch the page.lru field while in the migratepage() function.
+
+3. ``void (*putback_page)(struct page *);``
+
+   If migration fails on the isolated page, VM should return the isolated page
+   to the driver so VM calls the driver's putback_page() with the isolated page.
+   In this function, the driver should put the isolated page back into its own data
+   structure.
+
+Non-LRU movable page flags
+
+   There are two page flags for supporting non-LRU movable page.
+
+   * PG_movable
+
+     Driver should use the function below to make page movable under page_lock::
+
+	void __SetPageMovable(struct page *page, struct address_space *mapping)
+
+     It needs argument of address_space for registering migration
+     family functions which will be called by VM. Exactly speaking,
+     PG_movable is not a real flag of struct page. Rather, VM
+     reuses the page->mapping's lower bits to represent it::
+
+	#define PAGE_MAPPING_MOVABLE 0x2
+	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
+
+     so driver shouldn't access page->mapping directly. Instead, driver should
+     use page_mapping() which masks off the low two bits of page->mapping under
+     page lock so it can get the right struct address_space.
+
+     For testing of non-LRU movable pages, VM supports __PageMovable() function.
+     However, it doesn't guarantee to identify non-LRU movable pages because
+     the page->mapping field is unified with other variables in struct page.
+     If the driver releases the page after isolation by VM, page->mapping
+     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
+     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
+     page is LRU or non-LRU movable once the page has been isolated because LRU
+     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
+     good for just peeking to test non-LRU movable pages before more expensive
+     checking with lock_page() in pfn scanning to select a victim.
+
+     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
+     Unlike __PageMovable(), PageMovable() validates page->mapping and
+     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
+     sudden destroying of page->mapping.
+
+     Drivers using __SetPageMovable() should clear the flag via
+     __ClearMovablePage() under page_lock() before the releasing the page.
+
+   * PG_isolated
+
+     To prevent concurrent isolation among several CPUs, VM marks isolated page
+     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
+     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
+     flag because VM will set/clear it automatically. Keep in mind that if the
+     driver sees a PG_isolated page, it means the page has been isolated by the
+     VM so it shouldn't touch the page.lru field.
+     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
+     shouldn't use PG_isolated for its own purposes.
+
+Monitoring Migration
+=====================
+
+The following events (counters) can be used to monitor page migration.
+
+1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
+   page was migrated. If the page was a non-THP and non-hugetlb page, then
+   this counter is increased by one. If the page was a THP or hugetlb, then
+   this counter is increased by the number of THP or hugetlb subpages.
+   For example, migration of a single 2MB THP that has 4KB-size base pages
+   (subpages) will cause this counter to increase by 512.
+
+2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
+   PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
+   if it was a THP or hugetlb.
+
+3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
+
+4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split.
+
+5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had
+   to be split. After splitting, a migration retry was used for it's sub-pages.
+
+THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or
+PGMIGRATE_FAIL events. For example, a THP migration failure will cause both
+THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
+
+Christoph Lameter, May 8, 2006.
+Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
new file mode 100644
index 000000000000..f5c954afe97c
--- /dev/null
+++ b/Documentation/mm/page_owner.rst
@@ -0,0 +1,196 @@
+.. _page_owner:
+
+==================================================
+page owner: Tracking about who allocated each page
+==================================================
+
+Introduction
+============
+
+page owner is for the tracking about who allocated each page.
+It can be used to debug memory leak or to find a memory hogger.
+When allocation happens, information about allocation such as call stack
+and order of pages is stored into certain storage for each page.
+When we need to know about status of all pages, we can get and analyze
+this information.
+
+Although we already have tracepoint for tracing page allocation/free,
+using it for analyzing who allocate each page is rather complex. We need
+to enlarge the trace buffer for preventing overlapping until userspace
+program launched. And, launched program continually dump out the trace
+buffer for later analysis and it would change system behaviour with more
+possibility rather than just keeping it in memory, so bad for debugging.
+
+page owner can also be used for various purposes. For example, accurate
+fragmentation statistics can be obtained through gfp flag information of
+each page. It is already implemented and activated if page owner is
+enabled. Other usages are more than welcome.
+
+page owner is disabled by default. So, if you'd like to use it, you need
+to add "page_owner=on" to your boot cmdline. If the kernel is built
+with page owner and page owner is disabled in runtime due to not enabling
+boot option, runtime overhead is marginal. If disabled in runtime, it
+doesn't require memory to store owner information, so there is no runtime
+memory overhead. And, page owner inserts just two unlikely branches into
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
+
+- Without page owner::
+
+   text    data     bss     dec     hex filename
+   48392   2333     644   51369    c8a9 mm/page_alloc.o
+
+- With page owner::
+
+   text    data     bss     dec     hex filename
+   48800   2445     644   51889    cab1 mm/page_alloc.o
+   6662     108      29    6799    1a8f mm/page_owner.o
+   1025       8       8    1041     411 mm/page_ext.o
+
+Although, roughly, 8 KB code is added in total, page_alloc.o increase by
+520 bytes and less than half of it is in hotpath. Building the kernel with
+page owner and turning it on if needed would be great option to debug
+kernel memory problem.
+
+There is one notice that is caused by implementation detail. page owner
+stores information into the memory from struct page extension. This memory
+is initialized some time later than that page allocator starts in sparse
+memory system, so, until initialization, many pages can be allocated and
+they would have no owner information. To fix it up, these early allocated
+pages are investigated and marked as allocated in initialization phase.
+Although it doesn't mean that they have the right owner information,
+at least, we can tell whether the page is allocated or not,
+more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
+are catched and marked, although they are mostly allocated from struct
+page extension feature. Anyway, after that, no page is left in
+un-tracking state.
+
+Usage
+=====
+
+1) Build user-space helper::
+
+	cd tools/vm
+	make page_owner_sort
+
+2) Enable page owner: add "page_owner=on" to boot cmdline.
+
+3) Do the job that you want to debug.
+
+4) Analyze information from page owner::
+
+	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+
+   The general output of ``page_owner_full.txt`` is as follows::
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+   The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
+   in buf, uses regexp to extract the page order value, counts the times
+   and pages of buf, and finally sorts them according to the parameter(s).
+
+   See the result about who allocated each page
+   in the ``sorted_page_owner.txt``. General output::
+
+	XXX times, XXX pages:
+	Page allocated via order XXX, ...
+	// Detailed stack
+
+   By default, ``page_owner_sort`` is sorted according to the times of buf.
+   If you want to sort by the page nums of buf, use the ``-m`` parameter.
+   The detailed parameters are:
+
+   fundamental function::
+
+	Sort:
+		-a		Sort by memory allocation time.
+		-m		Sort by total memory.
+		-p		Sort by pid.
+		-P		Sort by tgid.
+		-n		Sort by task command name.
+		-r		Sort by memory release time.
+		-s		Sort by stack trace.
+		-t		Sort by times (default).
+		--sort <order>	Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].
+				Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is
+				optional since default direction is increasing numerical or lexicographic
+				order. Mixed use of abbreviated and complete-form of keys is allowed.
+
+		Examples:
+				./page_owner_sort <input> <output> --sort=n,+pid,-tgid
+				./page_owner_sort <input> <output> --sort=at
+
+   additional function::
+
+	Cull:
+		--cull <rules>
+				Specify culling rules.Culling syntax is key[,key[,...]].Choose a
+				multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
+
+		<rules> is a single argument in the form of a comma-separated list,
+		which offers a way to specify individual culling rules.  The recognized
+		keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
+		<rules> can be specified by the sequence of keys k1,k2, ..., as described in
+		the STANDARD SORT KEYS section below. Mixed use of abbreviated and
+		complete-form of keys is allowed.
+
+		Examples:
+				./page_owner_sort <input> <output> --cull=stacktrace
+				./page_owner_sort <input> <output> --cull=st,pid,name
+				./page_owner_sort <input> <output> --cull=n,f
+
+	Filter:
+		-f		Filter out the information of blocks whose memory has been released.
+
+	Select:
+		--pid <pidlist>		Select by pid. This selects the blocks whose process ID
+					numbers appear in <pidlist>.
+		--tgid <tgidlist>	Select by tgid. This selects the blocks whose thread
+					group ID numbers appear in <tgidlist>.
+		--name <cmdlist>	Select by task command name. This selects the blocks whose
+					task command name appear in <cmdlist>.
+
+		<pidlist>, <tgidlist>, <cmdlist> are single arguments in the form of a comma-separated list,
+		which offers a way to specify individual selecting rules.
+
+
+		Examples:
+				./page_owner_sort <input> <output> --pid=1
+				./page_owner_sort <input> <output> --tgid=1,2,3
+				./page_owner_sort <input> <output> --name name1,name2
+
+STANDARD FORMAT SPECIFIERS
+==========================
+::
+
+  For --sort option:
+
+	KEY		LONG		DESCRIPTION
+	p		pid		process ID
+	tg		tgid		thread group ID
+	n		name		task command name
+	st		stacktrace	stack trace of the page allocation
+	T		txt		full text of block
+	ft		free_ts		timestamp of the page when it was released
+	at		alloc_ts	timestamp of the page when it was allocated
+	ator		allocator	memory allocator for pages
+
+  For --curl option:
+
+	KEY		LONG		DESCRIPTION
+	p		pid		process ID
+	tg		tgid		thread group ID
+	n		name		task command name
+	f		free		whether the page has been released or not
+	st		stacktrace	stack trace of the page allocation
+	ator		allocator	memory allocator for pages
diff --git a/Documentation/mm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst
new file mode 100644
index 000000000000..50a30b7f8ac3
--- /dev/null
+++ b/Documentation/mm/page_reclaim.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Page Reclaim
+============
diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst
new file mode 100644
index 000000000000..1a09472f10a3
--- /dev/null
+++ b/Documentation/mm/page_table_check.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _page_table_check:
+
+================
+Page Table Check
+================
+
+Introduction
+============
+
+Page table check allows to harden the kernel by ensuring that some types of
+the memory corruptions are prevented.
+
+Page table check performs extra verifications at the time when new pages become
+accessible from the userspace by getting their page table entries (PTEs PMDs
+etc.) added into the table.
+
+In case of detected corruption, the kernel is crashed. There is a small
+performance and memory overhead associated with the page table check. Therefore,
+it is disabled by default, but can be optionally enabled on systems where the
+extra hardening outweighs the performance costs. Also, because page table check
+is synchronous, it can help with debugging double map memory corruption issues,
+by crashing kernel at the time wrong mapping occurs instead of later which is
+often the case with memory corruptions bugs.
+
+Double mapping detection logic
+==============================
+
++-------------------+-------------------+-------------------+------------------+
+| Current Mapping   | New mapping       | Permissions       | Rule             |
++===================+===================+===================+==================+
+| Anonymous         | Anonymous         | Read              | Allow            |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Named             | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Anonymous         | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Named             | Any               | Allow            |
++-------------------+-------------------+-------------------+------------------+
+
+Enabling Page Table Check
+=========================
+
+Build kernel with:
+
+- PAGE_TABLE_CHECK=y
+  Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
+  is available.
+
+- Boot with 'page_table_check=on' kernel parameter.
+
+Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
+table support without extra kernel parameter.
diff --git a/Documentation/mm/page_tables.rst b/Documentation/mm/page_tables.rst
new file mode 100644
index 000000000000..96939571d7bc
--- /dev/null
+++ b/Documentation/mm/page_tables.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Page Tables
+===========
diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst
new file mode 100644
index 000000000000..2ab7b8c1c863
--- /dev/null
+++ b/Documentation/mm/physical_memory.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Physical Memory
+===============
diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst
new file mode 100644
index 000000000000..e8618fbc62c9
--- /dev/null
+++ b/Documentation/mm/process_addrs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Process Addresses
+=================
diff --git a/Documentation/mm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst
new file mode 100644
index 000000000000..7bef6718e3a9
--- /dev/null
+++ b/Documentation/mm/remap_file_pages.rst
@@ -0,0 +1,33 @@
+.. _remap_file_pages:
+
+==============================
+remap_file_pages() system call
+==============================
+
+The remap_file_pages() system call is used to create a nonlinear mapping,
+that is, a mapping in which the pages of the file are mapped into a
+nonsequential order in memory. The advantage of using remap_file_pages()
+over using repeated calls to mmap(2) is that the former approach does not
+require the kernel to create additional VMA (Virtual Memory Area) data
+structures.
+
+Supporting of nonlinear mapping requires significant amount of non-trivial
+code in kernel virtual memory subsystem including hot paths. Also to get
+nonlinear mapping work kernel need a way to distinguish normal page table
+entries from entries with file offset (pte_file). Kernel reserves flag in
+PTE for this purpose. PTE flags are scarce resource especially on some CPU
+architectures. It would be nice to free up the flag for other usage.
+
+Fortunately, there are not many users of remap_file_pages() in the wild.
+It's only known that one enterprise RDBMS implementation uses the syscall
+on 32-bit systems to map files bigger than can linearly fit into 32-bit
+virtual address space. This use-case is not critical anymore since 64-bit
+systems are widely available.
+
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
+
+One side effect of emulation (apart from performance) is that user can hit
+vm.max_map_count limit more easily due to additional VMAs. See comment for
+DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/mm/shmfs.rst b/Documentation/mm/shmfs.rst
new file mode 100644
index 000000000000..8b01ebb4c30e
--- /dev/null
+++ b/Documentation/mm/shmfs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Shared Memory Filesystem
+========================
diff --git a/Documentation/mm/slab.rst b/Documentation/mm/slab.rst
new file mode 100644
index 000000000000..87d5a5bb172f
--- /dev/null
+++ b/Documentation/mm/slab.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Slab Allocation
+===============
diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst
new file mode 100644
index 000000000000..43063ade737a
--- /dev/null
+++ b/Documentation/mm/slub.rst
@@ -0,0 +1,452 @@
+.. _slub:
+
+==========================
+Short users guide for SLUB
+==========================
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add an option ``slub_debug``
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the ``slabinfo`` command to get statistical
+data and perform operation on the slabs. By default ``slabinfo`` only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. ``slabinfo`` can be compiled with
+::
+
+	gcc -o slabinfo tools/vm/slabinfo.c
+
+Some of the modes of operation of ``slabinfo`` require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to ``slub_debug``. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>
+	Enable options for all slabs
+
+slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
+	Enable options only for select slabs (no spaces
+	after a comma)
+
+Multiple blocks of options for all slabs or selected slabs can be given, with
+blocks of options delimited by ';'. The last of "all slabs" blocks is applied
+to all slabs except those that match one of the "select slabs" block. Options
+of the first "select slabs" blocks that matches the slab's name are applied.
+
+Possible debug options are::
+
+	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+			Sorry SLAB legacy issues)
+	Z		Red zoning
+	P		Poisoning (object and padding)
+	U		User tracking (free and alloc)
+	T		Trace (please only use on single slabs)
+	A		Enable failslab filter mark for the cache
+	O		Switch debugging off for caches that would have
+			caused higher minimum slab orders
+	-		Switch all debugging off (useful if the kernel is
+			configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify::
+
+	slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try::
+
+	slub_debug=,dentry
+
+to only enable debugging on the dentry cache.  You may use an asterisk at the
+end of the slab name, in order to cover all slabs with the same prefix.  For
+example, here's how you can poison the dentry cache as well as all kmalloc
+slabs::
+
+	slub_debug=P,kmalloc-*,dentry
+
+Red zoning and tracking may realign the slab.  We can just apply sanity checks
+to the dentry cache with::
+
+	slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes).  This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory.  To
+switch off debugging for such caches by default, use::
+
+	slub_debug=O
+
+You can apply different options to different list of slab names, using blocks
+of options. This will enable red zoning for dentry and user tracking for
+kmalloc. All other slabs will not get any debugging enabled::
+
+	slub_debug=Z,dentry;U,kmalloc-*
+
+You can also enable options (e.g. sanity checks and poisoning) for all caches
+except some that are deemed too performance critical and don't need to be
+debugged by specifying global debug options followed by a list of slab names
+with "-" as options::
+
+	slub_debug=FZ;-,zs_handle,zspage
+
+The state of each debug option for a slab can be found in the respective files
+under::
+
+	/sys/kernel/slab/<slab name>/
+
+If the file contains 1, the option is enabled, 0 means disabled. The debug
+options from the ``slub_debug`` parameter translate to the following files::
+
+	F	sanity_checks
+	Z	red_zone
+	P	poison
+	U	store_user
+	T	trace
+	A	failslab
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+============
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+``slabinfo -a`` displays which slabs were merged together.
+
+Slab validation
+===============
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the ``slabinfo`` tool. Then you can do
+::
+
+	slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case ``slabinfo -v`` simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+========================
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+.. slub_min_objects=x		(default 4)
+.. slub_min_order=x		(default 0)
+.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
+
+``slub_min_objects``
+	allows to specify how many objects must at least fit into one
+	slab in order for the allocation order to be acceptable.  In
+	general slub will be able to perform this number of
+	allocations on a slab without consulting centralized resources
+	(list_lock) where contention may occur.
+
+``slub_min_order``
+	specifies a minimum order of slabs. A similar effect like
+	``slub_min_objects``.
+
+``slub_max_order``
+	specified the order at which ``slub_min_objects`` should no
+	longer be checked. This is useful to avoid SLUB trying to
+	generate super large order pages to fit ``slub_min_objects``
+	of a slab cache with large object sizes into one high order
+	page. Setting command line parameter
+	``debug_guardpage_minorder=N`` (N > 0), forces setting
+	``slub_max_order`` to 0, what cause minimum possible order of
+	slabs allocation.
+
+SLUB Debug output
+=================
+
+Here is a sample of slub debug output::
+
+ ====================================================================
+ BUG kmalloc-8: Right Redzone overwritten
+ --------------------------------------------------------------------
+
+ INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+ INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+ INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+ INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+ Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+ Object   (0xc90f6d20): 31 30 31 39 2e 30 30 35                         1019.005
+ Redzone  (0xc90f6d28): 00 cc cc cc                                     .
+ Padding  (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
+
+   [<c010523d>] dump_trace+0x63/0x1eb
+   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+   [<c010601d>] show_trace+0x12/0x14
+   [<c0106035>] dump_stack+0x16/0x18
+   [<c017e0fa>] object_err+0x143/0x14b
+   [<c017e2cc>] check_object+0x66/0x234
+   [<c017eb43>] __slab_free+0x239/0x384
+   [<c017f446>] kfree+0xa6/0xc6
+   [<c02e2335>] get_modalias+0xb9/0xf5
+   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+   [<c027866a>] dev_uevent+0x1ad/0x1da
+   [<c0205024>] kobject_uevent_env+0x20a/0x45b
+   [<c020527f>] kobject_uevent+0xa/0xf
+   [<c02779f1>] store_uevent+0x4f/0x58
+   [<c027758e>] dev_attr_store+0x29/0x2f
+   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+   [<c0183ba7>] vfs_write+0xd1/0x15a
+   [<c01841d7>] sys_write+0x3d/0x72
+   [<c0104112>] sysenter_past_esp+0x5f/0x99
+   [<b7f7b410>] 0xb7f7b410
+   =======================
+
+ FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+   This will be a message in the system log starting with::
+
+     ===============================================
+     BUG <slab cache affected>: <What went wrong>
+     -----------------------------------------------
+
+     INFO: <corruption start>-<corruption_end> <more info>
+     INFO: Slab <address> <slab information>
+     INFO: Object <address> <object information>
+     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+	cpu> pid=<pid of the process>
+     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+	pid=<pid of the process>
+
+   (Object allocation / free information is only available if SLAB_STORE_USER is
+   set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+   Various types of lines can follow the BUG SLUB line:
+
+   Bytes b4 <address> : <bytes>
+	Shows a few bytes before the object where the problem was detected.
+	Can be useful if the corruption does not stop with the start of the
+	object.
+
+   Object <address> : <bytes>
+	The bytes of the object. If the object is inactive then the bytes
+	typically contain poison values. Any non-poison value shows a
+	corruption by a write after free.
+
+   Redzone <address> : <bytes>
+	The Redzone following the object. The Redzone is used to detect
+	writes after the object. All bytes should always have the same
+	value. If there is any deviation then it is due to a write after
+	the object boundary.
+
+	(Redzone information is only available if SLAB_RED_ZONE is set.
+	slub_debug sets that option)
+
+   Padding <address> : <bytes>
+	Unused data to fill up the space in order to get the next object
+	properly aligned. In the debug case we make sure that there are
+	at least 4 bytes of padding. This allows the detection of writes
+	before the object.
+
+3. A stackdump
+
+   The stackdump describes the location where the error was detected. The cause
+   of the corruption is may be more likely found by looking at the function that
+   allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+   operation of the system.
+
+   These are messages in the system log beginning with::
+
+	FIX <slab cache affected>: <corrective action taken>
+
+   In the above sample SLUB found that the Redzone of an active object has
+   been overwritten. Here a string of 8 characters was written into a slab that
+   has the length of 8 characters. However, a 8 character string needs a
+   terminating 0. That zero has overwritten the first byte of the Redzone field.
+   After reporting the details of the issue encountered the FIX SLUB message
+   tells us that SLUB has restored the Redzone to its proper value and then
+   system operations continue.
+
+Emergency operations
+====================
+
+Minimal debugging (sanity checks alone) can be enabled by booting with::
+
+	slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.::
+
+	slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects::
+
+	slub_debug=FZ,dentry
+
+Extended slabinfo mode and plotting
+===================================
+
+The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
+ - Slabcache Totals
+ - Slabs sorted by size (up to -N <num> slabs, default 1)
+ - Slabs sorted by loss (up to -N <num> slabs, default 1)
+
+Additionally, in this mode ``slabinfo`` does not dynamically scale
+sizes (G/M/K) and reports everything in bytes (this functionality is
+also available to other slabinfo modes via '-B' option) which makes
+reporting more precise and accurate. Moreover, in some sense the `-X'
+mode also simplifies the analysis of slabs' behaviour, because its
+output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
+pushes the analysis from looking through the numbers (tons of numbers)
+to something easier -- visual analysis.
+
+To generate plots:
+
+a) collect slabinfo extended records, for example::
+
+	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
+
+b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
+
+	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
+
+   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
+   and generates 3 png files (and 3 pre-processing cache files) per STATS
+   file:
+   - Slabcache Totals: FOO_STATS-totals.png
+   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
+   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
+
+Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
+need to compare slabs' behaviour "prior to" and "after" some code
+modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
+can 'merge' the `Slabcache Totals` sections from different
+measurements. To visually compare N plots:
+
+a) Collect as many STATS1, STATS2, .. STATSN files as you need::
+
+	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
+
+b) Pre-process those STATS files::
+
+	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
+
+c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
+   generated pre-processed \*-totals::
+
+	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
+
+   This will produce a single plot (png file).
+
+   Plots, expectedly, can be large so some fluctuations or small spikes
+   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
+   options to 'zoom-in'/'zoom-out':
+
+   a) ``-s %d,%d`` -- overwrites the default image width and height
+   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
+      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
+      40,60`` range will plot only samples collected between 40th and
+      60th seconds).
+
+
+DebugFS files for SLUB
+======================
+
+For more information about current state of SLUB caches with the user tracking
+debug option enabled, debugfs files are available, typically under
+/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
+tracking). There are 2 types of these files with the following debug
+information:
+
+1. alloc_traces::
+
+    Prints information about unique allocation traces of the currently
+    allocated objects. The output is sorted by frequency of each trace.
+
+    Information in the output:
+    Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
+    pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
+
+    Example:::
+
+    1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
+	__slab_alloc+0x6d/0x90
+	kmem_cache_alloc_trace+0x2eb/0x300
+	populate_error_injection_list+0x97/0x110
+	init_error_injection+0x1b/0x71
+	do_one_initcall+0x5f/0x2d0
+	kernel_init_freeable+0x26f/0x2d7
+	kernel_init+0xe/0x118
+	ret_from_fork+0x22/0x30
+
+
+2. free_traces::
+
+    Prints information about unique freeing traces of the currently allocated
+    objects. The freeing traces thus come from the previous life-cycle of the
+    objects and are reported as not available for objects allocated for the first
+    time. The output is sorted by frequency of each trace.
+
+    Information in the output:
+    Number of objects, freeing function, minimal/average/maximal jiffies since free,
+    pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
+
+    Example:::
+
+    1980 <not-available> age=4294912290 pid=0 cpus=0
+    51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
+	kfree+0x2db/0x420
+	acpi_ut_update_ref_count+0x6a6/0x782
+	acpi_ut_update_object_reference+0x1ad/0x234
+	acpi_ut_remove_reference+0x7d/0x84
+	acpi_rs_get_prt_method_data+0x97/0xd6
+	acpi_get_irq_routing_table+0x82/0xc4
+	acpi_pci_irq_find_prt_entry+0x8e/0x2e0
+	acpi_pci_irq_lookup+0x3a/0x1e0
+	acpi_pci_irq_enable+0x77/0x240
+	pcibios_enable_device+0x39/0x40
+	do_pci_enable_device.part.0+0x5d/0xe0
+	pci_enable_device_flags+0xfc/0x120
+	pci_enable_device+0x13/0x20
+	virtio_pci_probe+0x9e/0x170
+	local_pci_probe+0x48/0x80
+	pci_device_probe+0x105/0x1c0
+
+Christoph Lameter, May 30, 2007
+Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
new file mode 100644
index 000000000000..c08919662704
--- /dev/null
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -0,0 +1,100 @@
+.. _split_page_table_lock:
+
+=====================
+Split page table lock
+=====================
+
+Originally, mm->page_table_lock spinlock protected all page tables of the
+mm_struct. But this approach leads to poor page fault scalability of
+multi-threaded applications due high contention on the lock. To improve
+scalability, split page table lock was introduced.
+
+With split page table lock we have separate per-table lock to serialize
+access to the table. At the moment we use split lock for PTE and PMD
+tables. Access to higher level tables protected by mm->page_table_lock.
+
+There are helpers to lock/unlock a table and other accessor functions:
+
+ - pte_offset_map_lock()
+	maps pte and takes PTE table lock, returns pointer to the taken
+	lock;
+ - pte_unmap_unlock()
+	unlocks and unmaps PTE table;
+ - pte_alloc_map_lock()
+	allocates PTE table if needed and take the lock, returns pointer
+	to taken lock or NULL if allocation failed;
+ - pte_lockptr()
+	returns pointer to PTE table lock;
+ - pmd_lock()
+	takes PMD table lock, returns pointer to taken lock;
+ - pmd_lockptr()
+	returns pointer to PMD table lock;
+
+Split page table lock for PTE tables is enabled compile-time if
+CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
+If split lock is disabled, all tables are guarded by mm->page_table_lock.
+
+Split page table lock for PMD tables is enabled, if it's enabled for PTE
+tables and the architecture supports it (see below).
+
+Hugetlb and split page table lock
+=================================
+
+Hugetlb can support several page sizes. We use split lock only for PMD
+level, but not for PUD.
+
+Hugetlb-specific helpers:
+
+ - huge_pte_lock()
+	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
+	otherwise;
+ - huge_pte_lockptr()
+	returns pointer to table lock;
+
+Support of split page table lock by an architecture
+===================================================
+
+There's no need in special enabling of PTE split page table lock: everything
+required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+must be called on PTE table allocation / freeing.
+
+Make sure the architecture doesn't use slab allocator for page table
+allocation: slab uses page->slab_cache for its pages.
+This field shares storage with page->ptl.
+
+PMD split lock only makes sense if you have more than two page table
+levels.
+
+PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
+allocation and pgtable_pmd_page_dtor() on freeing.
+
+Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
+pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
+paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
+
+With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
+
+NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+be handled properly.
+
+page->ptl
+=========
+
+page->ptl is used to access split page table lock, where 'page' is struct
+page of page containing the table. It shares storage with page->private
+(and few other fields in union).
+
+To avoid increasing size of struct page and have best performance, we use a
+trick:
+
+ - if spinlock_t fits into long, we use page->ptr as spinlock, so we
+   can avoid indirect access and save a cache line.
+ - if size of spinlock_t is bigger then size of long, we use page->ptl as
+   pointer to spinlock_t and allocate it dynamically. This allows to use
+   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
+   one more cache line for indirect access;
+
+The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
+pgtable_pmd_page_ctor() for PMD table.
+
+Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/mm/swap.rst b/Documentation/mm/swap.rst
new file mode 100644
index 000000000000..78819bd4d745
--- /dev/null
+++ b/Documentation/mm/swap.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+Swap
+====
diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst
new file mode 100644
index 000000000000..216db1d67d04
--- /dev/null
+++ b/Documentation/mm/transhuge.rst
@@ -0,0 +1,187 @@
+.. _transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+This document describes design principles for Transparent Hugepage (THP)
+support and its interaction with other parts of the memory management
+system.
+
+Design principles
+=================
+
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+get_user_pages and follow_page
+==============================
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most GUP users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead. Taking a reference on any head/tail page would
+prevent the page from being split by anyone.
+
+.. note::
+   these aren't new constraints to the GUP API, and they match the
+   same constraints that apply to hugetlbfs too, so any driver capable
+   of handling GUP on hugetlbfs will also work fine on transparent
+   hugepage backed mappings.
+
+Graceful fallback
+=================
+
+Code walking pagetables but unaware about huge pmds can simply call
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundreds if not thousands of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+that you can't handle natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change::
+
+	diff --git a/mm/mremap.c b/mm/mremap.c
+	--- a/mm/mremap.c
+	+++ b/mm/mremap.c
+	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+			return NULL;
+
+		pmd = pmd_offset(pud, addr);
+	+	split_huge_pmd(vma, pmd, addr);
+		if (pmd_none_or_clear_bad(pmd))
+			return NULL;
+
+Locking in hugepage aware code
+==============================
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_lock in read (or write) mode to be sure a huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_lock in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd being converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page table lock and fallback to the old code as
+before. Otherwise, you can proceed to process the huge pmd and the
+hugepage natively. Once finished, you can drop the page table lock.
+
+Refcounts and transparent huge pages
+====================================
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate on head page's ->_refcount.
+
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
+    succeeds on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page is accounted for in compound_mapcount
+    (stored in first tail page). For file huge pages, we also increment
+    ->_mapcount of all sub-pages in order to have race-free detection of
+    last unmap of subpages.
+
+PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
+
+For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all
+subpages is offset up by one. This additional reference is required to
+get race-free detection of unmap of subpages when we have them mapped with
+both PMDs and PTEs.
+
+This optimization is required to lower the overhead of per-subpage mapcount
+tracking. The alternative is to alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+For anonymous pages, we set PG_double_map when a PMD of the page is split
+for the first time, but still have a PMD mapping. The additional references
+go away with the last compound_mapcount.
+
+File pages get PG_double_map set on the first map of the page with PTE and
+goes away when the page gets evicted from the page cache.
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries, but we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge pages: it expects page count to be equal to
+the sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have a reference to the head page).
+
+split_huge_page uses migration entries to stabilize page->_refcount and
+page->_mapcount of anonymous pages. File pages just get unmapped.
+
+We are safe against physical memory scanners too: the only legitimate way
+a scanner can get a reference to a page is get_page_unless_zero().
+
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already know how
+many references should be uncharged from the head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where references should go after split: it will stay on the head page.
+
+Note that split_huge_pmd() doesn't have any limitations on refcounting:
+pmd can be split at any point and never fails.
+
+Partial unmap and deferred_split_huge_page()
+============================================
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It also might be
+counterproductive since in many cases partial unmap happens during exit(2) if
+a THP crosses a VMA boundary.
+
+The function deferred_split_huge_page() is used to queue a page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst
new file mode 100644
index 000000000000..b280367d6a44
--- /dev/null
+++ b/Documentation/mm/unevictable-lru.rst
@@ -0,0 +1,554 @@
+.. _unevictable_lru:
+
+==============================
+Unevictable LRU Infrastructure
+==============================
+
+.. contents:: :local:
+
+
+Introduction
+============
+
+This document describes the Linux memory manager's "Unevictable LRU"
+infrastructure and the use of this to manage several types of "unevictable"
+pages.
+
+The document attempts to provide the overall rationale behind this mechanism
+and the rationale for some of the design decisions that drove the
+implementation.  The latter design rationale is discussed in the context of an
+implementation description.  Admittedly, one can obtain the implementation
+details - the "what does it do?" - by reading the code.  One hopes that the
+descriptions below add value by provide the answer to "why does it do that?".
+
+
+
+The Unevictable LRU
+===================
+
+The Unevictable LRU facility adds an additional LRU list to track unevictable
+pages and to hide these pages from vmscan.  This mechanism is based on a patch
+by Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux.  The problems have been observed at customer sites on large
+memory x86_64 systems.
+
+To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
+main memory will have over 32 million 4k pages in a single node.  When a large
+fraction of these pages are not evictable for any reason [see below], vmscan
+will spend a lot of time scanning the LRU lists looking for the small fraction
+of pages that are evictable.  This can result in a situation where all CPUs are
+spending 100% of their time in vmscan for hours or days on end, with the system
+completely unresponsive.
+
+The unevictable list addresses the following classes of unevictable pages:
+
+ * Those owned by ramfs.
+
+ * Those mapped into SHM_LOCK'd shared memory regions.
+
+ * Those mapped into VM_LOCKED [mlock()ed] VMAs.
+
+The infrastructure may also be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+The Unevictable LRU Page List
+-----------------------------
+
+The Unevictable LRU page list is a lie.  It was never an LRU-ordered list, but a
+companion to the LRU-ordered anonymous and file, active and inactive page lists;
+and now it is not even a page list.  But following familiar convention, here in
+this document and in the source, we often imagine it as a fifth LRU page list.
+
+The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list.
+
+The PG_unevictable flag is analogous to, and mutually exclusive with, the
+PG_active flag in that it indicates on which LRU list a page resides when
+PG_lru is set.
+
+The Unevictable LRU infrastructure maintains unevictable pages as if they were
+on an additional LRU list for a few reasons:
+
+ (1) We get to "treat unevictable pages just like we treat other pages in the
+     system - which means we get to use the same code to manipulate them, the
+     same code to isolate them (for migrate, etc.), the same code to keep track
+     of the statistics, etc..." [Rik van Riel]
+
+ (2) We want to be able to migrate unevictable pages between nodes for memory
+     defragmentation, workload management and memory hotplug.  The Linux kernel
+     can only migrate pages that it can successfully isolate from the LRU
+     lists (or "Movable" pages: outside of consideration here).  If we were to
+     maintain pages elsewhere than on an LRU-like list, where they can be
+     detected by isolate_lru_page(), we would prevent their migration.
+
+The unevictable list does not differentiate between file-backed and anonymous,
+swap-backed pages.  This differentiation is only important while the pages are,
+in fact, evictable.
+
+The unevictable list benefits from the "arrayification" of the per-node LRU
+lists and statistics originally proposed and posted by Christoph Lameter.
+
+
+Memory Control Group Interaction
+--------------------------------
+
+The unevictable LRU facility interacts with the memory control group [aka
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by
+extending the lru_list enum.
+
+The memory controller data structure automatically gets a per-node unevictable
+list as a result of the "arrayification" of the per-node LRU lists (one per
+lru_list enum element).  The memory controller tracks the movement of pages to
+and from the unevictable list.
+
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list.  This has a couple of
+effects:
+
+ (1) Because the pages are "hidden" from reclaim on the unevictable list, the
+     reclaim process can be more efficient, dealing only with pages that have a
+     chance of being reclaimed.
+
+ (2) On the other hand, if too many of the pages charged to the control group
+     are unevictable, the evictable portion of the working set of the tasks in
+     the control group may not fit into the available memory.  This can cause
+     the control group to thrash or to OOM-kill tasks.
+
+
+.. _mark_addr_space_unevict:
+
+Marking Address Spaces Unevictable
+----------------------------------
+
+For facilities such as ramfs none of the pages attached to the address space
+may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
+address space flag is provided, and this can be manipulated by a filesystem
+using a number of wrapper functions:
+
+ * ``void mapping_set_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being completely unevictable.
+
+ * ``void mapping_clear_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being evictable.
+
+ * ``int mapping_unevictable(struct address_space *mapping);``
+
+	Query the address space, and return true if it is completely
+	unevictable.
+
+These are currently used in three places in the kernel:
+
+ (1) By ramfs to mark the address spaces of its inodes when they are created,
+     and this mark remains for the life of the inode.
+
+ (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
+     Note that SHM_LOCK is not required to page in the locked pages if they're
+     swapped out; the application must touch the pages manually if it wants to
+     ensure they're in memory.
+
+ (3) By the i915 driver to mark pinned address space until it's unpinned. The
+     amount of unevictable memory marked by i915 driver is roughly the bounded
+     object size in debugfs/dri/0/i915_gem_objects.
+
+
+Detecting Unevictable Pages
+---------------------------
+
+The function page_evictable() in mm/internal.h determines whether a page is
+evictable or not using the query function outlined above [see section
+:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
+to check the AS_UNEVICTABLE flag.
+
+For address spaces that are so marked after being populated (as SHM regions
+might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate
+the page tables for the region as does, for example, mlock(), nor need it make
+any special effort to push any pages in the SHM_LOCK'd area to the unevictable
+list.  Instead, vmscan will do this if and when it encounters the pages during
+a reclamation scan.
+
+On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan
+the pages in the region and "rescue" them from the unevictable list if no other
+condition is keeping them unevictable.  If an unevictable region is destroyed,
+the pages are also "rescued" from the unevictable list in the process of
+freeing them.
+
+page_evictable() also checks for mlocked pages by testing an additional page
+flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
+faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED.
+
+
+Vmscan's Handling of Unevictable Pages
+--------------------------------------
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will not encounter the pages until they
+have become evictable again (via munlock() for example) and have been "rescued"
+from the unevictable list.  However, there may be situations where we decide,
+for the sake of expediency, to leave an unevictable page on one of the regular
+active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
+pages in all of the shrink_{active|inactive|page}_list() functions and will
+"cull" such pages that it encounters: that is, it diverts those pages to the
+unevictable list for the memory cgroup and node being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED VMA, but the
+page is not marked as PG_mlocked.  Such pages will make it all the way to
+shrink_active_list() or shrink_page_list() where they will be detected when
+vmscan walks the reverse map in page_referenced() or try_to_unmap().  The page
+is culled to the unevictable list when it is released by the shrinker.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
+using putback_lru_page() - the inverse operation to isolate_lru_page() - after
+dropping the page lock.  Because the condition which makes the page unevictable
+may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the
+unevictable state of a page before placing it on the unevictable list.
+
+
+MLOCKED Pages
+=============
+
+The unevictable page list is also useful for mlock(), in addition to ramfs and
+SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
+NOMMU situations, all mappings are effectively mlocked.
+
+
+History
+-------
+
+The "Unevictable mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph Lameter
+to achieve the same objective: hiding mlocked pages from vmscan.
+
+In Nick's patch, he used one of the struct page LRU list link fields as a count
+of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years
+earlier).  But this use of the link field for a count prevented the management
+of the pages on an LRU list, and thus mlocked pages were not migratable as
+isolate_lru_page() could not detect them, and the LRU list link field was not
+available to the migration subsystem.
+
+Nick resolved this by putting mlocked pages back on the LRU list before
+attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
+Nick's patch was integrated with the Unevictable LRU work, the count was
+replaced by walking the reverse map when munlocking, to determine whether any
+other VM_LOCKED VMAs still mapped the page.
+
+However, walking the reverse map for each page when munlocking was ugly and
+inefficient, and could lead to catastrophic contention on a file's rmap lock,
+when many processes which had it mlocked were trying to exit.  In 5.18, the
+idea of keeping mlock_count in Unevictable LRU list link field was revived and
+put to work, without preventing the migration of mlocked pages.  This is why
+the "Unevictable LRU list" cannot be a linked list of pages now; but there was
+no use for that linked list anyway - though its size is maintained for meminfo.
+
+
+Basic Management
+----------------
+
+mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
+pages.  When such a page has been "noticed" by the memory management subsystem,
+the page is marked with the PG_mlocked flag.  This can be manipulated using the
+PageMlocked() functions.
+
+A PG_mlocked page will be placed on the unevictable list when it is added to
+the LRU.  Such pages can be "noticed" by memory management in several places:
+
+ (1) in the mlock()/mlock2()/mlockall() system call handlers;
+
+ (2) in the mmap() system call handler when mmapping a region with the
+     MAP_LOCKED flag;
+
+ (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
+     flag;
+
+ (4) in the fault path and when a VM_LOCKED stack segment is expanded; or
+
+ (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
+     reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
+
+mlocked pages become unlocked and rescued from the unevictable list when:
+
+ (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
+
+ (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
+     unmapping at task exit;
+
+ (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
+     or
+
+ (4) before a page is COW'd in a VM_LOCKED VMA.
+
+
+mlock()/mlock2()/mlockall() System Call Handling
+------------------------------------------------
+
+mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup()
+for each VMA in the range specified by the call.  In the case of mlockall(),
+this is the entire active address space of the task.  Note that mlock_fixup()
+is used for both mlocking and munlocking a range of memory.  A call to mlock()
+an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is
+treated as a no-op and mlock_fixup() simply returns.
+
+If the VMA passes some filtering as described in "Filtering Special VMAs"
+below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
+off a subset of the VMA if the range does not cover the entire VMA.  Any pages
+already present in the VMA are then marked as mlocked by mlock_page() via
+mlock_pte_range() via walk_page_range() via mlock_vma_pages_range().
+
+Before returning from the system call, do_mlock() or mlockall() will call
+__mm_populate() to fault in the remaining pages via get_user_pages() and to
+mark those pages as mlocked as they are faulted.
+
+Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
+get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
+do end up getting faulted into this VM_LOCKED VMA, they will be handled in the
+fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled.
+
+For each PTE (or PMD) being faulted into a VMA, the page add rmap function
+calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED
+(unless it is a PTE mapping of a part of a transparent huge page).  Or when
+it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable()
+calls mlock_new_page() instead: similar to mlock_page(), but can make better
+judgments, since this page is held exclusively and known not to be on LRU yet.
+
+mlock_page() sets PageMlocked immediately, then places the page on the CPU's
+mlock pagevec, to batch up the rest of the work to be done under lru_lock by
+__mlock_page().  __mlock_page() sets PageUnevictable, initializes mlock_count
+and moves the page to unevictable state ("the unevictable LRU", but with
+mlock_count in place of LRU threading).  Or if the page was already PageLRU
+and PageUnevictable and PageMlocked, it simply increments the mlock_count.
+
+But in practice that may not work ideally: the page may not yet be on an LRU, or
+it may have been temporarily isolated from LRU.  In such cases the mlock_count
+field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn()
+returns the page to "LRU".  Races prohibit mlock_count from being set to 1 then:
+rather than risk stranding a page indefinitely as unevictable, always err with
+mlock_count on the low side, so that when munlocked the page will be rescued to
+an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a
+VM_LOCKED VMA.
+
+
+Filtering Special VMAs
+----------------------
+
+mlock_fixup() filters several classes of "special" VMAs:
+
+1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
+   these mappings are inherently pinned, so we don't need to mark them as
+   mlocked.  In any case, most of the pages have no struct page in which to so
+   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
+   so there is no sense in attempting to visit them.
+
+2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
+   neither need nor want to mlock() these pages.  But __mm_populate() includes
+   hugetlbfs ranges, allocating the huge pages and populating the PTEs.
+
+3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
+   such as the VDSO page, relay channel pages, etc.  These pages are inherently
+   unevictable and are not managed on the LRU lists.  __mm_populate() includes
+   these ranges, populating the PTEs if not already populated.
+
+4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate()
+   includes these ranges, populating the PTEs if not already populated.
+
+Note that for all of these special VMAs, mlock_fixup() does not set the
+VM_LOCKED flag.  Therefore, we won't have to deal with them later during
+munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
+VMAs against the task's "locked_vm".
+
+
+munlock()/munlockall() System Call Handling
+-------------------------------------------
+
+The munlock() and munlockall() system calls are handled by the same
+mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are.
+If called to munlock an already munlocked VMA, mlock_fixup() simply returns.
+Because of the VMA filtering discussed above, VM_LOCKED will not be set in
+any "special" VMAs.  So, those VMAs will be ignored for munlock.
+
+If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
+specified range.  All pages in the VMA are then munlocked by munlock_page() via
+mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same
+function used when mlocking a VMA range, with new flags for the VMA indicating
+that it is munlock() being performed.
+
+munlock_page() uses the mlock pagevec to batch up work to be done under
+lru_lock by  __munlock_page().  __munlock_page() decrements the page's
+mlock_count, and when that reaches 0 it clears PageMlocked and clears
+PageUnevictable, moving the page from unevictable state to inactive LRU.
+
+But in practice that may not work ideally: the page may not yet have reached
+"the unevictable LRU", or it may have been temporarily isolated from it.  In
+those cases its mlock_count field is unusable and must be assumed to be 0: so
+that the page will be rescued to an evictable LRU, then perhaps be mlocked
+again later if vmscan finds it in a VM_LOCKED VMA.
+
+
+Migrating MLOCKED Pages
+-----------------------
+
+A page that is being migrated has been isolated from the LRU lists and is held
+locked across unmapping of the page, updating the page's address space entry
+and copying the contents and state, until the page table entry has been
+replaced with an entry that refers to the new page.  Linux supports migration
+of mlocked pages and other unevictable pages.  PG_mlocked is cleared from the
+the old page when it is unmapped from the last VM_LOCKED VMA, and set when the
+new page is mapped in place of migration entry in a VM_LOCKED VMA.  If the page
+was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the
+page was unevictable for other reasons, PG_unevictable is copied explicitly.
+
+Note that page migration can race with mlocking or munlocking of the same page.
+There is mostly no problem since page migration requires unmapping all PTEs of
+the old page (including munlock where VM_LOCKED), then mapping in the new page
+(including mlock where VM_LOCKED).  The page table locks provide sufficient
+synchronization.
+
+However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA,
+before mlocking any pages already present, if one of those pages were migrated
+before mlock_pte_range() reached it, it would get counted twice in mlock_count.
+To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO,
+so that mlock_vma_page() will skip it.
+
+To complete page migration, we place the old and new pages back onto the LRU
+afterwards.  The "unneeded" page - old page on success, new page on failure -
+is freed when the reference count held by the migration process is released.
+
+
+Compacting MLOCKED Pages
+------------------------
+
+The memory map can be scanned for compactable regions and the default behavior
+is to let unevictable pages be moved.  /proc/sys/vm/compact_unevictable_allowed
+controls this behavior (see Documentation/admin-guide/sysctl/vm.rst).  The work
+of compaction is mostly handled by the page migration code and the same work
+flow as described in Migrating MLOCKED Pages will apply.
+
+
+MLOCKING Transparent Huge Pages
+-------------------------------
+
+A transparent huge page is represented by a single entry on an LRU list.
+Therefore, we can only make unevictable an entire compound page, not
+individual subpages.
+
+If a user tries to mlock() part of a huge page, and no user mlock()s the
+whole of the huge page, we want the rest of the page to be reclaimable.
+
+We cannot just split the page on partial mlock() as split_huge_page() can
+fail and a new intermittent failure mode for the syscall is undesirable.
+
+We handle this by keeping PTE-mlocked huge pages on evictable LRU lists:
+the PMD on the border of a VM_LOCKED VMA will be split into a PTE table.
+
+This way the huge page is accessible for vmscan.  Under memory pressure the
+page will be split, subpages which belong to VM_LOCKED VMAs will be moved
+to the unevictable LRU and the rest can be reclaimed.
+
+/proc/meminfo's Unevictable and Mlocked amounts do not include those parts
+of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs.
+
+
+mmap(MAP_LOCKED) System Call Handling
+-------------------------------------
+
+In addition to the mlock(), mlock2() and mlockall() system calls, an application
+can request that a region of memory be mlocked by supplying the MAP_LOCKED flag
+to the mmap() call.  There is one important and subtle difference here, though.
+mmap() + mlock() will fail if the range cannot be faulted in (e.g. because
+mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail.
+The mmaped area will still have properties of the locked area - pages will not
+get swapped out - but major page faults to fault memory in might still happen.
+
+Furthermore, any mmap() call or brk() call that expands the heap by a task
+that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked.  Before the unevictable/mlock
+changes, the kernel simply called make_pages_present() to allocate pages
+and populate the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure,
+the mmap() handler and task address space expansion functions call
+populate_vma_page_range() specifying the vma and the address range to mlock.
+
+
+munmap()/exit()/exec() System Call Handling
+-------------------------------------------
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any
+way, so unmapping them required no processing.
+
+For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
+munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
+(unless it was a PTE mapping of a part of a transparent huge page).
+
+munlock_page() uses the mlock pagevec to batch up work to be done under
+lru_lock by  __munlock_page().  __munlock_page() decrements the page's
+mlock_count, and when that reaches 0 it clears PageMlocked and clears
+PageUnevictable, moving the page from unevictable state to inactive LRU.
+
+But in practice that may not work ideally: the page may not yet have reached
+"the unevictable LRU", or it may have been temporarily isolated from it.  In
+those cases its mlock_count field is unusable and must be assumed to be 0: so
+that the page will be rescued to an evictable LRU, then perhaps be mlocked
+again later if vmscan finds it in a VM_LOCKED VMA.
+
+
+Truncating MLOCKED Pages
+------------------------
+
+File truncation or hole punching forcibly unmaps the deleted pages from
+userspace; truncation even unmaps and deletes any private anonymous pages
+which had been Copied-On-Write from the file pages now being truncated.
+
+Mlocked pages can be munlocked and deleted in this way: like with munmap(),
+for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
+munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
+(unless it was a PTE mapping of a part of a transparent huge page).
+
+However, if there is a racing munlock(), since mlock_vma_pages_range() starts
+munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages
+present, if one of those pages were unmapped by truncation or hole punch before
+mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA,
+and would not be counted out of mlock_count.  In this rare case, a page may
+still appear as PageMlocked after it has been fully unmapped: and it is left to
+release_pages() (or __page_cache_release()) to clear it and update statistics
+before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared,
+which is usually 0).
+
+
+Page Reclaim in shrink_*_list()
+-------------------------------
+
+vmscan's shrink_active_list() culls any obviously unevictable pages -
+i.e. !page_evictable(page) pages - diverting those to the unevictable list.
+However, shrink_active_list() only sees unevictable pages that made it onto the
+active/inactive LRU lists.  Note that these pages do not have PageUnevictable
+set - otherwise they would be on the unevictable list and shrink_active_list()
+would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+ (1) ramfs pages that have been placed on the LRU lists when first allocated.
+
+ (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
+     allocate or fault in the pages in the shared memory region.  This happens
+     when an application accesses the page the first time after SHM_LOCK'ing
+     the segment.
+
+ (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked,
+     but events left mlock_count too low, so they were munlocked too early.
+
+vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
+unevictable pages found on the inactive lists to the appropriate memory cgroup
+and node unevictable list.
+
+rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
+shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
+check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
+to correct them.  Such pages are culled to the unevictable list when released
+by the shrinker.
diff --git a/Documentation/mm/vmalloc.rst b/Documentation/mm/vmalloc.rst
new file mode 100644
index 000000000000..363fe20d6b9f
--- /dev/null
+++ b/Documentation/mm/vmalloc.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+Virtually Contiguous Memory Allocation
+======================================
diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst
new file mode 100644
index 000000000000..fc8c67833af6
--- /dev/null
+++ b/Documentation/mm/vmalloced-kernel-stacks.rst
@@ -0,0 +1,153 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Virtually Mapped Kernel Stack Support
+=====================================
+
+:Author: Shuah Khan <skhan@linuxfoundation.org>
+
+.. contents:: :local:
+
+Overview
+--------
+
+This is a compilation of information from the code and original patch
+series that introduced the `Virtually Mapped Kernel Stacks feature
+<https://lwn.net/Articles/694348/>`
+
+Introduction
+------------
+
+Kernel stack overflows are often hard to debug and make the kernel
+susceptible to exploits. Problems could show up at a later time making
+it difficult to isolate and root-cause.
+
+Virtually-mapped kernel stacks with guard pages causes kernel stack
+overflows to be caught immediately rather than causing difficult to
+diagnose corruptions.
+
+HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
+support for virtually mapped stacks with guard pages. This feature
+causes reliable faults when the stack overflows. The usability of
+the stack trace after overflow and response to the overflow itself
+is architecture dependent.
+
+.. note::
+        As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
+        support for VMAP_STACK.
+
+HAVE_ARCH_VMAP_STACK
+--------------------
+
+Architectures that can support Virtually Mapped Kernel Stacks should
+enable this bool configuration option. The requirements are:
+
+- vmalloc space must be large enough to hold many kernel stacks. This
+  may rule out many 32-bit architectures.
+- Stacks in vmalloc space need to work reliably.  For example, if
+  vmap page tables are created on demand, either this mechanism
+  needs to work while the stack points to a virtual address with
+  unpopulated page tables or arch code (switch_to() and switch_mm(),
+  most likely) needs to ensure that the stack's page table entries
+  are populated before running on a possibly unpopulated stack.
+- If the stack overflows into a guard page, something reasonable
+  should happen. The definition of "reasonable" is flexible, but
+  instantly rebooting without logging anything would be unfriendly.
+
+VMAP_STACK
+----------
+
+VMAP_STACK bool configuration option when enabled allocates virtually
+mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
+
+- Enable this if you want the use virtually-mapped kernel stacks
+  with guard pages. This causes kernel stack overflows to be caught
+  immediately rather than causing difficult-to-diagnose corruption.
+
+.. note::
+
+        Using this feature with KASAN requires architecture support
+        for backing virtual mappings with real shadow memory, and
+        KASAN_VMALLOC must be enabled.
+
+.. note::
+
+        VMAP_STACK is enabled, it is not possible to run DMA on stack
+        allocated data.
+
+Kernel configuration options and dependencies keep changing. Refer to
+the latest code base:
+
+`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
+
+Allocation
+-----------
+
+When a new kernel thread is created, thread stack is allocated from
+virtually contiguous memory pages from the page level allocator. These
+pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
+protections.
+
+alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
+with PAGE_KERNEL protections.
+
+- Allocated stacks are cached and later reused by new threads, so memcg
+  accounting is performed manually on assigning/releasing stacks to tasks.
+  Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
+- vm_struct is cached to be able to find when thread free is initiated
+  in interrupt context. free_thread_stack() can be called in interrupt
+  context.
+- On arm64, all VMAP's stacks need to have the same alignment to ensure
+  that VMAP'd stack overflow detection works correctly. Arch specific
+  vmap stack allocator takes care of this detail.
+- This does not address interrupt stacks - according to the original patch
+
+Thread stack allocation is initiated from clone(), fork(), vfork(),
+kernel_thread() via kernel_clone(). Leaving a few hints for searching
+the code base to understand when and how thread stack is allocated.
+
+Bulk of the code is in:
+`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
+
+stack_vm_area pointer in task_struct keeps track of the virtually allocated
+stack and a non-null stack_vm_area pointer serves as a indication that the
+virtually mapped kernel stacks are enabled.
+
+::
+
+        struct vm_struct *stack_vm_area;
+
+Stack overflow handling
+-----------------------
+
+Leading and trailing guard pages help detect stack overflows. When stack
+overflows into the guard pages, handlers have to be careful not overflow
+the stack again. When handlers are called, it is likely that very little
+stack space is left.
+
+On x86, this is done by handling the page fault indicating the kernel
+stack overflow on the double-fault stack.
+
+Testing VMAP allocation with guard pages
+----------------------------------------
+
+How do we ensure that VMAP_STACK is actually allocating with a leading
+and trailing guard page? The following lkdtm tests can help detect any
+regressions.
+
+::
+
+        void lkdtm_STACK_GUARD_PAGE_LEADING()
+        void lkdtm_STACK_GUARD_PAGE_TRAILING()
+
+Conclusions
+-----------
+
+- A percpu cache of vmalloced stacks appears to be a bit faster than a
+  high-order stack allocation, at least when the cache hits.
+- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
+  simply embed the thread_info (containing only flags) and 'int cpu' into
+  task_struct.
+- The thread stack can be free'ed as soon as the task is dead (without
+  waiting for RCU) and then, if vmapped stacks are in use, cache the
+  entire stack for reuse on the same cpu.
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
new file mode 100644
index 000000000000..c9c495f62d12
--- /dev/null
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -0,0 +1,223 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+A vmemmap diet for HugeTLB and Device DAX
+=========================================
+
+HugeTLB
+=======
+
+The struct page structures (page structs) are used to describe a physical
+page frame. By default, there is a one-to-one mapping from a page frame to
+it's corresponding page struct.
+
+HugeTLB pages consist of multiple base page size pages and is supported by many
+architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
+details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
+currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
+consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
+For each base page, there is a corresponding page struct.
+
+Within the HugeTLB subsystem, only the first 4 page structs are used to
+contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+this upper limit. The only 'useful' information in the remaining page structs
+is the compound_head field, and this field is the same for all tail pages.
+
+By removing redundant page structs for HugeTLB pages, memory can be returned
+to the buddy allocator for other uses.
+
+Different architectures support different HugeTLB pages. For example, the
+following table is the HugeTLB page size supported by x86 and arm64
+architectures. Because arm64 supports 4k, 16k, and 64k base pages and
+supports contiguous entries, so it supports many kinds of sizes of HugeTLB
+page.
+
++--------------+-----------+-----------------------------------------------+
+| Architecture | Page Size |                HugeTLB Page Size              |
++--------------+-----------+-----------+-----------+-----------+-----------+
+|    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
++--------------+-----------+-----------+-----------+-----------+-----------+
+|              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
+|              +-----------+-----------+-----------+-----------+-----------+
+|    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
+|              +-----------+-----------+-----------+-----------+-----------+
+|              |   64KB    |    2MB    |  512MB    |    16GB   |           |
++--------------+-----------+-----------+-----------+-----------+-----------+
+
+When the system boot up, every HugeTLB page has more than one struct page
+structs which size is (unit: pages)::
+
+   struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+
+Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
+of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
+relationship::
+
+   HugeTLB_Size = n * PAGE_SIZE
+
+Then::
+
+   struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+               = n * sizeof(struct page) / PAGE_SIZE
+
+We can use huge mapping at the pud/pmd level for the HugeTLB page.
+
+For the HugeTLB page of the pmd level mapping, then::
+
+   struct_size = n * sizeof(struct page) / PAGE_SIZE
+               = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
+               = sizeof(struct page) / sizeof(pte_t)
+               = 64 / 8
+               = 8 (pages)
+
+Where n is how many pte entries which one page can contains. So the value of
+n is (PAGE_SIZE / sizeof(pte_t)).
+
+This optimization only supports 64-bit system, so the value of sizeof(pte_t)
+is 8. And this optimization also applicable only when the size of struct page
+is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
+size of struct page structs of it is 8 page frames which size depends on the
+size of the base page.
+
+For the HugeTLB page of the pud level mapping, then::
+
+   struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
+               = PAGE_SIZE / 8 * 8 (pages)
+               = PAGE_SIZE (pages)
+
+Where the struct_size(pmd) is the size of the struct page structs of a
+HugeTLB page of the pmd level mapping.
+
+E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
+HugeTLB page consists in 4096.
+
+Next, we take the pmd level mapping of the HugeTLB page as an example to
+show the internal implementation of this optimization. There are 8 pages
+struct page structs associated with a HugeTLB page which is pmd mapped.
+
+Here is how things look before optimization::
+
+    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | -------------> |     2     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     3     | -------------> |     3     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     4     | -------------> |     4     |
+ |    PMD    |                     +-----------+                +-----------+
+ |   level   |                     |     5     | -------------> |     5     |
+ |  mapping  |                     +-----------+                +-----------+
+ |           |                     |     6     | -------------> |     6     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     7     | -------------> |     7     |
+ |           |                     +-----------+                +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+The value of page->compound_head is the same for all tail pages. The first
+page of page structs (page 0) associated with the HugeTLB page contains the 4
+page structs necessary to describe the HugeTLB. The only use of the remaining
+pages of page structs (page 1 to page 7) is to point to page->compound_head.
+Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
+will be used for each HugeTLB page. This will allow us to free the remaining
+7 pages to the buddy allocator.
+
+Here is how things look after remapping::
+
+    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                  | | | | | |
+ |           |                     |     2     | -----------------+ | | | | |
+ |           |                     +-----------+                    | | | | |
+ |           |                     |     3     | -------------------+ | | | |
+ |           |                     +-----------+                      | | | |
+ |           |                     |     4     | ---------------------+ | | |
+ |    PMD    |                     +-----------+                        | | |
+ |   level   |                     |     5     | -----------------------+ | |
+ |  mapping  |                     +-----------+                          | |
+ |           |                     |     6     | -------------------------+ |
+ |           |                     +-----------+                            |
+ |           |                     |     7     | ---------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
+vmemmap pages and restore the previous mapping relationship.
+
+For the HugeTLB page of the pud level mapping. It is similar to the former.
+We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
+
+Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
+(e.g. aarch64) provides a contiguous bit in the translation table entries
+that hints to the MMU to indicate that it is one of a contiguous set of
+entries that can be cached in a single TLB entry.
+
+The contiguous bit is used to increase the mapping size at the pmd and pte
+(last) level. So this type of HugeTLB page can be optimized only when its
+size of the struct page structs is greater than 1 page.
+
+Notice: The head vmemmap page is not freed to the buddy allocator and all
+tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+associated with each HugeTLB page. The compound_head() can handle this
+correctly (more details refer to the comment above compound_head()).
+
+Device DAX
+==========
+
+The device-dax interface uses the same tail deduplication technique explained
+in the previous chapter, except when used with the vmemmap in
+the device (altmap).
+
+The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
+PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+
+The differences with HugeTLB are relatively minor.
+
+It only use 3 page structs for storing all information as opposed
+to 4 on HugeTLB pages.
+
+There's no remapping of vmemmap given that device-dax memory is not part of
+System RAM ranges initialized at boot. Thus the tail page deduplication
+happens at a later stage when we populate the sections. HugeTLB reuses the
+the head vmemmap page representing, whereas device-dax reuses the tail
+vmemmap page. This results in only half of the savings compared to HugeTLB.
+
+Deduplicated tail pages are not mapped read-only.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PMD    |                     +-----------+                       | | |
+ |   level   |                     |     5     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     6     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     7     | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
diff --git a/Documentation/mm/z3fold.rst b/Documentation/mm/z3fold.rst
new file mode 100644
index 000000000000..224e3c61d686
--- /dev/null
+++ b/Documentation/mm/z3fold.rst
@@ -0,0 +1,30 @@
+.. _z3fold:
+
+======
+z3fold
+======
+
+z3fold is a special purpose allocator for storing compressed pages.
+It is designed to store up to three compressed pages per physical page.
+It is a zbud derivative which allows for higher compression
+ratio keeping the simplicity and determinism of its predecessor.
+
+The main differences between z3fold and zbud are:
+
+* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
+* z3fold can hold up to 3 compressed pages in its page
+* z3fold doesn't export any API itself and is thus intended to be used
+  via the zpool API.
+
+To keep the determinism and simplicity, z3fold, just like zbud, always
+stores an integral number of compressed pages per page, but it can store
+up to 3 pages unlike zbud which can store at most 2. Therefore the
+compression ratio goes to around 2.7x while zbud's one is around 1.7x.
+
+Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
+return a dereferenceable pointer. Instead, it returns an unsigned long
+handle which encodes actual location of the allocated object.
+
+Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
+depend on MMU enabled and provides more predictable reclaim behavior
+which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst
new file mode 100644
index 000000000000..6e79893d6132
--- /dev/null
+++ b/Documentation/mm/zsmalloc.rst
@@ -0,0 +1,82 @@
+.. _zsmalloc:
+
+========
+zsmalloc
+========
+
+This allocator is designed for use with zram. Thus, the allocator is
+supposed to work well under low memory conditions. In particular, it
+never attempts higher order page allocation which is very likely to
+fail under memory pressure. On the other hand, if we just use single
+(0-order) pages, it would suffer from very high fragmentation --
+any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+This was one of the major issues with its predecessor (xvmalloc).
+
+To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+and links them together using various 'struct page' fields. These linked
+pages act as a single higher-order page i.e. an object can span 0-order
+page boundaries. The code refers to these linked pages as a single entity
+called zspage.
+
+For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+since this satisfies the requirements of all its current users (in the
+worst case, page is incompressible and is thus stored "as-is" i.e. in
+uncompressed form). For allocation requests larger than this size, failure
+is returned (see zs_malloc).
+
+Additionally, zs_malloc() does not return a dereferenceable pointer.
+Instead, it returns an opaque handle (unsigned long) which encodes actual
+location of the allocated object. The reason for this indirection is that
+zsmalloc does not keep zspages permanently mapped since that would cause
+issues on 32-bit systems where the VA region for kernel space mappings
+is very small. So, before using the allocating memory, the object has to
+be mapped using zs_map_object() to get a usable pointer and subsequently
+unmapped using zs_unmap_object().
+
+stat
+====
+
+With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
+``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
+    ...
+    ...
+     9   176           0            1           186        129          8                4
+    10   192           1            0          2880       2872        135                3
+    11   208           0            1           819        795         42                2
+    12   224           0            1           219        159         12                4
+    ...
+    ...
+
+
+class
+	index
+size
+	object size zspage stores
+almost_empty
+	the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full
+	the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated
+	the number of objects allocated
+obj_used
+	the number of objects allocated to the user
+pages_used
+	the number of pages allocated for the class
+pages_per_zspage
+	the number of 0-order pages to make a zspage
+
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
+
+* n = number of allocated objects
+* N = total number of objects zspage can store
+* f = fullness_threshold_frac(ie, 4 at the moment)
+
+Similarly, we assign zspage to:
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
index 0c8276109fc0..30c69e1f44fe 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
@@ -13,7 +13,7 @@
 监测数据访问
 ============
 
-:doc:`DAMON </vm/damon/index>` 允许轻量级的数据访问监测。使用DAMON，
+:doc:`DAMON </mm/damon/index>` 允许轻量级的数据访问监测。使用DAMON，
 用户可以分析他们系统的内存访问模式，并优化它们。
 
 .. toctree::
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
index 1500bdbf338a..c976f3e33ffd 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
@@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做，这样我们就可以退回到基于LRU列
 
 .. [1] https://research.google/pubs/pub48551/
 .. [2] https://lwn.net/Articles/787611/
-.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
+.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index eee0e8c5c368..cd41ada4fdad 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。
   口相同。这将在下一个LTS内核发布后被移除，所以用户应该转移到
   :ref:`sysfs interface <sysfs_interface>`。
 - *内核空间编程接口。*
-  :doc:`这 </vm/damon/api>` 这是为内核空间程序员准备的。使用它，用户可以通过为你编写内
+  :doc:`这 </mm/damon/api>` 这是为内核空间程序员准备的。使用它，用户可以通过为你编写内
   核空间的DAMON应用程序，最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。
-  详细情况请参考接口 :doc:`文件 </vm/damon/api>`。
+  详细情况请参考接口 :doc:`文件 </mm/damon/api>`。
 
 sysfs接口
 =========
@@ -148,7 +148,7 @@ contexts/<N>/monitoring_attrs/
 在 ``nr_regions`` 目录下，有两个文件分别用于DAMON监测区域的下限和上限（``min`` 和 ``max`` ），
 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。
 
-关于间隔和监测区域范围的更多细节，请参考设计文件 (:doc:`/vm/damon/design`)。
+关于间隔和监测区域范围的更多细节，请参考设计文件 (:doc:`/mm/damon/design`)。
 
 contexts/<N>/targets/
 ---------------------
@@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
 ----
 
 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔``
-以及监测目标区域的最小/最大数量。要详细了解监测属性，请参考 `:doc:/vm/damon/design` 。例如，
+以及监测目标区域的最小/最大数量。要详细了解监测属性，请参考 `:doc:/mm/damon/design` 。例如，
 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000，然后再次检查::
 
     # cd <debugfs>/damon
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 26d9913fc8b6..b03020c8b2ab 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -101,7 +101,7 @@ Todolist:
 ========
 
 如何在内核中分配和使用内存。请注意，在
-:doc:`/vm/index` 中有更多的内存管理文档。
+:doc:`/mm/index` 中有更多的内存管理文档。
 
 .. toctree::
    :maxdepth: 1
diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst
index ad7bb8c17562..bf85baca8b3e 100644
--- a/Documentation/translations/zh_CN/index.rst
+++ b/Documentation/translations/zh_CN/index.rst
@@ -118,7 +118,7 @@ TODOList:
    sound/index
    filesystems/index
    scheduler/index
-   vm/index
+   mm/index
    peci/index
 
 TODOList:
diff --git a/Documentation/translations/zh_CN/mm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst
new file mode 100644
index 000000000000..c2816f523bd7
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/active_mm.rst
@@ -0,0 +1,85 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/active_mm.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=========
+Active MM
+=========
+
+这是一封linux之父回复开发者的一封邮件，所以翻译时我尽量保持邮件格式的完整。
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ 因为我并不经常写解释，所以已经抄送到linux-kernel邮件列表，而当我做这些，
+ 且更多的人在阅读它们时，我觉得棒极了。
+
+ 1999年7月30日 星期五， David Mosberger 写道：
+ >
+ > 是否有一个简短的描述，说明task_struct中的
+ >  "mm" 和 "active_mm"应该如何使用？ (如果
+ > 这个问题在邮件列表中讨论过，我表示歉意--我刚
+ > 刚度假回来，有一段时间没能关注linux-kernel了）。
+
+ 基本上，新的设定是：
+
+  - 我们有“真实地址空间”和“匿名地址空间”。区别在于，匿名地址空间根本不关心用
+    户级页表，所以当我们做上下文切换到匿名地址空间时，我们只是让以前的地址空间
+    处于活动状态。
+
+    一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线
+    程基本上都属于这一类，但即使是“真正的”线程也可以暂时说在一定时间内它们不
+    会对用户空间感兴趣，调度器不妨试着避免在切换VM状态上浪费时间。目前只有老
+    式的bdflush sync能做到这一点。
+
+  - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说，tsk->mm将是NULL，
+    其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。
+
+  - 然而，我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此，我们
+    有 “tsk->active_mm”，它显示了当前活动的地址空间是什么。
+
+    规则是，对于一个有真实地址空间的进程（即tsk->mm是 non-NULL），active_mm
+    显然必须与真实的mm相同。
+
+    对于一个匿名进程，tsk->mm == NULL，而tsk->active_mm是匿名进程运行时
+    “借用”的mm。当匿名进程被调度走时，借用的地址空间被返回并清除。
+
+ 为了支持所有这些，“struct mm_struct”现在有两个计数器：一个是 “mm_users”
+ 计数器，即有多少 “真正的地址空间用户”，另一个是 “mm_count”计数器，即 “lazy”
+ 用户（即匿名用户）的数量，如果有任何真正的用户，则加1。
+
+ 通常情况下，至少有一个真正的用户，但也可能是真正的用户在另一个CPU上退出，而
+ 一个lazy的用户仍在活动，所以你实际上得到的情况是，你有一个地址空间 **只**
+ 被lazy的用户使用。这通常是一个短暂的生命周期状态，因为一旦这个线程被安排给一
+ 个真正的线程，这个 “僵尸” mm就会被释放，因为 “mm_count”变成了零。
+
+ 另外，一个新的规则是，**没有人** 再把 “init_mm” 作为一个真正的MM了。
+ “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”，事实上，它主
+ 要是在启动时使用，当时还没有真正的VM被创建。因此，用来检查的代码
+
+   if (current->mm == &init_mm)
+
+ 一般来说，应该用
+
+   if (!current->mm)
+
+ 取代上面的写法（这更有意义--测试基本上是 “我们是否有一个用户环境”，并且通常
+ 由缺页异常处理程序和类似的东西来完成）。
+
+ 总之，我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1，因为它稍微改
+ 变了接口以适配alpha（谁会想到呢，但alpha体系结构上下文切换代码实际上最终是
+ 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的，alpha的PALcode将两者
+ 连接起来，你需要同时切换两者）。
+
+ (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst
new file mode 100644
index 000000000000..6fd79209c307
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/balance.rst
@@ -0,0 +1,81 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/balance.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+内存平衡
+========
+
+2000年1月开始，作者：Kanoj Sarcar <kanoj@sgi.com>
+
+对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配，需要进行
+内存平衡。
+
+调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个
+原因可能是，调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退
+选项的机会主义高阶分配请求中。在这种情况下，调用者可能也希望避免唤醒kswapd。
+
+__GFP_IO分配请求是为了防止文件系统死锁。
+
+在没有非睡眠分配请求的情况下，做平衡似乎是有害的。页面回收可以被懒散地启动，也就是
+说，只有在需要的时候（也就是区域的空闲内存为0），而不是让它成为一个主动的过程。
+
+也就是说，内核应该尝试从直接映射池中满足对直接映射页的请求，而不是回退到dma池中，
+这样就可以保持dma池为dma请求（不管是不是原子的）所填充。类似的争论也适用于高内存
+和直接映射的页面。相反，如果有很多空闲的dma页，最好是通过从dma池中分配一个来满足
+常规的内存请求，而不是产生常规区域平衡的开销。
+
+在2.2中，只有当空闲页总数低于总内存的1/64时，才会启动内存平衡/页面回收。如果dma
+和常规内存的比例合适，即使dma区完全空了，也很可能不会进行平衡。2.2已经在不同内存
+大小的生产机器上运行，即使有这个问题存在，似乎也做得不错。在2.3中，由于HIGHMEM的
+存在，这个问题变得更加严重。
+
+在2.3中，区域平衡可以用两种方式之一来完成：根据区域的大小（可能是低级区域的大小），
+我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是，在平衡的时
+候，我们不需要看低级区的大小，坏的方面是，我们可能会因为忽略低级区可能较低的使用率
+而做过于频繁的平衡。另外，只要对分配程序稍作修改，就有可能将memclass()宏简化为一
+个简单的等式。
+
+另一个可能的解决方案是，我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其
+低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题，并尽可能地保持了与2.2行为
+的接近。另外，平衡算法在各种架构上的工作方式也是一样的，这些架构有不同数量和类型的
+内存区。如果我们想变得更花哨一点，我们可以在未来为不同区域的自由页面分配不同的权重。
+
+请注意，如果普通区的大小与dma区相比是巨大的，那么在决定是否平衡普通区的时候，考虑
+空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。
+
+所附的补丁实现了第二个解决方案。它还 “修复”了两个问题：首先，在低内存条件下，kswapd
+被唤醒，就像2.2中的非睡眠分配。第二，HIGHMEM区也被平衡了，以便给replace_with_highmem()
+一个争取获得HIGHMEM页的机会，同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM
+页不会被泄露（例如，在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下）。
+
+kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的，可能
+是因为所有的分配请求都来自中断上下文，而所有的进程上下文都在睡眠。对于2.3，
+kswapd并不真正需要平衡高内存区，因为中断上下文并不请求高内存页。kswapd看zone
+结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。
+
+如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力，而该区的内存压力
+已经低于其水位，则会进行偷取。
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd：
+这些是每个区的字段，用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时，
+hysteric 的字段low_on_memory被设置。这个字段会一直被设置，直到空闲页数变成水位
+[WMARK_HIGH]。当low_on_memory被设置时，页面分配请求将尝试释放该区域的一些页面（如果
+请求中设置了GFP_WAIT）。与此相反的是，决定唤醒kswapd以释放一些区的页。这个决定不是基于
+hysteresis 的，而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行；在这种情况下，
+zone_wake_kswapd也被设置。
+
+
+我所听到的（超棒的）想法：
+
+1. 动态经历应该影响平衡：可以跟踪一个区的失败请求的数量，并反馈到平衡方案中（jalvo@mbay.net）。
+
+2. 实现一个类似于replace_with_highmem()的replace_with_regular()，以保留dma页面。
+   (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/translations/zh_CN/mm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst
new file mode 100644
index 000000000000..5593a83c86bc
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/api.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/api.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=======
+API参考
+=======
+
+内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` ，
+它位于源代码树的include/linux/。
+
+结构体
+======
+
+该API在以下内核代码中:
+
+include/linux/damon.h
+
+
+函数
+====
+
+该API在以下内核代码中:
+
+mm/damon/core.c
diff --git a/Documentation/translations/zh_CN/mm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst
new file mode 100644
index 000000000000..16e3db34a7dd
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/design.rst
@@ -0,0 +1,140 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/design.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+====
+设计
+====
+
+可配置的层
+==========
+
+DAMON提供了数据访问监控功能，同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间
+并为之优化的基元。另一方面，作为DAMON的核心，准确性和开销的权衡机制是在纯逻辑空间中。DAMON
+将这两部分分离在不同的层中，并定义了它的接口，以允许各种低层次的基元实现与核心逻辑的配置。
+
+由于这种分离的设计和可配置的接口，用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的
+任何地址空间。如果没有提供合适的，用户可以自己实现基元。
+
+例如，物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。
+另外，如果某些架构或设备支持特殊的优化访问检查基元，这些基元将很容易被配置。
+
+
+特定地址空间基元的参考实现
+==========================
+
+基本访问监测的低级基元被定义为两部分。:
+
+1. 确定地址空间的监测目标地址范围
+2. 目标空间中特定地址范围的访问检查。
+
+DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。
+
+
+基于VMA的目标地址范围构造
+-------------------------
+
+这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间，只是要求用户手动设置监控目标地址范围。
+
+在进程的超级巨大的虚拟地址空间中，只有小部分被映射到物理内存并被访问。因此，跟踪未映射的地
+址区域只是一种浪费。然而，由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声，所以严
+格来说，跟踪每一个映射并不是必须的，但在某些情况下甚至会产生很高的开销。也就是说，监测目标
+内部过于巨大的未映射区域应该被移除，以不占用自适应机制的时间。
+
+出于这个原因，这个实现将复杂的映射转换为三个不同的区域，覆盖地址空间的每个映射区域。这三个
+区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面
+的mmap()区域之间的间隙，以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙
+在通常的地址空间中是异常巨大的，排除这些间隙就足以做出合理的权衡。下面详细说明了这一点::
+
+    <heap>
+    <BIG UNMAPPED REGION 1>
+    <uppermost mmap()-ed region>
+    (small mmap()-ed regions and munmap()-ed regions)
+    <lowermost mmap()-ed region>
+    <BIG UNMAPPED REGION 2>
+    <stack>
+
+
+基于PTE访问位的访问检查
+-----------------------
+
+物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中
+找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表，而物理地址的实现则
+是查找与该地址有映射关系的每一个页表。通过这种方式，实现者找到并清除下一个采样目标地址的位，
+并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统，即空闲页跟
+踪和回收逻辑。为了避免这种干扰，DAMON使其与空闲页面跟踪相互排斥，并使用 ``PG_idle`` 和
+``PG_young`` 页面标志来解决与回收逻辑的冲突，就像空闲页面跟踪那样。
+
+
+独立于地址空间的核心机制
+========================
+
+下面四个部分分别描述了DAMON的核心机制和五个监测属性，即 ``采样间隔`` 、 ``聚集间隔`` 、
+``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。
+
+
+访问频率监测
+------------
+
+DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置
+``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说，DAMON检查每个 ``采样间隔`` 对每
+个页面的访问，并将结果汇总。换句话说，计算每个页面的访问次数。在每个 ``聚合间隔`` 过
+去后，DAMON调用先前由用户注册的回调函数，以便用户可以阅读聚合的结果，然后再清除这些结
+果。这可以用以下简单的伪代码来描述::
+
+    while monitoring_on:
+        for page in monitoring_target:
+            if accessed(page):
+                nr_accesses[page] += 1
+        if time() % aggregation_interval == 0:
+            for callback in user_registered_callbacks:
+                callback(monitoring_target, nr_accesses)
+            for page in monitoring_target:
+                nr_accesses[page] = 0
+        sleep(sampling interval)
+
+这种机制的监测开销将随着目标工作负载规模的增长而任意增加。
+
+
+基于区域的抽样调查
+------------------
+
+为了避免开销的无限制增加，DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持
+这个假设（一个区域内的页面具有相同的访问频率），该区域内就只需要检查一个页面。因此，对
+于每个 ``采样间隔`` ，DAMON在每个区域中随机挑选一个页面，等待一个 ``采样间隔`` ，检
+查该页面是否同时被访问，如果被访问则增加该区域的访问频率。因此，监测开销是可以通过设置
+区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。
+
+然而，如果假设没有得到保证，这个方案就不能保持输出的质量。
+
+
+适应性区域调整
+--------------
+
+即使最初的监测目标区域被很好地构建以满足假设（同一区域内的页面具有相似的访问频率），数
+据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设，DAMON根据每个
+区域的访问频率自适应地进行合并和拆分。
+
+对于每个 ``聚集区间`` ，它比较相邻区域的访问频率，如果频率差异较小，就合并这些区域。
+然后，在它报告并清除每个区域的聚合接入频率后，如果区域总数不超过用户指定的最大区域数，
+它将每个区域拆分为两个或三个区域。
+
+通过这种方式，DAMON提供了其最佳的质量和最小的开销，同时保持了用户为其权衡设定的界限。
+
+
+动态目标空间更新处理
+--------------------
+
+监测目标地址范围可以动态改变。例如，虚拟内存可以动态地被映射和解映射。物理内存可以被
+热插拔。
+
+由于在某些情况下变化可能相当频繁，DAMON允许监控操作检查动态变化，包括内存映射变化，
+并仅在用户指定的时间间隔（ ``更新间隔`` ）中的每个时间段，将其应用于监控操作相关的
+数据结构，如抽象的监控目标内存区。
\ No newline at end of file
diff --git a/Documentation/translations/zh_CN/mm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst
new file mode 100644
index 000000000000..de4be417494a
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/faq.rst
@@ -0,0 +1,48 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/faq.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+常见问题
+========
+
+为什么是一个新的子系统，而不是扩展perf或其他用户空间工具？
+==========================================================
+
+首先，因为它需要尽可能的轻量级，以便可以在线使用，所以应该避免任何不必要的开销，如内核-用户
+空间的上下文切换成本。第二，DAMON的目标是被包括内核在内的其他程序所使用。因此，对特定工具
+（如perf）的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。
+
+
+“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗？
+==============================================
+
+闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的，尽管它可以
+使用采样来减少开销。另一方面，DAMON是一个更高层次的框架，用于监控各种地址空间。它专注于内
+存管理优化，并提供复杂的精度/开销处理机制。因此，“空闲页面跟踪” 和 “perf mem” 可以提供
+DAMON输出的一个子集，但不能替代DAMON。
+
+
+DAMON是否只支持虚拟内存？
+=========================
+
+不，DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始
+部分，包括监测目标区域的构造和实际的访问检查。通过这种方式，DAMON用户可以用任何访问检查技
+术来监测任何地址空间。
+
+尽管如此，DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间
+相关功能的实现，以供参考和方便使用。
+
+
+我可以简单地监测页面的粒度吗？
+==============================
+
+是的，你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。
+因为监视目标区域的大小被强制为 ``>=page size`` ，所以区域分割不会产生任何影响。
diff --git a/Documentation/translations/zh_CN/mm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst
new file mode 100644
index 000000000000..b03bf307204f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+==========================
+DAMON:数据访问监视器
+==========================
+
+DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为
+（该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)）
+
+ - *准确度* （监测输出对DRAM级别的内存管理足够有用；但可能不适合CPU Cache级别），
+ - *轻量级* （监控开销低到可以在线应用），以及
+ - *可扩展* （无论目标工作负载的大小，开销的上限值都在恒定范围内）。
+
+因此，利用这个框架，内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实
+验性内存管理优化工作可以再次进行。同时，在用户空间，有一些特殊工作负载的用户可以编写
+个性化的应用程序，以便更好地了解和优化他们的工作负载和系统。
+
+.. toctree::
+   :maxdepth: 2
+
+   faq
+   design
+   api
diff --git a/Documentation/translations/zh_CN/mm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst
new file mode 100644
index 000000000000..83b14cce9adf
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst
@@ -0,0 +1,38 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/_free_page_reporting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==========
+空闲页报告
+==========
+
+空闲页报告是一个API，设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟
+化的情况下是很有用的，客户机能够使用这些数据来通知管理器它不再使用内存中的某些页
+面。
+
+对于驱动，通常是气球驱动要使用这个功能，它将分配和初始化一个page_reporting_dev_info
+结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必
+须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。
+假设没有其他页面报告设备已经注册， 对page_reporting_register的调用将向报告框
+架注册页面报告接口。
+
+一旦注册，页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告
+页面，并在任何足够高的页面被释放之后2秒继续报告。
+
+报告的页面将被存储在传递给报告函数的散列表中，最后一个条目的结束位被设置在条目
+nent-1中。 当页面被报告函数处理时，分配器将无法访问它们。一旦报告函数完成，这些
+页将被返回到它们所获得的自由区域。
+
+在移除使用空闲页报告的驱动之前，有必要调用page_reporting_unregister，以移除
+目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报
+告通过该接口发出。如果另一个驱动或同一驱动被注册，它就有可能恢复前一个驱动在报告
+空闲页方面的工作。
+
+
+Alexander Duyck, 2019年12月04日
diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst
new file mode 100644
index 000000000000..5c18ea2be04f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/frontswap.rst
@@ -0,0 +1,196 @@
+:Original: Documentation/mm/_free_page_reporting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=========
+Frontswap
+=========
+
+Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中，由
+于交换页被保存在RAM（或类似RAM的设备）中，而不是交换磁盘，因此可以获得巨大的性能
+节省（提高）。
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap之所以这么命名，是因为它可以被认为是与swap设备的“back”存储相反。存
+储器被认为是一个同步并发安全的面向页面的“伪RAM设备”，符合transcendent memory
+（如Xen的“tmem”，或内核内压缩内存，又称“zcache”，或未来的类似RAM的设备）的要
+求；这个伪RAM设备不能被内核直接访问或寻址，其大小未知且可能随时间变化。驱动程序通过
+调用frontswap_register_ops将自己与frontswap链接起来，以适当地设置frontswap_ops
+的功能，它提供的功能必须符合某些策略，如下所示:
+
+一个 “init” 将设备准备好接收与指定的交换设备编号（又称“类型”）相关的frontswap
+交换页。一个 “store” 将把该页复制到transcendent memory，并与该页的类型和偏移
+量相关联。一个 “load” 将把该页，如果找到的话，从transcendent memory复制到内核
+内存，但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从
+transcendent memory中删除该页，一个 “invalidate_area” 将删除所有与交换类型
+相关的页（例如，像swapoff）并通知 “device” 拒绝进一步存储该交换类型。
+
+一旦一个页面被成功存储，在该页面上的匹配加载通常会成功。因此，当内核发现自己处于需
+要交换页面的情况时，它首先尝试使用frontswap。如果存储的结果是成功的，那么数据就已
+经成功的保存到了transcendent memory中，并且避免了磁盘写入，如果后来再读回数据，
+也避免了磁盘读取。如果存储返回失败，transcendent memory已经拒绝了该数据，且该页
+可以像往常一样被写入交换空间。
+
+请注意，如果一个页面被存储，而该页面已经存在于transcendent memory中（一个 “重复”
+的存储），要么存储成功，数据被覆盖，要么存储失败，该页面被废止。这确保了旧的数据永远
+不会从frontswap中获得。
+
+如果配置正确，对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
+debugfs完成的。frontswap的有效性可以通过以下方式测量（在所有交换设备中）:
+
+``failed_stores``
+	有多少次存储的尝试是失败的
+
+``loads``
+	尝试了多少次加载（应该全部成功）
+
+``succ_stores``
+	有多少次存储的尝试是成功的
+
+``invalidates``
+	尝试了多少次作废
+
+后台实现可以提供额外的指标。
+
+经常问到的问题
+==============
+
+* 价值在哪里?
+
+当一个工作负载开始交换时，性能就会下降。Frontswap通过提供一个干净的、动态的接口来
+读取和写入交换页到 “transcendent memory”，从而大大增加了许多这样的工作负载的性
+能，否则内核是无法直接寻址的。当数据被转换为不同的形式和大小（比如压缩）或者被秘密
+移动（对于一些类似RAM的设备来说，这可能对写平衡很有用）时，这个接口是理想的。交换
+页（和被驱逐的页面缓存页）是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。
+
+Frontswap对内核的影响相当小，为各种系统配置中更动态、更灵活的RAM利用提供了巨大的
+灵活性：
+
+在单一内核的情况下，又称“zcache”，页面被压缩并存储在本地内存中，从而增加了可以安
+全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利
+用率。Benchmarks测试显示，当内存压力较低时，几乎没有影响，而在高内存压力下的一些
+工作负载上，则有明显的性能改善（25%以上）。
+
+“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory
+的支持。Frontswap页面像zcache一样被本地压缩，但随后被“remotified” 到另一个系
+统的RAM。这使得RAM可以根据需要动态地来回负载平衡，也就是说，当系统A超载时，它可以
+交换到系统B，反之亦然。RAMster也可以被配置成一个内存服务器，因此集群中的许多服务器
+可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户
+有多少内存可用
+
+在虚拟情况下，虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复
+用。对于RAM来说，这真的很难做到，而且在不改变内核的情况下，要做好这一点的努力基本上
+是失败的（除了一些广为人知的特殊情况下的工作负载）。具体来说，Xen Transcendent Memory
+后端允许管理器拥有的RAM “fallow”，不仅可以在多个虚拟机之间进行“time-shared”，
+而且页面可以被压缩和重复利用，以优化RAM的利用率。当客户操作系统被诱导交出未充分利用
+的RAM时（如 “selfballooning”），突然出现的意外内存压力可能会导致交换；frontswap
+允许这些页面被交换到管理器RAM中或从管理器RAM中交换（如果整体主机系统内存条件允许），
+从而减轻计划外交换可能带来的可怕的性能影响。
+
+一个KVM的实现正在进行中，并且已经被RFC'ed到lkml。而且，利用frontswap，对NVM作为
+内存扩展技术的调查也在进行中。
+
+* 当然，在某些情况下可能有性能上的优势，但frontswap的空间/时间开销是多少？
+
+如果 CONFIG_FRONTSWAP 被禁用，每个 frontswap 钩子都会编译成空，唯一的开销是每
+个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用，但没有
+frontswap的 “backend” 寄存器，每读或写一个交换页就会有一个额外的全局变量，而不
+是零。如果 CONFIG_FRONTSWAP 被启用，并且有一个frontswap的backend寄存器，并且
+后端每次 “store” 请求都失败（即尽管声称可能，但没有提供内存），CPU 的开销仍然可以
+忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前，系统很可能是 I/O 绑定
+的，无论如何使用一小部分的 CPU 都是不相关的。
+
+至于空间，如果CONFIG_FRONTSWAP被启用，并且有一个frontswap的backend注册，那么
+每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换
+页分配的8位（在2.6.34之前是16位）上增加的。(Hugh Dickins观察到，frontswap可能
+会偷取现有的8个比特，但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的
+非常大的交换盘（这很罕见），这是每32GB交换盘1MB开销。
+
+当交换页存储在transcendent memory中而不是写到磁盘上时，有一个副作用，即这可能会
+产生更多的内存压力，有可能超过其他的优点。一个backend，比如zcache，必须实现策略
+来仔细（但动态地）管理内存限制，以确保这种情况不会发生。
+
+* 好吧，那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何？
+
+我们假设在内核初始化过程中，一个frontswap 的 “backend” 已经注册了；这个注册表
+明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提
+供了多少内存是完全动态和随机的。
+
+每当一个交换设备被交换时，就会调用frontswap_init()，把交换设备的编号（又称“类
+型”）作为一个参数传给它。这就通知了frontswap，以期待 “store” 与该号码相关的交
+换页的尝试。
+
+每当交换子系统准备将一个页面写入交换设备时（参见swap_writepage()），就会调用
+frontswap_store。Frontswap与frontswap backend协商，如果backend说它没有空
+间，frontswap_store返回-1，内核就会照常把页换到交换设备上。注意，来自frontswap
+backend的响应对内核来说是不可预测的；它可能选择从不接受一个页面，可能接受每九个
+页面，也可能接受每一个页面。但是如果backend确实接受了一个页面，那么这个页面的数
+据已经被复制并与类型和偏移量相关联了，而且backend保证了数据的持久性。在这种情况
+下，frontswap在交换设备的“frontswap_map” 中设置了一个位，对应于交换设备上的
+页面偏移量，否则它就会将数据写入该设备。
+
+当交换子系统需要交换一个页面时（swap_readpage()），它首先调用frontswap_load()，
+检查frontswap_map，看这个页面是否早先被frontswap backend接受。如果是，该页
+的数据就会从frontswap后端填充，换入就完成了。如果不是，正常的交换代码将被执行，
+以便从真正的交换设备上获得这一页的数据。
+
+所以每次frontswap backend接受一个页面时，交换设备的读取和（可能）交换设备的写
+入都被 “frontswap backend store” 和（可能）“frontswap backend loads”
+所取代，这可能会快得多。
+
+* frontswap不能被配置为一个 “特殊的” 交换设备，它的优先级要高于任何真正的交换
+  设备（例如像zswap，或者可能是swap-over-nbd/NFS）？
+
+首先，现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次
+结构，但这将需要相当大的改变。即使它被重写，现有的交换子系统也使用了块I/O层，它
+假定交换设备是固定大小的，其中的任何页面都是可线性寻址的。Frontswap几乎没有触
+及现有的交换子系统，而是围绕着块I/O子系统的限制，提供了大量的灵活性和动态性。
+
+例如，frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend
+的定义至关重要，因为它赋予了backend完全动态的决定权。在zcache中，人们无法预
+先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝，而 “差” 本身也可
+以根据当前的内存限制动态地定义。
+
+此外，frontswap是完全同步的，而真正的交换设备，根据定义，是异步的，并且使用
+块I/O。块I/O层不仅是不必要的，而且可能进行 “优化”，这对面向RAM的设备来说是
+不合适的，包括将一些页面的写入延迟相当长的时间。同步是必须的，以确保后端的动
+态性，并避免棘手的竞争条件，这将不必要地大大增加frontswap和/或块I/O子系统的
+复杂性。也就是说，只有最初的 “store” 和 “load” 操作是需要同步的。一个独立
+的异步线程可以自由地操作由frontswap存储的页面。例如，RAMster中的 “remotification”
+线程使用标准的异步内核套接字，将压缩的frontswap页面移动到远程机器。同样，
+KVM的客户方实现可以进行客户内压缩，并使用 “batched” hypercalls。
+
+在虚拟化环境中，动态性允许管理程序（或主机操作系统）做“intelligent overcommit”。
+例如，它可以选择只接受页面，直到主机交换可能即将发生，然后强迫客户机做他们
+自己的交换。
+
+transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可
+能失败，所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此，
+frontswap必须作为每个交换设备的 “影子” 来实现，它有可能容纳交换设备可能
+容纳的每一个页面，也有可能根本不容纳任何页面。这意味着frontswap不能包含比
+swap设备总数更多的页面。例如，如果在某些安装上没有配置交换设备，frontswap
+就没有用。无交换设备的便携式设备仍然可以使用frontswap，但是这种设备的
+backend必须配置某种 “ghost” 交换设备，并确保它永远不会被使用。
+
+
+* 为什么会有这种关于 “重复存储” 的奇怪定义？如果一个页面以前被成功地存储过，
+  难道它不能总是被成功地覆盖吗？
+
+几乎总是可以的，不，有时不能。考虑一个例子，数据被压缩了，原来的4K页面被压
+缩到了1K。现在，有人试图用不可压缩的数据覆盖该页，因此会占用整个4K。但是
+backend没有更多的空间了。在这种情况下，这个存储必须被拒绝。每当frontswap
+拒绝一个会覆盖的存储时，它也必须使旧的数据作废，并确保它不再被访问。因为交
+换子系统会把新的数据写到读交换设备上，这是确保一致性的正确做法。
+
+* 为什么frontswap补丁会创建新的头文件swapfile.h？
+
+frontswap代码依赖于一些swap子系统内部的数据结构，这些数据结构多年来一直
+在静态和全局之间来回移动。这似乎是一个合理的妥协：将它们定义为全局，但在一
+个新的包含文件中声明它们，该文件不被包含swap.h的大量源文件所包含。
+
+Dan Magenheimer，最后更新于2012年4月9日
diff --git a/Documentation/translations/zh_CN/mm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst
new file mode 100644
index 000000000000..81202c65e000
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/highmem.rst
@@ -0,0 +1,128 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/highmem.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==========
+高内存处理
+==========
+
+作者: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+.. contents:: :local:
+
+高内存是什么？
+==============
+
+当物理内存的大小接近或超过虚拟内存的最大大小时，就会使用高内存（highmem）。在这一点上，内
+核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内
+存的临时映射。
+
+没有被永久映射覆盖的那部分（物理）内存就是我们所说的 "高内存"。对于这个边界的确切位置，有
+各种架构上的限制。
+
+例如，在i386架构中，我们选择将内核映射到每个进程的虚拟空间，这样我们就不必为内核的进入/退
+出付出全部的TLB作废代价。这意味着可用的虚拟内存空间（i386上为4GiB）必须在用户和内核空间之
+间进行划分。
+
+使用这种方法的架构的传统分配方式是3:1，3GiB用于用户空间，顶部的1GiB用于内核空间。::
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+这意味着内核在任何时候最多可以映射1GiB的物理内存，但是由于我们需要虚拟地址空间来做其他事
+情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少（通常在~896MiB左右）。
+
+其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而，一些硬件（如一些ARM）在使
+用mm上下文标签时，其虚拟空间有限。
+
+
+临时虚拟映射
+============
+
+内核包含几种创建临时映射的方法。:
+
+* vmap().  这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization
+  来解除映射。
+
+* kmap().  这允许对单个页面进行短期映射。它需要synchronization，但在一定程度上被摊销。
+  当以嵌套方式使用时，它也很容易出现死锁，因此不建议在新代码中使用它。
+
+* kmap_atomic().  这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上，
+  它表现得很好，但发布任务因此被要求留在该CPU上直到它完成，以免其他任务取代它的映射。
+
+  kmap_atomic() 也可以由中断上下文使用，因为它不睡眠，而且调用者可能在调用kunmap_atomic()
+  之后才睡眠。
+
+  可以假设k[un]map_atomic()不会失败。
+
+
+使用kmap_atomic
+===============
+
+何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存（见__GFP_HIGHMEM）
+分配的页面的内容时，例如在页缓存中的页面，就会使用它。该API有两个函数，它们的使用方式与
+下面类似::
+
+	/* 找到感兴趣的页面。 */
+	struct page *page = find_get_page(mapping, offset);
+
+	/* 获得对该页内容的访问权。 */
+	void *vaddr = kmap_atomic(page);
+
+	/* 对该页的内容做一些处理。 */
+	memset(vaddr, 0, PAGE_SIZE);
+
+	/* 解除该页面的映射。 */
+	kunmap_atomic(vaddr);
+
+注意，kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。
+
+如果你需要映射两个页面，因为你想从一个页面复制到另一个页面，你需要保持kmap_atomic调用严
+格嵌套，如::
+
+	vaddr1 = kmap_atomic(page1);
+	vaddr2 = kmap_atomic(page2);
+
+	memcpy(vaddr1, vaddr2, PAGE_SIZE);
+
+	kunmap_atomic(vaddr2);
+	kunmap_atomic(vaddr1);
+
+
+临时映射的成本
+==============
+
+创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。
+
+如果CONFIG_HIGHMEM没有被设置，那么内核会尝试用一点计算来创建映射，将页面结构地址转换成
+指向页面内容的指针，而不是去捣鼓映射。在这种情况下，解映射操作可能是一个空操作。
+
+如果CONFIG_MMU没有被设置，那么就不可能有临时映射和高内存。在这种情况下，也将使用计算方法。
+
+
+i386 PAE
+========
+
+在某些情况下，i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果:
+
+* Linux需要为系统中的每个页面建立一个页帧结构，而且页帧需要驻在永久映射中，这意味着：
+
+* 你最多可以有896M/sizeof(struct page)页帧；由于页结构体是32字节的，所以最终会有
+  112G的页；然而，内核需要在内存中存储更多的页帧......
+
+* PAE使你的页表变大--这使系统变慢，因为更多的数据需要在TLB填充等方面被访问。一个好处
+  是，PAE有更多的PTE位，可以提供像NX和PAT这样的高级功能。
+
+一般的建议是，你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作
+量有用，但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。
diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst
new file mode 100644
index 000000000000..5024a8a15516
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hmm.rst
@@ -0,0 +1,361 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/hmm.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==================
+异构内存管理 (HMM)
+==================
+
+提供基础设施和帮助程序以将非常规内存（设备内存，如板上 GPU 内存）集成到常规内核路径中，其
+基石是此类内存的专用struct page（请参阅本文档的第 5 至 7 节）。
+
+HMM 还为 SVM（共享虚拟内存）提供了可选的帮助程序，即允许设备透明地访问与 CPU 一致的程序
+地址，这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得
+必不可少，其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。
+
+本文档分为以下部分：在第一部分中，我揭示了与使用特定于设备的内存分配器相关的问题。在第二
+部分中，我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页
+表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后，
+最后一节介绍了一个新的迁移助手，它允许利用设备 DMA 引擎。
+
+.. contents:: :local:
+
+使用特定于设备的内存分配器的问题
+================================
+
+具有大量板载内存（几 GB）的设备（如 GPU）历来通过专用驱动程序特定 API 管理其内存。这会
+造成设备驱动程序分配和管理的内存与常规应用程序内存（私有匿名、共享内存或常规文件支持内存）
+之间的隔断。从这里开始，我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况：
+即，设备可以透明地使用任何应用程序内存区域。
+
+分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来
+看，程序中的所有内存对象并不平等，这使得依赖于广泛的库的大型程序变得复杂。
+
+具体来说，这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存（malloc、mmap
+私有、mmap 共享）和通过设备驱动程序 API 分配的内存之间复制对象（这仍然以 mmap 结束，
+但是是设备文件）。
+
+对于平面数据集（数组、网格、图像……），这并不难实现，但对于复杂数据集（列表、树……），
+很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错，
+而且由于数据集和地址的重复，程序更难调试。
+
+分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据，因此每个库可能
+不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响，并因为各种内存
+拷贝而浪费资源。
+
+复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出，并不是一个可行的选择。
+这将导致库入口点的组合爆炸。
+
+最后，随着高级语言结构（在 C++ 中，当然也在其他语言中）的进步，编译器现在有可能在没有程
+序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有
+其他模式，使用共享地址空间也更合理。
+
+
+I/O 总线、设备内存特性
+======================
+
+由于一些限制，I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本
+内存访问；甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下，它
+不是缓存一致的。
+
+如果我们只考虑 PCIE 总线，那么设备可以访问主内存（通常通过 IOMMU）并与 CPU 缓存一
+致。但是，它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟：CPU
+只能访问有限范围的设备内存，而不能对其执行原子操作。因此，从内核的角度来看，设备内存不
+能被视为与常规内存等同。
+
+另一个严重的因素是带宽有限（约 32GBytes/s，PCIE 4.0 和 16 通道）。这比最快的 GPU
+内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自
+己的内存时高一个数量级。
+
+一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制
+（OpenCAPI、CCIX）。它们主要允许 CPU 和设备之间的双向缓存一致性，并允许架构支持的所
+有原子操作。遗憾的是，并非所有平台都遵循这一趋势，并且一些主要架构没有针对这些问题的硬
+件解决方案。
+
+因此，为了使共享地址空间有意义，我们不仅必须允许设备访问任何内存，而且还必须允许任何内
+存在设备使用时迁移到设备内存（在迁移时阻止 CPU 访问）。
+
+
+共享地址空间和迁移
+==================
+
+HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间，因此对
+于进程地址空间中的任何有效主内存地址，相同的地址指向相同的物理内存。
+
+为了实现这一点，HMM 提供了一组帮助程序来填充设备页表，同时跟踪 CPU 页表更新。设备页表
+更新不像 CPU 页表更新那么容易。要更新设备页表，您必须分配一个缓冲区（或使用预先分配的
+缓冲区池）并在其中写入 GPU 特定命令以执行更新（取消映射、缓存失效和刷新等）。这不能通
+过所有设备的通用代码来完成。因此，为什么HMM提供了帮助器，在把硬件的具体细节留给设备驱
+动程序的同时，把一切可以考虑的因素都考虑进去了。
+
+HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存，它允许为设备内存的每个页面分配一个
+struct page。这些页面很特殊，因为 CPU 无法映射它们。然而，它们允许使用现有的迁移机
+制将主内存迁移到设备内存，从 CPU 的角度来看，一切看起来都像是换出到磁盘的页面。使用
+struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次，HMM 仅提供帮助程序，
+首先为设备内存热插拔新的 ZONE_DEVICE 内存，然后执行迁移。迁移内容和时间的策略决定留
+给设备驱动程序。
+
+请注意，任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如，当支持给定CPU
+地址 A 的页面从主内存页面迁移到设备页面时，对地址 A 的任何 CPU 访问都会触发缺页异常
+并启动向主内存的迁移。
+
+凭借这两个特性，HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步，而且还通
+过迁移设备正在使用的数据集部分来利用设备内存。
+
+
+地址空间镜像实现和API
+=====================
+
+地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中；HMM 有助于
+保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier
+开始::
+
+ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+				  struct mm_struct *mm, unsigned long start,
+				  unsigned long length,
+				  const struct mmu_interval_notifier_ops *ops);
+
+在 ops->invalidate() 回调期间，设备驱动程序必须对范围执行更新操作（将范围标记为只
+读，或完全取消映射等）。设备必须在驱动程序回调返回之前完成更新。
+
+当设备驱动程序想要填充一个虚拟地址范围时，它可以使用::
+
+  int hmm_range_fault(struct hmm_range *range);
+
+如果请求写访问，它将在丢失或只读条目上触发缺页异常（见下文）。缺页异常使用通用的 mm 缺
+页异常代码路径，就像 CPU 缺页异常一样。
+
+这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟
+范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。
+
+在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面，以保
+持事物正确同步。使用模式是::
+
+ int driver_populate_range(...)
+ {
+      struct hmm_range range;
+      ...
+
+      range.notifier = &interval_sub;
+      range.start = ...;
+      range.end = ...;
+      range.hmm_pfns = ...;
+
+      if (!mmget_not_zero(interval_sub->notifier.mm))
+          return -EFAULT;
+
+ again:
+      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+      mmap_read_lock(mm);
+      ret = hmm_range_fault(&range);
+      if (ret) {
+          mmap_read_unlock(mm);
+          if (ret == -EBUSY)
+                 goto again;
+          return ret;
+      }
+      mmap_read_unlock(mm);
+
+      take_lock(driver->update);
+      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      /* Use pfns array content to update device page table,
+       * under the update lock */
+
+      release_lock(driver->update);
+      return 0;
+ }
+
+driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用
+mmu_interval_read_retry() 之前保持，以避免与并发 CPU 页表更新发生任何竞争。
+
+利用 default_flags 和 pfn_flags_mask
+====================================
+
+hmm_range 结构有 2 个字段，default_flags 和 pfn_flags_mask，它们指定整个范围
+的故障或快照策略，而不必为 pfns 数组中的每个条目设置它们。
+
+例如，如果设备驱动程序需要至少具有读取权限的范围的页面，它会设置::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = 0;
+
+并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。
+
+现在假设驱动程序想要做同样的事情，除了它想要拥有写权限的范围内的一页。现在驱动程序设
+置::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
+
+有了这个，HMM 将在至少读取（即有效）的所有页面中异常，并且对于地址
+== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限，即，如果
+CPU pte 没有设置写权限，那么HMM将调用handle_mm_fault()。
+
+hmm_range_fault 完成后，标志位被设置为页表的当前状态，即 HMM_PFN_VALID | 如果页
+面可写，将设置 HMM_PFN_WRITE。
+
+
+从核心内核的角度表示和管理设备内存
+==================================
+
+尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存
+的信息，HMM 将自身挂接到 mm 代码的各个位置，以处理对设备内存支持的地址的任何访问。
+事实证明，这最终复制了 struct page 的大部分字段，并且还需要更新许多内核代码路径才
+能理解这种新的内存类型。
+
+大多数内核代码路径从不尝试访问页面后面的内存，而只关心struct page的内容。正因为如此，
+HMM 切换到直接使用 struct page 用于设备内存，这使得大多数内核代码路径不知道差异。
+我们只需要确保没有人试图从 CPU 端映射这些页面。
+
+移入和移出设备内存
+==================
+
+由于 CPU 无法直接访问设备内存，因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存
+储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和
+migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。
+
+在将页面迁移到设备私有内存之前，需要创建特殊的设备私有 ``struct page`` 。这些将用
+作特殊的“交换”页表条目，以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。
+
+这些可以通过以下方式分配和释放::
+
+    struct resource *res;
+    struct dev_pagemap pagemap;
+
+    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
+                                  "name of driver resource");
+    pagemap.type = MEMORY_DEVICE_PRIVATE;
+    pagemap.range.start = res->start;
+    pagemap.range.end = res->end;
+    pagemap.nr_range = 1;
+    pagemap.ops = &device_devmem_ops;
+    memremap_pages(&pagemap, numa_node_id());
+
+    memunmap_pages(&pagemap);
+    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
+
+还有devm_request_free_mem_region(), devm_memremap_pages(),
+devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``.
+
+整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration <page_migration>`) ，
+但这些步骤分为设备驱动程序特定代码和共享公共代码:
+
+1. ``mmap_read_lock()``
+
+   设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup()，
+   因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。
+
+2. ``migrate_vma_setup(struct migrate_vma *args)``
+
+   设备驱动初始化了 ``struct migrate_vma`` 的字段，并将该指针传递给
+   migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。
+   例如，设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存，设置
+   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页
+   面。如果后者被设置， ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备
+   私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前，只有匿名的私有VMA
+   范围可以被迁移到系统内存和设备私有内存。
+
+   migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()``
+   和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表，使
+   其他设备的MMU无效，以便在 ``args->src`` 数组中填写要迁移的PFN。
+   ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` ，
+   其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE， ``owner`` 字段设置为传递给
+   migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无
+   效化回调，只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。
+
+
+   在遍历页表时，一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效
+   的  “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清
+   除它，而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被
+   ``lock_page()``锁定，与LRU隔离（如果系统内存和设备私有页不在LRU上），从进
+   程中取消映射，并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup()
+   还清除了 ``args->dst`` 数组。
+
+3. 设备驱动程序分配目标页面并将源页面复制到目标页面。
+
+   驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已
+   设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选
+   择跳过页面迁移。
+
+   然后，驱动程序分配一个设备私有 struct page 或一个系统内存页，用 ``lock_page()``
+   锁定该页，并将 ``dst`` 数组条目填入::
+
+     dst[i] = migrate_pfn(page_to_pfn(dpage));
+
+   现在驱动程序知道这个页面正在被迁移，它可以使设备私有 MMU 映射无效并将设备私有
+   内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失
+   效，因此设备驱动程序只需使其自己的 MMU 映射失效。
+
+   驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的
+   ``struct page`` 面，并将源页面复制到目标设备上，如果指针为 ``NULL`` ，意
+   味着源页面没有被填充到系统内存中，则清除目标设备的私有内存。
+
+4. ``migrate_vma_pages()``
+
+   这一步是实际“提交”迁移的地方。
+
+   如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页，这时新分配的页会被插
+   入到CPU的页表中。如果一个CPU线程在同一页面上发生异常，这可能会失败。然而，页
+   表被锁定，只有一个新页会被插入。如果它失去了竞争，设备驱动将看到
+   ``MIGRATE_PFN_MIGRATE`` 位被清除。
+
+   如果源页被锁定、隔离等，源 ``struct page`` 信息现在被复制到目标
+   ``struct page`` ，最终完成CPU端的迁移。
+
+5. 设备驱动为仍在迁移的页面更新设备MMU页表，回滚未迁移的页面。
+
+   如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置，设备驱动可以
+   更新设备MMU，如果 ``MIGRATE_PFN_WRITE`` 位被设置，则设置写启用位。
+
+6. ``migrate_vma_finalize()``
+
+   这一步用新页的页表项替换特殊的迁移页表项，并释放对源和目的 ``struct page``
+   的引用。
+
+7. ``mmap_read_unlock()``
+
+   现在可以释放锁了。
+
+独占访问存储器
+==============
+
+一些设备具有诸如原子PTE位的功能，可以用来实现对系统内存的原子访问。为了支持对一
+个共享的虚拟内存页的原子操作，这样的设备需要对该页的访问是排他的，而不是来自CPU
+的任何用户空间访问。  ``make_device_exclusive_range()`` 函数可以用来使一
+个内存范围不能从用户空间访问。
+
+这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
+导致一个异常，该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已
+经被MMU通知器改变，之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序
+放弃页面锁和页面引用为止，这时页面上的任何CPU异常都可以按所述进行。
+
+内存 cgroup (memcg) 和 rss 统计
+===============================
+
+目前，设备内存被视为 rss 计数器中的任何常规页面（如果设备页面用于匿名，则为匿名，
+如果设备页面用于文件支持页面，则为文件，如果设备页面用于共享内存，则为 shmem）。
+这是为了保持现有应用程序的故意选择，这些应用程序可能在不知情的情况下开始使用设备
+内存，运行不受影响。
+
+一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序，
+因此不会释放太多系统内存。在决定以不同方式计算设备内存之前，我们希望收集更多关
+于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。
+
+对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算，常规
+页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规
+内存不会失败，因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对
+内存资源控制的影响有了更多的了解，我们可能会在后面重新考虑这个选择。
+
+请注意，设备内存永远不能由设备驱动程序或通过 GUP 固定，因此此类内存在进程退出时
+总是被释放的。或者在共享内存或文件支持内存的情况下，当删除最后一个引用时。
diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..752e5696cd47
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
@@ -0,0 +1,436 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/hugetlbfs_reserv.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==============
+Hugetlbfs 预留
+==============
+
+概述
+====
+
+:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指
+示要使用巨页，这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常
+时没有巨页存在，任务就会被发送一个SIGBUS，并经常不高兴地死去。在加入巨页支
+持后不久，人们决定，在mmap()时检测巨页的短缺情况会更好。这个想法是，如果
+没有足够的巨页来覆盖映射，mmap()将失败。这首先是在mmap()时在代码中做一个
+简单的检查，以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一
+样，代码随着时间的推移而不断发展。然而，基本的想法是在mmap()时 “预留”
+巨页，以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核
+中是如何进行巨页预留处理的。
+
+
+读者
+====
+这个描述主要是针对正在修改hugetlbfs代码的内核开发者。
+
+
+数据结构
+========
+
+resv_huge_pages
+	这是一个全局的（per-hstate）预留的巨页的计数。预留的巨页只对预留它们的任
+	务可用。因此，一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。
+Reserve Map
+	预留映射由以下结构体描述::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
+	系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的
+	区域。一个区域被描述为::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
+	file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型，在
+	reserv_map 中的一个区域可能表示该范围存在预留，或预留不存在。
+Flags for MAP_PRIVATE Reservations
+	这些被存储在预留的映射指针的底部。
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		表示该任务是与该映射相关的预留的所有者。
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		表示最初映射此范围（并创建储备）的任务由于COW失败而从该任务（子任务）中取消映
+		射了一个页面。
+Page Flags
+	PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在
+	“释放巨页” 一节中讨论。
+
+
+预留映射位置（私有或共享）
+==========================
+
+一个巨页映射或段要么是私有的，要么是共享的。如果是私有的，它通常只对一个地址空间
+（任务）可用。如果是共享的，它可以被映射到多个地址空间（任务）。对于这两种类型的映射，
+预留映射的位置和语义是明显不同的。位置的差异是：
+
+- 对于私有映射，预留映射挂在VMA结构体上。具体来说，就是vma->vm_private_data。这个保
+  留映射是在创建映射（mmap(MAP_PRIVATE)）时创建的。
+- 对于共享映射，预留映射挂在inode上。具体来说，就是inode->i_mapping->private_data。
+  由于共享映射总是由hugetlbfs文件系统中的文件支持，hugetlbfs代码确保每个节点包含一个预
+  留映射。因此，预留映射在创建节点时被分配。
+
+
+创建预留
+========
+当创建一个巨大的有页面支持的共享内存段（shmget(SHM_HUGETLB)）或通过mmap(MAP_HUGETLB)
+创建一个映射时，就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用::
+
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
+
+hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE
+标志。如果指定了NORESERVE，那么这个函数立即返回，因为不需要预留。
+
+参数'from'和'to'是映射或基础文件的巨页索引。对于shmget()，'from'总是0，'to'对应于段/映射
+的长度。对于mmap()，offset参数可以用来指定进入底层文件的偏移量。在这种情况下，'from'和'to'
+参数已经被这个偏移量所调整。
+
+PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。
+
+- 对于共享映射，预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时，预留映射不被
+  修改。
+- 对于私有映射，预留映射中没有条目表示相应页面存在预留。随着预留被消耗，条目被添加到预留映射中。
+  因此，预留映射也可用于确定哪些预留已被消耗。
+
+对于私有映射，hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外，
+HPAGE_RESV_OWNER标志被设置，以表明该VMA拥有预留。
+
+预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射，这始终是一个值（to - from）。
+然而，对于共享映射来说，一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节，
+请参见 :ref:`预留映射的修改 <resv_map_modifications>` 一节。
+
+该映射可能与一个子池（subpool）相关联。如果是这样，将查询子池以确保有足够的空间用于映射。子池
+有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 <sub_pool_resv>`
+一节。
+
+在咨询了预留映射和子池之后，就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查
+并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而，在这
+些函数中，代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话，全局预留计数resv_huge_pages
+会被调整，如下所示::
+
+	if (resv_needed <= (resv_huge_pages - free_huge_pages))
+		resv_huge_pages += resv_needed;
+
+注意，在检查和调整这些计数器时，全局锁hugetlb_lock会被预留。
+
+如果有足够的空闲的巨页，并且全局计数resv_huge_pages被调整，那么与映射相关的预留映射被修改以
+反映预留。在共享映射的情况下，将存在一个file_region，包括'from'-'to'范围。对于私有映射，
+不对预留映射进行修改，因为没有条目表示存在预留。
+
+如果hugetlb_reserve_pages()成功，全局预留数和与映射相关的预留映射将根据需要被修改，以确保
+在'from'-'to'范围内存在预留。
+
+消耗预留/分配一个巨页
+===========================
+
+当与预留相关的巨页在相应的映射中被分配和实例化时，预留就被消耗了。该分配是在函数alloc_huge_page()
+中进行的::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
+alloc_huge_page被传递给一个VMA指针和一个虚拟地址，因此它可以查阅预留映射以确定是否存在预留。
+此外，alloc_huge_page需要一个参数avoid_reserve，该参数表示即使看起来已经为指定的地址预留了
+预留，也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下，即现有页面的额
+外拷贝被分配。
+
+
+调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详
+细内容，请参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。从
+vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留，则为0，如果不存在预留，则为1。
+如果不存在预留，并且有一个与映射相关联的子池，则查询子池以确定它是否包含预留。如果子池包含预留，
+则可将其中一个用于该分配。然而，在任何情况下，avoid_reserve参数都会优先考虑为分配使用预留。在
+确定预留是否存在并可用于分配后，调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关
+的参数：
+
+- avoid_reserve，这是传递给alloc_huge_page()的同一个值/参数。
+- chg，尽管这个参数的类型是long，但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0，
+  则表明存在预留（关于可能的问题，请参见 “预留和内存策略” 一节）。如果值
+  为1，则表示不存在预留，如果可能的话，必须从全局空闲池中取出该页。
+
+与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面，当该页面从空闲列表中移除时，
+free_huge_pages的值被递减。如果有一个与该页相关的预留，将进行以下调整::
+
+	SetPagePrivate(page);	/* 表示分配这个页面消耗了一个预留，
+				 * 如果遇到错误，以至于必须释放这个页面，预留将被
+				 * 恢复。 */
+	resv_huge_pages--;	/* 减少全局预留计数 */
+
+注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
+的剩余巨页和超额分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整:
+SetPagePrivate(page) 和 resv_huge_pages--.
+
+在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
+面被释放时，这将被用于子池的计数。
+
+然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
+到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
+已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建一
+个新的条目。
+
+注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
+的剩余巨页和过度分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整。
+SetPagePrivate(page)和resv_huge_pages-。
+
+在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
+面被释放时，这将被用于子池的计数。
+
+然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
+到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
+已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建
+一个新的条目。
+
+在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用
+vma_commit_reservation()之间，预留映射有可能被改变。如果hugetlb_reserve_pages在共
+享映射中为同一页面被调用，这将是可能的。在这种情况下，预留计数和子池空闲页计数会有一个偏差。
+这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来
+识别。如果检测到这种竞争，子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息，请
+参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。
+
+
+实例化巨页
+==========
+
+在巨页分配之后，页面通常被添加到分配任务的页表中。在此之前，共享映射中的页面被添加到页面缓
+存中，私有映射中的页面被添加到匿名反向映射中。在这两种情况下，PagePrivate标志被清除。因此，
+当一个已经实例化的巨页被释放时，不会对全局预留计数（resv_huge_pages）进行调整。
+
+
+释放巨页
+========
+
+巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此，它只传
+递一个指向页面结构体的指针。当一个巨页被释放时，可能需要进行预留计算。如果该页与包含保
+留的子池相关联，或者该页在错误路径上被释放，必须恢复全局预留计数，就会出现这种情况。
+
+page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置，它表明全局预留计数
+应该被调整（关于如何设置这些标志的信息，请参见
+:ref: `消耗预留/分配一个巨页 <consume_resv>` ）。
+
+
+该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值（不等于
+传递的1的值），它表明预留与子池相关联，这个新释放的页面必须被用来保持子池预留的数量超过最小值。
+因此，在这种情况下，全局resv_huge_pages计数器被递增。
+
+如果页面中设置了PagePrivate标志，那么全局resv_huge_pages计数器将永远被递增。
+
+子池预留
+========
+
+有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一
+个hstate中的页面子集，它与一个已挂载的hugetlbfs文件系统相关
+
+当一个hugetlbfs文件系统被挂载时，可以指定min_size选项，它表示文件系统所需的最小的巨页数量。
+如果指定了这个选项，与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体
+hugepage_subpool的min_hpages字段中被跟踪。在挂载时，hugetlb_acct_memory(min_hpages)
+被调用以预留指定数量的巨页。如果它们不能被预留，挂载就会失败。
+
+当从子池中获取或释放页面时，会调用hugepage_subpool_get/put_pages()函数。
+hugepage_subpool_get/put_pages被传递给巨页数量，以此来调整子池的 “已用页面” 计数
+（get为下降，put为上升）。通常情况下，如果子池中没有足够的页面，它们会返回与传递的相同的值或
+一个错误。
+
+然而，如果预留与子池相关联，可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局
+池调整的数量。例如，假设一个子池包含3个预留的巨页，有人要求5个。与子池相关的3个预留页可以用来
+满足部分请求。但是，必须从全局池中获得2个页面。为了向调用者转达这一信息，将返回值2。然后，调用
+者要负责从全局池中获取另外两个页面。
+
+
+COW和预留
+==========
+
+由于共享映射都指向并使用相同的底层页面，COW最大的预留问题是私有映射。在这种情况下，两个任务可
+以指向同一个先前分配的页面。一个任务试图写到该页，所以必须分配一个新的页，以便每个任务都指向它
+自己的页。
+
+当该页最初被分配时，该页的预留被消耗了。当由于COW而试图分配一个新的页面时，有可能没有空闲的巨
+页，分配会失败。
+
+当最初创建私有映射时，通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。
+由于所有者创建了映射，所有者拥有与映射相关的所有预留。因此，当一个写异常发生并且没有可用的页面
+时，对预留的所有者和非所有者采取不同的行动。
+
+在发生异常的任务不是所有者的情况下，异常将失败，该任务通常会收到一个SIGBUS。
+
+如果所有者是发生异常的任务，我们希望它能够成功，因为它拥有原始的预留。为了达到这个目的，该页被
+从非所有者任务中解映射出来。这样一来，唯一的引用就是来自拥有者的任务。此外，HPAGE_RESV_UNMAPPED
+位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常，它可能
+会收到一个SIGBUS。但是，映射/预留的原始拥有者的行为将与预期一致。
+
+预留映射的修改
+==============
+
+以下低级函数用于对预留映射进行修改。通常情况下，这些函数不会被直接调用。而是调用一个预留映射辅
+助函数，该函数调用这些低级函数中的一个。这些低级函数在源代码（mm/hugetlb.c）中得到了相当好的
+记录。这些函数是::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
+
+在预留映射上的操作通常涉及两个操作:
+
+1) region_chg()被调用来检查预留映射，并确定在指定的范围[f, t]内有多少页目前没有被代表。
+
+   调用代码执行全局检查和分配，以确定是否有足够的巨页使操作成功。
+
+2)
+  a) 如果操作能够成功，regi_add()将被调用，以实际修改先前传递给regi_chg()的相同范围
+     [f, t]的预留映射。
+  b) 如果操作不能成功，region_abort被调用，在相同的范围[f, t]内中止操作。
+
+注意，这是一个两步的过程， region_add()和 region_abort()在事先调用 region_chg()后保证
+成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作（特别是 region_add()）的
+成功。
+
+如上所述，region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加
+到映射中的范围内的页数。在大多数情况下， region_add() 的返回值与 region_chg() 的返回值相
+同。然而，在共享映射的情况下，有可能在调用 region_chg() 和 region_add() 之间对预留映射进
+行更改。在这种情况下，regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下，全局计数
+和子池计数很可能是不正确的，需要调整。检查这种情况并进行适当的调整是调用者的责任。
+
+函数region_del()被调用以从预留映射中移除区域。
+它通常在以下情况下被调用:
+
+- 当hugetlbfs文件系统中的一个文件被删除时，该节点将被释放，预留映射也被释放。在释放预留映射
+  之前，所有单独的file_region结构体必须被释放。在这种情况下，region_del的范围是[0, LONG_MAX]。
+- 当一个hugetlbfs文件正在被截断时。在这种情况下，所有在新文件大小之后分配的页面必须被释放。
+  此外，预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下，region_del
+  的范围是[new_end_of_file, LONG_MAX]。
+- 当在一个hugetlbfs文件中打洞时。在这种情况下，巨页被一次次从文件的中间移除。当这些页被移除
+  时，region_del()被调用以从预留映射中移除相应的条目。在这种情况下，region_del被传递的范
+  围是[page_idx, page_idx + 1]。
+
+在任何情况下，region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下，region_del()
+会失败。这只能发生在打洞的情况下，即它必须分割一个现有的file_region条目，而不能分配一个新的
+结构体。在这种错误情况下，region_del()将返回-ENOMEM。这里的问题是，预留映射将显示对该页有
+预留。然而，子池和全局预留计数将不反映该预留。为了处理这种情况，调用函数hugetlb_fix_reserve_counts()
+来调整计数器，使其与不能被删除的预留映射条目相对应。
+
+region_count()在解除私有巨页映射时被调用。在私有映射中，预留映射中没有条目表明存在一个预留。
+因此，通过计算预留映射中的条目数，我们知道有多少预留被消耗了，有多少预留是未完成的
+（Outstanding = (end - start) - region_count（resv, start, end））。由于映射正在消
+失，子池和全局预留计数被未完成的预留数量所减去。
+
+预留映射帮助函数
+================
+
+有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣，所以它们只是传入一个
+地址而不是一个范围。此外，它们还传入相关的VMA。从VMA中，可以确定映射的类型（私有或共享）和预留
+映射的位置（inode或VMA）。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而，
+它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义，并向调用者隐藏了这个细节::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
+
+该函数为指定的页面调用 region_chg()。如果不存在预留，则返回1。如果存在预留，则返回0::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
+
+这将调用 region_add()，用于指定的页面。与region_chg和region_add的情况一样，该函数应在
+先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加，它将
+返回1，如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出
+现意外的差异，说明在两次调用之间修改了预留映射::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样，该函数应在
+先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+这是一个特殊的包装函数，有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数
+中调用。该函数与vma_needs_reservation一起使用，试图将一个预留添加到预留映射中。它考虑到
+了私有和共享映射的不同预留映射语义。因此，region_add被调用用于共享映射（因为映射中的条目表
+示预留），而region_del被调用用于私有映射（因为映射中没有条目表示预留）。关于在错误路径上需
+要做什么的更多信息，请参见  “错误路径中的预留清理”  。
+
+
+错误路径中的预留清理
+====================
+
+正如在:ref:`预留映射帮助函数<resv_map_helpers>` 一节中提到的，预留的修改分两步进行。首
+先，在分配页面之前调用vma_needs_reservation。如果分配成功，则调用vma_commit_reservation。
+如果不是，则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整，
+一切都很好。
+
+此外，在一个巨页被实例化后，PagePrivate标志被清空，这样，当页面最终被释放时，计数是
+正确的。
+
+然而，有几种情况是，在一个巨页被分配后，但在它被实例化之前，就遇到了错误。在这种情况下，
+页面分配已经消耗了预留，并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放
+（在实例化和清除PagePrivate之前），那么free_huge_page将增加全局预留计数。然而，预留映射
+显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高，
+并阻止分配一个预先分配的页面。
+
+函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的
+是将预留映射恢复到页面分配前的状态。通过这种方式，预留映射的状态将与页面释放后的全局预留计
+数相对应。
+
+函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下，
+它将简单地清除该页的PagePrivate标志。这样一来，当页面被释放时，全局预留计数将不会被递增。
+然而，预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址，但它不会像最
+初设想的那样使用一个预留页。
+
+有一些代码（最明显的是userfaultfd）不能调用restore_reserve_on_error。在这种情况下，
+它简单地修改了PagePrivate，以便在释放巨页时不会泄露预留。
+
+
+预留和内存策略
+==============
+当git第一次被用来管理Linux代码时，每个节点的巨页列表就存在于hstate结构中。预留的概念是
+在一段时间后加入的。当预留被添加时，没有尝试将内存策略考虑在内。虽然cpusets与内存策略不
+完全相同，但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作
+用::
+
+
+	/*
+	 * 当cpuset被配置时，它打破了严格的hugetlb页面预留，因为计数是在一个全局变量上完
+	 * 成的。在有cpuset的情况下，这样的预留完全是垃圾，因为预留没有根据当前cpuset的
+	 * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时，应用程序仍然有可能
+	 * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的（或者说太难看了），因
+	 * 为cpuset太不稳定了，任务或内存节点可以在cpuset之间动态移动。与cpuset共享
+	 * hugetlb映射的语义变化是不可取的。然而，为了预留一些语义，我们退回到检查当前空闲
+	 * 页的可用性，作为一种最好的尝试，希望能将cpuset改变语义的影响降到最低。
+	 */
+
+添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败（OOM）。然而，如果一个应用
+程序使用cpusets或内存策略，就不能保证在所需的节点上有巨页可用。即使有足够数量的全局
+预留，也是如此。
+
+Hugetlbfs回归测试
+=================
+
+最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码，请使用
+libhugetlbfs测试套件来检查回归情况。此外，如果你添加了任何新的hugetlb功能，请在
+libhugetlbfs中添加适当的测试。
+
+--
+Mike Kravetz，2017年4月7日
diff --git a/Documentation/translations/zh_CN/mm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst
new file mode 100644
index 000000000000..310862edc937
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hwpoison.rst
@@ -0,0 +1,166 @@
+
+:Original: Documentation/mm/hwpoison.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+hwpoison
+========
+
+什么是hwpoison?
+===============
+
+
+即将推出的英特尔CPU支持从一些内存错误中恢复（ ``MCA恢复`` ）。这需要操作系统宣布
+一个页面"poisoned"，杀死与之相关的进程，并避免在未来使用它。
+
+这个补丁包在虚拟机中实现了必要的(编程)框架。
+
+引用概述中的评论::
+
+	高级机器的检查与处理。处理方法是损坏的页面被硬件报告，通常是由于2位ECC内
+	存或高速缓存故障。
+
+	这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时，当前运行的进程
+	可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理，就可
+	以安全地忽略它. 而不是用另外一个机器检查去处理它。
+
+	处理不同状态的页面缓存页。这里棘手的部分是，相对于其他虚拟内存用户， 我们可以异
+	步访问任何页面。因为内存故障可能随时随地发生，可能违反了他们的一些假设。这就是
+	为什么这段代码必须非常小心。一般来说，它试图使用正常的锁规则，如获得标准锁，即使
+	这意味着错误处理可能需要很长的时间。
+
+	这里的一些操作有点低效，并且具有非线性的算法复杂性，因为数据结构没有针对这种情
+	况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的，所
+	以我们希望我们可以摆脱这种情况。
+
+该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的
+各种检查组成，用来处理poison的页面。
+
+现在主要目标是KVM客户机，但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm
+版本。
+
+对于KVM的使用，需要一个新的信号类型，这样KVM就可以用适当的地址将机器检查注入到客户
+机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是，所有的应用程序都不要这
+样做，但一些非常专业的应用程序可能会这样做。
+
+故障恢复模式
+============
+
+有两种（实际上是三种）模式的内存故障恢复可以在。
+
+vm.memory_failure_recovery sysctl 置零:
+	所有的内存故障都会导致panic。请不要尝试恢复。
+
+早期处理
+	(可以在全局和每个进程中控制) 一旦检测到错误，立即向应用程序发送SIGBUS这允许
+	应用程序以温和的方式处理内存错误（例如，放弃受影响的对象） 这是KVM qemu使用的
+	模式。
+
+推迟处理
+	当应用程序运行到损坏的页面时，发送SIGBUS。这对不知道内存错误的应用程序来说是
+	最好的，默认情况下注意一些页面总是被当作late kill处理。
+
+用户控制
+========
+
+vm.memory_failure_recovery
+	参阅 sysctl.txt
+
+vm.memory_failure_early_kill
+	全局启用early kill
+
+PR_MCE_KILL
+	设置early/late kill mode/revert 到系统默认值。
+
+	arg1: PR_MCE_KILL_CLEAR:
+		恢复到系统默认值
+	arg1: PR_MCE_KILL_SET:
+		arg2定义了线程特定模式
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			使用系统全局默认值
+
+	注意，如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO)，你应该在
+	指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则，SIGBUS将被发送到主线程。
+
+PR_MCE_KILL_GET
+	返回当前模式
+
+测试
+====
+
+* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面
+
+* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块
+
+  corrupt-pfn
+	在PFN处注入hwpoison故障，并echoed到这个文件。这做了一些早期过滤，以避
+	免在测试套件中损坏非预期页面。
+  unpoison-pfn
+	在PFN的Software-unpoison页面对应到这个文件。这样，一个页面可以再次被
+	复用。这只对Linux注入的故障起作用，对真正的内存故障不起作用。
+
+  注意这些注入接口并不稳定，可能会在不同的内核版本中发生变化
+
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通
+	配符值。这应该只用于人工注入的测试。
+
+  corrupt-filter-memcg
+	限制注入到memgroup拥有的页面。由memcg的inode号指定。
+
+	Example::
+
+		mkdir /sys/fs/cgroup/mem/hwpoison
+
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	当指定时，只有在((page_flags & mask) == value)的情况下才会poison页面。
+	这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相
+	同。这些标志位在include/linux/kernel-page-flags.h中定义，并在
+	Documentation/admin-guide/mm/pagemap.rst中记录。
+
+* 架构特定的MCE注入器
+
+  x86 有 mce-inject, mce-test
+
+  在mce-test中的一些便携式hwpoison测试程序，见下文。
+
+引用
+====
+
+http://halobates.de/mce-lc09-2.pdf
+	09年LinuxCon的概述演讲
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	测试套件（在tsrc中的hwpoison特定可移植测试）。
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86特定的注入器
+
+
+限制
+====
+- 不是所有的页面类型都被支持，而且永远不会。大多数内核内部对象不能被恢
+  复，目前只有LRU页。
+
+---
+Andi Kleen, 2009年10月
diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst
new file mode 100644
index 000000000000..4c8c6b7b72a3
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/index.rst
@@ -0,0 +1,54 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=================
+Linux内存管理文档
+=================
+
+这是一个关于Linux内存管理（mm）子系统内部的文档集，其中有不同层次的细节，包括注释
+和邮件列表的回复，用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建
+议，请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。
+对于控制和调整指南，请参阅(Documentation/admin-guide/mm/index)。
+TODO:待引用文档集被翻译完毕后请及时修改此处）
+
+.. toctree::
+   :maxdepth: 1
+
+   active_mm
+   balance
+   damon/index
+   free_page_reporting
+   highmem
+   ksm
+   frontswap
+   hmm
+   hwpoison
+   hugetlbfs_reserv
+   memory-model
+   mmu_notifier
+   numa
+   overcommit-accounting
+   page_frags
+   page_owner
+   page_table_check
+   remap_file_pages
+   split_page_table_lock
+   z3fold
+   zsmalloc
+
+TODOLIST:
+* arch_pgtable_helpers
+* free_page_reporting
+* hugetlbfs_reserv
+* page_migration
+* slub
+* transhuge
+* unevictable-lru
+* vmalloced-kernel-stacks
diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst
new file mode 100644
index 000000000000..d1f82e857ad7
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/ksm.rst
@@ -0,0 +1,70 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/ksm.rst
+
+:翻译:
+
+   徐鑫 xu xin <xu.xin16@zte.com.cn>
+
+============
+内核同页合并
+============
+
+KSM 是一种节省内存的数据去重功能，由CONFIG_KSM=y启用，并在2.6.32版本时被添加
+到Linux内核。详见 ``mm/ksm.c`` 的实现，以及http://lwn.net/Articles/306704和
+https://lwn.net/Articles/330589
+
+KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst
+文档中有描述。
+
+设计
+====
+
+概述
+----
+
+概述内容请见mm/ksm.c文档中的“DOC: Overview”
+
+逆映射
+------
+KSM维护着稳定树中的KSM页的逆映射信息。
+
+当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时，则代表了
+KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时，这个KSM页
+的 ``page->mapping`` 指向了该稳定树节点。
+
+如果共享数超过了阈值，KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多
+个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息，其中 ``page->mapping``
+指向该"副本"。
+
+每个链以及链接到该链中的所有"副本"强制不变的是，它们代表了相同的写保护内存
+内容，尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。
+
+这样一来，相比与无限的逆映射链表，稳定树的查找计算复杂性不受影响。但在稳定树
+本身中不能有重复的KSM页面内容仍然是强制要求。
+
+由 ``max_page_sharing`` 强制决定的数据去重限制是必要的，以此来避免虚拟内存
+rmap链表变得过大。rmap的遍历具有O(N)的复杂度，其中N是共享页面的rmap_项（即
+虚拟映射）的数量，而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。
+因此，这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进
+程在稳定节点"链"上的遍历也是O(N)，但这个N是稳定树"副本"的数量，而不是rmap项
+的数量，因此它对ksmd性能没有显著影响。实际上，最佳稳定树"副本"的候选节点将
+保留在"副本"列表的开头。
+
+``max_page_sharing`` 的值设置得高了会促使更快的内存合并（因为将有更少的稳定
+树副本排队进入稳定节点chain->hlist）和更高的数据去重系数，但代价是在交换、压
+缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。
+
+``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控
+的影响，高比值可能意味着稳定节点dup中存在碎片，这可以通过在ksmd中引入碎片算
+法来解决，该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup，以便释放
+那些仅包含极少rmap项的稳定节点"dup"，但这可能会增加ksmd进程的CPU使用率，并可
+能会减慢应用程序在KSM页面上的只读计算。
+
+KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本"，以便删减过时了的稳定节点。
+这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。
+
+参考
+====
+内核代码请见mm/ksm.c。
+涉及的函数(mm_slot  ksm_scan  stable_node  rmap_item)。
diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst
new file mode 100644
index 000000000000..77ec149a970c
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/memory-model.rst
@@ -0,0 +1,135 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/memory-model.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+============
+物理内存模型
+============
+
+系统中的物理内存可以用不同的方式进行寻址。最简单的情况是，物理内存从地址0开
+始，跨越一个连续的范围，直到最大的地址。然而，这个范围可能包含CPU无法访问的
+小孔隙。那么，在完全不同的地址可能有几个连续的范围。而且，别忘了NUMA，即不
+同的内存库连接到不同的CPU。
+
+Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每
+个架构都定义了它所支持的内存模型，默认的内存模型是什么，以及是否有可能手动
+覆盖该默认值。
+
+所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页
+帧的状态。
+
+无论选择哪种内存模型，物理页框号（PFN）和相应的 `struct page` 之间都存
+在一对一的映射关系。
+
+每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn`
+帮助函数，允许从PFN到 `struct page` 的转换，反之亦然。
+
+FLATMEM
+=======
+
+最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的
+物理内存。
+
+在FLATMEM内存模型中，有一个全局的 `mem_map` 数组来映射整个物理内存。对
+于大多数架构，孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page`
+对象从未被完全初始化。
+
+为了分配 `mem_map` 数组，架构特定的设置代码应该调用free_area_init()函数。
+然而，在调用memblock_free_all()函数之前，映射数组是不能使用的，该函数
+将所有的内存交给页分配器。
+
+一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下，特
+定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。
+
+使用FLATMEM，PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET`
+是 `mem_map` 数组的一个索引。
+
+`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。
+
+SPARSEMEM
+=========
+
+SPARSEMEM是Linux中最通用的内存模型，它是唯一支持若干高级功能的内存模型，
+如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟
+初始化。
+
+SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构
+体表示，它包含 `section_mem_map` ，从逻辑上讲，它是一个指向 `struct page`
+阵列的指针。然而，它被存储在一些其他的magic中，以帮助分区管理。区段的大小
+和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量
+来指定的，这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS`
+是一个架构所支持的物理地址的实际宽度，而 `SECTION_SIZE_BITS` 是一个任
+意的值。
+
+最大的段数表示为 `NR_MEM_SECTIONS` ，定义为
+
+.. math::
+
+   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
+
+`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的
+大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数:
+
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时， `mem_sections` 数组是静态的，有
+  `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时， `mem_sections` 数组被动态分配。
+  每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象，行数的计算是为了适应所有的
+  内存区。
+
+架构设置代码应该调用sparse_init()来初始化内存区和内存映射。
+
+通过SPARSEMEM，有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和
+ "sparse vmemmap"。选择是在构建时进行的，它由 `CONFIG_SPARSEMEM_VMEMMAP` 的
+ 值决定。
+
+Classic sparse在page->flags中编码了一个页面的段号，并使用PFN的高位来访问映射该页
+框的段。在一个区段内，PFN是指向页数组的索引。
+
+Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操
+作。有一个全局的 `struct page *vmemmap` 指针，指向一个虚拟连续的 `struct page`
+对象阵列。PFN是该数组的一个索引，`struct page` 从 `vmemmap` 的偏移量是该页的PFN。
+
+为了使用vmemmap，一个架构必须保留一个虚拟地址的范围，以映射包含内存映射的物理页，并
+确保 `vmemmap`指向该范围。此外，架构应该实现 :c:func:`vmemmap_populate` 方法，
+它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求，
+它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。
+
+虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分
+配的存储中。这种存储用vmem_altmap结构表示，最终通过一长串的函数调用传递给
+vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和
+:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。
+
+ZONE_DEVICE
+===========
+`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上，为设备驱动识别的物理地址范
+围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下
+事实有关：这些地址范围的页面对象从未被在线标记过，而且必须对设备进行引用，而不仅仅
+是页面，以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ，通过 :c:func:`devm_memremap_pages` ，
+为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`，
+:c:func:`page_to_pfn`, ，和 :c:func:`get_user_pages` 服务。由于页面引
+用计数永远不会低于1，所以页面永远不会被追踪为空闲内存，页面的 `struct list_head lru`
+空间被重新利用，用于向映射该内存的主机设备/驱动程序进行反向引用。
+
+虽然 `SPARSEMEM` 将内存作为一个区段的集合，可以选择收集并合成内存块，但
+`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE`
+内存从未被在线标记，因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界
+上。这个实现依赖于这种缺乏用户接口的约束，允许子段大小的内存范围被指定给
+:c:func:`arch_add_memory` ，即内存热插拔的上半部分。子段支持允许2MB作为
+:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。
+
+`ZONE_DEVICE` 的用户是:
+
+* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。
+
+* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` ，
+  以允许设备驱动程序协调与设备内存相关的内存管理事件，通常是GPU内存。参见Documentation/mm/hmm.rst。
+
+* p2pdma: 创建 `struct page` 对象，允许PCI/E拓扑结构中的peer设备协调它们之间的
+  直接DMA操作，即绕过主机内存。
diff --git a/Documentation/translations/zh_CN/mm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst
new file mode 100644
index 000000000000..ce3664d1a410
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst
@@ -0,0 +1,97 @@
+:Original: Documentation/mm/mmu_notifier.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+什么时候需要页表锁内通知？
+==========================
+
+当清除一个pte/pmd时，我们可以选择通过在页表锁下（通知版的\*_clear_flush调用
+mmu_notifier_invalidate_range）通知事件。但这种通知并不是在所有情况下都需要的。
+
+对于二级TLB（非CPU TLB），如IOMMU TLB或设备TLB（当设备使用类似ATS/PASID的东西让
+IOMMU走CPU页表来访问进程的虚拟地址空间）。只有两种情况需要在清除pte/pmd时在持有页
+表锁的同时通知这些二级TLB：
+
+  A) 在mmu_notifier_invalidate_range_end()之前，支持页的地址被释放。
+  B) 一个页表项被更新以指向一个新的页面（COW，零页上的写异常，__replace_page()，...）。
+
+情况A很明显，你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。
+
+情况B更加微妙。为了正确起见，它需要按照以下序列发生:
+
+  - 上页表锁
+  - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify())
+  - 设置页表项以指向新页
+
+如果在设置新的pte/pmd值之前，清除页表项之后没有进行通知，那么你就会破坏设备的C11或
+C++11等内存模型。
+
+考虑以下情况（设备使用类似于ATS/PASID的功能）。
+
+两个地址addrA和addrB，这样|addrA - addrB| >= PAGE_SIZE，我们假设它们是COW的
+写保护（B的其他情况也适用）。
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {尝试写到addrA}
+ CPU-thread-1  {尝试写到addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {读取addrA并填充设备TLB}
+ DEV-thread-2  {读取addrB并填充设备TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {更新页表以指向addrA的新页}}
+ CPU-thread-1  {COW_step1: {更新页表以指向addrB的新页}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {写入addrA，这是对新页面的写入}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {写入addrB，这是一个写入新页的过程}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {从旧页中读取addrA}
+ DEV-thread-2  {从新页面读取addrB}
+
+所以在这里，因为在N+2的时候，清空页表项没有和通知一起作废二级TLB，设备在看到addrA的新值之前
+就看到了addrB的新值。这就破坏了设备的总内存序。
+
+当改变一个pte的写保护或指向一个新的具有相同内容的写保护页（KSM）时，将mmu_notifier_invalidate_range
+调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程
+在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占，也是如此。
diff --git a/Documentation/translations/zh_CN/mm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst
new file mode 100644
index 000000000000..b15cfeeb6dfb
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/numa.rst
@@ -0,0 +1,101 @@
+:Original: Documentation/mm/numa.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+始于1999年11月，作者： <kanoj@sgi.com>
+
+==========================
+何为非统一内存访问(NUMA)？
+==========================
+
+这个问题可以从几个视角来回答：硬件观点和Linux软件视角。
+
+从硬件角度看，NUMA系统是一个由多个组件或装配组成的计算机平台，每个组件可能包含0个或更多的CPU、
+本地内存和/或IO总线。为了简洁起见，并将这些物理组件/装配的硬件视角与软件抽象区分开来，我们在
+本文中称这些组件/装配为“单元”。
+
+每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能
+不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如，交叉开关或点对点
+链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来，以创建NUMA平台，其中的单元与其
+他单元有多个距离。
+
+对于Linux，感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中，
+所有的内存都是可见的，并且可以从连接到任何单元的任何CPU中访问，缓存一致性是由处理器缓存和/或
+系统互连在硬件中处理。
+
+内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元
+有多远。例如，连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和
+更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的（其他）单元。
+
+平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反，这种架构是提供可扩展
+内存带宽的一种手段。然而，为了实现可扩展的内存带宽，系统和应用软件必须安排大部分的内存引用
+[cache misses]到“本地”内存——同一单元的内存，如果有的话——或者到最近的有内存的单元。
+
+这就自然而然有了Linux软件对NUMA系统的视角:
+
+Linux将系统的硬件资源划分为多个软件抽象，称为“节点”。Linux将节点映射到硬件平台的物理单元
+上，对一些架构的细节进行了抽象。与物理单元一样，软件节点可能包含0或更多的CPU、内存和/或IO
+总线。同样，对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快
+的访问时间和更高的有效带宽。
+
+对于一些架构，如x86，Linux将“隐藏”任何代表没有内存连接的物理单元的节点，并将连接到该单元
+的任何CPU重新分配到代表有内存的单元的节点上。因此，在这些架构上，我们不能假设Linux将所有
+的CPU与一个给定的节点相关联，会看到相同的本地内存访问时间和带宽。
+
+此外，对于某些架构，同样以x86为例，Linux支持对额外节点的仿真。对于NUMA仿真，Linux会将现
+有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部
+分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的，当与cpusets一起使用时，
+可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+对于每个有内存的节点，Linux构建了一个独立的内存管理子系统，有自己的空闲页列表、使用中页列表、
+使用统计和锁来调解访问。此外，Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE
+中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求
+时要访问的区/节点。当一个区没有可用的内存来满足请求时，这种情况被称为“overflow 溢出”或
+“fallback 回退”。
+
+由于一些节点包含多个包含不同类型内存的区，Linux必须决定是否对区列表进行排序，使分配回退到不同
+节点上的相同区类型，或同一节点上的不同区类型。这是一个重要的考虑因素，因为有些区，如DMA或DMA32，
+代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距
+离排序的远程节点之前，它会尝试回退到同一节点的其他分区。
+
+默认情况下，Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说，Linux将试
+图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能
+满足请求，内核将检查所选分区列表中其他节点的区域，寻找列表中第一个能满足请求的区域。
+
+本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配
+一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构
+中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥
+远的调度域中。然而，调度器并没有直接考虑到任务的NUMA足迹。因此，在充分不平衡的情况下，任务可
+以在节点之间迁移，远离其初始节点和内核数据结构。
+
+系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口，如taskset(1)和numactl(1)，以及程
+序接口，如sched_setaffinity(2)，来限制任务的迁移，以改善NUMA定位。此外，人们可以使用
+Linux NUMA内存策略修改内核的默认本地分配行为。 [见
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
+
+系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点
+的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+在不隐藏无内存节点的架构上，Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无
+内存的节点，“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反，它
+将是内核在建立分区列表时选择的离它最近的有内存的节点。所以，默认情况下，本地分配将由内核提供
+最近的可用内存来完成。这是同一机制的结果，该机制允许这种分配在一个包含内存的节点溢出时回退到
+其他附近的节点。
+
+一些内核分配不希望或不能容忍这种分配回退行为。相反，他们想确保他们从指定的节点获得内存，或者
+得到通知说该节点没有空闲内存。例如，当一个子系统分配每个CPU的内存资源时，通常是这种情况。
+
+一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的
+节点ID，然后只从返回的节点ID请求内存。当这样的分配失败时，请求的子系统可以恢复到它自己的回退
+路径。板块内核内存分配器就是这样的一个例子。或者，子系统可以选择在分配失败时禁用或不启用自己。
+内核分析子系统就是这样的一个例子。
+
+如果架构支持——不隐藏无内存节点，那么连接到无内存节点的CPU将总是产生回退路径的开销，或者一些
+子系统如果试图完全从无内存的节点分配内存，将无法初始化。为了透明地支持这种架构，内核子系统可
+以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样，这是同
+一个节点，默认的本地页分配将从这个节点开始尝试。
diff --git a/Documentation/translations/zh_CN/mm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst
new file mode 100644
index 000000000000..d8452d8b7fbb
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst
@@ -0,0 +1,86 @@
+:Original: Documentation/mm/overcommit-accounting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+==============
+超量使用审计
+==============
+
+Linux内核支持下列超量使用处理模式
+
+0
+	启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。
+	它确保严重的疯狂分配失败，同时允许超量使用以减少swap的使用。在这种模式下，
+	允许root分配稍多的内存。这是默认的。
+1
+	总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码，只是依赖
+	几乎完全由零页组成的虚拟内存
+
+2
+	不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量
+	（默认为50%）。根据你使用的数量，在大多数情况下，这意味着一个进程在访问页面时
+	不会被杀死，但会在内存分配上收到相应的错误。
+
+	对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说
+	是很有用的。
+
+超量使用策略是通过sysctl  `vm.overcommit_memory` 设置的。
+
+可以通过 `vm.overcommit_ratio` （百分比）或 `vm.overcommit_kbytes` （绝对值）
+来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。
+
+在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前
+的超量使用和提交量。
+
+陷阱
+====
+
+C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证，并在接近边缘的地方运行，
+你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说，这并
+不重要，但如果你真的非常关心的话，这就是一个值得关注的案例。
+
+
+在模式2中，MAP_NORESERVE标志被忽略。
+
+
+它是如何工作的
+==============
+
+超量使用是基于以下规则
+
+对于文件映射
+	| SHARED or READ-only	-	0 cost (该文件是映射而不是交换)
+	| PRIVATE WRITABLE	-	每个实例的映射大小
+
+对于匿名或者 ``/dev/zero`` 映射
+	| SHARED			-	映射的大小
+	| PRIVATE READ-only	-	0 cost (但作用不大)
+	| PRIVATE WRITABLE	-	每个实例的映射大小
+
+额外的计数
+	| 通过mmap制作可写副本的页面
+	| 从同一池中提取的shmfs内存
+
+状态
+====
+
+*	我们核算mmap内存映射
+*	我们核算mprotect在提交中的变化
+*	我们核算mremap的大小变化
+*	我们的审计 brk
+*	审计munmap
+*	我们在/proc中报告commit 状态
+*	核对并检查分叉的情况
+*	审查堆栈处理/执行中的构建
+*	叙述SHMfs的情况
+*	实现实际限制的执行
+
+待续
+====
+*	ptrace 页计数（这很难）。
diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst
new file mode 100644
index 000000000000..320952ca93af
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_frags.rst
@@ -0,0 +1,38 @@
+:Original: Documentation/mm/page_frag.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页面片段
+========
+
+一个页面片段是一个任意长度的任意偏移的内存区域，它位于一个0或更高阶的复合页面中。
+该页中的多个碎片在该页的引用计数器中被单独计算。
+
+page_frag函数，page_frag_alloc和page_frag_free，为页面片段提供了一个简单
+的分配框架。这被网络堆栈和网络设备驱动使用，以提供一个内存的支持区域，作为
+sk_buff->head使用，或者用于skb_shared_info的 “frags” 部分。
+
+为了使用页面片段API，需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点，
+并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用，
+这在分配时开销可能会很大。然而，由于这种缓存的性质，要求任何对缓存的调用都要受到每
+个CPU的限制，或者每个CPU的限制，并在执行碎片分配时强制禁止中断。
+
+网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用
+netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
+被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是
+它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用，因为这些函数
+将禁用中断，而 ”napi“ 前缀的函数只可以在softirq上下文中使用。
+
+许多网络设备驱动程序使用类似的方法来分配页面片段，但页面片段是在环或描述符级别上
+缓存的。为了实现这些情况，有必要提供一种拆解页面缓存的通用方法。出于这个原因，
+__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。
+这样做的好处是，它允许清理被添加到一个页面的多个引用，以避免每次分配都调用
+get_page。
+
+Alexander Duyck，2016年11月29日。
diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst
new file mode 100644
index 000000000000..03d9e613094a
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_owner.rst
@@ -0,0 +1,116 @@
+:Original: Documentation/mm/page_owner.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+================================
+page owner: 跟踪谁分配的每个页面
+================================
+
+概述
+====
+
+page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。
+当分配发生时，有关分配的信息，如调用堆栈和页面的顺序被存储到每个页面的特定存储中。
+当我们需要了解所有页面的状态时，我们可以获得并分析这些信息。
+
+尽管我们已经有了追踪页面分配/释放的tracepoint，但用它来分析谁分配的每个页面是
+相当复杂的。我们需要扩大跟踪缓冲区，以防止在用户空间程序启动前出现重叠。而且，启
+动的程序会不断地将跟踪缓冲区转出，供以后分析，这将会改变系统的行为，会产生更多的
+可能性，而不是仅仅保留在内存中，所以不利于调试。
+
+页面所有者也可以用于各种目的。例如，可以通过每个页面的gfp标志信息获得精确的碎片
+统计。如果启用了page owner，它就已经实现并激活了。我们非常欢迎其他用途。
+
+page owner在默认情况下是禁用的。所以，如果你想使用它，你需要在你的启动cmdline
+中加入"page_owner=on"。如果内核是用page owner构建的，并且由于没有启用启动
+选项而在运行时禁用page owner，那么运行时的开销是很小的。如果在运行时禁用，它不
+需要内存来存储所有者信息，所以没有运行时内存开销。而且，页面所有者在页面分配器的
+热路径中只插入了两个不可能的分支，如果不启用，那么分配就会像没有页面所有者的内核
+一样进行。这两个不可能的分支应该不会影响到分配的性能，特别是在静态键跳转标签修补
+功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。
+
+- 没有page owner::
+
+   text    data     bss     dec     hex filename
+   48392   2333     644   51369    c8a9 mm/page_alloc.o
+
+- 有page owner::
+
+   text    data     bss     dec     hex filename
+   48800   2445     644   51889    cab1 mm/page_alloc.o
+   6662     108      29    6799    1a8f mm/page_owner.o
+   1025       8       8    1041     411 mm/page_ext.o
+
+虽然总共增加了8KB的代码，但page_alloc.o增加了520字节，其中不到一半是在hotpath
+中。构建带有page owner的内核，并在需要时打开它，将是调试内核内存问题的最佳选择。
+
+有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这
+个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些，所以，在初始化
+之前，许多页面可以被分配，但它们没有所有者信息。为了解决这个问题，这些早期分配的
+页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息，但至
+少，我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上，有13343
+个早期分配的页面被捕捉和标记，尽管它们大部分是由结构页扩展功能分配的。总之，在这
+之后，没有任何页面处于未追踪状态。
+
+使用方法
+========
+
+1) 构建用户空间的帮助::
+
+	cd tools/vm
+	make page_owner_sort
+
+2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline.
+
+3) 做你想调试的工作。
+
+4) 分析来自页面所有者的信息::
+
+	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+
+   ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值)::
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+   ``page_owner_sort`` 工具忽略了 ``PFN`` 行，将剩余的行放在buf中，使用regexp提
+   取页序值，计算buf的次数和页数，最后根据参数进行排序。
+
+   在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出::
+
+	XXX times, XXX pages:
+	Page allocated via order XXX, ...
+	// Detailed stack
+
+   默认情况下， ``page_owner_sort`` 是根据buf的时间来排序的。如果你想
+   按buf的页数排序，请使用-m参数。详细的参数是:
+
+   基本函数:
+
+	Sort:
+		-a		按内存分配时间排序
+		-m		按总内存排序
+		-p		按pid排序。
+		-P		按tgid排序。
+		-r		按内存释放时间排序。
+		-s		按堆栈跟踪排序。
+		-t		按时间排序（默认）。
+
+   其它函数:
+
+	Cull:
+		-c		通过比较堆栈跟踪而不是总块来进行剔除。
+
+	Filter:
+		-f		过滤掉内存已被释放的块的信息。
diff --git a/Documentation/translations/zh_CN/mm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst
new file mode 100644
index 000000000000..e8077310a76c
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_table_check.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/page_table_check.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页表检查
+========
+
+概述
+====
+
+页表检查允许通过确保防止某些类型的内存损坏来强化内核。
+
+当新的页面可以从用户空间访问时，页表检查通过将它们的页表项（PTEs PMD等）添加到页表中来执行额外
+的验证。
+
+在检测到损坏的情况下，内核会被崩溃。页表检查有一个小的性能和内存开销。因此，它在默认情况下是禁用
+的，但是在额外的加固超过性能成本的系统上，可以选择启用。另外，由于页表检查是同步的，它可以帮助调
+试双映射内存损坏问题，在错误的映射发生时崩溃内核，而不是在内存损坏错误发生后内核崩溃。
+
+双重映射检测逻辑
+================
+
++-------------------+-------------------+-------------------+------------------+
+| Current Mapping   | New mapping       | Permissions       | Rule             |
++===================+===================+===================+==================+
+| Anonymous         | Anonymous         | Read              | Allow            |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Named             | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Anonymous         | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Named             | Any               | Allow            |
++-------------------+-------------------+-------------------+------------------+
+
+启用页表检查
+============
+
+用以下方法构建内核:
+
+- PAGE_TABLE_CHECK=y
+  注意，它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。
+
+- 使用 "page_table_check=on" 内核参数启动。
+
+可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核，以便在没有额外的内核参数的情况下获得页表
+支持。
diff --git a/Documentation/translations/zh_CN/mm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst
new file mode 100644
index 000000000000..31e0c54dc36f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst
@@ -0,0 +1,32 @@
+:Original: Documentation/mm/remap_file_pages.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+==============================
+remap_file_pages()系统调用
+==============================
+
+remap_file_pages()系统调用被用来创建一个非线性映射，也就是说，在这个映射中，
+文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好
+处是，前者不需要内核创建额外的VMA（虚拟内存区）数据结构。
+
+支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码，包括热
+路径。另外，为了使非线性映射工作，内核需要一种方法来区分正常的页表项和带有文件
+偏移的项（pte_file）。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资
+源，特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。
+
+幸运的是，在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS
+实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。
+由于64位系统的广泛使用，这种使用情况已经不重要了。
+
+syscall被废弃了，现在用一个模拟来代替它。仿真会创建新的VMA，而不是非线性映射。
+对于remap_file_pages()的少数用户来说，它的工作速度会变慢，但ABI被保留了。
+
+仿真的一个副作用（除了性能之外）是，由于额外的VMA，用户可以更容易达到
+vm.max_map_count的限制。关于限制的更多细节，请参见DEFAULT_MAX_MAP_COUNT
+的注释。
diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
new file mode 100644
index 000000000000..4fb7aa666037
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
@@ -0,0 +1,96 @@
+:Original: Documentation/mm/split_page_table_lock.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=================================
+分页表锁（split page table lock）
+=================================
+
+最初，mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方
+法导致了多线程应用程序的缺页异常可扩展性差，因为对锁的争夺很激烈。为了提高可扩
+展性，我们引入了分页表锁。
+
+有了分页表锁，我们就有了单独的每张表锁来顺序化对表的访问。目前，我们对PTE和
+PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。
+
+有一些辅助工具来锁定/解锁一个表和其他访问器函数:
+
+ - pte_offset_map_lock()
+	映射pte并获取PTE表锁，返回所取锁的指针；
+ - pte_unmap_unlock()
+	解锁和解映射PTE表；
+ - pte_alloc_map_lock()
+	如果需要的话，分配PTE表并获取锁，如果分配失败，返回已获取的锁的指针
+	或NULL；
+ - pte_lockptr()
+	返回指向PTE表锁的指针；
+ - pmd_lock()
+	取得PMD表锁，返回所取锁的指针。
+ - pmd_lockptr()
+	返回指向PMD表锁的指针；
+
+如果CONFIG_SPLIT_PTLOCK_CPUS（通常为4）小于或等于NR_CPUS，则在编译
+时启用PTE表的分页表锁。如果分页锁被禁用，所有的表都由mm->page_table_lock
+来保护。
+
+如果PMD表启用了分页锁，并且架构支持它，那么PMD表的分页锁就会被启用（见
+下文）。
+
+Hugetlb 和分页表锁
+==================
+
+Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁，但不对PUD使用。
+
+Hugetlb特定的辅助函数:
+
+ - huge_pte_lock()
+	对PMD_SIZE页面采取pmd分割锁，否则mm->page_table_lock；
+ - huge_pte_lockptr()
+	返回指向表锁的指针。
+
+架构对分页表锁的支持
+====================
+
+没有必要特别启用PTE分页表锁：所有需要的东西都由pgtable_pte_page_ctor()
+和pgtable_pte_page_dtor()完成，它们必须在PTE表分配/释放时被调用。
+
+确保架构不使用slab分配器来分配页表：slab使用page->slab_cache来分配其页
+面。这个区域与page->ptl共享存储。
+
+PMD分页锁只有在你有两个以上的页表级别时才有意义。
+
+启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor()，在释放时调
+用pgtable_pmd_page_dtor()。
+
+分配通常发生在pmd_alloc_one()中，释放发生在pmd_free()和pmd_free_tlb()
+中，但要确保覆盖所有的PMD表分配/释放路径：即X86_PAE在pgd_alloc()中预先
+分配一些PMD。
+
+一切就绪后，你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
+
+注意：pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
+须正确处理。
+
+page->ptl
+=========
+
+page->ptl用于访问分割页表锁，其中'page'是包含该表的页面struct page。它
+与page->private（以及union中的其他几个字段）共享存储。
+
+为了避免增加struct page的大小并获得最佳性能，我们使用了一个技巧:
+
+ - 如果spinlock_t适合于long，我们使用page->ptr作为spinlock，这样我们
+   就可以避免间接访问并节省一个缓存行。
+ - 如果spinlock_t的大小大于long的大小，我们使用page->ptl作为spinlock_t
+   的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
+   情况下使用分页锁，但由于间接访问而多花了一个缓存行。
+
+PTE表的spinlock_t分配在pgtable_pte_page_ctor()中，PMD表的spinlock_t
+分配在pgtable_pmd_page_ctor()中。
+
+请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/Documentation/translations/zh_CN/mm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst
new file mode 100644
index 000000000000..9569a6d88270
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/z3fold.rst
@@ -0,0 +1,31 @@
+:Original: Documentation/mm/z3fold.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+======
+z3fold
+======
+
+z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。
+它是zbud的衍生物，允许更高的压缩率，保持其前辈的简单性和确定性。
+
+z3fold和zbud的主要区别是:
+
+* 与zbud不同的是，z3fold允许最大的PAGE_SIZE分配。
+* z3fold在其页面中最多可以容纳3个压缩页面
+* z3fold本身没有输出任何API，因此打算通过zpool的API来使用
+
+为了保持确定性和简单性，z3fold，就像zbud一样，总是在每页存储一个整数的压缩页，但是
+它最多可以存储3页，不像zbud最多可以存储2页。因此压缩率达到2.7倍左右，而zbud的压缩
+率是1.7倍左右。
+
+不像zbud（但也像zsmalloc），z3fold_alloc()那样不返回一个可重复引用的指针。相反，它
+返回一个无符号长句柄，它编码了被分配对象的实际位置。
+
+保持有效的压缩率接近于zsmalloc，z3fold不依赖于MMU的启用，并提供更可预测的回收行
+为，这使得它更适合于小型和反应迅速的系统。
diff --git a/Documentation/translations/zh_CN/mm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst
new file mode 100644
index 000000000000..b5596ea08ae4
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst
@@ -0,0 +1,78 @@
+:Original: Documentation/mm/zs_malloc.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+========
+zsmalloc
+========
+
+这个分配器是为与zram一起使用而设计的。因此，该分配器应该在低内存条件下工作良好。特别是，
+它从未尝试过higher order页面的分配，这在内存压力下很可能会失败。另一方面，如果我们只
+是使用单（0-order）页，它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将
+占据整个页面。这是其前身（xvmalloc）的主要问题之一。
+
+为了克服这些问题，zsmalloc分配了一堆0-order页面，并使用各种"struct page"字段将它
+们链接起来。这些链接的页面作为一个单一的higher order页面，即一个对象可以跨越0-order
+页面的边界。代码将这些链接的页面作为一个实体，称为zspage。
+
+为了简单起见，zsmalloc只能分配大小不超过PAGE_SIZE的对象，因为这满足了所有当前用户的
+要求（在最坏的情况下，页面是不可压缩的，因此以"原样"即未压缩的形式存储）。对于大于这
+个大小的分配请求，会返回失败（见zs_malloc）。
+
+此外，zs_malloc()并不返回一个可重复引用的指针。相反，它返回一个不透明的句柄（无符号
+长），它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久
+映射，因为这在32位系统上会导致问题，因为内核空间映射的VA区域非常小。因此，在使用分配
+的内存之前，对象必须使用zs_map_object()进行映射以获得一个可用的指针，随后使用
+zs_unmap_object()解除映射。
+
+stat
+====
+
+通过CONFIG_ZSMALLOC_STAT，我们可以通过 ``/sys/kernel/debug/zsmalloc/<user name>``
+看到zsmalloc内部信息。下面是一个统计输出的例子。::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
+    ...
+    ...
+     9   176           0            1           186        129          8                4
+    10   192           1            0          2880       2872        135                3
+    11   208           0            1           819        795         42                2
+    12   224           0            1           219        159         12                4
+    ...
+    ...
+
+
+class
+	索引
+size
+	zspage存储对象大小
+almost_empty
+	ZS_ALMOST_EMPTY zspage的数量（见下文）。
+almost_full
+	ZS_ALMOST_FULL zspage的数量(见下图)
+obj_allocated
+	已分配对象的数量
+obj_used
+	分配给用户的对象的数量
+pages_used
+	为该类分配的页数
+pages_per_zspage
+	组成一个zspage的0-order页面的数量
+
+当n <= N / f时，我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组，其中
+
+* n = 已分配对象的数量
+* N = zspage可以存储的对象总数
+* f = fullness_threshold_frac(即，目前是4个)
+
+同样地，我们将zspage分配给:
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/vm/active_mm.rst
deleted file mode 100644
index 366609ea4f37..000000000000
--- a/Documentation/translations/zh_CN/vm/active_mm.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/active_mm.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=========
-Active MM
-=========
-
-这是一封linux之父回复开发者的一封邮件，所以翻译时我尽量保持邮件格式的完整。
-
-::
-
- List:       linux-kernel
- Subject:    Re: active_mm
- From:       Linus Torvalds <torvalds () transmeta ! com>
- Date:       1999-07-30 21:36:24
-
- 因为我并不经常写解释，所以已经抄送到linux-kernel邮件列表，而当我做这些，
- 且更多的人在阅读它们时，我觉得棒极了。
-
- 1999年7月30日 星期五， David Mosberger 写道：
- >
- > 是否有一个简短的描述，说明task_struct中的
- >  "mm" 和 "active_mm"应该如何使用？ (如果
- > 这个问题在邮件列表中讨论过，我表示歉意--我刚
- > 刚度假回来，有一段时间没能关注linux-kernel了）。
-
- 基本上，新的设定是：
-
-  - 我们有“真实地址空间”和“匿名地址空间”。区别在于，匿名地址空间根本不关心用
-    户级页表，所以当我们做上下文切换到匿名地址空间时，我们只是让以前的地址空间
-    处于活动状态。
-
-    一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线
-    程基本上都属于这一类，但即使是“真正的”线程也可以暂时说在一定时间内它们不
-    会对用户空间感兴趣，调度器不妨试着避免在切换VM状态上浪费时间。目前只有老
-    式的bdflush sync能做到这一点。
-
-  - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说，tsk->mm将是NULL，
-    其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。
-
-  - 然而，我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此，我们
-    有 “tsk->active_mm”，它显示了当前活动的地址空间是什么。
-
-    规则是，对于一个有真实地址空间的进程（即tsk->mm是 non-NULL），active_mm
-    显然必须与真实的mm相同。
-
-    对于一个匿名进程，tsk->mm == NULL，而tsk->active_mm是匿名进程运行时
-    “借用”的mm。当匿名进程被调度走时，借用的地址空间被返回并清除。
-
- 为了支持所有这些，“struct mm_struct”现在有两个计数器：一个是 “mm_users”
- 计数器，即有多少 “真正的地址空间用户”，另一个是 “mm_count”计数器，即 “lazy”
- 用户（即匿名用户）的数量，如果有任何真正的用户，则加1。
-
- 通常情况下，至少有一个真正的用户，但也可能是真正的用户在另一个CPU上退出，而
- 一个lazy的用户仍在活动，所以你实际上得到的情况是，你有一个地址空间 **只**
- 被lazy的用户使用。这通常是一个短暂的生命周期状态，因为一旦这个线程被安排给一
- 个真正的线程，这个 “僵尸” mm就会被释放，因为 “mm_count”变成了零。
-
- 另外，一个新的规则是，**没有人** 再把 “init_mm” 作为一个真正的MM了。
- “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”，事实上，它主
- 要是在启动时使用，当时还没有真正的VM被创建。因此，用来检查的代码
-
-   if (current->mm == &init_mm)
-
- 一般来说，应该用
-
-   if (!current->mm)
-
- 取代上面的写法（这更有意义--测试基本上是 “我们是否有一个用户环境”，并且通常
- 由缺页异常处理程序和类似的东西来完成）。
-
- 总之，我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1，因为它稍微改
- 变了接口以适配alpha（谁会想到呢，但alpha体系结构上下文切换代码实际上最终是
- 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的，alpha的PALcode将两者
- 连接起来，你需要同时切换两者）。
-
- (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/vm/balance.rst
deleted file mode 100644
index e98a47ef24a8..000000000000
--- a/Documentation/translations/zh_CN/vm/balance.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/balance.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-内存平衡
-========
-
-2000年1月开始，作者：Kanoj Sarcar <kanoj@sgi.com>
-
-对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配，需要进行
-内存平衡。
-
-调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个
-原因可能是，调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退
-选项的机会主义高阶分配请求中。在这种情况下，调用者可能也希望避免唤醒kswapd。
-
-__GFP_IO分配请求是为了防止文件系统死锁。
-
-在没有非睡眠分配请求的情况下，做平衡似乎是有害的。页面回收可以被懒散地启动，也就是
-说，只有在需要的时候（也就是区域的空闲内存为0），而不是让它成为一个主动的过程。
-
-也就是说，内核应该尝试从直接映射池中满足对直接映射页的请求，而不是回退到dma池中，
-这样就可以保持dma池为dma请求（不管是不是原子的）所填充。类似的争论也适用于高内存
-和直接映射的页面。相反，如果有很多空闲的dma页，最好是通过从dma池中分配一个来满足
-常规的内存请求，而不是产生常规区域平衡的开销。
-
-在2.2中，只有当空闲页总数低于总内存的1/64时，才会启动内存平衡/页面回收。如果dma
-和常规内存的比例合适，即使dma区完全空了，也很可能不会进行平衡。2.2已经在不同内存
-大小的生产机器上运行，即使有这个问题存在，似乎也做得不错。在2.3中，由于HIGHMEM的
-存在，这个问题变得更加严重。
-
-在2.3中，区域平衡可以用两种方式之一来完成：根据区域的大小（可能是低级区域的大小），
-我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是，在平衡的时
-候，我们不需要看低级区的大小，坏的方面是，我们可能会因为忽略低级区可能较低的使用率
-而做过于频繁的平衡。另外，只要对分配程序稍作修改，就有可能将memclass()宏简化为一
-个简单的等式。
-
-另一个可能的解决方案是，我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其
-低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题，并尽可能地保持了与2.2行为
-的接近。另外，平衡算法在各种架构上的工作方式也是一样的，这些架构有不同数量和类型的
-内存区。如果我们想变得更花哨一点，我们可以在未来为不同区域的自由页面分配不同的权重。
-
-请注意，如果普通区的大小与dma区相比是巨大的，那么在决定是否平衡普通区的时候，考虑
-空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。
-
-所附的补丁实现了第二个解决方案。它还 “修复”了两个问题：首先，在低内存条件下，kswapd
-被唤醒，就像2.2中的非睡眠分配。第二，HIGHMEM区也被平衡了，以便给replace_with_highmem()
-一个争取获得HIGHMEM页的机会，同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM
-页不会被泄露（例如，在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下）。
-
-kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的，可能
-是因为所有的分配请求都来自中断上下文，而所有的进程上下文都在睡眠。对于2.3，
-kswapd并不真正需要平衡高内存区，因为中断上下文并不请求高内存页。kswapd看zone
-结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。
-
-如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力，而该区的内存压力
-已经低于其水位，则会进行偷取。
-
-watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd：
-这些是每个区的字段，用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时，
-hysteric 的字段low_on_memory被设置。这个字段会一直被设置，直到空闲页数变成水位
-[WMARK_HIGH]。当low_on_memory被设置时，页面分配请求将尝试释放该区域的一些页面（如果
-请求中设置了GFP_WAIT）。与此相反的是，决定唤醒kswapd以释放一些区的页。这个决定不是基于
-hysteresis 的，而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行；在这种情况下，
-zone_wake_kswapd也被设置。
-
-
-我所听到的（超棒的）想法：
-
-1. 动态经历应该影响平衡：可以跟踪一个区的失败请求的数量，并反馈到平衡方案中（jalvo@mbay.net）。
-
-2. 实现一个类似于replace_with_highmem()的replace_with_regular()，以保留dma页面。
-   (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/vm/damon/api.rst
deleted file mode 100644
index 21143eea4ebe..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/api.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/api.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=======
-API参考
-=======
-
-内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` ，
-它位于源代码树的include/linux/。
-
-结构体
-======
-
-该API在以下内核代码中:
-
-include/linux/damon.h
-
-
-函数
-====
-
-该API在以下内核代码中:
-
-mm/damon/core.c
diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst
deleted file mode 100644
index 46128b77c2b3..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/design.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/design.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-====
-设计
-====
-
-可配置的层
-==========
-
-DAMON提供了数据访问监控功能，同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间
-并为之优化的基元。另一方面，作为DAMON的核心，准确性和开销的权衡机制是在纯逻辑空间中。DAMON
-将这两部分分离在不同的层中，并定义了它的接口，以允许各种低层次的基元实现与核心逻辑的配置。
-
-由于这种分离的设计和可配置的接口，用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的
-任何地址空间。如果没有提供合适的，用户可以自己实现基元。
-
-例如，物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。
-另外，如果某些架构或设备支持特殊的优化访问检查基元，这些基元将很容易被配置。
-
-
-特定地址空间基元的参考实现
-==========================
-
-基本访问监测的低级基元被定义为两部分。:
-
-1. 确定地址空间的监测目标地址范围
-2. 目标空间中特定地址范围的访问检查。
-
-DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。
-
-
-基于VMA的目标地址范围构造
--------------------------
-
-这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间，只是要求用户手动设置监控目标地址范围。
-
-在进程的超级巨大的虚拟地址空间中，只有小部分被映射到物理内存并被访问。因此，跟踪未映射的地
-址区域只是一种浪费。然而，由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声，所以严
-格来说，跟踪每一个映射并不是必须的，但在某些情况下甚至会产生很高的开销。也就是说，监测目标
-内部过于巨大的未映射区域应该被移除，以不占用自适应机制的时间。
-
-出于这个原因，这个实现将复杂的映射转换为三个不同的区域，覆盖地址空间的每个映射区域。这三个
-区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面
-的mmap()区域之间的间隙，以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙
-在通常的地址空间中是异常巨大的，排除这些间隙就足以做出合理的权衡。下面详细说明了这一点::
-
-    <heap>
-    <BIG UNMAPPED REGION 1>
-    <uppermost mmap()-ed region>
-    (small mmap()-ed regions and munmap()-ed regions)
-    <lowermost mmap()-ed region>
-    <BIG UNMAPPED REGION 2>
-    <stack>
-
-
-基于PTE访问位的访问检查
------------------------
-
-物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中
-找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表，而物理地址的实现则
-是查找与该地址有映射关系的每一个页表。通过这种方式，实现者找到并清除下一个采样目标地址的位，
-并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统，即空闲页跟
-踪和回收逻辑。为了避免这种干扰，DAMON使其与空闲页面跟踪相互排斥，并使用 ``PG_idle`` 和
-``PG_young`` 页面标志来解决与回收逻辑的冲突，就像空闲页面跟踪那样。
-
-
-独立于地址空间的核心机制
-========================
-
-下面四个部分分别描述了DAMON的核心机制和五个监测属性，即 ``采样间隔`` 、 ``聚集间隔`` 、
-``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。
-
-
-访问频率监测
-------------
-
-DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置
-``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说，DAMON检查每个 ``采样间隔`` 对每
-个页面的访问，并将结果汇总。换句话说，计算每个页面的访问次数。在每个 ``聚合间隔`` 过
-去后，DAMON调用先前由用户注册的回调函数，以便用户可以阅读聚合的结果，然后再清除这些结
-果。这可以用以下简单的伪代码来描述::
-
-    while monitoring_on:
-        for page in monitoring_target:
-            if accessed(page):
-                nr_accesses[page] += 1
-        if time() % aggregation_interval == 0:
-            for callback in user_registered_callbacks:
-                callback(monitoring_target, nr_accesses)
-            for page in monitoring_target:
-                nr_accesses[page] = 0
-        sleep(sampling interval)
-
-这种机制的监测开销将随着目标工作负载规模的增长而任意增加。
-
-
-基于区域的抽样调查
-------------------
-
-为了避免开销的无限制增加，DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持
-这个假设（一个区域内的页面具有相同的访问频率），该区域内就只需要检查一个页面。因此，对
-于每个 ``采样间隔`` ，DAMON在每个区域中随机挑选一个页面，等待一个 ``采样间隔`` ，检
-查该页面是否同时被访问，如果被访问则增加该区域的访问频率。因此，监测开销是可以通过设置
-区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。
-
-然而，如果假设没有得到保证，这个方案就不能保持输出的质量。
-
-
-适应性区域调整
---------------
-
-即使最初的监测目标区域被很好地构建以满足假设（同一区域内的页面具有相似的访问频率），数
-据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设，DAMON根据每个
-区域的访问频率自适应地进行合并和拆分。
-
-对于每个 ``聚集区间`` ，它比较相邻区域的访问频率，如果频率差异较小，就合并这些区域。
-然后，在它报告并清除每个区域的聚合接入频率后，如果区域总数不超过用户指定的最大区域数，
-它将每个区域拆分为两个或三个区域。
-
-通过这种方式，DAMON提供了其最佳的质量和最小的开销，同时保持了用户为其权衡设定的界限。
-
-
-动态目标空间更新处理
---------------------
-
-监测目标地址范围可以动态改变。例如，虚拟内存可以动态地被映射和解映射。物理内存可以被
-热插拔。
-
-由于在某些情况下变化可能相当频繁，DAMON允许监控操作检查动态变化，包括内存映射变化，
-并仅在用户指定的时间间隔（ ``更新间隔`` ）中的每个时间段，将其应用于监控操作相关的
-数据结构，如抽象的监控目标内存区。
\ No newline at end of file
diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/vm/damon/faq.rst
deleted file mode 100644
index 07b4ac19407d..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/faq.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/faq.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-常见问题
-========
-
-为什么是一个新的子系统，而不是扩展perf或其他用户空间工具？
-==========================================================
-
-首先，因为它需要尽可能的轻量级，以便可以在线使用，所以应该避免任何不必要的开销，如内核-用户
-空间的上下文切换成本。第二，DAMON的目标是被包括内核在内的其他程序所使用。因此，对特定工具
-（如perf）的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。
-
-
-“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗？
-==============================================
-
-闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的，尽管它可以
-使用采样来减少开销。另一方面，DAMON是一个更高层次的框架，用于监控各种地址空间。它专注于内
-存管理优化，并提供复杂的精度/开销处理机制。因此，“空闲页面跟踪” 和 “perf mem” 可以提供
-DAMON输出的一个子集，但不能替代DAMON。
-
-
-DAMON是否只支持虚拟内存？
-=========================
-
-不，DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始
-部分，包括监测目标区域的构造和实际的访问检查。通过这种方式，DAMON用户可以用任何访问检查技
-术来监测任何地址空间。
-
-尽管如此，DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间
-相关功能的实现，以供参考和方便使用。
-
-
-我可以简单地监测页面的粒度吗？
-==============================
-
-是的，你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。
-因为监视目标区域的大小被强制为 ``>=page size`` ，所以区域分割不会产生任何影响。
diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/vm/damon/index.rst
deleted file mode 100644
index 84d36d90c9b0..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/index.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/index.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-==========================
-DAMON:数据访问监视器
-==========================
-
-DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为
-（该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)）
-
- - *准确度* （监测输出对DRAM级别的内存管理足够有用；但可能不适合CPU Cache级别），
- - *轻量级* （监控开销低到可以在线应用），以及
- - *可扩展* （无论目标工作负载的大小，开销的上限值都在恒定范围内）。
-
-因此，利用这个框架，内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实
-验性内存管理优化工作可以再次进行。同时，在用户空间，有一些特殊工作负载的用户可以编写
-个性化的应用程序，以便更好地了解和优化他们的工作负载和系统。
-
-.. toctree::
-   :maxdepth: 2
-
-   faq
-   design
-   api
-
diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/vm/free_page_reporting.rst
deleted file mode 100644
index 31d6c34b956b..000000000000
--- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/_free_page_reporting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==========
-空闲页报告
-==========
-
-空闲页报告是一个API，设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟
-化的情况下是很有用的，客户机能够使用这些数据来通知管理器它不再使用内存中的某些页
-面。
-
-对于驱动，通常是气球驱动要使用这个功能，它将分配和初始化一个page_reporting_dev_info
-结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必
-须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。
-假设没有其他页面报告设备已经注册， 对page_reporting_register的调用将向报告框
-架注册页面报告接口。
-
-一旦注册，页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告
-页面，并在任何足够高的页面被释放之后2秒继续报告。
-
-报告的页面将被存储在传递给报告函数的散列表中，最后一个条目的结束位被设置在条目
-nent-1中。 当页面被报告函数处理时，分配器将无法访问它们。一旦报告函数完成，这些
-页将被返回到它们所获得的自由区域。
-
-在移除使用空闲页报告的驱动之前，有必要调用page_reporting_unregister，以移除
-目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报
-告通过该接口发出。如果另一个驱动或同一驱动被注册，它就有可能恢复前一个驱动在报告
-空闲页方面的工作。
-
-
-Alexander Duyck, 2019年12月04日
diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst
deleted file mode 100644
index 3eb07870e2ef..000000000000
--- a/Documentation/translations/zh_CN/vm/frontswap.rst
+++ /dev/null
@@ -1,196 +0,0 @@
-:Original: Documentation/vm/_free_page_reporting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-=========
-Frontswap
-=========
-
-Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中，由
-于交换页被保存在RAM（或类似RAM的设备）中，而不是交换磁盘，因此可以获得巨大的性能
-节省（提高）。
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap之所以这么命名，是因为它可以被认为是与swap设备的“back”存储相反。存
-储器被认为是一个同步并发安全的面向页面的“伪RAM设备”，符合transcendent memory
-（如Xen的“tmem”，或内核内压缩内存，又称“zcache”，或未来的类似RAM的设备）的要
-求；这个伪RAM设备不能被内核直接访问或寻址，其大小未知且可能随时间变化。驱动程序通过
-调用frontswap_register_ops将自己与frontswap链接起来，以适当地设置frontswap_ops
-的功能，它提供的功能必须符合某些策略，如下所示:
-
-一个 “init” 将设备准备好接收与指定的交换设备编号（又称“类型”）相关的frontswap
-交换页。一个 “store” 将把该页复制到transcendent memory，并与该页的类型和偏移
-量相关联。一个 “load” 将把该页，如果找到的话，从transcendent memory复制到内核
-内存，但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从
-transcendent memory中删除该页，一个 “invalidate_area” 将删除所有与交换类型
-相关的页（例如，像swapoff）并通知 “device” 拒绝进一步存储该交换类型。
-
-一旦一个页面被成功存储，在该页面上的匹配加载通常会成功。因此，当内核发现自己处于需
-要交换页面的情况时，它首先尝试使用frontswap。如果存储的结果是成功的，那么数据就已
-经成功的保存到了transcendent memory中，并且避免了磁盘写入，如果后来再读回数据，
-也避免了磁盘读取。如果存储返回失败，transcendent memory已经拒绝了该数据，且该页
-可以像往常一样被写入交换空间。
-
-请注意，如果一个页面被存储，而该页面已经存在于transcendent memory中（一个 “重复”
-的存储），要么存储成功，数据被覆盖，要么存储失败，该页面被废止。这确保了旧的数据永远
-不会从frontswap中获得。
-
-如果配置正确，对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
-debugfs完成的。frontswap的有效性可以通过以下方式测量（在所有交换设备中）:
-
-``failed_stores``
-	有多少次存储的尝试是失败的
-
-``loads``
-	尝试了多少次加载（应该全部成功）
-
-``succ_stores``
-	有多少次存储的尝试是成功的
-
-``invalidates``
-	尝试了多少次作废
-
-后台实现可以提供额外的指标。
-
-经常问到的问题
-==============
-
-* 价值在哪里?
-
-当一个工作负载开始交换时，性能就会下降。Frontswap通过提供一个干净的、动态的接口来
-读取和写入交换页到 “transcendent memory”，从而大大增加了许多这样的工作负载的性
-能，否则内核是无法直接寻址的。当数据被转换为不同的形式和大小（比如压缩）或者被秘密
-移动（对于一些类似RAM的设备来说，这可能对写平衡很有用）时，这个接口是理想的。交换
-页（和被驱逐的页面缓存页）是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。
-
-Frontswap对内核的影响相当小，为各种系统配置中更动态、更灵活的RAM利用提供了巨大的
-灵活性：
-
-在单一内核的情况下，又称“zcache”，页面被压缩并存储在本地内存中，从而增加了可以安
-全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利
-用率。Benchmarks测试显示，当内存压力较低时，几乎没有影响，而在高内存压力下的一些
-工作负载上，则有明显的性能改善（25%以上）。
-
-“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory
-的支持。Frontswap页面像zcache一样被本地压缩，但随后被“remotified” 到另一个系
-统的RAM。这使得RAM可以根据需要动态地来回负载平衡，也就是说，当系统A超载时，它可以
-交换到系统B，反之亦然。RAMster也可以被配置成一个内存服务器，因此集群中的许多服务器
-可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户
-有多少内存可用
-
-在虚拟情况下，虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复
-用。对于RAM来说，这真的很难做到，而且在不改变内核的情况下，要做好这一点的努力基本上
-是失败的（除了一些广为人知的特殊情况下的工作负载）。具体来说，Xen Transcendent Memory
-后端允许管理器拥有的RAM “fallow”，不仅可以在多个虚拟机之间进行“time-shared”，
-而且页面可以被压缩和重复利用，以优化RAM的利用率。当客户操作系统被诱导交出未充分利用
-的RAM时（如 “selfballooning”），突然出现的意外内存压力可能会导致交换；frontswap
-允许这些页面被交换到管理器RAM中或从管理器RAM中交换（如果整体主机系统内存条件允许），
-从而减轻计划外交换可能带来的可怕的性能影响。
-
-一个KVM的实现正在进行中，并且已经被RFC'ed到lkml。而且，利用frontswap，对NVM作为
-内存扩展技术的调查也在进行中。
-
-* 当然，在某些情况下可能有性能上的优势，但frontswap的空间/时间开销是多少？
-
-如果 CONFIG_FRONTSWAP 被禁用，每个 frontswap 钩子都会编译成空，唯一的开销是每
-个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用，但没有
-frontswap的 “backend” 寄存器，每读或写一个交换页就会有一个额外的全局变量，而不
-是零。如果 CONFIG_FRONTSWAP 被启用，并且有一个frontswap的backend寄存器，并且
-后端每次 “store” 请求都失败（即尽管声称可能，但没有提供内存），CPU 的开销仍然可以
-忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前，系统很可能是 I/O 绑定
-的，无论如何使用一小部分的 CPU 都是不相关的。
-
-至于空间，如果CONFIG_FRONTSWAP被启用，并且有一个frontswap的backend注册，那么
-每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换
-页分配的8位（在2.6.34之前是16位）上增加的。(Hugh Dickins观察到，frontswap可能
-会偷取现有的8个比特，但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的
-非常大的交换盘（这很罕见），这是每32GB交换盘1MB开销。
-
-当交换页存储在transcendent memory中而不是写到磁盘上时，有一个副作用，即这可能会
-产生更多的内存压力，有可能超过其他的优点。一个backend，比如zcache，必须实现策略
-来仔细（但动态地）管理内存限制，以确保这种情况不会发生。
-
-* 好吧，那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何？
-
-我们假设在内核初始化过程中，一个frontswap 的 “backend” 已经注册了；这个注册表
-明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提
-供了多少内存是完全动态和随机的。
-
-每当一个交换设备被交换时，就会调用frontswap_init()，把交换设备的编号（又称“类
-型”）作为一个参数传给它。这就通知了frontswap，以期待 “store” 与该号码相关的交
-换页的尝试。
-
-每当交换子系统准备将一个页面写入交换设备时（参见swap_writepage()），就会调用
-frontswap_store。Frontswap与frontswap backend协商，如果backend说它没有空
-间，frontswap_store返回-1，内核就会照常把页换到交换设备上。注意，来自frontswap
-backend的响应对内核来说是不可预测的；它可能选择从不接受一个页面，可能接受每九个
-页面，也可能接受每一个页面。但是如果backend确实接受了一个页面，那么这个页面的数
-据已经被复制并与类型和偏移量相关联了，而且backend保证了数据的持久性。在这种情况
-下，frontswap在交换设备的“frontswap_map” 中设置了一个位，对应于交换设备上的
-页面偏移量，否则它就会将数据写入该设备。
-
-当交换子系统需要交换一个页面时（swap_readpage()），它首先调用frontswap_load()，
-检查frontswap_map，看这个页面是否早先被frontswap backend接受。如果是，该页
-的数据就会从frontswap后端填充，换入就完成了。如果不是，正常的交换代码将被执行，
-以便从真正的交换设备上获得这一页的数据。
-
-所以每次frontswap backend接受一个页面时，交换设备的读取和（可能）交换设备的写
-入都被 “frontswap backend store” 和（可能）“frontswap backend loads”
-所取代，这可能会快得多。
-
-* frontswap不能被配置为一个 “特殊的” 交换设备，它的优先级要高于任何真正的交换
-  设备（例如像zswap，或者可能是swap-over-nbd/NFS）？
-
-首先，现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次
-结构，但这将需要相当大的改变。即使它被重写，现有的交换子系统也使用了块I/O层，它
-假定交换设备是固定大小的，其中的任何页面都是可线性寻址的。Frontswap几乎没有触
-及现有的交换子系统，而是围绕着块I/O子系统的限制，提供了大量的灵活性和动态性。
-
-例如，frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend
-的定义至关重要，因为它赋予了backend完全动态的决定权。在zcache中，人们无法预
-先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝，而 “差” 本身也可
-以根据当前的内存限制动态地定义。
-
-此外，frontswap是完全同步的，而真正的交换设备，根据定义，是异步的，并且使用
-块I/O。块I/O层不仅是不必要的，而且可能进行 “优化”，这对面向RAM的设备来说是
-不合适的，包括将一些页面的写入延迟相当长的时间。同步是必须的，以确保后端的动
-态性，并避免棘手的竞争条件，这将不必要地大大增加frontswap和/或块I/O子系统的
-复杂性。也就是说，只有最初的 “store” 和 “load” 操作是需要同步的。一个独立
-的异步线程可以自由地操作由frontswap存储的页面。例如，RAMster中的 “remotification”
-线程使用标准的异步内核套接字，将压缩的frontswap页面移动到远程机器。同样，
-KVM的客户方实现可以进行客户内压缩，并使用 “batched” hypercalls。
-
-在虚拟化环境中，动态性允许管理程序（或主机操作系统）做“intelligent overcommit”。
-例如，它可以选择只接受页面，直到主机交换可能即将发生，然后强迫客户机做他们
-自己的交换。
-
-transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可
-能失败，所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此，
-frontswap必须作为每个交换设备的 “影子” 来实现，它有可能容纳交换设备可能
-容纳的每一个页面，也有可能根本不容纳任何页面。这意味着frontswap不能包含比
-swap设备总数更多的页面。例如，如果在某些安装上没有配置交换设备，frontswap
-就没有用。无交换设备的便携式设备仍然可以使用frontswap，但是这种设备的
-backend必须配置某种 “ghost” 交换设备，并确保它永远不会被使用。
-
-
-* 为什么会有这种关于 “重复存储” 的奇怪定义？如果一个页面以前被成功地存储过，
-  难道它不能总是被成功地覆盖吗？
-
-几乎总是可以的，不，有时不能。考虑一个例子，数据被压缩了，原来的4K页面被压
-缩到了1K。现在，有人试图用不可压缩的数据覆盖该页，因此会占用整个4K。但是
-backend没有更多的空间了。在这种情况下，这个存储必须被拒绝。每当frontswap
-拒绝一个会覆盖的存储时，它也必须使旧的数据作废，并确保它不再被访问。因为交
-换子系统会把新的数据写到读交换设备上，这是确保一致性的正确做法。
-
-* 为什么frontswap补丁会创建新的头文件swapfile.h？
-
-frontswap代码依赖于一些swap子系统内部的数据结构，这些数据结构多年来一直
-在静态和全局之间来回移动。这似乎是一个合理的妥协：将它们定义为全局，但在一
-个新的包含文件中声明它们，该文件不被包含swap.h的大量源文件所包含。
-
-Dan Magenheimer，最后更新于2012年4月9日
diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/vm/highmem.rst
deleted file mode 100644
index 018838e58c3e..000000000000
--- a/Documentation/translations/zh_CN/vm/highmem.rst
+++ /dev/null
@@ -1,128 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/highmem.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==========
-高内存处理
-==========
-
-作者: Peter Zijlstra <a.p.zijlstra@chello.nl>
-
-.. contents:: :local:
-
-高内存是什么？
-==============
-
-当物理内存的大小接近或超过虚拟内存的最大大小时，就会使用高内存（highmem）。在这一点上，内
-核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内
-存的临时映射。
-
-没有被永久映射覆盖的那部分（物理）内存就是我们所说的 "高内存"。对于这个边界的确切位置，有
-各种架构上的限制。
-
-例如，在i386架构中，我们选择将内核映射到每个进程的虚拟空间，这样我们就不必为内核的进入/退
-出付出全部的TLB作废代价。这意味着可用的虚拟内存空间（i386上为4GiB）必须在用户和内核空间之
-间进行划分。
-
-使用这种方法的架构的传统分配方式是3:1，3GiB用于用户空间，顶部的1GiB用于内核空间。::
-
-		+--------+ 0xffffffff
-		| Kernel |
-		+--------+ 0xc0000000
-		|        |
-		| User   |
-		|        |
-		+--------+ 0x00000000
-
-这意味着内核在任何时候最多可以映射1GiB的物理内存，但是由于我们需要虚拟地址空间来做其他事
-情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少（通常在~896MiB左右）。
-
-其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而，一些硬件（如一些ARM）在使
-用mm上下文标签时，其虚拟空间有限。
-
-
-临时虚拟映射
-============
-
-内核包含几种创建临时映射的方法。:
-
-* vmap().  这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization
-  来解除映射。
-
-* kmap().  这允许对单个页面进行短期映射。它需要synchronization，但在一定程度上被摊销。
-  当以嵌套方式使用时，它也很容易出现死锁，因此不建议在新代码中使用它。
-
-* kmap_atomic().  这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上，
-  它表现得很好，但发布任务因此被要求留在该CPU上直到它完成，以免其他任务取代它的映射。
-
-  kmap_atomic() 也可以由中断上下文使用，因为它不睡眠，而且调用者可能在调用kunmap_atomic()
-  之后才睡眠。
-
-  可以假设k[un]map_atomic()不会失败。
-
-
-使用kmap_atomic
-===============
-
-何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存（见__GFP_HIGHMEM）
-分配的页面的内容时，例如在页缓存中的页面，就会使用它。该API有两个函数，它们的使用方式与
-下面类似::
-
-	/* 找到感兴趣的页面。 */
-	struct page *page = find_get_page(mapping, offset);
-
-	/* 获得对该页内容的访问权。 */
-	void *vaddr = kmap_atomic(page);
-
-	/* 对该页的内容做一些处理。 */
-	memset(vaddr, 0, PAGE_SIZE);
-
-	/* 解除该页面的映射。 */
-	kunmap_atomic(vaddr);
-
-注意，kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。
-
-如果你需要映射两个页面，因为你想从一个页面复制到另一个页面，你需要保持kmap_atomic调用严
-格嵌套，如::
-
-	vaddr1 = kmap_atomic(page1);
-	vaddr2 = kmap_atomic(page2);
-
-	memcpy(vaddr1, vaddr2, PAGE_SIZE);
-
-	kunmap_atomic(vaddr2);
-	kunmap_atomic(vaddr1);
-
-
-临时映射的成本
-==============
-
-创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。
-
-如果CONFIG_HIGHMEM没有被设置，那么内核会尝试用一点计算来创建映射，将页面结构地址转换成
-指向页面内容的指针，而不是去捣鼓映射。在这种情况下，解映射操作可能是一个空操作。
-
-如果CONFIG_MMU没有被设置，那么就不可能有临时映射和高内存。在这种情况下，也将使用计算方法。
-
-
-i386 PAE
-========
-
-在某些情况下，i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果:
-
-* Linux需要为系统中的每个页面建立一个页帧结构，而且页帧需要驻在永久映射中，这意味着：
-
-* 你最多可以有896M/sizeof(struct page)页帧；由于页结构体是32字节的，所以最终会有
-  112G的页；然而，内核需要在内存中存储更多的页帧......
-
-* PAE使你的页表变大--这使系统变慢，因为更多的数据需要在TLB填充等方面被访问。一个好处
-  是，PAE有更多的PTE位，可以提供像NX和PAT这样的高级功能。
-
-一般的建议是，你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作
-量有用，但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。
diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst
deleted file mode 100644
index 2379df95aa58..000000000000
--- a/Documentation/translations/zh_CN/vm/hmm.rst
+++ /dev/null
@@ -1,361 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/hmm.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==================
-异构内存管理 (HMM)
-==================
-
-提供基础设施和帮助程序以将非常规内存（设备内存，如板上 GPU 内存）集成到常规内核路径中，其
-基石是此类内存的专用struct page（请参阅本文档的第 5 至 7 节）。
-
-HMM 还为 SVM（共享虚拟内存）提供了可选的帮助程序，即允许设备透明地访问与 CPU 一致的程序
-地址，这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得
-必不可少，其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。
-
-本文档分为以下部分：在第一部分中，我揭示了与使用特定于设备的内存分配器相关的问题。在第二
-部分中，我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页
-表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后，
-最后一节介绍了一个新的迁移助手，它允许利用设备 DMA 引擎。
-
-.. contents:: :local:
-
-使用特定于设备的内存分配器的问题
-================================
-
-具有大量板载内存（几 GB）的设备（如 GPU）历来通过专用驱动程序特定 API 管理其内存。这会
-造成设备驱动程序分配和管理的内存与常规应用程序内存（私有匿名、共享内存或常规文件支持内存）
-之间的隔断。从这里开始，我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况：
-即，设备可以透明地使用任何应用程序内存区域。
-
-分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来
-看，程序中的所有内存对象并不平等，这使得依赖于广泛的库的大型程序变得复杂。
-
-具体来说，这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存（malloc、mmap
-私有、mmap 共享）和通过设备驱动程序 API 分配的内存之间复制对象（这仍然以 mmap 结束，
-但是是设备文件）。
-
-对于平面数据集（数组、网格、图像……），这并不难实现，但对于复杂数据集（列表、树……），
-很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错，
-而且由于数据集和地址的重复，程序更难调试。
-
-分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据，因此每个库可能
-不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响，并因为各种内存
-拷贝而浪费资源。
-
-复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出，并不是一个可行的选择。
-这将导致库入口点的组合爆炸。
-
-最后，随着高级语言结构（在 C++ 中，当然也在其他语言中）的进步，编译器现在有可能在没有程
-序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有
-其他模式，使用共享地址空间也更合理。
-
-
-I/O 总线、设备内存特性
-======================
-
-由于一些限制，I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本
-内存访问；甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下，它
-不是缓存一致的。
-
-如果我们只考虑 PCIE 总线，那么设备可以访问主内存（通常通过 IOMMU）并与 CPU 缓存一
-致。但是，它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟：CPU
-只能访问有限范围的设备内存，而不能对其执行原子操作。因此，从内核的角度来看，设备内存不
-能被视为与常规内存等同。
-
-另一个严重的因素是带宽有限（约 32GBytes/s，PCIE 4.0 和 16 通道）。这比最快的 GPU
-内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自
-己的内存时高一个数量级。
-
-一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制
-（OpenCAPI、CCIX）。它们主要允许 CPU 和设备之间的双向缓存一致性，并允许架构支持的所
-有原子操作。遗憾的是，并非所有平台都遵循这一趋势，并且一些主要架构没有针对这些问题的硬
-件解决方案。
-
-因此，为了使共享地址空间有意义，我们不仅必须允许设备访问任何内存，而且还必须允许任何内
-存在设备使用时迁移到设备内存（在迁移时阻止 CPU 访问）。
-
-
-共享地址空间和迁移
-==================
-
-HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间，因此对
-于进程地址空间中的任何有效主内存地址，相同的地址指向相同的物理内存。
-
-为了实现这一点，HMM 提供了一组帮助程序来填充设备页表，同时跟踪 CPU 页表更新。设备页表
-更新不像 CPU 页表更新那么容易。要更新设备页表，您必须分配一个缓冲区（或使用预先分配的
-缓冲区池）并在其中写入 GPU 特定命令以执行更新（取消映射、缓存失效和刷新等）。这不能通
-过所有设备的通用代码来完成。因此，为什么HMM提供了帮助器，在把硬件的具体细节留给设备驱
-动程序的同时，把一切可以考虑的因素都考虑进去了。
-
-HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存，它允许为设备内存的每个页面分配一个
-struct page。这些页面很特殊，因为 CPU 无法映射它们。然而，它们允许使用现有的迁移机
-制将主内存迁移到设备内存，从 CPU 的角度来看，一切看起来都像是换出到磁盘的页面。使用
-struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次，HMM 仅提供帮助程序，
-首先为设备内存热插拔新的 ZONE_DEVICE 内存，然后执行迁移。迁移内容和时间的策略决定留
-给设备驱动程序。
-
-请注意，任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如，当支持给定CPU
-地址 A 的页面从主内存页面迁移到设备页面时，对地址 A 的任何 CPU 访问都会触发缺页异常
-并启动向主内存的迁移。
-
-凭借这两个特性，HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步，而且还通
-过迁移设备正在使用的数据集部分来利用设备内存。
-
-
-地址空间镜像实现和API
-=====================
-
-地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中；HMM 有助于
-保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier
-开始::
-
- int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
-				  struct mm_struct *mm, unsigned long start,
-				  unsigned long length,
-				  const struct mmu_interval_notifier_ops *ops);
-
-在 ops->invalidate() 回调期间，设备驱动程序必须对范围执行更新操作（将范围标记为只
-读，或完全取消映射等）。设备必须在驱动程序回调返回之前完成更新。
-
-当设备驱动程序想要填充一个虚拟地址范围时，它可以使用::
-
-  int hmm_range_fault(struct hmm_range *range);
-
-如果请求写访问，它将在丢失或只读条目上触发缺页异常（见下文）。缺页异常使用通用的 mm 缺
-页异常代码路径，就像 CPU 缺页异常一样。
-
-这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟
-范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。
-
-在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面，以保
-持事物正确同步。使用模式是::
-
- int driver_populate_range(...)
- {
-      struct hmm_range range;
-      ...
-
-      range.notifier = &interval_sub;
-      range.start = ...;
-      range.end = ...;
-      range.hmm_pfns = ...;
-
-      if (!mmget_not_zero(interval_sub->notifier.mm))
-          return -EFAULT;
-
- again:
-      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
-      mmap_read_lock(mm);
-      ret = hmm_range_fault(&range);
-      if (ret) {
-          mmap_read_unlock(mm);
-          if (ret == -EBUSY)
-                 goto again;
-          return ret;
-      }
-      mmap_read_unlock(mm);
-
-      take_lock(driver->update);
-      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
-          release_lock(driver->update);
-          goto again;
-      }
-
-      /* Use pfns array content to update device page table,
-       * under the update lock */
-
-      release_lock(driver->update);
-      return 0;
- }
-
-driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用
-mmu_interval_read_retry() 之前保持，以避免与并发 CPU 页表更新发生任何竞争。
-
-利用 default_flags 和 pfn_flags_mask
-====================================
-
-hmm_range 结构有 2 个字段，default_flags 和 pfn_flags_mask，它们指定整个范围
-的故障或快照策略，而不必为 pfns 数组中的每个条目设置它们。
-
-例如，如果设备驱动程序需要至少具有读取权限的范围的页面，它会设置::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = 0;
-
-并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。
-
-现在假设驱动程序想要做同样的事情，除了它想要拥有写权限的范围内的一页。现在驱动程序设
-置::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
-    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
-
-有了这个，HMM 将在至少读取（即有效）的所有页面中异常，并且对于地址
-== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限，即，如果
-CPU pte 没有设置写权限，那么HMM将调用handle_mm_fault()。
-
-hmm_range_fault 完成后，标志位被设置为页表的当前状态，即 HMM_PFN_VALID | 如果页
-面可写，将设置 HMM_PFN_WRITE。
-
-
-从核心内核的角度表示和管理设备内存
-==================================
-
-尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存
-的信息，HMM 将自身挂接到 mm 代码的各个位置，以处理对设备内存支持的地址的任何访问。
-事实证明，这最终复制了 struct page 的大部分字段，并且还需要更新许多内核代码路径才
-能理解这种新的内存类型。
-
-大多数内核代码路径从不尝试访问页面后面的内存，而只关心struct page的内容。正因为如此，
-HMM 切换到直接使用 struct page 用于设备内存，这使得大多数内核代码路径不知道差异。
-我们只需要确保没有人试图从 CPU 端映射这些页面。
-
-移入和移出设备内存
-==================
-
-由于 CPU 无法直接访问设备内存，因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存
-储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和
-migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。
-
-在将页面迁移到设备私有内存之前，需要创建特殊的设备私有 ``struct page`` 。这些将用
-作特殊的“交换”页表条目，以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。
-
-这些可以通过以下方式分配和释放::
-
-    struct resource *res;
-    struct dev_pagemap pagemap;
-
-    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
-                                  "name of driver resource");
-    pagemap.type = MEMORY_DEVICE_PRIVATE;
-    pagemap.range.start = res->start;
-    pagemap.range.end = res->end;
-    pagemap.nr_range = 1;
-    pagemap.ops = &device_devmem_ops;
-    memremap_pages(&pagemap, numa_node_id());
-
-    memunmap_pages(&pagemap);
-    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
-
-还有devm_request_free_mem_region(), devm_memremap_pages(),
-devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``.
-
-整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration <page_migration>`) ，
-但这些步骤分为设备驱动程序特定代码和共享公共代码:
-
-1. ``mmap_read_lock()``
-
-   设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup()，
-   因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。
-
-2. ``migrate_vma_setup(struct migrate_vma *args)``
-
-   设备驱动初始化了 ``struct migrate_vma`` 的字段，并将该指针传递给
-   migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。
-   例如，设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存，设置
-   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页
-   面。如果后者被设置， ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备
-   私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前，只有匿名的私有VMA
-   范围可以被迁移到系统内存和设备私有内存。
-
-   migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()``
-   和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表，使
-   其他设备的MMU无效，以便在 ``args->src`` 数组中填写要迁移的PFN。
-   ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` ，
-   其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE， ``owner`` 字段设置为传递给
-   migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无
-   效化回调，只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。
-
-
-   在遍历页表时，一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效
-   的  “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清
-   除它，而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被
-   ``lock_page()``锁定，与LRU隔离（如果系统内存和设备私有页不在LRU上），从进
-   程中取消映射，并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup()
-   还清除了 ``args->dst`` 数组。
-
-3. 设备驱动程序分配目标页面并将源页面复制到目标页面。
-
-   驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已
-   设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选
-   择跳过页面迁移。
-
-   然后，驱动程序分配一个设备私有 struct page 或一个系统内存页，用 ``lock_page()``
-   锁定该页，并将 ``dst`` 数组条目填入::
-
-     dst[i] = migrate_pfn(page_to_pfn(dpage));
-
-   现在驱动程序知道这个页面正在被迁移，它可以使设备私有 MMU 映射无效并将设备私有
-   内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失
-   效，因此设备驱动程序只需使其自己的 MMU 映射失效。
-
-   驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的
-   ``struct page`` 面，并将源页面复制到目标设备上，如果指针为 ``NULL`` ，意
-   味着源页面没有被填充到系统内存中，则清除目标设备的私有内存。
-
-4. ``migrate_vma_pages()``
-
-   这一步是实际“提交”迁移的地方。
-
-   如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页，这时新分配的页会被插
-   入到CPU的页表中。如果一个CPU线程在同一页面上发生异常，这可能会失败。然而，页
-   表被锁定，只有一个新页会被插入。如果它失去了竞争，设备驱动将看到
-   ``MIGRATE_PFN_MIGRATE`` 位被清除。
-
-   如果源页被锁定、隔离等，源 ``struct page`` 信息现在被复制到目标
-   ``struct page`` ，最终完成CPU端的迁移。
-
-5. 设备驱动为仍在迁移的页面更新设备MMU页表，回滚未迁移的页面。
-
-   如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置，设备驱动可以
-   更新设备MMU，如果 ``MIGRATE_PFN_WRITE`` 位被设置，则设置写启用位。
-
-6. ``migrate_vma_finalize()``
-
-   这一步用新页的页表项替换特殊的迁移页表项，并释放对源和目的 ``struct page``
-   的引用。
-
-7. ``mmap_read_unlock()``
-
-   现在可以释放锁了。
-
-独占访问存储器
-==============
-
-一些设备具有诸如原子PTE位的功能，可以用来实现对系统内存的原子访问。为了支持对一
-个共享的虚拟内存页的原子操作，这样的设备需要对该页的访问是排他的，而不是来自CPU
-的任何用户空间访问。  ``make_device_exclusive_range()`` 函数可以用来使一
-个内存范围不能从用户空间访问。
-
-这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
-导致一个异常，该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已
-经被MMU通知器改变，之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序
-放弃页面锁和页面引用为止，这时页面上的任何CPU异常都可以按所述进行。
-
-内存 cgroup (memcg) 和 rss 统计
-===============================
-
-目前，设备内存被视为 rss 计数器中的任何常规页面（如果设备页面用于匿名，则为匿名，
-如果设备页面用于文件支持页面，则为文件，如果设备页面用于共享内存，则为 shmem）。
-这是为了保持现有应用程序的故意选择，这些应用程序可能在不知情的情况下开始使用设备
-内存，运行不受影响。
-
-一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序，
-因此不会释放太多系统内存。在决定以不同方式计算设备内存之前，我们希望收集更多关
-于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。
-
-对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算，常规
-页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规
-内存不会失败，因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对
-内存资源控制的影响有了更多的了解，我们可能会在后面重新考虑这个选择。
-
-请注意，设备内存永远不能由设备驱动程序或通过 GUP 固定，因此此类内存在进程退出时
-总是被释放的。或者在共享内存或文件支持内存的情况下，当删除最后一个引用时。
diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
deleted file mode 100644
index c6d471ce2131..000000000000
--- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
+++ /dev/null
@@ -1,436 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/hugetlbfs_reserv.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==============
-Hugetlbfs 预留
-==============
-
-概述
-====
-
-:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指
-示要使用巨页，这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常
-时没有巨页存在，任务就会被发送一个SIGBUS，并经常不高兴地死去。在加入巨页支
-持后不久，人们决定，在mmap()时检测巨页的短缺情况会更好。这个想法是，如果
-没有足够的巨页来覆盖映射，mmap()将失败。这首先是在mmap()时在代码中做一个
-简单的检查，以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一
-样，代码随着时间的推移而不断发展。然而，基本的想法是在mmap()时 “预留”
-巨页，以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核
-中是如何进行巨页预留处理的。
-
-
-读者
-====
-这个描述主要是针对正在修改hugetlbfs代码的内核开发者。
-
-
-数据结构
-========
-
-resv_huge_pages
-	这是一个全局的（per-hstate）预留的巨页的计数。预留的巨页只对预留它们的任
-	务可用。因此，一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。
-Reserve Map
-	预留映射由以下结构体描述::
-
-		struct resv_map {
-			struct kref refs;
-			spinlock_t lock;
-			struct list_head regions;
-			long adds_in_progress;
-			struct list_head region_cache;
-			long region_cache_count;
-		};
-
-	系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的
-	区域。一个区域被描述为::
-
-		struct file_region {
-			struct list_head link;
-			long from;
-			long to;
-		};
-
-	file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型，在
-	reserv_map 中的一个区域可能表示该范围存在预留，或预留不存在。
-Flags for MAP_PRIVATE Reservations
-	这些被存储在预留的映射指针的底部。
-
-	``#define HPAGE_RESV_OWNER    (1UL << 0)``
-		表示该任务是与该映射相关的预留的所有者。
-	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
-		表示最初映射此范围（并创建储备）的任务由于COW失败而从该任务（子任务）中取消映
-		射了一个页面。
-Page Flags
-	PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在
-	“释放巨页” 一节中讨论。
-
-
-预留映射位置（私有或共享）
-==========================
-
-一个巨页映射或段要么是私有的，要么是共享的。如果是私有的，它通常只对一个地址空间
-（任务）可用。如果是共享的，它可以被映射到多个地址空间（任务）。对于这两种类型的映射，
-预留映射的位置和语义是明显不同的。位置的差异是：
-
-- 对于私有映射，预留映射挂在VMA结构体上。具体来说，就是vma->vm_private_data。这个保
-  留映射是在创建映射（mmap(MAP_PRIVATE)）时创建的。
-- 对于共享映射，预留映射挂在inode上。具体来说，就是inode->i_mapping->private_data。
-  由于共享映射总是由hugetlbfs文件系统中的文件支持，hugetlbfs代码确保每个节点包含一个预
-  留映射。因此，预留映射在创建节点时被分配。
-
-
-创建预留
-========
-当创建一个巨大的有页面支持的共享内存段（shmget(SHM_HUGETLB)）或通过mmap(MAP_HUGETLB)
-创建一个映射时，就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用::
-
-	int hugetlb_reserve_pages(struct inode *inode,
-				  long from, long to,
-				  struct vm_area_struct *vma,
-				  vm_flags_t vm_flags)
-
-hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE
-标志。如果指定了NORESERVE，那么这个函数立即返回，因为不需要预留。
-
-参数'from'和'to'是映射或基础文件的巨页索引。对于shmget()，'from'总是0，'to'对应于段/映射
-的长度。对于mmap()，offset参数可以用来指定进入底层文件的偏移量。在这种情况下，'from'和'to'
-参数已经被这个偏移量所调整。
-
-PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。
-
-- 对于共享映射，预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时，预留映射不被
-  修改。
-- 对于私有映射，预留映射中没有条目表示相应页面存在预留。随着预留被消耗，条目被添加到预留映射中。
-  因此，预留映射也可用于确定哪些预留已被消耗。
-
-对于私有映射，hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外，
-HPAGE_RESV_OWNER标志被设置，以表明该VMA拥有预留。
-
-预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射，这始终是一个值（to - from）。
-然而，对于共享映射来说，一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节，
-请参见 :ref:`预留映射的修改 <resv_map_modifications>` 一节。
-
-该映射可能与一个子池（subpool）相关联。如果是这样，将查询子池以确保有足够的空间用于映射。子池
-有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 <sub_pool_resv>`
-一节。
-
-在咨询了预留映射和子池之后，就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查
-并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而，在这
-些函数中，代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话，全局预留计数resv_huge_pages
-会被调整，如下所示::
-
-	if (resv_needed <= (resv_huge_pages - free_huge_pages))
-		resv_huge_pages += resv_needed;
-
-注意，在检查和调整这些计数器时，全局锁hugetlb_lock会被预留。
-
-如果有足够的空闲的巨页，并且全局计数resv_huge_pages被调整，那么与映射相关的预留映射被修改以
-反映预留。在共享映射的情况下，将存在一个file_region，包括'from'-'to'范围。对于私有映射，
-不对预留映射进行修改，因为没有条目表示存在预留。
-
-如果hugetlb_reserve_pages()成功，全局预留数和与映射相关的预留映射将根据需要被修改，以确保
-在'from'-'to'范围内存在预留。
-
-消耗预留/分配一个巨页
-===========================
-
-当与预留相关的巨页在相应的映射中被分配和实例化时，预留就被消耗了。该分配是在函数alloc_huge_page()
-中进行的::
-
-	struct page *alloc_huge_page(struct vm_area_struct *vma,
-				     unsigned long addr, int avoid_reserve)
-
-alloc_huge_page被传递给一个VMA指针和一个虚拟地址，因此它可以查阅预留映射以确定是否存在预留。
-此外，alloc_huge_page需要一个参数avoid_reserve，该参数表示即使看起来已经为指定的地址预留了
-预留，也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下，即现有页面的额
-外拷贝被分配。
-
-
-调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详
-细内容，请参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。从
-vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留，则为0，如果不存在预留，则为1。
-如果不存在预留，并且有一个与映射相关联的子池，则查询子池以确定它是否包含预留。如果子池包含预留，
-则可将其中一个用于该分配。然而，在任何情况下，avoid_reserve参数都会优先考虑为分配使用预留。在
-确定预留是否存在并可用于分配后，调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关
-的参数：
-
-- avoid_reserve，这是传递给alloc_huge_page()的同一个值/参数。
-- chg，尽管这个参数的类型是long，但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0，
-  则表明存在预留（关于可能的问题，请参见 “预留和内存策略” 一节）。如果值
-  为1，则表示不存在预留，如果可能的话，必须从全局空闲池中取出该页。
-
-与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面，当该页面从空闲列表中移除时，
-free_huge_pages的值被递减。如果有一个与该页相关的预留，将进行以下调整::
-
-	SetPagePrivate(page);	/* 表示分配这个页面消耗了一个预留，
-				 * 如果遇到错误，以至于必须释放这个页面，预留将被
-				 * 恢复。 */
-	resv_huge_pages--;	/* 减少全局预留计数 */
-
-注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
-的剩余巨页和超额分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整:
-SetPagePrivate(page) 和 resv_huge_pages--.
-
-在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
-面被释放时，这将被用于子池的计数。
-
-然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
-到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
-已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建一
-个新的条目。
-
-注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
-的剩余巨页和过度分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整。
-SetPagePrivate(page)和resv_huge_pages-。
-
-在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
-面被释放时，这将被用于子池的计数。
-
-然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
-到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
-已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建
-一个新的条目。
-
-在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用
-vma_commit_reservation()之间，预留映射有可能被改变。如果hugetlb_reserve_pages在共
-享映射中为同一页面被调用，这将是可能的。在这种情况下，预留计数和子池空闲页计数会有一个偏差。
-这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来
-识别。如果检测到这种竞争，子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息，请
-参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。
-
-
-实例化巨页
-==========
-
-在巨页分配之后，页面通常被添加到分配任务的页表中。在此之前，共享映射中的页面被添加到页面缓
-存中，私有映射中的页面被添加到匿名反向映射中。在这两种情况下，PagePrivate标志被清除。因此，
-当一个已经实例化的巨页被释放时，不会对全局预留计数（resv_huge_pages）进行调整。
-
-
-释放巨页
-========
-
-巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此，它只传
-递一个指向页面结构体的指针。当一个巨页被释放时，可能需要进行预留计算。如果该页与包含保
-留的子池相关联，或者该页在错误路径上被释放，必须恢复全局预留计数，就会出现这种情况。
-
-page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置，它表明全局预留计数
-应该被调整（关于如何设置这些标志的信息，请参见
-:ref: `消耗预留/分配一个巨页 <consume_resv>` ）。
-
-
-该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值（不等于
-传递的1的值），它表明预留与子池相关联，这个新释放的页面必须被用来保持子池预留的数量超过最小值。
-因此，在这种情况下，全局resv_huge_pages计数器被递增。
-
-如果页面中设置了PagePrivate标志，那么全局resv_huge_pages计数器将永远被递增。
-
-子池预留
-========
-
-有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一
-个hstate中的页面子集，它与一个已挂载的hugetlbfs文件系统相关
-
-当一个hugetlbfs文件系统被挂载时，可以指定min_size选项，它表示文件系统所需的最小的巨页数量。
-如果指定了这个选项，与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体
-hugepage_subpool的min_hpages字段中被跟踪。在挂载时，hugetlb_acct_memory(min_hpages)
-被调用以预留指定数量的巨页。如果它们不能被预留，挂载就会失败。
-
-当从子池中获取或释放页面时，会调用hugepage_subpool_get/put_pages()函数。
-hugepage_subpool_get/put_pages被传递给巨页数量，以此来调整子池的 “已用页面” 计数
-（get为下降，put为上升）。通常情况下，如果子池中没有足够的页面，它们会返回与传递的相同的值或
-一个错误。
-
-然而，如果预留与子池相关联，可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局
-池调整的数量。例如，假设一个子池包含3个预留的巨页，有人要求5个。与子池相关的3个预留页可以用来
-满足部分请求。但是，必须从全局池中获得2个页面。为了向调用者转达这一信息，将返回值2。然后，调用
-者要负责从全局池中获取另外两个页面。
-
-
-COW和预留
-==========
-
-由于共享映射都指向并使用相同的底层页面，COW最大的预留问题是私有映射。在这种情况下，两个任务可
-以指向同一个先前分配的页面。一个任务试图写到该页，所以必须分配一个新的页，以便每个任务都指向它
-自己的页。
-
-当该页最初被分配时，该页的预留被消耗了。当由于COW而试图分配一个新的页面时，有可能没有空闲的巨
-页，分配会失败。
-
-当最初创建私有映射时，通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。
-由于所有者创建了映射，所有者拥有与映射相关的所有预留。因此，当一个写异常发生并且没有可用的页面
-时，对预留的所有者和非所有者采取不同的行动。
-
-在发生异常的任务不是所有者的情况下，异常将失败，该任务通常会收到一个SIGBUS。
-
-如果所有者是发生异常的任务，我们希望它能够成功，因为它拥有原始的预留。为了达到这个目的，该页被
-从非所有者任务中解映射出来。这样一来，唯一的引用就是来自拥有者的任务。此外，HPAGE_RESV_UNMAPPED
-位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常，它可能
-会收到一个SIGBUS。但是，映射/预留的原始拥有者的行为将与预期一致。
-
-预留映射的修改
-==============
-
-以下低级函数用于对预留映射进行修改。通常情况下，这些函数不会被直接调用。而是调用一个预留映射辅
-助函数，该函数调用这些低级函数中的一个。这些低级函数在源代码（mm/hugetlb.c）中得到了相当好的
-记录。这些函数是::
-
-	long region_chg(struct resv_map *resv, long f, long t);
-	long region_add(struct resv_map *resv, long f, long t);
-	void region_abort(struct resv_map *resv, long f, long t);
-	long region_count(struct resv_map *resv, long f, long t);
-
-在预留映射上的操作通常涉及两个操作:
-
-1) region_chg()被调用来检查预留映射，并确定在指定的范围[f, t]内有多少页目前没有被代表。
-
-   调用代码执行全局检查和分配，以确定是否有足够的巨页使操作成功。
-
-2)
-  a) 如果操作能够成功，regi_add()将被调用，以实际修改先前传递给regi_chg()的相同范围
-     [f, t]的预留映射。
-  b) 如果操作不能成功，region_abort被调用，在相同的范围[f, t]内中止操作。
-
-注意，这是一个两步的过程， region_add()和 region_abort()在事先调用 region_chg()后保证
-成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作（特别是 region_add()）的
-成功。
-
-如上所述，region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加
-到映射中的范围内的页数。在大多数情况下， region_add() 的返回值与 region_chg() 的返回值相
-同。然而，在共享映射的情况下，有可能在调用 region_chg() 和 region_add() 之间对预留映射进
-行更改。在这种情况下，regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下，全局计数
-和子池计数很可能是不正确的，需要调整。检查这种情况并进行适当的调整是调用者的责任。
-
-函数region_del()被调用以从预留映射中移除区域。
-它通常在以下情况下被调用:
-
-- 当hugetlbfs文件系统中的一个文件被删除时，该节点将被释放，预留映射也被释放。在释放预留映射
-  之前，所有单独的file_region结构体必须被释放。在这种情况下，region_del的范围是[0, LONG_MAX]。
-- 当一个hugetlbfs文件正在被截断时。在这种情况下，所有在新文件大小之后分配的页面必须被释放。
-  此外，预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下，region_del
-  的范围是[new_end_of_file, LONG_MAX]。
-- 当在一个hugetlbfs文件中打洞时。在这种情况下，巨页被一次次从文件的中间移除。当这些页被移除
-  时，region_del()被调用以从预留映射中移除相应的条目。在这种情况下，region_del被传递的范
-  围是[page_idx, page_idx + 1]。
-
-在任何情况下，region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下，region_del()
-会失败。这只能发生在打洞的情况下，即它必须分割一个现有的file_region条目，而不能分配一个新的
-结构体。在这种错误情况下，region_del()将返回-ENOMEM。这里的问题是，预留映射将显示对该页有
-预留。然而，子池和全局预留计数将不反映该预留。为了处理这种情况，调用函数hugetlb_fix_reserve_counts()
-来调整计数器，使其与不能被删除的预留映射条目相对应。
-
-region_count()在解除私有巨页映射时被调用。在私有映射中，预留映射中没有条目表明存在一个预留。
-因此，通过计算预留映射中的条目数，我们知道有多少预留被消耗了，有多少预留是未完成的
-（Outstanding = (end - start) - region_count（resv, start, end））。由于映射正在消
-失，子池和全局预留计数被未完成的预留数量所减去。
-
-预留映射帮助函数
-================
-
-有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣，所以它们只是传入一个
-地址而不是一个范围。此外，它们还传入相关的VMA。从VMA中，可以确定映射的类型（私有或共享）和预留
-映射的位置（inode或VMA）。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而，
-它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义，并向调用者隐藏了这个细节::
-
-	long vma_needs_reservation(struct hstate *h,
-				   struct vm_area_struct *vma,
-				   unsigned long addr)
-
-该函数为指定的页面调用 region_chg()。如果不存在预留，则返回1。如果存在预留，则返回0::
-
-	long vma_commit_reservation(struct hstate *h,
-				    struct vm_area_struct *vma,
-				    unsigned long addr)
-
-这将调用 region_add()，用于指定的页面。与region_chg和region_add的情况一样，该函数应在
-先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加，它将
-返回1，如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出
-现意外的差异，说明在两次调用之间修改了预留映射::
-
-	void vma_end_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样，该函数应在
-先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作::
-
-	long vma_add_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-这是一个特殊的包装函数，有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数
-中调用。该函数与vma_needs_reservation一起使用，试图将一个预留添加到预留映射中。它考虑到
-了私有和共享映射的不同预留映射语义。因此，region_add被调用用于共享映射（因为映射中的条目表
-示预留），而region_del被调用用于私有映射（因为映射中没有条目表示预留）。关于在错误路径上需
-要做什么的更多信息，请参见  “错误路径中的预留清理”  。
-
-
-错误路径中的预留清理
-====================
-
-正如在:ref:`预留映射帮助函数<resv_map_helpers>` 一节中提到的，预留的修改分两步进行。首
-先，在分配页面之前调用vma_needs_reservation。如果分配成功，则调用vma_commit_reservation。
-如果不是，则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整，
-一切都很好。
-
-此外，在一个巨页被实例化后，PagePrivate标志被清空，这样，当页面最终被释放时，计数是
-正确的。
-
-然而，有几种情况是，在一个巨页被分配后，但在它被实例化之前，就遇到了错误。在这种情况下，
-页面分配已经消耗了预留，并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放
-（在实例化和清除PagePrivate之前），那么free_huge_page将增加全局预留计数。然而，预留映射
-显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高，
-并阻止分配一个预先分配的页面。
-
-函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的
-是将预留映射恢复到页面分配前的状态。通过这种方式，预留映射的状态将与页面释放后的全局预留计
-数相对应。
-
-函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下，
-它将简单地清除该页的PagePrivate标志。这样一来，当页面被释放时，全局预留计数将不会被递增。
-然而，预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址，但它不会像最
-初设想的那样使用一个预留页。
-
-有一些代码（最明显的是userfaultfd）不能调用restore_reserve_on_error。在这种情况下，
-它简单地修改了PagePrivate，以便在释放巨页时不会泄露预留。
-
-
-预留和内存策略
-==============
-当git第一次被用来管理Linux代码时，每个节点的巨页列表就存在于hstate结构中。预留的概念是
-在一段时间后加入的。当预留被添加时，没有尝试将内存策略考虑在内。虽然cpusets与内存策略不
-完全相同，但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作
-用::
-
-
-	/*
-	 * 当cpuset被配置时，它打破了严格的hugetlb页面预留，因为计数是在一个全局变量上完
-	 * 成的。在有cpuset的情况下，这样的预留完全是垃圾，因为预留没有根据当前cpuset的
-	 * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时，应用程序仍然有可能
-	 * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的（或者说太难看了），因
-	 * 为cpuset太不稳定了，任务或内存节点可以在cpuset之间动态移动。与cpuset共享
-	 * hugetlb映射的语义变化是不可取的。然而，为了预留一些语义，我们退回到检查当前空闲
-	 * 页的可用性，作为一种最好的尝试，希望能将cpuset改变语义的影响降到最低。
-	 */
-
-添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败（OOM）。然而，如果一个应用
-程序使用cpusets或内存策略，就不能保证在所需的节点上有巨页可用。即使有足够数量的全局
-预留，也是如此。
-
-Hugetlbfs回归测试
-=================
-
-最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码，请使用
-libhugetlbfs测试套件来检查回归情况。此外，如果你添加了任何新的hugetlb功能，请在
-libhugetlbfs中添加适当的测试。
-
---
-Mike Kravetz，2017年4月7日
diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst
deleted file mode 100644
index c6e1e7bdb05b..000000000000
--- a/Documentation/translations/zh_CN/vm/hwpoison.rst
+++ /dev/null
@@ -1,166 +0,0 @@
-
-:Original: Documentation/vm/hwpoison.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-hwpoison
-========
-
-什么是hwpoison?
-===============
-
-
-即将推出的英特尔CPU支持从一些内存错误中恢复（ ``MCA恢复`` ）。这需要操作系统宣布
-一个页面"poisoned"，杀死与之相关的进程，并避免在未来使用它。
-
-这个补丁包在虚拟机中实现了必要的(编程)框架。
-
-引用概述中的评论::
-
-	高级机器的检查与处理。处理方法是损坏的页面被硬件报告，通常是由于2位ECC内
-	存或高速缓存故障。
-
-	这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时，当前运行的进程
-	可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理，就可
-	以安全地忽略它. 而不是用另外一个机器检查去处理它。
-
-	处理不同状态的页面缓存页。这里棘手的部分是，相对于其他虚拟内存用户， 我们可以异
-	步访问任何页面。因为内存故障可能随时随地发生，可能违反了他们的一些假设。这就是
-	为什么这段代码必须非常小心。一般来说，它试图使用正常的锁规则，如获得标准锁，即使
-	这意味着错误处理可能需要很长的时间。
-
-	这里的一些操作有点低效，并且具有非线性的算法复杂性，因为数据结构没有针对这种情
-	况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的，所
-	以我们希望我们可以摆脱这种情况。
-
-该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的
-各种检查组成，用来处理poison的页面。
-
-现在主要目标是KVM客户机，但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm
-版本。
-
-对于KVM的使用，需要一个新的信号类型，这样KVM就可以用适当的地址将机器检查注入到客户
-机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是，所有的应用程序都不要这
-样做，但一些非常专业的应用程序可能会这样做。
-
-故障恢复模式
-============
-
-有两种（实际上是三种）模式的内存故障恢复可以在。
-
-vm.memory_failure_recovery sysctl 置零:
-	所有的内存故障都会导致panic。请不要尝试恢复。
-
-早期处理
-	(可以在全局和每个进程中控制) 一旦检测到错误，立即向应用程序发送SIGBUS这允许
-	应用程序以温和的方式处理内存错误（例如，放弃受影响的对象） 这是KVM qemu使用的
-	模式。
-
-推迟处理
-	当应用程序运行到损坏的页面时，发送SIGBUS。这对不知道内存错误的应用程序来说是
-	最好的，默认情况下注意一些页面总是被当作late kill处理。
-
-用户控制
-========
-
-vm.memory_failure_recovery
-	参阅 sysctl.txt
-
-vm.memory_failure_early_kill
-	全局启用early kill
-
-PR_MCE_KILL
-	设置early/late kill mode/revert 到系统默认值。
-
-	arg1: PR_MCE_KILL_CLEAR:
-		恢复到系统默认值
-	arg1: PR_MCE_KILL_SET:
-		arg2定义了线程特定模式
-
-		PR_MCE_KILL_EARLY:
-			Early kill
-		PR_MCE_KILL_LATE:
-			Late kill
-		PR_MCE_KILL_DEFAULT
-			使用系统全局默认值
-
-	注意，如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO)，你应该在
-	指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则，SIGBUS将被发送到主线程。
-
-PR_MCE_KILL_GET
-	返回当前模式
-
-测试
-====
-
-* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面
-
-* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块
-
-  corrupt-pfn
-	在PFN处注入hwpoison故障，并echoed到这个文件。这做了一些早期过滤，以避
-	免在测试套件中损坏非预期页面。
-  unpoison-pfn
-	在PFN的Software-unpoison页面对应到这个文件。这样，一个页面可以再次被
-	复用。这只对Linux注入的故障起作用，对真正的内存故障不起作用。
-
-  注意这些注入接口并不稳定，可能会在不同的内核版本中发生变化
-
-  corrupt-filter-dev-major, corrupt-filter-dev-minor
-	只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通
-	配符值。这应该只用于人工注入的测试。
-
-  corrupt-filter-memcg
-	限制注入到memgroup拥有的页面。由memcg的inode号指定。
-
-	Example::
-
-		mkdir /sys/fs/cgroup/mem/hwpoison
-
-	        usemem -m 100 -s 1000 &
-		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-
-		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-
-		page-types -p `pidof init`   --hwpoison  # shall do nothing
-		page-types -p `pidof usemem` --hwpoison  # poison its pages
-
-  corrupt-filter-flags-mask, corrupt-filter-flags-value
-	当指定时，只有在((page_flags & mask) == value)的情况下才会poison页面。
-	这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相
-	同。这些标志位在include/linux/kernel-page-flags.h中定义，并在
-	Documentation/admin-guide/mm/pagemap.rst中记录。
-
-* 架构特定的MCE注入器
-
-  x86 有 mce-inject, mce-test
-
-  在mce-test中的一些便携式hwpoison测试程序，见下文。
-
-引用
-====
-
-http://halobates.de/mce-lc09-2.pdf
-	09年LinuxCon的概述演讲
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
-	测试套件（在tsrc中的hwpoison特定可移植测试）。
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
-	x86特定的注入器
-
-
-限制
-====
-- 不是所有的页面类型都被支持，而且永远不会。大多数内核内部对象不能被恢
-  复，目前只有LRU页。
-
----
-Andi Kleen, 2009年10月
diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst
deleted file mode 100644
index a1c6d529b6ff..000000000000
--- a/Documentation/translations/zh_CN/vm/index.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/index.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-=================
-Linux内存管理文档
-=================
-
-这是一个关于Linux内存管理（mm）子系统内部的文档集，其中有不同层次的细节，包括注释
-和邮件列表的回复，用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建
-议，请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。
-对于控制和调整指南，请参阅(Documentation/admin-guide/mm/index)。
-TODO:待引用文档集被翻译完毕后请及时修改此处）
-
-.. toctree::
-   :maxdepth: 1
-
-   active_mm
-   balance
-   damon/index
-   free_page_reporting
-   highmem
-   ksm
-   frontswap
-   hmm
-   hwpoison
-   hugetlbfs_reserv
-   memory-model
-   mmu_notifier
-   numa
-   overcommit-accounting
-   page_frags
-   page_owner
-   page_table_check
-   remap_file_pages
-   split_page_table_lock
-   z3fold
-   zsmalloc
-
-TODOLIST:
-* arch_pgtable_helpers
-* free_page_reporting
-* hugetlbfs_reserv
-* page_migration
-* slub
-* transhuge
-* unevictable-lru
-* vmalloced-kernel-stacks
diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/vm/ksm.rst
deleted file mode 100644
index 83b0c73984da..000000000000
--- a/Documentation/translations/zh_CN/vm/ksm.rst
+++ /dev/null
@@ -1,70 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/ksm.rst
-
-:翻译:
-
-   徐鑫 xu xin <xu.xin16@zte.com.cn>
-
-============
-内核同页合并
-============
-
-KSM 是一种节省内存的数据去重功能，由CONFIG_KSM=y启用，并在2.6.32版本时被添加
-到Linux内核。详见 ``mm/ksm.c`` 的实现，以及http://lwn.net/Articles/306704和
-https://lwn.net/Articles/330589
-
-KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst
-文档中有描述。
-
-设计
-====
-
-概述
-----
-
-概述内容请见mm/ksm.c文档中的“DOC: Overview”
-
-逆映射
-------
-KSM维护着稳定树中的KSM页的逆映射信息。
-
-当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时，则代表了
-KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时，这个KSM页
-的 ``page->mapping`` 指向了该稳定树节点。
-
-如果共享数超过了阈值，KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多
-个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息，其中 ``page->mapping``
-指向该"副本"。
-
-每个链以及链接到该链中的所有"副本"强制不变的是，它们代表了相同的写保护内存
-内容，尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。
-
-这样一来，相比与无限的逆映射链表，稳定树的查找计算复杂性不受影响。但在稳定树
-本身中不能有重复的KSM页面内容仍然是强制要求。
-
-由 ``max_page_sharing`` 强制决定的数据去重限制是必要的，以此来避免虚拟内存
-rmap链表变得过大。rmap的遍历具有O(N)的复杂度，其中N是共享页面的rmap_项（即
-虚拟映射）的数量，而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。
-因此，这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进
-程在稳定节点"链"上的遍历也是O(N)，但这个N是稳定树"副本"的数量，而不是rmap项
-的数量，因此它对ksmd性能没有显著影响。实际上，最佳稳定树"副本"的候选节点将
-保留在"副本"列表的开头。
-
-``max_page_sharing`` 的值设置得高了会促使更快的内存合并（因为将有更少的稳定
-树副本排队进入稳定节点chain->hlist）和更高的数据去重系数，但代价是在交换、压
-缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。
-
-``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控
-的影响，高比值可能意味着稳定节点dup中存在碎片，这可以通过在ksmd中引入碎片算
-法来解决，该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup，以便释放
-那些仅包含极少rmap项的稳定节点"dup"，但这可能会增加ksmd进程的CPU使用率，并可
-能会减慢应用程序在KSM页面上的只读计算。
-
-KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本"，以便删减过时了的稳定节点。
-这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。
-
-参考
-====
-内核代码请见mm/ksm.c。
-涉及的函数(mm_slot  ksm_scan  stable_node  rmap_item)。
diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst
deleted file mode 100644
index 013e30c88d72..000000000000
--- a/Documentation/translations/zh_CN/vm/memory-model.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/memory-model.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-============
-物理内存模型
-============
-
-系统中的物理内存可以用不同的方式进行寻址。最简单的情况是，物理内存从地址0开
-始，跨越一个连续的范围，直到最大的地址。然而，这个范围可能包含CPU无法访问的
-小孔隙。那么，在完全不同的地址可能有几个连续的范围。而且，别忘了NUMA，即不
-同的内存库连接到不同的CPU。
-
-Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每
-个架构都定义了它所支持的内存模型，默认的内存模型是什么，以及是否有可能手动
-覆盖该默认值。
-
-所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页
-帧的状态。
-
-无论选择哪种内存模型，物理页框号（PFN）和相应的 `struct page` 之间都存
-在一对一的映射关系。
-
-每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn`
-帮助函数，允许从PFN到 `struct page` 的转换，反之亦然。
-
-FLATMEM
-=======
-
-最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的
-物理内存。
-
-在FLATMEM内存模型中，有一个全局的 `mem_map` 数组来映射整个物理内存。对
-于大多数架构，孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page`
-对象从未被完全初始化。
-
-为了分配 `mem_map` 数组，架构特定的设置代码应该调用free_area_init()函数。
-然而，在调用memblock_free_all()函数之前，映射数组是不能使用的，该函数
-将所有的内存交给页分配器。
-
-一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下，特
-定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。
-
-使用FLATMEM，PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET`
-是 `mem_map` 数组的一个索引。
-
-`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。
-
-SPARSEMEM
-=========
-
-SPARSEMEM是Linux中最通用的内存模型，它是唯一支持若干高级功能的内存模型，
-如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟
-初始化。
-
-SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构
-体表示，它包含 `section_mem_map` ，从逻辑上讲，它是一个指向 `struct page`
-阵列的指针。然而，它被存储在一些其他的magic中，以帮助分区管理。区段的大小
-和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量
-来指定的，这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS`
-是一个架构所支持的物理地址的实际宽度，而 `SECTION_SIZE_BITS` 是一个任
-意的值。
-
-最大的段数表示为 `NR_MEM_SECTIONS` ，定义为
-
-.. math::
-
-   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
-
-`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的
-大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数:
-
-* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时， `mem_sections` 数组是静态的，有
-  `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。
-* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时， `mem_sections` 数组被动态分配。
-  每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象，行数的计算是为了适应所有的
-  内存区。
-
-架构设置代码应该调用sparse_init()来初始化内存区和内存映射。
-
-通过SPARSEMEM，有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和
- "sparse vmemmap"。选择是在构建时进行的，它由 `CONFIG_SPARSEMEM_VMEMMAP` 的
- 值决定。
-
-Classic sparse在page->flags中编码了一个页面的段号，并使用PFN的高位来访问映射该页
-框的段。在一个区段内，PFN是指向页数组的索引。
-
-Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操
-作。有一个全局的 `struct page *vmemmap` 指针，指向一个虚拟连续的 `struct page`
-对象阵列。PFN是该数组的一个索引，`struct page` 从 `vmemmap` 的偏移量是该页的PFN。
-
-为了使用vmemmap，一个架构必须保留一个虚拟地址的范围，以映射包含内存映射的物理页，并
-确保 `vmemmap`指向该范围。此外，架构应该实现 :c:func:`vmemmap_populate` 方法，
-它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求，
-它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。
-
-虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分
-配的存储中。这种存储用vmem_altmap结构表示，最终通过一长串的函数调用传递给
-vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和
-:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。
-
-ZONE_DEVICE
-===========
-`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上，为设备驱动识别的物理地址范
-围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下
-事实有关：这些地址范围的页面对象从未被在线标记过，而且必须对设备进行引用，而不仅仅
-是页面，以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ，通过 :c:func:`devm_memremap_pages` ，
-为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`，
-:c:func:`page_to_pfn`, ，和 :c:func:`get_user_pages` 服务。由于页面引
-用计数永远不会低于1，所以页面永远不会被追踪为空闲内存，页面的 `struct list_head lru`
-空间被重新利用，用于向映射该内存的主机设备/驱动程序进行反向引用。
-
-虽然 `SPARSEMEM` 将内存作为一个区段的集合，可以选择收集并合成内存块，但
-`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE`
-内存从未被在线标记，因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界
-上。这个实现依赖于这种缺乏用户接口的约束，允许子段大小的内存范围被指定给
-:c:func:`arch_add_memory` ，即内存热插拔的上半部分。子段支持允许2MB作为
-:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。
-
-`ZONE_DEVICE` 的用户是:
-
-* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。
-
-* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` ，
-  以允许设备驱动程序协调与设备内存相关的内存管理事件，通常是GPU内存。参见/vm/hmm.rst。
-
-* p2pdma: 创建 `struct page` 对象，允许PCI/E拓扑结构中的peer设备协调它们之间的
-  直接DMA操作，即绕过主机内存。
diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst
deleted file mode 100644
index b29a37b33628..000000000000
--- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-:Original: Documentation/vm/mmu_notifier.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-
-什么时候需要页表锁内通知？
-==========================
-
-当清除一个pte/pmd时，我们可以选择通过在页表锁下（通知版的\*_clear_flush调用
-mmu_notifier_invalidate_range）通知事件。但这种通知并不是在所有情况下都需要的。
-
-对于二级TLB（非CPU TLB），如IOMMU TLB或设备TLB（当设备使用类似ATS/PASID的东西让
-IOMMU走CPU页表来访问进程的虚拟地址空间）。只有两种情况需要在清除pte/pmd时在持有页
-表锁的同时通知这些二级TLB：
-
-  A) 在mmu_notifier_invalidate_range_end()之前，支持页的地址被释放。
-  B) 一个页表项被更新以指向一个新的页面（COW，零页上的写异常，__replace_page()，...）。
-
-情况A很明显，你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。
-
-情况B更加微妙。为了正确起见，它需要按照以下序列发生:
-
-  - 上页表锁
-  - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify())
-  - 设置页表项以指向新页
-
-如果在设置新的pte/pmd值之前，清除页表项之后没有进行通知，那么你就会破坏设备的C11或
-C++11等内存模型。
-
-考虑以下情况（设备使用类似于ATS/PASID的功能）。
-
-两个地址addrA和addrB，这样|addrA - addrB| >= PAGE_SIZE，我们假设它们是COW的
-写保护（B的其他情况也适用）。
-
-::
-
- [Time N] --------------------------------------------------------------------
- CPU-thread-0  {尝试写到addrA}
- CPU-thread-1  {尝试写到addrB}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {读取addrA并填充设备TLB}
- DEV-thread-2  {读取addrB并填充设备TLB}
- [Time N+1] ------------------------------------------------------------------
- CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
- CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+2] ------------------------------------------------------------------
- CPU-thread-0  {COW_step1: {更新页表以指向addrA的新页}}
- CPU-thread-1  {COW_step1: {更新页表以指向addrB的新页}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {写入addrA，这是对新页面的写入}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {}
- CPU-thread-3  {写入addrB，这是一个写入新页的过程}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+4] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+5] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {从旧页中读取addrA}
- DEV-thread-2  {从新页面读取addrB}
-
-所以在这里，因为在N+2的时候，清空页表项没有和通知一起作废二级TLB，设备在看到addrA的新值之前
-就看到了addrB的新值。这就破坏了设备的总内存序。
-
-当改变一个pte的写保护或指向一个新的具有相同内容的写保护页（KSM）时，将mmu_notifier_invalidate_range
-调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程
-在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占，也是如此。
diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst
deleted file mode 100644
index 6af412b924ad..000000000000
--- a/Documentation/translations/zh_CN/vm/numa.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-:Original: Documentation/vm/numa.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-始于1999年11月，作者： <kanoj@sgi.com>
-
-==========================
-何为非统一内存访问(NUMA)？
-==========================
-
-这个问题可以从几个视角来回答：硬件观点和Linux软件视角。
-
-从硬件角度看，NUMA系统是一个由多个组件或装配组成的计算机平台，每个组件可能包含0个或更多的CPU、
-本地内存和/或IO总线。为了简洁起见，并将这些物理组件/装配的硬件视角与软件抽象区分开来，我们在
-本文中称这些组件/装配为“单元”。
-
-每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能
-不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如，交叉开关或点对点
-链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来，以创建NUMA平台，其中的单元与其
-他单元有多个距离。
-
-对于Linux，感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中，
-所有的内存都是可见的，并且可以从连接到任何单元的任何CPU中访问，缓存一致性是由处理器缓存和/或
-系统互连在硬件中处理。
-
-内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元
-有多远。例如，连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和
-更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的（其他）单元。
-
-平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反，这种架构是提供可扩展
-内存带宽的一种手段。然而，为了实现可扩展的内存带宽，系统和应用软件必须安排大部分的内存引用
-[cache misses]到“本地”内存——同一单元的内存，如果有的话——或者到最近的有内存的单元。
-
-这就自然而然有了Linux软件对NUMA系统的视角:
-
-Linux将系统的硬件资源划分为多个软件抽象，称为“节点”。Linux将节点映射到硬件平台的物理单元
-上，对一些架构的细节进行了抽象。与物理单元一样，软件节点可能包含0或更多的CPU、内存和/或IO
-总线。同样，对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快
-的访问时间和更高的有效带宽。
-
-对于一些架构，如x86，Linux将“隐藏”任何代表没有内存连接的物理单元的节点，并将连接到该单元
-的任何CPU重新分配到代表有内存的单元的节点上。因此，在这些架构上，我们不能假设Linux将所有
-的CPU与一个给定的节点相关联，会看到相同的本地内存访问时间和带宽。
-
-此外，对于某些架构，同样以x86为例，Linux支持对额外节点的仿真。对于NUMA仿真，Linux会将现
-有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部
-分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的，当与cpusets一起使用时，
-可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-对于每个有内存的节点，Linux构建了一个独立的内存管理子系统，有自己的空闲页列表、使用中页列表、
-使用统计和锁来调解访问。此外，Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE
-中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求
-时要访问的区/节点。当一个区没有可用的内存来满足请求时，这种情况被称为“overflow 溢出”或
-“fallback 回退”。
-
-由于一些节点包含多个包含不同类型内存的区，Linux必须决定是否对区列表进行排序，使分配回退到不同
-节点上的相同区类型，或同一节点上的不同区类型。这是一个重要的考虑因素，因为有些区，如DMA或DMA32，
-代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距
-离排序的远程节点之前，它会尝试回退到同一节点的其他分区。
-
-默认情况下，Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说，Linux将试
-图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能
-满足请求，内核将检查所选分区列表中其他节点的区域，寻找列表中第一个能满足请求的区域。
-
-本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配
-一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构
-中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥
-远的调度域中。然而，调度器并没有直接考虑到任务的NUMA足迹。因此，在充分不平衡的情况下，任务可
-以在节点之间迁移，远离其初始节点和内核数据结构。
-
-系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口，如taskset(1)和numactl(1)，以及程
-序接口，如sched_setaffinity(2)，来限制任务的迁移，以改善NUMA定位。此外，人们可以使用
-Linux NUMA内存策略修改内核的默认本地分配行为。 [见
-:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
-
-系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点
-的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-在不隐藏无内存节点的架构上，Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无
-内存的节点，“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反，它
-将是内核在建立分区列表时选择的离它最近的有内存的节点。所以，默认情况下，本地分配将由内核提供
-最近的可用内存来完成。这是同一机制的结果，该机制允许这种分配在一个包含内存的节点溢出时回退到
-其他附近的节点。
-
-一些内核分配不希望或不能容忍这种分配回退行为。相反，他们想确保他们从指定的节点获得内存，或者
-得到通知说该节点没有空闲内存。例如，当一个子系统分配每个CPU的内存资源时，通常是这种情况。
-
-一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的
-节点ID，然后只从返回的节点ID请求内存。当这样的分配失败时，请求的子系统可以恢复到它自己的回退
-路径。板块内核内存分配器就是这样的一个例子。或者，子系统可以选择在分配失败时禁用或不启用自己。
-内核分析子系统就是这样的一个例子。
-
-如果架构支持——不隐藏无内存节点，那么连接到无内存节点的CPU将总是产生回退路径的开销，或者一些
-子系统如果试图完全从无内存的节点分配内存，将无法初始化。为了透明地支持这种架构，内核子系统可
-以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样，这是同
-一个节点，默认的本地页分配将从这个节点开始尝试。
diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
deleted file mode 100644
index 8765cb118f24..000000000000
--- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-:Original: Documentation/vm/overcommit-accounting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-
-==============
-超量使用审计
-==============
-
-Linux内核支持下列超量使用处理模式
-
-0
-	启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。
-	它确保严重的疯狂分配失败，同时允许超量使用以减少swap的使用。在这种模式下，
-	允许root分配稍多的内存。这是默认的。
-1
-	总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码，只是依赖
-	几乎完全由零页组成的虚拟内存
-
-2
-	不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量
-	（默认为50%）。根据你使用的数量，在大多数情况下，这意味着一个进程在访问页面时
-	不会被杀死，但会在内存分配上收到相应的错误。
-
-	对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说
-	是很有用的。
-
-超量使用策略是通过sysctl  `vm.overcommit_memory` 设置的。
-
-可以通过 `vm.overcommit_ratio` （百分比）或 `vm.overcommit_kbytes` （绝对值）
-来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。
-
-在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前
-的超量使用和提交量。
-
-陷阱
-====
-
-C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证，并在接近边缘的地方运行，
-你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说，这并
-不重要，但如果你真的非常关心的话，这就是一个值得关注的案例。
-
-
-在模式2中，MAP_NORESERVE标志被忽略。
-
-
-它是如何工作的
-==============
-
-超量使用是基于以下规则
-
-对于文件映射
-	| SHARED or READ-only	-	0 cost (该文件是映射而不是交换)
-	| PRIVATE WRITABLE	-	每个实例的映射大小
-
-对于匿名或者 ``/dev/zero`` 映射
-	| SHARED			-	映射的大小
-	| PRIVATE READ-only	-	0 cost (但作用不大)
-	| PRIVATE WRITABLE	-	每个实例的映射大小
-
-额外的计数
-	| 通过mmap制作可写副本的页面
-	| 从同一池中提取的shmfs内存
-
-状态
-====
-
-*	我们核算mmap内存映射
-*	我们核算mprotect在提交中的变化
-*	我们核算mremap的大小变化
-*	我们的审计 brk
-*	审计munmap
-*	我们在/proc中报告commit 状态
-*	核对并检查分叉的情况
-*	审查堆栈处理/执行中的构建
-*	叙述SHMfs的情况
-*	实现实际限制的执行
-
-待续
-====
-*	ptrace 页计数（这很难）。
diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst
deleted file mode 100644
index ad27fed33634..000000000000
--- a/Documentation/translations/zh_CN/vm/page_frags.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-:Original: Documentation/vm/page_frag.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-页面片段
-========
-
-一个页面片段是一个任意长度的任意偏移的内存区域，它位于一个0或更高阶的复合页面中。
-该页中的多个碎片在该页的引用计数器中被单独计算。
-
-page_frag函数，page_frag_alloc和page_frag_free，为页面片段提供了一个简单
-的分配框架。这被网络堆栈和网络设备驱动使用，以提供一个内存的支持区域，作为
-sk_buff->head使用，或者用于skb_shared_info的 “frags” 部分。
-
-为了使用页面片段API，需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点，
-并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用，
-这在分配时开销可能会很大。然而，由于这种缓存的性质，要求任何对缓存的调用都要受到每
-个CPU的限制，或者每个CPU的限制，并在执行碎片分配时强制禁止中断。
-
-网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用
-netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
-被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是
-它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用，因为这些函数
-将禁用中断，而 ”napi“ 前缀的函数只可以在softirq上下文中使用。
-
-许多网络设备驱动程序使用类似的方法来分配页面片段，但页面片段是在环或描述符级别上
-缓存的。为了实现这些情况，有必要提供一种拆解页面缓存的通用方法。出于这个原因，
-__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。
-这样做的好处是，它允许清理被添加到一个页面的多个引用，以避免每次分配都调用
-get_page。
-
-Alexander Duyck，2016年11月29日。
diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst
deleted file mode 100644
index 9e951fabba9d..000000000000
--- a/Documentation/translations/zh_CN/vm/page_owner.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-:Original: Documentation/vm/page_owner.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-================================
-page owner: 跟踪谁分配的每个页面
-================================
-
-概述
-====
-
-page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。
-当分配发生时，有关分配的信息，如调用堆栈和页面的顺序被存储到每个页面的特定存储中。
-当我们需要了解所有页面的状态时，我们可以获得并分析这些信息。
-
-尽管我们已经有了追踪页面分配/释放的tracepoint，但用它来分析谁分配的每个页面是
-相当复杂的。我们需要扩大跟踪缓冲区，以防止在用户空间程序启动前出现重叠。而且，启
-动的程序会不断地将跟踪缓冲区转出，供以后分析，这将会改变系统的行为，会产生更多的
-可能性，而不是仅仅保留在内存中，所以不利于调试。
-
-页面所有者也可以用于各种目的。例如，可以通过每个页面的gfp标志信息获得精确的碎片
-统计。如果启用了page owner，它就已经实现并激活了。我们非常欢迎其他用途。
-
-page owner在默认情况下是禁用的。所以，如果你想使用它，你需要在你的启动cmdline
-中加入"page_owner=on"。如果内核是用page owner构建的，并且由于没有启用启动
-选项而在运行时禁用page owner，那么运行时的开销是很小的。如果在运行时禁用，它不
-需要内存来存储所有者信息，所以没有运行时内存开销。而且，页面所有者在页面分配器的
-热路径中只插入了两个不可能的分支，如果不启用，那么分配就会像没有页面所有者的内核
-一样进行。这两个不可能的分支应该不会影响到分配的性能，特别是在静态键跳转标签修补
-功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。
-
-- 没有page owner::
-
-   text    data     bss     dec     hex filename
-   48392   2333     644   51369    c8a9 mm/page_alloc.o
-
-- 有page owner::
-
-   text    data     bss     dec     hex filename
-   48800   2445     644   51889    cab1 mm/page_alloc.o
-   6662     108      29    6799    1a8f mm/page_owner.o
-   1025       8       8    1041     411 mm/page_ext.o
-
-虽然总共增加了8KB的代码，但page_alloc.o增加了520字节，其中不到一半是在hotpath
-中。构建带有page owner的内核，并在需要时打开它，将是调试内核内存问题的最佳选择。
-
-有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这
-个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些，所以，在初始化
-之前，许多页面可以被分配，但它们没有所有者信息。为了解决这个问题，这些早期分配的
-页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息，但至
-少，我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上，有13343
-个早期分配的页面被捕捉和标记，尽管它们大部分是由结构页扩展功能分配的。总之，在这
-之后，没有任何页面处于未追踪状态。
-
-使用方法
-========
-
-1) 构建用户空间的帮助::
-
-	cd tools/vm
-	make page_owner_sort
-
-2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline.
-
-3) 做你想调试的工作。
-
-4) 分析来自页面所有者的信息::
-
-	cat /sys/kernel/debug/page_owner > page_owner_full.txt
-	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
-
-   ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值)::
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-   ``page_owner_sort`` 工具忽略了 ``PFN`` 行，将剩余的行放在buf中，使用regexp提
-   取页序值，计算buf的次数和页数，最后根据参数进行排序。
-
-   在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出::
-
-	XXX times, XXX pages:
-	Page allocated via order XXX, ...
-	// Detailed stack
-
-   默认情况下， ``page_owner_sort`` 是根据buf的时间来排序的。如果你想
-   按buf的页数排序，请使用-m参数。详细的参数是:
-
-   基本函数:
-
-	Sort:
-		-a		按内存分配时间排序
-		-m		按总内存排序
-		-p		按pid排序。
-		-P		按tgid排序。
-		-r		按内存释放时间排序。
-		-s		按堆栈跟踪排序。
-		-t		按时间排序（默认）。
-
-   其它函数:
-
-	Cull:
-		-c		通过比较堆栈跟踪而不是总块来进行剔除。
-
-	Filter:
-		-f		过滤掉内存已被释放的块的信息。
diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst
deleted file mode 100644
index a29fc1b360e6..000000000000
--- a/Documentation/translations/zh_CN/vm/page_table_check.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/page_table_check.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-页表检查
-========
-
-概述
-====
-
-页表检查允许通过确保防止某些类型的内存损坏来强化内核。
-
-当新的页面可以从用户空间访问时，页表检查通过将它们的页表项（PTEs PMD等）添加到页表中来执行额外
-的验证。
-
-在检测到损坏的情况下，内核会被崩溃。页表检查有一个小的性能和内存开销。因此，它在默认情况下是禁用
-的，但是在额外的加固超过性能成本的系统上，可以选择启用。另外，由于页表检查是同步的，它可以帮助调
-试双映射内存损坏问题，在错误的映射发生时崩溃内核，而不是在内存损坏错误发生后内核崩溃。
-
-双重映射检测逻辑
-================
-
-+-------------------+-------------------+-------------------+------------------+
-| Current Mapping   | New mapping       | Permissions       | Rule             |
-+===================+===================+===================+==================+
-| Anonymous         | Anonymous         | Read              | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Named             | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Anonymous         | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Named             | Any               | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-
-启用页表检查
-============
-
-用以下方法构建内核:
-
-- PAGE_TABLE_CHECK=y
-  注意，它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。
-
-- 使用 "page_table_check=on" 内核参数启动。
-
-可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核，以便在没有额外的内核参数的情况下获得页表
-支持。
diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst
deleted file mode 100644
index af6b7e28af23..000000000000
--- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:Original: Documentation/vm/remap_file_pages.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-==============================
-remap_file_pages()系统调用
-==============================
-
-remap_file_pages()系统调用被用来创建一个非线性映射，也就是说，在这个映射中，
-文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好
-处是，前者不需要内核创建额外的VMA（虚拟内存区）数据结构。
-
-支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码，包括热
-路径。另外，为了使非线性映射工作，内核需要一种方法来区分正常的页表项和带有文件
-偏移的项（pte_file）。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资
-源，特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。
-
-幸运的是，在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS
-实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。
-由于64位系统的广泛使用，这种使用情况已经不重要了。
-
-syscall被废弃了，现在用一个模拟来代替它。仿真会创建新的VMA，而不是非线性映射。
-对于remap_file_pages()的少数用户来说，它的工作速度会变慢，但ABI被保留了。
-
-仿真的一个副作用（除了性能之外）是，由于额外的VMA，用户可以更容易达到
-vm.max_map_count的限制。关于限制的更多细节，请参见DEFAULT_MAX_MAP_COUNT
-的注释。
diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
deleted file mode 100644
index 50694d97c426..000000000000
--- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-:Original: Documentation/vm/split_page_table_lock.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=================================
-分页表锁（split page table lock）
-=================================
-
-最初，mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方
-法导致了多线程应用程序的缺页异常可扩展性差，因为对锁的争夺很激烈。为了提高可扩
-展性，我们引入了分页表锁。
-
-有了分页表锁，我们就有了单独的每张表锁来顺序化对表的访问。目前，我们对PTE和
-PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。
-
-有一些辅助工具来锁定/解锁一个表和其他访问器函数:
-
- - pte_offset_map_lock()
-	映射pte并获取PTE表锁，返回所取锁的指针；
- - pte_unmap_unlock()
-	解锁和解映射PTE表；
- - pte_alloc_map_lock()
-	如果需要的话，分配PTE表并获取锁，如果分配失败，返回已获取的锁的指针
-	或NULL；
- - pte_lockptr()
-	返回指向PTE表锁的指针；
- - pmd_lock()
-	取得PMD表锁，返回所取锁的指针。
- - pmd_lockptr()
-	返回指向PMD表锁的指针；
-
-如果CONFIG_SPLIT_PTLOCK_CPUS（通常为4）小于或等于NR_CPUS，则在编译
-时启用PTE表的分页表锁。如果分页锁被禁用，所有的表都由mm->page_table_lock
-来保护。
-
-如果PMD表启用了分页锁，并且架构支持它，那么PMD表的分页锁就会被启用（见
-下文）。
-
-Hugetlb 和分页表锁
-==================
-
-Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁，但不对PUD使用。
-
-Hugetlb特定的辅助函数:
-
- - huge_pte_lock()
-	对PMD_SIZE页面采取pmd分割锁，否则mm->page_table_lock；
- - huge_pte_lockptr()
-	返回指向表锁的指针。
-
-架构对分页表锁的支持
-====================
-
-没有必要特别启用PTE分页表锁：所有需要的东西都由pgtable_pte_page_ctor()
-和pgtable_pte_page_dtor()完成，它们必须在PTE表分配/释放时被调用。
-
-确保架构不使用slab分配器来分配页表：slab使用page->slab_cache来分配其页
-面。这个区域与page->ptl共享存储。
-
-PMD分页锁只有在你有两个以上的页表级别时才有意义。
-
-启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor()，在释放时调
-用pgtable_pmd_page_dtor()。
-
-分配通常发生在pmd_alloc_one()中，释放发生在pmd_free()和pmd_free_tlb()
-中，但要确保覆盖所有的PMD表分配/释放路径：即X86_PAE在pgd_alloc()中预先
-分配一些PMD。
-
-一切就绪后，你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
-
-注意：pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
-须正确处理。
-
-page->ptl
-=========
-
-page->ptl用于访问分割页表锁，其中'page'是包含该表的页面struct page。它
-与page->private（以及union中的其他几个字段）共享存储。
-
-为了避免增加struct page的大小并获得最佳性能，我们使用了一个技巧:
-
- - 如果spinlock_t适合于long，我们使用page->ptr作为spinlock，这样我们
-   就可以避免间接访问并节省一个缓存行。
- - 如果spinlock_t的大小大于long的大小，我们使用page->ptl作为spinlock_t
-   的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
-   情况下使用分页锁，但由于间接访问而多花了一个缓存行。
-
-PTE表的spinlock_t分配在pgtable_pte_page_ctor()中，PMD表的spinlock_t
-分配在pgtable_pmd_page_ctor()中。
-
-请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst
deleted file mode 100644
index 57204aa08caa..000000000000
--- a/Documentation/translations/zh_CN/vm/z3fold.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-:Original: Documentation/vm/z3fold.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-======
-z3fold
-======
-
-z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。
-它是zbud的衍生物，允许更高的压缩率，保持其前辈的简单性和确定性。
-
-z3fold和zbud的主要区别是:
-
-* 与zbud不同的是，z3fold允许最大的PAGE_SIZE分配。
-* z3fold在其页面中最多可以容纳3个压缩页面
-* z3fold本身没有输出任何API，因此打算通过zpool的API来使用
-
-为了保持确定性和简单性，z3fold，就像zbud一样，总是在每页存储一个整数的压缩页，但是
-它最多可以存储3页，不像zbud最多可以存储2页。因此压缩率达到2.7倍左右，而zbud的压缩
-率是1.7倍左右。
-
-不像zbud（但也像zsmalloc），z3fold_alloc()那样不返回一个可重复引用的指针。相反，它
-返回一个无符号长句柄，它编码了被分配对象的实际位置。
-
-保持有效的压缩率接近于zsmalloc，z3fold不依赖于MMU的启用，并提供更可预测的回收行
-为，这使得它更适合于小型和反应迅速的系统。
diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst
deleted file mode 100644
index 29e9c70a8eb6..000000000000
--- a/Documentation/translations/zh_CN/vm/zsmalloc.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-:Original: Documentation/vm/zs_malloc.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-========
-zsmalloc
-========
-
-这个分配器是为与zram一起使用而设计的。因此，该分配器应该在低内存条件下工作良好。特别是，
-它从未尝试过higher order页面的分配，这在内存压力下很可能会失败。另一方面，如果我们只
-是使用单（0-order）页，它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将
-占据整个页面。这是其前身（xvmalloc）的主要问题之一。
-
-为了克服这些问题，zsmalloc分配了一堆0-order页面，并使用各种"struct page"字段将它
-们链接起来。这些链接的页面作为一个单一的higher order页面，即一个对象可以跨越0-order
-页面的边界。代码将这些链接的页面作为一个实体，称为zspage。
-
-为了简单起见，zsmalloc只能分配大小不超过PAGE_SIZE的对象，因为这满足了所有当前用户的
-要求（在最坏的情况下，页面是不可压缩的，因此以"原样"即未压缩的形式存储）。对于大于这
-个大小的分配请求，会返回失败（见zs_malloc）。
-
-此外，zs_malloc()并不返回一个可重复引用的指针。相反，它返回一个不透明的句柄（无符号
-长），它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久
-映射，因为这在32位系统上会导致问题，因为内核空间映射的VA区域非常小。因此，在使用分配
-的内存之前，对象必须使用zs_map_object()进行映射以获得一个可用的指针，随后使用
-zs_unmap_object()解除映射。
-
-stat
-====
-
-通过CONFIG_ZSMALLOC_STAT，我们可以通过 ``/sys/kernel/debug/zsmalloc/<user name>``
-看到zsmalloc内部信息。下面是一个统计输出的例子。::
-
- # cat /sys/kernel/debug/zsmalloc/zram0/classes
-
- class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ...
-    ...
-     9   176           0            1           186        129          8                4
-    10   192           1            0          2880       2872        135                3
-    11   208           0            1           819        795         42                2
-    12   224           0            1           219        159         12                4
-    ...
-    ...
-
-
-class
-	索引
-size
-	zspage存储对象大小
-almost_empty
-	ZS_ALMOST_EMPTY zspage的数量（见下文）。
-almost_full
-	ZS_ALMOST_FULL zspage的数量(见下图)
-obj_allocated
-	已分配对象的数量
-obj_used
-	分配给用户的对象的数量
-pages_used
-	为该类分配的页数
-pages_per_zspage
-	组成一个zspage的0-order页面的数量
-
-当n <= N / f时，我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组，其中
-
-* n = 已分配对象的数量
-* N = zspage可以存储的对象总数
-* f = fullness_threshold_frac(即，目前是4个)
-
-同样地，我们将zspage分配给:
-
-* ZS_ALMOST_FULL  when n > N / f
-* ZS_EMPTY        when n == 0
-* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst
index e1ce9d8c06f8..e97d7d578751 100644
--- a/Documentation/translations/zh_TW/index.rst
+++ b/Documentation/translations/zh_TW/index.rst
@@ -128,7 +128,7 @@ TODOList:
 * security/index
 * sound/index
 * crypto/index
-* vm/index
+* mm/index
 * bpf/index
 * usb/index
 * PCI/index
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
deleted file mode 100644
index bc74f5643008..000000000000
--- a/Documentation/vm/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-page-types
-slabinfo
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
deleted file mode 100644
index 6f8269c284ed..000000000000
--- a/Documentation/vm/active_mm.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-.. _active_mm:
-
-=========
-Active MM
-=========
-
-::
-
- List:       linux-kernel
- Subject:    Re: active_mm
- From:       Linus Torvalds <torvalds () transmeta ! com>
- Date:       1999-07-30 21:36:24
-
- Cc'd to linux-kernel, because I don't write explanations all that often,
- and when I do I feel better about more people reading them.
-
- On Fri, 30 Jul 1999, David Mosberger wrote:
- >
- > Is there a brief description someplace on how "mm" vs. "active_mm" in
- > the task_struct are supposed to be used?  (My apologies if this was
- > discussed on the mailing lists---I just returned from vacation and
- > wasn't able to follow linux-kernel for a while).
-
- Basically, the new setup is:
-
-  - we have "real address spaces" and "anonymous address spaces". The
-    difference is that an anonymous address space doesn't care about the
-    user-level page tables at all, so when we do a context switch into an
-    anonymous address space we just leave the previous address space
-    active.
-
-    The obvious use for a "anonymous address space" is any thread that
-    doesn't need any user mappings - all kernel threads basically fall into
-    this category, but even "real" threads can temporarily say that for
-    some amount of time they are not going to be interested in user space,
-    and that the scheduler might as well try to avoid wasting time on
-    switching the VM state around. Currently only the old-style bdflush
-    sync does that.
-
-  - "tsk->mm" points to the "real address space". For an anonymous process,
-    tsk->mm will be NULL, for the logical reason that an anonymous process
-    really doesn't _have_ a real address space at all.
-
-  - however, we obviously need to keep track of which address space we
-    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
-    which shows what the currently active address space is.
-
-    The rule is that for a process with a real address space (ie tsk->mm is
-    non-NULL) the active_mm obviously always has to be the same as the real
-    one.
-
-    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
-    "borrowed" mm while the anonymous process is running. When the
-    anonymous process gets scheduled away, the borrowed address space is
-    returned and cleared.
-
- To support all that, the "struct mm_struct" now has two counters: a
- "mm_users" counter that is how many "real address space users" there are,
- and a "mm_count" counter that is the number of "lazy" users (ie anonymous
- users) plus one if there are any real users.
-
- Usually there is at least one real user, but it could be that the real
- user exited on another CPU while a lazy user was still active, so you do
- actually get cases where you have a address space that is _only_ used by
- lazy users. That is often a short-lived state, because once that thread
- gets scheduled away in favour of a real thread, the "zombie" mm gets
- released because "mm_count" becomes zero.
-
- Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
- more. "init_mm" should be considered just a "lazy context when no other
- context is available", and in fact it is mainly used just at bootup when
- no real VM has yet been created. So code that used to check
-
- 	if (current->mm == &init_mm)
-
- should generally just do
-
- 	if (!current->mm)
-
- instead (which makes more sense anyway - the test is basically one of "do
- we have a user context", and is generally done by the page fault handler
- and things like that).
-
- Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
- because it slightly changes the interfaces to accommodate the alpha (who
- would have thought it, but the alpha actually ends up having one of the
- ugliest context switch codes - unlike the other architectures where the MM
- and register state is separate, the alpha PALcode joins the two, and you
- need to switch both together).
-
- (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
deleted file mode 100644
index cbaee9e59241..000000000000
--- a/Documentation/vm/arch_pgtable_helpers.rst
+++ /dev/null
@@ -1,260 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _arch_page_table_helpers:
-
-===============================
-Architecture Page Table Helpers
-===============================
-
-Generic MM expects architectures (with MMU) to provide helpers to create, access
-and modify page table entries at various level for different memory functions.
-These page table helpers need to conform to a common semantics across platforms.
-Following tables describe the expected semantics which can also be tested during
-boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
-test need to be in sync.
-
-
-PTE Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pte_same                  | Tests whether both PTE entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pte_bad                   | Tests a non-table mapped PTE                     |
-+---------------------------+--------------------------------------------------+
-| pte_present               | Tests a valid mapped PTE                         |
-+---------------------------+--------------------------------------------------+
-| pte_young                 | Tests a young PTE                                |
-+---------------------------+--------------------------------------------------+
-| pte_dirty                 | Tests a dirty PTE                                |
-+---------------------------+--------------------------------------------------+
-| pte_write                 | Tests a writable PTE                             |
-+---------------------------+--------------------------------------------------+
-| pte_special               | Tests a special PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_protnone              | Tests a PROT_NONE PTE                            |
-+---------------------------+--------------------------------------------------+
-| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
-+---------------------------+--------------------------------------------------+
-| pte_soft_dirty            | Tests a soft dirty PTE                           |
-+---------------------------+--------------------------------------------------+
-| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
-+---------------------------+--------------------------------------------------+
-| pte_mkyoung               | Creates a young PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkold                 | Creates an old PTE                               |
-+---------------------------+--------------------------------------------------+
-| pte_mkdirty               | Creates a dirty PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkclean               | Creates a clean PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkwrite               | Creates a writable PTE                           |
-+---------------------------+--------------------------------------------------+
-| pte_wrprotect             | Creates a write protected PTE                    |
-+---------------------------+--------------------------------------------------+
-| pte_mkspecial             | Creates a special PTE                            |
-+---------------------------+--------------------------------------------------+
-| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
-+---------------------------+--------------------------------------------------+
-| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
-+---------------------------+--------------------------------------------------+
-| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
-+---------------------------+--------------------------------------------------+
-| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
-+---------------------------+--------------------------------------------------+
-| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
-+---------------------------+--------------------------------------------------+
-| pte_mknotpresent          | Invalidates a mapped PTE                         |
-+---------------------------+--------------------------------------------------+
-| ptep_clear                | Clears a PTE                                     |
-+---------------------------+--------------------------------------------------+
-| ptep_get_and_clear        | Clears and returns PTE                           |
-+---------------------------+--------------------------------------------------+
-| ptep_get_and_clear_full   | Clears and returns PTE (batched PTE unmap)       |
-+---------------------------+--------------------------------------------------+
-| ptep_test_and_clear_young | Clears young from a PTE                          |
-+---------------------------+--------------------------------------------------+
-| ptep_set_wrprotect        | Converts into a write protected PTE              |
-+---------------------------+--------------------------------------------------+
-| ptep_set_access_flags     | Converts into a more permissive PTE              |
-+---------------------------+--------------------------------------------------+
-
-
-PMD Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pmd_same                  | Tests whether both PMD entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pmd_bad                   | Tests a non-table mapped PMD                     |
-+---------------------------+--------------------------------------------------+
-| pmd_leaf                  | Tests a leaf mapped PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
-+---------------------------+--------------------------------------------------+
-| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
-+---------------------------+--------------------------------------------------+
-| pmd_present               | Tests a valid mapped PMD                         |
-+---------------------------+--------------------------------------------------+
-| pmd_young                 | Tests a young PMD                                |
-+---------------------------+--------------------------------------------------+
-| pmd_dirty                 | Tests a dirty PMD                                |
-+---------------------------+--------------------------------------------------+
-| pmd_write                 | Tests a writable PMD                             |
-+---------------------------+--------------------------------------------------+
-| pmd_special               | Tests a special PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_protnone              | Tests a PROT_NONE PMD                            |
-+---------------------------+--------------------------------------------------+
-| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
-+---------------------------+--------------------------------------------------+
-| pmd_soft_dirty            | Tests a soft dirty PMD                           |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
-+---------------------------+--------------------------------------------------+
-| pmd_mkyoung               | Creates a young PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkold                 | Creates an old PMD                               |
-+---------------------------+--------------------------------------------------+
-| pmd_mkdirty               | Creates a dirty PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkclean               | Creates a clean PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkwrite               | Creates a writable PMD                           |
-+---------------------------+--------------------------------------------------+
-| pmd_wrprotect             | Creates a write protected PMD                    |
-+---------------------------+--------------------------------------------------+
-| pmd_mkspecial             | Creates a special PMD                            |
-+---------------------------+--------------------------------------------------+
-| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
-+---------------------------+--------------------------------------------------+
-| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
-+---------------------------+--------------------------------------------------+
-| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
-+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
-+---------------------------+--------------------------------------------------+
-| pmd_set_huge              | Creates a PMD huge mapping                       |
-+---------------------------+--------------------------------------------------+
-| pmd_clear_huge            | Clears a PMD huge mapping                        |
-+---------------------------+--------------------------------------------------+
-| pmdp_get_and_clear        | Clears a PMD                                     |
-+---------------------------+--------------------------------------------------+
-| pmdp_get_and_clear_full   | Clears a PMD                                     |
-+---------------------------+--------------------------------------------------+
-| pmdp_test_and_clear_young | Clears young from a PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmdp_set_wrprotect        | Converts into a write protected PMD              |
-+---------------------------+--------------------------------------------------+
-| pmdp_set_access_flags     | Converts into a more permissive PMD              |
-+---------------------------+--------------------------------------------------+
-
-
-PUD Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pud_same                  | Tests whether both PUD entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pud_bad                   | Tests a non-table mapped PUD                     |
-+---------------------------+--------------------------------------------------+
-| pud_leaf                  | Tests a leaf mapped PUD                          |
-+---------------------------+--------------------------------------------------+
-| pud_huge                  | Tests a HugeTLB mapped PUD                       |
-+---------------------------+--------------------------------------------------+
-| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
-+---------------------------+--------------------------------------------------+
-| pud_present               | Tests a valid mapped PUD                         |
-+---------------------------+--------------------------------------------------+
-| pud_young                 | Tests a young PUD                                |
-+---------------------------+--------------------------------------------------+
-| pud_dirty                 | Tests a dirty PUD                                |
-+---------------------------+--------------------------------------------------+
-| pud_write                 | Tests a writable PUD                             |
-+---------------------------+--------------------------------------------------+
-| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
-+---------------------------+--------------------------------------------------+
-| pud_mkyoung               | Creates a young PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkold                 | Creates an old PUD                               |
-+---------------------------+--------------------------------------------------+
-| pud_mkdirty               | Creates a dirty PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkclean               | Creates a clean PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkwrite               | Creates a writable PUD                           |
-+---------------------------+--------------------------------------------------+
-| pud_wrprotect             | Creates a write protected PUD                    |
-+---------------------------+--------------------------------------------------+
-| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
-+---------------------------+--------------------------------------------------+
-| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
-+---------------------------+--------------------------------------------------+
-| pud_set_huge              | Creates a PUD huge mapping                       |
-+---------------------------+--------------------------------------------------+
-| pud_clear_huge            | Clears a PUD huge mapping                        |
-+---------------------------+--------------------------------------------------+
-| pudp_get_and_clear        | Clears a PUD                                     |
-+---------------------------+--------------------------------------------------+
-| pudp_get_and_clear_full   | Clears a PUD                                     |
-+---------------------------+--------------------------------------------------+
-| pudp_test_and_clear_young | Clears young from a PUD                          |
-+---------------------------+--------------------------------------------------+
-| pudp_set_wrprotect        | Converts into a write protected PUD              |
-+---------------------------+--------------------------------------------------+
-| pudp_set_access_flags     | Converts into a more permissive PUD              |
-+---------------------------+--------------------------------------------------+
-
-
-HugeTLB Page Table Helpers
-==========================
-
-+---------------------------+--------------------------------------------------+
-| pte_huge                  | Tests a HugeTLB                                  |
-+---------------------------+--------------------------------------------------+
-| pte_mkhuge                | Creates a HugeTLB                                |
-+---------------------------+--------------------------------------------------+
-| huge_pte_dirty            | Tests a dirty HugeTLB                            |
-+---------------------------+--------------------------------------------------+
-| huge_pte_write            | Tests a writable HugeTLB                         |
-+---------------------------+--------------------------------------------------+
-| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
-+---------------------------+--------------------------------------------------+
-| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
-+---------------------------+--------------------------------------------------+
-| huge_pte_wrprotect        | Creates a write protected HugeTLB                |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
-+---------------------------+--------------------------------------------------+
-
-
-SWAP Page Table Helpers
-========================
-
-+---------------------------+--------------------------------------------------+
-| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
-+---------------------------+--------------------------------------------------+
-| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
-+---------------------------+--------------------------------------------------+
-| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
-+---------------------------+--------------------------------------------------+
-| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
-+---------------------------+--------------------------------------------------+
-| is_migration_entry        | Tests a migration (read or write) swapped entry  |
-+-------------------------------+----------------------------------------------+
-| is_writable_migration_entry   | Tests a write migration swapped entry        |
-+-------------------------------+----------------------------------------------+
-| make_readable_migration_entry | Creates a read migration swapped entry       |
-+-------------------------------+----------------------------------------------+
-| make_writable_migration_entry | Creates a write migration swapped entry      |
-+-------------------------------+----------------------------------------------+
-
-[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst
deleted file mode 100644
index 6a1fadf3e173..000000000000
--- a/Documentation/vm/balance.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-.. _balance:
-
-================
-Memory Balancing
-================
-
-Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
-well as for non __GFP_IO allocations.
-
-The first reason why a caller may avoid reclaim is that the caller can not
-sleep due to holding a spinlock or is in interrupt context. The second may
-be that the caller is willing to fail the allocation without incurring the
-overhead of page reclaim. This may happen for opportunistic high-order
-allocation requests that have order-0 fallback options. In such cases,
-the caller may also wish to avoid waking kswapd.
-
-__GFP_IO allocation requests are made to prevent file system deadlocks.
-
-In the absence of non sleepable allocation requests, it seems detrimental
-to be doing balancing. Page reclamation can be kicked off lazily, that
-is, only when needed (aka zone free memory is 0), instead of making it
-a proactive process.
-
-That being said, the kernel should try to fulfill requests for direct
-mapped pages from the direct mapped pool, instead of falling back on
-the dma pool, so as to keep the dma pool filled for dma requests (atomic
-or not). A similar argument applies to highmem and direct mapped pages.
-OTOH, if there is a lot of free dma pages, it is preferable to satisfy
-regular memory requests by allocating one from the dma pool, instead
-of incurring the overhead of regular zone balancing.
-
-In 2.2, memory balancing/page reclamation would kick off only when the
-_total_ number of free pages fell below 1/64 th of total memory. With the
-right ratio of dma and regular memory, it is quite possible that balancing
-would not be done even when the dma zone was completely empty. 2.2 has
-been running production machines of varying memory sizes, and seems to be
-doing fine even with the presence of this problem. In 2.3, due to
-HIGHMEM, this problem is aggravated.
-
-In 2.3, zone balancing can be done in one of two ways: depending on the
-zone size (and possibly of the size of lower class zones), we can decide
-at init time how many free pages we should aim for while balancing any
-zone. The good part is, while balancing, we do not need to look at sizes
-of lower class zones, the bad part is, we might do too frequent balancing
-due to ignoring possibly lower usage in the lower class zones. Also,
-with a slight change in the allocation routine, it is possible to reduce
-the memclass() macro to be a simple equality.
-
-Another possible solution is that we balance only when the free memory
-of a zone _and_ all its lower class zones falls below 1/64th of the
-total memory in the zone and its lower class zones. This fixes the 2.2
-balancing problem, and stays as close to 2.2 behavior as possible. Also,
-the balancing algorithm works the same way on the various architectures,
-which have different numbers and types of zones. If we wanted to get
-fancy, we could assign different weights to free pages in different
-zones in the future.
-
-Note that if the size of the regular zone is huge compared to dma zone,
-it becomes less significant to consider the free dma pages while
-deciding whether to balance the regular zone. The first solution
-becomes more attractive then.
-
-The appended patch implements the second solution. It also "fixes" two
-problems: first, kswapd is woken up as in 2.2 on low memory conditions
-for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
-so as to give a fighting chance for replace_with_highmem() to get a
-HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
-fall back into regular zone. This also makes sure that HIGHMEM pages
-are not leaked (for example, in situations where a HIGHMEM page is in
-the swapcache but is not being used by anyone)
-
-kswapd also needs to know about the zones it should balance. kswapd is
-primarily needed in a situation where balancing can not be done,
-probably because all allocation requests are coming from intr context
-and all process contexts are sleeping. For 2.3, kswapd does not really
-need to balance the highmem zone, since intr context does not request
-highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
-structure to decide whether a zone needs balancing.
-
-Page stealing from process memory and shm is done if stealing the page would
-alleviate memory pressure on any zone in the page's node that has fallen below
-its watermark.
-
-watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
-are per-zone fields, used to determine when a zone needs to be balanced. When
-the number of pages falls below watermark[WMARK_MIN], the hysteric field
-low_on_memory gets set. This stays set till the number of free pages becomes
-watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
-try to free some pages in the zone (providing GFP_WAIT is set in the request).
-Orthogonal to this, is the decision to poke kswapd to free some zone pages.
-That decision is not hysteresis based, and is done when the number of free
-pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
-
-
-(Good) Ideas that I have heard:
-
-1. Dynamic experience should influence balancing: number of failed requests
-   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
-2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
-   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst
deleted file mode 100644
index eb2b31eedfa1..000000000000
--- a/Documentation/vm/bootmem.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===========
-Boot Memory
-===========
diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst
deleted file mode 100644
index 08f34df45523..000000000000
--- a/Documentation/vm/damon/api.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=============
-API Reference
-=============
-
-Kernel space programs can use every feature of DAMON using below APIs.  All you
-need to do is including ``damon.h``, which is located in ``include/linux/`` of
-the source tree.
-
-Structures
-==========
-
-.. kernel-doc:: include/linux/damon.h
-
-
-Functions
-=========
-
-.. kernel-doc:: mm/damon/core.c
diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
deleted file mode 100644
index 0cff6fac6b7e..000000000000
--- a/Documentation/vm/damon/design.rst
+++ /dev/null
@@ -1,176 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======
-Design
-======
-
-Configurable Layers
-===================
-
-DAMON provides data access monitoring functionality while making the accuracy
-and the overhead controllable.  The fundamental access monitorings require
-primitives that dependent on and optimized for the target address space.  On
-the other hand, the accuracy and overhead tradeoff mechanism, which is the core
-of DAMON, is in the pure logic space.  DAMON separates the two parts in
-different layers and defines its interface to allow various low level
-primitives implementations configurable with the core logic.  We call the low
-level primitives implementations monitoring operations.
-
-Due to this separated design and the configurable interface, users can extend
-DAMON for any address space by configuring the core logics with appropriate
-monitoring operations.  If appropriate one is not provided, users can implement
-the operations on their own.
-
-For example, physical memory, virtual memory, swap space, those for specific
-processes, NUMA nodes, files, and backing memory devices would be supportable.
-Also, if some architectures or devices support special optimized access check
-primitives, those will be easily configurable.
-
-
-Reference Implementations of Address Space Specific Monitoring Operations
-=========================================================================
-
-The monitoring operations are defined in two parts:
-
-1. Identification of the monitoring target address range for the address space.
-2. Access check of specific address range in the target space.
-
-DAMON currently provides the implementations of the operations for the physical
-and virtual address spaces. Below two subsections describe how those work.
-
-
-VMA-based Target Address Range Construction
--------------------------------------------
-
-This is only for the virtual address space monitoring operations
-implementation.  That for the physical address space simply asks users to
-manually set the monitoring target address ranges.
-
-Only small parts in the super-huge virtual address space of the processes are
-mapped to the physical memory and accessed.  Thus, tracking the unmapped
-address regions is just wasteful.  However, because DAMON can deal with some
-level of noise using the adaptive regions adjustment mechanism, tracking every
-mapping is not strictly required but could even incur a high overhead in some
-cases.  That said, too huge unmapped areas inside the monitoring target should
-be removed to not take the time for the adaptive mechanism.
-
-For the reason, this implementation converts the complex mappings to three
-distinct regions that cover every mapped area of the address space.  The two
-gaps between the three regions are the two biggest unmapped areas in the given
-address space.  The two biggest unmapped areas would be the gap between the
-heap and the uppermost mmap()-ed region, and the gap between the lowermost
-mmap()-ed region and the stack in most of the cases.  Because these gaps are
-exceptionally huge in usual address spaces, excluding these will be sufficient
-to make a reasonable trade-off.  Below shows this in detail::
-
-    <heap>
-    <BIG UNMAPPED REGION 1>
-    <uppermost mmap()-ed region>
-    (small mmap()-ed regions and munmap()-ed regions)
-    <lowermost mmap()-ed region>
-    <BIG UNMAPPED REGION 2>
-    <stack>
-
-
-PTE Accessed-bit Based Access Check
------------------------------------
-
-Both of the implementations for physical and virtual address spaces use PTE
-Accessed-bit for basic access checks.  Only one difference is the way of
-finding the relevant PTE Accessed bit(s) from the address.  While the
-implementation for the virtual address walks the page table for the target task
-of the address, the implementation for the physical address walks every page
-table having a mapping to the address.  In this way, the implementations find
-and clear the bit(s) for next sampling target address and checks whether the
-bit(s) set again after one sampling period.  This could disturb other kernel
-subsystems using the Accessed bits, namely Idle page tracking and the reclaim
-logic.  DAMON does nothing to avoid disturbing Idle page tracking, so handling
-the interference is the responsibility of sysadmins.  However, it solves the
-conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
-as Idle page tracking does.
-
-
-Address Space Independent Core Mechanisms
-=========================================
-
-Below four sections describe each of the DAMON core mechanisms and the five
-monitoring attributes, ``sampling interval``, ``aggregation interval``,
-``update interval``, ``minimum number of regions``, and ``maximum number of
-regions``.
-
-
-Access Frequency Monitoring
----------------------------
-
-The output of DAMON says what pages are how frequently accessed for a given
-duration.  The resolution of the access frequency is controlled by setting
-``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
-access to each page per ``sampling interval`` and aggregates the results.  In
-other words, counts the number of the accesses to each page.  After each
-``aggregation interval`` passes, DAMON calls callback functions that previously
-registered by users so that users can read the aggregated results and then
-clears the results.  This can be described in below simple pseudo-code::
-
-    while monitoring_on:
-        for page in monitoring_target:
-            if accessed(page):
-                nr_accesses[page] += 1
-        if time() % aggregation_interval == 0:
-            for callback in user_registered_callbacks:
-                callback(monitoring_target, nr_accesses)
-            for page in monitoring_target:
-                nr_accesses[page] = 0
-        sleep(sampling interval)
-
-The monitoring overhead of this mechanism will arbitrarily increase as the
-size of the target workload grows.
-
-
-Region Based Sampling
----------------------
-
-To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
-that assumed to have the same access frequencies into a region.  As long as the
-assumption (pages in a region have the same access frequencies) is kept, only
-one page in the region is required to be checked.  Thus, for each ``sampling
-interval``, DAMON randomly picks one page in each region, waits for one
-``sampling interval``, checks whether the page is accessed meanwhile, and
-increases the access frequency of the region if so.  Therefore, the monitoring
-overhead is controllable by setting the number of regions.  DAMON allows users
-to set the minimum and the maximum number of regions for the trade-off.
-
-This scheme, however, cannot preserve the quality of the output if the
-assumption is not guaranteed.
-
-
-Adaptive Regions Adjustment
----------------------------
-
-Even somehow the initial monitoring target regions are well constructed to
-fulfill the assumption (pages in same region have similar access frequencies),
-the data access pattern can be dynamically changed.  This will result in low
-monitoring quality.  To keep the assumption as much as possible, DAMON
-adaptively merges and splits each region based on their access frequency.
-
-For each ``aggregation interval``, it compares the access frequencies of
-adjacent regions and merges those if the frequency difference is small.  Then,
-after it reports and clears the aggregated access frequency of each region, it
-splits each region into two or three regions if the total number of regions
-will not exceed the user-specified maximum number of regions after the split.
-
-In this way, DAMON provides its best-effort quality and minimal overhead while
-keeping the bounds users set for their trade-off.
-
-
-Dynamic Target Space Updates Handling
--------------------------------------
-
-The monitoring target address range could dynamically changed.  For example,
-virtual memory could be dynamically mapped and unmapped.  Physical memory could
-be hot-plugged.
-
-As the changes could be quite frequent in some cases, DAMON allows the
-monitoring operations to check dynamic changes including memory mapping changes
-and applies it to monitoring operations-related data structures such as the
-abstracted monitoring target memory area only for each of a user-specified time
-interval (``update interval``).
diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst
deleted file mode 100644
index dde7e2414ee6..000000000000
--- a/Documentation/vm/damon/faq.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========================
-Frequently Asked Questions
-==========================
-
-Why a new subsystem, instead of extending perf or other user space tools?
-=========================================================================
-
-First, because it needs to be lightweight as much as possible so that it can be
-used online, any unnecessary overhead such as kernel - user space context
-switching cost should be avoided.  Second, DAMON aims to be used by other
-programs including the kernel.  Therefore, having a dependency on specific
-tools like perf is not desirable.  These are the two biggest reasons why DAMON
-is implemented in the kernel space.
-
-
-Can 'idle pages tracking' or 'perf mem' substitute DAMON?
-=========================================================
-
-Idle page tracking is a low level primitive for access check of the physical
-address space.  'perf mem' is similar, though it can use sampling to minimize
-the overhead.  On the other hand, DAMON is a higher-level framework for the
-monitoring of various address spaces.  It is focused on memory management
-optimization and provides sophisticated accuracy/overhead handling mechanisms.
-Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
-DAMON's output, but cannot substitute DAMON.
-
-
-Does DAMON support virtual memory only?
-=======================================
-
-No.  The core of the DAMON is address space independent.  The address space
-specific monitoring operations including monitoring target regions
-constructions and actual access checks can be implemented and configured on the
-DAMON core by the users.  In this way, DAMON users can monitor any address
-space with any access check technique.
-
-Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
-implementations of the address space dependent functions for the virtual memory
-and the physical memory by default, for a reference and convenient use.
-
-
-Can I simply monitor page granularity?
-======================================
-
-Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
-working set size divided by the page size.  Because the monitoring target
-regions size is forced to be ``>=page size``, the region split will make no
-effect.
diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
deleted file mode 100644
index 48c0bbff98b2..000000000000
--- a/Documentation/vm/damon/index.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========================
-DAMON: Data Access MONitor
-==========================
-
-DAMON is a data access monitoring framework subsystem for the Linux kernel.
-The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
-
- - *accurate* (the monitoring output is useful enough for DRAM level memory
-   management; It might not appropriate for CPU Cache levels, though),
- - *light-weight* (the monitoring overhead is low enough to be applied online),
-   and
- - *scalable* (the upper-bound of the overhead is in constant range regardless
-   of the size of target workloads).
-
-Using this framework, therefore, the kernel's memory management mechanisms can
-make advanced decisions.  Experimental memory management optimization works
-that incurring high data accesses monitoring overhead could implemented again.
-In user space, meanwhile, users who have some special workloads can write
-personalized applications for better understanding and optimizations of their
-workloads and systems.
-
-.. toctree::
-   :maxdepth: 2
-
-   faq
-   design
-   api
diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst
deleted file mode 100644
index 8c05e62d8b2b..000000000000
--- a/Documentation/vm/free_page_reporting.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _free_page_reporting:
-
-=====================
-Free Page Reporting
-=====================
-
-Free page reporting is an API by which a device can register to receive
-lists of pages that are currently unused by the system. This is useful in
-the case of virtualization where a guest is then able to use this data to
-notify the hypervisor that it is no longer using certain pages in memory.
-
-For the driver, typically a balloon driver, to use of this functionality
-it will allocate and initialize a page_reporting_dev_info structure. The
-field within the structure it will populate is the "report" function
-pointer used to process the scatterlist. It must also guarantee that it can
-handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
-call to the function. A call to page_reporting_register will register the
-page reporting interface with the reporting framework assuming no other
-page reporting devices are already registered.
-
-Once registered the page reporting API will begin reporting batches of
-pages to the driver. The API will start reporting pages 2 seconds after
-the interface is registered and will continue to do so 2 seconds after any
-page of a sufficiently high order is freed.
-
-Pages reported will be stored in the scatterlist passed to the reporting
-function with the final entry having the end bit set in entry nent - 1.
-While pages are being processed by the report function they will not be
-accessible to the allocator. Once the report function has been completed
-the pages will be returned to the free area from which they were obtained.
-
-Prior to removing a driver that is making use of free page reporting it
-is necessary to call page_reporting_unregister to have the
-page_reporting_dev_info structure that is currently in use by free page
-reporting removed. Doing this will prevent further reports from being
-issued via the interface. If another driver or the same driver is
-registered it is possible for it to resume where the previous driver had
-left off in terms of reporting free pages.
-
-Alexander Duyck, Dec 04, 2019
diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst
deleted file mode 100644
index feecc5e24477..000000000000
--- a/Documentation/vm/frontswap.rst
+++ /dev/null
@@ -1,266 +0,0 @@
-.. _frontswap:
-
-=========
-Frontswap
-=========
-
-Frontswap provides a "transcendent memory" interface for swap pages.
-In some environments, dramatic performance savings may be obtained because
-swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap is so named because it can be thought of as the opposite of
-a "backing" store for a swap device.  The storage is assumed to be
-a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
-to the requirements of transcendent memory (such as Xen's "tmem", or
-in-kernel compressed memory, aka "zcache", or future RAM-like devices);
-this pseudo-RAM device is not directly accessible or addressable by the
-kernel and is of unknown and possibly time-varying size.  The driver
-links itself to frontswap by calling frontswap_register_ops to set the
-frontswap_ops funcs appropriately and the functions it provides must
-conform to certain policies as follows:
-
-An "init" prepares the device to receive frontswap pages associated
-with the specified swap device number (aka "type").  A "store" will
-copy the page to transcendent memory and associate it with the type and
-offset associated with the page. A "load" will copy the page, if found,
-from transcendent memory into kernel memory, but will NOT remove the page
-from transcendent memory.  An "invalidate_page" will remove the page
-from transcendent memory and an "invalidate_area" will remove ALL pages
-associated with the swap type (e.g., like swapoff) and notify the "device"
-to refuse further stores with that swap type.
-
-Once a page is successfully stored, a matching load on the page will normally
-succeed.  So when the kernel finds itself in a situation where it needs
-to swap out a page, it first attempts to use frontswap.  If the store returns
-success, the data has been successfully saved to transcendent memory and
-a disk write and, if the data is later read back, a disk read are avoided.
-If a store returns failure, transcendent memory has rejected the data, and the
-page can be written to swap as usual.
-
-Note that if a page is stored and the page already exists in transcendent memory
-(a "duplicate" store), either the store succeeds and the data is overwritten,
-or the store fails AND the page is invalidated.  This ensures stale data may
-never be obtained from frontswap.
-
-If properly configured, monitoring of frontswap is done via debugfs in
-the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
-frontswap can be measured (across all swap devices) with:
-
-``failed_stores``
-	how many store attempts have failed
-
-``loads``
-	how many loads were attempted (all should succeed)
-
-``succ_stores``
-	how many store attempts have succeeded
-
-``invalidates``
-	how many invalidates were attempted
-
-A backend implementation may provide additional metrics.
-
-FAQ
-===
-
-* Where's the value?
-
-When a workload starts swapping, performance falls through the floor.
-Frontswap significantly increases performance in many such workloads by
-providing a clean, dynamic interface to read and write swap pages to
-"transcendent memory" that is otherwise not directly addressable to the kernel.
-This interface is ideal when data is transformed to a different form
-and size (such as with compression) or secretly moved (as might be
-useful for write-balancing for some RAM-like devices).  Swap pages (and
-evicted page-cache pages) are a great use for this kind of slower-than-RAM-
-but-much-faster-than-disk "pseudo-RAM device".
-
-Frontswap with a fairly small impact on the kernel,
-provides a huge amount of flexibility for more dynamic, flexible RAM
-utilization in various system configurations:
-
-In the single kernel case, aka "zcache", pages are compressed and
-stored in local memory, thus increasing the total anonymous pages
-that can be safely kept in RAM.  Zcache essentially trades off CPU
-cycles used in compression/decompression for better memory utilization.
-Benchmarks have shown little or no impact when memory pressure is
-low while providing a significant performance improvement (25%+)
-on some workloads under high memory pressure.
-
-"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
-support for clustered systems.  Frontswap pages are locally compressed
-as in zcache, but then "remotified" to another system's RAM.  This
-allows RAM to be dynamically load-balanced back-and-forth as needed,
-i.e. when system A is overcommitted, it can swap to system B, and
-vice versa.  RAMster can also be configured as a memory server so
-many servers in a cluster can swap, dynamically as needed, to a single
-server configured with a large amount of RAM... without pre-configuring
-how much of the RAM is available for each of the clients!
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines.  This is really hard to do with RAM and efforts to do
-it well with no kernel changes have essentially failed (except in some
-well-publicized special-case workloads).
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization.  And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "selfballooning"), sudden unexpected
-memory pressure may result in swapping; frontswap allows those pages
-to be swapped to and from hypervisor RAM (if overall host system memory
-conditions allow), thus mitigating the potentially awful performance impact
-of unplanned swapping.
-
-A KVM implementation is underway and has been RFC'ed to lkml.  And,
-using frontswap, investigation is also underway on the use of NVM as
-a memory extension technology.
-
-* Sure there may be performance advantages in some situations, but
-  what's the space/time overhead of frontswap?
-
-If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
-nothingness and the only overhead is a few extra bytes per swapon'ed
-swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
-registers, there is one extra global variable compared to zero for
-every swap page read or written.  If CONFIG_FRONTSWAP is enabled
-AND a frontswap backend registers AND the backend fails every "store"
-request (i.e. provides no memory despite claiming it might),
-CPU overhead is still negligible -- and since every frontswap fail
-precedes a swap page write-to-disk, the system is highly likely
-to be I/O bound and using a small fraction of a percent of a CPU
-will be irrelevant anyway.
-
-As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
-registers, one bit is allocated for every swap page for every swap
-device that is swapon'd.  This is added to the EIGHT bits (which
-was sixteen until about 2.6.34) that the kernel already allocates
-for every swap page for every swap device that is swapon'd.  (Hugh
-Dickins has observed that frontswap could probably steal one of
-the existing eight bits, but let's worry about that minor optimization
-later.)  For very large swap disks (which are rare) on a standard
-4K pagesize, this is 1MB per 32GB swap.
-
-When swap pages are stored in transcendent memory instead of written
-out to disk, there is a side effect that this may create more memory
-pressure that can potentially outweigh the other advantages.  A
-backend, such as zcache, must implement policies to carefully (but
-dynamically) manage memory limits to ensure this doesn't happen.
-
-* OK, how about a quick overview of what this frontswap patch does
-  in terms that a kernel hacker can grok?
-
-Let's assume that a frontswap "backend" has registered during
-kernel initialization; this registration indicates that this
-frontswap backend has access to some "memory" that is not directly
-accessible by the kernel.  Exactly how much memory it provides is
-entirely dynamic and random.
-
-Whenever a swap-device is swapon'd frontswap_init() is called,
-passing the swap device number (aka "type") as a parameter.
-This notifies frontswap to expect attempts to "store" swap pages
-associated with that number.
-
-Whenever the swap subsystem is readying a page to write to a swap
-device (c.f swap_writepage()), frontswap_store is called.  Frontswap
-consults with the frontswap backend and if the backend says it does NOT
-have room, frontswap_store returns -1 and the kernel swaps the page
-to the swap device as normal.  Note that the response from the frontswap
-backend is unpredictable to the kernel; it may choose to never accept a
-page, it could accept every ninth page, or it might accept every
-page.  But if the backend does accept a page, the data from the page
-has already been copied and associated with the type and offset,
-and the backend guarantees the persistence of the data.  In this case,
-frontswap sets a bit in the "frontswap_map" for the swap device
-corresponding to the page offset on the swap device to which it would
-otherwise have written the data.
-
-When the swap subsystem needs to swap-in a page (swap_readpage()),
-it first calls frontswap_load() which checks the frontswap_map to
-see if the page was earlier accepted by the frontswap backend.  If
-it was, the page of data is filled from the frontswap backend and
-the swap-in is complete.  If not, the normal swap-in code is
-executed to obtain the page of data from the real swap device.
-
-So every time the frontswap backend accepts a page, a swap device read
-and (potentially) a swap device write are replaced by a "frontswap backend
-store" and (possibly) a "frontswap backend loads", which are presumably much
-faster.
-
-* Can't frontswap be configured as a "special" swap device that is
-  just higher priority than any real swap device (e.g. like zswap,
-  or maybe swap-over-nbd/NFS)?
-
-No.  First, the existing swap subsystem doesn't allow for any kind of
-swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
-but this would require fairly drastic changes.  Even if it were
-rewritten, the existing swap subsystem uses the block I/O layer which
-assumes a swap device is fixed size and any page in it is linearly
-addressable.  Frontswap barely touches the existing swap subsystem,
-and works around the constraints of the block I/O subsystem to provide
-a great deal of flexibility and dynamicity.
-
-For example, the acceptance of any swap page by the frontswap backend is
-entirely unpredictable. This is critical to the definition of frontswap
-backends because it grants completely dynamic discretion to the
-backend.  In zcache, one cannot know a priori how compressible a page is.
-"Poorly" compressible pages can be rejected, and "poorly" can itself be
-defined dynamically depending on current memory constraints.
-
-Further, frontswap is entirely synchronous whereas a real swap
-device is, by definition, asynchronous and uses block I/O.  The
-block I/O layer is not only unnecessary, but may perform "optimizations"
-that are inappropriate for a RAM-oriented device including delaying
-the write of some pages for a significant amount of time.  Synchrony is
-required to ensure the dynamicity of the backend and to avoid thorny race
-conditions that would unnecessarily and greatly complicate frontswap
-and/or the block I/O subsystem.  That said, only the initial "store"
-and "load" operations need be synchronous.  A separate asynchronous thread
-is free to manipulate the pages stored by frontswap.  For example,
-the "remotification" thread in RAMster uses standard asynchronous
-kernel sockets to move compressed frontswap pages to a remote machine.
-Similarly, a KVM guest-side implementation could do in-guest compression
-and use "batched" hypercalls.
-
-In a virtualized environment, the dynamicity allows the hypervisor
-(or host OS) to do "intelligent overcommit".  For example, it can
-choose to accept pages only until host-swapping might be imminent,
-then force guests to do their own swapping.
-
-There is a downside to the transcendent memory specifications for
-frontswap:  Since any "store" might fail, there must always be a real
-slot on a real swap device to swap the page.  Thus frontswap must be
-implemented as a "shadow" to every swapon'd device with the potential
-capability of holding every page that the swap device might have held
-and the possibility that it might hold no pages at all.  This means
-that frontswap cannot contain more pages than the total of swapon'd
-swap devices.  For example, if NO swap device is configured on some
-installation, frontswap is useless.  Swapless portable devices
-can still use frontswap but a backend for such devices must configure
-some kind of "ghost" swap device and ensure that it is never used.
-
-* Why this weird definition about "duplicate stores"?  If a page
-  has been previously successfully stored, can't it always be
-  successfully overwritten?
-
-Nearly always it can, but no, sometimes it cannot.  Consider an example
-where data is compressed and the original 4K page has been compressed
-to 1K.  Now an attempt is made to overwrite the page with data that
-is non-compressible and so would take the entire 4K.  But the backend
-has no more space.  In this case, the store must be rejected.  Whenever
-frontswap rejects a store that would overwrite, it also must invalidate
-the old data and ensure that it is no longer accessible.  Since the
-swap subsystem then writes the new data to the read swap device,
-this is the correct course of action to ensure coherency.
-
-* Why does the frontswap patch create the new include file swapfile.h?
-
-The frontswap code depends on some swap-subsystem-internal data
-structures that have, over the years, moved back and forth between
-static and global.  This seemed a reasonable compromise:  Define
-them as global but declare them in a new include file that isn't
-included by the large number of source files that include swap.h.
-
-Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst
deleted file mode 100644
index c9887f241c6c..000000000000
--- a/Documentation/vm/highmem.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _highmem:
-
-====================
-High Memory Handling
-====================
-
-By: Peter Zijlstra <a.p.zijlstra@chello.nl>
-
-.. contents:: :local:
-
-What Is High Memory?
-====================
-
-High memory (highmem) is used when the size of physical memory approaches or
-exceeds the maximum size of virtual memory.  At that point it becomes
-impossible for the kernel to keep all of the available physical memory mapped
-at all times.  This means the kernel needs to start using temporary mappings of
-the pieces of physical memory that it wants to access.
-
-The part of (physical) memory not covered by a permanent mapping is what we
-refer to as 'highmem'.  There are various architecture dependent constraints on
-where exactly that border lies.
-
-In the i386 arch, for example, we choose to map the kernel into every process's
-VM space so that we don't have to pay the full TLB invalidation costs for
-kernel entry/exit.  This means the available virtual memory space (4GiB on
-i386) has to be divided between user and kernel space.
-
-The traditional split for architectures using this approach is 3:1, 3GiB for
-userspace and the top 1GiB for kernel space::
-
-		+--------+ 0xffffffff
-		| Kernel |
-		+--------+ 0xc0000000
-		|        |
-		| User   |
-		|        |
-		+--------+ 0x00000000
-
-This means that the kernel can at most map 1GiB of physical memory at any one
-time, but because we need virtual address space for other things - including
-temporary maps to access the rest of the physical memory - the actual direct
-map will typically be less (usually around ~896MiB).
-
-Other architectures that have mm context tagged TLBs can have separate kernel
-and user maps.  Some hardware (like some ARMs), however, have limited virtual
-space when they use mm context tags.
-
-
-Temporary Virtual Mappings
-==========================
-
-The kernel contains several ways of creating temporary mappings. The following
-list shows them in order of preference of use.
-
-* kmap_local_page().  This function is used to require short term mappings.
-  It can be invoked from any context (including interrupts) but the mappings
-  can only be used in the context which acquired them.
-
-  This function should be preferred, where feasible, over all the others.
-
-  These mappings are thread-local and CPU-local, meaning that the mapping
-  can only be accessed from within this thread and the thread is bound the
-  CPU while the mapping is active. Even if the thread is preempted (since
-  preemption is never disabled by the function) the CPU can not be
-  unplugged from the system via CPU-hotplug until the mapping is disposed.
-
-  It's valid to take pagefaults in a local kmap region, unless the context
-  in which the local mapping is acquired does not allow it for other reasons.
-
-  kmap_local_page() always returns a valid virtual address and it is assumed
-  that kunmap_local() will never fail.
-
-  Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
-  extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
-  because the map implementation is stack based. See kmap_local_page() kdocs
-  (included in the "Functions" section) for details on how to manage nested
-  mappings.
-
-* kmap_atomic().  This permits a very short duration mapping of a single
-  page.  Since the mapping is restricted to the CPU that issued it, it
-  performs well, but the issuing task is therefore required to stay on that
-  CPU until it has finished, lest some other task displace its mappings.
-
-  kmap_atomic() may also be used by interrupt contexts, since it does not
-  sleep and the callers too may not sleep until after kunmap_atomic() is
-  called.
-
-  Each call of kmap_atomic() in the kernel creates a non-preemptible section
-  and disable pagefaults. This could be a source of unwanted latency. Therefore
-  users should prefer kmap_local_page() instead of kmap_atomic().
-
-  It is assumed that k[un]map_atomic() won't fail.
-
-* kmap().  This should be used to make short duration mapping of a single
-  page with no restrictions on preemption or migration. It comes with an
-  overhead as mapping space is restricted and protected by a global lock
-  for synchronization. When mapping is no longer needed, the address that
-  the page was mapped to must be released with kunmap().
-
-  Mapping changes must be propagated across all the CPUs. kmap() also
-  requires global TLB invalidation when the kmap's pool wraps and it might
-  block when the mapping space is fully utilized until a slot becomes
-  available. Therefore, kmap() is only callable from preemptible context.
-
-  All the above work is necessary if a mapping must last for a relatively
-  long time but the bulk of high-memory mappings in the kernel are
-  short-lived and only used in one place. This means that the cost of
-  kmap() is mostly wasted in such cases. kmap() was not intended for long
-  term mappings but it has morphed in that direction and its use is
-  strongly discouraged in newer code and the set of the preceding functions
-  should be preferred.
-
-  On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have
-  no real work to do because a 64-bit address space is more than sufficient to
-  address all the physical memory whose pages are permanently mapped.
-
-* vmap().  This can be used to make a long duration mapping of multiple
-  physical pages into a contiguous virtual space.  It needs global
-  synchronization to unmap.
-
-
-Cost of Temporary Mappings
-==========================
-
-The cost of creating temporary mappings can be quite high.  The arch has to
-manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
-
-If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
-simply with a bit of arithmetic that will convert the page struct address into
-a pointer to the page contents rather than juggling mappings about.  In such a
-case, the unmap operation may be a null operation.
-
-If CONFIG_MMU is not set, then there can be no temporary mappings and no
-highmem.  In such a case, the arithmetic approach will also be used.
-
-
-i386 PAE
-========
-
-The i386 arch, under some circumstances, will permit you to stick up to 64GiB
-of RAM into your 32-bit machine.  This has a number of consequences:
-
-* Linux needs a page-frame structure for each page in the system and the
-  pageframes need to live in the permanent mapping, which means:
-
-* you can have 896M/sizeof(struct page) page-frames at most; with struct
-  page being 32-bytes that would end up being something in the order of 112G
-  worth of pages; the kernel, however, needs to store more than just
-  page-frames in that memory...
-
-* PAE makes your page tables larger - which slows the system down as more
-  data has to be accessed to traverse in TLB fills and the like.  One
-  advantage is that PAE has more PTE bits and can provide advanced features
-  like NX and PAT.
-
-The general recommendation is that you don't use more than 8GiB on a 32-bit
-machine - although more might work for you and your workload, you're pretty
-much on your own - don't expect kernel developers to really care much if things
-come apart.
-
-
-Functions
-=========
-
-.. kernel-doc:: include/linux/highmem.h
-.. kernel-doc:: include/linux/highmem-internal.h
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
deleted file mode 100644
index f2a59ed82ed3..000000000000
--- a/Documentation/vm/hmm.rst
+++ /dev/null
@@ -1,452 +0,0 @@
-.. _hmm:
-
-=====================================
-Heterogeneous Memory Management (HMM)
-=====================================
-
-Provide infrastructure and helpers to integrate non-conventional memory (device
-memory like GPU on board memory) into regular kernel path, with the cornerstone
-of this being specialized struct page for such memory (see sections 5 to 7 of
-this document).
-
-HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program addresses coherently with
-the CPU meaning that any valid pointer on the CPU is also a valid pointer
-for the device. This is becoming mandatory to simplify the use of advanced
-heterogeneous computing where GPU, DSP, or FPGA are used to perform various
-computations on behalf of a process.
-
-This document is divided as follows: in the first section I expose the problems
-related to using device specific memory allocators. In the second section, I
-expose the hardware limitations that are inherent to many platforms. The third
-section gives an overview of the HMM design. The fourth section explains how
-CPU page-table mirroring works and the purpose of HMM in this context. The
-fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows
-leveraging the device DMA engine.
-
-.. contents:: :local:
-
-Problems of using a device specific memory allocator
-====================================================
-
-Devices with a large amount of on board memory (several gigabytes) like GPUs
-have historically managed their memory through dedicated driver specific APIs.
-This creates a disconnect between memory allocated and managed by a device
-driver and regular application memory (private anonymous, shared memory, or
-regular file backed memory). From here on I will refer to this aspect as split
-address space. I use shared address space to refer to the opposite situation:
-i.e., one in which any application memory region can be used by a device
-transparently.
-
-Split address space happens because devices can only access memory allocated
-through a device specific API. This implies that all memory objects in a program
-are not equal from the device point of view which complicates large programs
-that rely on a wide set of libraries.
-
-Concretely, this means that code that wants to leverage devices like GPUs needs
-to copy objects between generically allocated memory (malloc, mmap private, mmap
-share) and memory allocated through the device driver API (this still ends up
-with an mmap but of the device file).
-
-For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
-complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and programs get harder to debug because of the
-duplicate data set and addresses.
-
-Split address space also means that libraries cannot transparently use data
-they are getting from the core program or another library and thus each library
-might have to duplicate its input data set using the device specific memory
-allocator. Large projects suffer from this and waste resources because of the
-various memory copies.
-
-Duplicating each library API to accept as input or output memory allocated by
-each device specific allocator is not a viable option. It would lead to a
-combinatorial explosion in the library entry points.
-
-Finally, with the advance of high level language constructs (in C++ but in
-other languages too) it is now possible for the compiler to leverage GPUs and
-other devices without programmer knowledge. Some compiler identified patterns
-are only do-able with a shared address space. It is also more reasonable to use
-a shared address space for all other patterns.
-
-
-I/O bus, device memory characteristics
-======================================
-
-I/O buses cripple shared address spaces due to a few limitations. Most I/O
-buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from a CPU is even more
-limited. More often than not, it is not cache coherent.
-
-If we only consider the PCIE bus, then a device can access main memory (often
-through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from the device on main memory. This is worse
-in the other direction: the CPU can only access a limited range of the device
-memory and cannot perform atomic operations on it. Thus device memory cannot
-be considered the same as regular memory from the kernel point of view.
-
-Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
-and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
-The final limitation is latency. Access to main memory from the device has an
-order of magnitude higher latency than when the device accesses its own memory.
-
-Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow
-two-way cache coherency between CPU and device and allow all atomic operations the
-architecture supports. Sadly, not all platforms are following this trend and
-some major architectures are left without hardware solutions to these problems.
-
-So for shared address space to make sense, not only must we allow devices to
-access any memory but we must also permit any memory to be migrated to device
-memory while the device is using it (blocking CPU access while it happens).
-
-
-Shared address space and migration
-==================================
-
-HMM intends to provide two main features. The first one is to share the address
-space by duplicating the CPU page table in the device page table so the same
-address points to the same physical memory for any valid main memory address in
-the process address space.
-
-To achieve this, HMM offers a set of helpers to populate the device page table
-while keeping track of CPU page table updates. Device page table updates are
-not as easy as CPU page table updates. To update the device page table, you must
-allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
-specific commands in it to perform the update (unmap, cache invalidations, and
-flush, ...). This cannot be done through common code for all devices. Hence
-why HMM provides helpers to factor out everything that can be while leaving the
-hardware specific details to the device driver.
-
-The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of device memory. Those pages
-are special because the CPU cannot map them. However, they allow migrating
-main memory to device memory using existing migration mechanisms and everything
-looks like a page that is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm
-mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
-memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate is left to the device driver.
-
-Note that any CPU access to a device page triggers a page fault and a migration
-back to main memory. For example, when a page backing a given CPU address A is
-migrated from a main memory page to a device page, then any CPU access to
-address A triggers a page fault and initiates a migration back to main memory.
-
-With these two features, HMM not only allows a device to mirror process address
-space and keeps both CPU and device page tables synchronized, but also
-leverages device memory by migrating the part of the data set that is actively being
-used by the device.
-
-
-Address space mirroring implementation and API
-==============================================
-
-Address space mirroring's main objective is to allow duplication of a range of
-CPU page table into a device page table; HMM helps keep both synchronized. A
-device driver that wants to mirror a process address space must start with the
-registration of a mmu_interval_notifier::
-
- int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
-				  struct mm_struct *mm, unsigned long start,
-				  unsigned long length,
-				  const struct mmu_interval_notifier_ops *ops);
-
-During the ops->invalidate() callback the device driver must perform the
-update action to the range (mark range read only, or fully unmap, etc.). The
-device must complete the update before the driver callback returns.
-
-When the device driver wants to populate a range of virtual addresses, it can
-use::
-
-  int hmm_range_fault(struct hmm_range *range);
-
-It will trigger a page fault on missing or read-only entries if write access is
-requested (see below). Page faults use the generic mm page fault code path just
-like a CPU page fault.
-
-Both functions copy CPU page table entries into their pfns array argument. Each
-entry in that array corresponds to an address in the virtual range. HMM
-provides a set of flags to help the driver identify special CPU page table
-entries.
-
-Locking within the sync_cpu_device_pagetables() callback is the most important
-aspect the driver must respect in order to keep things properly synchronized.
-The usage pattern is::
-
- int driver_populate_range(...)
- {
-      struct hmm_range range;
-      ...
-
-      range.notifier = &interval_sub;
-      range.start = ...;
-      range.end = ...;
-      range.hmm_pfns = ...;
-
-      if (!mmget_not_zero(interval_sub->notifier.mm))
-          return -EFAULT;
-
- again:
-      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
-      mmap_read_lock(mm);
-      ret = hmm_range_fault(&range);
-      if (ret) {
-          mmap_read_unlock(mm);
-          if (ret == -EBUSY)
-                 goto again;
-          return ret;
-      }
-      mmap_read_unlock(mm);
-
-      take_lock(driver->update);
-      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
-          release_lock(driver->update);
-          goto again;
-      }
-
-      /* Use pfns array content to update device page table,
-       * under the update lock */
-
-      release_lock(driver->update);
-      return 0;
- }
-
-The driver->update lock is the same lock that the driver takes inside its
-invalidate() callback. That lock must be held before calling
-mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
-update.
-
-Leverage default_flags and pfn_flags_mask
-=========================================
-
-The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
-fault or snapshot policy for the whole range instead of having to set them
-for each entry in the pfns array.
-
-For instance if the device driver wants pages for a range with at least read
-permission, it sets::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = 0;
-
-and calls hmm_range_fault() as described above. This will fill fault all pages
-in the range with at least read permission.
-
-Now let's say the driver wants to do the same except for one page in the range for
-which it wants to have write permission. Now driver set::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
-    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
-
-With this, HMM will fault in all pages with at least read (i.e., valid) and for the
-address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission i.e., if the CPU pte does not have write permission set then HMM
-will call handle_mm_fault().
-
-After hmm_range_fault completes the flag bits are set to the current state of
-the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
-writable.
-
-
-Represent and manage device memory from core kernel point of view
-=================================================================
-
-Several different designs were tried to support device memory. The first one
-used a device specific data structure to keep information about migrated memory
-and HMM hooked itself in various places of mm code to handle any access to
-addresses that were backed by device memory. It turns out that this ended up
-replicating most of the fields of struct page and also needed many kernel code
-paths to be updated to understand this new kind of memory.
-
-Most kernel code paths never try to access the memory behind a page
-but only care about struct page contents. Because of this, HMM switched to
-directly using struct page for device memory which left most kernel code paths
-unaware of the difference. We only need to make sure that no one ever tries to
-map those pages from the CPU side.
-
-Migration to and from device memory
-===================================
-
-Because the CPU cannot access device memory directly, the device driver must
-use hardware DMA or device specific load/store instructions to migrate data.
-The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
-functions are designed to make drivers easier to write and to centralize common
-code across drivers.
-
-Before migrating pages to device private memory, special device private
-``struct page`` need to be created. These will be used as special "swap"
-page table entries so that a CPU process will fault if it tries to access
-a page that has been migrated to device private memory.
-
-These can be allocated and freed with::
-
-    struct resource *res;
-    struct dev_pagemap pagemap;
-
-    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
-                                  "name of driver resource");
-    pagemap.type = MEMORY_DEVICE_PRIVATE;
-    pagemap.range.start = res->start;
-    pagemap.range.end = res->end;
-    pagemap.nr_range = 1;
-    pagemap.ops = &device_devmem_ops;
-    memremap_pages(&pagemap, numa_node_id());
-
-    memunmap_pages(&pagemap);
-    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
-
-There are also devm_request_free_mem_region(), devm_memremap_pages(),
-devm_memunmap_pages(), and devm_release_mem_region() when the resources can
-be tied to a ``struct device``.
-
-The overall migration steps are similar to migrating NUMA pages within system
-memory (see :ref:`Page migration <page_migration>`) but the steps are split
-between device driver specific code and shared common code:
-
-1. ``mmap_read_lock()``
-
-   The device driver has to pass a ``struct vm_area_struct`` to
-   migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to
-   be held for the duration of the migration.
-
-2. ``migrate_vma_setup(struct migrate_vma *args)``
-
-   The device driver initializes the ``struct migrate_vma`` fields and passes
-   the pointer to migrate_vma_setup(). The ``args->flags`` field is used to
-   filter which source pages should be migrated. For example, setting
-   ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and
-   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in
-   device private memory. If the latter flag is set, the ``args->pgmap_owner``
-   field is used to identify device private pages owned by the driver. This
-   avoids trying to migrate device private pages residing in other devices.
-   Currently only anonymous private VMA ranges can be migrated to or from
-   system memory and device private memory.
-
-   One of the first steps migrate_vma_setup() does is to invalidate other
-   device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and
-   ``mmu_notifier_invalidate_range_end()`` calls around the page table
-   walks to fill in the ``args->src`` array with PFNs to be migrated.
-   The ``invalidate_range_start()`` callback is passed a
-   ``struct mmu_notifier_range`` with the ``event`` field set to
-   ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
-   the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
-   allows the device driver to skip the invalidation callback and only
-   invalidate device private MMU mappings that are actually migrating.
-   This is explained more in the next section.
-
-   While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()``
-   entry results in a valid "zero" PFN stored in the ``args->src`` array.
-   This lets the driver allocate device private memory and clear it instead
-   of copying a page of zeros. Valid PTE entries to system memory or
-   device private struct pages will be locked with ``lock_page()``, isolated
-   from the LRU (if system memory since device private pages are not on
-   the LRU), unmapped from the process, and a special migration PTE is
-   inserted in place of the original PTE.
-   migrate_vma_setup() also clears the ``args->dst`` array.
-
-3. The device driver allocates destination pages and copies source pages to
-   destination pages.
-
-   The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE``
-   bit is set and skips entries that are not migrating. The device driver
-   can also choose to skip migrating a page by not filling in the ``dst``
-   array for that page.
-
-   The driver then allocates either a device private struct page or a
-   system memory page, locks the page with ``lock_page()``, and fills in the
-   ``dst`` array entry with::
-
-     dst[i] = migrate_pfn(page_to_pfn(dpage));
-
-   Now that the driver knows that this page is being migrated, it can
-   invalidate device private MMU mappings and copy device private memory
-   to system memory or another device private page. The core Linux kernel
-   handles CPU page table invalidations so the device driver only has to
-   invalidate its own MMU mappings.
-
-   The driver can use ``migrate_pfn_to_page(src[i])`` to get the
-   ``struct page`` of the source and either copy the source page to the
-   destination or clear the destination device private memory if the pointer
-   is ``NULL`` meaning the source page was not populated in system memory.
-
-4. ``migrate_vma_pages()``
-
-   This step is where the migration is actually "committed".
-
-   If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this
-   is where the newly allocated page is inserted into the CPU's page table.
-   This can fail if a CPU thread faults on the same page. However, the page
-   table is locked and only one of the new pages will be inserted.
-   The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared
-   if it loses the race.
-
-   If the source page was locked, isolated, etc. the source ``struct page``
-   information is now copied to destination ``struct page`` finalizing the
-   migration on the CPU side.
-
-5. Device driver updates device MMU page tables for pages still migrating,
-   rolling back pages not migrating.
-
-   If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device
-   driver can update the device MMU and set the write enable bit if the
-   ``MIGRATE_PFN_WRITE`` bit is set.
-
-6. ``migrate_vma_finalize()``
-
-   This step replaces the special migration page table entry with the new
-   page's page table entry and releases the reference to the source and
-   destination ``struct page``.
-
-7. ``mmap_read_unlock()``
-
-   The lock can now be released.
-
-Exclusive access memory
-=======================
-
-Some devices have features such as atomic PTE bits that can be used to implement
-atomic access to system memory. To support atomic operations to a shared virtual
-memory page such a device needs access to that page which is exclusive of any
-userspace access from the CPU. The ``make_device_exclusive_range()`` function
-can be used to make a memory range inaccessible from userspace.
-
-This replaces all mappings for pages in the given range with special swap
-entries. Any attempt to access the swap entry results in a fault which is
-resovled by replacing the entry with the original mapping. A driver gets
-notified that the mapping has been changed by MMU notifiers, after which point
-it will no longer have exclusive access to the page. Exclusive access is
-guranteed to last until the driver drops the page lock and page reference, at
-which point any CPU faults on the page may proceed as described.
-
-Memory cgroup (memcg) and rss accounting
-========================================
-
-For now, device memory is accounted as any regular page in rss counters (either
-anonymous if device page is used for anonymous, file if device page is used for
-file backed page, or shmem if device page is used for shared memory). This is a
-deliberate choice to keep existing applications, that might start using device
-memory without knowing about it, running unimpacted.
-
-A drawback is that the OOM killer might kill an application using a lot of
-device memory and not a lot of regular system memory and thus not freeing much
-system memory. We want to gather more real world experience on how applications
-and system react under memory pressure in the presence of device memory before
-deciding to account device memory differently.
-
-
-Same decision was made for memory cgroup. Device memory pages are accounted
-against same memory cgroup a regular page would be accounted to. This does
-simplify migration to and from device memory. This also means that migration
-back from device memory to regular memory cannot fail because it would
-go above memory cgroup limit. We might revisit this choice latter on once we
-get more experience in how device memory is used and its impact on memory
-resource control.
-
-
-Note that device memory can never be pinned by a device driver nor through GUP
-and thus such memory is always free upon process exit. Or when last reference
-is dropped in case of shared memory or file backed memory.
diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst
deleted file mode 100644
index f143954e0d05..000000000000
--- a/Documentation/vm/hugetlbfs_reserv.rst
+++ /dev/null
@@ -1,596 +0,0 @@
-.. _hugetlbfs_reserve:
-
-=====================
-Hugetlbfs Reservation
-=====================
-
-Overview
-========
-
-Huge pages as described at :ref:`hugetlbpage` are typically
-preallocated for application use.  These huge pages are instantiated in a
-task's address space at page fault time if the VMA indicates huge pages are
-to be used.  If no huge page exists at page fault time, the task is sent
-a SIGBUS and often dies an unhappy death.  Shortly after huge page support
-was added, it was determined that it would be better to detect a shortage
-of huge pages at mmap() time.  The idea is that if there were not enough
-huge pages to cover the mapping, the mmap() would fail.  This was first
-done with a simple check in the code at mmap() time to determine if there
-were enough free huge pages to cover the mapping.  Like most things in the
-kernel, the code has evolved over time.  However, the basic idea was to
-'reserve' huge pages at mmap() time to ensure that huge pages would be
-available for page faults in that mapping.  The description below attempts to
-describe how huge page reserve processing is done in the v4.10 kernel.
-
-
-Audience
-========
-This description is primarily targeted at kernel developers who are modifying
-hugetlbfs code.
-
-
-The Data Structures
-===================
-
-resv_huge_pages
-	This is a global (per-hstate) count of reserved huge pages.  Reserved
-	huge pages are only available to the task which reserved them.
-	Therefore, the number of huge pages generally available is computed
-	as (``free_huge_pages - resv_huge_pages``).
-Reserve Map
-	A reserve map is described by the structure::
-
-		struct resv_map {
-			struct kref refs;
-			spinlock_t lock;
-			struct list_head regions;
-			long adds_in_progress;
-			struct list_head region_cache;
-			long region_cache_count;
-		};
-
-	There is one reserve map for each huge page mapping in the system.
-	The regions list within the resv_map describes the regions within
-	the mapping.  A region is described as::
-
-		struct file_region {
-			struct list_head link;
-			long from;
-			long to;
-		};
-
-	The 'from' and 'to' fields of the file region structure are huge page
-	indices into the mapping.  Depending on the type of mapping, a
-	region in the reserv_map may indicate reservations exist for the
-	range, or reservations do not exist.
-Flags for MAP_PRIVATE Reservations
-	These are stored in the bottom bits of the reservation map pointer.
-
-	``#define HPAGE_RESV_OWNER    (1UL << 0)``
-		Indicates this task is the owner of the reservations
-		associated with the mapping.
-	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
-		Indicates task originally mapping this range (and creating
-		reserves) has unmapped a page from this task (the child)
-		due to a failed COW.
-Page Flags
-	The PagePrivate page flag is used to indicate that a huge page
-	reservation must be restored when the huge page is freed.  More
-	details will be discussed in the "Freeing huge pages" section.
-
-
-Reservation Map Location (Private or Shared)
-============================================
-
-A huge page mapping or segment is either private or shared.  If private,
-it is typically only available to a single address space (task).  If shared,
-it can be mapped into multiple address spaces (tasks).  The location and
-semantics of the reservation map is significantly different for the two types
-of mappings.  Location differences are:
-
-- For private mappings, the reservation map hangs off the VMA structure.
-  Specifically, vma->vm_private_data.  This reserve map is created at the
-  time the mapping (mmap(MAP_PRIVATE)) is created.
-- For shared mappings, the reservation map hangs off the inode.  Specifically,
-  inode->i_mapping->private_data.  Since shared mappings are always backed
-  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
-  contains a reservation map.  As a result, the reservation map is allocated
-  when the inode is created.
-
-
-Creating Reservations
-=====================
-Reservations are created when a huge page backed shared memory segment is
-created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
-These operations result in a call to the routine hugetlb_reserve_pages()::
-
-	int hugetlb_reserve_pages(struct inode *inode,
-				  long from, long to,
-				  struct vm_area_struct *vma,
-				  vm_flags_t vm_flags)
-
-The first thing hugetlb_reserve_pages() does is check if the NORESERVE
-flag was specified in either the shmget() or mmap() call.  If NORESERVE
-was specified, then this routine returns immediately as no reservations
-are desired.
-
-The arguments 'from' and 'to' are huge page indices into the mapping or
-underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
-the length of the segment/mapping.  For mmap(), the offset argument could
-be used to specify the offset into the underlying file.  In such a case,
-the 'from' and 'to' arguments have been adjusted by this offset.
-
-One of the big differences between PRIVATE and SHARED mappings is the way
-in which reservations are represented in the reservation map.
-
-- For shared mappings, an entry in the reservation map indicates a reservation
-  exists or did exist for the corresponding page.  As reservations are
-  consumed, the reservation map is not modified.
-- For private mappings, the lack of an entry in the reservation map indicates
-  a reservation exists for the corresponding page.  As reservations are
-  consumed, entries are added to the reservation map.  Therefore, the
-  reservation map can also be used to determine which reservations have
-  been consumed.
-
-For private mappings, hugetlb_reserve_pages() creates the reservation map and
-hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
-to indicate this VMA owns the reservations.
-
-The reservation map is consulted to determine how many huge page reservations
-are needed for the current mapping/segment.  For private mappings, this is
-always the value (to - from).  However, for shared mappings it is possible that
-some reservations may already exist within the range (to - from).  See the
-section :ref:`Reservation Map Modifications <resv_map_modifications>`
-for details on how this is accomplished.
-
-The mapping may be associated with a subpool.  If so, the subpool is consulted
-to ensure there is sufficient space for the mapping.  It is possible that the
-subpool has set aside reservations that can be used for the mapping.  See the
-section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
-
-After consulting the reservation map and subpool, the number of needed new
-reservations is known.  The routine hugetlb_acct_memory() is called to check
-for and take the requested number of reservations.  hugetlb_acct_memory()
-calls into routines that potentially allocate and adjust surplus page counts.
-However, within those routines the code is simply checking to ensure there
-are enough free huge pages to accommodate the reservation.  If there are,
-the global reservation count resv_huge_pages is adjusted something like the
-following::
-
-	if (resv_needed <= (resv_huge_pages - free_huge_pages))
-		resv_huge_pages += resv_needed;
-
-Note that the global lock hugetlb_lock is held when checking and adjusting
-these counters.
-
-If there were enough free huge pages and the global count resv_huge_pages
-was adjusted, then the reservation map associated with the mapping is
-modified to reflect the reservations.  In the case of a shared mapping, a
-file_region will exist that includes the range 'from' - 'to'.  For private
-mappings, no modifications are made to the reservation map as lack of an
-entry indicates a reservation exists.
-
-If hugetlb_reserve_pages() was successful, the global reservation count and
-reservation map associated with the mapping will be modified as required to
-ensure reservations exist for the range 'from' - 'to'.
-
-.. _consume_resv:
-
-Consuming Reservations/Allocating a Huge Page
-=============================================
-
-Reservations are consumed when huge pages associated with the reservations
-are allocated and instantiated in the corresponding mapping.  The allocation
-is performed within the routine alloc_huge_page()::
-
-	struct page *alloc_huge_page(struct vm_area_struct *vma,
-				     unsigned long addr, int avoid_reserve)
-
-alloc_huge_page is passed a VMA pointer and a virtual address, so it can
-consult the reservation map to determine if a reservation exists.  In addition,
-alloc_huge_page takes the argument avoid_reserve which indicates reserves
-should not be used even if it appears they have been set aside for the
-specified address.  The avoid_reserve argument is most often used in the case
-of Copy on Write and Page Migration where additional copies of an existing
-page are being allocated.
-
-The helper routine vma_needs_reservation() is called to determine if a
-reservation exists for the address within the mapping(vma).  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
-information on what this routine does.
-The value returned from vma_needs_reservation() is generally
-0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
-If a reservation does not exist, and there is a subpool associated with the
-mapping the subpool is consulted to determine if it contains reservations.
-If the subpool contains reservations, one can be used for this allocation.
-However, in every case the avoid_reserve argument overrides the use of
-a reservation for the allocation.  After determining whether a reservation
-exists and can be used for the allocation, the routine dequeue_huge_page_vma()
-is called.  This routine takes two arguments related to reservations:
-
-- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
-- chg, even though this argument is of type long only the values 0 or 1 are
-  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
-  reservation exists (see the section "Memory Policy and Reservations" for
-  possible issues).  If the value is 1, it indicates a reservation does not
-  exist and the page must be taken from the global free pool if possible.
-
-The free lists associated with the memory policy of the VMA are searched for
-a free page.  If a page is found, the value free_huge_pages is decremented
-when the page is removed from the free list.  If there was a reservation
-associated with the page, the following adjustments are made::
-
-	SetPagePrivate(page);	/* Indicates allocating this page consumed
-				 * a reservation, and if an error is
-				 * encountered such that the page must be
-				 * freed, the reservation will be restored. */
-	resv_huge_pages--;	/* Decrement the global reservation count */
-
-Note, if no huge page can be found that satisfies the VMA's memory policy
-an attempt will be made to allocate one using the buddy allocator.  This
-brings up the issue of surplus huge pages and overcommit which is beyond
-the scope reservations.  Even if a surplus page is allocated, the same
-reservation based adjustments as above will be made: SetPagePrivate(page) and
-resv_huge_pages--.
-
-After obtaining a new huge page, (page)->private is set to the value of
-the subpool associated with the page if it exists.  This will be used for
-subpool accounting when the page is freed.
-
-The routine vma_commit_reservation() is then called to adjust the reserve
-map based on the consumption of the reservation.  In general, this involves
-ensuring the page is represented within a file_region structure of the region
-map.  For shared mappings where the reservation was present, an entry
-in the reserve map already existed so no change is made.  However, if there
-was no reservation in a shared mapping or this was a private mapping a new
-entry must be created.
-
-It is possible that the reserve map could have been changed between the call
-to vma_needs_reservation() at the beginning of alloc_huge_page() and the
-call to vma_commit_reservation() after the page was allocated.  This would
-be possible if hugetlb_reserve_pages was called for the same page in a shared
-mapping.  In such cases, the reservation count and subpool free page count
-will be off by one.  This rare condition can be identified by comparing the
-return value from vma_needs_reservation and vma_commit_reservation.  If such
-a race is detected, the subpool and global reserve counts are adjusted to
-compensate.  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
-information on these routines.
-
-
-Instantiate Huge Pages
-======================
-
-After huge page allocation, the page is typically added to the page tables
-of the allocating task.  Before this, pages in a shared mapping are added
-to the page cache and pages in private mappings are added to an anonymous
-reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
-when a huge page that has been instantiated is freed no adjustment is made
-to the global reservation count (resv_huge_pages).
-
-
-Freeing Huge Pages
-==================
-
-Huge page freeing is performed by the routine free_huge_page().  This routine
-is the destructor for hugetlbfs compound pages.  As a result, it is only
-passed a pointer to the page struct.  When a huge page is freed, reservation
-accounting may need to be performed.  This would be the case if the page was
-associated with a subpool that contained reserves, or the page is being freed
-on an error path where a global reserve count must be restored.
-
-The page->private field points to any subpool associated with the page.
-If the PagePrivate flag is set, it indicates the global reserve count should
-be adjusted (see the section
-:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
-for information on how these are set).
-
-The routine first calls hugepage_subpool_put_pages() for the page.  If this
-routine returns a value of 0 (which does not equal the value passed 1) it
-indicates reserves are associated with the subpool, and this newly free page
-must be used to keep the number of subpool reserves above the minimum size.
-Therefore, the global resv_huge_pages counter is incremented in this case.
-
-If the PagePrivate flag was set in the page, the global resv_huge_pages counter
-will always be incremented.
-
-.. _sub_pool_resv:
-
-Subpool Reservations
-====================
-
-There is a struct hstate associated with each huge page size.  The hstate
-tracks all huge pages of the specified size.  A subpool represents a subset
-of pages within a hstate that is associated with a mounted hugetlbfs
-filesystem.
-
-When a hugetlbfs filesystem is mounted a min_size option can be specified
-which indicates the minimum number of huge pages required by the filesystem.
-If this option is specified, the number of huge pages corresponding to
-min_size are reserved for use by the filesystem.  This number is tracked in
-the min_hpages field of a struct hugepage_subpool.  At mount time,
-hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
-huge pages.  If they can not be reserved, the mount fails.
-
-The routines hugepage_subpool_get/put_pages() are called when pages are
-obtained from or released back to a subpool.  They perform all subpool
-accounting, and track any reservations associated with the subpool.
-hugepage_subpool_get/put_pages are passed the number of huge pages by which
-to adjust the subpool 'used page' count (down for get, up for put).  Normally,
-they return the same value that was passed or an error if not enough pages
-exist in the subpool.
-
-However, if reserves are associated with the subpool a return value less
-than the passed value may be returned.  This return value indicates the
-number of additional global pool adjustments which must be made.  For example,
-suppose a subpool contains 3 reserved huge pages and someone asks for 5.
-The 3 reserved pages associated with the subpool can be used to satisfy part
-of the request.  But, 2 pages must be obtained from the global pools.  To
-relay this information to the caller, the value 2 is returned.  The caller
-is then responsible for attempting to obtain the additional two pages from
-the global pools.
-
-
-COW and Reservations
-====================
-
-Since shared mappings all point to and use the same underlying pages, the
-biggest reservation concern for COW is private mappings.  In this case,
-two tasks can be pointing at the same previously allocated page.  One task
-attempts to write to the page, so a new page must be allocated so that each
-task points to its own page.
-
-When the page was originally allocated, the reservation for that page was
-consumed.  When an attempt to allocate a new page is made as a result of
-COW, it is possible that no free huge pages are free and the allocation
-will fail.
-
-When the private mapping was originally created, the owner of the mapping
-was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
-map of the owner.  Since the owner created the mapping, the owner owns all
-the reservations associated with the mapping.  Therefore, when a write fault
-occurs and there is no page available, different action is taken for the owner
-and non-owner of the reservation.
-
-In the case where the faulting task is not the owner, the fault will fail and
-the task will typically receive a SIGBUS.
-
-If the owner is the faulting task, we want it to succeed since it owned the
-original reservation.  To accomplish this, the page is unmapped from the
-non-owning task.  In this way, the only reference is from the owning task.
-In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
-of the non-owning task.  The non-owning task may receive a SIGBUS if it later
-faults on a non-present page.  But, the original owner of the
-mapping/reservation will behave as expected.
-
-
-.. _resv_map_modifications:
-
-Reservation Map Modifications
-=============================
-
-The following low level routines are used to make modifications to a
-reservation map.  Typically, these routines are not called directly.  Rather,
-a reservation map helper routine is called which calls one of these low level
-routines.  These low level routines are fairly well documented in the source
-code (mm/hugetlb.c).  These routines are::
-
-	long region_chg(struct resv_map *resv, long f, long t);
-	long region_add(struct resv_map *resv, long f, long t);
-	void region_abort(struct resv_map *resv, long f, long t);
-	long region_count(struct resv_map *resv, long f, long t);
-
-Operations on the reservation map typically involve two operations:
-
-1) region_chg() is called to examine the reserve map and determine how
-   many pages in the specified range [f, t) are NOT currently represented.
-
-   The calling code performs global checks and allocations to determine if
-   there are enough huge pages for the operation to succeed.
-
-2)
-  a) If the operation can succeed, region_add() is called to actually modify
-     the reservation map for the same range [f, t) previously passed to
-     region_chg().
-  b) If the operation can not succeed, region_abort is called for the same
-     range [f, t) to abort the operation.
-
-Note that this is a two step process where region_add() and region_abort()
-are guaranteed to succeed after a prior call to region_chg() for the same
-range.  region_chg() is responsible for pre-allocating any data structures
-necessary to ensure the subsequent operations (specifically region_add()))
-will succeed.
-
-As mentioned above, region_chg() determines the number of pages in the range
-which are NOT currently represented in the map.  This number is returned to
-the caller.  region_add() returns the number of pages in the range added to
-the map.  In most cases, the return value of region_add() is the same as the
-return value of region_chg().  However, in the case of shared mappings it is
-possible for changes to the reservation map to be made between the calls to
-region_chg() and region_add().  In this case, the return value of region_add()
-will not match the return value of region_chg().  It is likely that in such
-cases global counts and subpool accounting will be incorrect and in need of
-adjustment.  It is the responsibility of the caller to check for this condition
-and make the appropriate adjustments.
-
-The routine region_del() is called to remove regions from a reservation map.
-It is typically called in the following situations:
-
-- When a file in the hugetlbfs filesystem is being removed, the inode will
-  be released and the reservation map freed.  Before freeing the reservation
-  map, all the individual file_region structures must be freed.  In this case
-  region_del is passed the range [0, LONG_MAX).
-- When a hugetlbfs file is being truncated.  In this case, all allocated pages
-  after the new file size must be freed.  In addition, any file_region entries
-  in the reservation map past the new end of file must be deleted.  In this
-  case, region_del is passed the range [new_end_of_file, LONG_MAX).
-- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
-  are removed from the middle of the file one at a time.  As the pages are
-  removed, region_del() is called to remove the corresponding entry from the
-  reservation map.  In this case, region_del is passed the range
-  [page_idx, page_idx + 1).
-
-In every case, region_del() will return the number of pages removed from the
-reservation map.  In VERY rare cases, region_del() can fail.  This can only
-happen in the hole punch case where it has to split an existing file_region
-entry and can not allocate a new structure.  In this error case, region_del()
-will return -ENOMEM.  The problem here is that the reservation map will
-indicate that there is a reservation for the page.  However, the subpool and
-global reservation counts will not reflect the reservation.  To handle this
-situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
-counters so that they correspond with the reservation map entry that could
-not be deleted.
-
-region_count() is called when unmapping a private huge page mapping.  In
-private mappings, the lack of a entry in the reservation map indicates that
-a reservation exists.  Therefore, by counting the number of entries in the
-reservation map we know how many reservations were consumed and how many are
-outstanding (outstanding = (end - start) - region_count(resv, start, end)).
-Since the mapping is going away, the subpool and global reservation counts
-are decremented by the number of outstanding reservations.
-
-.. _resv_map_helpers:
-
-Reservation Map Helper Routines
-===============================
-
-Several helper routines exist to query and modify the reservation maps.
-These routines are only interested with reservations for a specific huge
-page, so they just pass in an address instead of a range.  In addition,
-they pass in the associated VMA.  From the VMA, the type of mapping (private
-or shared) and the location of the reservation map (inode or VMA) can be
-determined.  These routines simply call the underlying routines described
-in the section "Reservation Map Modifications".  However, they do take into
-account the 'opposite' meaning of reservation map entries for private and
-shared mappings and hide this detail from the caller::
-
-	long vma_needs_reservation(struct hstate *h,
-				   struct vm_area_struct *vma,
-				   unsigned long addr)
-
-This routine calls region_chg() for the specified page.  If no reservation
-exists, 1 is returned.  If a reservation exists, 0 is returned::
-
-	long vma_commit_reservation(struct hstate *h,
-				    struct vm_area_struct *vma,
-				    unsigned long addr)
-
-This calls region_add() for the specified page.  As in the case of region_chg
-and region_add, this routine is to be called after a previous call to
-vma_needs_reservation.  It will add a reservation entry for the page.  It
-returns 1 if the reservation was added and 0 if not.  The return value should
-be compared with the return value of the previous call to
-vma_needs_reservation.  An unexpected difference indicates the reservation
-map was modified between calls::
-
-	void vma_end_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This calls region_abort() for the specified page.  As in the case of region_chg
-and region_abort, this routine is to be called after a previous call to
-vma_needs_reservation.  It will abort/end the in progress reservation add
-operation::
-
-	long vma_add_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This is a special wrapper routine to help facilitate reservation cleanup
-on error paths.  It is only called from the routine restore_reserve_on_error().
-This routine is used in conjunction with vma_needs_reservation in an attempt
-to add a reservation to the reservation map.  It takes into account the
-different reservation map semantics for private and shared mappings.  Hence,
-region_add is called for shared mappings (as an entry present in the map
-indicates a reservation), and region_del is called for private mappings (as
-the absence of an entry in the map indicates a reservation).  See the section
-"Reservation cleanup in error paths" for more information on what needs to
-be done on error paths.
-
-
-Reservation Cleanup in Error Paths
-==================================
-
-As mentioned in the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
-map modifications are performed in two steps.  First vma_needs_reservation
-is called before a page is allocated.  If the allocation is successful,
-then vma_commit_reservation is called.  If not, vma_end_reservation is called.
-Global and subpool reservation counts are adjusted based on success or failure
-of the operation and all is well.
-
-Additionally, after a huge page is instantiated the PagePrivate flag is
-cleared so that accounting when the page is ultimately freed is correct.
-
-However, there are several instances where errors are encountered after a huge
-page is allocated but before it is instantiated.  In this case, the page
-allocation has consumed the reservation and made the appropriate subpool,
-reservation map and global count adjustments.  If the page is freed at this
-time (before instantiation and clearing of PagePrivate), then free_huge_page
-will increment the global reservation count.  However, the reservation map
-indicates the reservation was consumed.  This resulting inconsistent state
-will cause the 'leak' of a reserved huge page.  The global reserve count will
-be  higher than it should and prevent allocation of a pre-allocated page.
-
-The routine restore_reserve_on_error() attempts to handle this situation.  It
-is fairly well documented.  The intention of this routine is to restore
-the reservation map to the way it was before the page allocation.   In this
-way, the state of the reservation map will correspond to the global reservation
-count after the page is freed.
-
-The routine restore_reserve_on_error itself may encounter errors while
-attempting to restore the reservation map entry.  In this case, it will
-simply clear the PagePrivate flag of the page.  In this way, the global
-reserve count will not be incremented when the page is freed.  However, the
-reservation map will continue to look as though the reservation was consumed.
-A page can still be allocated for the address, but it will not use a reserved
-page as originally intended.
-
-There is some code (most notably userfaultfd) which can not call
-restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
-so that a reservation will not be leaked when the huge page is freed.
-
-
-Reservations and Memory Policy
-==============================
-Per-node huge page lists existed in struct hstate when git was first used
-to manage Linux code.  The concept of reservations was added some time later.
-When reservations were added, no attempt was made to take memory policy
-into account.  While cpusets are not exactly the same as memory policy, this
-comment in hugetlb_acct_memory sums up the interaction between reservations
-and cpusets/memory policy::
-
-	/*
-	 * When cpuset is configured, it breaks the strict hugetlb page
-	 * reservation as the accounting is done on a global variable. Such
-	 * reservation is completely rubbish in the presence of cpuset because
-	 * the reservation is not checked against page availability for the
-	 * current cpuset. Application can still potentially OOM'ed by kernel
-	 * with lack of free htlb page in cpuset that the task is in.
-	 * Attempt to enforce strict accounting with cpuset is almost
-	 * impossible (or too ugly) because cpuset is too fluid that
-	 * task or memory node can be dynamically moved between cpusets.
-	 *
-	 * The change of semantics for shared hugetlb mapping with cpuset is
-	 * undesirable. However, in order to preserve some of the semantics,
-	 * we fall back to check against current free page availability as
-	 * a best attempt and hopefully to minimize the impact of changing
-	 * semantics that cpuset has.
-	 */
-
-Huge page reservations were added to prevent unexpected page allocation
-failures (OOM) at page fault time.  However, if an application makes use
-of cpusets or memory policy there is no guarantee that huge pages will be
-available on the required nodes.  This is true even if there are a sufficient
-number of global reservations.
-
-Hugetlbfs regression testing
-============================
-
-The most complete set of hugetlb tests are in the libhugetlbfs repository.
-If you modify any hugetlb related code, use the libhugetlbfs test suite
-to check for regressions.  In addition, if you add any new hugetlb
-functionality, please add appropriate tests to libhugetlbfs.
-
---
-Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst
deleted file mode 100644
index b9d5253c1305..000000000000
--- a/Documentation/vm/hwpoison.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-.. hwpoison:
-
-========
-hwpoison
-========
-
-What is hwpoison?
-=================
-
-Upcoming Intel CPUs have support for recovering from some memory errors
-(``MCA recovery``). This requires the OS to declare a page "poisoned",
-kill the processes associated with it and avoid using it in the future.
-
-This patchkit implements the necessary infrastructure in the VM.
-
-To quote the overview comment::
-
-	High level machine check handler. Handles pages reported by the
-	hardware as being corrupted usually due to a 2bit ECC memory or cache
-	failure.
-
-	This focusses on pages detected as corrupted in the background.
-	When the current CPU tries to consume corruption the currently
-	running process can just be killed directly instead. This implies
-	that if the error cannot be handled for some reason it's safe to
-	just ignore it because no corruption has been consumed yet. Instead
-	when that happens another machine check will happen.
-
-	Handles page cache pages in various states. The tricky part
-	here is that we can access any page asynchronous to other VM
-	users, because memory failures could happen anytime and anywhere,
-	possibly violating some of their assumptions. This is why this code
-	has to be extremely careful. Generally it tries to use normal locking
-	rules, as in get the standard locks, even if that means the
-	error handling takes potentially a long time.
-
-	Some of the operations here are somewhat inefficient and have non
-	linear algorithmic complexity, because the data structures have not
-	been optimized for this case. This is in particular the case
-	for the mapping from a vma to a process. Since this case is expected
-	to be rare we hope we can get away with this.
-
-The code consists of a the high level handler in mm/memory-failure.c,
-a new page poison bit and various checks in the VM to handle poisoned
-pages.
-
-The main target right now is KVM guests, but it works for all kinds
-of applications. KVM support requires a recent qemu-kvm release.
-
-For the KVM use there was need for a new signal type so that
-KVM can inject the machine check into the guest with the proper
-address. This in theory allows other applications to handle
-memory failures too. The expection is that near all applications
-won't do that, but some very specialized ones might.
-
-Failure recovery modes
-======================
-
-There are two (actually three) modes memory failure recovery can be in:
-
-vm.memory_failure_recovery sysctl set to zero:
-	All memory failures cause a panic. Do not attempt recovery.
-
-early kill
-	(can be controlled globally and per process)
-	Send SIGBUS to the application as soon as the error is detected
-	This allows applications who can process memory errors in a gentle
-	way (e.g. drop affected object)
-	This is the mode used by KVM qemu.
-
-late kill
-	Send SIGBUS when the application runs into the corrupted page.
-	This is best for memory error unaware applications and default
-	Note some pages are always handled as late kill.
-
-User control
-============
-
-vm.memory_failure_recovery
-	See sysctl.txt
-
-vm.memory_failure_early_kill
-	Enable early kill mode globally
-
-PR_MCE_KILL
-	Set early/late kill mode/revert to system default
-
-	arg1: PR_MCE_KILL_CLEAR:
-		Revert to system default
-	arg1: PR_MCE_KILL_SET:
-		arg2 defines thread specific mode
-
-		PR_MCE_KILL_EARLY:
-			Early kill
-		PR_MCE_KILL_LATE:
-			Late kill
-		PR_MCE_KILL_DEFAULT
-			Use system global default
-
-	Note that if you want to have a dedicated thread which handles
-	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
-	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
-	the SIGBUS is sent to the main thread.
-
-PR_MCE_KILL_GET
-	return current mode
-
-Testing
-=======
-
-* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
-  process for testing
-
-* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
-
-  corrupt-pfn
-	Inject hwpoison fault at PFN echoed into this file. This does
-	some early filtering to avoid corrupted unintended pages in test suites.
-
-  unpoison-pfn
-	Software-unpoison page at PFN echoed into this file. This way
-	a page can be reused again.  This only works for Linux
-	injected failures, not for real memory failures. Once any hardware
-	memory failure happens, this feature is disabled.
-
-  Note these injection interfaces are not stable and might change between
-  kernel versions
-
-  corrupt-filter-dev-major, corrupt-filter-dev-minor
-	Only handle memory failures to pages associated with the file
-	system defined by block device major/minor.  -1U is the
-	wildcard value.  This should be only used for testing with
-	artificial injection.
-
-  corrupt-filter-memcg
-	Limit injection to pages owned by memgroup. Specified by inode
-	number of the memcg.
-
-	Example::
-
-		mkdir /sys/fs/cgroup/mem/hwpoison
-
-	        usemem -m 100 -s 1000 &
-		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-
-		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-
-		page-types -p `pidof init`   --hwpoison  # shall do nothing
-		page-types -p `pidof usemem` --hwpoison  # poison its pages
-
-  corrupt-filter-flags-mask, corrupt-filter-flags-value
-	When specified, only poison pages if ((page_flags & mask) ==
-	value).  This allows stress testing of many kinds of
-	pages. The page_flags are the same as in /proc/kpageflags. The
-	flag bits are defined in include/linux/kernel-page-flags.h and
-	documented in Documentation/admin-guide/mm/pagemap.rst
-
-* Architecture specific MCE injector
-
-  x86 has mce-inject, mce-test
-
-  Some portable hwpoison test programs in mce-test, see below.
-
-References
-==========
-
-http://halobates.de/mce-lc09-2.pdf
-	Overview presentation from LinuxCon 09
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
-	Test suite (hwpoison specific portable tests in tsrc)
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
-	x86 specific injector
-
-
-Limitations
-===========
-- Not all page types are supported and never will. Most kernel internal
-  objects cannot be recovered, only LRU pages for now.
-
----
-Andi Kleen, Oct 2009
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
deleted file mode 100644
index 575ccd40e30c..000000000000
--- a/Documentation/vm/index.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-=====================================
-Linux Memory Management Documentation
-=====================================
-
-Memory Management Guide
-=======================
-
-This is a guide to understanding the memory management subsystem
-of Linux.  If you are looking for advice on simply allocating memory,
-see the :ref:`memory_allocation`.  For controlling and tuning guides,
-see the :doc:`admin guide <../admin-guide/mm/index>`.
-
-.. toctree::
-   :maxdepth: 1
-
-   physical_memory
-   page_tables
-   process_addrs
-   bootmem
-   page_allocation
-   vmalloc
-   slab
-   highmem
-   page_reclaim
-   swap
-   page_cache
-   shmfs
-   oom
-
-Legacy Documentation
-====================
-
-This is a collection of older documents about the Linux memory management
-(MM) subsystem internals with different level of details ranging from
-notes and mailing list responses for elaborating descriptions of data
-structures and algorithms.  It should all be integrated nicely into the
-above structured documentation, or deleted if it has served its purpose.
-
-.. toctree::
-   :maxdepth: 1
-
-   active_mm
-   arch_pgtable_helpers
-   balance
-   damon/index
-   free_page_reporting
-   frontswap
-   hmm
-   hwpoison
-   hugetlbfs_reserv
-   ksm
-   memory-model
-   mmu_notifier
-   numa
-   overcommit-accounting
-   page_migration
-   page_frags
-   page_owner
-   page_table_check
-   remap_file_pages
-   slub
-   split_page_table_lock
-   transhuge
-   unevictable-lru
-   vmalloced-kernel-stacks
-   vmemmap_dedup
-   z3fold
-   zsmalloc
diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst
deleted file mode 100644
index 9e37add068e6..000000000000
--- a/Documentation/vm/ksm.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. _ksm:
-
-=======================
-Kernel Samepage Merging
-=======================
-
-KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
-added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
-and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/
-
-The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
-
-Design
-======
-
-Overview
---------
-
-.. kernel-doc:: mm/ksm.c
-   :DOC: Overview
-
-Reverse mapping
----------------
-KSM maintains reverse mapping information for KSM pages in the stable
-tree.
-
-If a KSM page is shared between less than ``max_page_sharing`` VMAs,
-the node of the stable tree that represents such KSM page points to a
-list of struct rmap_item and the ``page->mapping`` of the
-KSM page points to the stable tree node.
-
-When the sharing passes this threshold, KSM adds a second dimension to
-the stable tree. The tree node becomes a "chain" that links one or
-more "dups". Each "dup" keeps reverse mapping information for a KSM
-page with ``page->mapping`` pointing to that "dup".
-
-Every "chain" and all "dups" linked into a "chain" enforce the
-invariant that they represent the same write protected memory content,
-even if each "dup" will be pointed by a different KSM page copy of
-that content.
-
-This way the stable tree lookup computational complexity is unaffected
-if compared to an unlimited list of reverse mappings. It is still
-enforced that there cannot be KSM page content duplicates in the
-stable tree itself.
-
-The deduplication limit enforced by ``max_page_sharing`` is required
-to avoid the virtual memory rmap lists to grow too large. The rmap
-walk has O(N) complexity where N is the number of rmap_items
-(i.e. virtual mappings) that are sharing the page, which is in turn
-capped by ``max_page_sharing``. So this effectively spreads the linear
-O(N) computational complexity from rmap walk context over different
-KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
-but N is the number of stable_node "dups", not the number of
-rmap_items, so it has not a significant impact on ksmd performance. In
-practice the best stable_node "dup" candidate will be kept and found
-at the head of the "dups" list.
-
-High values of ``max_page_sharing`` result in faster memory merging
-(because there will be fewer stable_node dups queued into the
-stable_node chain->hlist to check for pruning) and higher
-deduplication factor at the expense of slower worst case for rmap
-walks for any KSM page which can happen during swapping, compaction,
-NUMA balancing and page migration.
-
-The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
-``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
-in the stable_node dups, which could be solved by introducing
-fragmentation algorithms in ksmd which would refile rmap_items from
-one stable_node dup to another stable_node dup, in order to free up
-stable_node "dups" with few rmap_items in them, but that may increase
-the ksmd CPU usage and possibly slowdown the readonly computations on
-the KSM pages of the applications.
-
-The whole list of stable_node "dups" linked in the stable_node
-"chains" is scanned periodically in order to prune stale stable_nodes.
-The frequency of such scans is defined by
-``stable_node_chains_prune_millisecs`` sysfs tunable.
-
-Reference
----------
-.. kernel-doc:: mm/ksm.c
-   :functions: mm_slot ksm_scan stable_node rmap_item
-
---
-Izik Eidus,
-Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
deleted file mode 100644
index 30e8fbed6914..000000000000
--- a/Documentation/vm/memory-model.rst
+++ /dev/null
@@ -1,177 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _physical_memory_model:
-
-=====================
-Physical Memory Model
-=====================
-
-Physical memory in a system may be addressed in different ways. The
-simplest case is when the physical memory starts at address 0 and
-spans a contiguous range up to the maximal address. It could be,
-however, that this range contains small holes that are not accessible
-for the CPU. Then there could be several contiguous ranges at
-completely distinct addresses. And, don't forget about NUMA, where
-different memory banks are attached to different CPUs.
-
-Linux abstracts this diversity using one of the two memory models:
-FLATMEM and SPARSEMEM. Each architecture defines what
-memory models it supports, what the default memory model is and
-whether it is possible to manually override that default.
-
-All the memory models track the status of physical page frames using
-struct page arranged in one or more arrays.
-
-Regardless of the selected memory model, there exists one-to-one
-mapping between the physical page frame number (PFN) and the
-corresponding `struct page`.
-
-Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn`
-helpers that allow the conversion from PFN to `struct page` and vice
-versa.
-
-FLATMEM
-=======
-
-The simplest memory model is FLATMEM. This model is suitable for
-non-NUMA systems with contiguous, or mostly contiguous, physical
-memory.
-
-In the FLATMEM memory model, there is a global `mem_map` array that
-maps the entire physical memory. For most architectures, the holes
-have entries in the `mem_map` array. The `struct page` objects
-corresponding to the holes are never fully initialized.
-
-To allocate the `mem_map` array, architecture specific setup code should
-call :c:func:`free_area_init` function. Yet, the mappings array is not
-usable until the call to :c:func:`memblock_free_all` that hands all the
-memory to the page allocator.
-
-An architecture may free parts of the `mem_map` array that do not cover the
-actual physical pages. In such case, the architecture specific
-:c:func:`pfn_valid` implementation should take the holes in the
-`mem_map` into account.
-
-With FLATMEM, the conversion between a PFN and the `struct page` is
-straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
-`mem_map` array.
-
-The `ARCH_PFN_OFFSET` defines the first page frame number for
-systems with physical memory starting at address different from 0.
-
-SPARSEMEM
-=========
-
-SPARSEMEM is the most versatile memory model available in Linux and it
-is the only memory model that supports several advanced features such
-as hot-plug and hot-remove of the physical memory, alternative memory
-maps for non-volatile memory devices and deferred initialization of
-the memory map for larger systems.
-
-The SPARSEMEM model presents the physical memory as a collection of
-sections. A section is represented with struct mem_section
-that contains `section_mem_map` that is, logically, a pointer to an
-array of struct pages. However, it is stored with some other magic
-that aids the sections management. The section size and maximal number
-of section is specified using `SECTION_SIZE_BITS` and
-`MAX_PHYSMEM_BITS` constants defined by each architecture that
-supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a
-physical address that an architecture supports, the
-`SECTION_SIZE_BITS` is an arbitrary value.
-
-The maximal number of sections is denoted `NR_MEM_SECTIONS` and
-defined as
-
-.. math::
-
-   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
-
-The `mem_section` objects are arranged in a two-dimensional array
-called `mem_sections`. The size and placement of this array depend
-on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of
-sections:
-
-* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections`
-  array is static and has `NR_MEM_SECTIONS` rows. Each row holds a
-  single `mem_section` object.
-* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections`
-  array is dynamically allocated. Each row contains PAGE_SIZE worth of
-  `mem_section` objects and the number of rows is calculated to fit
-  all the memory sections.
-
-The architecture setup code should call sparse_init() to
-initialize the memory sections and the memory maps.
-
-With SPARSEMEM there are two possible ways to convert a PFN to the
-corresponding `struct page` - a "classic sparse" and "sparse
-vmemmap". The selection is made at build time and it is determined by
-the value of `CONFIG_SPARSEMEM_VMEMMAP`.
-
-The classic sparse encodes the section number of a page in page->flags
-and uses high bits of a PFN to access the section that maps that page
-frame. Inside a section, the PFN is the index to the array of pages.
-
-The sparse vmemmap uses a virtually mapped memory map to optimize
-pfn_to_page and page_to_pfn operations. There is a global `struct
-page *vmemmap` pointer that points to a virtually contiguous array of
-`struct page` objects. A PFN is an index to that array and the
-offset of the `struct page` from `vmemmap` is the PFN of that
-page.
-
-To use vmemmap, an architecture has to reserve a range of virtual
-addresses that will map the physical pages containing the memory
-map and make sure that `vmemmap` points to that range. In addition,
-the architecture should implement :c:func:`vmemmap_populate` method
-that will allocate the physical memory and create page tables for the
-virtual memory map. If an architecture does not have any special
-requirements for the vmemmap mappings, it can use default
-:c:func:`vmemmap_populate_basepages` provided by the generic memory
-management.
-
-The virtually mapped memory map allows storing `struct page` objects
-for persistent memory devices in pre-allocated storage on those
-devices. This storage is represented with struct vmem_altmap
-that is eventually passed to vmemmap_populate() through a long chain
-of function calls. The vmemmap_populate() implementation may use the
-`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
-allocate memory map on the persistent memory device.
-
-ZONE_DEVICE
-===========
-The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer
-`struct page` `mem_map` services for device driver identified physical
-address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact
-that the page objects for these address ranges are never marked online,
-and that a reference must be taken against the device, not just the page
-to keep the memory pinned for active use. `ZONE_DEVICE`, via
-:c:func:`devm_memremap_pages`, performs just enough memory hotplug to
-turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and
-:c:func:`get_user_pages` service for the given range of pfns. Since the
-page reference count never drops below 1 the page is never tracked as
-free memory and the page's `struct list_head lru` space is repurposed
-for back referencing to the host device / driver that mapped the memory.
-
-While `SPARSEMEM` presents memory as a collection of sections,
-optionally collected into memory blocks, `ZONE_DEVICE` users have a need
-for smaller granularity of populating the `mem_map`. Given that
-`ZONE_DEVICE` memory is never marked online it is subsequently never
-subject to its memory ranges being exposed through the sysfs memory
-hotplug api on memory block boundaries. The implementation relies on
-this lack of user-api constraint to allow sub-section sized memory
-ranges to be specified to :c:func:`arch_add_memory`, the top-half of
-memory hotplug. Sub-section support allows for 2MB as the cross-arch
-common alignment granularity for :c:func:`devm_memremap_pages`.
-
-The users of `ZONE_DEVICE` are:
-
-* pmem: Map platform persistent memory to be used as a direct-I/O target
-  via DAX mappings.
-
-* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
-  event callbacks to allow a device-driver to coordinate memory management
-  events related to device-memory, typically GPU memory. See
-  Documentation/vm/hmm.rst.
-
-* p2pdma: Create `struct page` objects to allow peer devices in a
-  PCI/-E topology to coordinate direct-DMA operations between themselves,
-  i.e. bypass host memory.
diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst
deleted file mode 100644
index df5d7777fc6b..000000000000
--- a/Documentation/vm/mmu_notifier.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-.. _mmu_notifier:
-
-When do you need to notify inside page table lock ?
-===================================================
-
-When clearing a pte/pmd we are given a choice to notify the event through
-(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
-the page table lock. But that notification is not necessary in all cases.
-
-For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
-thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
-process virtual address space). There is only 2 cases when you need to notify
-those secondary TLB while holding page table lock when clearing a pte/pmd:
-
-  A) page backing address is free before mmu_notifier_invalidate_range_end()
-  B) a page table entry is updated to point to a new page (COW, write fault
-     on zero page, __replace_page(), ...)
-
-Case A is obvious you do not want to take the risk for the device to write to
-a page that might now be used by some completely different task.
-
-Case B is more subtle. For correctness it requires the following sequence to
-happen:
-
-  - take page table lock
-  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
-  - set page table entry to point to new page
-
-If clearing the page table entry is not followed by a notify before setting
-the new pte/pmd value then you can break memory model like C11 or C++11 for
-the device.
-
-Consider the following scenario (device use a feature similar to ATS/PASID):
-
-Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
-they are write protected for COW (other case of B apply too).
-
-::
-
- [Time N] --------------------------------------------------------------------
- CPU-thread-0  {try to write to addrA}
- CPU-thread-1  {try to write to addrB}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA and populate device TLB}
- DEV-thread-2  {read addrB and populate device TLB}
- [Time N+1] ------------------------------------------------------------------
- CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
- CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+2] ------------------------------------------------------------------
- CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
- CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {write to addrA which is a write to new page}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {}
- CPU-thread-3  {write to addrB which is a write to new page}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+4] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+5] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA from old page}
- DEV-thread-2  {read addrB from new page}
-
-So here because at time N+2 the clear page table entry was not pair with a
-notification to invalidate the secondary TLB, the device see the new value for
-addrB before seeing the new value for addrA. This break total memory ordering
-for the device.
-
-When changing a pte to write protect or to point to a new write protected page
-with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
-call to mmu_notifier_invalidate_range_end() outside the page table lock. This
-is true even if the thread doing the page table update is preempted right after
-releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
deleted file mode 100644
index 99fdeca917ca..000000000000
--- a/Documentation/vm/numa.rst
+++ /dev/null
@@ -1,150 +0,0 @@
-.. _numa:
-
-Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
-
-=============
-What is NUMA?
-=============
-
-This question can be answered from a couple of perspectives:  the
-hardware view and the Linux software view.
-
-From the hardware perspective, a NUMA system is a computer platform that
-comprises multiple components or assemblies each of which may contain 0
-or more CPUs, local memory, and/or IO buses.  For brevity and to
-disambiguate the hardware view of these physical components/assemblies
-from the software abstraction thereof, we'll call the components/assemblies
-'cells' in this document.
-
-Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
-of the system--although some components necessary for a stand-alone SMP system
-may not be populated on any given cell.   The cells of the NUMA system are
-connected together with some sort of system interconnect--e.g., a crossbar or
-point-to-point link are common types of NUMA system interconnects.  Both of
-these types of interconnects can be aggregated to create NUMA platforms with
-cells at multiple distances from other cells.
-
-For Linux, the NUMA platforms of interest are primarily what is known as Cache
-Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
-to and accessible from any CPU attached to any cell and cache coherency
-is handled in hardware by the processor caches and/or the system interconnect.
-
-Memory access time and effective memory bandwidth varies depending on how far
-away the cell containing the CPU or IO bus making the memory access is from the
-cell containing the target memory.  For example, access to memory by CPUs
-attached to the same cell will experience faster access times and higher
-bandwidths than accesses to memory on other, remote cells.  NUMA platforms
-can have cells at multiple remote distances from any given cell.
-
-Platform vendors don't build NUMA systems just to make software developers'
-lives interesting.  Rather, this architecture is a means to provide scalable
-memory bandwidth.  However, to achieve scalable memory bandwidth, system and
-application software must arrange for a large majority of the memory references
-[cache misses] to be to "local" memory--memory on the same cell, if any--or
-to the closest cell with memory.
-
-This leads to the Linux software view of a NUMA system:
-
-Linux divides the system's hardware resources into multiple software
-abstractions called "nodes".  Linux maps the nodes onto the physical cells
-of the hardware platform, abstracting away some of the details for some
-architectures.  As with physical cells, software nodes may contain 0 or more
-CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
-"closer" nodes--nodes that map to closer cells--will generally experience
-faster access times and higher effective bandwidth than accesses to more
-remote cells.
-
-For some architectures, such as x86, Linux will "hide" any node representing a
-physical cell that has no memory attached, and reassign any CPUs attached to
-that cell to a node representing a cell that does have memory.  Thus, on
-these architectures, one cannot assume that all CPUs that Linux associates with
-a given node will see the same local memory access times and bandwidth.
-
-In addition, for some architectures, again x86 is an example, Linux supports
-the emulation of additional nodes.  For NUMA emulation, linux will carve up
-the existing nodes--or the system memory for non-NUMA platforms--into multiple
-nodes.  Each emulated node will manage a fraction of the underlying cells'
-physical memory.  NUMA emluation is useful for testing NUMA kernel and
-application features on non-NUMA platforms, and as a sort of memory resource
-management mechanism when used together with cpusets.
-[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-For each node with memory, Linux constructs an independent memory management
-subsystem, complete with its own free page lists, in-use page lists, usage
-statistics and locks to mediate access.  In addition, Linux constructs for
-each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
-an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
-selected zone/node cannot satisfy the allocation request.  This situation,
-when a zone has no available memory to satisfy a request, is called
-"overflow" or "fallback".
-
-Because some nodes contain multiple zones containing different types of
-memory, Linux must decide whether to order the zonelists such that allocations
-fall back to the same zone type on a different node, or to a different zone
-type on the same node.  This is an important consideration because some zones,
-such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default Node ordered zonelist. This means it tries to fallback to other zones
-from the same node before using remote nodes which are ordered by NUMA distance.
-
-By default, Linux will attempt to satisfy memory allocation requests from the
-node to which the CPU that executes the request is assigned.  Specifically,
-Linux will attempt to allocate from the first node in the appropriate zonelist
-for the node where the request originates.  This is called "local allocation."
-If the "local" node cannot satisfy the request, the kernel will examine other
-nodes' zones in the selected zonelist looking for the first zone in the list
-that can satisfy the request.
-
-Local allocation will tend to keep subsequent access to the allocated memory
-"local" to the underlying physical resources and off the system interconnect--
-as long as the task on whose behalf the kernel allocated some memory does not
-later migrate away from that memory.  The Linux scheduler is aware of the
-NUMA topology of the platform--embodied in the "scheduling domains" data
-structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler
-attempts to minimize task migration to distant scheduling domains.  However,
-the scheduler does not take a task's NUMA footprint into account directly.
-Thus, under sufficient imbalance, tasks can migrate between nodes, remote
-from their initial node and kernel data structures.
-
-System administrators and application designers can restrict a task's migration
-to improve NUMA locality using various CPU affinity command line interfaces,
-such as taskset(1) and numactl(1), and program interfaces such as
-sched_setaffinity(2).  Further, one can modify the kernel's default local
-allocation behavior using Linux NUMA memory policy. [see
-:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
-
-System administrators can restrict the CPUs and nodes' memories that a non-
-privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-On architectures that do not hide memoryless nodes, Linux will include only
-zones [nodes] with memory in the zonelists.  This means that for a memoryless
-node the "local memory node"--the node of the first zone in CPU's node's
-zonelist--will not be the node itself.  Rather, it will be the node that the
-kernel selected as the nearest node with memory when it built the zonelists.
-So, default, local allocations will succeed with the kernel supplying the
-closest available memory.  This is a consequence of the same mechanism that
-allows such allocations to fallback to other nearby nodes when a node that
-does contain memory overflows.
-
-Some kernel allocations do not want or cannot tolerate this allocation fallback
-behavior.  Rather they want to be sure they get memory from the specified node
-or get notified that the node has no free memory.  This is usually the case when
-a subsystem allocates per CPU memory resources, for example.
-
-A typical model for making such an allocation is to obtain the node id of the
-node to which the "current CPU" is attached using one of the kernel's
-numa_node_id() or CPU_to_node() functions and then request memory from only
-the node id returned.  When such an allocation fails, the requesting subsystem
-may revert to its own fallback path.  The slab kernel memory allocator is an
-example of this.  Or, the subsystem may choose to disable or not to enable
-itself on allocation failure.  The kernel profiling subsystem is an example of
-this.
-
-If the architecture supports--does not hide--memoryless nodes, then CPUs
-attached to memoryless nodes would always incur the fallback path overhead
-or some subsystems would fail to initialize if they attempted to allocated
-memory exclusively from a node without memory.  To support such
-architectures transparently, kernel subsystems can use the numa_mem_id()
-or cpu_to_mem() function to locate the "local memory node" for the calling or
-specified CPU.  Again, this is the same node from which default, local page
-allocations will be attempted.
diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst
deleted file mode 100644
index 18e9e40c1ec1..000000000000
--- a/Documentation/vm/oom.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================
-Out Of Memory Handling
-======================
diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst
deleted file mode 100644
index 1addb0c374a4..000000000000
--- a/Documentation/vm/overcommit-accounting.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. _overcommit_accounting:
-
-=====================
-Overcommit Accounting
-=====================
-
-The Linux kernel supports the following overcommit handling modes
-
-0
-	Heuristic overcommit handling. Obvious overcommits of address
-	space are refused. Used for a typical system. It ensures a
-	seriously wild allocation fails while allowing overcommit to
-	reduce swap usage.  root is allowed to allocate slightly more
-	memory in this mode. This is the default.
-
-1
-	Always overcommit. Appropriate for some scientific
-	applications. Classic example is code using sparse arrays and
-	just relying on the virtual memory consisting almost entirely
-	of zero pages.
-
-2
-	Don't overcommit. The total address space commit for the
-	system is not permitted to exceed swap + a configurable amount
-	(default is 50%) of physical RAM.  Depending on the amount you
-	use, in most situations this means a process will not be
-	killed while accessing pages but will receive errors on memory
-	allocation as appropriate.
-
-	Useful for applications that want to guarantee their memory
-	allocations will be available in the future without having to
-	initialize every page.
-
-The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
-
-The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
-or ``vm.overcommit_kbytes`` (absolute value). These only have an effect
-when ``vm.overcommit_memory`` is set to 2.
-
-The current overcommit limit and amount committed are viewable in
-``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
-
-Gotchas
-=======
-
-The C language stack growth does an implicit mremap. If you want absolute
-guarantees and run close to the edge you MUST mmap your stack for the
-largest size you think you will need. For typical stack usage this does
-not matter much but it's a corner case if you really really care
-
-In mode 2 the MAP_NORESERVE flag is ignored.
-
-
-How It Works
-============
-
-The overcommit is based on the following rules
-
-For a file backed map
-	| SHARED or READ-only	-	0 cost (the file is the map not swap)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-For an anonymous or ``/dev/zero`` map
-	| SHARED			-	size of mapping
-	| PRIVATE READ-only	-	0 cost (but of little use)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-Additional accounting
-	| Pages made writable copies by mmap
-	| shmfs memory drawn from the same pool
-
-Status
-======
-
-*	We account mmap memory mappings
-*	We account mprotect changes in commit
-*	We account mremap changes in size
-*	We account brk
-*	We account munmap
-*	We report the commit status in /proc
-*	Account and check on fork
-*	Review stack handling/building on exec
-*	SHMfs accounting
-*	Implement actual limit enforcement
-
-To Do
-=====
-*	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst
deleted file mode 100644
index d9b4495561f1..000000000000
--- a/Documentation/vm/page_allocation.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Page Allocation
-===============
diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst
deleted file mode 100644
index 75eba7c431b2..000000000000
--- a/Documentation/vm/page_cache.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========
-Page Cache
-==========
diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst
deleted file mode 100644
index 7d6f9385d129..000000000000
--- a/Documentation/vm/page_frags.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. _page_frags:
-
-==============
-Page fragments
-==============
-
-A page fragment is an arbitrary-length arbitrary-offset area of memory
-which resides within a 0 or higher order compound page.  Multiple
-fragments within that page are individually refcounted, in the page's
-reference counter.
-
-The page_frag functions, page_frag_alloc and page_frag_free, provide a
-simple allocation framework for page fragments.  This is used by the
-network stack and network device drivers to provide a backing region of
-memory for use as either an sk_buff->head, or to be used in the "frags"
-portion of skb_shared_info.
-
-In order to make use of the page fragment APIs a backing page fragment
-cache is needed.  This provides a central point for the fragment allocation
-and tracks allows multiple calls to make use of a cached page.  The
-advantage to doing this is that multiple calls to get_page can be avoided
-which can be expensive at allocation time.  However due to the nature of
-this caching it is required that any calls to the cache be protected by
-either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
-to be disabled when executing the fragment allocation.
-
-The network stack uses two separate caches per CPU to handle fragment
-allocation.  The netdev_alloc_cache is used by callers making use of the
-netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
-used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
-main difference between these two calls is the context in which they may be
-called.  The "netdev" prefixed functions are usable in any context as these
-functions will disable interrupts, while the "napi" prefixed functions are
-only usable within the softirq context.
-
-Many network device drivers use a similar methodology for allocating page
-fragments, but the page fragments are cached at the ring or descriptor
-level.  In order to enable these cases it is necessary to provide a generic
-way of tearing down a page cache.  For this reason __page_frag_cache_drain
-was implemented.  It allows for freeing multiple references from a single
-page via a single call.  The advantage to doing this is that it allows for
-cleaning up the multiple references that were added to a page in order to
-avoid calling get_page per allocation.
-
-Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
deleted file mode 100644
index 8c5cb8147e55..000000000000
--- a/Documentation/vm/page_migration.rst
+++ /dev/null
@@ -1,288 +0,0 @@
-.. _page_migration:
-
-==============
-Page migration
-==============
-
-Page migration allows moving the physical location of pages between
-nodes in a NUMA system while the process is running. This means that the
-virtual addresses that the process sees do not change. However, the
-system rearranges the physical location of those pages.
-
-Also see :ref:`Heterogeneous Memory Management (HMM) <hmm>`
-for migrating pages to or from device private memory.
-
-The main intent of page migration is to reduce the latency of memory accesses
-by moving pages near to the processor where the process accessing that memory
-is running.
-
-Page migration allows a process to manually relocate the node on which its
-pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
-a new memory policy via mbind(). The pages of a process can also be relocated
-from another process using the sys_migrate_pages() function call. The
-migrate_pages() function call takes two sets of nodes and moves pages of a
-process that are located on the from nodes to the destination nodes.
-Page migration functions are provided by the numactl package by Andi Kleen
-(a version later than 0.9.3 is required. Get it from
-https://github.com/numactl/numactl.git). numactl provides libnuma
-which provides an interface similar to other NUMA functionality for page
-migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
-pages of a process are located. See also the numa_maps documentation in the
-proc(5) man page.
-
-Manual migration is useful if for example the scheduler has relocated
-a process to a processor on a distant node. A batch scheduler or an
-administrator may detect the situation and move the pages of the process
-nearer to the new processor. The kernel itself only provides
-manual page migration support. Automatic page migration may be implemented
-through user space processes that move pages. A special function call
-"move_pages" allows the moving of individual pages within a process.
-For example, A NUMA profiler may obtain a log showing frequent off-node
-accesses and may use the result to move pages to more advantageous
-locations.
-
-Larger installations usually partition the system using cpusets into
-sections of nodes. Paul Jackson has equipped cpusets with the ability to
-move pages when a task is moved to another cpuset (See
-:ref:`CPUSETS <cpusets>`).
-Cpusets allow the automation of process locality. If a task is moved to
-a new cpuset then also all its pages are moved with it so that the
-performance of the process does not sink dramatically. Also the pages
-of processes in a cpuset are moved if the allowed memory nodes of a
-cpuset are changed.
-
-Page migration allows the preservation of the relative location of pages
-within a group of nodes for all migration techniques which will preserve a
-particular memory allocation pattern generated even after migrating a
-process. This is necessary in order to preserve the memory latencies.
-Processes will run with similar performance after migration.
-
-Page migration occurs in several steps. First a high level
-description for those trying to use migrate_pages() from the kernel
-(for userspace usage see the Andi Kleen's numactl package mentioned above)
-and then a low level description of how the low level details work.
-
-In kernel use of migrate_pages()
-================================
-
-1. Remove pages from the LRU.
-
-   Lists of pages to be migrated are generated by scanning over
-   pages and moving them into lists. This is done by
-   calling isolate_lru_page().
-   Calling isolate_lru_page() increases the references to the page
-   so that it cannot vanish while the page migration occurs.
-   It also prevents the swapper or other scans from encountering
-   the page.
-
-2. We need to have a function of type new_page_t that can be
-   passed to migrate_pages(). This function should figure out
-   how to allocate the correct new page given the old page.
-
-3. The migrate_pages() function is called which attempts
-   to do the migration. It will call the function to allocate
-   the new page for each page that is considered for
-   moving.
-
-How migrate_pages() works
-=========================
-
-migrate_pages() does several passes over its list of pages. A page is moved
-if all references to a page are removable at the time. The page has
-already been removed from the LRU via isolate_lru_page() and the refcount
-is increased so that the page cannot be freed while page migration occurs.
-
-Steps:
-
-1. Lock the page to be migrated.
-
-2. Ensure that writeback is complete.
-
-3. Lock the new page that we want to move to. It is locked so that accesses to
-   this (not yet up-to-date) page immediately block while the move is in progress.
-
-4. All the page table references to the page are converted to migration
-   entries. This decreases the mapcount of a page. If the resulting
-   mapcount is not zero then we do not migrate the page. All user space
-   processes that attempt to access the page will now wait on the page lock
-   or wait for the migration page table entry to be removed.
-
-5. The i_pages lock is taken. This will cause all processes trying
-   to access the page via the mapping to block on the spinlock.
-
-6. The refcount of the page is examined and we back out if references remain.
-   Otherwise, we know that we are the only one referencing this page.
-
-7. The radix tree is checked and if it does not contain the pointer to this
-   page then we back out because someone else modified the radix tree.
-
-8. The new page is prepped with some settings from the old page so that
-   accesses to the new page will discover a page with the correct settings.
-
-9. The radix tree is changed to point to the new page.
-
-10. The reference count of the old page is dropped because the address space
-    reference is gone. A reference to the new page is established because
-    the new page is referenced by the address space.
-
-11. The i_pages lock is dropped. With that lookups in the mapping
-    become possible again. Processes will move from spinning on the lock
-    to sleeping on the locked new page.
-
-12. The page contents are copied to the new page.
-
-13. The remaining page flags are copied to the new page.
-
-14. The old page flags are cleared to indicate that the page does
-    not provide any information anymore.
-
-15. Queued up writeback on the new page is triggered.
-
-16. If migration entries were inserted into the page table, then replace them
-    with real ptes. Doing so will enable access for user space processes not
-    already waiting for the page lock.
-
-17. The page locks are dropped from the old and new page.
-    Processes waiting on the page lock will redo their page faults
-    and will reach the new page.
-
-18. The new page is moved to the LRU and can be scanned by the swapper,
-    etc. again.
-
-Non-LRU page migration
-======================
-
-Although migration originally aimed for reducing the latency of memory accesses
-for NUMA, compaction also uses migration to create high-order pages.
-
-Current problem of the implementation is that it is designed to migrate only
-*LRU* pages. However, there are potential non-LRU pages which can be migrated
-in drivers, for example, zsmalloc, virtio-balloon pages.
-
-For virtio-balloon pages, some parts of migration code path have been hooked
-up and added virtio-balloon specific functions to intercept migration logics.
-It's too specific to a driver so other drivers who want to make their pages
-movable would have to add their own specific hooks in the migration path.
-
-To overcome the problem, VM supports non-LRU page migration which provides
-generic functions for non-LRU movable pages without driver specific hooks
-in the migration path.
-
-If a driver wants to make its pages movable, it should define three functions
-which are function pointers of struct address_space_operations.
-
-1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-
-   What VM expects from isolate_page() function of driver is to return *true*
-   if driver isolates the page successfully. On returning true, VM marks the page
-   as PG_isolated so concurrent isolation in several CPUs skip the page
-   for isolation. If a driver cannot isolate the page, it should return *false*.
-
-   Once page is successfully isolated, VM uses page.lru fields so driver
-   shouldn't expect to preserve values in those fields.
-
-2. ``int (*migratepage) (struct address_space *mapping,``
-|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-
-   After isolation, VM calls migratepage() of driver with the isolated page.
-   The function of migratepage() is to move the contents of the old page to the
-   new page
-   and set up fields of struct page newpage. Keep in mind that you should
-   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-   under page_lock if you migrated the oldpage successfully and returned
-   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-   because VM interprets -EAGAIN as "temporary migration failure". On returning
-   any error except -EAGAIN, VM will give up the page migration without
-   retrying.
-
-   Driver shouldn't touch the page.lru field while in the migratepage() function.
-
-3. ``void (*putback_page)(struct page *);``
-
-   If migration fails on the isolated page, VM should return the isolated page
-   to the driver so VM calls the driver's putback_page() with the isolated page.
-   In this function, the driver should put the isolated page back into its own data
-   structure.
-
-Non-LRU movable page flags
-
-   There are two page flags for supporting non-LRU movable page.
-
-   * PG_movable
-
-     Driver should use the function below to make page movable under page_lock::
-
-	void __SetPageMovable(struct page *page, struct address_space *mapping)
-
-     It needs argument of address_space for registering migration
-     family functions which will be called by VM. Exactly speaking,
-     PG_movable is not a real flag of struct page. Rather, VM
-     reuses the page->mapping's lower bits to represent it::
-
-	#define PAGE_MAPPING_MOVABLE 0x2
-	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-
-     so driver shouldn't access page->mapping directly. Instead, driver should
-     use page_mapping() which masks off the low two bits of page->mapping under
-     page lock so it can get the right struct address_space.
-
-     For testing of non-LRU movable pages, VM supports __PageMovable() function.
-     However, it doesn't guarantee to identify non-LRU movable pages because
-     the page->mapping field is unified with other variables in struct page.
-     If the driver releases the page after isolation by VM, page->mapping
-     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
-     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
-     page is LRU or non-LRU movable once the page has been isolated because LRU
-     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
-     good for just peeking to test non-LRU movable pages before more expensive
-     checking with lock_page() in pfn scanning to select a victim.
-
-     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
-     Unlike __PageMovable(), PageMovable() validates page->mapping and
-     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
-     sudden destroying of page->mapping.
-
-     Drivers using __SetPageMovable() should clear the flag via
-     __ClearMovablePage() under page_lock() before the releasing the page.
-
-   * PG_isolated
-
-     To prevent concurrent isolation among several CPUs, VM marks isolated page
-     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
-     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
-     flag because VM will set/clear it automatically. Keep in mind that if the
-     driver sees a PG_isolated page, it means the page has been isolated by the
-     VM so it shouldn't touch the page.lru field.
-     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
-     shouldn't use PG_isolated for its own purposes.
-
-Monitoring Migration
-=====================
-
-The following events (counters) can be used to monitor page migration.
-
-1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
-   page was migrated. If the page was a non-THP and non-hugetlb page, then
-   this counter is increased by one. If the page was a THP or hugetlb, then
-   this counter is increased by the number of THP or hugetlb subpages.
-   For example, migration of a single 2MB THP that has 4KB-size base pages
-   (subpages) will cause this counter to increase by 512.
-
-2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
-   PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
-   if it was a THP or hugetlb.
-
-3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
-
-4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split.
-
-5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had
-   to be split. After splitting, a migration retry was used for it's sub-pages.
-
-THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or
-PGMIGRATE_FAIL events. For example, a THP migration failure will cause both
-THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
-
-Christoph Lameter, May 8, 2006.
-Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
deleted file mode 100644
index f5c954afe97c..000000000000
--- a/Documentation/vm/page_owner.rst
+++ /dev/null
@@ -1,196 +0,0 @@
-.. _page_owner:
-
-==================================================
-page owner: Tracking about who allocated each page
-==================================================
-
-Introduction
-============
-
-page owner is for the tracking about who allocated each page.
-It can be used to debug memory leak or to find a memory hogger.
-When allocation happens, information about allocation such as call stack
-and order of pages is stored into certain storage for each page.
-When we need to know about status of all pages, we can get and analyze
-this information.
-
-Although we already have tracepoint for tracing page allocation/free,
-using it for analyzing who allocate each page is rather complex. We need
-to enlarge the trace buffer for preventing overlapping until userspace
-program launched. And, launched program continually dump out the trace
-buffer for later analysis and it would change system behaviour with more
-possibility rather than just keeping it in memory, so bad for debugging.
-
-page owner can also be used for various purposes. For example, accurate
-fragmentation statistics can be obtained through gfp flag information of
-each page. It is already implemented and activated if page owner is
-enabled. Other usages are more than welcome.
-
-page owner is disabled by default. So, if you'd like to use it, you need
-to add "page_owner=on" to your boot cmdline. If the kernel is built
-with page owner and page owner is disabled in runtime due to not enabling
-boot option, runtime overhead is marginal. If disabled in runtime, it
-doesn't require memory to store owner information, so there is no runtime
-memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if not enabled, then allocation is done
-like as the kernel without page owner. These two unlikely branches should
-not affect to allocation performance, especially if the static keys jump
-label patching functionality is available. Following is the kernel's code
-size change due to this facility.
-
-- Without page owner::
-
-   text    data     bss     dec     hex filename
-   48392   2333     644   51369    c8a9 mm/page_alloc.o
-
-- With page owner::
-
-   text    data     bss     dec     hex filename
-   48800   2445     644   51889    cab1 mm/page_alloc.o
-   6662     108      29    6799    1a8f mm/page_owner.o
-   1025       8       8    1041     411 mm/page_ext.o
-
-Although, roughly, 8 KB code is added in total, page_alloc.o increase by
-520 bytes and less than half of it is in hotpath. Building the kernel with
-page owner and turning it on if needed would be great option to debug
-kernel memory problem.
-
-There is one notice that is caused by implementation detail. page owner
-stores information into the memory from struct page extension. This memory
-is initialized some time later than that page allocator starts in sparse
-memory system, so, until initialization, many pages can be allocated and
-they would have no owner information. To fix it up, these early allocated
-pages are investigated and marked as allocated in initialization phase.
-Although it doesn't mean that they have the right owner information,
-at least, we can tell whether the page is allocated or not,
-more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
-are catched and marked, although they are mostly allocated from struct
-page extension feature. Anyway, after that, no page is left in
-un-tracking state.
-
-Usage
-=====
-
-1) Build user-space helper::
-
-	cd tools/vm
-	make page_owner_sort
-
-2) Enable page owner: add "page_owner=on" to boot cmdline.
-
-3) Do the job that you want to debug.
-
-4) Analyze information from page owner::
-
-	cat /sys/kernel/debug/page_owner > page_owner_full.txt
-	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
-
-   The general output of ``page_owner_full.txt`` is as follows::
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-   The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
-   in buf, uses regexp to extract the page order value, counts the times
-   and pages of buf, and finally sorts them according to the parameter(s).
-
-   See the result about who allocated each page
-   in the ``sorted_page_owner.txt``. General output::
-
-	XXX times, XXX pages:
-	Page allocated via order XXX, ...
-	// Detailed stack
-
-   By default, ``page_owner_sort`` is sorted according to the times of buf.
-   If you want to sort by the page nums of buf, use the ``-m`` parameter.
-   The detailed parameters are:
-
-   fundamental function::
-
-	Sort:
-		-a		Sort by memory allocation time.
-		-m		Sort by total memory.
-		-p		Sort by pid.
-		-P		Sort by tgid.
-		-n		Sort by task command name.
-		-r		Sort by memory release time.
-		-s		Sort by stack trace.
-		-t		Sort by times (default).
-		--sort <order>	Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].
-				Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is
-				optional since default direction is increasing numerical or lexicographic
-				order. Mixed use of abbreviated and complete-form of keys is allowed.
-
-		Examples:
-				./page_owner_sort <input> <output> --sort=n,+pid,-tgid
-				./page_owner_sort <input> <output> --sort=at
-
-   additional function::
-
-	Cull:
-		--cull <rules>
-				Specify culling rules.Culling syntax is key[,key[,...]].Choose a
-				multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
-
-		<rules> is a single argument in the form of a comma-separated list,
-		which offers a way to specify individual culling rules.  The recognized
-		keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
-		<rules> can be specified by the sequence of keys k1,k2, ..., as described in
-		the STANDARD SORT KEYS section below. Mixed use of abbreviated and
-		complete-form of keys is allowed.
-
-		Examples:
-				./page_owner_sort <input> <output> --cull=stacktrace
-				./page_owner_sort <input> <output> --cull=st,pid,name
-				./page_owner_sort <input> <output> --cull=n,f
-
-	Filter:
-		-f		Filter out the information of blocks whose memory has been released.
-
-	Select:
-		--pid <pidlist>		Select by pid. This selects the blocks whose process ID
-					numbers appear in <pidlist>.
-		--tgid <tgidlist>	Select by tgid. This selects the blocks whose thread
-					group ID numbers appear in <tgidlist>.
-		--name <cmdlist>	Select by task command name. This selects the blocks whose
-					task command name appear in <cmdlist>.
-
-		<pidlist>, <tgidlist>, <cmdlist> are single arguments in the form of a comma-separated list,
-		which offers a way to specify individual selecting rules.
-
-
-		Examples:
-				./page_owner_sort <input> <output> --pid=1
-				./page_owner_sort <input> <output> --tgid=1,2,3
-				./page_owner_sort <input> <output> --name name1,name2
-
-STANDARD FORMAT SPECIFIERS
-==========================
-::
-
-  For --sort option:
-
-	KEY		LONG		DESCRIPTION
-	p		pid		process ID
-	tg		tgid		thread group ID
-	n		name		task command name
-	st		stacktrace	stack trace of the page allocation
-	T		txt		full text of block
-	ft		free_ts		timestamp of the page when it was released
-	at		alloc_ts	timestamp of the page when it was allocated
-	ator		allocator	memory allocator for pages
-
-  For --curl option:
-
-	KEY		LONG		DESCRIPTION
-	p		pid		process ID
-	tg		tgid		thread group ID
-	n		name		task command name
-	f		free		whether the page has been released or not
-	st		stacktrace	stack trace of the page allocation
-	ator		allocator	memory allocator for pages
diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst
deleted file mode 100644
index 50a30b7f8ac3..000000000000
--- a/Documentation/vm/page_reclaim.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-============
-Page Reclaim
-============
diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst
deleted file mode 100644
index 1a09472f10a3..000000000000
--- a/Documentation/vm/page_table_check.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _page_table_check:
-
-================
-Page Table Check
-================
-
-Introduction
-============
-
-Page table check allows to harden the kernel by ensuring that some types of
-the memory corruptions are prevented.
-
-Page table check performs extra verifications at the time when new pages become
-accessible from the userspace by getting their page table entries (PTEs PMDs
-etc.) added into the table.
-
-In case of detected corruption, the kernel is crashed. There is a small
-performance and memory overhead associated with the page table check. Therefore,
-it is disabled by default, but can be optionally enabled on systems where the
-extra hardening outweighs the performance costs. Also, because page table check
-is synchronous, it can help with debugging double map memory corruption issues,
-by crashing kernel at the time wrong mapping occurs instead of later which is
-often the case with memory corruptions bugs.
-
-Double mapping detection logic
-==============================
-
-+-------------------+-------------------+-------------------+------------------+
-| Current Mapping   | New mapping       | Permissions       | Rule             |
-+===================+===================+===================+==================+
-| Anonymous         | Anonymous         | Read              | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Named             | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Anonymous         | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Named             | Any               | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-
-Enabling Page Table Check
-=========================
-
-Build kernel with:
-
-- PAGE_TABLE_CHECK=y
-  Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
-  is available.
-
-- Boot with 'page_table_check=on' kernel parameter.
-
-Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
-table support without extra kernel parameter.
diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst
deleted file mode 100644
index 96939571d7bc..000000000000
--- a/Documentation/vm/page_tables.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===========
-Page Tables
-===========
diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst
deleted file mode 100644
index 2ab7b8c1c863..000000000000
--- a/Documentation/vm/physical_memory.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Physical Memory
-===============
diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst
deleted file mode 100644
index e8618fbc62c9..000000000000
--- a/Documentation/vm/process_addrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=================
-Process Addresses
-=================
diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst
deleted file mode 100644
index 7bef6718e3a9..000000000000
--- a/Documentation/vm/remap_file_pages.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _remap_file_pages:
-
-==============================
-remap_file_pages() system call
-==============================
-
-The remap_file_pages() system call is used to create a nonlinear mapping,
-that is, a mapping in which the pages of the file are mapped into a
-nonsequential order in memory. The advantage of using remap_file_pages()
-over using repeated calls to mmap(2) is that the former approach does not
-require the kernel to create additional VMA (Virtual Memory Area) data
-structures.
-
-Supporting of nonlinear mapping requires significant amount of non-trivial
-code in kernel virtual memory subsystem including hot paths. Also to get
-nonlinear mapping work kernel need a way to distinguish normal page table
-entries from entries with file offset (pte_file). Kernel reserves flag in
-PTE for this purpose. PTE flags are scarce resource especially on some CPU
-architectures. It would be nice to free up the flag for other usage.
-
-Fortunately, there are not many users of remap_file_pages() in the wild.
-It's only known that one enterprise RDBMS implementation uses the syscall
-on 32-bit systems to map files bigger than can linearly fit into 32-bit
-virtual address space. This use-case is not critical anymore since 64-bit
-systems are widely available.
-
-The syscall is deprecated and replaced it with an emulation now. The
-emulation creates new VMAs instead of nonlinear mappings. It's going to
-work slower for rare users of remap_file_pages() but ABI is preserved.
-
-One side effect of emulation (apart from performance) is that user can hit
-vm.max_map_count limit more easily due to additional VMAs. See comment for
-DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst
deleted file mode 100644
index 8b01ebb4c30e..000000000000
--- a/Documentation/vm/shmfs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-========================
-Shared Memory Filesystem
-========================
diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst
deleted file mode 100644
index 87d5a5bb172f..000000000000
--- a/Documentation/vm/slab.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Slab Allocation
-===============
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
deleted file mode 100644
index 43063ade737a..000000000000
--- a/Documentation/vm/slub.rst
+++ /dev/null
@@ -1,452 +0,0 @@
-.. _slub:
-
-==========================
-Short users guide for SLUB
-==========================
-
-The basic philosophy of SLUB is very different from SLAB. SLAB
-requires rebuilding the kernel to activate debug options for all
-slab caches. SLUB always includes full debugging but it is off by default.
-SLUB can enable debugging only for selected slabs in order to avoid
-an impact on overall system performance which may make a bug more
-difficult to find.
-
-In order to switch debugging on one can add an option ``slub_debug``
-to the kernel command line. That will enable full debugging for
-all slabs.
-
-Typically one would then use the ``slabinfo`` command to get statistical
-data and perform operation on the slabs. By default ``slabinfo`` only lists
-slabs that have data in them. See "slabinfo -h" for more options when
-running the command. ``slabinfo`` can be compiled with
-::
-
-	gcc -o slabinfo tools/vm/slabinfo.c
-
-Some of the modes of operation of ``slabinfo`` require that slub debugging
-be enabled on the command line. F.e. no tracking information will be
-available without debugging on and validation can only partially
-be performed if debugging was not switched on.
-
-Some more sophisticated uses of slub_debug:
--------------------------------------------
-
-Parameters may be given to ``slub_debug``. If none is specified then full
-debugging is enabled. Format:
-
-slub_debug=<Debug-Options>
-	Enable options for all slabs
-
-slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
-	Enable options only for select slabs (no spaces
-	after a comma)
-
-Multiple blocks of options for all slabs or selected slabs can be given, with
-blocks of options delimited by ';'. The last of "all slabs" blocks is applied
-to all slabs except those that match one of the "select slabs" block. Options
-of the first "select slabs" blocks that matches the slab's name are applied.
-
-Possible debug options are::
-
-	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
-			Sorry SLAB legacy issues)
-	Z		Red zoning
-	P		Poisoning (object and padding)
-	U		User tracking (free and alloc)
-	T		Trace (please only use on single slabs)
-	A		Enable failslab filter mark for the cache
-	O		Switch debugging off for caches that would have
-			caused higher minimum slab orders
-	-		Switch all debugging off (useful if the kernel is
-			configured with CONFIG_SLUB_DEBUG_ON)
-
-F.e. in order to boot just with sanity checks and red zoning one would specify::
-
-	slub_debug=FZ
-
-Trying to find an issue in the dentry cache? Try::
-
-	slub_debug=,dentry
-
-to only enable debugging on the dentry cache.  You may use an asterisk at the
-end of the slab name, in order to cover all slabs with the same prefix.  For
-example, here's how you can poison the dentry cache as well as all kmalloc
-slabs::
-
-	slub_debug=P,kmalloc-*,dentry
-
-Red zoning and tracking may realign the slab.  We can just apply sanity checks
-to the dentry cache with::
-
-	slub_debug=F,dentry
-
-Debugging options may require the minimum possible slab order to increase as
-a result of storing the metadata (for example, caches with PAGE_SIZE object
-sizes).  This has a higher liklihood of resulting in slab allocation errors
-in low memory situations or if there's high fragmentation of memory.  To
-switch off debugging for such caches by default, use::
-
-	slub_debug=O
-
-You can apply different options to different list of slab names, using blocks
-of options. This will enable red zoning for dentry and user tracking for
-kmalloc. All other slabs will not get any debugging enabled::
-
-	slub_debug=Z,dentry;U,kmalloc-*
-
-You can also enable options (e.g. sanity checks and poisoning) for all caches
-except some that are deemed too performance critical and don't need to be
-debugged by specifying global debug options followed by a list of slab names
-with "-" as options::
-
-	slub_debug=FZ;-,zs_handle,zspage
-
-The state of each debug option for a slab can be found in the respective files
-under::
-
-	/sys/kernel/slab/<slab name>/
-
-If the file contains 1, the option is enabled, 0 means disabled. The debug
-options from the ``slub_debug`` parameter translate to the following files::
-
-	F	sanity_checks
-	Z	red_zone
-	P	poison
-	U	store_user
-	T	trace
-	A	failslab
-
-Careful with tracing: It may spew out lots of information and never stop if
-used on the wrong slab.
-
-Slab merging
-============
-
-If no debug options are specified then SLUB may merge similar slabs together
-in order to reduce overhead and increase cache hotness of objects.
-``slabinfo -a`` displays which slabs were merged together.
-
-Slab validation
-===============
-
-SLUB can validate all object if the kernel was booted with slub_debug. In
-order to do so you must have the ``slabinfo`` tool. Then you can do
-::
-
-	slabinfo -v
-
-which will test all objects. Output will be generated to the syslog.
-
-This also works in a more limited way if boot was without slab debug.
-In that case ``slabinfo -v`` simply tests all reachable objects. Usually
-these are in the cpu slabs and the partial slabs. Full slabs are not
-tracked by SLUB in a non debug situation.
-
-Getting more performance
-========================
-
-To some degree SLUB's performance is limited by the need to take the
-list_lock once in a while to deal with partial slabs. That overhead is
-governed by the order of the allocation for each slab. The allocations
-can be influenced by kernel parameters:
-
-.. slub_min_objects=x		(default 4)
-.. slub_min_order=x		(default 0)
-.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
-
-``slub_min_objects``
-	allows to specify how many objects must at least fit into one
-	slab in order for the allocation order to be acceptable.  In
-	general slub will be able to perform this number of
-	allocations on a slab without consulting centralized resources
-	(list_lock) where contention may occur.
-
-``slub_min_order``
-	specifies a minimum order of slabs. A similar effect like
-	``slub_min_objects``.
-
-``slub_max_order``
-	specified the order at which ``slub_min_objects`` should no
-	longer be checked. This is useful to avoid SLUB trying to
-	generate super large order pages to fit ``slub_min_objects``
-	of a slab cache with large object sizes into one high order
-	page. Setting command line parameter
-	``debug_guardpage_minorder=N`` (N > 0), forces setting
-	``slub_max_order`` to 0, what cause minimum possible order of
-	slabs allocation.
-
-SLUB Debug output
-=================
-
-Here is a sample of slub debug output::
-
- ====================================================================
- BUG kmalloc-8: Right Redzone overwritten
- --------------------------------------------------------------------
-
- INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
- INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
- INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
- INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
-
- Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
- Object   (0xc90f6d20): 31 30 31 39 2e 30 30 35                         1019.005
- Redzone  (0xc90f6d28): 00 cc cc cc                                     .
- Padding  (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
-
-   [<c010523d>] dump_trace+0x63/0x1eb
-   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
-   [<c010601d>] show_trace+0x12/0x14
-   [<c0106035>] dump_stack+0x16/0x18
-   [<c017e0fa>] object_err+0x143/0x14b
-   [<c017e2cc>] check_object+0x66/0x234
-   [<c017eb43>] __slab_free+0x239/0x384
-   [<c017f446>] kfree+0xa6/0xc6
-   [<c02e2335>] get_modalias+0xb9/0xf5
-   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
-   [<c027866a>] dev_uevent+0x1ad/0x1da
-   [<c0205024>] kobject_uevent_env+0x20a/0x45b
-   [<c020527f>] kobject_uevent+0xa/0xf
-   [<c02779f1>] store_uevent+0x4f/0x58
-   [<c027758e>] dev_attr_store+0x29/0x2f
-   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
-   [<c0183ba7>] vfs_write+0xd1/0x15a
-   [<c01841d7>] sys_write+0x3d/0x72
-   [<c0104112>] sysenter_past_esp+0x5f/0x99
-   [<b7f7b410>] 0xb7f7b410
-   =======================
-
- FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
-
-If SLUB encounters a corrupted object (full detection requires the kernel
-to be booted with slub_debug) then the following output will be dumped
-into the syslog:
-
-1. Description of the problem encountered
-
-   This will be a message in the system log starting with::
-
-     ===============================================
-     BUG <slab cache affected>: <What went wrong>
-     -----------------------------------------------
-
-     INFO: <corruption start>-<corruption_end> <more info>
-     INFO: Slab <address> <slab information>
-     INFO: Object <address> <object information>
-     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
-	cpu> pid=<pid of the process>
-     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
-	pid=<pid of the process>
-
-   (Object allocation / free information is only available if SLAB_STORE_USER is
-   set for the slab. slub_debug sets that option)
-
-2. The object contents if an object was involved.
-
-   Various types of lines can follow the BUG SLUB line:
-
-   Bytes b4 <address> : <bytes>
-	Shows a few bytes before the object where the problem was detected.
-	Can be useful if the corruption does not stop with the start of the
-	object.
-
-   Object <address> : <bytes>
-	The bytes of the object. If the object is inactive then the bytes
-	typically contain poison values. Any non-poison value shows a
-	corruption by a write after free.
-
-   Redzone <address> : <bytes>
-	The Redzone following the object. The Redzone is used to detect
-	writes after the object. All bytes should always have the same
-	value. If there is any deviation then it is due to a write after
-	the object boundary.
-
-	(Redzone information is only available if SLAB_RED_ZONE is set.
-	slub_debug sets that option)
-
-   Padding <address> : <bytes>
-	Unused data to fill up the space in order to get the next object
-	properly aligned. In the debug case we make sure that there are
-	at least 4 bytes of padding. This allows the detection of writes
-	before the object.
-
-3. A stackdump
-
-   The stackdump describes the location where the error was detected. The cause
-   of the corruption is may be more likely found by looking at the function that
-   allocated or freed the object.
-
-4. Report on how the problem was dealt with in order to ensure the continued
-   operation of the system.
-
-   These are messages in the system log beginning with::
-
-	FIX <slab cache affected>: <corrective action taken>
-
-   In the above sample SLUB found that the Redzone of an active object has
-   been overwritten. Here a string of 8 characters was written into a slab that
-   has the length of 8 characters. However, a 8 character string needs a
-   terminating 0. That zero has overwritten the first byte of the Redzone field.
-   After reporting the details of the issue encountered the FIX SLUB message
-   tells us that SLUB has restored the Redzone to its proper value and then
-   system operations continue.
-
-Emergency operations
-====================
-
-Minimal debugging (sanity checks alone) can be enabled by booting with::
-
-	slub_debug=F
-
-This will be generally be enough to enable the resiliency features of slub
-which will keep the system running even if a bad kernel component will
-keep corrupting objects. This may be important for production systems.
-Performance will be impacted by the sanity checks and there will be a
-continual stream of error messages to the syslog but no additional memory
-will be used (unlike full debugging).
-
-No guarantees. The kernel component still needs to be fixed. Performance
-may be optimized further by locating the slab that experiences corruption
-and enabling debugging only for that cache
-
-I.e.::
-
-	slub_debug=F,dentry
-
-If the corruption occurs by writing after the end of the object then it
-may be advisable to enable a Redzone to avoid corrupting the beginning
-of other objects::
-
-	slub_debug=FZ,dentry
-
-Extended slabinfo mode and plotting
-===================================
-
-The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
- - Slabcache Totals
- - Slabs sorted by size (up to -N <num> slabs, default 1)
- - Slabs sorted by loss (up to -N <num> slabs, default 1)
-
-Additionally, in this mode ``slabinfo`` does not dynamically scale
-sizes (G/M/K) and reports everything in bytes (this functionality is
-also available to other slabinfo modes via '-B' option) which makes
-reporting more precise and accurate. Moreover, in some sense the `-X'
-mode also simplifies the analysis of slabs' behaviour, because its
-output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
-pushes the analysis from looking through the numbers (tons of numbers)
-to something easier -- visual analysis.
-
-To generate plots:
-
-a) collect slabinfo extended records, for example::
-
-	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
-
-b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
-
-	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
-
-   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
-   and generates 3 png files (and 3 pre-processing cache files) per STATS
-   file:
-   - Slabcache Totals: FOO_STATS-totals.png
-   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
-   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
-
-Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
-need to compare slabs' behaviour "prior to" and "after" some code
-modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
-can 'merge' the `Slabcache Totals` sections from different
-measurements. To visually compare N plots:
-
-a) Collect as many STATS1, STATS2, .. STATSN files as you need::
-
-	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
-
-b) Pre-process those STATS files::
-
-	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
-
-c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
-   generated pre-processed \*-totals::
-
-	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
-
-   This will produce a single plot (png file).
-
-   Plots, expectedly, can be large so some fluctuations or small spikes
-   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
-   options to 'zoom-in'/'zoom-out':
-
-   a) ``-s %d,%d`` -- overwrites the default image width and height
-   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
-      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
-      40,60`` range will plot only samples collected between 40th and
-      60th seconds).
-
-
-DebugFS files for SLUB
-======================
-
-For more information about current state of SLUB caches with the user tracking
-debug option enabled, debugfs files are available, typically under
-/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
-tracking). There are 2 types of these files with the following debug
-information:
-
-1. alloc_traces::
-
-    Prints information about unique allocation traces of the currently
-    allocated objects. The output is sorted by frequency of each trace.
-
-    Information in the output:
-    Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
-    pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
-
-    Example:::
-
-    1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
-	__slab_alloc+0x6d/0x90
-	kmem_cache_alloc_trace+0x2eb/0x300
-	populate_error_injection_list+0x97/0x110
-	init_error_injection+0x1b/0x71
-	do_one_initcall+0x5f/0x2d0
-	kernel_init_freeable+0x26f/0x2d7
-	kernel_init+0xe/0x118
-	ret_from_fork+0x22/0x30
-
-
-2. free_traces::
-
-    Prints information about unique freeing traces of the currently allocated
-    objects. The freeing traces thus come from the previous life-cycle of the
-    objects and are reported as not available for objects allocated for the first
-    time. The output is sorted by frequency of each trace.
-
-    Information in the output:
-    Number of objects, freeing function, minimal/average/maximal jiffies since free,
-    pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
-
-    Example:::
-
-    1980 <not-available> age=4294912290 pid=0 cpus=0
-    51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
-	kfree+0x2db/0x420
-	acpi_ut_update_ref_count+0x6a6/0x782
-	acpi_ut_update_object_reference+0x1ad/0x234
-	acpi_ut_remove_reference+0x7d/0x84
-	acpi_rs_get_prt_method_data+0x97/0xd6
-	acpi_get_irq_routing_table+0x82/0xc4
-	acpi_pci_irq_find_prt_entry+0x8e/0x2e0
-	acpi_pci_irq_lookup+0x3a/0x1e0
-	acpi_pci_irq_enable+0x77/0x240
-	pcibios_enable_device+0x39/0x40
-	do_pci_enable_device.part.0+0x5d/0xe0
-	pci_enable_device_flags+0xfc/0x120
-	pci_enable_device+0x13/0x20
-	virtio_pci_probe+0x9e/0x170
-	local_pci_probe+0x48/0x80
-	pci_device_probe+0x105/0x1c0
-
-Christoph Lameter, May 30, 2007
-Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst
deleted file mode 100644
index c08919662704..000000000000
--- a/Documentation/vm/split_page_table_lock.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-.. _split_page_table_lock:
-
-=====================
-Split page table lock
-=====================
-
-Originally, mm->page_table_lock spinlock protected all page tables of the
-mm_struct. But this approach leads to poor page fault scalability of
-multi-threaded applications due high contention on the lock. To improve
-scalability, split page table lock was introduced.
-
-With split page table lock we have separate per-table lock to serialize
-access to the table. At the moment we use split lock for PTE and PMD
-tables. Access to higher level tables protected by mm->page_table_lock.
-
-There are helpers to lock/unlock a table and other accessor functions:
-
- - pte_offset_map_lock()
-	maps pte and takes PTE table lock, returns pointer to the taken
-	lock;
- - pte_unmap_unlock()
-	unlocks and unmaps PTE table;
- - pte_alloc_map_lock()
-	allocates PTE table if needed and take the lock, returns pointer
-	to taken lock or NULL if allocation failed;
- - pte_lockptr()
-	returns pointer to PTE table lock;
- - pmd_lock()
-	takes PMD table lock, returns pointer to taken lock;
- - pmd_lockptr()
-	returns pointer to PMD table lock;
-
-Split page table lock for PTE tables is enabled compile-time if
-CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
-If split lock is disabled, all tables are guarded by mm->page_table_lock.
-
-Split page table lock for PMD tables is enabled, if it's enabled for PTE
-tables and the architecture supports it (see below).
-
-Hugetlb and split page table lock
-=================================
-
-Hugetlb can support several page sizes. We use split lock only for PMD
-level, but not for PUD.
-
-Hugetlb-specific helpers:
-
- - huge_pte_lock()
-	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
-	otherwise;
- - huge_pte_lockptr()
-	returns pointer to table lock;
-
-Support of split page table lock by an architecture
-===================================================
-
-There's no need in special enabling of PTE split page table lock: everything
-required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
-must be called on PTE table allocation / freeing.
-
-Make sure the architecture doesn't use slab allocator for page table
-allocation: slab uses page->slab_cache for its pages.
-This field shares storage with page->ptl.
-
-PMD split lock only makes sense if you have more than two page table
-levels.
-
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
-
-Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
-pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
-paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
-
-With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
-
-NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
-be handled properly.
-
-page->ptl
-=========
-
-page->ptl is used to access split page table lock, where 'page' is struct
-page of page containing the table. It shares storage with page->private
-(and few other fields in union).
-
-To avoid increasing size of struct page and have best performance, we use a
-trick:
-
- - if spinlock_t fits into long, we use page->ptr as spinlock, so we
-   can avoid indirect access and save a cache line.
- - if size of spinlock_t is bigger then size of long, we use page->ptl as
-   pointer to spinlock_t and allocate it dynamically. This allows to use
-   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
-   one more cache line for indirect access;
-
-The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
-
-Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst
deleted file mode 100644
index 78819bd4d745..000000000000
--- a/Documentation/vm/swap.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-====
-Swap
-====
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
deleted file mode 100644
index 216db1d67d04..000000000000
--- a/Documentation/vm/transhuge.rst
+++ /dev/null
@@ -1,187 +0,0 @@
-.. _transhuge:
-
-============================
-Transparent Hugepage Support
-============================
-
-This document describes design principles for Transparent Hugepage (THP)
-support and its interaction with other parts of the memory management
-system.
-
-Design principles
-=================
-
-- "graceful fallback": mm components which don't have transparent hugepage
-  knowledge fall back to breaking huge pmd mapping into table of ptes and,
-  if necessary, split a transparent hugepage. Therefore these components
-  can continue working on the regular pages or regular pte mappings.
-
-- if a hugepage allocation fails because of memory fragmentation,
-  regular pages should be gracefully allocated instead and mixed in
-  the same vma without any failure or significant delay and without
-  userland noticing
-
-- if some task quits and more hugepages become available (either
-  immediately in the buddy or through the VM), guest physical memory
-  backed by regular pages should be relocated on hugepages
-  automatically (with khugepaged)
-
-- it doesn't require memory reservation and in turn it uses hugepages
-  whenever possible (the only possible reservation here is kernelcore=
-  to avoid unmovable pages to fragment all the memory but such a tweak
-  is not specific to transparent hugepage support and it's a generic
-  feature that applies to all dynamic high order allocations in the
-  kernel)
-
-get_user_pages and follow_page
-==============================
-
-get_user_pages and follow_page if run on a hugepage, will return the
-head or tail pages as usual (exactly as they would do on
-hugetlbfs). Most GUP users will only care about the actual physical
-address of the page and its temporary pinning to release after the I/O
-is complete, so they won't ever notice the fact the page is huge. But
-if any driver is going to mangle over the page structure of the tail
-page (like for checking page->mapping or other bits that are relevant
-for the head page and not the tail page), it should be updated to jump
-to check head page instead. Taking a reference on any head/tail page would
-prevent the page from being split by anyone.
-
-.. note::
-   these aren't new constraints to the GUP API, and they match the
-   same constraints that apply to hugetlbfs too, so any driver capable
-   of handling GUP on hugetlbfs will also work fine on transparent
-   hugepage backed mappings.
-
-Graceful fallback
-=================
-
-Code walking pagetables but unaware about huge pmds can simply call
-split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
-pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_pmd where
-missing after pmd_offset returns the pmd. Thanks to the graceful
-fallback design, with a one liner change, you can avoid to write
-hundreds if not thousands of lines of complex code to make your code
-hugepage aware.
-
-If you're not walking pagetables but you run into a physical hugepage
-that you can't handle natively in your code, you can split it by
-calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example. split_huge_page() can fail
-if the page is pinned and you must handle this correctly.
-
-Example to make mremap.c transparent hugepage aware with a one liner
-change::
-
-	diff --git a/mm/mremap.c b/mm/mremap.c
-	--- a/mm/mremap.c
-	+++ b/mm/mremap.c
-	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
-			return NULL;
-
-		pmd = pmd_offset(pud, addr);
-	+	split_huge_pmd(vma, pmd, addr);
-		if (pmd_none_or_clear_bad(pmd))
-			return NULL;
-
-Locking in hugepage aware code
-==============================
-
-We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_pmd() has a cost.
-
-To make pagetable walks huge pmd aware, all you need to do is to call
-pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
-mmap_lock in read (or write) mode to be sure a huge pmd cannot be
-created from under you by khugepaged (khugepaged collapse_huge_page
-takes the mmap_lock in write mode in addition to the anon_vma lock). If
-pmd_trans_huge returns false, you just fallback in the old code
-paths. If instead pmd_trans_huge returns true, you have to take the
-page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page table lock will prevent the huge pmd being converted into a
-regular pmd from under you (split_huge_pmd can run in parallel to the
-pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page table lock and fallback to the old code as
-before. Otherwise, you can proceed to process the huge pmd and the
-hugepage natively. Once finished, you can drop the page table lock.
-
-Refcounts and transparent huge pages
-====================================
-
-Refcounting on THP is mostly consistent with refcounting on other compound
-pages:
-
-  - get_page()/put_page() and GUP operate on head page's ->_refcount.
-
-  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
-    succeeds on tail pages.
-
-  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
-    on relevant sub-page of the compound page.
-
-  - map/unmap of the whole compound page is accounted for in compound_mapcount
-    (stored in first tail page). For file huge pages, we also increment
-    ->_mapcount of all sub-pages in order to have race-free detection of
-    last unmap of subpages.
-
-PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
-
-For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all
-subpages is offset up by one. This additional reference is required to
-get race-free detection of unmap of subpages when we have them mapped with
-both PMDs and PTEs.
-
-This optimization is required to lower the overhead of per-subpage mapcount
-tracking. The alternative is to alter ->_mapcount in all subpages on each
-map/unmap of the whole compound page.
-
-For anonymous pages, we set PG_double_map when a PMD of the page is split
-for the first time, but still have a PMD mapping. The additional references
-go away with the last compound_mapcount.
-
-File pages get PG_double_map set on the first map of the page with PTE and
-goes away when the page gets evicted from the page cache.
-
-split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the page
-structures. It can be done easily for refcounts taken by page table
-entries, but we don't have enough information on how to distribute any
-additional pins (i.e. from get_user_pages). split_huge_page() fails any
-requests to split pinned huge pages: it expects page count to be equal to
-the sum of mapcount of all sub-pages plus one (split_huge_page caller must
-have a reference to the head page).
-
-split_huge_page uses migration entries to stabilize page->_refcount and
-page->_mapcount of anonymous pages. File pages just get unmapped.
-
-We are safe against physical memory scanners too: the only legitimate way
-a scanner can get a reference to a page is get_page_unless_zero().
-
-All tail pages have zero ->_refcount until atomic_add(). This prevents the
-scanner from getting a reference to the tail page up to that point. After the
-atomic_add() we don't care about the ->_refcount value. We already know how
-many references should be uncharged from the head page.
-
-For head page get_page_unless_zero() will succeed and we don't mind. It's
-clear where references should go after split: it will stay on the head page.
-
-Note that split_huge_pmd() doesn't have any limitations on refcounting:
-pmd can be split at any point and never fails.
-
-Partial unmap and deferred_split_huge_page()
-============================================
-
-Unmapping part of THP (with munmap() or other way) is not going to free
-memory immediately. Instead, we detect that a subpage of THP is not in use
-in page_remove_rmap() and queue the THP for splitting if memory pressure
-comes. Splitting will free up unused subpages.
-
-Splitting the page right away is not an option due to locking context in
-the place where we can detect partial unmap. It also might be
-counterproductive since in many cases partial unmap happens during exit(2) if
-a THP crosses a VMA boundary.
-
-The function deferred_split_huge_page() is used to queue a page for splitting.
-The splitting itself will happen when we get memory pressure via shrinker
-interface.
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
deleted file mode 100644
index b280367d6a44..000000000000
--- a/Documentation/vm/unevictable-lru.rst
+++ /dev/null
@@ -1,554 +0,0 @@
-.. _unevictable_lru:
-
-==============================
-Unevictable LRU Infrastructure
-==============================
-
-.. contents:: :local:
-
-
-Introduction
-============
-
-This document describes the Linux memory manager's "Unevictable LRU"
-infrastructure and the use of this to manage several types of "unevictable"
-pages.
-
-The document attempts to provide the overall rationale behind this mechanism
-and the rationale for some of the design decisions that drove the
-implementation.  The latter design rationale is discussed in the context of an
-implementation description.  Admittedly, one can obtain the implementation
-details - the "what does it do?" - by reading the code.  One hopes that the
-descriptions below add value by provide the answer to "why does it do that?".
-
-
-
-The Unevictable LRU
-===================
-
-The Unevictable LRU facility adds an additional LRU list to track unevictable
-pages and to hide these pages from vmscan.  This mechanism is based on a patch
-by Larry Woodman of Red Hat to address several scalability problems with page
-reclaim in Linux.  The problems have been observed at customer sites on large
-memory x86_64 systems.
-
-To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
-main memory will have over 32 million 4k pages in a single node.  When a large
-fraction of these pages are not evictable for any reason [see below], vmscan
-will spend a lot of time scanning the LRU lists looking for the small fraction
-of pages that are evictable.  This can result in a situation where all CPUs are
-spending 100% of their time in vmscan for hours or days on end, with the system
-completely unresponsive.
-
-The unevictable list addresses the following classes of unevictable pages:
-
- * Those owned by ramfs.
-
- * Those mapped into SHM_LOCK'd shared memory regions.
-
- * Those mapped into VM_LOCKED [mlock()ed] VMAs.
-
-The infrastructure may also be able to handle other conditions that make pages
-unevictable, either by definition or by circumstance, in the future.
-
-
-The Unevictable LRU Page List
------------------------------
-
-The Unevictable LRU page list is a lie.  It was never an LRU-ordered list, but a
-companion to the LRU-ordered anonymous and file, active and inactive page lists;
-and now it is not even a page list.  But following familiar convention, here in
-this document and in the source, we often imagine it as a fifth LRU page list.
-
-The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
-called the "unevictable" list and an associated page flag, PG_unevictable, to
-indicate that the page is being managed on the unevictable list.
-
-The PG_unevictable flag is analogous to, and mutually exclusive with, the
-PG_active flag in that it indicates on which LRU list a page resides when
-PG_lru is set.
-
-The Unevictable LRU infrastructure maintains unevictable pages as if they were
-on an additional LRU list for a few reasons:
-
- (1) We get to "treat unevictable pages just like we treat other pages in the
-     system - which means we get to use the same code to manipulate them, the
-     same code to isolate them (for migrate, etc.), the same code to keep track
-     of the statistics, etc..." [Rik van Riel]
-
- (2) We want to be able to migrate unevictable pages between nodes for memory
-     defragmentation, workload management and memory hotplug.  The Linux kernel
-     can only migrate pages that it can successfully isolate from the LRU
-     lists (or "Movable" pages: outside of consideration here).  If we were to
-     maintain pages elsewhere than on an LRU-like list, where they can be
-     detected by isolate_lru_page(), we would prevent their migration.
-
-The unevictable list does not differentiate between file-backed and anonymous,
-swap-backed pages.  This differentiation is only important while the pages are,
-in fact, evictable.
-
-The unevictable list benefits from the "arrayification" of the per-node LRU
-lists and statistics originally proposed and posted by Christoph Lameter.
-
-
-Memory Control Group Interaction
---------------------------------
-
-The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by
-extending the lru_list enum.
-
-The memory controller data structure automatically gets a per-node unevictable
-list as a result of the "arrayification" of the per-node LRU lists (one per
-lru_list enum element).  The memory controller tracks the movement of pages to
-and from the unevictable list.
-
-When a memory control group comes under memory pressure, the controller will
-not attempt to reclaim pages on the unevictable list.  This has a couple of
-effects:
-
- (1) Because the pages are "hidden" from reclaim on the unevictable list, the
-     reclaim process can be more efficient, dealing only with pages that have a
-     chance of being reclaimed.
-
- (2) On the other hand, if too many of the pages charged to the control group
-     are unevictable, the evictable portion of the working set of the tasks in
-     the control group may not fit into the available memory.  This can cause
-     the control group to thrash or to OOM-kill tasks.
-
-
-.. _mark_addr_space_unevict:
-
-Marking Address Spaces Unevictable
-----------------------------------
-
-For facilities such as ramfs none of the pages attached to the address space
-may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
-address space flag is provided, and this can be manipulated by a filesystem
-using a number of wrapper functions:
-
- * ``void mapping_set_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being completely unevictable.
-
- * ``void mapping_clear_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being evictable.
-
- * ``int mapping_unevictable(struct address_space *mapping);``
-
-	Query the address space, and return true if it is completely
-	unevictable.
-
-These are currently used in three places in the kernel:
-
- (1) By ramfs to mark the address spaces of its inodes when they are created,
-     and this mark remains for the life of the inode.
-
- (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
-     Note that SHM_LOCK is not required to page in the locked pages if they're
-     swapped out; the application must touch the pages manually if it wants to
-     ensure they're in memory.
-
- (3) By the i915 driver to mark pinned address space until it's unpinned. The
-     amount of unevictable memory marked by i915 driver is roughly the bounded
-     object size in debugfs/dri/0/i915_gem_objects.
-
-
-Detecting Unevictable Pages
----------------------------
-
-The function page_evictable() in mm/internal.h determines whether a page is
-evictable or not using the query function outlined above [see section
-:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
-to check the AS_UNEVICTABLE flag.
-
-For address spaces that are so marked after being populated (as SHM regions
-might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate
-the page tables for the region as does, for example, mlock(), nor need it make
-any special effort to push any pages in the SHM_LOCK'd area to the unevictable
-list.  Instead, vmscan will do this if and when it encounters the pages during
-a reclamation scan.
-
-On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan
-the pages in the region and "rescue" them from the unevictable list if no other
-condition is keeping them unevictable.  If an unevictable region is destroyed,
-the pages are also "rescued" from the unevictable list in the process of
-freeing them.
-
-page_evictable() also checks for mlocked pages by testing an additional page
-flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
-faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED.
-
-
-Vmscan's Handling of Unevictable Pages
---------------------------------------
-
-If unevictable pages are culled in the fault path, or moved to the unevictable
-list at mlock() or mmap() time, vmscan will not encounter the pages until they
-have become evictable again (via munlock() for example) and have been "rescued"
-from the unevictable list.  However, there may be situations where we decide,
-for the sake of expediency, to leave an unevictable page on one of the regular
-active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
-pages in all of the shrink_{active|inactive|page}_list() functions and will
-"cull" such pages that it encounters: that is, it diverts those pages to the
-unevictable list for the memory cgroup and node being scanned.
-
-There may be situations where a page is mapped into a VM_LOCKED VMA, but the
-page is not marked as PG_mlocked.  Such pages will make it all the way to
-shrink_active_list() or shrink_page_list() where they will be detected when
-vmscan walks the reverse map in page_referenced() or try_to_unmap().  The page
-is culled to the unevictable list when it is released by the shrinker.
-
-To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
-using putback_lru_page() - the inverse operation to isolate_lru_page() - after
-dropping the page lock.  Because the condition which makes the page unevictable
-may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the
-unevictable state of a page before placing it on the unevictable list.
-
-
-MLOCKED Pages
-=============
-
-The unevictable page list is also useful for mlock(), in addition to ramfs and
-SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
-NOMMU situations, all mappings are effectively mlocked.
-
-
-History
--------
-
-The "Unevictable mlocked Pages" infrastructure is based on work originally
-posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
-Nick posted his patch as an alternative to a patch posted by Christoph Lameter
-to achieve the same objective: hiding mlocked pages from vmscan.
-
-In Nick's patch, he used one of the struct page LRU list link fields as a count
-of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years
-earlier).  But this use of the link field for a count prevented the management
-of the pages on an LRU list, and thus mlocked pages were not migratable as
-isolate_lru_page() could not detect them, and the LRU list link field was not
-available to the migration subsystem.
-
-Nick resolved this by putting mlocked pages back on the LRU list before
-attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
-Nick's patch was integrated with the Unevictable LRU work, the count was
-replaced by walking the reverse map when munlocking, to determine whether any
-other VM_LOCKED VMAs still mapped the page.
-
-However, walking the reverse map for each page when munlocking was ugly and
-inefficient, and could lead to catastrophic contention on a file's rmap lock,
-when many processes which had it mlocked were trying to exit.  In 5.18, the
-idea of keeping mlock_count in Unevictable LRU list link field was revived and
-put to work, without preventing the migration of mlocked pages.  This is why
-the "Unevictable LRU list" cannot be a linked list of pages now; but there was
-no use for that linked list anyway - though its size is maintained for meminfo.
-
-
-Basic Management
-----------------
-
-mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
-pages.  When such a page has been "noticed" by the memory management subsystem,
-the page is marked with the PG_mlocked flag.  This can be manipulated using the
-PageMlocked() functions.
-
-A PG_mlocked page will be placed on the unevictable list when it is added to
-the LRU.  Such pages can be "noticed" by memory management in several places:
-
- (1) in the mlock()/mlock2()/mlockall() system call handlers;
-
- (2) in the mmap() system call handler when mmapping a region with the
-     MAP_LOCKED flag;
-
- (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
-     flag;
-
- (4) in the fault path and when a VM_LOCKED stack segment is expanded; or
-
- (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
-     reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
-
-mlocked pages become unlocked and rescued from the unevictable list when:
-
- (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
-
- (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
-     unmapping at task exit;
-
- (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
-     or
-
- (4) before a page is COW'd in a VM_LOCKED VMA.
-
-
-mlock()/mlock2()/mlockall() System Call Handling
-------------------------------------------------
-
-mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup()
-for each VMA in the range specified by the call.  In the case of mlockall(),
-this is the entire active address space of the task.  Note that mlock_fixup()
-is used for both mlocking and munlocking a range of memory.  A call to mlock()
-an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is
-treated as a no-op and mlock_fixup() simply returns.
-
-If the VMA passes some filtering as described in "Filtering Special VMAs"
-below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
-off a subset of the VMA if the range does not cover the entire VMA.  Any pages
-already present in the VMA are then marked as mlocked by mlock_page() via
-mlock_pte_range() via walk_page_range() via mlock_vma_pages_range().
-
-Before returning from the system call, do_mlock() or mlockall() will call
-__mm_populate() to fault in the remaining pages via get_user_pages() and to
-mark those pages as mlocked as they are faulted.
-
-Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
-get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
-do end up getting faulted into this VM_LOCKED VMA, they will be handled in the
-fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled.
-
-For each PTE (or PMD) being faulted into a VMA, the page add rmap function
-calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED
-(unless it is a PTE mapping of a part of a transparent huge page).  Or when
-it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable()
-calls mlock_new_page() instead: similar to mlock_page(), but can make better
-judgments, since this page is held exclusively and known not to be on LRU yet.
-
-mlock_page() sets PageMlocked immediately, then places the page on the CPU's
-mlock pagevec, to batch up the rest of the work to be done under lru_lock by
-__mlock_page().  __mlock_page() sets PageUnevictable, initializes mlock_count
-and moves the page to unevictable state ("the unevictable LRU", but with
-mlock_count in place of LRU threading).  Or if the page was already PageLRU
-and PageUnevictable and PageMlocked, it simply increments the mlock_count.
-
-But in practice that may not work ideally: the page may not yet be on an LRU, or
-it may have been temporarily isolated from LRU.  In such cases the mlock_count
-field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn()
-returns the page to "LRU".  Races prohibit mlock_count from being set to 1 then:
-rather than risk stranding a page indefinitely as unevictable, always err with
-mlock_count on the low side, so that when munlocked the page will be rescued to
-an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a
-VM_LOCKED VMA.
-
-
-Filtering Special VMAs
-----------------------
-
-mlock_fixup() filters several classes of "special" VMAs:
-
-1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
-   these mappings are inherently pinned, so we don't need to mark them as
-   mlocked.  In any case, most of the pages have no struct page in which to so
-   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
-   so there is no sense in attempting to visit them.
-
-2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
-   neither need nor want to mlock() these pages.  But __mm_populate() includes
-   hugetlbfs ranges, allocating the huge pages and populating the PTEs.
-
-3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
-   such as the VDSO page, relay channel pages, etc.  These pages are inherently
-   unevictable and are not managed on the LRU lists.  __mm_populate() includes
-   these ranges, populating the PTEs if not already populated.
-
-4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate()
-   includes these ranges, populating the PTEs if not already populated.
-
-Note that for all of these special VMAs, mlock_fixup() does not set the
-VM_LOCKED flag.  Therefore, we won't have to deal with them later during
-munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
-VMAs against the task's "locked_vm".
-
-
-munlock()/munlockall() System Call Handling
--------------------------------------------
-
-The munlock() and munlockall() system calls are handled by the same
-mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are.
-If called to munlock an already munlocked VMA, mlock_fixup() simply returns.
-Because of the VMA filtering discussed above, VM_LOCKED will not be set in
-any "special" VMAs.  So, those VMAs will be ignored for munlock.
-
-If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
-specified range.  All pages in the VMA are then munlocked by munlock_page() via
-mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same
-function used when mlocking a VMA range, with new flags for the VMA indicating
-that it is munlock() being performed.
-
-munlock_page() uses the mlock pagevec to batch up work to be done under
-lru_lock by  __munlock_page().  __munlock_page() decrements the page's
-mlock_count, and when that reaches 0 it clears PageMlocked and clears
-PageUnevictable, moving the page from unevictable state to inactive LRU.
-
-But in practice that may not work ideally: the page may not yet have reached
-"the unevictable LRU", or it may have been temporarily isolated from it.  In
-those cases its mlock_count field is unusable and must be assumed to be 0: so
-that the page will be rescued to an evictable LRU, then perhaps be mlocked
-again later if vmscan finds it in a VM_LOCKED VMA.
-
-
-Migrating MLOCKED Pages
------------------------
-
-A page that is being migrated has been isolated from the LRU lists and is held
-locked across unmapping of the page, updating the page's address space entry
-and copying the contents and state, until the page table entry has been
-replaced with an entry that refers to the new page.  Linux supports migration
-of mlocked pages and other unevictable pages.  PG_mlocked is cleared from the
-the old page when it is unmapped from the last VM_LOCKED VMA, and set when the
-new page is mapped in place of migration entry in a VM_LOCKED VMA.  If the page
-was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the
-page was unevictable for other reasons, PG_unevictable is copied explicitly.
-
-Note that page migration can race with mlocking or munlocking of the same page.
-There is mostly no problem since page migration requires unmapping all PTEs of
-the old page (including munlock where VM_LOCKED), then mapping in the new page
-(including mlock where VM_LOCKED).  The page table locks provide sufficient
-synchronization.
-
-However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA,
-before mlocking any pages already present, if one of those pages were migrated
-before mlock_pte_range() reached it, it would get counted twice in mlock_count.
-To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO,
-so that mlock_vma_page() will skip it.
-
-To complete page migration, we place the old and new pages back onto the LRU
-afterwards.  The "unneeded" page - old page on success, new page on failure -
-is freed when the reference count held by the migration process is released.
-
-
-Compacting MLOCKED Pages
-------------------------
-
-The memory map can be scanned for compactable regions and the default behavior
-is to let unevictable pages be moved.  /proc/sys/vm/compact_unevictable_allowed
-controls this behavior (see Documentation/admin-guide/sysctl/vm.rst).  The work
-of compaction is mostly handled by the page migration code and the same work
-flow as described in Migrating MLOCKED Pages will apply.
-
-
-MLOCKING Transparent Huge Pages
--------------------------------
-
-A transparent huge page is represented by a single entry on an LRU list.
-Therefore, we can only make unevictable an entire compound page, not
-individual subpages.
-
-If a user tries to mlock() part of a huge page, and no user mlock()s the
-whole of the huge page, we want the rest of the page to be reclaimable.
-
-We cannot just split the page on partial mlock() as split_huge_page() can
-fail and a new intermittent failure mode for the syscall is undesirable.
-
-We handle this by keeping PTE-mlocked huge pages on evictable LRU lists:
-the PMD on the border of a VM_LOCKED VMA will be split into a PTE table.
-
-This way the huge page is accessible for vmscan.  Under memory pressure the
-page will be split, subpages which belong to VM_LOCKED VMAs will be moved
-to the unevictable LRU and the rest can be reclaimed.
-
-/proc/meminfo's Unevictable and Mlocked amounts do not include those parts
-of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs.
-
-
-mmap(MAP_LOCKED) System Call Handling
--------------------------------------
-
-In addition to the mlock(), mlock2() and mlockall() system calls, an application
-can request that a region of memory be mlocked by supplying the MAP_LOCKED flag
-to the mmap() call.  There is one important and subtle difference here, though.
-mmap() + mlock() will fail if the range cannot be faulted in (e.g. because
-mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail.
-The mmaped area will still have properties of the locked area - pages will not
-get swapped out - but major page faults to fault memory in might still happen.
-
-Furthermore, any mmap() call or brk() call that expands the heap by a task
-that has previously called mlockall() with the MCL_FUTURE flag will result
-in the newly mapped memory being mlocked.  Before the unevictable/mlock
-changes, the kernel simply called make_pages_present() to allocate pages
-and populate the page table.
-
-To mlock a range of memory under the unevictable/mlock infrastructure,
-the mmap() handler and task address space expansion functions call
-populate_vma_page_range() specifying the vma and the address range to mlock.
-
-
-munmap()/exit()/exec() System Call Handling
--------------------------------------------
-
-When unmapping an mlocked region of memory, whether by an explicit call to
-munmap() or via an internal unmap from exit() or exec() processing, we must
-munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
-Before the unevictable/mlock changes, mlocking did not mark the pages in any
-way, so unmapping them required no processing.
-
-For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
-munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
-(unless it was a PTE mapping of a part of a transparent huge page).
-
-munlock_page() uses the mlock pagevec to batch up work to be done under
-lru_lock by  __munlock_page().  __munlock_page() decrements the page's
-mlock_count, and when that reaches 0 it clears PageMlocked and clears
-PageUnevictable, moving the page from unevictable state to inactive LRU.
-
-But in practice that may not work ideally: the page may not yet have reached
-"the unevictable LRU", or it may have been temporarily isolated from it.  In
-those cases its mlock_count field is unusable and must be assumed to be 0: so
-that the page will be rescued to an evictable LRU, then perhaps be mlocked
-again later if vmscan finds it in a VM_LOCKED VMA.
-
-
-Truncating MLOCKED Pages
-------------------------
-
-File truncation or hole punching forcibly unmaps the deleted pages from
-userspace; truncation even unmaps and deletes any private anonymous pages
-which had been Copied-On-Write from the file pages now being truncated.
-
-Mlocked pages can be munlocked and deleted in this way: like with munmap(),
-for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
-munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
-(unless it was a PTE mapping of a part of a transparent huge page).
-
-However, if there is a racing munlock(), since mlock_vma_pages_range() starts
-munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages
-present, if one of those pages were unmapped by truncation or hole punch before
-mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA,
-and would not be counted out of mlock_count.  In this rare case, a page may
-still appear as PageMlocked after it has been fully unmapped: and it is left to
-release_pages() (or __page_cache_release()) to clear it and update statistics
-before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared,
-which is usually 0).
-
-
-Page Reclaim in shrink_*_list()
--------------------------------
-
-vmscan's shrink_active_list() culls any obviously unevictable pages -
-i.e. !page_evictable(page) pages - diverting those to the unevictable list.
-However, shrink_active_list() only sees unevictable pages that made it onto the
-active/inactive LRU lists.  Note that these pages do not have PageUnevictable
-set - otherwise they would be on the unevictable list and shrink_active_list()
-would never see them.
-
-Some examples of these unevictable pages on the LRU lists are:
-
- (1) ramfs pages that have been placed on the LRU lists when first allocated.
-
- (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
-     allocate or fault in the pages in the shared memory region.  This happens
-     when an application accesses the page the first time after SHM_LOCK'ing
-     the segment.
-
- (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked,
-     but events left mlock_count too low, so they were munlocked too early.
-
-vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
-unevictable pages found on the inactive lists to the appropriate memory cgroup
-and node unevictable list.
-
-rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
-shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
-check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
-to correct them.  Such pages are culled to the unevictable list when released
-by the shrinker.
diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst
deleted file mode 100644
index 363fe20d6b9f..000000000000
--- a/Documentation/vm/vmalloc.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================================
-Virtually Contiguous Memory Allocation
-======================================
diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst
deleted file mode 100644
index fc8c67833af6..000000000000
--- a/Documentation/vm/vmalloced-kernel-stacks.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=====================================
-Virtually Mapped Kernel Stack Support
-=====================================
-
-:Author: Shuah Khan <skhan@linuxfoundation.org>
-
-.. contents:: :local:
-
-Overview
---------
-
-This is a compilation of information from the code and original patch
-series that introduced the `Virtually Mapped Kernel Stacks feature
-<https://lwn.net/Articles/694348/>`
-
-Introduction
-------------
-
-Kernel stack overflows are often hard to debug and make the kernel
-susceptible to exploits. Problems could show up at a later time making
-it difficult to isolate and root-cause.
-
-Virtually-mapped kernel stacks with guard pages causes kernel stack
-overflows to be caught immediately rather than causing difficult to
-diagnose corruptions.
-
-HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
-support for virtually mapped stacks with guard pages. This feature
-causes reliable faults when the stack overflows. The usability of
-the stack trace after overflow and response to the overflow itself
-is architecture dependent.
-
-.. note::
-        As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
-        support for VMAP_STACK.
-
-HAVE_ARCH_VMAP_STACK
---------------------
-
-Architectures that can support Virtually Mapped Kernel Stacks should
-enable this bool configuration option. The requirements are:
-
-- vmalloc space must be large enough to hold many kernel stacks. This
-  may rule out many 32-bit architectures.
-- Stacks in vmalloc space need to work reliably.  For example, if
-  vmap page tables are created on demand, either this mechanism
-  needs to work while the stack points to a virtual address with
-  unpopulated page tables or arch code (switch_to() and switch_mm(),
-  most likely) needs to ensure that the stack's page table entries
-  are populated before running on a possibly unpopulated stack.
-- If the stack overflows into a guard page, something reasonable
-  should happen. The definition of "reasonable" is flexible, but
-  instantly rebooting without logging anything would be unfriendly.
-
-VMAP_STACK
-----------
-
-VMAP_STACK bool configuration option when enabled allocates virtually
-mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
-
-- Enable this if you want the use virtually-mapped kernel stacks
-  with guard pages. This causes kernel stack overflows to be caught
-  immediately rather than causing difficult-to-diagnose corruption.
-
-.. note::
-
-        Using this feature with KASAN requires architecture support
-        for backing virtual mappings with real shadow memory, and
-        KASAN_VMALLOC must be enabled.
-
-.. note::
-
-        VMAP_STACK is enabled, it is not possible to run DMA on stack
-        allocated data.
-
-Kernel configuration options and dependencies keep changing. Refer to
-the latest code base:
-
-`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
-
-Allocation
------------
-
-When a new kernel thread is created, thread stack is allocated from
-virtually contiguous memory pages from the page level allocator. These
-pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
-protections.
-
-alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
-with PAGE_KERNEL protections.
-
-- Allocated stacks are cached and later reused by new threads, so memcg
-  accounting is performed manually on assigning/releasing stacks to tasks.
-  Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
-- vm_struct is cached to be able to find when thread free is initiated
-  in interrupt context. free_thread_stack() can be called in interrupt
-  context.
-- On arm64, all VMAP's stacks need to have the same alignment to ensure
-  that VMAP'd stack overflow detection works correctly. Arch specific
-  vmap stack allocator takes care of this detail.
-- This does not address interrupt stacks - according to the original patch
-
-Thread stack allocation is initiated from clone(), fork(), vfork(),
-kernel_thread() via kernel_clone(). Leaving a few hints for searching
-the code base to understand when and how thread stack is allocated.
-
-Bulk of the code is in:
-`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
-
-stack_vm_area pointer in task_struct keeps track of the virtually allocated
-stack and a non-null stack_vm_area pointer serves as a indication that the
-virtually mapped kernel stacks are enabled.
-
-::
-
-        struct vm_struct *stack_vm_area;
-
-Stack overflow handling
------------------------
-
-Leading and trailing guard pages help detect stack overflows. When stack
-overflows into the guard pages, handlers have to be careful not overflow
-the stack again. When handlers are called, it is likely that very little
-stack space is left.
-
-On x86, this is done by handling the page fault indicating the kernel
-stack overflow on the double-fault stack.
-
-Testing VMAP allocation with guard pages
-----------------------------------------
-
-How do we ensure that VMAP_STACK is actually allocating with a leading
-and trailing guard page? The following lkdtm tests can help detect any
-regressions.
-
-::
-
-        void lkdtm_STACK_GUARD_PAGE_LEADING()
-        void lkdtm_STACK_GUARD_PAGE_TRAILING()
-
-Conclusions
------------
-
-- A percpu cache of vmalloced stacks appears to be a bit faster than a
-  high-order stack allocation, at least when the cache hits.
-- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
-  simply embed the thread_info (containing only flags) and 'int cpu' into
-  task_struct.
-- The thread stack can be free'ed as soon as the task is dead (without
-  waiting for RCU) and then, if vmapped stacks are in use, cache the
-  entire stack for reuse on the same cpu.
diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst
deleted file mode 100644
index c9c495f62d12..000000000000
--- a/Documentation/vm/vmemmap_dedup.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=========================================
-A vmemmap diet for HugeTLB and Device DAX
-=========================================
-
-HugeTLB
-=======
-
-The struct page structures (page structs) are used to describe a physical
-page frame. By default, there is a one-to-one mapping from a page frame to
-it's corresponding page struct.
-
-HugeTLB pages consist of multiple base page size pages and is supported by many
-architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
-details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
-currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
-consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
-For each base page, there is a corresponding page struct.
-
-Within the HugeTLB subsystem, only the first 4 page structs are used to
-contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
-this upper limit. The only 'useful' information in the remaining page structs
-is the compound_head field, and this field is the same for all tail pages.
-
-By removing redundant page structs for HugeTLB pages, memory can be returned
-to the buddy allocator for other uses.
-
-Different architectures support different HugeTLB pages. For example, the
-following table is the HugeTLB page size supported by x86 and arm64
-architectures. Because arm64 supports 4k, 16k, and 64k base pages and
-supports contiguous entries, so it supports many kinds of sizes of HugeTLB
-page.
-
-+--------------+-----------+-----------------------------------------------+
-| Architecture | Page Size |                HugeTLB Page Size              |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-|    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-|              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
-|              +-----------+-----------+-----------+-----------+-----------+
-|    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
-|              +-----------+-----------+-----------+-----------+-----------+
-|              |   64KB    |    2MB    |  512MB    |    16GB   |           |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-
-When the system boot up, every HugeTLB page has more than one struct page
-structs which size is (unit: pages)::
-
-   struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
-
-Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
-of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
-relationship::
-
-   HugeTLB_Size = n * PAGE_SIZE
-
-Then::
-
-   struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
-               = n * sizeof(struct page) / PAGE_SIZE
-
-We can use huge mapping at the pud/pmd level for the HugeTLB page.
-
-For the HugeTLB page of the pmd level mapping, then::
-
-   struct_size = n * sizeof(struct page) / PAGE_SIZE
-               = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
-               = sizeof(struct page) / sizeof(pte_t)
-               = 64 / 8
-               = 8 (pages)
-
-Where n is how many pte entries which one page can contains. So the value of
-n is (PAGE_SIZE / sizeof(pte_t)).
-
-This optimization only supports 64-bit system, so the value of sizeof(pte_t)
-is 8. And this optimization also applicable only when the size of struct page
-is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
-x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
-size of struct page structs of it is 8 page frames which size depends on the
-size of the base page.
-
-For the HugeTLB page of the pud level mapping, then::
-
-   struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
-               = PAGE_SIZE / 8 * 8 (pages)
-               = PAGE_SIZE (pages)
-
-Where the struct_size(pmd) is the size of the struct page structs of a
-HugeTLB page of the pmd level mapping.
-
-E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
-HugeTLB page consists in 4096.
-
-Next, we take the pmd level mapping of the HugeTLB page as an example to
-show the internal implementation of this optimization. There are 8 pages
-struct page structs associated with a HugeTLB page which is pmd mapped.
-
-Here is how things look before optimization::
-
-    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | -------------> |     1     |
- |           |                     +-----------+                +-----------+
- |           |                     |     2     | -------------> |     2     |
- |           |                     +-----------+                +-----------+
- |           |                     |     3     | -------------> |     3     |
- |           |                     +-----------+                +-----------+
- |           |                     |     4     | -------------> |     4     |
- |    PMD    |                     +-----------+                +-----------+
- |   level   |                     |     5     | -------------> |     5     |
- |  mapping  |                     +-----------+                +-----------+
- |           |                     |     6     | -------------> |     6     |
- |           |                     +-----------+                +-----------+
- |           |                     |     7     | -------------> |     7     |
- |           |                     +-----------+                +-----------+
- |           |
- |           |
- |           |
- +-----------+
-
-The value of page->compound_head is the same for all tail pages. The first
-page of page structs (page 0) associated with the HugeTLB page contains the 4
-page structs necessary to describe the HugeTLB. The only use of the remaining
-pages of page structs (page 1 to page 7) is to point to page->compound_head.
-Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
-will be used for each HugeTLB page. This will allow us to free the remaining
-7 pages to the buddy allocator.
-
-Here is how things look after remapping::
-
-    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
- |           |                     +-----------+                  | | | | | |
- |           |                     |     2     | -----------------+ | | | | |
- |           |                     +-----------+                    | | | | |
- |           |                     |     3     | -------------------+ | | | |
- |           |                     +-----------+                      | | | |
- |           |                     |     4     | ---------------------+ | | |
- |    PMD    |                     +-----------+                        | | |
- |   level   |                     |     5     | -----------------------+ | |
- |  mapping  |                     +-----------+                          | |
- |           |                     |     6     | -------------------------+ |
- |           |                     +-----------+                            |
- |           |                     |     7     | ---------------------------+
- |           |                     +-----------+
- |           |
- |           |
- |           |
- +-----------+
-
-When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
-vmemmap pages and restore the previous mapping relationship.
-
-For the HugeTLB page of the pud level mapping. It is similar to the former.
-We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
-
-Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
-(e.g. aarch64) provides a contiguous bit in the translation table entries
-that hints to the MMU to indicate that it is one of a contiguous set of
-entries that can be cached in a single TLB entry.
-
-The contiguous bit is used to increase the mapping size at the pmd and pte
-(last) level. So this type of HugeTLB page can be optimized only when its
-size of the struct page structs is greater than 1 page.
-
-Notice: The head vmemmap page is not freed to the buddy allocator and all
-tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
-more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
-associated with each HugeTLB page. The compound_head() can handle this
-correctly (more details refer to the comment above compound_head()).
-
-Device DAX
-==========
-
-The device-dax interface uses the same tail deduplication technique explained
-in the previous chapter, except when used with the vmemmap in
-the device (altmap).
-
-The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
-PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
-
-The differences with HugeTLB are relatively minor.
-
-It only use 3 page structs for storing all information as opposed
-to 4 on HugeTLB pages.
-
-There's no remapping of vmemmap given that device-dax memory is not part of
-System RAM ranges initialized at boot. Thus the tail page deduplication
-happens at a later stage when we populate the sections. HugeTLB reuses the
-the head vmemmap page representing, whereas device-dax reuses the tail
-vmemmap page. This results in only half of the savings compared to HugeTLB.
-
-Deduplicated tail pages are not mapped read-only.
-
-Here's how things look like on device-dax after the sections are populated::
-
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | -------------> |     1     |
- |           |                     +-----------+                +-----------+
- |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
- |           |                     +-----------+                   | | | | |
- |           |                     |     3     | ------------------+ | | | |
- |           |                     +-----------+                     | | | |
- |           |                     |     4     | --------------------+ | | |
- |    PMD    |                     +-----------+                       | | |
- |   level   |                     |     5     | ----------------------+ | |
- |  mapping  |                     +-----------+                         | |
- |           |                     |     6     | ------------------------+ |
- |           |                     +-----------+                           |
- |           |                     |     7     | --------------------------+
- |           |                     +-----------+
- |           |
- |           |
- |           |
- +-----------+
diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst
deleted file mode 100644
index 224e3c61d686..000000000000
--- a/Documentation/vm/z3fold.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. _z3fold:
-
-======
-z3fold
-======
-
-z3fold is a special purpose allocator for storing compressed pages.
-It is designed to store up to three compressed pages per physical page.
-It is a zbud derivative which allows for higher compression
-ratio keeping the simplicity and determinism of its predecessor.
-
-The main differences between z3fold and zbud are:
-
-* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
-* z3fold can hold up to 3 compressed pages in its page
-* z3fold doesn't export any API itself and is thus intended to be used
-  via the zpool API.
-
-To keep the determinism and simplicity, z3fold, just like zbud, always
-stores an integral number of compressed pages per page, but it can store
-up to 3 pages unlike zbud which can store at most 2. Therefore the
-compression ratio goes to around 2.7x while zbud's one is around 1.7x.
-
-Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
-return a dereferenceable pointer. Instead, it returns an unsigned long
-handle which encodes actual location of the allocated object.
-
-Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
-depend on MMU enabled and provides more predictable reclaim behavior
-which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst
deleted file mode 100644
index 6e79893d6132..000000000000
--- a/Documentation/vm/zsmalloc.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _zsmalloc:
-
-========
-zsmalloc
-========
-
-This allocator is designed for use with zram. Thus, the allocator is
-supposed to work well under low memory conditions. In particular, it
-never attempts higher order page allocation which is very likely to
-fail under memory pressure. On the other hand, if we just use single
-(0-order) pages, it would suffer from very high fragmentation --
-any object of size PAGE_SIZE/2 or larger would occupy an entire page.
-This was one of the major issues with its predecessor (xvmalloc).
-
-To overcome these issues, zsmalloc allocates a bunch of 0-order pages
-and links them together using various 'struct page' fields. These linked
-pages act as a single higher-order page i.e. an object can span 0-order
-page boundaries. The code refers to these linked pages as a single entity
-called zspage.
-
-For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
-since this satisfies the requirements of all its current users (in the
-worst case, page is incompressible and is thus stored "as-is" i.e. in
-uncompressed form). For allocation requests larger than this size, failure
-is returned (see zs_malloc).
-
-Additionally, zs_malloc() does not return a dereferenceable pointer.
-Instead, it returns an opaque handle (unsigned long) which encodes actual
-location of the allocated object. The reason for this indirection is that
-zsmalloc does not keep zspages permanently mapped since that would cause
-issues on 32-bit systems where the VA region for kernel space mappings
-is very small. So, before using the allocating memory, the object has to
-be mapped using zs_map_object() to get a usable pointer and subsequently
-unmapped using zs_unmap_object().
-
-stat
-====
-
-With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
-``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
-
- # cat /sys/kernel/debug/zsmalloc/zram0/classes
-
- class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ...
-    ...
-     9   176           0            1           186        129          8                4
-    10   192           1            0          2880       2872        135                3
-    11   208           0            1           819        795         42                2
-    12   224           0            1           219        159         12                4
-    ...
-    ...
-
-
-class
-	index
-size
-	object size zspage stores
-almost_empty
-	the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full
-	the number of ZS_ALMOST_FULL zspages(see below)
-obj_allocated
-	the number of objects allocated
-obj_used
-	the number of objects allocated to the user
-pages_used
-	the number of pages allocated for the class
-pages_per_zspage
-	the number of 0-order pages to make a zspage
-
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
-
-* n = number of allocated objects
-* N = total number of objects zspage can store
-* f = fullness_threshold_frac(ie, 4 at the moment)
-
-Similarly, we assign zspage to:
-
-* ZS_ALMOST_FULL  when n > N / f
-* ZS_EMPTY        when n == 0
-* ZS_FULL         when n == N
diff --git a/MAINTAINERS b/MAINTAINERS
index fe5daf141501..55fb1daa9057 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5526,7 +5526,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-kernel-mm-damon
 F:	Documentation/admin-guide/mm/damon/
-F:	Documentation/vm/damon/
+F:	Documentation/mm/damon/
 F:	include/linux/damon.h
 F:	include/trace/events/damon.h
 F:	mm/damon/
@@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management
 M:	Jérôme Glisse <jglisse@redhat.com>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/hmm.rst
+F:	Documentation/mm/hmm.rst
 F:	include/linux/hmm*
 F:	lib/test_hmm*
 F:	mm/hmm*
@@ -9135,8 +9135,8 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-kernel-mm-hugepages
 F:	Documentation/admin-guide/mm/hugetlbpage.rst
-F:	Documentation/vm/hugetlbfs_reserv.rst
-F:	Documentation/vm/vmemmap_dedup.rst
+F:	Documentation/mm/hugetlbfs_reserv.rst
+F:	Documentation/mm/vmemmap_dedup.rst
 F:	fs/hugetlbfs/
 F:	include/linux/hugetlb.h
 F:	mm/hugetlb.c
@@ -15072,7 +15072,7 @@ M:	Pasha Tatashin <pasha.tatashin@soleen.com>
 M:	Andrew Morton <akpm@linux-foundation.org>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/page_table_check.rst
+F:	Documentation/mm/page_table_check.rst
 F:	include/linux/page_table_check.h
 F:	mm/page_table_check.c
 
@@ -22158,7 +22158,7 @@ M:	Nitin Gupta <ngupta@vflare.org>
 R:	Sergey Senozhatsky <senozhatsky@chromium.org>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/zsmalloc.rst
+F:	Documentation/mm/zsmalloc.rst
 F:	include/linux/zsmalloc.h
 F:	mm/zsmalloc.c
 
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1920d52653b4..db2838cf8c02 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE
 	  Say Y to support efficient handling of sparse physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa.rst> for more.
+	  See <file:Documentation/mm/numa.rst> for more.
 
 config ARCH_ENABLE_THP_MIGRATION
 	def_bool y
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb9d5fd39d7f..392ff48f77df 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  * should return true.
  * We should not call this on a hugetlb entry. We should check for HugeTLB
  * entry using vma->vm_flags
- * The page table walk rule is explained in Documentation/vm/transhuge.rst
+ * The page table walk rule is explained in Documentation/mm/transhuge.rst
  */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index d5a6f101f843..126a36571667 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -4,7 +4,7 @@
  *
  * Authors: Jérôme Glisse <jglisse@redhat.com>
  *
- * See Documentation/vm/hmm.rst for reasons and overview of what HMM is.
+ * See Documentation/mm/hmm.rst for reasons and overview of what HMM is.
  */
 #ifndef LINUX_HMM_H
 #define LINUX_HMM_H
@@ -100,7 +100,7 @@ struct hmm_range {
 };
 
 /*
- * Please see Documentation/vm/hmm.rst for how to use the range API.
+ * Please see Documentation/mm/hmm.rst for how to use the range API.
  */
 int hmm_range_fault(struct hmm_range *range);
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..9f5ee49482de 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -39,7 +39,7 @@ struct vmem_altmap {
  * must be treated as an opaque object, rather than a "normal" struct page.
  *
  * A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst.
+ * include/linux/hmm.h and Documentation/mm/hmm.rst.
  *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 45fc2c81e370..d6c06e140277 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -198,7 +198,7 @@ struct mmu_notifier_ops {
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() already catches the points in time when an
 	 * external TLB range needs to be flushed. For more in depth
-	 * discussion on this see Documentation/vm/mmu_notifier.rst
+	 * discussion on this see Documentation/mm/mmu_notifier.rst
 	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8cd975a8bfeb..2a243616f222 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void);
  *
  * Use mmdrop() to release the reference acquired by mmgrab().
  *
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmgrab(struct mm_struct *mm)
@@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm)
  *
  * Use mmput() to release the reference acquired by mmget().
  *
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmget(struct mm_struct *mm)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0c0fed1b348f..95a5b7aa1ae9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -74,7 +74,7 @@ static inline int current_is_kswapd(void)
 
 /*
  * Unaddressable device memory support. See include/linux/hmm.h and
- * Documentation/vm/hmm.rst. Short description is we need struct pages for
+ * Documentation/mm/hmm.rst. Short description is we need struct pages for
  * device memory that is unaddressable (inaccessible) by CPU, so that we can
  * migrate part of a process memory to device memory.
  *
diff --git a/mm/Kconfig b/mm/Kconfig
index 169e64192e48..c1fa4993a56f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -663,7 +663,7 @@ config KSM
 	  the many instances by a single page with that content, so
 	  saving memory until one or another app needs to modify the content.
 	  Recommended for use with KVM, or with other duplicative applications.
-	  See Documentation/vm/ksm.rst for more information: KSM is inactive
+	  See Documentation/mm/ksm.rst for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1ab091f49fc0..dc7df1254f0a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -35,7 +35,7 @@
 #include <asm/tlbflush.h>
 
 /*
- * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
  * expectations that are being validated here. All future changes in here
  * or the documentation need to be in sync.
  */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6f69b044a8cc..1a97610308cb 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -4,7 +4,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of frontswap.  See
- * Documentation/vm/frontswap.rst for more information.
+ * Documentation/mm/frontswap.rst for more information.
  *
  * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..f9b90a8d7dfa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	 * replacing a zero pmd write protected page with a zero pte write
 	 * protected page.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush(vma, haddr, pmd);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..b36a4ef87a2e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4875,7 +4875,7 @@ again:
 				 * table protection not changing it to point
 				 * to a new page.
 				 *
-				 * See Documentation/vm/mmu_notifier.rst
+				 * See Documentation/mm/mmu_notifier.rst
 				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 				entry = huge_pte_wrprotect(entry);
@@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
 	 * page table protection not changing it to point to a new page.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(&range);
@@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	/*
 	 * No need to call mmu_notifier_invalidate_range(), see
-	 * Documentation/vm/mmu_notifier.rst.
+	 * Documentation/mm/mmu_notifier.rst.
 	 */
 	mmu_notifier_invalidate_range_end(&range);
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1089ea8a9c98..ba29c15c53d6 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -6,7 +6,7 @@
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  *
- * See Documentation/vm/vmemmap_dedup.rst
+ * See Documentation/mm/vmemmap_dedup.rst
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..8d2dc501c92c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * No need to notify as we are downgrading page table to read
 		 * only not changing it to point to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
@@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 * No need to notify as we are replacing a read only page with another
 	 * read only page with the same content.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/mmap.c b/mm/mmap.c
index 61e6135c54ef..c14d7286a379 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long ret = -EINVAL;
 	struct file *file;
 
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
 
 	if (prot)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5bcb334cd6f2..65e0a767b837 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		if (ret)
 			cleaned++;
@@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * to point at a new folio while a device is
 			 * still using this folio.
 			 *
-			 * See Documentation/vm/mmu_notifier.rst
+			 * See Documentation/mm/mmu_notifier.rst
 			 */
 			dec_mm_counter(mm, mm_counter_file(&folio->page));
 		}
@@ -1775,7 +1775,7 @@ discard:
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
 		if (vma->vm_flags & VM_LOCKED)
@@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
 		if (vma->vm_flags & VM_LOCKED)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 652f11a05749..3ff88a2eefb8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 
 		/*
 		 * Reuse the previous page for the rest of tail pages
-		 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
 		 */
 		next += PAGE_SIZE;
 		rc = vmemmap_populate_range(next, last, node, NULL,
diff --git a/mm/util.c b/mm/util.c
index 0837570c9225..5df8f2db7ca9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  * succeed and -ENOMEM implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
+ * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index c149427eb1c9..74c3dcecf64d 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -8,7 +8,7 @@
  * Or sort by total memory:
  * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
  *
- * See Documentation/vm/page_owner.rst
+ * See Documentation/mm/page_owner.rst
 */
 
 #include <stdio.h>
-- 
cgit 


From 35d11ec239e0996291b140a61a677210ff854f11 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 23 Jun 2022 12:24:32 +0200
Subject: scsi: ufs: ufshcd: Constify pointed data

For code safety, constify arrays and pointers to data which is not
modified.

Link: https://lore.kernel.org/r/20220623102432.108059-4-krzysztof.kozlowski@linaro.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd-priv.h |  6 +++---
 drivers/ufs/core/ufshcd.c      | 42 ++++++++++++++++++++++--------------------
 include/ufs/ufshcd.h           |  6 +++---
 3 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index ffb01fc6de75..8f67db202d7b 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -215,7 +215,7 @@ static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba,
 		hba->vops->config_scaling_param(hba, p, data);
 }
 
-extern struct ufs_pm_lvl_states ufs_pm_lvl_states[];
+extern const struct ufs_pm_lvl_states ufs_pm_lvl_states[];
 
 /**
  * ufshcd_scsi_to_upiu_lun - maps scsi LUN to UPIU LUN
@@ -234,8 +234,8 @@ static inline u8 ufshcd_scsi_to_upiu_lun(unsigned int scsi_lun)
 
 int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask);
 int ufshcd_write_ee_control(struct ufs_hba *hba);
-int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, u16 *other_mask,
-			     u16 set, u16 clr);
+int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
+			     const u16 *other_mask, u16 set, u16 clr);
 
 static inline int ufshcd_update_ee_drv_mask(struct ufs_hba *hba,
 					    u16 set, u16 clr)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index f5d5dde497ac..1d3214e6b364 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -175,7 +175,7 @@ enum {
 #define ufshcd_clear_eh_in_progress(h) \
 	((h)->eh_flags &= ~UFSHCD_EH_IN_PROGRESS)
 
-struct ufs_pm_lvl_states ufs_pm_lvl_states[] = {
+const struct ufs_pm_lvl_states ufs_pm_lvl_states[] = {
 	[UFS_PM_LVL_0] = {UFS_ACTIVE_PWR_MODE, UIC_LINK_ACTIVE_STATE},
 	[UFS_PM_LVL_1] = {UFS_ACTIVE_PWR_MODE, UIC_LINK_HIBERN8_STATE},
 	[UFS_PM_LVL_2] = {UFS_SLEEP_PWR_MODE, UIC_LINK_ACTIVE_STATE},
@@ -363,7 +363,7 @@ static void ufshcd_add_tm_upiu_trace(struct ufs_hba *hba, unsigned int tag,
 }
 
 static void ufshcd_add_uic_command_trace(struct ufs_hba *hba,
-					 struct uic_command *ucmd,
+					 const struct uic_command *ucmd,
 					 enum ufs_trace_str_t str_t)
 {
 	u32 cmd;
@@ -443,11 +443,11 @@ static void ufshcd_print_clk_freqs(struct ufs_hba *hba)
 }
 
 static void ufshcd_print_evt(struct ufs_hba *hba, u32 id,
-			     char *err_name)
+			     const char *err_name)
 {
 	int i;
 	bool found = false;
-	struct ufs_event_hist *e;
+	const struct ufs_event_hist *e;
 
 	if (id >= UFS_EVT_CNT)
 		return;
@@ -497,7 +497,7 @@ static void ufshcd_print_evt_hist(struct ufs_hba *hba)
 static
 void ufshcd_print_trs(struct ufs_hba *hba, unsigned long bitmap, bool pr_prdt)
 {
-	struct ufshcd_lrb *lrbp;
+	const struct ufshcd_lrb *lrbp;
 	int prdt_length;
 	int tag;
 
@@ -553,7 +553,7 @@ static void ufshcd_print_tmrs(struct ufs_hba *hba, unsigned long bitmap)
 
 static void ufshcd_print_host_state(struct ufs_hba *hba)
 {
-	struct scsi_device *sdev_ufs = hba->ufs_device_wlun;
+	const struct scsi_device *sdev_ufs = hba->ufs_device_wlun;
 
 	dev_err(hba->dev, "UFS Host state=%d\n", hba->ufshcd_state);
 	dev_err(hba->dev, "outstanding reqs=0x%lx tasks=0x%lx\n",
@@ -1098,7 +1098,7 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
  */
 static u32 ufshcd_pending_cmds(struct ufs_hba *hba)
 {
-	struct scsi_device *sdev;
+	const struct scsi_device *sdev;
 	u32 pending = 0;
 
 	lockdep_assert_held(hba->host->host_lock);
@@ -2069,14 +2069,15 @@ static inline int ufshcd_monitor_opcode2dir(u8 opcode)
 static inline bool ufshcd_should_inform_monitor(struct ufs_hba *hba,
 						struct ufshcd_lrb *lrbp)
 {
-	struct ufs_hba_monitor *m = &hba->monitor;
+	const struct ufs_hba_monitor *m = &hba->monitor;
 
 	return (m->enabled && lrbp && lrbp->cmd &&
 		(!m->chunk_size || m->chunk_size == lrbp->cmd->sdb.length) &&
 		ktime_before(hba->monitor.enabled_ts, lrbp->issue_time_stamp));
 }
 
-static void ufshcd_start_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
+static void ufshcd_start_monitor(struct ufs_hba *hba,
+				 const struct ufshcd_lrb *lrbp)
 {
 	int dir = ufshcd_monitor_opcode2dir(*lrbp->cmd->cmnd);
 	unsigned long flags;
@@ -2087,14 +2088,14 @@ static void ufshcd_start_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 }
 
-static void ufshcd_update_monitor(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
+static void ufshcd_update_monitor(struct ufs_hba *hba, const struct ufshcd_lrb *lrbp)
 {
 	int dir = ufshcd_monitor_opcode2dir(*lrbp->cmd->cmnd);
 	unsigned long flags;
 
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (dir >= 0 && hba->monitor.nr_queued[dir] > 0) {
-		struct request *req = scsi_cmd_to_rq(lrbp->cmd);
+		const struct request *req = scsi_cmd_to_rq(lrbp->cmd);
 		struct ufs_hba_monitor *m = &hba->monitor;
 		ktime_t now, inc, lat;
 
@@ -4907,7 +4908,7 @@ static int ufshcd_get_lu_wp(struct ufs_hba *hba,
  *
  */
 static inline void ufshcd_get_lu_power_on_wp_status(struct ufs_hba *hba,
-						    struct scsi_device *sdev)
+						    const struct scsi_device *sdev)
 {
 	if (hba->dev_info.f_power_on_wp_en &&
 	    !hba->dev_info.is_lu_power_on_wp) {
@@ -5426,8 +5427,8 @@ int ufshcd_write_ee_control(struct ufs_hba *hba)
 	return err;
 }
 
-int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, u16 *other_mask,
-			     u16 set, u16 clr)
+int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
+			     const u16 *other_mask, u16 set, u16 clr)
 {
 	u16 new_mask, ee_ctrl_mask;
 	int err = 0;
@@ -7354,7 +7355,8 @@ static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd)
  *
  * Returns calculated max ICC level for specific regulator
  */
-static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, char *buff)
+static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan,
+				    const char *buff)
 {
 	int i;
 	int curr_uA;
@@ -7401,7 +7403,7 @@ static u32 ufshcd_get_max_icc_level(int sup_curr_uA, u32 start_scan, char *buff)
  * Returns calculated ICC level
  */
 static u32 ufshcd_find_max_sup_active_icc_level(struct ufs_hba *hba,
-							u8 *desc_buf, int len)
+						const u8 *desc_buf, int len)
 {
 	u32 icc_level = 0;
 
@@ -7551,7 +7553,7 @@ out:
 	return ret;
 }
 
-static void ufshcd_wb_probe(struct ufs_hba *hba, u8 *desc_buf)
+static void ufshcd_wb_probe(struct ufs_hba *hba, const u8 *desc_buf)
 {
 	struct ufs_dev_info *dev_info = &hba->dev_info;
 	u8 lun;
@@ -7622,7 +7624,7 @@ wb_disabled:
 	hba->caps &= ~UFSHCD_CAP_WB_EN;
 }
 
-static void ufshcd_temp_notif_probe(struct ufs_hba *hba, u8 *desc_buf)
+static void ufshcd_temp_notif_probe(struct ufs_hba *hba, const u8 *desc_buf)
 {
 	struct ufs_dev_info *dev_info = &hba->dev_info;
 	u32 ext_ufs_feature;
@@ -7856,7 +7858,7 @@ static int ufshcd_quirk_tune_host_pa_tactivate(struct ufs_hba *hba)
 	u32 granularity, peer_granularity;
 	u32 pa_tactivate, peer_pa_tactivate;
 	u32 pa_tactivate_us, peer_pa_tactivate_us;
-	u8 gran_to_us_table[] = {1, 4, 8, 16, 32, 100};
+	static const u8 gran_to_us_table[] = {1, 4, 8, 16, 32, 100};
 
 	ret = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_GRANULARITY),
 				  &granularity);
@@ -7973,7 +7975,7 @@ struct ufs_ref_clk {
 	enum ufs_ref_clk_freq val;
 };
 
-static struct ufs_ref_clk ufs_ref_clk_freqs[] = {
+static const struct ufs_ref_clk ufs_ref_clk_freqs[] = {
 	{19200000, REF_CLK_FREQ_19_2_MHZ},
 	{26000000, REF_CLK_FREQ_26_MHZ},
 	{38400000, REF_CLK_FREQ_38_4_MHZ},
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index b5c9064a11d9..7fe1a926cd99 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1232,14 +1232,14 @@ static inline int ufshcd_vops_phy_initialization(struct ufs_hba *hba)
 	return 0;
 }
 
-extern struct ufs_pm_lvl_states ufs_pm_lvl_states[];
+extern const struct ufs_pm_lvl_states ufs_pm_lvl_states[];
 
 int ufshcd_dump_regs(struct ufs_hba *hba, size_t offset, size_t len,
 		     const char *prefix);
 
 int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask);
 int ufshcd_write_ee_control(struct ufs_hba *hba);
-int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, u16 *other_mask,
-			     u16 set, u16 clr);
+int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
+			     const u16 *other_mask, u16 set, u16 clr);
 
 #endif /* End of Header */
-- 
cgit 


From 70fb5ccf2ebb09a0c8ebba775041567812d45f86 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 13 Jun 2022 00:34:28 +0800
Subject: sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of
 util_avg

[Problem Statement]
select_idle_cpu() might spend too much time searching for an idle CPU,
when the system is overloaded.

The following histogram is the time spent in select_idle_cpu(),
when running 224 instances of netperf on a system with 112 CPUs
per LLC domain:

@usecs:
[0]                  533 |                                                    |
[1]                 5495 |                                                    |
[2, 4)             12008 |                                                    |
[4, 8)            239252 |                                                    |
[8, 16)          4041924 |@@@@@@@@@@@@@@                                      |
[16, 32)        12357398 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@         |
[32, 64)        14820255 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[64, 128)       13047682 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@       |
[128, 256)       8235013 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@                        |
[256, 512)       4507667 |@@@@@@@@@@@@@@@                                     |
[512, 1K)        2600472 |@@@@@@@@@                                           |
[1K, 2K)          927912 |@@@                                                 |
[2K, 4K)          218720 |                                                    |
[4K, 8K)           98161 |                                                    |
[8K, 16K)          37722 |                                                    |
[16K, 32K)          6715 |                                                    |
[32K, 64K)           477 |                                                    |
[64K, 128K)            7 |                                                    |

netperf latency usecs:
=======
case            	load    	    Lat_99th	    std%
TCP_RR          	thread-224	      257.39	(  0.21)

The time spent in select_idle_cpu() is visible to netperf and might have a negative
impact.

[Symptom analysis]
The patch [1] from Mel Gorman has been applied to track the efficiency
of select_idle_sibling. Copy the indicators here:

SIS Search Efficiency(se_eff%):
        A ratio expressed as a percentage of runqueues scanned versus
        idle CPUs found. A 100% efficiency indicates that the target,
        prev or recent CPU of a task was idle at wakeup. The lower the
        efficiency, the more runqueues were scanned before an idle CPU
        was found.

SIS Domain Search Efficiency(dom_eff%):
        Similar, except only for the slower SIS
	patch.

SIS Fast Success Rate(fast_rate%):
        Percentage of SIS that used target, prev or
	recent CPUs.

SIS Success rate(success_rate%):
        Percentage of scans that found an idle CPU.

The test is based on Aubrey's schedtests tool, including netperf, hackbench,
schbench and tbench.

Test on vanilla kernel:
schedstat_parse.py -f netperf_vanilla.log
case	        load	    se_eff%	    dom_eff%	  fast_rate%	success_rate%
TCP_RR	   28 threads	     99.978	      18.535	      99.995	     100.000
TCP_RR	   56 threads	     99.397	       5.671	      99.964	     100.000
TCP_RR	   84 threads	     21.721	       6.818	      73.632	     100.000
TCP_RR	  112 threads	     12.500	       5.533	      59.000	     100.000
TCP_RR	  140 threads	      8.524	       4.535	      49.020	     100.000
TCP_RR	  168 threads	      6.438	       3.945	      40.309	      99.999
TCP_RR	  196 threads	      5.397	       3.718	      32.320	      99.982
TCP_RR	  224 threads	      4.874	       3.661	      25.775	      99.767
UDP_RR	   28 threads	     99.988	      17.704	      99.997	     100.000
UDP_RR	   56 threads	     99.528	       5.977	      99.970	     100.000
UDP_RR	   84 threads	     24.219	       6.992	      76.479	     100.000
UDP_RR	  112 threads	     13.907	       5.706	      62.538	     100.000
UDP_RR	  140 threads	      9.408	       4.699	      52.519	     100.000
UDP_RR	  168 threads	      7.095	       4.077	      44.352	     100.000
UDP_RR	  196 threads	      5.757	       3.775	      35.764	      99.991
UDP_RR	  224 threads	      5.124	       3.704	      28.748	      99.860

schedstat_parse.py -f schbench_vanilla.log
(each group has 28 tasks)
case	        load	    se_eff%	    dom_eff%	  fast_rate%	success_rate%
normal	   1   mthread	     99.152	       6.400	      99.941	     100.000
normal	   2   mthreads	     97.844	       4.003	      99.908	     100.000
normal	   3   mthreads	     96.395	       2.118	      99.917	      99.998
normal	   4   mthreads	     55.288	       1.451	      98.615	      99.804
normal	   5   mthreads	      7.004	       1.870	      45.597	      61.036
normal	   6   mthreads	      3.354	       1.346	      20.777	      34.230
normal	   7   mthreads	      2.183	       1.028	      11.257	      21.055
normal	   8   mthreads	      1.653	       0.825	       7.849	      15.549

schedstat_parse.py -f hackbench_vanilla.log
(each group has 28 tasks)
case			load	        se_eff%	    dom_eff%	  fast_rate%	success_rate%
process-pipe	     1 group	         99.991	       7.692	      99.999	     100.000
process-pipe	    2 groups	         99.934	       4.615	      99.997	     100.000
process-pipe	    3 groups	         99.597	       3.198	      99.987	     100.000
process-pipe	    4 groups	         98.378	       2.464	      99.958	     100.000
process-pipe	    5 groups	         27.474	       3.653	      89.811	      99.800
process-pipe	    6 groups	         20.201	       4.098	      82.763	      99.570
process-pipe	    7 groups	         16.423	       4.156	      77.398	      99.316
process-pipe	    8 groups	         13.165	       3.920	      72.232	      98.828
process-sockets	     1 group	         99.977	       5.882	      99.999	     100.000
process-sockets	    2 groups	         99.927	       5.505	      99.996	     100.000
process-sockets	    3 groups	         99.397	       3.250	      99.980	     100.000
process-sockets	    4 groups	         79.680	       4.258	      98.864	      99.998
process-sockets	    5 groups	          7.673	       2.503	      63.659	      92.115
process-sockets	    6 groups	          4.642	       1.584	      58.946	      88.048
process-sockets	    7 groups	          3.493	       1.379	      49.816	      81.164
process-sockets	    8 groups	          3.015	       1.407	      40.845	      75.500
threads-pipe	     1 group	         99.997	       0.000	     100.000	     100.000
threads-pipe	    2 groups	         99.894	       2.932	      99.997	     100.000
threads-pipe	    3 groups	         99.611	       4.117	      99.983	     100.000
threads-pipe	    4 groups	         97.703	       2.624	      99.937	     100.000
threads-pipe	    5 groups	         22.919	       3.623	      87.150	      99.764
threads-pipe	    6 groups	         18.016	       4.038	      80.491	      99.557
threads-pipe	    7 groups	         14.663	       3.991	      75.239	      99.247
threads-pipe	    8 groups	         12.242	       3.808	      70.651	      98.644
threads-sockets	     1 group	         99.990	       6.667	      99.999	     100.000
threads-sockets	    2 groups	         99.940	       5.114	      99.997	     100.000
threads-sockets	    3 groups	         99.469	       4.115	      99.977	     100.000
threads-sockets	    4 groups	         87.528	       4.038	      99.400	     100.000
threads-sockets	    5 groups	          6.942	       2.398	      59.244	      88.337
threads-sockets	    6 groups	          4.359	       1.954	      49.448	      87.860
threads-sockets	    7 groups	          2.845	       1.345	      41.198	      77.102
threads-sockets	    8 groups	          2.871	       1.404	      38.512	      74.312

schedstat_parse.py -f tbench_vanilla.log
case			load	      se_eff%	    dom_eff%	  fast_rate%	success_rate%
loopback	  28 threads	       99.976	      18.369	      99.995	     100.000
loopback	  56 threads	       99.222	       7.799	      99.934	     100.000
loopback	  84 threads	       19.723	       6.819	      70.215	     100.000
loopback	 112 threads	       11.283	       5.371	      55.371	      99.999
loopback	 140 threads	        0.000	       0.000	       0.000	       0.000
loopback	 168 threads	        0.000	       0.000	       0.000	       0.000
loopback	 196 threads	        0.000	       0.000	       0.000	       0.000
loopback	 224 threads	        0.000	       0.000	       0.000	       0.000

According to the test above, if the system becomes busy, the
SIS Search Efficiency(se_eff%) drops significantly. Although some
benchmarks would finally find an idle CPU(success_rate% = 100%), it is
doubtful whether it is worth it to search the whole LLC domain.

[Proposal]
It would be ideal to have a crystal ball to answer this question:
How many CPUs must a wakeup path walk down, before it can find an idle
CPU? Many potential metrics could be used to predict the number.
One candidate is the sum of util_avg in this LLC domain. The benefit
of choosing util_avg is that it is a metric of accumulated historic
activity, which seems to be smoother than instantaneous metrics
(such as rq->nr_running). Besides, choosing the sum of util_avg
would help predict the load of the LLC domain more precisely, because
SIS_PROP uses one CPU's idle time to estimate the total LLC domain idle
time.

In summary, the lower the util_avg is, the more select_idle_cpu()
should scan for idle CPU, and vice versa. When the sum of util_avg
in this LLC domain hits 85% or above, the scan stops. The reason to
choose 85% as the threshold is that this is the imbalance_pct(117)
when a LLC sched group is overloaded.

Introduce the quadratic function:

y = SCHED_CAPACITY_SCALE - p * x^2
and y'= y / SCHED_CAPACITY_SCALE

x is the ratio of sum_util compared to the CPU capacity:
x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
y' is the ratio of CPUs to be scanned in the LLC domain,
and the number of CPUs to scan is calculated by:

nr_scan = llc_weight * y'

Choosing quadratic function is because:
[1] Compared to the linear function, it scans more aggressively when the
    sum_util is low.
[2] Compared to the exponential function, it is easier to calculate.
[3] It seems that there is no accurate mapping between the sum of util_avg
    and the number of CPUs to be scanned. Use heuristic scan for now.

For a platform with 112 CPUs per LLC, the number of CPUs to scan is:
sum_util%   0    5   15   25  35  45  55   65   75   85   86 ...
scan_nr   112  111  108  102  93  81  65   47   25    1    0 ...

For a platform with 16 CPUs per LLC, the number of CPUs to scan is:
sum_util%   0    5   15   25  35  45  55   65   75   85   86 ...
scan_nr    16   15   15   14  13  11   9    6    3    0    0 ...

Furthermore, to minimize the overhead of calculating the metrics in
select_idle_cpu(), borrow the statistics from periodic load balance.
As mentioned by Abel, on a platform with 112 CPUs per LLC, the
sum_util calculated by periodic load balance after 112 ms would
decay to about 0.5 * 0.5 * 0.5 * 0.7 = 8.75%, thus bringing a delay
in reflecting the latest utilization. But it is a trade-off.
Checking the util_avg in newidle load balance would be more frequent,
but it brings overhead - multiple CPUs write/read the per-LLC shared
variable and introduces cache contention. Tim also mentioned that,
it is allowed to be non-optimal in terms of scheduling for the
short-term variations, but if there is a long-term trend in the load
behavior, the scheduler can adjust for that.

When SIS_UTIL is enabled, the select_idle_cpu() uses the nr_scan
calculated by SIS_UTIL instead of the one from SIS_PROP. As Peter and
Mel suggested, SIS_UTIL should be enabled by default.

This patch is based on the util_avg, which is very sensitive to the
CPU frequency invariance. There is an issue that, when the max frequency
has been clamp, the util_avg would decay insanely fast when
the CPU is idle. Commit addca285120b ("cpufreq: intel_pstate: Handle no_turbo
in frequency invariance") could be used to mitigate this symptom, by adjusting
the arch_max_freq_ratio when turbo is disabled. But this issue is still
not thoroughly fixed, because the current code is unaware of the user-specified
max CPU frequency.

[Test result]

netperf and tbench were launched with 25% 50% 75% 100% 125% 150%
175% 200% of CPU number respectively. Hackbench and schbench were launched
by 1, 2 ,4, 8 groups. Each test lasts for 100 seconds and repeats 3 times.

The following is the benchmark result comparison between
baseline:vanilla v5.19-rc1 and compare:patched kernel. Positive compare%
indicates better performance.

Each netperf test is a:
netperf -4 -H 127.0.1 -t TCP/UDP_RR -c -C -l 100
netperf.throughput
=======
case            	load    	baseline(std%)	compare%( std%)
TCP_RR          	28 threads	 1.00 (  0.34)	 -0.16 (  0.40)
TCP_RR          	56 threads	 1.00 (  0.19)	 -0.02 (  0.20)
TCP_RR          	84 threads	 1.00 (  0.39)	 -0.47 (  0.40)
TCP_RR          	112 threads	 1.00 (  0.21)	 -0.66 (  0.22)
TCP_RR          	140 threads	 1.00 (  0.19)	 -0.69 (  0.19)
TCP_RR          	168 threads	 1.00 (  0.18)	 -0.48 (  0.18)
TCP_RR          	196 threads	 1.00 (  0.16)	+194.70 ( 16.43)
TCP_RR          	224 threads	 1.00 (  0.16)	+197.30 (  7.85)
UDP_RR          	28 threads	 1.00 (  0.37)	 +0.35 (  0.33)
UDP_RR          	56 threads	 1.00 ( 11.18)	 -0.32 (  0.21)
UDP_RR          	84 threads	 1.00 (  1.46)	 -0.98 (  0.32)
UDP_RR          	112 threads	 1.00 ( 28.85)	 -2.48 ( 19.61)
UDP_RR          	140 threads	 1.00 (  0.70)	 -0.71 ( 14.04)
UDP_RR          	168 threads	 1.00 ( 14.33)	 -0.26 ( 11.16)
UDP_RR          	196 threads	 1.00 ( 12.92)	+186.92 ( 20.93)
UDP_RR          	224 threads	 1.00 ( 11.74)	+196.79 ( 18.62)

Take the 224 threads as an example, the SIS search metrics changes are
illustrated below:

    vanilla                    patched
   4544492          +237.5%   15338634        sched_debug.cpu.sis_domain_search.avg
     38539        +39686.8%   15333634        sched_debug.cpu.sis_failed.avg
  128300000          -87.9%   15551326        sched_debug.cpu.sis_scanned.avg
   5842896          +162.7%   15347978        sched_debug.cpu.sis_search.avg

There is -87.9% less CPU scans after patched, which indicates lower overhead.
Besides, with this patch applied, there is -13% less rq lock contention
in perf-profile.calltrace.cycles-pp._raw_spin_lock.raw_spin_rq_lock_nested
.try_to_wake_up.default_wake_function.woken_wake_function.
This might help explain the performance improvement - Because this patch allows
the waking task to remain on the previous CPU, rather than grabbing other CPUs'
lock.

Each hackbench test is a:
hackbench -g $job --process/threads --pipe/sockets -l 1000000 -s 100
hackbench.throughput
=========
case            	load    	baseline(std%)	compare%( std%)
process-pipe    	1 group 	 1.00 (  1.29)	 +0.57 (  0.47)
process-pipe    	2 groups 	 1.00 (  0.27)	 +0.77 (  0.81)
process-pipe    	4 groups 	 1.00 (  0.26)	 +1.17 (  0.02)
process-pipe    	8 groups 	 1.00 (  0.15)	 -4.79 (  0.02)
process-sockets 	1 group 	 1.00 (  0.63)	 -0.92 (  0.13)
process-sockets 	2 groups 	 1.00 (  0.03)	 -0.83 (  0.14)
process-sockets 	4 groups 	 1.00 (  0.40)	 +5.20 (  0.26)
process-sockets 	8 groups 	 1.00 (  0.04)	 +3.52 (  0.03)
threads-pipe    	1 group 	 1.00 (  1.28)	 +0.07 (  0.14)
threads-pipe    	2 groups 	 1.00 (  0.22)	 -0.49 (  0.74)
threads-pipe    	4 groups 	 1.00 (  0.05)	 +1.88 (  0.13)
threads-pipe    	8 groups 	 1.00 (  0.09)	 -4.90 (  0.06)
threads-sockets 	1 group 	 1.00 (  0.25)	 -0.70 (  0.53)
threads-sockets 	2 groups 	 1.00 (  0.10)	 -0.63 (  0.26)
threads-sockets 	4 groups 	 1.00 (  0.19)	+11.92 (  0.24)
threads-sockets 	8 groups 	 1.00 (  0.08)	 +4.31 (  0.11)

Each tbench test is a:
tbench -t 100 $job 127.0.0.1
tbench.throughput
======
case            	load    	baseline(std%)	compare%( std%)
loopback        	28 threads	 1.00 (  0.06)	 -0.14 (  0.09)
loopback        	56 threads	 1.00 (  0.03)	 -0.04 (  0.17)
loopback        	84 threads	 1.00 (  0.05)	 +0.36 (  0.13)
loopback        	112 threads	 1.00 (  0.03)	 +0.51 (  0.03)
loopback        	140 threads	 1.00 (  0.02)	 -1.67 (  0.19)
loopback        	168 threads	 1.00 (  0.38)	 +1.27 (  0.27)
loopback        	196 threads	 1.00 (  0.11)	 +1.34 (  0.17)
loopback        	224 threads	 1.00 (  0.11)	 +1.67 (  0.22)

Each schbench test is a:
schbench -m $job -t 28 -r 100 -s 30000 -c 30000
schbench.latency_90%_us
========
case            	load    	baseline(std%)	compare%( std%)
normal          	1 mthread	 1.00 ( 31.22)	 -7.36 ( 20.25)*
normal          	2 mthreads	 1.00 (  2.45)	 -0.48 (  1.79)
normal          	4 mthreads	 1.00 (  1.69)	 +0.45 (  0.64)
normal          	8 mthreads	 1.00 (  5.47)	 +9.81 ( 14.28)

*Consider the Standard Deviation, this -7.36% regression might not be valid.

Also, a OLTP workload with a commercial RDBMS has been tested, and there
is no significant change.

There were concerns that unbalanced tasks among CPUs would cause problems.
For example, suppose the LLC domain is composed of 8 CPUs, and 7 tasks are
bound to CPU0~CPU6, while CPU7 is idle:

          CPU0    CPU1    CPU2    CPU3    CPU4    CPU5    CPU6    CPU7
util_avg  1024    1024    1024    1024    1024    1024    1024    0

Since the util_avg ratio is 87.5%( = 7/8 ), which is higher than 85%,
select_idle_cpu() will not scan, thus CPU7 is undetected during scan.
But according to Mel, it is unlikely the CPU7 will be idle all the time
because CPU7 could pull some tasks via CPU_NEWLY_IDLE.

lkp(kernel test robot) has reported a regression on stress-ng.sock on a
very busy system. According to the sched_debug statistics, it might be caused
by SIS_UTIL terminates the scan and chooses a previous CPU earlier, and this
might introduce more context switch, especially involuntary preemption, which
impacts a busy stress-ng. This regression has shown that, not all benchmarks
in every scenario benefit from idle CPU scan limit, and it needs further
investigation.

Besides, there is slight regression in hackbench's 16 groups case when the
LLC domain has 16 CPUs. Prateek mentioned that we should scan aggressively
in an LLC domain with 16 CPUs. Because the cost to search for an idle one
among 16 CPUs is negligible. The current patch aims to propose a generic
solution and only considers the util_avg. Something like the below could
be applied on top of the current patch to fulfill the requirement:

	if (llc_weight <= 16)
		nr_scan = nr_scan * 32 / llc_weight;

For LLC domain with 16 CPUs, the nr_scan will be expanded to 2 times large.
The smaller the CPU number this LLC domain has, the larger nr_scan will be
expanded. This needs further investigation.

There is also ongoing work[2] from Abel to filter out the busy CPUs during
wakeup, to further speed up the idle CPU scan. And it could be a following-up
optimization on top of this change.

Suggested-by: Tim Chen <tim.c.chen@intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Tested-by: Mohini Narkhede <mohini.narkhede@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220612163428.849378-1-yu.c.chen@intel.com
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 87 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/features.h        |  3 +-
 3 files changed, 90 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 56cffe42abbc..816df6cc444e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -81,6 +81,7 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	int		nr_idle_scan;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7400600b4db6..f80ae86bb404 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6332,6 +6332,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 {
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
+	struct sched_domain_shared *sd_share;
 	struct rq *this_rq = this_rq();
 	int this = smp_processor_id();
 	struct sched_domain *this_sd;
@@ -6371,6 +6372,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		time = cpu_clock(this);
 	}
 
+	if (sched_feat(SIS_UTIL)) {
+		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+		if (sd_share) {
+			/* because !--nr is the condition to stop scan */
+			nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+			/* overloaded LLC is unlikely to have idle cpu/core */
+			if (nr == 1)
+				return -1;
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -9224,6 +9236,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	return idlest;
 }
 
+static void update_idle_cpu_scan(struct lb_env *env,
+				 unsigned long sum_util)
+{
+	struct sched_domain_shared *sd_share;
+	int llc_weight, pct;
+	u64 x, y, tmp;
+	/*
+	 * Update the number of CPUs to scan in LLC domain, which could
+	 * be used as a hint in select_idle_cpu(). The update of sd_share
+	 * could be expensive because it is within a shared cache line.
+	 * So the write of this hint only occurs during periodic load
+	 * balancing, rather than CPU_NEWLY_IDLE, because the latter
+	 * can fire way more frequently than the former.
+	 */
+	if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+		return;
+
+	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+	if (env->sd->span_weight != llc_weight)
+		return;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+	if (!sd_share)
+		return;
+
+	/*
+	 * The number of CPUs to search drops as sum_util increases, when
+	 * sum_util hits 85% or above, the scan stops.
+	 * The reason to choose 85% as the threshold is because this is the
+	 * imbalance_pct(117) when a LLC sched group is overloaded.
+	 *
+	 * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
+	 * and y'= y / SCHED_CAPACITY_SCALE
+	 *
+	 * x is the ratio of sum_util compared to the CPU capacity:
+	 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+	 * y' is the ratio of CPUs to be scanned in the LLC domain,
+	 * and the number of CPUs to scan is calculated by:
+	 *
+	 * nr_scan = llc_weight * y'                                    [2]
+	 *
+	 * When x hits the threshold of overloaded, AKA, when
+	 * x = 100 / pct, y drops to 0. According to [1],
+	 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+	 *
+	 * Scale x by SCHED_CAPACITY_SCALE:
+	 * x' = sum_util / llc_weight;                                  [3]
+	 *
+	 * and finally [1] becomes:
+	 * y = SCHED_CAPACITY_SCALE -
+	 *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
+	 *
+	 */
+	/* equation [3] */
+	x = sum_util;
+	do_div(x, llc_weight);
+
+	/* equation [4] */
+	pct = env->sd->imbalance_pct;
+	tmp = x * x * pct * pct;
+	do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+	y = SCHED_CAPACITY_SCALE - tmp;
+
+	/* equation [2] */
+	y *= llc_weight;
+	do_div(y, SCHED_CAPACITY_SCALE);
+	if ((int)y != sd_share->nr_idle_scan)
+		WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
@@ -9236,6 +9319,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
+	unsigned long sum_util = 0;
 	int sg_status = 0;
 
 	do {
@@ -9268,6 +9352,7 @@ next_group:
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
+		sum_util += sgs->group_util;
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
@@ -9293,6 +9378,8 @@ next_group:
 		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
 		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
 	}
+
+	update_idle_cpu_scan(env, sum_util);
 }
 
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..ee7f23c76bd3 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -60,7 +60,8 @@ SCHED_FEAT(TTWU_QUEUE, true)
 /*
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
-SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_PROP, false)
+SCHED_FEAT(SIS_UTIL, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
-- 
cgit 


From 119a784c81270eb88e573174ed2209225d646656 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 16 Jun 2022 11:06:23 -0700
Subject: perf/core: Add a new read format to get a number of lost samples

Sometimes we want to know an accurate number of samples even if it's
lost.  Currenlty PERF_RECORD_LOST is generated for a ring-buffer which
might be shared with other events.  So it's hard to know per-event
lost count.

Add event->lost_samples field and PERF_FORMAT_LOST to retrieve it from
userspace.

Original-patch-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220616180623.1358843-1-namhyung@kernel.org
---
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  5 ++++-
 kernel/events/core.c            | 21 ++++++++++++++++++---
 kernel/events/ring_buffer.c     |  5 ++++-
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index da759560eec5..ee8b9ecdc03b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -759,6 +759,8 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	atomic64_t			lost_samples;
+
 	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..0474ee362151 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -301,6 +301,7 @@ enum {
  *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		id;           } && PERF_FORMAT_ID
+ *	  { u64		lost;         } && PERF_FORMAT_LOST
  *	} && !PERF_FORMAT_GROUP
  *
  *	{ u64		nr;
@@ -308,6 +309,7 @@ enum {
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		value;
  *	    { u64	id;           } && PERF_FORMAT_ID
+ *	    { u64	lost;         } && PERF_FORMAT_LOST
  *	  }		cntr[nr];
  *	} && PERF_FORMAT_GROUP
  * };
@@ -317,8 +319,9 @@ enum perf_event_read_format {
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
 	PERF_FORMAT_GROUP			= 1U << 3,
+	PERF_FORMAT_LOST			= 1U << 4,
 
-	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 5,		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..4d8c335a07db 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 	if (event->attr.read_format & PERF_FORMAT_ID)
 		entry += sizeof(u64);
 
+	if (event->attr.read_format & PERF_FORMAT_LOST)
+		entry += sizeof(u64);
+
 	if (event->attr.read_format & PERF_FORMAT_GROUP) {
 		nr += nr_siblings;
 		size += sizeof(u64);
@@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
 	values[n++] += perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&leader->lost_samples);
 
 	for_each_sibling_event(sub, leader) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
+		if (read_format & PERF_FORMAT_LOST)
+			values[n++] = atomic64_read(&sub->lost_samples);
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
 	u64 enabled, running;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
 		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&event->lost_samples);
 
 	if (copy_to_user(buf, values, n * sizeof(u64)))
 		return -EFAULT;
@@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 				 u64 enabled, u64 running)
 {
 	u64 read_format = event->attr.read_format;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = perf_event_count(event);
@@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	}
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&event->lost_samples);
 
 	__output_copy(handle, values, n * sizeof(u64));
 }
@@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
-	u64 values[5];
+	u64 values[6];
 	int n = 0;
 
 	values[n++] = 1 + leader->nr_siblings;
@@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&leader->lost_samples);
 
 	__output_copy(handle, values, n * sizeof(u64));
 
@@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
+		if (read_format & PERF_FORMAT_LOST)
+			values[n++] = atomic64_read(&sub->lost_samples);
 
 		__output_copy(handle, values, n * sizeof(u64));
 	}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb35b926024c..726132039c38 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
 		goto out;
 
 	if (unlikely(rb->paused)) {
-		if (rb->nr_pages)
+		if (rb->nr_pages) {
 			local_inc(&rb->lost);
+			atomic64_inc(&event->lost_samples);
+		}
 		goto out;
 	}
 
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
 
 fail:
 	local_inc(&rb->lost);
+	atomic64_inc(&event->lost_samples);
 	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
-- 
cgit 


From bb4479994945e9170534389a7762eb56149320ac Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 21 Jun 2022 10:04:10 +0100
Subject: sched, drivers: Remove max param from
 effective_cpu_util()/sched_cpu_util()

effective_cpu_util() already has a `int cpu' parameter which allows to
retrieve the CPU capacity scale factor (or maximum CPU capacity) inside
this function via an arch_scale_cpu_capacity(cpu).

A lot of code calling effective_cpu_util() (or the shim
sched_cpu_util()) needs the maximum CPU capacity, i.e. it will call
arch_scale_cpu_capacity() already.
But not having to pass it into effective_cpu_util() will make the EAS
wake-up code easier, especially when the maximum CPU capacity reduced
by the thermal pressure is passed through the EAS wake-up functions.

Due to the asymmetric CPU capacity support of arm/arm64 architectures,
arch_scale_cpu_capacity(int cpu) is a per-CPU variable read access via
per_cpu(cpu_scale, cpu) on such a system.
On all other architectures it is a a compile-time constant
(SCHED_CAPACITY_SCALE).

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lkml.kernel.org/r/20220621090414.433602-4-vdonnefort@google.com
---
 drivers/powercap/dtpm_cpu.c       | 33 +++++++++------------------------
 drivers/thermal/cpufreq_cooling.c |  6 ++----
 include/linux/sched.h             |  2 +-
 kernel/sched/core.c               | 11 ++++++-----
 kernel/sched/cpufreq_schedutil.c  |  5 ++---
 kernel/sched/fair.c               | 21 ++++++++++-----------
 kernel/sched/sched.h              |  2 +-
 7 files changed, 31 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index f5eced0842b3..6a88eb7e9f75 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -71,34 +71,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
 
 static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
 {
-	unsigned long max = 0, sum_util = 0;
+	unsigned long max, sum_util = 0;
 	int cpu;
 
-	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-
-		/*
-		 * The capacity is the same for all CPUs belonging to
-		 * the same perf domain, so a single call to
-		 * arch_scale_cpu_capacity() is enough. However, we
-		 * need the CPU parameter to be initialized by the
-		 * loop, so the call ends up in this block.
-		 *
-		 * We can initialize 'max' with a cpumask_first() call
-		 * before the loop but the bits computation is not
-		 * worth given the arch_scale_cpu_capacity() just
-		 * returns a value where the resulting assembly code
-		 * will be optimized by the compiler.
-		 */
-		max = arch_scale_cpu_capacity(cpu);
-		sum_util += sched_cpu_util(cpu, max);
-	}
-
 	/*
-	 * In the improbable case where all the CPUs of the perf
-	 * domain are offline, 'max' will be zero and will lead to an
-	 * illegal operation with a zero division.
+	 * The capacity is the same for all CPUs belonging to
+	 * the same perf domain.
 	 */
-	return max ? (power * ((sum_util << 10) / max)) >> 10 : 0;
+	max = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+
+	for_each_cpu_and(cpu, pd_mask, cpu_online_mask)
+		sum_util += sched_cpu_util(cpu);
+
+	return (power * ((sum_util << 10) / max)) >> 10;
 }
 
 static u64 get_pd_power_uw(struct dtpm *dtpm)
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b8151d95a806..b263b0fde03c 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
 		    int cpu_idx)
 {
-	unsigned long max = arch_scale_cpu_capacity(cpu);
-	unsigned long util;
+	unsigned long util = sched_cpu_util(cpu);
 
-	util = sched_cpu_util(cpu, max);
-	return (util * 100) / max;
+	return (util * 100) / arch_scale_cpu_capacity(cpu);
 }
 #else /* !CONFIG_SMP */
 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..88b8817b827d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2257,7 +2257,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 }
 
 /* Returns effective CPU energy utilization, as seen by the scheduler */
-unsigned long sched_cpu_util(int cpu, unsigned long max);
+unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_RSEQ
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d3e2c5a7c1b7..c538a0ac4617 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7125,12 +7125,14 @@ struct task_struct *idle_task(int cpu)
  * required to meet deadlines.
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum cpu_util_type type,
+				 enum cpu_util_type type,
 				 struct task_struct *p)
 {
-	unsigned long dl_util, util, irq;
+	unsigned long dl_util, util, irq, max;
 	struct rq *rq = cpu_rq(cpu);
 
+	max = arch_scale_cpu_capacity(cpu);
+
 	if (!uclamp_is_used() &&
 	    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
 		return max;
@@ -7210,10 +7212,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	return min(max, util);
 }
 
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
 {
-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
-				  ENERGY_UTIL, NULL);
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
 }
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3dbf351d12d5..1207c78f85c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -157,11 +157,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
 
-	sg_cpu->max = max;
+	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
 	sg_cpu->bw_dl = cpu_bw_dl(rq);
-	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
+	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
 					  FREQUENCY_UTIL, NULL);
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 80be1f1a5620..6de09b26b455 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6712,12 +6712,11 @@ static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
 	struct cpumask *pd_mask = perf_domain_span(pd);
-	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-	unsigned long max_util = 0, sum_util = 0;
-	unsigned long _cpu_cap = cpu_cap;
+	unsigned long max_util = 0, sum_util = 0, cpu_cap;
 	int cpu;
 
-	_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+	cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+	cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
 
 	/*
 	 * The capacity state of CPUs of the current rd can be driven by CPUs
@@ -6754,10 +6753,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		 * is already enough to scale the EM reported power
 		 * consumption at the (eventually clamped) cpu_capacity.
 		 */
-		cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
-					      ENERGY_UTIL, NULL);
+		cpu_util = effective_cpu_util(cpu, util_running, ENERGY_UTIL,
+					      NULL);
 
-		sum_util += min(cpu_util, _cpu_cap);
+		sum_util += min(cpu_util, cpu_cap);
 
 		/*
 		 * Performance domain frequency: utilization clamping
@@ -6766,12 +6765,12 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
-					      FREQUENCY_UTIL, tsk);
-		max_util = max(max_util, min(cpu_util, _cpu_cap));
+		cpu_util = effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL,
+					      tsk);
+		max_util = max(max_util, min(cpu_util, cpu_cap));
 	}
 
-	return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
+	return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 76b0027fd0c8..73ae32898f25 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2886,7 +2886,7 @@ enum cpu_util_type {
 };
 
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum cpu_util_type type,
+				 enum cpu_util_type type,
 				 struct task_struct *p);
 
 static inline unsigned long cpu_bw_dl(struct rq *rq)
-- 
cgit 


From 586b3b7600e44c1cad52c683ccfbb76fb2c10cc8 Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Date: Fri, 17 Jun 2022 16:16:56 +0530
Subject: firmware: xilinx: Add configuration values for tri-state

Add configuration values(enable/disable) for tri-state parameter.

Signed-off-by: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Link: https://lore.kernel.org/r/1655462819-28801-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1ec73d5352c3..4901fb31de3e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -368,6 +368,11 @@ enum pm_pinctrl_drive_strength {
 	PM_PINCTRL_DRIVE_STRENGTH_12MA = 3,
 };
 
+enum pm_pinctrl_tri_state {
+	PM_PINCTRL_TRI_STATE_DISABLE = 0,
+	PM_PINCTRL_TRI_STATE_ENABLE = 1,
+};
+
 enum zynqmp_pm_shutdown_type {
 	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0,
 	ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1,
-- 
cgit 


From 4313a24985f00340eeb591fd66aa2b257b9e0a69 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 23 May 2022 21:59:02 +0200
Subject: arch/*/: remove CONFIG_VIRT_TO_BUS

All architecture-independent users of virt_to_bus() and bus_to_virt()
have been fixed to use the dma mapping interfaces or have been
removed now.  This means the definitions on most architectures, and the
CONFIG_VIRT_TO_BUS symbol are now obsolete and can be removed.

The only exceptions to this are a few network and scsi drivers for m68k
Amiga and VME machines and ppc32 Macintosh. These drivers work correctly
with the old interfaces and are probably not worth changing.

On alpha and parisc, virt_to_bus() were still used in asm/floppy.h.
alpha can use isa_virt_to_bus() like x86 does, and parisc can just
open-code the virt_to_phys() here, as this is architecture specific
code.

I tried updating the bus-virt-phys-mapping.rst documentation, which
started as an email from Linus to explain some details of the Linux-2.0
driver interfaces. The bits about virt_to_bus() were declared obsolete
backin 2000, and the rest is not all that relevant any more, so in the
end I just decided to remove the file completely.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Helge Deller <deller@gmx.de> # parisc
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 Documentation/core-api/bus-virt-phys-mapping.rst   | 220 ---------------------
 Documentation/core-api/dma-api-howto.rst           |  14 --
 Documentation/core-api/index.rst                   |   1 -
 .../translations/zh_CN/core-api/index.rst          |   1 -
 arch/alpha/Kconfig                                 |   1 -
 arch/alpha/include/asm/floppy.h                    |   2 +-
 arch/alpha/include/asm/io.h                        |   8 +-
 arch/ia64/Kconfig                                  |   1 -
 arch/ia64/include/asm/io.h                         |   8 -
 arch/m68k/Kconfig                                  |   1 -
 arch/m68k/include/asm/virtconvert.h                |   4 +-
 arch/microblaze/Kconfig                            |   1 -
 arch/microblaze/include/asm/io.h                   |   2 -
 arch/mips/Kconfig                                  |   1 -
 arch/mips/include/asm/io.h                         |   9 -
 arch/parisc/Kconfig                                |   1 -
 arch/parisc/include/asm/floppy.h                   |   4 +-
 arch/parisc/include/asm/io.h                       |   2 -
 arch/powerpc/Kconfig                               |   1 -
 arch/powerpc/include/asm/io.h                      |   2 -
 arch/riscv/include/asm/page.h                      |   1 -
 arch/x86/Kconfig                                   |   1 -
 arch/x86/include/asm/io.h                          |   9 -
 arch/xtensa/Kconfig                                |   1 -
 arch/xtensa/include/asm/io.h                       |   3 -
 include/asm-generic/io.h                           |  14 --
 mm/Kconfig                                         |   8 -
 27 files changed, 10 insertions(+), 311 deletions(-)
 delete mode 100644 Documentation/core-api/bus-virt-phys-mapping.rst

(limited to 'include')

diff --git a/Documentation/core-api/bus-virt-phys-mapping.rst b/Documentation/core-api/bus-virt-phys-mapping.rst
deleted file mode 100644
index c72b24a7d52c..000000000000
--- a/Documentation/core-api/bus-virt-phys-mapping.rst
+++ /dev/null
@@ -1,220 +0,0 @@
-==========================================================
-How to access I/O mapped memory from within device drivers
-==========================================================
-
-:Author: Linus
-
-.. warning::
-
-	The virt_to_bus() and bus_to_virt() functions have been
-	superseded by the functionality provided by the PCI DMA interface
-	(see Documentation/core-api/dma-api-howto.rst).  They continue
-	to be documented below for historical purposes, but new code
-	must not use them. --davidm 00/12/12
-
-::
-
-  [ This is a mail message in response to a query on IO mapping, thus the
-    strange format for a "document" ]
-
-The AHA-1542 is a bus-master device, and your patch makes the driver give the
-controller the physical address of the buffers, which is correct on x86
-(because all bus master devices see the physical memory mappings directly). 
-
-However, on many setups, there are actually **three** different ways of looking
-at memory addresses, and in this case we actually want the third, the
-so-called "bus address". 
-
-Essentially, the three ways of addressing memory are (this is "real memory",
-that is, normal RAM--see later about other details): 
-
- - CPU untranslated.  This is the "physical" address.  Physical address 
-   0 is what the CPU sees when it drives zeroes on the memory bus.
-
- - CPU translated address. This is the "virtual" address, and is 
-   completely internal to the CPU itself with the CPU doing the appropriate
-   translations into "CPU untranslated". 
-
- - bus address. This is the address of memory as seen by OTHER devices, 
-   not the CPU. Now, in theory there could be many different bus 
-   addresses, with each device seeing memory in some device-specific way, but
-   happily most hardware designers aren't actually actively trying to make
-   things any more complex than necessary, so you can assume that all 
-   external hardware sees the memory the same way. 
-
-Now, on normal PCs the bus address is exactly the same as the physical
-address, and things are very simple indeed. However, they are that simple
-because the memory and the devices share the same address space, and that is
-not generally necessarily true on other PCI/ISA setups. 
-
-Now, just as an example, on the PReP (PowerPC Reference Platform), the 
-CPU sees a memory map something like this (this is from memory)::
-
-	0-2 GB		"real memory"
-	2 GB-3 GB	"system IO" (inb/out and similar accesses on x86)
-	3 GB-4 GB 	"IO memory" (shared memory over the IO bus)
-
-Now, that looks simple enough. However, when you look at the same thing from
-the viewpoint of the devices, you have the reverse, and the physical memory
-address 0 actually shows up as address 2 GB for any IO master.
-
-So when the CPU wants any bus master to write to physical memory 0, it 
-has to give the master address 0x80000000 as the memory address.
-
-So, for example, depending on how the kernel is actually mapped on the 
-PPC, you can end up with a setup like this::
-
- physical address:	0
- virtual address:	0xC0000000
- bus address:		0x80000000
-
-where all the addresses actually point to the same thing.  It's just seen 
-through different translations..
-
-Similarly, on the Alpha, the normal translation is::
-
- physical address:	0
- virtual address:	0xfffffc0000000000
- bus address:		0x40000000
-
-(but there are also Alphas where the physical address and the bus address
-are the same). 
-
-Anyway, the way to look up all these translations, you do::
-
-	#include <asm/io.h>
-
-	phys_addr = virt_to_phys(virt_addr);
-	virt_addr = phys_to_virt(phys_addr);
-	 bus_addr = virt_to_bus(virt_addr);
-	virt_addr = bus_to_virt(bus_addr);
-
-Now, when do you need these?
-
-You want the **virtual** address when you are actually going to access that
-pointer from the kernel. So you can have something like this::
-
-	/*
-	 * this is the hardware "mailbox" we use to communicate with
-	 * the controller. The controller sees this directly.
-	 */
-	struct mailbox {
-		__u32 status;
-		__u32 bufstart;
-		__u32 buflen;
-		..
-	} mbox;
-
-		unsigned char * retbuffer;
-
-		/* get the address from the controller */
-		retbuffer = bus_to_virt(mbox.bufstart);
-		switch (retbuffer[0]) {
-			case STATUS_OK:
-				...
-
-on the other hand, you want the bus address when you have a buffer that 
-you want to give to the controller::
-
-	/* ask the controller to read the sense status into "sense_buffer" */
-	mbox.bufstart = virt_to_bus(&sense_buffer);
-	mbox.buflen = sizeof(sense_buffer);
-	mbox.status = 0;
-	notify_controller(&mbox);
-
-And you generally **never** want to use the physical address, because you can't
-use that from the CPU (the CPU only uses translated virtual addresses), and
-you can't use it from the bus master. 
-
-So why do we care about the physical address at all? We do need the physical
-address in some cases, it's just not very often in normal code.  The physical
-address is needed if you use memory mappings, for example, because the
-"remap_pfn_range()" mm function wants the physical address of the memory to
-be remapped as measured in units of pages, a.k.a. the pfn (the memory
-management layer doesn't know about devices outside the CPU, so it
-shouldn't need to know about "bus addresses" etc).
-
-.. note::
-
-	The above is only one part of the whole equation. The above
-	only talks about "real memory", that is, CPU memory (RAM).
-
-There is a completely different type of memory too, and that's the "shared
-memory" on the PCI or ISA bus. That's generally not RAM (although in the case
-of a video graphics card it can be normal DRAM that is just used for a frame
-buffer), but can be things like a packet buffer in a network card etc. 
-
-This memory is called "PCI memory" or "shared memory" or "IO memory" or
-whatever, and there is only one way to access it: the readb/writeb and
-related functions. You should never take the address of such memory, because
-there is really nothing you can do with such an address: it's not
-conceptually in the same memory space as "real memory" at all, so you cannot
-just dereference a pointer. (Sadly, on x86 it **is** in the same memory space,
-so on x86 it actually works to just deference a pointer, but it's not
-portable). 
-
-For such memory, you can do things like:
-
- - reading::
-
-	/*
-	 * read first 32 bits from ISA memory at 0xC0000, aka
-	 * C000:0000 in DOS terms
-	 */
-	unsigned int signature = isa_readl(0xC0000);
-
- - remapping and writing::
-
-	/*
-	 * remap framebuffer PCI memory area at 0xFC000000,
-	 * size 1MB, so that we can access it: We can directly
-	 * access only the 640k-1MB area, so anything else
-	 * has to be remapped.
-	 */
-	void __iomem *baseptr = ioremap(0xFC000000, 1024*1024);
-
-	/* write a 'A' to the offset 10 of the area */
-	writeb('A',baseptr+10);
-
-	/* unmap when we unload the driver */
-	iounmap(baseptr);
-
- - copying and clearing::
-
-	/* get the 6-byte Ethernet address at ISA address E000:0040 */
-	memcpy_fromio(kernel_buffer, 0xE0040, 6);
-	/* write a packet to the driver */
-	memcpy_toio(0xE1000, skb->data, skb->len);
-	/* clear the frame buffer */
-	memset_io(0xA0000, 0, 0x10000);
-
-OK, that just about covers the basics of accessing IO portably.  Questions?
-Comments? You may think that all the above is overly complex, but one day you
-might find yourself with a 500 MHz Alpha in front of you, and then you'll be
-happy that your driver works ;)
-
-Note that kernel versions 2.0.x (and earlier) mistakenly called the
-ioremap() function "vremap()".  ioremap() is the proper name, but I
-didn't think straight when I wrote it originally.  People who have to
-support both can do something like::
- 
-	/* support old naming silliness */
-	#if LINUX_VERSION_CODE < 0x020100
-	#define ioremap vremap
-	#define iounmap vfree                                                     
-	#endif
- 
-at the top of their source files, and then they can use the right names
-even on 2.0.x systems. 
-
-And the above sounds worse than it really is.  Most real drivers really
-don't do all that complex things (or rather: the complexity is not so
-much in the actual IO accesses as in error handling and timeouts etc). 
-It's generally not hard to fix drivers, and in many cases the code
-actually looks better afterwards::
-
-	unsigned long signature = *(unsigned int *) 0xC0000;
-		vs
-	unsigned long signature = readl(0xC0000);
-
-I think the second version actually is more readable, no?
diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst
index 358d495456d1..828846804e25 100644
--- a/Documentation/core-api/dma-api-howto.rst
+++ b/Documentation/core-api/dma-api-howto.rst
@@ -707,20 +707,6 @@ to use the dma_sync_*() interfaces::
 		}
 	}
 
-Drivers converted fully to this interface should not use virt_to_bus() any
-longer, nor should they use bus_to_virt(). Some drivers have to be changed a
-little bit, because there is no longer an equivalent to bus_to_virt() in the
-dynamic DMA mapping scheme - you have to always store the DMA addresses
-returned by the dma_alloc_coherent(), dma_pool_alloc(), and dma_map_single()
-calls (dma_map_sg() stores them in the scatterlist itself if the platform
-supports dynamic DMA mapping in hardware) in your driver structures and/or
-in the card registers.
-
-All drivers should be using these interfaces with no exceptions.  It
-is planned to completely remove virt_to_bus() and bus_to_virt() as
-they are entirely deprecated.  Some ports already do not provide these
-as it is impossible to correctly support them.
-
 Handling Errors
 ===============
 
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index dedd4d853329..726065a3095e 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -41,7 +41,6 @@ Library functionality that is used throughout the kernel.
    rbtree
    generic-radix-tree
    packing
-   bus-virt-phys-mapping
    this_cpu_ops
    timekeeping
    errseq
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 26d9913fc8b6..c52175fc1b61 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -52,7 +52,6 @@ Todolist:
    circular-buffers
    generic-radix-tree
    packing
-   bus-virt-phys-mapping
    this_cpu_ops
    timekeeping
    errseq
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 7d0d26b5b3f5..97fce7386b00 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -17,7 +17,6 @@ config ALPHA
 	select HAVE_PERF_EVENTS
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
-	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PCI_IOMAP
 	select AUTO_IRQ_AFFINITY if SMP
diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h
index 588758685439..64b42d9591fc 100644
--- a/arch/alpha/include/asm/floppy.h
+++ b/arch/alpha/include/asm/floppy.h
@@ -20,7 +20,7 @@
 #define fd_free_dma()           free_dma(FLOPPY_DMA)
 #define fd_clear_dma_ff()       clear_dma_ff(FLOPPY_DMA)
 #define fd_set_dma_mode(mode)   set_dma_mode(FLOPPY_DMA,mode)
-#define fd_set_dma_addr(addr)   set_dma_addr(FLOPPY_DMA,virt_to_bus(addr))
+#define fd_set_dma_addr(addr)   set_dma_addr(FLOPPY_DMA,isa_virt_to_bus(addr))
 #define fd_set_dma_count(count) set_dma_count(FLOPPY_DMA,count)
 #define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
 #define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index c9cb554fbe54..d277189b2677 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -106,15 +106,15 @@ static inline void * phys_to_virt(unsigned long address)
 extern unsigned long __direct_map_base;
 extern unsigned long __direct_map_size;
 
-static inline unsigned long __deprecated virt_to_bus(volatile void *address)
+static inline unsigned long __deprecated isa_virt_to_bus(volatile void *address)
 {
 	unsigned long phys = virt_to_phys(address);
 	unsigned long bus = phys + __direct_map_base;
 	return phys <= __direct_map_size ? bus : 0;
 }
-#define isa_virt_to_bus virt_to_bus
+#define isa_virt_to_bus isa_virt_to_bus
 
-static inline void * __deprecated bus_to_virt(unsigned long address)
+static inline void * __deprecated isa_bus_to_virt(unsigned long address)
 {
 	void *virt;
 
@@ -125,7 +125,7 @@ static inline void * __deprecated bus_to_virt(unsigned long address)
 	virt = phys_to_virt(address);
 	return (long)address <= 0 ? NULL : virt;
 }
-#define isa_bus_to_virt bus_to_virt
+#define isa_bus_to_virt isa_bus_to_virt
 
 /*
  * There are different chipsets to interface the Alpha CPUs to the world.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index cb93769a9f2a..26ac8ea15a9e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -39,7 +39,6 @@ config IA64
 	select HAVE_FUNCTION_DESCRIPTORS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
-	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index 6d93b923b379..ce66dfc0e719 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -96,14 +96,6 @@ extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int valid_phys_addr_range (phys_addr_t addr, size_t count); /* efi.c */
 extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count);
 
-/*
- * The following two macros are deprecated and scheduled for removal.
- * Please use the PCI-DMA interface defined in <asm/pci.h> instead.
- */
-#define bus_to_virt	phys_to_virt
-#define virt_to_bus	virt_to_phys
-#define page_to_bus	page_to_phys
-
 # endif /* KERNEL */
 
 /*
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 936cce42ae9a..b06faf6c0b27 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -30,7 +30,6 @@ config M68K
 	select OLD_SIGACTION
 	select OLD_SIGSUSPEND3
 	select UACCESS_MEMCPY if !MMU
-	select VIRT_TO_BUS
 	select ZONE_DMA
 
 config CPU_BIG_ENDIAN
diff --git a/arch/m68k/include/asm/virtconvert.h b/arch/m68k/include/asm/virtconvert.h
index ca91b32dc6ef..0a27905b0036 100644
--- a/arch/m68k/include/asm/virtconvert.h
+++ b/arch/m68k/include/asm/virtconvert.h
@@ -33,9 +33,11 @@ static inline void *phys_to_virt(unsigned long address)
 
 /*
  * IO bus memory addresses are 1:1 with the physical address,
+ * deprecated globally but still used on two machines.
  */
+#if defined(CONFIG_AMIGA) || defined(CONFIG_VME)
 #define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
+#endif
 
 #endif
 #endif
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 8cf429ad1c84..415182eeb082 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -38,7 +38,6 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select PCI_DOMAINS_GENERIC if PCI
 	select PCI_SYSCALL if PCI
-	select VIRT_TO_BUS
 	select CPU_NO_EFFICIENT_FFS
 	select MMU_GATHER_NO_RANGE
 	select SPARSE_IRQ
diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h
index b6a57f8468f0..c1d78b8977a6 100644
--- a/arch/microblaze/include/asm/io.h
+++ b/arch/microblaze/include/asm/io.h
@@ -30,8 +30,6 @@ extern resource_size_t isa_mem_base;
 #define PCI_IOBASE	((void __iomem *)_IO_BASE)
 #define IO_SPACE_LIMIT (0xFFFFFFFF)
 
-#define page_to_bus(page)	(page_to_phys(page))
-
 extern void iounmap(volatile void __iomem *addr);
 
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index db09d45d59ec..ff0c76dfbbad 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -100,7 +100,6 @@ config MIPS
 	select RTC_LIB
 	select SYSCTL_EXCEPTION_TRACE
 	select TRACE_IRQFLAGS_SUPPORT
-	select VIRT_TO_BUS
 	select ARCH_HAS_ELFCORE_COMPAT
 	select HAVE_ARCH_KCSAN if 64BIT
 
diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h
index 6f5c86d2bab4..cd9168f34fb7 100644
--- a/arch/mips/include/asm/io.h
+++ b/arch/mips/include/asm/io.h
@@ -147,15 +147,6 @@ static inline void *isa_bus_to_virt(unsigned long address)
 	return phys_to_virt(address);
 }
 
-/*
- * However PCI ones are not necessarily 1:1 and therefore these interfaces
- * are forbidden in portable PCI drivers.
- *
- * Allow them for x86 for legacy drivers, though.
- */
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
-
 /*
  * Change "struct page" to physical address.
  */
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 5f2448dc5a2b..b0d68e9e2df0 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -43,7 +43,6 @@ config PARISC
 	select SYSCTL_ARCH_UNALIGN_ALLOW
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_MOD_ARCH_SPECIFIC
-	select VIRT_TO_BUS
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS
 	select TTY # Needed for pdc_cons.c
diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h
index 762cfe7778c0..b318a7df52f6 100644
--- a/arch/parisc/include/asm/floppy.h
+++ b/arch/parisc/include/asm/floppy.h
@@ -179,7 +179,7 @@ static void _fd_chose_dma_mode(char *addr, unsigned long size)
 {
 	if(can_use_virtual_dma == 2) {
 		if((unsigned int) addr >= (unsigned int) high_memory ||
-		   virt_to_bus(addr) >= 0x1000000 ||
+		   virt_to_phys(addr) >= 0x1000000 ||
 		   _CROSS_64KB(addr, size, 0))
 			use_virtual_dma = 1;
 		else
@@ -215,7 +215,7 @@ static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
 	doing_pdma = 0;
 	clear_dma_ff(FLOPPY_DMA);
 	set_dma_mode(FLOPPY_DMA,mode);
-	set_dma_addr(FLOPPY_DMA,virt_to_bus(addr));
+	set_dma_addr(FLOPPY_DMA,virt_to_phys(addr));
 	set_dma_count(FLOPPY_DMA,size);
 	enable_dma(FLOPPY_DMA);
 	return 0;
diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h
index 837ddddbac6a..42ffb60a6ea9 100644
--- a/arch/parisc/include/asm/io.h
+++ b/arch/parisc/include/asm/io.h
@@ -7,8 +7,6 @@
 
 #define virt_to_phys(a) ((unsigned long)__pa(a))
 #define phys_to_virt(a) __va(a)
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
 
 static inline unsigned long isa_bus_to_virt(unsigned long addr) {
 	BUG();
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c2ce2e60c8f0..ae8ab95c4389 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -276,7 +276,6 @@ config PPC
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
-	select VIRT_TO_BUS			if !PPC64
 	#
 	# Please keep this list sorted alphabetically.
 	#
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index c5a5f7c9b231..73fcd5cdb662 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -985,8 +985,6 @@ static inline void * bus_to_virt(unsigned long address)
 }
 #define bus_to_virt bus_to_virt
 
-#define page_to_bus(page)	(page_to_phys(page) + PCI_DRAM_OFFSET)
-
 #endif /* CONFIG_PPC32 */
 
 /* access ports */
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 1526e410e802..ac70b0fd9a9a 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -167,7 +167,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
 #define page_to_virt(page)	(pfn_to_virt(page_to_pfn(page)))
 
 #define page_to_phys(page)	(pfn_to_phys(page_to_pfn(page)))
-#define page_to_bus(page)	(page_to_phys(page))
 #define phys_to_page(paddr)	(pfn_to_page(phys_to_pfn(paddr)))
 
 #define sym_to_pfn(x)           __phys_to_pfn(__pa_symbol(x))
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index be0b95e51df6..9a81b867c3bc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -278,7 +278,6 @@ config X86
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
 	select USER_STACKTRACE_SUPPORT
-	select VIRT_TO_BUS
 	select HAVE_ARCH_KCSAN			if X86_64
 	select X86_FEATURE_NAMES		if PROC_FS
 	select PROC_PID_ARCH_STATUS		if PROC_FS
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 1870b99c3356..e9025640f634 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -169,15 +169,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
 }
 #define isa_bus_to_virt		phys_to_virt
 
-/*
- * However PCI ones are not necessarily 1:1 and therefore these interfaces
- * are forbidden in portable PCI drivers.
- *
- * Allow them on x86 for legacy drivers, though.
- */
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
-
 /*
  * The default ioremap() behavior is non-cached; if you need something
  * else, you probably want one of the following.
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 0b0f0172cced..92a24ed738a5 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -50,7 +50,6 @@ config XTENSA
 	select MODULES_USE_ELF_RELA
 	select PERF_USE_VMALLOC
 	select TRACE_IRQFLAGS_SUPPORT
-	select VIRT_TO_BUS
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
 	  primarily for embedded systems.  These processors are both
diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h
index 54188e69b988..a5b707e1c0f4 100644
--- a/arch/xtensa/include/asm/io.h
+++ b/arch/xtensa/include/asm/io.h
@@ -63,9 +63,6 @@ static inline void iounmap(volatile void __iomem *addr)
 		xtensa_iounmap(addr);
 }
 
-#define virt_to_bus     virt_to_phys
-#define bus_to_virt     phys_to_virt
-
 #endif /* CONFIG_MMU */
 
 #include <asm-generic/io.h>
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9c5114335a2d..7197e0642757 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1142,20 +1142,6 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 }
 #endif
 
-#ifdef CONFIG_VIRT_TO_BUS
-#ifndef virt_to_bus
-static inline unsigned long virt_to_bus(void *address)
-{
-	return (unsigned long)address;
-}
-
-static inline void *bus_to_virt(unsigned long address)
-{
-	return (void *)address;
-}
-#endif
-#endif
-
 #ifndef memset_io
 #define memset_io memset_io
 /**
diff --git a/mm/Kconfig b/mm/Kconfig
index 169e64192e48..b7a44b17c79f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,14 +639,6 @@ config BOUNCE
 	  memory available to the CPU. Enabled by default when HIGHMEM is
 	  selected, but you may say n to override this.
 
-config VIRT_TO_BUS
-	bool
-	help
-	  An architecture should select this if it implements the
-	  deprecated interface virt_to_bus().  All new architectures
-	  should probably not select this.
-
-
 config MMU_NOTIFIER
 	bool
 	select SRCU
-- 
cgit 


From 0e584d46218e3b9dc12a98e18e81a0cd3e0d5419 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Tue, 28 Jun 2022 10:46:22 +0100
Subject: regulator: fix a kernel-doc warning

document n_ramp_values field at struct regulator_desc, in order
to solve this warning:

	include/linux/regulator/driver.h:434: warning: Function parameter or member 'n_ramp_values' not described in 'regulator_desc'

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/15efc16e878aa327aa2769023bcdf959a795f41d.1656409369.git.mchehab@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 0228caaa6741..f9a7461e72b8 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -348,6 +348,7 @@ enum regulator_type {
  * @ramp_delay_table:	Table for mapping the regulator ramp-rate values. Values
  *			should be given in units of V/S (uV/uS). See the
  *			regulator_set_ramp_delay_regmap().
+ * @n_ramp_values:	number of elements at @ramp_delay_table.
  *
  * @enable_time: Time taken for initial enable of regulator (in uS).
  * @off_on_delay: guard time (in uS), before re-enabling a regulator
-- 
cgit 


From 1f90307e5f0d7bc9a336ead528f616a5df8e5944 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 19 Jun 2022 08:05:49 +0200
Subject: block: remove QUEUE_FLAG_DEAD

Disallow setting the blk-mq state on any queue that is already dying as
setting the state even then is a bad idea, and remove the now unused
QUEUE_FLAG_DEAD flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20220619060552.1850436-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 3 ---
 block/blk-mq-debugfs.c | 8 +++-----
 include/linux/blkdev.h | 2 --
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index c2cec402d01c..f86df390afad 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -313,9 +313,6 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * after draining finished.
 	 */
 	blk_freeze_queue(q);
-
-	blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
-
 	blk_sync_queue(q);
 	if (queue_is_mq(q)) {
 		blk_mq_cancel_work_sync(q);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4d1ce9ef4318..b80fae7ab1d9 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -116,7 +116,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(NOXMERGES),
 	QUEUE_FLAG_NAME(ADD_RANDOM),
 	QUEUE_FLAG_NAME(SAME_FORCE),
-	QUEUE_FLAG_NAME(DEAD),
 	QUEUE_FLAG_NAME(INIT_DONE),
 	QUEUE_FLAG_NAME(STABLE_WRITES),
 	QUEUE_FLAG_NAME(POLL),
@@ -151,11 +150,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
 	char opbuf[16] = { }, *op;
 
 	/*
-	 * The "state" attribute is removed after blk_cleanup_queue() has called
-	 * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid
-	 * triggering a use-after-free.
+	 * The "state" attribute is removed when the queue is removed.  Don't
+	 * allow setting the state on a dying queue to avoid a use-after-free.
 	 */
-	if (blk_queue_dead(q))
+	if (blk_queue_dying(q))
 		return -ENOENT;
 
 	if (count >= sizeof(opbuf)) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b2d42201bd5d..f4632f4fe884 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -564,7 +564,6 @@ struct request_queue {
 #define QUEUE_FLAG_NOXMERGES	9	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM	10	/* Contributes to random pool */
 #define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
-#define QUEUE_FLAG_DEAD		13	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
 #define QUEUE_FLAG_STABLE_WRITES 15	/* don't modify blks until WB is done */
 #define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */
@@ -592,7 +591,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_has_srcu(q)	test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
-#define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_init_done(q)	test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
-- 
cgit 


From 6f8191fdf41d3a53cc1d63fe2234e812c55a0092 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 19 Jun 2022 08:05:51 +0200
Subject: block: simplify disk shutdown

Set the queue dying flag and call blk_mq_exit_queue from del_gendisk for
all disks that do not have separately allocated queues, and thus remove
the need to call blk_cleanup_queue for them.

Rename blk_cleanup_disk to blk_mq_destroy_queue to make it clear that
this function is intended only for separately allocated blk-mq queues.

This saves an extra queue freeze for devices without a separately
allocated queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20220619060552.1850436-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                    | 37 -------------------------------
 block/blk-mq.c                      | 43 +++++++++++++++++++++++++++++++++++--
 block/blk-sysfs.c                   |  5 -----
 block/blk.h                         |  3 +++
 block/bsg-lib.c                     |  4 ++--
 block/genhd.c                       | 23 +++++++++++---------
 drivers/block/ataflop.c             |  1 -
 drivers/block/loop.c                |  1 -
 drivers/block/mtip32xx/mtip32xx.c   |  2 --
 drivers/block/rnbd/rnbd-clt.c       |  2 +-
 drivers/block/sx8.c                 |  4 ++--
 drivers/block/virtio_blk.c          |  1 -
 drivers/block/z2ram.c               |  1 -
 drivers/cdrom/gdrom.c               |  1 -
 drivers/memstick/core/ms_block.c    |  1 -
 drivers/memstick/core/mspro_block.c |  1 -
 drivers/mmc/core/block.c            |  1 -
 drivers/mmc/core/queue.c            |  1 -
 drivers/nvme/host/apple.c           |  2 +-
 drivers/nvme/host/core.c            |  1 -
 drivers/nvme/host/fc.c              | 12 +++++------
 drivers/nvme/host/pci.c             |  2 +-
 drivers/nvme/host/rdma.c            | 12 +++++------
 drivers/nvme/host/tcp.c             | 12 +++++------
 drivers/nvme/target/loop.c          | 12 +++++------
 drivers/s390/block/dasd.c           |  2 +-
 drivers/s390/block/dasd_genhd.c     |  4 ++--
 drivers/scsi/scsi_lib.c             |  6 +++---
 drivers/scsi/scsi_sysfs.c           |  2 +-
 drivers/scsi/sd.c                   |  4 ++--
 drivers/scsi/sr.c                   |  4 ++--
 drivers/ufs/core/ufshcd.c           |  4 ++--
 include/linux/blk-mq.h              |  3 +++
 include/linux/blkdev.h              |  4 +---
 34 files changed, 105 insertions(+), 113 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 04029ffea031..5ad7bd93077c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -284,43 +284,6 @@ void blk_queue_start_drain(struct request_queue *q)
 	wake_up_all(&q->mq_freeze_wq);
 }
 
-/**
- * blk_cleanup_queue - shutdown a request queue
- * @q: request queue to shutdown
- *
- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
- * put it.  All future requests will be failed immediately with -ENODEV.
- *
- * Context: can sleep
- */
-void blk_cleanup_queue(struct request_queue *q)
-{
-	/* cannot be called from atomic context */
-	might_sleep();
-
-	WARN_ON_ONCE(blk_queue_registered(q));
-
-	/* mark @q DYING, no new request or merges will be allowed afterwards */
-	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
-	blk_queue_start_drain(q);
-
-	/*
-	 * Drain all requests queued before DYING marking. Set DEAD flag to
-	 * prevent that blk_mq_run_hw_queues() accesses the hardware queues
-	 * after draining finished.
-	 */
-	blk_freeze_queue(q);
-	blk_sync_queue(q);
-	if (queue_is_mq(q)) {
-		blk_mq_cancel_work_sync(q);
-		blk_mq_exit_queue(q);
-	}
-
-	/* @q is and will stay empty, shutdown and put */
-	blk_put_queue(q);
-}
-EXPORT_SYMBOL(blk_cleanup_queue);
-
 /**
  * blk_queue_enter() - try to increase q->q_usage_counter
  * @q: request queue pointer
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 92aae03103b7..b1dbc4b2c2c9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3902,7 +3902,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 	q->queuedata = queuedata;
 	ret = blk_mq_init_allocated_queue(set, q);
 	if (ret) {
-		blk_cleanup_queue(q);
+		blk_put_queue(q);
 		return ERR_PTR(ret);
 	}
 	return q;
@@ -3914,6 +3914,35 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
+/**
+ * blk_mq_destroy_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * This shuts down a request queue allocated by blk_mq_init_queue() and drops
+ * the initial reference.  All future requests will failed with -ENODEV.
+ *
+ * Context: can sleep
+ */
+void blk_mq_destroy_queue(struct request_queue *q)
+{
+	WARN_ON_ONCE(!queue_is_mq(q));
+	WARN_ON_ONCE(blk_queue_registered(q));
+
+	might_sleep();
+
+	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+	blk_queue_start_drain(q);
+	blk_freeze_queue(q);
+
+	blk_sync_queue(q);
+	blk_mq_cancel_work_sync(q);
+	blk_mq_exit_queue(q);
+
+	/* @q is and will stay empty, shutdown and put */
+	blk_put_queue(q);
+}
+EXPORT_SYMBOL(blk_mq_destroy_queue);
+
 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
 		struct lock_class_key *lkclass)
 {
@@ -3926,13 +3955,23 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
 
 	disk = __alloc_disk_node(q, set->numa_node, lkclass);
 	if (!disk) {
-		blk_cleanup_queue(q);
+		blk_put_queue(q);
 		return ERR_PTR(-ENOMEM);
 	}
+	set_bit(GD_OWNS_QUEUE, &disk->state);
 	return disk;
 }
 EXPORT_SYMBOL(__blk_mq_alloc_disk);
 
+struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
+		struct lock_class_key *lkclass)
+{
+	if (!blk_get_queue(q))
+		return NULL;
+	return __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
+}
+EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
+
 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
 		struct blk_mq_tag_set *set, struct request_queue *q,
 		int hctx_idx, int node)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 69e53d1a4f0e..9b211e519de8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -755,11 +755,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
  * decremented with blk_put_queue(). Once the refcount reaches 0 this function
  * is called.
  *
- * For drivers that have a request_queue on a gendisk and added with
- * __device_add_disk() the refcount to request_queue will reach 0 with
- * the last put_disk() called by the driver. For drivers which don't use
- * __device_add_disk() this happens with blk_cleanup_queue().
- *
  * Drivers exist which depend on the release of the request_queue to be
  * synchronous, it should not be deferred.
  *
diff --git a/block/blk.h b/block/blk.h
index 8e79296ee97a..1a0d3e6a4a63 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -424,6 +424,9 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
 		sector_t length);
 void blk_drop_partitions(struct gendisk *disk);
 
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+		struct lock_class_key *lkclass);
+
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index acfe1357bf6c..fd4cd5e68282 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -324,7 +324,7 @@ void bsg_remove_queue(struct request_queue *q)
 			container_of(q->tag_set, struct bsg_set, tag_set);
 
 		bsg_unregister_queue(bset->bd);
-		blk_cleanup_queue(q);
+		blk_mq_destroy_queue(q);
 		blk_mq_free_tag_set(&bset->tag_set);
 		kfree(bset);
 	}
@@ -399,7 +399,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 
 	return q;
 out_cleanup_queue:
-	blk_cleanup_queue(q);
+	blk_mq_destroy_queue(q);
 out_queue:
 	blk_mq_free_tag_set(set);
 out_tag_set:
diff --git a/block/genhd.c b/block/genhd.c
index 278227ba1d53..4d15f828c449 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -617,6 +617,8 @@ void del_gendisk(struct gendisk *disk)
 	 * Fail any new I/O.
 	 */
 	set_bit(GD_DEAD, &disk->state);
+	if (test_bit(GD_OWNS_QUEUE, &disk->state))
+		blk_queue_flag_set(QUEUE_FLAG_DYING, q);
 	set_capacity(disk, 0);
 
 	/*
@@ -663,11 +665,16 @@ void del_gendisk(struct gendisk *disk)
 	blk_mq_unquiesce_queue(q);
 
 	/*
-	 * Allow using passthrough request again after the queue is torn down.
+	 * If the disk does not own the queue, allow using passthrough requests
+	 * again.  Else leave the queue frozen to fail all I/O.
 	 */
-	blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
-	__blk_mq_unfreeze_queue(q, true);
-
+	if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
+		blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+		__blk_mq_unfreeze_queue(q, true);
+	} else {
+		if (queue_is_mq(q))
+			blk_mq_exit_queue(q);
+	}
 }
 EXPORT_SYMBOL(del_gendisk);
 
@@ -1338,9 +1345,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 {
 	struct gendisk *disk;
 
-	if (!blk_get_queue(q))
-		return NULL;
-
 	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
 	if (!disk)
 		goto out_put_queue;
@@ -1391,7 +1395,6 @@ out_put_queue:
 	blk_put_queue(q);
 	return NULL;
 }
-EXPORT_SYMBOL(__alloc_disk_node);
 
 struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 {
@@ -1404,9 +1407,10 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 
 	disk = __alloc_disk_node(q, node, lkclass);
 	if (!disk) {
-		blk_cleanup_queue(q);
+		blk_put_queue(q);
 		return NULL;
 	}
+	set_bit(GD_OWNS_QUEUE, &disk->state);
 	return disk;
 }
 EXPORT_SYMBOL(__blk_alloc_disk);
@@ -1439,7 +1443,6 @@ EXPORT_SYMBOL(put_disk);
  */
 void blk_cleanup_disk(struct gendisk *disk)
 {
-	blk_cleanup_queue(disk->queue);
 	put_disk(disk);
 }
 EXPORT_SYMBOL(blk_cleanup_disk);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index e232cc4fd444..c6e41ee18aaa 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -2045,7 +2045,6 @@ static void atari_floppy_cleanup(void)
 			if (!unit[i].disk[type])
 				continue;
 			del_gendisk(unit[i].disk[type]);
-			blk_cleanup_queue(unit[i].disk[type]->queue);
 			put_disk(unit[i].disk[type]);
 		}
 		blk_mq_free_tag_set(&unit[i].tag_set);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 084f9b8a0ba3..cc608226c8c7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2057,7 +2057,6 @@ static void loop_remove(struct loop_device *lo)
 {
 	/* Make this loop device unreachable from pathname. */
 	del_gendisk(lo->lo_disk);
-	blk_cleanup_queue(lo->lo_disk->queue);
 	blk_mq_free_tag_set(&lo->tag_set);
 
 	mutex_lock(&loop_ctl_mutex);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index e7604b3bf8a7..1d0e0a9fdd7c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3565,7 +3565,6 @@ static int mtip_block_shutdown(struct driver_data *dd)
 	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
 		del_gendisk(dd->disk);
 
-	blk_cleanup_queue(dd->queue);
 	blk_mq_free_tag_set(&dd->tags);
 	put_disk(dd->disk);
 	return 0;
@@ -3914,7 +3913,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
 						dd->disk->disk_name);
 
-	blk_cleanup_queue(dd->queue);
 	blk_mq_free_tag_set(&dd->tags);
 
 	/* De-initialize the protocol layer. */
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 409c76b81aed..a4470374f54f 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1755,7 +1755,7 @@ static void rnbd_destroy_sessions(void)
 		list_for_each_entry_safe(dev, tn, &sess->devs_list, list) {
 			/*
 			 * Here unmap happens in parallel for only one reason:
-			 * blk_cleanup_queue() takes around half a second, so
+			 * del_gendisk() takes around half a second, so
 			 * on huge amount of devices the whole module unload
 			 * procedure takes minutes.
 			 */
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 63b4f6431d2e..75057dbbcfbe 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1536,7 +1536,7 @@ err_out_free_majors:
 		clear_bit(0, &carm_major_alloc);
 	else if (host->major == 161)
 		clear_bit(1, &carm_major_alloc);
-	blk_cleanup_queue(host->oob_q);
+	blk_mq_destroy_queue(host->oob_q);
 	blk_mq_free_tag_set(&host->tag_set);
 err_out_dma_free:
 	dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
@@ -1570,7 +1570,7 @@ static void carm_remove_one (struct pci_dev *pdev)
 		clear_bit(0, &carm_major_alloc);
 	else if (host->major == 161)
 		clear_bit(1, &carm_major_alloc);
-	blk_cleanup_queue(host->oob_q);
+	blk_mq_destroy_queue(host->oob_q);
 	blk_mq_free_tag_set(&host->tag_set);
 	dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
 	iounmap(host->mmio);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6fc7850c2b0a..cff1b6f6b054 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1111,7 +1111,6 @@ static void virtblk_remove(struct virtio_device *vdev)
 	flush_work(&vblk->config_work);
 
 	del_gendisk(vblk->disk);
-	blk_cleanup_queue(vblk->disk->queue);
 	blk_mq_free_tag_set(&vblk->tag_set);
 
 	mutex_lock(&vblk->vdev_mutex);
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 7a6ed83481b8..18ad43d9933e 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -384,7 +384,6 @@ static void __exit z2_exit(void)
 
 	for (i = 0; i < Z2MINOR_COUNT; i++) {
 		del_gendisk(z2ram_gendisk[i]);
-		blk_cleanup_queue(z2ram_gendisk[i]->queue);
 		put_disk(z2ram_gendisk[i]);
 	}
 	blk_mq_free_tag_set(&tag_set);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 8e78b37d0f6a..f4cc90ea6198 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -831,7 +831,6 @@ probe_fail_no_mem:
 
 static int remove_gdrom(struct platform_device *devptr)
 {
-	blk_cleanup_queue(gd.gdrom_rq);
 	blk_mq_free_tag_set(&gd.tag_set);
 	free_irq(HW_EVENT_GDROM_CMD, &gd);
 	free_irq(HW_EVENT_GDROM_DMA, &gd);
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 3993bdd4b519..ba7e7249a3db 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2187,7 +2187,6 @@ static void msb_remove(struct memstick_dev *card)
 
 	/* Remove the disk */
 	del_gendisk(msb->disk);
-	blk_cleanup_queue(msb->queue);
 	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 725ba74ded30..72e91c06c618 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1294,7 +1294,6 @@ static void mspro_block_remove(struct memstick_dev *card)
 	del_gendisk(msb->disk);
 	dev_dbg(&card->dev, "mspro block remove\n");
 
-	blk_cleanup_queue(msb->queue);
 	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f4a1281658db..bda6c67ce93f 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2509,7 +2509,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	return md;
 
  err_cleanup_queue:
-	blk_cleanup_queue(md->disk->queue);
 	blk_mq_free_tag_set(&md->queue.tag_set);
  err_kfree:
 	kfree(md);
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index fa5324ceeebe..f824cfdab75a 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -494,7 +494,6 @@ void mmc_cleanup_queue(struct mmc_queue *mq)
 	if (blk_queue_quiesced(q))
 		blk_mq_unquiesce_queue(q);
 
-	blk_cleanup_queue(q);
 	blk_mq_free_tag_set(&mq->tag_set);
 
 	/*
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index d702d7d60235..2d23b7d41f7e 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1502,7 +1502,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
 
 	if (!blk_get_queue(anv->ctrl.admin_q)) {
 		nvme_start_admin_queue(&anv->ctrl);
-		blk_cleanup_queue(anv->ctrl.admin_q);
+		blk_mq_destroy_queue(anv->ctrl.admin_q);
 		anv->ctrl.admin_q = NULL;
 		ret = -ENODEV;
 		goto put_dev;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b3d9c29aba1e..4e3a0f7bfc9c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4103,7 +4103,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (!nvme_ns_head_multipath(ns->head))
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
-	blk_cleanup_queue(ns->queue);
 
 	down_write(&ns->ctrl->namespaces_rwsem);
 	list_del_init(&ns->list);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 3c778bb0c294..a96aa831684c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2392,7 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref)
 	unsigned long flags;
 
 	if (ctrl->ctrl.tagset) {
-		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 		blk_mq_free_tag_set(&ctrl->tag_set);
 	}
 
@@ -2402,8 +2402,8 @@ nvme_fc_ctrl_free(struct kref *ref)
 	spin_unlock_irqrestore(&ctrl->rport->lock, flags);
 
 	nvme_start_admin_queue(&ctrl->ctrl);
-	blk_cleanup_queue(ctrl->ctrl.admin_q);
-	blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+	blk_mq_destroy_queue(ctrl->ctrl.admin_q);
+	blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 	blk_mq_free_tag_set(&ctrl->admin_tag_set);
 
 	kfree(ctrl->queues);
@@ -2953,7 +2953,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
 out_delete_hw_queues:
 	nvme_fc_delete_hw_io_queues(ctrl);
 out_cleanup_blk_queue:
-	blk_cleanup_queue(ctrl->ctrl.connect_q);
+	blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 out_free_tag_set:
 	blk_mq_free_tag_set(&ctrl->tag_set);
 	nvme_fc_free_io_queues(ctrl);
@@ -3642,9 +3642,9 @@ fail_ctrl:
 	return ERR_PTR(-EIO);
 
 out_cleanup_admin_q:
-	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_destroy_queue(ctrl->ctrl.admin_q);
 out_cleanup_fabrics_q:
-	blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+	blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 out_free_admin_tag_set:
 	blk_mq_free_tag_set(&ctrl->admin_tag_set);
 out_free_queues:
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d7b24ee17285..247a74aba336 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1760,7 +1760,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
 		 * queue to flush these to completion.
 		 */
 		nvme_start_admin_queue(&dev->ctrl);
-		blk_cleanup_queue(dev->ctrl.admin_q);
+		blk_mq_destroy_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
 }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2a5e1ea508a..0fb7c8e7ab0b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -840,8 +840,8 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
 	if (remove) {
-		blk_cleanup_queue(ctrl->ctrl.admin_q);
-		blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+		blk_mq_destroy_queue(ctrl->ctrl.admin_q);
+		blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 		blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
 	}
 	if (ctrl->async_event_sqe.data) {
@@ -935,10 +935,10 @@ out_stop_queue:
 	nvme_cancel_admin_tagset(&ctrl->ctrl);
 out_cleanup_queue:
 	if (new)
-		blk_cleanup_queue(ctrl->ctrl.admin_q);
+		blk_mq_destroy_queue(ctrl->ctrl.admin_q);
 out_cleanup_fabrics_q:
 	if (new)
-		blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+		blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 out_free_tagset:
 	if (new)
 		blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
@@ -957,7 +957,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
 	if (remove) {
-		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 		blk_mq_free_tag_set(ctrl->ctrl.tagset);
 	}
 	nvme_rdma_free_io_queues(ctrl);
@@ -1012,7 +1012,7 @@ out_wait_freeze_timed_out:
 out_cleanup_connect_q:
 	nvme_cancel_tagset(&ctrl->ctrl);
 	if (new)
-		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 out_free_tag_set:
 	if (new)
 		blk_mq_free_tag_set(ctrl->ctrl.tagset);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bb67538d241b..b81942fa5f95 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1885,7 +1885,7 @@ static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
 {
 	nvme_tcp_stop_io_queues(ctrl);
 	if (remove) {
-		blk_cleanup_queue(ctrl->connect_q);
+		blk_mq_destroy_queue(ctrl->connect_q);
 		blk_mq_free_tag_set(ctrl->tagset);
 	}
 	nvme_tcp_free_io_queues(ctrl);
@@ -1940,7 +1940,7 @@ out_wait_freeze_timed_out:
 out_cleanup_connect_q:
 	nvme_cancel_tagset(ctrl);
 	if (new)
-		blk_cleanup_queue(ctrl->connect_q);
+		blk_mq_destroy_queue(ctrl->connect_q);
 out_free_tag_set:
 	if (new)
 		blk_mq_free_tag_set(ctrl->tagset);
@@ -1953,8 +1953,8 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
 {
 	nvme_tcp_stop_queue(ctrl, 0);
 	if (remove) {
-		blk_cleanup_queue(ctrl->admin_q);
-		blk_cleanup_queue(ctrl->fabrics_q);
+		blk_mq_destroy_queue(ctrl->admin_q);
+		blk_mq_destroy_queue(ctrl->fabrics_q);
 		blk_mq_free_tag_set(ctrl->admin_tagset);
 	}
 	nvme_tcp_free_admin_queue(ctrl);
@@ -2012,10 +2012,10 @@ out_stop_queue:
 	nvme_cancel_admin_tagset(ctrl);
 out_cleanup_queue:
 	if (new)
-		blk_cleanup_queue(ctrl->admin_q);
+		blk_mq_destroy_queue(ctrl->admin_q);
 out_cleanup_fabrics_q:
 	if (new)
-		blk_cleanup_queue(ctrl->fabrics_q);
+		blk_mq_destroy_queue(ctrl->fabrics_q);
 out_free_tagset:
 	if (new)
 		blk_mq_free_tag_set(ctrl->admin_tagset);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 59024af2da2e..0f5c77e22a0a 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -266,8 +266,8 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
 	if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags))
 		return;
 	nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
-	blk_cleanup_queue(ctrl->ctrl.admin_q);
-	blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+	blk_mq_destroy_queue(ctrl->ctrl.admin_q);
+	blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 	blk_mq_free_tag_set(&ctrl->admin_tag_set);
 }
 
@@ -283,7 +283,7 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
 	mutex_unlock(&nvme_loop_ctrl_mutex);
 
 	if (nctrl->tagset) {
-		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 		blk_mq_free_tag_set(&ctrl->tag_set);
 	}
 	kfree(ctrl->queues);
@@ -410,9 +410,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 
 out_cleanup_queue:
 	clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
-	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_destroy_queue(ctrl->ctrl.admin_q);
 out_cleanup_fabrics_q:
-	blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+	blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
 out_free_tagset:
 	blk_mq_free_tag_set(&ctrl->admin_tag_set);
 out_free_sq:
@@ -554,7 +554,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
 	return 0;
 
 out_cleanup_connect_q:
-	blk_cleanup_queue(ctrl->ctrl.connect_q);
+	blk_mq_destroy_queue(ctrl->ctrl.connect_q);
 out_free_tagset:
 	blk_mq_free_tag_set(&ctrl->tag_set);
 out_destroy_queues:
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index ba6d78789660..e8489331f12b 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3280,7 +3280,7 @@ static int dasd_alloc_queue(struct dasd_block *block)
 static void dasd_free_queue(struct dasd_block *block)
 {
 	if (block->request_queue) {
-		blk_cleanup_queue(block->request_queue);
+		blk_mq_destroy_queue(block->request_queue);
 		blk_mq_free_tag_set(&block->tag_set);
 		block->request_queue = NULL;
 	}
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index a7a33ebf4bbe..5a83f0a39901 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -41,8 +41,8 @@ int dasd_gendisk_alloc(struct dasd_block *block)
 	if (base->devindex >= DASD_PER_MAJOR)
 		return -EBUSY;
 
-	gdp = __alloc_disk_node(block->request_queue, NUMA_NO_NODE,
-				&dasd_bio_compl_lkclass);
+	gdp = blk_mq_alloc_disk_for_queue(block->request_queue,
+					  &dasd_bio_compl_lkclass);
 	if (!gdp)
 		return -ENOMEM;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 6ffc9e4258a8..cdf0056582d5 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -163,7 +163,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
 	 * Requeue this command.  It will go before all other commands
 	 * that are already in the queue. Schedule requeue work under
 	 * lock such that the kblockd_schedule_work() call happens
-	 * before blk_cleanup_queue() finishes.
+	 * before blk_mq_destroy_queue() finishes.
 	 */
 	cmd->result = 0;
 
@@ -424,9 +424,9 @@ static void scsi_starved_list_run(struct Scsi_Host *shost)
 		 * it and the queue.  Mitigate by taking a reference to the
 		 * queue and never touching the sdev again after we drop the
 		 * host lock.  Note: if __scsi_remove_device() invokes
-		 * blk_cleanup_queue() before the queue is run from this
+		 * blk_mq_destroy_queue() before the queue is run from this
 		 * function then blk_run_queue() will return immediately since
-		 * blk_cleanup_queue() marks the queue with QUEUE_FLAG_DYING.
+		 * blk_mq_destroy_queue() marks the queue with QUEUE_FLAG_DYING.
 		 */
 		slq = sdev->request_queue;
 		if (!blk_get_queue(slq))
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 43949798a2e4..aa70d9282161 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1475,7 +1475,7 @@ void __scsi_remove_device(struct scsi_device *sdev)
 	scsi_device_set_state(sdev, SDEV_DEL);
 	mutex_unlock(&sdev->state_mutex);
 
-	blk_cleanup_queue(sdev->request_queue);
+	blk_mq_destroy_queue(sdev->request_queue);
 	cancel_work_sync(&sdev->requeue_work);
 
 	if (sdev->host->hostt->slave_destroy)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a1a2ac09066f..cb587e488601 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3440,8 +3440,8 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
-	gd = __alloc_disk_node(sdp->request_queue, NUMA_NO_NODE,
-			       &sd_bio_compl_lkclass);
+	gd = blk_mq_alloc_disk_for_queue(sdp->request_queue,
+					 &sd_bio_compl_lkclass);
 	if (!gd)
 		goto out_free;
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 32d3b8274f14..a278b739d0c5 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -624,8 +624,8 @@ static int sr_probe(struct device *dev)
 	if (!cd)
 		goto fail;
 
-	disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE,
-				 &sr_bio_compl_lkclass);
+	disk = blk_mq_alloc_disk_for_queue(sdev->request_queue,
+					   &sr_bio_compl_lkclass);
 	if (!disk)
 		goto fail_free;
 	mutex_init(&cd->lock);
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index ce86d1b790c0..91d8852daaa9 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -9487,7 +9487,7 @@ void ufshcd_remove(struct ufs_hba *hba)
 	ufs_bsg_remove(hba);
 	ufshpb_remove(hba);
 	ufs_sysfs_remove_nodes(hba->dev);
-	blk_cleanup_queue(hba->tmf_queue);
+	blk_mq_destroy_queue(hba->tmf_queue);
 	blk_mq_free_tag_set(&hba->tmf_tag_set);
 	scsi_remove_host(hba->host);
 	/* disable interrupts */
@@ -9783,7 +9783,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	return 0;
 
 free_tmf_queue:
-	blk_cleanup_queue(hba->tmf_queue);
+	blk_mq_destroy_queue(hba->tmf_queue);
 free_tmf_tag_set:
 	blk_mq_free_tag_set(&hba->tmf_tag_set);
 out_remove_scsi_host:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e2d9daf7e8dd..0fd96e92c6c6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -686,10 +686,13 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
 									\
 	__blk_mq_alloc_disk(set, queuedata, &__key);			\
 })
+struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
+		struct lock_class_key *lkclass);
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q);
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
+void blk_mq_destroy_queue(struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f4632f4fe884..530eeccffda3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -148,6 +148,7 @@ struct gendisk {
 #define GD_NATIVE_CAPACITY		3
 #define GD_ADDED			4
 #define GD_SUPPRESS_PART_SCAN		5
+#define GD_OWNS_QUEUE			6
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
@@ -815,8 +816,6 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
 
 int bdev_disk_changed(struct gendisk *disk, bool invalidate);
 
-struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
-		struct lock_class_key *lkclass);
 void put_disk(struct gendisk *disk);
 struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
 
@@ -933,7 +932,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
 /*
  * Access functions for manipulating queue properties
  */
-extern void blk_cleanup_queue(struct request_queue *);
 void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
-- 
cgit 


From 8b9ab62662048a3274361c7e5f64037c2c133e2c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 19 Jun 2022 08:05:52 +0200
Subject: block: remove blk_cleanup_disk

blk_cleanup_disk is nothing but a trivial wrapper for put_disk now,
so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20220619060552.1850436-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/m68k/emu/nfblock.c             |  4 ++--
 arch/um/drivers/ubd_kern.c          |  4 ++--
 arch/xtensa/platforms/iss/simdisk.c |  4 ++--
 block/genhd.c                       | 15 ---------------
 drivers/block/amiflop.c             |  2 +-
 drivers/block/aoe/aoeblk.c          |  2 +-
 drivers/block/aoe/aoedev.c          |  2 +-
 drivers/block/ataflop.c             |  4 ++--
 drivers/block/brd.c                 |  4 ++--
 drivers/block/drbd/drbd_main.c      |  4 ++--
 drivers/block/floppy.c              |  6 +++---
 drivers/block/loop.c                |  2 +-
 drivers/block/mtip32xx/mtip32xx.c   |  2 +-
 drivers/block/n64cart.c             |  2 +-
 drivers/block/nbd.c                 |  4 ++--
 drivers/block/null_blk/main.c       |  4 ++--
 drivers/block/paride/pcd.c          |  4 ++--
 drivers/block/paride/pd.c           |  4 ++--
 drivers/block/paride/pf.c           |  4 ++--
 drivers/block/pktcdvd.c             |  4 ++--
 drivers/block/ps3disk.c             |  4 ++--
 drivers/block/ps3vram.c             |  4 ++--
 drivers/block/rbd.c                 |  2 +-
 drivers/block/rnbd/rnbd-clt.c       |  4 ++--
 drivers/block/sunvdc.c              |  4 ++--
 drivers/block/swim.c                |  2 +-
 drivers/block/swim3.c               |  2 +-
 drivers/block/sx8.c                 |  2 +-
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  4 ++--
 drivers/block/z2ram.c               |  2 +-
 drivers/block/zram/zram_drv.c       |  4 ++--
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/md/bcache/super.c           |  2 +-
 drivers/md/dm.c                     |  2 +-
 drivers/md/md.c                     |  4 ++--
 drivers/memstick/core/ms_block.c    |  2 +-
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/mtd/mtd_blkdevs.c           |  4 ++--
 drivers/mtd/ubi/block.c             |  4 ++--
 drivers/nvdimm/btt.c                |  4 ++--
 drivers/nvdimm/pmem.c               |  4 ++--
 drivers/nvme/host/core.c            |  2 +-
 drivers/nvme/host/multipath.c       |  2 +-
 drivers/s390/block/dcssblk.c        |  8 ++++----
 drivers/s390/block/scm_blk.c        |  4 ++--
 include/linux/blkdev.h              |  1 -
 47 files changed, 74 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 267b02cc5655..a708fbd5a844 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -138,7 +138,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(dev->disk);
+	put_disk(dev->disk);
 free_dev:
 	kfree(dev);
 out:
@@ -180,7 +180,7 @@ static void __exit nfhd_exit(void)
 	list_for_each_entry_safe(dev, next, &nfhd_list, list) {
 		list_del(&dev->list);
 		del_gendisk(dev->disk);
-		blk_cleanup_disk(dev->disk);
+		put_disk(dev->disk);
 		kfree(dev);
 	}
 	unregister_blkdev(major_num, "nfhd");
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index c4344b67628d..479b79e11442 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -925,7 +925,7 @@ static int ubd_add(int n, char **error_out)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_cleanup_tags:
 	blk_mq_free_tag_set(&ubd_dev->tag_set);
 out:
@@ -1032,7 +1032,7 @@ static int ubd_remove(int n, char **error_out)
 	ubd_gendisk[n] = NULL;
 	if(disk != NULL){
 		del_gendisk(disk);
-		blk_cleanup_disk(disk);
+		put_disk(disk);
 	}
 
 	err = 0;
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 4255b92fa3eb..f50caaa1c249 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -290,7 +290,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(dev->gd);
+	put_disk(dev->gd);
 out:
 	return err;
 }
@@ -344,7 +344,7 @@ static void simdisk_teardown(struct simdisk *dev, int which,
 	simdisk_detach(dev);
 	if (dev->gd) {
 		del_gendisk(dev->gd);
-		blk_cleanup_disk(dev->gd);
+		put_disk(dev->gd);
 	}
 	remove_proc_entry(tmp, procdir);
 }
diff --git a/block/genhd.c b/block/genhd.c
index 4d15f828c449..bf9be06af2c8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1432,21 +1432,6 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
-/**
- * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
- * @disk: gendisk to shutdown
- *
- * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
- * the queue DEAD, destroy and put it and the gendisk structure.
- *
- * Context: can sleep
- */
-void blk_cleanup_disk(struct gendisk *disk)
-{
-	put_disk(disk);
-}
-EXPORT_SYMBOL(blk_cleanup_disk);
-
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
 	char event[] = "DISK_RO=1";
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 5a566f2fd533..4c8b2ba579ee 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1802,7 +1802,7 @@ static int fd_alloc_disk(int drive, int system)
 	unit[drive].gendisk[system] = disk;
 	err = add_disk(disk);
 	if (err)
-		blk_cleanup_disk(disk);
+		put_disk(disk);
 	return err;
 }
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 348adf335217..12b3ca8f6f4a 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -427,7 +427,7 @@ aoeblk_gdalloc(void *vp)
 	return;
 
 out_disk_cleanup:
-	blk_cleanup_disk(gd);
+	put_disk(gd);
 err_tagset:
 	blk_mq_free_tag_set(set);
 err_mempool:
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index b381d1c3ef32..3523dd82d7a0 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -277,7 +277,7 @@ freedev(struct aoedev *d)
 	if (d->gd) {
 		aoedisk_rm_debugfs(d);
 		del_gendisk(d->gd);
-		blk_cleanup_disk(d->gd);
+		put_disk(d->gd);
 		blk_mq_free_tag_set(&d->tag_set);
 	}
 	t = d->targets;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index c6e41ee18aaa..9deb4df6bdb8 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -2031,7 +2031,7 @@ static void ataflop_probe(dev_t dev)
 	return;
 
 cleanup_disk:
-	blk_cleanup_disk(unit[drive].disk[type]);
+	put_disk(unit[drive].disk[type]);
 	unit[drive].disk[type] = NULL;
 }
 
@@ -2063,7 +2063,7 @@ static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs)
 			continue;
 		if (fs->registered[type])
 			del_gendisk(fs->disk[type]);
-		blk_cleanup_disk(fs->disk[type]);
+		put_disk(fs->disk[type]);
 	}
 	blk_mq_free_tag_set(&fs->tag_set);
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 6e3f2f0d2352..9e26d5e769f3 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -419,7 +419,7 @@ static int brd_alloc(int i)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_free_dev:
 	list_del(&brd->brd_list);
 	kfree(brd);
@@ -439,7 +439,7 @@ static void brd_cleanup(void)
 
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
 		del_gendisk(brd->brd_disk);
-		blk_cleanup_disk(brd->brd_disk);
+		put_disk(brd->brd_disk);
 		brd_free_pages(brd);
 		list_del(&brd->brd_list);
 		kfree(brd);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2887350ae010..f3e4db16fd07 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2207,7 +2207,7 @@ void drbd_destroy_device(struct kref *kref)
 	if (device->bitmap) /* should no longer be there. */
 		drbd_bm_cleanup(device);
 	__free_page(device->md_io.page);
-	blk_cleanup_disk(device->vdisk);
+	put_disk(device->vdisk);
 	kfree(device->rs_plan_s);
 
 	/* not for_each_connection(connection, resource):
@@ -2807,7 +2807,7 @@ out_no_minor_idr:
 out_no_bitmap:
 	__free_page(device->md_io.page);
 out_no_io_page:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_no_disk:
 	kref_put(&resource->kref, drbd_destroy_resource);
 	kfree(device);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 015841f50f4e..491e7205a0db 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4557,7 +4557,7 @@ out:
 	return;
 
 cleanup_disk:
-	blk_cleanup_disk(disks[drive][type]);
+	put_disk(disks[drive][type]);
 	disks[drive][type] = NULL;
 	mutex_unlock(&floppy_probe_lock);
 }
@@ -4753,7 +4753,7 @@ out_put_disk:
 		if (!disks[drive][0])
 			break;
 		del_timer_sync(&motor_off_timer[drive]);
-		blk_cleanup_disk(disks[drive][0]);
+		put_disk(disks[drive][0]);
 		blk_mq_free_tag_set(&tag_sets[drive]);
 	}
 	return err;
@@ -4985,7 +4985,7 @@ static void __exit floppy_module_exit(void)
 		}
 		for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
 			if (disks[drive][i])
-				blk_cleanup_disk(disks[drive][i]);
+				put_disk(disks[drive][i]);
 		}
 		blk_mq_free_tag_set(&tag_sets[drive]);
 	}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cc608226c8c7..e3c0ba93c1a3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2040,7 +2040,7 @@ static int loop_add(int i)
 	return i;
 
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_cleanup_tags:
 	blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1d0e0a9fdd7c..e116c6cf56f5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3534,7 +3534,7 @@ init_hw_cmds_error:
 disk_index_error:
 	ida_free(&rssd_index_ida, index);
 ida_get_error:
-	blk_cleanup_disk(dd->disk);
+	put_disk(dd->disk);
 block_queue_alloc_init_error:
 	blk_mq_free_tag_set(&dd->tags);
 block_queue_alloc_tag_error:
diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c
index e094d2b8b5a9..d914156db2d8 100644
--- a/drivers/block/n64cart.c
+++ b/drivers/block/n64cart.c
@@ -157,7 +157,7 @@ static int __init n64cart_probe(struct platform_device *pdev)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out:
 	return err;
 }
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 07f3c139a3d7..5c4c9c45c6ac 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -250,7 +250,7 @@ static void nbd_dev_remove(struct nbd_device *nbd)
 	struct gendisk *disk = nbd->disk;
 
 	del_gendisk(disk);
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 	blk_mq_free_tag_set(&nbd->tag_set);
 
 	/*
@@ -1833,7 +1833,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
 out_free_work:
 	destroy_workqueue(nbd->recv_workq);
 out_err_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_free_idr:
 	mutex_lock(&nbd_index_mutex);
 	idr_remove(&nbd_index_idr, index);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 6b67088f4ea7..d695ea29efa6 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1737,7 +1737,7 @@ static void null_del_dev(struct nullb *nullb)
 		null_restart_queue_async(nullb);
 	}
 
-	blk_cleanup_disk(nullb->disk);
+	put_disk(nullb->disk);
 	if (dev->queue_mode == NULL_Q_MQ &&
 	    nullb->tag_set == &nullb->__tag_set)
 		blk_mq_free_tag_set(nullb->tag_set);
@@ -2082,7 +2082,7 @@ static int null_add_dev(struct nullb_device *dev)
 out_cleanup_zone:
 	null_free_zoned_dev(dev);
 out_cleanup_disk:
-	blk_cleanup_disk(nullb->disk);
+	put_disk(nullb->disk);
 out_cleanup_tags:
 	if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
 		blk_mq_free_tag_set(nullb->tag_set);
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index f462ad67931a..a5ab40784119 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -956,7 +956,7 @@ out_unreg_cdrom:
 out_pi_release:
 	pi_release(cd->pi);
 out_free_disk:
-	blk_cleanup_disk(cd->disk);
+	put_disk(cd->disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(&cd->tag_set);
 	return ret;
@@ -1029,7 +1029,7 @@ static void __exit pcd_exit(void)
 		unregister_cdrom(&cd->info);
 		del_gendisk(cd->disk);
 		pi_release(cd->pi);
-		blk_cleanup_disk(cd->disk);
+		put_disk(cd->disk);
 
 		blk_mq_free_tag_set(&cd->tag_set);
 	}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 3637c38c72f9..c8c14c6f5c3a 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -943,7 +943,7 @@ static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port,
 		goto cleanup_disk;
 	return 0;
 cleanup_disk:
-	blk_cleanup_disk(disk->gd);
+	put_disk(disk->gd);
 put_disk:
 	put_disk(p);
 	disk->gd = NULL;
@@ -1018,7 +1018,7 @@ static void __exit pd_exit(void)
 		if (p) {
 			disk->gd = NULL;
 			del_gendisk(p);
-			blk_cleanup_disk(p);
+			put_disk(p);
 			blk_mq_free_tag_set(&disk->tag_set);
 			pi_release(disk->pi);
 		}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 292e9a4ce1b9..eec1b9fde245 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -975,7 +975,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port,
 out_pi_release:
 	pi_release(pf->pi);
 out_free_disk:
-	blk_cleanup_disk(pf->disk);
+	put_disk(pf->disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(&pf->tag_set);
 	return ret;
@@ -1044,7 +1044,7 @@ static void __exit pf_exit(void)
 		if (!pf->present)
 			continue;
 		del_gendisk(pf->disk);
-		blk_cleanup_disk(pf->disk);
+		put_disk(pf->disk);
 		blk_mq_free_tag_set(&pf->tag_set);
 		pi_release(pf->pi);
 	}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 789093375344..653d24231483 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2733,7 +2733,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	return 0;
 
 out_mem2:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_mem:
 	mempool_exit(&pd->rb_pool);
 	kfree(pd);
@@ -2783,7 +2783,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	pkt_dbg(1, pd, "writer unmapped\n");
 
 	del_gendisk(pd->disk);
-	blk_cleanup_disk(pd->disk);
+	put_disk(pd->disk);
 
 	mempool_exit(&pd->rb_pool);
 	kfree(pd);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 3054adf77460..36d7b36c60c7 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -473,7 +473,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 
 	return 0;
 fail_cleanup_disk:
-	blk_cleanup_disk(gendisk);
+	put_disk(gendisk);
 fail_free_tag_set:
 	blk_mq_free_tag_set(&priv->tag_set);
 fail_teardown:
@@ -500,7 +500,7 @@ static void ps3disk_remove(struct ps3_system_bus_device *_dev)
 		    &ps3disk_mask);
 	mutex_unlock(&ps3disk_mask_mutex);
 	del_gendisk(priv->gendisk);
-	blk_cleanup_disk(priv->gendisk);
+	put_disk(priv->gendisk);
 	blk_mq_free_tag_set(&priv->tag_set);
 	dev_notice(&dev->sbd.core, "Synchronizing disk cache\n");
 	ps3disk_sync_cache(dev);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 4f90819e245e..d1e0fefec90b 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -761,7 +761,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(gendisk);
+	put_disk(gendisk);
 out_cache_cleanup:
 	remove_proc_entry(DEVICE_NAME, NULL);
 	ps3vram_cache_cleanup(dev);
@@ -792,7 +792,7 @@ static void ps3vram_remove(struct ps3_system_bus_device *dev)
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	del_gendisk(priv->gendisk);
-	blk_cleanup_disk(priv->gendisk);
+	put_disk(priv->gendisk);
 	remove_proc_entry(DEVICE_NAME, NULL);
 	ps3vram_cache_cleanup(dev);
 	iounmap(priv->reports);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ef9bc62e9afd..0d8ec2fe5740 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4729,7 +4729,7 @@ static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static void rbd_free_disk(struct rbd_device *rbd_dev)
 {
-	blk_cleanup_disk(rbd_dev->disk);
+	put_disk(rbd_dev->disk);
 	blk_mq_free_tag_set(&rbd_dev->tag_set);
 	rbd_dev->disk = NULL;
 }
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index a4470374f54f..b8d9e2824d9c 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1408,7 +1408,7 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
 	err = add_disk(dev->gd);
 	if (err)
-		blk_cleanup_disk(dev->gd);
+		put_disk(dev->gd);
 
 	return err;
 }
@@ -1630,7 +1630,7 @@ put_sess:
 static void destroy_gen_disk(struct rnbd_clt_dev *dev)
 {
 	del_gendisk(dev->gd);
-	blk_cleanup_disk(dev->gd);
+	put_disk(dev->gd);
 }
 
 static void destroy_sysfs(struct rnbd_clt_dev *dev,
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index dd0a1a6fed29..fb855da971ee 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -886,7 +886,7 @@ static int probe_disk(struct vdc_port *port)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(g);
+	put_disk(g);
 out_free_tag:
 	blk_mq_free_tag_set(&port->tag_set);
 	return err;
@@ -1070,7 +1070,7 @@ static void vdc_port_remove(struct vio_dev *vdev)
 		del_timer_sync(&port->vio.timer);
 
 		del_gendisk(port->disk);
-		blk_cleanup_disk(port->disk);
+		put_disk(port->disk);
 		blk_mq_free_tag_set(&port->tag_set);
 
 		vdc_free_tx_ring(port);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fef65a18d56f..42b4b6828690 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -783,7 +783,7 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs)
 	if (fs->registered)
 		del_gendisk(fs->disk);
 
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 	blk_mq_free_tag_set(&fs->tag_set);
 }
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 6c39f2c9f806..da811a7da03f 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1238,7 +1238,7 @@ static int swim3_attach(struct macio_dev *mdev,
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(&fs->tag_set);
 out_unregister:
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 75057dbbcfbe..0e1a484cab0b 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1377,7 +1377,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no)
 
 	if (host->state > HST_DEV_ACTIVATE)
 		del_gendisk(disk);
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 }
 
 static int carm_init_shm(struct carm_host *host)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cff1b6f6b054..d7d72e8f6e55 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1089,7 +1089,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(vblk->disk);
+	put_disk(vblk->disk);
 out_free_tags:
 	blk_mq_free_tag_set(&vblk->tag_set);
 out_free_vq:
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 33f04ef78984..3bc80f35418b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2384,7 +2384,7 @@ static void blkfront_connect(struct blkfront_info *info)
 
 	err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
 	if (err) {
-		blk_cleanup_disk(info->gd);
+		put_disk(info->gd);
 		blk_mq_free_tag_set(&info->tag_set);
 		info->rq = NULL;
 		goto fail;
@@ -2469,7 +2469,7 @@ static int blkfront_remove(struct xenbus_device *xbdev)
 	blkif_free(info, 0);
 	if (info->gd) {
 		xlbd_release_minors(info->gd->first_minor, info->gd->minors);
-		blk_cleanup_disk(info->gd);
+		put_disk(info->gd);
 		blk_mq_free_tag_set(&info->tag_set);
 	}
 
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 18ad43d9933e..c1e85f356e4d 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -337,7 +337,7 @@ static int z2ram_register_disk(int minor)
 	z2ram_gendisk[minor] = disk;
 	err = add_disk(disk);
 	if (err)
-		blk_cleanup_disk(disk);
+		put_disk(disk);
 	return err;
 }
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b8549c61ff2c..e5233c911e43 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1957,7 +1957,7 @@ static int zram_add(void)
 	return device_id;
 
 out_cleanup_disk:
-	blk_cleanup_disk(zram->disk);
+	put_disk(zram->disk);
 out_free_idr:
 	idr_remove(&zram_index_idr, device_id);
 out_free_dev:
@@ -2008,7 +2008,7 @@ static int zram_remove(struct zram *zram)
 	 */
 	zram_reset_device(zram);
 
-	blk_cleanup_disk(zram->disk);
+	put_disk(zram->disk);
 	kfree(zram);
 	return 0;
 }
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index f4cc90ea6198..ceded5772aac 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -817,7 +817,7 @@ probe_fail_free_irqs:
 	free_irq(HW_EVENT_GDROM_DMA, &gd);
 	free_irq(HW_EVENT_GDROM_CMD, &gd);
 probe_fail_cleanup_disk:
-	blk_cleanup_disk(gd.disk);
+	put_disk(gd.disk);
 probe_fail_free_tag_set:
 	blk_mq_free_tag_set(&gd.tag_set);
 probe_fail_free_cd_info:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3563d15dbaf2..9dd752d272f6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -884,7 +884,7 @@ static void bcache_device_free(struct bcache_device *d)
 	if (disk) {
 		ida_simple_remove(&bcache_device_idx,
 				  first_minor_to_idx(disk->first_minor));
-		blk_cleanup_disk(disk);
+		put_disk(disk);
 	}
 
 	bioset_exit(&d->bio_split);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4c04a980fcd9..8872f9c63688 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1894,7 +1894,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 			del_gendisk(md->disk);
 		}
 		dm_queue_destroy_crypto_profile(md->queue);
-		blk_cleanup_disk(md->disk);
+		put_disk(md->disk);
 	}
 
 	if (md->pending_io) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7ecb0bffda0..076255ec9ba1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5579,7 +5579,7 @@ static void md_free(struct kobject *ko)
 
 	if (mddev->gendisk) {
 		del_gendisk(mddev->gendisk);
-		blk_cleanup_disk(mddev->gendisk);
+		put_disk(mddev->gendisk);
 	}
 	percpu_ref_exit(&mddev->writes_pending);
 
@@ -5718,7 +5718,7 @@ static int md_alloc(dev_t dev, char *name)
 out_del_gendisk:
 	del_gendisk(disk);
 out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
 out_unlock_disks_mutex:
 	mutex_unlock(&disks_mutex);
 	mddev_put(mddev);
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index ba7e7249a3db..ed9a683b3ca8 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2129,7 +2129,7 @@ static int msb_init_disk(struct memstick_dev *card)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(msb->disk);
+	put_disk(msb->disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(&msb->tag_set);
 out_release_id:
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 72e91c06c618..61cf75d4a01e 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1209,7 +1209,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(msb->disk);
+	put_disk(msb->disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(&msb->tag_set);
 out_release_id:
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index f73172111465..60b222799871 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -29,7 +29,7 @@ static void blktrans_dev_release(struct kref *kref)
 	struct mtd_blktrans_dev *dev =
 		container_of(kref, struct mtd_blktrans_dev, ref);
 
-	blk_cleanup_disk(dev->disk);
+	put_disk(dev->disk);
 	blk_mq_free_tag_set(dev->tag_set);
 	kfree(dev->tag_set);
 	list_del(&dev->list);
@@ -398,7 +398,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(new->disk);
+	put_disk(new->disk);
 out_free_tag_set:
 	blk_mq_free_tag_set(new->tag_set);
 out_kfree_tag_set:
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index a78fdf3b30f7..4cf67a2a0d04 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -467,7 +467,7 @@ out_destroy_wq:
 out_remove_minor:
 	idr_remove(&ubiblock_minor_idr, gd->first_minor);
 out_cleanup_disk:
-	blk_cleanup_disk(dev->gd);
+	put_disk(dev->gd);
 out_free_tags:
 	blk_mq_free_tag_set(&dev->tag_set);
 out_free_dev:
@@ -486,7 +486,7 @@ static void ubiblock_cleanup(struct ubiblock *dev)
 	destroy_workqueue(dev->wq);
 	/* Finally destroy the blk queue */
 	dev_info(disk_to_dev(dev->gd), "released");
-	blk_cleanup_disk(dev->gd);
+	put_disk(dev->gd);
 	blk_mq_free_tag_set(&dev->tag_set);
 	idr_remove(&ubiblock_minor_idr, dev->gd->first_minor);
 }
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 9613e54c7a67..5e622c0d4b66 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1548,14 +1548,14 @@ static int btt_blk_init(struct btt *btt)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(btt->btt_disk);
+	put_disk(btt->btt_disk);
 	return rc;
 }
 
 static void btt_blk_cleanup(struct btt *btt)
 {
 	del_gendisk(btt->btt_disk);
-	blk_cleanup_disk(btt->btt_disk);
+	put_disk(btt->btt_disk);
 }
 
 /**
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 629d10fcf53b..a72b81fa3242 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -450,7 +450,7 @@ static void pmem_release_disk(void *__pmem)
 	put_dax(pmem->dax_dev);
 	del_gendisk(pmem->disk);
 
-	blk_cleanup_disk(pmem->disk);
+	put_disk(pmem->disk);
 }
 
 static int pmem_attach_disk(struct device *dev,
@@ -596,7 +596,7 @@ out_cleanup_dax:
 	kill_dax(pmem->dax_dev);
 	put_dax(pmem->dax_dev);
 out:
-	blk_cleanup_disk(pmem->disk);
+	put_disk(pmem->disk);
 	return rc;
 }
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4e3a0f7bfc9c..b5b24998a5ab 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4061,7 +4061,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	mutex_unlock(&ctrl->subsys->lock);
 	nvme_put_ns_head(ns->head);
  out_cleanup_disk:
-	blk_cleanup_disk(disk);
+	put_disk(disk);
  out_free_ns:
 	kfree(ns);
  out_free_id:
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d3e2440d8abb..ccf9a6da8f6e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -853,7 +853,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 	/* make sure all pending bios are cleaned up */
 	kblockd_schedule_work(&head->requeue_work);
 	flush_work(&head->requeue_work);
-	blk_cleanup_disk(head->disk);
+	put_disk(head->disk);
 }
 
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 8d0d0eaa3059..4d8d1759775a 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -414,7 +414,7 @@ removeseg:
 	kill_dax(dev_info->dax_dev);
 	put_dax(dev_info->dax_dev);
 	del_gendisk(dev_info->gd);
-	blk_cleanup_disk(dev_info->gd);
+	put_disk(dev_info->gd);
 	up_write(&dcssblk_devices_sem);
 
 	if (device_remove_file_self(dev, attr)) {
@@ -712,7 +712,7 @@ out_dax:
 	put_dax(dev_info->dax_dev);
 put_dev:
 	list_del(&dev_info->lh);
-	blk_cleanup_disk(dev_info->gd);
+	put_disk(dev_info->gd);
 	list_for_each_entry(seg_info, &dev_info->seg_list, lh) {
 		segment_unload(seg_info->segment_name);
 	}
@@ -722,7 +722,7 @@ put_dev:
 dev_list_del:
 	list_del(&dev_info->lh);
 release_gd:
-	blk_cleanup_disk(dev_info->gd);
+	put_disk(dev_info->gd);
 	up_write(&dcssblk_devices_sem);
 seg_list_del:
 	if (dev_info == NULL)
@@ -790,7 +790,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
 	kill_dax(dev_info->dax_dev);
 	put_dax(dev_info->dax_dev);
 	del_gendisk(dev_info->gd);
-	blk_cleanup_disk(dev_info->gd);
+	put_disk(dev_info->gd);
 
 	/* unload all related segments */
 	list_for_each_entry(entry, &dev_info->seg_list, lh)
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index 2a9c0ddcade5..0c1df1d5f1ac 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -501,7 +501,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
 	return 0;
 
 out_cleanup_disk:
-	blk_cleanup_disk(bdev->gendisk);
+	put_disk(bdev->gendisk);
 out_tag:
 	blk_mq_free_tag_set(&bdev->tag_set);
 out:
@@ -512,7 +512,7 @@ out:
 void scm_blk_dev_cleanup(struct scm_blk_dev *bdev)
 {
 	del_gendisk(bdev->gendisk);
-	blk_cleanup_disk(bdev->gendisk);
+	put_disk(bdev->gendisk);
 	blk_mq_free_tag_set(&bdev->tag_set);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 530eeccffda3..22b12531aeb7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -834,7 +834,6 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
 									\
 	__blk_alloc_disk(node_id, &__key);				\
 })
-void blk_cleanup_disk(struct gendisk *disk);
 
 int __register_blkdev(unsigned int major, const char *name,
 		void (*probe)(dev_t devt));
-- 
cgit 


From cc5c516df028b221d94c65c47c5ae8d20f61b6f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Jun 2022 19:18:45 +0200
Subject: block: simplify blktrace sysfs attribute creation

Add the trace attributes to the default gendisk attributes, just like
we already do for partitions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220628171850.1313069-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c            | 11 +----------
 block/blk.h                  |  2 ++
 block/genhd.c                |  3 +++
 block/partitions/core.c      |  1 -
 include/linux/blktrace_api.h | 10 ----------
 kernel/trace/blktrace.c      | 11 -----------
 6 files changed, 6 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9b211e519de8..5f3f73115988 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -810,21 +810,14 @@ int blk_register_queue(struct gendisk *disk)
 	struct device *dev = disk_to_dev(disk);
 	struct request_queue *q = disk->queue;
 
-	ret = blk_trace_init_sysfs(dev);
-	if (ret)
-		return ret;
-
 	mutex_lock(&q->sysfs_dir_lock);
 
 	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
-	if (ret < 0) {
-		blk_trace_remove_sysfs(dev);
+	if (ret < 0)
 		goto unlock;
-	}
 
 	ret = sysfs_create_group(&q->kobj, &queue_attr_group);
 	if (ret) {
-		blk_trace_remove_sysfs(dev);
 		kobject_del(&q->kobj);
 		kobject_put(&dev->kobj);
 		goto unlock;
@@ -890,7 +883,6 @@ put_dev:
 	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
 	kobject_del(&q->kobj);
-	blk_trace_remove_sysfs(dev);
 	kobject_put(&dev->kobj);
 
 	return ret;
@@ -931,7 +923,6 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (queue_is_mq(q))
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 	blk_crypto_sysfs_unregister(q);
-	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
 	elv_unregister_queue(q);
diff --git a/block/blk.h b/block/blk.h
index 1a0d3e6a4a63..74d59435870c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -452,6 +452,8 @@ extern struct device_attribute dev_attr_events;
 extern struct device_attribute dev_attr_events_async;
 extern struct device_attribute dev_attr_events_poll_msecs;
 
+extern struct attribute_group blk_trace_attr_group;
+
 long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
diff --git a/block/genhd.c b/block/genhd.c
index bf9be06af2c8..b1fb7e058b9c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1134,6 +1134,9 @@ static struct attribute_group disk_attr_group = {
 
 static const struct attribute_group *disk_attr_groups[] = {
 	&disk_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
 	NULL
 };
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 8a0ec929023b..7dc487f5b03c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -9,7 +9,6 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/vmalloc.h>
-#include <linux/blktrace_api.h>
 #include <linux/raid/detect.h>
 #include "check.h"
 
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 623e22492afa..f6f9b544365a 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -77,10 +77,6 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
-extern void blk_trace_remove_sysfs(struct device *dev);
-extern int blk_trace_init_sysfs(struct device *dev);
-
-extern struct attribute_group blk_trace_attr_group;
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
@@ -91,13 +87,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 # define blk_add_cgroup_trace_msg(q, cg, fmt, ...)	do { } while (0)
-# define blk_trace_remove_sysfs(dev)			do { } while (0)
 # define blk_trace_note_message_enabled(q)		(false)
-static inline int blk_trace_init_sysfs(struct device *dev)
-{
-	return 0;
-}
-
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #ifdef CONFIG_COMPAT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fe04c6f96ca5..c584effe5fe9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1867,17 +1867,6 @@ out_unlock_bdev:
 out:
 	return ret ? ret : count;
 }
-
-int blk_trace_init_sysfs(struct device *dev)
-{
-	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
-}
-
-void blk_trace_remove_sysfs(struct device *dev)
-{
-	sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
-}
-
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #ifdef CONFIG_EVENT_TRACING
-- 
cgit 


From 8682b92e5ab852b93739a0f2b261fff4c733be57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Jun 2022 19:18:50 +0200
Subject: blk-mq: cleanup disk sysfs registration

Pass a gendisk to the sysfs register/unregister functions and give
them descriptive names.  Also move the unregistration helper next
to the one doing the registration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220628171850.1313069-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sysfs.c   | 39 ++++++++++++++++++++-------------------
 block/blk-mq.h         |  3 ++-
 block/blk-sysfs.c      |  9 ++++-----
 include/linux/blk-mq.h |  1 -
 4 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ee6efe2b250d..93997d297d42 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,22 +203,6 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	lockdep_assert_held(&q->sysfs_dir_lock);
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_unregister_hctx(hctx);
-
-	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(q->mq_kobj);
-
-	q->mq_sysfs_init_done = false;
-}
-
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 {
 	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
@@ -251,16 +235,16 @@ void blk_mq_sysfs_init(struct request_queue *q)
 	}
 }
 
-int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int blk_mq_sysfs_register(struct gendisk *disk)
 {
+	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i, j;
 	int ret;
 
-	WARN_ON_ONCE(!q->kobj.parent);
 	lockdep_assert_held(&q->sysfs_dir_lock);
 
-	ret = kobject_add(q->mq_kobj, &dev->kobj, "%s", "mq");
+	ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
 	if (ret < 0)
 		goto out;
 
@@ -288,6 +272,23 @@ unreg:
 	return ret;
 }
 
+void blk_mq_sysfs_unregister(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	lockdep_assert_held(&q->sysfs_dir_lock);
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_unregister_hctx(hctx);
+
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
+
+	q->mq_sysfs_init_done = false;
+}
+
 void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a92639f2bfd2..54e20edf0da3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -118,7 +118,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
  */
 extern void blk_mq_sysfs_init(struct request_queue *q);
 extern void blk_mq_sysfs_deinit(struct request_queue *q);
-extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
+int blk_mq_sysfs_register(struct gendisk *disk);
+void blk_mq_sysfs_unregister(struct gendisk *disk);
 int blk_mq_sysfs_register_hctxs(struct request_queue *q);
 void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b72506770b97..85ea43eff094 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -812,18 +812,17 @@ struct kobj_type blk_queue_ktype = {
  */
 int blk_register_queue(struct gendisk *disk)
 {
-	int ret;
-	struct device *dev = disk_to_dev(disk);
 	struct request_queue *q = disk->queue;
+	int ret;
 
 	mutex_lock(&q->sysfs_dir_lock);
 
-	ret = kobject_add(&q->kobj, &dev->kobj, "%s", "queue");
+	ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
 	if (ret < 0)
 		goto unlock;
 
 	if (queue_is_mq(q))
-		__blk_mq_register_dev(dev, q);
+		blk_mq_sysfs_register(disk);
 	mutex_lock(&q->sysfs_lock);
 
 	mutex_lock(&q->debugfs_mutex);
@@ -919,7 +918,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	 * structures that can be modified through sysfs.
 	 */
 	if (queue_is_mq(q))
-		blk_mq_unregister_dev(disk_to_dev(disk), q);
+		blk_mq_sysfs_unregister(disk);
 	blk_crypto_sysfs_unregister(q);
 
 	mutex_lock(&q->sysfs_lock);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0fd96e92c6c6..43aad0da3305 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -691,7 +691,6 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q);
-void blk_mq_unregister_dev(struct device *, struct request_queue *);
 void blk_mq_destroy_queue(struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
-- 
cgit 


From 8add9a3a2243166f8f60fc20e876caaf30a333f7 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 28 Jun 2022 15:18:21 +0100
Subject: efi: Simplify arch_efi_call_virt() macro

Currently, the arch_efi_call_virt() assumes all users of it will have
defined a type 'efi_##f##_t' to make use of it.

Simplify the arch_efi_call_virt() macro by eliminating the explicit
need for efi_##f##_t type for every user of this macro.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
[ardb: apply Sudeep's ARM fix to i686, Loongarch and RISC-V too]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/efi.h       |  7 -------
 arch/arm64/include/asm/efi.h     |  7 ++-----
 arch/loongarch/include/asm/efi.h | 16 ++--------------
 arch/riscv/include/asm/efi.h     |  2 --
 arch/x86/include/asm/efi.h       |  3 +--
 include/linux/efi.h              |  2 ++
 6 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index 27218eabbf9a..3088ef72704e 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -24,13 +24,6 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 #define arch_efi_call_virt_setup()	efi_virtmap_load()
 #define arch_efi_call_virt_teardown()	efi_virtmap_unload()
 
-#define arch_efi_call_virt(p, f, args...)				\
-({									\
-	efi_##f##_t *__f;						\
-	__f = p->f;							\
-	__f(args);							\
-})
-
 #define ARCH_EFI_IRQ_FLAGS_MASK \
 	(PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \
 	 PSR_T_BIT | MODE_MASK)
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index ad55079abe47..439e2bc5d5d8 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -27,12 +27,9 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 	__efi_fpsimd_begin();						\
 })
 
+#undef arch_efi_call_virt
 #define arch_efi_call_virt(p, f, args...)				\
-({									\
-	efi_##f##_t *__f;						\
-	__f = p->f;							\
-	__efi_rt_asm_wrapper(__f, #f, args);				\
-})
+	__efi_rt_asm_wrapper((p)->f, #f, args)
 
 #define arch_efi_call_virt_teardown()					\
 ({									\
diff --git a/arch/loongarch/include/asm/efi.h b/arch/loongarch/include/asm/efi.h
index 0127d84d5e1d..9d44c6948be1 100644
--- a/arch/loongarch/include/asm/efi.h
+++ b/arch/loongarch/include/asm/efi.h
@@ -13,20 +13,8 @@ void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
 
 #define ARCH_EFI_IRQ_FLAGS_MASK  0x00000004  /* Bit 2: CSR.CRMD.IE */
 
-#define arch_efi_call_virt_setup()               \
-({                                               \
-})
-
-#define arch_efi_call_virt(p, f, args...)        \
-({                                               \
-	efi_##f##_t * __f;                       \
-	__f = p->f;                              \
-	__f(args);                               \
-})
-
-#define arch_efi_call_virt_teardown()            \
-({                                               \
-})
+#define arch_efi_call_virt_setup()
+#define arch_efi_call_virt_teardown()
 
 #define EFI_ALLOC_ALIGN		SZ_64K
 
diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h
index cc4f6787f937..f74879a8f1ea 100644
--- a/arch/riscv/include/asm/efi.h
+++ b/arch/riscv/include/asm/efi.h
@@ -23,8 +23,6 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 #define arch_efi_call_virt_setup()      efi_virtmap_load()
 #define arch_efi_call_virt_teardown()   efi_virtmap_unload()
 
-#define arch_efi_call_virt(p, f, args...) p->f(args)
-
 #define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE)
 
 /* Load initrd anywhere in system RAM */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index eb90206eae80..9a63dd1b794c 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -100,8 +100,6 @@ static inline void efi_fpu_end(void)
 	efi_fpu_end();							\
 })
 
-#define arch_efi_call_virt(p, f, args...)	p->f(args)
-
 #else /* !CONFIG_X86_32 */
 
 #define EFI_LOADER_SIGNATURE	"EL64"
@@ -121,6 +119,7 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
 	efi_enter_mm();							\
 })
 
+#undef arch_efi_call_virt
 #define arch_efi_call_virt(p, f, args...) ({				\
 	u64 ret, ibt = ibt_save();					\
 	ret = efi_call((void *)p->f, args);				\
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 93ce85a14a46..9ff63acef1ec 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1200,6 +1200,8 @@ static inline void efi_check_for_embedded_firmwares(void) { }
 
 efi_status_t efi_random_get_seed(void);
 
+#define arch_efi_call_virt(p, f, args...)	((p)->f(args))
+
 /*
  * Arch code can implement the following three template macros, avoiding
  * reptition for the void/non-void return cases of {__,}efi_call_virt():
-- 
cgit 


From 94dfc73e7cf4a31da66b8843f0b9283ddd6b8381 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 6 Apr 2022 19:36:51 -0500
Subject: treewide: uapi: Replace zero-length arrays with flexible-array
 members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

This code was transformed with the help of Coccinelle:
(linux-5.19-rc2$ spatch --jobs $(getconf _NPROCESSORS_ONLN) --sp-file script.cocci --include-headers --dir . > output.patch)

@@
identifier S, member, array;
type T1, T2;
@@

struct S {
  ...
  T1 member;
  T2 array[
- 0
  ];
};

-fstrict-flex-arrays=3 is coming and we need to land these changes
to prevent issues like these in the short future:

../fs/minix/dir.c:337:3: warning: 'strcpy' will always overflow; destination buffer has size 0,
but the source string has length 2 (including NUL byte) [-Wfortify-source]
		strcpy(de3->name, ".");
		^

Since these are all [0] to [] changes, the risk to UAPI is nearly zero. If
this breaks anything, we can use a union with a new member name.

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Build-tested-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/62b675ec.wKX6AOZ6cbE71vtF%25lkp@intel.com/
Acked-by: Dan Williams <dan.j.williams@intel.com> # For ndctl.h
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 arch/m68k/include/uapi/asm/bootinfo.h           |  4 +-
 arch/mips/include/uapi/asm/ucontext.h           |  2 +-
 arch/s390/include/uapi/asm/hwctrset.h           |  6 +--
 arch/x86/include/uapi/asm/bootparam.h           |  2 +-
 arch/x86/include/uapi/asm/kvm.h                 | 12 ++---
 include/uapi/drm/i915_drm.h                     |  6 +--
 include/uapi/linux/blkzoned.h                   |  2 +-
 include/uapi/linux/bpf.h                        |  2 +-
 include/uapi/linux/btrfs.h                      | 10 ++--
 include/uapi/linux/btrfs_tree.h                 |  2 +-
 include/uapi/linux/can/bcm.h                    |  2 +-
 include/uapi/linux/connector.h                  |  2 +-
 include/uapi/linux/cycx_cfm.h                   |  2 +-
 include/uapi/linux/dm-ioctl.h                   |  8 +--
 include/uapi/linux/dm-log-userspace.h           |  2 +-
 include/uapi/linux/ethtool.h                    | 28 +++++-----
 include/uapi/linux/fanotify.h                   |  2 +-
 include/uapi/linux/fiemap.h                     |  2 +-
 include/uapi/linux/firewire-cdev.h              | 12 ++---
 include/uapi/linux/fs.h                         |  2 +-
 include/uapi/linux/if_alg.h                     |  2 +-
 include/uapi/linux/if_arcnet.h                  |  6 +--
 include/uapi/linux/if_pppox.h                   |  4 +-
 include/uapi/linux/if_tun.h                     |  2 +-
 include/uapi/linux/igmp.h                       |  6 +--
 include/uapi/linux/inet_diag.h                  |  2 +-
 include/uapi/linux/inotify.h                    |  2 +-
 include/uapi/linux/ip.h                         |  4 +-
 include/uapi/linux/ip_vs.h                      |  4 +-
 include/uapi/linux/iso_fs.h                     |  4 +-
 include/uapi/linux/jffs2.h                      |  8 +--
 include/uapi/linux/kcov.h                       |  2 +-
 include/uapi/linux/kvm.h                        |  8 +--
 include/uapi/linux/minix_fs.h                   |  4 +-
 include/uapi/linux/mmc/ioctl.h                  |  2 +-
 include/uapi/linux/ndctl.h                      | 10 ++--
 include/uapi/linux/net_dropmon.h                |  4 +-
 include/uapi/linux/netfilter/x_tables.h         |  4 +-
 include/uapi/linux/netfilter_arp/arp_tables.h   |  6 +--
 include/uapi/linux/netfilter_bridge/ebt_among.h |  2 +-
 include/uapi/linux/netfilter_ipv4/ip_tables.h   |  6 +--
 include/uapi/linux/netfilter_ipv6/ip6_tables.h  |  4 +-
 include/uapi/linux/perf_event.h                 |  2 +-
 include/uapi/linux/pkt_cls.h                    |  4 +-
 include/uapi/linux/raid/md_p.h                  |  2 +-
 include/uapi/linux/random.h                     |  2 +-
 include/uapi/linux/romfs_fs.h                   |  4 +-
 include/uapi/linux/rtnetlink.h                  |  2 +-
 include/uapi/linux/sctp.h                       | 10 ++--
 include/uapi/linux/seg6.h                       |  2 +-
 include/uapi/linux/seg6_iptunnel.h              |  2 +-
 include/uapi/linux/stm.h                        |  2 +-
 include/uapi/linux/target_core_user.h           |  2 +-
 include/uapi/linux/usb/audio.h                  |  2 +-
 include/uapi/linux/usb/cdc.h                    |  6 +--
 include/uapi/linux/usb/ch9.h                    |  2 +-
 include/uapi/linux/usb/raw_gadget.h             |  4 +-
 include/uapi/linux/usbdevice_fs.h               |  4 +-
 include/uapi/linux/vhost_types.h                |  4 +-
 include/uapi/linux/virtio_9p.h                  |  2 +-
 include/uapi/linux/xfrm.h                       | 10 ++--
 include/uapi/rdma/hfi/hfi1_user.h               |  2 +-
 include/uapi/rdma/ib_user_verbs.h               | 72 ++++++++++++-------------
 include/uapi/rdma/rdma_user_cm.h                |  2 +-
 include/uapi/rdma/rdma_user_ioctl_cmds.h        |  2 +-
 include/uapi/scsi/fc/fc_els.h                   | 18 +++----
 include/uapi/scsi/scsi_bsg_fc.h                 |  2 +-
 include/uapi/sound/asound.h                     |  2 +-
 include/uapi/sound/firewire.h                   |  6 +--
 include/uapi/sound/skl-tplg-interface.h         |  2 +-
 include/uapi/sound/sof/header.h                 |  2 +-
 include/uapi/sound/usb_stream.h                 |  2 +-
 tools/arch/x86/include/uapi/asm/kvm.h           | 12 ++---
 tools/include/uapi/drm/i915_drm.h               |  6 +--
 tools/include/uapi/linux/fs.h                   |  2 +-
 tools/include/uapi/linux/if_tun.h               |  2 +-
 tools/include/uapi/linux/kvm.h                  |  8 +--
 tools/include/uapi/linux/perf_event.h           |  2 +-
 tools/include/uapi/linux/pkt_cls.h              |  4 +-
 tools/include/uapi/linux/seg6.h                 |  4 +-
 tools/include/uapi/linux/usbdevice_fs.h         |  4 +-
 tools/include/uapi/sound/asound.h               |  2 +-
 82 files changed, 216 insertions(+), 216 deletions(-)

(limited to 'include')

diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h
index 203d9cbf9630..95ecf3ae4c49 100644
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -34,7 +34,7 @@
 struct bi_record {
 	__be16 tag;			/* tag ID */
 	__be16 size;			/* size of record (in bytes) */
-	__be32 data[0];			/* data */
+	__be32 data[];			/* data */
 };
 
 
@@ -168,7 +168,7 @@ struct bootversion {
 	struct {
 		__be32 machtype;
 		__be32 version;
-	} machversions[0];
+	} machversions[];
 } __packed;
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/mips/include/uapi/asm/ucontext.h b/arch/mips/include/uapi/asm/ucontext.h
index 2d3bf8eebf1f..6122ef97c6ff 100644
--- a/arch/mips/include/uapi/asm/ucontext.h
+++ b/arch/mips/include/uapi/asm/ucontext.h
@@ -60,7 +60,7 @@ struct ucontext {
 	sigset_t		uc_sigmask;
 
 	/* Extended context structures may follow ucontext */
-	unsigned long long	uc_extcontext[0];
+	unsigned long long	uc_extcontext[];
 };
 
 #endif /* __MIPS_UAPI_ASM_UCONTEXT_H */
diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h
index 3d8284b95f87..e56b9dd23a4b 100644
--- a/arch/s390/include/uapi/asm/hwctrset.h
+++ b/arch/s390/include/uapi/asm/hwctrset.h
@@ -30,18 +30,18 @@ struct s390_ctrset_start {		/* Set CPUs to operate on */
 struct s390_ctrset_setdata {		/* Counter set data */
 	__u32 set;			/* Counter set number */
 	__u32 no_cnts;			/* # of counters stored in cv[] */
-	__u64 cv[0];			/* Counter values (variable length) */
+	__u64 cv[];			/* Counter values (variable length) */
 };
 
 struct s390_ctrset_cpudata {		/* Counter set data per CPU */
 	__u32 cpu_nr;			/* CPU number */
 	__u32 no_sets;			/* # of counters sets in data[] */
-	struct s390_ctrset_setdata data[0];
+	struct s390_ctrset_setdata data[];
 };
 
 struct s390_ctrset_read {		/* Structure to get all ctr sets */
 	__u64 no_cpus;			/* Total # of CPUs data taken from */
-	struct s390_ctrset_cpudata data[0];
+	struct s390_ctrset_cpudata data[];
 };
 
 #define S390_HWCTR_MAGIC	'C'	/* Random magic # for ioctls */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index bea5cdcdf532..cdd6c7f6cfa6 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -52,7 +52,7 @@ struct setup_data {
 	__u64 next;
 	__u32 type;
 	__u32 len;
-	__u8 data[0];
+	__u8 data[];
 };
 
 /* extensible setup indirect data node */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 21614807a2cb..ec53c9fa1da9 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -198,13 +198,13 @@ struct kvm_msrs {
 	__u32 nmsrs; /* number of msrs in entries */
 	__u32 pad;
 
-	struct kvm_msr_entry entries[0];
+	struct kvm_msr_entry entries[];
 };
 
 /* for KVM_GET_MSR_INDEX_LIST */
 struct kvm_msr_list {
 	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
+	__u32 indices[];
 };
 
 /* Maximum size of any access bitmap in bytes */
@@ -241,7 +241,7 @@ struct kvm_cpuid_entry {
 struct kvm_cpuid {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
+	struct kvm_cpuid_entry entries[];
 };
 
 struct kvm_cpuid_entry2 {
@@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 {
 struct kvm_cpuid2 {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry2 entries[0];
+	struct kvm_cpuid_entry2 entries[];
 };
 
 /* for KVM_GET_PIT and KVM_SET_PIT */
@@ -389,7 +389,7 @@ struct kvm_xsave {
 	 * the contents of CPUID leaf 0xD on the host.
 	 */
 	__u32 region[1024];
-	__u32 extra[0];
+	__u32 extra[];
 };
 
 #define KVM_MAX_XCRS	16
@@ -516,7 +516,7 @@ struct kvm_pmu_event_filter {
 	__u32 fixed_counter_bitmap;
 	__u32 flags;
 	__u32 pad[4];
-	__u64 events[0];
+	__u64 events[];
 };
 
 #define KVM_PMU_EVENT_ALLOW 0
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a2def7b27009..b28ff5d88145 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2123,7 +2123,7 @@ struct i915_context_engines_load_balance {
 
 	__u64 mbz64; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
@@ -2161,7 +2161,7 @@ struct i915_context_engines_bond {
 	__u64 flags; /* all undefined flags must be zero */
 	__u64 mbz64[4]; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \
@@ -2288,7 +2288,7 @@ struct i915_context_engines_parallel_submit {
 	 * length = width (i) * num_siblings (j)
 	 * index = j + i * num_siblings
 	 */
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 
 } __packed;
 
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 656a326821a2..b80fcc9ea525 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -130,7 +130,7 @@ struct blk_zone_report {
 	__u64		sector;
 	__u32		nr_zones;
 	__u32		flags;
-	struct blk_zone zones[0];
+	struct blk_zone zones[];
 };
 
 /**
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..e4b33ba06f00 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -79,7 +79,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
-	__u8	data[0];	/* Arbitrary size */
+	__u8	data[];	/* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..3d0edbe3b991 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -93,7 +93,7 @@ struct btrfs_qgroup_inherit {
 	__u64	num_ref_copies;
 	__u64	num_excl_copies;
 	struct btrfs_qgroup_limit lim;
-	__u64	qgroups[0];
+	__u64	qgroups[];
 };
 
 struct btrfs_ioctl_qgroup_limit_args {
@@ -561,7 +561,7 @@ struct btrfs_ioctl_search_args_v2 {
 	__u64 buf_size;		   /* in - size of buffer
 					    * out - on EOVERFLOW: needed size
 					    *       to store item */
-	__u64 buf[0];                       /* out - found items */
+	__u64 buf[];                       /* out - found items */
 };
 
 struct btrfs_ioctl_clone_range_args {
@@ -632,7 +632,7 @@ struct btrfs_ioctl_same_args {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;
 	__u32 reserved2;
-	struct btrfs_ioctl_same_extent_info info[0];
+	struct btrfs_ioctl_same_extent_info info[];
 };
 
 struct btrfs_ioctl_space_info {
@@ -644,7 +644,7 @@ struct btrfs_ioctl_space_info {
 struct btrfs_ioctl_space_args {
 	__u64 space_slots;
 	__u64 total_spaces;
-	struct btrfs_ioctl_space_info spaces[0];
+	struct btrfs_ioctl_space_info spaces[];
 };
 
 struct btrfs_data_container {
@@ -652,7 +652,7 @@ struct btrfs_data_container {
 	__u32	bytes_missing;	/* out -- additional bytes needed for result */
 	__u32	elem_cnt;	/* out */
 	__u32	elem_missed;	/* out */
-	__u64	val[0];		/* out */
+	__u64	val[];		/* out */
 };
 
 struct btrfs_ioctl_ino_path_args {
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index d4117152d907..5f32a2a495dc 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -575,7 +575,7 @@ struct btrfs_inode_extref {
 	__le64 parent_objectid;
 	__le64 index;
 	__le16 name_len;
-	__u8   name[0];
+	__u8   name[];
 	/* name goes here */
 } __attribute__ ((__packed__));
 
diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h
index dd2b925b09ac..f1e45f533a72 100644
--- a/include/uapi/linux/can/bcm.h
+++ b/include/uapi/linux/can/bcm.h
@@ -71,7 +71,7 @@ struct bcm_msg_head {
 	struct bcm_timeval ival1, ival2;
 	canid_t can_id;
 	__u32 nframes;
-	struct can_frame frames[0];
+	struct can_frame frames[];
 };
 
 enum {
diff --git a/include/uapi/linux/connector.h b/include/uapi/linux/connector.h
index 3738936149a2..5ae131c3f145 100644
--- a/include/uapi/linux/connector.h
+++ b/include/uapi/linux/connector.h
@@ -75,7 +75,7 @@ struct cn_msg {
 
 	__u16 len;		/* Length of the following data */
 	__u16 flags;
-	__u8 data[0];
+	__u8 data[];
 };
 
 #endif /* _UAPI__CONNECTOR_H */
diff --git a/include/uapi/linux/cycx_cfm.h b/include/uapi/linux/cycx_cfm.h
index 51f541942ff9..91778c8024b1 100644
--- a/include/uapi/linux/cycx_cfm.h
+++ b/include/uapi/linux/cycx_cfm.h
@@ -91,7 +91,7 @@ struct cycx_firmware {
 	unsigned short	    reserved[6];
 	char		    descr[CFM_DESCR_LEN];
 	struct cycx_fw_info info;
-	unsigned char	    image[0];
+	unsigned char	    image[];
 };
 
 struct cycx_fw_header {
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 2e9550fef90f..8c97d75f3104 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -182,7 +182,7 @@ struct dm_target_spec {
 struct dm_target_deps {
 	__u32 count;	/* Array size */
 	__u32 padding;	/* unused */
-	__u64 dev[0];	/* out */
+	__u64 dev[];	/* out */
 };
 
 /*
@@ -192,7 +192,7 @@ struct dm_name_list {
 	__u64 dev;
 	__u32 next;		/* offset to the next record from
 				   the _start_ of this */
-	char name[0];
+	char name[];
 
 	/*
 	 * The following members can be accessed by taking a pointer that
@@ -216,7 +216,7 @@ struct dm_target_versions {
         __u32 next;
         __u32 version[3];
 
-        char name[0];
+        char name[];
 };
 
 /*
@@ -225,7 +225,7 @@ struct dm_target_versions {
 struct dm_target_msg {
 	__u64 sector;	/* Device sector */
 
-	char message[0];
+	char message[];
 };
 
 /*
diff --git a/include/uapi/linux/dm-log-userspace.h b/include/uapi/linux/dm-log-userspace.h
index 5c47a8603376..23dad9565e46 100644
--- a/include/uapi/linux/dm-log-userspace.h
+++ b/include/uapi/linux/dm-log-userspace.h
@@ -426,7 +426,7 @@ struct dm_ulog_request {
 	__u32 request_type;  /* DM_ULOG_* defined above */
 	__u32 data_size;     /* How much data (not including this struct) */
 
-	char data[0];
+	char data[];
 };
 
 #endif /* __DM_LOG_USERSPACE_H__ */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index e0f0ee9bc89e..2d5741fd44bb 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -257,7 +257,7 @@ struct ethtool_tunable {
 	__u32	id;
 	__u32	type_id;
 	__u32	len;
-	void	*data[0];
+	void	*data[];
 };
 
 #define DOWNSHIFT_DEV_DEFAULT_COUNT	0xff
@@ -322,7 +322,7 @@ struct ethtool_regs {
 	__u32	cmd;
 	__u32	version;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -348,7 +348,7 @@ struct ethtool_eeprom {
 	__u32	magic;
 	__u32	offset;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -752,7 +752,7 @@ struct ethtool_gstrings {
 	__u32	cmd;
 	__u32	string_set;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -777,7 +777,7 @@ struct ethtool_sset_info {
 	__u32	cmd;
 	__u32	reserved;
 	__u64	sset_mask;
-	__u32	data[0];
+	__u32	data[];
 };
 
 /**
@@ -817,7 +817,7 @@ struct ethtool_test {
 	__u32	flags;
 	__u32	reserved;
 	__u32	len;
-	__u64	data[0];
+	__u64	data[];
 };
 
 /**
@@ -834,7 +834,7 @@ struct ethtool_test {
 struct ethtool_stats {
 	__u32	cmd;
 	__u32	n_stats;
-	__u64	data[0];
+	__u64	data[];
 };
 
 /**
@@ -851,7 +851,7 @@ struct ethtool_stats {
 struct ethtool_perm_addr {
 	__u32	cmd;
 	__u32	size;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /* boolean flags controlling per-interface behavior characteristics.
@@ -1160,7 +1160,7 @@ struct ethtool_rxnfc {
 struct ethtool_rxfh_indir {
 	__u32	cmd;
 	__u32	size;
-	__u32	ring_index[0];
+	__u32	ring_index[];
 };
 
 /**
@@ -1201,7 +1201,7 @@ struct ethtool_rxfh {
 	__u8	hfunc;
 	__u8	rsvd8[3];
 	__u32	rsvd32;
-	__u32   rss_config[0];
+	__u32   rss_config[];
 };
 #define ETH_RXFH_CONTEXT_ALLOC		0xffffffff
 #define ETH_RXFH_INDIR_NO_CHANGE	0xffffffff
@@ -1286,7 +1286,7 @@ struct ethtool_dump {
 	__u32	version;
 	__u32	flag;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 #define ETH_FW_DUMP_DISABLE 0
@@ -1318,7 +1318,7 @@ struct ethtool_get_features_block {
 struct ethtool_gfeatures {
 	__u32	cmd;
 	__u32	size;
-	struct ethtool_get_features_block features[0];
+	struct ethtool_get_features_block features[];
 };
 
 /**
@@ -1340,7 +1340,7 @@ struct ethtool_set_features_block {
 struct ethtool_sfeatures {
 	__u32	cmd;
 	__u32	size;
-	struct ethtool_set_features_block features[0];
+	struct ethtool_set_features_block features[];
 };
 
 /**
@@ -2087,7 +2087,7 @@ struct ethtool_link_settings {
 	__u8	master_slave_state;
 	__u8	reserved1[1];
 	__u32	reserved[7];
-	__u32	link_mode_masks[0];
+	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
 	 * __u32 map_advertising[link_mode_masks_nwords];
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..197df344307d 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -162,7 +162,7 @@ struct fanotify_event_info_fid {
 	 * Following is an opaque struct file_handle that can be passed as
 	 * an argument to open_by_handle_at(2).
 	 */
-	unsigned char handle[0];
+	unsigned char handle[];
 };
 
 /*
diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h
index 07c1cdcb715e..24ca0c00cae3 100644
--- a/include/uapi/linux/fiemap.h
+++ b/include/uapi/linux/fiemap.h
@@ -34,7 +34,7 @@ struct fiemap {
 	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
 	__u32 fm_extent_count;  /* size of fm_extents array (in) */
 	__u32 fm_reserved;
-	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+	struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */
 };
 
 #define FIEMAP_MAX_OFFSET	(~0ULL)
diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h
index 5effa9832802..92be3ea3c6e0 100644
--- a/include/uapi/linux/firewire-cdev.h
+++ b/include/uapi/linux/firewire-cdev.h
@@ -118,7 +118,7 @@ struct fw_cdev_event_response {
 	__u32 type;
 	__u32 rcode;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -142,7 +142,7 @@ struct fw_cdev_event_request {
 	__u64 offset;
 	__u32 handle;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -205,7 +205,7 @@ struct fw_cdev_event_request2 {
 	__u32 generation;
 	__u32 handle;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -265,7 +265,7 @@ struct fw_cdev_event_iso_interrupt {
 	__u32 type;
 	__u32 cycle;
 	__u32 header_length;
-	__u32 header[0];
+	__u32 header[];
 };
 
 /**
@@ -355,7 +355,7 @@ struct fw_cdev_event_phy_packet {
 	__u32 type;
 	__u32 rcode;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -803,7 +803,7 @@ struct fw_cdev_set_iso_channels {
  */
 struct fw_cdev_iso_packet {
 	__u32 control;
-	__u32 header[0];
+	__u32 header[];
 };
 
 /**
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..b7b56871029c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,7 +90,7 @@ struct file_dedupe_range {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;	/* must be zero */
 	__u32 reserved2;	/* must be zero */
-	struct file_dedupe_range_info info[0];
+	struct file_dedupe_range_info info[];
 };
 
 /* And dynamically-tunable limits and defaults: */
diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h
index dc52a11ba6d1..578b18aab821 100644
--- a/include/uapi/linux/if_alg.h
+++ b/include/uapi/linux/if_alg.h
@@ -42,7 +42,7 @@ struct sockaddr_alg_new {
 
 struct af_alg_iv {
 	__u32	ivlen;
-	__u8	iv[0];
+	__u8	iv[];
 };
 
 /* Socket options */
diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h
index 683878036d76..b122cfac7128 100644
--- a/include/uapi/linux/if_arcnet.h
+++ b/include/uapi/linux/if_arcnet.h
@@ -60,7 +60,7 @@ struct arc_rfc1201 {
 	__u8  proto;		/* protocol ID field - varies		*/
 	__u8  split_flag;	/* for use with split packets		*/
 	__be16   sequence;	/* sequence number			*/
-	__u8  payload[0];	/* space remaining in packet (504 bytes)*/
+	__u8  payload[];	/* space remaining in packet (504 bytes)*/
 };
 #define RFC1201_HDR_SIZE 4
 
@@ -69,7 +69,7 @@ struct arc_rfc1201 {
  */
 struct arc_rfc1051 {
 	__u8 proto;		/* ARC_P_RFC1051_ARP/RFC1051_IP	*/
-	__u8 payload[0];	/* 507 bytes			*/
+	__u8 payload[];	/* 507 bytes			*/
 };
 #define RFC1051_HDR_SIZE 1
 
@@ -80,7 +80,7 @@ struct arc_rfc1051 {
 struct arc_eth_encap {
 	__u8 proto;		/* Always ARC_P_ETHER			*/
 	struct ethhdr eth;	/* standard ethernet header (yuck!)	*/
-	__u8 payload[0];	/* 493 bytes				*/
+	__u8 payload[];	/* 493 bytes				*/
 };
 #define ETH_ENCAP_HDR_SIZE 14
 
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index e7a693c28f16..9abd80dcc46f 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -122,7 +122,7 @@ struct sockaddr_pppol2tpv3in6 {
 struct pppoe_tag {
 	__be16 tag_type;
 	__be16 tag_len;
-	char tag_data[0];
+	char tag_data[];
 } __attribute__ ((packed));
 
 /* Tag identifiers */
@@ -150,7 +150,7 @@ struct pppoe_hdr {
 	__u8 code;
 	__be16 sid;
 	__be16 length;
-	struct pppoe_tag tag[0];
+	struct pppoe_tag tag[];
 } __packed;
 
 /* Length of entire PPPoE + PPP header */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 454ae31b93c7..2ec07de1d73b 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -108,7 +108,7 @@ struct tun_pi {
 struct tun_filter {
 	__u16  flags; /* TUN_FLT_ flags see above */
 	__u16  count; /* Number of addresses */
-	__u8   addr[0][ETH_ALEN];
+	__u8   addr[][ETH_ALEN];
 };
 
 #endif /* _UAPI__IF_TUN_H */
diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h
index 90c28bc466c6..5930f2437cd1 100644
--- a/include/uapi/linux/igmp.h
+++ b/include/uapi/linux/igmp.h
@@ -48,7 +48,7 @@ struct igmpv3_grec {
 	__u8	grec_auxwords;
 	__be16	grec_nsrcs;
 	__be32	grec_mca;
-	__be32	grec_src[0];
+	__be32	grec_src[];
 };
 
 struct igmpv3_report {
@@ -57,7 +57,7 @@ struct igmpv3_report {
 	__sum16 csum;
 	__be16 resv2;
 	__be16 ngrec;
-	struct igmpv3_grec grec[0];
+	struct igmpv3_grec grec[];
 };
 
 struct igmpv3_query {
@@ -78,7 +78,7 @@ struct igmpv3_query {
 #endif
 	__u8 qqic;
 	__be16 nsrcs;
-	__be32 srcs[0];
+	__be32 srcs[];
 };
 
 #define IGMP_HOST_MEMBERSHIP_QUERY	0x11	/* From RFC1112 */
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 20ee93f0f876..50655de04c9b 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -104,7 +104,7 @@ struct inet_diag_hostcond {
 	__u8	family;
 	__u8	prefix_len;
 	int	port;
-	__be32	addr[0];
+	__be32	addr[];
 };
 
 struct inet_diag_markcond {
diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h
index 884b4846b630..b3e165853d5b 100644
--- a/include/uapi/linux/inotify.h
+++ b/include/uapi/linux/inotify.h
@@ -23,7 +23,7 @@ struct inotify_event {
 	__u32		mask;		/* watch mask */
 	__u32		cookie;		/* cookie to synchronize two events */
 	__u32		len;		/* length (including nulls) of name */
-	char		name[0];	/* stub for possible name */
+	char		name[];	/* stub for possible name */
 };
 
 /* the following are legal, implemented events that user-space can watch for */
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index e00bbb9c47bb..961ec16a26b8 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -112,13 +112,13 @@ struct ip_auth_hdr {
 	__be16 reserved;
 	__be32 spi;
 	__be32 seq_no;		/* Sequence number */
-	__u8  auth_data[0];	/* Variable len but >=4. Mind the 64 bit alignment! */
+	__u8  auth_data[];	/* Variable len but >=4. Mind the 64 bit alignment! */
 };
 
 struct ip_esp_hdr {
 	__be32 spi;
 	__be32 seq_no;		/* Sequence number */
-	__u8  enc_data[0];	/* Variable len but >=8. Mind the 64 bit alignment! */
+	__u8  enc_data[];	/* Variable len but >=8. Mind the 64 bit alignment! */
 };
 
 struct ip_comp_hdr {
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..1ed234e7f251 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -254,7 +254,7 @@ struct ip_vs_get_dests {
 	unsigned int		num_dests;
 
 	/* the real servers */
-	struct ip_vs_dest_entry	entrytable[0];
+	struct ip_vs_dest_entry	entrytable[];
 };
 
 
@@ -264,7 +264,7 @@ struct ip_vs_get_services {
 	unsigned int		num_services;
 
 	/* service table */
-	struct ip_vs_service_entry entrytable[0];
+	struct ip_vs_service_entry entrytable[];
 };
 
 
diff --git a/include/uapi/linux/iso_fs.h b/include/uapi/linux/iso_fs.h
index a2555176f6d1..758178f5b52d 100644
--- a/include/uapi/linux/iso_fs.h
+++ b/include/uapi/linux/iso_fs.h
@@ -137,7 +137,7 @@ struct iso_path_table{
 	__u8  name_len[2];	/* 721 */
 	__u8  extent[4];	/* 731 */
 	__u8  parent[2];	/* 721 */
-	char name[0];
+	char name[];
 } __attribute__((packed));
 
 /* high sierra is identical to iso, except that the date is only 6 bytes, and
@@ -154,7 +154,7 @@ struct iso_directory_record {
 	__u8 interleave			[ISODCL (28, 28)]; /* 711 */
 	__u8 volume_sequence_number	[ISODCL (29, 32)]; /* 723 */
 	__u8 name_len			[ISODCL (33, 33)]; /* 711 */
-	char name			[0];
+	char name			[];
 } __attribute__((packed));
 
 #define ISOFS_BLOCK_BITS 11
diff --git a/include/uapi/linux/jffs2.h b/include/uapi/linux/jffs2.h
index 784ba0b9690a..637ee4a793cf 100644
--- a/include/uapi/linux/jffs2.h
+++ b/include/uapi/linux/jffs2.h
@@ -123,7 +123,7 @@ struct jffs2_raw_dirent
 	__u8 unused[2];
 	jint32_t node_crc;
 	jint32_t name_crc;
-	__u8 name[0];
+	__u8 name[];
 };
 
 /* The JFFS2 raw inode structure: Used for storage on physical media.  */
@@ -155,7 +155,7 @@ struct jffs2_raw_inode
 	jint16_t flags;	     /* See JFFS2_INO_FLAG_* */
 	jint32_t data_crc;   /* CRC for the (compressed) data.  */
 	jint32_t node_crc;   /* CRC for the raw inode (excluding data)  */
-	__u8 data[0];
+	__u8 data[];
 };
 
 struct jffs2_raw_xattr {
@@ -170,7 +170,7 @@ struct jffs2_raw_xattr {
 	jint16_t value_len;
 	jint32_t data_crc;
 	jint32_t node_crc;
-	__u8 data[0];
+	__u8 data[];
 } __attribute__((packed));
 
 struct jffs2_raw_xref
@@ -196,7 +196,7 @@ struct jffs2_raw_summary
 	jint32_t padded;	/* sum of the size of padding nodes */
 	jint32_t sum_crc;	/* summary information crc */
 	jint32_t node_crc; 	/* node crc */
-	jint32_t sum[0]; 	/* inode summary info */
+	jint32_t sum[]; 	/* inode summary info */
 };
 
 union jffs2_node_union
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 1d0350e44ae3..ed95dba9fa37 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -13,7 +13,7 @@ struct kcov_remote_arg {
 	__u32		area_size;	/* Length of coverage buffer in words */
 	__u32		num_handles;	/* Size of handles array */
 	__aligned_u64	common_handle;
-	__aligned_u64	handles[0];
+	__aligned_u64	handles[];
 };
 
 #define KCOV_REMOTE_MAX_HANDLES		0x100
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..74dc8bafcb9e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -542,7 +542,7 @@ struct kvm_coalesced_mmio {
 
 struct kvm_coalesced_mmio_ring {
 	__u32 first, last;
-	struct kvm_coalesced_mmio coalesced_mmio[0];
+	struct kvm_coalesced_mmio coalesced_mmio[];
 };
 
 #define KVM_COALESCED_MMIO_MAX \
@@ -621,7 +621,7 @@ struct kvm_clear_dirty_log {
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
-	__u8  sigset[0];
+	__u8  sigset[];
 };
 
 /* for KVM_TPR_ACCESS_REPORTING */
@@ -1221,7 +1221,7 @@ struct kvm_irq_routing_entry {
 struct kvm_irq_routing {
 	__u32 nr;
 	__u32 flags;
-	struct kvm_irq_routing_entry entries[0];
+	struct kvm_irq_routing_entry entries[];
 };
 
 #endif
@@ -1341,7 +1341,7 @@ struct kvm_dirty_tlb {
 
 struct kvm_reg_list {
 	__u64 n; /* number of regs */
-	__u64 reg[0];
+	__u64 reg[];
 };
 
 struct kvm_one_reg {
diff --git a/include/uapi/linux/minix_fs.h b/include/uapi/linux/minix_fs.h
index 95dbcb17eacd..8d9ca8b2c357 100644
--- a/include/uapi/linux/minix_fs.h
+++ b/include/uapi/linux/minix_fs.h
@@ -97,11 +97,11 @@ struct minix3_super_block {
 
 struct minix_dir_entry {
 	__u16 inode;
-	char name[0];
+	char name[];
 };
 
 struct minix3_dir_entry {
 	__u32 inode;
-	char name[0];
+	char name[];
 };
 #endif
diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h
index 27a39847d55c..e7401ade6822 100644
--- a/include/uapi/linux/mmc/ioctl.h
+++ b/include/uapi/linux/mmc/ioctl.h
@@ -58,7 +58,7 @@ struct mmc_ioc_cmd {
  */
 struct mmc_ioc_multi_cmd {
 	__u64 num_of_cmds;
-	struct mmc_ioc_cmd cmds[0];
+	struct mmc_ioc_cmd cmds[];
 };
 
 #define MMC_IOC_CMD _IOWR(MMC_BLOCK_MAJOR, 0, struct mmc_ioc_cmd)
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 17e02b64ea2e..73516e263627 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -30,25 +30,25 @@ struct nd_cmd_get_config_data_hdr {
 	__u32 in_offset;
 	__u32 in_length;
 	__u32 status;
-	__u8 out_buf[0];
+	__u8 out_buf[];
 } __packed;
 
 struct nd_cmd_set_config_hdr {
 	__u32 in_offset;
 	__u32 in_length;
-	__u8 in_buf[0];
+	__u8 in_buf[];
 } __packed;
 
 struct nd_cmd_vendor_hdr {
 	__u32 opcode;
 	__u32 in_length;
-	__u8 in_buf[0];
+	__u8 in_buf[];
 } __packed;
 
 struct nd_cmd_vendor_tail {
 	__u32 status;
 	__u32 out_length;
-	__u8 out_buf[0];
+	__u8 out_buf[];
 } __packed;
 
 struct nd_cmd_ars_cap {
@@ -86,7 +86,7 @@ struct nd_cmd_ars_status {
 		__u32 reserved;
 		__u64 err_address;
 		__u64 length;
-	} __packed records[0];
+	} __packed records[];
 } __packed;
 
 struct nd_cmd_clear_error {
diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 1bbea8f0681e..84f622a66a7a 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -29,12 +29,12 @@ struct net_dm_config_entry {
 
 struct net_dm_config_msg {
 	__u32 entries;
-	struct net_dm_config_entry options[0];
+	struct net_dm_config_entry options[];
 };
 
 struct net_dm_alert_msg {
 	__u32 entries;
-	struct net_dm_drop_point points[0];
+	struct net_dm_drop_point points[];
 };
 
 struct net_dm_user_msg {
diff --git a/include/uapi/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h
index b8c6bb233ac1..796af83a963a 100644
--- a/include/uapi/linux/netfilter/x_tables.h
+++ b/include/uapi/linux/netfilter/x_tables.h
@@ -28,7 +28,7 @@ struct xt_entry_match {
 		__u16 match_size;
 	} u;
 
-	unsigned char data[0];
+	unsigned char data[];
 };
 
 struct xt_entry_target {
@@ -119,7 +119,7 @@ struct xt_counters_info {
 	unsigned int num_counters;
 
 	/* The counters (actually `number' of these). */
-	struct xt_counters counters[0];
+	struct xt_counters counters[];
 };
 
 #define XT_INV_PROTO		0x40	/* Invert the sense of PROTO. */
diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h
index bbf5af2b67a8..a6ac2463f787 100644
--- a/include/uapi/linux/netfilter_arp/arp_tables.h
+++ b/include/uapi/linux/netfilter_arp/arp_tables.h
@@ -109,7 +109,7 @@ struct arpt_entry
 	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 /*
@@ -181,7 +181,7 @@ struct arpt_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct arpt_entry entries[0];
+	struct arpt_entry entries[];
 };
 
 /* The argument to ARPT_SO_GET_ENTRIES. */
@@ -193,7 +193,7 @@ struct arpt_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct arpt_entry entrytable[0];
+	struct arpt_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h
index 9acf757bc1f7..73b26a280c4f 100644
--- a/include/uapi/linux/netfilter_bridge/ebt_among.h
+++ b/include/uapi/linux/netfilter_bridge/ebt_among.h
@@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple {
 struct ebt_mac_wormhash {
 	int table[257];
 	int poolsize;
-	struct ebt_mac_wormhash_tuple pool[0];
+	struct ebt_mac_wormhash_tuple pool[];
 };
 
 #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \
diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h
index 50c7fee625ae..1485df28b239 100644
--- a/include/uapi/linux/netfilter_ipv4/ip_tables.h
+++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h
@@ -121,7 +121,7 @@ struct ipt_entry {
 	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 /*
@@ -203,7 +203,7 @@ struct ipt_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct ipt_entry entries[0];
+	struct ipt_entry entries[];
 };
 
 /* The argument to IPT_SO_GET_ENTRIES. */
@@ -215,7 +215,7 @@ struct ipt_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct ipt_entry entrytable[0];
+	struct ipt_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
index d9e364f96a5c..766e8e0bcc68 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
@@ -243,7 +243,7 @@ struct ip6t_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct ip6t_entry entries[0];
+	struct ip6t_entry entries[];
 };
 
 /* The argument to IP6T_SO_GET_ENTRIES. */
@@ -255,7 +255,7 @@ struct ip6t_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct ip6t_entry entrytable[0];
+	struct ip6t_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..4653834f078f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -491,7 +491,7 @@ struct perf_event_query_bpf {
 	/*
 	 * User provided buffer to store program ids
 	 */
-	__u32	ids[0];
+	__u32	ids[];
 };
 
 /*
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9a2ee1e39fad..ffbe230ef90b 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -256,7 +256,7 @@ struct tc_u32_sel {
 
 	short			hoff;
 	__be32			hmask;
-	struct tc_u32_key	keys[0];
+	struct tc_u32_key	keys[];
 };
 
 struct tc_u32_mark {
@@ -268,7 +268,7 @@ struct tc_u32_mark {
 struct tc_u32_pcnt {
 	__u64 rcnt;
 	__u64 rhit;
-	__u64 kcnts[0];
+	__u64 kcnts[];
 };
 
 /* Flags */
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index e5a98a16f9b0..6c0aa577730f 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -303,7 +303,7 @@ struct mdp_superblock_1 {
 	 * into the 'roles' value.  If a device is spare or faulty, then it doesn't
 	 * have a meaningful role.
 	 */
-	__le16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
+	__le16	dev_roles[];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
 };
 
 /* feature_map bits */
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index dcc1b3e6106f..e744c23582eb 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -41,7 +41,7 @@
 struct rand_pool_info {
 	int	entropy_count;
 	int	buf_size;
-	__u32	buf[0];
+	__u32	buf[];
 };
 
 /*
diff --git a/include/uapi/linux/romfs_fs.h b/include/uapi/linux/romfs_fs.h
index a7f1585accef..6aa05e792454 100644
--- a/include/uapi/linux/romfs_fs.h
+++ b/include/uapi/linux/romfs_fs.h
@@ -27,7 +27,7 @@ struct romfs_super_block {
 	__be32 word1;
 	__be32 size;
 	__be32 checksum;
-	char name[0];		/* volume name */
+	char name[];		/* volume name */
 };
 
 /* On disk inode */
@@ -37,7 +37,7 @@ struct romfs_inode {
 	__be32 spec;
 	__be32 size;
 	__be32 checksum;
-	char name[0];
+	char name[];
 };
 
 #define ROMFH_TYPE 7
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 83849a37db5b..eb2747d58a81 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -440,7 +440,7 @@ struct rtnexthop {
 /* RTA_VIA */
 struct rtvia {
 	__kernel_sa_family_t	rtvia_family;
-	__u8			rtvia_addr[0];
+	__u8			rtvia_addr[];
 };
 
 /* RTM_CACHEINFO */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c4ff1ebd8bcc..ed7d4ecbf53d 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -365,7 +365,7 @@ struct sctp_assoc_change {
 	__u16 sac_outbound_streams;
 	__u16 sac_inbound_streams;
 	sctp_assoc_t sac_assoc_id;
-	__u8 sac_info[0];
+	__u8 sac_info[];
 };
 
 /*
@@ -436,7 +436,7 @@ struct sctp_remote_error {
 	__u32 sre_length;
 	__be16 sre_error;
 	sctp_assoc_t sre_assoc_id;
-	__u8 sre_data[0];
+	__u8 sre_data[];
 };
 
 
@@ -453,7 +453,7 @@ struct sctp_send_failed {
 	__u32 ssf_error;
 	struct sctp_sndrcvinfo ssf_info;
 	sctp_assoc_t ssf_assoc_id;
-	__u8 ssf_data[0];
+	__u8 ssf_data[];
 };
 
 struct sctp_send_failed_event {
@@ -463,7 +463,7 @@ struct sctp_send_failed_event {
 	__u32 ssf_error;
 	struct sctp_sndinfo ssfe_info;
 	sctp_assoc_t ssf_assoc_id;
-	__u8 ssf_data[0];
+	__u8 ssf_data[];
 };
 
 /*
@@ -1029,7 +1029,7 @@ struct sctp_getaddrs_old {
 struct sctp_getaddrs {
 	sctp_assoc_t		assoc_id; /*input*/
 	__u32			addr_num; /*output*/
-	__u8			addrs[0]; /*output, variable size*/
+	__u8			addrs[]; /*output, variable size*/
 };
 
 /* A socket user request obtained via SCTP_GET_ASSOC_STATS that retrieves
diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
index 286e8d6a8e98..13bcbc8bba32 100644
--- a/include/uapi/linux/seg6.h
+++ b/include/uapi/linux/seg6.h
@@ -30,7 +30,7 @@ struct ipv6_sr_hdr {
 	__u8	flags;
 	__u16	tag;
 
-	struct in6_addr segments[0];
+	struct in6_addr segments[];
 };
 
 #define SR6_FLAG1_PROTECTED	(1 << 6)
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..a74294211290 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -26,7 +26,7 @@ enum {
 
 struct seg6_iptunnel_encap {
 	int mode;
-	struct ipv6_sr_hdr srh[0];
+	struct ipv6_sr_hdr srh[];
 };
 
 #define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3))
diff --git a/include/uapi/linux/stm.h b/include/uapi/linux/stm.h
index 7bac318b4440..de3579c2cff0 100644
--- a/include/uapi/linux/stm.h
+++ b/include/uapi/linux/stm.h
@@ -36,7 +36,7 @@ struct stp_policy_id {
 	/* padding */
 	__u16		__reserved_0;
 	__u32		__reserved_1;
-	char		id[0];
+	char		id[];
 };
 
 #define STP_POLICY_ID_SET	_IOWR('%', 0, struct stp_policy_id)
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 27ace512babd..fbd8ca67e107 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -152,7 +152,7 @@ struct tcmu_tmr_entry {
 	__u32 cmd_cnt;
 	__u64 __pad3;
 	__u64 __pad4;
-	__u16 cmd_ids[0];
+	__u16 cmd_ids[];
 } __packed;
 
 #define TCMU_OP_ALIGN_SIZE sizeof(__u64)
diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 76b7c3f6cd0d..c917c53070d5 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -341,7 +341,7 @@ struct uac_feature_unit_descriptor {
 	__u8 bUnitID;
 	__u8 bSourceID;
 	__u8 bControlSize;
-	__u8 bmaControls[0]; /* variable length */
+	__u8 bmaControls[]; /* variable length */
 } __attribute__((packed));
 
 static inline __u8 uac_feature_unit_iFeature(struct uac_feature_unit_descriptor *desc)
diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 6d61550959ef..acf3852bb676 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -171,7 +171,7 @@ struct usb_cdc_mdlm_detail_desc {
 
 	/* type is associated with mdlm_desc.bGUID */
 	__u8	bGuidDescriptorType;
-	__u8	bDetailData[0];
+	__u8	bDetailData[];
 } __attribute__ ((packed));
 
 /* "OBEX Control Model Functional Descriptor" */
@@ -379,7 +379,7 @@ struct usb_cdc_ncm_ndp16 {
 	__le32	dwSignature;
 	__le16	wLength;
 	__le16	wNextNdpIndex;
-	struct	usb_cdc_ncm_dpe16 dpe16[0];
+	struct	usb_cdc_ncm_dpe16 dpe16[];
 } __attribute__ ((packed));
 
 /* 32-bit NCM Datagram Pointer Entry */
@@ -395,7 +395,7 @@ struct usb_cdc_ncm_ndp32 {
 	__le16	wReserved6;
 	__le32	dwNextNdpIndex;
 	__le32	dwReserved12;
-	struct	usb_cdc_ncm_dpe32 dpe32[0];
+	struct	usb_cdc_ncm_dpe32 dpe32[];
 } __attribute__ ((packed));
 
 /* CDC NCM subclass 3.2.1 and 3.2.2 */
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 17ce56198c9a..31fcfa084e63 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -818,7 +818,7 @@ struct usb_key_descriptor {
 
 	__u8  tTKID[3];
 	__u8  bReserved;
-	__u8  bKeyData[0];
+	__u8  bKeyData[];
 } __attribute__((packed));
 
 /*-------------------------------------------------------------------------*/
diff --git a/include/uapi/linux/usb/raw_gadget.h b/include/uapi/linux/usb/raw_gadget.h
index 0be685272eb1..c7d2199134d7 100644
--- a/include/uapi/linux/usb/raw_gadget.h
+++ b/include/uapi/linux/usb/raw_gadget.h
@@ -60,7 +60,7 @@ enum usb_raw_event_type {
 struct usb_raw_event {
 	__u32		type;
 	__u32		length;
-	__u8		data[0];
+	__u8		data[];
 };
 
 #define USB_RAW_IO_FLAGS_ZERO	0x0001
@@ -90,7 +90,7 @@ struct usb_raw_ep_io {
 	__u16		ep;
 	__u16		flags;
 	__u32		length;
-	__u8		data[0];
+	__u8		data[];
 };
 
 /* Maximum number of non-control endpoints in struct usb_raw_eps_info. */
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index cf525cddeb94..74a84e02422a 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -131,7 +131,7 @@ struct usbdevfs_urb {
 	unsigned int signr;	/* signal to be sent on completion,
 				  or 0 if none should be sent. */
 	void __user *usercontext;
-	struct usbdevfs_iso_packet_desc iso_frame_desc[0];
+	struct usbdevfs_iso_packet_desc iso_frame_desc[];
 };
 
 /* ioctls for talking directly to drivers */
@@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim {
 struct usbdevfs_streams {
 	unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */
 	unsigned int num_eps;
-	unsigned char eps[0];
+	unsigned char eps[];
 };
 
 /*
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 634cee485abb..391331a10879 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -107,7 +107,7 @@ struct vhost_memory_region {
 struct vhost_memory {
 	__u32 nregions;
 	__u32 padding;
-	struct vhost_memory_region regions[0];
+	struct vhost_memory_region regions[];
 };
 
 /* VHOST_SCSI specific definitions */
@@ -135,7 +135,7 @@ struct vhost_scsi_target {
 struct vhost_vdpa_config {
 	__u32 off;
 	__u32 len;
-	__u8 buf[0];
+	__u8 buf[];
 };
 
 /* vhost vdpa IOVA range
diff --git a/include/uapi/linux/virtio_9p.h b/include/uapi/linux/virtio_9p.h
index 441047432258..374b68f8ac6e 100644
--- a/include/uapi/linux/virtio_9p.h
+++ b/include/uapi/linux/virtio_9p.h
@@ -38,7 +38,7 @@ struct virtio_9p_config {
 	/* length of the tag name */
 	__virtio16 tag_len;
 	/* non-NULL terminated tag name */
-	__u8 tag[0];
+	__u8 tag[];
 } __attribute__((packed));
 
 #endif /* _LINUX_VIRTIO_9P_H */
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..e8191e0c3b56 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -33,7 +33,7 @@ struct xfrm_sec_ctx {
 	__u8	ctx_alg;
 	__u16	ctx_len;
 	__u32	ctx_sid;
-	char	ctx_str[0];
+	char	ctx_str[];
 };
 
 /* Security Context Domains of Interpretation */
@@ -96,27 +96,27 @@ struct xfrm_replay_state_esn {
 	__u32		oseq_hi;
 	__u32		seq_hi;
 	__u32		replay_window;
-	__u32		bmp[0];
+	__u32		bmp[];
 };
 
 struct xfrm_algo {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_auth {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
 	unsigned int	alg_trunc_len;  /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_aead {
 	char		alg_name[64];
 	unsigned int	alg_key_len;	/* in bits */
 	unsigned int	alg_icv_len;	/* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_stats {
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index d95ef9a2b032..1106a7c90b29 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -180,7 +180,7 @@ struct hfi1_sdma_comp_entry {
 struct hfi1_status {
 	__aligned_u64 dev;      /* device/hw status bits */
 	__aligned_u64 port;     /* port state and status bits */
-	char freezemsg[0];
+	char freezemsg[];
 };
 
 enum sdma_req_opcode {
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 7dd903d932e5..43672cb1fd57 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -158,18 +158,18 @@ struct ib_uverbs_ex_cmd_hdr {
 
 struct ib_uverbs_get_context {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_get_context_resp {
 	__u32 async_fd;
 	__u32 num_comp_vectors;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_device {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_device_resp {
@@ -278,7 +278,7 @@ struct ib_uverbs_query_port {
 	__aligned_u64 response;
 	__u8  port_num;
 	__u8  reserved[7];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_port_resp {
@@ -308,12 +308,12 @@ struct ib_uverbs_query_port_resp {
 
 struct ib_uverbs_alloc_pd {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_alloc_pd_resp {
 	__u32 pd_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_dealloc_pd {
@@ -324,12 +324,12 @@ struct ib_uverbs_open_xrcd {
 	__aligned_u64 response;
 	__u32 fd;
 	__u32 oflags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_open_xrcd_resp {
 	__u32 xrcd_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_close_xrcd {
@@ -343,14 +343,14 @@ struct ib_uverbs_reg_mr {
 	__aligned_u64 hca_va;
 	__u32 pd_handle;
 	__u32 access_flags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_reg_mr_resp {
 	__u32 mr_handle;
 	__u32 lkey;
 	__u32 rkey;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_rereg_mr {
@@ -362,13 +362,13 @@ struct ib_uverbs_rereg_mr {
 	__aligned_u64 hca_va;
 	__u32 pd_handle;
 	__u32 access_flags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_rereg_mr_resp {
 	__u32 lkey;
 	__u32 rkey;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_dereg_mr {
@@ -380,13 +380,13 @@ struct ib_uverbs_alloc_mw {
 	__u32 pd_handle;
 	__u8  mw_type;
 	__u8  reserved[3];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_alloc_mw_resp {
 	__u32 mw_handle;
 	__u32 rkey;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_dealloc_mw {
@@ -408,7 +408,7 @@ struct ib_uverbs_create_cq {
 	__u32 comp_vector;
 	__s32 comp_channel;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 enum ib_uverbs_ex_create_cq_flags {
@@ -442,13 +442,13 @@ struct ib_uverbs_resize_cq {
 	__aligned_u64 response;
 	__u32 cq_handle;
 	__u32 cqe;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_resize_cq_resp {
 	__u32 cqe;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_poll_cq {
@@ -492,7 +492,7 @@ struct ib_uverbs_wc {
 struct ib_uverbs_poll_cq_resp {
 	__u32 count;
 	__u32 reserved;
-	struct ib_uverbs_wc wc[0];
+	struct ib_uverbs_wc wc[];
 };
 
 struct ib_uverbs_req_notify_cq {
@@ -585,7 +585,7 @@ struct ib_uverbs_create_qp {
 	__u8  qp_type;
 	__u8  is_srq;
 	__u8  reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 enum ib_uverbs_create_qp_mask {
@@ -624,7 +624,7 @@ struct ib_uverbs_open_qp {
 	__u32 qpn;
 	__u8  qp_type;
 	__u8  reserved[7];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 /* also used for open response */
@@ -669,7 +669,7 @@ struct ib_uverbs_query_qp {
 	__aligned_u64 response;
 	__u32 qp_handle;
 	__u32 attr_mask;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_qp_resp {
@@ -703,7 +703,7 @@ struct ib_uverbs_query_qp_resp {
 	__u8  alt_timeout;
 	__u8  sq_sig_all;
 	__u8  reserved[5];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_modify_qp {
@@ -824,7 +824,7 @@ struct ib_uverbs_post_send {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_send_wr send_wr[0];
+	struct ib_uverbs_send_wr send_wr[];
 };
 
 struct ib_uverbs_post_send_resp {
@@ -843,7 +843,7 @@ struct ib_uverbs_post_recv {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_recv_wr recv_wr[0];
+	struct ib_uverbs_recv_wr recv_wr[];
 };
 
 struct ib_uverbs_post_recv_resp {
@@ -856,7 +856,7 @@ struct ib_uverbs_post_srq_recv {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_recv_wr recv[0];
+	struct ib_uverbs_recv_wr recv[];
 };
 
 struct ib_uverbs_post_srq_recv_resp {
@@ -869,12 +869,12 @@ struct ib_uverbs_create_ah {
 	__u32 pd_handle;
 	__u32 reserved;
 	struct ib_uverbs_ah_attr attr;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_ah_resp {
 	__u32 ah_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_destroy_ah {
@@ -886,7 +886,7 @@ struct ib_uverbs_attach_mcast {
 	__u32 qp_handle;
 	__u16 mlid;
 	__u16 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_detach_mcast {
@@ -894,7 +894,7 @@ struct ib_uverbs_detach_mcast {
 	__u32 qp_handle;
 	__u16 mlid;
 	__u16 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_flow_spec_hdr {
@@ -1135,7 +1135,7 @@ struct ib_uverbs_flow_attr {
 	 * struct ib_flow_spec_xxx
 	 * struct ib_flow_spec_yyy
 	 */
-	struct ib_uverbs_flow_spec_hdr flow_specs[0];
+	struct ib_uverbs_flow_spec_hdr flow_specs[];
 };
 
 struct ib_uverbs_create_flow  {
@@ -1161,7 +1161,7 @@ struct ib_uverbs_create_srq {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srq_limit;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_xsrq {
@@ -1175,7 +1175,7 @@ struct ib_uverbs_create_xsrq {
 	__u32 max_num_tags;
 	__u32 xrcd_handle;
 	__u32 cq_handle;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_srq_resp {
@@ -1183,7 +1183,7 @@ struct ib_uverbs_create_srq_resp {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srqn;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_modify_srq {
@@ -1191,14 +1191,14 @@ struct ib_uverbs_modify_srq {
 	__u32 attr_mask;
 	__u32 max_wr;
 	__u32 srq_limit;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_srq {
 	__aligned_u64 response;
 	__u32 srq_handle;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_srq_resp {
@@ -1269,7 +1269,7 @@ struct ib_uverbs_ex_create_rwq_ind_table  {
 	 * wq_handle1
 	 * wq_handle2
 	 */
-	__u32 wq_handles[0];
+	__u32 wq_handles[];
 };
 
 struct ib_uverbs_ex_create_rwq_ind_table_resp {
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index ed5a514305c1..7cea03581f79 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -184,7 +184,7 @@ struct rdma_ucm_query_addr_resp {
 struct rdma_ucm_query_path_resp {
 	__u32 num_paths;
 	__u32 reserved;
-	struct ib_path_rec_data path_data[0];
+	struct ib_path_rec_data path_data[];
 };
 
 struct rdma_ucm_conn_param {
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
index 38ab7accb7be..ab1aef17feb1 100644
--- a/include/uapi/rdma/rdma_user_ioctl_cmds.h
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -81,7 +81,7 @@ struct ib_uverbs_ioctl_hdr {
 	__aligned_u64 reserved1;
 	__u32 driver_id;
 	__u32 reserved2;
-	struct ib_uverbs_attr  attrs[0];
+	struct ib_uverbs_attr  attrs[];
 };
 
 #endif
diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h
index c9812c5c2fc4..16782c360de3 100644
--- a/include/uapi/scsi/fc/fc_els.h
+++ b/include/uapi/scsi/fc/fc_els.h
@@ -264,7 +264,7 @@ struct fc_tlv_desc {
 					 * Size of descriptor excluding
 					 * desc_tag and desc_len fields.
 					 */
-	__u8		desc_value[0];  /* Descriptor Value */
+	__u8		desc_value[];  /* Descriptor Value */
 };
 
 /* Descriptor tag and len fields are considered the mandatory header
@@ -1027,7 +1027,7 @@ struct fc_fn_li_desc {
 					 * threshold to caause the LI event
 					 */
 	__be32		pname_count;	/* number of portname_list elements */
-	__be64		pname_list[0];	/* list of N_Port_Names accessible
+	__be64		pname_list[];	/* list of N_Port_Names accessible
 					 * through the attached port
 					 */
 };
@@ -1069,7 +1069,7 @@ struct fc_fn_peer_congn_desc {
 					 * congestion event
 					 */
 	__be32		pname_count;	/* number of portname_list elements */
-	__be64		pname_list[0];	/* list of N_Port_Names accessible
+	__be64		pname_list[];	/* list of N_Port_Names accessible
 					 * through the attached port
 					 */
 };
@@ -1104,7 +1104,7 @@ struct fc_els_fpin {
 					 * Size of ELS excluding fpin_cmd,
 					 * fpin_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	fpin_desc[0];	/* Descriptor list */
+	struct fc_tlv_desc	fpin_desc[];	/* Descriptor list */
 };
 
 /* Diagnostic Function Descriptor - FPIN Registration */
@@ -1115,7 +1115,7 @@ struct fc_df_desc_fpin_reg {
 					 * desc_tag and desc_len fields.
 					 */
 	__be32		count;		/* Number of desc_tags elements */
-	__be32		desc_tags[0];	/* Array of Descriptor Tags.
+	__be32		desc_tags[];	/* Array of Descriptor Tags.
 					 * Each tag indicates a function
 					 * supported by the N_Port (request)
 					 * or by the  N_Port and Fabric
@@ -1135,7 +1135,7 @@ struct fc_els_rdf {
 					 * Size of ELS excluding fpin_cmd,
 					 * fpin_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	desc[0];	/* Descriptor list */
+	struct fc_tlv_desc	desc[];	/* Descriptor list */
 };
 
 /*
@@ -1148,7 +1148,7 @@ struct fc_els_rdf_resp {
 						 * and desc_list_len fields.
 						 */
 	struct fc_els_lsri_desc	lsri;
-	struct fc_tlv_desc	desc[0];	/* Supported Descriptor list */
+	struct fc_tlv_desc	desc[];	/* Supported Descriptor list */
 };
 
 
@@ -1231,7 +1231,7 @@ struct fc_els_edc {
 					 * Size of ELS excluding edc_cmd,
 					 * edc_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	desc[0];
+	struct fc_tlv_desc	desc[];
 					/* Diagnostic Descriptor list */
 };
 
@@ -1245,7 +1245,7 @@ struct fc_els_edc_resp {
 						 * and desc_list_len fields.
 						 */
 	struct fc_els_lsri_desc	lsri;
-	struct fc_tlv_desc	desc[0];
+	struct fc_tlv_desc	desc[];
 				    /* Supported Diagnostic Descriptor list */
 };
 
diff --git a/include/uapi/scsi/scsi_bsg_fc.h b/include/uapi/scsi/scsi_bsg_fc.h
index 3ae65e93235c..7f5930801f72 100644
--- a/include/uapi/scsi/scsi_bsg_fc.h
+++ b/include/uapi/scsi/scsi_bsg_fc.h
@@ -209,7 +209,7 @@ struct fc_bsg_host_vendor {
 	__u64 vendor_id;
 
 	/* start of vendor command area */
-	__u32 vendor_cmd[0];
+	__u32 vendor_cmd[];
 };
 
 /* Response:
diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index 2d3e5df39a59..3974a2a911cc 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value {
 struct snd_ctl_tlv {
 	unsigned int numid;	/* control element numeric identification */
 	unsigned int length;	/* in bytes aligned to 4 */
-	unsigned int tlv[0];	/* first TLV */
+	unsigned int tlv[];	/* first TLV */
 };
 
 #define SNDRV_CTL_IOCTL_PVERSION	_IOR('U', 0x00, int)
diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index 39cf6eb75940..3532ac7046d7 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -38,11 +38,11 @@ struct snd_efw_transaction {
 	__be32 category;
 	__be32 command;
 	__be32 status;
-	__be32 params[0];
+	__be32 params[];
 };
 struct snd_firewire_event_efw_response {
 	unsigned int type;
-	__be32 response[0];	/* some responses */
+	__be32 response[];	/* some responses */
 };
 
 struct snd_firewire_event_digi00x_message {
@@ -63,7 +63,7 @@ struct snd_firewire_tascam_change {
 
 struct snd_firewire_event_tascam_control {
 	unsigned int type;
-	struct snd_firewire_tascam_change changes[0];
+	struct snd_firewire_tascam_change changes[];
 };
 
 struct snd_firewire_event_motu_register_dsp_change {
diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h
index a93c0decfdd5..f29899b179a6 100644
--- a/include/uapi/sound/skl-tplg-interface.h
+++ b/include/uapi/sound/skl-tplg-interface.h
@@ -151,7 +151,7 @@ struct skl_dfw_algo_data {
 	__u32 rsvd:30;
 	__u32 param_id;
 	__u32 max;
-	char params[0];
+	char params[];
 } __packed;
 
 enum skl_tkn_dir {
diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index 5f4518e7a972..dbf137516522 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -23,7 +23,7 @@ struct sof_abi_hdr {
 	__u32 size;		/**< size in bytes of data excl. this struct */
 	__u32 abi;		/**< SOF ABI version */
 	__u32 reserved[4];	/**< reserved for future use */
-	__u32 data[0];		/**< Component data - opaque to core */
+	__u32 data[];		/**< Component data - opaque to core */
 }  __packed;
 
 #endif
diff --git a/include/uapi/sound/usb_stream.h b/include/uapi/sound/usb_stream.h
index 95419d8bbc16..ffdd3ea1e31d 100644
--- a/include/uapi/sound/usb_stream.h
+++ b/include/uapi/sound/usb_stream.h
@@ -61,7 +61,7 @@ struct usb_stream {
 	unsigned		 inpacket_split_at;
 	unsigned		 next_inpacket_split;
 	unsigned		 next_inpacket_split_at;
-	struct usb_stream_packet inpacket[0];
+	struct usb_stream_packet inpacket[];
 };
 
 enum usb_stream_state {
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index bf6e96011dfe..e135f4dcb19d 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -198,13 +198,13 @@ struct kvm_msrs {
 	__u32 nmsrs; /* number of msrs in entries */
 	__u32 pad;
 
-	struct kvm_msr_entry entries[0];
+	struct kvm_msr_entry entries[];
 };
 
 /* for KVM_GET_MSR_INDEX_LIST */
 struct kvm_msr_list {
 	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
+	__u32 indices[];
 };
 
 /* Maximum size of any access bitmap in bytes */
@@ -241,7 +241,7 @@ struct kvm_cpuid_entry {
 struct kvm_cpuid {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
+	struct kvm_cpuid_entry entries[];
 };
 
 struct kvm_cpuid_entry2 {
@@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 {
 struct kvm_cpuid2 {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry2 entries[0];
+	struct kvm_cpuid_entry2 entries[];
 };
 
 /* for KVM_GET_PIT and KVM_SET_PIT */
@@ -389,7 +389,7 @@ struct kvm_xsave {
 	 * the contents of CPUID leaf 0xD on the host.
 	 */
 	__u32 region[1024];
-	__u32 extra[0];
+	__u32 extra[];
 };
 
 #define KVM_MAX_XCRS	16
@@ -515,7 +515,7 @@ struct kvm_pmu_event_filter {
 	__u32 fixed_counter_bitmap;
 	__u32 flags;
 	__u32 pad[4];
-	__u64 events[0];
+	__u64 events[];
 };
 
 #define KVM_PMU_EVENT_ALLOW 0
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
index 05c3642aaece..239b91b13c60 100644
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -2060,7 +2060,7 @@ struct i915_context_engines_load_balance {
 
 	__u64 mbz64; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
@@ -2098,7 +2098,7 @@ struct i915_context_engines_bond {
 	__u64 flags; /* all undefined flags must be zero */
 	__u64 mbz64[4]; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \
@@ -2225,7 +2225,7 @@ struct i915_context_engines_parallel_submit {
 	 * length = width (i) * num_siblings (j)
 	 * index = j + i * num_siblings
 	 */
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 
 } __packed;
 
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index bdf7b404b3e7..b7b56871029c 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -90,7 +90,7 @@ struct file_dedupe_range {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;	/* must be zero */
 	__u32 reserved2;	/* must be zero */
-	struct file_dedupe_range_info info[0];
+	struct file_dedupe_range_info info[];
 };
 
 /* And dynamically-tunable limits and defaults: */
diff --git a/tools/include/uapi/linux/if_tun.h b/tools/include/uapi/linux/if_tun.h
index 454ae31b93c7..2ec07de1d73b 100644
--- a/tools/include/uapi/linux/if_tun.h
+++ b/tools/include/uapi/linux/if_tun.h
@@ -108,7 +108,7 @@ struct tun_pi {
 struct tun_filter {
 	__u16  flags; /* TUN_FLT_ flags see above */
 	__u16  count; /* Number of addresses */
-	__u8   addr[0][ETH_ALEN];
+	__u8   addr[][ETH_ALEN];
 };
 
 #endif /* _UAPI__IF_TUN_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 6a184d260c7f..37ce8cbac322 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -539,7 +539,7 @@ struct kvm_coalesced_mmio {
 
 struct kvm_coalesced_mmio_ring {
 	__u32 first, last;
-	struct kvm_coalesced_mmio coalesced_mmio[0];
+	struct kvm_coalesced_mmio coalesced_mmio[];
 };
 
 #define KVM_COALESCED_MMIO_MAX \
@@ -618,7 +618,7 @@ struct kvm_clear_dirty_log {
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
-	__u8  sigset[0];
+	__u8  sigset[];
 };
 
 /* for KVM_TPR_ACCESS_REPORTING */
@@ -1216,7 +1216,7 @@ struct kvm_irq_routing_entry {
 struct kvm_irq_routing {
 	__u32 nr;
 	__u32 flags;
-	struct kvm_irq_routing_entry entries[0];
+	struct kvm_irq_routing_entry entries[];
 };
 
 #endif
@@ -1335,7 +1335,7 @@ struct kvm_dirty_tlb {
 
 struct kvm_reg_list {
 	__u64 n; /* number of regs */
-	__u64 reg[0];
+	__u64 reg[];
 };
 
 struct kvm_one_reg {
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d37629dbad72..4653834f078f 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -491,7 +491,7 @@ struct perf_event_query_bpf {
 	/*
 	 * User provided buffer to store program ids
 	 */
-	__u32	ids[0];
+	__u32	ids[];
 };
 
 /*
diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h
index 12153771396a..3faee0199a9b 100644
--- a/tools/include/uapi/linux/pkt_cls.h
+++ b/tools/include/uapi/linux/pkt_cls.h
@@ -180,7 +180,7 @@ struct tc_u32_sel {
 
 	short			hoff;
 	__be32			hmask;
-	struct tc_u32_key	keys[0];
+	struct tc_u32_key	keys[];
 };
 
 struct tc_u32_mark {
@@ -192,7 +192,7 @@ struct tc_u32_mark {
 struct tc_u32_pcnt {
 	__u64 rcnt;
 	__u64 rhit;
-	__u64 kcnts[0];
+	__u64 kcnts[];
 };
 
 /* Flags */
diff --git a/tools/include/uapi/linux/seg6.h b/tools/include/uapi/linux/seg6.h
index 286e8d6a8e98..f94baf154c47 100644
--- a/tools/include/uapi/linux/seg6.h
+++ b/tools/include/uapi/linux/seg6.h
@@ -30,7 +30,7 @@ struct ipv6_sr_hdr {
 	__u8	flags;
 	__u16	tag;
 
-	struct in6_addr segments[0];
+	struct in6_addr segments[];
 };
 
 #define SR6_FLAG1_PROTECTED	(1 << 6)
@@ -49,7 +49,7 @@ struct ipv6_sr_hdr {
 struct sr6_tlv {
 	__u8 type;
 	__u8 len;
-	__u8 data[0];
+	__u8 data[];
 };
 
 #endif
diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h
index cf525cddeb94..74a84e02422a 100644
--- a/tools/include/uapi/linux/usbdevice_fs.h
+++ b/tools/include/uapi/linux/usbdevice_fs.h
@@ -131,7 +131,7 @@ struct usbdevfs_urb {
 	unsigned int signr;	/* signal to be sent on completion,
 				  or 0 if none should be sent. */
 	void __user *usercontext;
-	struct usbdevfs_iso_packet_desc iso_frame_desc[0];
+	struct usbdevfs_iso_packet_desc iso_frame_desc[];
 };
 
 /* ioctls for talking directly to drivers */
@@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim {
 struct usbdevfs_streams {
 	unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */
 	unsigned int num_eps;
-	unsigned char eps[0];
+	unsigned char eps[];
 };
 
 /*
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h
index 2d3e5df39a59..3974a2a911cc 100644
--- a/tools/include/uapi/sound/asound.h
+++ b/tools/include/uapi/sound/asound.h
@@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value {
 struct snd_ctl_tlv {
 	unsigned int numid;	/* control element numeric identification */
 	unsigned int length;	/* in bytes aligned to 4 */
-	unsigned int tlv[0];	/* first TLV */
+	unsigned int tlv[];	/* first TLV */
 };
 
 #define SNDRV_CTL_IOCTL_PVERSION	_IOR('U', 0x00, int)
-- 
cgit 


From c3497fd009ef2c59eea60d21c3ac22de3585ed7d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jun 2022 19:50:29 -0400
Subject: fix short copy handling in copy_mc_pipe_to_iter()

Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
result in a short copy.  In that case we need to trim the unused
buffers, as well as the length of partially filled one - it's not
enough to set ->head, ->iov_offset and ->count to reflect how
much had we copied.  Not hard to fix, fortunately...

I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
rather than iov_iter.c - it has nothing to do with iov_iter and
having it will allow us to avoid an ugly kludge in fs/splice.c.
We could put it into lib/iov_iter.c for now and move it later,
but I don't see the point going that way...

Cc: stable@kernel.org # 4.19+
Fixes: ca146f6f091e "lib/iov_iter: Fix pipe handling in _copy_to_iter_mcsafe()"
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/pipe_fs_i.h |  9 +++++++++
 lib/iov_iter.c            | 15 +++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index cb0fd633a610..4ea496924106 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -229,6 +229,15 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
 	return buf->ops->try_steal(pipe, buf);
 }
 
+static inline void pipe_discard_from(struct pipe_inode_info *pipe,
+		unsigned int old_head)
+{
+	unsigned int mask = pipe->ring_size - 1;
+
+	while (pipe->head > old_head)
+		pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]);
+}
+
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0b64695ab632..2bf20b48a04a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -689,6 +689,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 	struct pipe_inode_info *pipe = i->pipe;
 	unsigned int p_mask = pipe->ring_size - 1;
 	unsigned int i_head;
+	unsigned int valid = pipe->head;
 	size_t n, off, xfer = 0;
 
 	if (!sanity(i))
@@ -702,11 +703,17 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 		rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
 		chunk -= rem;
 		kunmap_local(p);
-		i->head = i_head;
-		i->iov_offset = off + chunk;
-		xfer += chunk;
-		if (rem)
+		if (chunk) {
+			i->head = i_head;
+			i->iov_offset = off + chunk;
+			xfer += chunk;
+			valid = i_head + 1;
+		}
+		if (rem) {
+			pipe->bufs[i_head & p_mask].len -= rem;
+			pipe_discard_from(pipe, valid);
 			break;
+		}
 		n -= chunk;
 		off = 0;
 		i_head++;
-- 
cgit 


From 03895c8414d748747900ede2cb603d0ed3eeae1c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 17 Jun 2022 00:42:12 +0200
Subject: wifi: mac80211: add gfp_t parameter to
 ieeee80211_obss_color_collision_notify

Introduce the capability to specify gfp_t parameter to
ieeee80211_obss_color_collision_notify routine since it runs in
interrupt context in ieee80211_rx_check_bss_color_collision().

Fixes: 6d945a33f2b0a ("mac80211: introduce BSS color collision detection")
Co-developed-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/02c990fb3fbd929c8548a656477d20d6c0427a13.1655419135.git.lorenzo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/wmi.c | 3 ++-
 include/net/cfg80211.h                | 5 +++--
 include/net/mac80211.h                | 3 ++-
 net/mac80211/cfg.c                    | 4 ++--
 net/mac80211/rx.c                     | 3 ++-
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 84d1c7054013..7b1dc19c565e 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -3822,7 +3822,8 @@ ath11k_wmi_obss_color_collision_event(struct ath11k_base *ab, struct sk_buff *sk
 
 	switch (ev->evt_type) {
 	case WMI_BSS_COLOR_COLLISION_DETECTION:
-		ieeee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap);
+		ieeee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap,
+						       GFP_KERNEL);
 		ath11k_dbg(ab, ATH11K_DBG_WMI,
 			   "OBSS color collision detected vdev:%d, event:%d, bitmap:%08llx\n",
 			   ev->vdev_id, ev->evt_type, ev->obss_color_bitmap);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6d02e12e4702..80f41446b1f0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8462,11 +8462,12 @@ int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp,
  * cfg80211_obss_color_collision_notify - notify about bss color collision
  * @dev: network device
  * @color_bitmap: representations of the colors that the local BSS is aware of
+ * @gfp: allocation flags
  */
 static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
-						       u64 color_bitmap)
+						       u64 color_bitmap, gfp_t gfp)
 {
-	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+	return cfg80211_bss_color_notify(dev, gfp,
 					 NL80211_CMD_OBSS_COLOR_COLLISION,
 					 0, color_bitmap);
 }
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ebadb2103968..47642b020706 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6960,10 +6960,11 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is
  *	aware of.
+ * @gfp: allocation flags
  */
 void
 ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				       u64 color_bitmap);
+				       u64 color_bitmap, gfp_t gfp);
 
 /**
  * ieee80211_is_tx_data - check if frame is a data frame
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f7896f257e1b..4ddf297f40f2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4468,14 +4468,14 @@ EXPORT_SYMBOL_GPL(ieee80211_color_change_finish);
 
 void
 ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				       u64 color_bitmap)
+				       u64 color_bitmap, gfp_t gfp)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 
 	if (sdata->vif.color_change_active || sdata->vif.csa_active)
 		return;
 
-	cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap);
+	cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap, gfp);
 }
 EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify);
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3c08ae04ddbc..1675f8cb87f1 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3217,7 +3217,8 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 				      IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
 		if (color == bss_conf->he_bss_color.color)
 			ieeee80211_obss_color_collision_notify(&rx->sdata->vif,
-							       BIT_ULL(color));
+							       BIT_ULL(color),
+							       GFP_ATOMIC);
 	}
 }
 
-- 
cgit 


From 9adf24a40978c19f57f44572b292b38938da7686 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Jun 2022 12:25:53 +0200
Subject: fs: port HAS_UNMAPPED_ID() to vfs{g,u}id_t

The HAS_UNMAPPED_ID() helper is fully self contained so we can port it
to vfs{g,u}id_t without much effort.

Cc: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d6e3347cbf69..ec2e35886779 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2323,8 +2323,8 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
 				   struct inode *inode)
 {
-	return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
-	       !gid_valid(i_gid_into_mnt(mnt_userns, inode));
+	return !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+	       !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode));
 }
 
 static inline int iocb_flags(struct file *file);
-- 
cgit 


From fc04dafd263d8c0b0251b63d47f35b29373d50f2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Jun 2022 13:15:10 +0200
Subject: mnt_idmapping: use new helpers in mapped_fs{g,u}id()

The old non-type safe helpers will soon be removed.

Cc: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 21bd22a7b326..be83643cadff 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -422,7 +422,8 @@ static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns,
 static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns,
 				  struct user_namespace *fs_userns)
 {
-	return mapped_kuid_user(mnt_userns, fs_userns, current_fsuid());
+	return from_vfsuid(mnt_userns, fs_userns,
+			   VFSUIDT_INIT(current_fsuid()));
 }
 
 /**
@@ -441,7 +442,8 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns,
 static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns,
 				  struct user_namespace *fs_userns)
 {
-	return mapped_kgid_user(mnt_userns, fs_userns, current_fsgid());
+	return from_vfsgid(mnt_userns, fs_userns,
+			   VFSGIDT_INIT(current_fsgid()));
 }
 
 #endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit 


From acd6510dd7ab3664b69eb99e37c4fd6325a7d442 Mon Sep 17 00:00:00 2001
From: Tanmay Shah <tanmay.shah@xilinx.com>
Date: Tue, 7 Jun 2022 15:42:54 -0700
Subject: firmware: xilinx: Add TF_A_PM_REGISTER_SGI SMC call

SGI interrupt register and reset is performed by EEMI ioctl
IOCTL_REGISTER_SGI. However, this is not correct use of EEMI call.
SGI registration functionality does not qualify as energy management
activity and so shouldn't be mapped to EEMI call.

This new call will replace IOCTL_REGISTER_SGI and will  be handled by TF-A
specific handler in TF-A. To maintain backward compatibility for a while
firmware driver will still use IOCTL_REGISTER_SGI as fallback strategy if
new call fails or is not supported by TF-A.

This new design also helps to make TF-A as pass through layer for EEMI
calls. So we don't have to maintain PM_IOCTL as EEMI API ID in TF-A.

Signed-off-by: Tanmay Shah <tanmay.shah@xilinx.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/20220607224253.54919-1-tanmay.shah@xilinx.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp.c        | 16 +++++++++++++++-
 drivers/soc/xilinx/xlnx_event_manager.c |  5 ++---
 include/linux/firmware/xlnx-zynqmp.h    |  7 +++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 7977a494a651..d1f652802181 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -2,7 +2,7 @@
 /*
  * Xilinx Zynq MPSoC Firmware layer
  *
- *  Copyright (C) 2014-2021 Xilinx, Inc.
+ *  Copyright (C) 2014-2022 Xilinx, Inc.
  *
  *  Michal Simek <michal.simek@xilinx.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -340,6 +340,20 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
 static u32 pm_api_version;
 static u32 pm_tz_version;
 
+int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
+{
+	int ret;
+
+	ret = zynqmp_pm_invoke_fn(TF_A_PM_REGISTER_SGI, sgi_num, reset, 0, 0,
+				  NULL);
+	if (!ret)
+		return ret;
+
+	/* try old implementation as fallback strategy if above fails */
+	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_REGISTER_SGI, sgi_num,
+				   reset, NULL);
+}
+
 /**
  * zynqmp_pm_get_api_version() - Get version number of PMU PM firmware
  * @version:	Returned version value
diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c
index 5dcb7665fe22..2de082765bef 100644
--- a/drivers/soc/xilinx/xlnx_event_manager.c
+++ b/drivers/soc/xilinx/xlnx_event_manager.c
@@ -647,8 +647,7 @@ static int xlnx_event_manager_probe(struct platform_device *pdev)
 	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "soc/event:starting",
 			  xlnx_event_cpuhp_start, xlnx_event_cpuhp_down);
 
-	ret = zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_REGISTER_SGI, sgi_num,
-				  0, NULL);
+	ret = zynqmp_pm_register_sgi(sgi_num, 0);
 	if (ret) {
 		dev_err(&pdev->dev, "SGI %d Registration over TF-A failed with %d\n", sgi_num, ret);
 		xlnx_event_cleanup_sgi(pdev);
@@ -681,7 +680,7 @@ static int xlnx_event_manager_remove(struct platform_device *pdev)
 		kfree(eve_data);
 	}
 
-	ret = zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_REGISTER_SGI, 0, 1, NULL);
+	ret = zynqmp_pm_register_sgi(0, 1);
 	if (ret)
 		dev_err(&pdev->dev, "SGI unregistration over TF-A failed with %d\n", ret);
 
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1ec73d5352c3..cbde3b1fa414 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -34,6 +34,7 @@
 #define PM_API_VERSION_2	2
 
 /* ATF only commands */
+#define TF_A_PM_REGISTER_SGI		0xa04
 #define PM_GET_TRUSTZONE_VERSION	0xa03
 #define PM_SET_SUSPEND_MODE		0xa02
 #define GET_CALLBACK_DATA		0xa01
@@ -468,6 +469,7 @@ int zynqmp_pm_feature(const u32 api_id);
 int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id);
 int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value);
 int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload);
+int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset);
 #else
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
@@ -733,6 +735,11 @@ static inline int zynqmp_pm_get_feature_config(enum pm_feature_config_id id,
 {
 	return -ENODEV;
 }
+
+static inline int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif /* __FIRMWARE_ZYNQMP_H__ */
-- 
cgit 


From 6ffcd825e7d0416d78fd41cd5b7856a78122cc8c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 28 Jun 2022 20:41:40 -0400
Subject: mm: Remove __delete_from_page_cache()

This wrapper is no longer used.  Remove it and all references to it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/inode.c              | 2 +-
 include/linux/pagemap.h | 4 ----
 mm/memory-failure.c     | 2 +-
 mm/shmem.c              | 2 +-
 mm/truncate.c           | 2 +-
 5 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index bd4da9c5207e..d2bdc407c94b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -604,7 +604,7 @@ void clear_inode(struct inode *inode)
 {
 	/*
 	 * We have to cycle the i_pages lock here because reclaim can be in the
-	 * process of removing the last page (in __delete_from_page_cache())
+	 * process of removing the last page (in __filemap_remove_folio())
 	 * and we must not free the mapping under it.
 	 */
 	xa_lock_irq(&inode->i_data.i_pages);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ce96866fbec4..44b9c265a234 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1107,10 +1107,6 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 void filemap_remove_folio(struct folio *folio);
 void delete_from_page_cache(struct page *page);
 void __filemap_remove_folio(struct folio *folio, void *shadow);
-static inline void __delete_from_page_cache(struct page *page, void *shadow)
-{
-	__filemap_remove_folio(page_folio(page), shadow);
-}
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct folio_batch *fbatch);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b85661cbdc4a..a859486ddda9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1935,7 +1935,7 @@ try_again:
 
 	/*
 	 * Now take care of user space mappings.
-	 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
 	 */
 	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
diff --git a/mm/shmem.c b/mm/shmem.c
index a6f565308133..b42dfb01c634 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -392,7 +392,7 @@ void shmem_uncharge(struct inode *inode, long pages)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long flags;
 
-	/* nrpages adjustment done by __delete_from_page_cache() or caller */
+	/* nrpages adjustment done by __filemap_remove_folio() or caller */
 
 	spin_lock_irqsave(&info->lock, flags);
 	info->alloced -= pages;
diff --git a/mm/truncate.c b/mm/truncate.c
index ab50d0d59a2a..0b0708bf935f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -443,7 +443,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
  * mapping->invalidate_lock.
  *
  * Note: When this function returns, there can be a page in the process of
- * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
+ * deletion (inside __filemap_remove_folio()) in the specified range.  Thus
  * mapping->nrpages can be non-zero when this function returns even after
  * truncation of the whole mapping.
  */
-- 
cgit 


From 2bb876b58d593d7f2522ec0f41f20a74fde76822 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 1 Jun 2022 15:13:59 -0400
Subject: filemap: Remove add_to_page_cache() and add_to_page_cache_locked()

These functions have no more users, so delete them.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
---
 Documentation/admin-guide/cgroup-v1/memcg_test.rst |  2 +-
 include/linux/pagemap.h                            | 18 ------------------
 mm/filemap.c                                       | 20 --------------------
 mm/shmem.c                                         |  2 +-
 mm/swap_state.c                                    |  2 +-
 5 files changed, 3 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
index 45b94f7b3beb..a402359abb99 100644
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -97,7 +97,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 =============
 
 	Page Cache is charged at
-	- add_to_page_cache_locked().
+	- filemap_add_folio().
 
 	The logic is very clear. (About migration, see below)
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 44b9c265a234..dee519ef7436 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1098,8 +1098,6 @@ size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
 size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
 size_t fault_in_readable(const char __user *uaddr, size_t size);
 
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-		pgoff_t index, gfp_t gfp);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp);
 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
@@ -1115,22 +1113,6 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp);
 loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
 		int whence);
 
-/*
- * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __SetPageLocked() against it.
- */
-static inline int add_to_page_cache(struct page *page,
-		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
-{
-	int error;
-
-	__SetPageLocked(page);
-	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-	if (unlikely(error))
-		__ClearPageLocked(page);
-	return error;
-}
-
 /* Must be non-static for BPF error injection */
 int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
 		pgoff_t index, gfp_t gfp, void **shadowp);
diff --git a/mm/filemap.c b/mm/filemap.c
index ffdfbc8b0e3c..22a17ab256f7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -929,26 +929,6 @@ error:
 }
 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
 
-/**
- * add_to_page_cache_locked - add a locked page to the pagecache
- * @page:	page to add
- * @mapping:	the page's address_space
- * @offset:	page index
- * @gfp_mask:	page allocation mode
- *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU.  The caller must do that.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-		pgoff_t offset, gfp_t gfp_mask)
-{
-	return __filemap_add_folio(mapping, page_folio(page), offset,
-					  gfp_mask, NULL);
-}
-EXPORT_SYMBOL(add_to_page_cache_locked);
-
 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 				pgoff_t index, gfp_t gfp)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index b42dfb01c634..6a5e46f1a326 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -693,7 +693,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
- * Like add_to_page_cache_locked, but error if expected item has gone.
+ * Like filemap_add_folio, but error if expected item has gone.
  */
 static int shmem_add_to_page_cache(struct folio *folio,
 				   struct address_space *mapping,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 778d57d2d92d..f5b6f5638908 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -95,7 +95,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
 }
 
 /*
- * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles filemap_add_folio on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
 int add_to_swap_cache(struct page *page, swp_entry_t entry,
-- 
cgit 


From be0ced5e9cb81a4c2edefd081a7794a1814fb60d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 3 Jun 2022 15:30:25 -0400
Subject: filemap: Add filemap_get_folios()

This is the equivalent of find_get_pages() but fills a folio_batch
instead of an array of pages.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index dee519ef7436..cfd0e8001b3b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -718,6 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 	return head + (index & (thp_nr_pages(head) - 1));
 }
 
+unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
+		pgoff_t end, struct folio_batch *fbatch);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index 22a17ab256f7..ce3209a8ad49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2127,6 +2127,65 @@ put:
 	return folio_batch_count(fbatch);
 }
 
+/**
+ * filemap_get_folios - Get a batch of folios
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @end:	The final page index (inclusive)
+ * @fbatch:	The batch to fill.
+ *
+ * Search for and return a batch of folios in the mapping starting at
+ * index @start and up to index @end (inclusive).  The folios are returned
+ * in @fbatch with an elevated reference count.
+ *
+ * The first folio may start before @start; if it does, it will contain
+ * @start.  The final folio may extend beyond @end; if it does, it will
+ * contain @end.  The folios have ascending indices.  There may be gaps
+ * between the folios if there are indices which have no folio in the
+ * page cache.  If folios are added to or removed from the page cache
+ * while this is running, they may or may not be found by this call.
+ *
+ * Return: The number of folios which were found.
+ * We also update @start to index the next folio for the traversal.
+ */
+unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
+		pgoff_t end, struct folio_batch *fbatch)
+{
+	XA_STATE(xas, &mapping->i_pages, *start);
+	struct folio *folio;
+
+	rcu_read_lock();
+	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+		/* Skip over shadow, swap and DAX entries */
+		if (xa_is_value(folio))
+			continue;
+		if (!folio_batch_add(fbatch, folio)) {
+			unsigned long nr = folio_nr_pages(folio);
+
+			if (folio_test_hugetlb(folio))
+				nr = 1;
+			*start = folio->index + nr;
+			goto out;
+		}
+	}
+
+	/*
+	 * We come here when there is no page beyond @end. We take care to not
+	 * overflow the index @start as it confuses some of the callers. This
+	 * breaks the iteration when there is a page at index -1 but that is
+	 * already broken anyway.
+	 */
+	if (end == (pgoff_t)-1)
+		*start = (pgoff_t)-1;
+	else
+		*start = end + 1;
+out:
+	rcu_read_unlock();
+
+	return folio_batch_count(fbatch);
+}
+EXPORT_SYMBOL(filemap_get_folios);
+
 static inline
 bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
 {
-- 
cgit 


From 77414d195f905dd43f58bce82118775ffa59575c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 4 Jun 2022 17:39:09 -0400
Subject: vmscan: Add check_move_unevictable_folios()

Change the guts of check_move_unevictable_pages() over to use folios
and add check_move_unevictable_pages() as a wrapper.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/swap.h |  3 ++-
 mm/vmscan.c          | 56 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0c0fed1b348f..8672a7123ccd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -438,7 +438,8 @@ static inline bool node_reclaim_enabled(void)
 	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
 }
 
-extern void check_move_unevictable_pages(struct pagevec *pvec);
+void check_move_unevictable_folios(struct folio_batch *fbatch);
+void check_move_unevictable_pages(struct pagevec *pvec);
 
 extern void kswapd_run(int nid);
 extern void kswapd_stop(int nid);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7d9a683e3a7..04f8671caad9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4790,45 +4790,57 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 }
 #endif
 
+void check_move_unevictable_pages(struct pagevec *pvec)
+{
+	struct folio_batch fbatch;
+	unsigned i;
+
+	folio_batch_init(&fbatch);
+	for (i = 0; i < pvec->nr; i++) {
+		struct page *page = pvec->pages[i];
+
+		if (PageTransTail(page))
+			continue;
+		folio_batch_add(&fbatch, page_folio(page));
+	}
+	check_move_unevictable_folios(&fbatch);
+}
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+
 /**
- * check_move_unevictable_pages - check pages for evictability and move to
- * appropriate zone lru list
- * @pvec: pagevec with lru pages to check
+ * check_move_unevictable_folios - Move evictable folios to appropriate zone
+ * lru list
+ * @fbatch: Batch of lru folios to check.
  *
- * Checks pages for evictability, if an evictable page is in the unevictable
+ * Checks folios for evictability, if an evictable folio is in the unevictable
  * lru list, moves it to the appropriate evictable lru list. This function
- * should be only used for lru pages.
+ * should be only used for lru folios.
  */
-void check_move_unevictable_pages(struct pagevec *pvec)
+void check_move_unevictable_folios(struct folio_batch *fbatch)
 {
 	struct lruvec *lruvec = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 
-	for (i = 0; i < pvec->nr; i++) {
-		struct page *page = pvec->pages[i];
-		struct folio *folio = page_folio(page);
-		int nr_pages;
-
-		if (PageTransTail(page))
-			continue;
+	for (i = 0; i < fbatch->nr; i++) {
+		struct folio *folio = fbatch->folios[i];
+		int nr_pages = folio_nr_pages(folio);
 
-		nr_pages = thp_nr_pages(page);
 		pgscanned += nr_pages;
 
-		/* block memcg migration during page moving between lru */
-		if (!TestClearPageLRU(page))
+		/* block memcg migration while the folio moves between lrus */
+		if (!folio_test_clear_lru(folio))
 			continue;
 
 		lruvec = folio_lruvec_relock_irq(folio, lruvec);
-		if (page_evictable(page) && PageUnevictable(page)) {
-			del_page_from_lru_list(page, lruvec);
-			ClearPageUnevictable(page);
-			add_page_to_lru_list(page, lruvec);
+		if (folio_evictable(folio) && folio_test_unevictable(folio)) {
+			lruvec_del_folio(lruvec, folio);
+			folio_clear_unevictable(folio);
+			lruvec_add_folio(lruvec, folio);
 			pgrescued += nr_pages;
 		}
-		SetPageLRU(page);
+		folio_set_lru(folio);
 	}
 
 	if (lruvec) {
@@ -4839,4 +4851,4 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 	}
 }
-EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
-- 
cgit 


From bb4b42ba926224e26ed699e51def164b4b163935 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 4 Jun 2022 17:46:02 -0400
Subject: filemap: Remove find_get_pages_range() and associated functions

All callers of find_get_pages_range(), pagevec_lookup_range() and
pagevec_lookup() have now been removed.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/pagemap.h |  3 ---
 include/linux/pagevec.h | 10 --------
 mm/filemap.c            | 67 -------------------------------------------------
 mm/swap.c               | 29 ---------------------
 4 files changed, 109 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cfd0e8001b3b..87d4ea571240 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -720,9 +720,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 
 unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
-			pgoff_t end, unsigned int nr_pages,
-			struct page **pages);
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 67b1246f136b..6649154a2115 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,16 +27,6 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
-unsigned pagevec_lookup_range(struct pagevec *pvec,
-			      struct address_space *mapping,
-			      pgoff_t *start, pgoff_t end);
-static inline unsigned pagevec_lookup(struct pagevec *pvec,
-				      struct address_space *mapping,
-				      pgoff_t *start)
-{
-	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
-}
-
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		xa_mark_t tag);
diff --git a/mm/filemap.c b/mm/filemap.c
index ce3209a8ad49..15399e8cd281 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2196,73 +2196,6 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
 	return index < folio->index + folio_nr_pages(folio) - 1;
 }
 
-/**
- * find_get_pages_range - gang pagecache lookup
- * @mapping:	The address_space to search
- * @start:	The starting page index
- * @end:	The final page index (inclusive)
- * @nr_pages:	The maximum number of pages
- * @pages:	Where the resulting pages are placed
- *
- * find_get_pages_range() will search for and return a group of up to @nr_pages
- * pages in the mapping starting at index @start and up to index @end
- * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
- * a reference against the returned pages.
- *
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages.
- * We also update @start to index the next page for the traversal.
- *
- * Return: the number of pages which were found. If this number is
- * smaller than @nr_pages, the end of specified range has been
- * reached.
- */
-unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
-			      pgoff_t end, unsigned int nr_pages,
-			      struct page **pages)
-{
-	XA_STATE(xas, &mapping->i_pages, *start);
-	struct folio *folio;
-	unsigned ret = 0;
-
-	if (unlikely(!nr_pages))
-		return 0;
-
-	rcu_read_lock();
-	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
-		/* Skip over shadow, swap and DAX entries */
-		if (xa_is_value(folio))
-			continue;
-
-again:
-		pages[ret] = folio_file_page(folio, xas.xa_index);
-		if (++ret == nr_pages) {
-			*start = xas.xa_index + 1;
-			goto out;
-		}
-		if (folio_more_pages(folio, xas.xa_index, end)) {
-			xas.xa_index++;
-			folio_ref_inc(folio);
-			goto again;
-		}
-	}
-
-	/*
-	 * We come here when there is no page beyond @end. We take care to not
-	 * overflow the index @start as it confuses some of the callers. This
-	 * breaks the iteration when there is a page at index -1 but that is
-	 * already broken anyway.
-	 */
-	if (end == (pgoff_t)-1)
-		*start = (pgoff_t)-1;
-	else
-		*start = end + 1;
-out:
-	rcu_read_unlock();
-
-	return ret;
-}
-
 /**
  * find_get_pages_contig - gang contiguous pagecache lookup
  * @mapping:	The address_space to search
diff --git a/mm/swap.c b/mm/swap.c
index f3922a96b2e9..f65e284247b2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1086,35 +1086,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
 	fbatch->nr = j;
 }
 
-/**
- * pagevec_lookup_range - gang pagecache lookup
- * @pvec:	Where the resulting pages are placed
- * @mapping:	The address_space to search
- * @start:	The starting page index
- * @end:	The final page index
- *
- * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
- * pages in the mapping starting from index @start and upto index @end
- * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
- * reference against the pages in @pvec.
- *
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages. We
- * also update @start to index the next page for the traversal.
- *
- * pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than PAGEVEC_SIZE, the end of specified range has been
- * reached.
- */
-unsigned pagevec_lookup_range(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *start, pgoff_t end)
-{
-	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
-					pvec->pages);
-	return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range);
-
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		xa_mark_t tag)
-- 
cgit 


From 0e8e08cca5e3256a6209f02b482bee96fb91ba1b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Apr 2022 08:49:28 -0400
Subject: netfs: Remove extern from function prototypes

The 'extern' keyword is not necessary and removing it lets us shorten
some lines.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/netfs.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 1773e5df8e65..11df38d03359 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -276,19 +276,18 @@ struct netfs_cache_ops {
 };
 
 struct readahead_control;
-extern void netfs_readahead(struct readahead_control *);
+void netfs_readahead(struct readahead_control *);
 int netfs_read_folio(struct file *, struct folio *);
-extern int netfs_write_begin(struct netfs_inode *,
-			     struct file *, struct address_space *,
-			     loff_t, unsigned int, struct folio **,
-			     void **);
-
-extern void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
-extern void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
-				 enum netfs_sreq_ref_trace what);
-extern void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
-				 bool was_async, enum netfs_sreq_ref_trace what);
-extern void netfs_stats_show(struct seq_file *);
+int netfs_write_begin(struct netfs_inode *, struct file *,
+		struct address_space *, loff_t pos, unsigned int len,
+		struct folio **, void **fsdata);
+
+void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
+void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
+			  enum netfs_sreq_ref_trace what);
+void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
+			  bool was_async, enum netfs_sreq_ref_trace what);
+void netfs_stats_show(struct seq_file *);
 
 /**
  * netfs_inode - Get the netfs inode context from the inode
-- 
cgit 


From a71a62dd5e012e3786e54d43b260ed9d74e75a1a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 15 Jun 2022 15:58:40 +0200
Subject: dt-bindings: pinctrl: renesas: Remove spaces before #define

Remove spaces at the beginning of lines with #defines.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/5188ef93a911ce3781b16530fdebbf0f0af462b6.1655301264.git.geert+renesas@glider.be
---
 include/dt-bindings/pinctrl/r7s9210-pinctrl.h | 2 +-
 include/dt-bindings/pinctrl/rzg2l-pinctrl.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/pinctrl/r7s9210-pinctrl.h b/include/dt-bindings/pinctrl/r7s9210-pinctrl.h
index 2d0c23e5d3a7..8736ce038eca 100644
--- a/include/dt-bindings/pinctrl/r7s9210-pinctrl.h
+++ b/include/dt-bindings/pinctrl/r7s9210-pinctrl.h
@@ -42,6 +42,6 @@
 /*
  * Convert a port and pin label to its global pin index
  */
- #define RZA2_PIN(port, pin)	((port) * RZA2_PINS_PER_PORT + (pin))
+#define RZA2_PIN(port, pin)	((port) * RZA2_PINS_PER_PORT + (pin))
 
 #endif /* __DT_BINDINGS_PINCTRL_RENESAS_RZA2_H */
diff --git a/include/dt-bindings/pinctrl/rzg2l-pinctrl.h b/include/dt-bindings/pinctrl/rzg2l-pinctrl.h
index b48f8c7a5556..c78ed5e5efb7 100644
--- a/include/dt-bindings/pinctrl/rzg2l-pinctrl.h
+++ b/include/dt-bindings/pinctrl/rzg2l-pinctrl.h
@@ -18,6 +18,6 @@
 #define RZG2L_PORT_PINMUX(b, p, f)	((b) * RZG2L_PINS_PER_PORT + (p) | ((f) << 16))
 
 /* Convert a port and pin label to its global pin index */
- #define RZG2L_GPIO(port, pin)	((port) * RZG2L_PINS_PER_PORT + (pin))
+#define RZG2L_GPIO(port, pin)	((port) * RZG2L_PINS_PER_PORT + (pin))
 
 #endif /* __DT_BINDINGS_RZG2L_PINCTRL_H */
-- 
cgit 


From a57f68ddc8865d59a19783080cc52fb4a11dc209 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Fri, 24 Jun 2022 17:18:45 +0300
Subject: reset: Fix devm bulk optional exclusive control getter

Most likely due to copy-paste mistake the device managed version of the
denoted reset control getter has been implemented with invalid semantic,
which can be immediately spotted by having "WARN_ON(shared && acquired)"
warning in the system log as soon as the method is called. Anyway let's
fix it by altering the boolean arguments passed to the
__devm_reset_control_bulk_get() method from
- shared = true, optional = false, acquired = true
to
+ shared = false, optional = true, acquired = true
That's what they were supposed to be in the first place (see the non-devm
version of the same method: reset_control_bulk_get_optional_exclusive()).

Fixes: 48d71395896d ("reset: Add reset_control_bulk API")
Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/20220624141853.7417-2-Sergey.Semin@baikalelectronics.ru
---
 include/linux/reset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index 8a21b5756c3e..514ddf003efc 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -731,7 +731,7 @@ static inline int __must_check
 devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
 					       struct reset_control_bulk_data *rstcs)
 {
-	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, true);
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true);
 }
 
 /**
-- 
cgit 


From 45ed13d9b40cc711a996192a8ef98c0f773238e6 Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:49 +0200
Subject: dt-bindings: net: pcs: add bindings for Renesas RZ/N1 MII converter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This MII converter can be found on the RZ/N1 processor family. The MII
converter ports are declared as subnodes which are then referenced by
users of the PCS driver such as the switch.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20220624144001.95518-5-clement.leger@bootlin.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 .../bindings/net/pcs/renesas,rzn1-miic.yaml        | 171 +++++++++++++++++++++
 include/dt-bindings/net/pcs-rzn1-miic.h            |  33 ++++
 2 files changed, 204 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
 create mode 100644 include/dt-bindings/net/pcs-rzn1-miic.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
new file mode 100644
index 000000000000..2d33bbab7163
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/pcs/renesas,rzn1-miic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/N1 MII converter
+
+maintainers:
+  - Clément Léger <clement.leger@bootlin.com>
+
+description: |
+  This MII converter is present on the Renesas RZ/N1 SoC family. It is
+  responsible to do MII passthrough or convert it to RMII/RGMII.
+
+properties:
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
+
+  compatible:
+    items:
+      - enum:
+          - renesas,r9a06g032-miic
+      - const: renesas,rzn1-miic
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: MII reference clock
+      - description: RGMII reference clock
+      - description: RMII reference clock
+      - description: AHB clock used for the MII converter register interface
+
+  clock-names:
+    items:
+      - const: mii_ref
+      - const: rgmii_ref
+      - const: rmii_ref
+      - const: hclk
+
+  renesas,miic-switch-portin:
+    description: MII Switch PORTIN configuration. This value should use one of
+      the values defined in dt-bindings/net/pcs-rzn1-miic.h.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [1, 2]
+
+  power-domains:
+    maxItems: 1
+
+patternProperties:
+  "^mii-conv@[0-5]$":
+    type: object
+    description: MII converter port
+
+    properties:
+      reg:
+        description: MII Converter port number.
+        enum: [1, 2, 3, 4, 5]
+
+      renesas,miic-input:
+        description: Converter input port configuration. This value should use
+          one of the values defined in dt-bindings/net/pcs-rzn1-miic.h.
+        $ref: /schemas/types.yaml#/definitions/uint32
+
+    required:
+      - reg
+      - renesas,miic-input
+
+    additionalProperties: false
+
+    allOf:
+      - if:
+          properties:
+            reg:
+              const: 1
+        then:
+          properties:
+            renesas,miic-input:
+              const: 0
+      - if:
+          properties:
+            reg:
+              const: 2
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [1, 11]
+      - if:
+          properties:
+            reg:
+              const: 3
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [7, 10]
+      - if:
+          properties:
+            reg:
+              const: 4
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [4, 6, 9, 13]
+      - if:
+          properties:
+            reg:
+              const: 5
+        then:
+          properties:
+            renesas,miic-input:
+              enum: [3, 5, 8, 12]
+
+required:
+  - '#address-cells'
+  - '#size-cells'
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - power-domains
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/net/pcs-rzn1-miic.h>
+    #include <dt-bindings/clock/r9a06g032-sysctrl.h>
+
+    eth-miic@44030000 {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      compatible = "renesas,r9a06g032-miic", "renesas,rzn1-miic";
+      reg = <0x44030000 0x10000>;
+      clocks = <&sysctrl R9A06G032_CLK_MII_REF>,
+              <&sysctrl R9A06G032_CLK_RGMII_REF>,
+              <&sysctrl R9A06G032_CLK_RMII_REF>,
+              <&sysctrl R9A06G032_HCLK_SWITCH_RG>;
+      clock-names = "mii_ref", "rgmii_ref", "rmii_ref", "hclk";
+      renesas,miic-switch-portin = <MIIC_GMAC2_PORT>;
+      power-domains = <&sysctrl>;
+
+      mii_conv1: mii-conv@1 {
+        renesas,miic-input = <MIIC_GMAC1_PORT>;
+        reg = <1>;
+      };
+
+      mii_conv2: mii-conv@2 {
+        renesas,miic-input = <MIIC_SWITCH_PORTD>;
+        reg = <2>;
+      };
+
+      mii_conv3: mii-conv@3 {
+        renesas,miic-input = <MIIC_SWITCH_PORTC>;
+        reg = <3>;
+      };
+
+      mii_conv4: mii-conv@4 {
+        renesas,miic-input = <MIIC_SWITCH_PORTB>;
+        reg = <4>;
+      };
+
+      mii_conv5: mii-conv@5 {
+        renesas,miic-input = <MIIC_SWITCH_PORTA>;
+        reg = <5>;
+      };
+    };
diff --git a/include/dt-bindings/net/pcs-rzn1-miic.h b/include/dt-bindings/net/pcs-rzn1-miic.h
new file mode 100644
index 000000000000..784782eaec9e
--- /dev/null
+++ b/include/dt-bindings/net/pcs-rzn1-miic.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2022 Schneider-Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#ifndef _DT_BINDINGS_PCS_RZN1_MIIC
+#define _DT_BINDINGS_PCS_RZN1_MIIC
+
+/*
+ * Reefer to the datasheet [1] section 8.2.1, Internal Connection of Ethernet
+ * Ports to check the available combination
+ *
+ * [1] REN_r01uh0750ej0140-rzn1-introduction_MAT_20210228.pdf
+ */
+
+#define MIIC_GMAC1_PORT			0
+#define MIIC_GMAC2_PORT			1
+#define MIIC_RTOS_PORT			2
+#define MIIC_SERCOS_PORTA		3
+#define MIIC_SERCOS_PORTB		4
+#define MIIC_ETHERCAT_PORTA		5
+#define MIIC_ETHERCAT_PORTB		6
+#define MIIC_ETHERCAT_PORTC		7
+#define MIIC_SWITCH_PORTA		8
+#define MIIC_SWITCH_PORTB		9
+#define MIIC_SWITCH_PORTC		10
+#define MIIC_SWITCH_PORTD		11
+#define MIIC_HSR_PORTA			12
+#define MIIC_HSR_PORTB			13
+
+#endif
-- 
cgit 


From 77940f0d96cd2ec9fe2125f74f513a7254bcdd7f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Jun 2022 16:31:12 +0200
Subject: mnt_idmapping: align kernel doc and parameter order

The kernel documentation added for the new helpers had a few tiny
ordering issues. Fix them.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index be83643cadff..41dc80f8b67c 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -70,12 +70,12 @@ static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right)
 
 /**
  * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value
- * @kuid: the kuid to compare
  * @vfsuid: the vfsuid to compare
+ * @kuid: the kuid to compare
  *
- * Check whether @kuid and @vfsuid have the same values.
+ * Check whether @vfsuid and @kuid have the same values.
  *
- * Return: true if @kuid and @vfsuid have the same value, false if not.
+ * Return: true if @vfsuid and @kuid have the same value, false if not.
  * Comparison between two invalid uids returns false.
  */
 static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
@@ -85,15 +85,15 @@ static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
 
 /**
  * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value
- * @kgid: the kgid to compare
  * @vfsgid: the vfsgid to compare
+ * @kgid: the kgid to compare
  *
- * Check whether @kgid and @vfsgid have the same values.
+ * Check whether @vfsgid and @kgid have the same values.
  *
- * Return: true if @kgid and @vfsgid have the same value, false if not.
+ * Return: true if @vfsgid and @kgid have the same value, false if not.
  * Comparison between two invalid gids returns false.
  */
-static inline bool vfsgid_eq_kgid(kgid_t kgid, vfsgid_t vfsgid)
+static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid)
 {
 	return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid);
 }
@@ -171,7 +171,7 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
 }
 
 /**
- * mapped_kuid_fs - map a filesystem kuid into a mnt_userns
+ * make_vfsuid - map a filesystem kuid into a mnt_userns
  * @mnt_userns: the mount's idmapping
  * @fs_userns: the filesystem's idmapping
  * @kuid : kuid to be mapped
@@ -216,7 +216,7 @@ static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
 }
 
 /**
- * mapped_kgid_fs - map a filesystem kgid into a mnt_userns
+ * make_vfsgid - map a filesystem kgid into a mnt_userns
  * @mnt_userns: the mount's idmapping
  * @fs_userns: the filesystem's idmapping
  * @kgid : kgid to be mapped
-- 
cgit 


From 6a27d28c81bc5843de2490688a04ee5baa6615e7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 29 Jun 2022 08:20:12 +0200
Subject: block: move ->ia_ranges from the request_queue to the gendisk

Independent access ranges only matter for file system I/O and are only
valid with a registered gendisk, so move them there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Link: https://lore.kernel.org/r/20220629062013.1331068-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ia-ranges.c  | 18 +++++++++---------
 include/linux/blkdev.h | 12 ++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 47c89e65b57f..c1bf14bcd15f 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -106,7 +106,7 @@ static struct kobj_type blk_ia_ranges_ktype = {
  *
  * Register with sysfs a set of independent access ranges for @disk.
  * If @new_iars is not NULL, this set of ranges is registered and the old set
- * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
+ * specified by disk->ia_ranges is unregistered. Otherwise, disk->ia_ranges is
  * registered if it is not already.
  */
 int disk_register_independent_access_ranges(struct gendisk *disk,
@@ -121,12 +121,12 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
 
 	/* If a new range set is specified, unregister the old one */
 	if (new_iars) {
-		if (q->ia_ranges)
+		if (disk->ia_ranges)
 			disk_unregister_independent_access_ranges(disk);
-		q->ia_ranges = new_iars;
+		disk->ia_ranges = new_iars;
 	}
 
-	iars = q->ia_ranges;
+	iars = disk->ia_ranges;
 	if (!iars)
 		return 0;
 
@@ -138,7 +138,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
 	ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
 				   &q->kobj, "%s", "independent_access_ranges");
 	if (ret) {
-		q->ia_ranges = NULL;
+		disk->ia_ranges = NULL;
 		kobject_put(&iars->kobj);
 		return ret;
 	}
@@ -164,7 +164,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
 void disk_unregister_independent_access_ranges(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
-	struct blk_independent_access_ranges *iars = q->ia_ranges;
+	struct blk_independent_access_ranges *iars = disk->ia_ranges;
 	int i;
 
 	lockdep_assert_held(&q->sysfs_dir_lock);
@@ -182,7 +182,7 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk)
 		kfree(iars);
 	}
 
-	q->ia_ranges = NULL;
+	disk->ia_ranges = NULL;
 }
 
 static struct blk_independent_access_range *
@@ -242,7 +242,7 @@ static bool disk_check_ia_ranges(struct gendisk *disk,
 static bool disk_ia_ranges_changed(struct gendisk *disk,
 				   struct blk_independent_access_ranges *new)
 {
-	struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+	struct blk_independent_access_ranges *old = disk->ia_ranges;
 	int i;
 
 	if (!old)
@@ -331,7 +331,7 @@ reg:
 	if (blk_queue_registered(q)) {
 		disk_register_independent_access_ranges(disk, iars);
 	} else {
-		swap(q->ia_ranges, iars);
+		swap(disk->ia_ranges, iars);
 		kfree(iars);
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 22b12531aeb7..b9a94c53c6cd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -171,6 +171,12 @@ struct gendisk {
 	struct badblocks *bb;
 	struct lockdep_map lockdep_map;
 	u64 diskseq;
+
+	/*
+	 * Independent sector access ranges. This is always NULL for
+	 * devices that do not have multiple independent access ranges.
+	 */
+	struct blk_independent_access_ranges *ia_ranges;
 };
 
 static inline bool disk_live(struct gendisk *disk)
@@ -539,12 +545,6 @@ struct request_queue {
 
 	bool			mq_sysfs_init_done;
 
-	/*
-	 * Independent sector access ranges. This is always NULL for
-	 * devices that do not have multiple independent access ranges.
-	 */
-	struct blk_independent_access_ranges *ia_ranges;
-
 	/**
 	 * @srcu: Sleepable RCU. Use as lock when type of the request queue
 	 * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
-- 
cgit 


From 7fdc74da940ddf8f2eb0dd1202cbfbfe08342cbb Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen@kylinos.cn>
Date: Fri, 17 Jun 2022 10:51:51 +0800
Subject: ACPI: processor: Split out thermal initialization from ACPI PSS

Commit 239708a3af44 ("ACPI: Split out ACPI PSS from ACPI Processor
driver"), moves processor thermal registration to acpi_pss_perf_init(),
which doesn't get executed if ACPI_CPU_FREQ_PSS is not enabled.

As ARM64 supports P-states using CPPC, it should be possible to also
support processor passive cooling even if PSS is not enabled. Split
out the processor thermal cooling register from ACPI PSS to support
this, and move it into a separate function in processor_thermal.c.

Signed-off-by: Riwen Lu <luriwen@kylinos.cn>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig             |  2 +-
 drivers/acpi/Makefile            |  5 ++-
 drivers/acpi/processor_driver.c  | 72 +++++-----------------------------------
 drivers/acpi/processor_thermal.c | 54 ++++++++++++++++++++++++++++++
 include/acpi/processor.h         |  8 +++--
 5 files changed, 71 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 1e34f846508f..2457ade3f82d 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -255,7 +255,6 @@ config ACPI_DOCK
 
 config ACPI_CPU_FREQ_PSS
 	bool
-	select THERMAL
 
 config ACPI_PROCESSOR_CSTATE
 	def_bool y
@@ -287,6 +286,7 @@ config ACPI_PROCESSOR
 	depends on X86 || IA64 || ARM64 || LOONGARCH
 	select ACPI_PROCESSOR_IDLE
 	select ACPI_CPU_FREQ_PSS if X86 || IA64 || LOONGARCH
+	select THERMAL
 	default y
 	help
 	  This driver adds support for the ACPI Processor package. It is required
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index b5a8d3e00a52..0002eecbf870 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -109,10 +109,9 @@ obj-$(CONFIG_ACPI_PPTT) 	+= pptt.o
 obj-$(CONFIG_ACPI_PFRUT)	+= pfr_update.o pfr_telemetry.o
 
 # processor has its own "processor." module_param namespace
-processor-y			:= processor_driver.o
+processor-y			:= processor_driver.o processor_thermal.o
 processor-$(CONFIG_ACPI_PROCESSOR_IDLE) += processor_idle.o
-processor-$(CONFIG_ACPI_CPU_FREQ_PSS)	+= processor_throttling.o	\
-	processor_thermal.o
+processor-$(CONFIG_ACPI_CPU_FREQ_PSS)	+= processor_throttling.o
 processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
 
 obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 368a9edefd0c..1278969eec1f 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -139,75 +139,17 @@ static int acpi_soft_cpu_dead(unsigned int cpu)
 }
 
 #ifdef CONFIG_ACPI_CPU_FREQ_PSS
-static int acpi_pss_perf_init(struct acpi_processor *pr,
-		struct acpi_device *device)
+static void acpi_pss_perf_init(struct acpi_processor *pr)
 {
-	int result = 0;
-
 	acpi_processor_ppc_has_changed(pr, 0);
 
 	acpi_processor_get_throttling_info(pr);
 
 	if (pr->flags.throttling)
 		pr->flags.limit = 1;
-
-	pr->cdev = thermal_cooling_device_register("Processor", device,
-						   &processor_cooling_ops);
-	if (IS_ERR(pr->cdev)) {
-		result = PTR_ERR(pr->cdev);
-		return result;
-	}
-
-	dev_dbg(&device->dev, "registered as cooling_device%d\n",
-		pr->cdev->id);
-
-	result = sysfs_create_link(&device->dev.kobj,
-				   &pr->cdev->device.kobj,
-				   "thermal_cooling");
-	if (result) {
-		dev_err(&device->dev,
-			"Failed to create sysfs link 'thermal_cooling'\n");
-		goto err_thermal_unregister;
-	}
-
-	result = sysfs_create_link(&pr->cdev->device.kobj,
-				   &device->dev.kobj,
-				   "device");
-	if (result) {
-		dev_err(&pr->cdev->device,
-			"Failed to create sysfs link 'device'\n");
-		goto err_remove_sysfs_thermal;
-	}
-
-	return 0;
-
- err_remove_sysfs_thermal:
-	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
- err_thermal_unregister:
-	thermal_cooling_device_unregister(pr->cdev);
-
-	return result;
-}
-
-static void acpi_pss_perf_exit(struct acpi_processor *pr,
-		struct acpi_device *device)
-{
-	if (pr->cdev) {
-		sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
-		sysfs_remove_link(&pr->cdev->device.kobj, "device");
-		thermal_cooling_device_unregister(pr->cdev);
-		pr->cdev = NULL;
-	}
 }
 #else
-static inline int acpi_pss_perf_init(struct acpi_processor *pr,
-		struct acpi_device *device)
-{
-	return 0;
-}
-
-static inline void acpi_pss_perf_exit(struct acpi_processor *pr,
-		struct acpi_device *device) {}
+static inline void acpi_pss_perf_init(struct acpi_processor *pr) {}
 #endif /* CONFIG_ACPI_CPU_FREQ_PSS */
 
 static int __acpi_processor_start(struct acpi_device *device)
@@ -229,7 +171,9 @@ static int __acpi_processor_start(struct acpi_device *device)
 	if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver)
 		acpi_processor_power_init(pr);
 
-	result = acpi_pss_perf_init(pr, device);
+	acpi_pss_perf_init(pr);
+
+	result = acpi_processor_thermal_init(pr, device);
 	if (result)
 		goto err_power_exit;
 
@@ -239,7 +183,7 @@ static int __acpi_processor_start(struct acpi_device *device)
 		return 0;
 
 	result = -ENODEV;
-	acpi_pss_perf_exit(pr, device);
+	acpi_processor_thermal_exit(pr, device);
 
 err_power_exit:
 	acpi_processor_power_exit(pr);
@@ -277,10 +221,10 @@ static int acpi_processor_stop(struct device *dev)
 		return 0;
 	acpi_processor_power_exit(pr);
 
-	acpi_pss_perf_exit(pr, device);
-
 	acpi_cppc_processor_exit(pr);
 
+	acpi_processor_thermal_exit(pr, device);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index d8b2dfcd59b5..db6ac540e924 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -266,3 +266,57 @@ const struct thermal_cooling_device_ops processor_cooling_ops = {
 	.get_cur_state = processor_get_cur_state,
 	.set_cur_state = processor_set_cur_state,
 };
+
+int acpi_processor_thermal_init(struct acpi_processor *pr,
+				struct acpi_device *device)
+{
+	int result = 0;
+
+	pr->cdev = thermal_cooling_device_register("Processor", device,
+						   &processor_cooling_ops);
+	if (IS_ERR(pr->cdev)) {
+		result = PTR_ERR(pr->cdev);
+		return result;
+	}
+
+	dev_dbg(&device->dev, "registered as cooling_device%d\n",
+		pr->cdev->id);
+
+	result = sysfs_create_link(&device->dev.kobj,
+				   &pr->cdev->device.kobj,
+				   "thermal_cooling");
+	if (result) {
+		dev_err(&device->dev,
+			"Failed to create sysfs link 'thermal_cooling'\n");
+		goto err_thermal_unregister;
+	}
+
+	result = sysfs_create_link(&pr->cdev->device.kobj,
+				   &device->dev.kobj,
+				   "device");
+	if (result) {
+		dev_err(&pr->cdev->device,
+			"Failed to create sysfs link 'device'\n");
+		goto err_remove_sysfs_thermal;
+	}
+
+	return 0;
+
+err_remove_sysfs_thermal:
+	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
+err_thermal_unregister:
+	thermal_cooling_device_unregister(pr->cdev);
+
+	return result;
+}
+
+void acpi_processor_thermal_exit(struct acpi_processor *pr,
+				 struct acpi_device *device)
+{
+	if (pr->cdev) {
+		sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
+		sysfs_remove_link(&pr->cdev->device.kobj, "device");
+		thermal_cooling_device_unregister(pr->cdev);
+		pr->cdev = NULL;
+	}
+}
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 194027371928..ba1e3ed98d3d 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -442,8 +442,12 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr)
 
 /* in processor_thermal.c */
 int acpi_processor_get_limit_info(struct acpi_processor *pr);
+int acpi_processor_thermal_init(struct acpi_processor *pr,
+				struct acpi_device *device);
+void acpi_processor_thermal_exit(struct acpi_processor *pr,
+				 struct acpi_device *device);
 extern const struct thermal_cooling_device_ops processor_cooling_ops;
-#if defined(CONFIG_ACPI_CPU_FREQ_PSS) & defined(CONFIG_CPU_FREQ)
+#ifdef CONFIG_CPU_FREQ
 void acpi_thermal_cpufreq_init(struct cpufreq_policy *policy);
 void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy);
 #else
@@ -455,6 +459,6 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	return;
 }
-#endif	/* CONFIG_ACPI_CPU_FREQ_PSS */
+#endif	/* CONFIG_CPU_FREQ */
 
 #endif
-- 
cgit 


From e414207d12f38bc42222037dbcdbe6b4649418d6 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen@kylinos.cn>
Date: Fri, 17 Jun 2022 10:51:52 +0800
Subject: ACPI: processor: Drop leftover acpi_processor_get_limit_info()
 declaration

Commit 22e7551eb6fd ("ACPI / processor: Remove acpi_processor_get_limit_info()"),
left it behind, so drop it.

Signed-off-by: Riwen Lu <luriwen@kylinos.cn>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/processor.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index ba1e3ed98d3d..9fa49686957a 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -441,7 +441,6 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr)
 #endif /* CONFIG_ACPI_PROCESSOR_IDLE */
 
 /* in processor_thermal.c */
-int acpi_processor_get_limit_info(struct acpi_processor *pr);
 int acpi_processor_thermal_init(struct acpi_processor *pr,
 				struct acpi_device *device);
 void acpi_processor_thermal_exit(struct acpi_processor *pr,
-- 
cgit 


From 57b8b2113e2056d72dbe1d6123950f145c874168 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Wed, 22 Jun 2022 20:13:20 +0400
Subject: ASoC: dt-bindings: Add bindings for WCD9335 DAIs

Add bindings for the DAIs available in WCD9335 to avoid
having to use unclear number indices in device trees.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220622161322.168017-2-y.oudjana@protonmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 MAINTAINERS                              |  1 +
 include/dt-bindings/sound/qcom,wcd9335.h | 15 +++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 include/dt-bindings/sound/qcom,wcd9335.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index c4648e86dc14..e9d5b052cd41 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16250,6 +16250,7 @@ M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
 M:	Banajit Goswami <bgoswami@quicinc.com>
 L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:	Supported
+F:	include/dt-bindings/sound/qcom,wcd9335.h
 F:	sound/soc/codecs/lpass-rx-macro.*
 F:	sound/soc/codecs/lpass-tx-macro.*
 F:	sound/soc/codecs/lpass-va-macro.c
diff --git a/include/dt-bindings/sound/qcom,wcd9335.h b/include/dt-bindings/sound/qcom,wcd9335.h
new file mode 100644
index 000000000000..f5e9f1db091e
--- /dev/null
+++ b/include/dt-bindings/sound/qcom,wcd9335.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef __DT_SOUND_QCOM_WCD9335_H
+#define __DT_SOUND_QCOM_WCD9335_H
+
+#define AIF1_PB                 0
+#define AIF1_CAP                1
+#define AIF2_PB                 2
+#define AIF2_CAP                3
+#define AIF3_PB                 4
+#define AIF3_CAP                5
+#define AIF4_PB                 6
+#define NUM_CODEC_DAIS          7
+
+#endif
-- 
cgit 


From 445cbd219ac3b8f451153c210aaf97adcbf4bd02 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:09 +0100
Subject: regmap-irq: Convert bool bitfields to unsigned int

Use 'unsigned int' for bitfields for consistency with most other
kernel code.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-2-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c |  2 +-
 include/linux/regmap.h           | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index a6db605707b0..a58b29e9c7c7 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -43,7 +43,7 @@ struct regmap_irq_chip_data {
 	unsigned int irq_reg_stride;
 	unsigned int type_reg_stride;
 
-	bool clear_status:1;
+	unsigned int clear_status:1;
 };
 
 static int sub_irq_reg(struct regmap_irq_chip_data *data,
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index d5b08f4f0dc0..bcbc5d4ffd30 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1534,19 +1534,19 @@ struct regmap_irq_chip {
 	unsigned int type_base;
 	unsigned int *virt_reg_base;
 	unsigned int irq_reg_stride;
-	bool mask_writeonly:1;
-	bool init_ack_masked:1;
-	bool mask_invert:1;
-	bool use_ack:1;
-	bool ack_invert:1;
-	bool clear_ack:1;
-	bool wake_invert:1;
-	bool runtime_pm:1;
-	bool type_invert:1;
-	bool type_in_mask:1;
-	bool clear_on_unmask:1;
-	bool not_fixed_stride:1;
-	bool status_invert:1;
+	unsigned int mask_writeonly:1;
+	unsigned int init_ack_masked:1;
+	unsigned int mask_invert:1;
+	unsigned int use_ack:1;
+	unsigned int ack_invert:1;
+	unsigned int clear_ack:1;
+	unsigned int wake_invert:1;
+	unsigned int runtime_pm:1;
+	unsigned int type_invert:1;
+	unsigned int type_in_mask:1;
+	unsigned int clear_on_unmask:1;
+	unsigned int not_fixed_stride:1;
+	unsigned int status_invert:1;
 
 	int num_regs;
 
-- 
cgit 


From 53a1a16dcc972163bd5816192d5d63ae433c9e56 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:10 +0100
Subject: regmap-irq: Remove unused type_reg_stride field

It appears that no chip ever required a nonzero type_reg_stride
and commit 1066cfbdfa3f ("regmap-irq: Extend sub-irq to support
non-fixed reg strides") broke support. Just remove the field.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-3-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 6 ------
 include/linux/regmap.h           | 3 ---
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index a58b29e9c7c7..475a959e2b8b 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -41,7 +41,6 @@ struct regmap_irq_chip_data {
 	unsigned int **virt_buf;
 
 	unsigned int irq_reg_stride;
-	unsigned int type_reg_stride;
 
 	unsigned int clear_status:1;
 };
@@ -743,11 +742,6 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	else
 		d->irq_reg_stride = 1;
 
-	if (chip->type_reg_stride)
-		d->type_reg_stride = chip->type_reg_stride;
-	else
-		d->type_reg_stride = 1;
-
 	if (!map->use_single_read && map->reg_stride == 1 &&
 	    d->irq_reg_stride == 1) {
 		d->status_reg_buf = kmalloc_array(chip->num_regs,
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index bcbc5d4ffd30..58db2206f193 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1503,8 +1503,6 @@ struct regmap_irq_sub_irq_map {
  * @num_type_reg:    Number of type registers.
  * @num_virt_regs:   Number of non-standard irq configuration registers.
  *		     If zero unsupported.
- * @type_reg_stride: Stride to use for chips where type registers are not
- *			contiguous.
  * @handle_pre_irq:  Driver specific callback to handle interrupt from device
  *		     before regmap_irq_handler process the interrupts.
  * @handle_post_irq: Driver specific callback to handle interrupt from device
@@ -1555,7 +1553,6 @@ struct regmap_irq_chip {
 
 	int num_type_reg;
 	int num_virt_regs;
-	unsigned int type_reg_stride;
 
 	int (*handle_pre_irq)(void *irq_drv_data);
 	int (*handle_post_irq)(void *irq_drv_data);
-- 
cgit 


From 610fdd668e6af48fcae7908161d14eee3a95ec92 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:12 +0100
Subject: regmap-irq: Remove an unnecessary restriction on type_in_mask

Check types_supported instead of checking type_rising/falling_val
when using type_in_mask interrupts. This makes the intent clearer
and allows a type_in_mask irq to support level or edge triggers,
rather than only edge triggers.

Update the documentation and comments to reflect the new behavior.

This shouldn't affect existing drivers, because if they didn't
set types_supported properly the type buffer wouldn't be updated.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-5-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 19 ++++++++-----------
 include/linux/regmap.h           |  8 +++++---
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index dca27b4e29d3..fd7c4315d16b 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -252,22 +252,19 @@ static void regmap_irq_enable(struct irq_data *data)
 	struct regmap *map = d->map;
 	const struct regmap_irq *irq_data = irq_to_regmap_irq(d, data->hwirq);
 	unsigned int reg = irq_data->reg_offset / map->reg_stride;
-	unsigned int mask, type;
-
-	type = irq_data->type.type_falling_val | irq_data->type.type_rising_val;
+	unsigned int mask;
 
 	/*
 	 * The type_in_mask flag means that the underlying hardware uses
-	 * separate mask bits for rising and falling edge interrupts, but
-	 * we want to make them into a single virtual interrupt with
-	 * configurable edge.
+	 * separate mask bits for each interrupt trigger type, but we want
+	 * to have a single logical interrupt with a configurable type.
 	 *
-	 * If the interrupt we're enabling defines the falling or rising
-	 * masks then instead of using the regular mask bits for this
-	 * interrupt, use the value previously written to the type buffer
-	 * at the corresponding offset in regmap_irq_set_type().
+	 * If the interrupt we're enabling defines any supported types
+	 * then instead of using the regular mask bits for this interrupt,
+	 * use the value previously written to the type buffer at the
+	 * corresponding offset in regmap_irq_set_type().
 	 */
-	if (d->chip->type_in_mask && type)
+	if (d->chip->type_in_mask && irq_data->type.types_supported)
 		mask = d->type_buf[reg] & irq_data->mask;
 	else
 		mask = irq_data->mask;
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 58db2206f193..b404c59b3b20 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1484,9 +1484,11 @@ struct regmap_irq_sub_irq_map {
  * @clear_ack:  Use this to set 1 and 0 or vice-versa to clear interrupts.
  * @wake_invert: Inverted wake register: cleared bits are wake enabled.
  * @type_invert: Invert the type flags.
- * @type_in_mask: Use the mask registers for controlling irq type. For
- *                interrupts defining type_rising/falling_mask use mask_base
- *                for edge configuration and never update bits in type_base.
+ * @type_in_mask: Use the mask registers for controlling irq type. Use this if
+ *		  the hardware provides separate bits for rising/falling edge
+ *		  or low/high level interrupts and they should be combined into
+ *		  a single logical interrupt. Use &struct regmap_irq_type data
+ *		  to define the mask bit for each irq type.
  * @clear_on_unmask: For chips with interrupts cleared on read: read the status
  *                   registers before unmasking interrupts to clear any bits
  *                   set when they were masked.
-- 
cgit 


From ad22b3e98f9430896bd4bd8f4fbff4667f02a0c8 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:14 +0100
Subject: regmap-irq: Remove mask_writeonly and regmap_irq_update_bits()

Commit a71411dbf6c8 ("regmap: irq: add chip option mask_writeonly")
introduced the mask_writeonly option, but it isn't used now and it
appears it's never been used by any in-tree drivers. The motivation
for the option is mentioned in the commit message,

    Some irq controllers have writeonly/multipurpose register
    layouts. In those cases we read invalid data back. [...]

The option causes mask register updates to use regmap_write_bits()
instead of regmap_update_bits().

However, regmap_write_bits() doesn't solve the reading invalid data
problem. It's still a read-modify-write op like regmap_update_bits().
The difference is that 'update bits' will only write the new value
if it is different from the current value, while 'write bits' will
write the new value unconditionally, even if it's the same as the
current value.

This seems like a bit of a specialized use case and probably isn't
that useful for regmap-irq, so let's just remove the option and go
back to using an 'update bits' op for the mask registers. We can
always add the option back if some driver ends up needing it in the
future.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-7-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 24 +++++++-----------------
 include/linux/regmap.h           |  2 --
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index cb20ce6f91e7..7e93dd8af56b 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -80,16 +80,6 @@ static void regmap_irq_lock(struct irq_data *data)
 	mutex_lock(&d->lock);
 }
 
-static int regmap_irq_update_bits(struct regmap_irq_chip_data *d,
-				  unsigned int reg, unsigned int mask,
-				  unsigned int val)
-{
-	if (d->chip->mask_writeonly)
-		return regmap_write_bits(d->map, reg, mask, val);
-	else
-		return regmap_update_bits(d->map, reg, mask, val);
-}
-
 static void regmap_irq_sync_unlock(struct irq_data *data)
 {
 	struct regmap_irq_chip_data *d = irq_data_get_irq_chip_data(data);
@@ -130,11 +120,11 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 
 		reg = sub_irq_reg(d, d->chip->mask_base, i);
 		if (d->chip->mask_invert) {
-			ret = regmap_irq_update_bits(d, reg,
+			ret = regmap_update_bits(d->map, reg,
 					 d->mask_buf_def[i], ~d->mask_buf[i]);
 		} else if (d->chip->unmask_base) {
 			/* set mask with mask_base register */
-			ret = regmap_irq_update_bits(d, reg,
+			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], ~d->mask_buf[i]);
 			if (ret < 0)
 				dev_err(d->map->dev,
@@ -143,12 +133,12 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 			unmask_offset = d->chip->unmask_base -
 							d->chip->mask_base;
 			/* clear mask with unmask_base register */
-			ret = regmap_irq_update_bits(d,
+			ret = regmap_update_bits(d->map,
 					reg + unmask_offset,
 					d->mask_buf_def[i],
 					d->mask_buf[i]);
 		} else {
-			ret = regmap_irq_update_bits(d, reg,
+			ret = regmap_update_bits(d->map, reg,
 					 d->mask_buf_def[i], d->mask_buf[i]);
 		}
 		if (ret != 0)
@@ -763,17 +753,17 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		reg = sub_irq_reg(d, d->chip->mask_base, i);
 
 		if (chip->mask_invert)
-			ret = regmap_irq_update_bits(d, reg,
+			ret = regmap_update_bits(d->map, reg,
 					 d->mask_buf[i], ~d->mask_buf[i]);
 		else if (d->chip->unmask_base) {
 			unmask_offset = d->chip->unmask_base -
 					d->chip->mask_base;
-			ret = regmap_irq_update_bits(d,
+			ret = regmap_update_bits(d->map,
 					reg + unmask_offset,
 					d->mask_buf[i],
 					d->mask_buf[i]);
 		} else
-			ret = regmap_irq_update_bits(d, reg,
+			ret = regmap_update_bits(d->map, reg,
 					 d->mask_buf[i], d->mask_buf[i]);
 		if (ret != 0) {
 			dev_err(map->dev, "Failed to set masks in 0x%x: %d\n",
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index b404c59b3b20..6489b3610362 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1468,7 +1468,6 @@ struct regmap_irq_sub_irq_map {
  *
  * @status_base: Base status register address.
  * @mask_base:   Base mask register address.
- * @mask_writeonly: Base mask register is write only.
  * @unmask_base:  Base unmask register address. for chips who have
  *                separate mask and unmask registers
  * @ack_base:    Base ack address. If zero then the chip is clear on read.
@@ -1534,7 +1533,6 @@ struct regmap_irq_chip {
 	unsigned int type_base;
 	unsigned int *virt_reg_base;
 	unsigned int irq_reg_stride;
-	unsigned int mask_writeonly:1;
 	unsigned int init_ack_masked:1;
 	unsigned int mask_invert:1;
 	unsigned int use_ack:1;
-- 
cgit 


From faa87ce9196dbb074d75bd4aecb8bacf18f19b4e Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:16 +0100
Subject: regmap-irq: Introduce config registers for irq types

Config registers provide a more uniform approach to handling irq type
registers. They are essentially an extension of the virtual registers
used by the qcom-pm8008 driver.

Config registers can be represented as a 2D array:

    config_base[0]      reg0,0      reg0,1      reg0,2      reg0,3
    config_base[1]      reg1,0      reg1,1      reg1,2      reg1,3
    config_base[2]      reg2,0      reg2,1      reg2,2      reg2,3

There are 'num_config_bases' base registers, each of which is used to
address 'num_config_regs' registers. The addresses are calculated in
the same way as for other bases. It is assumed that an irq's type is
controlled by one column of registers; that column is identified by
the irq's 'type_reg_offset'.

The set_type_config() callback is responsible for updating the config
register contents. It receives an array of buffers (each represents a
row of registers) and the index of the column to update, along with
the 'struct regmap_irq' description and requested irq type.

Buffered values are written to registers in regmap_irq_sync_unlock().
Note that the entire register contents are overwritten, which is a
minor change in behavior from type registers via 'type_base'.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-9-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 115 +++++++++++++++++++++++++++++++++++++--
 include/linux/regmap.h           |  12 ++++
 2 files changed, 122 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 5f9a5856c45e..e3dbf55a561f 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -39,6 +39,7 @@ struct regmap_irq_chip_data {
 	unsigned int *type_buf;
 	unsigned int *type_buf_def;
 	unsigned int **virt_buf;
+	unsigned int **config_buf;
 
 	unsigned int irq_reg_stride;
 
@@ -228,6 +229,17 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		}
 	}
 
+	for (i = 0; i < d->chip->num_config_bases; i++) {
+		for (j = 0; j < d->chip->num_config_regs; j++) {
+			reg = sub_irq_reg(d, d->chip->config_base[i], j);
+			ret = regmap_write(map, reg, d->config_buf[i][j]);
+			if (ret)
+				dev_err(d->map->dev,
+					"Failed to write config %x: %d\n",
+					reg, ret);
+		}
+	}
+
 	if (d->chip->runtime_pm)
 		pm_runtime_put(map->dev);
 
@@ -287,7 +299,7 @@ static int regmap_irq_set_type(struct irq_data *data, unsigned int type)
 	struct regmap_irq_chip_data *d = irq_data_get_irq_chip_data(data);
 	struct regmap *map = d->map;
 	const struct regmap_irq *irq_data = irq_to_regmap_irq(d, data->hwirq);
-	int reg;
+	int reg, ret;
 	const struct regmap_irq_type *t = &irq_data->type;
 
 	if ((t->types_supported & type) != type)
@@ -327,9 +339,19 @@ static int regmap_irq_set_type(struct irq_data *data, unsigned int type)
 		return -EINVAL;
 	}
 
-	if (d->chip->set_type_virt)
-		return d->chip->set_type_virt(d->virt_buf, type, data->hwirq,
-					      reg);
+	if (d->chip->set_type_virt) {
+		ret = d->chip->set_type_virt(d->virt_buf, type, data->hwirq,
+					     reg);
+		if (ret)
+			return ret;
+	}
+
+	if (d->chip->set_type_config) {
+		ret = d->chip->set_type_config(d->config_buf, type,
+					       irq_data, reg);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -599,6 +621,61 @@ static const struct irq_domain_ops regmap_domain_ops = {
 	.xlate	= irq_domain_xlate_onetwocell,
 };
 
+/**
+ * regmap_irq_set_type_config_simple() - Simple IRQ type configuration callback.
+ * @buf: Buffer containing configuration register values, this is a 2D array of
+ *       `num_config_bases` rows, each of `num_config_regs` elements.
+ * @type: The requested IRQ type.
+ * @irq_data: The IRQ being configured.
+ * @idx: Index of the irq's config registers within each array `buf[i]`
+ *
+ * This is a &struct regmap_irq_chip->set_type_config callback suitable for
+ * chips with one config register. Register values are updated according to
+ * the &struct regmap_irq_type data associated with an IRQ.
+ */
+int regmap_irq_set_type_config_simple(unsigned int **buf, unsigned int type,
+				      const struct regmap_irq *irq_data, int idx)
+{
+	const struct regmap_irq_type *t = &irq_data->type;
+
+	if (t->type_reg_mask)
+		buf[0][idx] &= ~t->type_reg_mask;
+	else
+		buf[0][idx] &= ~(t->type_falling_val |
+				 t->type_rising_val |
+				 t->type_level_low_val |
+				 t->type_level_high_val);
+
+	switch (type) {
+	case IRQ_TYPE_EDGE_FALLING:
+		buf[0][idx] |= t->type_falling_val;
+		break;
+
+	case IRQ_TYPE_EDGE_RISING:
+		buf[0][idx] |= t->type_rising_val;
+		break;
+
+	case IRQ_TYPE_EDGE_BOTH:
+		buf[0][idx] |= (t->type_falling_val |
+				t->type_rising_val);
+		break;
+
+	case IRQ_TYPE_LEVEL_HIGH:
+		buf[0][idx] |= t->type_level_high_val;
+		break;
+
+	case IRQ_TYPE_LEVEL_LOW:
+		buf[0][idx] |= t->type_level_low_val;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(regmap_irq_set_type_config_simple);
+
 /**
  * regmap_add_irq_chip_fwnode() - Use standard regmap IRQ controller handling
  *
@@ -724,6 +801,24 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		}
 	}
 
+	if (chip->num_config_bases && chip->num_config_regs) {
+		/*
+		 * Create config_buf[num_config_bases][num_config_regs]
+		 */
+		d->config_buf = kcalloc(chip->num_config_bases,
+					sizeof(*d->config_buf), GFP_KERNEL);
+		if (!d->config_buf)
+			goto err_alloc;
+
+		for (i = 0; i < chip->num_config_regs; i++) {
+			d->config_buf[i] = kcalloc(chip->num_config_regs,
+						   sizeof(**d->config_buf),
+						   GFP_KERNEL);
+			if (!d->config_buf[i])
+				goto err_alloc;
+		}
+	}
+
 	d->irq_chip = regmap_irq_chip;
 	d->irq_chip.name = chip->name;
 	d->irq = irq;
@@ -894,6 +989,11 @@ err_alloc:
 			kfree(d->virt_buf[i]);
 		kfree(d->virt_buf);
 	}
+	if (d->config_buf) {
+		for (i = 0; i < chip->num_config_bases; i++)
+			kfree(d->config_buf[i]);
+		kfree(d->config_buf);
+	}
 	kfree(d);
 	return ret;
 }
@@ -934,7 +1034,7 @@ EXPORT_SYMBOL_GPL(regmap_add_irq_chip);
 void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d)
 {
 	unsigned int virq;
-	int hwirq;
+	int i, hwirq;
 
 	if (!d)
 		return;
@@ -964,6 +1064,11 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d)
 	kfree(d->mask_buf);
 	kfree(d->status_reg_buf);
 	kfree(d->status_buf);
+	if (d->config_buf) {
+		for (i = 0; i < d->chip->num_config_bases; i++)
+			kfree(d->config_buf[i]);
+		kfree(d->config_buf);
+	}
 	kfree(d);
 }
 EXPORT_SYMBOL_GPL(regmap_del_irq_chip);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 6489b3610362..791c15ad1f63 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1475,6 +1475,7 @@ struct regmap_irq_sub_irq_map {
  * @wake_base:   Base address for wake enables.  If zero unsupported.
  * @type_base:   Base address for irq type.  If zero unsupported.
  * @virt_reg_base:   Base addresses for extra config regs.
+ * @config_base: Base address for IRQ type config regs. If null unsupported.
  * @irq_reg_stride:  Stride to use for chips where registers are not contiguous.
  * @init_ack_masked: Ack all masked interrupts once during initalization.
  * @mask_invert: Inverted mask register: cleared bits are masked out.
@@ -1504,12 +1505,15 @@ struct regmap_irq_sub_irq_map {
  * @num_type_reg:    Number of type registers.
  * @num_virt_regs:   Number of non-standard irq configuration registers.
  *		     If zero unsupported.
+ * @num_config_bases:	Number of config base registers.
+ * @num_config_regs:	Number of config registers for each config base register.
  * @handle_pre_irq:  Driver specific callback to handle interrupt from device
  *		     before regmap_irq_handler process the interrupts.
  * @handle_post_irq: Driver specific callback to handle interrupt from device
  *		     after handling the interrupts in regmap_irq_handler().
  * @set_type_virt:   Driver specific callback to extend regmap_irq_set_type()
  *		     and configure virt regs.
+ * @set_type_config: Callback used for configuring irq types.
  * @irq_drv_data:    Driver specific IRQ data which is passed as parameter when
  *		     driver specific pre/post interrupt handler is called.
  *
@@ -1532,6 +1536,7 @@ struct regmap_irq_chip {
 	unsigned int wake_base;
 	unsigned int type_base;
 	unsigned int *virt_reg_base;
+	const unsigned int *config_base;
 	unsigned int irq_reg_stride;
 	unsigned int init_ack_masked:1;
 	unsigned int mask_invert:1;
@@ -1553,16 +1558,23 @@ struct regmap_irq_chip {
 
 	int num_type_reg;
 	int num_virt_regs;
+	int num_config_bases;
+	int num_config_regs;
 
 	int (*handle_pre_irq)(void *irq_drv_data);
 	int (*handle_post_irq)(void *irq_drv_data);
 	int (*set_type_virt)(unsigned int **buf, unsigned int type,
 			     unsigned long hwirq, int reg);
+	int (*set_type_config)(unsigned int **buf, unsigned int type,
+			       const struct regmap_irq *irq_data, int idx);
 	void *irq_drv_data;
 };
 
 struct regmap_irq_chip_data;
 
+int regmap_irq_set_type_config_simple(unsigned int **buf, unsigned int type,
+				      const struct regmap_irq *irq_data, int idx);
+
 int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags,
 			int irq_base, const struct regmap_irq_chip *chip,
 			struct regmap_irq_chip_data **data);
-- 
cgit 


From 9edd4f5aee8470dcfd0db04005908f61fbfae8e0 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:17 +0100
Subject: regmap-irq: Deprecate type registers and virtual registers

Config registers can be used to replace both type and virtual
registers, so mark both features are deprecated and issue a
warning if they're used.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-10-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c |  6 ++++++
 include/linux/regmap.h           | 18 ++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index e3dbf55a561f..8cbc62c3d638 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -726,6 +726,12 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 				return -EINVAL;
 	}
 
+	if (chip->num_type_reg)
+		dev_warn(map->dev, "type registers are deprecated; use config registers instead");
+
+	if (chip->num_virt_regs || chip->virt_reg_base || chip->set_type_virt)
+		dev_warn(map->dev, "virtual registers are deprecated; use config registers instead");
+
 	if (irq_base) {
 		irq_base = irq_alloc_descs(irq_base, 0, chip->num_irqs, 0);
 		if (irq_base < 0) {
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 791c15ad1f63..c9a9bbbb6261 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1473,8 +1473,10 @@ struct regmap_irq_sub_irq_map {
  * @ack_base:    Base ack address. If zero then the chip is clear on read.
  *               Using zero value is possible with @use_ack bit.
  * @wake_base:   Base address for wake enables.  If zero unsupported.
- * @type_base:   Base address for irq type.  If zero unsupported.
- * @virt_reg_base:   Base addresses for extra config regs.
+ * @type_base:   Base address for irq type.  If zero unsupported.  Deprecated,
+ *		 use @config_base instead.
+ * @virt_reg_base:   Base addresses for extra config regs. Deprecated, use
+ *		     @config_base instead.
  * @config_base: Base address for IRQ type config regs. If null unsupported.
  * @irq_reg_stride:  Stride to use for chips where registers are not contiguous.
  * @init_ack_masked: Ack all masked interrupts once during initalization.
@@ -1483,7 +1485,8 @@ struct regmap_irq_sub_irq_map {
  * @ack_invert:  Inverted ack register: cleared bits for ack.
  * @clear_ack:  Use this to set 1 and 0 or vice-versa to clear interrupts.
  * @wake_invert: Inverted wake register: cleared bits are wake enabled.
- * @type_invert: Invert the type flags.
+ * @type_invert: Invert the type flags. Deprecated, use config registers
+ *		 instead.
  * @type_in_mask: Use the mask registers for controlling irq type. Use this if
  *		  the hardware provides separate bits for rising/falling edge
  *		  or low/high level interrupts and they should be combined into
@@ -1502,9 +1505,11 @@ struct regmap_irq_sub_irq_map {
  * @irqs:        Descriptors for individual IRQs.  Interrupt numbers are
  *               assigned based on the index in the array of the interrupt.
  * @num_irqs:    Number of descriptors.
- * @num_type_reg:    Number of type registers.
+ * @num_type_reg:    Number of type registers. Deprecated, use config registers
+ *		     instead.
  * @num_virt_regs:   Number of non-standard irq configuration registers.
- *		     If zero unsupported.
+ *		     If zero unsupported. Deprecated, use config registers
+ *		     instead.
  * @num_config_bases:	Number of config base registers.
  * @num_config_regs:	Number of config registers for each config base register.
  * @handle_pre_irq:  Driver specific callback to handle interrupt from device
@@ -1512,7 +1517,8 @@ struct regmap_irq_sub_irq_map {
  * @handle_post_irq: Driver specific callback to handle interrupt from device
  *		     after handling the interrupts in regmap_irq_handler().
  * @set_type_virt:   Driver specific callback to extend regmap_irq_set_type()
- *		     and configure virt regs.
+ *		     and configure virt regs. Deprecated, use @set_type_config
+ *		     callback and config registers instead.
  * @set_type_config: Callback used for configuring irq types.
  * @irq_drv_data:    Driver specific IRQ data which is passed as parameter when
  *		     driver specific pre/post interrupt handler is called.
-- 
cgit 


From e8ffb12e7f065db616a3eba79ff138bececf0825 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:18 +0100
Subject: regmap-irq: Fix inverted handling of unmask registers

To me "unmask" suggests that we write 1s to the register when
an interrupt is enabled. This also makes sense because it's the
opposite of what the "mask" register does (write 1s to disable
an interrupt).

But regmap-irq does the opposite: for a disabled interrupt, it
writes 1s to "unmask" and 0s to "mask". This is surprising and
deviates from the usual way mask registers are handled.

Additionally, mask_invert didn't interact with unmask registers
properly -- it caused them to be ignored entirely.

Fix this by making mask and unmask registers orthogonal, using
the following behavior:

* Mask registers are written with 1s for disabled interrupts.
* Unmask registers are written with 1s for enabled interrupts.

This behavior supports both normal or inverted mask registers
and separate set/clear registers via different combinations of
mask_base/unmask_base.

The old unmask register behavior is deprecated. Drivers need to
opt-in to the new behavior by setting mask_unmask_non_inverted.
Warnings are issued if the driver relies on deprecated behavior.
Chips that only set one of mask_base/unmask_base don't have to
use the mask_unmask_non_inverted flag because that use case was
previously not supported.

The mask_invert flag is also deprecated in favor of describing
inverted mask registers as unmask registers.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-11-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 114 +++++++++++++++++++++++----------------
 include/linux/regmap.h           |  18 +++++--
 2 files changed, 84 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 8cbc62c3d638..2c724ae185c4 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -30,6 +30,9 @@ struct regmap_irq_chip_data {
 	int irq;
 	int wake_count;
 
+	unsigned int mask_base;
+	unsigned int unmask_base;
+
 	void *status_reg_buf;
 	unsigned int *main_status_buf;
 	unsigned int *status_buf;
@@ -95,7 +98,6 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 	struct regmap *map = d->map;
 	int i, j, ret;
 	u32 reg;
-	u32 unmask_offset;
 	u32 val;
 
 	if (d->chip->runtime_pm) {
@@ -124,35 +126,23 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 	 * suppress pointless writes.
 	 */
 	for (i = 0; i < d->chip->num_regs; i++) {
-		if (!d->chip->mask_base)
-			continue;
-
-		reg = sub_irq_reg(d, d->chip->mask_base, i);
-		if (d->chip->mask_invert) {
+		if (d->mask_base) {
+			reg = sub_irq_reg(d, d->mask_base, i);
 			ret = regmap_update_bits(d->map, reg,
-					 d->mask_buf_def[i], ~d->mask_buf[i]);
-		} else if (d->chip->unmask_base) {
-			/* set mask with mask_base register */
+					d->mask_buf_def[i], d->mask_buf[i]);
+			if (ret)
+				dev_err(d->map->dev, "Failed to sync masks in %x\n",
+					reg);
+		}
+
+		if (d->unmask_base) {
+			reg = sub_irq_reg(d, d->unmask_base, i);
 			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], ~d->mask_buf[i]);
-			if (ret < 0)
-				dev_err(d->map->dev,
-					"Failed to sync unmasks in %x\n",
+			if (ret)
+				dev_err(d->map->dev, "Failed to sync masks in %x\n",
 					reg);
-			unmask_offset = d->chip->unmask_base -
-							d->chip->mask_base;
-			/* clear mask with unmask_base register */
-			ret = regmap_update_bits(d->map,
-					reg + unmask_offset,
-					d->mask_buf_def[i],
-					d->mask_buf[i]);
-		} else {
-			ret = regmap_update_bits(d->map, reg,
-					 d->mask_buf_def[i], d->mask_buf[i]);
 		}
-		if (ret != 0)
-			dev_err(d->map->dev, "Failed to sync masks in %x\n",
-				reg);
 
 		reg = sub_irq_reg(d, d->chip->wake_base, i);
 		if (d->wake_buf) {
@@ -704,7 +694,6 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	int ret = -ENOMEM;
 	int num_type_reg;
 	u32 reg;
-	u32 unmask_offset;
 
 	if (chip->num_regs <= 0)
 		return -EINVAL;
@@ -832,6 +821,42 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	d->chip = chip;
 	d->irq_base = irq_base;
 
+	if (chip->mask_base && chip->unmask_base &&
+	    !chip->mask_unmask_non_inverted) {
+		/*
+		 * Chips that specify both mask_base and unmask_base used to
+		 * get inverted mask behavior by default, with no way to ask
+		 * for the normal, non-inverted behavior. This "inverted by
+		 * default" behavior is deprecated, but we have to support it
+		 * until existing drivers have been fixed.
+		 *
+		 * Existing drivers should be updated by swapping mask_base
+		 * and unmask_base and setting mask_unmask_non_inverted=true.
+		 * New drivers should always set the flag.
+		 */
+		dev_warn(map->dev, "mask_base and unmask_base are inverted, please fix it");
+
+		/* Might as well warn about mask_invert while we're at it... */
+		if (chip->mask_invert)
+			dev_warn(map->dev, "mask_invert=true ignored");
+
+		d->mask_base = chip->unmask_base;
+		d->unmask_base = chip->mask_base;
+	} else if (chip->mask_invert) {
+		/*
+		 * Swap the roles of mask_base and unmask_base if the bits are
+		 * inverted. This is deprecated, drivers should use unmask_base
+		 * directly.
+		 */
+		dev_warn(map->dev, "mask_invert=true is deprecated; please switch to unmask_base");
+
+		d->mask_base = chip->unmask_base;
+		d->unmask_base = chip->mask_base;
+	} else {
+		d->mask_base = chip->mask_base;
+		d->unmask_base = chip->unmask_base;
+	}
+
 	if (chip->irq_reg_stride)
 		d->irq_reg_stride = chip->irq_reg_stride;
 	else
@@ -854,28 +879,27 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	/* Mask all the interrupts by default */
 	for (i = 0; i < chip->num_regs; i++) {
 		d->mask_buf[i] = d->mask_buf_def[i];
-		if (!chip->mask_base)
-			continue;
-
-		reg = sub_irq_reg(d, d->chip->mask_base, i);
 
-		if (chip->mask_invert)
+		if (d->mask_base) {
+			reg = sub_irq_reg(d, d->mask_base, i);
 			ret = regmap_update_bits(d->map, reg,
-					 d->mask_buf[i], ~d->mask_buf[i]);
-		else if (d->chip->unmask_base) {
-			unmask_offset = d->chip->unmask_base -
-					d->chip->mask_base;
-			ret = regmap_update_bits(d->map,
-					reg + unmask_offset,
-					d->mask_buf[i],
-					d->mask_buf[i]);
-		} else
+					d->mask_buf_def[i], d->mask_buf[i]);
+			if (ret) {
+				dev_err(map->dev, "Failed to set masks in 0x%x: %d\n",
+					reg, ret);
+				goto err_alloc;
+			}
+		}
+
+		if (d->unmask_base) {
+			reg = sub_irq_reg(d, d->unmask_base, i);
 			ret = regmap_update_bits(d->map, reg,
-					 d->mask_buf[i], d->mask_buf[i]);
-		if (ret != 0) {
-			dev_err(map->dev, "Failed to set masks in 0x%x: %d\n",
-				reg, ret);
-			goto err_alloc;
+					d->mask_buf_def[i], ~d->mask_buf[i]);
+			if (ret) {
+				dev_err(map->dev, "Failed to set masks in 0x%x: %d\n",
+					reg, ret);
+				goto err_alloc;
+			}
 		}
 
 		if (!chip->init_ack_masked)
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index c9a9bbbb6261..c9c9e39750ed 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1467,9 +1467,10 @@ struct regmap_irq_sub_irq_map {
  *		   main_status set.
  *
  * @status_base: Base status register address.
- * @mask_base:   Base mask register address.
- * @unmask_base:  Base unmask register address. for chips who have
- *                separate mask and unmask registers
+ * @mask_base:   Base mask register address. Mask bits are set to 1 when an
+ *               interrupt is masked, 0 when unmasked.
+ * @unmask_base:  Base unmask register address. Unmask bits are set to 1 when
+ *                an interrupt is unmasked and 0 when masked.
  * @ack_base:    Base ack address. If zero then the chip is clear on read.
  *               Using zero value is possible with @use_ack bit.
  * @wake_base:   Base address for wake enables.  If zero unsupported.
@@ -1481,6 +1482,16 @@ struct regmap_irq_sub_irq_map {
  * @irq_reg_stride:  Stride to use for chips where registers are not contiguous.
  * @init_ack_masked: Ack all masked interrupts once during initalization.
  * @mask_invert: Inverted mask register: cleared bits are masked out.
+ *		 Deprecated; prefer describing an inverted mask register as
+ *		 an unmask register.
+ * @mask_unmask_non_inverted: Controls mask bit inversion for chips that set
+ *	both @mask_base and @unmask_base. If false, mask and unmask bits are
+ *	inverted (which is deprecated behavior); if true, bits will not be
+ *	inverted and the registers keep their normal behavior. Note that if
+ *	you use only one of @mask_base or @unmask_base, this flag has no
+ *	effect and is unnecessary. Any new drivers that set both @mask_base
+ *	and @unmask_base should set this to true to avoid relying on the
+ *	deprecated behavior.
  * @use_ack:     Use @ack register even if it is zero.
  * @ack_invert:  Inverted ack register: cleared bits for ack.
  * @clear_ack:  Use this to set 1 and 0 or vice-versa to clear interrupts.
@@ -1546,6 +1557,7 @@ struct regmap_irq_chip {
 	unsigned int irq_reg_stride;
 	unsigned int init_ack_masked:1;
 	unsigned int mask_invert:1;
+	unsigned int mask_unmask_non_inverted:1;
 	unsigned int use_ack:1;
 	unsigned int ack_invert:1;
 	unsigned int clear_ack:1;
-- 
cgit 


From bdf9b86cd3adbbcf590ab82b74ab8554534c9b6e Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:19 +0100
Subject: regmap-irq: Add get_irq_reg() callback

Replace the internal sub_irq_reg() function with a public callback
that drivers can use when they have more complex register layouts.
The default implementation is regmap_irq_get_irq_reg_linear(), used
if the chip doesn't provide its own callback.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-12-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 126 +++++++++++++++++++++++++--------------
 include/linux/regmap.h           |  15 ++++-
 2 files changed, 93 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 2c724ae185c4..71de097847a7 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -46,30 +46,12 @@ struct regmap_irq_chip_data {
 
 	unsigned int irq_reg_stride;
 
+	unsigned int (*get_irq_reg)(struct regmap_irq_chip_data *data,
+				    unsigned int base, int index);
+
 	unsigned int clear_status:1;
 };
 
-static int sub_irq_reg(struct regmap_irq_chip_data *data,
-		       unsigned int base_reg, int i)
-{
-	const struct regmap_irq_chip *chip = data->chip;
-	struct regmap *map = data->map;
-	struct regmap_irq_sub_irq_map *subreg;
-	unsigned int offset;
-	int reg = 0;
-
-	if (!chip->sub_reg_offsets || !chip->not_fixed_stride) {
-		/* Assume linear mapping */
-		reg = base_reg + (i * map->reg_stride * data->irq_reg_stride);
-	} else {
-		subreg = &chip->sub_reg_offsets[i];
-		offset = subreg->offset[0];
-		reg = base_reg + offset;
-	}
-
-	return reg;
-}
-
 static inline const
 struct regmap_irq *irq_to_regmap_irq(struct regmap_irq_chip_data *data,
 				     int irq)
@@ -81,7 +63,13 @@ static bool regmap_irq_can_bulk_read_status(struct regmap_irq_chip_data *data)
 {
 	struct regmap *map = data->map;
 
+	/*
+	 * While possible that a user-defined ->get_irq_reg() callback might
+	 * be linear enough to support bulk reads, most of the time it won't.
+	 * Therefore only allow them if the default callback is being used.
+	 */
 	return data->irq_reg_stride == 1 && map->reg_stride == 1 &&
+	       data->get_irq_reg == regmap_irq_get_irq_reg_linear &&
 	       !map->use_single_read;
 }
 
@@ -109,7 +97,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 
 	if (d->clear_status) {
 		for (i = 0; i < d->chip->num_regs; i++) {
-			reg = sub_irq_reg(d, d->chip->status_base, i);
+			reg = d->get_irq_reg(d, d->chip->status_base, i);
 
 			ret = regmap_read(map, reg, &val);
 			if (ret)
@@ -127,7 +115,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 	 */
 	for (i = 0; i < d->chip->num_regs; i++) {
 		if (d->mask_base) {
-			reg = sub_irq_reg(d, d->mask_base, i);
+			reg = d->get_irq_reg(d, d->mask_base, i);
 			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], d->mask_buf[i]);
 			if (ret)
@@ -136,7 +124,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		}
 
 		if (d->unmask_base) {
-			reg = sub_irq_reg(d, d->unmask_base, i);
+			reg = d->get_irq_reg(d, d->unmask_base, i);
 			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], ~d->mask_buf[i]);
 			if (ret)
@@ -144,7 +132,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 					reg);
 		}
 
-		reg = sub_irq_reg(d, d->chip->wake_base, i);
+		reg = d->get_irq_reg(d, d->chip->wake_base, i);
 		if (d->wake_buf) {
 			if (d->chip->wake_invert)
 				ret = regmap_update_bits(d->map, reg,
@@ -168,7 +156,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		 * it'll be ignored in irq handler, then may introduce irq storm
 		 */
 		if (d->mask_buf[i] && (d->chip->ack_base || d->chip->use_ack)) {
-			reg = sub_irq_reg(d, d->chip->ack_base, i);
+			reg = d->get_irq_reg(d, d->chip->ack_base, i);
 
 			/* some chips ack by write 0 */
 			if (d->chip->ack_invert)
@@ -192,7 +180,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		for (i = 0; i < d->chip->num_type_reg; i++) {
 			if (!d->type_buf_def[i])
 				continue;
-			reg = sub_irq_reg(d, d->chip->type_base, i);
+			reg = d->get_irq_reg(d, d->chip->type_base, i);
 			if (d->chip->type_invert)
 				ret = regmap_update_bits(d->map, reg,
 					d->type_buf_def[i], ~d->type_buf[i]);
@@ -208,8 +196,8 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 	if (d->chip->num_virt_regs) {
 		for (i = 0; i < d->chip->num_virt_regs; i++) {
 			for (j = 0; j < d->chip->num_regs; j++) {
-				reg = sub_irq_reg(d, d->chip->virt_reg_base[i],
-						  j);
+				reg = d->get_irq_reg(d, d->chip->virt_reg_base[i],
+						     j);
 				ret = regmap_write(map, reg, d->virt_buf[i][j]);
 				if (ret != 0)
 					dev_err(d->map->dev,
@@ -221,7 +209,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 
 	for (i = 0; i < d->chip->num_config_bases; i++) {
 		for (j = 0; j < d->chip->num_config_regs; j++) {
-			reg = sub_irq_reg(d, d->chip->config_base[i], j);
+			reg = d->get_irq_reg(d, d->chip->config_base[i], j);
 			ret = regmap_write(map, reg, d->config_buf[i][j]);
 			if (ret)
 				dev_err(d->map->dev,
@@ -382,14 +370,17 @@ static inline int read_sub_irq_data(struct regmap_irq_chip_data *data,
 	const struct regmap_irq_chip *chip = data->chip;
 	struct regmap *map = data->map;
 	struct regmap_irq_sub_irq_map *subreg;
+	unsigned int reg;
 	int i, ret = 0;
 
 	if (!chip->sub_reg_offsets) {
-		/* Assume linear mapping */
-		ret = regmap_read(map, chip->status_base +
-				  (b * map->reg_stride * data->irq_reg_stride),
-				   &data->status_buf[b]);
+		reg = data->get_irq_reg(data, chip->status_base, b);
+		ret = regmap_read(map, reg, &data->status_buf[b]);
 	} else {
+		/*
+		 * Note we can't use ->get_irq_reg() here because the offsets
+		 * in 'subreg' are *not* interchangeable with indices.
+		 */
 		subreg = &chip->sub_reg_offsets[b];
 		for (i = 0; i < subreg->num_regs; i++) {
 			unsigned int offset = subreg->offset[i];
@@ -455,10 +446,18 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 		 * sake of simplicity. and add bulk reads only if needed
 		 */
 		for (i = 0; i < chip->num_main_regs; i++) {
-			ret = regmap_read(map, chip->main_status +
-				  (i * map->reg_stride
-				   * data->irq_reg_stride),
-				  &data->main_status_buf[i]);
+			/*
+			 * For not_fixed_stride, don't use ->get_irq_reg().
+			 * It would produce an incorrect result.
+			 */
+			if (data->chip->not_fixed_stride)
+				reg = chip->main_status +
+					i * map->reg_stride * data->irq_reg_stride;
+			else
+				reg = data->get_irq_reg(data,
+							chip->main_status, i);
+
+			ret = regmap_read(map, reg, &data->main_status_buf[i]);
 			if (ret) {
 				dev_err(map->dev,
 					"Failed to read IRQ status %d\n",
@@ -523,7 +522,7 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 
 	} else {
 		for (i = 0; i < data->chip->num_regs; i++) {
-			unsigned int reg = sub_irq_reg(data,
+			unsigned int reg = data->get_irq_reg(data,
 					data->chip->status_base, i);
 			ret = regmap_read(map, reg, &data->status_buf[i]);
 
@@ -551,7 +550,7 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 		data->status_buf[i] &= ~data->mask_buf[i];
 
 		if (data->status_buf[i] && (chip->ack_base || chip->use_ack)) {
-			reg = sub_irq_reg(data, data->chip->ack_base, i);
+			reg = data->get_irq_reg(data, data->chip->ack_base, i);
 
 			if (chip->ack_invert)
 				ret = regmap_write(map, reg,
@@ -611,6 +610,36 @@ static const struct irq_domain_ops regmap_domain_ops = {
 	.xlate	= irq_domain_xlate_onetwocell,
 };
 
+/**
+ * regmap_irq_get_irq_reg_linear() - Linear IRQ register mapping callback.
+ * @data: Data for the &struct regmap_irq_chip
+ * @base: Base register
+ * @index: Register index
+ *
+ * Returns the register address corresponding to the given @base and @index
+ * by the formula ``base + index * regmap_stride * irq_reg_stride``.
+ */
+unsigned int regmap_irq_get_irq_reg_linear(struct regmap_irq_chip_data *data,
+					   unsigned int base, int index)
+{
+	const struct regmap_irq_chip *chip = data->chip;
+	struct regmap *map = data->map;
+
+	/*
+	 * FIXME: This is for backward compatibility and should be removed
+	 * when not_fixed_stride is dropped (it's only used by qcom-pm8008).
+	 */
+	if (chip->not_fixed_stride && chip->sub_reg_offsets) {
+		struct regmap_irq_sub_irq_map *subreg;
+
+		subreg = &chip->sub_reg_offsets[0];
+		return base + subreg->offset[0];
+	}
+
+	return base + index * map->reg_stride * chip->irq_reg_stride;
+}
+EXPORT_SYMBOL_GPL(regmap_irq_get_irq_reg_linear);
+
 /**
  * regmap_irq_set_type_config_simple() - Simple IRQ type configuration callback.
  * @buf: Buffer containing configuration register values, this is a 2D array of
@@ -862,6 +891,11 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	else
 		d->irq_reg_stride = 1;
 
+	if (chip->get_irq_reg)
+		d->get_irq_reg = chip->get_irq_reg;
+	else
+		d->get_irq_reg = regmap_irq_get_irq_reg_linear;
+
 	if (regmap_irq_can_bulk_read_status(d)) {
 		d->status_reg_buf = kmalloc_array(chip->num_regs,
 						  map->format.val_bytes,
@@ -881,7 +915,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		d->mask_buf[i] = d->mask_buf_def[i];
 
 		if (d->mask_base) {
-			reg = sub_irq_reg(d, d->mask_base, i);
+			reg = d->get_irq_reg(d, d->mask_base, i);
 			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], d->mask_buf[i]);
 			if (ret) {
@@ -892,7 +926,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		}
 
 		if (d->unmask_base) {
-			reg = sub_irq_reg(d, d->unmask_base, i);
+			reg = d->get_irq_reg(d, d->unmask_base, i);
 			ret = regmap_update_bits(d->map, reg,
 					d->mask_buf_def[i], ~d->mask_buf[i]);
 			if (ret) {
@@ -906,7 +940,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			continue;
 
 		/* Ack masked but set interrupts */
-		reg = sub_irq_reg(d, d->chip->status_base, i);
+		reg = d->get_irq_reg(d, d->chip->status_base, i);
 		ret = regmap_read(map, reg, &d->status_buf[i]);
 		if (ret != 0) {
 			dev_err(map->dev, "Failed to read IRQ status: %d\n",
@@ -918,7 +952,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			d->status_buf[i] = ~d->status_buf[i];
 
 		if (d->status_buf[i] && (chip->ack_base || chip->use_ack)) {
-			reg = sub_irq_reg(d, d->chip->ack_base, i);
+			reg = d->get_irq_reg(d, d->chip->ack_base, i);
 			if (chip->ack_invert)
 				ret = regmap_write(map, reg,
 					~(d->status_buf[i] & d->mask_buf[i]));
@@ -943,7 +977,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	if (d->wake_buf) {
 		for (i = 0; i < chip->num_regs; i++) {
 			d->wake_buf[i] = d->mask_buf_def[i];
-			reg = sub_irq_reg(d, d->chip->wake_base, i);
+			reg = d->get_irq_reg(d, d->chip->wake_base, i);
 
 			if (chip->wake_invert)
 				ret = regmap_update_bits(d->map, reg,
@@ -963,7 +997,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 
 	if (chip->num_type_reg && !chip->type_in_mask) {
 		for (i = 0; i < chip->num_type_reg; ++i) {
-			reg = sub_irq_reg(d, d->chip->type_base, i);
+			reg = d->get_irq_reg(d, d->chip->type_base, i);
 
 			ret = regmap_read(map, reg, &d->type_buf_def[i]);
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index c9c9e39750ed..0d240922d4da 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1440,6 +1440,8 @@ struct regmap_irq_sub_irq_map {
 	unsigned int *offset;
 };
 
+struct regmap_irq_chip_data;
+
 /**
  * struct regmap_irq_chip - Description of a generic regmap irq_chip.
  *
@@ -1531,6 +1533,13 @@ struct regmap_irq_sub_irq_map {
  *		     and configure virt regs. Deprecated, use @set_type_config
  *		     callback and config registers instead.
  * @set_type_config: Callback used for configuring irq types.
+ * @get_irq_reg: Callback for mapping (base register, index) pairs to register
+ *		 addresses. The base register will be one of @status_base,
+ *		 @mask_base, etc., @main_status, or any of @config_base.
+ *		 The index will be in the range [0, num_main_regs[ for the
+ *		 main status base, [0, num_type_settings[ for any config
+ *		 register base, and [0, num_regs[ for any other base.
+ *		 If unspecified then regmap_irq_get_irq_reg_linear() is used.
  * @irq_drv_data:    Driver specific IRQ data which is passed as parameter when
  *		     driver specific pre/post interrupt handler is called.
  *
@@ -1585,11 +1594,13 @@ struct regmap_irq_chip {
 			     unsigned long hwirq, int reg);
 	int (*set_type_config)(unsigned int **buf, unsigned int type,
 			       const struct regmap_irq *irq_data, int idx);
+	unsigned int (*get_irq_reg)(struct regmap_irq_chip_data *data,
+				    unsigned int base, int index);
 	void *irq_drv_data;
 };
 
-struct regmap_irq_chip_data;
-
+unsigned int regmap_irq_get_irq_reg_linear(struct regmap_irq_chip_data *data,
+					   unsigned int base, int index);
 int regmap_irq_set_type_config_simple(unsigned int **buf, unsigned int type,
 				      const struct regmap_irq *irq_data, int idx);
 
-- 
cgit 


From 48e014ee9a61e8f4700987b82f7cb1dc3c89fa76 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Thu, 23 Jun 2022 22:14:20 +0100
Subject: regmap-irq: Deprecate the not_fixed_stride flag

This flag is a bit of a hack and the same thing can be accomplished
using a custom ->get_irq_reg() callback. Add a warning to catch any
use of the flag.

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220623211420.918875-13-aidanmacdonald.0x0@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 2 ++
 include/linux/regmap.h           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 71de097847a7..a691553a0d6f 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -739,6 +739,8 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	}
 
 	if (chip->not_fixed_stride) {
+		dev_warn(map->dev, "not_fixed_stride is deprecated; use ->get_irq_reg() instead");
+
 		for (i = 0; i < chip->num_regs; i++)
 			if (chip->sub_reg_offsets[i].num_regs != 1)
 				return -EINVAL;
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 0d240922d4da..7cf2157134ac 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1509,8 +1509,10 @@ struct regmap_irq_chip_data;
  *                   registers before unmasking interrupts to clear any bits
  *                   set when they were masked.
  * @not_fixed_stride: Used when chip peripherals are not laid out with fixed
- * 		      stride. Must be used with sub_reg_offsets containing the
- * 		      offsets to each peripheral.
+ *		      stride. Must be used with sub_reg_offsets containing the
+ *		      offsets to each peripheral. Deprecated; the same thing
+ *		      can be accomplished with a @get_irq_reg callback, without
+ *		      the need for a @sub_reg_offsets table.
  * @status_invert: Inverted status register: cleared bits are active interrupts.
  * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
  *
-- 
cgit 


From ae92b1c84306a68c361d90b46ad3acef99c10e29 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Tue, 28 Jun 2022 10:46:24 +0100
Subject: usb: typec_altmode: add a missing "@" at a kernel-doc parameter

Without that, the parameter is not properly parsed:
	include/linux/usb/typec_altmode.h:132: warning: Function parameter or member 'altmode' not described in 'typec_altmode_get_orientation'

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/70dc4c5d744cf1fe9a0efe6b85deaa0489628282.1656409369.git.mchehab@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_altmode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h
index 65933cbe9129..350d49012659 100644
--- a/include/linux/usb/typec_altmode.h
+++ b/include/linux/usb/typec_altmode.h
@@ -124,7 +124,7 @@ struct typec_altmode *typec_match_altmode(struct typec_altmode **altmodes,
 
 /**
  * typec_altmode_get_orientation - Get cable plug orientation
- * altmode: Handle to the alternate mode
+ * @altmode: Handle to the alternate mode
  */
 static inline enum typec_orientation
 typec_altmode_get_orientation(struct typec_altmode *altmode)
-- 
cgit 


From af3f4134006bf3bf894179c08aaf98ed5938cf4e Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:04 -0700
Subject: bpf: add bpf_func_t and trampoline helpers

I'll be adding lsm cgroup specific helpers that grab
trampoline mutex.

No functional changes.

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-2-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 11 ++++-----
 kernel/bpf/trampoline.c | 63 ++++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d05e1495a06e..d547be9db75f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,8 @@ typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 					struct bpf_iter_aux_info *aux);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
+typedef unsigned int (*bpf_func_t)(const void *,
+				   const struct bpf_insn *);
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
 	bpf_iter_init_seq_priv_t init_seq_private;
@@ -879,8 +881,7 @@ struct bpf_dispatcher {
 static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
-	unsigned int (*bpf_func)(const void *,
-				 const struct bpf_insn *))
+	bpf_func_t bpf_func)
 {
 	return bpf_func(ctx, insnsi);
 }
@@ -909,8 +910,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
-		unsigned int (*bpf_func)(const void *,			\
-					 const struct bpf_insn *))	\
+		bpf_func_t bpf_func)					\
 	{								\
 		return bpf_func(ctx, insnsi);				\
 	}								\
@@ -921,8 +921,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 	unsigned int bpf_dispatcher_##name##_func(			\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
-		unsigned int (*bpf_func)(const void *,			\
-					 const struct bpf_insn *));	\
+		bpf_func_t bpf_func);					\
 	extern struct bpf_dispatcher bpf_dispatcher_##name;
 #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
 #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 93c7675f0c9e..5466e15be61f 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -410,7 +410,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 	}
 }
 
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
 	struct bpf_tramp_link *link_exiting;
@@ -418,44 +418,33 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
 	int cnt = 0, i;
 
 	kind = bpf_attach_type_to_tramp(link->link.prog);
-	mutex_lock(&tr->mutex);
-	if (tr->extension_prog) {
+	if (tr->extension_prog)
 		/* cannot attach fentry/fexit if extension prog is attached.
 		 * cannot overwrite extension prog either.
 		 */
-		err = -EBUSY;
-		goto out;
-	}
+		return -EBUSY;
 
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		cnt += tr->progs_cnt[i];
 
 	if (kind == BPF_TRAMP_REPLACE) {
 		/* Cannot attach extension if fentry/fexit are in use. */
-		if (cnt) {
-			err = -EBUSY;
-			goto out;
-		}
+		if (cnt)
+			return -EBUSY;
 		tr->extension_prog = link->link.prog;
-		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
-					 link->link.prog->bpf_func);
-		goto out;
-	}
-	if (cnt >= BPF_MAX_TRAMP_LINKS) {
-		err = -E2BIG;
-		goto out;
+		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+					  link->link.prog->bpf_func);
 	}
-	if (!hlist_unhashed(&link->tramp_hlist)) {
+	if (cnt >= BPF_MAX_TRAMP_LINKS)
+		return -E2BIG;
+	if (!hlist_unhashed(&link->tramp_hlist))
 		/* prog already linked */
-		err = -EBUSY;
-		goto out;
-	}
+		return -EBUSY;
 	hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
 		if (link_exiting->link.prog != link->link.prog)
 			continue;
 		/* prog already linked */
-		err = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 
 	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
@@ -465,30 +454,44 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
 		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
 	}
-out:
+	return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+	int err;
+
+	mutex_lock(&tr->mutex);
+	err = __bpf_trampoline_link_prog(link, tr);
 	mutex_unlock(&tr->mutex);
 	return err;
 }
 
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
 	int err;
 
 	kind = bpf_attach_type_to_tramp(link->link.prog);
-	mutex_lock(&tr->mutex);
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
 					 tr->extension_prog->bpf_func, NULL);
 		tr->extension_prog = NULL;
-		goto out;
+		return err;
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
-	err = bpf_trampoline_update(tr);
-out:
+	return bpf_trampoline_update(tr);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+	int err;
+
+	mutex_lock(&tr->mutex);
+	err = __bpf_trampoline_unlink_prog(link, tr);
 	mutex_unlock(&tr->mutex);
 	return err;
 }
-- 
cgit 


From 00442143a2ab7f1da46fbf4d2a99c85df767d49a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:05 -0700
Subject: bpf: convert cgroup_bpf.progs to hlist

This lets us reclaim some space to be used by new cgroup lsm slots.

Before:
struct cgroup_bpf {
	struct bpf_prog_array *    effective[23];        /*     0   184 */
	/* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */
	struct list_head           progs[23];            /*   184   368 */
	/* --- cacheline 8 boundary (512 bytes) was 40 bytes ago --- */
	u32                        flags[23];            /*   552    92 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 10 boundary (640 bytes) was 8 bytes ago --- */
	struct list_head           storages;             /*   648    16 */
	struct bpf_prog_array *    inactive;             /*   664     8 */
	struct percpu_ref          refcnt;               /*   672    16 */
	struct work_struct         release_work;         /*   688    32 */

	/* size: 720, cachelines: 12, members: 7 */
	/* sum members: 716, holes: 1, sum holes: 4 */
	/* last cacheline: 16 bytes */
};

After:
struct cgroup_bpf {
	struct bpf_prog_array *    effective[23];        /*     0   184 */
	/* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */
	struct hlist_head          progs[23];            /*   184   184 */
	/* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */
	u8                         flags[23];            /*   368    23 */

	/* XXX 1 byte hole, try to pack */

	/* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
	struct list_head           storages;             /*   392    16 */
	struct bpf_prog_array *    inactive;             /*   408     8 */
	struct percpu_ref          refcnt;               /*   416    16 */
	struct work_struct         release_work;         /*   432    72 */

	/* size: 504, cachelines: 8, members: 7 */
	/* sum members: 503, holes: 1, sum holes: 1 */
	/* last cacheline: 56 bytes */
};

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-3-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  4 +--
 include/linux/bpf-cgroup.h      |  2 +-
 kernel/bpf/cgroup.c             | 76 ++++++++++++++++++++++++-----------------
 3 files changed, 47 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 695d1224a71b..5d268e76d8e6 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -47,8 +47,8 @@ struct cgroup_bpf {
 	 * have either zero or one element
 	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
-	u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
+	struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+	u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* list of cgroup shared storages */
 	struct list_head storages;
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 669d96d074ad..6673acfbf2ef 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -95,7 +95,7 @@ struct bpf_cgroup_link {
 };
 
 struct bpf_prog_list {
-	struct list_head node;
+	struct hlist_node node;
 	struct bpf_prog *prog;
 	struct bpf_cgroup_link *link;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 7a394f7c205c..4adb4f3ecb7f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -157,11 +157,12 @@ static void cgroup_bpf_release(struct work_struct *work)
 	mutex_lock(&cgroup_mutex);
 
 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
-		struct list_head *progs = &cgrp->bpf.progs[atype];
-		struct bpf_prog_list *pl, *pltmp;
+		struct hlist_head *progs = &cgrp->bpf.progs[atype];
+		struct bpf_prog_list *pl;
+		struct hlist_node *pltmp;
 
-		list_for_each_entry_safe(pl, pltmp, progs, node) {
-			list_del(&pl->node);
+		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+			hlist_del(&pl->node);
 			if (pl->prog)
 				bpf_prog_put(pl->prog);
 			if (pl->link)
@@ -217,12 +218,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 /* count number of elements in the list.
  * it's slow but the list cannot be long
  */
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head)
 {
 	struct bpf_prog_list *pl;
 	u32 cnt = 0;
 
-	list_for_each_entry(pl, head, node) {
+	hlist_for_each_entry(pl, head, node) {
 		if (!prog_list_prog(pl))
 			continue;
 		cnt++;
@@ -291,7 +292,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 			continue;
 
-		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
+		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
 			if (!prog_list_prog(pl))
 				continue;
 
@@ -342,7 +343,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 		cgroup_bpf_get(p);
 
 	for (i = 0; i < NR; i++)
-		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+		INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
 
 	INIT_LIST_HEAD(&cgrp->bpf.storages);
 
@@ -418,7 +419,7 @@ cleanup:
 
 #define BPF_CGROUP_MAX_PROGS 64
 
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
 					       struct bpf_prog *replace_prog,
@@ -428,12 +429,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
 
 	/* single-attach case */
 	if (!allow_multi) {
-		if (list_empty(progs))
+		if (hlist_empty(progs))
 			return NULL;
-		return list_first_entry(progs, typeof(*pl), node);
+		return hlist_entry(progs->first, typeof(*pl), node);
 	}
 
-	list_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry(pl, progs, node) {
 		if (prog && pl->prog == prog && prog != replace_prog)
 			/* disallow attaching the same prog twice */
 			return ERR_PTR(-EINVAL);
@@ -444,7 +445,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
 
 	/* direct prog multi-attach w/ replacement case */
 	if (replace_prog) {
-		list_for_each_entry(pl, progs, node) {
+		hlist_for_each_entry(pl, progs, node) {
 			if (pl->prog == replace_prog)
 				/* a match found */
 				return pl;
@@ -480,7 +481,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_list *pl;
-	struct list_head *progs;
+	struct hlist_head *progs;
 	int err;
 
 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -503,7 +504,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (!hierarchy_allows_attach(cgrp, atype))
 		return -EPERM;
 
-	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
+	if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
 		/* Disallow attaching non-overridable on top
 		 * of existing overridable in this cgroup.
 		 * Disallow attaching multi-prog if overridable or none
@@ -525,12 +526,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (pl) {
 		old_prog = pl->prog;
 	} else {
+		struct hlist_node *last = NULL;
+
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
 			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
 		}
-		list_add_tail(&pl->node, progs);
+		if (hlist_empty(progs))
+			hlist_add_head(&pl->node, progs);
+		else
+			hlist_for_each(last, progs) {
+				if (last->next)
+					continue;
+				hlist_add_behind(&pl->node, last);
+				break;
+			}
 	}
 
 	pl->prog = prog;
@@ -556,7 +567,7 @@ cleanup:
 	}
 	bpf_cgroup_storages_free(new_storage);
 	if (!old_prog) {
-		list_del(&pl->node);
+		hlist_del(&pl->node);
 		kfree(pl);
 	}
 	return err;
@@ -587,7 +598,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 	struct cgroup_subsys_state *css;
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
-	struct list_head *head;
+	struct hlist_head *head;
 	struct cgroup *cg;
 	int pos;
 
@@ -603,7 +614,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 				continue;
 
 			head = &cg->bpf.progs[atype];
-			list_for_each_entry(pl, head, node) {
+			hlist_for_each_entry(pl, head, node) {
 				if (!prog_list_prog(pl))
 					continue;
 				if (pl->link == link)
@@ -637,7 +648,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
-	struct list_head *progs;
+	struct hlist_head *progs;
 	bool found = false;
 
 	atype = to_cgroup_bpf_attach_type(link->type);
@@ -649,7 +660,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	if (link->link.prog->type != new_prog->type)
 		return -EINVAL;
 
-	list_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry(pl, progs, node) {
 		if (pl->link == link) {
 			found = true;
 			break;
@@ -688,7 +699,7 @@ out_unlock:
 	return ret;
 }
 
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
 					       bool allow_multi)
@@ -696,14 +707,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 	struct bpf_prog_list *pl;
 
 	if (!allow_multi) {
-		if (list_empty(progs))
+		if (hlist_empty(progs))
 			/* report error when trying to detach and nothing is attached */
 			return ERR_PTR(-ENOENT);
 
 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
 		 */
-		return list_first_entry(progs, typeof(*pl), node);
+		return hlist_entry(progs->first, typeof(*pl), node);
 	}
 
 	if (!prog && !link)
@@ -713,7 +724,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 		return ERR_PTR(-EINVAL);
 
 	/* find the prog or link and detach it */
-	list_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry(pl, progs, node) {
 		if (pl->prog == prog && pl->link == link)
 			return pl;
 	}
@@ -737,7 +748,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct cgroup_subsys_state *css;
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
-	struct list_head *head;
+	struct hlist_head *head;
 	struct cgroup *cg;
 	int pos;
 
@@ -754,7 +765,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 				continue;
 
 			head = &cg->bpf.progs[atype];
-			list_for_each_entry(pl, head, node) {
+			hlist_for_each_entry(pl, head, node) {
 				if (!prog_list_prog(pl))
 					continue;
 				if (pl->prog == prog && pl->link == link)
@@ -791,7 +802,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
-	struct list_head *progs;
+	struct hlist_head *progs;
 	u32 flags;
 
 	atype = to_cgroup_bpf_attach_type(type);
@@ -822,9 +833,10 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	}
 
 	/* now can actually delete it from this cgroup list */
-	list_del(&pl->node);
+	hlist_del(&pl->node);
+
 	kfree(pl);
-	if (list_empty(progs))
+	if (hlist_empty(progs))
 		/* last program was detached, reset flags to zero */
 		cgrp->bpf.flags[atype] = 0;
 	if (old_prog)
@@ -852,7 +864,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	enum bpf_attach_type type = attr->query.attach_type;
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_array *effective;
-	struct list_head *progs;
+	struct hlist_head *progs;
 	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
 	u32 flags;
@@ -891,7 +903,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		u32 id;
 
 		i = 0;
-		list_for_each_entry(pl, progs, node) {
+		hlist_for_each_entry(pl, progs, node) {
 			prog = prog_list_prog(pl);
 			id = prog->aux->id;
 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
-- 
cgit 


From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:06 -0700
Subject: bpf: per-cgroup lsm flavor

Allow attaching to lsm hooks in the cgroup context.

Attaching to per-cgroup LSM works exactly like attaching
to other per-cgroup hooks. New BPF_LSM_CGROUP is added
to trigger new mode; the actual lsm hook we attach to is
signaled via existing attach_btf_id.

For the hooks that have 'struct socket' or 'struct sock' as its first
argument, we use the cgroup associated with that socket. For the rest,
we use 'current' cgroup (this is all on default hierarchy == v2 only).
Note that for some hooks that work on 'struct sock' we still
take the cgroup from 'current' because some of them work on the socket
that hasn't been properly initialized yet.

Behind the scenes, we allocate a shim program that is attached
to the trampoline and runs cgroup effective BPF programs array.
This shim has some rudimentary ref counting and can be shared
between several programs attaching to the same lsm hook from
different cgroups.

Note that this patch bloats cgroup size because we add 211
cgroup_bpf_attach_type(s) for simplicity sake. This will be
addressed in the subsequent patch.

Also note that we only add non-sleepable flavor for now. To enable
sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu,
shim programs have to be freed via trace rcu, cgroup_bpf.effective
should be also trace-rcu-managed + maybe some other changes that
I'm not aware of.

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c     |  24 +++--
 include/linux/bpf-cgroup-defs.h |   8 ++
 include/linux/bpf-cgroup.h      |   7 ++
 include/linux/bpf.h             |  24 +++++
 include/linux/bpf_lsm.h         |  13 +++
 include/linux/btf_ids.h         |   3 +-
 include/uapi/linux/bpf.h        |   1 +
 kernel/bpf/bpf_lsm.c            |  48 ++++++++++
 kernel/bpf/btf.c                |  11 +++
 kernel/bpf/cgroup.c             | 136 ++++++++++++++++++++++++---
 kernel/bpf/core.c               |   2 +
 kernel/bpf/syscall.c            |  10 ++
 kernel/bpf/trampoline.c         | 198 ++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c           |  32 +++++++
 tools/include/uapi/linux/bpf.h  |   1 +
 15 files changed, 498 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2c51ca9f7cec..2f460c67f9c7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1770,6 +1770,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 			   struct bpf_tramp_link *l, int stack_size,
 			   int run_ctx_off, bool save_ret)
 {
+	void (*exit)(struct bpf_prog *prog, u64 start,
+		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit;
+	u64 (*enter)(struct bpf_prog *prog,
+		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter;
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
@@ -1788,15 +1792,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	 */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
 
+	if (p->aux->sleepable) {
+		enter = __bpf_prog_enter_sleepable;
+		exit = __bpf_prog_exit_sleepable;
+	} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
+		enter = __bpf_prog_enter_lsm_cgroup;
+		exit = __bpf_prog_exit_lsm_cgroup;
+	}
+
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
 	/* arg2: lea rsi, [rbp - ctx_cookie_off] */
 	EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
 
-	if (emit_call(&prog,
-		      p->aux->sleepable ? __bpf_prog_enter_sleepable :
-		      __bpf_prog_enter, prog))
-			return -EINVAL;
+	if (emit_call(&prog, enter, prog))
+		return -EINVAL;
 	/* remember prog start time returned by __bpf_prog_enter */
 	emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
 
@@ -1840,10 +1850,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
 	/* arg3: lea rdx, [rbp - run_ctx_off] */
 	EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
-	if (emit_call(&prog,
-		      p->aux->sleepable ? __bpf_prog_exit_sleepable :
-		      __bpf_prog_exit, prog))
-			return -EINVAL;
+	if (emit_call(&prog, exit, prog))
+		return -EINVAL;
 
 	*pprog = prog;
 	return 0;
diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 5d268e76d8e6..b99f8c3e37ea 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -10,6 +10,12 @@
 
 struct bpf_prog_array;
 
+#ifdef CONFIG_BPF_LSM
+#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */
+#else
+#define CGROUP_LSM_NUM 0
+#endif
+
 enum cgroup_bpf_attach_type {
 	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
 	CGROUP_INET_INGRESS = 0,
@@ -35,6 +41,8 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
+	CGROUP_LSM_START,
+	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
 	MAX_CGROUP_BPF_ATTACH_TYPE
 };
 
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 6673acfbf2ef..2bd1b5f8de9b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,6 +23,13 @@ struct ctl_table;
 struct ctl_table_header;
 struct task_struct;
 
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+				       const struct bpf_insn *insn);
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+					 const struct bpf_insn *insn);
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+					  const struct bpf_insn *insn);
+
 #ifdef CONFIG_CGROUP_BPF
 
 #define CGROUP_ATYPE(type) \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d547be9db75f..77cd613a00bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -794,6 +794,10 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
 				       struct bpf_tramp_run_ctx *run_ctx);
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx);
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
@@ -1060,6 +1064,7 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	u32 verified_insns;
+	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
@@ -1167,6 +1172,11 @@ struct bpf_tramp_link {
 	u64 cookie;
 };
 
+struct bpf_shim_tramp_link {
+	struct bpf_tramp_link link;
+	struct bpf_trampoline *trampoline;
+};
+
 struct bpf_tracing_link {
 	struct bpf_tramp_link link;
 	enum bpf_attach_type attach_type;
@@ -1245,6 +1255,9 @@ struct bpf_dummy_ops {
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+				    int cgroup_atype);
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
 #else
 static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 {
@@ -1268,6 +1281,14 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+						  int cgroup_atype)
+{
+	return -EOPNOTSUPP;
+}
+static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+}
 #endif
 
 struct bpf_array {
@@ -2368,6 +2389,8 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
+extern const struct bpf_func_proto bpf_set_retval_proto;
+extern const struct bpf_func_proto bpf_get_retval_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2485,6 +2508,7 @@ int bpf_arch_text_invalidate(void *dst, size_t len);
 
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
+int btf_id_set_index(const struct btf_id_set *set, u32 id);
 
 #define MAX_BPRINTF_VARARGS		12
 
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 479c101546ad..61787a5f6af9 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -42,6 +42,9 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
 
+int bpf_lsm_hook_idx(u32 btf_id);
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
+
 #else /* !CONFIG_BPF_LSM */
 
 static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
@@ -65,6 +68,16 @@ static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
+static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+					   bpf_func_t *bpf_func)
+{
+}
+
+static inline int bpf_lsm_hook_idx(u32 btf_id)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 335a19092368..252a4befeab1 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -179,7 +179,8 @@ extern struct btf_id_set name;
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock)
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e81362891596..b7479898c879 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -998,6 +998,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_LSM_CGROUP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c1351df9f7ee..0f72020bfdcf 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -16,6 +16,7 @@
 #include <linux/bpf_local_storage.h>
 #include <linux/btf_ids.h>
 #include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
 
 /* For every LSM hook that allows attachment of BPF programs, declare a nop
  * function where a BPF program can be attached.
@@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks)
 #undef LSM_HOOK
 BTF_SET_END(bpf_lsm_hooks)
 
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_SET_END(bpf_lsm_current_hooks)
+
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+			     bpf_func_t *bpf_func)
+{
+	const struct btf_param *args;
+
+	if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+	    btf_id_set_contains(&bpf_lsm_current_hooks,
+				prog->aux->attach_btf_id)) {
+		*bpf_func = __cgroup_bpf_run_lsm_current;
+		return;
+	}
+
+	args = btf_params(prog->aux->attach_func_proto);
+
+#ifdef CONFIG_NET
+	if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+		*bpf_func = __cgroup_bpf_run_lsm_socket;
+	else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+		*bpf_func = __cgroup_bpf_run_lsm_sock;
+	else
+#endif
+		*bpf_func = __cgroup_bpf_run_lsm_current;
+}
+
+int bpf_lsm_hook_idx(u32 btf_id)
+{
+	return btf_id_set_index(&bpf_lsm_hooks, btf_id);
+}
+
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog)
 {
@@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
 	case BPF_FUNC_get_attach_cookie:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+	case BPF_FUNC_get_local_storage:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_get_local_storage_proto : NULL;
+	case BPF_FUNC_set_retval:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_set_retval_proto : NULL;
+	case BPF_FUNC_get_retval:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_get_retval_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2e2066d6af94..7c1fe422ed3f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 	if (arg == nr_args) {
 		switch (prog->expected_attach_type) {
+		case BPF_LSM_CGROUP:
 		case BPF_LSM_MAC:
 		case BPF_TRACE_FEXIT:
 			/* When LSM programs are attached to void LSM hooks
@@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b)
 	return *pa - *pb;
 }
 
+int btf_id_set_index(const struct btf_id_set *set, u32 id)
+{
+	const u32 *p;
+
+	p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func);
+	if (!p)
+		return -1;
+	return p - set->ids;
+}
+
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 {
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4adb4f3ecb7f..9cf41dd4f96f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,8 @@
 #include <linux/string.h>
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
 
@@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 	return run_ctx.retval;
 }
 
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+				       const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct sock *sk;
+	struct cgroup *cgrp;
+	int ret = 0;
+	u64 *args;
+
+	args = (u64 *)ctx;
+	sk = (void *)(unsigned long)args[0];
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+					 const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct socket *sock;
+	struct cgroup *cgrp;
+	int ret = 0;
+	u64 *args;
+
+	args = (u64 *)ctx;
+	sock = (void *)(unsigned long)args[0];
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	/* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+	cgrp = task_dfl_cgroup(current);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+	if (attach_type != BPF_LSM_CGROUP)
+		return to_cgroup_bpf_attach_type(attach_type);
+	return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+	if (attach_type != BPF_LSM_CGROUP)
+		return to_cgroup_bpf_attach_type(attach_type);
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
 void cgroup_bpf_offline(struct cgroup *cgrp)
 {
 	cgroup_get(cgrp);
@@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work)
 
 		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
 			hlist_del(&pl->node);
-			if (pl->prog)
+			if (pl->prog) {
+				if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+					bpf_trampoline_unlink_cgroup_shim(pl->prog);
 				bpf_prog_put(pl->prog);
-			if (pl->link)
+			}
+			if (pl->link) {
+				if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
 				bpf_cgroup_link_auto_detach(pl->link);
+			}
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
@@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	struct bpf_prog *new_prog = prog ? : link->link.prog;
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_list *pl;
 	struct hlist_head *progs;
@@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
 		return -EINVAL;
 
-	atype = to_cgroup_bpf_attach_type(type);
+	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	bpf_cgroup_storages_assign(pl->storage, storage);
 	cgrp->bpf.flags[atype] = saved_flags;
 
+	if (type == BPF_LSM_CGROUP) {
+		err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+		if (err)
+			goto cleanup;
+	}
+
 	err = update_effective_progs(cgrp, atype);
 	if (err)
-		goto cleanup;
+		goto cleanup_trampoline;
 
-	if (old_prog)
+	if (old_prog) {
+		if (type == BPF_LSM_CGROUP)
+			bpf_trampoline_unlink_cgroup_shim(old_prog);
 		bpf_prog_put(old_prog);
-	else
+	} else {
 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+	}
 	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
+cleanup_trampoline:
+	if (type == BPF_LSM_CGROUP)
+		bpf_trampoline_unlink_cgroup_shim(new_prog);
+
 cleanup:
 	if (old_prog) {
 		pl->prog = old_prog;
@@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	struct hlist_head *progs;
 	bool found = false;
 
-	atype = to_cgroup_bpf_attach_type(link->type);
+	atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
 	struct hlist_head *progs;
+	u32 attach_btf_id = 0;
 	u32 flags;
 
-	atype = to_cgroup_bpf_attach_type(type);
+	if (prog)
+		attach_btf_id = prog->aux->attach_btf_id;
+	if (link)
+		attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+	atype = bpf_cgroup_atype_find(type, attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (hlist_empty(progs))
 		/* last program was detached, reset flags to zero */
 		cgrp->bpf.flags[atype] = 0;
-	if (old_prog)
+	if (old_prog) {
+		if (type == BPF_LSM_CGROUP)
+			bpf_trampoline_unlink_cgroup_shim(old_prog);
 		bpf_prog_put(old_prog);
+	}
 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 	return 0;
 }
@@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
 				    cg_link->type));
+	if (cg_link->type == BPF_LSM_CGROUP)
+		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
 
 	cg = cg_link->cgroup;
 	cg_link->cgroup = NULL;
@@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval)
 	return ctx->retval;
 }
 
-static const struct bpf_func_proto bpf_get_retval_proto = {
+const struct bpf_func_proto bpf_get_retval_proto = {
 	.func		= bpf_get_retval,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval)
 	return 0;
 }
 
-static const struct bpf_func_proto bpf_set_retval_proto = {
+const struct bpf_func_proto bpf_set_retval_proto = {
 	.func		= bpf_set_retval,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f023cb399e3f..4cc10b942a3c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
 const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7d5af5b99f0d..626b8f7d237b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_LOOKUP;
 	case BPF_XDP:
 		return BPF_PROG_TYPE_XDP;
+	case BPF_LSM_CGROUP:
+		return BPF_PROG_TYPE_LSM;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_LSM:
+		if (ptype == BPF_PROG_TYPE_LSM &&
+		    prog->expected_attach_type != BPF_LSM_CGROUP)
+			return -EINVAL;
+
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
 	default:
@@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_LSM:
 		return cgroup_bpf_prog_detach(attr, ptype);
 	default:
 		return -EINVAL;
@@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_raw_tp_link_attach(prog, NULL);
 		else if (prog->expected_attach_type == BPF_TRACE_ITER)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
+		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+			ret = cgroup_bpf_link_attach(attr, prog);
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5466e15be61f..d7c251d7fbcd 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -11,6 +11,8 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/module.h>
 #include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
 	return err;
 }
 
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+	struct bpf_shim_tramp_link *shim_link =
+		container_of(link, struct bpf_shim_tramp_link, link.link);
+
+	/* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+	if (!shim_link->trampoline)
+		return;
+
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+	bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_shim_tramp_link *shim_link =
+		container_of(link, struct bpf_shim_tramp_link, link.link);
+
+	kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+	.release = bpf_shim_tramp_link_release,
+	.dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+						     bpf_func_t bpf_func,
+						     int cgroup_atype)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_prog *p;
+
+	shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+	if (!shim_link)
+		return NULL;
+
+	p = bpf_prog_alloc(1, 0);
+	if (!p) {
+		kfree(shim_link);
+		return NULL;
+	}
+
+	p->jited = false;
+	p->bpf_func = bpf_func;
+
+	p->aux->cgroup_atype = cgroup_atype;
+	p->aux->attach_func_proto = prog->aux->attach_func_proto;
+	p->aux->attach_btf_id = prog->aux->attach_btf_id;
+	p->aux->attach_btf = prog->aux->attach_btf;
+	btf_get(p->aux->attach_btf);
+	p->type = BPF_PROG_TYPE_LSM;
+	p->expected_attach_type = BPF_LSM_MAC;
+	bpf_prog_inc(p);
+	bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+		      &bpf_shim_tramp_link_lops, p);
+
+	return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+						    bpf_func_t bpf_func)
+{
+	struct bpf_tramp_link *link;
+	int kind;
+
+	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+			struct bpf_prog *p = link->link.prog;
+
+			if (p->bpf_func == bpf_func)
+				return container_of(link, struct bpf_shim_tramp_link, link);
+		}
+	}
+
+	return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+				    int cgroup_atype)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_attach_target_info tgt_info = {};
+	struct bpf_trampoline *tr;
+	bpf_func_t bpf_func;
+	u64 key;
+	int err;
+
+	err = bpf_check_attach_target(NULL, prog, NULL,
+				      prog->aux->attach_btf_id,
+				      &tgt_info);
+	if (err)
+		return err;
+
+	key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+					 prog->aux->attach_btf_id);
+
+	bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+	tr = bpf_trampoline_get(key, &tgt_info);
+	if (!tr)
+		return  -ENOMEM;
+
+	mutex_lock(&tr->mutex);
+
+	shim_link = cgroup_shim_find(tr, bpf_func);
+	if (shim_link) {
+		/* Reusing existing shim attached by the other program. */
+		bpf_link_inc(&shim_link->link.link);
+
+		mutex_unlock(&tr->mutex);
+		bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+		return 0;
+	}
+
+	/* Allocate and install new shim. */
+
+	shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+	if (!shim_link) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+	if (err)
+		goto err;
+
+	shim_link->trampoline = tr;
+	/* note, we're still holding tr refcnt from above */
+
+	mutex_unlock(&tr->mutex);
+
+	return 0;
+err:
+	mutex_unlock(&tr->mutex);
+
+	if (shim_link)
+		bpf_link_put(&shim_link->link.link);
+
+	/* have to release tr while _not_ holding its mutex */
+	bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+	return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_trampoline *tr;
+	bpf_func_t bpf_func;
+	u64 key;
+
+	key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+					 prog->aux->attach_btf_id);
+
+	bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+	tr = bpf_trampoline_lookup(key);
+	if (WARN_ON_ONCE(!tr))
+		return;
+
+	mutex_lock(&tr->mutex);
+	shim_link = cgroup_shim_find(tr, bpf_func);
+	mutex_unlock(&tr->mutex);
+
+	if (shim_link)
+		bpf_link_put(&shim_link->link.link);
+
+	bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info)
 {
@@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
 	rcu_read_unlock();
 }
 
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	/* Runtime stats are exported via actual BPF_LSM_CGROUP
+	 * programs, not the shims.
+	 */
+	rcu_read_lock();
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	return NO_START_TIME;
+}
+
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	migrate_enable();
+	rcu_read_unlock();
+}
+
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 {
 	rcu_read_lock_trace();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4938477912cd..df3ec6b05f05 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				reg_type_str(env, regs[BPF_REG_1].type));
 			return -EACCES;
 		}
+		break;
+	case BPF_FUNC_set_retval:
+		if (env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+			if (!env->prog->aux->attach_func_proto->type) {
+				/* Make sure programs that attach to void
+				 * hooks don't try to modify return value.
+				 */
+				verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+				return -EINVAL;
+			}
+		}
+		break;
 	}
 
 	if (err)
@@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		range = tnum_range(SK_DROP, SK_PASS);
 		break;
+
+	case BPF_PROG_TYPE_LSM:
+		if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+			/* Regular BPF_PROG_TYPE_LSM programs can return
+			 * any value.
+			 */
+			return 0;
+		}
+		if (!env->prog->aux->attach_func_proto->type) {
+			/* Make sure programs that attach to void
+			 * hooks don't try to modify return value.
+			 */
+			range = tnum_range(1, 1);
+		}
+		break;
+
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
@@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	if (!tnum_in(range, reg->var_off)) {
 		verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+		if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+		    !prog->aux->attach_func_proto->type)
+			verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
 		return -EINVAL;
 	}
 
@@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		fallthrough;
 	case BPF_MODIFY_RETURN:
 	case BPF_LSM_MAC:
+	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 		if (!btf_type_is_func(t)) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e81362891596..b7479898c879 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -998,6 +998,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_LSM_CGROUP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit 


From c0e19f2c9a3edd38e4b1bdae98eb44555d02bc31 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:07 -0700
Subject: bpf: minimize number of allocated lsm slots per program

Previous patch adds 1:1 mapping between all 211 LSM hooks
and bpf_cgroup program array. Instead of reserving a slot per
possible hook, reserve 10 slots per cgroup for lsm programs.
Those slots are dynamically allocated on demand and reclaimed.

struct cgroup_bpf {
	struct bpf_prog_array *    effective[33];        /*     0   264 */
	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	struct hlist_head          progs[33];            /*   264   264 */
	/* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */
	u8                         flags[33];            /*   528    33 */

	/* XXX 7 bytes hole, try to pack */

	struct list_head           storages;             /*   568    16 */
	/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
	struct bpf_prog_array *    inactive;             /*   584     8 */
	struct percpu_ref          refcnt;               /*   592    16 */
	struct work_struct         release_work;         /*   608    72 */

	/* size: 680, cachelines: 11, members: 7 */
	/* sum members: 673, holes: 1, sum holes: 7 */
	/* last cacheline: 40 bytes */
};

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-5-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  3 ++-
 include/linux/bpf.h             |  9 +++++++-
 include/linux/bpf_lsm.h         |  6 ------
 kernel/bpf/bpf_lsm.c            |  5 -----
 kernel/bpf/btf.c                | 10 ---------
 kernel/bpf/cgroup.c             | 47 ++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/core.c               |  7 ++++++
 kernel/bpf/trampoline.c         |  1 +
 8 files changed, 64 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index b99f8c3e37ea..7b121bd780eb 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -11,7 +11,8 @@
 struct bpf_prog_array;
 
 #ifdef CONFIG_BPF_LSM
-#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */
+/* Maximum number of concurrently attachable per-cgroup LSM hooks. */
+#define CGROUP_LSM_NUM 10
 #else
 #define CGROUP_LSM_NUM 0
 #endif
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 77cd613a00bd..5d2afa55c7c3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2508,7 +2508,6 @@ int bpf_arch_text_invalidate(void *dst, size_t len);
 
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
-int btf_id_set_index(const struct btf_id_set *set, u32 id);
 
 #define MAX_BPRINTF_VARARGS		12
 
@@ -2545,4 +2544,12 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
 
+#ifdef CONFIG_BPF_LSM
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
+void bpf_cgroup_atype_put(int cgroup_atype);
+#else
+static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
+static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
+#endif /* CONFIG_BPF_LSM */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 61787a5f6af9..4bcf76a9bb06 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -42,7 +42,6 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
 
-int bpf_lsm_hook_idx(u32 btf_id);
 void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
 
 #else /* !CONFIG_BPF_LSM */
@@ -73,11 +72,6 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
 {
 }
 
-static inline int bpf_lsm_hook_idx(u32 btf_id)
-{
-	return -EINVAL;
-}
-
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0f72020bfdcf..83aa431dd52e 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -69,11 +69,6 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
 		*bpf_func = __cgroup_bpf_run_lsm_current;
 }
 
-int bpf_lsm_hook_idx(u32 btf_id)
-{
-	return btf_id_set_index(&bpf_lsm_hooks, btf_id);
-}
-
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7c1fe422ed3f..8d3c7ab8af46 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6843,16 +6843,6 @@ static int btf_id_cmp_func(const void *a, const void *b)
 	return *pa - *pb;
 }
 
-int btf_id_set_index(const struct btf_id_set *set, u32 id)
-{
-	const u32 *p;
-
-	p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func);
-	if (!p)
-		return -1;
-	return p - set->ids;
-}
-
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 {
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9cf41dd4f96f..169cbd0de797 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -127,12 +127,57 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
 }
 
 #ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+	u32 attach_btf_id;
+	int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
 static enum cgroup_bpf_attach_type
 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 {
+	int i;
+
+	lockdep_assert_held(&cgroup_mutex);
+
 	if (attach_type != BPF_LSM_CGROUP)
 		return to_cgroup_bpf_attach_type(attach_type);
-	return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
+
+	for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+		if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+			return CGROUP_LSM_START + i;
+
+	for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+		if (cgroup_lsm_atype[i].attach_btf_id == 0)
+			return CGROUP_LSM_START + i;
+
+	return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+	int i = cgroup_atype - CGROUP_LSM_START;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+		     cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+	cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+	cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+	int i = cgroup_atype - CGROUP_LSM_START;
+
+	mutex_lock(&cgroup_mutex);
+	if (--cgroup_lsm_atype[i].refcnt <= 0)
+		cgroup_lsm_atype[i].attach_btf_id = 0;
+	WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+	mutex_unlock(&cgroup_mutex);
 }
 #else
 static enum cgroup_bpf_attach_type
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4cc10b942a3c..805c2ad5c793 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -107,6 +107,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
 
 	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
 	mutex_init(&fp->aux->used_maps_mutex);
@@ -2569,6 +2572,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 #ifdef CONFIG_BPF_SYSCALL
 	bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+#endif
+#ifdef CONFIG_CGROUP_BPF
+	if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+		bpf_cgroup_atype_put(aux->cgroup_atype);
 #endif
 	bpf_free_used_maps(aux);
 	bpf_free_used_btfs(aux);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d7c251d7fbcd..6cd226584c33 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -555,6 +555,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
 	bpf_prog_inc(p);
 	bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
 		      &bpf_shim_tramp_link_lops, p);
+	bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
 
 	return shim_link;
 }
-- 
cgit 


From b79c9fc9551b45953a94abf550b7bd3b00e3a0f9 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:08 -0700
Subject: bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP

We have two options:
1. Treat all BPF_LSM_CGROUP the same, regardless of attach_btf_id
2. Treat BPF_LSM_CGROUP+attach_btf_id as a separate hook point

I was doing (2) in the original patch, but switching to (1) here:

* bpf_prog_query returns all attached BPF_LSM_CGROUP programs
regardless of attach_btf_id
* attach_btf_id is exported via bpf_prog_info

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-6-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 ++
 kernel/bpf/cgroup.c      | 95 ++++++++++++++++++++++++++++++++----------------
 kernel/bpf/syscall.c     |  8 +++-
 3 files changed, 74 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b7479898c879..ad9e7311c4cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1432,6 +1432,7 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
+		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
@@ -6076,6 +6077,8 @@ struct bpf_prog_info {
 	__u64 run_cnt;
 	__u64 recursion_misses;
 	__u32 verified_insns;
+	__u32 attach_btf_obj_id;
+	__u32 attach_btf_id;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 169cbd0de797..59b7eb60d5b4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1017,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			      union bpf_attr __user *uattr)
 {
+	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
+	enum cgroup_bpf_attach_type from_atype, to_atype;
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_array *effective;
-	struct hlist_head *progs;
-	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
+	int total_cnt = 0;
 	u32 flags;
 
-	atype = to_cgroup_bpf_attach_type(type);
-	if (atype < 0)
-		return -EINVAL;
-
-	progs = &cgrp->bpf.progs[atype];
-	flags = cgrp->bpf.flags[atype];
+	if (type == BPF_LSM_CGROUP) {
+		if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+			return -EINVAL;
 
-	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-					      lockdep_is_held(&cgroup_mutex));
+		from_atype = CGROUP_LSM_START;
+		to_atype = CGROUP_LSM_END;
+		flags = 0;
+	} else {
+		from_atype = to_cgroup_bpf_attach_type(type);
+		if (from_atype < 0)
+			return -EINVAL;
+		to_atype = from_atype;
+		flags = cgrp->bpf.flags[from_atype];
+	}
 
-	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
-		cnt = bpf_prog_array_length(effective);
-	else
-		cnt = prog_list_length(progs);
+	for (atype = from_atype; atype <= to_atype; atype++) {
+		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+							      lockdep_is_held(&cgroup_mutex));
+			total_cnt += bpf_prog_array_length(effective);
+		} else {
+			total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+		}
+	}
 
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 		return -EFAULT;
-	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
 		return -EFAULT;
-	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
 		/* return early if user requested only program count + flags */
 		return 0;
-	if (attr->query.prog_cnt < cnt) {
-		cnt = attr->query.prog_cnt;
+
+	if (attr->query.prog_cnt < total_cnt) {
+		total_cnt = attr->query.prog_cnt;
 		ret = -ENOSPC;
 	}
 
-	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
-		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
-	} else {
-		struct bpf_prog_list *pl;
-		u32 id;
+	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+							      lockdep_is_held(&cgroup_mutex));
+			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+		} else {
+			struct hlist_head *progs;
+			struct bpf_prog_list *pl;
+			struct bpf_prog *prog;
+			u32 id;
+
+			progs = &cgrp->bpf.progs[atype];
+			cnt = min_t(int, prog_list_length(progs), total_cnt);
+			i = 0;
+			hlist_for_each_entry(pl, progs, node) {
+				prog = prog_list_prog(pl);
+				id = prog->aux->id;
+				if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+					return -EFAULT;
+				if (++i == cnt)
+					break;
+			}
+		}
 
-		i = 0;
-		hlist_for_each_entry(pl, progs, node) {
-			prog = prog_list_prog(pl);
-			id = prog->aux->id;
-			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
-				return -EFAULT;
-			if (++i == cnt)
-				break;
+		if (prog_attach_flags) {
+			flags = cgrp->bpf.flags[atype];
+
+			for (i = 0; i < cnt; i++)
+				if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
+					return -EFAULT;
+			prog_attach_flags += cnt;
 		}
+
+		prog_ids += cnt;
+		total_cnt -= cnt;
 	}
 	return ret;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 626b8f7d237b..ab688d85b2c6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3520,7 +3520,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	}
 }
 
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
 
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
@@ -3556,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SYSCTL:
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
+	case BPF_LSM_CGROUP:
 		return cgroup_bpf_prog_query(attr, uattr);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
@@ -4066,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 
 	if (prog->aux->btf)
 		info.btf_id = btf_obj_id(prog->aux->btf);
+	info.attach_btf_id = prog->aux->attach_btf_id;
+	if (prog->aux->attach_btf)
+		info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf);
+	else if (prog->aux->dst_prog)
+		info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf);
 
 	ulen = info.nr_func_info;
 	info.nr_func_info = prog->aux->func_info_cnt;
-- 
cgit 


From 9113d7e48e9128522b9f5a54dfd30dff10509a92 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:09 -0700
Subject: bpf: expose bpf_{g,s}etsockopt to lsm cgroup

I don't see how to make it nice without introducing btf id lists
for the hooks where these helpers are allowed. Some LSM hooks
work on the locked sockets, some are triggering early and
don't grab any locks, so have two lists for now:

1. LSM hooks which trigger under socket lock - minority of the hooks,
   but ideal case for us, we can expose existing BTF-based helpers
2. LSM hooks which trigger without socket lock, but they trigger
   early in the socket creation path where it should be safe to
   do setsockopt without any locks
3. The rest are prohibited. I'm thinking that this use-case might
   be a good gateway to sleeping lsm cgroup hooks in the future.
   We can either expose lock/unlock operations (and add tracking
   to the verifier) or have another set of bpf_setsockopt
   wrapper that grab the locks and might sleep.

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-7-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/bpf_lsm.c | 38 +++++++++++++++++++++++++++++++++
 net/core/filter.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 93 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5d2afa55c7c3..2b21f2a3452f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2386,6 +2386,8 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
+extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
+extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 83aa431dd52e..d469b7f3deef 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -45,6 +45,24 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
 BTF_ID(func, bpf_lsm_sk_free_security)
 BTF_SET_END(bpf_lsm_current_hooks)
 
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
 void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
 			     bpf_func_t *bpf_func)
 {
@@ -201,6 +219,26 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_retval:
 		return prog->expected_attach_type == BPF_LSM_CGROUP ?
 			&bpf_get_retval_proto : NULL;
+	case BPF_FUNC_setsockopt:
+		if (prog->expected_attach_type != BPF_LSM_CGROUP)
+			return NULL;
+		if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+					prog->aux->attach_btf_id))
+			return &bpf_sk_setsockopt_proto;
+		if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+					prog->aux->attach_btf_id))
+			return &bpf_unlocked_sk_setsockopt_proto;
+		return NULL;
+	case BPF_FUNC_getsockopt:
+		if (prog->expected_attach_type != BPF_LSM_CGROUP)
+			return NULL;
+		if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+					prog->aux->attach_btf_id))
+			return &bpf_sk_getsockopt_proto;
+		if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+					prog->aux->attach_btf_id))
+			return &bpf_unlocked_sk_getsockopt_proto;
+		return NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 151aa4756bd6..c6941ab0eb52 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5012,8 +5012,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-static int _bpf_setsockopt(struct sock *sk, int level, int optname,
-			   char *optval, int optlen)
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+			    char *optval, int optlen)
 {
 	char devname[IFNAMSIZ];
 	int val, valbool;
@@ -5024,8 +5024,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 	if (!sk_fullsock(sk))
 		return -EINVAL;
 
-	sock_owned_by_me(sk);
-
 	if (level == SOL_SOCKET) {
 		if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
 			return -EINVAL;
@@ -5258,14 +5256,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 	return ret;
 }
 
-static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 			   char *optval, int optlen)
+{
+	if (sk_fullsock(sk))
+		sock_owned_by_me(sk);
+	return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+			    char *optval, int optlen)
 {
 	if (!sk_fullsock(sk))
 		goto err_clear;
 
-	sock_owned_by_me(sk);
-
 	if (level == SOL_SOCKET) {
 		if (optlen != sizeof(int))
 			goto err_clear;
@@ -5360,6 +5364,14 @@ err_clear:
 	return -EINVAL;
 }
 
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+			   char *optval, int optlen)
+{
+	if (sk_fullsock(sk))
+		sock_owned_by_me(sk);
+	return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
 	   int, optname, char *, optval, int, optlen)
 {
@@ -5400,6 +5412,40 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
+	.func		= bpf_unlocked_sk_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
+	.func		= bpf_unlocked_sk_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
 	   int, level, int, optname, char *, optval, int, optlen)
 {
-- 
cgit 


From f163f0302ab69722c052519f4014814bf10026a9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:21 +0200
Subject: context_tracking: Rename context_tracking_user_enter/exit() to
 user_enter/exit_callable()

context_tracking_user_enter() and context_tracking_user_exit() are
ASM callable versions of user_enter() and user_exit() for architectures
that didn't manage to check the context tracking static key from ASM.
Change those function names to better reflect their purpose.

[ frederic: Apply Max Filippov feedback. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 arch/arm/kernel/entry-header.S   |  8 ++++----
 arch/csky/kernel/entry.S         |  4 ++--
 arch/riscv/kernel/entry.S        |  6 +++---
 arch/xtensa/kernel/entry.S       |  4 ++--
 include/linux/context_tracking.h |  4 ++--
 kernel/context_tracking.c        | 28 +++++++++++++++++-----------
 6 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index 5865621bf691..95def2b38d1c 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -369,10 +369,10 @@ ALT_UP_B(.L1_\@)
 #ifdef CONFIG_CONTEXT_TRACKING
 	.if	\save
 	stmdb   sp!, {r0-r3, ip, lr}
-	bl	context_tracking_user_exit
+	bl	user_exit_callable
 	ldmia	sp!, {r0-r3, ip, lr}
 	.else
-	bl	context_tracking_user_exit
+	bl	user_exit_callable
 	.endif
 #endif
 	.endm
@@ -381,10 +381,10 @@ ALT_UP_B(.L1_\@)
 #ifdef CONFIG_CONTEXT_TRACKING
 	.if	\save
 	stmdb   sp!, {r0-r3, ip, lr}
-	bl	context_tracking_user_enter
+	bl	user_enter_callable
 	ldmia	sp!, {r0-r3, ip, lr}
 	.else
-	bl	context_tracking_user_enter
+	bl	user_enter_callable
 	.endif
 #endif
 	.endm
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index a4ababf25e24..bc734d17c16f 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -23,7 +23,7 @@
 	mfcr	a0, epsr
 	btsti	a0, 31
 	bt	1f
-	jbsr	context_tracking_user_exit
+	jbsr	user_exit_callable
 	ldw	a0, (sp, LSAVE_A0)
 	ldw	a1, (sp, LSAVE_A1)
 	ldw	a2, (sp, LSAVE_A2)
@@ -160,7 +160,7 @@ ret_from_exception:
 	cmpnei	r10, 0
 	bt	exit_work
 #ifdef CONFIG_CONTEXT_TRACKING
-	jbsr	context_tracking_user_enter
+	jbsr	user_enter_callable
 #endif
 1:
 #ifdef CONFIG_PREEMPTION
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 2e5b88ca11ce..12f6bba57e33 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -112,11 +112,11 @@ _save_context:
 #endif
 
 #ifdef CONFIG_CONTEXT_TRACKING
-	/* If previous state is in user mode, call context_tracking_user_exit. */
+	/* If previous state is in user mode, call user_exit_callable(). */
 	li   a0, SR_PP
 	and a0, s1, a0
 	bnez a0, skip_context_tracking
-	call context_tracking_user_exit
+	call user_exit_callable
 skip_context_tracking:
 #endif
 
@@ -270,7 +270,7 @@ resume_userspace:
 	bnez s1, work_pending
 
 #ifdef CONFIG_CONTEXT_TRACKING
-	call context_tracking_user_enter
+	call user_enter_callable
 #endif
 
 	/* Save unwound kernel stack pointer in thread_info */
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index e3eae648ba2e..d72bcafae90c 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -458,7 +458,7 @@ KABI_W	or	a3, a3, a2
 #ifdef CONFIG_CONTEXT_TRACKING
 	l32i		abi_tmp0, a1, PT_PS
 	bbci.l		abi_tmp0, PS_UM_BIT, 1f
-	abi_call	context_tracking_user_exit
+	abi_call	user_exit_callable
 1:
 #endif
 
@@ -545,7 +545,7 @@ common_exception_return:
 
 .Lexit_tif_loop_user:
 #ifdef CONFIG_CONTEXT_TRACKING
-	abi_call	context_tracking_user_enter
+	abi_call	user_enter_callable
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	_bbci.l		abi_saved0, TIF_DB_DISABLED, 1f
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 773035124bad..69532cd18f72 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -19,8 +19,8 @@ extern void __ct_user_exit(enum ctx_state state);
 
 extern void context_tracking_enter(enum ctx_state state);
 extern void context_tracking_exit(enum ctx_state state);
-extern void context_tracking_user_enter(void);
-extern void context_tracking_user_exit(void);
+extern void user_enter_callable(void);
+extern void user_exit_callable(void);
 
 static inline void user_enter(void)
 {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 88c60ab39fbb..8f7dd5799bda 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -134,19 +134,22 @@ void context_tracking_enter(enum ctx_state state)
 NOKPROBE_SYMBOL(context_tracking_enter);
 EXPORT_SYMBOL_GPL(context_tracking_enter);
 
-/*
- * OBSOLETE:
- * This function should be noinstr but it unsafely calls local_irq_restore(),
- * involving illegal RCU uses through tracing and lockdep.
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ *			   archs that didn't manage to check the context tracking
+ *			   static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
  * This is unlikely to be fixed as this function is obsolete. The preferred
  * way is to call user_enter_irqoff(). It should be the arch entry code
  * responsibility to call into context tracking with IRQs disabled.
  */
-void context_tracking_user_enter(void)
+void user_enter_callable(void)
 {
 	user_enter();
 }
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
 
 /**
  * __ct_user_exit - Inform the context tracking that the CPU is
@@ -208,19 +211,22 @@ void context_tracking_exit(enum ctx_state state)
 NOKPROBE_SYMBOL(context_tracking_exit);
 EXPORT_SYMBOL_GPL(context_tracking_exit);
 
-/*
- * OBSOLETE:
- * This function should be noinstr but it unsafely calls local_irq_save(),
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ *			  archs that didn't manage to check the context tracking
+ *			  static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
  * involving illegal RCU uses through tracing and lockdep. This is unlikely
  * to be fixed as this function is obsolete. The preferred way is to call
  * user_exit_irqoff(). It should be the arch entry code responsibility to
  * call into context tracking with IRQs disabled.
  */
-void context_tracking_user_exit(void)
+void user_exit_callable(void)
 {
 	user_exit();
 }
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
 
 void __init context_tracking_cpu_set(int cpu)
 {
-- 
cgit 


From fe98db1c6d1ad7349e61a5a2766ad64975bc9ae4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:22 +0200
Subject: context_tracking: Rename context_tracking_enter/exit() to
 ct_user_enter/exit()

context_tracking_enter() and context_tracking_exit() have confusing
names that don't explain the fact they are referring to user/guest state.

Use more self-explanatory names and shrink to the new context tracking
prefix instead.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking.h | 13 +++++++------
 kernel/context_tracking.c        | 12 ++++++------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 69532cd18f72..7a5f04ae1758 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -17,21 +17,22 @@ extern void context_tracking_cpu_set(int cpu);
 extern void __ct_user_enter(enum ctx_state state);
 extern void __ct_user_exit(enum ctx_state state);
 
-extern void context_tracking_enter(enum ctx_state state);
-extern void context_tracking_exit(enum ctx_state state);
+extern void ct_user_enter(enum ctx_state state);
+extern void ct_user_exit(enum ctx_state state);
+
 extern void user_enter_callable(void);
 extern void user_exit_callable(void);
 
 static inline void user_enter(void)
 {
 	if (context_tracking_enabled())
-		context_tracking_enter(CONTEXT_USER);
+		ct_user_enter(CONTEXT_USER);
 
 }
 static inline void user_exit(void)
 {
 	if (context_tracking_enabled())
-		context_tracking_exit(CONTEXT_USER);
+		ct_user_exit(CONTEXT_USER);
 }
 
 /* Called with interrupts disabled.  */
@@ -57,7 +58,7 @@ static inline enum ctx_state exception_enter(void)
 
 	prev_ctx = this_cpu_read(context_tracking.state);
 	if (prev_ctx != CONTEXT_KERNEL)
-		context_tracking_exit(prev_ctx);
+		ct_user_exit(prev_ctx);
 
 	return prev_ctx;
 }
@@ -67,7 +68,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) &&
 	    context_tracking_enabled()) {
 		if (prev_ctx != CONTEXT_KERNEL)
-			context_tracking_enter(prev_ctx);
+			ct_user_enter(prev_ctx);
 	}
 }
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 8f7dd5799bda..590c920ad57f 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -112,7 +112,7 @@ EXPORT_SYMBOL_GPL(__ct_user_enter);
  * or context_tracking_guest_enter(). It should be the arch entry code
  * responsibility to call into context tracking with IRQs disabled.
  */
-void context_tracking_enter(enum ctx_state state)
+void ct_user_enter(enum ctx_state state)
 {
 	unsigned long flags;
 
@@ -131,8 +131,8 @@ void context_tracking_enter(enum ctx_state state)
 	__ct_user_enter(state);
 	local_irq_restore(flags);
 }
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
 
 /**
  * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
@@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(__ct_user_exit);
  * or context_tracking_guest_exit(). It should be the arch entry code
  * responsibility to call into context tracking with IRQs disabled.
  */
-void context_tracking_exit(enum ctx_state state)
+void ct_user_exit(enum ctx_state state)
 {
 	unsigned long flags;
 
@@ -208,8 +208,8 @@ void context_tracking_exit(enum ctx_state state)
 	__ct_user_exit(state);
 	local_irq_restore(flags);
 }
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
 
 /**
  * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
-- 
cgit 


From 2a0aafce963dab996b843f6cdb49237b0ae4a118 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:23 +0200
Subject: context_tracking: Rename context_tracking_cpu_set() to
 ct_cpu_track_user()

context_tracking_cpu_set() is called in order to tell a CPU to track
user/kernel transitions. Since context tracking is going to expand in
to also track transitions from/to idle/IRQ/NMIs, the scope
of this function name becomes too broad and needs to be made more
specific. Also shorten the prefix to align with the new namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking.h | 2 +-
 kernel/context_tracking.c        | 4 ++--
 kernel/time/tick-sched.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 7a5f04ae1758..63259fece7c7 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -11,7 +11,7 @@
 
 
 #ifdef CONFIG_CONTEXT_TRACKING
-extern void context_tracking_cpu_set(int cpu);
+extern void ct_cpu_track_user(int cpu);
 
 /* Called with interrupts disabled.  */
 extern void __ct_user_enter(enum ctx_state state);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 590c920ad57f..d361bd52e4e1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -228,7 +228,7 @@ void user_exit_callable(void)
 }
 NOKPROBE_SYMBOL(user_exit_callable);
 
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
 {
 	static __initdata bool initialized = false;
 
@@ -258,6 +258,6 @@ void __init context_tracking_init(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		context_tracking_cpu_set(cpu);
+		ct_cpu_track_user(cpu);
 }
 #endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58a11f859ac7..de192dcff828 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -571,7 +571,7 @@ void __init tick_nohz_init(void)
 	}
 
 	for_each_cpu(cpu, tick_nohz_full_mask)
-		context_tracking_cpu_set(cpu);
+		ct_cpu_track_user(cpu);
 
 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 					"kernel/nohz:predown", NULL,
-- 
cgit 


From 24a9c54182b3758801b8ca6c8c237cc2ff654732 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:24 +0200
Subject: context_tracking: Split user tracking Kconfig

Context tracking is going to be used not only to track user transitions
but also idle/IRQs/NMIs. The user tracking part will then become a
separate feature. Prepare Kconfig for that.

[ frederic: Apply Max Filippov feedback. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 .../time/context-tracking/arch-support.txt         |  6 ++---
 arch/Kconfig                                       |  4 +--
 arch/arm/Kconfig                                   |  2 +-
 arch/arm/kernel/entry-common.S                     |  4 +--
 arch/arm/kernel/entry-header.S                     |  4 +--
 arch/arm64/Kconfig                                 |  2 +-
 arch/csky/Kconfig                                  |  2 +-
 arch/csky/kernel/entry.S                           |  4 +--
 arch/loongarch/Kconfig                             |  2 +-
 arch/mips/Kconfig                                  |  2 +-
 arch/powerpc/Kconfig                               |  2 +-
 arch/powerpc/include/asm/context_tracking.h        |  2 +-
 arch/riscv/Kconfig                                 |  2 +-
 arch/riscv/kernel/entry.S                          |  6 ++---
 arch/sparc/Kconfig                                 |  2 +-
 arch/sparc/kernel/rtrap_64.S                       |  2 +-
 arch/x86/Kconfig                                   |  4 +--
 arch/xtensa/Kconfig                                |  2 +-
 arch/xtensa/kernel/entry.S                         |  4 +--
 include/linux/context_tracking.h                   | 12 ++++-----
 include/linux/context_tracking_state.h             |  4 +--
 init/Kconfig                                       |  4 +--
 kernel/context_tracking.c                          |  6 ++++-
 kernel/sched/core.c                                |  2 +-
 kernel/time/Kconfig                                | 31 ++++++++++++++--------
 25 files changed, 65 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index c9e0a16290e6..e59071a49090 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -1,7 +1,7 @@
 #
-# Feature name:          context-tracking
-#         Kconfig:       HAVE_CONTEXT_TRACKING
-#         description:   arch supports context tracking for NO_HZ_FULL
+# Feature name:          user-context-tracking
+#         Kconfig:       HAVE_CONTEXT_TRACKING_USER
+#         description:   arch supports user context tracking for NO_HZ_FULL
 #
     -----------------------
     |         arch |status|
diff --git a/arch/Kconfig b/arch/Kconfig
index fcf9a41a4ef5..154b7b78da09 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -774,7 +774,7 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES
 	  and similar) by implementing an inline arch_within_stack_frames(),
 	  which is used by CONFIG_HARDENED_USERCOPY.
 
-config HAVE_CONTEXT_TRACKING
+config HAVE_CONTEXT_TRACKING_USER
 	bool
 	help
 	  Provide kernel/user boundaries probes necessary for subsystems
@@ -785,7 +785,7 @@ config HAVE_CONTEXT_TRACKING
 	  protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal
 	  handling on irq exit still need to be protected.
 
-config HAVE_CONTEXT_TRACKING_OFFSTACK
+config HAVE_CONTEXT_TRACKING_USER_OFFSTACK
 	bool
 	help
 	  Architecture neither relies on exception_enter()/exception_exit()
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 7630ba9cb6cc..9acc6aac5912 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -84,7 +84,7 @@ config ARM
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
 	select HAVE_ARM_SMCCC if CPU_V7
 	select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_BUILDTIME_MCOUNT_SORT
 	select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 7aa3ded4af92..37a0125fc926 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -28,7 +28,7 @@
 #include "entry-header.S"
 
 saved_psr	.req	r8
-#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING)
+#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER)
 saved_pc	.req	r9
 #define TRACE(x...) x
 #else
@@ -38,7 +38,7 @@ saved_pc	.req	lr
 
 	.section .entry.text,"ax",%progbits
 	.align	5
-#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING) || \
+#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER) || \
 	IS_ENABLED(CONFIG_DEBUG_RSEQ))
 /*
  * This is the fast syscall return path.  We do as little as possible here,
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index 95def2b38d1c..99411fa91350 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -366,7 +366,7 @@ ALT_UP_B(.L1_\@)
  * between user and kernel mode.
  */
 	.macro ct_user_exit, save = 1
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	.if	\save
 	stmdb   sp!, {r0-r3, ip, lr}
 	bl	user_exit_callable
@@ -378,7 +378,7 @@ ALT_UP_B(.L1_\@)
 	.endm
 
 	.macro ct_user_enter, save = 1
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	.if	\save
 	stmdb   sp!, {r0-r3, ip, lr}
 	bl	user_enter_callable
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..7c5dd2af9ca9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -174,7 +174,7 @@ config ARM64
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 21d72b078eef..f55ba1745f7b 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -42,7 +42,7 @@ config CSKY
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_SECCOMP_FILTER
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index bc734d17c16f..547b4cd1b24b 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -19,7 +19,7 @@
 .endm
 
 .macro	context_tracking
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	mfcr	a0, epsr
 	btsti	a0, 31
 	bt	1f
@@ -159,7 +159,7 @@ ret_from_exception:
 	and	r10, r9
 	cmpnei	r10, 0
 	bt	exit_work
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	jbsr	user_enter_callable
 #endif
 1:
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1920d52653b4..130dc65f3c85 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -76,7 +76,7 @@ config LOONGARCH
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DMA_CONTIGUOUS
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index db09d45d59ec..9457894db237 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -56,7 +56,7 @@ config MIPS
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_TIF_NOHZ
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c2ce2e60c8f0..874c8d81284a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -202,7 +202,7 @@ config PPC
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_CONTEXT_TRACKING		if PPC64
+	select HAVE_CONTEXT_TRACKING_USER		if PPC64
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/include/asm/context_tracking.h b/arch/powerpc/include/asm/context_tracking.h
index f2682b28b050..4b63931c49e0 100644
--- a/arch/powerpc/include/asm/context_tracking.h
+++ b/arch/powerpc/include/asm/context_tracking.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_CONTEXT_TRACKING_H
 #define _ASM_POWERPC_CONTEXT_TRACKING_H
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 #define SCHEDULE_USER bl	schedule_user
 #else
 #define SCHEDULE_USER bl	schedule
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 32ffef9f6e5b..29b46f217345 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -86,7 +86,7 @@ config RISCV
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_EBPF_JIT if MMU
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 12f6bba57e33..b9eda3fcbd6d 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -111,7 +111,7 @@ _save_context:
 	call __trace_hardirqs_off
 #endif
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	/* If previous state is in user mode, call user_exit_callable(). */
 	li   a0, SR_PP
 	and a0, s1, a0
@@ -176,7 +176,7 @@ handle_syscall:
 	 */
 	csrs CSR_STATUS, SR_IE
 #endif
-#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING)
+#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER)
 	/* Recover a0 - a7 for system calls */
 	REG_L a0, PT_A0(sp)
 	REG_L a1, PT_A1(sp)
@@ -269,7 +269,7 @@ resume_userspace:
 	andi s1, s0, _TIF_WORK_MASK
 	bnez s1, work_pending
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	call user_enter_callable
 #endif
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index ba449c47effd..9232411a8821 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -71,7 +71,7 @@ config SPARC64
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_TIF_NOHZ
 	select HAVE_DEBUG_KMEMLEAK
 	select IOMMU_HELPER
diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S
index c5fd4b450d9b..eef102765a7e 100644
--- a/arch/sparc/kernel/rtrap_64.S
+++ b/arch/sparc/kernel/rtrap_64.S
@@ -15,7 +15,7 @@
 #include <asm/visasm.h>
 #include <asm/processor.h>
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 # define SCHEDULE_USER schedule_user
 #else
 # define SCHEDULE_USER schedule
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index be0b95e51df6..b0a6dbbb760b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -186,8 +186,8 @@ config X86
 	select HAVE_ASM_MODVERSIONS
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
-	select HAVE_CONTEXT_TRACKING		if X86_64
-	select HAVE_CONTEXT_TRACKING_OFFSTACK	if HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER		if X86_64
+	select HAVE_CONTEXT_TRACKING_USER_OFFSTACK	if HAVE_CONTEXT_TRACKING_USER
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_OBJTOOL_MCOUNT		if HAVE_OBJTOOL
 	select HAVE_BUILDTIME_MCOUNT_SORT
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 0b0f0172cced..7927fed7bc83 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -33,7 +33,7 @@ config XTENSA
 	select HAVE_ARCH_KCSAN
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_CONTEXT_TRACKING
+	select HAVE_CONTEXT_TRACKING_USER
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_EXIT_THREAD
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index d72bcafae90c..fb67d85116e4 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -455,7 +455,7 @@ KABI_W	or	a3, a3, a2
 	abi_call	trace_hardirqs_off
 1:
 #endif
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	l32i		abi_tmp0, a1, PT_PS
 	bbci.l		abi_tmp0, PS_UM_BIT, 1f
 	abi_call	user_exit_callable
@@ -544,7 +544,7 @@ common_exception_return:
 	j		.Lrestore_state
 
 .Lexit_tif_loop_user:
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	abi_call	user_enter_callable
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 63259fece7c7..e35ae66b4794 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -10,7 +10,7 @@
 #include <asm/ptrace.h>
 
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 extern void ct_cpu_track_user(int cpu);
 
 /* Called with interrupts disabled.  */
@@ -52,7 +52,7 @@ static inline enum ctx_state exception_enter(void)
 {
 	enum ctx_state prev_ctx;
 
-	if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) ||
+	if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) ||
 	    !context_tracking_enabled())
 		return 0;
 
@@ -65,7 +65,7 @@ static inline enum ctx_state exception_enter(void)
 
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
-	if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) &&
+	if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) &&
 	    context_tracking_enabled()) {
 		if (prev_ctx != CONTEXT_KERNEL)
 			ct_user_enter(prev_ctx);
@@ -109,14 +109,14 @@ static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
 static __always_inline bool context_tracking_guest_enter(void) { return false; }
 static inline void context_tracking_guest_exit(void) { }
 
-#endif /* !CONFIG_CONTEXT_TRACKING */
+#endif /* !CONFIG_CONTEXT_TRACKING_USER */
 
 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
 
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
 extern void context_tracking_init(void);
 #else
 static inline void context_tracking_init(void) { }
-#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
+#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */
 
 #endif
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index edc7b46376a6..2b46afe105a9 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -22,7 +22,7 @@ struct context_tracking {
 	} state;
 };
 
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 extern struct static_key_false context_tracking_key;
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
@@ -45,6 +45,6 @@ static inline bool context_tracking_enabled_this_cpu(void)
 static __always_inline bool context_tracking_enabled(void) { return false; }
 static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
 static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; }
-#endif /* CONFIG_CONTEXT_TRACKING */
+#endif /* CONFIG_CONTEXT_TRACKING_USER */
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index c7900e8975f1..06454d19e2f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -494,11 +494,11 @@ config VIRT_CPU_ACCOUNTING_NATIVE
 
 config VIRT_CPU_ACCOUNTING_GEN
 	bool "Full dynticks CPU time accounting"
-	depends on HAVE_CONTEXT_TRACKING
+	depends on HAVE_CONTEXT_TRACKING_USER
 	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
 	depends on GENERIC_CLOCKEVENTS
 	select VIRT_CPU_ACCOUNTING
-	select CONTEXT_TRACKING
+	select CONTEXT_TRACKING_USER
 	help
 	  Select this option to enable task and CPU time accounting on full
 	  dynticks systems. This accounting is implemented by watching every
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index d361bd52e4e1..f3dec1be2bf6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -22,6 +22,8 @@
 #include <linux/export.h>
 #include <linux/kprobes.h>
 
+#ifdef CONFIG_CONTEXT_TRACKING_USER
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/context_tracking.h>
 
@@ -252,7 +254,7 @@ void __init ct_cpu_track_user(int cpu)
 	initialized = true;
 }
 
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
 void __init context_tracking_init(void)
 {
 	int cpu;
@@ -261,3 +263,5 @@ void __init context_tracking_init(void)
 		ct_cpu_track_user(cpu);
 }
 #endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..883167a57bf9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6559,7 +6559,7 @@ void __sched schedule_idle(void)
 	} while (need_resched());
 }
 
-#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
 asmlinkage __visible void __sched schedule_user(void)
 {
 	/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 27b7868b5c30..41f99bcfe9e6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -73,6 +73,9 @@ config TIME_KUNIT_TEST
 
 	  If unsure, say N.
 
+config CONTEXT_TRACKING
+	bool
+
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
@@ -111,7 +114,7 @@ config NO_HZ_FULL
 	# NO_HZ_COMMON dependency
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
-	depends on HAVE_CONTEXT_TRACKING
+	depends on HAVE_CONTEXT_TRACKING_USER
 	# VIRT_CPU_ACCOUNTING_GEN dependency
 	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select NO_HZ_COMMON
@@ -137,31 +140,37 @@ config NO_HZ_FULL
 
 endchoice
 
-config CONTEXT_TRACKING
-       bool
+config CONTEXT_TRACKING_USER
+	bool
+	depends on HAVE_CONTEXT_TRACKING_USER
+	select CONTEXT_TRACKING
+	help
+	  Track transitions between kernel and user on behalf of RCU and
+	  tickless cputime accounting. The former case relies on context
+	  tracking to enter/exit RCU extended quiescent states.
 
-config CONTEXT_TRACKING_FORCE
-	bool "Force context tracking"
-	depends on CONTEXT_TRACKING
+config CONTEXT_TRACKING_USER_FORCE
+	bool "Force user context tracking"
+	depends on CONTEXT_TRACKING_USER
 	default y if !NO_HZ_FULL
 	help
 	  The major pre-requirement for full dynticks to work is to
-	  support the context tracking subsystem. But there are also
+	  support the user context tracking subsystem. But there are also
 	  other dependencies to provide in order to make the full
 	  dynticks working.
 
 	  This option stands for testing when an arch implements the
-	  context tracking backend but doesn't yet fulfill all the
+	  user context tracking backend but doesn't yet fulfill all the
 	  requirements to make the full dynticks feature working.
 	  Without the full dynticks, there is no way to test the support
-	  for context tracking and the subsystems that rely on it: RCU
+	  for user context tracking and the subsystems that rely on it: RCU
 	  userspace extended quiescent state and tickless cputime
 	  accounting. This option copes with the absence of the full
-	  dynticks subsystem by forcing the context tracking on all
+	  dynticks subsystem by forcing the user context tracking on all
 	  CPUs in the system.
 
 	  Say Y only if you're working on the development of an
-	  architecture backend for the context tracking.
+	  architecture backend for the user context tracking.
 
 	  Say N otherwise, this option brings an overhead that you
 	  don't want in production.
-- 
cgit 


From 3d410403a572766628947754e2f62c29bcbf9035 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 28 Jun 2022 10:51:52 +0200
Subject: net: dsa: add get_pause_stats support

Add support for pause stats

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 33283eeda697..ea7bf007f34f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -895,6 +895,8 @@ struct dsa_switch_ops {
 				  const struct ethtool_rmon_hist_range **ranges);
 	void	(*get_stats64)(struct dsa_switch *ds, int port,
 				   struct rtnl_link_stats64 *s);
+	void	(*get_pause_stats)(struct dsa_switch *ds, int port,
+				   struct ethtool_pause_stats *pause_stats);
 	void	(*self_test)(struct dsa_switch *ds, int port,
 			     struct ethtool_test *etest, u64 *data);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 760ca58307a3..ad6a6663feeb 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1109,6 +1109,16 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev,
 	return phylink_ethtool_ksettings_set(dp->pl, cmd);
 }
 
+static void dsa_slave_get_pause_stats(struct net_device *dev,
+				  struct ethtool_pause_stats *pause_stats)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_pause_stats)
+		ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
 static void dsa_slave_get_pauseparam(struct net_device *dev,
 				     struct ethtool_pauseparam *pause)
 {
@@ -2100,6 +2110,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_eee		= dsa_slave_get_eee,
 	.get_link_ksettings	= dsa_slave_get_link_ksettings,
 	.set_link_ksettings	= dsa_slave_set_link_ksettings,
+	.get_pause_stats	= dsa_slave_get_pause_stats,
 	.get_pauseparam		= dsa_slave_get_pauseparam,
 	.set_pauseparam		= dsa_slave_set_pauseparam,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
-- 
cgit 


From 3eb4a4c3442c0642feaf466ecf6fe3cfb4af2c43 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 28 Jun 2022 13:08:31 +0300
Subject: net: switchdev: add reminder near struct switchdev_notifier_fdb_info

br_switchdev_fdb_notify() creates an on-stack FDB info variable, and
initializes it member by member. As such, newly added fields which are
not initialized by br_switchdev_fdb_notify() will contain junk bytes
from the stack.

Other uses of struct switchdev_notifier_fdb_info have a struct
initializer which should put zeroes in the uninitialized fields.

Add a reminder above the structure for future developers. Recently
discussed during review.

Link: https://patchwork.kernel.org/project/netdevbpf/patch/20220524152144.40527-2-schultz.hans+netdev@gmail.com/#24877698
Link: https://patchwork.kernel.org/project/netdevbpf/patch/20220524152144.40527-3-schultz.hans+netdev@gmail.com/#24912269
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20220628100831.2899434-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index aa0171d5786d..7dcdc97c0bc3 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -239,6 +239,9 @@ struct switchdev_notifier_info {
 	const void *ctx;
 };
 
+/* Remember to update br_switchdev_fdb_populate() when adding
+ * new members to this structure
+ */
 struct switchdev_notifier_fdb_info {
 	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
-- 
cgit 


From 07e7fcf1714c5f9930ad27613fea940aedba68da Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Date: Wed, 4 May 2022 17:38:32 +0100
Subject: clk: qcom: gcc-msm8939: Add missing SYSTEM_MM_NOC_BFDCD_CLK_SRC

When adding in the indexes for this clock-controller we missed
SYSTEM_MM_NOC_BFDCD_CLK_SRC.

Add it in now.

Fixes: 4c71d6abc4fc ("clk: qcom: Add DT bindings for MSM8939 GCC")
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>
Cc: devicetree@vger.kernel.org
Signed-off-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220504163835.40130-2-bryan.odonoghue@linaro.org
---
 include/dt-bindings/clock/qcom,gcc-msm8939.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,gcc-msm8939.h b/include/dt-bindings/clock/qcom,gcc-msm8939.h
index 0634467c4ce5..2d545ed0d35a 100644
--- a/include/dt-bindings/clock/qcom,gcc-msm8939.h
+++ b/include/dt-bindings/clock/qcom,gcc-msm8939.h
@@ -192,6 +192,7 @@
 #define GCC_VENUS0_CORE0_VCODEC0_CLK		183
 #define GCC_VENUS0_CORE1_VCODEC0_CLK		184
 #define GCC_OXILI_TIMER_CLK			185
+#define SYSTEM_MM_NOC_BFDCD_CLK_SRC		186
 
 /* Indexes for GDSCs */
 #define BIMC_GDSC				0
-- 
cgit 


From c381d02b2fd5f82d2207db1b9b25ff60d0d9c27c Mon Sep 17 00:00:00 2001
From: Yuwei Wang <wangyuweihx@gmail.com>
Date: Wed, 29 Jun 2022 08:48:31 +0000
Subject: sysctl: add proc_dointvec_ms_jiffies_minmax

add proc_dointvec_ms_jiffies_minmax to fit read msecs value to jiffies
with a limited range of values

Signed-off-by: Yuwei Wang <wangyuweihx@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/sysctl.h |  2 ++
 kernel/sysctl.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 80263f7cdb77..17b42ce89d3e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -75,6 +75,8 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
 int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer,
 			size_t *lenp, loff_t *ppos);
 int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e52b6e372c60..85c92e2c2570 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1237,6 +1237,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 	return 0;
 }
 
+static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
+						int *valp, int write, void *data)
+{
+	int tmp, ret;
+	struct do_proc_dointvec_minmax_conv_param *param = data;
+	/*
+	 * If writing, first do so via a temporary local int so we can
+	 * bounds-check it before touching *valp.
+	 */
+	int *ip = write ? &tmp : valp;
+
+	ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data);
+	if (ret)
+		return ret;
+
+	if (write) {
+		if ((param->min && *param->min > tmp) ||
+				(param->max && *param->max < tmp))
+			return -EINVAL;
+		*valp = tmp;
+	}
+	return 0;
+}
+
 /**
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
@@ -1259,6 +1283,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    	    do_proc_dointvec_jiffies_conv,NULL);
 }
 
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+			  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dointvec_minmax_conv_param param = {
+		.min = (int *) table->extra1,
+		.max = (int *) table->extra2,
+	};
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+			do_proc_dointvec_ms_jiffies_minmax_conv, &param);
+}
+
 /**
  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
  * @table: the sysctl table
@@ -1523,6 +1558,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+				    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 		    void *buffer, size_t *lenp, loff_t *ppos)
 {
-- 
cgit 


From 211da42eaa45db7b0edfde187dd88a85fbd466b5 Mon Sep 17 00:00:00 2001
From: Yuwei Wang <wangyuweihx@gmail.com>
Date: Wed, 29 Jun 2022 08:48:32 +0000
Subject: net, neigh: introduce interval_probe_time_ms for periodic probe

commit ed6cd6a17896 ("net, neigh: Set lower cap for neigh_managed_work rearming")
fixed a case when DELAY_PROBE_TIME is configured to 0, the processing of the
system work queue hog CPU to 100%, and further more we should introduce
a new option used by periodic probe

Signed-off-by: Yuwei Wang <wangyuweihx@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst |  6 ++++++
 include/net/neighbour.h                |  1 +
 include/uapi/linux/neighbour.h         |  1 +
 include/uapi/linux/sysctl.h            | 37 +++++++++++++++++-----------------
 net/core/neighbour.c                   | 32 +++++++++++++++++++++++++++--
 net/decnet/dn_neigh.c                  |  1 +
 net/ipv4/arp.c                         |  1 +
 net/ipv6/ndisc.c                       |  1 +
 8 files changed, 60 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 9f41961d11d5..4c8bbf5acfd1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -202,6 +202,12 @@ neigh/default/unres_qlen - INTEGER
 
 	Default: 101
 
+neigh/default/interval_probe_time_ms - INTEGER
+	The probe interval for neighbor entries with NTF_MANAGED flag,
+	the min value is 1.
+
+	Default: 5000
+
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
 
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 87419f7f5421..9f0bab0589d9 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -48,6 +48,7 @@ enum {
 	NEIGH_VAR_RETRANS_TIME,
 	NEIGH_VAR_BASE_REACHABLE_TIME,
 	NEIGH_VAR_DELAY_PROBE_TIME,
+	NEIGH_VAR_INTERVAL_PROBE_TIME_MS,
 	NEIGH_VAR_GC_STALETIME,
 	NEIGH_VAR_QUEUE_LEN_BYTES,
 	NEIGH_VAR_PROXY_QLEN,
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 39c565e460c7..a998bf761635 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -154,6 +154,7 @@ enum {
 	NDTPA_QUEUE_LENBYTES,		/* u32 */
 	NDTPA_MCAST_REPROBES,		/* u32 */
 	NDTPA_PAD,
+	NDTPA_INTERVAL_PROBE_TIME_MS,	/* u64, msecs */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 6a3b194c50fe..8981f00204db 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -584,24 +584,25 @@ enum {
 
 /* /proc/sys/net/<protocol>/neigh/<dev> */
 enum {
-	NET_NEIGH_MCAST_SOLICIT=1,
-	NET_NEIGH_UCAST_SOLICIT=2,
-	NET_NEIGH_APP_SOLICIT=3,
-	NET_NEIGH_RETRANS_TIME=4,
-	NET_NEIGH_REACHABLE_TIME=5,
-	NET_NEIGH_DELAY_PROBE_TIME=6,
-	NET_NEIGH_GC_STALE_TIME=7,
-	NET_NEIGH_UNRES_QLEN=8,
-	NET_NEIGH_PROXY_QLEN=9,
-	NET_NEIGH_ANYCAST_DELAY=10,
-	NET_NEIGH_PROXY_DELAY=11,
-	NET_NEIGH_LOCKTIME=12,
-	NET_NEIGH_GC_INTERVAL=13,
-	NET_NEIGH_GC_THRESH1=14,
-	NET_NEIGH_GC_THRESH2=15,
-	NET_NEIGH_GC_THRESH3=16,
-	NET_NEIGH_RETRANS_TIME_MS=17,
-	NET_NEIGH_REACHABLE_TIME_MS=18,
+	NET_NEIGH_MCAST_SOLICIT = 1,
+	NET_NEIGH_UCAST_SOLICIT = 2,
+	NET_NEIGH_APP_SOLICIT = 3,
+	NET_NEIGH_RETRANS_TIME = 4,
+	NET_NEIGH_REACHABLE_TIME = 5,
+	NET_NEIGH_DELAY_PROBE_TIME = 6,
+	NET_NEIGH_GC_STALE_TIME = 7,
+	NET_NEIGH_UNRES_QLEN = 8,
+	NET_NEIGH_PROXY_QLEN = 9,
+	NET_NEIGH_ANYCAST_DELAY = 10,
+	NET_NEIGH_PROXY_DELAY = 11,
+	NET_NEIGH_LOCKTIME = 12,
+	NET_NEIGH_GC_INTERVAL = 13,
+	NET_NEIGH_GC_THRESH1 = 14,
+	NET_NEIGH_GC_THRESH2 = 15,
+	NET_NEIGH_GC_THRESH3 = 16,
+	NET_NEIGH_RETRANS_TIME_MS = 17,
+	NET_NEIGH_REACHABLE_TIME_MS = 18,
+	NET_NEIGH_INTERVAL_PROBE_TIME_MS = 19,
 };
 
 /* /proc/sys/net/dccp */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d8ec70622ecb..6a8c2596ebab 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1579,7 +1579,7 @@ static void neigh_managed_work(struct work_struct *work)
 	list_for_each_entry(neigh, &tbl->managed_list, managed_list)
 		neigh_event_send_probe(neigh, NULL, false);
 	queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
-			   max(NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME), HZ));
+			   NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
 	write_unlock_bh(&tbl->lock);
 }
 
@@ -2100,7 +2100,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 	    nla_put_msecs(skb, NDTPA_PROXY_DELAY,
 			  NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
 	    nla_put_msecs(skb, NDTPA_LOCKTIME,
-			  NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+			  NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+	    nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+			  NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
 		goto nla_put_failure;
 	return nla_nest_end(skb, nest);
 
@@ -2255,6 +2257,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
 	[NDTPA_ANYCAST_DELAY]		= { .type = NLA_U64 },
 	[NDTPA_PROXY_DELAY]		= { .type = NLA_U64 },
 	[NDTPA_LOCKTIME]		= { .type = NLA_U64 },
+	[NDTPA_INTERVAL_PROBE_TIME_MS]	= { .type = NLA_U64, .min = 1 },
 };
 
 static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2373,6 +2376,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 					      nla_get_msecs(tbp[i]));
 				call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
 				break;
+			case NDTPA_INTERVAL_PROBE_TIME_MS:
+				NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+					      nla_get_msecs(tbp[i]));
+				break;
 			case NDTPA_RETRANS_TIME:
 				NEIGH_VAR_SET(p, RETRANS_TIME,
 					      nla_get_msecs(tbp[i]));
@@ -3562,6 +3569,22 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+						   void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp = *ctl;
+	int ret;
+
+	int min = msecs_to_jiffies(1);
+
+	tmp.extra1 = &min;
+	tmp.extra2 = NULL;
+
+	ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+	neigh_proc_update(ctl, write);
+	return ret;
+}
+
 int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
 			size_t *lenp, loff_t *ppos)
 {
@@ -3658,6 +3681,9 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
 #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
 	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
 
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
+
 #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
 	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
 
@@ -3676,6 +3702,8 @@ static struct neigh_sysctl_table {
 		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+		NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+						       "interval_probe_time_ms"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
 		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
 		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index fbd98ac853ea..7c569bcc0aca 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -94,6 +94,7 @@ struct neigh_table dn_neigh_table = {
 			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ab4a5601c82a..af2f12ffc9ca 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -168,6 +168,7 @@ struct neigh_table arp_tbl = {
 			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b0dfe97ea4ee..cd84cbdac0a2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -128,6 +128,7 @@ struct neigh_table nd_tbl = {
 			[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
-- 
cgit 


From d6fb6ee1820cb9c49717b8d28c5b2e940cb2e439 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 18 Jun 2022 13:23:11 +0200
Subject: ACPI: bus: Drop driver member of struct acpi_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct acpi_device::driver tracks the same information as the driver
member of struct acpi_device::dev.

Fix all users of the former to use the latter and drop the redundant
data from struct acpi_device.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c          | 21 ++++++++++-----------
 drivers/acpi/device_sysfs.c |  2 +-
 include/acpi/acpi_bus.h     |  1 -
 3 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 479eec8a1ec6..90cabe4fd4d0 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -465,7 +465,6 @@ out_free:
 static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
 {
 	struct acpi_device *adev;
-	struct acpi_driver *driver;
 	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE;
 	bool hotplug_event = false;
 
@@ -517,10 +516,13 @@ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
 	if (!adev)
 		goto err;
 
-	driver = adev->driver;
-	if (driver && driver->ops.notify &&
-	    (driver->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS))
-		driver->ops.notify(adev, type);
+	if (adev->dev.driver) {
+		struct acpi_driver *driver = to_acpi_driver(adev->dev.driver);
+
+		if (driver && driver->ops.notify &&
+		    (driver->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS))
+			driver->ops.notify(adev, type);
+	}
 
 	if (!hotplug_event) {
 		acpi_bus_put_acpi_device(adev);
@@ -539,8 +541,9 @@ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
 static void acpi_notify_device(acpi_handle handle, u32 event, void *data)
 {
 	struct acpi_device *device = data;
+	struct acpi_driver *acpi_drv = to_acpi_driver(device->dev.driver);
 
-	device->driver->ops.notify(device, event);
+	acpi_drv->ops.notify(device, event);
 }
 
 static void acpi_notify_device_fixed(void *data)
@@ -1033,8 +1036,6 @@ static int acpi_device_probe(struct device *dev)
 	if (ret)
 		return ret;
 
-	acpi_dev->driver = acpi_drv;
-
 	pr_debug("Driver [%s] successfully bound to device [%s]\n",
 		 acpi_drv->name, acpi_dev->pnp.bus_id);
 
@@ -1044,7 +1045,6 @@ static int acpi_device_probe(struct device *dev)
 			if (acpi_drv->ops.remove)
 				acpi_drv->ops.remove(acpi_dev);
 
-			acpi_dev->driver = NULL;
 			acpi_dev->driver_data = NULL;
 			return ret;
 		}
@@ -1060,7 +1060,7 @@ static int acpi_device_probe(struct device *dev)
 static void acpi_device_remove(struct device *dev)
 {
 	struct acpi_device *acpi_dev = to_acpi_device(dev);
-	struct acpi_driver *acpi_drv = acpi_dev->driver;
+	struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver);
 
 	if (acpi_drv->ops.notify)
 		acpi_device_remove_notify_handler(acpi_dev);
@@ -1068,7 +1068,6 @@ static void acpi_device_remove(struct device *dev)
 	if (acpi_drv->ops.remove)
 		acpi_drv->ops.remove(acpi_dev);
 
-	acpi_dev->driver = NULL;
 	acpi_dev->driver_data = NULL;
 
 	put_device(dev);
diff --git a/drivers/acpi/device_sysfs.c b/drivers/acpi/device_sysfs.c
index d5d6403ba07b..120873dad2cc 100644
--- a/drivers/acpi/device_sysfs.c
+++ b/drivers/acpi/device_sysfs.c
@@ -376,7 +376,7 @@ eject_store(struct device *d, struct device_attribute *attr,
 		return -EINVAL;
 
 	if ((!acpi_device->handler || !acpi_device->handler->hotplug.enabled)
-	    && !acpi_device->driver)
+	    && !d->driver)
 		return -ENODEV;
 
 	status = acpi_get_type(acpi_device->handle, &not_used);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 54c5566df9fe..ab239a35cb2a 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -379,7 +379,6 @@ struct acpi_device {
 	struct acpi_device_data data;
 	struct acpi_scan_handler *handler;
 	struct acpi_hotplug_context *hp;
-	struct acpi_driver *driver;
 	const struct acpi_gpio_mapping *driver_gpios;
 	void *driver_data;
 	struct device dev;
-- 
cgit 


From 95c8222f0e52b09b7607616274e7cae84d519a9b Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Wed, 29 Jun 2022 16:25:18 +0200
Subject: spi: spi.c: Fix comment style

Capitalize first word in comment where appropriate and add
parentheses to function names.

Reported-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: David Jander <david@protonic.nl>
Link: https://lore.kernel.org/r/20220629142519.3985486-3-david@protonic.nl
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 94 ++++++++++++++++++++++++-------------------------
 include/linux/spi/spi.h | 82 +++++++++++++++++++++---------------------
 2 files changed, 88 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 031606251b58..6d69f5dc62bc 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1354,7 +1354,7 @@ int spi_delay_to_ns(struct spi_delay *_delay, struct spi_transfer *xfer)
 		/* Nothing to do here */
 		break;
 	case SPI_DELAY_UNIT_SCK:
-		/* clock cycles need to be obtained from spi_transfer */
+		/* Clock cycles need to be obtained from spi_transfer */
 		if (!xfer)
 			return -EINVAL;
 		/*
@@ -1403,7 +1403,7 @@ static void _spi_transfer_cs_change_delay(struct spi_message *msg,
 	u32 unit = xfer->cs_change_delay.unit;
 	int ret;
 
-	/* return early on "fast" mode - for everything but USECS */
+	/* Return early on "fast" mode - for everything but USECS */
 	if (!delay) {
 		if (unit == SPI_DELAY_UNIT_USECS)
 			_spi_transfer_delay_ns(default_delay_ns);
@@ -1629,7 +1629,7 @@ static int __spi_pump_transfer_message(struct spi_controller *ctlr,
 	WRITE_ONCE(ctlr->cur_msg_incomplete, true);
 	WRITE_ONCE(ctlr->cur_msg_need_completion, false);
 	reinit_completion(&ctlr->cur_msg_completion);
-	smp_wmb(); /* make these available to spi_finalize_current_message */
+	smp_wmb(); /* Make these available to spi_finalize_current_message() */
 
 	ret = ctlr->transfer_one_message(ctlr, msg);
 	if (ret) {
@@ -1905,7 +1905,7 @@ struct spi_message *spi_get_next_queued_message(struct spi_controller *ctlr)
 	struct spi_message *next;
 	unsigned long flags;
 
-	/* get a pointer to the next message, if any */
+	/* Get a pointer to the next message, if any */
 	spin_lock_irqsave(&ctlr->queue_lock, flags);
 	next = list_first_entry_or_null(&ctlr->queue, struct spi_message,
 					queue);
@@ -2558,7 +2558,7 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr,
 	acpi_dev_free_resource_list(&resource_list);
 
 	if (ret < 0)
-		/* found SPI in _CRS but it points to another controller */
+		/* Found SPI in _CRS but it points to another controller */
 		return ERR_PTR(-ENODEV);
 
 	if (!lookup.max_speed_hz &&
@@ -3014,7 +3014,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 		return status;
 
 	if (ctlr->bus_num >= 0) {
-		/* devices with a fixed bus num must check-in with the num */
+		/* Devices with a fixed bus num must check-in with the num */
 		mutex_lock(&board_lock);
 		id = idr_alloc(&spi_master_idr, ctlr, ctlr->bus_num,
 			ctlr->bus_num + 1, GFP_KERNEL);
@@ -3023,7 +3023,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 			return id == -ENOSPC ? -EBUSY : id;
 		ctlr->bus_num = id;
 	} else if (ctlr->dev.of_node) {
-		/* allocate dynamic bus number using Linux idr */
+		/* Allocate dynamic bus number using Linux idr */
 		id = of_alias_get_id(ctlr->dev.of_node, "spi");
 		if (id >= 0) {
 			ctlr->bus_num = id;
@@ -3082,7 +3082,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 		goto free_bus_id;
 	}
 
-	/* setting last_cs to -1 means no chip selected */
+	/* Setting last_cs to -1 means no chip selected */
 	ctlr->last_cs = -1;
 
 	status = device_add(&ctlr->dev);
@@ -3106,7 +3106,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 			goto free_bus_id;
 		}
 	}
-	/* add statistics */
+	/* Add statistics */
 	ctlr->pcpu_statistics = spi_alloc_pcpu_stats(dev);
 	if (!ctlr->pcpu_statistics) {
 		dev_err(dev, "Error allocating per-cpu statistics\n");
@@ -3209,7 +3209,7 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 
 	device_del(&ctlr->dev);
 
-	/* free bus id */
+	/* Free bus id */
 	mutex_lock(&board_lock);
 	if (found == ctlr)
 		idr_remove(&spi_master_idr, id);
@@ -3268,14 +3268,14 @@ static void __spi_replace_transfers_release(struct spi_controller *ctlr,
 	struct spi_replaced_transfers *rxfer = res;
 	size_t i;
 
-	/* call extra callback if requested */
+	/* Call extra callback if requested */
 	if (rxfer->release)
 		rxfer->release(ctlr, msg, res);
 
-	/* insert replaced transfers back into the message */
+	/* Insert replaced transfers back into the message */
 	list_splice(&rxfer->replaced_transfers, rxfer->replaced_after);
 
-	/* remove the formerly inserted entries */
+	/* Remove the formerly inserted entries */
 	for (i = 0; i < rxfer->inserted; i++)
 		list_del(&rxfer->inserted_transfers[i].transfer_list);
 }
@@ -3308,7 +3308,7 @@ static struct spi_replaced_transfers *spi_replace_transfers(
 	struct spi_transfer *xfer;
 	size_t i;
 
-	/* allocate the structure using spi_res */
+	/* Allocate the structure using spi_res */
 	rxfer = spi_res_alloc(msg->spi, __spi_replace_transfers_release,
 			      struct_size(rxfer, inserted_transfers, insert)
 			      + extradatasize,
@@ -3316,15 +3316,15 @@ static struct spi_replaced_transfers *spi_replace_transfers(
 	if (!rxfer)
 		return ERR_PTR(-ENOMEM);
 
-	/* the release code to invoke before running the generic release */
+	/* The release code to invoke before running the generic release */
 	rxfer->release = release;
 
-	/* assign extradata */
+	/* Assign extradata */
 	if (extradatasize)
 		rxfer->extradata =
 			&rxfer->inserted_transfers[insert];
 
-	/* init the replaced_transfers list */
+	/* Init the replaced_transfers list */
 	INIT_LIST_HEAD(&rxfer->replaced_transfers);
 
 	/*
@@ -3333,7 +3333,7 @@ static struct spi_replaced_transfers *spi_replace_transfers(
 	 */
 	rxfer->replaced_after = xfer_first->transfer_list.prev;
 
-	/* remove the requested number of transfers */
+	/* Remove the requested number of transfers */
 	for (i = 0; i < remove; i++) {
 		/*
 		 * If the entry after replaced_after it is msg->transfers
@@ -3343,14 +3343,14 @@ static struct spi_replaced_transfers *spi_replace_transfers(
 		if (rxfer->replaced_after->next == &msg->transfers) {
 			dev_err(&msg->spi->dev,
 				"requested to remove more spi_transfers than are available\n");
-			/* insert replaced transfers back into the message */
+			/* Insert replaced transfers back into the message */
 			list_splice(&rxfer->replaced_transfers,
 				    rxfer->replaced_after);
 
-			/* free the spi_replace_transfer structure */
+			/* Free the spi_replace_transfer structure... */
 			spi_res_free(rxfer);
 
-			/* and return with an error */
+			/* ...and return with an error */
 			return ERR_PTR(-EINVAL);
 		}
 
@@ -3367,26 +3367,26 @@ static struct spi_replaced_transfers *spi_replace_transfers(
 	 * based on the first transfer to get removed.
 	 */
 	for (i = 0; i < insert; i++) {
-		/* we need to run in reverse order */
+		/* We need to run in reverse order */
 		xfer = &rxfer->inserted_transfers[insert - 1 - i];
 
-		/* copy all spi_transfer data */
+		/* Copy all spi_transfer data */
 		memcpy(xfer, xfer_first, sizeof(*xfer));
 
-		/* add to list */
+		/* Add to list */
 		list_add(&xfer->transfer_list, rxfer->replaced_after);
 
-		/* clear cs_change and delay for all but the last */
+		/* Clear cs_change and delay for all but the last */
 		if (i) {
 			xfer->cs_change = false;
 			xfer->delay.value = 0;
 		}
 	}
 
-	/* set up inserted */
+	/* Set up inserted... */
 	rxfer->inserted = insert;
 
-	/* and register it with spi_res/spi_message */
+	/* ...and register it with spi_res/spi_message */
 	spi_res_add(msg, rxfer);
 
 	return rxfer;
@@ -3403,10 +3403,10 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 	size_t offset;
 	size_t count, i;
 
-	/* calculate how many we have to replace */
+	/* Calculate how many we have to replace */
 	count = DIV_ROUND_UP(xfer->len, maxsize);
 
-	/* create replacement */
+	/* Create replacement */
 	srt = spi_replace_transfers(msg, xfer, 1, count, NULL, 0, gfp);
 	if (IS_ERR(srt))
 		return PTR_ERR(srt);
@@ -3429,9 +3429,9 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 	 */
 	xfers[0].len = min_t(size_t, maxsize, xfer[0].len);
 
-	/* all the others need rx_buf/tx_buf also set */
+	/* All the others need rx_buf/tx_buf also set */
 	for (i = 1, offset = maxsize; i < count; offset += maxsize, i++) {
-		/* update rx_buf, tx_buf and dma */
+		/* Update rx_buf, tx_buf and dma */
 		if (xfers[i].rx_buf)
 			xfers[i].rx_buf += offset;
 		if (xfers[i].rx_dma)
@@ -3441,7 +3441,7 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 		if (xfers[i].tx_dma)
 			xfers[i].tx_dma += offset;
 
-		/* update length */
+		/* Update length */
 		xfers[i].len = min(maxsize, xfers[i].len - offset);
 	}
 
@@ -3451,7 +3451,7 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 	 */
 	*xferp = &xfers[count - 1];
 
-	/* increment statistics counters */
+	/* Increment statistics counters */
 	SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics,
 				       transfers_split_maxsize);
 	SPI_STATISTICS_INCREMENT_FIELD(msg->spi->pcpu_statistics,
@@ -3713,7 +3713,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 			return ret;
 
 		list_for_each_entry(xfer, &message->transfers, transfer_list) {
-			/* don't change cs_change on the last entry in the list */
+			/* Don't change cs_change on the last entry in the list */
 			if (list_is_last(&xfer->transfer_list, &message->transfers))
 				break;
 			xfer->cs_change = 1;
@@ -3806,7 +3806,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 				!(spi->mode & SPI_TX_QUAD))
 				return -EINVAL;
 		}
-		/* check transfer rx_nbits */
+		/* Check transfer rx_nbits */
 		if (xfer->rx_buf) {
 			if (spi->mode & SPI_NO_RX)
 				return -EINVAL;
@@ -4144,7 +4144,7 @@ int spi_bus_lock(struct spi_controller *ctlr)
 	ctlr->bus_lock_flag = 1;
 	spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags);
 
-	/* mutex remains locked until spi_bus_unlock is called */
+	/* Mutex remains locked until spi_bus_unlock() is called */
 
 	return 0;
 }
@@ -4173,7 +4173,7 @@ int spi_bus_unlock(struct spi_controller *ctlr)
 }
 EXPORT_SYMBOL_GPL(spi_bus_unlock);
 
-/* portable code must never pass more than 32 bytes */
+/* Portable code must never pass more than 32 bytes */
 #define	SPI_BUFSIZ	max(32, SMP_CACHE_BYTES)
 
 static u8	*buf;
@@ -4239,7 +4239,7 @@ int spi_write_then_read(struct spi_device *spi,
 	x[0].tx_buf = local_buf;
 	x[1].rx_buf = local_buf + n_tx;
 
-	/* do the i/o */
+	/* Do the i/o */
 	status = spi_sync(spi, &message);
 	if (status == 0)
 		memcpy(rxbuf, x[1].rx_buf, n_rx);
@@ -4256,7 +4256,7 @@ EXPORT_SYMBOL_GPL(spi_write_then_read);
 /*-------------------------------------------------------------------------*/
 
 #if IS_ENABLED(CONFIG_OF_DYNAMIC)
-/* must call put_device() when done with returned spi_device device */
+/* Must call put_device() when done with returned spi_device device */
 static struct spi_device *of_find_spi_device_by_node(struct device_node *node)
 {
 	struct device *dev = bus_find_device_by_of_node(&spi_bus_type, node);
@@ -4264,7 +4264,7 @@ static struct spi_device *of_find_spi_device_by_node(struct device_node *node)
 	return dev ? to_spi_device(dev) : NULL;
 }
 
-/* the spi controllers are not using spi_bus, so we find it with another way */
+/* The spi controllers are not using spi_bus, so we find it with another way */
 static struct spi_controller *of_find_spi_controller_by_node(struct device_node *node)
 {
 	struct device *dev;
@@ -4275,7 +4275,7 @@ static struct spi_controller *of_find_spi_controller_by_node(struct device_node
 	if (!dev)
 		return NULL;
 
-	/* reference got in class_find_device */
+	/* Reference got in class_find_device */
 	return container_of(dev, struct spi_controller, dev);
 }
 
@@ -4290,7 +4290,7 @@ static int of_spi_notify(struct notifier_block *nb, unsigned long action,
 	case OF_RECONFIG_CHANGE_ADD:
 		ctlr = of_find_spi_controller_by_node(rd->dn->parent);
 		if (ctlr == NULL)
-			return NOTIFY_OK;	/* not for us */
+			return NOTIFY_OK;	/* Not for us */
 
 		if (of_node_test_and_set_flag(rd->dn, OF_POPULATED)) {
 			put_device(&ctlr->dev);
@@ -4309,19 +4309,19 @@ static int of_spi_notify(struct notifier_block *nb, unsigned long action,
 		break;
 
 	case OF_RECONFIG_CHANGE_REMOVE:
-		/* already depopulated? */
+		/* Already depopulated? */
 		if (!of_node_check_flag(rd->dn, OF_POPULATED))
 			return NOTIFY_OK;
 
-		/* find our device by node */
+		/* Find our device by node */
 		spi = of_find_spi_device_by_node(rd->dn);
 		if (spi == NULL)
-			return NOTIFY_OK;	/* no? not meant for us */
+			return NOTIFY_OK;	/* No? not meant for us */
 
-		/* unregister takes one ref away */
+		/* Unregister takes one ref away */
 		spi_unregister_device(spi);
 
-		/* and put the reference of the find */
+		/* And put the reference of the find */
 		put_device(&spi->dev);
 		break;
 	}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index eb0d316e3c36..e6c73d5ff1a8 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -176,13 +176,13 @@ extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer);
 struct spi_device {
 	struct device		dev;
 	struct spi_controller	*controller;
-	struct spi_controller	*master;	/* compatibility layer */
+	struct spi_controller	*master;	/* Compatibility layer */
 	u32			max_speed_hz;
 	u8			chip_select;
 	u8			bits_per_word;
 	bool			rt;
-#define SPI_NO_TX	BIT(31)		/* no transmit wire */
-#define SPI_NO_RX	BIT(30)		/* no receive wire */
+#define SPI_NO_TX	BIT(31)		/* No transmit wire */
+#define SPI_NO_RX	BIT(30)		/* No receive wire */
 	/*
 	 * All bits defined above should be covered by SPI_MODE_KERNEL_MASK.
 	 * The SPI_MODE_KERNEL_MASK has the SPI_MODE_USER_MASK counterpart,
@@ -199,14 +199,14 @@ struct spi_device {
 	void			*controller_data;
 	char			modalias[SPI_NAME_SIZE];
 	const char		*driver_override;
-	struct gpio_desc	*cs_gpiod;	/* chip select gpio desc */
-	struct spi_delay	word_delay; /* inter-word delay */
+	struct gpio_desc	*cs_gpiod;	/* Chip select gpio desc */
+	struct spi_delay	word_delay; /* Inter-word delay */
 	/* CS delays */
 	struct spi_delay	cs_setup;
 	struct spi_delay	cs_hold;
 	struct spi_delay	cs_inactive;
 
-	/* the statistics */
+	/* The statistics */
 	struct spi_statistics __percpu	*pcpu_statistics;
 
 	/*
@@ -228,7 +228,7 @@ static inline struct spi_device *to_spi_device(struct device *dev)
 	return dev ? container_of(dev, struct spi_device, dev) : NULL;
 }
 
-/* most drivers won't need to care about device refcounting */
+/* Most drivers won't need to care about device refcounting */
 static inline struct spi_device *spi_dev_get(struct spi_device *spi)
 {
 	return (spi && get_device(&spi->dev)) ? spi : NULL;
@@ -251,7 +251,7 @@ static inline void spi_set_ctldata(struct spi_device *spi, void *state)
 	spi->controller_state = state;
 }
 
-/* device driver data */
+/* Device driver data */
 
 static inline void spi_set_drvdata(struct spi_device *spi, void *data)
 {
@@ -318,7 +318,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
 
 extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select);
 
-/* use a define to avoid include chaining to get THIS_MODULE */
+/* Use a define to avoid include chaining to get THIS_MODULE */
 #define spi_register_driver(driver) \
 	__spi_register_driver(THIS_MODULE, driver)
 
@@ -486,7 +486,7 @@ struct spi_controller {
 
 	struct list_head list;
 
-	/* other than negative (== assign one dynamically), bus_num is fully
+	/* Other than negative (== assign one dynamically), bus_num is fully
 	 * board-specific.  usually that simplifies to being SOC-specific.
 	 * example:  one SOC has three SPI controllers, numbered 0..2,
 	 * and one board's schematics might show it using SPI-2.  software
@@ -499,7 +499,7 @@ struct spi_controller {
 	 */
 	u16			num_chipselect;
 
-	/* some SPI controllers pose alignment requirements on DMAable
+	/* Some SPI controllers pose alignment requirements on DMAable
 	 * buffers; let protocol drivers know about these requirements.
 	 */
 	u16			dma_alignment;
@@ -510,29 +510,29 @@ struct spi_controller {
 	/* spi_device.mode flags override flags for this controller */
 	u32			buswidth_override_bits;
 
-	/* bitmask of supported bits_per_word for transfers */
+	/* Bitmask of supported bits_per_word for transfers */
 	u32			bits_per_word_mask;
 #define SPI_BPW_MASK(bits) BIT((bits) - 1)
 #define SPI_BPW_RANGE_MASK(min, max) GENMASK((max) - 1, (min) - 1)
 
-	/* limits on transfer speed */
+	/* Limits on transfer speed */
 	u32			min_speed_hz;
 	u32			max_speed_hz;
 
-	/* other constraints relevant to this driver */
+	/* Other constraints relevant to this driver */
 	u16			flags;
-#define SPI_CONTROLLER_HALF_DUPLEX	BIT(0)	/* can't do full duplex */
-#define SPI_CONTROLLER_NO_RX		BIT(1)	/* can't do buffer read */
-#define SPI_CONTROLLER_NO_TX		BIT(2)	/* can't do buffer write */
-#define SPI_CONTROLLER_MUST_RX		BIT(3)	/* requires rx */
-#define SPI_CONTROLLER_MUST_TX		BIT(4)	/* requires tx */
+#define SPI_CONTROLLER_HALF_DUPLEX	BIT(0)	/* Can't do full duplex */
+#define SPI_CONTROLLER_NO_RX		BIT(1)	/* Can't do buffer read */
+#define SPI_CONTROLLER_NO_TX		BIT(2)	/* Can't do buffer write */
+#define SPI_CONTROLLER_MUST_RX		BIT(3)	/* Requires rx */
+#define SPI_CONTROLLER_MUST_TX		BIT(4)	/* Requires tx */
 
 #define SPI_MASTER_GPIO_SS		BIT(5)	/* GPIO CS must select slave */
 
-	/* flag indicating if the allocation of this struct is devres-managed */
+	/* Flag indicating if the allocation of this struct is devres-managed */
 	bool			devm_allocated;
 
-	/* flag indicating this is an SPI slave controller */
+	/* Flag indicating this is an SPI slave controller */
 	bool			slave;
 
 	/*
@@ -548,11 +548,11 @@ struct spi_controller {
 	/* Used to avoid adding the same CS twice */
 	struct mutex		add_lock;
 
-	/* lock and mutex for SPI bus locking */
+	/* Lock and mutex for SPI bus locking */
 	spinlock_t		bus_lock_spinlock;
 	struct mutex		bus_lock_mutex;
 
-	/* flag indicating that the SPI bus is locked for exclusive use */
+	/* Flag indicating that the SPI bus is locked for exclusive use */
 	bool			bus_lock_flag;
 
 	/* Setup mode and clock, etc (spi driver may call many times).
@@ -573,7 +573,7 @@ struct spi_controller {
 	 */
 	int (*set_cs_timing)(struct spi_device *spi);
 
-	/* bidirectional bulk transfers
+	/* Bidirectional bulk transfers
 	 *
 	 * + The transfer() method may not sleep; its main role is
 	 *   just to add the message to the queue.
@@ -595,7 +595,7 @@ struct spi_controller {
 	int			(*transfer)(struct spi_device *spi,
 						struct spi_message *mesg);
 
-	/* called on release() to free memory provided by spi_controller */
+	/* Called on release() to free memory provided by spi_controller */
 	void			(*cleanup)(struct spi_device *spi);
 
 	/*
@@ -666,14 +666,14 @@ struct spi_controller {
 	s8			unused_native_cs;
 	s8			max_native_cs;
 
-	/* statistics */
+	/* Statistics */
 	struct spi_statistics __percpu	*pcpu_statistics;
 
 	/* DMA channels for use with core dmaengine helpers */
 	struct dma_chan		*dma_tx;
 	struct dma_chan		*dma_rx;
 
-	/* dummy data for full duplex devices */
+	/* Dummy data for full duplex devices */
 	void			*dummy_rx;
 	void			*dummy_tx;
 
@@ -738,7 +738,7 @@ void spi_take_timestamp_post(struct spi_controller *ctlr,
 			     struct spi_transfer *xfer,
 			     size_t progress, bool irqs_off);
 
-/* the spi driver core manages memory for the spi_controller classdev */
+/* The spi driver core manages memory for the spi_controller classdev */
 extern struct spi_controller *__spi_alloc_controller(struct device *host,
 						unsigned int size, bool slave);
 
@@ -808,7 +808,7 @@ typedef void (*spi_res_release_t)(struct spi_controller *ctlr,
 struct spi_res {
 	struct list_head        entry;
 	spi_res_release_t       release;
-	unsigned long long      data[]; /* guarantee ull alignment */
+	unsigned long long      data[]; /* Guarantee ull alignment */
 };
 
 /*---------------------------------------------------------------------------*/
@@ -941,7 +941,7 @@ struct spi_res {
  * and its transfers, ignore them until its completion callback.
  */
 struct spi_transfer {
-	/* it's ok if tx_buf == rx_buf (right?)
+	/* It's ok if tx_buf == rx_buf (right?)
 	 * for MicroWire, one buffer must be null
 	 * buffers must work with dma_*map_single() calls, unless
 	 *   spi_message.is_dma_mapped reports a pre-existing mapping
@@ -1032,24 +1032,24 @@ struct spi_message {
 	 * tell them about such special cases.
 	 */
 
-	/* completion is reported through a callback */
+	/* Completion is reported through a callback */
 	void			(*complete)(void *context);
 	void			*context;
 	unsigned		frame_length;
 	unsigned		actual_length;
 	int			status;
 
-	/* for optional use by whatever driver currently owns the
+	/* For optional use by whatever driver currently owns the
 	 * spi_message ...  between calls to spi_async and then later
 	 * complete(), that's the spi_controller controller driver.
 	 */
 	struct list_head	queue;
 	void			*state;
 
-	/* list of spi_res reources when the spi message is processed */
+	/* List of spi_res reources when the spi message is processed */
 	struct list_head        resources;
 
-	/* spi_prepare_message was called for this message */
+	/* spi_prepare_message() was called for this message */
 	bool			prepared;
 };
 
@@ -1154,7 +1154,7 @@ spi_max_transfer_size(struct spi_device *spi)
 	if (ctlr->max_transfer_size)
 		tr_max = ctlr->max_transfer_size(spi);
 
-	/* transfer size limit must not be greater than message size limit */
+	/* Transfer size limit must not be greater than message size limit */
 	return min(tr_max, msg_max);
 }
 
@@ -1305,7 +1305,7 @@ spi_read(struct spi_device *spi, void *buf, size_t len)
 	return spi_sync_transfer(spi, &t, 1);
 }
 
-/* this copies txbuf and rxbuf data; for small transfers only! */
+/* This copies txbuf and rxbuf data; for small transfers only! */
 extern int spi_write_then_read(struct spi_device *spi,
 		const void *txbuf, unsigned n_tx,
 		void *rxbuf, unsigned n_rx);
@@ -1328,7 +1328,7 @@ static inline ssize_t spi_w8r8(struct spi_device *spi, u8 cmd)
 
 	status = spi_write_then_read(spi, &cmd, 1, &result, 1);
 
-	/* return negative errno or unsigned value */
+	/* Return negative errno or unsigned value */
 	return (status < 0) ? status : result;
 }
 
@@ -1353,7 +1353,7 @@ static inline ssize_t spi_w8r16(struct spi_device *spi, u8 cmd)
 
 	status = spi_write_then_read(spi, &cmd, 1, &result, 2);
 
-	/* return negative errno or unsigned value */
+	/* Return negative errno or unsigned value */
 	return (status < 0) ? status : result;
 }
 
@@ -1433,7 +1433,7 @@ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd)
  * are active in some dynamic board configuration models.
  */
 struct spi_board_info {
-	/* the device name and module name are coupled, like platform_bus;
+	/* The device name and module name are coupled, like platform_bus;
 	 * "modalias" is normally the driver name.
 	 *
 	 * platform_data goes to spi_device.dev.platform_data,
@@ -1446,7 +1446,7 @@ struct spi_board_info {
 	void		*controller_data;
 	int		irq;
 
-	/* slower signaling on noisy or low voltage boards */
+	/* Slower signaling on noisy or low voltage boards */
 	u32		max_speed_hz;
 
 
@@ -1475,7 +1475,7 @@ struct spi_board_info {
 extern int
 spi_register_board_info(struct spi_board_info const *info, unsigned n);
 #else
-/* board init code may ignore whether SPI is configured or not */
+/* Board init code may ignore whether SPI is configured or not */
 static inline int
 spi_register_board_info(struct spi_board_info const *info, unsigned n)
 	{ return 0; }
-- 
cgit 


From 36a7b63f069630e854beb305e99c151cddd3b8e5 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 28 Jun 2022 21:14:35 -0700
Subject: dt-bindings: mailbox: qcom-ipcc: Add NSP1 client

Add a client for the NSP1 found in some recent Qualcomm platforms.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220629041438.1352536-3-bjorn.andersson@linaro.org
---
 include/dt-bindings/mailbox/qcom-ipcc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/mailbox/qcom-ipcc.h b/include/dt-bindings/mailbox/qcom-ipcc.h
index 9296d0bb5f34..fbfa3febc66d 100644
--- a/include/dt-bindings/mailbox/qcom-ipcc.h
+++ b/include/dt-bindings/mailbox/qcom-ipcc.h
@@ -30,6 +30,7 @@
 #define IPCC_CLIENT_PCIE1		14
 #define IPCC_CLIENT_PCIE2		15
 #define IPCC_CLIENT_SPSS		16
+#define IPCC_CLIENT_NSP1		18
 #define IPCC_CLIENT_TME			23
 #define IPCC_CLIENT_WPSS		24
 
-- 
cgit 


From 72a43046b61a3fe7164a622224bcdfc3cf6b795d Mon Sep 17 00:00:00 2001
From: Chanho Park <chanho61.park@samsung.com>
Date: Wed, 29 Jun 2022 09:41:41 +0900
Subject: tty: serial: samsung_tty: loopback mode support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Internal loopback mode can be supported by setting
UCON register's Loopback Mode bit. The mode & bit can be supported
since s3c2410 and later SoCs. The prefix of LOOPBACK / BIT(5) naming
should be also changed to S3C2410_ in order to avoid confusion.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Chanho Park <chanho61.park@samsung.com>
Link: https://lore.kernel.org/r/20220629004141.51484-1-chanho61.park@samsung.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/samsung_tty.c | 8 ++++++++
 include/linux/serial_s3c.h       | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c
index 3e0aa2923605..8971fbb49fa3 100644
--- a/drivers/tty/serial/samsung_tty.c
+++ b/drivers/tty/serial/samsung_tty.c
@@ -1018,6 +1018,7 @@ static unsigned int s3c24xx_serial_get_mctrl(struct uart_port *port)
 static void s3c24xx_serial_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	unsigned int umcon = rd_regl(port, S3C2410_UMCON);
+	unsigned int ucon = rd_reg(port, S3C2410_UCON);
 
 	if (mctrl & TIOCM_RTS)
 		umcon |= S3C2410_UMCOM_RTS_LOW;
@@ -1025,6 +1026,13 @@ static void s3c24xx_serial_set_mctrl(struct uart_port *port, unsigned int mctrl)
 		umcon &= ~S3C2410_UMCOM_RTS_LOW;
 
 	wr_regl(port, S3C2410_UMCON, umcon);
+
+	if (mctrl & TIOCM_LOOP)
+		ucon |= S3C2410_UCON_LOOPBACK;
+	else
+		ucon &= ~S3C2410_UCON_LOOPBACK;
+
+	wr_regl(port, S3C2410_UCON, ucon);
 }
 
 static void s3c24xx_serial_break_ctl(struct uart_port *port, int break_state)
diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index dec15f5b3dec..1672cf0810ef 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -83,7 +83,7 @@
 #define S3C2410_UCON_RXIRQMODE	  (1<<0)
 #define S3C2410_UCON_RXFIFO_TOI	  (1<<7)
 #define S3C2443_UCON_RXERR_IRQEN  (1<<6)
-#define S3C2443_UCON_LOOPBACK	  (1<<5)
+#define S3C2410_UCON_LOOPBACK	  (1<<5)
 
 #define S3C2410_UCON_DEFAULT	  (S3C2410_UCON_TXILEVEL  | \
 				   S3C2410_UCON_RXILEVEL  | \
-- 
cgit 


From f9b11229b79c0fb2100b5bb4628a101b1d37fbf6 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Wed, 29 Jun 2022 12:48:41 +0300
Subject: serial: 8250: Fix PM usage_count for console handover
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When console is enabled, univ8250_console_setup() calls
serial8250_console_setup() before .dev is set to uart_port. Therefore,
it will not call pm_runtime_get_sync(). Later, when the actual driver
is going to take over univ8250_console_exit() is called. As .dev is
already set, serial8250_console_exit() makes pm_runtime_put_sync() call
with usage count being zero triggering PM usage count warning
(extra debug for univ8250_console_setup(), univ8250_console_exit(), and
serial8250_register_ports()):

[    0.068987] univ8250_console_setup ttyS0 nodev
[    0.499670] printk: console [ttyS0] enabled
[    0.717955] printk: console [ttyS0] printing thread started
[    1.960163] serial8250_register_ports assigned dev for ttyS0
[    1.976830] printk: console [ttyS0] disabled
[    1.976888] printk: console [ttyS0] printing thread stopped
[    1.977073] univ8250_console_exit ttyS0 usage:0
[    1.977075] serial8250 serial8250: Runtime PM usage count underflow!
[    1.977429] dw-apb-uart.6: ttyS0 at MMIO 0x4010006000 (irq = 33, base_baud = 115200) is a 16550A
[    1.977812] univ8250_console_setup ttyS0 usage:2
[    1.978167] printk: console [ttyS0] printing thread started
[    1.978203] printk: console [ttyS0] enabled

To fix the issue, call pm_runtime_get_sync() in
serial8250_register_ports() as soon as .dev is set for an uart_port
if it has console enabled.

This problem became apparent only recently because 82586a721595 ("PM:
runtime: Avoid device usage count underflows") added the warning
printout. I confirmed this problem also occurs with v5.18 (w/o the
warning printout, obviously).

Fixes: bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console")
Cc: stable <stable@kernel.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/b4f428e9-491f-daf2-2232-819928dc276e@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 4 ++++
 drivers/tty/serial/serial_core.c    | 5 -----
 include/linux/serial_core.h         | 5 +++++
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index cfbd2de0ca6e..3f56dbc9432b 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -23,6 +23,7 @@
 #include <linux/sysrq.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/tty.h>
 #include <linux/ratelimit.h>
 #include <linux/tty_flip.h>
@@ -559,6 +560,9 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 
 		up->port.dev = dev;
 
+		if (uart_console_enabled(&up->port))
+			pm_runtime_get_sync(up->port.dev);
+
 		serial8250_apply_quirks(up);
 		uart_add_one_port(drv, &up->port);
 	}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 338ebadfd44b..3dc926d6c00a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1941,11 +1941,6 @@ static int uart_proc_show(struct seq_file *m, void *v)
 }
 #endif
 
-static inline bool uart_console_enabled(struct uart_port *port)
-{
-	return uart_console(port) && (port->cons->flags & CON_ENABLED);
-}
-
 static void uart_port_spin_lock_init(struct uart_port *port)
 {
 	spin_lock_init(&port->lock);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 657a0fc68a3f..fde258b3decd 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -390,6 +390,11 @@ static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
 static inline int setup_earlycon(char *buf) { return 0; }
 #endif
 
+static inline bool uart_console_enabled(struct uart_port *port)
+{
+	return uart_console(port) && (port->cons->flags & CON_ENABLED);
+}
+
 struct uart_port *uart_get_console(struct uart_port *ports, int nr,
 				   struct console *c);
 int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr,
-- 
cgit 


From 6e97eba8ad8748fabb795cffc5d9e1a7dcfd7367 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 28 Jun 2022 18:59:10 +0300
Subject: vfio: Split migration ops from main device ops

vfio core checks whether the driver sets some migration op (e.g.
set_state/get_state) and accordingly calls its op.

However, currently mlx5 driver sets the above ops without regards to its
migration caps.

This might lead to unexpected usage/Oops if user space may call to the
above ops even if the driver doesn't support migration. As for example,
the migration state_mutex is not initialized in that case.

The cleanest way to manage that seems to split the migration ops from
the main device ops, this will let the driver setting them separately
from the main ops when it's applicable.

As part of that, validate ops construction on registration and include a
check for VFIO_MIGRATION_STOP_COPY since the uAPI claims it must be set
in migration_flags.

HISI driver was changed as well to match this scheme.

This scheme may enable down the road to come with some extra group of
ops (e.g. DMA log) that can be set without regards to the other options
based on driver caps.

Fixes: 6fadb021266d ("vfio/mlx5: Implement vfio_pci driver for mlx5 devices")
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220628155910.171454-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 11 +++++++---
 drivers/vfio/pci/mlx5/cmd.c                    |  4 +++-
 drivers/vfio/pci/mlx5/cmd.h                    |  3 ++-
 drivers/vfio/pci/mlx5/main.c                   |  9 +++++---
 drivers/vfio/pci/vfio_pci_core.c               |  7 ++++++
 drivers/vfio/vfio.c                            | 11 +++++-----
 include/linux/vfio.h                           | 30 +++++++++++++++++---------
 7 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 4def43f5f7b6..ea762e28c1cc 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1185,7 +1185,7 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
 	if (ret)
 		return ret;
 
-	if (core_vdev->ops->migration_set_state) {
+	if (core_vdev->mig_ops) {
 		ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
 		if (ret) {
 			vfio_pci_core_disable(vdev);
@@ -1208,6 +1208,11 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
+static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
+	.migration_set_state = hisi_acc_vfio_pci_set_device_state,
+	.migration_get_state = hisi_acc_vfio_pci_get_device_state,
+};
+
 static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
 	.name = "hisi-acc-vfio-pci-migration",
 	.open_device = hisi_acc_vfio_pci_open_device,
@@ -1219,8 +1224,6 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
 	.mmap = hisi_acc_vfio_pci_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
-	.migration_set_state = hisi_acc_vfio_pci_set_device_state,
-	.migration_get_state = hisi_acc_vfio_pci_get_device_state,
 };
 
 static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
@@ -1272,6 +1275,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device
 		if (!ret) {
 			vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
 						  &hisi_acc_vfio_pci_migrn_ops);
+			hisi_acc_vdev->core_device.vdev.mig_ops =
+					&hisi_acc_vfio_pci_migrn_state_ops;
 		} else {
 			pci_warn(pdev, "migration support failed, continue with generic interface\n");
 			vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index cdd0c667dc77..dd5d7bfe0a49 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,7 +108,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
 	destroy_workqueue(mvdev->cb_wq);
 }
 
-void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev)
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops)
 {
 	struct pci_dev *pdev = mvdev->core_device.pdev;
 	int ret;
@@ -149,6 +150,7 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev)
 	mvdev->core_device.vdev.migration_flags =
 		VFIO_MIGRATION_STOP_COPY |
 		VFIO_MIGRATION_P2P;
+	mvdev->core_device.vdev.mig_ops = mig_ops;
 
 end:
 	mlx5_vf_put_core_dev(mvdev->mdev);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index aa692d9ce656..8208f4701a90 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -62,7 +62,8 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 					  size_t *state_size);
-void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops);
 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index d754990f0662..a9b63d15c5d3 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -574,6 +574,11 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
+static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
+	.migration_set_state = mlx5vf_pci_set_device_state,
+	.migration_get_state = mlx5vf_pci_get_device_state,
+};
+
 static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.name = "mlx5-vfio-pci",
 	.open_device = mlx5vf_pci_open_device,
@@ -585,8 +590,6 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
-	.migration_set_state = mlx5vf_pci_set_device_state,
-	.migration_get_state = mlx5vf_pci_get_device_state,
 };
 
 static int mlx5vf_pci_probe(struct pci_dev *pdev,
@@ -599,7 +602,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev,
 	if (!mvdev)
 		return -ENOMEM;
 	vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops);
-	mlx5vf_cmd_set_migratable(mvdev);
+	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops);
 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
 	ret = vfio_pci_core_register_device(&mvdev->core_device);
 	if (ret)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a0d69ddaf90d..2efa06b1fafa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1855,6 +1855,13 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 		return -EINVAL;
 
+	if (vdev->vdev.mig_ops) {
+		if (!(vdev->vdev.mig_ops->migration_get_state &&
+		      vdev->vdev.mig_ops->migration_set_state) ||
+		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
+			return -EINVAL;
+	}
+
 	/*
 	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
 	 * by the host or other users.  We cannot capture the VFs if they
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 61e71c1154be..aac9213a783d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1541,8 +1541,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	struct file *filp = NULL;
 	int ret;
 
-	if (!device->ops->migration_set_state ||
-	    !device->ops->migration_get_state)
+	if (!device->mig_ops)
 		return -ENOTTY;
 
 	ret = vfio_check_feature(flags, argsz,
@@ -1558,7 +1557,8 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	if (flags & VFIO_DEVICE_FEATURE_GET) {
 		enum vfio_device_mig_state curr_state;
 
-		ret = device->ops->migration_get_state(device, &curr_state);
+		ret = device->mig_ops->migration_get_state(device,
+							   &curr_state);
 		if (ret)
 			return ret;
 		mig.device_state = curr_state;
@@ -1566,7 +1566,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	}
 
 	/* Handle the VFIO_DEVICE_FEATURE_SET */
-	filp = device->ops->migration_set_state(device, mig.device_state);
+	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 	if (IS_ERR(filp) || !filp)
 		goto out_copy;
 
@@ -1589,8 +1589,7 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 	};
 	int ret;
 
-	if (!device->ops->migration_set_state ||
-	    !device->ops->migration_get_state)
+	if (!device->mig_ops)
 		return -ENOTTY;
 
 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 49580fa2073a..4d26e149db81 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -32,6 +32,11 @@ struct vfio_device_set {
 struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
+	/*
+	 * mig_ops is a static property of the vfio_device which must be set
+	 * prior to registering the vfio_device.
+	 */
+	const struct vfio_migration_ops *mig_ops;
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
 	struct list_head dev_set_list;
@@ -61,16 +66,6 @@ struct vfio_device {
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
- * @migration_set_state: Optional callback to change the migration state for
- *         devices that support migration. It's mandatory for
- *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
- *         The returned FD is used for data transfer according to the FSM
- *         definition. The driver is responsible to ensure that FD reaches end
- *         of stream or error whenever the migration FSM leaves a data transfer
- *         state or before close_device() returns.
- * @migration_get_state: Optional callback to get the migration state for
- *         devices that support migration. It's mandatory for
- *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
  */
 struct vfio_device_ops {
 	char	*name;
@@ -87,6 +82,21 @@ struct vfio_device_ops {
 	int	(*match)(struct vfio_device *vdev, char *buf);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
+};
+
+/**
+ * @migration_set_state: Optional callback to change the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ *         The returned FD is used for data transfer according to the FSM
+ *         definition. The driver is responsible to ensure that FD reaches end
+ *         of stream or error whenever the migration FSM leaves a data transfer
+ *         state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ */
+struct vfio_migration_ops {
 	struct file *(*migration_set_state)(
 		struct vfio_device *device,
 		enum vfio_device_mig_state new_state);
-- 
cgit 


From 90e6d290603df6387c95c114cc8154862c3e7515 Mon Sep 17 00:00:00 2001
From: Robert Marko <robimarko@gmail.com>
Date: Sun, 15 May 2022 23:00:41 +0200
Subject: dt-bindings: clock: qcom: ipq8074: add PPE crypto clock

Add binding for the PPE crypto clock in IPQ8074.

Signed-off-by: Robert Marko <robimarko@gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220515210048.483898-4-robimarko@gmail.com
---
 include/dt-bindings/clock/qcom,gcc-ipq8074.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,gcc-ipq8074.h b/include/dt-bindings/clock/qcom,gcc-ipq8074.h
index 8e2bec1c91bf..5f0928785d7a 100644
--- a/include/dt-bindings/clock/qcom,gcc-ipq8074.h
+++ b/include/dt-bindings/clock/qcom,gcc-ipq8074.h
@@ -233,6 +233,7 @@
 #define GCC_PCIE0_AXI_S_BRIDGE_CLK		224
 #define GCC_PCIE0_RCHNG_CLK_SRC			225
 #define GCC_PCIE0_RCHNG_CLK			226
+#define GCC_CRYPTO_PPE_CLK			227
 
 #define GCC_BLSP1_BCR				0
 #define GCC_BLSP1_QUP1_BCR			1
-- 
cgit 


From 74622e401e2109a9aacce0b9698bbcd2307db17a Mon Sep 17 00:00:00 2001
From: Robert Marko <robimarko@gmail.com>
Date: Sun, 15 May 2022 23:00:45 +0200
Subject: dt-bindings: clock: qcom: ipq8074: add USB GDSCs

Add bindings for the USB GDSCs found in IPQ8074 GCC.

Signed-off-by: Robert Marko <robimarko@gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220515210048.483898-8-robimarko@gmail.com
---
 include/dt-bindings/clock/qcom,gcc-ipq8074.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,gcc-ipq8074.h b/include/dt-bindings/clock/qcom,gcc-ipq8074.h
index 8e2bec1c91bf..55f8322a1e50 100644
--- a/include/dt-bindings/clock/qcom,gcc-ipq8074.h
+++ b/include/dt-bindings/clock/qcom,gcc-ipq8074.h
@@ -367,4 +367,7 @@
 #define GCC_PCIE1_AXI_MASTER_STICKY_ARES	130
 #define GCC_PCIE0_AXI_SLAVE_STICKY_ARES		131
 
+#define USB0_GDSC				0
+#define USB1_GDSC				1
+
 #endif
-- 
cgit 


From 21bb8af513d35c005c401706030f4eb469538d1d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:06 +0200
Subject: bitops: always define asm-generic non-atomic bitops

Move generic non-atomic bitops from the asm-generic header which
gets included only when there are no architecture-specific
alternatives, to a separate independent file to make them always
available.
Almost no actual code changes, only one comment added to
generic_test_bit() saying that it's an atomic operation itself
and thus `volatile` must always stay there with no cast-aways.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> # comment
Suggested-by: Marco Elver <elver@google.com> # reference to kernel-doc
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/asm-generic/bitops/generic-non-atomic.h | 130 ++++++++++++++++++++++++
 include/asm-generic/bitops/non-atomic.h         | 110 ++------------------
 2 files changed, 138 insertions(+), 102 deletions(-)
 create mode 100644 include/asm-generic/bitops/generic-non-atomic.h

(limited to 'include')

diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
new file mode 100644
index 000000000000..7226488810e5
--- /dev/null
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
+#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
+
+#include <linux/bits.h>
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+/*
+ * Generic definitions for bit operations, should not be used in regular code
+ * directly.
+ */
+
+/**
+ * generic___set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __always_inline void
+generic___set_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p  |= mask;
+}
+
+static __always_inline void
+generic___clear_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p &= ~mask;
+}
+
+/**
+ * generic___change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __always_inline
+void generic___change_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p ^= mask;
+}
+
+/**
+ * generic___test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail.  You must protect multiple accesses with a lock.
+ */
+static __always_inline int
+generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old = *p;
+
+	*p = old | mask;
+	return (old & mask) != 0;
+}
+
+/**
+ * generic___test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail.  You must protect multiple accesses with a lock.
+ */
+static __always_inline int
+generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old = *p;
+
+	*p = old & ~mask;
+	return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __always_inline int
+generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old = *p;
+
+	*p = old ^ mask;
+	return (old & mask) != 0;
+}
+
+/**
+ * generic_test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int
+generic_test_bit(unsigned int nr, const volatile unsigned long *addr)
+{
+	/*
+	 * Unlike the bitops with the '__' prefix above, this one *is* atomic,
+	 * so `volatile` must always stay here with no cast-aways. See
+	 * `Documentation/atomic_bitops.txt` for the details.
+	 */
+	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+#endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 078cc68be2f1..23d3abc1e10d 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -2,121 +2,27 @@
 #ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 
-#include <asm/types.h>
+#include <asm-generic/bitops/generic-non-atomic.h>
 
-/**
- * arch___set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___set_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-	*p  |= mask;
-}
+#define arch___set_bit generic___set_bit
 #define __set_bit arch___set_bit
 
-static __always_inline void
-arch___clear_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-	*p &= ~mask;
-}
+#define arch___clear_bit generic___clear_bit
 #define __clear_bit arch___clear_bit
 
-/**
- * arch___change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline
-void arch___change_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-
-	*p ^= mask;
-}
+#define arch___change_bit generic___change_bit
 #define __change_bit arch___change_bit
 
-/**
- * arch___test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline int
-arch___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old = *p;
-
-	*p = old | mask;
-	return (old & mask) != 0;
-}
+#define arch___test_and_set_bit generic___test_and_set_bit
 #define __test_and_set_bit arch___test_and_set_bit
 
-/**
- * arch___test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline int
-arch___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old = *p;
-
-	*p = old & ~mask;
-	return (old & mask) != 0;
-}
+#define arch___test_and_clear_bit generic___test_and_clear_bit
 #define __test_and_clear_bit arch___test_and_clear_bit
 
-/* WARNING: non atomic and it can be reordered! */
-static __always_inline int
-arch___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old = *p;
-
-	*p = old ^ mask;
-	return (old & mask) != 0;
-}
+#define arch___test_and_change_bit generic___test_and_change_bit
 #define __test_and_change_bit arch___test_and_change_bit
 
-/**
- * arch_test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static __always_inline int
-arch_test_bit(unsigned int nr, const volatile unsigned long *addr)
-{
-	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
-}
+#define arch_test_bit generic_test_bit
 #define test_bit arch_test_bit
 
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
-- 
cgit 


From 0e862838f290147ea9c16db852d8d494b552d38d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:07 +0200
Subject: bitops: unify non-atomic bitops prototypes across architectures

Currently, there is a mess with the prototypes of the non-atomic
bitops across the different architectures:

ret	bool, int, unsigned long
nr	int, long, unsigned int, unsigned long
addr	volatile unsigned long *, volatile void *

Thankfully, it doesn't provoke any bugs, but can sometimes make
the compiler angry when it's not handy at all.
Adjust all the prototypes to the following standard:

ret	bool				retval can be only 0 or 1
nr	unsigned long			native; signed makes no sense
addr	volatile unsigned long *	bitmaps are arrays of ulongs

Next, some architectures don't define 'arch_' versions as they don't
support instrumentation, others do. To make sure there is always the
same set of callables present and to ease any potential future
changes, make them all follow the rule:
 * architecture-specific files define only 'arch_' versions;
 * non-prefixed versions can be defined only in asm-generic files;
and place the non-prefixed definitions into a new file in
asm-generic to be included by non-instrumented architectures.

Finally, add some static assertions in order to prevent people from
making a mess in this room again.
I also used the %__always_inline attribute consistently, so that
they always get resolved to the actual operations.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/alpha/include/asm/bitops.h                    | 32 ++++++------
 arch/hexagon/include/asm/bitops.h                  | 24 +++++----
 arch/ia64/include/asm/bitops.h                     | 42 ++++++++-------
 arch/m68k/include/asm/bitops.h                     | 49 +++++++++++------
 arch/s390/include/asm/bitops.h                     | 61 +++++++++++-----------
 arch/sh/include/asm/bitops-op32.h                  | 34 +++++++-----
 arch/x86/include/asm/bitops.h                      | 22 ++++----
 include/asm-generic/bitops/generic-non-atomic.h    | 24 ++++-----
 .../asm-generic/bitops/instrumented-non-atomic.h   | 21 +++++---
 include/asm-generic/bitops/non-atomic.h            | 13 +----
 .../bitops/non-instrumented-non-atomic.h           | 16 ++++++
 include/linux/bitops.h                             | 17 ++++++
 tools/include/asm-generic/bitops/non-atomic.h      | 24 +++++----
 13 files changed, 229 insertions(+), 150 deletions(-)
 create mode 100644 include/asm-generic/bitops/non-instrumented-non-atomic.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index e1d8483a45f2..492c7713ddae 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -46,8 +46,8 @@ set_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static inline void
-__set_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -82,8 +82,8 @@ clear_bit_unlock(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ void
-__clear_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -94,7 +94,7 @@ static inline void
 __clear_bit_unlock(unsigned long nr, volatile void * addr)
 {
 	smp_mb();
-	__clear_bit(nr, addr);
+	arch___clear_bit(nr, addr);
 }
 
 static inline void
@@ -118,8 +118,8 @@ change_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ void
-__change_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -186,8 +186,8 @@ test_and_set_bit_lock(unsigned long nr, volatile void *addr)
 /*
  * WARNING: non atomic version.
  */
-static inline int
-__test_and_set_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -230,8 +230,8 @@ test_and_clear_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static inline int
-__test_and_clear_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -272,8 +272,8 @@ test_and_change_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ int
-__test_and_change_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -283,8 +283,8 @@ __test_and_change_bit(unsigned long nr, volatile void * addr)
 	return (old & mask) != 0;
 }
 
-static inline int
-test_bit(int nr, const volatile void * addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return (1UL & (((const int *) addr)[nr >> 5] >> (nr & 31))) != 0UL;
 }
@@ -450,6 +450,8 @@ sched_find_first_bit(const unsigned long b[2])
 	return __ffs(tmp) + ofs;
 }
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h
index 75d6ba3643b8..da500471ac73 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -127,38 +127,45 @@ static inline void change_bit(int nr, volatile void *addr)
  * be atomic, particularly for things like slab_lock and slab_unlock.
  *
  */
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_clear_bit(nr, addr);
 }
 
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_set_bit(nr, addr);
 }
 
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_change_bit(nr, addr);
 }
 
 /*  Apparently, at least some of these are allowed to be non-atomic  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_clear_bit(nr, addr);
 }
 
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_set_bit(nr, addr);
 }
 
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_change_bit(nr, addr);
 }
 
-static inline int __test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	int retval;
 
@@ -172,8 +179,6 @@ static inline int __test_bit(int nr, const volatile unsigned long *addr)
 	return retval;
 }
 
-#define test_bit(nr, addr) __test_bit(nr, addr)
-
 /*
  * ffz - find first zero in word.
  * @word: The word to search
@@ -271,6 +276,7 @@ static inline unsigned long __fls(unsigned long word)
 }
 
 #include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index 577be93c0818..9f62af7fd7c4 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -53,7 +53,7 @@ set_bit (int nr, volatile void *addr)
 }
 
 /**
- * __set_bit - Set a bit in memory
+ * arch___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -61,8 +61,8 @@ set_bit (int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__set_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
 }
@@ -135,7 +135,7 @@ __clear_bit_unlock(int nr, void *addr)
 }
 
 /**
- * __clear_bit - Clears a bit in memory (non-atomic version)
+ * arch___clear_bit - Clears a bit in memory (non-atomic version)
  * @nr: the bit to clear
  * @addr: the address to start counting from
  *
@@ -143,8 +143,8 @@ __clear_bit_unlock(int nr, void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__clear_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31));
 }
@@ -175,7 +175,7 @@ change_bit (int nr, volatile void *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * arch___change_bit - Toggle a bit in memory
  * @nr: the bit to toggle
  * @addr: the address to start counting from
  *
@@ -183,8 +183,8 @@ change_bit (int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__change_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) ^= (1 << (nr & 31));
 }
@@ -224,7 +224,7 @@ test_and_set_bit (int nr, volatile void *addr)
 #define test_and_set_bit_lock test_and_set_bit
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * arch___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -232,8 +232,8 @@ test_and_set_bit (int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __inline__ int
-__test_and_set_bit (int nr, volatile void *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 *p = (__u32 *) addr + (nr >> 5);
 	__u32 m = 1 << (nr & 31);
@@ -269,7 +269,7 @@ test_and_clear_bit (int nr, volatile void *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * arch___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -277,8 +277,8 @@ test_and_clear_bit (int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __inline__ int
-__test_and_clear_bit(int nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 *p = (__u32 *) addr + (nr >> 5);
 	__u32 m = 1 << (nr & 31);
@@ -314,14 +314,14 @@ test_and_change_bit (int nr, volatile void *addr)
 }
 
 /**
- * __test_and_change_bit - Change a bit and return its old value
+ * arch___test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
  * This operation is non-atomic and can be reordered.
  */
-static __inline__ int
-__test_and_change_bit (int nr, void *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 old, bit = (1 << (nr & 31));
 	__u32 *m = (__u32 *) addr + (nr >> 5);
@@ -331,8 +331,8 @@ __test_and_change_bit (int nr, void *addr)
 	return (old & bit) != 0;
 }
 
-static __inline__ int
-test_bit (int nr, const volatile void *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1 & (((const volatile __u32 *) addr)[nr >> 5] >> (nr & 31));
 }
@@ -443,6 +443,8 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x)
 
 #ifdef __KERNEL__
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 51283db53667..71495faf2a90 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -65,8 +65,11 @@ static inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr)
 				bfset_mem_set_bit(nr, vaddr))
 #endif
 
-#define __set_bit(nr, vaddr)	set_bit(nr, vaddr)
-
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	set_bit(nr, addr);
+}
 
 static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
 {
@@ -105,8 +108,11 @@ static inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
 				bfclr_mem_clear_bit(nr, vaddr))
 #endif
 
-#define __clear_bit(nr, vaddr)	clear_bit(nr, vaddr)
-
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	clear_bit(nr, addr);
+}
 
 static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
 {
@@ -145,14 +151,17 @@ static inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
 				bfchg_mem_change_bit(nr, vaddr))
 #endif
 
-#define __change_bit(nr, vaddr)	change_bit(nr, vaddr)
-
-
-static inline int test_bit(int nr, const volatile unsigned long *vaddr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	return (vaddr[nr >> 5] & (1UL << (nr & 31))) != 0;
+	change_bit(nr, addr);
 }
 
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	return (addr[nr >> 5] & (1UL << (nr & 31))) != 0;
+}
 
 static inline int bset_reg_test_and_set_bit(int nr,
 					    volatile unsigned long *vaddr)
@@ -201,8 +210,11 @@ static inline int bfset_mem_test_and_set_bit(int nr,
 					bfset_mem_test_and_set_bit(nr, vaddr))
 #endif
 
-#define __test_and_set_bit(nr, vaddr)	test_and_set_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_set_bit(nr, addr);
+}
 
 static inline int bclr_reg_test_and_clear_bit(int nr,
 					      volatile unsigned long *vaddr)
@@ -251,8 +263,11 @@ static inline int bfclr_mem_test_and_clear_bit(int nr,
 					bfclr_mem_test_and_clear_bit(nr, vaddr))
 #endif
 
-#define __test_and_clear_bit(nr, vaddr)	test_and_clear_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_clear_bit(nr, addr);
+}
 
 static inline int bchg_reg_test_and_change_bit(int nr,
 					       volatile unsigned long *vaddr)
@@ -301,8 +316,11 @@ static inline int bfchg_mem_test_and_change_bit(int nr,
 					bfchg_mem_test_and_change_bit(nr, vaddr))
 #endif
 
-#define __test_and_change_bit(nr, vaddr) test_and_change_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_change_bit(nr, addr);
+}
 
 /*
  *	The true 68020 and more advanced processors support the "bfffo"
@@ -522,6 +540,7 @@ static inline int __fls(int x)
 #define clear_bit_unlock	clear_bit
 #define __clear_bit_unlock	clear_bit_unlock
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 191dc7898b0f..9a7d15da966e 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -113,75 +113,76 @@ static inline bool arch_test_and_change_bit(unsigned long nr,
 	return old & mask;
 }
 
-static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr |= mask;
+	*p |= mask;
 }
 
-static inline void arch___clear_bit(unsigned long nr,
-				    volatile unsigned long *ptr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr &= ~mask;
+	*p &= ~mask;
 }
 
-static inline void arch___change_bit(unsigned long nr,
-				     volatile unsigned long *ptr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr ^= mask;
+	*p ^= mask;
 }
 
-static inline bool arch___test_and_set_bit(unsigned long nr,
-					   volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr |= mask;
+	old = *p;
+	*p |= mask;
 	return old & mask;
 }
 
-static inline bool arch___test_and_clear_bit(unsigned long nr,
-					     volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr &= ~mask;
+	old = *p;
+	*p &= ~mask;
 	return old & mask;
 }
 
-static inline bool arch___test_and_change_bit(unsigned long nr,
-					      volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr ^= mask;
+	old = *p;
+	*p ^= mask;
 	return old & mask;
 }
 
-static inline bool arch_test_bit(unsigned long nr,
-				 const volatile unsigned long *ptr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
-	const volatile unsigned long *addr = __bitops_word(nr, ptr);
+	const volatile unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	return *addr & mask;
+	return *p & mask;
 }
 
 static inline bool arch_test_and_set_bit_lock(unsigned long nr,
diff --git a/arch/sh/include/asm/bitops-op32.h b/arch/sh/include/asm/bitops-op32.h
index cfe5465acce7..565a85d8b7fb 100644
--- a/arch/sh/include/asm/bitops-op32.h
+++ b/arch/sh/include/asm/bitops-op32.h
@@ -2,6 +2,8 @@
 #ifndef __ASM_SH_BITOPS_OP32_H
 #define __ASM_SH_BITOPS_OP32_H
 
+#include <linux/bits.h>
+
 /*
  * The bit modifying instructions on SH-2A are only capable of working
  * with a 3-bit immediate, which signifies the shift position for the bit
@@ -16,7 +18,8 @@
 #define BYTE_OFFSET(nr)		((nr) % BITS_PER_BYTE)
 #endif
 
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -33,7 +36,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 	}
 }
 
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -52,7 +56,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * arch___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -60,7 +64,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -79,7 +84,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * arch___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -87,7 +92,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -98,7 +104,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * arch___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -106,7 +112,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -117,8 +124,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-					    volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -129,13 +136,16 @@ static inline int __test_and_change_bit(int nr,
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * arch_test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #endif /* __ASM_SH_BITOPS_OP32_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a288ecd230ab..973c6bd17f98 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -63,7 +63,7 @@ arch_set_bit(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___set_bit(long nr, volatile unsigned long *addr)
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -89,7 +89,7 @@ arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___clear_bit(long nr, volatile unsigned long *addr)
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -114,7 +114,7 @@ arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___change_bit(long nr, volatile unsigned long *addr)
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -145,7 +145,7 @@ arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline bool
-arch___test_and_set_bit(long nr, volatile unsigned long *addr)
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -171,7 +171,7 @@ arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
  * this without also updating arch/x86/kernel/kvm.c
  */
 static __always_inline bool
-arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -183,7 +183,7 @@ arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline bool
-arch___test_and_change_bit(long nr, volatile unsigned long *addr)
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -219,10 +219,12 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 	return oldbit;
 }
 
-#define arch_test_bit(nr, addr)			\
-	(__builtin_constant_p((nr))		\
-	 ? constant_test_bit((nr), (addr))	\
-	 : variable_test_bit((nr), (addr)))
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
+					  variable_test_bit(nr, addr);
+}
 
 /**
  * __ffs - find first set bit in word
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 7226488810e5..b85b8a2ac239 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -24,7 +24,7 @@
  * may be that only one operation succeeds.
  */
 static __always_inline void
-generic___set_bit(unsigned int nr, volatile unsigned long *addr)
+generic___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -33,7 +33,7 @@ generic___set_bit(unsigned int nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-generic___clear_bit(unsigned int nr, volatile unsigned long *addr)
+generic___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -50,8 +50,8 @@ generic___clear_bit(unsigned int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __always_inline
-void generic___change_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline void
+generic___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -68,8 +68,8 @@ void generic___change_bit(unsigned int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __always_inline int
-generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -88,8 +88,8 @@ generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __always_inline int
-generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -100,8 +100,8 @@ generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static __always_inline int
-generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -116,8 +116,8 @@ generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static __always_inline int
-generic_test_bit(unsigned int nr, const volatile unsigned long *addr)
+static __always_inline bool
+generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	/*
 	 * Unlike the bitops with the '__' prefix above, this one *is* atomic,
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 7ab1ecc37782..b019f77ef21c 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -22,7 +22,8 @@
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___set_bit(nr, addr);
@@ -37,7 +38,8 @@ static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___clear_bit(nr, addr);
@@ -52,7 +54,8 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___change_bit(nr, addr);
@@ -90,7 +93,8 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_set_bit(nr, addr);
@@ -104,7 +108,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_clear_bit(nr, addr);
@@ -118,7 +123,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_change_bit(nr, addr);
@@ -129,7 +135,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static __always_inline bool test_bit(long nr, const volatile unsigned long *addr)
+static __always_inline bool
+test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
 	return arch_test_bit(nr, addr);
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 23d3abc1e10d..5c37ced343ae 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -5,24 +5,15 @@
 #include <asm-generic/bitops/generic-non-atomic.h>
 
 #define arch___set_bit generic___set_bit
-#define __set_bit arch___set_bit
-
 #define arch___clear_bit generic___clear_bit
-#define __clear_bit arch___clear_bit
-
 #define arch___change_bit generic___change_bit
-#define __change_bit arch___change_bit
 
 #define arch___test_and_set_bit generic___test_and_set_bit
-#define __test_and_set_bit arch___test_and_set_bit
-
 #define arch___test_and_clear_bit generic___test_and_clear_bit
-#define __test_and_clear_bit arch___test_and_clear_bit
-
 #define arch___test_and_change_bit generic___test_and_change_bit
-#define __test_and_change_bit arch___test_and_change_bit
 
 #define arch_test_bit generic_test_bit
-#define test_bit arch_test_bit
+
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
new file mode 100644
index 000000000000..e0fd7bf72a56
--- /dev/null
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
+#define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
+
+#define __set_bit		arch___set_bit
+#define __clear_bit		arch___clear_bit
+#define __change_bit		arch___change_bit
+
+#define __test_and_set_bit	arch___test_and_set_bit
+#define __test_and_clear_bit	arch___test_and_clear_bit
+#define __test_and_change_bit	arch___test_and_change_bit
+
+#define test_bit		arch_test_bit
+
+#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 7aaed501f768..87087454a288 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -26,12 +26,29 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+#include <asm-generic/bitops/generic-non-atomic.h>
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
+/* Check that the bitops prototypes are sane */
+#define __check_bitop_pr(name)						\
+	static_assert(__same_type(arch_##name, generic_##name) &&	\
+		      __same_type(name, generic_##name))
+
+__check_bitop_pr(__set_bit);
+__check_bitop_pr(__clear_bit);
+__check_bitop_pr(__change_bit);
+__check_bitop_pr(__test_and_set_bit);
+__check_bitop_pr(__test_and_clear_bit);
+__check_bitop_pr(__test_and_change_bit);
+__check_bitop_pr(test_bit);
+
+#undef __check_bitop_pr
+
 static inline int get_bitmask_order(unsigned int count)
 {
 	int order;
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
index 7e10c4b50c5d..e5e78e42e57b 100644
--- a/tools/include/asm-generic/bitops/non-atomic.h
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 
-#include <asm/types.h>
+#include <linux/bits.h>
 
 /**
  * __set_bit - Set a bit in memory
@@ -13,7 +13,8 @@
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -21,7 +22,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 	*p  |= mask;
 }
 
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -38,7 +40,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -55,7 +58,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -74,7 +78,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -85,8 +90,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-					    volatile unsigned long *addr)
+static __always_inline bool
+__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -101,7 +106,8 @@ static inline int __test_and_change_bit(int nr,
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
-- 
cgit 


From bb7379bfa680bd48b468e856475778db2ad866c1 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:08 +0200
Subject: bitops: define const_*() versions of the non-atomics

Define const_*() variants of the non-atomic bitops to be used when
the input arguments are compile-time constants, so that the compiler
will be always able to resolve those to compile-time constants as
well. Those are mostly direct aliases for generic_*() with one
exception for const_test_bit(): the original one is declared
atomic-safe and thus doesn't discard the `volatile` qualifier, so
in order to let optimize code, define it separately disregarding
the qualifier.
Add them to the compile-time type checks as well just in case.

Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/asm-generic/bitops/generic-non-atomic.h | 31 +++++++++++++++++++++++++
 include/linux/bitops.h                          |  1 +
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index b85b8a2ac239..3d5ebd24652b 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -127,4 +127,35 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+/*
+ * const_*() definitions provide good compile-time optimizations when
+ * the passed arguments can be resolved at compile time.
+ */
+#define const___set_bit			generic___set_bit
+#define const___clear_bit		generic___clear_bit
+#define const___change_bit		generic___change_bit
+#define const___test_and_set_bit	generic___test_and_set_bit
+#define const___test_and_clear_bit	generic___test_and_clear_bit
+#define const___test_and_change_bit	generic___test_and_change_bit
+
+/**
+ * const_test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ *
+ * A version of generic_test_bit() which discards the `volatile` qualifier to
+ * allow a compiler to optimize code harder. Non-atomic and to be called only
+ * for testing compile-time constants, e.g. by the corresponding macros, not
+ * directly from "regular" code.
+ */
+static __always_inline bool
+const_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	const unsigned long *p = (const unsigned long *)addr + BIT_WORD(nr);
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long val = *p;
+
+	return !!(val & mask);
+}
+
 #endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 87087454a288..d393297287d5 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -37,6 +37,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 /* Check that the bitops prototypes are sane */
 #define __check_bitop_pr(name)						\
 	static_assert(__same_type(arch_##name, generic_##name) &&	\
+		      __same_type(const_##name, generic_##name) &&	\
 		      __same_type(name, generic_##name))
 
 __check_bitop_pr(__set_bit);
-- 
cgit 


From e69eb9c460f128b71c6b995d75a05244e4b6cc3e Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:09 +0200
Subject: bitops: wrap non-atomic bitops with a transparent macro

In preparation for altering the non-atomic bitops with a macro, wrap
them in a transparent definition. This requires prepending one more
'_' to their names in order to be able to do that seamlessly. It is
a simple change, given that all the non-prefixed definitions are now
in asm-generic.
sparc32 already has several triple-underscored functions, so I had
to rename them ('___' -> 'sp32_').

Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sparc/include/asm/bitops_32.h                 | 18 +++++++-------
 arch/sparc/lib/atomic32.c                          | 12 +++++-----
 .../asm-generic/bitops/instrumented-non-atomic.h   | 28 +++++++++++-----------
 .../bitops/non-instrumented-non-atomic.h           | 14 +++++------
 include/linux/bitops.h                             | 18 +++++++++++++-
 tools/include/asm-generic/bitops/non-atomic.h      | 24 +++++++++----------
 tools/include/linux/bitops.h                       | 16 +++++++++++++
 7 files changed, 81 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h
index 889afa9f990f..3448c191b484 100644
--- a/arch/sparc/include/asm/bitops_32.h
+++ b/arch/sparc/include/asm/bitops_32.h
@@ -19,9 +19,9 @@
 #error only <linux/bitops.h> can be included directly
 #endif
 
-unsigned long ___set_bit(unsigned long *addr, unsigned long mask);
-unsigned long ___clear_bit(unsigned long *addr, unsigned long mask);
-unsigned long ___change_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask);
 
 /*
  * Set bit 'nr' in 32-bit quantity at address 'addr' where bit '0'
@@ -36,7 +36,7 @@ static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *add
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___set_bit(ADDR, mask) != 0;
+	return sp32___set_bit(ADDR, mask) != 0;
 }
 
 static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
@@ -46,7 +46,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___set_bit(ADDR, mask);
+	(void) sp32___set_bit(ADDR, mask);
 }
 
 static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
@@ -56,7 +56,7 @@ static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *a
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___clear_bit(ADDR, mask) != 0;
+	return sp32___clear_bit(ADDR, mask) != 0;
 }
 
 static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
@@ -66,7 +66,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___clear_bit(ADDR, mask);
+	(void) sp32___clear_bit(ADDR, mask);
 }
 
 static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
@@ -76,7 +76,7 @@ static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___change_bit(ADDR, mask) != 0;
+	return sp32___change_bit(ADDR, mask) != 0;
 }
 
 static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
@@ -86,7 +86,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___change_bit(ADDR, mask);
+	(void) sp32___change_bit(ADDR, mask);
 }
 
 #include <asm-generic/bitops/non-atomic.h>
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 8b81d0f00c97..cf80d1ae352b 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -120,7 +120,7 @@ void arch_atomic_set(atomic_t *v, int i)
 }
 EXPORT_SYMBOL(arch_atomic_set);
 
-unsigned long ___set_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -131,9 +131,9 @@ unsigned long ___set_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___set_bit);
+EXPORT_SYMBOL(sp32___set_bit);
 
-unsigned long ___clear_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -144,9 +144,9 @@ unsigned long ___clear_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___clear_bit);
+EXPORT_SYMBOL(sp32___clear_bit);
 
-unsigned long ___change_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -157,7 +157,7 @@ unsigned long ___change_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___change_bit);
+EXPORT_SYMBOL(sp32___change_bit);
 
 unsigned long __cmpxchg_u32(volatile u32 *ptr, u32 old, u32 new)
 {
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index b019f77ef21c..988a3bbfba34 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -14,7 +14,7 @@
 #include <linux/instrumented.h>
 
 /**
- * __set_bit - Set a bit in memory
+ * ___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -23,14 +23,14 @@
  * succeeds.
  */
 static __always_inline void
-__set_bit(unsigned long nr, volatile unsigned long *addr)
+___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___set_bit(nr, addr);
 }
 
 /**
- * __clear_bit - Clears a bit in memory
+ * ___clear_bit - Clears a bit in memory
  * @nr: the bit to clear
  * @addr: the address to start counting from
  *
@@ -39,14 +39,14 @@ __set_bit(unsigned long nr, volatile unsigned long *addr)
  * succeeds.
  */
 static __always_inline void
-__clear_bit(unsigned long nr, volatile unsigned long *addr)
+___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___clear_bit(nr, addr);
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * ___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -55,7 +55,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
  * succeeds.
  */
 static __always_inline void
-__change_bit(unsigned long nr, volatile unsigned long *addr)
+___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___change_bit(nr, addr);
@@ -86,7 +86,7 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * ___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -94,14 +94,14 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_set_bit(nr, addr);
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * ___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -109,14 +109,14 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_clear_bit(nr, addr);
 }
 
 /**
- * __test_and_change_bit - Change a bit and return its old value
+ * ___test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
@@ -124,19 +124,19 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_change_bit(nr, addr);
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * _test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
 static __always_inline bool
-test_bit(unsigned long nr, const volatile unsigned long *addr)
+_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
 	return arch_test_bit(nr, addr);
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index e0fd7bf72a56..bdb9b1ffaee9 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -3,14 +3,14 @@
 #ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
 #define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
 
-#define __set_bit		arch___set_bit
-#define __clear_bit		arch___clear_bit
-#define __change_bit		arch___change_bit
+#define ___set_bit		arch___set_bit
+#define ___clear_bit		arch___clear_bit
+#define ___change_bit		arch___change_bit
 
-#define __test_and_set_bit	arch___test_and_set_bit
-#define __test_and_clear_bit	arch___test_and_clear_bit
-#define __test_and_change_bit	arch___test_and_change_bit
+#define ___test_and_set_bit	arch___test_and_set_bit
+#define ___test_and_clear_bit	arch___test_and_clear_bit
+#define ___test_and_change_bit	arch___test_and_change_bit
 
-#define test_bit		arch_test_bit
+#define _test_bit		arch_test_bit
 
 #endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index d393297287d5..3c3afbae1533 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -26,8 +26,24 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+/*
+ * Defined here because those may be needed by architecture-specific static
+ * inlines.
+ */
+
 #include <asm-generic/bitops/generic-non-atomic.h>
 
+#define bitop(op, nr, addr)						\
+	op(nr, addr)
+
+#define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
+#define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
+#define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
+#define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
+#define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
+#define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+#define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
@@ -38,7 +54,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 #define __check_bitop_pr(name)						\
 	static_assert(__same_type(arch_##name, generic_##name) &&	\
 		      __same_type(const_##name, generic_##name) &&	\
-		      __same_type(name, generic_##name))
+		      __same_type(_##name, generic_##name))
 
 __check_bitop_pr(__set_bit);
 __check_bitop_pr(__clear_bit);
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
index e5e78e42e57b..0c472a833408 100644
--- a/tools/include/asm-generic/bitops/non-atomic.h
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -5,7 +5,7 @@
 #include <linux/bits.h>
 
 /**
- * __set_bit - Set a bit in memory
+ * ___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -14,7 +14,7 @@
  * may be that only one operation succeeds.
  */
 static __always_inline void
-__set_bit(unsigned long nr, volatile unsigned long *addr)
+___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -23,7 +23,7 @@ __set_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-__clear_bit(unsigned long nr, volatile unsigned long *addr)
+___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -32,7 +32,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * ___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -41,7 +41,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
  * may be that only one operation succeeds.
  */
 static __always_inline void
-__change_bit(unsigned long nr, volatile unsigned long *addr)
+___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -50,7 +50,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * ___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -59,7 +59,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static __always_inline bool
-__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -70,7 +70,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * ___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -79,7 +79,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static __always_inline bool
-__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -91,7 +91,7 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 
 /* WARNING: non atomic and it can be reordered! */
 static __always_inline bool
-__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -102,12 +102,12 @@ __test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * _test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
 static __always_inline bool
-test_bit(unsigned long nr, const volatile unsigned long *addr)
+_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index 5fca38fe1ba8..f18683b95ea6 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -25,6 +25,22 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+/*
+ * Defined here because those may be needed by architecture-specific static
+ * inlines.
+ */
+
+#define bitop(op, nr, addr)						\
+	op(nr, addr)
+
+#define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
+#define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
+#define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
+#define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
+#define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
+#define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+#define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
-- 
cgit 


From b03fc1173c0c2bb8fad61902a862985cecdc4b1b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:10 +0200
Subject: bitops: let optimize out non-atomic bitops on compile-time constants

Currently, many architecture-specific non-atomic bitop
implementations use inline asm or other hacks which are faster or
more robust when working with "real" variables (i.e. fields from
the structures etc.), but the compilers have no clue how to optimize
them out when called on compile-time constants. That said, the
following code:

	DECLARE_BITMAP(foo, BITS_PER_LONG) = { }; // -> unsigned long foo[1];
	unsigned long bar = BIT(BAR_BIT);
	unsigned long baz = 0;

	__set_bit(FOO_BIT, foo);
	baz |= BIT(BAZ_BIT);

	BUILD_BUG_ON(!__builtin_constant_p(test_bit(FOO_BIT, foo));
	BUILD_BUG_ON(!__builtin_constant_p(bar & BAR_BIT));
	BUILD_BUG_ON(!__builtin_constant_p(baz & BAZ_BIT));

triggers the first assertion on x86_64, which means that the
compiler is unable to evaluate it to a compile-time initializer
when the architecture-specific bitop is used even if it's obvious.
In order to let the compiler optimize out such cases, expand the
bitop() macro to use the "constant" C non-atomic bitop
implementations when all of the arguments passed are compile-time
constants, which means that the result will be a compile-time
constant as well, so that it produces more efficient and simple
code in 100% cases, comparing to the architecture-specific
counterparts.

The savings are architecture, compiler and compiler flags dependent,
for example, on x86_64 -O2:

GCC 12: add/remove: 78/29 grow/shrink: 332/525 up/down: 31325/-61560 (-30235)
LLVM 13: add/remove: 79/76 grow/shrink: 184/537 up/down: 55076/-141892 (-86816)
LLVM 14: add/remove: 10/3 grow/shrink: 93/138 up/down: 3705/-6992 (-3287)

and ARM64 (courtesy of Mark):

GCC 11: add/remove: 92/29 grow/shrink: 933/2766 up/down: 39340/-82580 (-43240)
LLVM 14: add/remove: 21/11 grow/shrink: 620/651 up/down: 12060/-15824 (-3764)

Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3c3afbae1533..cf9bf65039f2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -33,8 +33,24 @@ extern unsigned long __sw_hweight64(__u64 w);
 
 #include <asm-generic/bitops/generic-non-atomic.h>
 
+/*
+ * Many architecture-specific non-atomic bitops contain inline asm code and due
+ * to that the compiler can't optimize them to compile-time expressions or
+ * constants. In contrary, generic_*() helpers are defined in pure C and
+ * compilers optimize them just well.
+ * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively
+ * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when
+ * the arguments can be resolved at compile time. That expression itself is a
+ * constant and doesn't bring any functional changes to the rest of cases.
+ * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when
+ * passing a bitmap from .bss or .data (-> `!!addr` is always true).
+ */
 #define bitop(op, nr, addr)						\
-	op(nr, addr)
+	((__builtin_constant_p(nr) &&					\
+	  __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) &&	\
+	  (uintptr_t)(addr) != (uintptr_t)NULL &&			\
+	  __builtin_constant_p(*(const unsigned long *)(addr))) ?	\
+	 const##op(nr, addr) : op(nr, addr))
 
 #define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
 #define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
-- 
cgit 


From 3e7e5baaaba78075a7f3a57432609e363bf2a486 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:12 +0200
Subject: bitmap: don't assume compiler evaluates small mem*() builtins calls

Intel kernel bot triggered the build bug on ARC architecture that
in fact is as follows:

	DECLARE_BITMAP(bitmap, BITS_PER_LONG);

	bitmap_clear(bitmap, 0, BITS_PER_LONG);
	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

which can be expanded to:

	unsigned long bitmap[1];

	memset(bitmap, 0, sizeof(*bitmap));
	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

In most cases, a compiler is able to expand small/simple mem*()
calls to simple assignments or bitops, in this case that would mean:

	unsigned long bitmap[1] = { 0 };

	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

and on most architectures this works, but not on ARC, despite having
-O3 for every build.
So, to make this work, in case when the last bit to modify is still
within the first long (small_const_nbits()), just use plain
assignments for the rest of bitmap_*() functions which still use
mem*(), but didn't receive such compile-time optimizations yet.
This doesn't have the same coverage as compilers provide, but at
least something to start:

text: add/remove: 3/7 grow/shrink: 43/78 up/down: 1848/-3370 (-1546)
data: add/remove: 1/11 grow/shrink: 0/8 up/down: 4/-356 (-352)

notably cpumask_*() family when NR_CPUS <= BITS_PER_LONG:

netif_get_num_default_rss_queues              38       4     -34
cpumask_copy                                  90       -     -90
cpumask_clear                                146       -    -146

and the abovementioned assertion started passing.

Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index f091a1664bf1..c91638e507f2 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -238,20 +238,32 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memset(dst, 0, len);
+
+	if (small_const_nbits(nbits))
+		*dst = 0;
+	else
+		memset(dst, 0, len);
 }
 
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memset(dst, 0xff, len);
+
+	if (small_const_nbits(nbits))
+		*dst = ~0UL;
+	else
+		memset(dst, 0xff, len);
 }
 
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memcpy(dst, src, len);
+
+	if (small_const_nbits(nbits))
+		*dst = *src;
+	else
+		memcpy(dst, src, len);
 }
 
 /*
@@ -431,6 +443,8 @@ static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__set_bit(start, map);
+	else if (small_const_nbits(start + nbits))
+		*map |= GENMASK(start + nbits - 1, start);
 	else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
 		 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
 		 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
@@ -445,6 +459,8 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__clear_bit(start, map);
+	else if (small_const_nbits(start + nbits))
+		*map &= ~GENMASK(start + nbits - 1, start);
 	else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
 		 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
 		 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
-- 
cgit 


From 1c9017e44af2eee94b1001af18c401ae440ad77c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 28 Jun 2022 17:52:35 +0300
Subject: net: dsa: felix: keep reference on entire tc-taprio config

In a future change we will need to remember the entire tc-taprio config
on all ports rather than just the base time, so use the
taprio_offload_get() helper function to replace ocelot_port->base_time
with ocelot_port->taprio.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c | 23 +++++++++++++----------
 include/soc/mscc/ocelot.h              |  5 ++---
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 693cd6ffbace..12792362bbee 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1210,6 +1210,9 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 			       QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
 			       QSYS_TAG_CONFIG, port);
 
+		taprio_offload_free(ocelot_port->taprio);
+		ocelot_port->taprio = NULL;
+
 		mutex_unlock(&ocelot->tas_lock);
 		return 0;
 	}
@@ -1258,8 +1261,6 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 		       QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES_M,
 		       QSYS_TAG_CONFIG, port);
 
-	ocelot_port->base_time = taprio->base_time;
-
 	vsc9959_new_base_time(ocelot, taprio->base_time,
 			      taprio->cycle_time, &base_ts);
 	ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
@@ -1282,6 +1283,10 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
 	ret = readx_poll_timeout(vsc9959_tas_read_cfg_status, ocelot, val,
 				 !(val & QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE),
 				 10, 100000);
+	if (ret)
+		goto err;
+
+	ocelot_port->taprio = taprio_offload_get(taprio);
 
 err:
 	mutex_unlock(&ocelot->tas_lock);
@@ -1291,17 +1296,18 @@ err:
 
 static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
 {
+	struct tc_taprio_qopt_offload *taprio;
 	struct ocelot_port *ocelot_port;
 	struct timespec64 base_ts;
-	u64 cycletime;
 	int port;
 	u32 val;
 
 	mutex_lock(&ocelot->tas_lock);
 
 	for (port = 0; port < ocelot->num_phys_ports; port++) {
-		val = ocelot_read_rix(ocelot, QSYS_TAG_CONFIG, port);
-		if (!(val & QSYS_TAG_CONFIG_ENABLE))
+		ocelot_port = ocelot->ports[port];
+		taprio = ocelot_port->taprio;
+		if (!taprio)
 			continue;
 
 		ocelot_rmw(ocelot,
@@ -1315,11 +1321,8 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
 			       QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
 			       QSYS_TAG_CONFIG, port);
 
-		cycletime = ocelot_read(ocelot, QSYS_PARAM_CFG_REG_4);
-		ocelot_port = ocelot->ports[port];
-
-		vsc9959_new_base_time(ocelot, ocelot_port->base_time,
-				      cycletime, &base_ts);
+		vsc9959_new_base_time(ocelot, taprio->base_time,
+				      taprio->cycle_time, &base_ts);
 
 		ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
 		ocelot_write(ocelot, lower_32_bits(base_ts.tv_sec),
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 3737570116c3..ac151ecc7f19 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -670,6 +670,8 @@ struct ocelot_port {
 	/* VLAN that untagged frames are classified to, on ingress */
 	const struct ocelot_bridge_vlan	*pvid_vlan;
 
+	struct tc_taprio_qopt_offload	*taprio;
+
 	phy_interface_t			phy_mode;
 
 	unsigned int			ptp_skbs_in_flight;
@@ -692,9 +694,6 @@ struct ocelot_port {
 	int				bridge_num;
 
 	int				speed;
-
-	/* Store the AdminBaseTime of EST fetched from userspace. */
-	s64				base_time;
 };
 
 struct ocelot {
-- 
cgit 


From 837ced3a1a5d8bb1a637dd584711f31ae6b54d93 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 28 Jun 2022 17:52:38 +0300
Subject: time64.h: consolidate uses of PSEC_PER_NSEC

Time-sensitive networking code needs to work with PTP times expressed in
nanoseconds, and with packet transmission times expressed in
picoseconds, since those would be fractional at higher than gigabit
speed when expressed in nanoseconds.

Convert the existing uses in tc-taprio and the ocelot/felix DSA driver
to a PSEC_PER_NSEC macro. This macro is placed in include/linux/time64.h
as opposed to its relatives (PSEC_PER_SEC etc) from include/vdso/time64.h
because the vDSO library does not (yet) need/use it.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com> # for the vDSO parts
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c | 5 +++--
 include/linux/time64.h                 | 3 +++
 net/sched/sch_taprio.c                 | 5 +++--
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index f51d9d5f0d82..61ed317602e7 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -16,6 +16,7 @@
 #include <linux/iopoll.h>
 #include <linux/mdio.h>
 #include <linux/pci.h>
+#include <linux/time.h>
 #include "felix.h"
 
 #define VSC9959_NUM_PORTS		6
@@ -1235,7 +1236,7 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port)
 		u32 max_sdu;
 
 		if (min_gate_len[tc] == U64_MAX /* Gate always open */ ||
-		    min_gate_len[tc] * 1000 > needed_bit_time_ps) {
+		    min_gate_len[tc] * PSEC_PER_NSEC > needed_bit_time_ps) {
 			/* Setting QMAXSDU_CFG to 0 disables oversized frame
 			 * dropping.
 			 */
@@ -1249,7 +1250,7 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port)
 			 * frame, make sure to enable oversize frame dropping
 			 * for frames larger than the smallest that would fit.
 			 */
-			max_sdu = div_u64(min_gate_len[tc] * 1000,
+			max_sdu = div_u64(min_gate_len[tc] * PSEC_PER_NSEC,
 					  picos_per_byte);
 			/* A TC gate may be completely closed, which is a
 			 * special case where all packets are oversized.
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 81b9686a2079..2fb8232cff1d 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -20,6 +20,9 @@ struct itimerspec64 {
 	struct timespec64 it_value;
 };
 
+/* Parameters used to convert the timespec values: */
+#define PSEC_PER_NSEC			1000L
+
 /* Located here for timespec[64]_valid_strict */
 #define TIME64_MAX			((s64)~((u64)1 << 63))
 #define TIME64_MIN			(-TIME64_MAX - 1)
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index b9c71a304d39..0b941dd63d26 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/time.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -176,7 +177,7 @@ static ktime_t get_interval_end_time(struct sched_gate_list *sched,
 
 static int length_to_duration(struct taprio_sched *q, int len)
 {
-	return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+	return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
 }
 
 /* Returns the entry corresponding to next available interval. If
@@ -551,7 +552,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
 {
 	atomic_set(&entry->budget,
-		   div64_u64((u64)entry->interval * 1000,
+		   div64_u64((u64)entry->interval * PSEC_PER_NSEC,
 			     atomic64_read(&q->picos_per_byte)));
 }
 
-- 
cgit 


From 34e3b69b1edc966f0f4dcdd880afba3a2dad8c09 Mon Sep 17 00:00:00 2001
From: Phil Edworthy <phil.edworthy@renesas.com>
Date: Fri, 24 Jun 2022 09:48:32 +0100
Subject: dt-bindings: pinctrl: Add DT bindings for Renesas RZ/V2M pinctrl

Add device tree binding documentation and header file for Renesas
RZ/V2M pinctrl.

Signed-off-by: Phil Edworthy <phil.edworthy@renesas.com>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220624084833.22605-2-phil.edworthy@renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 .../bindings/pinctrl/renesas,rzv2m-pinctrl.yaml    | 170 +++++++++++++++++++++
 include/dt-bindings/pinctrl/rzv2m-pinctrl.h        |  23 +++
 2 files changed, 193 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/pinctrl/renesas,rzv2m-pinctrl.yaml
 create mode 100644 include/dt-bindings/pinctrl/rzv2m-pinctrl.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,rzv2m-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,rzv2m-pinctrl.yaml
new file mode 100644
index 000000000000..eac6245db7dc
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/renesas,rzv2m-pinctrl.yaml
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/renesas,rzv2m-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/V2M combined Pin and GPIO controller
+
+maintainers:
+  - Geert Uytterhoeven <geert+renesas@glider.be>
+  - Phil Edworthy <phil.edworthy@renesas.com>
+
+description:
+  The Renesas RZ/V2M SoC features a combined Pin and GPIO controller.
+  Pin multiplexing and GPIO configuration is performed on a per-pin basis.
+  Each port features up to 16 pins, each of them configurable for GPIO function
+  (port mode) or in alternate function mode.
+  Up to 8 different alternate function modes exist for each single pin.
+
+properties:
+  compatible:
+    const: renesas,r9a09g011-pinctrl # RZ/V2M
+
+  reg:
+    maxItems: 1
+
+  gpio-controller: true
+
+  '#gpio-cells':
+    const: 2
+    description:
+      The first cell contains the global GPIO port index, constructed using the
+      RZV2M_GPIO() helper macro in <dt-bindings/pinctrl/rzv2m-pinctrl.h> and the
+      second cell represents consumer flag as mentioned in ../gpio/gpio.txt
+      E.g. "RZV2M_GPIO(8, 1)" for P8_1.
+
+  gpio-ranges:
+    maxItems: 1
+
+  interrupts:
+    description: INEXINT[0..38] corresponding to individual pin inputs.
+    maxItems: 39
+
+  clocks:
+    maxItems: 1
+
+  power-domains:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+additionalProperties:
+  anyOf:
+    - type: object
+      allOf:
+        - $ref: pincfg-node.yaml#
+        - $ref: pinmux-node.yaml#
+
+      description:
+        Pin controller client devices use pin configuration subnodes (children
+        and grandchildren) for desired pin configuration.
+        Client device subnodes use below standard properties.
+
+      properties:
+        phandle: true
+        pinmux:
+          description:
+            Values are constructed from GPIO port number, pin number, and
+            alternate function configuration number using the RZV2M_PORT_PINMUX()
+            helper macro in <dt-bindings/pinctrl/rzv2m-pinctrl.h>.
+        pins: true
+        bias-disable: true
+        bias-pull-down: true
+        bias-pull-up: true
+        drive-strength-microamp:
+          # Superset of supported values
+          enum: [ 1600, 1800, 2000, 3200, 3800, 4000, 6400, 7800, 8000,
+                  9000, 9600, 11000, 12000, 13000, 18000 ]
+        slew-rate:
+          description: 0 is slow slew rate, 1 is fast slew rate
+          enum: [ 0, 1 ]
+        gpio-hog: true
+        gpios: true
+        output-high: true
+        output-low: true
+        line-name: true
+
+    - type: object
+      properties:
+        phandle: true
+
+      additionalProperties:
+        $ref: "#/additionalProperties/anyOf/0"
+
+allOf:
+  - $ref: "pinctrl.yaml#"
+
+required:
+  - compatible
+  - reg
+  - gpio-controller
+  - '#gpio-cells'
+  - gpio-ranges
+  - interrupts
+  - clocks
+  - power-domains
+  - resets
+
+examples:
+  - |
+    #include <dt-bindings/pinctrl/rzv2m-pinctrl.h>
+    #include <dt-bindings/clock/r9a09g011-cpg.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    pinctrl: pinctrl@b6250000 {
+            compatible = "renesas,r9a09g011-pinctrl";
+            reg = <0xb6250000 0x800>;
+
+            gpio-controller;
+            #gpio-cells = <2>;
+            gpio-ranges = <&pinctrl 0 0 352>;
+            interrupts = <GIC_SPI 68 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 70 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 78 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 95 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
+            clocks = <&cpg CPG_MOD R9A09G011_PFC_PCLK>;
+            resets = <&cpg R9A09G011_PFC_PRESETN>;
+            power-domains = <&cpg>;
+
+            i2c2_pins: i2c2 {
+                    pinmux = <RZV2M_PORT_PINMUX(3, 8, 2)>, /* SDA */
+                             <RZV2M_PORT_PINMUX(3, 9, 2)>; /* SCL */
+            };
+    };
diff --git a/include/dt-bindings/pinctrl/rzv2m-pinctrl.h b/include/dt-bindings/pinctrl/rzv2m-pinctrl.h
new file mode 100644
index 000000000000..525532cd15da
--- /dev/null
+++ b/include/dt-bindings/pinctrl/rzv2m-pinctrl.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * This header provides constants for Renesas RZ/V2M pinctrl bindings.
+ *
+ * Copyright (C) 2022 Renesas Electronics Corp.
+ *
+ */
+
+#ifndef __DT_BINDINGS_RZV2M_PINCTRL_H
+#define __DT_BINDINGS_RZV2M_PINCTRL_H
+
+#define RZV2M_PINS_PER_PORT	16
+
+/*
+ * Create the pin index from its bank and position numbers and store in
+ * the upper 16 bits the alternate function identifier
+ */
+#define RZV2M_PORT_PINMUX(b, p, f)	((b) * RZV2M_PINS_PER_PORT + (p) | ((f) << 16))
+
+/* Convert a port and pin label to its global pin index */
+#define RZV2M_GPIO(port, pin)	((port) * RZV2M_PINS_PER_PORT + (pin))
+
+#endif /* __DT_BINDINGS_RZV2M_PINCTRL_H */
-- 
cgit 


From 6708be40047789aa3587a3866b782d5cda7b2a31 Mon Sep 17 00:00:00 2001
From: Peter Chiu <chui-hao.chiu@mediatek.com>
Date: Wed, 22 Jun 2022 09:08:20 +0800
Subject: wifi: ieee80211: s1g action frames are not robust

S1g action frame with code 22 is not protected so update the robust
action frame list.

Signed-off-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Link: https://lore.kernel.org/r/20220622010820.17522-1-chui-hao.chiu@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 870f659087d6..f386f9ed41f3 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4094,6 +4094,7 @@ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr)
 			*category != WLAN_CATEGORY_SELF_PROTECTED &&
 			*category != WLAN_CATEGORY_UNPROT_DMG &&
 			*category != WLAN_CATEGORY_VHT &&
+			*category != WLAN_CATEGORY_S1G &&
 			*category != WLAN_CATEGORY_VENDOR_SPECIFIC;
 	}
 
-- 
cgit 


From 2d8b08fef0af23aa2fe7f1a1719ac5b11478042f Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Tue, 28 Jun 2022 10:46:05 +0100
Subject: wifi: cfg80211: fix kernel-doc warnings all over the file

There are currently 17 kernel-doc warnings on this file:
	include/net/cfg80211.h:391: warning: Function parameter or member 'bw' not described in 'ieee80211_eht_mcs_nss_supp'
	include/net/cfg80211.h:437: warning: Function parameter or member 'eht_cap' not described in 'ieee80211_sband_iftype_data'
	include/net/cfg80211.h:507: warning: Function parameter or member 's1g' not described in 'ieee80211_sta_s1g_cap'
	include/net/cfg80211.h:1390: warning: Function parameter or member 'counter_offset_beacon' not described in 'cfg80211_color_change_settings'
	include/net/cfg80211.h:1390: warning: Function parameter or member 'counter_offset_presp' not described in 'cfg80211_color_change_settings'
	include/net/cfg80211.h:1430: warning: Enum value 'STATION_PARAM_APPLY_STA_TXPOWER' not described in enum 'station_parameters_apply_mask'
	include/net/cfg80211.h:2195: warning: Function parameter or member 'dot11MeshConnectedToAuthServer' not described in 'mesh_config'
	include/net/cfg80211.h:2341: warning: Function parameter or member 'short_ssid' not described in 'cfg80211_scan_6ghz_params'
	include/net/cfg80211.h:3328: warning: Function parameter or member 'kck_len' not described in 'cfg80211_gtk_rekey_data'
	include/net/cfg80211.h:3698: warning: Function parameter or member 'ftm' not described in 'cfg80211_pmsr_result'
	include/net/cfg80211.h:3828: warning: Function parameter or member 'global_mcast_stypes' not described in 'mgmt_frame_regs'
	include/net/cfg80211.h:4977: warning: Function parameter or member 'ftm' not described in 'cfg80211_pmsr_capabilities'
	include/net/cfg80211.h:5742: warning: Function parameter or member 'u' not described in 'wireless_dev'
	include/net/cfg80211.h:5742: warning: Function parameter or member 'links' not described in 'wireless_dev'
	include/net/cfg80211.h:5742: warning: Function parameter or member 'valid_links' not described in 'wireless_dev'
	include/net/cfg80211.h:6076: warning: Function parameter or member 'is_amsdu' not described in 'ieee80211_data_to_8023_exthdr'
	include/net/cfg80211.h:6949: warning: Function parameter or member 'sig_dbm' not described in 'cfg80211_notify_new_peer_candidate'

Address them, in order to build a better documentation from this
header.

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/f6f522cdc716a01744bb0eae2186f4592976222b.1656409369.git.mchehab@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 996782c44838..c7e641071eff 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -374,6 +374,7 @@ struct ieee80211_sta_he_cap {
  * and NSS Set field"
  *
  * @only_20mhz: MCS/NSS support for 20 MHz-only STA.
+ * @bw: MCS/NSS support for 80, 160 and 320 MHz
  * @bw._80: MCS/NSS support for BW <= 80 MHz
  * @bw._160: MCS/NSS support for BW = 160 MHz
  * @bw._320: MCS/NSS support for BW = 320 MHz
@@ -420,6 +421,7 @@ struct ieee80211_sta_eht_cap {
  * @he_cap: holds the HE capabilities
  * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
  *	6 GHz band channel (and 0 may be valid value).
+ * @eht_cap: STA's EHT capabilities
  * @vendor_elems: vendor element(s) to advertise
  * @vendor_elems.data: vendor element(s) data
  * @vendor_elems.len: vendor element(s) length
@@ -495,7 +497,7 @@ struct ieee80211_edmg {
  * This structure describes most essential parameters needed
  * to describe 802.11ah S1G capabilities for a STA.
  *
- * @s1g_supported: is STA an S1G STA
+ * @s1g: is STA an S1G STA
  * @cap: S1G capabilities information
  * @nss_mcs: Supported NSS MCS set
  */
@@ -1373,8 +1375,8 @@ struct cfg80211_csa_settings {
  * Used for bss color change
  *
  * @beacon_color_change: beacon data while performing the color countdown
- * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
- * @counter_offsets_presp: offsets of the counters within the probe response
+ * @counter_offset_beacon: offsets of the counters within the beacon (tail)
+ * @counter_offset_presp: offsets of the counters within the probe response
  * @beacon_next: beacon data to be used after the color change
  * @count: number of beacons until the color change
  * @color: the color used after the change
@@ -1417,6 +1419,7 @@ struct iface_combination_params {
  * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
  * @STATION_PARAM_APPLY_CAPABILITY: apply new capability
  * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state
+ * @STATION_PARAM_APPLY_STA_TXPOWER: apply tx power for STA
  *
  * Not all station parameters have in-band "no change" signalling,
  * for those that don't these flags will are used.
@@ -2149,6 +2152,9 @@ struct bss_parameters {
  * @plink_timeout: If no tx activity is seen from a STA we've established
  *	peering with for longer than this time (in seconds), then remove it
  *	from the STA's list of peers.  Default is 30 minutes.
+ * @dot11MeshConnectedToAuthServer: if set to true then this mesh STA
+ *	will advertise that it is connected to a authentication server
+ *	in the mesh formation field.
  * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is
  *      connected to a mesh gate in mesh formation info.  If false, the
  *      value in mesh formation is determined by the presence of root paths
@@ -2321,12 +2327,12 @@ struct cfg80211_scan_info {
 /**
  * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only
  *
- * @short_bssid: short ssid to scan for
+ * @short_ssid: short ssid to scan for
  * @bssid: bssid to scan for
  * @channel_idx: idx of the channel in the channel array in the scan request
  *	 which the above info relvant to
  * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU
- * @short_ssid_valid: short_ssid is valid and can be used
+ * @short_ssid_valid: @short_ssid is valid and can be used
  * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait
  *       20 TUs before starting to send probe requests.
  */
@@ -3317,7 +3323,7 @@ struct cfg80211_wowlan_wakeup {
  * @kck: key confirmation key (@kck_len bytes)
  * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes)
  * @kek_len: length of kek
- * @kck_len length of kck
+ * @kck_len: length of kck
  * @akm: akm (oui, id)
  */
 struct cfg80211_gtk_rekey_data {
@@ -3679,6 +3685,7 @@ struct cfg80211_pmsr_ftm_result {
  * @type: type of the measurement reported, note that we only support reporting
  *	one type at a time, but you can report multiple results separately and
  *	they're all aggregated for userspace.
+ * @ftm: FTM result
  */
 struct cfg80211_pmsr_result {
 	u64 host_time, ap_tsf;
@@ -3817,7 +3824,7 @@ struct cfg80211_update_owe_info {
  *	for the entire device
  * @interface_stypes: bitmap of management frame subtypes registered
  *	for the given interface
- * @global_mcast_rx: mcast RX is needed globally for these subtypes
+ * @global_mcast_stypes: mcast RX is needed globally for these subtypes
  * @interface_mcast_stypes: mcast RX is needed on this interface
  *	for these subtypes
  */
@@ -4940,6 +4947,7 @@ struct wiphy_iftype_ext_capab {
  * @max_peers: maximum number of peers in a single measurement
  * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement
  * @randomize_mac_addr: can randomize MAC address for measurement
+ * @ftm: FTM measurement data
  * @ftm.supported: FTM measurement is supported
  * @ftm.asap: ASAP-mode is supported
  * @ftm.non_asap: non-ASAP-mode is supported
@@ -5563,6 +5571,7 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @netdev: (private) Used to reference back to the netdev, may be %NULL
  * @identifier: (private) Identifier used in nl80211 to identify this
  *	wireless device if it has no netdev
+ * @u: union containing data specific to @iftype
  * @connected_addr: (private) BSSID or AP MLD address if connected
  * @connected: indicates if connected or not (STA mode)
  * @current_bss: (private) Used by the internal configuration code
@@ -5624,6 +5633,9 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @pmsr_free_wk: (private) peer measurements cleanup work
  * @unprot_beacon_reported: (private) timestamp of last
  *	unprotected beacon report
+ * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr
+ *	@ap and @client for each link
+ * @valid_links: bitmap describing what elements of @links are valid
  */
 struct wireless_dev {
 	struct wiphy *wiphy;
@@ -6068,6 +6080,7 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);
  * @addr: the device MAC address
  * @iftype: the virtual interface type
  * @data_offset: offset of payload after the 802.11 header
+ * @is_amsdu: true if the 802.11 header is A-MSDU
  * Return: 0 on success. Non-zero on error.
  */
 int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
@@ -6937,6 +6950,7 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
  * @macaddr: the MAC address of the new candidate
  * @ie: information elements advertised by the peer candidate
  * @ie_len: length of the information elements buffer
+ * @sig_dbm: signal level in dBm
  * @gfp: allocation flags
  *
  * This function notifies cfg80211 that the mesh peer candidate has been
-- 
cgit 


From 82757b792be7a549460137b2dbfb9d48003a072a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Tue, 28 Jun 2022 10:46:06 +0100
Subject: wifi: mac80211: add a missing comma at kernel-doc markup

The lack of the colon makes it not parse the function parameter:
	include/net/mac80211.h:6250: warning: Function parameter or member 'vif' not described in 'ieee80211_channel_switch_disconnect'

Fix it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/11c1bdb861d89c93058fcfe312749b482851cbdb.1656409369.git.mchehab@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 27f24ac0426d..c0557142343f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6238,7 +6238,7 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success);
 
 /**
  * ieee80211_channel_switch_disconnect - disconnect due to channel switch error
- * @vif &struct ieee80211_vif pointer from the add_interface callback.
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @block_tx: if %true, do not send deauth frame.
  *
  * Instruct mac80211 to disconnect due to a channel switch error. The channel
-- 
cgit 


From 80fc671bcc0173836e9032b0c698ea74c13b9d7c Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 1 Jul 2022 11:48:43 +0800
Subject: uacce: Handle parent device removal or parent driver module rmmod

The uacce driver must deal with a possible removal of the parent device
or parent driver module rmmod at any time.

Although uacce_remove(), called on device removal and on driver unbind,
prevents future use of the uacce fops by removing the cdev, fops that
were called before that point may still be running.

Serialize uacce_fops_open() and uacce_remove() with uacce->mutex.
Serialize other fops against uacce_remove() with q->mutex.
Since we need to protect uacce_fops_poll() which gets called on the fast
path, replace uacce->queues_lock with q->mutex to improve scalability.
The other fops are only used during setup.

uacce_queue_is_valid(), checked under q->mutex or uacce->mutex, denotes
whether uacce_remove() has disabled all queues. If that is the case,
don't go any further since the parent device is being removed and
uacce->ops should not be called anymore.

Reported-by: Yang Shen <shenyang39@huawei.com>
Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20220701034843.7502-1-zhangfei.gao@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/uacce/uacce.c | 133 +++++++++++++++++++++++++++++----------------
 include/linux/uacce.h      |   6 +-
 2 files changed, 91 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index 281c54003edc..b70a013139c7 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -9,43 +9,38 @@
 
 static struct class *uacce_class;
 static dev_t uacce_devt;
-static DEFINE_MUTEX(uacce_mutex);
 static DEFINE_XARRAY_ALLOC(uacce_xa);
 
-static int uacce_start_queue(struct uacce_queue *q)
+/*
+ * If the parent driver or the device disappears, the queue state is invalid and
+ * ops are not usable anymore.
+ */
+static bool uacce_queue_is_valid(struct uacce_queue *q)
 {
-	int ret = 0;
+	return q->state == UACCE_Q_INIT || q->state == UACCE_Q_STARTED;
+}
 
-	mutex_lock(&uacce_mutex);
+static int uacce_start_queue(struct uacce_queue *q)
+{
+	int ret;
 
-	if (q->state != UACCE_Q_INIT) {
-		ret = -EINVAL;
-		goto out_with_lock;
-	}
+	if (q->state != UACCE_Q_INIT)
+		return -EINVAL;
 
 	if (q->uacce->ops->start_queue) {
 		ret = q->uacce->ops->start_queue(q);
 		if (ret < 0)
-			goto out_with_lock;
+			return ret;
 	}
 
 	q->state = UACCE_Q_STARTED;
-
-out_with_lock:
-	mutex_unlock(&uacce_mutex);
-
-	return ret;
+	return 0;
 }
 
 static int uacce_put_queue(struct uacce_queue *q)
 {
 	struct uacce_device *uacce = q->uacce;
 
-	mutex_lock(&uacce_mutex);
-
-	if (q->state == UACCE_Q_ZOMBIE)
-		goto out;
-
 	if ((q->state == UACCE_Q_STARTED) && uacce->ops->stop_queue)
 		uacce->ops->stop_queue(q);
 
@@ -54,8 +49,6 @@ static int uacce_put_queue(struct uacce_queue *q)
 		uacce->ops->put_queue(q);
 
 	q->state = UACCE_Q_ZOMBIE;
-out:
-	mutex_unlock(&uacce_mutex);
 
 	return 0;
 }
@@ -65,20 +58,36 @@ static long uacce_fops_unl_ioctl(struct file *filep,
 {
 	struct uacce_queue *q = filep->private_data;
 	struct uacce_device *uacce = q->uacce;
+	long ret = -ENXIO;
+
+	/*
+	 * uacce->ops->ioctl() may take the mmap_lock when copying arg to/from
+	 * user. Avoid a circular lock dependency with uacce_fops_mmap(), which
+	 * gets called with mmap_lock held, by taking uacce->mutex instead of
+	 * q->mutex. Doing this in uacce_fops_mmap() is not possible because
+	 * uacce_fops_open() calls iommu_sva_bind_device(), which takes
+	 * mmap_lock, while holding uacce->mutex.
+	 */
+	mutex_lock(&uacce->mutex);
+	if (!uacce_queue_is_valid(q))
+		goto out_unlock;
 
 	switch (cmd) {
 	case UACCE_CMD_START_Q:
-		return uacce_start_queue(q);
-
+		ret = uacce_start_queue(q);
+		break;
 	case UACCE_CMD_PUT_Q:
-		return uacce_put_queue(q);
-
+		ret = uacce_put_queue(q);
+		break;
 	default:
-		if (!uacce->ops->ioctl)
-			return -EINVAL;
-
-		return uacce->ops->ioctl(q, cmd, arg);
+		if (uacce->ops->ioctl)
+			ret = uacce->ops->ioctl(q, cmd, arg);
+		else
+			ret = -EINVAL;
 	}
+out_unlock:
+	mutex_unlock(&uacce->mutex);
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -136,6 +145,13 @@ static int uacce_fops_open(struct inode *inode, struct file *filep)
 	if (!q)
 		return -ENOMEM;
 
+	mutex_lock(&uacce->mutex);
+
+	if (!uacce->parent) {
+		ret = -EINVAL;
+		goto out_with_mem;
+	}
+
 	ret = uacce_bind_queue(uacce, q);
 	if (ret)
 		goto out_with_mem;
@@ -152,10 +168,9 @@ static int uacce_fops_open(struct inode *inode, struct file *filep)
 	filep->private_data = q;
 	uacce->inode = inode;
 	q->state = UACCE_Q_INIT;
-
-	mutex_lock(&uacce->queues_lock);
+	mutex_init(&q->mutex);
 	list_add(&q->list, &uacce->queues);
-	mutex_unlock(&uacce->queues_lock);
+	mutex_unlock(&uacce->mutex);
 
 	return 0;
 
@@ -163,18 +178,20 @@ out_with_bond:
 	uacce_unbind_queue(q);
 out_with_mem:
 	kfree(q);
+	mutex_unlock(&uacce->mutex);
 	return ret;
 }
 
 static int uacce_fops_release(struct inode *inode, struct file *filep)
 {
 	struct uacce_queue *q = filep->private_data;
+	struct uacce_device *uacce = q->uacce;
 
-	mutex_lock(&q->uacce->queues_lock);
-	list_del(&q->list);
-	mutex_unlock(&q->uacce->queues_lock);
+	mutex_lock(&uacce->mutex);
 	uacce_put_queue(q);
 	uacce_unbind_queue(q);
+	list_del(&q->list);
+	mutex_unlock(&uacce->mutex);
 	kfree(q);
 
 	return 0;
@@ -217,10 +234,9 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	vma->vm_private_data = q;
 	qfr->type = type;
 
-	mutex_lock(&uacce_mutex);
-
-	if (q->state != UACCE_Q_INIT && q->state != UACCE_Q_STARTED) {
-		ret = -EINVAL;
+	mutex_lock(&q->mutex);
+	if (!uacce_queue_is_valid(q)) {
+		ret = -ENXIO;
 		goto out_with_lock;
 	}
 
@@ -248,12 +264,12 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	}
 
 	q->qfrs[type] = qfr;
-	mutex_unlock(&uacce_mutex);
+	mutex_unlock(&q->mutex);
 
 	return ret;
 
 out_with_lock:
-	mutex_unlock(&uacce_mutex);
+	mutex_unlock(&q->mutex);
 	kfree(qfr);
 	return ret;
 }
@@ -262,12 +278,20 @@ static __poll_t uacce_fops_poll(struct file *file, poll_table *wait)
 {
 	struct uacce_queue *q = file->private_data;
 	struct uacce_device *uacce = q->uacce;
+	__poll_t ret = 0;
+
+	mutex_lock(&q->mutex);
+	if (!uacce_queue_is_valid(q))
+		goto out_unlock;
 
 	poll_wait(file, &q->wait, wait);
+
 	if (uacce->ops->is_q_updated && uacce->ops->is_q_updated(q))
-		return EPOLLIN | EPOLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 
-	return 0;
+out_unlock:
+	mutex_unlock(&q->mutex);
+	return ret;
 }
 
 static const struct file_operations uacce_fops = {
@@ -450,7 +474,7 @@ struct uacce_device *uacce_alloc(struct device *parent,
 		goto err_with_uacce;
 
 	INIT_LIST_HEAD(&uacce->queues);
-	mutex_init(&uacce->queues_lock);
+	mutex_init(&uacce->mutex);
 	device_initialize(&uacce->dev);
 	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
 	uacce->dev.class = uacce_class;
@@ -507,13 +531,23 @@ void uacce_remove(struct uacce_device *uacce)
 	if (uacce->inode)
 		unmap_mapping_range(uacce->inode->i_mapping, 0, 0, 1);
 
+	/*
+	 * uacce_fops_open() may be running concurrently, even after we remove
+	 * the cdev. Holding uacce->mutex ensures that open() does not obtain a
+	 * removed uacce device.
+	 */
+	mutex_lock(&uacce->mutex);
 	/* ensure no open queue remains */
-	mutex_lock(&uacce->queues_lock);
 	list_for_each_entry_safe(q, next_q, &uacce->queues, list) {
+		/*
+		 * Taking q->mutex ensures that fops do not use the defunct
+		 * uacce->ops after the queue is disabled.
+		 */
+		mutex_lock(&q->mutex);
 		uacce_put_queue(q);
+		mutex_unlock(&q->mutex);
 		uacce_unbind_queue(q);
 	}
-	mutex_unlock(&uacce->queues_lock);
 
 	/* disable sva now since no opened queues */
 	uacce_disable_sva(uacce);
@@ -521,6 +555,13 @@ void uacce_remove(struct uacce_device *uacce)
 	if (uacce->cdev)
 		cdev_device_del(uacce->cdev, &uacce->dev);
 	xa_erase(&uacce_xa, uacce->dev_id);
+	/*
+	 * uacce exists as long as there are open fds, but ops will be freed
+	 * now. Ensure that bugs cause NULL deref rather than use-after-free.
+	 */
+	uacce->ops = NULL;
+	uacce->parent = NULL;
+	mutex_unlock(&uacce->mutex);
 	put_device(&uacce->dev);
 }
 EXPORT_SYMBOL_GPL(uacce_remove);
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
index 48e319f40275..9ce88c28b0a8 100644
--- a/include/linux/uacce.h
+++ b/include/linux/uacce.h
@@ -70,6 +70,7 @@ enum uacce_q_state {
  * @wait: wait queue head
  * @list: index into uacce queues list
  * @qfrs: pointer of qfr regions
+ * @mutex: protects queue state
  * @state: queue state machine
  * @pasid: pasid associated to the mm
  * @handle: iommu_sva handle returned by iommu_sva_bind_device()
@@ -80,6 +81,7 @@ struct uacce_queue {
 	wait_queue_head_t wait;
 	struct list_head list;
 	struct uacce_qfile_region *qfrs[UACCE_MAX_REGION];
+	struct mutex mutex;
 	enum uacce_q_state state;
 	u32 pasid;
 	struct iommu_sva *handle;
@@ -97,9 +99,9 @@ struct uacce_queue {
  * @dev_id: id of the uacce device
  * @cdev: cdev of the uacce
  * @dev: dev of the uacce
+ * @mutex: protects uacce operation
  * @priv: private pointer of the uacce
  * @queues: list of queues
- * @queues_lock: lock for queues list
  * @inode: core vfs
  */
 struct uacce_device {
@@ -113,9 +115,9 @@ struct uacce_device {
 	u32 dev_id;
 	struct cdev *cdev;
 	struct device dev;
+	struct mutex mutex;
 	void *priv;
 	struct list_head queues;
-	struct mutex queues_lock;
 	struct inode *inode;
 };
 
-- 
cgit 


From c882716b6d411495f221bdd73e9137357c16a3ea Mon Sep 17 00:00:00 2001
From: Liang He <windhl@126.com>
Date: Tue, 28 Jun 2022 10:16:40 +0800
Subject: firmware: Hold a reference for of_find_compatible_node()

In of_register_trusted_foundations(), we need to hold the reference
returned by of_find_compatible_node() and then use it to call
of_node_put() for refcount balance.

Signed-off-by: Liang He <windhl@126.com>
Link: https://lore.kernel.org/r/20220628021640.4015-1-windhl@126.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/trusted_foundations.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/trusted_foundations.h b/include/linux/firmware/trusted_foundations.h
index be5984bda592..931b6c5c72df 100644
--- a/include/linux/firmware/trusted_foundations.h
+++ b/include/linux/firmware/trusted_foundations.h
@@ -71,12 +71,16 @@ static inline void register_trusted_foundations(
 
 static inline void of_register_trusted_foundations(void)
 {
+	struct device_node *np = of_find_compatible_node(NULL, NULL, "tlm,trusted-foundations");
+
+	if (!np)
+		return;
+	of_node_put(np);
 	/*
 	 * If we find the target should enable TF but does not support it,
 	 * fail as the system won't be able to do much anyway
 	 */
-	if (of_find_compatible_node(NULL, NULL, "tlm,trusted-foundations"))
-		register_trusted_foundations(NULL);
+	register_trusted_foundations(NULL);
 }
 
 static inline bool trusted_foundations_registered(void)
-- 
cgit 


From c8a9415e6ddef98a948f8c30d9ec2e749c0ccd9d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Jul 2022 10:36:36 +0200
Subject: wifi: cfg80211: remove redundant documentation

These struct members no longer exist, remove them
from documentation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c7e641071eff..87ebed6a48bd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5572,18 +5572,8 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @identifier: (private) Identifier used in nl80211 to identify this
  *	wireless device if it has no netdev
  * @u: union containing data specific to @iftype
- * @connected_addr: (private) BSSID or AP MLD address if connected
  * @connected: indicates if connected or not (STA mode)
- * @current_bss: (private) Used by the internal configuration code
- * @chandef: (private) Used by the internal configuration code to track
- *	the user-set channel definition.
- * @preset_chandef: (private) Used by the internal configuration code to
- *	track the channel to be used for AP later
  * @bssid: (private) Used by the internal configuration code
- * @ssid: (private) Used by the internal configuration code
- * @ssid_len: (private) Used by the internal configuration code
- * @mesh_id_len: (private) Used by the internal configuration code
- * @mesh_id_up_len: (private) Used by the internal configuration code
  * @wext: (private) Used by the internal wireless extensions compat code
  * @wext.ibss: (private) IBSS data part of wext handling
  * @wext.connect: (private) connection handling data
-- 
cgit 


From 7f884baae68adc85db55b97e3fc903a1f20bd1f9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Jul 2022 10:42:40 +0200
Subject: wifi: mac80211: fix a kernel-doc complaint

Somehow kernel-doc complains here about strong markup, but
we really don't need the [] so just remove that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c0557142343f..853d4d9ebf23 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4045,7 +4045,7 @@ struct ieee80211_prep_tx_info {
  *	removing the old link information is still valid (link_conf pointer),
  *	but may immediately disappear after the function returns. The old or
  *	new links bitmaps may be 0 if going from/to a non-MLO situation.
- *	The @old[] array contains pointers to the old bss_conf structures
+ *	The @old array contains pointers to the old bss_conf structures
  *	that were already removed, in case they're needed.
  *	This callback can sleep.
  * @change_sta_links: Change the valid links of a station, similar to
-- 
cgit 


From 942741dabcb43236006f557178801ce2051e69f9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 25 Jun 2022 23:24:05 +0200
Subject: wifi: mac80211: switch airtime fairness back to deficit round-robin
 scheduling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commits 6a789ba679d652587532cec2a0e0274fda172f3b and
2433647bc8d983a543e7d31b41ca2de1c7e2c198.

The virtual time scheduler code has a number of issues:
- queues slowed down by hardware/firmware powersave handling were not properly
  handled.
- on ath10k in push-pull mode, tx queues that the driver tries to pull from
  were starved, causing excessive latency
- delay between tx enqueue and reported airtime use were causing excessively
  bursty tx behavior

The bursty behavior may also be present on the round-robin scheduler, but there
it is much easier to fix without introducing additional regressions

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://lore.kernel.org/r/20220625212411.36675-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h        |  17 +-
 net/mac80211/cfg.c            |  35 +---
 net/mac80211/debugfs.c        |  70 ++------
 net/mac80211/debugfs_netdev.c |  32 +---
 net/mac80211/debugfs_sta.c    |  24 +--
 net/mac80211/ieee80211_i.h    | 182 ++-------------------
 net/mac80211/iface.c          |   4 -
 net/mac80211/main.c           |  11 +-
 net/mac80211/rx.c             |   6 +-
 net/mac80211/sta_info.c       |  67 +++-----
 net/mac80211/sta_info.h       |  11 +-
 net/mac80211/status.c         |  19 ---
 net/mac80211/tx.c             | 366 ++++++++++++------------------------------
 13 files changed, 190 insertions(+), 654 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 853d4d9ebf23..256b9215e17b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6832,6 +6832,9 @@ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac)
 {
 }
 
+void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
+			      struct ieee80211_txq *txq, bool force);
+
 /**
  * ieee80211_schedule_txq - schedule a TXQ for transmission
  *
@@ -6844,7 +6847,11 @@ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac)
  * The driver may call this function if it has buffered packets for
  * this TXQ internally.
  */
-void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
+static inline void
+ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq)
+{
+	__ieee80211_schedule_txq(hw, txq, true);
+}
 
 /**
  * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq()
@@ -6856,8 +6863,12 @@ void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
  * The driver may set force=true if it has buffered packets for this TXQ
  * internally.
  */
-void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
-			  bool force);
+static inline void
+ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
+		     bool force)
+{
+	__ieee80211_schedule_txq(hw, txq, force);
+}
 
 /**
  * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b387f5f4fef0..fd6c4291c971 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1581,38 +1581,6 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
 #endif
 }
 
-static void sta_apply_airtime_params(struct ieee80211_local *local,
-				     struct sta_info *sta,
-				     struct station_parameters *params)
-{
-	u8 ac;
-
-	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
-		struct airtime_sched_info *air_sched = &local->airtime[ac];
-		struct airtime_info *air_info = &sta->airtime[ac];
-		struct txq_info *txqi;
-		u8 tid;
-
-		spin_lock_bh(&air_sched->lock);
-		for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) {
-			if (air_info->weight == params->airtime_weight ||
-			    !sta->sta.txq[tid] ||
-			    ac != ieee80211_ac_from_tid(tid))
-				continue;
-
-			airtime_weight_set(air_info, params->airtime_weight);
-
-			txqi = to_txq_info(sta->sta.txq[tid]);
-			if (RB_EMPTY_NODE(&txqi->schedule_order))
-				continue;
-
-			ieee80211_update_airtime_weight(local, air_sched,
-							0, true);
-		}
-		spin_unlock_bh(&air_sched->lock);
-	}
-}
-
 static int sta_apply_parameters(struct ieee80211_local *local,
 				struct sta_info *sta,
 				struct station_parameters *params)
@@ -1811,8 +1779,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		sta_apply_mesh_params(local, sta, params);
 
 	if (params->airtime_weight)
-		sta_apply_airtime_params(local, sta, params);
-
+		sta->airtime_weight = params->airtime_weight;
 
 	/* set the STA state after all sta info from usermode has been set */
 	if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) ||
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 1fe43b264d75..0c748b1eb023 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -216,14 +216,14 @@ static ssize_t aql_txq_limit_read(struct file *file,
 			"VI	%u		%u\n"
 			"BE	%u		%u\n"
 			"BK	%u		%u\n",
-			local->airtime[IEEE80211_AC_VO].aql_txq_limit_low,
-			local->airtime[IEEE80211_AC_VO].aql_txq_limit_high,
-			local->airtime[IEEE80211_AC_VI].aql_txq_limit_low,
-			local->airtime[IEEE80211_AC_VI].aql_txq_limit_high,
-			local->airtime[IEEE80211_AC_BE].aql_txq_limit_low,
-			local->airtime[IEEE80211_AC_BE].aql_txq_limit_high,
-			local->airtime[IEEE80211_AC_BK].aql_txq_limit_low,
-			local->airtime[IEEE80211_AC_BK].aql_txq_limit_high);
+			local->aql_txq_limit_low[IEEE80211_AC_VO],
+			local->aql_txq_limit_high[IEEE80211_AC_VO],
+			local->aql_txq_limit_low[IEEE80211_AC_VI],
+			local->aql_txq_limit_high[IEEE80211_AC_VI],
+			local->aql_txq_limit_low[IEEE80211_AC_BE],
+			local->aql_txq_limit_high[IEEE80211_AC_BE],
+			local->aql_txq_limit_low[IEEE80211_AC_BK],
+			local->aql_txq_limit_high[IEEE80211_AC_BK]);
 	return simple_read_from_buffer(user_buf, count, ppos,
 				       buf, len);
 }
@@ -255,11 +255,11 @@ static ssize_t aql_txq_limit_write(struct file *file,
 	if (ac >= IEEE80211_NUM_ACS)
 		return -EINVAL;
 
-	q_limit_low_old = local->airtime[ac].aql_txq_limit_low;
-	q_limit_high_old = local->airtime[ac].aql_txq_limit_high;
+	q_limit_low_old = local->aql_txq_limit_low[ac];
+	q_limit_high_old = local->aql_txq_limit_high[ac];
 
-	local->airtime[ac].aql_txq_limit_low = q_limit_low;
-	local->airtime[ac].aql_txq_limit_high = q_limit_high;
+	local->aql_txq_limit_low[ac] = q_limit_low;
+	local->aql_txq_limit_high[ac] = q_limit_high;
 
 	mutex_lock(&local->sta_mtx);
 	list_for_each_entry(sta, &local->sta_list, list) {
@@ -382,46 +382,6 @@ static const struct file_operations force_tx_status_ops = {
 	.llseek = default_llseek,
 };
 
-static ssize_t airtime_read(struct file *file,
-			    char __user *user_buf,
-			    size_t count,
-			    loff_t *ppos)
-{
-	struct ieee80211_local *local = file->private_data;
-	char buf[200];
-	u64 v_t[IEEE80211_NUM_ACS];
-	u64 wt[IEEE80211_NUM_ACS];
-	int len = 0, ac;
-
-	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
-		spin_lock_bh(&local->airtime[ac].lock);
-		v_t[ac] = local->airtime[ac].v_t;
-		wt[ac] = local->airtime[ac].weight_sum;
-		spin_unlock_bh(&local->airtime[ac].lock);
-	}
-	len = scnprintf(buf, sizeof(buf),
-			"\tVO         VI         BE         BK\n"
-			"Virt-t\t%-10llu %-10llu %-10llu %-10llu\n"
-			"Weight\t%-10llu %-10llu %-10llu %-10llu\n",
-			v_t[0],
-			v_t[1],
-			v_t[2],
-			v_t[3],
-			wt[0],
-			wt[1],
-			wt[2],
-			wt[3]);
-
-	return simple_read_from_buffer(user_buf, count, ppos,
-				       buf, len);
-}
-
-static const struct file_operations airtime_ops = {
-	.read = airtime_read,
-	.open = simple_open,
-	.llseek = default_llseek,
-};
-
 #ifdef CONFIG_PM
 static ssize_t reset_write(struct file *file, const char __user *user_buf,
 			   size_t count, loff_t *ppos)
@@ -675,11 +635,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	if (local->ops->wake_tx_queue)
 		DEBUGFS_ADD_MODE(aqm, 0600);
 
-	if (wiphy_ext_feature_isset(local->hw.wiphy,
-				    NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) {
-		DEBUGFS_ADD_MODE(airtime, 0600);
-		DEBUGFS_ADD_MODE(airtime_flags, 0600);
-	}
+	DEBUGFS_ADD_MODE(airtime_flags, 0600);
 
 	DEBUGFS_ADD(aql_txq_limit);
 	debugfs_create_u32("aql_threshold", 0600,
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index dfb194e15018..ead917501d6c 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -510,34 +510,6 @@ static ssize_t ieee80211_if_fmt_aqm(
 }
 IEEE80211_IF_FILE_R(aqm);
 
-static ssize_t ieee80211_if_fmt_airtime(
-	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_txq *txq = sdata->vif.txq;
-	struct airtime_info *air_info;
-	int len;
-
-	if (!txq)
-		return 0;
-
-	spin_lock_bh(&local->airtime[txq->ac].lock);
-	air_info = to_airtime_info(txq);
-	len = scnprintf(buf,
-			buflen,
-			"RX: %llu us\nTX: %llu us\nWeight: %u\n"
-			"Virt-T: %lld us\n",
-			air_info->rx_airtime,
-			air_info->tx_airtime,
-			air_info->weight,
-			air_info->v_t);
-	spin_unlock_bh(&local->airtime[txq->ac].lock);
-
-	return len;
-}
-
-IEEE80211_IF_FILE_R(airtime);
-
 IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
 
 /* IBSS attributes */
@@ -683,10 +655,8 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
 
 	if (sdata->local->ops->wake_tx_queue &&
 	    sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
-	    sdata->vif.type != NL80211_IFTYPE_NAN) {
+	    sdata->vif.type != NL80211_IFTYPE_NAN)
 		DEBUGFS_ADD(aqm);
-		DEBUGFS_ADD(airtime);
-	}
 }
 
 static void add_sta_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 182094be9001..1dc238fc24f0 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -202,7 +202,7 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
 	size_t bufsz = 400;
 	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
 	u64 rx_airtime = 0, tx_airtime = 0;
-	u64 v_t[IEEE80211_NUM_ACS];
+	s64 deficit[IEEE80211_NUM_ACS];
 	ssize_t rv;
 	int ac;
 
@@ -210,18 +210,18 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
 		return -ENOMEM;
 
 	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
-		spin_lock_bh(&local->airtime[ac].lock);
+		spin_lock_bh(&local->active_txq_lock[ac]);
 		rx_airtime += sta->airtime[ac].rx_airtime;
 		tx_airtime += sta->airtime[ac].tx_airtime;
-		v_t[ac] = sta->airtime[ac].v_t;
-		spin_unlock_bh(&local->airtime[ac].lock);
+		deficit[ac] = sta->airtime[ac].deficit;
+		spin_unlock_bh(&local->active_txq_lock[ac]);
 	}
 
 	p += scnprintf(p, bufsz + buf - p,
 		"RX: %llu us\nTX: %llu us\nWeight: %u\n"
-		"Virt-T: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n",
-		rx_airtime, tx_airtime, sta->airtime[0].weight,
-		v_t[0], v_t[1], v_t[2], v_t[3]);
+		"Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n",
+		rx_airtime, tx_airtime, sta->airtime_weight,
+		deficit[0], deficit[1], deficit[2], deficit[3]);
 
 	rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
 	kfree(buf);
@@ -236,11 +236,11 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf,
 	int ac;
 
 	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
-		spin_lock_bh(&local->airtime[ac].lock);
+		spin_lock_bh(&local->active_txq_lock[ac]);
 		sta->airtime[ac].rx_airtime = 0;
 		sta->airtime[ac].tx_airtime = 0;
-		sta->airtime[ac].v_t = 0;
-		spin_unlock_bh(&local->airtime[ac].lock);
+		sta->airtime[ac].deficit = sta->airtime_weight;
+		spin_unlock_bh(&local->active_txq_lock[ac]);
 	}
 
 	return count;
@@ -263,10 +263,10 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf,
 		return -ENOMEM;
 
 	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
-		spin_lock_bh(&local->airtime[ac].lock);
+		spin_lock_bh(&local->active_txq_lock[ac]);
 		q_limit_l[ac] = sta->airtime[ac].aql_limit_low;
 		q_limit_h[ac] = sta->airtime[ac].aql_limit_high;
-		spin_unlock_bh(&local->airtime[ac].lock);
+		spin_unlock_bh(&local->active_txq_lock[ac]);
 		q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending);
 	}
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2190d08f4e34..f21e456dbad7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -818,16 +818,20 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
- * @schedule_order: used with ieee80211_local->active_txqs
  * @frags: used to keep fragments created after dequeue
+ * @schedule_order: used with ieee80211_local->active_txqs
+ * @schedule_round: counter to prevent infinite loops on TXQ scheduling
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
-	struct rb_node schedule_order;
+
+	u16 schedule_round;
+	struct list_head schedule_order;
 
 	struct sk_buff_head frags;
+
 	unsigned long flags;
 
 	/* keep last! */
@@ -923,8 +927,6 @@ struct ieee80211_link_data {
 	struct ieee80211_key __rcu *default_mgmt_key;
 	struct ieee80211_key __rcu *default_beacon_key;
 
-	struct airtime_info airtime[IEEE80211_NUM_ACS];
-
 	struct work_struct csa_finalize_work;
 	bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */
 	struct cfg80211_chan_def csa_chandef;
@@ -1208,44 +1210,6 @@ enum mac80211_scan_state {
 	SCAN_ABORT,
 };
 
-/**
- * struct airtime_sched_info - state used for airtime scheduling and AQL
- *
- * @lock: spinlock that protects all the fields in this struct
- * @active_txqs: rbtree of currently backlogged queues, sorted by virtual time
- * @schedule_pos: the current position maintained while a driver walks the tree
- *                with ieee80211_next_txq()
- * @active_list: list of struct airtime_info structs that were active within
- *               the last AIRTIME_ACTIVE_DURATION (100 ms), used to compute
- *               weight_sum
- * @last_weight_update: used for rate limiting walking active_list
- * @last_schedule_time: tracks the last time a transmission was scheduled; used
- *                      for catching up v_t if no stations are eligible for
- *                      transmission.
- * @v_t: global virtual time; queues with v_t < this are eligible for
- *       transmission
- * @weight_sum: total sum of all active stations used for dividing airtime
- * @weight_sum_reciprocal: reciprocal of weight_sum (to avoid divisions in fast
- *                         path - see comment above
- *                         IEEE80211_RECIPROCAL_DIVISOR_64)
- * @aql_txq_limit_low: AQL limit when total outstanding airtime
- *                     is < IEEE80211_AQL_THRESHOLD
- * @aql_txq_limit_high: AQL limit when total outstanding airtime
- *                      is > IEEE80211_AQL_THRESHOLD
- */
-struct airtime_sched_info {
-	spinlock_t lock;
-	struct rb_root_cached active_txqs;
-	struct rb_node *schedule_pos;
-	struct list_head active_list;
-	u64 last_weight_update;
-	u64 last_schedule_activity;
-	u64 v_t;
-	u64 weight_sum;
-	u64 weight_sum_reciprocal;
-	u32 aql_txq_limit_low;
-	u32 aql_txq_limit_high;
-};
 DECLARE_STATIC_KEY_FALSE(aql_disable);
 
 struct ieee80211_local {
@@ -1259,8 +1223,13 @@ struct ieee80211_local {
 	struct codel_params cparams;
 
 	/* protects active_txqs and txqi->schedule_order */
-	struct airtime_sched_info airtime[IEEE80211_NUM_ACS];
+	spinlock_t active_txq_lock[IEEE80211_NUM_ACS];
+	struct list_head active_txqs[IEEE80211_NUM_ACS];
+	u16 schedule_round[IEEE80211_NUM_ACS];
+
 	u16 airtime_flags;
+	u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
+	u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
 	u32 aql_threshold;
 	atomic_t aql_total_pending_airtime;
 
@@ -1683,125 +1652,6 @@ static inline bool txq_has_queue(struct ieee80211_txq *txq)
 	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
 }
 
-static inline struct airtime_info *to_airtime_info(struct ieee80211_txq *txq)
-{
-	struct ieee80211_sub_if_data *sdata;
-	struct sta_info *sta;
-
-	if (txq->sta) {
-		sta = container_of(txq->sta, struct sta_info, sta);
-		return &sta->airtime[txq->ac];
-	}
-
-	sdata = vif_to_sdata(txq->vif);
-	return &sdata->deflink.airtime[txq->ac];
-}
-
-/* To avoid divisions in the fast path, we keep pre-computed reciprocals for
- * airtime weight calculations. There are two different weights to keep track
- * of: The per-station weight and the sum of weights per phy.
- *
- * For the per-station weights (kept in airtime_info below), we use 32-bit
- * reciprocals with a devisor of 2^19. This lets us keep the multiplications and
- * divisions for the station weights as 32-bit operations at the cost of a bit
- * of rounding error for high weights; but the choice of divisor keeps rounding
- * errors <10% for weights <2^15, assuming no more than 8ms of airtime is
- * reported at a time.
- *
- * For the per-phy sum of weights the values can get higher, so we use 64-bit
- * operations for those with a 32-bit divisor, which should avoid any
- * significant rounding errors.
- */
-#define IEEE80211_RECIPROCAL_DIVISOR_64 0x100000000ULL
-#define IEEE80211_RECIPROCAL_SHIFT_64 32
-#define IEEE80211_RECIPROCAL_DIVISOR_32 0x80000U
-#define IEEE80211_RECIPROCAL_SHIFT_32 19
-
-static inline void airtime_weight_set(struct airtime_info *air_info, u16 weight)
-{
-	if (air_info->weight == weight)
-		return;
-
-	air_info->weight = weight;
-	if (weight) {
-		air_info->weight_reciprocal =
-			IEEE80211_RECIPROCAL_DIVISOR_32 / weight;
-	} else {
-		air_info->weight_reciprocal = 0;
-	}
-}
-
-static inline void airtime_weight_sum_set(struct airtime_sched_info *air_sched,
-					  int weight_sum)
-{
-	if (air_sched->weight_sum == weight_sum)
-		return;
-
-	air_sched->weight_sum = weight_sum;
-	if (air_sched->weight_sum) {
-		air_sched->weight_sum_reciprocal = IEEE80211_RECIPROCAL_DIVISOR_64;
-		do_div(air_sched->weight_sum_reciprocal, air_sched->weight_sum);
-	} else {
-		air_sched->weight_sum_reciprocal = 0;
-	}
-}
-
-/* A problem when trying to enforce airtime fairness is that we want to divide
- * the airtime between the currently *active* stations. However, basing this on
- * the instantaneous queue state of stations doesn't work, as queues tend to
- * oscillate very quickly between empty and occupied, leading to the scheduler
- * thinking only a single station is active when deciding whether to allow
- * transmission (and thus not throttling correctly).
- *
- * To fix this we use a timer-based notion of activity: a station is considered
- * active if it has been scheduled within the last 100 ms; we keep a separate
- * list of all the stations considered active in this manner, and lazily update
- * the total weight of active stations from this list (filtering the stations in
- * the list by their 'last active' time).
- *
- * We add one additional safeguard to guard against stations that manage to get
- * scheduled every 100 ms but don't transmit a lot of data, and thus don't use
- * up any airtime. Such stations would be able to get priority for an extended
- * period of time if they do start transmitting at full capacity again, and so
- * we add an explicit maximum for how far behind a station is allowed to fall in
- * the virtual airtime domain. This limit is set to a relatively high value of
- * 20 ms because the main mechanism for catching up idle stations is the active
- * state as described above; i.e., the hard limit should only be hit in
- * pathological cases.
- */
-#define AIRTIME_ACTIVE_DURATION (100 * NSEC_PER_MSEC)
-#define AIRTIME_MAX_BEHIND 20000 /* 20 ms */
-
-static inline bool airtime_is_active(struct airtime_info *air_info, u64 now)
-{
-	return air_info->last_scheduled >= now - AIRTIME_ACTIVE_DURATION;
-}
-
-static inline void airtime_set_active(struct airtime_sched_info *air_sched,
-				      struct airtime_info *air_info, u64 now)
-{
-	air_info->last_scheduled = now;
-	air_sched->last_schedule_activity = now;
-	list_move_tail(&air_info->list, &air_sched->active_list);
-}
-
-static inline bool airtime_catchup_v_t(struct airtime_sched_info *air_sched,
-				       u64 v_t, u64 now)
-{
-	air_sched->v_t = v_t;
-	return true;
-}
-
-static inline void init_airtime_info(struct airtime_info *air_info,
-				     struct airtime_sched_info *air_sched)
-{
-	atomic_set(&air_info->aql_tx_pending, 0);
-	air_info->aql_limit_low = air_sched->aql_txq_limit_low;
-	air_info->aql_limit_high = air_sched->aql_txq_limit_high;
-	airtime_weight_set(air_info, IEEE80211_DEFAULT_AIRTIME_WEIGHT);
-	INIT_LIST_HEAD(&air_info->list);
-}
-
 static inline bool
 ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
 {
@@ -2047,14 +1897,6 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      u64 *cookie);
 int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len);
-void ieee80211_resort_txq(struct ieee80211_hw *hw,
-			  struct ieee80211_txq *txq);
-void ieee80211_unschedule_txq(struct ieee80211_hw *hw,
-			      struct ieee80211_txq *txq,
-			      bool purge);
-void ieee80211_update_airtime_weight(struct ieee80211_local *local,
-				     struct airtime_sched_info *air_sched,
-				     u64 now, bool force);
 
 /* HT */
 void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 0cf5a395d925..56dd831fe45f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -2176,10 +2176,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		}
 	}
 
-	for (i = 0; i < IEEE80211_NUM_ACS; i++)
-		init_airtime_info(&sdata->deflink.airtime[i],
-				  &local->airtime[i]);
-
 	ieee80211_set_default_queues(sdata);
 
 	sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 08f153b82a23..e6c1cafbe9e5 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -784,13 +784,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	spin_lock_init(&local->queue_stop_reason_lock);
 
 	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
-		struct airtime_sched_info *air_sched = &local->airtime[i];
-
-		air_sched->active_txqs = RB_ROOT_CACHED;
-		INIT_LIST_HEAD(&air_sched->active_list);
-		spin_lock_init(&air_sched->lock);
-		air_sched->aql_txq_limit_low = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L;
-		air_sched->aql_txq_limit_high =
+		INIT_LIST_HEAD(&local->active_txqs[i]);
+		spin_lock_init(&local->active_txq_lock[i]);
+		local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L;
+		local->aql_txq_limit_high[i] =
 			IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H;
 	}
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d017ad14d7db..834d2171f344 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1565,8 +1565,12 @@ static void sta_ps_start(struct sta_info *sta)
 
 	for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
 		struct ieee80211_txq *txq = sta->sta.txq[tid];
+		struct txq_info *txqi = to_txq_info(txq);
 
-		ieee80211_unschedule_txq(&local->hw, txq, false);
+		spin_lock(&local->active_txq_lock[txq->ac]);
+		if (!list_empty(&txqi->schedule_order))
+			list_del_init(&txqi->schedule_order);
+		spin_unlock(&local->active_txq_lock[txq->ac]);
 
 		if (txq_has_queue(txq))
 			set_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 014032369994..a1a2118b4bf0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -559,11 +559,15 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	if (sta_prepare_rate_control(local, sta, gfp))
 		goto free_txq;
 
+	sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT;
 
 	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
 		skb_queue_head_init(&sta->ps_tx_buf[i]);
 		skb_queue_head_init(&sta->tx_filtered[i]);
-		init_airtime_info(&sta->airtime[i], &local->airtime[i]);
+		sta->airtime[i].deficit = sta->airtime_weight;
+		atomic_set(&sta->airtime[i].aql_tx_pending, 0);
+		sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i];
+		sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i];
 	}
 
 	for (i = 0; i < IEEE80211_NUM_TIDS; i++)
@@ -2035,59 +2039,24 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta,
 }
 EXPORT_SYMBOL(ieee80211_sta_set_buffered);
 
-void ieee80211_register_airtime(struct ieee80211_txq *txq,
-				u32 tx_airtime, u32 rx_airtime)
+void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
+				    u32 tx_airtime, u32 rx_airtime)
 {
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
-	struct ieee80211_local *local = sdata->local;
-	u64 weight_sum, weight_sum_reciprocal;
-	struct airtime_sched_info *air_sched;
-	struct airtime_info *air_info;
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct ieee80211_local *local = sta->sdata->local;
+	u8 ac = ieee80211_ac_from_tid(tid);
 	u32 airtime = 0;
 
-	air_sched = &local->airtime[txq->ac];
-	air_info = to_airtime_info(txq);
-
-	if (local->airtime_flags & AIRTIME_USE_TX)
+	if (sta->local->airtime_flags & AIRTIME_USE_TX)
 		airtime += tx_airtime;
-	if (local->airtime_flags & AIRTIME_USE_RX)
+	if (sta->local->airtime_flags & AIRTIME_USE_RX)
 		airtime += rx_airtime;
 
-	/* Weights scale so the unit weight is 256 */
-	airtime <<= 8;
-
-	spin_lock_bh(&air_sched->lock);
-
-	air_info->tx_airtime += tx_airtime;
-	air_info->rx_airtime += rx_airtime;
-
-	if (air_sched->weight_sum) {
-		weight_sum = air_sched->weight_sum;
-		weight_sum_reciprocal = air_sched->weight_sum_reciprocal;
-	} else {
-		weight_sum = air_info->weight;
-		weight_sum_reciprocal = air_info->weight_reciprocal;
-	}
-
-	/* Round the calculation of global vt */
-	air_sched->v_t += (u64)((airtime + (weight_sum >> 1)) *
-				weight_sum_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_64;
-	air_info->v_t += (u32)((airtime + (air_info->weight >> 1)) *
-			       air_info->weight_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_32;
-	ieee80211_resort_txq(&local->hw, txq);
-
-	spin_unlock_bh(&air_sched->lock);
-}
-
-void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
-				    u32 tx_airtime, u32 rx_airtime)
-{
-	struct ieee80211_txq *txq = pubsta->txq[tid];
-
-	if (!txq)
-		return;
-
-	ieee80211_register_airtime(txq, tx_airtime, rx_airtime);
+	spin_lock_bh(&local->active_txq_lock[ac]);
+	sta->airtime[ac].tx_airtime += tx_airtime;
+	sta->airtime[ac].rx_airtime += rx_airtime;
+	sta->airtime[ac].deficit -= airtime;
+	spin_unlock_bh(&local->active_txq_lock[ac]);
 }
 EXPORT_SYMBOL(ieee80211_sta_register_airtime);
 
@@ -2501,7 +2470,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 	}
 
 	if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) {
-		sinfo->airtime_weight = sta->airtime[0].weight;
+		sinfo->airtime_weight = sta->airtime_weight;
 		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT);
 	}
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 4e0b969891de..3e26ad3be800 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -135,25 +135,18 @@ enum ieee80211_agg_stop_reason {
 #define AIRTIME_USE_TX		BIT(0)
 #define AIRTIME_USE_RX		BIT(1)
 
-
 struct airtime_info {
 	u64 rx_airtime;
 	u64 tx_airtime;
-	u64 v_t;
-	u64 last_scheduled;
-	struct list_head list;
+	s64 deficit;
 	atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
 	u32 aql_limit_low;
 	u32 aql_limit_high;
-	u32 weight_reciprocal;
-	u16 weight;
 };
 
 void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
 					  struct sta_info *sta, u8 ac,
 					  u16 tx_airtime, bool tx_completed);
-void ieee80211_register_airtime(struct ieee80211_txq *txq,
-				u32 tx_airtime, u32 rx_airtime);
 
 struct sta_info;
 
@@ -609,6 +602,7 @@ struct link_sta_info {
  * @tid_seq: per-TID sequence numbers for sending to this STA
  * @airtime: per-AC struct airtime_info describing airtime statistics for this
  *	station
+ * @airtime_weight: station weight for airtime fairness calculation purposes
  * @ampdu_mlme: A-MPDU state machine state
  * @mesh: mesh STA information
  * @debugfs_dir: debug filesystem directory dentry
@@ -693,6 +687,7 @@ struct sta_info {
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
 
 	struct airtime_info airtime[IEEE80211_NUM_ACS];
+	u16 airtime_weight;
 
 	/*
 	 * Aggregation information, locked with lock.
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 5c2202a7ea1c..9bd4d336d444 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -998,25 +998,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 		if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
 			ieee80211_frame_acked(sta, skb);
 
-	} else if (wiphy_ext_feature_isset(local->hw.wiphy,
-					   NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) {
-		struct ieee80211_sub_if_data *sdata;
-		struct ieee80211_txq *txq;
-		u32 airtime;
-
-		/* Account airtime to multicast queue */
-		sdata = ieee80211_sdata_from_skb(local, skb);
-
-		if (sdata && (txq = sdata->vif.txq)) {
-			airtime = info->status.tx_time ?:
-				ieee80211_calc_expected_tx_airtime(hw,
-								   &sdata->vif,
-								   NULL,
-								   skb->len,
-								   false);
-
-			ieee80211_register_airtime(txq, airtime, 0);
-		}
 	}
 
 	/* SNMP counters
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c3e14ef20c05..fcee60ce2456 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -18,7 +18,6 @@
 #include <linux/bitmap.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
-#include <linux/timekeeping.h>
 #include <net/net_namespace.h>
 #include <net/ieee80211_radiotap.h>
 #include <net/cfg80211.h>
@@ -1479,7 +1478,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
 	__skb_queue_head_init(&txqi->frags);
-	RB_CLEAR_NODE(&txqi->schedule_order);
+	INIT_LIST_HEAD(&txqi->schedule_order);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1523,7 +1522,9 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 	spin_unlock_bh(&fq->lock);
 
-	ieee80211_unschedule_txq(&local->hw, &txqi->txq, true);
+	spin_lock_bh(&local->active_txq_lock[txqi->txq.ac]);
+	list_del_init(&txqi->schedule_order);
+	spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]);
 }
 
 void ieee80211_txq_set_params(struct ieee80211_local *local)
@@ -3802,259 +3803,102 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue);
 struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct airtime_sched_info *air_sched;
-	u64 now = ktime_get_coarse_boottime_ns();
 	struct ieee80211_txq *ret = NULL;
-	struct airtime_info *air_info;
-	struct txq_info *txqi = NULL;
-	struct rb_node *node;
-	bool first = false;
+	struct txq_info *txqi = NULL, *head = NULL;
+	bool found_eligible_txq = false;
 
-	air_sched = &local->airtime[ac];
-	spin_lock_bh(&air_sched->lock);
+	spin_lock_bh(&local->active_txq_lock[ac]);
 
-	node = air_sched->schedule_pos;
-
-begin:
-	if (!node) {
-		node = rb_first_cached(&air_sched->active_txqs);
-		first = true;
-	} else {
-		node = rb_next(node);
-	}
-
-	if (!node)
-		goto out;
-
-	txqi = container_of(node, struct txq_info, schedule_order);
-	air_info = to_airtime_info(&txqi->txq);
-
-	if (air_info->v_t > air_sched->v_t &&
-	    (!first || !airtime_catchup_v_t(air_sched, air_info->v_t, now)))
+ begin:
+	txqi = list_first_entry_or_null(&local->active_txqs[ac],
+					struct txq_info,
+					schedule_order);
+	if (!txqi)
 		goto out;
 
-	if (!ieee80211_txq_airtime_check(hw, &txqi->txq)) {
-		first = false;
-		goto begin;
-	}
-
-	air_sched->schedule_pos = node;
-	air_sched->last_schedule_activity = now;
-	ret = &txqi->txq;
-out:
-	spin_unlock_bh(&air_sched->lock);
-	return ret;
-}
-EXPORT_SYMBOL(ieee80211_next_txq);
-
-static void __ieee80211_insert_txq(struct rb_root_cached *root,
-				   struct txq_info *txqi)
-{
-	struct rb_node **new = &root->rb_root.rb_node;
-	struct airtime_info *old_air, *new_air;
-	struct rb_node *parent = NULL;
-	struct txq_info *__txqi;
-	bool leftmost = true;
-
-	while (*new) {
-		parent = *new;
-		__txqi = rb_entry(parent, struct txq_info, schedule_order);
-		old_air = to_airtime_info(&__txqi->txq);
-		new_air = to_airtime_info(&txqi->txq);
-
-		if (new_air->v_t <= old_air->v_t) {
-			new = &parent->rb_left;
-		} else {
-			new = &parent->rb_right;
-			leftmost = false;
-		}
+	if (txqi == head) {
+		if (!found_eligible_txq)
+			goto out;
+		else
+			found_eligible_txq = false;
 	}
 
-	rb_link_node(&txqi->schedule_order, parent, new);
-	rb_insert_color_cached(&txqi->schedule_order, root, leftmost);
-}
+	if (!head)
+		head = txqi;
 
-void ieee80211_resort_txq(struct ieee80211_hw *hw,
-			  struct ieee80211_txq *txq)
-{
-	struct airtime_info *air_info = to_airtime_info(txq);
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = to_txq_info(txq);
-	struct airtime_sched_info *air_sched;
-
-	air_sched = &local->airtime[txq->ac];
-
-	lockdep_assert_held(&air_sched->lock);
+	if (txqi->txq.sta) {
+		struct sta_info *sta = container_of(txqi->txq.sta,
+						    struct sta_info, sta);
+		bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
+		s64 deficit = sta->airtime[txqi->txq.ac].deficit;
 
-	if (!RB_EMPTY_NODE(&txqi->schedule_order)) {
-		struct airtime_info *a_prev = NULL, *a_next = NULL;
-		struct txq_info *t_prev, *t_next;
-		struct rb_node *n_prev, *n_next;
+		if (aql_check)
+			found_eligible_txq = true;
 
-		/* Erasing a node can cause an expensive rebalancing operation,
-		 * so we check the previous and next nodes first and only remove
-		 * and re-insert if the current node is not already in the
-		 * correct position.
-		 */
-		if ((n_prev = rb_prev(&txqi->schedule_order)) != NULL) {
-			t_prev = container_of(n_prev, struct txq_info,
-					      schedule_order);
-			a_prev = to_airtime_info(&t_prev->txq);
-		}
+		if (deficit < 0)
+			sta->airtime[txqi->txq.ac].deficit +=
+				sta->airtime_weight;
 
-		if ((n_next = rb_next(&txqi->schedule_order)) != NULL) {
-			t_next = container_of(n_next, struct txq_info,
-					      schedule_order);
-			a_next = to_airtime_info(&t_next->txq);
+		if (deficit < 0 || !aql_check) {
+			list_move_tail(&txqi->schedule_order,
+				       &local->active_txqs[txqi->txq.ac]);
+			goto begin;
 		}
-
-		if ((!a_prev || a_prev->v_t <= air_info->v_t) &&
-		    (!a_next || a_next->v_t > air_info->v_t))
-			return;
-
-		if (air_sched->schedule_pos == &txqi->schedule_order)
-			air_sched->schedule_pos = n_prev;
-
-		rb_erase_cached(&txqi->schedule_order,
-				&air_sched->active_txqs);
-		RB_CLEAR_NODE(&txqi->schedule_order);
-		__ieee80211_insert_txq(&air_sched->active_txqs, txqi);
 	}
-}
 
-void ieee80211_update_airtime_weight(struct ieee80211_local *local,
-				     struct airtime_sched_info *air_sched,
-				     u64 now, bool force)
-{
-	struct airtime_info *air_info, *tmp;
-	u64 weight_sum = 0;
-
-	if (unlikely(!now))
-		now = ktime_get_coarse_boottime_ns();
 
-	lockdep_assert_held(&air_sched->lock);
-
-	if (!force && (air_sched->last_weight_update <
-		       now - AIRTIME_ACTIVE_DURATION))
-		return;
-
-	list_for_each_entry_safe(air_info, tmp,
-				 &air_sched->active_list, list) {
-		if (airtime_is_active(air_info, now))
-			weight_sum += air_info->weight;
-		else
-			list_del_init(&air_info->list);
-	}
-	airtime_weight_sum_set(air_sched, weight_sum);
-	air_sched->last_weight_update = now;
-}
-
-void ieee80211_schedule_txq(struct ieee80211_hw *hw,
-			    struct ieee80211_txq *txq)
-	__acquires(txq_lock) __releases(txq_lock)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = to_txq_info(txq);
-	struct airtime_sched_info *air_sched;
-	u64 now = ktime_get_coarse_boottime_ns();
-	struct airtime_info *air_info;
-	u8 ac = txq->ac;
-	bool was_active;
-
-	air_sched = &local->airtime[ac];
-	air_info = to_airtime_info(txq);
-
-	spin_lock_bh(&air_sched->lock);
-	was_active = airtime_is_active(air_info, now);
-	airtime_set_active(air_sched, air_info, now);
-
-	if (!RB_EMPTY_NODE(&txqi->schedule_order))
+	if (txqi->schedule_round == local->schedule_round[ac])
 		goto out;
 
-	/* If the station has been inactive for a while, catch up its v_t so it
-	 * doesn't get indefinite priority; see comment above the definition of
-	 * AIRTIME_MAX_BEHIND.
-	 */
-	if ((!was_active && air_info->v_t < air_sched->v_t) ||
-	    air_info->v_t < air_sched->v_t - AIRTIME_MAX_BEHIND)
-		air_info->v_t = air_sched->v_t;
-
-	ieee80211_update_airtime_weight(local, air_sched, now, !was_active);
-	__ieee80211_insert_txq(&air_sched->active_txqs, txqi);
+	list_del_init(&txqi->schedule_order);
+	txqi->schedule_round = local->schedule_round[ac];
+	ret = &txqi->txq;
 
 out:
-	spin_unlock_bh(&air_sched->lock);
-}
-EXPORT_SYMBOL(ieee80211_schedule_txq);
-
-static void __ieee80211_unschedule_txq(struct ieee80211_hw *hw,
-				       struct ieee80211_txq *txq,
-				       bool purge)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = to_txq_info(txq);
-	struct airtime_sched_info *air_sched;
-	struct airtime_info *air_info;
-
-	air_sched = &local->airtime[txq->ac];
-	air_info = to_airtime_info(&txqi->txq);
-
-	lockdep_assert_held(&air_sched->lock);
-
-	if (purge) {
-		list_del_init(&air_info->list);
-		ieee80211_update_airtime_weight(local, air_sched, 0, true);
-	}
-
-	if (RB_EMPTY_NODE(&txqi->schedule_order))
-		return;
-
-	if (air_sched->schedule_pos == &txqi->schedule_order)
-		air_sched->schedule_pos = rb_prev(&txqi->schedule_order);
-
-	if (!purge)
-		airtime_set_active(air_sched, air_info,
-				   ktime_get_coarse_boottime_ns());
-
-	rb_erase_cached(&txqi->schedule_order,
-			&air_sched->active_txqs);
-	RB_CLEAR_NODE(&txqi->schedule_order);
+	spin_unlock_bh(&local->active_txq_lock[ac]);
+	return ret;
 }
+EXPORT_SYMBOL(ieee80211_next_txq);
 
-void ieee80211_unschedule_txq(struct ieee80211_hw *hw,
+void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
 			      struct ieee80211_txq *txq,
-			      bool purge)
-	__acquires(txq_lock) __releases(txq_lock)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-
-	spin_lock_bh(&local->airtime[txq->ac].lock);
-	__ieee80211_unschedule_txq(hw, txq, purge);
-	spin_unlock_bh(&local->airtime[txq->ac].lock);
-}
-
-void ieee80211_return_txq(struct ieee80211_hw *hw,
-			  struct ieee80211_txq *txq, bool force)
+			      bool force)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = to_txq_info(txq);
 
-	spin_lock_bh(&local->airtime[txq->ac].lock);
-
-	if (!RB_EMPTY_NODE(&txqi->schedule_order) && !force &&
-	    !txq_has_queue(txq))
-		__ieee80211_unschedule_txq(hw, txq, false);
+	spin_lock_bh(&local->active_txq_lock[txq->ac]);
+
+	if (list_empty(&txqi->schedule_order) &&
+	    (force || !skb_queue_empty(&txqi->frags) ||
+	     txqi->tin.backlog_packets)) {
+		/* If airtime accounting is active, always enqueue STAs at the
+		 * head of the list to ensure that they only get moved to the
+		 * back by the airtime DRR scheduler once they have a negative
+		 * deficit. A station that already has a negative deficit will
+		 * get immediately moved to the back of the list on the next
+		 * call to ieee80211_next_txq().
+		 */
+		if (txqi->txq.sta && local->airtime_flags &&
+		    wiphy_ext_feature_isset(local->hw.wiphy,
+					    NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
+			list_add(&txqi->schedule_order,
+				 &local->active_txqs[txq->ac]);
+		else
+			list_add_tail(&txqi->schedule_order,
+				      &local->active_txqs[txq->ac]);
+	}
 
-	spin_unlock_bh(&local->airtime[txq->ac].lock);
+	spin_unlock_bh(&local->active_txq_lock[txq->ac]);
 }
-EXPORT_SYMBOL(ieee80211_return_txq);
+EXPORT_SYMBOL(__ieee80211_schedule_txq);
 
 DEFINE_STATIC_KEY_FALSE(aql_disable);
 
 bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
 				 struct ieee80211_txq *txq)
 {
-	struct airtime_info *air_info = to_airtime_info(txq);
+	struct sta_info *sta;
 	struct ieee80211_local *local = hw_to_local(hw);
 
 	if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
@@ -4069,12 +3913,15 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
 	if (unlikely(txq->tid == IEEE80211_NUM_TIDS))
 		return true;
 
-	if (atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_low)
+	sta = container_of(txq->sta, struct sta_info, sta);
+	if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+	    sta->airtime[txq->ac].aql_limit_low)
 		return true;
 
 	if (atomic_read(&local->aql_total_pending_airtime) <
 	    local->aql_threshold &&
-	    atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_high)
+	    atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+	    sta->airtime[txq->ac].aql_limit_high)
 		return true;
 
 	return false;
@@ -4084,59 +3931,60 @@ EXPORT_SYMBOL(ieee80211_txq_airtime_check);
 bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
 				struct ieee80211_txq *txq)
 {
-	struct txq_info *first_txqi = NULL, *txqi = to_txq_info(txq);
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct airtime_sched_info *air_sched;
-	struct airtime_info *air_info;
-	struct rb_node *node = NULL;
-	bool ret = false;
-	u64 now;
-
-
-	if (!ieee80211_txq_airtime_check(hw, txq))
-		return false;
+	struct txq_info *iter, *tmp, *txqi = to_txq_info(txq);
+	struct sta_info *sta;
+	u8 ac = txq->ac;
 
-	air_sched = &local->airtime[txq->ac];
-	spin_lock_bh(&air_sched->lock);
+	spin_lock_bh(&local->active_txq_lock[ac]);
 
-	if (RB_EMPTY_NODE(&txqi->schedule_order))
+	if (!txqi->txq.sta)
 		goto out;
 
-	now = ktime_get_coarse_boottime_ns();
+	if (list_empty(&txqi->schedule_order))
+		goto out;
 
-	/* Like in ieee80211_next_txq(), make sure the first station in the
-	 * scheduling order is eligible for transmission to avoid starvation.
-	 */
-	node = rb_first_cached(&air_sched->active_txqs);
-	if (node) {
-		first_txqi = container_of(node, struct txq_info,
-					  schedule_order);
-		air_info = to_airtime_info(&first_txqi->txq);
+	list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac],
+				 schedule_order) {
+		if (iter == txqi)
+			break;
 
-		if (air_sched->v_t < air_info->v_t)
-			airtime_catchup_v_t(air_sched, air_info->v_t, now);
+		if (!iter->txq.sta) {
+			list_move_tail(&iter->schedule_order,
+				       &local->active_txqs[ac]);
+			continue;
+		}
+		sta = container_of(iter->txq.sta, struct sta_info, sta);
+		if (sta->airtime[ac].deficit < 0)
+			sta->airtime[ac].deficit += sta->airtime_weight;
+		list_move_tail(&iter->schedule_order, &local->active_txqs[ac]);
 	}
 
-	air_info = to_airtime_info(&txqi->txq);
-	if (air_info->v_t <= air_sched->v_t) {
-		air_sched->last_schedule_activity = now;
-		ret = true;
-	}
+	sta = container_of(txqi->txq.sta, struct sta_info, sta);
+	if (sta->airtime[ac].deficit >= 0)
+		goto out;
+
+	sta->airtime[ac].deficit += sta->airtime_weight;
+	list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]);
+	spin_unlock_bh(&local->active_txq_lock[ac]);
 
+	return false;
 out:
-	spin_unlock_bh(&air_sched->lock);
-	return ret;
+	if (!list_empty(&txqi->schedule_order))
+		list_del_init(&txqi->schedule_order);
+	spin_unlock_bh(&local->active_txq_lock[ac]);
+
+	return true;
 }
 EXPORT_SYMBOL(ieee80211_txq_may_transmit);
 
 void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct airtime_sched_info *air_sched = &local->airtime[ac];
 
-	spin_lock_bh(&air_sched->lock);
-	air_sched->schedule_pos = NULL;
-	spin_unlock_bh(&air_sched->lock);
+	spin_lock_bh(&local->active_txq_lock[ac]);
+	local->schedule_round[ac]++;
+	spin_unlock_bh(&local->active_txq_lock[ac]);
 }
 EXPORT_SYMBOL(ieee80211_txq_schedule_start);
 
-- 
cgit 


From ecad3b0b99bff7247a11f8c7cb19ac9b0cb28b09 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Mon, 23 May 2022 18:55:58 +0530
Subject: wifi: cfg80211: Increase akm_suites array size in
 cfg80211_crypto_settings

Increase akm_suites array size in struct cfg80211_crypto_settings to 10
and advertise the capability to userspace. This allows userspace to send
more than two AKMs to driver in netlink commands such as
NL80211_CMD_CONNECT.

This capability is needed for implementing WPA3-Personal transition mode
correctly with any driver that handles roaming internally. Currently,
the possible AKMs for multi-AKM connect can include PSK, PSK-SHA-256,
SAE, FT-PSK and FT-SAE. Since the count is already 5, increasing
the akm_suites array size to 10 should be reasonable for future
usecases.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1653312358-12321-1-git-send-email-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.c | 12 ++++++++----
 include/net/cfg80211.h                            | 11 ++++++++++-
 include/uapi/linux/nl80211.h                      | 14 ++++++++++++++
 net/wireless/core.c                               |  6 ++++++
 net/wireless/nl80211.c                            |  7 ++++++-
 5 files changed, 44 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index 3d734a7a5ba8..0fad53693292 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -241,6 +241,7 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif,
 	struct qlink_auth_encr *aen;
 	int ret;
 	int i;
+	int n;
 
 	if (!qtnf_cmd_start_ap_can_fit(vif, s))
 		return -E2BIG;
@@ -280,8 +281,9 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif,
 	for (i = 0; i < QLINK_MAX_NR_CIPHER_SUITES; i++)
 		aen->ciphers_pairwise[i] =
 				cpu_to_le32(s->crypto.ciphers_pairwise[i]);
-	aen->n_akm_suites = cpu_to_le32(s->crypto.n_akm_suites);
-	for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++)
+	n = min(QLINK_MAX_NR_AKM_SUITES, s->crypto.n_akm_suites);
+	aen->n_akm_suites = cpu_to_le32(n);
+	for (i = 0; i < n; i++)
 		aen->akm_suites[i] = cpu_to_le32(s->crypto.akm_suites[i]);
 	aen->control_port = s->crypto.control_port;
 	aen->control_port_no_encrypt = s->crypto.control_port_no_encrypt;
@@ -2076,6 +2078,7 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 	struct qlink_auth_encr *aen;
 	int ret;
 	int i;
+	int n;
 	u32 connect_flags = 0;
 
 	cmd_skb = qtnf_cmd_alloc_new_cmdskb(vif->mac->macid, vif->vifid,
@@ -2132,9 +2135,10 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 		aen->ciphers_pairwise[i] =
 			cpu_to_le32(sme->crypto.ciphers_pairwise[i]);
 
-	aen->n_akm_suites = cpu_to_le32(sme->crypto.n_akm_suites);
+	n = min(QLINK_MAX_NR_AKM_SUITES, sme->crypto.n_akm_suites);
+	aen->n_akm_suites = cpu_to_le32(n);
 
-	for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++)
+	for (i = 0; i < n; i++)
 		aen->akm_suites[i] = cpu_to_le32(sme->crypto.akm_suites[i]);
 
 	aen->control_port = sme->crypto.control_port;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 87ebed6a48bd..6bc161d653f3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1063,6 +1063,7 @@ struct survey_info {
 };
 
 #define CFG80211_MAX_WEP_KEYS	4
+#define CFG80211_MAX_NUM_AKM_SUITES	10
 
 /**
  * struct cfg80211_crypto_settings - Crypto settings
@@ -1114,7 +1115,7 @@ struct cfg80211_crypto_settings {
 	int n_ciphers_pairwise;
 	u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES];
 	int n_akm_suites;
-	u32 akm_suites[NL80211_MAX_NR_AKM_SUITES];
+	u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES];
 	bool control_port;
 	__be16 control_port_ethertype;
 	bool control_port_no_encrypt;
@@ -5200,6 +5201,13 @@ struct wiphy_iftype_akm_suites {
  * @ema_max_profile_periodicity: maximum profile periodicity supported by
  *	the driver. Setting this field to a non-zero value indicates that the
  *	driver supports enhanced multi-BSSID advertisements (EMA AP).
+ * @max_num_akm_suites: maximum number of AKM suites allowed for
+ *	configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by
+ *	driver. If set by driver minimum allowed value is
+ *	NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
+ *	legacy userspace and maximum allowed value is
+ *	CFG80211_MAX_NUM_AKM_SUITES.
  */
 struct wiphy {
 	struct mutex mtx;
@@ -5346,6 +5354,7 @@ struct wiphy {
 
 	u8 mbssid_max_interfaces;
 	u8 ema_max_profile_periodicity;
+	u16 max_num_akm_suites;
 
 	char priv[] __aligned(NETDEV_ALIGN);
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 89f64f46b98d..279f9715919e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2694,6 +2694,13 @@ enum nl80211_commands {
  *	connection. Used with %NL80211_CMD_CONNECT. If this attribute is not
  *	included in NL80211_CMD_CONNECT drivers must not perform MLO connection.
  *
+ * @NL80211_ATTR_MAX_NUM_AKM_SUITES: U16 attribute. Indicates maximum number of
+ *	AKM suites allowed for %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_START_AP in %NL80211_CMD_GET_WIPHY response. If this
+ *	attribute is not present userspace shall consider maximum number of AKM
+ *	suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum
+ *	number prior to the introduction of this attribute.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3214,6 +3221,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MLO_SUPPORT,
 
+	NL80211_ATTR_MAX_NUM_AKM_SUITES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3268,6 +3277,11 @@ enum nl80211_attrs {
 #define NL80211_HE_MIN_CAPABILITY_LEN           16
 #define NL80211_HE_MAX_CAPABILITY_LEN           54
 #define NL80211_MAX_NR_CIPHER_SUITES		5
+
+/*
+ * NL80211_MAX_NR_AKM_SUITES is obsolete when %NL80211_ATTR_MAX_NUM_AKM_SUITES
+ * present in %NL80211_CMD_GET_WIPHY response.
+ */
 #define NL80211_MAX_NR_AKM_SUITES		2
 #define NL80211_EHT_MIN_CAPABILITY_LEN          13
 #define NL80211_EHT_MAX_CAPABILITY_LEN          51
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 3e5d12040726..6b5321bb1176 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -913,6 +913,12 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 #endif
 
+	if (!wiphy->max_num_akm_suites)
+		wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES;
+	else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES ||
+		 wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES)
+		return -EINVAL;
+
 	/* check and set up bitrates */
 	ieee80211_set_bitrate_flags(wiphy);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b583a76ef492..e2b6740268a6 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -798,6 +798,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
 	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
 };
 
 /* policy for the key attributes */
@@ -2932,6 +2933,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		if (nl80211_put_mbssid_support(&rdev->wiphy, msg))
 			goto nla_put_failure;
 
+		if (nla_put_u16(msg, NL80211_ATTR_MAX_NUM_AKM_SUITES,
+				rdev->wiphy.max_num_akm_suites))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -10431,7 +10436,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 		if (len % sizeof(u32))
 			return -EINVAL;
 
-		if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES)
+		if (settings->n_akm_suites > rdev->wiphy.max_num_akm_suites)
 			return -EINVAL;
 
 		memcpy(settings->akm_suites, data, len);
-- 
cgit 


From 8bc65d38ee466897a264c9e336fe21058818b1b1 Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Sun, 22 May 2022 23:49:04 -0700
Subject: wifi: nl80211: retrieve EHT related elements in AP mode

Add support to retrieve EHT capabilities and EHT operation elements
passed by the userspace in the beacon template and store the pointers
in struct cfg80211_ap_settings to be used by the drivers.

Co-developed-by: Vikram Kandukuri <quic_vikram@quicinc.com>
Signed-off-by: Vikram Kandukuri <quic_vikram@quicinc.com>
Co-developed-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://lore.kernel.org/r/20220523064904.28523-1-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++++
 net/wireless/nl80211.c | 26 ++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6bc161d653f3..140354f5f15b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1296,6 +1296,8 @@ struct cfg80211_unsol_bcast_probe_resp {
  * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
  * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
  * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
+ * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled)
+ * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled)
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
  * @twt_responder: Enable Target Wait Time
@@ -1332,6 +1334,8 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_vht_cap *vht_cap;
 	const struct ieee80211_he_cap_elem *he_cap;
 	const struct ieee80211_he_operation *he_oper;
+	const struct ieee80211_eht_cap_elem *eht_cap;
+	const struct ieee80211_eht_operation *eht_oper;
 	bool ht_required, vht_required, he_required, sae_h2e_required;
 	bool twt_responder;
 	u32 flags;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e2b6740268a6..eda2ad029c90 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5522,7 +5522,7 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
  * HT/VHT requirements/capabilities, we parse them out of the IEs for the
  * benefit of drivers that rebuild IEs in the firmware.
  */
-static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
+static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
 {
 	const struct cfg80211_beacon_data *bcn = &params->beacon;
 	size_t ies_len = bcn->tail_len;
@@ -5548,6 +5548,26 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
 	cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ies, ies_len);
 	if (cap && cap->datalen >= sizeof(*params->he_oper) + 1)
 		params->he_oper = (void *)(cap->data + 1);
+	cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_CAPABILITY, ies, ies_len);
+	if (cap) {
+		if (!cap->datalen)
+			return -EINVAL;
+		params->eht_cap = (void *)(cap->data + 1);
+		if (!ieee80211_eht_capa_size_ok((const u8 *)params->he_cap,
+						(const u8 *)params->eht_cap,
+						cap->datalen - 1))
+			return -EINVAL;
+	}
+	cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_OPERATION, ies, ies_len);
+	if (cap) {
+		if (!cap->datalen)
+			return -EINVAL;
+		params->eht_oper = (void *)(cap->data + 1);
+		if (!ieee80211_eht_oper_size_ok((const u8 *)params->eht_oper,
+						cap->datalen - 1))
+			return -EINVAL;
+	}
+	return 0;
 }
 
 static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
@@ -5873,7 +5893,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			goto out_unlock;
 	}
 
-	nl80211_calculate_ap_params(params);
+	err = nl80211_calculate_ap_params(params);
+	if (err)
+		goto out_unlock;
 
 	if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS])
 		params->flags = nla_get_u32(
-- 
cgit 


From e918c137db4083e59866d2aaa603887cbd9969bf Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 30 Jun 2022 15:17:57 -0700
Subject: net: remove SK_RECLAIM_THRESHOLD and SK_RECLAIM_CHUNK

There are no more users for the mentioned macros, just
drop them.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 40bbd0e8925b..0dd43c3df49b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1619,11 +1619,6 @@ static inline void sk_mem_charge(struct sock *sk, int size)
 	sk->sk_forward_alloc -= size;
 }
 
-/* the following macros control memory reclaiming in mptcp_rmem_uncharge()
- */
-#define SK_RECLAIM_THRESHOLD	(1 << 21)
-#define SK_RECLAIM_CHUNK	(1 << 20)
-
 static inline void sk_mem_uncharge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
-- 
cgit 


From 31a371e419c885e0f137ce70395356ba8639dc52 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 29 Jun 2022 17:42:08 +0300
Subject: fanotify: prepare for setting event flags in ignore mask

Setting flags FAN_ONDIR FAN_EVENT_ON_CHILD in ignore mask has no effect.
The FAN_EVENT_ON_CHILD flag in mask implicitly applies to ignore mask and
ignore mask is always implicitly applied to events on directories.

Define a mark flag that replaces this legacy behavior with logic of
applying the ignore mask according to event flags in ignore mask.

Implement the new logic to prepare for supporting an ignore mask that
ignores events on children and ignore mask that does not ignore events
on directories.

To emphasize the change in terminology, also rename ignored_mask mark
member to ignore_mask and use accessors to get only the effective
ignored events or the ignored events and flags.

This change in terminology finally aligns with the "ignore mask"
language in man pages and in most of the comments.

Link: https://lore.kernel.org/r/20220629144210.2983229-2-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 19 ++++----
 fs/notify/fanotify/fanotify_user.c | 21 +++++----
 fs/notify/fdinfo.c                 |  6 +--
 fs/notify/fsnotify.c               | 21 +++++----
 include/linux/fsnotify_backend.h   | 89 +++++++++++++++++++++++++++++++++++---
 5 files changed, 121 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 4f897e109547..cd7d09a569ff 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 				     const void *data, int data_type,
 				     struct inode *dir)
 {
-	__u32 marks_mask = 0, marks_ignored_mask = 0;
+	__u32 marks_mask = 0, marks_ignore_mask = 0;
 	__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS |
 				     FANOTIFY_EVENT_FLAGS;
 	const struct path *path = fsnotify_data_path(data, data_type);
 	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
 	struct fsnotify_mark *mark;
+	bool ondir = event_mask & FAN_ONDIR;
 	int type;
 
 	pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
@@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 			return 0;
 	} else if (!(fid_mode & FAN_REPORT_FID)) {
 		/* Do we have a directory inode to report? */
-		if (!dir && !(event_mask & FS_ISDIR))
+		if (!dir && !ondir)
 			return 0;
 	}
 
 	fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
-		/* Apply ignore mask regardless of mark's ISDIR flag */
-		marks_ignored_mask |= mark->ignored_mask;
+		/*
+		 * Apply ignore mask depending on event flags in ignore mask.
+		 */
+		marks_ignore_mask |=
+			fsnotify_effective_ignore_mask(mark, ondir, type);
 
 		/*
-		 * If the event is on dir and this mark doesn't care about
-		 * events on dir, don't send it!
+		 * Send the event depending on event flags in mark mask.
 		 */
-		if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
+		if (!fsnotify_mask_applicable(mark->mask, ondir, type))
 			continue;
 
 		marks_mask |= mark->mask;
@@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 		*match_mask |= 1U << type;
 	}
 
-	test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+	test_mask = event_mask & marks_mask & ~marks_ignore_mask;
 
 	/*
 	 * For dirent modification events (create/delete/move) that do not carry
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b08ce0d821a7..a9eea037fee9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1012,7 +1012,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
 		fsn_mark->mask &= ~mask;
 	} else {
-		fsn_mark->ignored_mask &= ~mask;
+		fsn_mark->ignore_mask &= ~mask;
 	}
 	newmask = fsnotify_calc_mask(fsn_mark);
 	/*
@@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 	 * changes to the mask.
 	 * Destroy mark when only umask bits remain.
 	 */
-	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
+	*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
 	spin_unlock(&fsn_mark->lock);
 
 	return oldmask & ~newmask;
@@ -1090,7 +1090,7 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 	/*
 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
-	 * because of an ignored mask that is now going to survive FS_MODIFY.
+	 * because of an ignore mask that is now going to survive FS_MODIFY.
 	 */
 	if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
 	    (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
@@ -1123,7 +1123,7 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	if (!(fan_flags & FAN_MARK_IGNORED_MASK))
 		fsn_mark->mask |= mask;
 	else
-		fsn_mark->ignored_mask |= mask;
+		fsn_mark->ignore_mask |= mask;
 
 	recalc = fsnotify_calc_mask(fsn_mark) &
 		~fsnotify_conn_mask(fsn_mark->connector);
@@ -1261,7 +1261,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 
 	/*
 	 * If some other task has this inode open for write we should not add
-	 * an ignored mark, unless that ignored mark is supposed to survive
+	 * an ignore mask, unless that ignore mask is supposed to survive
 	 * modification changes anyway.
 	 */
 	if ((flags & FAN_MARK_IGNORED_MASK) &&
@@ -1557,7 +1557,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	__kernel_fsid_t __fsid, *fsid = NULL;
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
-	bool ignored = flags & FAN_MARK_IGNORED_MASK;
+	bool ignore = flags & FAN_MARK_IGNORED_MASK;
 	unsigned int obj_type, fid_mode;
 	u32 umask = 0;
 	int ret;
@@ -1606,8 +1606,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (mask & ~valid_mask)
 		return -EINVAL;
 
-	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
-	if (ignored)
+	/*
+	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
+	 * FAN_MARK_IGNORED_MASK.
+	 */
+	if (ignore)
 		mask &= ~FANOTIFY_EVENT_FLAGS;
 
 	f = fdget(fanotify_fd);
@@ -1721,7 +1724,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		 * events with parent/name info for non-directory.
 		 */
 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
-		    (flags & FAN_MARK_ADD) && !ignored)
+		    (flags & FAN_MARK_ADD) && !ignore)
 			mask |= FAN_EVENT_ON_CHILD;
 	}
 
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 59fb40abe33d..55081ae3a6ec 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 			return;
 		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
 			   inode->i_ino, inode->i_sb->s_dev,
-			   mflags, mark->mask, mark->ignored_mask);
+			   mflags, mark->mask, mark->ignore_mask);
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
@@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		struct mount *mnt = fsnotify_conn_mount(mark->connector);
 
 		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
-			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+			   mnt->mnt_id, mflags, mark->mask, mark->ignore_mask);
 	} else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
 		struct super_block *sb = fsnotify_conn_sb(mark->connector);
 
 		seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
-			   sb->s_dev, mflags, mark->mask, mark->ignored_mask);
+			   sb->s_dev, mflags, mark->mask, mark->ignore_mask);
 	}
 }
 
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 0b3e74935cb4..8687562df2e3 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
 	struct fsnotify_group *group = NULL;
 	__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
 	__u32 marks_mask = 0;
-	__u32 marks_ignored_mask = 0;
+	__u32 marks_ignore_mask = 0;
+	bool is_dir = mask & FS_ISDIR;
 	struct fsnotify_mark *mark;
 	int type;
 
@@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
 		fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
 			if (!(mark->flags &
 			      FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
-				mark->ignored_mask = 0;
+				mark->ignore_mask = 0;
 		}
 	}
 
@@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
 	fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
 		group = mark->group;
 		marks_mask |= mark->mask;
-		marks_ignored_mask |= mark->ignored_mask;
+		marks_ignore_mask |=
+			fsnotify_effective_ignore_mask(mark, is_dir, type);
 	}
 
-	pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
-		 __func__, group, mask, marks_mask, marks_ignored_mask,
+	pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
+		 __func__, group, mask, marks_mask, marks_ignore_mask,
 		 data, data_type, dir, cookie);
 
-	if (!(test_mask & marks_mask & ~marks_ignored_mask))
+	if (!(test_mask & marks_mask & ~marks_ignore_mask))
 		return 0;
 
 	if (group->ops->handle_event) {
@@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types(
 			 * But is *this mark* watching children?
 			 */
 			if (type == FSNOTIFY_ITER_TYPE_PARENT &&
-			    !(mark->mask & FS_EVENT_ON_CHILD))
+			    !(mark->mask & FS_EVENT_ON_CHILD) &&
+			    !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
 				continue;
 
 			fsnotify_iter_set_report_type(iter_info, type);
@@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 
 
 	/*
-	 * If this is a modify event we may need to clear some ignored masks.
-	 * In that case, the object with ignored masks will have the FS_MODIFY
+	 * If this is a modify event we may need to clear some ignore masks.
+	 * In that case, the object with ignore masks will have the FS_MODIFY
 	 * event in its mask.
 	 * Otherwise, return if none of the marks care about this type of event.
 	 */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9560734759fa..d7d96c806bff 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -518,8 +518,8 @@ struct fsnotify_mark {
 	struct hlist_node obj_list;
 	/* Head of list of marks for an object [mark ref] */
 	struct fsnotify_mark_connector *connector;
-	/* Events types to ignore [mark->lock, group->mark_mutex] */
-	__u32 ignored_mask;
+	/* Events types and flags to ignore [mark->lock, group->mark_mutex] */
+	__u32 ignore_mask;
 	/* General fsnotify mark flags */
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x0001
 #define FSNOTIFY_MARK_FLAG_ATTACHED		0x0002
@@ -529,6 +529,7 @@ struct fsnotify_mark {
 	/* fanotify mark flags */
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x0100
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
+#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS	0x0400
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
@@ -655,15 +656,91 @@ extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
 
 /* functions used to manipulate the marks attached to inodes */
 
-/* Get mask for calculating object interest taking ignored mask into account */
+/*
+ * Canonical "ignore mask" including event flags.
+ *
+ * Note the subtle semantic difference from the legacy ->ignored_mask.
+ * ->ignored_mask traditionally only meant which events should be ignored,
+ * while ->ignore_mask also includes flags regarding the type of objects on
+ * which events should be ignored.
+ */
+static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
+{
+	__u32 ignore_mask = mark->ignore_mask;
+
+	/* The event flags in ignore mask take effect */
+	if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+		return ignore_mask;
+
+	/*
+	 * Legacy behavior:
+	 * - Always ignore events on dir
+	 * - Ignore events on child if parent is watching children
+	 */
+	ignore_mask |= FS_ISDIR;
+	ignore_mask &= ~FS_EVENT_ON_CHILD;
+	ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;
+
+	return ignore_mask;
+}
+
+/* Legacy ignored_mask - only event types to ignore */
+static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
+{
+	return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
+}
+
+/*
+ * Check if mask (or ignore mask) should be applied depending if victim is a
+ * directory and whether it is reported to a watching parent.
+ */
+static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
+					    int iter_type)
+{
+	/* Should mask be applied to a directory? */
+	if (is_dir && !(mask & FS_ISDIR))
+		return false;
+
+	/* Should mask be applied to a child? */
+	if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
+	    !(mask & FS_EVENT_ON_CHILD))
+		return false;
+
+	return true;
+}
+
+/*
+ * Effective ignore mask taking into account if event victim is a
+ * directory and whether it is reported to a watching parent.
+ */
+static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
+						   bool is_dir, int iter_type)
+{
+	__u32 ignore_mask = fsnotify_ignored_events(mark);
+
+	if (!ignore_mask)
+		return 0;
+
+	/* For non-dir and non-child, no need to consult the event flags */
+	if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
+		return ignore_mask;
+
+	ignore_mask = fsnotify_ignore_mask(mark);
+	if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
+		return 0;
+
+	return ignore_mask & ALL_FSNOTIFY_EVENTS;
+}
+
+/* Get mask for calculating object interest taking ignore mask into account */
 static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
 {
 	__u32 mask = mark->mask;
 
-	if (!mark->ignored_mask)
+	if (!fsnotify_ignored_events(mark))
 		return mask;
 
-	/* Interest in FS_MODIFY may be needed for clearing ignored mask */
+	/* Interest in FS_MODIFY may be needed for clearing ignore mask */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
 		mask |= FS_MODIFY;
 
@@ -671,7 +748,7 @@ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
 	 * If mark is interested in ignoring events on children, the object must
 	 * show interest in those events for fsnotify_parent() to notice it.
 	 */
-	return mask | (mark->ignored_mask & ALL_FSNOTIFY_EVENTS);
+	return mask | mark->ignore_mask;
 }
 
 /* Get mask of events for a list of marks */
-- 
cgit 


From 8afd7215aa97f8868d033f6e1d01a276ab2d29c0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 29 Jun 2022 17:42:09 +0300
Subject: fanotify: cleanups for fanotify_mark() input validations

Create helper fanotify_may_update_existing_mark() for checking for
conflicts between existing mark flags and fanotify_mark() flags.

Use variable mark_cmd to make the checks for mark command bits
cleaner.

Link: https://lore.kernel.org/r/20220629144210.2983229-3-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 30 +++++++++++++++++++++---------
 include/linux/fanotify.h           |  9 +++++----
 2 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a9eea037fee9..6781d46cd37c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1187,6 +1187,19 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
 					 sizeof(struct fanotify_error_event));
 }
 
+static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
+					      unsigned int fan_flags)
+{
+	/*
+	 * Non evictable mark cannot be downgraded to evictable mark.
+	 */
+	if (fan_flags & FAN_MARK_EVICTABLE &&
+	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+		return -EEXIST;
+
+	return 0;
+}
+
 static int fanotify_add_mark(struct fsnotify_group *group,
 			     fsnotify_connp_t *connp, unsigned int obj_type,
 			     __u32 mask, unsigned int fan_flags,
@@ -1208,13 +1221,11 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	}
 
 	/*
-	 * Non evictable mark cannot be downgraded to evictable mark.
+	 * Check if requested mark flags conflict with an existing mark flags.
 	 */
-	if (fan_flags & FAN_MARK_EVICTABLE &&
-	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
-		ret = -EEXIST;
+	ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
+	if (ret)
 		goto out;
-	}
 
 	/*
 	 * Error events are pre-allocated per group, only if strictly
@@ -1557,6 +1568,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	__kernel_fsid_t __fsid, *fsid = NULL;
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
 	bool ignore = flags & FAN_MARK_IGNORED_MASK;
 	unsigned int obj_type, fid_mode;
 	u32 umask = 0;
@@ -1586,7 +1598,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		return -EINVAL;
 	}
 
-	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	switch (mark_cmd) {
 	case FAN_MARK_ADD:
 	case FAN_MARK_REMOVE:
 		if (!mask)
@@ -1675,7 +1687,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
 		goto fput_and_out;
 
-	if (flags & FAN_MARK_FLUSH) {
+	if (mark_cmd == FAN_MARK_FLUSH) {
 		ret = 0;
 		if (mark_type == FAN_MARK_MOUNT)
 			fsnotify_clear_vfsmount_marks_by_group(group);
@@ -1691,7 +1703,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (ret)
 		goto fput_and_out;
 
-	if (flags & FAN_MARK_ADD) {
+	if (mark_cmd == FAN_MARK_ADD) {
 		ret = fanotify_events_supported(group, &path, mask, flags);
 		if (ret)
 			goto path_put_and_out;
@@ -1729,7 +1741,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	}
 
 	/* create/update an inode mark */
-	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+	switch (mark_cmd) {
 	case FAN_MARK_ADD:
 		if (mark_type == FAN_MARK_MOUNT)
 			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index e517dbcf74ed..a7207f092fd1 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -59,15 +59,16 @@
 #define FANOTIFY_MARK_TYPE_BITS	(FAN_MARK_INODE | FAN_MARK_MOUNT | \
 				 FAN_MARK_FILESYSTEM)
 
+#define FANOTIFY_MARK_CMD_BITS	(FAN_MARK_ADD | FAN_MARK_REMOVE | \
+				 FAN_MARK_FLUSH)
+
 #define FANOTIFY_MARK_FLAGS	(FANOTIFY_MARK_TYPE_BITS | \
-				 FAN_MARK_ADD | \
-				 FAN_MARK_REMOVE | \
+				 FANOTIFY_MARK_CMD_BITS | \
 				 FAN_MARK_DONT_FOLLOW | \
 				 FAN_MARK_ONLYDIR | \
 				 FAN_MARK_IGNORED_MASK | \
 				 FAN_MARK_IGNORED_SURV_MODIFY | \
-				 FAN_MARK_EVICTABLE | \
-				 FAN_MARK_FLUSH)
+				 FAN_MARK_EVICTABLE)
 
 /*
  * Events that can be reported with data type FSNOTIFY_EVENT_PATH.
-- 
cgit 


From e252f2ed1c8c6c3884ab5dd34e003ed21f1fe6e0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 29 Jun 2022 17:42:10 +0300
Subject: fanotify: introduce FAN_MARK_IGNORE

This flag is a new way to configure ignore mask which allows adding and
removing the event flags FAN_ONDIR and FAN_EVENT_ON_CHILD in ignore mask.

The legacy FAN_MARK_IGNORED_MASK flag would always ignore events on
directories and would ignore events on children depending on whether
the FAN_EVENT_ON_CHILD flag was set in the (non ignored) mask.

FAN_MARK_IGNORE can be used to ignore events on children without setting
FAN_EVENT_ON_CHILD in the mark's mask and will not ignore events on
directories unconditionally, only when FAN_ONDIR is set in ignore mask.

The new behavior is non-downgradable.  After calling fanotify_mark() with
FAN_MARK_IGNORE once, calling fanotify_mark() with FAN_MARK_IGNORED_MASK
on the same object will return EEXIST error.

Setting the event flags with FAN_MARK_IGNORE on a non-dir inode mark
has no meaning and will return ENOTDIR error.

The meaning of FAN_MARK_IGNORED_SURV_MODIFY is preserved with the new
FAN_MARK_IGNORE flag, but with a few semantic differences:

1. FAN_MARK_IGNORED_SURV_MODIFY is required for filesystem and mount
   marks and on an inode mark on a directory. Omitting this flag
   will return EINVAL or EISDIR error.

2. An ignore mask on a non-directory inode that survives modify could
   never be downgraded to an ignore mask that does not survive modify.
   With new FAN_MARK_IGNORE semantics we make that rule explicit -
   trying to update a surviving ignore mask without the flag
   FAN_MARK_IGNORED_SURV_MODIFY will return EEXIST error.

The conveniene macro FAN_MARK_IGNORE_SURV is added for
(FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY), because the
common case should use short constant names.

Link: https://lore.kernel.org/r/20220629144210.2983229-4-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.h      |  2 ++
 fs/notify/fanotify/fanotify_user.c | 63 ++++++++++++++++++++++++++++++++------
 include/linux/fanotify.h           |  5 ++-
 include/uapi/linux/fanotify.h      |  8 +++++
 4 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 80e0ec95b113..1d9f11255c64 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
 	if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
 		mflags |= FAN_MARK_EVICTABLE;
+	if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+		mflags |= FAN_MARK_IGNORE;
 
 	return mflags;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6781d46cd37c..f0e49a406ffa 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1009,7 +1009,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 	mask &= ~umask;
 	spin_lock(&fsn_mark->lock);
 	oldmask = fsnotify_calc_mask(fsn_mark);
-	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
 		fsn_mark->mask &= ~mask;
 	} else {
 		fsn_mark->ignore_mask &= ~mask;
@@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 				       unsigned int fan_flags)
 {
 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
 	bool recalc = false;
 
+	/*
+	 * When using FAN_MARK_IGNORE for the first time, mark starts using
+	 * independent event flags in ignore mask.  After that, trying to
+	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
+	 * will result in EEXIST error.
+	 */
+	if (ignore == FAN_MARK_IGNORE)
+		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
+
 	/*
 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
 	 * because of an ignore mask that is now going to survive FS_MODIFY.
 	 */
-	if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
-	    (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 		if (!(fsn_mark->mask & FS_MODIFY))
@@ -1120,7 +1129,7 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	bool recalc;
 
 	spin_lock(&fsn_mark->lock);
-	if (!(fan_flags & FAN_MARK_IGNORED_MASK))
+	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
 		fsn_mark->mask |= mask;
 	else
 		fsn_mark->ignore_mask |= mask;
@@ -1197,6 +1206,24 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
 		return -EEXIST;
 
+	/*
+	 * New ignore mask semantics cannot be downgraded to old semantics.
+	 */
+	if (fan_flags & FAN_MARK_IGNORED_MASK &&
+	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+		return -EEXIST;
+
+	/*
+	 * An ignore mask that survives modify could never be downgraded to not
+	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
+	 * explicit and return an error when trying to update the ignore mask
+	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
+	 */
+	if (fan_flags & FAN_MARK_IGNORE &&
+	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+		return -EEXIST;
+
 	return 0;
 }
 
@@ -1231,7 +1258,8 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	 * Error events are pre-allocated per group, only if strictly
 	 * needed (i.e. FAN_FS_ERROR was requested).
 	 */
-	if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
+	    (mask & FAN_FS_ERROR)) {
 		ret = fanotify_group_init_error_pool(group);
 		if (ret)
 			goto out;
@@ -1275,7 +1303,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	 * an ignore mask, unless that ignore mask is supposed to survive
 	 * modification changes anyway.
 	 */
-	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
 	    inode_is_open_for_write(inode))
 		return 0;
@@ -1531,7 +1559,8 @@ static int fanotify_events_supported(struct fsnotify_group *group,
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
-				 (mask & FAN_RENAME);
+				 (mask & FAN_RENAME) ||
+				 (flags & FAN_MARK_IGNORE);
 
 	/*
 	 * Some filesystems such as 'proc' acquire unusual locks when opening
@@ -1569,7 +1598,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
-	bool ignore = flags & FAN_MARK_IGNORED_MASK;
+	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
 	unsigned int obj_type, fid_mode;
 	u32 umask = 0;
 	int ret;
@@ -1618,12 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (mask & ~valid_mask)
 		return -EINVAL;
 
+
+	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
+	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
+		return -EINVAL;
+
 	/*
 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
 	 * FAN_MARK_IGNORED_MASK.
 	 */
-	if (ignore)
+	if (ignore == FAN_MARK_IGNORED_MASK) {
 		mask &= ~FANOTIFY_EVENT_FLAGS;
+		umask = FANOTIFY_EVENT_FLAGS;
+	}
 
 	f = fdget(fanotify_fd);
 	if (unlikely(!f.file))
@@ -1727,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	else
 		mnt = path.mnt;
 
+	ret = mnt ? -EINVAL : -EISDIR;
+	/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+	if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
+	    (mnt || S_ISDIR(inode->i_mode)) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
+		goto path_put_and_out;
+
 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
 	if (mnt || !S_ISDIR(inode->i_mode)) {
 		mask &= ~FAN_EVENT_ON_CHILD;
@@ -1819,7 +1862,7 @@ static int __init fanotify_user_setup(void)
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
 					 SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index a7207f092fd1..8ad743def6f3 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -62,11 +62,14 @@
 #define FANOTIFY_MARK_CMD_BITS	(FAN_MARK_ADD | FAN_MARK_REMOVE | \
 				 FAN_MARK_FLUSH)
 
+#define FANOTIFY_MARK_IGNORE_BITS (FAN_MARK_IGNORED_MASK | \
+				   FAN_MARK_IGNORE)
+
 #define FANOTIFY_MARK_FLAGS	(FANOTIFY_MARK_TYPE_BITS | \
 				 FANOTIFY_MARK_CMD_BITS | \
+				 FANOTIFY_MARK_IGNORE_BITS | \
 				 FAN_MARK_DONT_FOLLOW | \
 				 FAN_MARK_ONLYDIR | \
-				 FAN_MARK_IGNORED_MASK | \
 				 FAN_MARK_IGNORED_SURV_MODIFY | \
 				 FAN_MARK_EVICTABLE)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..d8536d77fea1 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -83,12 +83,20 @@
 #define FAN_MARK_FLUSH		0x00000080
 /* FAN_MARK_FILESYSTEM is	0x00000100 */
 #define FAN_MARK_EVICTABLE	0x00000200
+/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */
+#define FAN_MARK_IGNORE		0x00000400
 
 /* These are NOT bitwise flags.  Both bits can be used togther.  */
 #define FAN_MARK_INODE		0x00000000
 #define FAN_MARK_MOUNT		0x00000010
 #define FAN_MARK_FILESYSTEM	0x00000100
 
+/*
+ * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY
+ * for non-inode mark types.
+ */
+#define FAN_MARK_IGNORE_SURV	(FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY)
+
 /* Deprecated - do not use this in programs and do not add new flags here! */
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
-- 
cgit 


From b69a2afd5afce9bf6d56e349d6ab592c916e20f2 Mon Sep 17 00:00:00 2001
From: Jonathan McDowell <noodles@fb.com>
Date: Thu, 30 Jun 2022 08:36:12 +0000
Subject: x86/kexec: Carry forward IMA measurement log on kexec

On kexec file load, the Integrity Measurement Architecture (IMA)
subsystem may verify the IMA signature of the kernel and initramfs, and
measure it. The command line parameters passed to the kernel in the
kexec call may also be measured by IMA.

A remote attestation service can verify a TPM quote based on the TPM
event log, the IMA measurement list and the TPM PCR data. This can
be achieved only if the IMA measurement log is carried over from the
current kernel to the next kernel across the kexec call.

PowerPC and ARM64 both achieve this using device tree with a
"linux,ima-kexec-buffer" node. x86 platforms generally don't make use of
device tree, so use the setup_data mechanism to pass the IMA buffer to
the new kernel.

Signed-off-by: Jonathan McDowell <noodles@fb.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com> # IMA function definitions
Link: https://lore.kernel.org/r/YmKyvlF3my1yWTvK@noodles-fedora-PC23Y6EG
---
 arch/x86/Kconfig                      |  1 +
 arch/x86/include/uapi/asm/bootparam.h |  9 +++++
 arch/x86/kernel/e820.c                |  6 ++--
 arch/x86/kernel/kexec-bzimage64.c     | 42 +++++++++++++++++++++--
 arch/x86/kernel/setup.c               | 63 +++++++++++++++++++++++++++++++++++
 drivers/of/kexec.c                    | 13 +++-----
 include/linux/ima.h                   |  5 +++
 include/linux/of.h                    |  2 --
 security/integrity/ima/ima_kexec.c    |  2 +-
 9 files changed, 127 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index be0b95e51df6..670e0edc074f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2033,6 +2033,7 @@ config KEXEC_FILE
 	bool "kexec file based system call"
 	select KEXEC_CORE
 	select BUILD_BIN2C
+	select HAVE_IMA_KEXEC if IMA
 	depends on X86_64
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index bea5cdcdf532..ca0796ac4403 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -11,6 +11,7 @@
 #define SETUP_APPLE_PROPERTIES		5
 #define SETUP_JAILHOUSE			6
 #define SETUP_CC_BLOB			7
+#define SETUP_IMA			8
 
 #define SETUP_INDIRECT			(1<<31)
 
@@ -172,6 +173,14 @@ struct jailhouse_setup_data {
 	} __attribute__((packed)) v2;
 } __attribute__((packed));
 
+/*
+ * IMA buffer setup data information from the previous kernel during kexec
+ */
+struct ima_setup_data {
+	__u64 addr;
+	__u64 size;
+} __attribute__((packed));
+
 /* The so-called "zeropage" */
 struct boot_params {
 	struct screen_info screen_info;			/* 0x000 */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f267205f2d5a..9dac24680ff8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1017,10 +1017,10 @@ void __init e820__reserve_setup_data(void)
 		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
 
 		/*
-		 * SETUP_EFI is supplied by kexec and does not need to be
-		 * reserved.
+		 * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need
+		 * to be reserved.
 		 */
-		if (data->type != SETUP_EFI)
+		if (data->type != SETUP_EFI && data->type != SETUP_IMA)
 			e820__range_update_kexec(pa_data,
 						 sizeof(*data) + data->len,
 						 E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 170d0fd68b1f..c63974e94272 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -186,11 +186,38 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
 }
 #endif /* CONFIG_EFI */
 
+static void
+setup_ima_state(const struct kimage *image, struct boot_params *params,
+		unsigned long params_load_addr,
+		unsigned int ima_setup_data_offset)
+{
+#ifdef CONFIG_IMA_KEXEC
+	struct setup_data *sd = (void *)params + ima_setup_data_offset;
+	unsigned long setup_data_phys;
+	struct ima_setup_data *ima;
+
+	if (!image->ima_buffer_size)
+		return;
+
+	sd->type = SETUP_IMA;
+	sd->len = sizeof(*ima);
+
+	ima = (void *)sd + sizeof(struct setup_data);
+	ima->addr = image->ima_buffer_addr;
+	ima->size = image->ima_buffer_size;
+
+	/* Add setup data */
+	setup_data_phys = params_load_addr + ima_setup_data_offset;
+	sd->next = params->hdr.setup_data;
+	params->hdr.setup_data = setup_data_phys;
+#endif /* CONFIG_IMA_KEXEC */
+}
+
 static int
 setup_boot_parameters(struct kimage *image, struct boot_params *params,
 		      unsigned long params_load_addr,
 		      unsigned int efi_map_offset, unsigned int efi_map_sz,
-		      unsigned int efi_setup_data_offset)
+		      unsigned int setup_data_offset)
 {
 	unsigned int nr_e820_entries;
 	unsigned long long mem_k, start, end;
@@ -245,8 +272,15 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 #ifdef CONFIG_EFI
 	/* Setup EFI state */
 	setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
-			efi_setup_data_offset);
+			setup_data_offset);
+	setup_data_offset += sizeof(struct setup_data) +
+			sizeof(struct efi_setup_data);
 #endif
+
+	/* Setup IMA log buffer state */
+	setup_ima_state(image, params, params_load_addr,
+			setup_data_offset);
+
 	/* Setup EDD info */
 	memcpy(params->eddbuf, boot_params.eddbuf,
 				EDDMAXNR * sizeof(struct edd_info));
@@ -403,6 +437,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 				sizeof(struct setup_data) +
 				sizeof(struct efi_setup_data);
 
+	if (IS_ENABLED(CONFIG_IMA_KEXEC))
+		kbuf.bufsz += sizeof(struct setup_data) +
+			      sizeof(struct ima_setup_data);
+
 	params = kzalloc(kbuf.bufsz, GFP_KERNEL);
 	if (!params)
 		return ERR_PTR(-ENOMEM);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bd6c6fd373ae..53f863f28b4c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -11,6 +11,7 @@
 #include <linux/dma-map-ops.h>
 #include <linux/dmi.h>
 #include <linux/efi.h>
+#include <linux/ima.h>
 #include <linux/init_ohci1394_dma.h>
 #include <linux/initrd.h>
 #include <linux/iscsi_ibft.h>
@@ -140,6 +141,11 @@ __visible unsigned long mmu_cr4_features __ro_after_init;
 __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
 #endif
 
+#ifdef CONFIG_IMA
+static phys_addr_t ima_kexec_buffer_phys;
+static size_t ima_kexec_buffer_size;
+#endif
+
 /* Boot loader ID and version as integers, for the benefit of proc_dointvec */
 int bootloader_type, bootloader_version;
 
@@ -330,6 +336,60 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
+static void __init add_early_ima_buffer(u64 phys_addr)
+{
+#ifdef CONFIG_IMA
+	struct ima_setup_data *data;
+
+	data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data));
+	if (!data) {
+		pr_warn("setup: failed to memremap ima_setup_data entry\n");
+		return;
+	}
+
+	if (data->size) {
+		memblock_reserve(data->addr, data->size);
+		ima_kexec_buffer_phys = data->addr;
+		ima_kexec_buffer_size = data->size;
+	}
+
+	early_memunmap(data, sizeof(*data));
+#else
+	pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n");
+#endif
+}
+
+#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
+int __init ima_free_kexec_buffer(void)
+{
+	int rc;
+
+	if (!ima_kexec_buffer_size)
+		return -ENOENT;
+
+	rc = memblock_phys_free(ima_kexec_buffer_phys,
+				ima_kexec_buffer_size);
+	if (rc)
+		return rc;
+
+	ima_kexec_buffer_phys = 0;
+	ima_kexec_buffer_size = 0;
+
+	return 0;
+}
+
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
+{
+	if (!ima_kexec_buffer_size)
+		return -ENOENT;
+
+	*addr = __va(ima_kexec_buffer_phys);
+	*size = ima_kexec_buffer_size;
+
+	return 0;
+}
+#endif
+
 static void __init parse_setup_data(void)
 {
 	struct setup_data *data;
@@ -355,6 +415,9 @@ static void __init parse_setup_data(void)
 		case SETUP_EFI:
 			parse_efi_setup(pa_data, data_len);
 			break;
+		case SETUP_IMA:
+			add_early_ima_buffer(pa_data);
+			break;
 		default:
 			break;
 		}
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 8d374cc552be..f2e58ddfaed2 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -9,6 +9,7 @@
  *  Copyright (C) 2016  IBM Corporation
  */
 
+#include <linux/ima.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/memblock.h>
@@ -115,6 +116,7 @@ static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
 	return 0;
 }
 
+#ifdef CONFIG_HAVE_IMA_KEXEC
 /**
  * ima_get_kexec_buffer - get IMA buffer from the previous kernel
  * @addr:	On successful return, set to point to the buffer contents.
@@ -122,16 +124,13 @@ static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
  *
  * Return: 0 on success, negative errno on error.
  */
-int ima_get_kexec_buffer(void **addr, size_t *size)
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
 {
 	int ret, len;
 	unsigned long tmp_addr;
 	size_t tmp_size;
 	const void *prop;
 
-	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
-		return -ENOTSUPP;
-
 	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
 	if (!prop)
 		return -ENOENT;
@@ -149,16 +148,13 @@ int ima_get_kexec_buffer(void **addr, size_t *size)
 /**
  * ima_free_kexec_buffer - free memory used by the IMA buffer
  */
-int ima_free_kexec_buffer(void)
+int __init ima_free_kexec_buffer(void)
 {
 	int ret;
 	unsigned long addr;
 	size_t size;
 	struct property *prop;
 
-	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
-		return -ENOTSUPP;
-
 	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
 	if (!prop)
 		return -ENOENT;
@@ -173,6 +169,7 @@ int ima_free_kexec_buffer(void)
 
 	return memblock_phys_free(addr, size);
 }
+#endif
 
 /**
  * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 426b1744215e..81708ca0ebc7 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -140,6 +140,11 @@ static inline int ima_measure_critical_data(const char *event_label,
 
 #endif /* CONFIG_IMA */
 
+#ifdef CONFIG_HAVE_IMA_KEXEC
+int __init ima_free_kexec_buffer(void);
+int __init ima_get_kexec_buffer(void **addr, size_t *size);
+#endif
+
 #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT
 extern bool arch_ima_get_secureboot(void);
 extern const char * const *arch_get_ima_policy(void);
diff --git a/include/linux/of.h b/include/linux/of.h
index f0a5d6b10c5a..20a4e7cb7afe 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -441,8 +441,6 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 				   unsigned long initrd_load_addr,
 				   unsigned long initrd_len,
 				   const char *cmdline, size_t extra_fdt_size);
-int ima_get_kexec_buffer(void **addr, size_t *size);
-int ima_free_kexec_buffer(void);
 #else /* CONFIG_OF */
 
 static inline void of_core_init(void)
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 13753136f03f..419dc405c831 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -137,7 +137,7 @@ void ima_add_kexec_buffer(struct kimage *image)
 /*
  * Restore the measurement list from the previous kernel.
  */
-void ima_load_kexec_buffer(void)
+void __init ima_load_kexec_buffer(void)
 {
 	void *kexec_buffer = NULL;
 	size_t kexec_buffer_size = 0;
-- 
cgit 


From e6bdbcc764af822ff7172a4a78d437dc433a0c74 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jun 2022 20:38:00 +0200
Subject: ACPI: bus: Drop unused list heads from struct acpi_device

Drop the children and node list heads that have no more users
from struct acpi_device and the code manipulating them from
__acpi_device_add() and acpi_device_del().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/acpi/scan.c     | 11 +----------
 include/acpi/acpi_bus.h |  2 --
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 7b085826d881..b100e6ca9bb4 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -465,8 +465,6 @@ static void acpi_device_del(struct acpi_device *device)
 	struct acpi_device_bus_id *acpi_device_bus_id;
 
 	mutex_lock(&acpi_device_lock);
-	if (device->parent)
-		list_del(&device->node);
 
 	list_for_each_entry(acpi_device_bus_id, &acpi_bus_id_list, node)
 		if (!strcmp(acpi_device_bus_id->bus_id,
@@ -482,6 +480,7 @@ static void acpi_device_del(struct acpi_device *device)
 		}
 
 	list_del(&device->wakeup_list);
+
 	mutex_unlock(&acpi_device_lock);
 
 	acpi_power_add_remove_device(device, false);
@@ -674,8 +673,6 @@ static int __acpi_device_add(struct acpi_device *device,
 	 * -------
 	 * Link this device to its parent and siblings.
 	 */
-	INIT_LIST_HEAD(&device->children);
-	INIT_LIST_HEAD(&device->node);
 	INIT_LIST_HEAD(&device->wakeup_list);
 	INIT_LIST_HEAD(&device->physical_node_list);
 	INIT_LIST_HEAD(&device->del_list);
@@ -715,9 +712,6 @@ static int __acpi_device_add(struct acpi_device *device,
 		list_add_tail(&acpi_device_bus_id->node, &acpi_bus_id_list);
 	}
 
-	if (device->parent)
-		list_add_tail(&device->node, &device->parent->children);
-
 	if (device->wakeup.flags.valid)
 		list_add_tail(&device->wakeup_list, &acpi_wakeup_device_list);
 
@@ -746,9 +740,6 @@ static int __acpi_device_add(struct acpi_device *device,
 err:
 	mutex_lock(&acpi_device_lock);
 
-	if (device->parent)
-		list_del(&device->node);
-
 	list_del(&device->wakeup_list);
 
 err_unlock:
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ab239a35cb2a..48f0fd499274 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -365,8 +365,6 @@ struct acpi_device {
 	acpi_handle handle;		/* no handle for fixed hardware */
 	struct fwnode_handle fwnode;
 	struct acpi_device *parent;
-	struct list_head children;
-	struct list_head node;
 	struct list_head wakeup_list;
 	struct list_head del_list;
 	struct acpi_device_status status;
-- 
cgit 


From 2852ca7fba9f77b204f0fe953b31fadd0057c936 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 1 Jul 2022 16:47:41 +0800
Subject: panic: Taint kernel if tests are run

Most in-kernel tests (such as KUnit tests) are not supposed to run on
production systems: they may do deliberately illegal things to trigger
errors, and have security implications (for example, KUnit assertions
will often deliberately leak kernel addresses).

Add a new taint type, TAINT_TEST to signal that a test has been run.
This will be printed as 'N' (originally for kuNit, as every other
sensible letter was taken.)

This should discourage people from running these tests on production
systems, and to make it easier to tell if tests have been run
accidentally (by loading the wrong configuration, etc.)

Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/admin-guide/tainted-kernels.rst | 1 +
 include/linux/panic.h                         | 3 ++-
 kernel/panic.c                                | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index ceeed7b0798d..7d80e8c307d1 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -100,6 +100,7 @@ Bit  Log  Number  Reason that got the kernel tainted
  15  _/K   32768  kernel has been live patched
  16  _/X   65536  auxiliary taint, defined for and used by distros
  17  _/T  131072  kernel was built with the struct randomization plugin
+ 18  _/N  262144  an in-kernel test has been run
 ===  ===  ======  ========================================================
 
 Note: The character ``_`` is representing a blank in this table to make reading
diff --git a/include/linux/panic.h b/include/linux/panic.h
index e71161da69c4..c7759b3f2045 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -68,7 +68,8 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
 #define TAINT_LIVEPATCH			15
 #define TAINT_AUX			16
 #define TAINT_RANDSTRUCT		17
-#define TAINT_FLAGS_COUNT		18
+#define TAINT_TEST			18
+#define TAINT_FLAGS_COUNT		19
 #define TAINT_FLAGS_MAX			((1UL << TAINT_FLAGS_COUNT) - 1)
 
 struct taint_flag {
diff --git a/kernel/panic.c b/kernel/panic.c
index a3c758dba15a..6b3369e21026 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -428,6 +428,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	[ TAINT_LIVEPATCH ]		= { 'K', ' ', true },
 	[ TAINT_AUX ]			= { 'X', ' ', true },
 	[ TAINT_RANDSTRUCT ]		= { 'T', ' ', true },
+	[ TAINT_TEST ]			= { 'N', ' ', true },
 };
 
 /**
-- 
cgit 


From c272612cb4a2f7cde550d35f46cde159a2af0bab Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 1 Jul 2022 16:47:43 +0800
Subject: kunit: Taint the kernel when KUnit tests are run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make KUnit trigger the new TAINT_TEST taint when any KUnit test is run.
Due to KUnit tests not being intended to run on production systems, and
potentially causing problems (or security issues like leaking kernel
addresses), the kernel's state should not be considered safe for
production use after KUnit tests are run.

This both marks KUnit modules as test modules using MODULE_INFO() and
manually taints the kernel when tests are run (which catches builtin
tests).

Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Tested-by: Maíra Canal <mairacanal@riseup.net>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h | 3 ++-
 lib/kunit/test.c     | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 8ffcd7de9607..ccae848720dc 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -277,7 +277,8 @@ static inline int kunit_run_all_tests(void)
 	{								\
 		return __kunit_test_suites_exit(__suites);		\
 	}								\
-	module_exit(kunit_test_suites_exit)
+	module_exit(kunit_test_suites_exit)				\
+	MODULE_INFO(test, "Y");
 #else
 #define kunit_test_suites_for_module(__suites)
 #endif /* MODULE */
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index a5053a07409f..8b11552dc215 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -11,6 +11,7 @@
 #include <kunit/test-bug.h>
 #include <linux/kernel.h>
 #include <linux/moduleparam.h>
+#include <linux/panic.h>
 #include <linux/sched/debug.h>
 #include <linux/sched.h>
 
@@ -501,6 +502,9 @@ int kunit_run_tests(struct kunit_suite *suite)
 	struct kunit_result_stats suite_stats = { 0 };
 	struct kunit_result_stats total_stats = { 0 };
 
+	/* Taint the kernel so we know we've run tests. */
+	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
 	if (suite->suite_init) {
 		suite->suite_init_err = suite->suite_init(suite);
 		if (suite->suite_init_err) {
-- 
cgit 


From eb003bf3ba221bb3d21d1fdcddaa36c158fd2d8f Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 24 Jun 2022 20:36:39 +0200
Subject: platform/surface: aggregator: Add helper macros for requests with
 argument and return value

Add helper macros for synchronous stack-allocated Surface Aggregator
request with both argument and return value, similar to the current
argument-only and return-value-only ones.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220624183642.910893-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/controller.h | 125 ++++++++++++++++++++++++++
 include/linux/surface_aggregator/device.h     |  36 ++++++++
 2 files changed, 161 insertions(+)

(limited to 'include')

diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 50a2b4926c06..d11a1c6e3186 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -469,6 +469,67 @@ struct ssam_request_spec_md {
 		return 0;							\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_WR() - Define synchronous SAM request function with
+ * both argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. The generated function takes care of setting up the request
+ * and response structs, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, const atype *arg, rtype *ret)``, returning the status
+ * of the request, which is zero on success and negative on failure. The
+ * ``ctrl`` parameter is the controller via which the request is sent. The
+ * request argument is specified via the ``arg`` pointer. The request's return
+ * value is written to the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_WR(name, atype, rtype, spec...)		\
+	static int name(struct ssam_controller *ctrl, const atype *arg, rtype *ret) \
+	{									\
+		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = s.target_id;					\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = s.instance_id;				\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, sizeof(atype)); \
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
 /**
  * SSAM_DEFINE_SYNC_REQUEST_MD_N() - Define synchronous multi-device SAM
  * request function with neither argument nor return value.
@@ -613,6 +674,70 @@ struct ssam_request_spec_md {
 		return 0;							\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_MD_WR() - Define synchronous multi-device SAM
+ * request function with both argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. Device specifying parameters are not hard-coded, but instead
+ * must be provided to the function. The generated function takes care of
+ * setting up the request and response structs, buffer allocation, as well as
+ * execution of the request itself, returning once the request has been fully
+ * completed. The required transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg, rtype *ret)``,
+ * returning the status of the request, which is zero on success and negative
+ * on failure. The ``ctrl`` parameter is the controller via which the request
+ * is sent, ``tid`` the target ID for the request, and ``iid`` the instance ID.
+ * The request argument is specified via the ``arg`` pointer. The request's
+ * return value is written to the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_MD_WR(name, atype, rtype, spec...)		\
+	static int name(struct ssam_controller *ctrl, u8 tid, u8 iid,		\
+			const atype *arg, rtype *ret)				\
+	{									\
+		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = tid;						\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = iid;						\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, sizeof(atype)); \
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
 
 /* -- Event notifier/callbacks. --------------------------------------------- */
 
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index c418f7f2732d..6cf7e80312d5 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -483,6 +483,42 @@ static inline void ssam_remove_clients(struct device *dev) {}
 				    sdev->uid.instance, ret);		\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_CL_WR() - Define synchronous client-device SAM
+ * request function with argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. Device specifying parameters are not hard-coded, but instead
+ * are provided via the client device, specifically its UID, supplied when
+ * calling this function. The generated function takes care of setting up the
+ * request struct, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct ssam_device
+ * *sdev, const atype *arg, rtype *ret)``, returning the status of the request,
+ * which is zero on success and negative on failure. The ``sdev`` parameter
+ * specifies both the target device of the request and by association the
+ * controller via which the request is sent. The request's argument is
+ * specified via the ``arg`` pointer. The request's return value is written to
+ * the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_CL_WR(name, atype, rtype, spec...)		\
+	SSAM_DEFINE_SYNC_REQUEST_MD_WR(__raw_##name, atype, rtype, spec)	\
+	static int name(struct ssam_device *sdev, const atype *arg, rtype *ret)	\
+	{									\
+		return __raw_##name(sdev->ctrl, sdev->uid.target,		\
+				    sdev->uid.instance, arg, ret);		\
+	}
+
 
 /* -- Helpers for client-device notifiers. ---------------------------------- */
 
-- 
cgit 


From 4a4ab610b8ae912c28a4fd28442ef24ed7a1a5bd Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 24 Jun 2022 22:57:58 +0200
Subject: platform/surface: aggregator: Move device registry helper functions
 to core module

Move helper functions for client device registration to the core module.
This simplifies addition of future DT/OF support and also allows us to
split out the device hub drivers into their own module.

At the same time, also improve device node validation a bit by not
silently skipping devices with invalid device UID specifiers. Further,
ensure proper lifetime management for the firmware/software nodes
associated with the added devices.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220624205800.1355621-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/bus.c          | 149 ++++++++++++++++++---
 .../platform/surface/surface_aggregator_registry.c |  75 +----------
 include/linux/surface_aggregator/device.h          |  52 +++++++
 3 files changed, 187 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c
index abbbb5b08b07..e0b0381a2834 100644
--- a/drivers/platform/surface/aggregator/bus.c
+++ b/drivers/platform/surface/aggregator/bus.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/property.h>
 #include <linux/slab.h>
 
 #include <linux/surface_aggregator/controller.h>
@@ -14,6 +15,9 @@
 #include "bus.h"
 #include "controller.h"
 
+
+/* -- Device and bus functions. --------------------------------------------- */
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
@@ -46,6 +50,7 @@ static void ssam_device_release(struct device *dev)
 	struct ssam_device *sdev = to_ssam_device(dev);
 
 	ssam_controller_put(sdev->ctrl);
+	fwnode_handle_put(sdev->dev.fwnode);
 	kfree(sdev);
 }
 
@@ -363,6 +368,134 @@ void ssam_device_driver_unregister(struct ssam_device_driver *sdrv)
 }
 EXPORT_SYMBOL_GPL(ssam_device_driver_unregister);
 
+
+/* -- Bus registration. ----------------------------------------------------- */
+
+/**
+ * ssam_bus_register() - Register and set-up the SSAM client device bus.
+ */
+int ssam_bus_register(void)
+{
+	return bus_register(&ssam_bus_type);
+}
+
+/**
+ * ssam_bus_unregister() - Unregister the SSAM client device bus.
+ */
+void ssam_bus_unregister(void)
+{
+	return bus_unregister(&ssam_bus_type);
+}
+
+
+/* -- Helpers for controller and hub devices. ------------------------------- */
+
+static int ssam_device_uid_from_string(const char *str, struct ssam_device_uid *uid)
+{
+	u8 d, tc, tid, iid, fn;
+	int n;
+
+	n = sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx", &d, &tc, &tid, &iid, &fn);
+	if (n != 5)
+		return -EINVAL;
+
+	uid->domain = d;
+	uid->category = tc;
+	uid->target = tid;
+	uid->instance = iid;
+	uid->function = fn;
+
+	return 0;
+}
+
+static int ssam_get_uid_for_node(struct fwnode_handle *node, struct ssam_device_uid *uid)
+{
+	const char *str = fwnode_get_name(node);
+
+	/*
+	 * To simplify definitions of firmware nodes, we set the device name
+	 * based on the UID of the device, prefixed with "ssam:".
+	 */
+	if (strncmp(str, "ssam:", strlen("ssam:")) != 0)
+		return -ENODEV;
+
+	str += strlen("ssam:");
+	return ssam_device_uid_from_string(str, uid);
+}
+
+static int ssam_add_client_device(struct device *parent, struct ssam_controller *ctrl,
+				  struct fwnode_handle *node)
+{
+	struct ssam_device_uid uid;
+	struct ssam_device *sdev;
+	int status;
+
+	status = ssam_get_uid_for_node(node, &uid);
+	if (status)
+		return status;
+
+	sdev = ssam_device_alloc(ctrl, uid);
+	if (!sdev)
+		return -ENOMEM;
+
+	sdev->dev.parent = parent;
+	sdev->dev.fwnode = fwnode_handle_get(node);
+
+	status = ssam_device_add(sdev);
+	if (status)
+		ssam_device_put(sdev);
+
+	return status;
+}
+
+/**
+ * __ssam_register_clients() - Register client devices defined under the
+ * given firmware node as children of the given device.
+ * @parent: The parent device under which clients should be registered.
+ * @ctrl: The controller with which client should be registered.
+ * @node: The firmware node holding definitions of the devices to be added.
+ *
+ * Register all clients that have been defined as children of the given root
+ * firmware node as children of the given parent device. The respective child
+ * firmware nodes will be associated with the correspondingly created child
+ * devices.
+ *
+ * The given controller will be used to instantiate the new devices. See
+ * ssam_device_add() for details.
+ *
+ * Note that, generally, the use of either ssam_device_register_clients() or
+ * ssam_register_clients() should be preferred as they directly use the
+ * firmware node and/or controller associated with the given device. This
+ * function is only intended for use when different device specifications (e.g.
+ * ACPI and firmware nodes) need to be combined (as is done in the platform hub
+ * of the device registry).
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+			    struct fwnode_handle *node)
+{
+	struct fwnode_handle *child;
+	int status;
+
+	fwnode_for_each_child_node(node, child) {
+		/*
+		 * Try to add the device specified in the firmware node. If
+		 * this fails with -ENODEV, the node does not specify any SSAM
+		 * device, so ignore it and continue with the next one.
+		 */
+		status = ssam_add_client_device(parent, ctrl, child);
+		if (status && status != -ENODEV)
+			goto err;
+	}
+
+	return 0;
+err:
+	ssam_remove_clients(parent);
+	return status;
+}
+EXPORT_SYMBOL_GPL(__ssam_register_clients);
+
 static int ssam_remove_device(struct device *dev, void *_data)
 {
 	struct ssam_device *sdev = to_ssam_device(dev);
@@ -387,19 +520,3 @@ void ssam_remove_clients(struct device *dev)
 	device_for_each_child_reverse(dev, NULL, ssam_remove_device);
 }
 EXPORT_SYMBOL_GPL(ssam_remove_clients);
-
-/**
- * ssam_bus_register() - Register and set-up the SSAM client device bus.
- */
-int ssam_bus_register(void)
-{
-	return bus_register(&ssam_bus_type);
-}
-
-/**
- * ssam_bus_unregister() - Unregister the SSAM client device bus.
- */
-void ssam_bus_unregister(void)
-{
-	return bus_unregister(&ssam_bus_type);
-}
diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c
index f1c5905f1c16..c680792a037e 100644
--- a/drivers/platform/surface/surface_aggregator_registry.c
+++ b/drivers/platform/surface/surface_aggregator_registry.c
@@ -286,76 +286,6 @@ static const struct software_node *ssam_node_group_sp8[] = {
 };
 
 
-/* -- Device registry helper functions. ------------------------------------- */
-
-static int ssam_uid_from_string(const char *str, struct ssam_device_uid *uid)
-{
-	u8 d, tc, tid, iid, fn;
-	int n;
-
-	n = sscanf(str, "ssam:%hhx:%hhx:%hhx:%hhx:%hhx", &d, &tc, &tid, &iid, &fn);
-	if (n != 5)
-		return -EINVAL;
-
-	uid->domain = d;
-	uid->category = tc;
-	uid->target = tid;
-	uid->instance = iid;
-	uid->function = fn;
-
-	return 0;
-}
-
-static int ssam_hub_add_device(struct device *parent, struct ssam_controller *ctrl,
-			       struct fwnode_handle *node)
-{
-	struct ssam_device_uid uid;
-	struct ssam_device *sdev;
-	int status;
-
-	status = ssam_uid_from_string(fwnode_get_name(node), &uid);
-	if (status)
-		return status;
-
-	sdev = ssam_device_alloc(ctrl, uid);
-	if (!sdev)
-		return -ENOMEM;
-
-	sdev->dev.parent = parent;
-	sdev->dev.fwnode = node;
-
-	status = ssam_device_add(sdev);
-	if (status)
-		ssam_device_put(sdev);
-
-	return status;
-}
-
-static int ssam_hub_register_clients(struct device *parent, struct ssam_controller *ctrl,
-				     struct fwnode_handle *node)
-{
-	struct fwnode_handle *child;
-	int status;
-
-	fwnode_for_each_child_node(node, child) {
-		/*
-		 * Try to add the device specified in the firmware node. If
-		 * this fails with -EINVAL, the node does not specify any SSAM
-		 * device, so ignore it and continue with the next one.
-		 */
-
-		status = ssam_hub_add_device(parent, ctrl, child);
-		if (status && status != -EINVAL)
-			goto err;
-	}
-
-	return 0;
-err:
-	ssam_remove_clients(parent);
-	return status;
-}
-
-
 /* -- SSAM generic subsystem hub driver framework. -------------------------- */
 
 enum ssam_hub_state {
@@ -385,7 +315,6 @@ struct ssam_hub {
 static void ssam_hub_update_workfn(struct work_struct *work)
 {
 	struct ssam_hub *hub = container_of(work, struct ssam_hub, update_work.work);
-	struct fwnode_handle *node = dev_fwnode(&hub->sdev->dev);
 	enum ssam_hub_state state;
 	int status = 0;
 
@@ -425,7 +354,7 @@ static void ssam_hub_update_workfn(struct work_struct *work)
 	hub->state = state;
 
 	if (hub->state == SSAM_HUB_CONNECTED)
-		status = ssam_hub_register_clients(&hub->sdev->dev, hub->sdev->ctrl, node);
+		status = ssam_device_register_clients(hub->sdev);
 	else
 		ssam_remove_clients(&hub->sdev->dev);
 
@@ -769,7 +698,7 @@ static int ssam_platform_hub_probe(struct platform_device *pdev)
 
 	set_secondary_fwnode(&pdev->dev, root);
 
-	status = ssam_hub_register_clients(&pdev->dev, ctrl, root);
+	status = __ssam_register_clients(&pdev->dev, ctrl, root);
 	if (status) {
 		set_secondary_fwnode(&pdev->dev, NULL);
 		software_node_unregister_node_group(nodes);
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 6cf7e80312d5..46c45d1b6368 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -15,6 +15,7 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/property.h>
 #include <linux/types.h>
 
 #include <linux/surface_aggregator/controller.h>
@@ -375,11 +376,62 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
 /* -- Helpers for controller and hub devices. ------------------------------- */
 
 #ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+
+int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+			    struct fwnode_handle *node);
 void ssam_remove_clients(struct device *dev);
+
 #else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+static inline int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+					  struct fwnode_handle *node)
+{
+	return 0;
+}
+
 static inline void ssam_remove_clients(struct device *dev) {}
+
 #endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
 
+/**
+ * ssam_register_clients() - Register all client devices defined under the
+ * given parent device.
+ * @dev: The parent device under which clients should be registered.
+ * @ctrl: The controller with which client should be registered.
+ *
+ * Register all clients that have via firmware nodes been defined as children
+ * of the given (parent) device. The respective child firmware nodes will be
+ * associated with the correspondingly created child devices.
+ *
+ * The given controller will be used to instantiate the new devices. See
+ * ssam_device_add() for details.
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+static inline int ssam_register_clients(struct device *dev, struct ssam_controller *ctrl)
+{
+	return __ssam_register_clients(dev, ctrl, dev_fwnode(dev));
+}
+
+/**
+ * ssam_device_register_clients() - Register all client devices defined under
+ * the given SSAM parent device.
+ * @sdev: The parent device under which clients should be registered.
+ *
+ * Register all clients that have via firmware nodes been defined as children
+ * of the given (parent) device. The respective child firmware nodes will be
+ * associated with the correspondingly created child devices.
+ *
+ * The controller used by the parent device will be used to instantiate the new
+ * devices. See ssam_device_add() for details.
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+static inline int ssam_device_register_clients(struct ssam_device *sdev)
+{
+	return ssam_register_clients(&sdev->dev, sdev->ctrl);
+}
+
 
 /* -- Helpers for client-device requests. ----------------------------------- */
 
-- 
cgit 


From b48dbb998d70b7f48c2ec0a15c3cf47136808e4e Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Sun, 12 Jun 2022 13:42:32 +0900
Subject: 9p fid refcount: add p9_fid_get/put wrappers

I was recently reminded that it is not clear that p9_client_clunk()
was actually just decrementing refcount and clunking only when that
reaches zero: make it clear through a set of helpers.

This will also allow instrumenting refcounting better for debugging
next patch

Link: https://lkml.kernel.org/r/20220612085330.1451496-5-asmadeus@codewreck.org
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Reviewed-by: Christian Schoenebeck <linux_oss@crudebyte.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 fs/9p/fid.c             | 18 +++++++++---------
 fs/9p/fid.h             |  2 +-
 fs/9p/vfs_addr.c        |  4 ++--
 fs/9p/vfs_dentry.c      |  4 ++--
 fs/9p/vfs_dir.c         |  2 +-
 fs/9p/vfs_file.c        |  4 ++--
 fs/9p/vfs_inode.c       | 48 ++++++++++++++++++++++++------------------------
 fs/9p/vfs_inode_dotl.c  | 42 +++++++++++++++++++++---------------------
 fs/9p/vfs_super.c       |  6 +++---
 fs/9p/xattr.c           |  8 ++++----
 include/net/9p/client.h | 28 ++++++++++++++++++++++++++++
 net/9p/client.c         | 15 +++------------
 12 files changed, 100 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index e8fad28fc5bd..d792499349c4 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -56,7 +56,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
 	h = (struct hlist_head *)&inode->i_private;
 	hlist_for_each_entry(fid, h, ilist) {
 		if (uid_eq(fid->uid, uid)) {
-			refcount_inc(&fid->count);
+			p9_fid_get(fid);
 			ret = fid;
 			break;
 		}
@@ -104,7 +104,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
 		hlist_for_each_entry(fid, h, dlist) {
 			if (any || uid_eq(fid->uid, uid)) {
 				ret = fid;
-				refcount_inc(&ret->count);
+				p9_fid_get(ret);
 				break;
 			}
 		}
@@ -172,7 +172,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 		old_fid = fid;
 
 		fid = p9_client_walk(old_fid, 1, &dentry->d_name.name, 1);
-		p9_client_clunk(old_fid);
+		p9_fid_put(old_fid);
 		goto fid_out;
 	}
 	up_read(&v9ses->rename_sem);
@@ -194,7 +194,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 		if (IS_ERR(root_fid))
 			return root_fid;
 
-		refcount_inc(&root_fid->count);
+		p9_fid_get(root_fid);
 		v9fs_fid_add(dentry->d_sb->s_root, root_fid);
 	}
 	/* If we are root ourself just return that */
@@ -225,7 +225,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 				     old_fid == root_fid /* clone */);
 		/* non-cloning walk will return the same fid */
 		if (fid != old_fid) {
-			p9_client_clunk(old_fid);
+			p9_fid_put(old_fid);
 			old_fid = fid;
 		}
 		if (IS_ERR(fid)) {
@@ -240,11 +240,11 @@ fid_out:
 		spin_lock(&dentry->d_lock);
 		if (d_unhashed(dentry)) {
 			spin_unlock(&dentry->d_lock);
-			p9_client_clunk(fid);
+			p9_fid_put(fid);
 			fid = ERR_PTR(-ENOENT);
 		} else {
 			__add_fid(dentry, fid);
-			refcount_inc(&fid->count);
+			p9_fid_get(fid);
 			spin_unlock(&dentry->d_lock);
 		}
 	}
@@ -301,7 +301,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	fid = clone_fid(ofid);
 	if (IS_ERR(fid))
 		goto error_out;
-	p9_client_clunk(ofid);
+	p9_fid_put(ofid);
 	/*
 	 * writeback fid will only be used to write back the
 	 * dirty pages. We always request for the open fid in read-write
@@ -310,7 +310,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	 */
 	err = p9_client_open(fid, O_RDWR);
 	if (err < 0) {
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 		fid = ERR_PTR(err);
 		goto error_out;
 	}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index f7f33509e169..3168dfad510e 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -29,7 +29,7 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 		return fid;
 
 	nfid = clone_fid(fid);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	return nfid;
 }
 #endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index d0833fa69faf..47b9a1122f34 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -73,7 +73,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 		BUG_ON(!fid);
 	}
 
-	refcount_inc(&fid->count);
+	p9_fid_get(fid);
 	rreq->netfs_priv = fid;
 	return 0;
 }
@@ -86,7 +86,7 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
 {
 	struct p9_fid *fid = rreq->netfs_priv;
 
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 }
 
 /**
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 1c609e99d280..f89f01734587 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -54,7 +54,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
 	p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
 		 dentry, dentry);
 	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
-		p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+		p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
 	dentry->d_fsdata = NULL;
 }
 
@@ -85,7 +85,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 			retval = v9fs_refresh_inode_dotl(fid, inode);
 		else
 			retval = v9fs_refresh_inode(fid, inode);
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
 		if (retval == -ENOENT)
 			return 0;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 958680f7f23e..000fbaae9b18 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -218,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		spin_lock(&inode->i_lock);
 		hlist_del(&fid->ilist);
 		spin_unlock(&inode->i_lock);
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	}
 
 	if ((filp->f_mode & FMODE_WRITE)) {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2573c08f335c..8276f3af35d7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -63,7 +63,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 		err = p9_client_open(fid, omode);
 		if (err < 0) {
-			p9_client_clunk(fid);
+			p9_fid_put(fid);
 			return err;
 		}
 		if ((file->f_flags & O_APPEND) &&
@@ -98,7 +98,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	v9fs_open_fid_add(inode, fid);
 	return 0;
 out_error:
-	p9_client_clunk(file->private_data);
+	p9_fid_put(file->private_data);
 	file->private_data = NULL;
 	return err;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3d8297714772..bde18776f0c7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -400,7 +400,7 @@ void v9fs_evict_inode(struct inode *inode)
 	fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
 	/* clunk the fid stashed in writeback_fid */
 	if (v9inode->writeback_fid) {
-		p9_client_clunk(v9inode->writeback_fid);
+		p9_fid_put(v9inode->writeback_fid);
 		v9inode->writeback_fid = NULL;
 	}
 }
@@ -569,7 +569,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
 	if (v9fs_proto_dotl(v9ses))
 		retval = p9_client_unlinkat(dfid, dentry->d_name.name,
 					    v9fs_at_to_dotl_flags(flags));
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	if (retval == -EOPNOTSUPP) {
 		/* Try the one based on path */
 		v9fid = v9fs_fid_clone(dentry);
@@ -633,14 +633,14 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
 		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		return ERR_PTR(err);
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
 	if (err < 0) {
 		p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		goto error;
 	}
 
@@ -652,7 +652,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 			p9_debug(P9_DEBUG_VFS,
 				   "p9_client_walk failed %d\n", err);
 			fid = NULL;
-			p9_client_clunk(dfid);
+			p9_fid_put(dfid);
 			goto error;
 		}
 		/*
@@ -663,20 +663,20 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 			err = PTR_ERR(inode);
 			p9_debug(P9_DEBUG_VFS,
 				   "inode creation failed %d\n", err);
-			p9_client_clunk(dfid);
+			p9_fid_put(dfid);
 			goto error;
 		}
 		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
 	}
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	return ofid;
 error:
 	if (ofid)
-		p9_client_clunk(ofid);
+		p9_fid_put(ofid);
 
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
 	return ERR_PTR(err);
 }
@@ -708,7 +708,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 		return PTR_ERR(fid);
 
 	v9fs_invalidate_inode_attr(dir);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 
 	return 0;
 }
@@ -744,7 +744,7 @@ static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	}
 
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
 	return err;
 }
@@ -785,7 +785,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	 */
 	name = dentry->d_name.name;
 	fid = p9_client_walk(dfid, 1, &name, 1);
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	if (fid == ERR_PTR(-ENOENT))
 		inode = NULL;
 	else if (IS_ERR(fid))
@@ -808,7 +808,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		else if (!IS_ERR(res))
 			v9fs_fid_add(res, fid);
 		else
-			p9_client_clunk(fid);
+			p9_fid_put(fid);
 	}
 	return res;
 }
@@ -891,7 +891,7 @@ out:
 
 error:
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	goto out;
 }
 
@@ -959,7 +959,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	dfid = v9fs_parent_fid(old_dentry);
 	olddirfid = clone_fid(dfid);
 	if (dfid && !IS_ERR(dfid))
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 
 	if (IS_ERR(olddirfid)) {
 		retval = PTR_ERR(olddirfid);
@@ -968,7 +968,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 
 	dfid = v9fs_parent_fid(new_dentry);
 	newdirfid = clone_fid(dfid);
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 
 	if (IS_ERR(newdirfid)) {
 		retval = PTR_ERR(newdirfid);
@@ -1020,13 +1020,13 @@ clunk_newdir:
 		d_move(old_dentry, new_dentry);
 	}
 	up_write(&v9ses->rename_sem);
-	p9_client_clunk(newdirfid);
+	p9_fid_put(newdirfid);
 
 clunk_olddir:
-	p9_client_clunk(olddirfid);
+	p9_fid_put(olddirfid);
 
 done:
-	p9_client_clunk(oldfid);
+	p9_fid_put(oldfid);
 	return retval;
 }
 
@@ -1060,7 +1060,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		return PTR_ERR(fid);
 
 	st = p9_client_stat(fid);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
@@ -1136,7 +1136,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
 	retval = p9_client_wstat(fid, &wstat);
 
 	if (use_dentry)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
 	if (retval < 0)
 		return retval;
@@ -1261,7 +1261,7 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
 		return ERR_CAST(fid);
 
 	st = p9_client_stat(fid);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
@@ -1308,7 +1308,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 		return PTR_ERR(fid);
 
 	v9fs_invalidate_inode_attr(dir);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	return 0;
 }
 
@@ -1364,7 +1364,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 		v9fs_refresh_inode(oldfid, d_inode(old_dentry));
 		v9fs_invalidate_inode_attr(dir);
 	}
-	p9_client_clunk(oldfid);
+	p9_fid_put(oldfid);
 	return retval;
 }
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index b6eb1160296c..09b124fe349c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -274,7 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
 		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		goto out;
 	}
 
@@ -286,7 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	if (err) {
 		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
 			 err);
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		goto error;
 	}
 	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
@@ -294,14 +294,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	if (err < 0) {
 		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
 			 err);
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		goto error;
 	}
 	v9fs_invalidate_inode_attr(dir);
 
 	/* instantiate inode and assign the unopened fid to the dentry */
 	fid = p9_client_walk(dfid, 1, &name, 1);
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
@@ -358,10 +358,10 @@ out:
 
 error:
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 err_clunk_old_fid:
 	if (ofid)
-		p9_client_clunk(ofid);
+		p9_fid_put(ofid);
 	goto out;
 }
 
@@ -458,9 +458,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
 	v9fs_invalidate_inode_attr(dir);
 error:
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	v9fs_put_acl(dacl, pacl);
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	return err;
 }
 
@@ -489,7 +489,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
 	 */
 
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
@@ -603,7 +603,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0) {
 		if (use_dentry)
-			p9_client_clunk(fid);
+			p9_fid_put(fid);
 		return retval;
 	}
 
@@ -619,12 +619,12 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
 		retval = v9fs_acl_chmod(inode, fid);
 		if (retval < 0) {
 			if (use_dentry)
-				p9_client_clunk(fid);
+				p9_fid_put(fid);
 			return retval;
 		}
 	}
 	if (use_dentry)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
 	return 0;
 }
@@ -771,9 +771,9 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
 
 error:
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 	return err;
 }
 
@@ -803,14 +803,14 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 
 	oldfid = v9fs_fid_lookup(old_dentry);
 	if (IS_ERR(oldfid)) {
-		p9_client_clunk(dfid);
+		p9_fid_put(dfid);
 		return PTR_ERR(oldfid);
 	}
 
 	err = p9_client_link(dfid, oldfid, dentry->d_name.name);
 
-	p9_client_clunk(dfid);
-	p9_client_clunk(oldfid);
+	p9_fid_put(dfid);
+	p9_fid_put(oldfid);
 	if (err < 0) {
 		p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
 		return err;
@@ -826,7 +826,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 			return PTR_ERR(fid);
 
 		v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	}
 	ihold(d_inode(old_dentry));
 	d_instantiate(dentry, d_inode(old_dentry));
@@ -924,9 +924,9 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
 	}
 error:
 	if (fid)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	v9fs_put_acl(dacl, pacl);
-	p9_client_clunk(dfid);
+	p9_fid_put(dfid);
 
 	return err;
 }
@@ -956,7 +956,7 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
 	if (IS_ERR(fid))
 		return ERR_CAST(fid);
 	retval = p9_client_readlink(fid, &target);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	if (retval)
 		return ERR_PTR(retval);
 	set_delayed_call(done, kfree_link, target);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 97e23b4e6982..bf350fad9500 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -190,7 +190,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	return dget(sb->s_root);
 
 clunk_fid:
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	v9fs_session_close(v9ses);
 free_session:
 	kfree(v9ses);
@@ -203,7 +203,7 @@ release_sb:
 	 * attached the fid to dentry so it won't get clunked
 	 * automatically.
 	 */
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	deactivate_locked_super(sb);
 	return ERR_PTR(retval);
 }
@@ -270,7 +270,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	}
 	res = simple_statfs(dentry, buf);
 done:
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	return res;
 }
 
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index a824441b95a2..1f9298a4bd42 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -44,7 +44,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 		if (err)
 			retval = err;
 	}
-	p9_client_clunk(attr_fid);
+	p9_fid_put(attr_fid);
 	return retval;
 }
 
@@ -71,7 +71,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 	ret = v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 
 	return ret;
 }
@@ -98,7 +98,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 	ret = v9fs_fid_xattr_set(fid, name, value, value_len, flags);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	return ret;
 }
 
@@ -128,7 +128,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
 			 retval);
 	else
 		p9_client_write(fid, 0, &from, &retval);
-	err = p9_client_clunk(fid);
+	err = p9_fid_put(fid);
 	if (!retval && err)
 		retval = err;
 	return retval;
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index ec1d1706f43c..eabb53992350 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -237,6 +237,34 @@ static inline int p9_req_try_get(struct p9_req_t *r)
 
 int p9_req_put(struct p9_req_t *r);
 
+/* fid reference counting helpers:
+ *  - fids used for any length of time should always be referenced through
+ *    p9_fid_get(), and released with p9_fid_put()
+ *  - v9fs_fid_lookup() or similar will automatically call get for you
+ *    and also require a put
+ *  - the *_fid_add() helpers will stash the fid in the inode,
+ *    at which point it is the responsibility of evict_inode()
+ *    to call the put
+ *  - the last put will automatically send a clunk to the server
+ */
+static inline struct p9_fid *p9_fid_get(struct p9_fid *fid)
+{
+	refcount_inc(&fid->count);
+
+	return fid;
+}
+
+static inline int p9_fid_put(struct p9_fid *fid)
+{
+	if (!fid || IS_ERR(fid))
+		return 0;
+
+	if (!refcount_dec_and_test(&fid->count))
+		return 0;
+
+	return p9_client_clunk(fid);
+}
+
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
 
 int p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type,
diff --git a/net/9p/client.c b/net/9p/client.c
index 8bba0d9cf975..f3eb280c7d9d 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1228,7 +1228,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 
 clunk_fid:
 	kfree(wqids);
-	p9_client_clunk(fid);
+	p9_fid_put(fid);
 	fid = NULL;
 
 error:
@@ -1459,15 +1459,6 @@ int p9_client_clunk(struct p9_fid *fid)
 	struct p9_req_t *req;
 	int retries = 0;
 
-	if (!fid || IS_ERR(fid)) {
-		pr_warn("%s (%d): Trying to clunk with invalid fid\n",
-			__func__, task_pid_nr(current));
-		dump_stack();
-		return 0;
-	}
-	if (!refcount_dec_and_test(&fid->count))
-		return 0;
-
 again:
 	p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n",
 		 fid->fid, retries);
@@ -1519,7 +1510,7 @@ int p9_client_remove(struct p9_fid *fid)
 	p9_tag_remove(clnt, req);
 error:
 	if (err == -ERESTARTSYS)
-		p9_client_clunk(fid);
+		p9_fid_put(fid);
 	else
 		p9_fid_destroy(fid);
 	return err;
@@ -2042,7 +2033,7 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
 		 attr_fid->fid, *attr_size);
 	return attr_fid;
 clunk_fid:
-	p9_client_clunk(attr_fid);
+	p9_fid_put(attr_fid);
 	attr_fid = NULL;
 error:
 	if (attr_fid && attr_fid != file_fid)
-- 
cgit 


From 286c171b86ebc693e18b485dde3a3fc470af37bd Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Mon, 13 Jun 2022 08:32:06 +0900
Subject: 9p fid refcount: add a 9p_fid_ref tracepoint

This adds a tracepoint event for 9p fid lifecycle tracing: when a fid
is created, its reference count increased/decreased, and freed.
The new 9p_fid_ref tracepoint should help anyone wishing to debug any
fid problem such as missing clunk (destroy) or use-after-free.

Link: https://lkml.kernel.org/r/20220612085330.1451496-6-asmadeus@codewreck.org
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/client.h   | 13 +++++++++++++
 include/trace/events/9p.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 net/9p/client.c           | 20 +++++++++++++++++++-
 3 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index eabb53992350..8f629f1df865 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -11,6 +11,7 @@
 
 #include <linux/utsname.h>
 #include <linux/idr.h>
+#include <linux/tracepoint-defs.h>
 
 /* Number of requests per row */
 #define P9_ROW_MAXTAG 255
@@ -237,6 +238,12 @@ static inline int p9_req_try_get(struct p9_req_t *r)
 
 int p9_req_put(struct p9_req_t *r);
 
+/* We cannot have the real tracepoints in header files,
+ * use a wrapper function */
+DECLARE_TRACEPOINT(9p_fid_ref);
+void do_trace_9p_fid_get(struct p9_fid *fid);
+void do_trace_9p_fid_put(struct p9_fid *fid);
+
 /* fid reference counting helpers:
  *  - fids used for any length of time should always be referenced through
  *    p9_fid_get(), and released with p9_fid_put()
@@ -249,6 +256,9 @@ int p9_req_put(struct p9_req_t *r);
  */
 static inline struct p9_fid *p9_fid_get(struct p9_fid *fid)
 {
+	if (tracepoint_enabled(9p_fid_ref))
+		do_trace_9p_fid_get(fid);
+
 	refcount_inc(&fid->count);
 
 	return fid;
@@ -259,6 +269,9 @@ static inline int p9_fid_put(struct p9_fid *fid)
 	if (!fid || IS_ERR(fid))
 		return 0;
 
+	if (tracepoint_enabled(9p_fid_ref))
+		do_trace_9p_fid_put(fid);
+
 	if (!refcount_dec_and_test(&fid->count))
 		return 0;
 
diff --git a/include/trace/events/9p.h b/include/trace/events/9p.h
index 78c5608a1648..4dfa6d7f83ba 100644
--- a/include/trace/events/9p.h
+++ b/include/trace/events/9p.h
@@ -77,6 +77,13 @@
 		EM( P9_TWSTAT,		"P9_TWSTAT" )			\
 		EMe(P9_RWSTAT,		"P9_RWSTAT" )
 
+
+#define P9_FID_REFTYPE							\
+		EM( P9_FID_REF_CREATE,	"create " )			\
+		EM( P9_FID_REF_GET,	"get    " )			\
+		EM( P9_FID_REF_PUT,	"put    " )			\
+		EMe(P9_FID_REF_DESTROY,	"destroy" )
+
 /* Define EM() to export the enums to userspace via TRACE_DEFINE_ENUM() */
 #undef EM
 #undef EMe
@@ -84,6 +91,21 @@
 #define EMe(a, b)	TRACE_DEFINE_ENUM(a);
 
 P9_MSG_T
+P9_FID_REFTYPE
+
+/* And also use EM/EMe to define helper enums -- once */
+#ifndef __9P_DECLARE_TRACE_ENUMS_ONLY_ONCE
+#define __9P_DECLARE_TRACE_ENUMS_ONLY_ONCE
+#undef EM
+#undef EMe
+#define EM(a, b)	a,
+#define EMe(a, b)	a
+
+enum p9_fid_reftype {
+	P9_FID_REFTYPE
+} __mode(byte);
+
+#endif
 
 /*
  * Now redefine the EM() and EMe() macros to map the enums to the strings
@@ -96,6 +118,8 @@ P9_MSG_T
 
 #define show_9p_op(type)						\
 	__print_symbolic(type, P9_MSG_T)
+#define show_9p_fid_reftype(type)					\
+	__print_symbolic(type, P9_FID_REFTYPE)
 
 TRACE_EVENT(9p_client_req,
 	    TP_PROTO(struct p9_client *clnt, int8_t type, int tag),
@@ -168,6 +192,30 @@ TRACE_EVENT(9p_protocol_dump,
 		      __entry->tag, 0, __entry->line, 16, __entry->line + 16)
  );
 
+
+TRACE_EVENT(9p_fid_ref,
+	    TP_PROTO(struct p9_fid *fid, __u8 type),
+
+	    TP_ARGS(fid, type),
+
+	    TP_STRUCT__entry(
+		    __field(	int,	fid		)
+		    __field(	int,	refcount	)
+		    __field(	__u8, type	)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->fid = fid->fid;
+		    __entry->refcount = refcount_read(&fid->count);
+		    __entry->type = type;
+		    ),
+
+	    TP_printk("%s fid %d, refcount %d",
+		      show_9p_fid_reftype(__entry->type),
+		      __entry->fid, __entry->refcount)
+);
+
+
 #endif /* _TRACE_9P_H */
 
 /* This part must be outside protection */
diff --git a/net/9p/client.c b/net/9p/client.c
index f3eb280c7d9d..dfe8beb864fc 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -907,8 +907,10 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
 			    GFP_NOWAIT);
 	spin_unlock_irq(&clnt->lock);
 	idr_preload_end();
-	if (!ret)
+	if (!ret) {
+		trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
 		return fid;
+	}
 
 	kfree(fid);
 	return NULL;
@@ -920,6 +922,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
 	unsigned long flags;
 
 	p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+	trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
 	clnt = fid->clnt;
 	spin_lock_irqsave(&clnt->lock, flags);
 	idr_remove(&clnt->fids, fid->fid);
@@ -928,6 +931,21 @@ static void p9_fid_destroy(struct p9_fid *fid)
 	kfree(fid);
 }
 
+/* We also need to export tracepoint symbols for tracepoint_enabled() */
+EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
+
+void do_trace_9p_fid_get(struct p9_fid *fid)
+{
+	trace_9p_fid_ref(fid, P9_FID_REF_GET);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_get);
+
+void do_trace_9p_fid_put(struct p9_fid *fid)
+{
+	trace_9p_fid_ref(fid, P9_FID_REF_PUT);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_put);
+
 static int p9_client_version(struct p9_client *c)
 {
 	int err = 0;
-- 
cgit 


From 504148fedb854299972d164b001357b888a9193e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Jun 2022 15:07:50 +0000
Subject: net: add skb_[inner_]tcp_all_headers helpers

Most drivers use "skb_transport_offset(skb) + tcp_hdrlen(skb)"
to compute headers length for a TCP packet, but others
use more convoluted (but equivalent) ways.

Add skb_tcp_all_headers() and skb_inner_tcp_all_headers()
helpers to harmonize this a bit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c            |  2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           |  6 ++---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c    |  9 ++++---
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c    |  8 +++---
 drivers/net/ethernet/atheros/atlx/atl1.c           |  7 +++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c    | 17 +++++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  7 ++---
 drivers/net/ethernet/broadcom/tg3.c                |  2 +-
 drivers/net/ethernet/brocade/bna/bnad.c            |  6 ++---
 drivers/net/ethernet/cadence/macb_main.c           |  2 +-
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  4 +--
 drivers/net/ethernet/chelsio/cxgb4/sge.c           |  2 +-
 .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c      |  6 ++---
 drivers/net/ethernet/cisco/enic/enic_main.c        |  5 ++--
 drivers/net/ethernet/emulex/benet/be_main.c        |  6 ++---
 drivers/net/ethernet/freescale/fec_main.c          |  2 +-
 drivers/net/ethernet/fungible/funeth/funeth_tx.c   |  2 +-
 drivers/net/ethernet/google/gve/gve_tx_dqo.c       |  4 +--
 drivers/net/ethernet/hisilicon/hns/hns_enet.c      |  6 ++---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    |  4 +--
 drivers/net/ethernet/hisilicon/hns3/hns3_trace.h   |  3 +--
 drivers/net/ethernet/ibm/ehea/ehea_main.c          |  2 +-
 drivers/net/ethernet/intel/e1000/e1000_main.c      |  4 +--
 drivers/net/ethernet/intel/e1000e/netdev.c         |  4 +--
 drivers/net/ethernet/intel/ixgb/ixgb_main.c        |  2 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c         |  2 +-
 drivers/net/ethernet/marvell/mvneta.c              |  4 +--
 .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.c |  4 +--
 drivers/net/ethernet/marvell/sky2.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |  4 +--
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  4 +--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfd3/dp.c       |  5 ++--
 drivers/net/ethernet/netronome/nfp/nfdk/dp.c       |  5 ++--
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  7 +++--
 drivers/net/ethernet/pensando/ionic/ionic_txrx.c   |  5 ++--
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |  8 +++---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c     |  2 +-
 drivers/net/ethernet/qualcomm/emac/emac-mac.c      |  4 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 +-
 drivers/net/ethernet/synopsys/dwc-xlgmac-net.c     |  2 +-
 drivers/net/wireless/ath/wil6210/txrx.c            |  4 +--
 drivers/net/xen-netback/netback.c                  |  4 +--
 drivers/staging/qlge/qlge_main.c                   |  2 +-
 include/linux/tcp.h                                | 30 ++++++++++++++++++++++
 net/tls/tls_device_fallback.c                      |  6 ++---
 48 files changed, 119 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 2c3dca41d3bd..f7995519bbc8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -573,7 +573,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb);
 
 	if (skb_is_gso(skb)) {
-		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hlen = skb_tcp_all_headers(skb);
 		phead = skb->data;
 		if (unlikely(!skb_pull(skb, hlen))) {
 			ipoib_warn(priv, "linear data too small\n");
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 4d46780fad13..f342bb853189 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1673,12 +1673,10 @@ static int xgbe_prep_tso(struct sk_buff *skb, struct xgbe_packet_data *packet)
 		return ret;
 
 	if (XGMAC_GET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES, VXLAN)) {
-		packet->header_len = skb_inner_transport_offset(skb) +
-				     inner_tcp_hdrlen(skb);
+		packet->header_len = skb_inner_tcp_all_headers(skb);
 		packet->tcp_header_len = inner_tcp_hdrlen(skb);
 	} else {
-		packet->header_len = skb_transport_offset(skb) +
-				     tcp_hdrlen(skb);
+		packet->header_len = skb_tcp_all_headers(skb);
 		packet->tcp_header_len = tcp_hdrlen(skb);
 	}
 	packet->tcp_payload_len = skb->len - packet->header_len;
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 24fe967c18cd..948584761e66 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2072,7 +2072,7 @@ static u16 atl1c_cal_tpd_req(const struct sk_buff *skb)
 	tpd_req = skb_shinfo(skb)->nr_frags + 1;
 
 	if (skb_is_gso(skb)) {
-		proto_hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		proto_hdr_len = skb_tcp_all_headers(skb);
 		if (proto_hdr_len < skb_headlen(skb))
 			tpd_req++;
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
@@ -2107,7 +2107,7 @@ static int atl1c_tso_csum(struct atl1c_adapter *adapter,
 			if (real_len < skb->len)
 				pskb_trim(skb, real_len);
 
-			hdr_len = (skb_transport_offset(skb) + tcp_hdrlen(skb));
+			hdr_len = skb_tcp_all_headers(skb);
 			if (unlikely(skb->len == hdr_len)) {
 				/* only xsum need */
 				if (netif_msg_tx_queued(adapter))
@@ -2132,7 +2132,7 @@ static int atl1c_tso_csum(struct atl1c_adapter *adapter,
 			*tpd = atl1c_get_tpd(adapter, queue);
 			ipv6_hdr(skb)->payload_len = 0;
 			/* check payload == 0 byte ? */
-			hdr_len = (skb_transport_offset(skb) + tcp_hdrlen(skb));
+			hdr_len = skb_tcp_all_headers(skb);
 			if (unlikely(skb->len == hdr_len)) {
 				/* only xsum need */
 				if (netif_msg_tx_queued(adapter))
@@ -2219,7 +2219,8 @@ static int atl1c_tx_map(struct atl1c_adapter *adapter,
 	tso = (tpd->word1 >> TPD_LSO_EN_SHIFT) & TPD_LSO_EN_MASK;
 	if (tso) {
 		/* TSO */
-		map_len = hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
+		map_len = hdr_len;
 		use_tpd = tpd;
 
 		buffer_info = atl1c_get_tx_buffer(adapter, use_tpd);
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 4fc9e6350c49..57a51fb7746c 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1609,8 +1609,7 @@ static u16 atl1e_cal_tdp_req(const struct sk_buff *skb)
 	if (skb_is_gso(skb)) {
 		if (skb->protocol == htons(ETH_P_IP) ||
 		   (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV6)) {
-			proto_hdr_len = skb_transport_offset(skb) +
-					tcp_hdrlen(skb);
+			proto_hdr_len = skb_tcp_all_headers(skb);
 			if (proto_hdr_len < skb_headlen(skb)) {
 				tpd_req += ((skb_headlen(skb) - proto_hdr_len +
 					   MAX_TX_BUF_LEN - 1) >>
@@ -1645,7 +1644,7 @@ static int atl1e_tso_csum(struct atl1e_adapter *adapter,
 			if (real_len < skb->len)
 				pskb_trim(skb, real_len);
 
-			hdr_len = (skb_transport_offset(skb) + tcp_hdrlen(skb));
+			hdr_len = skb_tcp_all_headers(skb);
 			if (unlikely(skb->len == hdr_len)) {
 				/* only xsum need */
 				netdev_warn(adapter->netdev,
@@ -1713,7 +1712,8 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
 	segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK;
 	if (segment) {
 		/* TSO */
-		map_len = hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
+		map_len = hdr_len;
 		use_tpd = tpd;
 
 		tx_buffer = atl1e_get_tx_buffer(adapter, use_tpd);
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 6a969969d221..ff1fe09abf9f 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2115,7 +2115,7 @@ static int atl1_tso(struct atl1_adapter *adapter, struct sk_buff *skb,
 				ntohs(iph->tot_len));
 			if (real_len < skb->len)
 				pskb_trim(skb, real_len);
-			hdr_len = (skb_transport_offset(skb) + tcp_hdrlen(skb));
+			hdr_len = skb_tcp_all_headers(skb);
 			if (skb->len == hdr_len) {
 				iph->check = 0;
 				tcp_hdr(skb)->check =
@@ -2206,7 +2206,7 @@ static void atl1_tx_map(struct atl1_adapter *adapter, struct sk_buff *skb,
 	retval = (ptpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK;
 	if (retval) {
 		/* TSO */
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		buffer_info->length = hdr_len;
 		page = virt_to_page(skb->data);
 		offset = offset_in_page(skb->data);
@@ -2367,8 +2367,7 @@ static netdev_tx_t atl1_xmit_frame(struct sk_buff *skb,
 	mss = skb_shinfo(skb)->gso_size;
 	if (mss) {
 		if (skb->protocol == htons(ETH_P_IP)) {
-			proto_hdr_len = (skb_transport_offset(skb) +
-					 tcp_hdrlen(skb));
+			proto_hdr_len = skb_tcp_all_headers(skb);
 			if (unlikely(proto_hdr_len > len)) {
 				dev_kfree_skb_any(skb);
 				return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 5729a5ab059d..712b5595bc39 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3421,12 +3421,9 @@ static int bnx2x_pkt_req_lin(struct bnx2x *bp, struct sk_buff *skb,
 
 			/* Headers length */
 			if (xmit_type & XMIT_GSO_ENC)
-				hlen = (int)(skb_inner_transport_header(skb) -
-					     skb->data) +
-					     inner_tcp_hdrlen(skb);
+				hlen = skb_inner_tcp_all_headers(skb);
 			else
-				hlen = (int)(skb_transport_header(skb) -
-					     skb->data) + tcp_hdrlen(skb);
+				hlen = skb_tcp_all_headers(skb);
 
 			/* Amount of data (w/o headers) on linear part of SKB*/
 			first_bd_sz = skb_headlen(skb) - hlen;
@@ -3534,15 +3531,13 @@ static u8 bnx2x_set_pbd_csum_enc(struct bnx2x *bp, struct sk_buff *skb,
 			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) &
 			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW;
 
-		return skb_inner_transport_header(skb) +
-			inner_tcp_hdrlen(skb) - skb->data;
+		return skb_inner_tcp_all_headers(skb);
 	}
 
 	/* We support checksum offload for TCP and UDP only.
 	 * No need to pass the UDP header length - it's a constant.
 	 */
-	return skb_inner_transport_header(skb) +
-		sizeof(struct udphdr) - skb->data;
+	return skb_inner_transport_offset(skb) + sizeof(struct udphdr);
 }
 
 /**
@@ -3568,12 +3563,12 @@ static u8 bnx2x_set_pbd_csum_e2(struct bnx2x *bp, struct sk_buff *skb,
 			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) &
 			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW;
 
-		return skb_transport_header(skb) + tcp_hdrlen(skb) - skb->data;
+		return skb_tcp_all_headers(skb);
 	}
 	/* We support checksum offload for TCP and UDP only.
 	 * No need to pass the UDP header length - it's a constant.
 	 */
-	return skb_transport_header(skb) + sizeof(struct udphdr) - skb->data;
+	return skb_transport_offset(skb) + sizeof(struct udphdr);
 }
 
 /* set FW indication according to inner or outer protocols if tunneled */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index b474a4fe4039..11d35da61921 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -535,12 +535,9 @@ normal_tx:
 		u32 hdr_len;
 
 		if (skb->encapsulation)
-			hdr_len = skb_inner_network_offset(skb) +
-				skb_inner_network_header_len(skb) +
-				inner_tcp_hdrlen(skb);
+			hdr_len = skb_inner_tcp_all_headers(skb);
 		else
-			hdr_len = skb_transport_offset(skb) +
-				tcp_hdrlen(skb);
+			hdr_len = skb_tcp_all_headers(skb);
 
 		txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO |
 					TX_BD_FLAGS_T_IPID |
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index c28f8cc00d1c..db1e9d810b41 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -7944,7 +7944,7 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		iph = ip_hdr(skb);
 		tcp_opt_len = tcp_optlen(skb);
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb) - ETH_HLEN;
+		hdr_len = skb_tcp_all_headers(skb) - ETH_HLEN;
 
 		/* HW/FW can not correctly segment packets that have been
 		 * vlan encapsulated.
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index f6fe08df568b..29dd0f93d6c0 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2823,8 +2823,7 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 			BNAD_UPDATE_CTR(bnad, tx_skb_mss_too_long);
 			return -EINVAL;
 		}
-		if (unlikely((gso_size + skb_transport_offset(skb) +
-			      tcp_hdrlen(skb)) >= skb->len)) {
+		if (unlikely((gso_size + skb_tcp_all_headers(skb)) >= skb->len)) {
 			txqent->hdr.wi.opcode = htons(BNA_TXQ_WI_SEND);
 			txqent->hdr.wi.lso_mss = 0;
 			BNAD_UPDATE_CTR(bnad, tx_skb_tso_too_short);
@@ -2872,8 +2871,7 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 				BNAD_UPDATE_CTR(bnad, tcpcsum_offload);
 
 				if (unlikely(skb_headlen(skb) <
-					    skb_transport_offset(skb) +
-				    tcp_hdrlen(skb))) {
+					    skb_tcp_all_headers(skb))) {
 					BNAD_UPDATE_CTR(bnad, tx_skb_tcp_hdr);
 					return -EINVAL;
 				}
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index d0ea8dbfa213..90a9798424af 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2267,7 +2267,7 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			/* only queue eth + ip headers separately for UDP */
 			hdrlen = skb_transport_offset(skb);
 		else
-			hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+			hdrlen = skb_tcp_all_headers(skb);
 		if (skb_headlen(skb) < hdrlen) {
 			netdev_err(bp->dev, "Error - LSO headers fragmented!!!\n");
 			/* if this is required, would need to copy to single buffer */
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 4367edbdd579..06397cc8bb36 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -1261,7 +1261,7 @@ int nicvf_xdp_sq_append_pkt(struct nicvf *nic, struct snd_queue *sq,
 static int nicvf_tso_count_subdescs(struct sk_buff *skb)
 {
 	struct skb_shared_info *sh = skb_shinfo(skb);
-	unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	unsigned int sh_len = skb_tcp_all_headers(skb);
 	unsigned int data_len = skb->len - sh_len;
 	unsigned int p_len = sh->gso_size;
 	long f_id = -1;    /* id of the current fragment */
@@ -1382,7 +1382,7 @@ nicvf_sq_add_hdr_subdesc(struct nicvf *nic, struct snd_queue *sq, int qentry,
 
 	if (nic->hw_tso && skb_shinfo(skb)->gso_size) {
 		hdr->tso = 1;
-		hdr->tso_start = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr->tso_start = skb_tcp_all_headers(skb);
 		hdr->tso_max_paysize = skb_shinfo(skb)->gso_size;
 		/* For non-tunneled pkts, point this to L2 ethertype */
 		hdr->inner_l3_offset = skb_network_offset(skb) - 2;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index f889f404305c..ee52e3b1d74f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1531,7 +1531,7 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 #if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE)
 	if (cxgb4_is_ktls_skb(skb) &&
-	    (skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb))))
+	    (skb->len - skb_tcp_all_headers(skb)))
 		return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev);
 #endif /* CHELSIO_TLS_DEVICE */
 
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
index 60b648b46f75..bfee0e4e54b1 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
@@ -1012,7 +1012,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb,
 	/* packet length = eth hdr len + ip hdr len + tcp hdr len
 	 * (including options).
 	 */
-	pktlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	pktlen = skb_tcp_all_headers(skb);
 
 	ctrl = sizeof(*cpl) + pktlen;
 	len16 = DIV_ROUND_UP(sizeof(*wr) + ctrl, 16);
@@ -1907,7 +1907,7 @@ static int chcr_ktls_sw_fallback(struct sk_buff *skb,
 		return 0;
 
 	th = tcp_hdr(nskb);
-	skb_offset =  skb_transport_offset(nskb) + tcp_hdrlen(nskb);
+	skb_offset = skb_tcp_all_headers(nskb);
 	data_len = nskb->len - skb_offset;
 	skb_tx_timestamp(nskb);
 
@@ -1938,7 +1938,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	tcp_seq = ntohl(th->seq);
-	skb_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	skb_offset = skb_tcp_all_headers(skb);
 	skb_data_len = skb->len - skb_offset;
 	data_len = skb_data_len;
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 1c81b161de52..372fb7b3a282 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -680,11 +680,10 @@ static int enic_queue_wq_skb_tso(struct enic *enic, struct vnic_wq *wq,
 	skb_frag_t *frag;
 
 	if (skb->encapsulation) {
-		hdr_len = skb_inner_transport_header(skb) - skb->data;
-		hdr_len += inner_tcp_hdrlen(skb);
+		hdr_len = skb_inner_tcp_all_headers(skb);
 		enic_preload_tcp_csum_encap(skb);
 	} else {
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		enic_preload_tcp_csum(skb);
 	}
 
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 41acd18a3fd2..414362febbb9 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -737,9 +737,9 @@ void be_link_status_update(struct be_adapter *adapter, u8 link_status)
 static int be_gso_hdr_len(struct sk_buff *skb)
 {
 	if (skb->encapsulation)
-		return skb_inner_transport_offset(skb) +
-		       inner_tcp_hdrlen(skb);
-	return skb_transport_offset(skb) + tcp_hdrlen(skb);
+		return skb_inner_tcp_all_headers(skb);
+
+	return skb_tcp_all_headers(skb);
 }
 
 static void be_tx_stats_update(struct be_tx_obj *txo, struct sk_buff *skb)
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index a90275143d87..e8e2aa1e7f01 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -691,7 +691,7 @@ fec_enet_txq_put_hdr_tso(struct fec_enet_priv_tx_q *txq,
 			 struct bufdesc *bdp, int index)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int hdr_len = skb_tcp_all_headers(skb);
 	struct bufdesc_ex *ebdp = container_of(bdp, struct bufdesc_ex, desc);
 	void *bufaddr;
 	unsigned long dmabuf;
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_tx.c b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
index 0a4a590218ba..a97e3af00cb9 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_tx.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
@@ -83,7 +83,7 @@ static struct sk_buff *fun_tls_tx(struct sk_buff *skb, struct funeth_txq *q,
 	const struct fun_ktls_tx_ctx *tls_ctx;
 	u32 datalen, seq;
 
-	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	datalen = skb->len - skb_tcp_all_headers(skb);
 	if (!datalen)
 		return skb;
 
diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
index f7ba616195f3..588d64819ed5 100644
--- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
@@ -386,7 +386,7 @@ static int gve_prep_tso(struct sk_buff *skb)
 				     (__force __wsum)htonl(paylen));
 
 		/* Compute length of segmentation header. */
-		header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		header_len = skb_tcp_all_headers(skb);
 		break;
 	default:
 		return -EINVAL;
@@ -598,9 +598,9 @@ static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
  */
 static bool gve_can_send_tso(const struct sk_buff *skb)
 {
-	const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
 	const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
+	const int header_len = skb_tcp_all_headers(skb);
 	const int gso_size = shinfo->gso_size;
 	int cur_seg_num_bufs;
 	int cur_seg_size;
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 2f0bd21a9082..d94cc8c6681f 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -31,8 +31,6 @@
 #define HNS_BUFFER_SIZE_2048 2048
 
 #define BD_MAX_SEND_SIZE 8191
-#define SKB_TMP_LEN(SKB) \
-	(((SKB)->transport_header - (SKB)->mac_header) + tcp_hdrlen(SKB))
 
 static void fill_v2_desc_hw(struct hnae_ring *ring, void *priv, int size,
 			    int send_sz, dma_addr_t dma, int frag_end,
@@ -94,7 +92,7 @@ static void fill_v2_desc_hw(struct hnae_ring *ring, void *priv, int size,
 						     HNSV2_TXD_TSE_B, 1);
 					l4_len = tcp_hdrlen(skb);
 					mss = skb_shinfo(skb)->gso_size;
-					paylen = skb->len - SKB_TMP_LEN(skb);
+					paylen = skb->len - skb_tcp_all_headers(skb);
 				}
 			} else if (skb->protocol == htons(ETH_P_IPV6)) {
 				hnae_set_bit(tvsvsn, HNSV2_TXD_IPV6_B, 1);
@@ -108,7 +106,7 @@ static void fill_v2_desc_hw(struct hnae_ring *ring, void *priv, int size,
 						     HNSV2_TXD_TSE_B, 1);
 					l4_len = tcp_hdrlen(skb);
 					mss = skb_shinfo(skb)->gso_size;
-					paylen = skb->len - SKB_TMP_LEN(skb);
+					paylen = skb->len - skb_tcp_all_headers(skb);
 				}
 			}
 			desc->tx.ip_offset = ip_offset;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ae56306400b8..35d70041b9e8 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1838,9 +1838,9 @@ static unsigned int hns3_tx_bd_num(struct sk_buff *skb, unsigned int *bd_size,
 static unsigned int hns3_gso_hdr_len(struct sk_buff *skb)
 {
 	if (!skb->encapsulation)
-		return skb_transport_offset(skb) + tcp_hdrlen(skb);
+		return skb_tcp_all_headers(skb);
 
-	return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
+	return skb_inner_tcp_all_headers(skb);
 }
 
 /* HW need every continuous max_non_tso_bd_num buffer data to be larger
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
index 5153e5d41bbd..b8a1ecb4b8fb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
@@ -37,8 +37,7 @@ DECLARE_EVENT_CLASS(hns3_skb_template,
 		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
 		__entry->hdr_len = skb->encapsulation ?
-		skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb) :
-		skb_transport_offset(skb) + tcp_hdrlen(skb);
+		skb_inner_tcp_all_headers(skb) : skb_tcp_all_headers(skb);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->fraglist = skb_has_frag_list(skb);
 		hns3_shinfo_pack(skb_shinfo(skb), __entry->size);
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 8ce3348edf08..5dc302880f5f 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -1617,7 +1617,7 @@ static void write_swqe2_immediate(struct sk_buff *skb, struct ehea_swqe *swqe,
 		 * For TSO packets we only copy the headers into the
 		 * immediate area.
 		 */
-		immediate_len = ETH_HLEN + ip_hdrlen(skb) + tcp_hdrlen(skb);
+		immediate_len = skb_tcp_all_headers(skb);
 	}
 
 	if (skb_is_gso(skb) || skb_data_size >= SWQE2_MAX_IMM) {
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 3f5feb55cfba..23299fc56199 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -2708,7 +2708,7 @@ static int e1000_tso(struct e1000_adapter *adapter,
 		if (err < 0)
 			return err;
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		mss = skb_shinfo(skb)->gso_size;
 		if (protocol == htons(ETH_P_IP)) {
 			struct iphdr *iph = ip_hdr(skb);
@@ -3139,7 +3139,7 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 		max_per_txd = min(mss << 2, max_per_txd);
 		max_txd_pwr = fls(max_per_txd) - 1;
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		if (skb->data_len && hdr_len == len) {
 			switch (hw->mac_type) {
 			case e1000_82544: {
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index fa06f68c8c80..38e60def8520 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5474,7 +5474,7 @@ static int e1000_tso(struct e1000_ring *tx_ring, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	hdr_len = skb_tcp_all_headers(skb);
 	mss = skb_shinfo(skb)->gso_size;
 	if (protocol == htons(ETH_P_IP)) {
 		struct iphdr *iph = ip_hdr(skb);
@@ -5846,7 +5846,7 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 		 * points to just header, pull a few bytes of payload from
 		 * frags into skb->data
 		 */
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		/* we do this workaround for ES2LAN, but it is un-necessary,
 		 * avoiding it could save a lot of cycles
 		 */
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index bca53625da33..45be9a1ab6af 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -1187,7 +1187,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb)
 		if (err < 0)
 			return err;
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		mss = skb_shinfo(skb)->gso_size;
 		iph = ip_hdr(skb);
 		iph->tot_len = 0;
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 57eff4e9e6de..b6be0552a6c1 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -775,7 +775,7 @@ txq_put_hdr_tso(struct sk_buff *skb, struct tx_queue *txq, int length,
 		u32 *first_cmd_sts, bool first_desc)
 {
 	struct mv643xx_eth_private *mp = txq_to_mp(txq);
-	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int hdr_len = skb_tcp_all_headers(skb);
 	int tx_index;
 	struct tx_desc *desc;
 	int ret;
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 384f5a16753d..0caa2df87c04 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2664,8 +2664,8 @@ err_drop_frame:
 static inline void
 mvneta_tso_put_hdr(struct sk_buff *skb, struct mvneta_tx_queue *txq)
 {
-	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 	struct mvneta_tx_buf *buf = &txq->buf[txq->txq_put_index];
+	int hdr_len = skb_tcp_all_headers(skb);
 	struct mvneta_tx_desc *tx_desc;
 
 	tx_desc = mvneta_txq_next_desc_get(txq);
@@ -2727,7 +2727,7 @@ static int mvneta_tx_tso(struct sk_buff *skb, struct net_device *dev,
 	if ((txq->count + tso_count_descs(skb)) >= txq->size)
 		return 0;
 
-	if (skb_headlen(skb) < (skb_transport_offset(skb) + tcp_hdrlen(skb))) {
+	if (skb_headlen(skb) < skb_tcp_all_headers(skb)) {
 		pr_info("*** Is this even possible?\n");
 		return 0;
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
index 3baeafc40807..a18e8efd0f1e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
@@ -624,7 +624,7 @@ static void otx2_sqe_add_ext(struct otx2_nic *pfvf, struct otx2_snd_queue *sq,
 	ext->subdc = NIX_SUBDC_EXT;
 	if (skb_shinfo(skb)->gso_size) {
 		ext->lso = 1;
-		ext->lso_sb = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		ext->lso_sb = skb_tcp_all_headers(skb);
 		ext->lso_mps = skb_shinfo(skb)->gso_size;
 
 		/* Only TSOv4 and TSOv6 GSO offloads are supported */
@@ -931,7 +931,7 @@ static bool is_hw_tso_supported(struct otx2_nic *pfvf,
 	 * be correctly modified, hence don't offload such TSO segments.
 	 */
 
-	payload_len = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	payload_len = skb->len - skb_tcp_all_headers(skb);
 	last_seg_size = payload_len % skb_shinfo(skb)->gso_size;
 	if (last_seg_size && last_seg_size < 16)
 		return false;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 9f1a7ec0491a..bbea5458000b 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -1863,7 +1863,7 @@ static netdev_tx_t sky2_xmit_frame(struct sk_buff *skb,
 	if (mss != 0) {
 
 		if (!(hw->flags & SKY2_HW_NEW_LE))
-			mss += ETH_HLEN + ip_hdrlen(skb) + tcp_hdrlen(skb);
+			mss += skb_tcp_all_headers(skb);
 
 		if (mss != sky2->tx_last_mss) {
 			le = get_tx_le(sky2, &slot);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index af3b2b59a2a6..43a4102e9c09 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -645,7 +645,7 @@ static int get_real_size(const struct sk_buff *skb,
 		*inline_ok = false;
 		*hopbyhop = 0;
 		if (skb->encapsulation) {
-			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
+			*lso_header_size = skb_inner_tcp_all_headers(skb);
 		} else {
 			/* Detects large IPV6 TCP packets and prepares for removal of
 			 * HBH header that has been pushed by ip6_xmit(),
@@ -653,7 +653,7 @@ static int get_real_size(const struct sk_buff *skb,
 			 */
 			if (ipv6_has_hopopt_jumbo(skb))
 				*hopbyhop = sizeof(struct hop_jumbo_hdr);
-			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+			*lso_header_size = skb_tcp_all_headers(skb);
 		}
 		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
 			ALIGN(*lso_header_size - *hopbyhop + 4, DS_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index 4b6f0d1ea59a..cc5cb3010e64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -458,7 +458,7 @@ bool mlx5e_ktls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq,
 	int datalen;
 	u32 seq;
 
-	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	datalen = skb->len - skb_tcp_all_headers(skb);
 	if (!datalen)
 		return true;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 50d14cec4894..64d78fd99c6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -152,14 +152,14 @@ mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop)
 
 	*hopbyhop = 0;
 	if (skb->encapsulation) {
-		ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
+		ihs = skb_tcp_all_headers(skb);
 		stats->tso_inner_packets++;
 		stats->tso_inner_bytes += skb->len - ihs;
 	} else {
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
 			ihs = skb_transport_offset(skb) + sizeof(struct udphdr);
 		} else {
-			ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
+			ihs = skb_tcp_all_headers(skb);
 			if (ipv6_has_hopopt_jumbo(skb)) {
 				*hopbyhop = sizeof(struct hop_jumbo_hdr);
 				ihs -= sizeof(struct hop_jumbo_hdr);
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 61497c3e4cfb..971dde8c3286 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -2692,7 +2692,7 @@ again:
 		 * send loop that we are still in the
 		 * header portion of the TSO packet.
 		 * TSO header can be at most 1KB long */
-		cum_len = -(skb_transport_offset(skb) + tcp_hdrlen(skb));
+		cum_len = -skb_tcp_all_headers(skb);
 
 		/* for IPv6 TSO, the checksum offset stores the
 		 * TCP header length, to save the firmware from
diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
index f9410d59146d..8a9d36619659 100644
--- a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c
@@ -81,12 +81,11 @@ nfp_nfd3_tx_tso(struct nfp_net_r_vector *r_vec, struct nfp_nfd3_tx_buf *txbuf,
 	if (!skb->encapsulation) {
 		l3_offset = skb_network_offset(skb);
 		l4_offset = skb_transport_offset(skb);
-		hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdrlen = skb_tcp_all_headers(skb);
 	} else {
 		l3_offset = skb_inner_network_offset(skb);
 		l4_offset = skb_inner_transport_offset(skb);
-		hdrlen = skb_inner_transport_header(skb) - skb->data +
-			inner_tcp_hdrlen(skb);
+		hdrlen = skb_inner_tcp_all_headers(skb);
 	}
 
 	txbuf->pkt_cnt = skb_shinfo(skb)->gso_segs;
diff --git a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
index 300637e576a8..85dd376a6adc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c
@@ -46,12 +46,11 @@ nfp_nfdk_tx_tso(struct nfp_net_r_vector *r_vec, struct nfp_nfdk_tx_buf *txbuf,
 	if (!skb->encapsulation) {
 		l3_offset = skb_network_offset(skb);
 		l4_offset = skb_transport_offset(skb);
-		hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdrlen = skb_tcp_all_headers(skb);
 	} else {
 		l3_offset = skb_inner_network_offset(skb);
 		l4_offset = skb_inner_transport_offset(skb);
-		hdrlen = skb_inner_transport_header(skb) - skb->data +
-			inner_tcp_hdrlen(skb);
+		hdrlen = skb_inner_tcp_all_headers(skb);
 	}
 
 	segs = skb_shinfo(skb)->gso_segs;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index bcdd5ab0da5a..3e2ebe42e679 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -598,7 +598,7 @@ nfp_net_tls_tx(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
 	if (!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk))
 		return skb;
 
-	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	datalen = skb->len - skb_tcp_all_headers(skb);
 	seq = ntohl(tcp_hdr(skb)->seq);
 	ntls = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
 	resync_pending = tls_offload_tx_resync_pending(skb->sk);
@@ -666,7 +666,7 @@ void nfp_net_tls_tx_undo(struct sk_buff *skb, u64 tls_handle)
 	if (WARN_ON_ONCE(!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk)))
 		return;
 
-	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	datalen = skb->len - skb_tcp_all_headers(skb);
 	seq = ntohl(tcp_hdr(skb)->seq);
 
 	ntls = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
@@ -1758,8 +1758,7 @@ nfp_net_features_check(struct sk_buff *skb, struct net_device *dev,
 	if (skb_is_gso(skb)) {
 		u32 hdrlen;
 
-		hdrlen = skb_inner_transport_header(skb) - skb->data +
-			inner_tcp_hdrlen(skb);
+		hdrlen = skb_inner_tcp_all_headers(skb);
 
 		/* Assume worst case scenario of having longest possible
 		 * metadata prepend - 8B
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
index f54035455ad6..c03986bf2628 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@ -947,10 +947,9 @@ static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
 	}
 
 	if (encap)
-		hdrlen = skb_inner_transport_header(skb) - skb->data +
-			 inner_tcp_hdrlen(skb);
+		hdrlen = skb_inner_tcp_all_headers(skb);
 	else
-		hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdrlen = skb_tcp_all_headers(skb);
 
 	tso_rem = len;
 	seg_rem = min(tso_rem, hdrlen + mss);
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 07dd3c3b1771..4e6f00af17d9 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1877,7 +1877,7 @@ netxen_tso_check(struct net_device *netdev,
 	if ((netdev->features & (NETIF_F_TSO | NETIF_F_TSO6)) &&
 			skb_shinfo(skb)->gso_size > 0) {
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 
 		first_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
 		first_desc->total_hdr_length = hdr_len;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index b7cc36589f59..7c2af482192d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -260,11 +260,9 @@ static int map_frag_to_bd(struct qede_tx_queue *txq,
 static u16 qede_get_skb_hlen(struct sk_buff *skb, bool is_encap_pkt)
 {
 	if (is_encap_pkt)
-		return (skb_inner_transport_header(skb) +
-			inner_tcp_hdrlen(skb) - skb->data);
-	else
-		return (skb_transport_header(skb) +
-			tcp_hdrlen(skb) - skb->data);
+		return skb_inner_tcp_all_headers(skb);
+
+	return skb_tcp_all_headers(skb);
 }
 
 /* +2 for 1st BD for headers and 2nd BD for headlen (if required) */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index 8d43ca282956..9da5e97f8a0a 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -497,7 +497,7 @@ set_flags:
 	}
 	opcode = QLCNIC_TX_ETHER_PKT;
 	if (skb_is_gso(skb)) {
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		first_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
 		first_desc->hdr_length = hdr_len;
 		opcode = (protocol == ETH_P_IPV6) ? QLCNIC_TX_TCP_LSO6 :
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index 80c95c331c82..0d80447d4d3b 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -1264,7 +1264,7 @@ static int emac_tso_csum(struct emac_adapter *adpt,
 				pskb_trim(skb, pkt_len);
 		}
 
-		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr_len = skb_tcp_all_headers(skb);
 		if (unlikely(skb->len == hdr_len)) {
 			/* we only need to do csum */
 			netif_warn(adpt, tx_err, adpt->netdev,
@@ -1339,7 +1339,7 @@ static void emac_tx_fill_tpd(struct emac_adapter *adpt,
 
 	/* if Large Segment Offload is (in TCP Segmentation Offload struct) */
 	if (TPD_LSO(tpd)) {
-		mapped_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		mapped_len = skb_tcp_all_headers(skb);
 
 		tpbuf = GET_TPD_BUFFER(tx_q, tx_q->tpd.produce_idx);
 		tpbuf->length = mapped_len;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index fe263cad8248..6f14b00c0b14 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3961,7 +3961,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		proto_hdr_len = skb_transport_offset(skb) + sizeof(struct udphdr);
 		hdr = sizeof(struct udphdr);
 	} else {
-		proto_hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		proto_hdr_len = skb_tcp_all_headers(skb);
 		hdr = tcp_hdrlen(skb);
 	}
 
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
index d435519236e4..e54ce73396ee 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
@@ -81,7 +81,7 @@ static int xlgmac_prep_tso(struct sk_buff *skb,
 	if (ret)
 		return ret;
 
-	pkt_info->header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	pkt_info->header_len = skb_tcp_all_headers(skb);
 	pkt_info->tcp_header_len = tcp_hdrlen(skb);
 	pkt_info->tcp_payload_len = skb->len - pkt_info->header_len;
 	pkt_info->mss = skb_shinfo(skb)->gso_size;
diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c
index 5704defd7be1..237cbd5c5060 100644
--- a/drivers/net/wireless/ath/wil6210/txrx.c
+++ b/drivers/net/wireless/ath/wil6210/txrx.c
@@ -1782,9 +1782,7 @@ static int __wil_tx_vring_tso(struct wil6210_priv *wil, struct wil6210_vif *vif,
 	}
 
 	/* Header Length = MAC header len + IP header len + TCP header len*/
-	hdrlen = ETH_HLEN +
-		(int)skb_network_header_len(skb) +
-		tcp_hdrlen(skb);
+	hdrlen = skb_tcp_all_headers(skb);
 
 	gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV6 | SKB_GSO_TCPV4);
 	switch (gso_type) {
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index fc61a4418737..a256695fc89e 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1201,9 +1201,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 			}
 
 			mss = skb_shinfo(skb)->gso_size;
-			hdrlen = skb_transport_header(skb) -
-				skb_mac_header(skb) +
-				tcp_hdrlen(skb);
+			hdrlen = skb_tcp_all_headers(skb);
 
 			skb_shinfo(skb)->gso_segs =
 				DIV_ROUND_UP(skb->len - hdrlen, mss);
diff --git a/drivers/staging/qlge/qlge_main.c b/drivers/staging/qlge/qlge_main.c
index 113a3efd12e9..6cd7fc9589c3 100644
--- a/drivers/staging/qlge/qlge_main.c
+++ b/drivers/staging/qlge/qlge_main.c
@@ -2461,7 +2461,7 @@ static int qlge_tso(struct sk_buff *skb, struct qlge_ob_mac_tso_iocb_req *mac_io
 		mac_iocb_ptr->flags3 |= OB_MAC_TSO_IOCB_IC;
 		mac_iocb_ptr->frame_len = cpu_to_le32((u32)skb->len);
 		mac_iocb_ptr->total_hdrs_len =
-			cpu_to_le16(skb_transport_offset(skb) + tcp_hdrlen(skb));
+			cpu_to_le16(skb_tcp_all_headers(skb));
 		mac_iocb_ptr->net_trans_offset =
 			cpu_to_le16(skb_network_offset(skb) |
 				    skb_transport_offset(skb)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1168302b7927..a9fbe22732c3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -46,6 +46,36 @@ static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
 	return inner_tcp_hdr(skb)->doff * 4;
 }
 
+/**
+ * skb_tcp_all_headers - Returns size of all headers for a TCP packet
+ * @skb: buffer
+ *
+ * Used in TX path, for a packet known to be a TCP one.
+ *
+ * if (skb_is_gso(skb)) {
+ *         int hlen = skb_tcp_all_headers(skb);
+ *         ...
+ */
+static inline int skb_tcp_all_headers(const struct sk_buff *skb)
+{
+	return skb_transport_offset(skb) + tcp_hdrlen(skb);
+}
+
+/**
+ * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet
+ * @skb: buffer
+ *
+ * Used in TX path, for a packet known to be a TCP one.
+ *
+ * if (skb_is_gso(skb) && skb->encapsulation) {
+ *         int hlen = skb_inner_tcp_all_headers(skb);
+ *         ...
+ */
+static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb)
+{
+	return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
+}
+
 static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 {
 	return (tcp_hdr(skb)->doff - 5) * 4;
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index e40bedd112b6..3bae29ae57ca 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -232,7 +232,7 @@ static int fill_sg_in(struct scatterlist *sg_in,
 		      s32 *sync_size,
 		      int *resync_sgs)
 {
-	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int tcp_payload_offset = skb_tcp_all_headers(skb);
 	int payload_len = skb->len - tcp_payload_offset;
 	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
 	struct tls_record_info *record;
@@ -310,8 +310,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
 				   struct sk_buff *skb,
 				   s32 sync_size, u64 rcd_sn)
 {
-	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
 	struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
+	int tcp_payload_offset = skb_tcp_all_headers(skb);
 	int payload_len = skb->len - tcp_payload_offset;
 	void *buf, *iv, *aad, *dummy_buf;
 	struct aead_request *aead_req;
@@ -372,7 +372,7 @@ free_nskb:
 
 static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
 {
-	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int tcp_payload_offset = skb_tcp_all_headers(skb);
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
 	int payload_len = skb->len - tcp_payload_offset;
-- 
cgit 


From 092f875131dcdbd8f48b2ece9416225a141e656b Mon Sep 17 00:00:00 2001
From: Prasanna Vengateshan <prasanna.vengateshan@microchip.com>
Date: Fri, 1 Jul 2022 20:30:20 +0530
Subject: net: dsa: tag_ksz: add tag handling for Microchip LAN937x

The Microchip LAN937X switches have a tagging protocol which is
very similar to KSZ tagging. So that the implementation is added to
tag_ksz.c and reused common APIs

Signed-off-by: Prasanna Vengateshan <prasanna.vengateshan@microchip.com>
Signed-off-by: Arun Ramadoss <arun.ramadoss@microchip.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/Kconfig   |  4 ++--
 net/dsa/tag_ksz.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ea7bf007f34f..b902b31bebce 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -54,6 +54,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_RTL8_4_VALUE		24
 #define DSA_TAG_PROTO_RTL8_4T_VALUE		25
 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE		26
+#define DSA_TAG_PROTO_LAN937X_VALUE		27
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -83,6 +84,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_RTL8_4		= DSA_TAG_PROTO_RTL8_4_VALUE,
 	DSA_TAG_PROTO_RTL8_4T		= DSA_TAG_PROTO_RTL8_4T_VALUE,
 	DSA_TAG_PROTO_RZN1_A5PSW	= DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
+	DSA_TAG_PROTO_LAN937X		= DSA_TAG_PROTO_LAN937X_VALUE,
 };
 
 struct dsa_switch;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 63853fff4e2f..3eef72ce99a4 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -87,10 +87,10 @@ config NET_DSA_TAG_MTK
 	  Mediatek switches.
 
 config NET_DSA_TAG_KSZ
-	tristate "Tag driver for Microchip 8795/9477/9893 families of switches"
+	tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches"
 	help
 	  Say Y if you want to enable support for tagging frames for the
-	  Microchip 8795/9477/9893 families of switches.
+	  Microchip 8795/937x/9477/9893 families of switches.
 
 config NET_DSA_TAG_OCELOT
 	tristate "Tag driver for Ocelot family of switches, using NPI port"
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 3509fc967ca9..38fa19c1e2d5 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -193,10 +193,69 @@ static const struct dsa_device_ops ksz9893_netdev_ops = {
 DSA_TAG_DRIVER(ksz9893_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893);
 
+/* For xmit, 2 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : represents tag override, lookup and valid
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8)
+ *
+ * For rcv, 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ *	  (eg, 0x00=port1, 0x02=port3, 0x07=port8)
+ */
+#define LAN937X_EGRESS_TAG_LEN		2
+
+#define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE	BIT(11)
+#define LAN937X_TAIL_TAG_LOOKUP			BIT(12)
+#define LAN937X_TAIL_TAG_VALID			BIT(13)
+#define LAN937X_TAIL_TAG_PORT_MASK		7
+
+static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	const struct ethhdr *hdr = eth_hdr(skb);
+	__be16 *tag;
+	u16 val;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+		return NULL;
+
+	tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN);
+
+	val = BIT(dp->index);
+
+	if (is_link_local_ether_addr(hdr->h_dest))
+		val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE;
+
+	/* Tail tag valid bit - This bit should always be set by the CPU */
+	val |= LAN937X_TAIL_TAG_VALID;
+
+	put_unaligned_be16(val, tag);
+
+	return skb;
+}
+
+static const struct dsa_device_ops lan937x_netdev_ops = {
+	.name	= "lan937x",
+	.proto	= DSA_TAG_PROTO_LAN937X,
+	.xmit	= lan937x_xmit,
+	.rcv	= ksz9477_rcv,
+	.needed_tailroom = LAN937X_EGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(lan937x_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X);
+
 static struct dsa_tag_driver *dsa_tag_driver_array[] = {
 	&DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
 	&DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
 	&DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
+	&DSA_TAG_DRIVER_NAME(lan937x_netdev_ops),
 };
 
 module_dsa_tag_drivers(dsa_tag_driver_array);
-- 
cgit 


From f019679ea5f2ab650c3348a79e7d9c3625f62899 Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Mon, 30 May 2022 06:07:57 +0300
Subject: net/mlx5: E-switch, Remove dependency between sriov and eswitch mode

Currently, there are three eswitch modes, none, legacy and switchdev.
None is the default mode. Remove redundant none mode as eswitch mode
should always be either legacy mode or switchdev mode.

With this patch, there are two behavior changes:

1. Legacy becomes the default mode. When querying eswitch mode using
   devlink, a valid mode is always returned.
2. When disabling sriov, the eswitch mode will not change, only vfs
   are unloaded.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 87 ++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  9 ++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 52 ++++---------
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c  | 10 +--
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  1 +
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    |  3 +-
 include/linux/mlx5/eswitch.h                       |  8 +-
 8 files changed, 89 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index b9a3473f5672..82b62ab1d1d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1152,8 +1152,6 @@ mlx5_eswitch_update_num_of_vfs(struct mlx5_eswitch *esw, int num_vfs)
 {
 	const u32 *out;
 
-	WARN_ON_ONCE(esw->mode != MLX5_ESWITCH_NONE);
-
 	if (num_vfs < 0)
 		return;
 
@@ -1287,7 +1285,7 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs)
 	return 0;
 
 abort:
-	esw->mode = MLX5_ESWITCH_NONE;
+	esw->mode = MLX5_ESWITCH_LEGACY;
 
 	if (mode == MLX5_ESWITCH_OFFLOADS)
 		mlx5_rescan_drivers(esw->dev);
@@ -1312,13 +1310,13 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 	if (!mlx5_esw_allowed(esw))
 		return 0;
 
-	toggle_lag = esw->mode == MLX5_ESWITCH_NONE;
+	toggle_lag = !mlx5_esw_is_fdb_created(esw);
 
 	if (toggle_lag)
 		mlx5_lag_disable_change(esw->dev);
 
 	down_write(&esw->mode_lock);
-	if (esw->mode == MLX5_ESWITCH_NONE) {
+	if (!mlx5_esw_is_fdb_created(esw)) {
 		ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs);
 	} else {
 		enum mlx5_eswitch_vport_event vport_events;
@@ -1337,56 +1335,79 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 	return ret;
 }
 
-void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf)
+/* When disabling sriov, free driver level resources. */
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
 {
-	struct devlink *devlink = priv_to_devlink(esw->dev);
-	int old_mode;
-
-	lockdep_assert_held_write(&esw->mode_lock);
-
-	if (esw->mode == MLX5_ESWITCH_NONE)
+	if (!mlx5_esw_allowed(esw))
 		return;
 
-	esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n",
+	down_write(&esw->mode_lock);
+	/* If driver is unloaded, this function is called twice by remove_one()
+	 * and mlx5_unload(). Prevent the second call.
+	 */
+	if (!esw->esw_funcs.num_vfs && !clear_vf)
+		goto unlock;
+
+	esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), active vports(%d)\n",
 		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
 		 esw->esw_funcs.num_vfs, esw->enabled_vports);
 
+	mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
+	if (clear_vf)
+		mlx5_eswitch_clear_vf_vports_info(esw);
+	/* If disabling sriov in switchdev mode, free meta rules here
+	 * because it depends on num_vfs.
+	 */
+	if (esw->mode == MLX5_ESWITCH_OFFLOADS) {
+		struct devlink *devlink = priv_to_devlink(esw->dev);
+
+		esw_offloads_del_send_to_vport_meta_rules(esw);
+		devlink_rate_nodes_destroy(devlink);
+	}
+
+	esw->esw_funcs.num_vfs = 0;
+
+unlock:
+	up_write(&esw->mode_lock);
+}
+
+/* Free resources for corresponding eswitch mode. It is called by devlink
+ * when changing eswitch mode or modprobe when unloading driver.
+ */
+void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw)
+{
+	struct devlink *devlink = priv_to_devlink(esw->dev);
+
 	/* Notify eswitch users that it is exiting from current mode.
 	 * So that it can do necessary cleanup before the eswitch is disabled.
 	 */
-	mlx5_esw_mode_change_notify(esw, MLX5_ESWITCH_NONE);
+	mlx5_esw_mode_change_notify(esw, MLX5_ESWITCH_LEGACY);
 
 	mlx5_eswitch_event_handlers_unregister(esw);
 
+	esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n",
+		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
+		 esw->esw_funcs.num_vfs, esw->enabled_vports);
+
 	esw->fdb_table.flags &= ~MLX5_ESW_FDB_CREATED;
-	if (esw->mode == MLX5_ESWITCH_LEGACY)
-		esw_legacy_disable(esw);
-	else if (esw->mode == MLX5_ESWITCH_OFFLOADS)
+	if (esw->mode == MLX5_ESWITCH_OFFLOADS)
 		esw_offloads_disable(esw);
-
-	old_mode = esw->mode;
-	esw->mode = MLX5_ESWITCH_NONE;
-
-	if (old_mode == MLX5_ESWITCH_OFFLOADS)
-		mlx5_rescan_drivers(esw->dev);
-
-	devlink_rate_nodes_destroy(devlink);
-
+	else if (esw->mode == MLX5_ESWITCH_LEGACY)
+		esw_legacy_disable(esw);
 	mlx5_esw_acls_ns_cleanup(esw);
 
-	if (clear_vf)
-		mlx5_eswitch_clear_vf_vports_info(esw);
+	if (esw->mode == MLX5_ESWITCH_OFFLOADS)
+		devlink_rate_nodes_destroy(devlink);
 }
 
-void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf)
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 {
 	if (!mlx5_esw_allowed(esw))
 		return;
 
 	mlx5_lag_disable_change(esw->dev);
 	down_write(&esw->mode_lock);
-	mlx5_eswitch_disable_locked(esw, clear_vf);
-	esw->esw_funcs.num_vfs = 0;
+	mlx5_eswitch_disable_locked(esw);
 	up_write(&esw->mode_lock);
 	mlx5_lag_enable_change(esw->dev);
 }
@@ -1581,7 +1602,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	refcount_set(&esw->qos.refcnt, 0);
 
 	esw->enabled_vports = 0;
-	esw->mode = MLX5_ESWITCH_NONE;
+	esw->mode = MLX5_ESWITCH_LEGACY;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
 	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) &&
 	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
@@ -1883,7 +1904,7 @@ u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev)
 {
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
 
-	return mlx5_esw_allowed(esw) ? esw->mode : MLX5_ESWITCH_NONE;
+	return mlx5_esw_allowed(esw) ? esw->mode : MLX5_ESWITCH_LEGACY;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index a9ba0e324834..5452437e531f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -342,6 +342,7 @@ void esw_offloads_disable(struct mlx5_eswitch *esw);
 int esw_offloads_enable(struct mlx5_eswitch *esw);
 void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw);
 int esw_offloads_init_reps(struct mlx5_eswitch *esw);
+void esw_offloads_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw);
 
 bool mlx5_esw_vport_match_metadata_supported(const struct mlx5_eswitch *esw);
 int mlx5_esw_offloads_vport_metadata_set(struct mlx5_eswitch *esw, bool enable);
@@ -357,8 +358,9 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
 #define MLX5_ESWITCH_IGNORE_NUM_VFS (-1)
 int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs);
 int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs);
-void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf);
-void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf);
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf);
+void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw);
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw);
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       u16 vport, const u8 *mac);
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
@@ -729,7 +731,8 @@ int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw);
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { return 0; }
-static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf) {}
+static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) {}
+static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 static inline
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, u16 vport, int link_state) { return 0; }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 2ce3728576d1..7c51011aed2b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1040,6 +1040,15 @@ static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
 		mlx5_del_flow_rules(flows[i]);
 
 	kvfree(flows);
+	/* If changing eswitch mode from switchdev to legacy, but num_vfs is not 0,
+	 * meta rules could be freed again. So set it to NULL.
+	 */
+	esw->fdb_table.offloads.send_to_vport_meta_rules = NULL;
+}
+
+void esw_offloads_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
+{
+	mlx5_eswitch_del_send_to_vport_meta_rules(esw);
 }
 
 static int
@@ -2034,7 +2043,7 @@ static int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode)
 	if (!MLX5_CAP_GEN(dev, vport_group_manager))
 		return -EOPNOTSUPP;
 
-	if (esw->mode == MLX5_ESWITCH_NONE)
+	if (!mlx5_esw_is_fdb_created(esw))
 		return -EOPNOTSUPP;
 
 	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
@@ -2170,7 +2179,6 @@ static int esw_offloads_start(struct mlx5_eswitch *esw,
 {
 	int err, err1;
 
-	mlx5_eswitch_disable_locked(esw, false);
 	err = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_OFFLOADS,
 					 esw->dev->priv.sriov.num_vfs);
 	if (err) {
@@ -2894,7 +2902,7 @@ int mlx5_esw_offloads_vport_metadata_set(struct mlx5_eswitch *esw, bool enable)
 	int err = 0;
 
 	down_write(&esw->mode_lock);
-	if (esw->mode != MLX5_ESWITCH_NONE) {
+	if (mlx5_esw_is_fdb_created(esw)) {
 		err = -EBUSY;
 		goto done;
 	}
@@ -3229,7 +3237,6 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 {
 	int err, err1;
 
-	mlx5_eswitch_disable_locked(esw, false);
 	err = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY,
 					 MLX5_ESWITCH_IGNORE_NUM_VFS);
 	if (err) {
@@ -3334,15 +3341,6 @@ static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode)
 	return 0;
 }
 
-static int eswitch_devlink_esw_mode_check(const struct mlx5_eswitch *esw)
-{
-	/* devlink commands in NONE eswitch mode are currently supported only
-	 * on ECPF.
-	 */
-	return (esw->mode == MLX5_ESWITCH_NONE &&
-		!mlx5_core_is_ecpf_esw_manager(esw->dev)) ? -EOPNOTSUPP : 0;
-}
-
 /* FIXME: devl_unlock() followed by devl_lock() inside driver callback
  * is never correct and prone to races. It's a transitional workaround,
  * never repeat this pattern.
@@ -3399,6 +3397,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 	if (cur_mlx5_mode == mlx5_mode)
 		goto unlock;
 
+	mlx5_eswitch_disable_locked(esw);
 	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
 		if (mlx5_devlink_trap_get_num_active(esw->dev)) {
 			NL_SET_ERR_MSG_MOD(extack,
@@ -3409,6 +3408,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		err = esw_offloads_start(esw, extack);
 	} else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) {
 		err = esw_offloads_stop(esw, extack);
+		mlx5_rescan_drivers(esw->dev);
 	} else {
 		err = -EINVAL;
 	}
@@ -3431,12 +3431,7 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 		return PTR_ERR(esw);
 
 	mlx5_eswtich_mode_callback_enter(devlink, esw);
-	err = eswitch_devlink_esw_mode_check(esw);
-	if (err)
-		goto unlock;
-
 	err = esw_mode_to_devlink(esw->mode, mode);
-unlock:
 	mlx5_eswtich_mode_callback_exit(devlink, esw);
 	return err;
 }
@@ -3485,9 +3480,6 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 		return PTR_ERR(esw);
 
 	mlx5_eswtich_mode_callback_enter(devlink, esw);
-	err = eswitch_devlink_esw_mode_check(esw);
-	if (err)
-		goto out;
 
 	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
 	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
@@ -3539,12 +3531,7 @@ int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
 		return PTR_ERR(esw);
 
 	mlx5_eswtich_mode_callback_enter(devlink, esw);
-	err = eswitch_devlink_esw_mode_check(esw);
-	if (err)
-		goto unlock;
-
 	err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
-unlock:
 	mlx5_eswtich_mode_callback_exit(devlink, esw);
 	return err;
 }
@@ -3555,16 +3542,13 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_eswitch *esw;
-	int err;
+	int err = 0;
 
 	esw = mlx5_devlink_eswitch_get(devlink);
 	if (IS_ERR(esw))
 		return PTR_ERR(esw);
 
 	mlx5_eswtich_mode_callback_enter(devlink, esw);
-	err = eswitch_devlink_esw_mode_check(esw);
-	if (err)
-		goto unlock;
 
 	if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
 	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) ||
@@ -3615,21 +3599,15 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink,
 					enum devlink_eswitch_encap_mode *encap)
 {
 	struct mlx5_eswitch *esw;
-	int err;
 
 	esw = mlx5_devlink_eswitch_get(devlink);
 	if (IS_ERR(esw))
 		return PTR_ERR(esw);
 
 	mlx5_eswtich_mode_callback_enter(devlink, esw);
-	err = eswitch_devlink_esw_mode_check(esw);
-	if (err)
-		goto unlock;
-
 	*encap = esw->offloads.encap;
-unlock:
 	mlx5_eswtich_mode_callback_exit(devlink, esw);
-	return err;
+	return 0;
 }
 
 static bool
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 2a8fc547eb37..641505d2c0c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -632,6 +632,7 @@ static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 {
 #ifdef CONFIG_MLX5_ESWITCH
+	struct mlx5_core_dev *dev;
 	u8 mode;
 #endif
 	int i;
@@ -641,11 +642,11 @@ static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 			return false;
 
 #ifdef CONFIG_MLX5_ESWITCH
-	mode = mlx5_eswitch_mode(ldev->pf[MLX5_LAG_P1].dev);
-
-	if (mode != MLX5_ESWITCH_NONE && mode != MLX5_ESWITCH_OFFLOADS)
+	dev = ldev->pf[MLX5_LAG_P1].dev;
+	if ((mlx5_sriov_is_enabled(dev)) && !is_mdev_switchdev_mode(dev))
 		return false;
 
+	mode = mlx5_eswitch_mode(dev);
 	for (i = 0; i < ldev->ports; i++)
 		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
 			return false;
@@ -760,8 +761,7 @@ static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
 
 #ifdef CONFIG_MLX5_ESWITCH
 	for (i = 0; i < ldev->ports; i++)
-		roce_lag = roce_lag &&
-			ldev->pf[i].dev->priv.eswitch->mode == MLX5_ESWITCH_NONE;
+		roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev);
 #endif
 
 	return roce_lag;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 2078d9f03a5f..a9e51c1b7738 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1250,6 +1250,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 {
 	mlx5_sf_dev_table_destroy(dev);
 	mlx5_sriov_detach(dev);
+	mlx5_eswitch_disable(dev->priv.eswitch);
 	mlx5_lag_remove_mdev(dev);
 	mlx5_ec_cleanup(dev);
 	mlx5_sf_hw_table_destroy(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 3be659cd91f1..7d955a4d9f14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -501,7 +501,7 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi
 	case MLX5_ESWITCH_OFFLOADS:
 		mlx5_sf_table_enable(table);
 		break;
-	case MLX5_ESWITCH_NONE:
+	case MLX5_ESWITCH_LEGACY:
 		mlx5_sf_table_disable(table);
 		break;
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 2935614f6fa9..5757cd6e1819 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -145,8 +145,7 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf)
 		sriov->vfs_ctx[vf].enabled = 0;
 	}
 
-	if (MLX5_ESWITCH_MANAGER(dev))
-		mlx5_eswitch_disable(dev->priv.eswitch, clear_vf);
+	mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf);
 
 	if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages))
 		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 8b18fe9771f9..e2701ed0200e 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -12,7 +12,6 @@
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
 
 enum {
-	MLX5_ESWITCH_NONE,
 	MLX5_ESWITCH_LEGACY,
 	MLX5_ESWITCH_OFFLOADS
 };
@@ -153,7 +152,7 @@ struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw);
 
 static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev)
 {
-	return MLX5_ESWITCH_NONE;
+	return MLX5_ESWITCH_LEGACY;
 }
 
 static inline enum devlink_eswitch_encap_mode
@@ -198,6 +197,11 @@ static inline struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitc
 
 #endif /* CONFIG_MLX5_ESWITCH */
 
+static inline bool is_mdev_legacy_mode(struct mlx5_core_dev *dev)
+{
+	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_LEGACY;
+}
+
 static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
 {
 	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS;
-- 
cgit 


From 036bff2800cbcf8217dd0bc93d8421b5b8f72476 Mon Sep 17 00:00:00 2001
From: Dario Binacchi <dario.binacchi@amarulasolutions.com>
Date: Tue, 28 Jun 2022 18:31:28 +0200
Subject: can: netlink: dump bitrate 0 if can_priv::bittiming.bitrate is -1U

Upcoming changes on slcan driver will require you to specify a bitrate
of value -1 to prevent the open_candev() from failing but at the same
time highlighting that it is a fake value. In this case the command
`ip --details -s -s link show' would print 4294967295 as the bitrate
value. The patch change this value in 0.

Link: https://lore.kernel.org/all/20220628163137.413025-5-dario.binacchi@amarulasolutions.com
Suggested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Dario Binacchi <dario.binacchi@amarulasolutions.com>
Tested-by: Jeroen Hofstee <jhofstee@victronenergy.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/netlink.c | 3 ++-
 include/linux/can/bittiming.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 037824011266..8efa22d9f214 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -511,7 +511,8 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (priv->do_get_state)
 		priv->do_get_state(dev, &state);
 
-	if ((priv->bittiming.bitrate &&
+	if ((priv->bittiming.bitrate != CAN_BITRATE_UNSET &&
+	     priv->bittiming.bitrate != CAN_BITRATE_UNKNOWN &&
 	     nla_put(skb, IFLA_CAN_BITTIMING,
 		     sizeof(priv->bittiming), &priv->bittiming)) ||
 
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 7ae21c0f7f23..ef0a77173e3c 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -11,6 +11,8 @@
 
 #define CAN_SYNC_SEG 1
 
+#define CAN_BITRATE_UNSET 0
+#define CAN_BITRATE_UNKNOWN (-1U)
 
 #define CAN_CTRLMODE_TDC_MASK					\
 	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL)
-- 
cgit 


From cffe57bee62b155c08d71218fc0e9e84a0a90bbb Mon Sep 17 00:00:00 2001
From: Bagas Sanjaya <bagasdotme@gmail.com>
Date: Wed, 22 Jun 2022 15:45:46 +0700
Subject: Documentation: highmem: use literal block for code example in
 highmem.h comment

When building htmldocs on Linus's tree, there are inline emphasis warnings
on include/linux/highmem.h:

Documentation/vm/highmem:166: ./include/linux/highmem.h:154: WARNING: Inline emphasis start-string without end-string.
Documentation/vm/highmem:166: ./include/linux/highmem.h:157: WARNING: Inline emphasis start-string without end-string.

These warnings above are due to comments in code example at the mentioned
lines above are enclosed by double dash (--), which confuses Sphinx as
inline markup delimiters instead.

Fix these warnings by indenting the code example with literal block
indentation and making the comments C comments.

Link: https://lkml.kernel.org/r/20220622084546.17745-1-bagasdotme@gmail.com
Fixes: 85a85e7601263f ("Documentation/vm: move "Using kmap-atomic" to highmem.h")
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Tested-by: Ira Weiny <ira.weiny@intel.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3af34de54330..56d6a0196534 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -149,19 +149,19 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset);
  * It is used in atomic context when code wants to access the contents of a
  * page that might be allocated from high memory (see __GFP_HIGHMEM), for
  * example a page in the pagecache.  The API has two functions, and they
- * can be used in a manner similar to the following:
+ * can be used in a manner similar to the following::
  *
- * -- Find the page of interest. --
- * struct page *page = find_get_page(mapping, offset);
+ *   // Find the page of interest.
+ *   struct page *page = find_get_page(mapping, offset);
  *
- * -- Gain access to the contents of that page. --
- * void *vaddr = kmap_atomic(page);
+ *   // Gain access to the contents of that page.
+ *   void *vaddr = kmap_atomic(page);
  *
- * -- Do something to the contents of that page. --
- * memset(vaddr, 0, PAGE_SIZE);
+ *   // Do something to the contents of that page.
+ *   memset(vaddr, 0, PAGE_SIZE);
  *
- * -- Unmap that page. --
- * kunmap_atomic(vaddr);
+ *   // Unmap that page.
+ *   kunmap_atomic(vaddr);
  *
  * Note that the kunmap_atomic() call takes the result of the kmap_atomic()
  * call, not the argument.
-- 
cgit 


From 507db7927cd181d409dd495c8384b8e14c21c600 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Sun, 3 Jul 2022 18:08:36 -0700
Subject: mm: rmap: use the correct parameter name for DEFINE_PAGE_VMA_WALK

The parameter used by DEFINE_PAGE_VMA_WALK is _page not page, fix the
parameter name.  It didn't cause any build error, it is probably because
the only caller is write_protect_page() from ksm.c, which pass in page.

Link: https://lkml.kernel.org/r/20220512174551.81279-1-shy828301@gmail.com
Fixes: 2aff7a4755be ("mm: Convert page_vma_mapped_walk to work on PFNs")
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9ec23138e410..bf80adca980b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -325,8 +325,8 @@ struct page_vma_mapped_walk {
 #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)	\
 	struct page_vma_mapped_walk name = {				\
 		.pfn = page_to_pfn(_page),				\
-		.nr_pages = compound_nr(page),				\
-		.pgoff = page_to_pgoff(page),				\
+		.nr_pages = compound_nr(_page),				\
+		.pgoff = page_to_pgoff(_page),				\
 		.vma = _vma,						\
 		.address = _address,					\
 		.flags = _flags,					\
-- 
cgit 


From c453d8c7d1384d7e1d7f26d3ec0d527092edf801 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Fri, 13 May 2022 12:17:05 -0700
Subject: mm/page_vma_mapped.c: check possible huge PMD map with
 transhuge_vma_suitable()

IIUC page_vma_mapped_walk() checks if the vma is possibly huge PMD mapped
with transparent_hugepage_active() and "pvmw->nr_pages >= HPAGE_PMD_NR".

Actually pvmw->nr_pages is returned by compound_nr() or folio_nr_pages(),
so the page should be THP as long as "pvmw->nr_pages >= HPAGE_PMD_NR".
And it is guaranteed THP is allocated for valid VMA in the first place.
But it may be not PMD mapped if the VMA is file VMA and it is not properly
aligned.  The transhuge_vma_suitable() is used to do such check, so
replace transparent_hugepage_active() to it, which is too heavy and
overkilling.

Link: https://lkml.kernel.org/r/20220513191705.457775-1-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 8 ++++++--
 mm/page_vma_mapped.c    | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de29821231c9..648cb3ce7099 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -117,8 +117,10 @@ extern struct kobj_attribute shmem_enabled_attr;
 extern unsigned long transparent_hugepage_flags;
 
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
+		unsigned long addr)
 {
+	unsigned long haddr;
+
 	/* Don't have to check pgoff for anonymous vma */
 	if (!vma_is_anonymous(vma)) {
 		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -126,6 +128,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 			return false;
 	}
 
+	haddr = addr & HPAGE_PMD_MASK;
+
 	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
 		return false;
 	return true;
@@ -342,7 +346,7 @@ static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
 }
 
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
+		unsigned long addr)
 {
 	return false;
 }
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..e971a467fcdf 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -243,7 +243,7 @@ restart:
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
 			if ((pvmw->flags & PVMW_SYNC) &&
-			    transparent_hugepage_active(vma) &&
+			    transhuge_vma_suitable(vma, pvmw->address) &&
 			    (pvmw->nr_pages >= HPAGE_PMD_NR)) {
 				spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
 
-- 
cgit 


From 7ce82f4c3f3ead13a9d9498768e3b1a79975c4d8 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 30 May 2022 19:30:15 +0800
Subject: mm/migration: return errno when isolate_huge_page failed

We might fail to isolate huge page due to e.g.  the page is under
migration which cleared HPageMigratable.  We should return errno in this
case rather than always return 1 which could confuse the user, i.e.  the
caller might think all of the memory is migrated while the hugetlb page is
left behind.  We make the prototype of isolate_huge_page consistent with
isolate_lru_page as suggested by Huang Ying and rename isolate_huge_page
to isolate_hugetlb as suggested by Muchun to improve the readability.

Link: https://lkml.kernel.org/r/20220530113016.16663-4-linmiaohe@huawei.com
Fixes: e8db67eb0ded ("mm: migrate: move_pages() supports thp migration")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Suggested-by: Huang Ying <ying.huang@intel.com>
Reported-by: kernel test robot <lkp@intel.com> (build error)
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 +++---
 mm/gup.c                |  2 +-
 mm/hugetlb.c            | 11 +++++------
 mm/memory-failure.c     |  2 +-
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  2 +-
 mm/migrate.c            |  7 ++++---
 7 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e4cff27d1198..756b66ff025e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
-bool isolate_huge_page(struct page *page, struct list_head *list);
+int isolate_hugetlb(struct page *page, struct list_head *list);
 int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
 void putback_active_hugepage(struct page *page);
@@ -376,9 +376,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 	return NULL;
 }
 
-static inline bool isolate_huge_page(struct page *page, struct list_head *list)
+static inline int isolate_hugetlb(struct page *page, struct list_head *list)
 {
-	return false;
+	return -EBUSY;
 }
 
 static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
diff --git a/mm/gup.c b/mm/gup.c
index 407a81d5ca03..3129b754ade3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1930,7 +1930,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 		 * Try to move out any movable page before pinning the range.
 		 */
 		if (folio_test_hugetlb(folio)) {
-			if (!isolate_huge_page(&folio->page,
+			if (isolate_hugetlb(&folio->page,
 						&movable_page_list))
 				isolation_error_count++;
 			continue;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b36a4ef87a2e..dd9a46ccb79c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2766,8 +2766,7 @@ retry:
 		 * Fail with -EBUSY if not possible.
 		 */
 		spin_unlock_irq(&hugetlb_lock);
-		if (!isolate_huge_page(old_page, list))
-			ret = -EBUSY;
+		ret = isolate_hugetlb(old_page, list);
 		spin_lock_irq(&hugetlb_lock);
 		goto free_new;
 	} else if (!HPageFreed(old_page)) {
@@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
 	if (hstate_is_gigantic(h))
 		return -ENOMEM;
 
-	if (page_count(head) && isolate_huge_page(head, list))
+	if (page_count(head) && !isolate_hugetlb(head, list))
 		ret = 0;
 	else if (!page_count(head))
 		ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -6960,15 +6959,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 }
 
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
 {
-	bool ret = true;
+	int ret = 0;
 
 	spin_lock_irq(&hugetlb_lock);
 	if (!PageHeadHuge(page) ||
 	    !HPageMigratable(page) ||
 	    !get_page_unless_zero(page)) {
-		ret = false;
+		ret = -EBUSY;
 		goto unlock;
 	}
 	ClearHPageMigratable(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..845369f839e1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2178,7 +2178,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
 	bool lru = PageLRU(page);
 
 	if (PageHuge(page)) {
-		isolated = isolate_huge_page(page, pagelist);
+		isolated = !isolate_hugetlb(page, pagelist);
 	} else {
 		if (lru)
 			isolated = !isolate_lru_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f1a730c4499..84990a14d51a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 
 		if (PageHuge(page)) {
 			pfn = page_to_pfn(head) + compound_nr(head) - 1;
-			isolate_huge_page(head, &source);
+			isolate_hugetlb(head, &source);
 			continue;
 		} else if (PageTransHuge(page))
 			pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d39b01fd52fe..9689919a2829 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 	if (flags & (MPOL_MF_MOVE_ALL) ||
 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
-		if (!isolate_huge_page(page, qp->pagelist) &&
+		if (isolate_hugetlb(page, qp->pagelist) &&
 			(flags & MPOL_MF_STRICT))
 			/*
 			 * Failed to isolate page but allow migrating pages
diff --git a/mm/migrate.c b/mm/migrate.c
index c83b3ae2e285..1d036dec1328 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -133,7 +133,7 @@ static void putback_movable_page(struct page *page)
  *
  * This function shall be used whenever the isolated pageset has been
  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
  */
 void putback_movable_pages(struct list_head *l)
 {
@@ -1628,8 +1628,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 
 	if (PageHuge(page)) {
 		if (PageHead(page)) {
-			isolate_huge_page(page, pagelist);
-			err = 1;
+			err = isolate_hugetlb(page, pagelist);
+			if (!err)
+				err = 1;
 		}
 	} else {
 		struct page *head;
-- 
cgit 


From ad1ac596e8a8c4b06715dfbd89853eb73c9886b2 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 30 May 2022 19:30:16 +0800
Subject: mm/migration: fix potential pte_unmap on an not mapped pte

__migration_entry_wait and migration_entry_wait_on_locked assume pte is
always mapped from caller.  But this is not the case when it's called from
migration_entry_wait_huge and follow_huge_pmd.  Add a hugetlbfs variant
that calls hugetlb_migration_entry_wait(ptep == NULL) to fix this issue.

Link: https://lkml.kernel.org/r/20220530113016.16663-5-linmiaohe@huawei.com
Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 12 ++++++++----
 mm/hugetlb.c            |  4 ++--
 mm/migrate.c            | 23 +++++++++++++++++++----
 3 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index f24775b41880..bb7afd03a324 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
-extern void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte);
+#ifdef CONFIG_HUGETLB_PAGE
+extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
+#endif
 #else
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
@@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
-static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte) { }
+#ifdef CONFIG_HUGETLB_PAGE
+static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
+#endif
 static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd9a46ccb79c..ed202d29ca46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5702,7 +5702,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(vma, mm, ptep);
+			migration_entry_wait_huge(vma, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
@@ -6927,7 +6927,7 @@ retry:
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			__migration_entry_wait_huge((pte_t *)pmd, ptl);
 			goto retry;
 		}
 		/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 1d036dec1328..7934eebf1689 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -315,13 +315,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	__migration_entry_wait(mm, ptep, ptl);
 }
 
-void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte)
+#ifdef CONFIG_HUGETLB_PAGE
+void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
-	__migration_entry_wait(mm, pte, ptl);
+	pte_t pte;
+
+	spin_lock(ptl);
+	pte = huge_ptep_get(ptep);
+
+	if (unlikely(!is_hugetlb_entry_migration(pte)))
+		spin_unlock(ptl);
+	else
+		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
 }
 
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
+{
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
+
+	__migration_entry_wait_huge(pte, ptl);
+}
+#endif
+
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 {
-- 
cgit 


From c9e124e0382d83d458db204f929002ea98daa6a8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:06 +0000
Subject: mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h

The function for knowing if given monitoring context's targets will have
pid or not is defined and used in dbgfs only.  However, the logic is also
needed for sysfs.  This commit moves the code to damon.h and makes both
dbgfs and sysfs to use it.

Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  6 ++++++
 mm/damon/dbgfs.c      | 15 +++++----------
 mm/damon/sysfs.c      |  8 +++-----
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2765c7d99beb..b9aae19fab3e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id);
 int damon_register_ops(struct damon_operations *ops);
 int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
+static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
+{
+	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
+}
+
+
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a0dab8b5e45f..5ae810927309 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -275,11 +275,6 @@ out:
 	return ret;
 }
 
-static inline bool target_has_pid(const struct damon_ctx *ctx)
-{
-	return ctx->ops.id == DAMON_OPS_VADDR;
-}
-
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 {
 	struct damon_target *t;
@@ -288,7 +283,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 	int rc;
 
 	damon_for_each_target(t, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			/* Show pid numbers to debugfs users */
 			id = pid_vnr(t->pid);
 		else
@@ -415,7 +410,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -425,11 +420,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 		if (!t) {
 			damon_for_each_target_safe(t, next, ctx)
 				damon_destroy_target(t);
-			if (target_has_pid(ctx))
+			if (damon_target_has_pid(ctx))
 				dbgfs_put_pids(pids, nr_targets);
 			return -ENOMEM;
 		}
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			t->pid = pids[i];
 		damon_add_target(ctx, t);
 	}
@@ -722,7 +717,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (!target_has_pid(ctx))
+	if (!damon_target_has_pid(ctx))
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 09f9e8ca3d1f..8810e6abdb06 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2136,8 +2136,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (ctx->ops.id == DAMON_OPS_VADDR ||
-				ctx->ops.id == DAMON_OPS_FVADDR)
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -2181,8 +2180,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 
 	if (!t)
 		return -ENOMEM;
-	if (ctx->ops.id == DAMON_OPS_VADDR ||
-			ctx->ops.id == DAMON_OPS_FVADDR) {
+	if (damon_target_has_pid(ctx)) {
 		t->pid = find_get_pid(sys_target->pid);
 		if (!t->pid)
 			goto destroy_targets_out;
@@ -2210,7 +2208,7 @@ static struct damon_target *damon_sysfs_existing_target(
 	struct pid *pid;
 	struct damon_target *t;
 
-	if (ctx->ops.id == DAMON_OPS_PADDR) {
+	if (!damon_target_has_pid(ctx)) {
 		/* Up to only one target for paddr could exist */
 		damon_for_each_target(t, ctx)
 			return t;
-- 
cgit 


From d9da8f6cf55eeca642c021912af1890002464c64 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 9 Jun 2022 20:18:46 +0200
Subject: mm: introduce clear_highpage_kasan_tagged

Add a clear_highpage_kasan_tagged() helper that does clear_highpage() on a
page potentially tagged by KASAN.

This helper is used by the following patch.

Link: https://lkml.kernel.org/r/4471979b46b2c487787ddcd08b9dc5fedd1b6ffd.1654798516.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 10 ++++++++++
 mm/page_alloc.c         |  8 ++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index fee9835e3793..22379a63e293 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page)
 	kunmap_local(kaddr);
 }
 
+static inline void clear_highpage_kasan_tagged(struct page *page)
+{
+	u8 tag;
+
+	tag = page_kasan_tag(page);
+	page_kasan_tag_reset(page);
+	clear_highpage(page);
+	page_kasan_tag_set(page, tag);
+}
+
 #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
 
 static inline void tag_clear_highpage(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9234863f2488..248469134962 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1302,12 +1302,8 @@ static void kernel_init_pages(struct page *page, int numpages)
 
 	/* s390's use of memset() could override KASAN redzones. */
 	kasan_disable_current();
-	for (i = 0; i < numpages; i++) {
-		u8 tag = page_kasan_tag(page + i);
-		page_kasan_tag_reset(page + i);
-		clear_highpage(page + i);
-		page_kasan_tag_set(page + i, tag);
-	}
+	for (i = 0; i < numpages; i++)
+		clear_highpage_kasan_tagged(page + i);
 	kasan_enable_current();
 }
 
-- 
cgit 


From c15187a4a2d660bf490f7873afd0de5288f65c8f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:22 -0700
Subject: mm: memcontrol: introduce mem_cgroup_ino() and
 mem_cgroup_get_from_ino()

Patch series "mm: introduce shrinker debugfs interface", v5.

The only existing debugging mechanism is a couple of tracepoints in
do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end.  They
aren't covering everything though: shrinkers which report 0 objects will
never show up, there is no support for memcg-aware shrinkers.  Shrinkers
are identified by their scan function, which is not always enough (e.g.
hard to guess which super block's shrinker it is having only
"super_cache_scan").

To provide a better visibility and debug options for memory shrinkers this
patchset introduces a /sys/kernel/debug/shrinker interface, to some extent
similar to /sys/kernel/slab.

For each shrinker registered in the system a directory is created.  As
now, the directory will contain only a "scan" file, which allows to get
the number of managed objects for each memory cgroup (for memcg-aware
shrinkers) and each numa node (for numa-aware shrinkers on a numa
machine).  Other interfaces might be added in the future.

To make debugging more pleasant, the patchset also names all shrinkers, so
that debugfs entries can have meaningful names.


This patch (of 5):

Shrinker debugfs requires a way to represent memory cgroups without using
full paths, both for displaying information and getting input from a user.

Cgroup inode number is a perfect way, already used by bpf.

This commit adds a couple of helper functions which will be used to handle
memcg-aware shrinkers.

Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 mm/memcontrol.c            | 23 +++++++++++++++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 04f2f33607e9..4d31ce55b1c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 }
 struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
 
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+	return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
+}
+
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
+#endif
+
 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
 {
 	return mem_cgroup_from_css(seq_css(m));
@@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return NULL;
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+	return NULL;
+}
+#endif
+
 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
 {
 	return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 655c09393ad5..1497affe08c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5088,6 +5088,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return idr_find(&mem_cgroup_idr, id);
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+	struct cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+
+	cgrp = cgroup_get_from_id(ino);
+	if (!cgrp)
+		return ERR_PTR(-ENOENT);
+
+	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+	if (css)
+		memcg = container_of(css, struct mem_cgroup, css);
+	else
+		memcg = ERR_PTR(-ENOENT);
+
+	cgroup_put(cgrp);
+
+	return memcg;
+}
+#endif
+
 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
-- 
cgit 


From 5035ebc644aec92d55d1bbfe042f35341e4bffb5 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:23 -0700
Subject: mm: shrinkers: introduce debugfs interface for memory shrinkers

This commit introduces the /sys/kernel/debug/shrinker debugfs interface
which provides an ability to observe the state of individual kernel memory
shrinkers.

Because the feature adds some memory overhead (which shouldn't be large
unless there is a huge amount of registered shrinkers), it's guarded by a
config option (enabled by default).

This commit introduces the "count" interface for each shrinker registered
in the system.

The output is in the following format:
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
...

To reduce the size of output on machines with many thousands cgroups, if
the total number of objects on all nodes is 0, the line is omitted.

If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed as
cgroup inode id.  If the shrinker is not numa-aware, 0's are printed for
all nodes except the first one.

This commit gives debugfs entries simple numeric names, which are not very
convenient.  The following commit in the series will provide shrinkers
with more meaningful names.

[akpm@linux-foundation.org: remove WARN_ON_ONCE(), per Roman]
  Reported-by: syzbot+300d27c79fe6d4cbcc39@syzkaller.appspotmail.com
Link: https://lkml.kernel.org/r/20220601032227.4076670-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h |  19 +++++-
 lib/Kconfig.debug        |   9 +++
 mm/Makefile              |   1 +
 mm/shrinker_debug.c      | 168 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c              |   6 +-
 5 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 mm/shrinker_debug.c

(limited to 'include')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76fbf92b04d9..2ced8149c513 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -72,6 +72,10 @@ struct shrinker {
 #ifdef CONFIG_MEMCG
 	/* ID in shrinker_idr */
 	int id;
+#endif
+#ifdef CONFIG_SHRINKER_DEBUG
+	int debugfs_id;
+	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
@@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
-#endif
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	return 0;
+}
+static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+#endif /* _LINUX_SHRINKER_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e24db4bff19..0b483a8da409 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -699,6 +699,15 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
 	help
 	  Debug objects boot parameter default value
 
+config SHRINKER_DEBUG
+	default y
+	bool "Enable shrinker debugging support"
+	depends on DEBUG_FS
+	help
+	  Say Y to enable the shrinker debugfs interface which provides
+	  visibility into the kernel memory shrinkers subsystem.
+	  Disable it to avoid an extra memory footprint.
+
 config HAVE_DEBUG_KMEMLEAK
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..9a564f836403 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..1a70556bd46c
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+					    struct mem_cgroup *memcg,
+					    unsigned long *count_per_node)
+{
+	unsigned long nr, total = 0;
+	int nid;
+
+	for_each_node(nid) {
+		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+			struct shrink_control sc = {
+				.gfp_mask = GFP_KERNEL,
+				.nid = nid,
+				.memcg = memcg,
+			};
+
+			nr = shrinker->count_objects(shrinker, &sc);
+			if (nr == SHRINK_EMPTY)
+				nr = 0;
+		} else {
+			nr = 0;
+		}
+
+		count_per_node[nid] = nr;
+		total += nr;
+	}
+
+	return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+	struct shrinker *shrinker = m->private;
+	unsigned long *count_per_node;
+	struct mem_cgroup *memcg;
+	unsigned long total;
+	bool memcg_aware;
+	int ret, nid;
+
+	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+	if (!count_per_node)
+		return -ENOMEM;
+
+	ret = down_read_killable(&shrinker_rwsem);
+	if (ret) {
+		kfree(count_per_node);
+		return ret;
+	}
+	rcu_read_lock();
+
+	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		if (memcg && !mem_cgroup_online(memcg))
+			continue;
+
+		total = shrinker_count_objects(shrinker,
+					       memcg_aware ? memcg : NULL,
+					       count_per_node);
+		if (total) {
+			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+			for_each_node(nid)
+				seq_printf(m, " %lu", count_per_node[nid]);
+			seq_putc(m, '\n');
+		}
+
+		if (!memcg_aware) {
+			mem_cgroup_iter_break(NULL, memcg);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			mem_cgroup_iter_break(NULL, memcg);
+			ret = -EINTR;
+			break;
+		}
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+	rcu_read_unlock();
+	up_read(&shrinker_rwsem);
+
+	kfree(count_per_node);
+	return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	struct dentry *entry;
+	char buf[16];
+	int id;
+
+	lockdep_assert_held(&shrinker_rwsem);
+
+	/* debugfs isn't initialized yet, add debugfs entries later. */
+	if (!shrinker_debugfs_root)
+		return 0;
+
+	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+	if (id < 0)
+		return id;
+	shrinker->debugfs_id = id;
+
+	snprintf(buf, sizeof(buf), "%d", id);
+
+	/* create debugfs entry */
+	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+	if (IS_ERR(entry)) {
+		ida_free(&shrinker_debugfs_ida, id);
+		return PTR_ERR(entry);
+	}
+	shrinker->debugfs_entry = entry;
+
+	debugfs_create_file("count", 0220, entry, shrinker,
+			    &shrinker_debugfs_count_fops);
+	return 0;
+}
+
+void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+	lockdep_assert_held(&shrinker_rwsem);
+
+	if (!shrinker->debugfs_entry)
+		return;
+
+	debugfs_remove_recursive(shrinker->debugfs_entry);
+	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+	struct shrinker *shrinker;
+	struct dentry *dentry;
+	int ret = 0;
+
+	dentry = debugfs_create_dir("shrinker", NULL);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	shrinker_debugfs_root = dentry;
+
+	/* Create debugfs entries for shrinkers registered at boot */
+	down_write(&shrinker_rwsem);
+	list_for_each_entry(shrinker, &shrinker_list, list)
+		if (!shrinker->debugfs_entry) {
+			ret = shrinker_debugfs_add(shrinker);
+			if (ret)
+				break;
+		}
+	up_write(&shrinker_rwsem);
+
+	return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7d9a683e3a7..35dedff79eb4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task,
 	task->reclaim_state = rs;
 }
 
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -650,6 +650,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	shrinker->flags |= SHRINKER_REGISTERED;
+	shrinker_debugfs_add(shrinker);
 	up_write(&shrinker_rwsem);
 }
 
@@ -677,6 +678,7 @@ void unregister_shrinker(struct shrinker *shrinker)
 	shrinker->flags &= ~SHRINKER_REGISTERED;
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 		unregister_memcg_shrinker(shrinker);
+	shrinker_debugfs_remove(shrinker);
 	up_write(&shrinker_rwsem);
 
 	kfree(shrinker->nr_deferred);
-- 
cgit 


From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:24 -0700
Subject: mm: shrinkers: provide shrinkers with names

Currently shrinkers are anonymous objects.  For debugging purposes they
can be identified by count/scan function names, but it's not always
useful: e.g.  for superblock's shrinkers it's nice to have at least an
idea of to which superblock the shrinker belongs.

This commit adds names to shrinkers.  register_shrinker() and
prealloc_shrinker() functions are extended to take a format and arguments
to master a name.

In some cases it's not possible to determine a good name at the time when
a shrinker is allocated.  For such cases shrinker_debugfs_rename() is
provided.

The expected format is:
    <subsystem>-<shrinker_type>[:<instance>]-<id>
For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair.

After this change the shrinker debugfs directory looks like:
  $ cd /sys/kernel/debug/shrinker/
  $ ls
    dquota-cache-16     sb-devpts-28     sb-proc-47       sb-tmpfs-42
    mm-shadow-18        sb-devtmpfs-5    sb-proc-48       sb-tmpfs-43
    mm-zspool:zram0-34  sb-hugetlbfs-17  sb-pstore-31     sb-tmpfs-44
    rcu-kfree-0         sb-hugetlbfs-33  sb-rootfs-2      sb-tmpfs-49
    sb-aio-20           sb-iomem-12      sb-securityfs-6  sb-tracefs-13
    sb-anon_inodefs-15  sb-mqueue-21     sb-selinuxfs-22  sb-xfs:vda1-36
    sb-bdev-3           sb-nsfs-4        sb-sockfs-8      sb-zsmalloc-19
    sb-bpf-32           sb-pipefs-14     sb-sysfs-26      thp-deferred_split-10
    sb-btrfs:vda2-24    sb-proc-25       sb-tmpfs-1       thp-zero-9
    sb-cgroup2-30       sb-proc-39       sb-tmpfs-27      xfs-buf:vda1-37
    sb-configfs-23      sb-proc-41       sb-tmpfs-29      xfs-inodegc:vda1-38
    sb-dax-11           sb-proc-45       sb-tmpfs-35
    sb-debugfs-7        sb-proc-46       sb-tmpfs-40

[roman.gushchin@linux.dev: fix build warnings]
  Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle
  Reported-by: kernel test robot <lkp@intel.com>
Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kvm/mmu/mmu.c                           |  2 +-
 drivers/android/binder_alloc.c                   |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c     |  3 +-
 drivers/gpu/drm/msm/msm_gem_shrinker.c           |  2 +-
 drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c |  2 +-
 drivers/gpu/drm/ttm/ttm_pool.c                   |  2 +-
 drivers/md/bcache/btree.c                        |  2 +-
 drivers/md/dm-bufio.c                            |  3 +-
 drivers/md/dm-zoned-metadata.c                   |  4 +-
 drivers/md/raid5.c                               |  2 +-
 drivers/misc/vmw_balloon.c                       |  2 +-
 drivers/virtio/virtio_balloon.c                  |  2 +-
 drivers/xen/xenbus/xenbus_probe_backend.c        |  2 +-
 fs/btrfs/super.c                                 |  2 +
 fs/erofs/utils.c                                 |  2 +-
 fs/ext4/extents_status.c                         |  3 +-
 fs/f2fs/super.c                                  |  2 +-
 fs/gfs2/glock.c                                  |  2 +-
 fs/gfs2/main.c                                   |  2 +-
 fs/jbd2/journal.c                                |  3 +-
 fs/mbcache.c                                     |  2 +-
 fs/nfs/nfs42xattr.c                              |  7 +--
 fs/nfs/super.c                                   |  2 +-
 fs/nfsd/filecache.c                              |  2 +-
 fs/nfsd/nfscache.c                               |  3 +-
 fs/quota/dquot.c                                 |  2 +-
 fs/super.c                                       |  6 ++-
 fs/ubifs/super.c                                 |  2 +-
 fs/xfs/xfs_buf.c                                 |  3 +-
 fs/xfs/xfs_icache.c                              |  2 +-
 fs/xfs/xfs_qm.c                                  |  3 +-
 include/linux/shrinker.h                         | 14 +++++-
 kernel/rcu/tree.c                                |  2 +-
 mm/huge_memory.c                                 |  4 +-
 mm/shrinker_debug.c                              | 47 ++++++++++++++++++-
 mm/vmscan.c                                      | 58 ++++++++++++++++++++++--
 mm/workingset.c                                  |  2 +-
 mm/zsmalloc.c                                    |  3 +-
 net/sunrpc/auth.c                                |  2 +-
 39 files changed, 167 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 17252f39bd7c..797d3286ecc1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6317,7 +6317,7 @@ int kvm_mmu_vendor_module_init(void)
 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
 		goto out;
 
-	ret = register_shrinker(&mmu_shrinker);
+	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
 	if (ret)
 		goto out;
 
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 5649a0371a1f..51b502217d00 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1084,7 +1084,7 @@ int binder_alloc_shrinker_init(void)
 	int ret = list_lru_init(&binder_alloc_lru);
 
 	if (ret == 0) {
-		ret = register_shrinker(&binder_shrinker);
+		ret = register_shrinker(&binder_shrinker, "android-binder");
 		if (ret)
 			list_lru_destroy(&binder_alloc_lru);
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index 6a6ff98a8746..e43577e03067 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915)
 	i915->mm.shrinker.count_objects = i915_gem_shrinker_count;
 	i915->mm.shrinker.seeks = DEFAULT_SEEKS;
 	i915->mm.shrinker.batch = 4096;
-	drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker));
+	drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker,
+						  "drm-i915_gem"));
 
 	i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
 	drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier));
diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
index 086dacf2f26a..26e84d2ea6ae 100644
--- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
+++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
@@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev)
 	priv->shrinker.count_objects = msm_gem_shrinker_count;
 	priv->shrinker.scan_objects = msm_gem_shrinker_scan;
 	priv->shrinker.seeks = DEFAULT_SEEKS;
-	WARN_ON(register_shrinker(&priv->shrinker));
+	WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem"));
 
 	priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap;
 	WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier));
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
index 77e7cb6d1ae3..bf0170782f25 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
@@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev)
 	pfdev->shrinker.count_objects = panfrost_gem_shrinker_count;
 	pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan;
 	pfdev->shrinker.seeks = DEFAULT_SEEKS;
-	WARN_ON(register_shrinker(&pfdev->shrinker));
+	WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost"));
 }
 
 /**
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 1bba0a0ed3f9..21b61631f73a 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	mm_shrinker.count_objects = ttm_pool_shrinker_count;
 	mm_shrinker.scan_objects = ttm_pool_shrinker_scan;
 	mm_shrinker.seeks = 1;
-	return register_shrinker(&mm_shrinker);
+	return register_shrinker(&mm_shrinker, "drm-ttm_pool");
 }
 
 /**
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e136d6edc1ed..147c493a989a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c)
 	c->shrink.seeks = 4;
 	c->shrink.batch = c->btree_pages * 2;
 
-	if (register_shrinker(&c->shrink))
+	if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
 		pr_warn("bcache: %s: could not register shrinker\n",
 				__func__);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5ffa1dcf84cf..3ff571b20f14 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
 	c->shrinker.batch = 0;
-	r = register_shrinker(&c->shrinker);
+	r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
+			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 	if (r)
 		goto bad;
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d1ea66114d14..46648f6100fb 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
 	zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
 
 	/* Metadata cache shrinker */
-	ret = register_shrinker(&zmd->mblk_shrinker);
+	ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)",
+				MAJOR(dev->bdev->bd_dev),
+				MINOR(dev->bdev->bd_dev));
 	if (ret) {
 		dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
 		goto err;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d09256d7f81..780ae66840b7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->shrinker.count_objects = raid5_cache_count;
 	conf->shrinker.batch = 128;
 	conf->shrinker.flags = 0;
-	ret = register_shrinker(&conf->shrinker);
+	ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
 	if (ret) {
 		pr_warn("md/raid:%s: couldn't register shrinker.\n",
 			mdname(mddev));
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 086ce77d9074..c2d2fa114e65 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1587,7 +1587,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b)
 	b->shrinker.count_objects = vmballoon_shrinker_count;
 	b->shrinker.seeks = DEFAULT_SEEKS;
 
-	r = register_shrinker(&b->shrinker);
+	r = register_shrinker(&b->shrinker, "vmw-balloon");
 
 	if (r == 0)
 		b->shrinker_registered = true;
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b9737da6c4dd..cba57b1f382f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -875,7 +875,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
 	vb->shrinker.count_objects = virtio_balloon_shrinker_count;
 	vb->shrinker.seeks = DEFAULT_SEEKS;
 
-	return register_shrinker(&vb->shrinker);
+	return register_shrinker(&vb->shrinker, "virtio-balloon");
 }
 
 static int virtballoon_probe(struct virtio_device *vdev)
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index 5abded97e1a7..9c09f89d8278 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void)
 
 	register_xenstore_notifier(&xenstore_notifier);
 
-	if (register_shrinker(&backend_memory_shrinker))
+	if (register_shrinker(&backend_memory_shrinker, "xen-backend"))
 		pr_warn("shrinker registration failed\n");
 
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6627dd7875ee..eee3e96d877f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1815,6 +1815,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+					s->s_id);
 		btrfs_sb(s)->bdev_holder = fs_type;
 		if (!strstr(crc32c_impl(), "generic"))
 			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ec9a1d780dc1..46627cb69abe 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
 
 int __init erofs_init_shrinker(void)
 {
-	return register_shrinker(&erofs_shrinker_info);
+	return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
 }
 
 void erofs_exit_shrinker(void)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..23167efda95e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
 	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-	err = register_shrinker(&sbi->s_es_shrinker);
+	err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
+				sbi->s_sb->s_id);
 	if (err)
 		goto err4;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37221e94e5ef..bce02306f7a0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4579,7 +4579,7 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_init_sysfs();
 	if (err)
 		goto free_garbage_collection_cache;
-	err = register_shrinker(&f2fs_shrinker_info);
+	err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
 	if (err)
 		goto free_sysfs;
 	err = register_filesystem(&f2fs_fs_type);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c992d53013d3..dca842379cab 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2533,7 +2533,7 @@ int __init gfs2_glock_init(void)
 		return -ENOMEM;
 	}
 
-	ret = register_shrinker(&glock_shrinker);
+	ret = register_shrinker(&glock_shrinker, "gfs2-glock");
 	if (ret) {
 		destroy_workqueue(gfs2_delete_workqueue);
 		destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 244187e3e70f..b66a3e1ec152 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_trans_cachep)
 		goto fail_cachep8;
 
-	error = register_shrinker(&gfs2_qd_shrinker);
+	error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
 	if (error)
 		goto fail_shrinker;
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0cbeeaec2d1..45e4655c8033 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
 		goto err_cleanup;
 
-	if (register_shrinker(&journal->j_shrinker)) {
+	if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
 		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
 		goto err_cleanup;
 	}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..0b833da0a9a5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	cache->c_shrink.count_objects = mb_cache_count;
 	cache->c_shrink.scan_objects = mb_cache_scan;
 	cache->c_shrink.seeks = DEFAULT_SEEKS;
-	if (register_shrinker(&cache->c_shrink)) {
+	if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
 		kfree(cache->c_hash);
 		kfree(cache);
 		goto err_out;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index e7b34f7e0614..a9bf09fdf2c3 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
 	if (ret)
 		goto out2;
 
-	ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+	ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
 	if (ret)
 		goto out1;
 
-	ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+	ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
 	if (ret)
 		goto out;
 
-	ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+	ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
+				"nfs-xattr_large_entry");
 	if (!ret)
 		return 0;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6ab5eeb000dc..82944e14fcea 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
 	ret = nfs_register_sysctl();
 	if (ret < 0)
 		goto error_2;
-	ret = register_shrinker(&acl_shrinker);
+	ret = register_shrinker(&acl_shrinker, "nfs-acl");
 	if (ret < 0)
 		goto error_3;
 #ifdef CONFIG_NFS_V4_2
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb2d590c036..a605c0e39b09 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -670,7 +670,7 @@ nfsd_file_cache_init(void)
 		goto out_err;
 	}
 
-	ret = register_shrinker(&nfsd_file_shrinker);
+	ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
 	if (ret) {
 		pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
 		goto out_lru;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 7da88bdc0d6c..9b31e1103e7b 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
 	nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
 	nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
 	nn->nfsd_reply_cache_shrinker.seeks = 1;
-	status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+	status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
+				   "nfsd-reply:%s", nn->nfsd_name);
 	if (status)
 		goto out_stats_destroy;
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09d1307959d0..e0b659900e70 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2995,7 +2995,7 @@ static int __init dquot_init(void)
 	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
 		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
 
-	if (register_shrinker(&dqcache_shrinker))
+	if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
 		panic("Cannot register dquot shrinker");
 
 	return 0;
diff --git a/fs/super.c b/fs/super.c
index 60f57c7bc0a6..4fca6657f442 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
-	if (prealloc_shrinker(&s->s_shrink))
+	if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
 		goto fail;
 	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
 		goto fail;
@@ -1288,6 +1288,8 @@ int get_tree_bdev(struct fs_context *fc,
 	} else {
 		s->s_mode = mode;
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+					fc->fs_type->name, s->s_id);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, fc);
 		if (error) {
@@ -1363,6 +1365,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	} else {
 		s->s_mode = mode;
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+					fs_type->name, s->s_id);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0978d01b0ea4..d0c9a09988bc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2430,7 +2430,7 @@ static int __init ubifs_init(void)
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
-	err = register_shrinker(&ubifs_shrinker_info);
+	err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
 	if (err)
 		goto out_slab;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bf4e60871068..4aa9c9cf5b6e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1986,7 +1986,8 @@ xfs_alloc_buftarg(
 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
 	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
-	if (register_shrinker(&btp->bt_shrinker))
+	if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+			      mp->m_super->s_id))
 		goto error_pcpu;
 	return btp;
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 5269354b1b69..a1941c8b8630 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2201,5 +2201,5 @@ xfs_inodegc_register_shrinker(
 	shrink->flags = SHRINKER_NONSLAB;
 	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
 
-	return register_shrinker(shrink);
+	return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index abf08bbf34a9..c31d57453ceb 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -677,7 +677,8 @@ xfs_qm_init_quotainfo(
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
 	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 
-	error = register_shrinker(&qinf->qi_shrinker);
+	error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+				  mp->m_super->s_id);
 	if (error)
 		goto out_free_inos;
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 2ced8149c513..08e6054e061f 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -75,6 +75,7 @@ struct shrinker {
 #endif
 #ifdef CONFIG_SHRINKER_DEBUG
 	int debugfs_id;
+	const char *name;
 	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
@@ -92,9 +93,11 @@ struct shrinker {
  */
 #define SHRINKER_NONSLAB	(1 << 3)
 
-extern int prealloc_shrinker(struct shrinker *shrinker);
+extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
+					    const char *fmt, ...);
 extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int register_shrinker(struct shrinker *shrinker);
+extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
+					    const char *fmt, ...);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
@@ -102,6 +105,8 @@ extern void synchronize_shrinkers(void);
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int shrinker_debugfs_add(struct shrinker *shrinker);
 extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
+						  const char *fmt, ...);
 #else /* CONFIG_SHRINKER_DEBUG */
 static inline int shrinker_debugfs_add(struct shrinker *shrinker)
 {
@@ -110,5 +115,10 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker)
 static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
 {
 }
+static inline __printf(2, 3)
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return 0;
+}
 #endif /* CONFIG_SHRINKER_DEBUG */
 #endif /* _LINUX_SHRINKER_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c25ba442044a..4b3bf6ebb1eb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4884,7 +4884,7 @@ static void __init kfree_rcu_batch_init(void)
 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
-	if (register_shrinker(&kfree_rcu_shrinker))
+	if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
 		pr_err("Failed to register kfree_rcu() shrinker!\n");
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9b90a8d7dfa..60d742c33de3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -423,10 +423,10 @@ static int __init hugepage_init(void)
 	if (err)
 		goto err_slab;
 
-	err = register_shrinker(&huge_zero_page_shrinker);
+	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 	if (err)
 		goto err_hzp_shrinker;
-	err = register_shrinker(&deferred_split_shrinker);
+	err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
 	if (err)
 		goto err_split_shrinker;
 
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 1a70556bd46c..781ecbd3d608 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -102,7 +102,7 @@ DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
 int shrinker_debugfs_add(struct shrinker *shrinker)
 {
 	struct dentry *entry;
-	char buf[16];
+	char buf[128];
 	int id;
 
 	lockdep_assert_held(&shrinker_rwsem);
@@ -116,7 +116,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 		return id;
 	shrinker->debugfs_id = id;
 
-	snprintf(buf, sizeof(buf), "%d", id);
+	snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id);
 
 	/* create debugfs entry */
 	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
@@ -131,10 +131,53 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 	return 0;
 }
 
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+	struct dentry *entry;
+	char buf[128];
+	const char *new, *old;
+	va_list ap;
+	int ret = 0;
+
+	va_start(ap, fmt);
+	new = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+
+	if (!new)
+		return -ENOMEM;
+
+	down_write(&shrinker_rwsem);
+
+	old = shrinker->name;
+	shrinker->name = new;
+
+	if (shrinker->debugfs_entry) {
+		snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
+			 shrinker->debugfs_id);
+
+		entry = debugfs_rename(shrinker_debugfs_root,
+				       shrinker->debugfs_entry,
+				       shrinker_debugfs_root, buf);
+		if (IS_ERR(entry))
+			ret = PTR_ERR(entry);
+		else
+			shrinker->debugfs_entry = entry;
+	}
+
+	up_write(&shrinker_rwsem);
+
+	kfree_const(old);
+
+	return ret;
+}
+EXPORT_SYMBOL(shrinker_debugfs_rename);
+
 void shrinker_debugfs_remove(struct shrinker *shrinker)
 {
 	lockdep_assert_held(&shrinker_rwsem);
 
+	kfree_const(shrinker->name);
+
 	if (!shrinker->debugfs_entry)
 		return;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35dedff79eb4..97ac6c6c026d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -608,7 +608,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
 /*
  * Add a shrinker callback to be called from the vm.
  */
-int prealloc_shrinker(struct shrinker *shrinker)
+static int __prealloc_shrinker(struct shrinker *shrinker)
 {
 	unsigned int size;
 	int err;
@@ -632,8 +632,36 @@ int prealloc_shrinker(struct shrinker *shrinker)
 	return 0;
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	va_list ap;
+	int err;
+
+	va_start(ap, fmt);
+	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+	if (!shrinker->name)
+		return -ENOMEM;
+
+	err = __prealloc_shrinker(shrinker);
+	if (err)
+		kfree_const(shrinker->name);
+
+	return err;
+}
+#else
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return __prealloc_shrinker(shrinker);
+}
+#endif
+
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
+#ifdef CONFIG_SHRINKER_DEBUG
+	kfree_const(shrinker->name);
+#endif
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
 		down_write(&shrinker_rwsem);
 		unregister_memcg_shrinker(shrinker);
@@ -654,15 +682,39 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 	up_write(&shrinker_rwsem);
 }
 
-int register_shrinker(struct shrinker *shrinker)
+static int __register_shrinker(struct shrinker *shrinker)
 {
-	int err = prealloc_shrinker(shrinker);
+	int err = __prealloc_shrinker(shrinker);
 
 	if (err)
 		return err;
 	register_shrinker_prepared(shrinker);
 	return 0;
 }
+
+#ifdef CONFIG_SHRINKER_DEBUG
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	va_list ap;
+	int err;
+
+	va_start(ap, fmt);
+	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+	if (!shrinker->name)
+		return -ENOMEM;
+
+	err = __register_shrinker(shrinker);
+	if (err)
+		kfree_const(shrinker->name);
+	return err;
+}
+#else
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return __register_shrinker(shrinker);
+}
+#endif
 EXPORT_SYMBOL(register_shrinker);
 
 /*
diff --git a/mm/workingset.c b/mm/workingset.c
index 592569a8974c..a5e84862fc86 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -625,7 +625,7 @@ static int __init workingset_init(void)
 	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
 	       timestamp_bits, max_order, bucket_order);
 
-	ret = prealloc_shrinker(&workingset_shadow_shrinker);
+	ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
 	if (ret)
 		goto err;
 	ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d5fc04385b8..f24b71568e83 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2217,7 +2217,8 @@ static int zs_register_shrinker(struct zs_pool *pool)
 	pool->shrinker.batch = 0;
 	pool->shrinker.seeks = DEFAULT_SEEKS;
 
-	return register_shrinker(&pool->shrinker);
+	return register_shrinker(&pool->shrinker, "mm-zspool:%s",
+				 pool->name);
 }
 
 /**
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 682fcd24bf43..04e7b55fe0d9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -874,7 +874,7 @@ int __init rpcauth_init_module(void)
 	err = rpc_init_authunix();
 	if (err < 0)
 		goto out1;
-	err = register_shrinker(&rpc_cred_shrinker);
+	err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred");
 	if (err < 0)
 		goto out2;
 	return 0;
-- 
cgit 


From 8cdcc532268df0893d9756f537cbce479f4c4831 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:56 +0000
Subject: mm/damon/schemes: add 'LRU_PRIO' DAMOS action

This commit adds a new DAMOS action called 'LRU_PRIO' for the physical
address space.  The action prioritizes pages in the memory regions of the
user-specified target access pattern on their LRU lists.  This is hence
supposed to be used for frequently accessed (hot) memory regions so that
hot pages could be more likely protected under memory pressure.
Internally, it simply calls 'mark_page_accessed()'.

Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/ops-common.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/damon/ops-common.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 5 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b9aae19fab3e..4c64e03e94d8 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -86,6 +86,7 @@ struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -95,6 +96,7 @@ enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_LRU_PRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 10ef20b2003f..b1335de200e7 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 	/* Return coldness of the region */
 	return DAMOS_MAX_SCORE - hotness;
 }
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s)
+{
+	unsigned int max_nr_accesses;
+	int freq_subscore;
+	unsigned int age_in_sec;
+	int age_in_log, age_subscore;
+	unsigned int freq_weight = s->quota.weight_nr_accesses;
+	unsigned int age_weight = s->quota.weight_age;
+	int hotness;
+
+	max_nr_accesses = c->aggr_interval / c->sample_interval;
+	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+			age_in_log++, age_in_sec >>= 1)
+		;
+
+	/* If frequency is 0, higher age means it's colder */
+	if (freq_subscore == 0)
+		age_in_log *= -1;
+
+	/*
+	 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+	 * Scale it to be in [0, 100] and set it as age subscore.
+	 */
+	age_in_log += DAMON_MAX_AGE_IN_LOG;
+	age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+		DAMON_MAX_AGE_IN_LOG / 2;
+
+	hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+	if (freq_weight + age_weight)
+		hotness /= freq_weight + age_weight;
+	/*
+	 * Transform it to fit in [0, DAMOS_MAX_SCORE]
+	 */
+	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+	return hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index e790cb5f8fe0..52329ff361cd 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
 
 int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 7bcd48066b43..f145b1d51e13 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -233,6 +233,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		mark_page_accessed(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -240,6 +256,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pa_pageout(r);
+	case DAMOS_LRU_PRIO:
+		return damon_pa_mark_accessed(r);
 	default:
 		break;
 	}
@@ -253,6 +271,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pageout_score(context, r, scheme);
+	case DAMOS_LRU_PRIO:
+		return damon_hot_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index c35809c6087c..86c69f980927 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -762,6 +762,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"pageout",
 	"hugepage",
 	"nohugepage",
+	"lru_prio",
 	"stat",
 };
 
-- 
cgit 


From 99cdc2cd180a7adc87badc9ca92f8af803d8bf3b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:58 +0000
Subject: mm/damon/schemes: add 'LRU_DEPRIO' action

This commit adds a new DAMON-based operation scheme action called
'LRU_DEPRIO' for physical address space.  The action deprioritizes pages
in the memory area of the target access pattern on their LRU lists.  This
is hence supposed to be used for rarely accessed (cold) memory regions so
that cold pages could be more likely reclaimed first under memory
pressure.  Internally, it simply calls 'lru_deactivate()'.

Using this with 'LRU_PRIO' action for hot pages, users can proactively
sort LRU lists based on the access pattern.  That is, it can make the LRU
lists somewhat more trustworthy source of access temperature.  As a
result, efficiency of LRU-lists based mechanisms including the reclamation
target selection could be improved.

Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 3 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 4c64e03e94d8..7b1f4a488230 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -87,6 +87,7 @@ struct damon_target {
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
+ * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -97,6 +98,7 @@ enum damos_action {
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
 	DAMOS_LRU_PRIO,
+	DAMOS_LRU_DEPRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index f145b1d51e13..dc131c6a5403 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,6 +249,22 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		deactivate_page(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -258,6 +274,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		return damon_pa_pageout(r);
 	case DAMOS_LRU_PRIO:
 		return damon_pa_mark_accessed(r);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pa_deactivate_pages(r);
 	default:
 		break;
 	}
@@ -273,6 +291,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 		return damon_pageout_score(context, r, scheme);
 	case DAMOS_LRU_PRIO:
 		return damon_hot_score(context, r, scheme);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pageout_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 86c69f980927..7488e27c87c3 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -763,6 +763,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"hugepage",
 	"nohugepage",
 	"lru_prio",
+	"lru_deprio",
 	"stat",
 };
 
-- 
cgit 


From 64fe24a3e05e5f3ac56fcd45afd2fd1d9cc8fcb6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 14 Jun 2022 11:36:29 +0200
Subject: mm/mprotect: try avoiding write faults for exclusive anonymous pages
 when changing protection

Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we
can try mapping anonymous pages in a private writable mapping writable if
they are exclusive, the PTE is already dirty, and no special handling
applies.  Mapping the anonymous page writable is essentially the same
thing the write fault handler would do in this case.

Special handling is required for uffd-wp and softdirty tracking, so take
care of that properly.  Also, leave PROT_NONE handling alone for now; in
the future, we could similarly extend the logic in do_numa_page() or use
pte_mk_savedwrite() here.

While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE)
performance, it should also be a valuable optimization for uffd-wp, when
un-protecting.

This has been previously suggested by Peter Collingbourne in [1], relevant
in the context of the Scudo memory allocator, before we had
PageAnonExclusive.

This commit doesn't add the same handling for PMDs (i.e., anonymous THP,
anonymous hugetlb); benchmark results from Andrea indicate that there are
minor performance gains, so it's might still be valuable to streamline
that logic for all anonymous pages in the future.

As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it
to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually
happening.

Micro-benchmark courtesy of Andrea:

===
 #define _GNU_SOURCE
 #include <sys/mman.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>

 #define SIZE (1024*1024*1024)

int main(int argc, char *argv[])
{
	char *p;
	if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE))
		perror("posix_memalign"), exit(1);
	if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE))
		perror("madvise");
	explicit_bzero(p, SIZE);
	for (int loops = 0; loops < 40; loops++) {
		if (mprotect(p, SIZE, PROT_READ))
			perror("mprotect"), exit(1);
		if (mprotect(p, SIZE, PROT_READ|PROT_WRITE))
			perror("mprotect"), exit(1);
		explicit_bzero(p, SIZE);
	}
}
===

Results on my Ryzen 9 3900X:

Stock 10 runs (lower is better):   AVG 6.398s, STDEV 0.043
Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026

===

[1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com

Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Peter Collingbourne <pcc@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  8 ++++--
 mm/mprotect.c      | 77 +++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 68 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..09ea26056e2f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
  * for now all the callers are only use one of the flags at the same
  * time.
  */
-/* Whether we should allow dirty bit accounting */
-#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
+/*
+ * Whether we should manually check if we can map individual PTEs writable,
+ * because something (e.g., COW, uffd-wp) blocks that from happening for all
+ * PTEs automatically in a writable mapping.
+ */
+#define  MM_CP_TRY_CHANGE_WRITABLE	   (1UL << 0)
 /* Whether this protection change is for NUMA hints */
 #define  MM_CP_PROT_NUMA                   (1UL << 1)
 /* Whether this change is for write protecting */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ba5592655ee3..996a97e213ad 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -38,6 +38,39 @@
 
 #include "internal.h"
 
+static inline bool can_change_pte_writable(struct vm_area_struct *vma,
+					   unsigned long addr, pte_t pte)
+{
+	struct page *page;
+
+	VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
+
+	if (pte_protnone(pte) || !pte_dirty(pte))
+		return false;
+
+	/* Do we need write faults for softdirty tracking? */
+	if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte))
+		return false;
+
+	/* Do we need write faults for uffd-wp tracking? */
+	if (userfaultfd_pte_wp(vma, pte))
+		return false;
+
+	if (!(vma->vm_flags & VM_SHARED)) {
+		/*
+		 * We can only special-case on exclusive anonymous pages,
+		 * because we know that our write-fault handler similarly would
+		 * map them writable without any additional checks while holding
+		 * the PT lock.
+		 */
+		page = vm_normal_page(vma, addr, pte);
+		if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+			return false;
+	}
+
+	return true;
+}
+
 static unsigned long change_pte_range(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	unsigned long pages = 0;
 	int target_node = NUMA_NO_NODE;
-	bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 				ptent = pte_wrprotect(ptent);
 				ptent = pte_mkuffd_wp(ptent);
 			} else if (uffd_wp_resolve) {
-				/*
-				 * Leave the write bit to be handled
-				 * by PF interrupt handler, then
-				 * things like COW could be properly
-				 * handled.
-				 */
 				ptent = pte_clear_uffd_wp(ptent);
 			}
 
-			/* Avoid taking write faults for known dirty pages */
-			if (dirty_accountable && pte_dirty(ptent) &&
-					(pte_soft_dirty(ptent) ||
-					 !(vma->vm_flags & VM_SOFTDIRTY))) {
+			/*
+			 * In some writable, shared mappings, we might want
+			 * to catch actual write access -- see
+			 * vma_wants_writenotify().
+			 *
+			 * In all writable, private mappings, we have to
+			 * properly handle COW.
+			 *
+			 * In both cases, we can sometimes still change PTEs
+			 * writable and avoid the write-fault handler, for
+			 * example, if a PTE is already dirty and no other
+			 * COW or special handling is required.
+			 */
+			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+			    !pte_write(ptent) &&
+			    can_change_pte_writable(vma, addr, ptent))
 				ptent = pte_mkwrite(ptent);
-			}
+
 			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
 			if (pte_needs_flush(oldpte, ptent))
 				tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
@@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	bool try_change_writable;
 	pgoff_t pgoff;
 	int error;
-	int dirty_accountable = 0;
 
 	if (newflags == oldflags) {
 		*pprev = vma;
@@ -583,11 +621,20 @@ success:
 	 * held in write mode.
 	 */
 	vma->vm_flags = newflags;
-	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+	/*
+	 * We want to check manually if we can change individual PTEs writable
+	 * if we can't do that automatically for all PTEs in a mapping. For
+	 * private mappings, that's always the case when we have write
+	 * permissions as we properly have to handle COW.
+	 */
+	if (vma->vm_flags & VM_SHARED)
+		try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
+	else
+		try_change_writable = !!(vma->vm_flags & VM_WRITE);
 	vma_set_page_prot(vma);
 
 	change_protection(tlb, vma, start, end, vma->vm_page_prot,
-			  dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+			  try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
 
 	/*
 	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
-- 
cgit 


From b8cecb9376b9d3031cf62b476a0db087b6b01072 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 16:42:44 +0100
Subject: mm/vmscan: convert reclaim_clean_pages_from_list() to folios

Patch series "nvert much of vmscan to folios"

vmscan always operates on folios since it puts the pages on the LRU list.
Switching all of these functions from pages to folios saves 1483 bytes of
text from removing all the baggage around calling compound_page() and
similar functions.


This patch (of 5):

This is a straightforward conversion which removes several hidden calls
to compound_head, saving 330 bytes of kernel text.

Link: https://lkml.kernel.org/r/20220617154248.700416-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20220617154248.700416-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  6 ++++++
 mm/vmscan.c                | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..f32aade2a6e0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -670,6 +670,12 @@ static __always_inline bool PageAnon(struct page *page)
 	return folio_test_anon(page_folio(page));
 }
 
+static __always_inline bool __folio_test_movable(const struct folio *folio)
+{
+	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
+			PAGE_MAPPING_MOVABLE;
+}
+
 static __always_inline int __PageMovable(struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 97ac6c6c026d..2ecca45672e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2041,7 +2041,7 @@ keep:
 }
 
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
-					    struct list_head *page_list)
+					    struct list_head *folio_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -2049,16 +2049,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
-	struct page *page, *next;
-	LIST_HEAD(clean_pages);
+	struct folio *folio, *next;
+	LIST_HEAD(clean_folios);
 	unsigned int noreclaim_flag;
 
-	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (!PageHuge(page) && page_is_file_lru(page) &&
-		    !PageDirty(page) && !__PageMovable(page) &&
-		    !PageUnevictable(page)) {
-			ClearPageActive(page);
-			list_move(&page->lru, &clean_pages);
+	list_for_each_entry_safe(folio, next, folio_list, lru) {
+		if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
+		    !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
+		    !folio_test_unevictable(folio)) {
+			folio_clear_active(folio);
+			list_move(&folio->lru, &clean_folios);
 		}
 	}
 
@@ -2069,11 +2069,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 * change in the future.
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
-	nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+	nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
 					&stat, true);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
-	list_splice(&clean_pages, page_list);
+	list_splice(&clean_folios, folio_list);
 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
 			    -(long)nr_reclaimed);
 	/*
-- 
cgit 


From e3c4cebf3f9db8c9150eb1982da7e353d9938bed Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:49:59 +0100
Subject: mm: add folios_put()

Patch series "Convert the swap code to be more folio-based".

There's still more to do with the swap code, but this reaps a lot of the
folio benefit.  More than 4kB of kernel text saved (with the UEK7 kernel
config).  I don't know how much that's going to translate into CPU
savings, but some of those compound_head() calls are on every page free,
so it should be noticable.  It might even be noticable just from an
I-cache consumption perspective.


This patch (of 22):

This is just a wrapper around release_pages() for now.  Place the
prototype in mm.h along with folio_put() and folio_put_refs().

Link: https://lkml.kernel.org/r/20220617175020.717127-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20220617175020.717127-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      | 19 +++++++++++++++++++
 include/linux/pagemap.h |  2 --
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09ea26056e2f..09670ccb94e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1220,6 +1220,25 @@ static inline void folio_put_refs(struct folio *folio, int refs)
 		__put_page(&folio->page);
 }
 
+void release_pages(struct page **pages, int nr);
+
+/**
+ * folios_put - Decrement the reference count on an array of folios.
+ * @folios: The folios.
+ * @nr: How many folios there are.
+ *
+ * Like folio_put(), but for an array of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which
+ * need to be taken if the folios are freed.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folios_put(struct folio **folios, unsigned int nr)
+{
+	release_pages((struct page **)folios, nr);
+}
+
 static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ce96866fbec4..c399a9c5da7d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 #endif
 }
 
-void release_pages(struct page **pages, int nr);
-
 struct address_space *page_mapping(struct page *);
 struct address_space *folio_mapping(struct folio *);
 struct address_space *swapcache_mapping(struct folio *);
-- 
cgit 


From 7d80dd096f8f889128f67a2d452e4dadeed71e63 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:01 +0100
Subject: mm/swap: make __pagevec_lru_add static

__pagevec_lru_add has no callers outside swap.c, so make it static,
and move it to a more logical position in the file.

Link: https://lkml.kernel.org/r/20220617175020.717127-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagevec.h |   1 -
 mm/swap.c               | 126 ++++++++++++++++++++++++------------------------
 2 files changed, 63 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 67b1246f136b..b0e3540f3a4c 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,7 +26,6 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
 			      pgoff_t *start, pgoff_t end);
diff --git a/mm/swap.c b/mm/swap.c
index 4265bee41bbd..cab77a5c64c7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -228,6 +228,69 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
 
 typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
 
+static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
+{
+	int was_unevictable = folio_test_clear_unevictable(folio);
+	long nr_pages = folio_nr_pages(folio);
+
+	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+	folio_set_lru(folio);
+	/*
+	 * Is an smp_mb__after_atomic() still required here, before
+	 * folio_evictable() tests PageMlocked, to rule out the possibility
+	 * of stranding an evictable folio on an unevictable LRU?  I think
+	 * not, because __munlock_page() only clears PageMlocked while the LRU
+	 * lock is held.
+	 *
+	 * (That is not true of __page_cache_release(), and not necessarily
+	 * true of release_pages(): but those only clear PageMlocked after
+	 * put_page_testzero() has excluded any other users of the page.)
+	 */
+	if (folio_evictable(folio)) {
+		if (was_unevictable)
+			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+	} else {
+		folio_clear_active(folio);
+		folio_set_unevictable(folio);
+		/*
+		 * folio->mlock_count = !!folio_test_mlocked(folio)?
+		 * But that leaves __mlock_page() in doubt whether another
+		 * actor has already counted the mlock or not.  Err on the
+		 * safe side, underestimate, let page reclaim fix it, rather
+		 * than leaving a page on the unevictable LRU indefinitely.
+		 */
+		folio->mlock_count = 0;
+		if (!was_unevictable)
+			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+	}
+
+	lruvec_add_folio(lruvec, folio);
+	trace_mm_lru_insertion(folio);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them.  Reinitialises the caller's pagevec.
+ */
+static void __pagevec_lru_add(struct pagevec *pvec)
+{
+	int i;
+	struct lruvec *lruvec = NULL;
+	unsigned long flags = 0;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct folio *folio = page_folio(pvec->pages[i]);
+
+		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+		__pagevec_lru_add_fn(folio, lruvec);
+	}
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
+	release_pages(pvec->pages, pvec->nr);
+	pagevec_reinit(pvec);
+}
+
 static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 {
 	int i;
@@ -1036,69 +1099,6 @@ void __pagevec_release(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_release);
 
-static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
-{
-	int was_unevictable = folio_test_clear_unevictable(folio);
-	long nr_pages = folio_nr_pages(folio);
-
-	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
-
-	folio_set_lru(folio);
-	/*
-	 * Is an smp_mb__after_atomic() still required here, before
-	 * folio_evictable() tests PageMlocked, to rule out the possibility
-	 * of stranding an evictable folio on an unevictable LRU?  I think
-	 * not, because __munlock_page() only clears PageMlocked while the LRU
-	 * lock is held.
-	 *
-	 * (That is not true of __page_cache_release(), and not necessarily
-	 * true of release_pages(): but those only clear PageMlocked after
-	 * put_page_testzero() has excluded any other users of the page.)
-	 */
-	if (folio_evictable(folio)) {
-		if (was_unevictable)
-			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
-	} else {
-		folio_clear_active(folio);
-		folio_set_unevictable(folio);
-		/*
-		 * folio->mlock_count = !!folio_test_mlocked(folio)?
-		 * But that leaves __mlock_page() in doubt whether another
-		 * actor has already counted the mlock or not.  Err on the
-		 * safe side, underestimate, let page reclaim fix it, rather
-		 * than leaving a page on the unevictable LRU indefinitely.
-		 */
-		folio->mlock_count = 0;
-		if (!was_unevictable)
-			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
-	}
-
-	lruvec_add_folio(lruvec, folio);
-	trace_mm_lru_insertion(folio);
-}
-
-/*
- * Add the passed pages to the LRU, then drop the caller's refcount
- * on them.  Reinitialises the caller's pagevec.
- */
-void __pagevec_lru_add(struct pagevec *pvec)
-{
-	int i;
-	struct lruvec *lruvec = NULL;
-	unsigned long flags = 0;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct folio *folio = page_folio(pvec->pages[i]);
-
-		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
-		__pagevec_lru_add_fn(folio, lruvec);
-	}
-	if (lruvec)
-		unlock_page_lruvec_irqrestore(lruvec, flags);
-	release_pages(pvec->pages, pvec->nr);
-	pagevec_reinit(pvec);
-}
-
 /**
  * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
  * @fbatch: The batch to prune
-- 
cgit 


From 8d29c7036f5ff360ea1f51b9fed5d909be7c8094 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:13 +0100
Subject: mm/swap: convert __put_page() to __folio_put()

Saves 11 bytes of text by removing a check of PageTail.

Link: https://lkml.kernel.org/r/20220617175020.717127-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |  6 +++---
 mm/swap.c            | 14 +++++++-------
 net/core/page_pool.c |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09670ccb94e7..3fb49aec13fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -855,7 +855,7 @@ static inline struct folio *virt_to_folio(const void *x)
 	return page_folio(page);
 }
 
-void __put_page(struct page *page);
+void __folio_put(struct folio *folio);
 
 void put_pages_list(struct list_head *pages);
 
@@ -1197,7 +1197,7 @@ static inline __must_check bool try_get_page(struct page *page)
 static inline void folio_put(struct folio *folio)
 {
 	if (folio_put_testzero(folio))
-		__put_page(&folio->page);
+		__folio_put(folio);
 }
 
 /**
@@ -1217,7 +1217,7 @@ static inline void folio_put(struct folio *folio)
 static inline void folio_put_refs(struct folio *folio, int refs)
 {
 	if (folio_ref_sub_and_test(folio, refs))
-		__put_page(&folio->page);
+		__folio_put(folio);
 }
 
 void release_pages(struct page **pages, int nr);
diff --git a/mm/swap.c b/mm/swap.c
index a5a91aec83da..d09e9ac53809 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -119,16 +119,16 @@ static void __put_compound_page(struct page *page)
 	destroy_compound_page(page);
 }
 
-void __put_page(struct page *page)
+void __folio_put(struct folio *folio)
 {
-	if (unlikely(is_zone_device_page(page)))
-		free_zone_device_page(page);
-	else if (unlikely(PageCompound(page)))
-		__put_compound_page(page);
+	if (unlikely(folio_is_zone_device(folio)))
+		free_zone_device_page(&folio->page);
+	else if (unlikely(folio_test_large(folio)))
+		__put_compound_page(&folio->page);
 	else
-		__put_single_page(page);
+		__put_single_page(&folio->page);
 }
-EXPORT_SYMBOL(__put_page);
+EXPORT_SYMBOL(__folio_put);
 
 /**
  * put_pages_list() - release a list of pages
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f18e6e771993..db70e94c8df2 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -16,7 +16,7 @@
 #include <linux/dma-direction.h>
 #include <linux/dma-mapping.h>
 #include <linux/page-flags.h>
-#include <linux/mm.h> /* for __put_page() */
+#include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
 #include <linux/ethtool.h>
 
-- 
cgit 


From 5375336c8c42a343c3b440b6f1e21c65e7b174b9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:17 +0100
Subject: mm: convert destroy_compound_page() to destroy_large_folio()

All callers now have a folio, so push the folio->page conversion
down to this function.

[akpm@linux-foundation.org: uninline destroy_large_folio() to fix build issue]
Link: https://lkml.kernel.org/r/20220617175020.717127-20-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 +-----
 mm/page_alloc.c    | 8 ++++++++
 mm/swap.c          | 2 +-
 mm/vmscan.c        | 4 ++--
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3fb49aec13fd..9cc02a7e503b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -892,11 +892,7 @@ static inline void set_compound_page_dtor(struct page *page,
 	page[1].compound_dtor = compound_dtor;
 }
 
-static inline void destroy_compound_page(struct page *page)
-{
-	VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
-	compound_page_dtors[page[1].compound_dtor](page);
-}
+void destroy_large_folio(struct folio *folio);
 
 static inline int head_compound_pincount(struct page *head)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 248469134962..52fd92b2c1fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -744,6 +744,14 @@ void prep_compound_page(struct page *page, unsigned int order)
 	prep_compound_head(page, order);
 }
 
+void destroy_large_folio(struct folio *folio)
+{
+	enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+	VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+	compound_page_dtors[dtor](&folio->page);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 
diff --git a/mm/swap.c b/mm/swap.c
index 5f6caa651599..1f563d857768 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -115,7 +115,7 @@ static void __folio_put_large(struct folio *folio)
 	 */
 	if (!folio_test_hugetlb(folio))
 		__page_cache_release(folio);
-	destroy_compound_page(&folio->page);
+	destroy_large_folio(folio);
 }
 
 void __folio_put(struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7d3db64a4e0..e660d7205f47 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1979,7 +1979,7 @@ free_it:
 		 * appear not as the counts should be low
 		 */
 		if (unlikely(folio_test_large(folio)))
-			destroy_compound_page(&folio->page);
+			destroy_large_folio(folio);
 		else
 			list_add(&folio->lru, &free_pages);
 		continue;
@@ -2348,7 +2348,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
 
 			if (unlikely(folio_test_large(folio))) {
 				spin_unlock_irq(&lruvec->lru_lock);
-				destroy_compound_page(&folio->page);
+				destroy_large_folio(folio);
 				spin_lock_irq(&lruvec->lru_lock);
 			} else
 				list_add(&folio->lru, &folios_to_free);
-- 
cgit 


From ed7802dd48f7a507213cbb95bb4c6f1fe134eb5d Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 17 Jun 2022 21:56:49 +0800
Subject: mm: memory_hotplug: enumerate all supported section flags

Patch series "make hugetlb_optimize_vmemmap compatible with
memmap_on_memory", v3.

This series makes hugetlb_optimize_vmemmap compatible with
memmap_on_memory.


This patch (of 2):

We are almost running out of section flags, only one bit is available in
the worst case (powerpc with 256k pages).  However, there are still some
free bits (in ->section_mem_map) on other architectures (e.g.  x86_64 has
10 bits available, arm64 has 8 bits available with worst case of 64K
pages).  We have hard coded those numbers in code, it is inconvenient to
use those bits on other architectures except powerpc.  So transfer those
section flags to enumeration to make it easy to add new section flags in
the future.  Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.

[songmuchun@bytedance.com: replace enum with defines per David]
  Link: https://lkml.kernel.org/r/20220620110616.12056-2-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++---------
 mm/memory_hotplug.c    |  6 ++++++
 mm/sparse.c            |  2 +-
 3 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..2b5757752333 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void);
  *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
  *      worst combination is powerpc with 256k pages,
  *      which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available.
+ * To sum it up, at least 6 bits are available on all architectures.
+ * However, we can exceed 6 bits on some other architectures except
+ * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
+ * with the worst case of 64K pages on arm64) if we make sure the
+ * exceeded bit is not applicable to powerpc.
  */
-#define SECTION_MARKED_PRESENT		(1UL<<0)
-#define SECTION_HAS_MEM_MAP		(1UL<<1)
-#define SECTION_IS_ONLINE		(1UL<<2)
-#define SECTION_IS_EARLY		(1UL<<3)
-#define SECTION_TAINT_ZONE_DEVICE	(1UL<<4)
-#define SECTION_MAP_LAST_BIT		(1UL<<5)
-#define SECTION_MAP_MASK		(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT		6
+enum {
+	SECTION_MARKED_PRESENT_BIT,
+	SECTION_HAS_MEM_MAP_BIT,
+	SECTION_IS_ONLINE_BIT,
+	SECTION_IS_EARLY_BIT,
+#ifdef CONFIG_ZONE_DEVICE
+	SECTION_TAINT_ZONE_DEVICE_BIT,
+#endif
+	SECTION_MAP_LAST_BIT,
+};
+
+#define SECTION_MARKED_PRESENT		BIT(SECTION_MARKED_PRESENT_BIT)
+#define SECTION_HAS_MEM_MAP		BIT(SECTION_HAS_MEM_MAP_BIT)
+#define SECTION_IS_ONLINE		BIT(SECTION_IS_ONLINE_BIT)
+#define SECTION_IS_EARLY		BIT(SECTION_IS_EARLY_BIT)
+#ifdef CONFIG_ZONE_DEVICE
+#define SECTION_TAINT_ZONE_DEVICE	BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
+#endif
+#define SECTION_MAP_MASK		(~(BIT(SECTION_MAP_LAST_BIT) - 1))
+#define SECTION_NID_SHIFT		SECTION_MAP_LAST_BIT
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
@@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_IS_ONLINE));
 }
 
+#ifdef CONFIG_ZONE_DEVICE
 static inline int online_device_section(struct mem_section *section)
 {
 	unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
 
 	return section && ((section->section_mem_map & flags) == flags);
 }
+#else
+static inline int online_device_section(struct mem_section *section)
+{
+	return 0;
+}
+#endif
 
 static inline int online_section_nr(unsigned long nr)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 84990a14d51a..a2a6d280054f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -670,12 +670,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 
 }
 
+#ifdef CONFIG_ZONE_DEVICE
 static void section_taint_zone_device(unsigned long pfn)
 {
 	struct mem_section *ms = __pfn_to_section(pfn);
 
 	ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
 }
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
 
 /*
  * Associate the pfn range with the given zone, initializing the memmaps
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..e5a8a3a0edd7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
 {
 	unsigned long coded_mem_map =
 		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
-	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
 	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
 	return coded_mem_map;
 }
-- 
cgit 


From 66361095129b3b5d065e6c09cf0c085ef4a8c40f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 17 Jun 2022 21:56:50 +0800
Subject: mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with
 memmap_on_memory

For now, the feature of hugetlb_free_vmemmap is not compatible with the
feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap takes
precedence over memory_hotplug.memmap_on_memory.  However, someone wants
to make memory_hotplug.memmap_on_memory takes precedence over
hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
succeed memory hotplug in close-to-OOM situations.  So the decision of
making hugetlb_free_vmemmap take precedence is not wise and elegant.

The proper approach is to have hugetlb_vmemmap.c do the check whether the
section which the HugeTLB pages belong to can be optimized.  If the
section's vmemmap pages are allocated from the added memory block itself,
hugetlb_free_vmemmap should refuse to optimize the vmemmap, otherwise, do
the optimization.  Then both kernel parameters are compatible.  So this
patch introduces VmemmapSelfHosted to mask any non-optimizable vmemmap
pages.  The hugetlb_vmemmap can use this flag to detect if a vmemmap page
can be optimized.

[songmuchun@bytedance.com: walk vmemmap page tables to avoid false-positive]
  Link: https://lkml.kernel.org/r/20220620110616.12056-3-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Co-developed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 22 ++++-----
 Documentation/admin-guide/sysctl/vm.rst         |  5 +-
 include/linux/memory_hotplug.h                  |  9 ----
 include/linux/page-flags.h                      | 11 +++++
 mm/hugetlb_vmemmap.c                            | 66 ++++++++++++++++++++++---
 mm/memory_hotplug.c                             | 27 +++++-----
 6 files changed, 93 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8c0ea6b6c6a9..2cacd4f8deb7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1722,9 +1722,11 @@
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
 
-			This is not compatible with memory_hotplug.memmap_on_memory.
-			If both parameters are enabled, hugetlb_free_vmemmap takes
-			precedence over memory_hotplug.memmap_on_memory.
+			Note that the vmemmap pages may be allocated from the added
+			memory block itself when memory_hotplug.memmap_on_memory is
+			enabled, those vmemmap pages cannot be optimized even if this
+			feature is enabled.  Other vmemmap pages not allocated from
+			the added memory block itself do not be affected.
 
 	hung_task_panic=
 			[KNL] Should the hung task detector generate panics.
@@ -3068,10 +3070,12 @@
 			[KNL,X86,ARM] Boolean flag to enable this feature.
 			Format: {on | off (default)}
 			When enabled, runtime hotplugged memory will
-			allocate its internal metadata (struct pages)
-			from the hotadded memory which will allow to
-			hotadd a lot of memory without requiring
-			additional memory to do so.
+			allocate its internal metadata (struct pages,
+			those vmemmap pages cannot be optimized even
+			if hugetlb_free_vmemmap is enabled) from the
+			hotadded memory which will allow to hotadd a
+			lot of memory without requiring additional
+			memory to do so.
 			This feature is disabled by default because it
 			has some implication on large (e.g. GB)
 			allocations in some configurations (e.g. small
@@ -3081,10 +3085,6 @@
 			Note that even when enabled, there are a few cases where
 			the feature is not effective.
 
-			This is not compatible with hugetlb_free_vmemmap. If
-			both parameters are enabled, hugetlb_free_vmemmap takes
-			precedence over memory_hotplug.memmap_on_memory.
-
 	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 4a440a7cfeb0..f74f722ad702 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
 hugetlb_optimize_vmemmap
 ========================
 
-This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
-is configured or the size of 'struct page' (a structure defined in
-include/linux/mm_types.h) is not power of two (an unusual system config could
+This knob is not available when the size of 'struct page' (a structure defined
+in include/linux/mm_types.h) is not power of two (an unusual system config could
 result in this).
 
 Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d7edf62a6a..e0b2209ab71c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
 extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-bool mhp_memmap_on_memory(void);
-#else
-static inline bool mhp_memmap_on_memory(void)
-{
-	return false;
-}
-#endif
-
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f32aade2a6e0..82719d33c0f1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -193,6 +193,11 @@ enum pageflags {
 
 	/* Only valid for buddy pages. Used to track pages that are reported */
 	PG_reported = PG_uptodate,
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* For self-hosted memmap pages */
+	PG_vmemmap_self_hosted = PG_owner_priv_1,
+#endif
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
  */
 __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
+#else
+PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba29c15c53d6..1362feb3c6c9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,7 +10,7 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
-#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	return ret;
 }
 
+static unsigned int vmemmap_optimizable_pages(struct hstate *h,
+					      struct page *head)
+{
+	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+		return 0;
+
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+		pmd_t *pmdp, pmd;
+		struct page *vmemmap_page;
+		unsigned long vaddr = (unsigned long)head;
+
+		/*
+		 * Only the vmemmap page's vmemmap page can be self-hosted.
+		 * Walking the page tables to find the backing page of the
+		 * vmemmap page.
+		 */
+		pmdp = pmd_off_k(vaddr);
+		/*
+		 * The READ_ONCE() is used to stabilize *pmdp in a register or
+		 * on the stack so that it will stop changing under the code.
+		 * The only concurrent operation where it can be changed is
+		 * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+		 * operation).
+		 */
+		pmd = READ_ONCE(*pmdp);
+		if (pmd_leaf(pmd))
+			vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+		else
+			vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+		/*
+		 * Due to HugeTLB alignment requirements and the vmemmap pages
+		 * being at the start of the hotplugged memory region in
+		 * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+		 * page's vmemmap page if it is marked as VmemmapSelfHosted is
+		 * sufficient.
+		 *
+		 * [                  hotplugged memory                  ]
+		 * [        section        ][...][        section        ]
+		 * [ vmemmap ][              usable memory               ]
+		 *   ^   |     |                                        |
+		 *   +---+     |                                        |
+		 *     ^       |                                        |
+		 *     +-------+                                        |
+		 *          ^                                           |
+		 *          +-------------------------------------------+
+		 */
+		if (PageVmemmapSelfHosted(vmemmap_page))
+			return 0;
+	}
+
+	return hugetlb_optimize_vmemmap_pages(h);
+}
+
 void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
 	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
-	vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+	vmemmap_pages = vmemmap_optimizable_pages(h, head);
 	if (!vmemmap_pages)
 		return;
 
-	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
-		return;
-
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
@@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 static __init int hugetlb_vmemmap_sysctls_init(void)
 {
 	/*
-	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
-	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 * If "struct page" crosses page boundaries, the vmemmap pages cannot
+	 * be optimized.
 	 */
-	if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+	if (is_power_of_2(sizeof(struct page)))
 		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 
 	return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a2a6d280054f..99ecb2b3ff53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -43,30 +43,22 @@
 #include "shuffle.h"
 
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
-{
-	if (hugetlb_optimize_vmemmap_enabled())
-		return 0;
-	return param_set_bool(val, kp);
-}
-
-static const struct kernel_param_ops memmap_on_memory_ops = {
-	.flags	= KERNEL_PARAM_OPS_FL_NOARG,
-	.set	= memmap_on_memory_set,
-	.get	= param_get_bool,
-};
-
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
 static bool memmap_on_memory __ro_after_init;
-module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
+module_param(memmap_on_memory, bool, 0444);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
 
-bool mhp_memmap_on_memory(void)
+static inline bool mhp_memmap_on_memory(void)
 {
 	return memmap_on_memory;
 }
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+	return false;
+}
 #endif
 
 enum {
@@ -1035,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 			      struct zone *zone)
 {
 	unsigned long end_pfn = pfn + nr_pages;
-	int ret;
+	int ret, i;
 
 	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
 	if (ret)
@@ -1043,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
 
+	for (i = 0; i < nr_pages; i++)
+		SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+
 	/*
 	 * It might be that the vmemmap_pages fully span sections. If that is
 	 * the case, mark those sections online here as otherwise they will be
-- 
cgit 


From e8da368a1e42a8056d1a6b419e1b91b6cf11d77e Mon Sep 17 00:00:00 2001
From: Yun-Ze Li <p76091292@gs.ncku.edu.tw>
Date: Mon, 20 Jun 2022 07:15:16 +0000
Subject: mm, docs: fix comments that mention mem_hotplug_end()

Comments that mention mem_hotplug_end() are confusing as there is no
function called mem_hotplug_end().  Fix them by replacing all the
occurences of mem_hotplug_end() in the comments with mem_hotplug_done().

[akpm@linux-foundation.org: grammatical fixes]
Link: https://lkml.kernel.org/r/20220620071516.1286101-1-p76091292@gs.ncku.edu.tw
Signed-off-by: Yun-Ze Li <p76091292@gs.ncku.edu.tw>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 6 +++---
 mm/compaction.c        | 2 +-
 mm/vmscan.c            | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2b5757752333..735bf5b37949 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -591,8 +591,8 @@ struct zone {
 	 * give them a chance of being in the same cacheline.
 	 *
 	 * Write access to present_pages at runtime should be protected by
-	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
-	 * present_pages should get_online_mems() to get a stable value.
+	 * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
+	 * present_pages should use get_online_mems() to get a stable value.
 	 */
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
@@ -870,7 +870,7 @@ typedef struct pglist_data {
 	unsigned long nr_reclaim_start;	/* nr pages written while throttled
 					 * when throttling started. */
 	struct task_struct *kswapd;	/* Protected by
-					   mem_hotplug_begin/end() */
+					   mem_hotplug_begin/done() */
 	int kswapd_order;
 	enum zone_type kswapd_highest_zoneidx;
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f89b969c12b..cd029ab03d0e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -3011,7 +3011,7 @@ void kcompactd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
  */
 void kcompactd_stop(int nid)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 161096d9311a..f58761cea0a0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4645,7 +4645,7 @@ void kswapd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
  */
 void kswapd_stop(int nid)
 {
-- 
cgit 


From 18f3962953e40401b7ed98e8524167282c3e626e Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Sun, 26 Jun 2022 22:57:17 +0800
Subject: mm: hugetlb: kill set_huge_swap_pte_at()

Commit e5251fd43007 ("mm/hugetlb: introduce set_huge_swap_pte_at()
helper") add set_huge_swap_pte_at() to handle swap entries on
architectures that support hugepages consisting of contiguous ptes.  And
currently the set_huge_swap_pte_at() is only overridden by arm64.

set_huge_swap_pte_at() provide a sz parameter to help determine the number
of entries to be updated.  But in fact, all hugetlb swap entries contain
pfn information, so we can find the corresponding folio through the pfn
recorded in the swap entry, then the folio_size() is the number of entries
that need to be updated.

And considering that users will easily cause bugs by ignoring the
difference between set_huge_swap_pte_at() and set_huge_pte_at().  Let's
handle swap entries in set_huge_pte_at() and remove the
set_huge_swap_pte_at(), then we can call set_huge_pte_at() anywhere, which
simplifies our coding.

Link: https://lkml.kernel.org/r/20220626145717.53572-1-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/hugetlb.h |  3 ---
 arch/arm64/mm/hugetlbpage.c      | 34 +++++++++++++++++-----------------
 include/linux/hugetlb.h          | 13 -------------
 mm/hugetlb.c                     |  8 +++-----
 mm/rmap.c                        | 11 +++--------
 5 files changed, 23 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 1fd2846dbefe..d20f5da2d76f 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
 extern pte_t huge_ptep_get(pte_t *ptep);
-extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep, pte_t pte, unsigned long sz);
-#define set_huge_swap_pte_at set_huge_swap_pte_at
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index e2a5ec9fdc0d..3be8f25aa5be 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm,
 	flush_tlb_range(&vma, saddr, addr);
 }
 
+static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
+{
+	VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
+
+	return page_folio(pfn_to_page(swp_offset(entry)));
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, pte_t pte)
 {
@@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	unsigned long pfn, dpfn;
 	pgprot_t hugeprot;
 
-	/*
-	 * Code needs to be expanded to handle huge swap and migration
-	 * entries. Needed for HUGETLB and MEMORY_FAILURE.
-	 */
-	WARN_ON(!pte_present(pte));
+	if (!pte_present(pte)) {
+		struct folio *folio;
+
+		folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte));
+		ncontig = num_contig_ptes(folio_size(folio), &pgsize);
+
+		for (i = 0; i < ncontig; i++, ptep++)
+			set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
 
 	if (!pte_cont(pte)) {
 		set_pte_at(mm, addr, ptep, pte);
@@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
 }
 
-void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-			  pte_t *ptep, pte_t pte, unsigned long sz)
-{
-	int i, ncontig;
-	size_t pgsize;
-
-	ncontig = num_contig_ptes(sz, &pgsize);
-
-	for (i = 0; i < ncontig; i++, ptep++)
-		set_pte(ptep, pte);
-}
-
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, unsigned long sz)
 {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 756b66ff025e..c6cccfaf8708 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -903,14 +903,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 	atomic_long_sub(l, &mm->hugetlb_usage);
 }
 
-#ifndef set_huge_swap_pte_at
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte, unsigned long sz)
-{
-	set_huge_pte_at(mm, addr, ptep, pte);
-}
-#endif
-
 #ifndef huge_ptep_modify_prot_start
 #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
 static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
@@ -1094,11 +1086,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 {
 }
 
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte, unsigned long sz)
-{
-}
-
 static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 					  unsigned long addr, pte_t *ptep)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65454896f174..064da8ffbac6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4798,12 +4798,11 @@ again:
 				entry = swp_entry_to_pte(swp_entry);
 				if (userfaultfd_wp(src_vma) && uffd_wp)
 					entry = huge_pte_mkuffd_wp(entry);
-				set_huge_swap_pte_at(src, addr, src_pte,
-						     entry, sz);
+				set_huge_pte_at(src, addr, src_pte, entry);
 			}
 			if (!userfaultfd_wp(dst_vma) && uffd_wp)
 				entry = huge_pte_clear_uffd_wp(entry);
-			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+			set_huge_pte_at(dst, addr, dst_pte, entry);
 		} else if (unlikely(is_pte_marker(entry))) {
 			/*
 			 * We copy the pte marker only if the dst vma has
@@ -6344,8 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 					newpte = pte_swp_mkuffd_wp(newpte);
 				else if (uffd_wp_resolve)
 					newpte = pte_swp_clear_uffd_wp(newpte);
-				set_huge_swap_pte_at(mm, address, ptep,
-						     newpte, psize);
+				set_huge_pte_at(mm, address, ptep, newpte);
 				pages++;
 			}
 			spin_unlock(ptl);
diff --git a/mm/rmap.c b/mm/rmap.c
index 56134cdc5ca3..83172ee0ea35 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1618,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_swap_pte_at(mm, address,
-						     pvmw.pte, pteval,
-						     vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2004,9 +2002,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_swap_pte_at(mm, address,
-						     pvmw.pte, pteval,
-						     vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2074,8 +2070,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			if (pte_uffd_wp(pteval))
 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			if (folio_test_hugetlb(folio))
-				set_huge_swap_pte_at(mm, address, pvmw.pte,
-						     swp_pte, vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
 			else
 				set_pte_at(mm, address, pvmw.pte, swp_pte);
 			trace_set_migration_pte(address, pte_val(swp_pte),
-- 
cgit 


From 1baec203b77cafa24610b5c9ae7a2aa380d74ef6 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Sat, 25 Jun 2022 17:28:16 +0800
Subject: mm/khugepaged: try to free transhuge swapcache when possible

Transhuge swapcaches won't be freed in __collapse_huge_page_copy().  It's
because release_pte_page() is not called for these pages and thus
free_page_and_swap_cache can't grab the page lock.  These pages won't be
freed from swap cache even if we are the only user until next time
reclaim.  It shouldn't hurt indeed, but we could try to free these pages
to save more memory for system.

Link: https://lkml.kernel.org/r/20220625092816.4856-8-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: NeilBrown <neilb@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/khugepaged.c      | 7 ++++++-
 mm/swap.h            | 5 -----
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 95a5b7aa1ae9..6d11c51b2b62 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -455,6 +455,7 @@ static inline unsigned long total_swapcache_pages(void)
 	return global_node_page_state(NR_SWAPCACHE);
 }
 
+extern void free_swap_cache(struct page *page);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 /* linux/mm/swapfile.c */
@@ -539,6 +540,10 @@ static inline void put_swap_device(struct swap_info_struct *si)
 /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
 #define free_swap_and_cache(e) is_pfn_swap_entry(e)
 
+static inline void free_swap_cache(struct page *page)
+{
+}
+
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
 	return 0;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 08e885f28def..01e0d6336754 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -755,7 +755,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
 		list_del(&src_page->lru);
-		release_pte_page(src_page);
+		mod_node_page_state(page_pgdat(src_page),
+				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
+				    -compound_nr(src_page));
+		unlock_page(src_page);
+		free_swap_cache(src_page);
+		putback_lru_page(src_page);
 	}
 }
 
diff --git a/mm/swap.h b/mm/swap.h
index fa0816af4712..17936e068c1c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
-void free_swap_cache(struct page *page);
 struct page *lookup_swap_cache(swp_entry_t entry,
 			       struct vm_area_struct *vma,
 			       unsigned long addr);
@@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
 	return NULL;
 }
 
-static inline void free_swap_cache(struct page *page)
-{
-}
-
 static inline void show_swap_cache_info(void)
 {
 }
-- 
cgit 


From 1fcf54deb767d474181ad7cf33c92bb2a33607fb Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Wed, 29 Jun 2022 14:14:26 -0700
Subject: sched/core: add forced idle accounting for cgroups

4feee7d1260 previously added per-task forced idle accounting. This patch
extends this to also include cgroups.

rstat is used for cgroup accounting, except for the root, which uses
kcpustat in order to bypass the need for doing an rstat flush when
reading root stats.

Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that schedstats is enabled.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lkml.kernel.org/r/20220629211426.3329954-1-joshdon@google.com
---
 include/linux/cgroup-defs.h |  4 ++++
 include/linux/kernel_stat.h |  7 +++++++
 kernel/cgroup/rstat.c       | 44 ++++++++++++++++++++++++++++++++++++++------
 kernel/sched/core_sched.c   |  6 +++++-
 kernel/sched/cputime.c      | 15 +++++++++++++++
 5 files changed, 69 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..025fd0e84a31 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -287,6 +287,10 @@ struct css_set {
 
 struct cgroup_base_stat {
 	struct task_cputime cputime;
+
+#ifdef CONFIG_SCHED_CORE
+	u64 forceidle_sum;
+#endif
 };
 
 /*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 69ae6b278464..ddb5a358fd82 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,9 @@ enum cpu_usage_stat {
 	CPUTIME_STEAL,
 	CPUTIME_GUEST,
 	CPUTIME_GUEST_NICE,
+#ifdef CONFIG_SCHED_CORE
+	CPUTIME_FORCEIDLE,
+#endif
 	NR_STATS,
 };
 
@@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user);
 
 extern void account_idle_ticks(unsigned long ticks);
 
+#ifdef CONFIG_SCHED_CORE
+extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
+#endif
+
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 24b5c2ab5598..feb59380c896 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -310,6 +310,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 	dst_bstat->cputime.utime += src_bstat->cputime.utime;
 	dst_bstat->cputime.stime += src_bstat->cputime.stime;
 	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
 }
 
 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,6 +321,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
 }
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -398,6 +404,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	case CPUTIME_SOFTIRQ:
 		rstatc->bstat.cputime.stime += delta_exec;
 		break;
+#ifdef CONFIG_SCHED_CORE
+	case CPUTIME_FORCEIDLE:
+		rstatc->bstat.forceidle_sum += delta_exec;
+		break;
+#endif
 	default:
 		break;
 	}
@@ -411,8 +422,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
  * with how it is done by __cgroup_account_cputime_field for each bit of
  * cpu time attributed to a cgroup.
  */
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
 {
+	struct task_cputime *cputime = &bstat->cputime;
 	int i;
 
 	cputime->stime = 0;
@@ -438,6 +450,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
 		cputime->sum_exec_runtime += user;
 		cputime->sum_exec_runtime += sys;
 		cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
 	}
 }
 
@@ -445,27 +461,43 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	u64 usage, utime, stime;
-	struct task_cputime cputime;
+	struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+	u64 forceidle_time;
+#endif
 
 	if (cgroup_parent(cgrp)) {
 		cgroup_rstat_flush_hold(cgrp);
 		usage = cgrp->bstat.cputime.sum_exec_runtime;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+		forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
 		cgroup_rstat_flush_release();
 	} else {
-		root_cgroup_cputime(&cputime);
-		usage = cputime.sum_exec_runtime;
-		utime = cputime.utime;
-		stime = cputime.stime;
+		root_cgroup_cputime(&bstat);
+		usage = bstat.cputime.sum_exec_runtime;
+		utime = bstat.cputime.utime;
+		stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+		forceidle_time = bstat.forceidle_sum;
+#endif
 	}
 
 	do_div(usage, NSEC_PER_USEC);
 	do_div(utime, NSEC_PER_USEC);
 	do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+	do_div(forceidle_time, NSEC_PER_USEC);
+#endif
 
 	seq_printf(seq, "usage_usec %llu\n"
 		   "user_usec %llu\n"
 		   "system_usec %llu\n",
 		   usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
 }
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 38a2cec21014..5103502da7ba 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -277,7 +277,11 @@ void __sched_core_account_forceidle(struct rq *rq)
 		if (p == rq_i->idle)
 			continue;
 
-		__schedstat_add(p->stats.core_forceidle_sum, delta);
+		/*
+		 * Note: this will account forceidle to the current cpu, even
+		 * if it comes from our SMT sibling.
+		 */
+		__account_forceidle_time(p, delta);
 	}
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 78a233d43757..95fc77853743 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -226,6 +226,21 @@ void account_idle_time(u64 cputime)
 		cpustat[CPUTIME_IDLE] += cputime;
 }
 
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+	__schedstat_add(p->stats.core_forceidle_sum, delta);
+
+	task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
 /*
  * When a guest is interrupted for a longer amount of time, missed clock
  * ticks are not redelivered later. Due to that, this function may on
-- 
cgit 


From 5e25c25aa2c08fb9a79476e029c0b1e3dcd70566 Mon Sep 17 00:00:00 2001
From: Petr Vaněk <arkamar@atlas.cz>
Date: Thu, 30 Jun 2022 16:27:20 +0200
Subject: xfrm: improve wording of comment above XFRM_OFFLOAD flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I have noticed a few minor wording issues in a comment recently added
above XFRM_OFFLOAD flags in 7c76ecd9c99b ("xfrm: enforce validity of
offload input flags").

Signed-off-by: Petr Vaněk <arkamar@atlas.cz>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 3ed61df9cc91..7929bf9cbee4 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -511,9 +511,9 @@ struct xfrm_user_offload {
 	int				ifindex;
 	__u8				flags;
 };
-/* This flag was exposed without any kernel code that supporting it.
- * Unfortunately, strongswan has the code that uses sets this flag,
- * which makes impossible to reuse this bit.
+/* This flag was exposed without any kernel code that supports it.
+ * Unfortunately, strongswan has the code that sets this flag,
+ * which makes it impossible to reuse this bit.
  *
  * So leave it here to make sure that it won't be reused by mistake.
  */
-- 
cgit 


From 39bfb3c12d792181de0cf3306be1ea03664a1b05 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Fri, 1 Jul 2022 19:56:06 +0200
Subject: net: phy: broadcom: Add support for BCM53128 internal PHYs

Add support for BCM53128 internal PHYs. These support interrupts as well as
statistics. Therefore, enable the Broadcom PHY driver for them.

Tested on BCM53128 switch using the mainline b53 DSA driver.

Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 15 +++++++++++++++
 include/linux/brcmphy.h    |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 876bc45ede60..31fbcdddc9ad 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -1066,6 +1066,20 @@ static struct phy_driver broadcom_drivers[] = {
 	.config_intr	= bcm_phy_config_intr,
 	.handle_interrupt = bcm_phy_handle_interrupt,
 	.link_change_notify	= bcm54xx_link_change_notify,
+}, {
+	.phy_id		= PHY_ID_BCM53128,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM53128",
+	.flags		= PHY_IS_INTERNAL,
+	/* PHY_GBIT_FEATURES */
+	.get_sset_count	= bcm_phy_get_sset_count,
+	.get_strings	= bcm_phy_get_strings,
+	.get_stats	= bcm54xx_get_stats,
+	.probe		= bcm54xx_phy_probe,
+	.config_init	= bcm54xx_config_init,
+	.config_intr	= bcm_phy_config_intr,
+	.handle_interrupt = bcm_phy_handle_interrupt,
+	.link_change_notify	= bcm54xx_link_change_notify,
 }, {
 	.phy_id         = PHY_ID_BCM89610,
 	.phy_id_mask    = 0xfffffff0,
@@ -1102,6 +1116,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM5241, 0xfffffff0 },
 	{ PHY_ID_BCM5395, 0xfffffff0 },
 	{ PHY_ID_BCM53125, 0xfffffff0 },
+	{ PHY_ID_BCM53128, 0xfffffff0 },
 	{ PHY_ID_BCM89610, 0xfffffff0 },
 	{ }
 };
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 747fad264033..6ff567ece34a 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -16,6 +16,7 @@
 #define PHY_ID_BCM5481			0x0143bca0
 #define PHY_ID_BCM5395			0x0143bcf0
 #define PHY_ID_BCM53125			0x03625f20
+#define PHY_ID_BCM53128			0x03625e10
 #define PHY_ID_BCM54810			0x03625d00
 #define PHY_ID_BCM54811			0x03625cc0
 #define PHY_ID_BCM5482			0x0143bcb0
-- 
cgit 


From df76234276e22136b2468825c18407fdfbb2076a Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sat, 25 Jun 2022 13:36:15 +0200
Subject: mfd: bcm2835-pm: Add support for BCM2711

In BCM2711 the new RPiVid ASB took over V3D. The old ASB is still present
with the ISP and H264 bits, and V3D is in the same place in the new ASB
as the old one.

As per the devicetree bindings, BCM2711 will provide both the old and
new ASB resources, so get both of them and pass them into
'bcm2835-power,' which will take care of selecting which one to use
accordingly.

Since the RPiVid ASB's resources were being provided prior to formalizing
the bindings[1], also support the old DT files that didn't use
'reg-names.'

[1] See: 7dbe8c62ceeb ("ARM: dts: Add minimal Raspberry Pi 4 support")
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Reviewed-by: Peter Robinson <pbrobinson@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220625113619.15944-8-stefan.wahren@i2se.com
---
 drivers/mfd/bcm2835-pm.c       | 13 +++++++++++++
 include/linux/mfd/bcm2835-pm.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/bcm2835-pm.c b/drivers/mfd/bcm2835-pm.c
index 418c8a16427d..49cd1f03884a 100644
--- a/drivers/mfd/bcm2835-pm.c
+++ b/drivers/mfd/bcm2835-pm.c
@@ -42,6 +42,14 @@ static int bcm2835_pm_get_pdata(struct platform_device *pdev,
 				pm->asb = NULL;
 		}
 
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						    "rpivid_asb");
+		if (res) {
+			pm->rpivid_asb = devm_ioremap_resource(&pdev->dev, res);
+			if (IS_ERR(pm->rpivid_asb))
+				pm->rpivid_asb = NULL;
+		}
+
 		return 0;
 	}
 
@@ -54,6 +62,10 @@ static int bcm2835_pm_get_pdata(struct platform_device *pdev,
 	if (IS_ERR(pm->asb))
 		pm->asb = NULL;
 
+	pm->rpivid_asb = devm_platform_ioremap_resource(pdev, 2);
+	if (IS_ERR(pm->rpivid_asb))
+		pm->rpivid_asb = NULL;
+
 	return 0;
 }
 
@@ -95,6 +107,7 @@ static int bcm2835_pm_probe(struct platform_device *pdev)
 static const struct of_device_id bcm2835_pm_of_match[] = {
 	{ .compatible = "brcm,bcm2835-pm-wdt", },
 	{ .compatible = "brcm,bcm2835-pm", },
+	{ .compatible = "brcm,bcm2711-pm", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, bcm2835_pm_of_match);
diff --git a/include/linux/mfd/bcm2835-pm.h b/include/linux/mfd/bcm2835-pm.h
index ed37dc40e82a..f70a810c55f7 100644
--- a/include/linux/mfd/bcm2835-pm.h
+++ b/include/linux/mfd/bcm2835-pm.h
@@ -9,6 +9,7 @@ struct bcm2835_pm {
 	struct device *dev;
 	void __iomem *base;
 	void __iomem *asb;
+	void __iomem *rpivid_asb;
 };
 
 #endif /* BCM2835_MFD_PM_H */
-- 
cgit 


From e2a4a0eeb0cd308bb96d168f90c225da2084d4dc Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Sun, 3 Jul 2022 17:11:24 +0800
Subject: dt-bindings: interconnect: add fsl,imx8mp.h

Add fsl,imx8mp.h for i.MX8MP

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220703091132.1412063-3-peng.fan@oss.nxp.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 include/dt-bindings/interconnect/fsl,imx8mp.h | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 include/dt-bindings/interconnect/fsl,imx8mp.h

(limited to 'include')

diff --git a/include/dt-bindings/interconnect/fsl,imx8mp.h b/include/dt-bindings/interconnect/fsl,imx8mp.h
new file mode 100644
index 000000000000..7357d417529a
--- /dev/null
+++ b/include/dt-bindings/interconnect/fsl,imx8mp.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Interconnect framework driver for i.MX SoC
+ *
+ * Copyright 2022 NXP
+ * Peng Fan <peng.fan@nxp.com>
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_IMX8MP_H
+#define __DT_BINDINGS_INTERCONNECT_IMX8MP_H
+
+#define IMX8MP_ICN_NOC		0
+#define IMX8MP_ICN_MAIN		1
+#define IMX8MP_ICS_DRAM		2
+#define IMX8MP_ICS_OCRAM	3
+#define IMX8MP_ICM_A53		4
+#define IMX8MP_ICM_SUPERMIX	5
+#define IMX8MP_ICM_GIC		6
+#define IMX8MP_ICM_MLMIX	7
+
+#define IMX8MP_ICN_AUDIO	8
+#define IMX8MP_ICM_DSP		9
+#define IMX8MP_ICM_SDMA2PER	10
+#define IMX8MP_ICM_SDMA2BURST	11
+#define IMX8MP_ICM_SDMA3PER	12
+#define IMX8MP_ICM_SDMA3BURST	13
+#define IMX8MP_ICM_EDMA		14
+
+#define IMX8MP_ICN_GPU		15
+#define IMX8MP_ICM_GPU2D	16
+#define IMX8MP_ICM_GPU3D	17
+
+#define IMX8MP_ICN_HDMI		18
+#define IMX8MP_ICM_HRV		19
+#define IMX8MP_ICM_LCDIF_HDMI	20
+#define IMX8MP_ICM_HDCP		21
+
+#define IMX8MP_ICN_HSIO		22
+#define IMX8MP_ICM_NOC_PCIE	23
+#define IMX8MP_ICM_USB1		24
+#define IMX8MP_ICM_USB2		25
+#define IMX8MP_ICM_PCIE		26
+
+#define IMX8MP_ICN_MEDIA	27
+#define IMX8MP_ICM_LCDIF_RD	28
+#define IMX8MP_ICM_LCDIF_WR	29
+#define IMX8MP_ICM_ISI0		30
+#define IMX8MP_ICM_ISI1		31
+#define IMX8MP_ICM_ISI2		32
+#define IMX8MP_ICM_ISP0		33
+#define IMX8MP_ICM_ISP1		34
+#define IMX8MP_ICM_DWE		35
+
+#define IMX8MP_ICN_VIDEO	36
+#define IMX8MP_ICM_VPU_G1	37
+#define IMX8MP_ICM_VPU_G2	38
+#define IMX8MP_ICM_VPU_H1	39
+
+#endif /* __DT_BINDINGS_INTERCONNECT_IMX8MP_H */
-- 
cgit 


From 2fcfa72fc13f0203bb676bd1ec30ec85e17855be Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Sun, 3 Jul 2022 17:11:25 +0800
Subject: interconnect: add device managed bulk API

Add device managed bulk API to simplify driver.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20220703091132.1412063-4-peng.fan@oss.nxp.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 drivers/interconnect/bulk.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/interconnect.h |  7 +++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/interconnect/bulk.c b/drivers/interconnect/bulk.c
index 448cc536aa79..8b1d8a412464 100644
--- a/drivers/interconnect/bulk.c
+++ b/drivers/interconnect/bulk.c
@@ -115,3 +115,45 @@ void icc_bulk_disable(int num_paths, const struct icc_bulk_data *paths)
 		icc_disable(paths[num_paths].path);
 }
 EXPORT_SYMBOL_GPL(icc_bulk_disable);
+
+struct icc_bulk_devres {
+	struct icc_bulk_data *paths;
+	int num_paths;
+};
+
+static void devm_icc_bulk_release(struct device *dev, void *res)
+{
+	struct icc_bulk_devres *devres = res;
+
+	icc_bulk_put(devres->num_paths, devres->paths);
+}
+
+/**
+ * devm_of_icc_bulk_get() - resource managed of_icc_bulk_get
+ * @dev: the device requesting the path
+ * @num_paths: the number of icc_bulk_data
+ * @paths: the table with the paths we want to get
+ *
+ * Returns 0 on success or negative errno otherwise.
+ */
+int devm_of_icc_bulk_get(struct device *dev, int num_paths, struct icc_bulk_data *paths)
+{
+	struct icc_bulk_devres *devres;
+	int ret;
+
+	devres = devres_alloc(devm_icc_bulk_release, sizeof(*devres), GFP_KERNEL);
+	if (!devres)
+		return -ENOMEM;
+
+	ret = of_icc_bulk_get(dev, num_paths, paths);
+	if (!ret) {
+		devres->paths = paths;
+		devres->num_paths = num_paths;
+		devres_add(dev, devres);
+	} else {
+		devres_free(devres);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_of_icc_bulk_get);
diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h
index f685777b875e..2b0e784ba771 100644
--- a/include/linux/interconnect.h
+++ b/include/linux/interconnect.h
@@ -44,6 +44,7 @@ struct icc_path *icc_get(struct device *dev, const int src_id,
 			 const int dst_id);
 struct icc_path *of_icc_get(struct device *dev, const char *name);
 struct icc_path *devm_of_icc_get(struct device *dev, const char *name);
+int devm_of_icc_bulk_get(struct device *dev, int num_paths, struct icc_bulk_data *paths);
 struct icc_path *of_icc_get_by_index(struct device *dev, int idx);
 void icc_put(struct icc_path *path);
 int icc_enable(struct icc_path *path);
@@ -116,6 +117,12 @@ static inline int of_icc_bulk_get(struct device *dev, int num_paths, struct icc_
 	return 0;
 }
 
+static inline int devm_of_icc_bulk_get(struct device *dev, int num_paths,
+				       struct icc_bulk_data *paths)
+{
+	return 0;
+}
+
 static inline void icc_bulk_put(int num_paths, struct icc_bulk_data *paths)
 {
 }
-- 
cgit 


From 2bd0467074f078372ac2c979c56dc94228c90fbf Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 30 Jun 2022 18:31:34 +0100
Subject: include: trace: Add SCMI full message tracing

Add a distinct trace event to dump full SCMI message headers and payloads.

Link: https://lore.kernel.org/r/20220630173135.2086631-2-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/trace/events/scmi.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/scmi.h b/include/trace/events/scmi.h
index cee4b2b64ae4..9108544beca9 100644
--- a/include/trace/events/scmi.h
+++ b/include/trace/events/scmi.h
@@ -112,6 +112,37 @@ TRACE_EVENT(scmi_rx_done,
 		__entry->transfer_id, __entry->msg_id, __entry->protocol_id,
 		__entry->seq, __entry->msg_type)
 );
+
+TRACE_EVENT(scmi_msg_dump,
+	TP_PROTO(u8 protocol_id, u8 msg_id, unsigned char *tag, u16 seq,
+		 int status, void *buf, size_t len),
+	TP_ARGS(protocol_id, msg_id, tag, seq, status, buf, len),
+
+	TP_STRUCT__entry(
+		__field(u8, protocol_id)
+		__field(u8, msg_id)
+		__array(char, tag, 5)
+		__field(u16, seq)
+		__field(int, status)
+		__field(size_t, len)
+		__dynamic_array(unsigned char, cmd, len)
+	),
+
+	TP_fast_assign(
+		__entry->protocol_id = protocol_id;
+		__entry->msg_id = msg_id;
+		strscpy(__entry->tag, tag, 5);
+		__entry->seq = seq;
+		__entry->status = status;
+		__entry->len = len;
+		memcpy(__get_dynamic_array(cmd), buf, __entry->len);
+	),
+
+	TP_printk("pt=%02X t=%s msg_id=%02X seq=%04X s=%d pyld=%s",
+		  __entry->protocol_id, __entry->tag, __entry->msg_id,
+		  __entry->seq, __entry->status,
+		__print_hex_str(__get_dynamic_array(cmd), __entry->len))
+);
 #endif /* _TRACE_SCMI_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 7097f29819bb70374dfae9f705e548a720f16f94 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 4 Jul 2022 11:19:31 +0100
Subject: firmware: arm_scmi: Add SCMI v3.1 System Power extensions

Add support for SCMIv3.1 System Power optional timeout field while
dispatching SYSTEM_POWER_STATE_NOTIFIER notification.

Link: https://lore.kernel.org/r/20220704101933.2981635-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/system.c | 17 ++++++++++++++++-
 include/linux/scmi_protocol.h      |  2 ++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/system.c b/drivers/firmware/arm_scmi/system.c
index 220e399118ad..9383d7584539 100644
--- a/drivers/firmware/arm_scmi/system.c
+++ b/drivers/firmware/arm_scmi/system.c
@@ -27,10 +27,12 @@ struct scmi_system_power_state_notifier_payld {
 	__le32 agent_id;
 	__le32 flags;
 	__le32 system_state;
+	__le32 timeout;
 };
 
 struct scmi_system_info {
 	u32 version;
+	bool graceful_timeout_supported;
 };
 
 static int scmi_system_request_notify(const struct scmi_protocol_handle *ph,
@@ -72,17 +74,27 @@ scmi_system_fill_custom_report(const struct scmi_protocol_handle *ph,
 			       const void *payld, size_t payld_sz,
 			       void *report, u32 *src_id)
 {
+	size_t expected_sz;
 	const struct scmi_system_power_state_notifier_payld *p = payld;
 	struct scmi_system_power_state_notifier_report *r = report;
+	struct scmi_system_info *pinfo = ph->get_priv(ph);
 
+	expected_sz = pinfo->graceful_timeout_supported ?
+			sizeof(*p) : sizeof(*p) - sizeof(__le32);
 	if (evt_id != SCMI_EVENT_SYSTEM_POWER_STATE_NOTIFIER ||
-	    sizeof(*p) != payld_sz)
+	    payld_sz != expected_sz)
 		return NULL;
 
 	r->timestamp = timestamp;
 	r->agent_id = le32_to_cpu(p->agent_id);
 	r->flags = le32_to_cpu(p->flags);
 	r->system_state = le32_to_cpu(p->system_state);
+	if (pinfo->graceful_timeout_supported &&
+	    r->system_state == SCMI_SYSTEM_SHUTDOWN &&
+	    SCMI_SYSPOWER_IS_REQUEST_GRACEFUL(r->flags))
+		r->timeout = le32_to_cpu(p->timeout);
+	else
+		r->timeout = 0x00;
 	*src_id = 0;
 
 	return r;
@@ -129,6 +141,9 @@ static int scmi_system_protocol_init(const struct scmi_protocol_handle *ph)
 		return -ENOMEM;
 
 	pinfo->version = version;
+	if (PROTOCOL_REV_MAJOR(pinfo->version) >= 0x2)
+		pinfo->graceful_timeout_supported = true;
+
 	return ph->set_priv(ph, pinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 704111f63993..311aa3c73aac 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -781,8 +781,10 @@ struct scmi_clock_rate_notif_report {
 struct scmi_system_power_state_notifier_report {
 	ktime_t		timestamp;
 	unsigned int	agent_id;
+#define SCMI_SYSPOWER_IS_REQUEST_GRACEFUL(flags)	((flags) & BIT(0))
 	unsigned int	flags;
 	unsigned int	system_state;
+	unsigned int	timeout;
 };
 
 struct scmi_perf_limits_report {
-- 
cgit 


From d91079995fa62720e11a39c08b932f2f9a8cbfae Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 4 Jul 2022 11:19:32 +0100
Subject: firmware: arm_scmi: Add devm_protocol_acquire helper

Add a method to get hold of a protocol, causing it to be initialized and
its resource accounting updated, without getting access to its operations
and handle.

Some protocols, like SCMI SystemPower, do not expose any protocol ops to
the kernel OSPM agent but still need to be at least initialized. This
helper avoids the need to invoke a full devm_get_protocol() only to get
the protocol initialized while throwing away unused the protocol ops and
handle.

Link: https://lore.kernel.org/r/20220704101933.2981635-4-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/driver.c | 70 ++++++++++++++++++++++++++++++--------
 include/linux/scmi_protocol.h      |  5 +++
 2 files changed, 60 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 0f73d743a7c2..1125a1f167a6 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1523,6 +1523,30 @@ static void scmi_devm_release_protocol(struct device *dev, void *res)
 	scmi_protocol_release(dres->handle, dres->protocol_id);
 }
 
+static struct scmi_protocol_instance __must_check *
+scmi_devres_protocol_instance_get(struct scmi_device *sdev, u8 protocol_id)
+{
+	struct scmi_protocol_instance *pi;
+	struct scmi_protocol_devres *dres;
+
+	dres = devres_alloc(scmi_devm_release_protocol,
+			    sizeof(*dres), GFP_KERNEL);
+	if (!dres)
+		return ERR_PTR(-ENOMEM);
+
+	pi = scmi_get_protocol_instance(sdev->handle, protocol_id);
+	if (IS_ERR(pi)) {
+		devres_free(dres);
+		return pi;
+	}
+
+	dres->handle = sdev->handle;
+	dres->protocol_id = protocol_id;
+	devres_add(&sdev->dev, dres);
+
+	return pi;
+}
+
 /**
  * scmi_devm_protocol_get  - Devres managed get protocol operations and handle
  * @sdev: A reference to an scmi_device whose embedded struct device is to
@@ -1546,32 +1570,47 @@ scmi_devm_protocol_get(struct scmi_device *sdev, u8 protocol_id,
 		       struct scmi_protocol_handle **ph)
 {
 	struct scmi_protocol_instance *pi;
-	struct scmi_protocol_devres *dres;
-	struct scmi_handle *handle = sdev->handle;
 
 	if (!ph)
 		return ERR_PTR(-EINVAL);
 
-	dres = devres_alloc(scmi_devm_release_protocol,
-			    sizeof(*dres), GFP_KERNEL);
-	if (!dres)
-		return ERR_PTR(-ENOMEM);
-
-	pi = scmi_get_protocol_instance(handle, protocol_id);
-	if (IS_ERR(pi)) {
-		devres_free(dres);
+	pi = scmi_devres_protocol_instance_get(sdev, protocol_id);
+	if (IS_ERR(pi))
 		return pi;
-	}
-
-	dres->handle = handle;
-	dres->protocol_id = protocol_id;
-	devres_add(&sdev->dev, dres);
 
 	*ph = &pi->ph;
 
 	return pi->proto->ops;
 }
 
+/**
+ * scmi_devm_protocol_acquire  - Devres managed helper to get hold of a protocol
+ * @sdev: A reference to an scmi_device whose embedded struct device is to
+ *	  be used for devres accounting.
+ * @protocol_id: The protocol being requested.
+ *
+ * Get hold of a protocol accounting for its usage, possibly triggering its
+ * initialization but without getting access to its protocol specific operations
+ * and handle.
+ *
+ * Being a devres based managed method, protocol hold will be automatically
+ * released, and possibly de-initialized on last user, once the SCMI driver
+ * owning the scmi_device is unbound from it.
+ *
+ * Return: 0 on SUCCESS
+ */
+static int __must_check scmi_devm_protocol_acquire(struct scmi_device *sdev,
+						   u8 protocol_id)
+{
+	struct scmi_protocol_instance *pi;
+
+	pi = scmi_devres_protocol_instance_get(sdev, protocol_id);
+	if (IS_ERR(pi))
+		return PTR_ERR(pi);
+
+	return 0;
+}
+
 static int scmi_devm_protocol_match(struct device *dev, void *res, void *data)
 {
 	struct scmi_protocol_devres *dres = res;
@@ -2176,6 +2215,7 @@ static int scmi_probe(struct platform_device *pdev)
 	handle = &info->handle;
 	handle->dev = info->dev;
 	handle->version = &info->version;
+	handle->devm_protocol_acquire = scmi_devm_protocol_acquire;
 	handle->devm_protocol_get = scmi_devm_protocol_get;
 	handle->devm_protocol_put = scmi_devm_protocol_put;
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 311aa3c73aac..898488c7f185 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -624,6 +624,9 @@ struct scmi_notify_ops {
  *
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
+ * @devm_protocol_acquire: devres managed method to get hold of a protocol,
+ *			   causing its initialization and related resource
+ *			   accounting
  * @devm_protocol_get: devres managed method to acquire a protocol and get specific
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
@@ -642,6 +645,8 @@ struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
 
+	int __must_check (*devm_protocol_acquire)(struct scmi_device *sdev,
+						  u8 proto);
 	const void __must_check *
 		(*devm_protocol_get)(struct scmi_device *sdev, u8 proto,
 				     struct scmi_protocol_handle **ph);
-- 
cgit 


From 0316f99c4780b0a5fd60b7f136c64cb1af8d5fc3 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 4 Jul 2022 11:22:36 +0100
Subject: firmware: arm_scmi: Add SCMI v3.1 powercap protocol basic support

Add support for SCMI v3.1 powercap protocol, with the exception of powercap
fast channels, exposing all the new related powercap protocol operations as
usual in include/linux/scmi_protocol.h.

Link: https://lore.kernel.org/r/20220704102241.2988447-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/Makefile    |   2 +-
 drivers/firmware/arm_scmi/driver.c    |   2 +
 drivers/firmware/arm_scmi/powercap.c  | 753 ++++++++++++++++++++++++++++++++++
 drivers/firmware/arm_scmi/protocols.h |   1 +
 include/linux/scmi_protocol.h         | 125 ++++++
 5 files changed, 882 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/arm_scmi/powercap.c

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile
index faacd0ec7b81..9ea86f8cc8f7 100644
--- a/drivers/firmware/arm_scmi/Makefile
+++ b/drivers/firmware/arm_scmi/Makefile
@@ -7,7 +7,7 @@ scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_SMC) += smc.o
 scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_VIRTIO) += virtio.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_OPTEE) += optee.o
-scmi-protocols-y = base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o
+scmi-protocols-y = base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o
 scmi-module-objs := $(scmi-bus-y) $(scmi-driver-y) $(scmi-protocols-y) \
 		    $(scmi-transport-y)
 obj-$(CONFIG_ARM_SCMI_PROTOCOL) += scmi-module.o
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 1125a1f167a6..92a71e02b4cb 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -2485,6 +2485,7 @@ static int __init scmi_driver_init(void)
 	scmi_sensors_register();
 	scmi_voltage_register();
 	scmi_system_register();
+	scmi_powercap_register();
 
 	return platform_driver_register(&scmi_driver);
 }
@@ -2501,6 +2502,7 @@ static void __exit scmi_driver_exit(void)
 	scmi_sensors_unregister();
 	scmi_voltage_unregister();
 	scmi_system_unregister();
+	scmi_powercap_unregister();
 
 	scmi_bus_exit();
 
diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c
new file mode 100644
index 000000000000..d71a1f6c2d8a
--- /dev/null
+++ b/drivers/firmware/arm_scmi/powercap.c
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System Control and Management Interface (SCMI) Powercap Protocol
+ *
+ * Copyright (C) 2022 ARM Ltd.
+ */
+
+#define pr_fmt(fmt) "SCMI Notifications POWERCAP - " fmt
+
+#include <linux/bitfield.h>
+#include <linux/module.h>
+#include <linux/scmi_protocol.h>
+
+#include "protocols.h"
+#include "notify.h"
+
+enum scmi_powercap_protocol_cmd {
+	POWERCAP_DOMAIN_ATTRIBUTES = 0x3,
+	POWERCAP_CAP_GET = 0x4,
+	POWERCAP_CAP_SET = 0x5,
+	POWERCAP_PAI_GET = 0x6,
+	POWERCAP_PAI_SET = 0x7,
+	POWERCAP_DOMAIN_NAME_GET = 0x8,
+	POWERCAP_MEASUREMENTS_GET = 0x9,
+	POWERCAP_CAP_NOTIFY = 0xa,
+	POWERCAP_MEASUREMENTS_NOTIFY = 0xb,
+	POWERCAP_DESCRIBE_FASTCHANNEL = 0xc,
+};
+
+struct scmi_msg_resp_powercap_domain_attributes {
+	__le32 attributes;
+#define SUPPORTS_POWERCAP_CAP_CHANGE_NOTIFY(x)		((x) & BIT(31))
+#define SUPPORTS_POWERCAP_MEASUREMENTS_CHANGE_NOTIFY(x)	((x) & BIT(30))
+#define SUPPORTS_ASYNC_POWERCAP_CAP_SET(x)		((x) & BIT(29))
+#define SUPPORTS_EXTENDED_NAMES(x)			((x) & BIT(28))
+#define SUPPORTS_POWERCAP_CAP_CONFIGURATION(x)		((x) & BIT(27))
+#define SUPPORTS_POWERCAP_MONITORING(x)			((x) & BIT(26))
+#define SUPPORTS_POWERCAP_PAI_CONFIGURATION(x)		((x) & BIT(25))
+#define POWERCAP_POWER_UNIT(x)				\
+		(FIELD_GET(GENMASK(24, 23), (x)))
+#define	SUPPORTS_POWER_UNITS_MW(x)			\
+		(POWERCAP_POWER_UNIT(x) == 0x2)
+#define	SUPPORTS_POWER_UNITS_UW(x)			\
+		(POWERCAP_POWER_UNIT(x) == 0x1)
+	u8 name[SCMI_SHORT_NAME_MAX_SIZE];
+	__le32 min_pai;
+	__le32 max_pai;
+	__le32 pai_step;
+	__le32 min_power_cap;
+	__le32 max_power_cap;
+	__le32 power_cap_step;
+	__le32 sustainable_power;
+	__le32 accuracy;
+	__le32 parent_id;
+};
+
+struct scmi_msg_powercap_set_cap_or_pai {
+	__le32 domain;
+	__le32 flags;
+#define CAP_SET_ASYNC		BIT(1)
+#define CAP_SET_IGNORE_DRESP	BIT(0)
+	__le32 value;
+};
+
+struct scmi_msg_resp_powercap_cap_set_complete {
+	__le32 domain;
+	__le32 power_cap;
+};
+
+struct scmi_msg_resp_powercap_meas_get {
+	__le32 power;
+	__le32 pai;
+};
+
+struct scmi_msg_powercap_notify_cap {
+	__le32 domain;
+	__le32 notify_enable;
+};
+
+struct scmi_msg_powercap_notify_thresh {
+	__le32 domain;
+	__le32 notify_enable;
+	__le32 power_thresh_low;
+	__le32 power_thresh_high;
+};
+
+struct scmi_powercap_cap_changed_notify_payld {
+	__le32 agent_id;
+	__le32 domain_id;
+	__le32 power_cap;
+	__le32 pai;
+};
+
+struct scmi_powercap_meas_changed_notify_payld {
+	__le32 agent_id;
+	__le32 domain_id;
+	__le32 power;
+};
+
+struct scmi_powercap_state {
+	bool meas_notif_enabled;
+	u64 thresholds;
+#define THRESH_LOW(p, id)				\
+	(lower_32_bits((p)->states[(id)].thresholds))
+#define THRESH_HIGH(p, id)				\
+	(upper_32_bits((p)->states[(id)].thresholds))
+};
+
+struct powercap_info {
+	u32 version;
+	int num_domains;
+	struct scmi_powercap_state *states;
+	struct scmi_powercap_info *powercaps;
+};
+
+static enum scmi_powercap_protocol_cmd evt_2_cmd[] = {
+	POWERCAP_CAP_NOTIFY,
+	POWERCAP_MEASUREMENTS_NOTIFY,
+};
+
+static int scmi_powercap_notify(const struct scmi_protocol_handle *ph,
+				u32 domain, int message_id, bool enable);
+
+static int
+scmi_powercap_attributes_get(const struct scmi_protocol_handle *ph,
+			     struct powercap_info *pi)
+{
+	int ret;
+	struct scmi_xfer *t;
+
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES, 0,
+				      sizeof(u32), &t);
+	if (ret)
+		return ret;
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		u32 attributes;
+
+		attributes = get_unaligned_le32(t->rx.buf);
+		pi->num_domains = FIELD_GET(GENMASK(15, 0), attributes);
+	}
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static inline int
+scmi_powercap_validate(unsigned int min_val, unsigned int max_val,
+		       unsigned int step_val, bool configurable)
+{
+	if (!min_val || !max_val)
+		return -EPROTO;
+
+	if ((configurable && min_val == max_val) ||
+	    (!configurable && min_val != max_val))
+		return -EPROTO;
+
+	if (min_val != max_val && !step_val)
+		return -EPROTO;
+
+	return 0;
+}
+
+static int
+scmi_powercap_domain_attributes_get(const struct scmi_protocol_handle *ph,
+				    struct powercap_info *pinfo, u32 domain)
+{
+	int ret;
+	u32 flags;
+	struct scmi_xfer *t;
+	struct scmi_powercap_info *dom_info = pinfo->powercaps + domain;
+	struct scmi_msg_resp_powercap_domain_attributes *resp;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_DOMAIN_ATTRIBUTES,
+				      sizeof(domain), sizeof(*resp), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(domain, t->tx.buf);
+	resp = t->rx.buf;
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		flags = le32_to_cpu(resp->attributes);
+
+		dom_info->id = domain;
+		dom_info->notify_powercap_cap_change =
+			SUPPORTS_POWERCAP_CAP_CHANGE_NOTIFY(flags);
+		dom_info->notify_powercap_measurement_change =
+			SUPPORTS_POWERCAP_MEASUREMENTS_CHANGE_NOTIFY(flags);
+		dom_info->async_powercap_cap_set =
+			SUPPORTS_ASYNC_POWERCAP_CAP_SET(flags);
+		dom_info->powercap_cap_config =
+			SUPPORTS_POWERCAP_CAP_CONFIGURATION(flags);
+		dom_info->powercap_monitoring =
+			SUPPORTS_POWERCAP_MONITORING(flags);
+		dom_info->powercap_pai_config =
+			SUPPORTS_POWERCAP_PAI_CONFIGURATION(flags);
+		dom_info->powercap_scale_mw =
+			SUPPORTS_POWER_UNITS_MW(flags);
+		dom_info->powercap_scale_uw =
+			SUPPORTS_POWER_UNITS_UW(flags);
+
+		strscpy(dom_info->name, resp->name, SCMI_SHORT_NAME_MAX_SIZE);
+
+		dom_info->min_pai = le32_to_cpu(resp->min_pai);
+		dom_info->max_pai = le32_to_cpu(resp->max_pai);
+		dom_info->pai_step = le32_to_cpu(resp->pai_step);
+		ret = scmi_powercap_validate(dom_info->min_pai,
+					     dom_info->max_pai,
+					     dom_info->pai_step,
+					     dom_info->powercap_pai_config);
+		if (ret) {
+			dev_err(ph->dev,
+				"Platform reported inconsistent PAI config for domain %d - %s\n",
+				dom_info->id, dom_info->name);
+			goto clean;
+		}
+
+		dom_info->min_power_cap = le32_to_cpu(resp->min_power_cap);
+		dom_info->max_power_cap = le32_to_cpu(resp->max_power_cap);
+		dom_info->power_cap_step = le32_to_cpu(resp->power_cap_step);
+		ret = scmi_powercap_validate(dom_info->min_power_cap,
+					     dom_info->max_power_cap,
+					     dom_info->power_cap_step,
+					     dom_info->powercap_cap_config);
+		if (ret) {
+			dev_err(ph->dev,
+				"Platform reported inconsistent CAP config for domain %d - %s\n",
+				dom_info->id, dom_info->name);
+			goto clean;
+		}
+
+		dom_info->sustainable_power =
+			le32_to_cpu(resp->sustainable_power);
+		dom_info->accuracy = le32_to_cpu(resp->accuracy);
+
+		dom_info->parent_id = le32_to_cpu(resp->parent_id);
+		if (dom_info->parent_id != SCMI_POWERCAP_ROOT_ZONE_ID &&
+		    (dom_info->parent_id >= pinfo->num_domains ||
+		     dom_info->parent_id == dom_info->id)) {
+			dev_err(ph->dev,
+				"Platform reported inconsistent parent ID for domain %d - %s\n",
+				dom_info->id, dom_info->name);
+			ret = -ENODEV;
+		}
+	}
+
+clean:
+	ph->xops->xfer_put(ph, t);
+
+	/*
+	 * If supported overwrite short name with the extended one;
+	 * on error just carry on and use already provided short name.
+	 */
+	if (!ret && SUPPORTS_EXTENDED_NAMES(flags))
+		ph->hops->extended_name_get(ph, POWERCAP_DOMAIN_NAME_GET,
+					    domain, dom_info->name,
+					    SCMI_MAX_STR_SIZE);
+
+	return ret;
+}
+
+static int scmi_powercap_num_domains_get(const struct scmi_protocol_handle *ph)
+{
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	return pi->num_domains;
+}
+
+static const struct scmi_powercap_info *
+scmi_powercap_dom_info_get(const struct scmi_protocol_handle *ph, u32 domain_id)
+{
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (domain_id >= pi->num_domains)
+		return NULL;
+
+	return pi->powercaps + domain_id;
+}
+
+static int scmi_powercap_cap_get(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 *power_cap)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!power_cap || domain_id >= pi->num_domains)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_CAP_GET, sizeof(u32),
+				      sizeof(u32), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(domain_id, t->tx.buf);
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret)
+		*power_cap = get_unaligned_le32(t->rx.buf);
+
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 power_cap,
+				 bool ignore_dresp)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_powercap_set_cap_or_pai *msg;
+	const struct scmi_powercap_info *pc;
+
+	pc = scmi_powercap_dom_info_get(ph, domain_id);
+	if (!pc || !pc->powercap_cap_config || !power_cap ||
+	    power_cap < pc->min_power_cap ||
+	    power_cap > pc->max_power_cap)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_CAP_SET,
+				      sizeof(*msg), 0, &t);
+	if (ret)
+		return ret;
+
+	msg = t->tx.buf;
+	msg->domain = cpu_to_le32(domain_id);
+	msg->flags =
+		cpu_to_le32(FIELD_PREP(CAP_SET_ASYNC, !!pc->async_powercap_cap_set) |
+			    FIELD_PREP(CAP_SET_IGNORE_DRESP, !!ignore_dresp));
+	msg->value = cpu_to_le32(power_cap);
+
+	if (!pc->async_powercap_cap_set || ignore_dresp) {
+		ret = ph->xops->do_xfer(ph, t);
+	} else {
+		ret = ph->xops->do_xfer_with_response(ph, t);
+		if (!ret) {
+			struct scmi_msg_resp_powercap_cap_set_complete *resp;
+
+			resp = t->rx.buf;
+			if (le32_to_cpu(resp->domain) == domain_id)
+				dev_dbg(ph->dev,
+					"Powercap ID %d CAP set async to %u\n",
+					domain_id,
+					get_unaligned_le32(&resp->power_cap));
+			else
+				ret = -EPROTO;
+		}
+	}
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static int scmi_powercap_pai_get(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 *pai)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!pai || domain_id >= pi->num_domains)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_PAI_GET, sizeof(u32),
+				      sizeof(u32), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(domain_id, t->tx.buf);
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret)
+		*pai = get_unaligned_le32(t->rx.buf);
+
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int scmi_powercap_pai_set(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 pai)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_powercap_set_cap_or_pai *msg;
+	const struct scmi_powercap_info *pc;
+
+	pc = scmi_powercap_dom_info_get(ph, domain_id);
+	if (!pc || !pc->powercap_pai_config || !pai ||
+	    pai < pc->min_pai || pai > pc->max_pai)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_PAI_SET,
+				      sizeof(*msg), 0, &t);
+	if (ret)
+		return ret;
+
+	msg = t->tx.buf;
+	msg->domain = cpu_to_le32(domain_id);
+	msg->flags = cpu_to_le32(0);
+	msg->value = cpu_to_le32(pai);
+
+	ret = ph->xops->do_xfer(ph, t);
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static int scmi_powercap_measurements_get(const struct scmi_protocol_handle *ph,
+					  u32 domain_id, u32 *average_power,
+					  u32 *pai)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_resp_powercap_meas_get *resp;
+	const struct scmi_powercap_info *pc;
+
+	pc = scmi_powercap_dom_info_get(ph, domain_id);
+	if (!pc || !pc->powercap_monitoring || !pai || !average_power)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, POWERCAP_MEASUREMENTS_GET,
+				      sizeof(u32), sizeof(*resp), &t);
+	if (ret)
+		return ret;
+
+	resp = t->rx.buf;
+	put_unaligned_le32(domain_id, t->tx.buf);
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		*average_power = le32_to_cpu(resp->power);
+		*pai = le32_to_cpu(resp->pai);
+	}
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static int
+scmi_powercap_measurements_threshold_get(const struct scmi_protocol_handle *ph,
+					 u32 domain_id, u32 *power_thresh_low,
+					 u32 *power_thresh_high)
+{
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!power_thresh_low || !power_thresh_high ||
+	    domain_id >= pi->num_domains)
+		return -EINVAL;
+
+	*power_thresh_low =  THRESH_LOW(pi, domain_id);
+	*power_thresh_high = THRESH_HIGH(pi, domain_id);
+
+	return 0;
+}
+
+static int
+scmi_powercap_measurements_threshold_set(const struct scmi_protocol_handle *ph,
+					 u32 domain_id, u32 power_thresh_low,
+					 u32 power_thresh_high)
+{
+	int ret = 0;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (domain_id >= pi->num_domains ||
+	    power_thresh_low > power_thresh_high)
+		return -EINVAL;
+
+	/* Anything to do ? */
+	if (THRESH_LOW(pi, domain_id) == power_thresh_low &&
+	    THRESH_HIGH(pi, domain_id) == power_thresh_high)
+		return ret;
+
+	pi->states[domain_id].thresholds =
+		(FIELD_PREP(GENMASK_ULL(31, 0), power_thresh_low) |
+		 FIELD_PREP(GENMASK_ULL(63, 32), power_thresh_high));
+
+	/* Update thresholds if notification already enabled */
+	if (pi->states[domain_id].meas_notif_enabled)
+		ret = scmi_powercap_notify(ph, domain_id,
+					   POWERCAP_MEASUREMENTS_NOTIFY,
+					   true);
+
+	return ret;
+}
+
+static const struct scmi_powercap_proto_ops powercap_proto_ops = {
+	.num_domains_get = scmi_powercap_num_domains_get,
+	.info_get = scmi_powercap_dom_info_get,
+	.cap_get = scmi_powercap_cap_get,
+	.cap_set = scmi_powercap_cap_set,
+	.pai_get = scmi_powercap_pai_get,
+	.pai_set = scmi_powercap_pai_set,
+	.measurements_get = scmi_powercap_measurements_get,
+	.measurements_threshold_set = scmi_powercap_measurements_threshold_set,
+	.measurements_threshold_get = scmi_powercap_measurements_threshold_get,
+};
+
+static int scmi_powercap_notify(const struct scmi_protocol_handle *ph,
+				u32 domain, int message_id, bool enable)
+{
+	int ret;
+	struct scmi_xfer *t;
+
+	switch (message_id) {
+	case POWERCAP_CAP_NOTIFY:
+	{
+		struct scmi_msg_powercap_notify_cap *notify;
+
+		ret = ph->xops->xfer_get_init(ph, message_id,
+					      sizeof(*notify), 0, &t);
+		if (ret)
+			return ret;
+
+		notify = t->tx.buf;
+		notify->domain = cpu_to_le32(domain);
+		notify->notify_enable = cpu_to_le32(enable ? BIT(0) : 0);
+		break;
+	}
+	case POWERCAP_MEASUREMENTS_NOTIFY:
+	{
+		u32 low, high;
+		struct scmi_msg_powercap_notify_thresh *notify;
+
+		/*
+		 * Note that we have to pick the most recently configured
+		 * thresholds to build a proper POWERCAP_MEASUREMENTS_NOTIFY
+		 * enable request and we fail, complaining, if no thresholds
+		 * were ever set, since this is an indication the API has been
+		 * used wrongly.
+		 */
+		ret = scmi_powercap_measurements_threshold_get(ph, domain,
+							       &low, &high);
+		if (ret)
+			return ret;
+
+		if (enable && !low && !high) {
+			dev_err(ph->dev,
+				"Invalid Measurements Notify thresholds: %u/%u\n",
+				low, high);
+			return -EINVAL;
+		}
+
+		ret = ph->xops->xfer_get_init(ph, message_id,
+					      sizeof(*notify), 0, &t);
+		if (ret)
+			return ret;
+
+		notify = t->tx.buf;
+		notify->domain = cpu_to_le32(domain);
+		notify->notify_enable = cpu_to_le32(enable ? BIT(0) : 0);
+		notify->power_thresh_low = cpu_to_le32(low);
+		notify->power_thresh_high = cpu_to_le32(high);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+
+	ret = ph->xops->do_xfer(ph, t);
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static int
+scmi_powercap_set_notify_enabled(const struct scmi_protocol_handle *ph,
+				 u8 evt_id, u32 src_id, bool enable)
+{
+	int ret, cmd_id;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (evt_id >= ARRAY_SIZE(evt_2_cmd) || src_id >= pi->num_domains)
+		return -EINVAL;
+
+	cmd_id = evt_2_cmd[evt_id];
+	ret = scmi_powercap_notify(ph, src_id, cmd_id, enable);
+	if (ret)
+		pr_debug("FAIL_ENABLED - evt[%X] dom[%d] - ret:%d\n",
+			 evt_id, src_id, ret);
+	else if (cmd_id == POWERCAP_MEASUREMENTS_NOTIFY)
+		/*
+		 * On success save the current notification enabled state, so
+		 * as to be able to properly update the notification thresholds
+		 * when they are modified on a domain for which measurement
+		 * notifications were currently enabled.
+		 *
+		 * This is needed because the SCMI Notification core machinery
+		 * and API does not support passing per-notification custom
+		 * arguments at callback registration time.
+		 *
+		 * Note that this can be done here with a simple flag since the
+		 * SCMI core Notifications code takes care of keeping proper
+		 * per-domain enables refcounting, so that this helper function
+		 * will be called only once (for enables) when the first user
+		 * registers a callback on this domain and once more (disable)
+		 * when the last user de-registers its callback.
+		 */
+		pi->states[src_id].meas_notif_enabled = enable;
+
+	return ret;
+}
+
+static void *
+scmi_powercap_fill_custom_report(const struct scmi_protocol_handle *ph,
+				 u8 evt_id, ktime_t timestamp,
+				 const void *payld, size_t payld_sz,
+				 void *report, u32 *src_id)
+{
+	void *rep = NULL;
+
+	switch (evt_id) {
+	case SCMI_EVENT_POWERCAP_CAP_CHANGED:
+	{
+		const struct scmi_powercap_cap_changed_notify_payld *p = payld;
+		struct scmi_powercap_cap_changed_report *r = report;
+
+		if (sizeof(*p) != payld_sz)
+			break;
+
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		r->domain_id = le32_to_cpu(p->domain_id);
+		r->power_cap = le32_to_cpu(p->power_cap);
+		r->pai = le32_to_cpu(p->pai);
+		*src_id = r->domain_id;
+		rep = r;
+		break;
+	}
+	case SCMI_EVENT_POWERCAP_MEASUREMENTS_CHANGED:
+	{
+		const struct scmi_powercap_meas_changed_notify_payld *p = payld;
+		struct scmi_powercap_meas_changed_report *r = report;
+
+		if (sizeof(*p) != payld_sz)
+			break;
+
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		r->domain_id = le32_to_cpu(p->domain_id);
+		r->power = le32_to_cpu(p->power);
+		*src_id = r->domain_id;
+		rep = r;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return rep;
+}
+
+static int
+scmi_powercap_get_num_sources(const struct scmi_protocol_handle *ph)
+{
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!pi)
+		return -EINVAL;
+
+	return pi->num_domains;
+}
+
+static const struct scmi_event powercap_events[] = {
+	{
+		.id = SCMI_EVENT_POWERCAP_CAP_CHANGED,
+		.max_payld_sz =
+			sizeof(struct scmi_powercap_cap_changed_notify_payld),
+		.max_report_sz =
+			sizeof(struct scmi_powercap_cap_changed_report),
+	},
+	{
+		.id = SCMI_EVENT_POWERCAP_MEASUREMENTS_CHANGED,
+		.max_payld_sz =
+			sizeof(struct scmi_powercap_meas_changed_notify_payld),
+		.max_report_sz =
+			sizeof(struct scmi_powercap_meas_changed_report),
+	},
+};
+
+static const struct scmi_event_ops powercap_event_ops = {
+	.get_num_sources = scmi_powercap_get_num_sources,
+	.set_notify_enabled = scmi_powercap_set_notify_enabled,
+	.fill_custom_report = scmi_powercap_fill_custom_report,
+};
+
+static const struct scmi_protocol_events powercap_protocol_events = {
+	.queue_sz = SCMI_PROTO_QUEUE_SZ,
+	.ops = &powercap_event_ops,
+	.evts = powercap_events,
+	.num_events = ARRAY_SIZE(powercap_events),
+};
+
+static int
+scmi_powercap_protocol_init(const struct scmi_protocol_handle *ph)
+{
+	int domain, ret;
+	u32 version;
+	struct powercap_info *pinfo;
+
+	ret = ph->xops->version_get(ph, &version);
+	if (ret)
+		return ret;
+
+	dev_dbg(ph->dev, "Powercap Version %d.%d\n",
+		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
+
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	ret = scmi_powercap_attributes_get(ph, pinfo);
+	if (ret)
+		return ret;
+
+	pinfo->powercaps = devm_kcalloc(ph->dev, pinfo->num_domains,
+					sizeof(*pinfo->powercaps),
+					GFP_KERNEL);
+	if (!pinfo->powercaps)
+		return -ENOMEM;
+
+	/*
+	 * Note that any failure in retrieving any domain attribute leads to
+	 * the whole Powercap protocol initialization failure: this way the
+	 * reported Powercap domains are all assured, when accessed, to be well
+	 * formed and correlated by sane parent-child relationship (if any).
+	 */
+	for (domain = 0; domain < pinfo->num_domains; domain++) {
+		ret = scmi_powercap_domain_attributes_get(ph, pinfo, domain);
+		if (ret)
+			return ret;
+	}
+
+	pinfo->states = devm_kcalloc(ph->dev, pinfo->num_domains,
+				     sizeof(*pinfo->states), GFP_KERNEL);
+	if (!pinfo->states)
+		return -ENOMEM;
+
+	pinfo->version = version;
+
+	return ph->set_priv(ph, pinfo);
+}
+
+static const struct scmi_protocol scmi_powercap = {
+	.id = SCMI_PROTOCOL_POWERCAP,
+	.owner = THIS_MODULE,
+	.instance_init = &scmi_powercap_protocol_init,
+	.ops = &powercap_proto_ops,
+	.events = &powercap_protocol_events,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(powercap, scmi_powercap)
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index 51c31379f9b3..99d36d503d1e 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -315,5 +315,6 @@ DECLARE_SCMI_REGISTER_UNREGISTER(reset);
 DECLARE_SCMI_REGISTER_UNREGISTER(sensors);
 DECLARE_SCMI_REGISTER_UNREGISTER(voltage);
 DECLARE_SCMI_REGISTER_UNREGISTER(system);
+DECLARE_SCMI_REGISTER_UNREGISTER(powercap);
 
 #endif /* _SCMI_PROTOCOLS_H */
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 898488c7f185..95023dc8b3a3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -560,6 +560,114 @@ struct scmi_voltage_proto_ops {
 			 s32 *volt_uV);
 };
 
+/**
+ * struct scmi_powercap_info  - Describe one available Powercap domain
+ *
+ * @id: Domain ID as advertised by the platform.
+ * @notify_powercap_cap_change: CAP change notification support.
+ * @notify_powercap_measurement_change: MEASUREMENTS change notifications
+ *				       support.
+ * @async_powercap_cap_set: Asynchronous CAP set support.
+ * @powercap_cap_config: CAP configuration support.
+ * @powercap_monitoring: Monitoring (measurements) support.
+ * @powercap_pai_config: PAI configuration support.
+ * @powercap_scale_mw: Domain reports power data in milliwatt units.
+ * @powercap_scale_uw: Domain reports power data in microwatt units.
+ *		       Note that, when both @powercap_scale_mw and
+ *		       @powercap_scale_uw are set to false, the domain
+ *		       reports power data on an abstract linear scale.
+ * @name: name assigned to the Powercap Domain by platform.
+ * @min_pai: Minimum configurable PAI.
+ * @max_pai: Maximum configurable PAI.
+ * @pai_step: Step size between two consecutive PAI values.
+ * @min_power_cap: Minimum configurable CAP.
+ * @max_power_cap: Maximum configurable CAP.
+ * @power_cap_step: Step size between two consecutive CAP values.
+ * @sustainable_power: Maximum sustainable power consumption for this domain
+ *		       under normal conditions.
+ * @accuracy: The accuracy with which the power is measured and reported in
+ *	      integral multiples of 0.001 percent.
+ * @parent_id: Identifier of the containing parent power capping domain, or the
+ *	       value 0xFFFFFFFF if this powercap domain is a root domain not
+ *	       contained in any other domain.
+ */
+struct scmi_powercap_info {
+	unsigned int id;
+	bool notify_powercap_cap_change;
+	bool notify_powercap_measurement_change;
+	bool async_powercap_cap_set;
+	bool powercap_cap_config;
+	bool powercap_monitoring;
+	bool powercap_pai_config;
+	bool powercap_scale_mw;
+	bool powercap_scale_uw;
+	char name[SCMI_MAX_STR_SIZE];
+	unsigned int min_pai;
+	unsigned int max_pai;
+	unsigned int pai_step;
+	unsigned int min_power_cap;
+	unsigned int max_power_cap;
+	unsigned int power_cap_step;
+	unsigned int sustainable_power;
+	unsigned int accuracy;
+#define SCMI_POWERCAP_ROOT_ZONE_ID     0xFFFFFFFFUL
+	unsigned int parent_id;
+};
+
+/**
+ * struct scmi_powercap_proto_ops - represents the various operations provided
+ * by SCMI Powercap Protocol
+ *
+ * @num_domains_get: get the count of powercap domains provided by SCMI.
+ * @info_get: get the information for the specified domain.
+ * @cap_get: get the current CAP value for the specified domain.
+ * @cap_set: set the CAP value for the specified domain to the provided value;
+ *	     if the domain supports setting the CAP with an asynchronous command
+ *	     this request will finally trigger an asynchronous transfer, but, if
+ *	     @ignore_dresp here is set to true, this call will anyway return
+ *	     immediately without waiting for the related delayed response.
+ * @pai_get: get the current PAI value for the specified domain.
+ * @pai_set: set the PAI value for the specified domain to the provided value.
+ * @measurements_get: retrieve the current average power measurements for the
+ *		      specified domain and the related PAI upon which is
+ *		      calculated.
+ * @measurements_threshold_set: set the desired low and high power thresholds
+ *				to be used when registering for notification
+ *				of type POWERCAP_MEASUREMENTS_NOTIFY with this
+ *				powercap domain.
+ *				Note that this must be called at least once
+ *				before registering any callback with the usual
+ *				@scmi_notify_ops; moreover, in case this method
+ *				is called with measurement notifications already
+ *				enabled it will also trigger, transparently, a
+ *				proper update of the power thresholds configured
+ *				in the SCMI backend server.
+ * @measurements_threshold_get: get the currently configured low and high power
+ *				thresholds used when registering callbacks for
+ *				notification POWERCAP_MEASUREMENTS_NOTIFY.
+ */
+struct scmi_powercap_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	const struct scmi_powercap_info __must_check *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 domain_id);
+	int (*cap_get)(const struct scmi_protocol_handle *ph, u32 domain_id,
+		       u32 *power_cap);
+	int (*cap_set)(const struct scmi_protocol_handle *ph, u32 domain_id,
+		       u32 power_cap, bool ignore_dresp);
+	int (*pai_get)(const struct scmi_protocol_handle *ph, u32 domain_id,
+		       u32 *pai);
+	int (*pai_set)(const struct scmi_protocol_handle *ph, u32 domain_id,
+		       u32 pai);
+	int (*measurements_get)(const struct scmi_protocol_handle *ph,
+				u32 domain_id, u32 *average_power, u32 *pai);
+	int (*measurements_threshold_set)(const struct scmi_protocol_handle *ph,
+					  u32 domain_id, u32 power_thresh_low,
+					  u32 power_thresh_high);
+	int (*measurements_threshold_get)(const struct scmi_protocol_handle *ph,
+					  u32 domain_id, u32 *power_thresh_low,
+					  u32 *power_thresh_high);
+};
+
 /**
  * struct scmi_notify_ops  - represents notifications' operations provided by
  * SCMI core
@@ -666,6 +774,7 @@ enum scmi_std_protocol {
 	SCMI_PROTOCOL_SENSOR = 0x15,
 	SCMI_PROTOCOL_RESET = 0x16,
 	SCMI_PROTOCOL_VOLTAGE = 0x17,
+	SCMI_PROTOCOL_POWERCAP = 0x18,
 };
 
 enum scmi_system_events {
@@ -767,6 +876,8 @@ enum scmi_notification_events {
 	SCMI_EVENT_RESET_ISSUED = 0x0,
 	SCMI_EVENT_BASE_ERROR_EVENT = 0x0,
 	SCMI_EVENT_SYSTEM_POWER_STATE_NOTIFIER = 0x0,
+	SCMI_EVENT_POWERCAP_CAP_CHANGED = 0x0,
+	SCMI_EVENT_POWERCAP_MEASUREMENTS_CHANGED = 0x1,
 };
 
 struct scmi_power_state_changed_report {
@@ -837,4 +948,18 @@ struct scmi_base_error_report {
 	unsigned long long	reports[];
 };
 
+struct scmi_powercap_cap_changed_report {
+	ktime_t		timestamp;
+	unsigned int	agent_id;
+	unsigned int	domain_id;
+	unsigned int	power_cap;
+	unsigned int	pai;
+};
+
+struct scmi_powercap_meas_changed_report {
+	ktime_t		timestamp;
+	unsigned int	agent_id;
+	unsigned int	domain_id;
+	unsigned int	power;
+};
 #endif /* _LINUX_SCMI_PROTOCOL_H */
-- 
cgit 


From 855aa26e5f56d415b71d3f8d86ef0cc51b2166a3 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 4 Jul 2022 11:22:38 +0100
Subject: firmware: arm_scmi: Add SCMI v3.1 powercap fast channels support

Add SCMIv3.1 powercap protocol fast channel support using common helpers
provided by the SCMI core with scmi_proto_helpers_ops operations.

Link: https://lore.kernel.org/r/20220704102241.2988447-5-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/powercap.c | 169 ++++++++++++++++++++++++++++-------
 include/linux/scmi_protocol.h        |   2 +
 2 files changed, 138 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c
index d71a1f6c2d8a..10b77c3401b5 100644
--- a/drivers/firmware/arm_scmi/powercap.c
+++ b/drivers/firmware/arm_scmi/powercap.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) "SCMI Notifications POWERCAP - " fmt
 
 #include <linux/bitfield.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
@@ -27,6 +28,12 @@ enum scmi_powercap_protocol_cmd {
 	POWERCAP_DESCRIBE_FASTCHANNEL = 0xc,
 };
 
+enum {
+	POWERCAP_FC_CAP,
+	POWERCAP_FC_PAI,
+	POWERCAP_FC_MAX,
+};
+
 struct scmi_msg_resp_powercap_domain_attributes {
 	__le32 attributes;
 #define SUPPORTS_POWERCAP_CAP_CHANGE_NOTIFY(x)		((x) & BIT(31))
@@ -36,6 +43,7 @@ struct scmi_msg_resp_powercap_domain_attributes {
 #define SUPPORTS_POWERCAP_CAP_CONFIGURATION(x)		((x) & BIT(27))
 #define SUPPORTS_POWERCAP_MONITORING(x)			((x) & BIT(26))
 #define SUPPORTS_POWERCAP_PAI_CONFIGURATION(x)		((x) & BIT(25))
+#define SUPPORTS_POWERCAP_FASTCHANNELS(x)		((x) & BIT(22))
 #define POWERCAP_POWER_UNIT(x)				\
 		(FIELD_GET(GENMASK(24, 23), (x)))
 #define	SUPPORTS_POWER_UNITS_MW(x)			\
@@ -201,6 +209,8 @@ scmi_powercap_domain_attributes_get(const struct scmi_protocol_handle *ph,
 			SUPPORTS_POWER_UNITS_MW(flags);
 		dom_info->powercap_scale_uw =
 			SUPPORTS_POWER_UNITS_UW(flags);
+		dom_info->fastchannels =
+			SUPPORTS_POWERCAP_FASTCHANNELS(flags);
 
 		strscpy(dom_info->name, resp->name, SCMI_SHORT_NAME_MAX_SIZE);
 
@@ -280,15 +290,11 @@ scmi_powercap_dom_info_get(const struct scmi_protocol_handle *ph, u32 domain_id)
 	return pi->powercaps + domain_id;
 }
 
-static int scmi_powercap_cap_get(const struct scmi_protocol_handle *ph,
-				 u32 domain_id, u32 *power_cap)
+static int scmi_powercap_xfer_cap_get(const struct scmi_protocol_handle *ph,
+				      u32 domain_id, u32 *power_cap)
 {
 	int ret;
 	struct scmi_xfer *t;
-	struct powercap_info *pi = ph->get_priv(ph);
-
-	if (!power_cap || domain_id >= pi->num_domains)
-		return -EINVAL;
 
 	ret = ph->xops->xfer_get_init(ph, POWERCAP_CAP_GET, sizeof(u32),
 				      sizeof(u32), &t);
@@ -305,20 +311,31 @@ static int scmi_powercap_cap_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
-				 u32 domain_id, u32 power_cap,
-				 bool ignore_dresp)
+static int scmi_powercap_cap_get(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 *power_cap)
+{
+	struct scmi_powercap_info *dom;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!power_cap || domain_id >= pi->num_domains)
+		return -EINVAL;
+
+	dom = pi->powercaps + domain_id;
+	if (dom->fc_info && dom->fc_info[POWERCAP_FC_CAP].get_addr) {
+		*power_cap = ioread32(dom->fc_info[POWERCAP_FC_CAP].get_addr);
+		return 0;
+	}
+
+	return scmi_powercap_xfer_cap_get(ph, domain_id, power_cap);
+}
+
+static int scmi_powercap_xfer_cap_set(const struct scmi_protocol_handle *ph,
+				      const struct scmi_powercap_info *pc,
+				      u32 power_cap, bool ignore_dresp)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_powercap_set_cap_or_pai *msg;
-	const struct scmi_powercap_info *pc;
-
-	pc = scmi_powercap_dom_info_get(ph, domain_id);
-	if (!pc || !pc->powercap_cap_config || !power_cap ||
-	    power_cap < pc->min_power_cap ||
-	    power_cap > pc->max_power_cap)
-		return -EINVAL;
 
 	ret = ph->xops->xfer_get_init(ph, POWERCAP_CAP_SET,
 				      sizeof(*msg), 0, &t);
@@ -326,7 +343,7 @@ static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
 		return ret;
 
 	msg = t->tx.buf;
-	msg->domain = cpu_to_le32(domain_id);
+	msg->domain = cpu_to_le32(pc->id);
 	msg->flags =
 		cpu_to_le32(FIELD_PREP(CAP_SET_ASYNC, !!pc->async_powercap_cap_set) |
 			    FIELD_PREP(CAP_SET_IGNORE_DRESP, !!ignore_dresp));
@@ -340,10 +357,10 @@ static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
 			struct scmi_msg_resp_powercap_cap_set_complete *resp;
 
 			resp = t->rx.buf;
-			if (le32_to_cpu(resp->domain) == domain_id)
+			if (le32_to_cpu(resp->domain) == pc->id)
 				dev_dbg(ph->dev,
 					"Powercap ID %d CAP set async to %u\n",
-					domain_id,
+					pc->id,
 					get_unaligned_le32(&resp->power_cap));
 			else
 				ret = -EPROTO;
@@ -354,16 +371,35 @@ static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int scmi_powercap_pai_get(const struct scmi_protocol_handle *ph,
-				 u32 domain_id, u32 *pai)
+static int scmi_powercap_cap_set(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 power_cap,
+				 bool ignore_dresp)
 {
-	int ret;
-	struct scmi_xfer *t;
-	struct powercap_info *pi = ph->get_priv(ph);
+	const struct scmi_powercap_info *pc;
 
-	if (!pai || domain_id >= pi->num_domains)
+	pc = scmi_powercap_dom_info_get(ph, domain_id);
+	if (!pc || !pc->powercap_cap_config || !power_cap ||
+	    power_cap < pc->min_power_cap ||
+	    power_cap > pc->max_power_cap)
 		return -EINVAL;
 
+	if (pc->fc_info && pc->fc_info[POWERCAP_FC_CAP].set_addr) {
+		struct scmi_fc_info *fci = &pc->fc_info[POWERCAP_FC_CAP];
+
+		iowrite32(power_cap, fci->set_addr);
+		ph->hops->fastchannel_db_ring(fci->set_db);
+		return 0;
+	}
+
+	return scmi_powercap_xfer_cap_set(ph, pc, power_cap, ignore_dresp);
+}
+
+static int scmi_powercap_xfer_pai_get(const struct scmi_protocol_handle *ph,
+				      u32 domain_id, u32 *pai)
+{
+	int ret;
+	struct scmi_xfer *t;
+
 	ret = ph->xops->xfer_get_init(ph, POWERCAP_PAI_GET, sizeof(u32),
 				      sizeof(u32), &t);
 	if (ret)
@@ -379,18 +415,30 @@ static int scmi_powercap_pai_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int scmi_powercap_pai_set(const struct scmi_protocol_handle *ph,
-				 u32 domain_id, u32 pai)
+static int scmi_powercap_pai_get(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 *pai)
+{
+	struct scmi_powercap_info *dom;
+	struct powercap_info *pi = ph->get_priv(ph);
+
+	if (!pai || domain_id >= pi->num_domains)
+		return -EINVAL;
+
+	dom = pi->powercaps + domain_id;
+	if (dom->fc_info && dom->fc_info[POWERCAP_FC_PAI].get_addr) {
+		*pai = ioread32(dom->fc_info[POWERCAP_FC_PAI].get_addr);
+		return 0;
+	}
+
+	return scmi_powercap_xfer_pai_get(ph, domain_id, pai);
+}
+
+static int scmi_powercap_xfer_pai_set(const struct scmi_protocol_handle *ph,
+				      u32 domain_id, u32 pai)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_powercap_set_cap_or_pai *msg;
-	const struct scmi_powercap_info *pc;
-
-	pc = scmi_powercap_dom_info_get(ph, domain_id);
-	if (!pc || !pc->powercap_pai_config || !pai ||
-	    pai < pc->min_pai || pai > pc->max_pai)
-		return -EINVAL;
 
 	ret = ph->xops->xfer_get_init(ph, POWERCAP_PAI_SET,
 				      sizeof(*msg), 0, &t);
@@ -408,6 +456,27 @@ static int scmi_powercap_pai_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
+static int scmi_powercap_pai_set(const struct scmi_protocol_handle *ph,
+				 u32 domain_id, u32 pai)
+{
+	const struct scmi_powercap_info *pc;
+
+	pc = scmi_powercap_dom_info_get(ph, domain_id);
+	if (!pc || !pc->powercap_pai_config || !pai ||
+	    pai < pc->min_pai || pai > pc->max_pai)
+		return -EINVAL;
+
+	if (pc->fc_info && pc->fc_info[POWERCAP_FC_PAI].set_addr) {
+		struct scmi_fc_info *fci = &pc->fc_info[POWERCAP_FC_PAI];
+
+		iowrite32(pai, fci->set_addr);
+		ph->hops->fastchannel_db_ring(fci->set_db);
+		return 0;
+	}
+
+	return scmi_powercap_xfer_pai_set(ph, domain_id, pai);
+}
+
 static int scmi_powercap_measurements_get(const struct scmi_protocol_handle *ph,
 					  u32 domain_id, u32 *average_power,
 					  u32 *pai)
@@ -497,6 +566,36 @@ static const struct scmi_powercap_proto_ops powercap_proto_ops = {
 	.measurements_threshold_get = scmi_powercap_measurements_threshold_get,
 };
 
+static void scmi_powercap_domain_init_fc(const struct scmi_protocol_handle *ph,
+					 u32 domain, struct scmi_fc_info **p_fc)
+{
+	struct scmi_fc_info *fc;
+
+	fc = devm_kcalloc(ph->dev, POWERCAP_FC_MAX, sizeof(*fc), GFP_KERNEL);
+	if (!fc)
+		return;
+
+	ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
+				   POWERCAP_CAP_SET, 4, domain,
+				   &fc[POWERCAP_FC_CAP].set_addr,
+				   &fc[POWERCAP_FC_CAP].set_db);
+
+	ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
+				   POWERCAP_CAP_GET, 4, domain,
+				   &fc[POWERCAP_FC_CAP].get_addr, NULL);
+
+	ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
+				   POWERCAP_PAI_SET, 4, domain,
+				   &fc[POWERCAP_FC_PAI].set_addr,
+				   &fc[POWERCAP_FC_PAI].set_db);
+
+	ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
+				   POWERCAP_PAI_GET, 4, domain,
+				   &fc[POWERCAP_FC_PAI].get_addr, NULL);
+
+	*p_fc = fc;
+}
+
 static int scmi_powercap_notify(const struct scmi_protocol_handle *ph,
 				u32 domain, int message_id, bool enable)
 {
@@ -730,6 +829,10 @@ scmi_powercap_protocol_init(const struct scmi_protocol_handle *ph)
 		ret = scmi_powercap_domain_attributes_get(ph, pinfo, domain);
 		if (ret)
 			return ret;
+
+		if (pinfo->powercaps[domain].fastchannels)
+			scmi_powercap_domain_init_fc(ph, domain,
+						     &pinfo->powercaps[domain].fc_info);
 	}
 
 	pinfo->states = devm_kcalloc(ph->dev, pinfo->num_domains,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 95023dc8b3a3..7f4f9df1b20f 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -601,6 +601,7 @@ struct scmi_powercap_info {
 	bool powercap_pai_config;
 	bool powercap_scale_mw;
 	bool powercap_scale_uw;
+	bool fastchannels;
 	char name[SCMI_MAX_STR_SIZE];
 	unsigned int min_pai;
 	unsigned int max_pai;
@@ -612,6 +613,7 @@ struct scmi_powercap_info {
 	unsigned int accuracy;
 #define SCMI_POWERCAP_ROOT_ZONE_ID     0xFFFFFFFFUL
 	unsigned int parent_id;
+	struct scmi_fc_info *fc_info;
 };
 
 /**
-- 
cgit 


From e699eb9b4f1b98be08197d699e5c34a8b576b26f Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 4 Jul 2022 11:22:39 +0100
Subject: include: trace: Add SCMI fast channel tracing

All the currently defined SCMI events are meant to trace only regular SCMI
transfers based on SCMI messages exchanges; SCMI transactions based on
fast channels, where used, are completely invisible from the tracing point
of view.

Add support to trace fast channel transactions; while doing that avoid
exposing full shared memory location addresses.

Link: https://lore.kernel.org/r/20220704102241.2988447-6-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/trace/events/scmi.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/scmi.h b/include/trace/events/scmi.h
index 9108544beca9..65016a767b7a 100644
--- a/include/trace/events/scmi.h
+++ b/include/trace/events/scmi.h
@@ -7,6 +7,31 @@
 
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(scmi_fc_call,
+	TP_PROTO(u8 protocol_id, u8 msg_id, u32 res_id, u32 val1, u32 val2),
+	TP_ARGS(protocol_id, msg_id, res_id, val1, val2),
+
+	TP_STRUCT__entry(
+		__field(u8, protocol_id)
+		__field(u8, msg_id)
+		__field(u32, res_id)
+		__field(u32, val1)
+		__field(u32, val2)
+	),
+
+	TP_fast_assign(
+		__entry->protocol_id = protocol_id;
+		__entry->msg_id = msg_id;
+		__entry->res_id = res_id;
+		__entry->val1 = val1;
+		__entry->val2 = val2;
+	),
+
+	TP_printk("[0x%02X]:[0x%02X]:[%08X]:%u:%u",
+		  __entry->protocol_id, __entry->msg_id,
+		  __entry->res_id, __entry->val1, __entry->val2)
+);
+
 TRACE_EVENT(scmi_xfer_begin,
 	TP_PROTO(int transfer_id, u8 msg_id, u8 protocol_id, u16 seq,
 		 bool poll),
-- 
cgit 


From b347aa7b57477f71c740e2bbc6d1078a7109ba23 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@openvz.org>
Date: Fri, 3 Jun 2022 06:21:49 +0300
Subject: mm/tracing: add 'accounted' entry into output of allocation
 tracepoints

Slab caches marked with SLAB_ACCOUNT force accounting for every
allocation from this cache even if __GFP_ACCOUNT flag is not passed.
Unfortunately, at the moment this flag is not visible in ftrace output,
and this makes it difficult to analyze the accounted allocations.

This patch adds boolean "accounted" entry into trace output,
and set it to 'true' for calls used __GFP_ACCOUNT flag and
for allocations from caches marked with SLAB_ACCOUNT.
Set it to 'false' if accounting is disabled in configs.

Signed-off-by: Vasily Averin <vvs@openvz.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Link: https://lore.kernel.org/r/c418ed25-65fe-f623-fbf8-1676528859ed@openvz.org
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/trace/events/kmem.h | 40 ++++++++++++++++++++++++++--------------
 mm/slab.c                   | 10 +++++-----
 mm/slab_common.c            |  9 ++++-----
 mm/slob.c                   |  8 ++++----
 mm/slub.c                   | 20 ++++++++++----------
 5 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f76668305ac5..4cb51ace600d 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -13,11 +13,12 @@ DECLARE_EVENT_CLASS(kmem_alloc,
 
 	TP_PROTO(unsigned long call_site,
 		 const void *ptr,
+		 struct kmem_cache *s,
 		 size_t bytes_req,
 		 size_t bytes_alloc,
 		 gfp_t gfp_flags),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	call_site	)
@@ -25,6 +26,7 @@ DECLARE_EVENT_CLASS(kmem_alloc,
 		__field(	size_t,		bytes_req	)
 		__field(	size_t,		bytes_alloc	)
 		__field(	unsigned long,	gfp_flags	)
+		__field(	bool,		accounted	)
 	),
 
 	TP_fast_assign(
@@ -33,42 +35,47 @@ DECLARE_EVENT_CLASS(kmem_alloc,
 		__entry->bytes_req	= bytes_req;
 		__entry->bytes_alloc	= bytes_alloc;
 		__entry->gfp_flags	= (__force unsigned long)gfp_flags;
+		__entry->accounted	= IS_ENABLED(CONFIG_MEMCG_KMEM) ?
+					  ((gfp_flags & __GFP_ACCOUNT) ||
+					  (s && s->flags & SLAB_ACCOUNT)) : false;
 	),
 
-	TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+	TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s accounted=%s",
 		(void *)__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		show_gfp_flags(__entry->gfp_flags))
+		show_gfp_flags(__entry->gfp_flags),
+		__entry->accounted ? "true" : "false")
 );
 
 DEFINE_EVENT(kmem_alloc, kmalloc,
 
-	TP_PROTO(unsigned long call_site, const void *ptr,
+	TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s,
 		 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags)
 );
 
 DEFINE_EVENT(kmem_alloc, kmem_cache_alloc,
 
-	TP_PROTO(unsigned long call_site, const void *ptr,
+	TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s,
 		 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags)
 );
 
 DECLARE_EVENT_CLASS(kmem_alloc_node,
 
 	TP_PROTO(unsigned long call_site,
 		 const void *ptr,
+		 struct kmem_cache *s,
 		 size_t bytes_req,
 		 size_t bytes_alloc,
 		 gfp_t gfp_flags,
 		 int node),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	call_site	)
@@ -77,6 +84,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
 		__field(	size_t,		bytes_alloc	)
 		__field(	unsigned long,	gfp_flags	)
 		__field(	int,		node		)
+		__field(	bool,		accounted	)
 	),
 
 	TP_fast_assign(
@@ -86,33 +94,37 @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
 		__entry->bytes_alloc	= bytes_alloc;
 		__entry->gfp_flags	= (__force unsigned long)gfp_flags;
 		__entry->node		= node;
+		__entry->accounted	= IS_ENABLED(CONFIG_MEMCG_KMEM) ?
+					  ((gfp_flags & __GFP_ACCOUNT) ||
+					  (s && s->flags & SLAB_ACCOUNT)) : false;
 	),
 
-	TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+	TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
 		(void *)__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
 		show_gfp_flags(__entry->gfp_flags),
-		__entry->node)
+		__entry->node,
+		__entry->accounted ? "true" : "false")
 );
 
 DEFINE_EVENT(kmem_alloc_node, kmalloc_node,
 
 	TP_PROTO(unsigned long call_site, const void *ptr,
-		 size_t bytes_req, size_t bytes_alloc,
+		 struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc,
 		 gfp_t gfp_flags, int node),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node)
 );
 
 DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,
 
 	TP_PROTO(unsigned long call_site, const void *ptr,
-		 size_t bytes_req, size_t bytes_alloc,
+		 struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc,
 		 gfp_t gfp_flags, int node),
 
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
+	TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node)
 );
 
 TRACE_EVENT(kfree,
diff --git a/mm/slab.c b/mm/slab.c
index 733d3af633cf..f7d6ca702018 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3478,7 +3478,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
 {
 	void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
 
-	trace_kmem_cache_alloc(_RET_IP_, ret,
+	trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
 			       cachep->object_size, cachep->size, flags);
 
 	return ret;
@@ -3567,7 +3567,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 	ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
-	trace_kmalloc(_RET_IP_, ret,
+	trace_kmalloc(_RET_IP_, ret, cachep,
 		      size, cachep->size, flags);
 	return ret;
 }
@@ -3592,7 +3592,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
 
-	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+	trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);
 
@@ -3611,7 +3611,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
-	trace_kmalloc_node(_RET_IP_, ret,
+	trace_kmalloc_node(_RET_IP_, ret, cachep,
 			   size, cachep->size,
 			   flags, nodeid);
 	return ret;
@@ -3694,7 +3694,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	ret = slab_alloc(cachep, NULL, flags, size, caller);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
-	trace_kmalloc(caller, ret,
+	trace_kmalloc(caller, ret, cachep,
 		      size, cachep->size, flags);
 
 	return ret;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 77c3adf40e50..6c9aac5d8f4a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,13 +26,12 @@
 #include <linux/memcontrol.h>
 #include <linux/stackdepot.h>
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
-
 #include "internal.h"
-
 #include "slab.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
@@ -959,7 +958,7 @@ EXPORT_SYMBOL(kmalloc_order);
 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
 	void *ret = kmalloc_order(size, flags, order);
-	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+	trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order_trace);
diff --git a/mm/slob.c b/mm/slob.c
index f47811f09aca..56421fe461c4 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,7 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 		*m = size;
 		ret = (void *)m + minalign;
 
-		trace_kmalloc_node(caller, ret,
+		trace_kmalloc_node(caller, ret, NULL,
 				   size, size + minalign, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
@@ -516,7 +516,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 			gfp |= __GFP_COMP;
 		ret = slob_new_pages(gfp, order, node);
 
-		trace_kmalloc_node(caller, ret,
+		trace_kmalloc_node(caller, ret, NULL,
 				   size, PAGE_SIZE << order, gfp, node);
 	}
 
@@ -616,12 +616,12 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node, 0);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+		trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
 					    SLOB_UNITS(c->size) * SLOB_UNIT,
 					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+		trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
 					    PAGE_SIZE << get_order(c->size),
 					    flags, node);
 	}
diff --git a/mm/slub.c b/mm/slub.c
index c21edf51e99f..514741cd1c3c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3257,7 +3257,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
 {
 	void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
 
-	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
+	trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
 				s->size, gfpflags);
 
 	return ret;
@@ -3280,7 +3280,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru);
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
-	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
@@ -3292,7 +3292,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
 
-	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+	trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
 				    s->object_size, s->size, gfpflags, node);
 
 	return ret;
@@ -3306,7 +3306,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
 
-	trace_kmalloc_node(_RET_IP_, ret,
+	trace_kmalloc_node(_RET_IP_, ret, s,
 			   size, s->size, gfpflags, node);
 
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
@@ -4441,7 +4441,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
 
-	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+	trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
 
 	ret = kasan_kmalloc(s, ret, size, flags);
 
@@ -4475,7 +4475,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 		ret = kmalloc_large_node(size, flags, node);
 
-		trace_kmalloc_node(_RET_IP_, ret,
+		trace_kmalloc_node(_RET_IP_, ret, NULL,
 				   size, PAGE_SIZE << get_order(size),
 				   flags, node);
 
@@ -4489,7 +4489,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
 
-	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+	trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
 
 	ret = kasan_kmalloc(s, ret, size, flags);
 
@@ -4946,7 +4946,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	ret = slab_alloc(s, NULL, gfpflags, caller, size);
 
 	/* Honor the call site pointer we received. */
-	trace_kmalloc(caller, ret, size, s->size, gfpflags);
+	trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
 
 	return ret;
 }
@@ -4962,7 +4962,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 		ret = kmalloc_large_node(size, gfpflags, node);
 
-		trace_kmalloc_node(caller, ret,
+		trace_kmalloc_node(caller, ret, NULL,
 				   size, PAGE_SIZE << get_order(size),
 				   gfpflags, node);
 
@@ -4977,7 +4977,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
 
 	/* Honor the call site pointer we received. */
-	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+	trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
 
 	return ret;
 }
-- 
cgit 


From cc1cfc47ea47187a21ec1f079b3c53264157fe15 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 4 Jul 2022 11:15:49 +0100
Subject: cacheinfo: Add support to check if last level cache(LLC) is valid or
 shared

It is useful to have helper to check if the given two CPUs share last
level cache. We can do that check by comparing fw_token or by comparing
the cache ID. Currently we check just for fw_token as the cache ID is
optional.

This helper can be used to build the llc_sibling during arch specific
topology parsing and feeding information to the sched_domains. This also
helps to get rid of llc_id in the CPU topology as it is sort of duplicate
information.

Also add helper to check if the llc information in cacheinfo is valid
or not.

Link: https://lore.kernel.org/r/20220704101605.1318280-6-sudeep.holla@arm.com
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/cacheinfo.c  | 26 ++++++++++++++++++++++++++
 include/linux/cacheinfo.h |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 2cea9201f31c..fdc1baa342f1 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -47,6 +47,32 @@ static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
 	return sib_leaf->fw_token == this_leaf->fw_token;
 }
 
+bool last_level_cache_is_valid(unsigned int cpu)
+{
+	struct cacheinfo *llc;
+
+	if (!cache_leaves(cpu))
+		return false;
+
+	llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+
+	return !!llc->fw_token;
+}
+
+bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y)
+{
+	struct cacheinfo *llc_x, *llc_y;
+
+	if (!last_level_cache_is_valid(cpu_x) ||
+	    !last_level_cache_is_valid(cpu_y))
+		return false;
+
+	llc_x = per_cpu_cacheinfo_idx(cpu_x, cache_leaves(cpu_x) - 1);
+	llc_y = per_cpu_cacheinfo_idx(cpu_y, cache_leaves(cpu_y) - 1);
+
+	return cache_leaves_are_shared(llc_x, llc_y);
+}
+
 #ifdef CONFIG_OF
 /* OF properties to query for a given cache type */
 struct cache_type_info {
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 4ff37cb763ae..7e429bc5c1a4 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -82,6 +82,8 @@ struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
 int init_cache_level(unsigned int cpu);
 int populate_cache_leaves(unsigned int cpu);
 int cache_setup_acpi(unsigned int cpu);
+bool last_level_cache_is_valid(unsigned int cpu);
+bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
 #ifndef CONFIG_ACPI_PPTT
 /*
  * acpi_find_last_cache_level is only called on ACPI enabled
-- 
cgit 


From 36bbc5b4ffab33ccac0f4db27f619a6ba7a4fd32 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 4 Jul 2022 11:15:50 +0100
Subject: cacheinfo: Allow early detection and population of cache attributes

Some architecture/platforms may need to setup cache properties very
early in the boot along with other cpu topologies so that all these
information can be used to build sched_domains which is used by the
scheduler.

Allow detect_cache_attributes to be called quite early during the boot.

Link: https://lore.kernel.org/r/20220704101605.1318280-7-sudeep.holla@arm.com
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/cacheinfo.c  | 55 +++++++++++++++++++++++++++++++----------------
 include/linux/cacheinfo.h |  1 +
 2 files changed, 38 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index fdc1baa342f1..4d21a1022fa9 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -193,14 +193,8 @@ static int cache_setup_of_node(unsigned int cpu)
 {
 	struct device_node *np;
 	struct cacheinfo *this_leaf;
-	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 	unsigned int index = 0;
 
-	/* skip if fw_token is already populated */
-	if (this_cpu_ci->info_list->fw_token) {
-		return 0;
-	}
-
 	np = of_cpu_device_node_get(cpu);
 	if (!np) {
 		pr_err("Failed to find cpu%d device node\n", cpu);
@@ -236,6 +230,18 @@ int __weak cache_setup_acpi(unsigned int cpu)
 
 unsigned int coherency_max_size;
 
+static int cache_setup_properties(unsigned int cpu)
+{
+	int ret = 0;
+
+	if (of_have_populated_dt())
+		ret = cache_setup_of_node(cpu);
+	else if (!acpi_disabled)
+		ret = cache_setup_acpi(cpu);
+
+	return ret;
+}
+
 static int cache_shared_cpu_map_setup(unsigned int cpu)
 {
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -246,21 +252,21 @@ static int cache_shared_cpu_map_setup(unsigned int cpu)
 	if (this_cpu_ci->cpu_map_populated)
 		return 0;
 
-	if (of_have_populated_dt())
-		ret = cache_setup_of_node(cpu);
-	else if (!acpi_disabled)
-		ret = cache_setup_acpi(cpu);
-
-	if (ret)
-		return ret;
+	/*
+	 * skip setting up cache properties if LLC is valid, just need
+	 * to update the shared cpu_map if the cache attributes were
+	 * populated early before all the cpus are brought online
+	 */
+	if (!last_level_cache_is_valid(cpu)) {
+		ret = cache_setup_properties(cpu);
+		if (ret)
+			return ret;
+	}
 
 	for (index = 0; index < cache_leaves(cpu); index++) {
 		unsigned int i;
 
 		this_leaf = per_cpu_cacheinfo_idx(cpu, index);
-		/* skip if shared_cpu_map is already populated */
-		if (!cpumask_empty(&this_leaf->shared_cpu_map))
-			continue;
 
 		cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
 		for_each_online_cpu(i) {
@@ -330,17 +336,28 @@ int __weak populate_cache_leaves(unsigned int cpu)
 	return -ENOENT;
 }
 
-static int detect_cache_attributes(unsigned int cpu)
+int detect_cache_attributes(unsigned int cpu)
 {
 	int ret;
 
+	/* Since early detection of the cacheinfo is allowed via this
+	 * function and this also gets called as CPU hotplug callbacks via
+	 * cacheinfo_cpu_online, the initialisation can be skipped and only
+	 * CPU maps can be updated as the CPU online status would be update
+	 * if called via cacheinfo_cpu_online path.
+	 */
+	if (per_cpu_cacheinfo(cpu))
+		goto update_cpu_map;
+
 	if (init_cache_level(cpu) || !cache_leaves(cpu))
 		return -ENOENT;
 
 	per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu),
 					 sizeof(struct cacheinfo), GFP_KERNEL);
-	if (per_cpu_cacheinfo(cpu) == NULL)
+	if (per_cpu_cacheinfo(cpu) == NULL) {
+		cache_leaves(cpu) = 0;
 		return -ENOMEM;
+	}
 
 	/*
 	 * populate_cache_leaves() may completely setup the cache leaves and
@@ -349,6 +366,8 @@ static int detect_cache_attributes(unsigned int cpu)
 	ret = populate_cache_leaves(cpu);
 	if (ret)
 		goto free_ci;
+
+update_cpu_map:
 	/*
 	 * For systems using DT for cache hierarchy, fw_token
 	 * and shared_cpu_map will be set up here only if they are
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 7e429bc5c1a4..00b7a6ae8617 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -84,6 +84,7 @@ int populate_cache_leaves(unsigned int cpu);
 int cache_setup_acpi(unsigned int cpu);
 bool last_level_cache_is_valid(unsigned int cpu);
 bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+int detect_cache_attributes(unsigned int cpu);
 #ifndef CONFIG_ACPI_PPTT
 /*
  * acpi_find_last_cache_level is only called on ACPI enabled
-- 
cgit 


From 5b8dc787ce4a45c87254d1b0b22f161347ab7f81 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 4 Jul 2022 11:15:56 +0100
Subject: arch_topology: Drop LLC identifier stash from the CPU topology

Since the cacheinfo LLC information is used directly in arch_topology,
there is no need to parse and store the LLC ID information only for
ACPI systems in the CPU topology.

Remove the redundant LLC ID from the generic CPU arch_topology
information.

Link: https://lore.kernel.org/r/20220704101605.1318280-13-sudeep.holla@arm.com
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/arch_topology.c  | 1 -
 include/linux/arch_topology.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 4f936c984fb6..8206990c679f 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -752,7 +752,6 @@ void __init reset_cpu_topology(void)
 		cpu_topo->core_id = -1;
 		cpu_topo->cluster_id = -1;
 		cpu_topo->package_id = -1;
-		cpu_topo->llc_id = -1;
 
 		clear_cpu_topology(cpu);
 	}
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 58cbe18d825c..a07b510e7dc5 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -68,7 +68,6 @@ struct cpu_topology {
 	int core_id;
 	int cluster_id;
 	int package_id;
-	int llc_id;
 	cpumask_t thread_sibling;
 	cpumask_t core_sibling;
 	cpumask_t cluster_sibling;
-- 
cgit 


From 7128af87c7f1c30cd6cebe0b012cc25872c689e2 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 4 Jul 2022 11:16:05 +0100
Subject: ACPI: Remove the unused find_acpi_cpu_cache_topology()

The sole user of this find_acpi_cpu_cache_topology() was arm64 topology
which is now consolidated into the generic arch_topology without the need
of this function.

Drop the unused function find_acpi_cpu_cache_topology().

Link: https://lore.kernel.org/r/20220704101605.1318280-22-sudeep.holla@arm.com
Cc: Rafael J. Wysocki <rafael@kernel.org>
Reported-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/acpi/pptt.c  | 37 -------------------------------------
 include/linux/acpi.h |  5 -----
 2 files changed, 42 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index 763f021d45e6..dd3222a15c9c 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -691,43 +691,6 @@ int find_acpi_cpu_topology(unsigned int cpu, int level)
 	return find_acpi_cpu_topology_tag(cpu, level, 0);
 }
 
-/**
- * find_acpi_cpu_cache_topology() - Determine a unique cache topology value
- * @cpu: Kernel logical CPU number
- * @level: The cache level for which we would like a unique ID
- *
- * Determine a unique ID for each unified cache in the system
- *
- * Return: -ENOENT if the PPTT doesn't exist, or the CPU cannot be found.
- * Otherwise returns a value which represents a unique topological feature.
- */
-int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
-{
-	struct acpi_table_header *table;
-	struct acpi_pptt_cache *found_cache;
-	acpi_status status;
-	u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
-	struct acpi_pptt_processor *cpu_node = NULL;
-	int ret = -1;
-
-	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
-	if (ACPI_FAILURE(status)) {
-		acpi_pptt_warn_missing();
-		return -ENOENT;
-	}
-
-	found_cache = acpi_find_cache_node(table, acpi_cpu_id,
-					   CACHE_TYPE_UNIFIED,
-					   level,
-					   &cpu_node);
-	if (found_cache)
-		ret = ACPI_PTR_DIFF(cpu_node, table);
-
-	acpi_put_table(table);
-
-	return ret;
-}
-
 /**
  * find_acpi_cpu_topology_package() - Determine a unique CPU package value
  * @cpu: Kernel logical CPU number
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4f82a5bc6d98..7b96a8bff6d2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1429,7 +1429,6 @@ int find_acpi_cpu_topology(unsigned int cpu, int level);
 int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
-int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
 #else
 static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
 {
@@ -1451,10 +1450,6 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 {
 	return -EINVAL;
 }
-static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
-{
-	return -EINVAL;
-}
 #endif
 
 #ifdef CONFIG_ACPI_PCC
-- 
cgit 


From 50d6281ce9b8412f7ef02d1bc9d23aa62ae0cf98 Mon Sep 17 00:00:00 2001
From: Ren Zhijie <renzhijie2@huawei.com>
Date: Thu, 30 Jun 2022 20:35:28 +0800
Subject: dma-mapping: Fix build error unused-value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_DMA_DECLARE_COHERENT is not set,
make ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- will be failed, like this:

drivers/remoteproc/remoteproc_core.c: In function ‘rproc_rvdev_release’:
./include/linux/dma-map-ops.h:182:42: error: statement with no effect [-Werror=unused-value]
 #define dma_release_coherent_memory(dev) (0)
                                          ^
drivers/remoteproc/remoteproc_core.c:464:2: note: in expansion of macro ‘dma_release_coherent_memory’
  dma_release_coherent_memory(dev);
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors

The return type of function dma_release_coherent_memory in CONFIG_DMA_DECLARE_COHERENT area is void, so in !CONFIG_DMA_DECLARE_COHERENT area it should neither return any value nor be defined as zero.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: e61c451476e6 ("dma-mapping: Add dma_release_coherent_memory to DMA API")
Signed-off-by: Ren Zhijie <renzhijie2@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220630123528.251181-1-renzhijie2@huawei.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/dma-map-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 53db9655efe9..bfffe494356a 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -179,10 +179,10 @@ static inline int dma_declare_coherent_memory(struct device *dev,
 	return -ENOSYS;
 }
 
-#define dma_release_coherent_memory(dev) (0)
 #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
+static inline void dma_release_coherent_memory(struct device *dev) { }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
 #ifdef CONFIG_DMA_GLOBAL_POOL
-- 
cgit 


From 668d361c9d893be3cbd4f3650e1934a62b204def Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Wed, 22 Jun 2022 19:17:22 +0100
Subject: dt-bindings: clock: r9a07g043-cpg: Add Renesas RZ/Five CPG Clock and
 Reset Definitions

Renesas RZ/Five SoC has almost the same clock structure compared to the
Renesas RZ/G2UL SoC, re-use the r9a07g043-cpg.h header file and just
amend the RZ/Five CPG clock and reset definitions.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220622181723.13033-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a07g043-cpg.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g043-cpg.h b/include/dt-bindings/clock/r9a07g043-cpg.h
index 27e232733096..77cde8effdc7 100644
--- a/include/dt-bindings/clock/r9a07g043-cpg.h
+++ b/include/dt-bindings/clock/r9a07g043-cpg.h
@@ -108,6 +108,15 @@
 #define R9A07G043_ADC_ADCLK		76
 #define R9A07G043_ADC_PCLK		77
 #define R9A07G043_TSU_PCLK		78
+#define R9A07G043_NCEPLDM_DM_CLK	79	/* RZ/Five Only */
+#define R9A07G043_NCEPLDM_ACLK		80	/* RZ/Five Only */
+#define R9A07G043_NCEPLDM_TCK		81	/* RZ/Five Only */
+#define R9A07G043_NCEPLMT_ACLK		82	/* RZ/Five Only */
+#define R9A07G043_NCEPLIC_ACLK		83	/* RZ/Five Only */
+#define R9A07G043_AX45MP_CORE0_CLK	84	/* RZ/Five Only */
+#define R9A07G043_AX45MP_ACLK		85	/* RZ/Five Only */
+#define R9A07G043_IAX45_CLK		86	/* RZ/Five Only */
+#define R9A07G043_IAX45_PCLK		87	/* RZ/Five Only */
 
 /* R9A07G043 Resets */
 #define R9A07G043_CA55_RST_1_0		0	/* RZ/G2UL Only */
@@ -180,5 +189,16 @@
 #define R9A07G043_ADC_PRESETN		67
 #define R9A07G043_ADC_ADRST_N		68
 #define R9A07G043_TSU_PRESETN		69
+#define R9A07G043_NCEPLDM_DTM_PWR_RST_N	70	/* RZ/Five Only */
+#define R9A07G043_NCEPLDM_ARESETN	71	/* RZ/Five Only */
+#define R9A07G043_NCEPLMT_POR_RSTN	72	/* RZ/Five Only */
+#define R9A07G043_NCEPLMT_ARESETN	73	/* RZ/Five Only */
+#define R9A07G043_NCEPLIC_ARESETN	74	/* RZ/Five Only */
+#define R9A07G043_AX45MP_ARESETNM	75	/* RZ/Five Only */
+#define R9A07G043_AX45MP_ARESETNS	76	/* RZ/Five Only */
+#define R9A07G043_AX45MP_L2_RESETN	77	/* RZ/Five Only */
+#define R9A07G043_AX45MP_CORE0_RESETN	78	/* RZ/Five Only */
+#define R9A07G043_IAX45_RESETN		79	/* RZ/Five Only */
+
 
 #endif /* __DT_BINDINGS_CLOCK_R9A07G043_CPG_H__ */
-- 
cgit 


From dcc165d6179c3934b93b8c3bffde1ed9710fd7ef Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 3 Jun 2022 20:07:07 +0300
Subject: ASoC: madera: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220603170707.48728-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/madera-pdata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/madera-pdata.h b/include/sound/madera-pdata.h
index e3060f48f108..58398d80c3de 100644
--- a/include/sound/madera-pdata.h
+++ b/include/sound/madera-pdata.h
@@ -9,7 +9,7 @@
 #ifndef MADERA_CODEC_PDATA_H
 #define MADERA_CODEC_PDATA_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 #define MADERA_MAX_INPUT		6
 #define MADERA_MAX_MUXED_CHANNELS	4
-- 
cgit 


From 3dcb861dbc6ab101838a1548b1efddd00ca3c3ec Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 30 Jun 2022 11:40:59 +0200
Subject: ACPI: VIOT: Fix ACS setup

Currently acpi_viot_init() gets called after the pci
device has been scanned and pci_enable_acs() has been called.
So pci_request_acs() fails to be taken into account leading
to wrong single iommu group topologies when dealing with
multi-function root ports for instance.

We cannot simply move the acpi_viot_init() earlier, similarly
as the IORT init because the VIOT parsing relies on the pci
scan. However we can detect VIOT is present earlier and in
such a case, request ACS. Introduce a new acpi_viot_early_init()
routine that allows to call pci_request_acs() before the scan.

While at it, guard the call to pci_request_acs() with #ifdef
CONFIG_PCI.

Fixes: 3cf485540e7b ("ACPI: Add driver for the VIOT table")
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reported-by: Jin Liu <jinl@redhat.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Tested-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c        |  1 +
 drivers/acpi/viot.c       | 26 ++++++++++++++++++++------
 include/linux/acpi_viot.h |  2 ++
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 86fa61a21826..906ad8153fd9 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1400,6 +1400,7 @@ static int __init acpi_init(void)
 
 	pci_mmcfg_late_init();
 	acpi_iort_init();
+	acpi_viot_early_init();
 	acpi_hest_init();
 	acpi_ghes_init();
 	acpi_scan_init();
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
index d2256326c73a..647f11cf165d 100644
--- a/drivers/acpi/viot.c
+++ b/drivers/acpi/viot.c
@@ -248,6 +248,26 @@ err_free:
 	return ret;
 }
 
+/**
+ * acpi_viot_early_init - Test the presence of VIOT and enable ACS
+ *
+ * If the VIOT does exist, ACS must be enabled. This cannot be
+ * done in acpi_viot_init() which is called after the bus scan
+ */
+void __init acpi_viot_early_init(void)
+{
+#ifdef CONFIG_PCI
+	acpi_status status;
+	struct acpi_table_header *hdr;
+
+	status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);
+	if (ACPI_FAILURE(status))
+		return;
+	pci_request_acs();
+	acpi_put_table(hdr);
+#endif
+}
+
 /**
  * acpi_viot_init - Parse the VIOT table
  *
@@ -319,12 +339,6 @@ static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void *data)
 			epid = ((domain_nr - ep->segment_start) << 16) +
 				dev_id - ep->bdf_start + ep->endpoint_id;
 
-			/*
-			 * If we found a PCI range managed by the viommu, we're
-			 * the one that has to request ACS.
-			 */
-			pci_request_acs();
-
 			return viot_dev_iommu_init(&pdev->dev, ep->viommu,
 						   epid);
 		}
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
index 1eb8ee5b0e5f..a5a122431563 100644
--- a/include/linux/acpi_viot.h
+++ b/include/linux/acpi_viot.h
@@ -6,9 +6,11 @@
 #include <linux/acpi.h>
 
 #ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_early_init(void);
 void __init acpi_viot_init(void);
 int viot_iommu_configure(struct device *dev);
 #else
+static inline void acpi_viot_early_init(void) {}
 static inline void acpi_viot_init(void) {}
 static inline int viot_iommu_configure(struct device *dev)
 {
-- 
cgit 


From 09d3154a6f0f0bb5b604832095804780f3684b96 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 6 Jun 2022 22:51:58 -0500
Subject: PM: wakeup: Unify device_init_wakeup() for PM_SLEEP and !PM_SLEEP

Previously the CONFIG_PM_SLEEP and !CONFIG_PM_SLEEP device_init_wakeup()
implementations differed in confusing ways:

  - The PM_SLEEP version checked for a NULL device pointer and returned
    -EINVAL, while the !PM_SLEEP version did not and would simply
    dereference a NULL pointer.

  - When called with "false", the !PM_SLEEP version cleared "capable" and
    "enable" in the opposite order of the PM_SLEEP version.  That was
    harmless because for !PM_SLEEP they're simple assignments, but it's
    unnecessary confusion.

Use a simplified version of the PM_SLEEP implementation for both cases.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 30 ------------------------------
 include/linux/pm_wakeup.h   | 31 +++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 11a4ffe91367..e3befa2c1b66 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -500,36 +500,6 @@ void device_set_wakeup_capable(struct device *dev, bool capable)
 }
 EXPORT_SYMBOL_GPL(device_set_wakeup_capable);
 
-/**
- * device_init_wakeup - Device wakeup initialization.
- * @dev: Device to handle.
- * @enable: Whether or not to enable @dev as a wakeup device.
- *
- * By default, most devices should leave wakeup disabled.  The exceptions are
- * devices that everyone expects to be wakeup sources: keyboards, power buttons,
- * possibly network interfaces, etc.  Also, devices that don't generate their
- * own wakeup requests but merely forward requests from one bus to another
- * (like PCI bridges) should have wakeup enabled by default.
- */
-int device_init_wakeup(struct device *dev, bool enable)
-{
-	int ret = 0;
-
-	if (!dev)
-		return -EINVAL;
-
-	if (enable) {
-		device_set_wakeup_capable(dev, true);
-		ret = device_wakeup_enable(dev);
-	} else {
-		device_wakeup_disable(dev);
-		device_set_wakeup_capable(dev, false);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(device_init_wakeup);
-
 /**
  * device_set_wakeup_enable - Enable or disable a device to wake up the system.
  * @dev: Device to handle.
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 196a157456aa..77f4849e3418 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -109,7 +109,6 @@ extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
 extern int device_wakeup_enable(struct device *dev);
 extern int device_wakeup_disable(struct device *dev);
 extern void device_set_wakeup_capable(struct device *dev, bool capable);
-extern int device_init_wakeup(struct device *dev, bool val);
 extern int device_set_wakeup_enable(struct device *dev, bool enable);
 extern void __pm_stay_awake(struct wakeup_source *ws);
 extern void pm_stay_awake(struct device *dev);
@@ -167,13 +166,6 @@ static inline int device_set_wakeup_enable(struct device *dev, bool enable)
 	return 0;
 }
 
-static inline int device_init_wakeup(struct device *dev, bool val)
-{
-	device_set_wakeup_capable(dev, val);
-	device_set_wakeup_enable(dev, val);
-	return 0;
-}
-
 static inline bool device_may_wakeup(struct device *dev)
 {
 	return dev->power.can_wakeup && dev->power.should_wakeup;
@@ -217,4 +209,27 @@ static inline void pm_wakeup_hard_event(struct device *dev)
 	return pm_wakeup_dev_event(dev, 0, true);
 }
 
+/**
+ * device_init_wakeup - Device wakeup initialization.
+ * @dev: Device to handle.
+ * @enable: Whether or not to enable @dev as a wakeup device.
+ *
+ * By default, most devices should leave wakeup disabled.  The exceptions are
+ * devices that everyone expects to be wakeup sources: keyboards, power buttons,
+ * possibly network interfaces, etc.  Also, devices that don't generate their
+ * own wakeup requests but merely forward requests from one bus to another
+ * (like PCI bridges) should have wakeup enabled by default.
+ */
+static inline int device_init_wakeup(struct device *dev, bool enable)
+{
+	if (enable) {
+		device_set_wakeup_capable(dev, true);
+		return device_wakeup_enable(dev);
+	} else {
+		device_wakeup_disable(dev);
+		device_set_wakeup_capable(dev, false);
+		return 0;
+	}
+}
+
 #endif /* _LINUX_PM_WAKEUP_H */
-- 
cgit 


From e67198cc05b8ecbb7b8e2d8ef9fb5c8d26821873 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:25 +0200
Subject: context_tracking: Take idle eqs entrypoints over RCU

The RCU dynticks counter is going to be merged into the context tracking
subsystem. Start with moving the idle extended quiescent states
entrypoints to context tracking. For now those are dumb redirections to
existing RCU calls.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 Documentation/RCU/stallwarn.rst   |  4 ++--
 arch/arm/mach-imx/cpuidle-imx6q.c |  5 +++--
 drivers/acpi/processor_idle.c     |  5 +++--
 drivers/cpuidle/cpuidle.c         |  9 +++++----
 include/linux/context_tracking.h  |  8 ++++++++
 include/linux/rcupdate.h          |  2 +-
 kernel/context_tracking.c         | 15 +++++++++++++++
 kernel/locking/lockdep.c          |  2 +-
 kernel/rcu/Kconfig                |  2 ++
 kernel/rcu/tree.c                 |  2 --
 kernel/rcu/update.c               |  2 +-
 kernel/sched/idle.c               | 10 +++++-----
 kernel/sched/sched.h              |  1 +
 kernel/time/Kconfig               |  6 ++++++
 14 files changed, 53 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 794837eb519b..b95bda7755fa 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -97,8 +97,8 @@ warnings:
 	which will include additional debugging information.
 
 -	A low-level kernel issue that either fails to invoke one of the
-	variants of rcu_user_enter(), rcu_user_exit(), rcu_idle_enter(),
-	rcu_idle_exit(), rcu_irq_enter(), or rcu_irq_exit() on the one
+	variants of rcu_user_enter(), rcu_user_exit(), ct_idle_enter(),
+	ct_idle_exit(), rcu_irq_enter(), or rcu_irq_exit() on the one
 	hand, or that invokes one of them too many times on the other.
 	Historically, the most frequent issue has been an omission
 	of either irq_enter() or irq_exit(), which in turn invoke
diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c
index 094337dc1bc7..d086cbae09c3 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2012 Freescale Semiconductor, Inc.
  */
 
+#include <linux/context_tracking.h>
 #include <linux/cpuidle.h>
 #include <linux/module.h>
 #include <asm/cpuidle.h>
@@ -24,9 +25,9 @@ static int imx6q_enter_wait(struct cpuidle_device *dev,
 		imx6_set_lpm(WAIT_UNCLOCKED);
 	raw_spin_unlock(&cpuidle_lock);
 
-	rcu_idle_enter();
+	ct_idle_enter();
 	cpu_do_idle();
-	rcu_idle_exit();
+	ct_idle_exit();
 
 	raw_spin_lock(&cpuidle_lock);
 	if (num_idle_cpus-- == num_online_cpus())
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 6a5572a1a80c..1401d193a2df 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -23,6 +23,7 @@
 #include <linux/minmax.h>
 #include <linux/perf_event.h>
 #include <acpi/processor.h>
+#include <linux/context_tracking.h>
 
 /*
  * Include the apic definitions for x86 to have the APIC timer related defines
@@ -647,11 +648,11 @@ static int acpi_idle_enter_bm(struct cpuidle_driver *drv,
 		raw_spin_unlock(&c3_lock);
 	}
 
-	rcu_idle_enter();
+	ct_idle_enter();
 
 	acpi_idle_do_entry(cx);
 
-	rcu_idle_exit();
+	ct_idle_exit();
 
 	/* Re-enable bus master arbitration */
 	if (dis_bm) {
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ef2ea1b12cd8..62dd956025f3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -23,6 +23,7 @@
 #include <linux/suspend.h>
 #include <linux/tick.h>
 #include <linux/mmu_context.h>
+#include <linux/context_tracking.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -150,12 +151,12 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv,
 	 */
 	stop_critical_timings();
 	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
-		rcu_idle_enter();
+		ct_idle_enter();
 	target_state->enter_s2idle(dev, drv, index);
 	if (WARN_ON_ONCE(!irqs_disabled()))
 		local_irq_disable();
 	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
-		rcu_idle_exit();
+		ct_idle_exit();
 	tick_unfreeze();
 	start_critical_timings();
 
@@ -233,10 +234,10 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 
 	stop_critical_timings();
 	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
-		rcu_idle_enter();
+		ct_idle_enter();
 	entered_state = target_state->enter(dev, drv, index);
 	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
-		rcu_idle_exit();
+		ct_idle_exit();
 	start_critical_timings();
 
 	sched_clock_idle_wakeup_event();
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index e35ae66b4794..01abadb2f993 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -119,4 +119,12 @@ extern void context_tracking_init(void);
 static inline void context_tracking_init(void) { }
 #endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */
 
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+extern void ct_idle_enter(void);
+extern void ct_idle_exit(void);
+#else
+static inline void ct_idle_enter(void) { }
+static inline void ct_idle_exit(void) { }
+#endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
+
 #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1a32036c918c..6ebe754501c3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -128,7 +128,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
  * @a: Code that RCU needs to pay attention to.
  *
  * RCU read-side critical sections are forbidden in the inner idle loop,
- * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU
+ * that is, between the ct_idle_enter() and the ct_idle_exit() -- RCU
  * will happily ignore any such read-side critical sections.  However,
  * things like powertop need tracepoints in the inner idle loop.
  *
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index f3dec1be2bf6..c0b3798d4e94 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -22,6 +22,21 @@
 #include <linux/export.h>
 #include <linux/kprobes.h>
 
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+noinstr void ct_idle_enter(void)
+{
+	rcu_idle_enter();
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+void ct_idle_exit(void)
+{
+	rcu_idle_exit();
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 
 #define CREATE_TRACE_POINTS
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f06b91ca6482..5ea690cb4f7a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6570,7 +6570,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 
 	/*
 	 * If a CPU is in the RCU-free window in idle (ie: in the section
-	 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+	 * between ct_idle_enter() and ct_idle_exit(), then RCU
 	 * considers that CPU to be in an "extended quiescent state",
 	 * which means that RCU will be completely ignoring that CPU.
 	 * Therefore, rcu_read_lock() and friends have absolutely no
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 1c630e573548..3fa24e63d6f9 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -8,6 +8,8 @@ menu "RCU Subsystem"
 config TREE_RCU
 	bool
 	default y if SMP
+	# Dynticks-idle tracking
+	select CONTEXT_TRACKING_IDLE
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a5edab5558c..051fed0844b6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -664,7 +664,6 @@ void noinstr rcu_idle_enter(void)
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
 	rcu_eqs_enter(false);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
 #ifdef CONFIG_NO_HZ_FULL
 
@@ -904,7 +903,6 @@ void noinstr rcu_idle_exit(void)
 	rcu_eqs_exit(false);
 	raw_local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
 
 #ifdef CONFIG_NO_HZ_FULL
 /**
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fc7fef575606..147214b2cd68 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -85,7 +85,7 @@ module_param(rcu_normal_after_boot, int, 0444);
  * and while lockdep is disabled.
  *
  * Note that if the CPU is in the idle loop from an RCU point of view (ie:
- * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
  * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
  * rcu_read_lock().  The reason for this is that RCU ignores CPUs that are
  * in such a section, considering these as in extended quiescent state,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 328cccbee444..f26ab2675f7d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -53,14 +53,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
 {
 	trace_cpu_idle(0, smp_processor_id());
 	stop_critical_timings();
-	rcu_idle_enter();
+	ct_idle_enter();
 	local_irq_enable();
 
 	while (!tif_need_resched() &&
 	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
 		cpu_relax();
 
-	rcu_idle_exit();
+	ct_idle_exit();
 	start_critical_timings();
 	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
 
@@ -98,12 +98,12 @@ void __cpuidle default_idle_call(void)
 		 *
 		 * Trace IRQs enable here, then switch off RCU, and have
 		 * arch_cpu_idle() use raw_local_irq_enable(). Note that
-		 * rcu_idle_enter() relies on lockdep IRQ state, so switch that
+		 * ct_idle_enter() relies on lockdep IRQ state, so switch that
 		 * last -- this is very similar to the entry code.
 		 */
 		trace_hardirqs_on_prepare();
 		lockdep_hardirqs_on_prepare();
-		rcu_idle_enter();
+		ct_idle_enter();
 		lockdep_hardirqs_on(_THIS_IP_);
 
 		arch_cpu_idle();
@@ -116,7 +116,7 @@ void __cpuidle default_idle_call(void)
 		 */
 		raw_local_irq_disable();
 		lockdep_hardirqs_off(_THIS_IP_);
-		rcu_idle_exit();
+		ct_idle_exit();
 		lockdep_hardirqs_on(_THIS_IP_);
 		raw_local_irq_enable();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47b89a0fc6e5..0cfe2d0af294 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -27,6 +27,7 @@
 #include <linux/capability.h>
 #include <linux/cgroup_api.h>
 #include <linux/cgroup.h>
+#include <linux/context_tracking.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask_api.h>
 #include <linux/ctype.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 41f99bcfe9e6..a41753be1a2b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -76,6 +76,12 @@ config TIME_KUNIT_TEST
 config CONTEXT_TRACKING
 	bool
 
+config CONTEXT_TRACKING_IDLE
+	bool
+	select CONTEXT_TRACKING
+	help
+	  Tracks idle state on behalf of RCU.
+
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
-- 
cgit 


From 6f0e6c1598b1a3d19fc30db86b6e26d6f881b43d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:26 +0200
Subject: context_tracking: Take IRQ eqs entrypoints over RCU

The RCU dynticks counter is going to be merged into the context tracking
subsystem. Prepare with moving the IRQ extended quiescent states
entrypoints to context tracking. For now those are dumb redirection to
existing RCU calls.

[ paulmck: Apply Stephen Rothwell feedback from -next. ]
[ paulmck: Apply Nathan Chancellor feedback. ]

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 .../RCU/Design/Requirements/Requirements.rst       | 10 ++++-----
 Documentation/RCU/stallwarn.rst                    |  4 ++--
 arch/Kconfig                                       |  2 +-
 arch/arm64/kernel/entry-common.c                   |  6 +++---
 arch/x86/mm/fault.c                                |  2 +-
 drivers/cpuidle/cpuidle-psci.c                     |  8 ++++----
 drivers/cpuidle/cpuidle-riscv-sbi.c                |  8 ++++----
 include/linux/context_tracking_irq.h               | 17 +++++++++++++++
 include/linux/context_tracking_state.h             |  1 +
 include/linux/entry-common.h                       | 10 ++++-----
 include/linux/rcupdate.h                           |  5 +++--
 include/linux/tracepoint.h                         |  4 ++--
 kernel/cfi.c                                       |  4 ++--
 kernel/context_tracking.c                          | 24 ++++++++++++++++++++--
 kernel/cpu_pm.c                                    |  8 ++++----
 kernel/entry/common.c                              | 12 +++++------
 kernel/softirq.c                                   |  4 ++--
 kernel/trace/trace.c                               |  6 +++---
 18 files changed, 87 insertions(+), 48 deletions(-)
 create mode 100644 include/linux/context_tracking_irq.h

(limited to 'include')

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 04ed8bf27a0e..074810c73936 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1844,10 +1844,10 @@ that meets this requirement.
 
 Furthermore, NMI handlers can be interrupted by what appear to RCU to be
 normal interrupts. One way that this can happen is for code that
-directly invokes rcu_irq_enter() and rcu_irq_exit() to be called
+directly invokes ct_irq_enter() and ct_irq_exit() to be called
 from an NMI handler. This astonishing fact of life prompted the current
-code structure, which has rcu_irq_enter() invoking
-rcu_nmi_enter() and rcu_irq_exit() invoking rcu_nmi_exit().
+code structure, which has ct_irq_enter() invoking
+rcu_nmi_enter() and ct_irq_exit() invoking rcu_nmi_exit().
 And yes, I also learned of this requirement the hard way.
 
 Loadable Modules
@@ -2195,7 +2195,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
    sections, and RCU believes this CPU to be idle, no problem. This
    sort of thing is used by some architectures for light-weight
    exception handlers, which can then avoid the overhead of
-   rcu_irq_enter() and rcu_irq_exit() at exception entry and
+   ct_irq_enter() and ct_irq_exit() at exception entry and
    exit, respectively. Some go further and avoid the entireties of
    irq_enter() and irq_exit().
    Just make very sure you are running some of your tests with
@@ -2226,7 +2226,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
 +-----------------------------------------------------------------------+
 | **Answer**:                                                           |
 +-----------------------------------------------------------------------+
-| One approach is to do ``rcu_irq_exit();rcu_irq_enter();`` every so    |
+| One approach is to do ``ct_irq_exit();ct_irq_enter();`` every so      |
 | often. But given that long-running interrupt handlers can cause other |
 | problems, not least for response time, shouldn't you work to keep     |
 | your interrupt handler's runtime within reasonable bounds?            |
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index b95bda7755fa..ce1f58a9d954 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -98,11 +98,11 @@ warnings:
 
 -	A low-level kernel issue that either fails to invoke one of the
 	variants of rcu_user_enter(), rcu_user_exit(), ct_idle_enter(),
-	ct_idle_exit(), rcu_irq_enter(), or rcu_irq_exit() on the one
+	ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one
 	hand, or that invokes one of them too many times on the other.
 	Historically, the most frequent issue has been an omission
 	of either irq_enter() or irq_exit(), which in turn invoke
-	rcu_irq_enter() or rcu_irq_exit(), respectively.  Building your
+	ct_irq_enter() or ct_irq_exit(), respectively.  Building your
 	kernel with CONFIG_RCU_EQS_DEBUG=y can help track down these types
 	of issues, which sometimes arise in architecture-specific code.
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 154b7b78da09..342642be105f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -782,7 +782,7 @@ config HAVE_CONTEXT_TRACKING_USER
 	  Syscalls need to be wrapped inside user_exit()-user_enter(), either
 	  optimized behind static key or through the slow path using TIF_NOHZ
 	  flag. Exceptions handlers must be wrapped as well. Irqs are already
-	  protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal
+	  protected inside ct_irq_enter/ct_irq_exit() but preemption or signal
 	  handling on irq exit still need to be protected.
 
 config HAVE_CONTEXT_TRACKING_USER_OFFSTACK
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 56cefd33eb8e..8dabe9ec10f1 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -41,7 +41,7 @@ static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs)
 
 	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
 		lockdep_hardirqs_off(CALLER_ADDR0);
-		rcu_irq_enter();
+		ct_irq_enter();
 		trace_hardirqs_off_finish();
 
 		regs->exit_rcu = true;
@@ -76,7 +76,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs)
 		if (regs->exit_rcu) {
 			trace_hardirqs_on_prepare();
 			lockdep_hardirqs_on_prepare();
-			rcu_irq_exit();
+			ct_irq_exit();
 			lockdep_hardirqs_on(CALLER_ADDR0);
 			return;
 		}
@@ -84,7 +84,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs)
 		trace_hardirqs_on();
 	} else {
 		if (regs->exit_rcu)
-			rcu_irq_exit();
+			ct_irq_exit();
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fad8faa29d04..971977c438fc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1526,7 +1526,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 
 	/*
 	 * Entry handling for valid #PF from kernel mode is slightly
-	 * different: RCU is already watching and rcu_irq_enter() must not
+	 * different: RCU is already watching and ct_irq_enter() must not
 	 * be invoked because a kernel fault on a user space address might
 	 * sleep.
 	 *
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
index 540105ca0781..57bc3e3ae391 100644
--- a/drivers/cpuidle/cpuidle-psci.c
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -69,12 +69,12 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev,
 		return -1;
 
 	/* Do runtime PM to manage a hierarchical CPU toplogy. */
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	if (s2idle)
 		dev_pm_genpd_suspend(pd_dev);
 	else
 		pm_runtime_put_sync_suspend(pd_dev);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	state = psci_get_domain_state();
 	if (!state)
@@ -82,12 +82,12 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev,
 
 	ret = psci_cpu_suspend_enter(state) ? -1 : idx;
 
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	if (s2idle)
 		dev_pm_genpd_resume(pd_dev);
 	else
 		pm_runtime_get_sync(pd_dev);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	cpu_pm_exit();
 
diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c
index 1151e5e2ba82..862a2876f1c9 100644
--- a/drivers/cpuidle/cpuidle-riscv-sbi.c
+++ b/drivers/cpuidle/cpuidle-riscv-sbi.c
@@ -116,12 +116,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
 		return -1;
 
 	/* Do runtime PM to manage a hierarchical CPU toplogy. */
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	if (s2idle)
 		dev_pm_genpd_suspend(pd_dev);
 	else
 		pm_runtime_put_sync_suspend(pd_dev);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	if (sbi_is_domain_state_available())
 		state = sbi_get_domain_state();
@@ -130,12 +130,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
 
 	ret = sbi_suspend(state) ? -1 : idx;
 
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	if (s2idle)
 		dev_pm_genpd_resume(pd_dev);
 	else
 		pm_runtime_get_sync(pd_dev);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	cpu_pm_exit();
 
diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h
new file mode 100644
index 000000000000..62f62bbd1a50
--- /dev/null
+++ b/include/linux/context_tracking_irq.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CONTEXT_TRACKING_IRQ_H
+#define _LINUX_CONTEXT_TRACKING_IRQ_H
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+void ct_irq_enter(void);
+void ct_irq_exit(void);
+void ct_irq_enter_irqson(void);
+void ct_irq_exit_irqson(void);
+#else
+static inline void ct_irq_enter(void) { }
+static inline void ct_irq_exit(void) { }
+static inline void ct_irq_enter_irqson(void) { }
+static inline void ct_irq_exit_irqson(void) { }
+#endif
+
+#endif
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 2b46afe105a9..9c16a8b2c194 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -4,6 +4,7 @@
 
 #include <linux/percpu.h>
 #include <linux/static_key.h>
+#include <linux/context_tracking_irq.h>
 
 struct context_tracking {
 	/*
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c92ac75d6556..84a466b176cf 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -357,7 +357,7 @@ void irqentry_exit_to_user_mode(struct pt_regs *regs);
 /**
  * struct irqentry_state - Opaque object for exception state storage
  * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the
- *            exit path has to invoke rcu_irq_exit().
+ *            exit path has to invoke ct_irq_exit().
  * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that
  *           lockdep state is restored correctly on exit from nmi.
  *
@@ -395,12 +395,12 @@ typedef struct irqentry_state {
  *
  * For kernel mode entries RCU handling is done conditional. If RCU is
  * watching then the only RCU requirement is to check whether the tick has
- * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
- * invoked on entry and rcu_irq_exit() on exit.
+ * to be restarted. If RCU is not watching then ct_irq_enter() has to be
+ * invoked on entry and ct_irq_exit() on exit.
  *
- * Avoiding the rcu_irq_enter/exit() calls is an optimization but also
+ * Avoiding the ct_irq_enter/exit() calls is an optimization but also
  * solves the problem of kernel mode pagefaults which can schedule, which
- * is not possible after invoking rcu_irq_enter() without undoing it.
+ * is not possible after invoking ct_irq_enter() without undoing it.
  *
  * For user mode entries irqentry_enter_from_user_mode() is invoked to
  * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6ebe754501c3..f1562d91c67d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -29,6 +29,7 @@
 #include <linux/lockdep.h>
 #include <asm/processor.h>
 #include <linux/cpumask.h>
+#include <linux/context_tracking_irq.h>
 
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
@@ -143,9 +144,9 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
  */
 #define RCU_NONIDLE(a) \
 	do { \
-		rcu_irq_enter_irqson(); \
+		ct_irq_enter_irqson(); \
 		do { a; } while (0); \
-		rcu_irq_exit_irqson(); \
+		ct_irq_exit_irqson(); \
 	} while (0)
 
 /*
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 28031b15f878..55717a2eda08 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -200,13 +200,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		 */							\
 		if (rcuidle) {						\
 			__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
-			rcu_irq_enter_irqson();				\
+			ct_irq_enter_irqson();				\
 		}							\
 									\
 		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 									\
 		if (rcuidle) {						\
-			rcu_irq_exit_irqson();				\
+			ct_irq_exit_irqson();				\
 			srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
 		}							\
 									\
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 08102d19ec15..2046276ee234 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -295,7 +295,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr)
 	rcu_idle = !rcu_is_watching();
 	if (rcu_idle) {
 		local_irq_save(flags);
-		rcu_irq_enter();
+		ct_irq_enter();
 	}
 
 	if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
@@ -304,7 +304,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr)
 		fn = find_module_check_fn(ptr);
 
 	if (rcu_idle) {
-		rcu_irq_exit();
+		ct_irq_exit();
 		local_irq_restore(flags);
 	}
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index c0b3798d4e94..72bd71a02c44 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -35,6 +35,26 @@ void ct_idle_exit(void)
 	rcu_idle_exit();
 }
 EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+noinstr void ct_irq_enter(void)
+{
+	rcu_irq_enter();
+}
+
+noinstr void ct_irq_exit(void)
+{
+	rcu_irq_exit();
+}
+
+void ct_irq_enter_irqson(void)
+{
+	rcu_irq_enter_irqson();
+}
+
+void ct_irq_exit_irqson(void)
+{
+	rcu_irq_exit_irqson();
+}
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
@@ -90,7 +110,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
 			 * At this stage, only low level arch entry code remains and
 			 * then we'll run in userspace. We can assume there won't be
 			 * any RCU read-side critical section until the next call to
-			 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+			 * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
 			 * on the tick.
 			 */
 			if (state == CONTEXT_USER) {
@@ -136,7 +156,7 @@ void ct_user_enter(enum ctx_state state)
 	/*
 	 * Some contexts may involve an exception occuring in an irq,
 	 * leading to that nesting:
-	 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+	 * ct_irq_enter() rcu_user_exit() rcu_user_exit() ct_irq_exit()
 	 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
 	 * helpers are enough to protect RCU uses inside the exception. So
 	 * just return immediately if we detect we are in an IRQ.
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 246efc74e3f3..ba4ba71facf9 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event)
 	 * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
 	 * this.
 	 */
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	rcu_read_lock();
 	ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
 	rcu_read_unlock();
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	return notifier_to_errno(ret);
 }
@@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
 	unsigned long flags;
 	int ret;
 
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
 	ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
 	raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 
 	return notifier_to_errno(ret);
 }
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 032f164abe7c..667ba5d581ff 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -321,7 +321,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	}
 
 	/*
-	 * If this entry hit the idle task invoke rcu_irq_enter() whether
+	 * If this entry hit the idle task invoke ct_irq_enter() whether
 	 * RCU is watching or not.
 	 *
 	 * Interrupts can nest when the first interrupt invokes softirq
@@ -332,12 +332,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * not nested into another interrupt.
 	 *
 	 * Checking for rcu_is_watching() here would prevent the nesting
-	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
 	 * assume that it is the first interrupt and eventually claim
 	 * quiescent state and end grace periods prematurely.
 	 *
-	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
+	 * Unconditionally invoke ct_irq_enter() so RCU state stays
 	 * consistent.
 	 *
 	 * TINY_RCU does not support EQS, so let the compiler eliminate
@@ -350,7 +350,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 		 * as in irqentry_enter_from_user_mode().
 		 */
 		lockdep_hardirqs_off(CALLER_ADDR0);
-		rcu_irq_enter();
+		ct_irq_enter();
 		instrumentation_begin();
 		trace_hardirqs_off_finish();
 		instrumentation_end();
@@ -418,7 +418,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 			trace_hardirqs_on_prepare();
 			lockdep_hardirqs_on_prepare();
 			instrumentation_end();
-			rcu_irq_exit();
+			ct_irq_exit();
 			lockdep_hardirqs_on(CALLER_ADDR0);
 			return;
 		}
@@ -436,7 +436,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 		 * was not watching on entry.
 		 */
 		if (state.exit_rcu)
-			rcu_irq_exit();
+			ct_irq_exit();
 	}
 }
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f0aef8aa9ff..c8a6913c067d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -620,7 +620,7 @@ void irq_enter_rcu(void)
  */
 void irq_enter(void)
 {
-	rcu_irq_enter();
+	ct_irq_enter();
 	irq_enter_rcu();
 }
 
@@ -672,7 +672,7 @@ void irq_exit_rcu(void)
 void irq_exit(void)
 {
 	__irq_exit_rcu();
-	rcu_irq_exit();
+	ct_irq_exit();
 	 /* must be last! */
 	lockdep_hardirq_exit();
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c95992e2c71..fe78a6818126 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3107,15 +3107,15 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
 	/*
 	 * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
 	 * but if the above rcu_is_watching() failed, then the NMI
-	 * triggered someplace critical, and rcu_irq_enter() should
+	 * triggered someplace critical, and ct_irq_enter() should
 	 * not be called from NMI.
 	 */
 	if (unlikely(in_nmi()))
 		return;
 
-	rcu_irq_enter_irqson();
+	ct_irq_enter_irqson();
 	__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
-	rcu_irq_exit_irqson();
+	ct_irq_exit_irqson();
 }
 
 /**
-- 
cgit 


From 493c1822825f00025d6754ec0632990a27edc6f8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:27 +0200
Subject: context_tracking: Take NMI eqs entrypoints over RCU

The RCU dynticks counter is going to be merged into the context tracking
subsystem. Prepare with moving the NMI extended quiescent states
entrypoints to context tracking. For now those are dumb redirection to
existing RCU calls.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 Documentation/RCU/Design/Requirements/Requirements.rst |  2 +-
 arch/Kconfig                                           |  2 +-
 arch/arm64/kernel/entry-common.c                       |  8 ++++----
 include/linux/context_tracking_irq.h                   |  4 ++++
 include/linux/hardirq.h                                |  4 ++--
 kernel/context_tracking.c                              | 10 ++++++++++
 kernel/entry/common.c                                  |  4 ++--
 kernel/extable.c                                       |  4 ++--
 kernel/trace/trace.c                                   |  2 +-
 9 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 074810c73936..a0f8164c8513 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1847,7 +1847,7 @@ normal interrupts. One way that this can happen is for code that
 directly invokes ct_irq_enter() and ct_irq_exit() to be called
 from an NMI handler. This astonishing fact of life prompted the current
 code structure, which has ct_irq_enter() invoking
-rcu_nmi_enter() and ct_irq_exit() invoking rcu_nmi_exit().
+ct_nmi_enter() and ct_irq_exit() invoking ct_nmi_exit().
 And yes, I also learned of this requirement the hard way.
 
 Loadable Modules
diff --git a/arch/Kconfig b/arch/Kconfig
index 342642be105f..f56f7c0e924d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -797,7 +797,7 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK
 
 	  - Critical entry code isn't preemptible (or better yet:
 	    not interruptible).
-	  - No use of RCU read side critical sections, unless rcu_nmi_enter()
+	  - No use of RCU read side critical sections, unless ct_nmi_enter()
 	    got called.
 	  - No use of instrumentation, unless instrumentation_begin() got
 	    called.
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 8dabe9ec10f1..c75ca36b4a49 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -161,7 +161,7 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs)
 	__nmi_enter();
 	lockdep_hardirqs_off(CALLER_ADDR0);
 	lockdep_hardirq_enter();
-	rcu_nmi_enter();
+	ct_nmi_enter();
 
 	trace_hardirqs_off_finish();
 	ftrace_nmi_enter();
@@ -182,7 +182,7 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs)
 		lockdep_hardirqs_on_prepare();
 	}
 
-	rcu_nmi_exit();
+	ct_nmi_exit();
 	lockdep_hardirq_exit();
 	if (restore)
 		lockdep_hardirqs_on(CALLER_ADDR0);
@@ -199,7 +199,7 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs)
 	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
 
 	lockdep_hardirqs_off(CALLER_ADDR0);
-	rcu_nmi_enter();
+	ct_nmi_enter();
 
 	trace_hardirqs_off_finish();
 }
@@ -218,7 +218,7 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs)
 		lockdep_hardirqs_on_prepare();
 	}
 
-	rcu_nmi_exit();
+	ct_nmi_exit();
 	if (restore)
 		lockdep_hardirqs_on(CALLER_ADDR0);
 }
diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h
index 62f62bbd1a50..c50b5670c4a5 100644
--- a/include/linux/context_tracking_irq.h
+++ b/include/linux/context_tracking_irq.h
@@ -7,11 +7,15 @@ void ct_irq_enter(void);
 void ct_irq_exit(void);
 void ct_irq_enter_irqson(void);
 void ct_irq_exit_irqson(void);
+void ct_nmi_enter(void);
+void ct_nmi_exit(void);
 #else
 static inline void ct_irq_enter(void) { }
 static inline void ct_irq_exit(void) { }
 static inline void ct_irq_enter_irqson(void) { }
 static inline void ct_irq_exit_irqson(void) { }
+static inline void ct_nmi_enter(void) { }
+static inline void ct_nmi_exit(void) { }
 #endif
 
 #endif
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 76878b357ffa..345cdbe9c1b7 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -124,7 +124,7 @@ extern void rcu_nmi_exit(void);
 	do {							\
 		__nmi_enter();					\
 		lockdep_hardirq_enter();			\
-		rcu_nmi_enter();				\
+		ct_nmi_enter();				\
 		instrumentation_begin();			\
 		ftrace_nmi_enter();				\
 		instrumentation_end();				\
@@ -143,7 +143,7 @@ extern void rcu_nmi_exit(void);
 		instrumentation_begin();			\
 		ftrace_nmi_exit();				\
 		instrumentation_end();				\
-		rcu_nmi_exit();					\
+		ct_nmi_exit();					\
 		lockdep_hardirq_exit();				\
 		__nmi_exit();					\
 	} while (0)
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72bd71a02c44..b8a731f20778 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -55,6 +55,16 @@ void ct_irq_exit_irqson(void)
 {
 	rcu_irq_exit_irqson();
 }
+
+noinstr void ct_nmi_enter(void)
+{
+	rcu_nmi_enter();
+}
+
+noinstr void ct_nmi_exit(void)
+{
+	rcu_nmi_exit();
+}
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 667ba5d581ff..063068a9ea9b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -449,7 +449,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
 	__nmi_enter();
 	lockdep_hardirqs_off(CALLER_ADDR0);
 	lockdep_hardirq_enter();
-	rcu_nmi_enter();
+	ct_nmi_enter();
 
 	instrumentation_begin();
 	trace_hardirqs_off_finish();
@@ -469,7 +469,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
 	}
 	instrumentation_end();
 
-	rcu_nmi_exit();
+	ct_nmi_exit();
 	lockdep_hardirq_exit();
 	if (irq_state.lockdep)
 		lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/extable.c b/kernel/extable.c
index bda5e9761541..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr)
 
 	/* Treat this like an NMI as it can happen anywhere */
 	if (no_rcu)
-		rcu_nmi_enter();
+		ct_nmi_enter();
 
 	if (is_module_text_address(addr))
 		goto out;
@@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr)
 	ret = 0;
 out:
 	if (no_rcu)
-		rcu_nmi_exit();
+		ct_nmi_exit();
 
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fe78a6818126..5fc7f17f5ec7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3105,7 +3105,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
 	}
 
 	/*
-	 * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+	 * When an NMI triggers, RCU is enabled via ct_nmi_enter(),
 	 * but if the above rcu_is_watching() failed, then the NMI
 	 * triggered someplace critical, and ct_irq_enter() should
 	 * not be called from NMI.
-- 
cgit 


From 3864caafe7c66f01b188ffccb6a4215f3bf56292 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:28 +0200
Subject: rcu/context-tracking: Remove rcu_irq_enter/exit()

Now rcu_irq_enter/exit() is an unnecessary middle call between
ct_irq_enter/exit() and nmi_irq_enter/exit(). Take this opportunity
to remove the former functions and move the comments above them to the
new entrypoints.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/rcutiny.h   |  4 ---
 include/linux/rcutree.h   |  4 ---
 kernel/context_tracking.c | 71 +++++++++++++++++++++++++++++++++++++---
 kernel/rcu/tree.c         | 83 -----------------------------------------------
 4 files changed, 67 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5fed476f977f..591119413cf1 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -78,10 +78,6 @@ static inline void rcu_cpu_stall_reset(void) { }
 static inline int rcu_jiffies_till_stall_check(void) { return 21 * HZ; }
 static inline void rcu_idle_enter(void) { }
 static inline void rcu_idle_exit(void) { }
-static inline void rcu_irq_enter(void) { }
-static inline void rcu_irq_exit_irqson(void) { }
-static inline void rcu_irq_enter_irqson(void) { }
-static inline void rcu_irq_exit(void) { }
 static inline void rcu_irq_exit_check_preempt(void) { }
 #define rcu_is_idle_cpu(cpu) \
 	(is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq())
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9c6cfb742504..4522b6a7cc42 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -47,10 +47,6 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 void rcu_idle_enter(void);
 void rcu_idle_exit(void);
-void rcu_irq_enter(void);
-void rcu_irq_exit(void);
-void rcu_irq_enter_irqson(void);
-void rcu_irq_exit_irqson(void);
 bool rcu_is_idle_cpu(int cpu);
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index b8a731f20778..c0d86dac98f1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -36,24 +36,87 @@ void ct_idle_exit(void)
 }
 EXPORT_SYMBOL_GPL(ct_idle_exit);
 
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.  The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
 noinstr void ct_irq_enter(void)
 {
-	rcu_irq_enter();
+	lockdep_assert_irqs_disabled();
+	ct_nmi_enter();
 }
 
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.  The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
 noinstr void ct_irq_exit(void)
 {
-	rcu_irq_exit();
+	lockdep_assert_irqs_disabled();
+	ct_nmi_exit();
 }
 
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
 void ct_irq_enter_irqson(void)
 {
-	rcu_irq_enter_irqson();
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ct_irq_enter();
+	local_irq_restore(flags);
 }
 
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
 void ct_irq_exit_irqson(void)
 {
-	rcu_irq_exit_irqson();
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ct_irq_exit();
+	local_irq_restore(flags);
 }
 
 noinstr void ct_nmi_enter(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 051fed0844b6..75b433dba427 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -789,31 +789,6 @@ noinstr void rcu_nmi_exit(void)
 		rcu_dynticks_task_enter();
 }
 
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.  The caller must have disabled interrupts.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit().  If your
- * architecture's idle loop violates this assumption, RCU will give you what
- * you deserve, good and hard.  But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_irq_exit(void)
-{
-	lockdep_assert_irqs_disabled();
-	rcu_nmi_exit();
-}
-
 #ifdef CONFIG_PROVE_RCU
 /**
  * rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@ -832,21 +807,6 @@ void rcu_irq_exit_check_preempt(void)
 }
 #endif /* #ifdef CONFIG_PROVE_RCU */
 
-/*
- * Wrapper for rcu_irq_exit() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_exit_irqson(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rcu_irq_exit();
-	local_irq_restore(flags);
-}
-
 /*
  * Exit an RCU extended quiescent state, which can be either the
  * idle loop or adaptive-tickless usermode execution.
@@ -1041,49 +1001,6 @@ noinstr void rcu_nmi_enter(void)
 	barrier();
 }
 
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.  The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to user mode!
- * This code assumes that the idle loop never does upcalls to user mode.
- * If your architecture's idle loop does do upcalls to user mode (or does
- * anything else that results in unbalanced calls to the irq_enter() and
- * irq_exit() functions), RCU will give you what you deserve, good and hard.
- * But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_irq_enter(void)
-{
-	lockdep_assert_irqs_disabled();
-	rcu_nmi_enter();
-}
-
-/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter_irqson(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rcu_irq_enter();
-	local_irq_restore(flags);
-}
-
 /*
  * Check to see if any future non-offloaded RCU-related work will need
  * to be done by the current CPU, even if none need be done immediately,
-- 
cgit 


From 62e2412df4b90ae6706ce1f1a9649b789b2e44ef Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:29 +0200
Subject: rcu/context_tracking: Move dynticks counter to context tracking

In order to prepare for merging RCU dynticks counter into the context
tracking state, move the rcu_data's dynticks field to the context
tracking structure. It will later be mixed within the context tracking
state itself.

[ paulmck: Move enum ctx_state into global scope. ]

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking_state.h | 45 ++++++++++++++++++++++-----
 kernel/context_tracking.c              | 10 ++++--
 kernel/rcu/tree.c                      | 56 ++++++++++++++++------------------
 kernel/rcu/tree.h                      |  1 -
 kernel/rcu/tree_exp.h                  |  2 +-
 kernel/rcu/tree_stall.h                |  4 +--
 6 files changed, 75 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 9c16a8b2c194..5a8da2787287 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -6,7 +6,15 @@
 #include <linux/static_key.h>
 #include <linux/context_tracking_irq.h>
 
+enum ctx_state {
+	CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
+	CONTEXT_KERNEL = 0,
+	CONTEXT_USER,
+	CONTEXT_GUEST,
+};
+
 struct context_tracking {
+#ifdef CONFIG_CONTEXT_TRACKING_USER
 	/*
 	 * When active is false, probes are unset in order
 	 * to minimize overhead: TIF flags are cleared
@@ -15,17 +23,40 @@ struct context_tracking {
 	 */
 	bool active;
 	int recursion;
-	enum ctx_state {
-		CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
-		CONTEXT_KERNEL = 0,
-		CONTEXT_USER,
-		CONTEXT_GUEST,
-	} state;
+	enum ctx_state state;
+#endif
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+	atomic_t dynticks;		/* Even value for idle, else odd. */
+#endif
 };
 
+#ifdef CONFIG_CONTEXT_TRACKING
+DECLARE_PER_CPU(struct context_tracking, context_tracking);
+#endif
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+static __always_inline int ct_dynticks(void)
+{
+	return atomic_read(this_cpu_ptr(&context_tracking.dynticks));
+}
+
+static __always_inline int ct_dynticks_cpu(int cpu)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return atomic_read(&ct->dynticks);
+}
+
+static __always_inline int ct_dynticks_cpu_acquire(int cpu)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return atomic_read_acquire(&ct->dynticks);
+}
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 extern struct static_key_false context_tracking_key;
-DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
 static __always_inline bool context_tracking_enabled(void)
 {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index c0d86dac98f1..01abbcec52f7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -23,6 +23,13 @@
 #include <linux/kprobes.h>
 
 
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+	.dynticks = ATOMIC_INIT(1),
+#endif
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 noinstr void ct_idle_enter(void)
 {
@@ -138,9 +145,6 @@ noinstr void ct_nmi_exit(void)
 DEFINE_STATIC_KEY_FALSE(context_tracking_key);
 EXPORT_SYMBOL_GPL(context_tracking_key);
 
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
 static noinstr bool context_tracking_recursion_enter(void)
 {
 	int recursion;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 75b433dba427..a471edc3d893 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -77,7 +77,6 @@
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
 	.dynticks_nesting = 1,
 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
-	.dynticks = ATOMIC_INIT(1),
 #ifdef CONFIG_RCU_NOCB_CPU
 	.cblist.flags = SEGCBLIST_RCU_CORE,
 #endif
@@ -268,7 +267,7 @@ void rcu_softirq_qs(void)
  */
 static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
 {
-	return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks));
+	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks));
 }
 
 /*
@@ -324,9 +323,7 @@ static noinstr void rcu_dynticks_eqs_exit(void)
  */
 static void rcu_dynticks_eqs_online(void)
 {
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
-	if (atomic_read(&rdp->dynticks) & 0x1)
+	if (ct_dynticks() & 0x1)
 		return;
 	rcu_dynticks_inc(1);
 }
@@ -338,17 +335,17 @@ static void rcu_dynticks_eqs_online(void)
  */
 static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
-	return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
+	return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1);
 }
 
 /*
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
  */
-static int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(int cpu)
 {
 	smp_mb();  // Fundamental RCU ordering guarantee.
-	return atomic_read_acquire(&rdp->dynticks);
+	return ct_dynticks_cpu_acquire(cpu);
 }
 
 /*
@@ -363,9 +360,7 @@ static bool rcu_dynticks_in_eqs(int snap)
 /* Return true if the specified CPU is currently idle from an RCU viewpoint.  */
 bool rcu_is_idle_cpu(int cpu)
 {
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
-	return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+	return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
 }
 
 /*
@@ -375,7 +370,7 @@ bool rcu_is_idle_cpu(int cpu)
  */
 static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
 {
-	return snap != rcu_dynticks_snap(rdp);
+	return snap != rcu_dynticks_snap(rdp->cpu);
 }
 
 /*
@@ -384,11 +379,10 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
  */
 bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
 {
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	int snap;
 
 	// If not quiescent, force back to earlier extended quiescent state.
-	snap = atomic_read(&rdp->dynticks) & ~0x1;
+	snap = ct_dynticks_cpu(cpu) & ~0x1;
 
 	smp_rmb(); // Order ->dynticks and *vp reads.
 	if (READ_ONCE(*vp))
@@ -396,7 +390,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
 	smp_rmb(); // Order *vp read and ->dynticks re-read.
 
 	// If still in the same extended quiescent state, we are good!
-	return snap == atomic_read(&rdp->dynticks);
+	return snap == ct_dynticks_cpu(cpu);
 }
 
 /*
@@ -620,6 +614,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 static noinstr void rcu_eqs_enter(bool user)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
@@ -633,12 +628,12 @@ static noinstr void rcu_eqs_enter(bool user)
 
 	instrumentation_begin();
 	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
+	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	rcu_preempt_deferred_qs(current);
 
 	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 
 	instrumentation_end();
 	WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
@@ -740,7 +735,7 @@ noinstr void rcu_user_enter(void)
  * rcu_nmi_exit - inform RCU of exit from NMI context
  *
  * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
+ * RCU-idle period, update ct->dynticks and rdp->dynticks_nmi_nesting
  * to let the RCU grace-period handling know that the CPU is back to
  * being RCU-idle.
  *
@@ -749,6 +744,7 @@ noinstr void rcu_user_enter(void)
  */
 noinstr void rcu_nmi_exit(void)
 {
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
 	instrumentation_begin();
@@ -766,7 +762,7 @@ noinstr void rcu_nmi_exit(void)
 	 */
 	if (rdp->dynticks_nmi_nesting != 1) {
 		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
-				  atomic_read(&rdp->dynticks));
+				  ct_dynticks());
 		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
 			   rdp->dynticks_nmi_nesting - 2);
 		instrumentation_end();
@@ -774,11 +770,11 @@ noinstr void rcu_nmi_exit(void)
 	}
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
+	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, ct_dynticks());
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
 	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 	instrumentation_end();
 
 	// RCU is watching here ...
@@ -817,6 +813,7 @@ void rcu_irq_exit_check_preempt(void)
  */
 static void noinstr rcu_eqs_exit(bool user)
 {
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 	struct rcu_data *rdp;
 	long oldval;
 
@@ -836,9 +833,9 @@ static void noinstr rcu_eqs_exit(bool user)
 	instrumentation_begin();
 
 	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 
-	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
+	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(rdp->dynticks_nesting, 1);
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -944,7 +941,7 @@ void __rcu_irq_enter_check_tick(void)
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
  *
- * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
+ * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
  * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
  * that the CPU is active.  This implementation permits nested NMIs, as
  * long as the nesting level does not overflow an int.  (You will probably
@@ -957,6 +954,7 @@ noinstr void rcu_nmi_enter(void)
 {
 	long incby = 2;
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
 	/* Complain about underflow. */
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@@ -980,9 +978,9 @@ noinstr void rcu_nmi_enter(void)
 
 		instrumentation_begin();
 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
-		instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
+		instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks));
 		// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-		instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+		instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 
 		incby = 1;
 	} else if (!in_nmi()) {
@@ -994,7 +992,7 @@ noinstr void rcu_nmi_enter(void)
 
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 			  rdp->dynticks_nmi_nesting,
-			  rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
+			  rdp->dynticks_nmi_nesting + incby, ct_dynticks());
 	instrumentation_end();
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
 		   rdp->dynticks_nmi_nesting + incby);
@@ -1138,7 +1136,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
  */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-	rdp->dynticks_snap = rcu_dynticks_snap(rdp);
+	rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
 	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
 		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
 		rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -4142,7 +4140,7 @@ rcu_boot_init_percpu_data(int cpu)
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	INIT_WORK(&rdp->strict_work, strict_work_handler);
 	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
-	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
 	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
 	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
 	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2ccf5845957d..ebb973f5b190 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -189,7 +189,6 @@ struct rcu_data {
 	int dynticks_snap;		/* Per-GP tracking for dynticks. */
 	long dynticks_nesting;		/* Track process nesting level. */
 	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
-	atomic_t dynticks;		/* Even value for idle, else odd. */
 	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0f70f62039a9..75c22d1034c1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -356,7 +356,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
 		    !(rnp->qsmaskinitnext & mask)) {
 			mask_ofl_test |= mask;
 		} else {
-			snap = rcu_dynticks_snap(rdp);
+			snap = rcu_dynticks_snap(cpu);
 			if (rcu_dynticks_in_eqs(snap))
 				mask_ofl_test |= mask;
 			else
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 3556637768fd..250fbf2e8522 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -465,7 +465,7 @@ static void print_cpu_stall_info(int cpu)
 	}
 	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
 	falsepositive = rcu_is_gp_kthread_starving(NULL) &&
-			rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+			rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
 	rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
 	if (rcuc_starved)
 		sprintf(buf, " rcuc=%ld jiffies(starved)", j);
@@ -478,7 +478,7 @@ static void print_cpu_stall_info(int cpu)
 			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
 				"!."[!delta],
 	       ticks_value, ticks_title,
-	       rcu_dynticks_snap(rdp) & 0xfff,
+	       rcu_dynticks_snap(cpu) & 0xfff,
 	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
-- 
cgit 


From 904e600e60f46f92eb4bcfb95788b1fedf7e8237 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:30 +0200
Subject: rcu/context_tracking: Move dynticks_nesting to context tracking

The RCU eqs tracking is going to be performed by the context tracking
subsystem. The related nesting counters thus need to be moved to the
context tracking structure.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking_state.h | 13 +++++++++++++
 kernel/context_tracking.c              |  1 +
 kernel/rcu/tree.c                      | 31 ++++++++++++++++---------------
 kernel/rcu/tree.h                      |  1 -
 kernel/rcu/tree_stall.h                |  2 +-
 5 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 5a8da2787287..13a4a9d1ec7e 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -27,6 +27,7 @@ struct context_tracking {
 #endif
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 	atomic_t dynticks;		/* Even value for idle, else odd. */
+	long dynticks_nesting;		/* Track process nesting level. */
 #endif
 };
 
@@ -53,6 +54,18 @@ static __always_inline int ct_dynticks_cpu_acquire(int cpu)
 
 	return atomic_read_acquire(&ct->dynticks);
 }
+
+static __always_inline long ct_dynticks_nesting(void)
+{
+	return __this_cpu_read(context_tracking.dynticks_nesting);
+}
+
+static __always_inline long ct_dynticks_nesting_cpu(int cpu)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return ct->dynticks_nesting;
+}
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 01abbcec52f7..dfefe04400f8 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -25,6 +25,7 @@
 
 DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
+	.dynticks_nesting = 1,
 	.dynticks = ATOMIC_INIT(1),
 #endif
 };
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a471edc3d893..f6bf328bb9cf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -75,7 +75,6 @@
 /* Data structures. */
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
-	.dynticks_nesting = 1,
 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
 #ifdef CONFIG_RCU_NOCB_CPU
 	.cblist.flags = SEGCBLIST_RCU_CORE,
@@ -436,7 +435,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 	lockdep_assert_irqs_disabled();
 
 	/* Check for counter underflows */
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+	RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
 			 "RCU dynticks_nesting counter underflow!");
 	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
 			 "RCU dynticks_nmi_nesting counter underflow/zero!");
@@ -452,7 +451,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 	WARN_ON_ONCE(!nesting && !is_idle_task(current));
 
 	/* Does CPU appear to be idle from an RCU standpoint? */
-	return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
+	return ct_dynticks_nesting() == 0;
 }
 
 #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -619,16 +618,16 @@ static noinstr void rcu_eqs_enter(bool user)
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-		     rdp->dynticks_nesting == 0);
-	if (rdp->dynticks_nesting != 1) {
+		     ct_dynticks_nesting() == 0);
+	if (ct_dynticks_nesting() != 1) {
 		// RCU will still be watching, so just do accounting and leave.
-		rdp->dynticks_nesting--;
+		ct->dynticks_nesting--;
 		return;
 	}
 
 	instrumentation_begin();
 	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, ct_dynticks());
+	trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	rcu_preempt_deferred_qs(current);
 
@@ -636,7 +635,7 @@ static noinstr void rcu_eqs_enter(bool user)
 	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 
 	instrumentation_end();
-	WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+	WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
 	// RCU is watching here ...
 	rcu_dynticks_eqs_enter();
 	// ... but is no longer watching here.
@@ -793,7 +792,7 @@ void rcu_irq_exit_check_preempt(void)
 {
 	lockdep_assert_irqs_disabled();
 
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+	RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
 			 "RCU dynticks_nesting counter underflow/zero!");
 	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
 			 DYNTICK_IRQ_NONIDLE,
@@ -819,11 +818,11 @@ static void noinstr rcu_eqs_exit(bool user)
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
 	rdp = this_cpu_ptr(&rcu_data);
-	oldval = rdp->dynticks_nesting;
+	oldval = ct_dynticks_nesting();
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
 	if (oldval) {
 		// RCU was already watching, so just do accounting and leave.
-		rdp->dynticks_nesting++;
+		ct->dynticks_nesting++;
 		return;
 	}
 	rcu_dynticks_task_exit();
@@ -835,9 +834,9 @@ static void noinstr rcu_eqs_exit(bool user)
 	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
 	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
 
-	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, ct_dynticks());
+	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
-	WRITE_ONCE(rdp->dynticks_nesting, 1);
+	WRITE_ONCE(ct->dynticks_nesting, 1);
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
 	instrumentation_end();
@@ -4134,12 +4133,13 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu)
 {
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 
 	/* Set up local state, ensuring consistent view of global state. */
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	INIT_WORK(&rdp->strict_work, strict_work_handler);
-	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
+	WARN_ON_ONCE(ct->dynticks_nesting != 1);
 	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
 	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
 	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
@@ -4164,6 +4164,7 @@ rcu_boot_init_percpu_data(int cpu)
 int rcutree_prepare_cpu(unsigned int cpu)
 {
 	unsigned long flags;
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	struct rcu_node *rnp = rcu_get_root();
 
@@ -4172,7 +4173,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
 	rdp->blimit = blimit;
-	rdp->dynticks_nesting = 1;	/* CPU not up, no tearing. */
+	ct->dynticks_nesting = 1;	/* CPU not up, no tearing. */
 	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
 
 	/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ebb973f5b190..650ff3cf0121 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -187,7 +187,6 @@ struct rcu_data {
 
 	/* 3) dynticks interface. */
 	int dynticks_snap;		/* Per-GP tracking for dynticks. */
-	long dynticks_nesting;		/* Track process nesting level. */
 	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
 	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 250fbf2e8522..a9c82254b6c6 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -479,7 +479,7 @@ static void print_cpu_stall_info(int cpu)
 				"!."[!delta],
 	       ticks_value, ticks_title,
 	       rcu_dynticks_snap(cpu) & 0xfff,
-	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+	       ct_dynticks_nesting_cpu(cpu), rdp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
 	       rcuc_starved ? buf : "",
-- 
cgit 


From 95e04f48ec0a634e2f221081f5fa1a904755f326 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:31 +0200
Subject: rcu/context_tracking: Move dynticks_nmi_nesting to context tracking

The RCU eqs tracking is going to be performed by the context tracking
subsystem. The related nesting counters thus need to be moved to the
context tracking structure.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking_state.h | 16 ++++++++++++
 kernel/context_tracking.c              |  1 +
 kernel/rcu/rcu.h                       |  4 ---
 kernel/rcu/tree.c                      | 48 +++++++++++++++-------------------
 kernel/rcu/tree.h                      |  1 -
 kernel/rcu/tree_stall.h                |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 13a4a9d1ec7e..5f11e3d2d85a 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -13,6 +13,9 @@ enum ctx_state {
 	CONTEXT_GUEST,
 };
 
+/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
+#define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
+
 struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 	/*
@@ -28,6 +31,7 @@ struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 	atomic_t dynticks;		/* Even value for idle, else odd. */
 	long dynticks_nesting;		/* Track process nesting level. */
+	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
 #endif
 };
 
@@ -66,6 +70,18 @@ static __always_inline long ct_dynticks_nesting_cpu(int cpu)
 
 	return ct->dynticks_nesting;
 }
+
+static __always_inline long ct_dynticks_nmi_nesting(void)
+{
+	return __this_cpu_read(context_tracking.dynticks_nmi_nesting);
+}
+
+static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return ct->dynticks_nmi_nesting;
+}
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index dfefe04400f8..7c3033e9a518 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -26,6 +26,7 @@
 DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 	.dynticks_nesting = 1,
+	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
 	.dynticks = ATOMIC_INIT(1),
 #endif
 };
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4916077119f3..7b4a88deff9a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,10 +12,6 @@
 
 #include <trace/events/rcu.h>
 
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
-
-
 /*
  * Grace-period counter management.
  */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f6bf328bb9cf..006939b29e82 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -75,7 +75,6 @@
 /* Data structures. */
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
-	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
 #ifdef CONFIG_RCU_NOCB_CPU
 	.cblist.flags = SEGCBLIST_RCU_CORE,
 #endif
@@ -437,11 +436,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 	/* Check for counter underflows */
 	RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
 			 "RCU dynticks_nesting counter underflow!");
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
 			 "RCU dynticks_nmi_nesting counter underflow/zero!");
 
 	/* Are we at first interrupt nesting level? */
-	nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+	nesting = ct_dynticks_nmi_nesting();
 	if (nesting > 1)
 		return false;
 
@@ -612,11 +611,10 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
  */
 static noinstr void rcu_eqs_enter(bool user)
 {
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
-	WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
-	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+	WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 		     ct_dynticks_nesting() == 0);
 	if (ct_dynticks_nesting() != 1) {
@@ -734,7 +732,7 @@ noinstr void rcu_user_enter(void)
  * rcu_nmi_exit - inform RCU of exit from NMI context
  *
  * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update ct->dynticks and rdp->dynticks_nmi_nesting
+ * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
  * to let the RCU grace-period handling know that the CPU is back to
  * being RCU-idle.
  *
@@ -744,7 +742,6 @@ noinstr void rcu_user_enter(void)
 noinstr void rcu_nmi_exit(void)
 {
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
 	instrumentation_begin();
 	/*
@@ -752,25 +749,25 @@ noinstr void rcu_nmi_exit(void)
 	 * (We are exiting an NMI handler, so RCU better be paying attention
 	 * to us!)
 	 */
-	WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
 	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 
 	/*
 	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
 	 * leave it in non-RCU-idle state.
 	 */
-	if (rdp->dynticks_nmi_nesting != 1) {
-		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+	if (ct_dynticks_nmi_nesting() != 1) {
+		trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
 				  ct_dynticks());
-		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
-			   rdp->dynticks_nmi_nesting - 2);
+		WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+			   ct_dynticks_nmi_nesting() - 2);
 		instrumentation_end();
 		return;
 	}
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, ct_dynticks());
-	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+	trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+	WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
 	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
 	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
@@ -794,7 +791,7 @@ void rcu_irq_exit_check_preempt(void)
 
 	RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
 			 "RCU dynticks_nesting counter underflow/zero!");
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
 			 DYNTICK_IRQ_NONIDLE,
 			 "Bad RCU  dynticks_nmi_nesting counter\n");
 	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -813,11 +810,9 @@ void rcu_irq_exit_check_preempt(void)
 static void noinstr rcu_eqs_exit(bool user)
 {
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-	struct rcu_data *rdp;
 	long oldval;
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
-	rdp = this_cpu_ptr(&rcu_data);
 	oldval = ct_dynticks_nesting();
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
 	if (oldval) {
@@ -837,8 +832,8 @@ static void noinstr rcu_eqs_exit(bool user)
 	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(ct->dynticks_nesting, 1);
-	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
-	WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+	WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
 	instrumentation_end();
 }
 
@@ -941,7 +936,7 @@ void __rcu_irq_enter_check_tick(void)
  * rcu_nmi_enter - inform RCU of entry to NMI context
  *
  * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
- * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
  * that the CPU is active.  This implementation permits nested NMIs, as
  * long as the nesting level does not overflow an int.  (You will probably
  * run out of stack space first.)
@@ -952,11 +947,10 @@ void __rcu_irq_enter_check_tick(void)
 noinstr void rcu_nmi_enter(void)
 {
 	long incby = 2;
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
 	/* Complain about underflow. */
-	WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
 
 	/*
 	 * If idle from RCU viewpoint, atomically increment ->dynticks
@@ -990,11 +984,11 @@ noinstr void rcu_nmi_enter(void)
 	}
 
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
-			  rdp->dynticks_nmi_nesting,
-			  rdp->dynticks_nmi_nesting + incby, ct_dynticks());
+			  ct_dynticks_nmi_nesting(),
+			  ct_dynticks_nmi_nesting() + incby, ct_dynticks());
 	instrumentation_end();
-	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
-		   rdp->dynticks_nmi_nesting + incby);
+	WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+		   ct_dynticks_nmi_nesting() + incby);
 	barrier();
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 650ff3cf0121..72dbf8512ce7 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -187,7 +187,6 @@ struct rcu_data {
 
 	/* 3) dynticks interface. */
 	int dynticks_snap;		/* Per-GP tracking for dynticks. */
-	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
 	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index a9c82254b6c6..2683ce0a7c72 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -479,7 +479,7 @@ static void print_cpu_stall_info(int cpu)
 				"!."[!delta],
 	       ticks_value, ticks_title,
 	       rcu_dynticks_snap(cpu) & 0xfff,
-	       ct_dynticks_nesting_cpu(cpu), rdp->dynticks_nmi_nesting,
+	       ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
 	       rcuc_starved ? buf : "",
-- 
cgit 


From 564506495ca96a6e66d077d3d5b9f02d4b9b0f45 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:32 +0200
Subject: rcu/context-tracking: Move deferred nocb resched to context tracking

To prepare for migrating the RCU eqs accounting code to context tracking,
split the last-resort deferred nocb resched from rcu_user_enter() and
move it into a separate call from context tracking.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/rcupdate.h  |  6 ++++++
 kernel/context_tracking.c |  8 ++++++++
 kernel/rcu/tree.c         | 15 ++-------------
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f1562d91c67d..3717cad983a6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -112,6 +112,12 @@ static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
+void rcu_irq_work_resched(void);
+#else
+static inline void rcu_irq_work_resched(void) { }
+#endif
+
 #ifdef CONFIG_RCU_NOCB_CPU
 void rcu_init_nohz(void);
 int rcu_nocb_cpu_offload(int cpu);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 7c3033e9a518..8cf59d8a6af6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -177,6 +177,8 @@ static __always_inline void context_tracking_recursion_exit(void)
  */
 void noinstr __ct_user_enter(enum ctx_state state)
 {
+	lockdep_assert_irqs_disabled();
+
 	/* Kernel threads aren't supposed to go to userspace */
 	WARN_ON_ONCE(!current->mm);
 
@@ -198,6 +200,12 @@ void noinstr __ct_user_enter(enum ctx_state state)
 				vtime_user_enter(current);
 				instrumentation_end();
 			}
+			/*
+			 * Other than generic entry implementation, we may be past the last
+			 * rescheduling opportunity in the entry code. Trigger a self IPI
+			 * that will fire and reschedule once we resume in user/guest mode.
+			 */
+			rcu_irq_work_resched();
 			rcu_user_enter();
 		}
 		/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 006939b29e82..8c0c3490532e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -681,7 +681,7 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
  * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
  * get re-enabled again.
  */
-noinstr static void rcu_irq_work_resched(void)
+noinstr void rcu_irq_work_resched(void)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
@@ -697,10 +697,7 @@ noinstr static void rcu_irq_work_resched(void)
 	}
 	instrumentation_end();
 }
-
-#else
-static inline void rcu_irq_work_resched(void) { }
-#endif
+#endif /* #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) */
 
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
@@ -715,14 +712,6 @@ static inline void rcu_irq_work_resched(void) { }
  */
 noinstr void rcu_user_enter(void)
 {
-	lockdep_assert_irqs_disabled();
-
-	/*
-	 * Other than generic entry implementation, we may be past the last
-	 * rescheduling opportunity in the entry code. Trigger a self IPI
-	 * that will fire and reschedule once we resume in user/guest mode.
-	 */
-	rcu_irq_work_resched();
 	rcu_eqs_enter(true);
 }
 
-- 
cgit 


From 172114552701b85d5c3b1a089a73ee85d0d7786b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:33 +0200
Subject: rcu/context-tracking: Move RCU-dynticks internal functions to
 context_tracking

Move the core RCU eqs/dynticks functions to context tracking so that
we can later merge all that code within context tracking.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking.h |  20 +++
 include/linux/rcutree.h          |   3 +
 kernel/context_tracking.c        | 336 +++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree.c                | 324 +------------------------------------
 kernel/rcu/tree.h                |   5 -
 kernel/rcu/tree_plugin.h         |  38 +----
 6 files changed, 364 insertions(+), 362 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 01abadb2f993..1f568676bc1d 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -122,6 +122,26 @@ static inline void context_tracking_init(void) { }
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 extern void ct_idle_enter(void);
 extern void ct_idle_exit(void);
+
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+	return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1);
+}
+
+/*
+ * Increment the current CPU's context_tracking structure's ->dynticks field
+ * with ordering.  Return the new value.
+ */
+static __always_inline unsigned long rcu_dynticks_inc(int incby)
+{
+	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks));
+}
+
 #else
 static inline void ct_idle_enter(void) { }
 static inline void ct_idle_exit(void) { }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 4522b6a7cc42..24db1e41695c 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -55,6 +55,9 @@ void rcu_irq_exit_check_preempt(void);
 static inline void rcu_irq_exit_check_preempt(void) { }
 #endif
 
+struct task_struct;
+void rcu_preempt_deferred_qs(struct task_struct *t);
+
 void exit_rcu(void);
 
 void rcu_scheduler_starting(void);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 8cf59d8a6af6..072c4b6044b3 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -21,6 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/export.h>
 #include <linux/kprobes.h>
+#include <trace/events/rcu.h>
 
 
 DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
@@ -33,6 +34,309 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 EXPORT_SYMBOL_GPL(context_tracking);
 
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x)  tracepoint_string(x)
+
+/* Record the current task on dyntick-idle entry. */
+static __always_inline void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static __always_inline void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
+static __always_inline void rcu_dynticks_task_trace_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+		current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
+static __always_inline void rcu_dynticks_task_trace_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+		current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state.  This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void rcu_dynticks_eqs_enter(void)
+{
+	int seq;
+
+	/*
+	 * CPUs seeing atomic_add_return() must see prior RCU read-side
+	 * critical sections, and we also must force ordering with the
+	 * next idle sojourn.
+	 */
+	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
+	seq = rcu_dynticks_inc(1);
+	// RCU is no longer watching.  Better be in extended quiescent state!
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
+}
+
+/*
+ * Record exit from an extended quiescent state.  This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void rcu_dynticks_eqs_exit(void)
+{
+	int seq;
+
+	/*
+	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
+	 * and we also must force ordering with the next RCU read-side
+	 * critical section.
+	 */
+	seq = rcu_dynticks_inc(1);
+	// RCU is now watching.  Better not be in an extended quiescent state!
+	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr rcu_eqs_enter(bool user)
+{
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+	WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     ct_dynticks_nesting() == 0);
+	if (ct_dynticks_nesting() != 1) {
+		// RCU will still be watching, so just do accounting and leave.
+		ct->dynticks_nesting--;
+		return;
+	}
+
+	instrumentation_begin();
+	lockdep_assert_irqs_disabled();
+	trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+	rcu_preempt_deferred_qs(current);
+
+	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+
+	instrumentation_end();
+	WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+	// RCU is watching here ...
+	rcu_dynticks_eqs_enter();
+	// ... but is no longer watching here.
+	rcu_dynticks_task_enter();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr rcu_eqs_exit(bool user)
+{
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+	long oldval;
+
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+	oldval = ct_dynticks_nesting();
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+	if (oldval) {
+		// RCU was already watching, so just do accounting and leave.
+		ct->dynticks_nesting++;
+		return;
+	}
+	rcu_dynticks_task_exit();
+	// RCU is not watching here ...
+	rcu_dynticks_eqs_exit();
+	// ... but is watching here.
+	instrumentation_begin();
+
+	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+
+	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+	WRITE_ONCE(ct->dynticks_nesting, 1);
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+	WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+	instrumentation_end();
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr rcu_nmi_exit(void)
+{
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+	instrumentation_begin();
+	/*
+	 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+	 * (We are exiting an NMI handler, so RCU better be paying attention
+	 * to us!)
+	 */
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
+	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+	/*
+	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+	 * leave it in non-RCU-idle state.
+	 */
+	if (ct_dynticks_nmi_nesting() != 1) {
+		trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
+				  ct_dynticks());
+		WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+			   ct_dynticks_nmi_nesting() - 2);
+		instrumentation_end();
+		return;
+	}
+
+	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+	trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+	WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+
+	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
+	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+	instrumentation_end();
+
+	// RCU is watching here ...
+	rcu_dynticks_eqs_enter();
+	// ... but is no longer watching here.
+
+	if (!in_nmi())
+		rcu_dynticks_task_enter();
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr rcu_nmi_enter(void)
+{
+	long incby = 2;
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+	/* Complain about underflow. */
+	WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+
+	/*
+	 * If idle from RCU viewpoint, atomically increment ->dynticks
+	 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+	 * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
+	 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+	 * to be in the outermost NMI handler that interrupted an RCU-idle
+	 * period (observation due to Andy Lutomirski).
+	 */
+	if (rcu_dynticks_curr_cpu_in_eqs()) {
+
+		if (!in_nmi())
+			rcu_dynticks_task_exit();
+
+		// RCU is not watching here ...
+		rcu_dynticks_eqs_exit();
+		// ... but is watching here.
+
+		instrumentation_begin();
+		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+		instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks));
+		// instrumentation for the noinstr rcu_dynticks_eqs_exit()
+		instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+
+		incby = 1;
+	} else if (!in_nmi()) {
+		instrumentation_begin();
+		rcu_irq_enter_check_tick();
+	} else  {
+		instrumentation_begin();
+	}
+
+	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+			  ct_dynticks_nmi_nesting(),
+			  ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+	instrumentation_end();
+	WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+		   ct_dynticks_nmi_nesting() + incby);
+	barrier();
+}
+
+/**
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr rcu_idle_enter(void)
+{
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+	rcu_eqs_enter(false);
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr rcu_idle_exit(void)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	rcu_eqs_exit(false);
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
 noinstr void ct_idle_enter(void)
 {
 	rcu_idle_enter();
@@ -139,6 +443,38 @@ noinstr void ct_nmi_exit(void)
 }
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * rcu_user_enter - inform RCU that we are resuming userspace.
+ *
+ * Enter RCU idle mode right before resuming userspace.  No use of RCU
+ * is permitted between this call and rcu_user_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void rcu_user_enter(void)
+{
+	rcu_eqs_enter(true);
+}
+
+/**
+ * rcu_user_exit - inform RCU that we are exiting userspace.
+ *
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr rcu_user_exit(void)
+{
+	rcu_eqs_exit(true);
+}
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 
 #define CREATE_TRACE_POINTS
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8c0c3490532e..e2a2083079a2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -62,6 +62,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/kasan.h>
+#include <linux/context_tracking.h>
 #include "../time/tick-internal.h"
 
 #include "tree.h"
@@ -259,56 +260,6 @@ void rcu_softirq_qs(void)
 	rcu_tasks_qs(current, false);
 }
 
-/*
- * Increment the current CPU's rcu_data structure's ->dynticks field
- * with ordering.  Return the new value.
- */
-static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
-{
-	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks));
-}
-
-/*
- * Record entry into an extended quiescent state.  This is only to be
- * called when not already in an extended quiescent state, that is,
- * RCU is watching prior to the call to this function and is no longer
- * watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_enter(void)
-{
-	int seq;
-
-	/*
-	 * CPUs seeing atomic_add_return() must see prior RCU read-side
-	 * critical sections, and we also must force ordering with the
-	 * next idle sojourn.
-	 */
-	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
-	seq = rcu_dynticks_inc(1);
-	// RCU is no longer watching.  Better be in extended quiescent state!
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
-}
-
-/*
- * Record exit from an extended quiescent state.  This is only to be
- * called from an extended quiescent state, that is, RCU is not watching
- * prior to the call to this function and is watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_exit(void)
-{
-	int seq;
-
-	/*
-	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
-	 * and we also must force ordering with the next RCU read-side
-	 * critical section.
-	 */
-	seq = rcu_dynticks_inc(1);
-	// RCU is now watching.  Better not be in an extended quiescent state!
-	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
-}
-
 /*
  * Reset the current CPU's ->dynticks counter to indicate that the
  * newly onlined CPU is no longer in an extended quiescent state.
@@ -326,16 +277,6 @@ static void rcu_dynticks_eqs_online(void)
 	rcu_dynticks_inc(1);
 }
 
-/*
- * Is the current CPU in an extended quiescent state?
- *
- * No ordering, as we are sampling CPU-local information.
- */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
-{
-	return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1);
-}
-
 /*
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
@@ -601,65 +542,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 }
 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-static noinstr void rcu_eqs_enter(bool user)
-{
-	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-
-	WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
-	WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-		     ct_dynticks_nesting() == 0);
-	if (ct_dynticks_nesting() != 1) {
-		// RCU will still be watching, so just do accounting and leave.
-		ct->dynticks_nesting--;
-		return;
-	}
-
-	instrumentation_begin();
-	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
-	rcu_preempt_deferred_qs(current);
-
-	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
-
-	instrumentation_end();
-	WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
-	// RCU is watching here ...
-	rcu_dynticks_eqs_enter();
-	// ... but is no longer watching here.
-	rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_idle_enter(void)
-{
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
-	rcu_eqs_enter(false);
-}
-
-#ifdef CONFIG_NO_HZ_FULL
-
-#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
 /*
  * An empty function that will trigger a reschedule on
  * IRQ tail once IRQs get re-enabled on userspace/guest resume.
@@ -697,78 +580,7 @@ noinstr void rcu_irq_work_resched(void)
 	}
 	instrumentation_end();
 }
-#endif /* #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) */
-
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace.  No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
-	rcu_eqs_enter(true);
-}
-
-#endif /* CONFIG_NO_HZ_FULL */
-
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
- *
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
- *
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_exit(void)
-{
-	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-
-	instrumentation_begin();
-	/*
-	 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
-	 * (We are exiting an NMI handler, so RCU better be paying attention
-	 * to us!)
-	 */
-	WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
-	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
-
-	/*
-	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
-	 * leave it in non-RCU-idle state.
-	 */
-	if (ct_dynticks_nmi_nesting() != 1) {
-		trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
-				  ct_dynticks());
-		WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
-			   ct_dynticks_nmi_nesting() - 2);
-		instrumentation_end();
-		return;
-	}
-
-	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
-	WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
-
-	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
-	instrumentation_end();
-
-	// RCU is watching here ...
-	rcu_dynticks_eqs_enter();
-	// ... but is no longer watching here.
-
-	if (!in_nmi())
-		rcu_dynticks_task_enter();
-}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
 
 #ifdef CONFIG_PROVE_RCU
 /**
@@ -788,77 +600,7 @@ void rcu_irq_exit_check_preempt(void)
 }
 #endif /* #ifdef CONFIG_PROVE_RCU */
 
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
- * allow for the possibility of usermode upcalls messing up our count of
- * interrupt nesting level during the busy period that is just now starting.
- */
-static void noinstr rcu_eqs_exit(bool user)
-{
-	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-	long oldval;
-
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
-	oldval = ct_dynticks_nesting();
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
-	if (oldval) {
-		// RCU was already watching, so just do accounting and leave.
-		ct->dynticks_nesting++;
-		return;
-	}
-	rcu_dynticks_task_exit();
-	// RCU is not watching here ...
-	rcu_dynticks_eqs_exit();
-	// ... but is watching here.
-	instrumentation_begin();
-
-	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
-
-	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
-	WRITE_ONCE(ct->dynticks_nesting, 1);
-	WARN_ON_ONCE(ct_dynticks_nmi_nesting());
-	WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
-	instrumentation_end();
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_idle_exit(void)
-{
-	unsigned long flags;
-
-	raw_local_irq_save(flags);
-	rcu_eqs_exit(false);
-	raw_local_irq_restore(flags);
-}
-
 #ifdef CONFIG_NO_HZ_FULL
-/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
-	rcu_eqs_exit(true);
-}
-
 /**
  * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
  *
@@ -921,66 +663,6 @@ void __rcu_irq_enter_check_tick(void)
 }
 #endif /* CONFIG_NO_HZ_FULL */
 
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
- * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active.  This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int.  (You will probably
- * run out of stack space first.)
- *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_enter(void)
-{
-	long incby = 2;
-	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
-
-	/* Complain about underflow. */
-	WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
-
-	/*
-	 * If idle from RCU viewpoint, atomically increment ->dynticks
-	 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
-	 * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
-	 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
-	 * to be in the outermost NMI handler that interrupted an RCU-idle
-	 * period (observation due to Andy Lutomirski).
-	 */
-	if (rcu_dynticks_curr_cpu_in_eqs()) {
-
-		if (!in_nmi())
-			rcu_dynticks_task_exit();
-
-		// RCU is not watching here ...
-		rcu_dynticks_eqs_exit();
-		// ... but is watching here.
-
-		instrumentation_begin();
-		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
-		instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks));
-		// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-		instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
-
-		incby = 1;
-	} else if (!in_nmi()) {
-		instrumentation_begin();
-		rcu_irq_enter_check_tick();
-	} else  {
-		instrumentation_begin();
-	}
-
-	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
-			  ct_dynticks_nmi_nesting(),
-			  ct_dynticks_nmi_nesting() + incby, ct_dynticks());
-	instrumentation_end();
-	WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
-		   ct_dynticks_nmi_nesting() + incby);
-	barrier();
-}
-
 /*
  * Check to see if any future non-offloaded RCU-related work will need
  * to be done by the current CPU, even if none need be done immediately,
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 72dbf8512ce7..0d5d1de327e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -427,7 +427,6 @@ static void rcu_cpu_kthread_setup(unsigned int cpu);
 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
-static void rcu_preempt_deferred_qs(struct task_struct *t);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
@@ -467,10 +466,6 @@ do {								\
 
 static void rcu_bind_gp_kthread(void);
 static bool rcu_nohz_full_cpu(void);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
-static void rcu_dynticks_task_trace_enter(void);
-static void rcu_dynticks_task_trace_exit(void);
 
 /* Forward declarations for tree_stall.h */
 static void record_gp_stall_check_time(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c8ba0fe17267..4a53aa013f82 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  * evaluate safety in terms of interrupt, softirq, and preemption
  * disabling.
  */
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+void rcu_preempt_deferred_qs(struct task_struct *t)
 {
 	unsigned long flags;
 
@@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 // period for a quiescent state from this CPU.  Note that requests from
 // tasks are handled when removing the task from the blocked-tasks list
 // below.
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+void rcu_preempt_deferred_qs(struct task_struct *t)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
@@ -1290,37 +1290,3 @@ static void rcu_bind_gp_kthread(void)
 		return;
 	housekeeping_affine(current, HK_TYPE_RCU);
 }
-
-/* Record the current task on dyntick-idle entry. */
-static __always_inline void rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static __always_inline void rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static __always_inline void rcu_dynticks_task_trace_enter(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
-	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
-		current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static __always_inline void rcu_dynticks_task_trace_exit(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
-	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
-		current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
-- 
cgit 


From c33ef43a359001415032665dfcd433979c462b71 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:34 +0200
Subject: rcu/context-tracking: Remove unused and/or unecessary middle
 functions

Some eqs functions are now only used internally by context tracking, so
their public declarations can be removed.

Also middle functions such as rcu_user_*() and rcu_idle_*()
which now directly call to rcu_eqs_enter() and rcu_eqs_exit() can be
wiped out as well.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 Documentation/RCU/stallwarn.rst |  2 +-
 include/linux/hardirq.h         |  8 ----
 include/linux/rcupdate.h        |  8 ----
 include/linux/rcutiny.h         |  2 -
 include/linux/rcutree.h         |  2 -
 kernel/context_tracking.c       | 98 ++++++++++++-----------------------------
 6 files changed, 28 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index ce1f58a9d954..e38c587067fc 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -97,7 +97,7 @@ warnings:
 	which will include additional debugging information.
 
 -	A low-level kernel issue that either fails to invoke one of the
-	variants of rcu_user_enter(), rcu_user_exit(), ct_idle_enter(),
+	variants of rcu_eqs_enter(true), rcu_eqs_exit(true), ct_idle_enter(),
 	ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one
 	hand, or that invokes one of them too many times on the other.
 	Historically, the most frequent issue has been an omission
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 345cdbe9c1b7..d57cab4d4c06 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -92,14 +92,6 @@ void irq_exit_rcu(void);
 #define arch_nmi_exit()		do { } while (0)
 #endif
 
-#ifdef CONFIG_TINY_RCU
-static inline void rcu_nmi_enter(void) { }
-static inline void rcu_nmi_exit(void) { }
-#else
-extern void rcu_nmi_enter(void);
-extern void rcu_nmi_exit(void);
-#endif
-
 /*
  * NMI vs Tracing
  * --------------
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3717cad983a6..434da1eb88cd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -104,14 +104,6 @@ static inline void rcu_sysrq_start(void) { }
 static inline void rcu_sysrq_end(void) { }
 #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
 
-#ifdef CONFIG_NO_HZ_FULL
-void rcu_user_enter(void);
-void rcu_user_exit(void);
-#else
-static inline void rcu_user_enter(void) { }
-static inline void rcu_user_exit(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
-
 #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
 void rcu_irq_work_resched(void);
 #else
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 591119413cf1..900ba35c3582 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -76,8 +76,6 @@ static inline int rcu_needs_cpu(void)
 static inline void rcu_virt_note_context_switch(int cpu) { }
 static inline void rcu_cpu_stall_reset(void) { }
 static inline int rcu_jiffies_till_stall_check(void) { return 21 * HZ; }
-static inline void rcu_idle_enter(void) { }
-static inline void rcu_idle_exit(void) { }
 static inline void rcu_irq_exit_check_preempt(void) { }
 #define rcu_is_idle_cpu(cpu) \
 	(is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq())
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 24db1e41695c..9cca00ed9bc9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -45,8 +45,6 @@ unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 void cond_synchronize_rcu(unsigned long oldstate);
 
-void rcu_idle_enter(void);
-void rcu_idle_exit(void);
 bool rcu_is_idle_cpu(int cpu);
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 072c4b6044b3..e485b6b01537 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -189,17 +189,17 @@ static void noinstr rcu_eqs_exit(bool user)
 }
 
 /**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+ * ct_nmi_exit - inform RCU of exit from NMI context
  *
  * If we are returning from the outermost NMI handler that interrupted an
  * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
  * to let the RCU grace-period handling know that the CPU is back to
  * being RCU-idle.
  *
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
  * with CONFIG_RCU_EQS_DEBUG=y.
  */
-void noinstr rcu_nmi_exit(void)
+void noinstr ct_nmi_exit(void)
 {
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
@@ -242,7 +242,7 @@ void noinstr rcu_nmi_exit(void)
 }
 
 /**
- * rcu_nmi_enter - inform RCU of entry to NMI context
+ * ct_nmi_enter - inform RCU of entry to NMI context
  *
  * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
  * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
@@ -250,10 +250,10 @@ void noinstr rcu_nmi_exit(void)
  * long as the nesting level does not overflow an int.  (You will probably
  * run out of stack space first.)
  *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
  * with CONFIG_RCU_EQS_DEBUG=y.
  */
-void noinstr rcu_nmi_enter(void)
+void noinstr ct_nmi_enter(void)
 {
 	long incby = 2;
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
@@ -302,32 +302,33 @@ void noinstr rcu_nmi_enter(void)
 }
 
 /**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
+ * ct_idle_enter - inform RCU that current CPU is entering idle
  *
  * Enter idle mode, in other words, -leave- the mode in which RCU
  * read-side critical sections can occur.  (Though RCU read-side
  * critical sections can occur in irq handlers in idle, a possibility
  * handled by irq_enter() and irq_exit().)
  *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
  * CONFIG_RCU_EQS_DEBUG=y.
  */
-void noinstr rcu_idle_enter(void)
+void noinstr ct_idle_enter(void)
 {
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
 	rcu_eqs_enter(false);
 }
+EXPORT_SYMBOL_GPL(ct_idle_enter);
 
 /**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
  *
  * Exit idle mode, in other words, -enter- the mode in which RCU
  * read-side critical sections can occur.
  *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
  * CONFIG_RCU_EQS_DEBUG=y.
  */
-void noinstr rcu_idle_exit(void)
+void noinstr ct_idle_exit(void)
 {
 	unsigned long flags;
 
@@ -335,18 +336,6 @@ void noinstr rcu_idle_exit(void)
 	rcu_eqs_exit(false);
 	raw_local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-noinstr void ct_idle_enter(void)
-{
-	rcu_idle_enter();
-}
-EXPORT_SYMBOL_GPL(ct_idle_enter);
-
-void ct_idle_exit(void)
-{
-	rcu_idle_exit();
-}
 EXPORT_SYMBOL_GPL(ct_idle_exit);
 
 /**
@@ -431,50 +420,11 @@ void ct_irq_exit_irqson(void)
 	ct_irq_exit();
 	local_irq_restore(flags);
 }
-
-noinstr void ct_nmi_enter(void)
-{
-	rcu_nmi_enter();
-}
-
-noinstr void ct_nmi_exit(void)
-{
-	rcu_nmi_exit();
-}
+#else
+static __always_inline void rcu_eqs_enter(bool user) { }
+static __always_inline void rcu_eqs_exit(bool user) { }
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
-#ifdef CONFIG_NO_HZ_FULL
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace.  No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
-	rcu_eqs_enter(true);
-}
-
-/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
-	rcu_eqs_exit(true);
-}
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
-
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 
 #define CREATE_TRACE_POINTS
@@ -542,7 +492,13 @@ void noinstr __ct_user_enter(enum ctx_state state)
 			 * that will fire and reschedule once we resume in user/guest mode.
 			 */
 			rcu_irq_work_resched();
-			rcu_user_enter();
+			/*
+			 * Enter RCU idle mode right before resuming userspace.  No use of RCU
+			 * is permitted between this call and rcu_eqs_exit(). This way the
+			 * CPU doesn't need to maintain the tick for RCU maintenance purposes
+			 * when the CPU runs in userspace.
+			 */
+			rcu_eqs_enter(true);
 		}
 		/*
 		 * Even if context tracking is disabled on this CPU, because it's outside
@@ -579,7 +535,7 @@ void ct_user_enter(enum ctx_state state)
 	/*
 	 * Some contexts may involve an exception occuring in an irq,
 	 * leading to that nesting:
-	 * ct_irq_enter() rcu_user_exit() rcu_user_exit() ct_irq_exit()
+	 * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
 	 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
 	 * helpers are enough to protect RCU uses inside the exception. So
 	 * just return immediately if we detect we are in an IRQ.
@@ -631,10 +587,10 @@ void noinstr __ct_user_exit(enum ctx_state state)
 	if (__this_cpu_read(context_tracking.state) == state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
-			 * We are going to run code that may use RCU. Inform
-			 * RCU core about that (ie: we may need the tick again).
+			 * Exit RCU idle mode while entering the kernel because it can
+			 * run a RCU read side critical section anytime.
 			 */
-			rcu_user_exit();
+			rcu_eqs_exit(true);
 			if (state == CONTEXT_USER) {
 				instrumentation_begin();
 				vtime_user_exit(current);
-- 
cgit 


From 171476775d32a40bfebf83250136c19b2e842672 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 8 Jun 2022 16:40:35 +0200
Subject: context_tracking: Convert state to atomic_t

Context tracking's state and dynticks counter are going to be merged
in a single field so that both updates can happen atomically and at the
same time. Prepare for that with converting the state into an atomic_t.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicolas Saenz Julienne <nsaenz@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Cc: Yu Liao <liaoyu15@huawei.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Paul Gortmaker<paul.gortmaker@windriver.com>
Cc: Alex Belits <abelits@marvell.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 include/linux/context_tracking.h       |  32 +++-----
 include/linux/context_tracking_state.h |  57 ++++++++++---
 kernel/context_tracking.c              | 143 ++++++++++++++++++++++-----------
 kernel/rcu/tree.c                      |  13 ++-
 kernel/rcu/tree_stall.h                |   4 +-
 5 files changed, 158 insertions(+), 91 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 1f568676bc1d..dcef4a9e4d63 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -56,7 +56,7 @@ static inline enum ctx_state exception_enter(void)
 	    !context_tracking_enabled())
 		return 0;
 
-	prev_ctx = this_cpu_read(context_tracking.state);
+	prev_ctx = __ct_state();
 	if (prev_ctx != CONTEXT_KERNEL)
 		ct_user_exit(prev_ctx);
 
@@ -86,33 +86,21 @@ static __always_inline void context_tracking_guest_exit(void)
 		__ct_user_exit(CONTEXT_GUEST);
 }
 
-/**
- * ct_state() - return the current context tracking state if known
- *
- * Returns the current cpu's context tracking state if context tracking
- * is enabled.  If context tracking is disabled, returns
- * CONTEXT_DISABLED.  This should be used primarily for debugging.
- */
-static __always_inline enum ctx_state ct_state(void)
-{
-	return context_tracking_enabled() ?
-		this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
-}
+#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
+
 #else
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline void user_enter_irqoff(void) { }
 static inline void user_exit_irqoff(void) { }
-static inline enum ctx_state exception_enter(void) { return 0; }
+static inline int exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
-static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
+static inline int ct_state(void) { return -1; }
 static __always_inline bool context_tracking_guest_enter(void) { return false; }
 static inline void context_tracking_guest_exit(void) { }
-
+#define CT_WARN_ON(cond) do { } while (0)
 #endif /* !CONFIG_CONTEXT_TRACKING_USER */
 
-#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
-
 #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
 extern void context_tracking_init(void);
 #else
@@ -130,16 +118,16 @@ extern void ct_idle_exit(void);
  */
 static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
-	return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1);
+	return !(arch_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX);
 }
 
 /*
- * Increment the current CPU's context_tracking structure's ->dynticks field
+ * Increment the current CPU's context_tracking structure's ->state field
  * with ordering.  Return the new value.
  */
-static __always_inline unsigned long rcu_dynticks_inc(int incby)
+static __always_inline unsigned long ct_state_inc(int incby)
 {
-	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks));
+	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
 }
 
 #else
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 5f11e3d2d85a..e20a74bc0597 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -6,15 +6,23 @@
 #include <linux/static_key.h>
 #include <linux/context_tracking_irq.h>
 
+/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
+#define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
+
 enum ctx_state {
-	CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
-	CONTEXT_KERNEL = 0,
-	CONTEXT_USER,
-	CONTEXT_GUEST,
+	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
+	CONTEXT_KERNEL		= 0,
+	CONTEXT_IDLE		= 1,
+	CONTEXT_USER		= 2,
+	CONTEXT_GUEST		= 3,
+	CONTEXT_MAX		= 4,
 };
 
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
+/* Even value for idle, else odd. */
+#define RCU_DYNTICKS_IDX CONTEXT_MAX
+
+#define CT_STATE_MASK (CONTEXT_MAX - 1)
+#define CT_DYNTICKS_MASK (~CT_STATE_MASK)
 
 struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_USER
@@ -26,10 +34,11 @@ struct context_tracking {
 	 */
 	bool active;
 	int recursion;
-	enum ctx_state state;
+#endif
+#ifdef CONFIG_CONTEXT_TRACKING
+	atomic_t state;
 #endif
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
-	atomic_t dynticks;		/* Even value for idle, else odd. */
 	long dynticks_nesting;		/* Track process nesting level. */
 	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
 #endif
@@ -37,26 +46,31 @@ struct context_tracking {
 
 #ifdef CONFIG_CONTEXT_TRACKING
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
+
+static __always_inline int __ct_state(void)
+{
+	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK;
+}
 #endif
 
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 static __always_inline int ct_dynticks(void)
 {
-	return atomic_read(this_cpu_ptr(&context_tracking.dynticks));
+	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_DYNTICKS_MASK;
 }
 
 static __always_inline int ct_dynticks_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return atomic_read(&ct->dynticks);
+	return atomic_read(&ct->state) & CT_DYNTICKS_MASK;
 }
 
 static __always_inline int ct_dynticks_cpu_acquire(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return atomic_read_acquire(&ct->dynticks);
+	return atomic_read_acquire(&ct->state) & CT_DYNTICKS_MASK;
 }
 
 static __always_inline long ct_dynticks_nesting(void)
@@ -102,6 +116,27 @@ static inline bool context_tracking_enabled_this_cpu(void)
 	return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
 }
 
+/**
+ * ct_state() - return the current context tracking state if known
+ *
+ * Returns the current cpu's context tracking state if context tracking
+ * is enabled.  If context tracking is disabled, returns
+ * CONTEXT_DISABLED.  This should be used primarily for debugging.
+ */
+static __always_inline int ct_state(void)
+{
+	int ret;
+
+	if (!context_tracking_enabled())
+		return CONTEXT_DISABLED;
+
+	preempt_disable();
+	ret = __ct_state();
+	preempt_enable();
+
+	return ret;
+}
+
 #else
 static __always_inline bool context_tracking_enabled(void) { return false; }
 static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e485b6b01537..ca78ff27dc53 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -28,8 +28,8 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 	.dynticks_nesting = 1,
 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
-	.dynticks = ATOMIC_INIT(1),
 #endif
+	.state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
 };
 EXPORT_SYMBOL_GPL(context_tracking);
 
@@ -76,7 +76,7 @@ static __always_inline void rcu_dynticks_task_trace_exit(void)
  * RCU is watching prior to the call to this function and is no longer
  * watching upon return.
  */
-static noinstr void rcu_dynticks_eqs_enter(void)
+static noinstr void ct_kernel_exit_state(int offset)
 {
 	int seq;
 
@@ -86,9 +86,9 @@ static noinstr void rcu_dynticks_eqs_enter(void)
 	 * next idle sojourn.
 	 */
 	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
-	seq = rcu_dynticks_inc(1);
+	seq = ct_state_inc(offset);
 	// RCU is no longer watching.  Better be in extended quiescent state!
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
 }
 
 /*
@@ -96,7 +96,7 @@ static noinstr void rcu_dynticks_eqs_enter(void)
  * called from an extended quiescent state, that is, RCU is not watching
  * prior to the call to this function and is watching upon return.
  */
-static noinstr void rcu_dynticks_eqs_exit(void)
+static noinstr void ct_kernel_enter_state(int offset)
 {
 	int seq;
 
@@ -105,10 +105,10 @@ static noinstr void rcu_dynticks_eqs_exit(void)
 	 * and we also must force ordering with the next RCU read-side
 	 * critical section.
 	 */
-	seq = rcu_dynticks_inc(1);
+	seq = ct_state_inc(offset);
 	// RCU is now watching.  Better not be in an extended quiescent state!
 	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
 }
 
 /*
@@ -119,7 +119,7 @@ static noinstr void rcu_dynticks_eqs_exit(void)
  * the possibility of usermode upcalls having messed up our count
  * of interrupt nesting level during the prior busy period.
  */
-static void noinstr rcu_eqs_enter(bool user)
+static void noinstr ct_kernel_exit(bool user, int offset)
 {
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 
@@ -139,13 +139,13 @@ static void noinstr rcu_eqs_enter(bool user)
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	rcu_preempt_deferred_qs(current);
 
-	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+	// instrumentation for the noinstr ct_kernel_exit_state()
+	instrument_atomic_write(&ct->state, sizeof(ct->state));
 
 	instrumentation_end();
 	WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
 	// RCU is watching here ...
-	rcu_dynticks_eqs_enter();
+	ct_kernel_exit_state(offset);
 	// ... but is no longer watching here.
 	rcu_dynticks_task_enter();
 }
@@ -158,7 +158,7 @@ static void noinstr rcu_eqs_enter(bool user)
  * allow for the possibility of usermode upcalls messing up our count of
  * interrupt nesting level during the busy period that is just now starting.
  */
-static void noinstr rcu_eqs_exit(bool user)
+static void noinstr ct_kernel_enter(bool user, int offset)
 {
 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 	long oldval;
@@ -173,12 +173,12 @@ static void noinstr rcu_eqs_exit(bool user)
 	}
 	rcu_dynticks_task_exit();
 	// RCU is not watching here ...
-	rcu_dynticks_eqs_exit();
+	ct_kernel_enter_state(offset);
 	// ... but is watching here.
 	instrumentation_begin();
 
-	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+	// instrumentation for the noinstr ct_kernel_enter_state()
+	instrument_atomic_write(&ct->state, sizeof(ct->state));
 
 	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
@@ -192,7 +192,7 @@ static void noinstr rcu_eqs_exit(bool user)
  * ct_nmi_exit - inform RCU of exit from NMI context
  *
  * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
  * to let the RCU grace-period handling know that the CPU is back to
  * being RCU-idle.
  *
@@ -229,12 +229,12 @@ void noinstr ct_nmi_exit(void)
 	trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
 	WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
-	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
-	instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+	// instrumentation for the noinstr ct_kernel_exit_state()
+	instrument_atomic_write(&ct->state, sizeof(ct->state));
 	instrumentation_end();
 
 	// RCU is watching here ...
-	rcu_dynticks_eqs_enter();
+	ct_kernel_exit_state(RCU_DYNTICKS_IDX);
 	// ... but is no longer watching here.
 
 	if (!in_nmi())
@@ -244,7 +244,7 @@ void noinstr ct_nmi_exit(void)
 /**
  * ct_nmi_enter - inform RCU of entry to NMI context
  *
- * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
  * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
  * that the CPU is active.  This implementation permits nested NMIs, as
  * long as the nesting level does not overflow an int.  (You will probably
@@ -275,14 +275,14 @@ void noinstr ct_nmi_enter(void)
 			rcu_dynticks_task_exit();
 
 		// RCU is not watching here ...
-		rcu_dynticks_eqs_exit();
+		ct_kernel_enter_state(RCU_DYNTICKS_IDX);
 		// ... but is watching here.
 
 		instrumentation_begin();
 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
-		instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks));
-		// instrumentation for the noinstr rcu_dynticks_eqs_exit()
-		instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+		instrument_atomic_read(&ct->state, sizeof(ct->state));
+		// instrumentation for the noinstr ct_kernel_enter_state()
+		instrument_atomic_write(&ct->state, sizeof(ct->state));
 
 		incby = 1;
 	} else if (!in_nmi()) {
@@ -315,7 +315,7 @@ void noinstr ct_nmi_enter(void)
 void noinstr ct_idle_enter(void)
 {
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
-	rcu_eqs_enter(false);
+	ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
 }
 EXPORT_SYMBOL_GPL(ct_idle_enter);
 
@@ -333,7 +333,7 @@ void noinstr ct_idle_exit(void)
 	unsigned long flags;
 
 	raw_local_irq_save(flags);
-	rcu_eqs_exit(false);
+	ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
 	raw_local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(ct_idle_exit);
@@ -421,8 +421,8 @@ void ct_irq_exit_irqson(void)
 	local_irq_restore(flags);
 }
 #else
-static __always_inline void rcu_eqs_enter(bool user) { }
-static __always_inline void rcu_eqs_exit(bool user) { }
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
@@ -463,6 +463,7 @@ static __always_inline void context_tracking_recursion_exit(void)
  */
 void noinstr __ct_user_enter(enum ctx_state state)
 {
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 	lockdep_assert_irqs_disabled();
 
 	/* Kernel threads aren't supposed to go to userspace */
@@ -471,8 +472,8 @@ void noinstr __ct_user_enter(enum ctx_state state)
 	if (!context_tracking_recursion_enter())
 		return;
 
-	if ( __this_cpu_read(context_tracking.state) != state) {
-		if (__this_cpu_read(context_tracking.active)) {
+	if (__ct_state() != state) {
+		if (ct->active) {
 			/*
 			 * At this stage, only low level arch entry code remains and
 			 * then we'll run in userspace. We can assume there won't be
@@ -492,28 +493,49 @@ void noinstr __ct_user_enter(enum ctx_state state)
 			 * that will fire and reschedule once we resume in user/guest mode.
 			 */
 			rcu_irq_work_resched();
+
 			/*
 			 * Enter RCU idle mode right before resuming userspace.  No use of RCU
 			 * is permitted between this call and rcu_eqs_exit(). This way the
 			 * CPU doesn't need to maintain the tick for RCU maintenance purposes
 			 * when the CPU runs in userspace.
 			 */
-			rcu_eqs_enter(true);
+			ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+			/*
+			 * Special case if we only track user <-> kernel transitions for tickless
+			 * cputime accounting but we don't support RCU extended quiescent state.
+			 * In this we case we don't care about any concurrency/ordering.
+			 */
+			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+				atomic_set(&ct->state, state);
+		} else {
+			/*
+			 * Even if context tracking is disabled on this CPU, because it's outside
+			 * the full dynticks mask for example, we still have to keep track of the
+			 * context transitions and states to prevent inconsistency on those of
+			 * other CPUs.
+			 * If a task triggers an exception in userspace, sleep on the exception
+			 * handler and then migrate to another CPU, that new CPU must know where
+			 * the exception returns by the time we call exception_exit().
+			 * This information can only be provided by the previous CPU when it called
+			 * exception_enter().
+			 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+			 * is false because we know that CPU is not tickless.
+			 */
+			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+				/* Tracking for vtime only, no concurrent RCU EQS accounting */
+				atomic_set(&ct->state, state);
+			} else {
+				/*
+				 * Tracking for vtime and RCU EQS. Make sure we don't race
+				 * with NMIs. OTOH we don't care about ordering here since
+				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+				 * ordered.
+				 */
+				atomic_add(state, &ct->state);
+			}
 		}
-		/*
-		 * Even if context tracking is disabled on this CPU, because it's outside
-		 * the full dynticks mask for example, we still have to keep track of the
-		 * context transitions and states to prevent inconsistency on those of
-		 * other CPUs.
-		 * If a task triggers an exception in userspace, sleep on the exception
-		 * handler and then migrate to another CPU, that new CPU must know where
-		 * the exception returns by the time we call exception_exit().
-		 * This information can only be provided by the previous CPU when it called
-		 * exception_enter().
-		 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
-		 * is false because we know that CPU is not tickless.
-		 */
-		__this_cpu_write(context_tracking.state, state);
 	}
 	context_tracking_recursion_exit();
 }
@@ -581,24 +603,47 @@ NOKPROBE_SYMBOL(user_enter_callable);
  */
 void noinstr __ct_user_exit(enum ctx_state state)
 {
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
 	if (!context_tracking_recursion_enter())
 		return;
 
-	if (__this_cpu_read(context_tracking.state) == state) {
-		if (__this_cpu_read(context_tracking.active)) {
+	if (__ct_state() == state) {
+		if (ct->active) {
 			/*
 			 * Exit RCU idle mode while entering the kernel because it can
 			 * run a RCU read side critical section anytime.
 			 */
-			rcu_eqs_exit(true);
+			ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
 			if (state == CONTEXT_USER) {
 				instrumentation_begin();
 				vtime_user_exit(current);
 				trace_user_exit(0);
 				instrumentation_end();
 			}
+
+			/*
+			 * Special case if we only track user <-> kernel transitions for tickless
+			 * cputime accounting but we don't support RCU extended quiescent state.
+			 * In this we case we don't care about any concurrency/ordering.
+			 */
+			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+				atomic_set(&ct->state, CONTEXT_KERNEL);
+
+		} else {
+			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+				/* Tracking for vtime only, no concurrent RCU EQS accounting */
+				atomic_set(&ct->state, CONTEXT_KERNEL);
+			} else {
+				/*
+				 * Tracking for vtime and RCU EQS. Make sure we don't race
+				 * with NMIs. OTOH we don't care about ordering here since
+				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+				 * ordered.
+				 */
+				atomic_sub(state, &ct->state);
+			}
 		}
-		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
 	context_tracking_recursion_exit();
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e2a2083079a2..f9d20b40071f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -272,9 +272,9 @@ void rcu_softirq_qs(void)
  */
 static void rcu_dynticks_eqs_online(void)
 {
-	if (ct_dynticks() & 0x1)
+	if (ct_dynticks() & RCU_DYNTICKS_IDX)
 		return;
-	rcu_dynticks_inc(1);
+	ct_state_inc(RCU_DYNTICKS_IDX);
 }
 
 /*
@@ -293,7 +293,7 @@ static int rcu_dynticks_snap(int cpu)
  */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-	return !(snap & 0x1);
+	return !(snap & RCU_DYNTICKS_IDX);
 }
 
 /* Return true if the specified CPU is currently idle from an RCU viewpoint.  */
@@ -321,8 +321,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
 	int snap;
 
 	// If not quiescent, force back to earlier extended quiescent state.
-	snap = ct_dynticks_cpu(cpu) & ~0x1;
-
+	snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
 	smp_rmb(); // Order ->dynticks and *vp reads.
 	if (READ_ONCE(*vp))
 		return false;  // Non-zero, so report failure;
@@ -348,9 +347,9 @@ notrace void rcu_momentary_dyntick_idle(void)
 	int seq;
 
 	raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
-	seq = rcu_dynticks_inc(2);
+	seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
 	/* It is illegal to call this from idle state. */
-	WARN_ON_ONCE(!(seq & 0x1));
+	WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
 	rcu_preempt_deferred_qs(current);
 }
 EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2683ce0a7c72..195cad14742d 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -469,7 +469,7 @@ static void print_cpu_stall_info(int cpu)
 	rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
 	if (rcuc_starved)
 		sprintf(buf, " rcuc=%ld jiffies(starved)", j);
-	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
+	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
 	       cpu,
 	       "O."[!!cpu_online(cpu)],
 	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -478,7 +478,7 @@ static void print_cpu_stall_info(int cpu)
 			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
 				"!."[!delta],
 	       ticks_value, ticks_title,
-	       rcu_dynticks_snap(cpu) & 0xfff,
+	       rcu_dynticks_snap(cpu) & 0xffff,
 	       ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
-- 
cgit 


From 1dcaa3b462265f688613163a1562a65ee53a3311 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 16 Jun 2022 09:30:37 -0700
Subject: context_tracking: Use arch_atomic_read() in __ct_state for KASAN

Context tracking's __ct_state() function can be invoked from noinstr state
where RCU is not watching.  This means that its use of atomic_read()
causes KASAN to invoke the non-noinstr __kasan_check_read() function
from the noinstr function __ct_state().  This is problematic because
someone tracing the __kasan_check_read() function could get a nasty
surprise because of RCU not watching.

This commit therefore replaces the __ct_state() function's use of
atomic_read() with arch_atomic_read(), which KASAN does not attempt to
add instrumention to.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Marco Elver <elver@google.com>
Reviewed-by: Marco Elver <elver@google.com>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index e20a74bc0597..4a4d56f77180 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -49,7 +49,7 @@ DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
 static __always_inline int __ct_state(void)
 {
-	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK;
+	return arch_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK;
 }
 #endif
 
-- 
cgit 


From d7be266adbfd3aca6965ea6a0c36b2c3d8fc9fc8 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 4 Jul 2022 22:02:40 +0300
Subject: net: sched: provide shim definitions for taprio_offload_{get,free}

All callers of taprio_offload_get() and taprio_offload_free() prior to
the blamed commit are conditionally compiled based on CONFIG_NET_SCH_TAPRIO.

felix_vsc9959.c is different; it provides vsc9959_qos_port_tas_set()
even when taprio is compiled out.

Provide shim definitions for the functions exported by taprio so that
felix_vsc9959.c is able to compile. vsc9959_qos_port_tas_set() in that
case is dead code anyway, and ocelot_port->taprio remains NULL, which is
fine for the rest of the logic.

Fixes: 1c9017e44af2 ("net: dsa: felix: keep reference on entire tc-taprio config")
Reported-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Colin Foster <colin.foster@in-advantage.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Link: https://lore.kernel.org/r/20220704190241.1288847-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 44a35531952e..3372a1f67cf4 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -173,11 +173,28 @@ struct tc_taprio_qopt_offload {
 	struct tc_taprio_sched_entry entries[];
 };
 
+#if IS_ENABLED(CONFIG_NET_SCH_TAPRIO)
+
 /* Reference counting */
 struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
 						  *offload);
 void taprio_offload_free(struct tc_taprio_qopt_offload *offload);
 
+#else
+
+/* Reference counting */
+static inline struct tc_taprio_qopt_offload *
+taprio_offload_get(struct tc_taprio_qopt_offload *offload)
+{
+	return NULL;
+}
+
+static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+}
+
+#endif
+
 /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is
  * not mistaken for a software timestamp, because this will otherwise prevent
  * the dispatch of hardware timestamps to the socket.
-- 
cgit 


From 3c660a5d86f4c01cf641bfea004a49f5860a5bed Mon Sep 17 00:00:00 2001
From: Daniel Müller <deso@posteo.net>
Date: Tue, 28 Jun 2022 16:01:18 +0000
Subject: bpf: Introduce TYPE_MATCH related constants/macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to provide type match support we require a new type of
relocation which, in turn, requires toolchain support. Recent LLVM/Clang
versions support a new value for the last argument to the
__builtin_preserve_type_info builtin, for example.
With this change we introduce the necessary constants into relevant
header files, mirroring what the compiler may support.

Signed-off-by: Daniel Müller <deso@posteo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220628160127.607834-2-deso@posteo.net
---
 include/uapi/linux/bpf.h       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 tools/lib/bpf/bpf_core_read.h  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ad9e7311c4cf..379e68fb866f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind {
 	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
 	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+	BPF_CORE_TYPE_MATCHES = 12,          /* type match in target kernel */
 };
 
 /*
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ad9e7311c4cf..379e68fb866f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind {
 	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
 	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+	BPF_CORE_TYPE_MATCHES = 12,          /* type match in target kernel */
 };
 
 /*
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index fd48b1ff59ca..2308f4990e96 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -29,6 +29,7 @@ enum bpf_type_id_kind {
 enum bpf_type_info_kind {
 	BPF_TYPE_EXISTS = 0,		/* type existence in target kernel */
 	BPF_TYPE_SIZE = 1,		/* type size in target kernel */
+	BPF_TYPE_MATCHES = 2,		/* type match in target kernel */
 };
 
 /* second argument to __builtin_preserve_enum_value() built-in */
-- 
cgit 


From 6a4e9307cd3782f8e805fac970b9a240ab3078d6 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:11:12 +0200
Subject: dmaengine: qcom: fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>

Link: https://lore.kernel.org/r/20220521111145.81697-62-Julia.Lawall@inria.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/qcom-gpi-dma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
index f46dc3372f11..6680dd1a43c6 100644
--- a/include/linux/dma/qcom-gpi-dma.h
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -26,7 +26,7 @@ enum spi_transfer_cmd {
  * @clk_div: source clock divider
  * @clk_src: serial clock
  * @cmd: spi cmd
- * @fragmentation: keep CS assserted at end of sequence
+ * @fragmentation: keep CS asserted at end of sequence
  * @cs: chip select toggle
  * @set_config: set peripheral config
  * @rx_len: receive length for buffer
-- 
cgit 


From bd29c00edd0a5dac8b6e7332bb470cd50f92e893 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 21 Jun 2022 17:56:38 -0500
Subject: soundwire: revisit driver bind/unbind and callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the SoundWire probe, we store a pointer from the driver ops into
the 'slave' structure. This can lead to kernel oopses when unbinding
codec drivers, e.g. with the following sequence to remove machine
driver and codec driver.

/sbin/modprobe -r snd_soc_sof_sdw
/sbin/modprobe -r snd_soc_rt711

The full details can be found in the BugLink below, for reference the
two following examples show different cases of driver ops/callbacks
being invoked after the driver .remove().

kernel: BUG: kernel NULL pointer dereference, address: 0000000000000150
kernel: Workqueue: events cdns_update_slave_status_work [soundwire_cadence]
kernel: RIP: 0010:mutex_lock+0x19/0x30
kernel: Call Trace:
kernel:  ? sdw_handle_slave_status+0x426/0xe00 [soundwire_bus 94ff184bf398570c3f8ff7efe9e32529f532e4ae]
kernel:  ? newidle_balance+0x26a/0x400
kernel:  ? cdns_update_slave_status_work+0x1e9/0x200 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82]

kernel: BUG: unable to handle page fault for address: ffffffffc07654c8
kernel: Workqueue: pm pm_runtime_work
kernel: RIP: 0010:sdw_bus_prep_clk_stop+0x6f/0x160 [soundwire_bus]
kernel: Call Trace:
kernel:  <TASK>
kernel:  sdw_cdns_clock_stop+0xb5/0x1b0 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82]
kernel:  intel_suspend_runtime+0x5f/0x120 [soundwire_intel aca858f7c87048d3152a4a41bb68abb9b663a1dd]
kernel:  ? dpm_sysfs_remove+0x60/0x60

This was not detected earlier in Intel tests since the tests first
remove the parent PCI device and shut down the bus. The sequence
above is a corner case which keeps the bus operational but without a
driver bound.

While trying to solve this kernel oopses, it became clear that the
existing SoundWire bus does not deal well with the unbind case.

Commit 528be501b7d4a ("soundwire: sdw_slave: add probe_complete structure and new fields")
added a 'probed' status variable and a 'probe_complete'
struct completion. This status is however not reset on remove and
likewise the 'probe complete' is not re-initialized, so the
bind/unbind/bind test cases would fail. The timeout used before the
'update_status' callback was also a bad idea in hindsight, there
should really be no timing assumption as to if and when a driver is
bound to a device.

An initial draft was based on device_lock() and device_unlock() was
tested. This proved too complicated, with deadlocks created during the
suspend-resume sequences, which also use the same device_lock/unlock()
as the bind/unbind sequences. On a CometLake device, a bad DSDT/BIOS
caused spurious resumes and the use of device_lock() caused hangs
during suspend. After multiple weeks or testing and painful
reverse-engineering of deadlocks on different devices, we looked for
alternatives that did not interfere with the device core.

A bus notifier was used successfully to keep track of DRIVER_BOUND and
DRIVER_UNBIND events. This solved the bind-unbind-bind case in tests,
but it can still be defeated with a theoretical corner case where the
memory is freed by a .remove while the callback is in use. The
notifier only helps make sure the driver callbacks are valid, but not
that the memory allocated in probe remains valid while the callbacks
are invoked.

This patch suggests the introduction of a new 'sdw_dev_lock' mutex
protecting probe/remove and all driver callbacks. Since this mutex is
'local' to SoundWire only, it does not interfere with existing locks
and does not create deadlocks. In addition, this patch removes the
'probe_complete' completion, instead we directly invoke the
'update_status' from the probe routine. That removes any sort of
timing dependency and a much better support for the device/driver
model, the driver could be bound before the bus started, or eons after
the bus started and the hardware would be properly initialized in all
cases.

BugLink: https://github.com/thesofproject/linux/issues/3531
Fixes: 56d4fe31af77 ("soundwire: Add MIPI DisCo property helpers")
Fixes: 528be501b7d4a ("soundwire: sdw_slave: add probe_complete structure and new fields")
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220621225641.221170-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 75 ++++++++++++++++++++++++-------------------
 drivers/soundwire/bus_type.c  | 30 ++++++++++++++---
 drivers/soundwire/slave.c     |  3 +-
 drivers/soundwire/stream.c    | 53 +++++++++++++++++++-----------
 include/linux/soundwire/sdw.h |  6 ++--
 5 files changed, 106 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index a2bfb0434a67..8d4000664fa3 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -7,6 +7,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_type.h>
 #include "bus.h"
 #include "sysfs_local.h"
 
@@ -842,15 +843,21 @@ static int sdw_slave_clk_stop_callback(struct sdw_slave *slave,
 				       enum sdw_clk_stop_mode mode,
 				       enum sdw_clk_stop_type type)
 {
-	int ret;
+	int ret = 0;
 
-	if (slave->ops && slave->ops->clk_stop) {
-		ret = slave->ops->clk_stop(slave, mode, type);
-		if (ret < 0)
-			return ret;
+	mutex_lock(&slave->sdw_dev_lock);
+
+	if (slave->probed)  {
+		struct device *dev = &slave->dev;
+		struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+		if (drv->ops && drv->ops->clk_stop)
+			ret = drv->ops->clk_stop(slave, mode, type);
 	}
 
-	return 0;
+	mutex_unlock(&slave->sdw_dev_lock);
+
+	return ret;
 }
 
 static int sdw_slave_clk_stop_prepare(struct sdw_slave *slave,
@@ -1611,14 +1618,24 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
 		}
 
 		/* Update the Slave driver */
-		if (slave_notify && slave->ops &&
-		    slave->ops->interrupt_callback) {
-			slave_intr.sdca_cascade = sdca_cascade;
-			slave_intr.control_port = clear;
-			memcpy(slave_intr.port, &port_status,
-			       sizeof(slave_intr.port));
-
-			slave->ops->interrupt_callback(slave, &slave_intr);
+		if (slave_notify) {
+			mutex_lock(&slave->sdw_dev_lock);
+
+			if (slave->probed) {
+				struct device *dev = &slave->dev;
+				struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+				if (drv->ops && drv->ops->interrupt_callback) {
+					slave_intr.sdca_cascade = sdca_cascade;
+					slave_intr.control_port = clear;
+					memcpy(slave_intr.port, &port_status,
+					       sizeof(slave_intr.port));
+
+					drv->ops->interrupt_callback(slave, &slave_intr);
+				}
+			}
+
+			mutex_unlock(&slave->sdw_dev_lock);
 		}
 
 		/* Ack interrupt */
@@ -1692,29 +1709,21 @@ io_err:
 static int sdw_update_slave_status(struct sdw_slave *slave,
 				   enum sdw_slave_status status)
 {
-	unsigned long time;
+	int ret = 0;
 
-	if (!slave->probed) {
-		/*
-		 * the slave status update is typically handled in an
-		 * interrupt thread, which can race with the driver
-		 * probe, e.g. when a module needs to be loaded.
-		 *
-		 * make sure the probe is complete before updating
-		 * status.
-		 */
-		time = wait_for_completion_timeout(&slave->probe_complete,
-				msecs_to_jiffies(DEFAULT_PROBE_TIMEOUT));
-		if (!time) {
-			dev_err(&slave->dev, "Probe not complete, timed out\n");
-			return -ETIMEDOUT;
-		}
+	mutex_lock(&slave->sdw_dev_lock);
+
+	if (slave->probed) {
+		struct device *dev = &slave->dev;
+		struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+		if (drv->ops && drv->ops->update_status)
+			ret = drv->ops->update_status(slave, status);
 	}
 
-	if (!slave->ops || !slave->ops->update_status)
-		return 0;
+	mutex_unlock(&slave->sdw_dev_lock);
 
-	return slave->ops->update_status(slave, status);
+	return ret;
 }
 
 /**
diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c
index b81e04dd3a9f..04b3529f8929 100644
--- a/drivers/soundwire/bus_type.c
+++ b/drivers/soundwire/bus_type.c
@@ -98,8 +98,6 @@ static int sdw_drv_probe(struct device *dev)
 	if (!id)
 		return -ENODEV;
 
-	slave->ops = drv->ops;
-
 	/*
 	 * attach to power domain but don't turn on (last arg)
 	 */
@@ -107,19 +105,23 @@ static int sdw_drv_probe(struct device *dev)
 	if (ret)
 		return ret;
 
+	mutex_lock(&slave->sdw_dev_lock);
+
 	ret = drv->probe(slave, id);
 	if (ret) {
 		name = drv->name;
 		if (!name)
 			name = drv->driver.name;
+		mutex_unlock(&slave->sdw_dev_lock);
+
 		dev_err(dev, "Probe of %s failed: %d\n", name, ret);
 		dev_pm_domain_detach(dev, false);
 		return ret;
 	}
 
 	/* device is probed so let's read the properties now */
-	if (slave->ops && slave->ops->read_prop)
-		slave->ops->read_prop(slave);
+	if (drv->ops && drv->ops->read_prop)
+		drv->ops->read_prop(slave);
 
 	/* init the sysfs as we have properties now */
 	ret = sdw_slave_sysfs_init(slave);
@@ -139,7 +141,19 @@ static int sdw_drv_probe(struct device *dev)
 					     slave->prop.clk_stop_timeout);
 
 	slave->probed = true;
-	complete(&slave->probe_complete);
+
+	/*
+	 * if the probe happened after the bus was started, notify the codec driver
+	 * of the current hardware status to e.g. start the initialization.
+	 * Errors are only logged as warnings to avoid failing the probe.
+	 */
+	if (drv->ops && drv->ops->update_status) {
+		ret = drv->ops->update_status(slave, slave->status);
+		if (ret < 0)
+			dev_warn(dev, "%s: update_status failed with status %d\n", __func__, ret);
+	}
+
+	mutex_unlock(&slave->sdw_dev_lock);
 
 	dev_dbg(dev, "probe complete\n");
 
@@ -152,9 +166,15 @@ static int sdw_drv_remove(struct device *dev)
 	struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
 	int ret = 0;
 
+	mutex_lock(&slave->sdw_dev_lock);
+
+	slave->probed = false;
+
 	if (drv->remove)
 		ret = drv->remove(slave);
 
+	mutex_unlock(&slave->sdw_dev_lock);
+
 	dev_pm_domain_detach(dev, false);
 
 	return ret;
diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c
index 669d7573320b..25e76b5d4a1a 100644
--- a/drivers/soundwire/slave.c
+++ b/drivers/soundwire/slave.c
@@ -12,6 +12,7 @@ static void sdw_slave_release(struct device *dev)
 {
 	struct sdw_slave *slave = dev_to_sdw_dev(dev);
 
+	mutex_destroy(&slave->sdw_dev_lock);
 	kfree(slave);
 }
 
@@ -58,9 +59,9 @@ int sdw_slave_add(struct sdw_bus *bus,
 	init_completion(&slave->enumeration_complete);
 	init_completion(&slave->initialization_complete);
 	slave->dev_num = 0;
-	init_completion(&slave->probe_complete);
 	slave->probed = false;
 	slave->first_interrupt_done = false;
+	mutex_init(&slave->sdw_dev_lock);
 
 	for (i = 0; i < SDW_MAX_PORTS; i++)
 		init_completion(&slave->port_ready[i]);
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index d34150559142..bd502368339e 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_type.h>
 #include <sound/soc.h>
 #include "bus.h"
 
@@ -401,20 +402,26 @@ static int sdw_do_port_prep(struct sdw_slave_runtime *s_rt,
 			    struct sdw_prepare_ch prep_ch,
 			    enum sdw_port_prep_ops cmd)
 {
-	const struct sdw_slave_ops *ops = s_rt->slave->ops;
-	int ret;
+	int ret = 0;
+	struct sdw_slave *slave = s_rt->slave;
 
-	if (ops->port_prep) {
-		ret = ops->port_prep(s_rt->slave, &prep_ch, cmd);
-		if (ret < 0) {
-			dev_err(&s_rt->slave->dev,
-				"Slave Port Prep cmd %d failed: %d\n",
-				cmd, ret);
-			return ret;
+	mutex_lock(&slave->sdw_dev_lock);
+
+	if (slave->probed) {
+		struct device *dev = &slave->dev;
+		struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+		if (drv->ops && drv->ops->port_prep) {
+			ret = drv->ops->port_prep(slave, &prep_ch, cmd);
+			if (ret < 0)
+				dev_err(dev, "Slave Port Prep cmd %d failed: %d\n",
+					cmd, ret);
 		}
 	}
 
-	return 0;
+	mutex_unlock(&slave->sdw_dev_lock);
+
+	return ret;
 }
 
 static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
@@ -578,7 +585,7 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt)
 	struct sdw_slave_runtime *s_rt;
 	struct sdw_bus *bus = m_rt->bus;
 	struct sdw_slave *slave;
-	int ret = 0;
+	int ret;
 
 	if (bus->ops->set_bus_conf) {
 		ret = bus->ops->set_bus_conf(bus, &bus->params);
@@ -589,17 +596,27 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt)
 	list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
 		slave = s_rt->slave;
 
-		if (slave->ops->bus_config) {
-			ret = slave->ops->bus_config(slave, &bus->params);
-			if (ret < 0) {
-				dev_err(bus->dev, "Notify Slave: %d failed\n",
-					slave->dev_num);
-				return ret;
+		mutex_lock(&slave->sdw_dev_lock);
+
+		if (slave->probed) {
+			struct device *dev = &slave->dev;
+			struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+			if (drv->ops && drv->ops->bus_config) {
+				ret = drv->ops->bus_config(slave, &bus->params);
+				if (ret < 0) {
+					dev_err(dev, "Notify Slave: %d failed\n",
+						slave->dev_num);
+					mutex_unlock(&slave->sdw_dev_lock);
+					return ret;
+				}
 			}
 		}
+
+		mutex_unlock(&slave->sdw_dev_lock);
 	}
 
-	return ret;
+	return 0;
 }
 
 /**
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 76ce3f3ac0f2..bf6f0decb3f6 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -646,9 +646,6 @@ struct sdw_slave_ops {
  * @dev_num: Current Device Number, values can be 0 or dev_num_sticky
  * @dev_num_sticky: one-time static Device Number assigned by Bus
  * @probed: boolean tracking driver state
- * @probe_complete: completion utility to control potential races
- * on startup between driver probe/initialization and SoundWire
- * Slave state changes/implementation-defined interrupts
  * @enumeration_complete: completion utility to control potential races
  * on startup between device enumeration and read/write access to the
  * Slave device
@@ -663,6 +660,7 @@ struct sdw_slave_ops {
  * for a Slave happens for the first time after enumeration
  * @is_mockup_device: status flag used to squelch errors in the command/control
  * protocol for SoundWire mockup devices
+ * @sdw_dev_lock: mutex used to protect callbacks/remove races
  */
 struct sdw_slave {
 	struct sdw_slave_id id;
@@ -680,12 +678,12 @@ struct sdw_slave {
 	u16 dev_num;
 	u16 dev_num_sticky;
 	bool probed;
-	struct completion probe_complete;
 	struct completion enumeration_complete;
 	struct completion initialization_complete;
 	u32 unattach_request;
 	bool first_interrupt_done;
 	bool is_mockup_device;
+	struct mutex sdw_dev_lock; /* protect callbacks/remove races */
 };
 
 #define dev_to_sdw_dev(_dev) container_of(_dev, struct sdw_slave, dev)
-- 
cgit 


From 9a24bb35b0d81edb826f174d7752c2a54bc00abd Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 21 Jun 2022 17:56:39 -0500
Subject: soundwire: peripheral: remove useless ops pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we are using the ops structure directly from the driver,
there are no users left of this ops pointer.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220621225641.221170-3-pierre-louis.bossart@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index bf6f0decb3f6..39058c841469 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -637,7 +637,6 @@ struct sdw_slave_ops {
  * @dev: Linux device
  * @status: Status reported by the Slave
  * @bus: Bus handle
- * @ops: Slave callback ops
  * @prop: Slave properties
  * @debugfs: Slave debugfs
  * @node: node for bus list
@@ -667,7 +666,6 @@ struct sdw_slave {
 	struct device dev;
 	enum sdw_slave_status status;
 	struct sdw_bus *bus;
-	const struct sdw_slave_ops *ops;
 	struct sdw_slave_prop prop;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs;
-- 
cgit 


From e9a023f2b73ac35ff5cfbefe8524c64d8173d65f Mon Sep 17 00:00:00 2001
From: Eric Lin <eric.lin@sifive.com>
Date: Tue, 5 Jul 2022 17:19:20 +0800
Subject: drivers/perf: riscv_pmu: Add riscv pmu pm notifier

Currently, when the CPU is doing suspend to ram, we don't
save pmu counter register and its content will be lost.

To ensure perf profiling is not affected by suspend to ram,
this patch is based on arm_pmu CPU_PM notifier and implements riscv
pmu pm notifier. In the pm notifier, we stop the counter and update
the counter value before suspend and start the counter after resume.

Signed-off-by: Eric Lin <eric.lin@sifive.com>
Link: https://lore.kernel.org/r/20220705091920.27432-1-eric.lin@sifive.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/riscv_pmu.c       |  4 +--
 drivers/perf/riscv_pmu_sbi.c   | 81 +++++++++++++++++++++++++++++++++++++++---
 include/linux/perf/riscv_pmu.h |  4 +++
 3 files changed, 83 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index b2b8d2074ed0..2c961839903d 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -121,7 +121,7 @@ u64 riscv_pmu_event_update(struct perf_event *event)
 	return delta;
 }
 
-static void riscv_pmu_stop(struct perf_event *event, int flags)
+void riscv_pmu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
@@ -175,7 +175,7 @@ int riscv_pmu_event_set_period(struct perf_event *event)
 	return overflow;
 }
 
-static void riscv_pmu_start(struct perf_event *event, int flags)
+void riscv_pmu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index dca3537a8dcc..adc910e0f1e2 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -17,6 +17,7 @@
 #include <linux/irqdomain.h>
 #include <linux/of_irq.h>
 #include <linux/of.h>
+#include <linux/cpu_pm.h>
 
 #include <asm/sbi.h>
 #include <asm/hwcap.h>
@@ -693,6 +694,73 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
 	return 0;
 }
 
+#ifdef CONFIG_CPU_PM
+static int riscv_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
+				void *v)
+{
+	struct riscv_pmu *rvpmu = container_of(b, struct riscv_pmu, riscv_pm_nb);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
+	int enabled = bitmap_weight(cpuc->used_hw_ctrs, RISCV_MAX_COUNTERS);
+	struct perf_event *event;
+	int idx;
+
+	if (!enabled)
+		return NOTIFY_OK;
+
+	for (idx = 0; idx < RISCV_MAX_COUNTERS; idx++) {
+		event = cpuc->events[idx];
+		if (!event)
+			continue;
+
+		switch (cmd) {
+		case CPU_PM_ENTER:
+			/*
+			 * Stop and update the counter
+			 */
+			riscv_pmu_stop(event, PERF_EF_UPDATE);
+			break;
+		case CPU_PM_EXIT:
+		case CPU_PM_ENTER_FAILED:
+			/*
+			 * Restore and enable the counter.
+			 *
+			 * Requires RCU read locking to be functional,
+			 * wrap the call within RCU_NONIDLE to make the
+			 * RCU subsystem aware this cpu is not idle from
+			 * an RCU perspective for the riscv_pmu_start() call
+			 * duration.
+			 */
+			RCU_NONIDLE(riscv_pmu_start(event, PERF_EF_RELOAD));
+			break;
+		default:
+			break;
+		}
+	}
+
+	return NOTIFY_OK;
+}
+
+static int riscv_pm_pmu_register(struct riscv_pmu *pmu)
+{
+	pmu->riscv_pm_nb.notifier_call = riscv_pm_pmu_notify;
+	return cpu_pm_register_notifier(&pmu->riscv_pm_nb);
+}
+
+static void riscv_pm_pmu_unregister(struct riscv_pmu *pmu)
+{
+	cpu_pm_unregister_notifier(&pmu->riscv_pm_nb);
+}
+#else
+static inline int riscv_pm_pmu_register(struct riscv_pmu *pmu) { return 0; }
+static inline void riscv_pm_pmu_unregister(struct riscv_pmu *pmu) { }
+#endif
+
+static void riscv_pmu_destroy(struct riscv_pmu *pmu)
+{
+	riscv_pm_pmu_unregister(pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
+}
+
 static int pmu_sbi_device_probe(struct platform_device *pdev)
 {
 	struct riscv_pmu *pmu = NULL;
@@ -733,14 +801,19 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	ret = riscv_pm_pmu_register(pmu);
+	if (ret)
+		goto out_unregister;
+
 	ret = perf_pmu_register(&pmu->pmu, "cpu", PERF_TYPE_RAW);
-	if (ret) {
-		cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
-		return ret;
-	}
+	if (ret)
+		goto out_unregister;
 
 	return 0;
 
+out_unregister:
+	riscv_pmu_destroy(pmu);
+
 out_free:
 	kfree(pmu);
 	return ret;
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 46f9b6fe306e..bf66fe011fa8 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -56,9 +56,13 @@ struct riscv_pmu {
 
 	struct cpu_hw_events	__percpu *hw_events;
 	struct hlist_node	node;
+	struct notifier_block   riscv_pm_nb;
 };
 
 #define to_riscv_pmu(p) (container_of(p, struct riscv_pmu, pmu))
+
+void riscv_pmu_start(struct perf_event *event, int flags);
+void riscv_pmu_stop(struct perf_event *event, int flags);
 unsigned long riscv_pmu_ctr_read_csr(unsigned long csr);
 int riscv_pmu_event_set_period(struct perf_event *event);
 uint64_t riscv_pmu_ctr_get_width_mask(struct perf_event *event);
-- 
cgit 


From 66637ab137b44914356a9dc7a9b3f8ebcf0b0695 Mon Sep 17 00:00:00 2001
From: Guangbin Huang <huangguangbin2@huawei.com>
Date: Tue, 28 Jun 2022 14:34:19 +0800
Subject: drivers/perf: hisi: add driver for HNS3 PMU

HNS3(HiSilicon Network System 3) PMU is RCiEP device in HiSilicon SoC NIC,
supports collection of performance statistics such as bandwidth, latency,
packet rate and interrupt rate.

NIC of each SICL has one PMU device for it. Driver registers each PMU
device to perf, and exports information of supported events, filter mode of
each event, bdf range, hardware clock frequency, identifier and so on via
sysfs.

Each PMU device has its own registers of control, counters and interrupt,
and it supports 8 hardware events, each hardward event has its own
registers for configuration, counters and interrupt.

Filter options contains:
config       - select event
port         - select physical port of nic
tc           - select tc(must be used with port)
func         - select PF/VF
queue        - select queue of PF/VF(must be used with func)
intr         - select interrupt number(must be used with func)
global       - select all functions of IO DIE

Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/20220628063419.38514-3-huangguangbin2@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 MAINTAINERS                       |    6 +
 drivers/perf/hisilicon/Kconfig    |   10 +
 drivers/perf/hisilicon/Makefile   |    1 +
 drivers/perf/hisilicon/hns3_pmu.c | 1671 +++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h        |    1 +
 5 files changed, 1689 insertions(+)
 create mode 100644 drivers/perf/hisilicon/hns3_pmu.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3cf9842d9233..6fc770ed6292 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8944,6 +8944,12 @@ F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
 F:	Documentation/admin-guide/perf/hisi-pmu.rst
 F:	drivers/perf/hisilicon
 
+HISILICON HNS3 PMU DRIVER
+M:	Guangbin Huang <huangguangbin2@huawei.com>
+S:	Supported
+F:	Documentation/admin-guide/perf/hns3-pmu.rst
+F:	drivers/perf/hisilicon/hns3_pmu.c
+
 HISILICON QM AND ZIP Controller DRIVER
 M:	Zhou Wang <wangzhou1@hisilicon.com>
 L:	linux-crypto@vger.kernel.org
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig
index 5546218b5598..171bfc1b6bc2 100644
--- a/drivers/perf/hisilicon/Kconfig
+++ b/drivers/perf/hisilicon/Kconfig
@@ -14,3 +14,13 @@ config HISI_PCIE_PMU
 	  RCiEP devices.
 	  Adds the PCIe PMU into perf events system for monitoring latency,
 	  bandwidth etc.
+
+config HNS3_PMU
+	tristate "HNS3 PERF PMU"
+	depends on ARM64 || COMPILE_TEST
+	depends on PCI
+	help
+	  Provide support for HNS3 performance monitoring unit (PMU) RCiEP
+	  devices.
+	  Adds the HNS3 PMU into perf events system for monitoring latency,
+	  bandwidth etc.
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 6be83517acaa..4d2c9abe3372 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
 			  hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o
 
 obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
+obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o
diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c
new file mode 100644
index 000000000000..e0457d84af6b
--- /dev/null
+++ b/drivers/perf/hisilicon/hns3_pmu.c
@@ -0,0 +1,1671 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for HNS3 PMU iEP device. Related perf events are
+ * bandwidth, latency, packet rate, interrupt rate etc.
+ *
+ * Copyright (C) 2022 HiSilicon Limited
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci-epf.h>
+#include <linux/perf_event.h>
+#include <linux/smp.h>
+
+/* registers offset address */
+#define HNS3_PMU_REG_GLOBAL_CTRL		0x0000
+#define HNS3_PMU_REG_CLOCK_FREQ			0x0020
+#define HNS3_PMU_REG_BDF			0x0fe0
+#define HNS3_PMU_REG_VERSION			0x0fe4
+#define HNS3_PMU_REG_DEVICE_ID			0x0fe8
+
+#define HNS3_PMU_REG_EVENT_OFFSET		0x1000
+#define HNS3_PMU_REG_EVENT_SIZE			0x1000
+#define HNS3_PMU_REG_EVENT_CTRL_LOW		0x00
+#define HNS3_PMU_REG_EVENT_CTRL_HIGH		0x04
+#define HNS3_PMU_REG_EVENT_INTR_STATUS		0x08
+#define HNS3_PMU_REG_EVENT_INTR_MASK		0x0c
+#define HNS3_PMU_REG_EVENT_COUNTER		0x10
+#define HNS3_PMU_REG_EVENT_EXT_COUNTER		0x18
+#define HNS3_PMU_REG_EVENT_QID_CTRL		0x28
+#define HNS3_PMU_REG_EVENT_QID_PARA		0x2c
+
+#define HNS3_PMU_FILTER_SUPPORT_GLOBAL		BIT(0)
+#define HNS3_PMU_FILTER_SUPPORT_PORT		BIT(1)
+#define HNS3_PMU_FILTER_SUPPORT_PORT_TC		BIT(2)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC		BIT(3)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE	BIT(4)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC_INTR	BIT(5)
+
+#define HNS3_PMU_FILTER_ALL_TC			0xf
+#define HNS3_PMU_FILTER_ALL_QUEUE		0xffff
+
+#define HNS3_PMU_CTRL_SUBEVENT_S		4
+#define HNS3_PMU_CTRL_FILTER_MODE_S		24
+
+#define HNS3_PMU_GLOBAL_START			BIT(0)
+
+#define HNS3_PMU_EVENT_STATUS_RESET		BIT(11)
+#define HNS3_PMU_EVENT_EN			BIT(12)
+#define HNS3_PMU_EVENT_OVERFLOW_RESTART		BIT(15)
+
+#define HNS3_PMU_QID_PARA_FUNC_S		0
+#define HNS3_PMU_QID_PARA_QUEUE_S		16
+
+#define HNS3_PMU_QID_CTRL_REQ_ENABLE		BIT(0)
+#define HNS3_PMU_QID_CTRL_DONE			BIT(1)
+#define HNS3_PMU_QID_CTRL_MISS			BIT(2)
+
+#define HNS3_PMU_INTR_MASK_OVERFLOW		BIT(1)
+
+#define HNS3_PMU_MAX_HW_EVENTS			8
+
+/*
+ * Each hardware event contains two registers (counter and ext_counter) for
+ * bandwidth, packet rate, latency and interrupt rate. These two registers will
+ * be triggered to run at the same when a hardware event is enabled. The meaning
+ * of counter and ext_counter of different event type are different, their
+ * meaning show as follow:
+ *
+ * +----------------+------------------+---------------+
+ * |   event type   |     counter      |  ext_counter  |
+ * +----------------+------------------+---------------+
+ * | bandwidth      | byte number      | cycle number  |
+ * +----------------+------------------+---------------+
+ * | packet rate    | packet number    | cycle number  |
+ * +----------------+------------------+---------------+
+ * | latency        | cycle number     | packet number |
+ * +----------------+------------------+---------------+
+ * | interrupt rate | interrupt number | cycle number  |
+ * +----------------+------------------+---------------+
+ *
+ * The cycle number indicates increment of counter of hardware timer, the
+ * frequency of hardware timer can be read from hw_clk_freq file.
+ *
+ * Performance of each hardware event is calculated by: counter / ext_counter.
+ *
+ * Since processing of data is preferred to be done in userspace, we expose
+ * ext_counter as a separate event for userspace and use bit 16 to indicate it.
+ * For example, event 0x00001 and 0x10001 are actually one event for hardware
+ * because bit 0-15 are same. If the bit 16 of one event is 0 means to read
+ * counter register, otherwise means to read ext_counter register.
+ */
+/* bandwidth events */
+#define HNS3_PMU_EVT_BW_SSU_EGU_BYTE_NUM		0x00001
+#define HNS3_PMU_EVT_BW_SSU_EGU_TIME			0x10001
+#define HNS3_PMU_EVT_BW_SSU_RPU_BYTE_NUM		0x00002
+#define HNS3_PMU_EVT_BW_SSU_RPU_TIME			0x10002
+#define HNS3_PMU_EVT_BW_SSU_ROCE_BYTE_NUM		0x00003
+#define HNS3_PMU_EVT_BW_SSU_ROCE_TIME			0x10003
+#define HNS3_PMU_EVT_BW_ROCE_SSU_BYTE_NUM		0x00004
+#define HNS3_PMU_EVT_BW_ROCE_SSU_TIME			0x10004
+#define HNS3_PMU_EVT_BW_TPU_SSU_BYTE_NUM		0x00005
+#define HNS3_PMU_EVT_BW_TPU_SSU_TIME			0x10005
+#define HNS3_PMU_EVT_BW_RPU_RCBRX_BYTE_NUM		0x00006
+#define HNS3_PMU_EVT_BW_RPU_RCBRX_TIME			0x10006
+#define HNS3_PMU_EVT_BW_RCBTX_TXSCH_BYTE_NUM		0x00008
+#define HNS3_PMU_EVT_BW_RCBTX_TXSCH_TIME		0x10008
+#define HNS3_PMU_EVT_BW_WR_FBD_BYTE_NUM			0x00009
+#define HNS3_PMU_EVT_BW_WR_FBD_TIME			0x10009
+#define HNS3_PMU_EVT_BW_WR_EBD_BYTE_NUM			0x0000a
+#define HNS3_PMU_EVT_BW_WR_EBD_TIME			0x1000a
+#define HNS3_PMU_EVT_BW_RD_FBD_BYTE_NUM			0x0000b
+#define HNS3_PMU_EVT_BW_RD_FBD_TIME			0x1000b
+#define HNS3_PMU_EVT_BW_RD_EBD_BYTE_NUM			0x0000c
+#define HNS3_PMU_EVT_BW_RD_EBD_TIME			0x1000c
+#define HNS3_PMU_EVT_BW_RD_PAY_M0_BYTE_NUM		0x0000d
+#define HNS3_PMU_EVT_BW_RD_PAY_M0_TIME			0x1000d
+#define HNS3_PMU_EVT_BW_RD_PAY_M1_BYTE_NUM		0x0000e
+#define HNS3_PMU_EVT_BW_RD_PAY_M1_TIME			0x1000e
+#define HNS3_PMU_EVT_BW_WR_PAY_M0_BYTE_NUM		0x0000f
+#define HNS3_PMU_EVT_BW_WR_PAY_M0_TIME			0x1000f
+#define HNS3_PMU_EVT_BW_WR_PAY_M1_BYTE_NUM		0x00010
+#define HNS3_PMU_EVT_BW_WR_PAY_M1_TIME			0x10010
+
+/* packet rate events */
+#define HNS3_PMU_EVT_PPS_IGU_SSU_PACKET_NUM		0x00100
+#define HNS3_PMU_EVT_PPS_IGU_SSU_TIME			0x10100
+#define HNS3_PMU_EVT_PPS_SSU_EGU_PACKET_NUM		0x00101
+#define HNS3_PMU_EVT_PPS_SSU_EGU_TIME			0x10101
+#define HNS3_PMU_EVT_PPS_SSU_RPU_PACKET_NUM		0x00102
+#define HNS3_PMU_EVT_PPS_SSU_RPU_TIME			0x10102
+#define HNS3_PMU_EVT_PPS_SSU_ROCE_PACKET_NUM		0x00103
+#define HNS3_PMU_EVT_PPS_SSU_ROCE_TIME			0x10103
+#define HNS3_PMU_EVT_PPS_ROCE_SSU_PACKET_NUM		0x00104
+#define HNS3_PMU_EVT_PPS_ROCE_SSU_TIME			0x10104
+#define HNS3_PMU_EVT_PPS_TPU_SSU_PACKET_NUM		0x00105
+#define HNS3_PMU_EVT_PPS_TPU_SSU_TIME			0x10105
+#define HNS3_PMU_EVT_PPS_RPU_RCBRX_PACKET_NUM		0x00106
+#define HNS3_PMU_EVT_PPS_RPU_RCBRX_TIME			0x10106
+#define HNS3_PMU_EVT_PPS_RCBTX_TPU_PACKET_NUM		0x00107
+#define HNS3_PMU_EVT_PPS_RCBTX_TPU_TIME			0x10107
+#define HNS3_PMU_EVT_PPS_RCBTX_TXSCH_PACKET_NUM		0x00108
+#define HNS3_PMU_EVT_PPS_RCBTX_TXSCH_TIME		0x10108
+#define HNS3_PMU_EVT_PPS_WR_FBD_PACKET_NUM		0x00109
+#define HNS3_PMU_EVT_PPS_WR_FBD_TIME			0x10109
+#define HNS3_PMU_EVT_PPS_WR_EBD_PACKET_NUM		0x0010a
+#define HNS3_PMU_EVT_PPS_WR_EBD_TIME			0x1010a
+#define HNS3_PMU_EVT_PPS_RD_FBD_PACKET_NUM		0x0010b
+#define HNS3_PMU_EVT_PPS_RD_FBD_TIME			0x1010b
+#define HNS3_PMU_EVT_PPS_RD_EBD_PACKET_NUM		0x0010c
+#define HNS3_PMU_EVT_PPS_RD_EBD_TIME			0x1010c
+#define HNS3_PMU_EVT_PPS_RD_PAY_M0_PACKET_NUM		0x0010d
+#define HNS3_PMU_EVT_PPS_RD_PAY_M0_TIME			0x1010d
+#define HNS3_PMU_EVT_PPS_RD_PAY_M1_PACKET_NUM		0x0010e
+#define HNS3_PMU_EVT_PPS_RD_PAY_M1_TIME			0x1010e
+#define HNS3_PMU_EVT_PPS_WR_PAY_M0_PACKET_NUM		0x0010f
+#define HNS3_PMU_EVT_PPS_WR_PAY_M0_TIME			0x1010f
+#define HNS3_PMU_EVT_PPS_WR_PAY_M1_PACKET_NUM		0x00110
+#define HNS3_PMU_EVT_PPS_WR_PAY_M1_TIME			0x10110
+#define HNS3_PMU_EVT_PPS_NICROH_TX_PRE_PACKET_NUM	0x00111
+#define HNS3_PMU_EVT_PPS_NICROH_TX_PRE_TIME		0x10111
+#define HNS3_PMU_EVT_PPS_NICROH_RX_PRE_PACKET_NUM	0x00112
+#define HNS3_PMU_EVT_PPS_NICROH_RX_PRE_TIME		0x10112
+
+/* latency events */
+#define HNS3_PMU_EVT_DLY_TX_PUSH_TIME			0x00202
+#define HNS3_PMU_EVT_DLY_TX_PUSH_PACKET_NUM		0x10202
+#define HNS3_PMU_EVT_DLY_TX_TIME			0x00204
+#define HNS3_PMU_EVT_DLY_TX_PACKET_NUM			0x10204
+#define HNS3_PMU_EVT_DLY_SSU_TX_NIC_TIME		0x00206
+#define HNS3_PMU_EVT_DLY_SSU_TX_NIC_PACKET_NUM		0x10206
+#define HNS3_PMU_EVT_DLY_SSU_TX_ROCE_TIME		0x00207
+#define HNS3_PMU_EVT_DLY_SSU_TX_ROCE_PACKET_NUM		0x10207
+#define HNS3_PMU_EVT_DLY_SSU_RX_NIC_TIME		0x00208
+#define HNS3_PMU_EVT_DLY_SSU_RX_NIC_PACKET_NUM		0x10208
+#define HNS3_PMU_EVT_DLY_SSU_RX_ROCE_TIME		0x00209
+#define HNS3_PMU_EVT_DLY_SSU_RX_ROCE_PACKET_NUM		0x10209
+#define HNS3_PMU_EVT_DLY_RPU_TIME			0x0020e
+#define HNS3_PMU_EVT_DLY_RPU_PACKET_NUM			0x1020e
+#define HNS3_PMU_EVT_DLY_TPU_TIME			0x0020f
+#define HNS3_PMU_EVT_DLY_TPU_PACKET_NUM			0x1020f
+#define HNS3_PMU_EVT_DLY_RPE_TIME			0x00210
+#define HNS3_PMU_EVT_DLY_RPE_PACKET_NUM			0x10210
+#define HNS3_PMU_EVT_DLY_TPE_TIME			0x00211
+#define HNS3_PMU_EVT_DLY_TPE_PACKET_NUM			0x10211
+#define HNS3_PMU_EVT_DLY_TPE_PUSH_TIME			0x00212
+#define HNS3_PMU_EVT_DLY_TPE_PUSH_PACKET_NUM		0x10212
+#define HNS3_PMU_EVT_DLY_WR_FBD_TIME			0x00213
+#define HNS3_PMU_EVT_DLY_WR_FBD_PACKET_NUM		0x10213
+#define HNS3_PMU_EVT_DLY_WR_EBD_TIME			0x00214
+#define HNS3_PMU_EVT_DLY_WR_EBD_PACKET_NUM		0x10214
+#define HNS3_PMU_EVT_DLY_RD_FBD_TIME			0x00215
+#define HNS3_PMU_EVT_DLY_RD_FBD_PACKET_NUM		0x10215
+#define HNS3_PMU_EVT_DLY_RD_EBD_TIME			0x00216
+#define HNS3_PMU_EVT_DLY_RD_EBD_PACKET_NUM		0x10216
+#define HNS3_PMU_EVT_DLY_RD_PAY_M0_TIME			0x00217
+#define HNS3_PMU_EVT_DLY_RD_PAY_M0_PACKET_NUM		0x10217
+#define HNS3_PMU_EVT_DLY_RD_PAY_M1_TIME			0x00218
+#define HNS3_PMU_EVT_DLY_RD_PAY_M1_PACKET_NUM		0x10218
+#define HNS3_PMU_EVT_DLY_WR_PAY_M0_TIME			0x00219
+#define HNS3_PMU_EVT_DLY_WR_PAY_M0_PACKET_NUM		0x10219
+#define HNS3_PMU_EVT_DLY_WR_PAY_M1_TIME			0x0021a
+#define HNS3_PMU_EVT_DLY_WR_PAY_M1_PACKET_NUM		0x1021a
+#define HNS3_PMU_EVT_DLY_MSIX_WRITE_TIME		0x0021c
+#define HNS3_PMU_EVT_DLY_MSIX_WRITE_PACKET_NUM		0x1021c
+
+/* interrupt rate events */
+#define HNS3_PMU_EVT_PPS_MSIX_NIC_INTR_NUM		0x00300
+#define HNS3_PMU_EVT_PPS_MSIX_NIC_TIME			0x10300
+
+/* filter mode supported by each bandwidth event */
+#define HNS3_PMU_FILTER_BW_SSU_EGU		0x07
+#define HNS3_PMU_FILTER_BW_SSU_RPU		0x1f
+#define HNS3_PMU_FILTER_BW_SSU_ROCE		0x0f
+#define HNS3_PMU_FILTER_BW_ROCE_SSU		0x0f
+#define HNS3_PMU_FILTER_BW_TPU_SSU		0x1f
+#define HNS3_PMU_FILTER_BW_RPU_RCBRX		0x11
+#define HNS3_PMU_FILTER_BW_RCBTX_TXSCH		0x11
+#define HNS3_PMU_FILTER_BW_WR_FBD		0x1b
+#define HNS3_PMU_FILTER_BW_WR_EBD		0x11
+#define HNS3_PMU_FILTER_BW_RD_FBD		0x01
+#define HNS3_PMU_FILTER_BW_RD_EBD		0x1b
+#define HNS3_PMU_FILTER_BW_RD_PAY_M0		0x01
+#define HNS3_PMU_FILTER_BW_RD_PAY_M1		0x01
+#define HNS3_PMU_FILTER_BW_WR_PAY_M0		0x01
+#define HNS3_PMU_FILTER_BW_WR_PAY_M1		0x01
+
+/* filter mode supported by each packet rate event */
+#define HNS3_PMU_FILTER_PPS_IGU_SSU		0x07
+#define HNS3_PMU_FILTER_PPS_SSU_EGU		0x07
+#define HNS3_PMU_FILTER_PPS_SSU_RPU		0x1f
+#define HNS3_PMU_FILTER_PPS_SSU_ROCE		0x0f
+#define HNS3_PMU_FILTER_PPS_ROCE_SSU		0x0f
+#define HNS3_PMU_FILTER_PPS_TPU_SSU		0x1f
+#define HNS3_PMU_FILTER_PPS_RPU_RCBRX		0x11
+#define HNS3_PMU_FILTER_PPS_RCBTX_TPU		0x1f
+#define HNS3_PMU_FILTER_PPS_RCBTX_TXSCH		0x11
+#define HNS3_PMU_FILTER_PPS_WR_FBD		0x1b
+#define HNS3_PMU_FILTER_PPS_WR_EBD		0x11
+#define HNS3_PMU_FILTER_PPS_RD_FBD		0x01
+#define HNS3_PMU_FILTER_PPS_RD_EBD		0x1b
+#define HNS3_PMU_FILTER_PPS_RD_PAY_M0		0x01
+#define HNS3_PMU_FILTER_PPS_RD_PAY_M1		0x01
+#define HNS3_PMU_FILTER_PPS_WR_PAY_M0		0x01
+#define HNS3_PMU_FILTER_PPS_WR_PAY_M1		0x01
+#define HNS3_PMU_FILTER_PPS_NICROH_TX_PRE	0x01
+#define HNS3_PMU_FILTER_PPS_NICROH_RX_PRE	0x01
+
+/* filter mode supported by each latency event */
+#define HNS3_PMU_FILTER_DLY_TX_PUSH		0x01
+#define HNS3_PMU_FILTER_DLY_TX			0x01
+#define HNS3_PMU_FILTER_DLY_SSU_TX_NIC		0x07
+#define HNS3_PMU_FILTER_DLY_SSU_TX_ROCE		0x07
+#define HNS3_PMU_FILTER_DLY_SSU_RX_NIC		0x07
+#define HNS3_PMU_FILTER_DLY_SSU_RX_ROCE		0x07
+#define HNS3_PMU_FILTER_DLY_RPU			0x11
+#define HNS3_PMU_FILTER_DLY_TPU			0x1f
+#define HNS3_PMU_FILTER_DLY_RPE			0x01
+#define HNS3_PMU_FILTER_DLY_TPE			0x0b
+#define HNS3_PMU_FILTER_DLY_TPE_PUSH		0x1b
+#define HNS3_PMU_FILTER_DLY_WR_FBD		0x1b
+#define HNS3_PMU_FILTER_DLY_WR_EBD		0x11
+#define HNS3_PMU_FILTER_DLY_RD_FBD		0x01
+#define HNS3_PMU_FILTER_DLY_RD_EBD		0x1b
+#define HNS3_PMU_FILTER_DLY_RD_PAY_M0		0x01
+#define HNS3_PMU_FILTER_DLY_RD_PAY_M1		0x01
+#define HNS3_PMU_FILTER_DLY_WR_PAY_M0		0x01
+#define HNS3_PMU_FILTER_DLY_WR_PAY_M1		0x01
+#define HNS3_PMU_FILTER_DLY_MSIX_WRITE		0x01
+
+/* filter mode supported by each interrupt rate event */
+#define HNS3_PMU_FILTER_INTR_MSIX_NIC		0x01
+
+enum hns3_pmu_hw_filter_mode {
+	HNS3_PMU_HW_FILTER_GLOBAL,
+	HNS3_PMU_HW_FILTER_PORT,
+	HNS3_PMU_HW_FILTER_PORT_TC,
+	HNS3_PMU_HW_FILTER_FUNC,
+	HNS3_PMU_HW_FILTER_FUNC_QUEUE,
+	HNS3_PMU_HW_FILTER_FUNC_INTR,
+};
+
+struct hns3_pmu_event_attr {
+	u32 event;
+	u16 filter_support;
+};
+
+struct hns3_pmu {
+	struct perf_event *hw_events[HNS3_PMU_MAX_HW_EVENTS];
+	struct hlist_node node;
+	struct pci_dev *pdev;
+	struct pmu pmu;
+	void __iomem *base;
+	int irq;
+	int on_cpu;
+	u32 identifier;
+	u32 hw_clk_freq; /* hardware clock frequency of PMU */
+	/* maximum and minimum bdf allowed by PMU */
+	u16 bdf_min;
+	u16 bdf_max;
+};
+
+#define to_hns3_pmu(p)  (container_of((p), struct hns3_pmu, pmu))
+
+#define GET_PCI_DEVFN(bdf)  ((bdf) & 0xff)
+
+#define FILTER_CONDITION_PORT(port) ((1 << (port)) & 0xff)
+#define FILTER_CONDITION_PORT_TC(port, tc) (((port) << 3) | ((tc) & 0x07))
+#define FILTER_CONDITION_FUNC_INTR(func, intr) (((intr) << 8) | (func))
+
+#define HNS3_PMU_FILTER_ATTR(_name, _config, _start, _end)               \
+	static inline u64 hns3_pmu_get_##_name(struct perf_event *event) \
+	{                                                                \
+		return FIELD_GET(GENMASK_ULL(_end, _start),              \
+				 event->attr._config);                   \
+	}
+
+HNS3_PMU_FILTER_ATTR(subevent, config, 0, 7);
+HNS3_PMU_FILTER_ATTR(event_type, config, 8, 15);
+HNS3_PMU_FILTER_ATTR(ext_counter_used, config, 16, 16);
+HNS3_PMU_FILTER_ATTR(port, config1, 0, 3);
+HNS3_PMU_FILTER_ATTR(tc, config1, 4, 7);
+HNS3_PMU_FILTER_ATTR(bdf, config1, 8, 23);
+HNS3_PMU_FILTER_ATTR(queue, config1, 24, 39);
+HNS3_PMU_FILTER_ATTR(intr, config1, 40, 51);
+HNS3_PMU_FILTER_ATTR(global, config1, 52, 52);
+
+#define HNS3_BW_EVT_BYTE_NUM(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_BW_##_name##_BYTE_NUM,				\
+	HNS3_PMU_FILTER_BW_##_name})
+#define HNS3_BW_EVT_TIME(_name)		(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_BW_##_name##_TIME,					\
+	HNS3_PMU_FILTER_BW_##_name})
+#define HNS3_PPS_EVT_PACKET_NUM(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_PPS_##_name##_PACKET_NUM,				\
+	HNS3_PMU_FILTER_PPS_##_name})
+#define HNS3_PPS_EVT_TIME(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_PPS_##_name##_TIME,				\
+	HNS3_PMU_FILTER_PPS_##_name})
+#define HNS3_DLY_EVT_TIME(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_DLY_##_name##_TIME,				\
+	HNS3_PMU_FILTER_DLY_##_name})
+#define HNS3_DLY_EVT_PACKET_NUM(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_DLY_##_name##_PACKET_NUM,				\
+	HNS3_PMU_FILTER_DLY_##_name})
+#define HNS3_INTR_EVT_INTR_NUM(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_PPS_##_name##_INTR_NUM,				\
+	HNS3_PMU_FILTER_INTR_##_name})
+#define HNS3_INTR_EVT_TIME(_name)	(&(struct hns3_pmu_event_attr) {\
+	HNS3_PMU_EVT_PPS_##_name##_TIME,				\
+	HNS3_PMU_FILTER_INTR_##_name})
+
+static ssize_t hns3_pmu_format_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hns3_pmu_event_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hns3_pmu_event_attr *event;
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+	event = eattr->var;
+
+	return sysfs_emit(buf, "config=0x%x\n", event->event);
+}
+
+static ssize_t hns3_pmu_filter_mode_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hns3_pmu_event_attr *event;
+	struct dev_ext_attribute *eattr;
+	int len;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+	event = eattr->var;
+
+	len = sysfs_emit_at(buf, 0, "filter mode supported: ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_GLOBAL)
+		len += sysfs_emit_at(buf, len, "global ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT)
+		len += sysfs_emit_at(buf, len, "port ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT_TC)
+		len += sysfs_emit_at(buf, len, "port-tc ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC)
+		len += sysfs_emit_at(buf, len, "func ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE)
+		len += sysfs_emit_at(buf, len, "func-queue ");
+	if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_INTR)
+		len += sysfs_emit_at(buf, len, "func-intr ");
+
+	len += sysfs_emit_at(buf, len, "\n");
+
+	return len;
+}
+
+#define HNS3_PMU_ATTR(_name, _func, _config)				\
+	(&((struct dev_ext_attribute[]) {				\
+		{ __ATTR(_name, 0444, _func, NULL), (void *)_config }	\
+	})[0].attr.attr)
+
+#define HNS3_PMU_FORMAT_ATTR(_name, _format) \
+	HNS3_PMU_ATTR(_name, hns3_pmu_format_show, (void *)_format)
+#define HNS3_PMU_EVENT_ATTR(_name, _event) \
+	HNS3_PMU_ATTR(_name, hns3_pmu_event_show, (void *)_event)
+#define HNS3_PMU_FLT_MODE_ATTR(_name, _event) \
+	HNS3_PMU_ATTR(_name, hns3_pmu_filter_mode_show, (void *)_event)
+
+#define HNS3_PMU_BW_EVT_PAIR(_name, _macro) \
+	HNS3_PMU_EVENT_ATTR(_name##_byte_num, HNS3_BW_EVT_BYTE_NUM(_macro)), \
+	HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_BW_EVT_TIME(_macro))
+#define HNS3_PMU_PPS_EVT_PAIR(_name, _macro) \
+	HNS3_PMU_EVENT_ATTR(_name##_packet_num, HNS3_PPS_EVT_PACKET_NUM(_macro)), \
+	HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_PPS_EVT_TIME(_macro))
+#define HNS3_PMU_DLY_EVT_PAIR(_name, _macro) \
+	HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_DLY_EVT_TIME(_macro)), \
+	HNS3_PMU_EVENT_ATTR(_name##_packet_num, HNS3_DLY_EVT_PACKET_NUM(_macro))
+#define HNS3_PMU_INTR_EVT_PAIR(_name, _macro) \
+	HNS3_PMU_EVENT_ATTR(_name##_intr_num, HNS3_INTR_EVT_INTR_NUM(_macro)), \
+	HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_INTR_EVT_TIME(_macro))
+
+#define HNS3_PMU_BW_FLT_MODE_PAIR(_name, _macro) \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_byte_num, HNS3_BW_EVT_BYTE_NUM(_macro)), \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_BW_EVT_TIME(_macro))
+#define HNS3_PMU_PPS_FLT_MODE_PAIR(_name, _macro) \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_packet_num, HNS3_PPS_EVT_PACKET_NUM(_macro)), \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_PPS_EVT_TIME(_macro))
+#define HNS3_PMU_DLY_FLT_MODE_PAIR(_name, _macro) \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_DLY_EVT_TIME(_macro)), \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_packet_num, HNS3_DLY_EVT_PACKET_NUM(_macro))
+#define HNS3_PMU_INTR_FLT_MODE_PAIR(_name, _macro) \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_intr_num, HNS3_INTR_EVT_INTR_NUM(_macro)), \
+	HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_INTR_EVT_TIME(_macro))
+
+static u8 hns3_pmu_hw_filter_modes[] = {
+	HNS3_PMU_HW_FILTER_GLOBAL,
+	HNS3_PMU_HW_FILTER_PORT,
+	HNS3_PMU_HW_FILTER_PORT_TC,
+	HNS3_PMU_HW_FILTER_FUNC,
+	HNS3_PMU_HW_FILTER_FUNC_QUEUE,
+	HNS3_PMU_HW_FILTER_FUNC_INTR,
+};
+
+#define HNS3_PMU_SET_HW_FILTER(_hwc, _mode) \
+	((_hwc)->addr_filters = (void *)&hns3_pmu_hw_filter_modes[(_mode)])
+
+static ssize_t identifier_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "0x%x\n", hns3_pmu->identifier);
+}
+static DEVICE_ATTR_RO(identifier);
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%d\n", hns3_pmu->on_cpu);
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static ssize_t bdf_min_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+	u16 bdf = hns3_pmu->bdf_min;
+
+	return sysfs_emit(buf, "%02x:%02x.%x\n", PCI_BUS_NUM(bdf),
+			  PCI_SLOT(bdf), PCI_FUNC(bdf));
+}
+static DEVICE_ATTR_RO(bdf_min);
+
+static ssize_t bdf_max_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+	u16 bdf = hns3_pmu->bdf_max;
+
+	return sysfs_emit(buf, "%02x:%02x.%x\n", PCI_BUS_NUM(bdf),
+			  PCI_SLOT(bdf), PCI_FUNC(bdf));
+}
+static DEVICE_ATTR_RO(bdf_max);
+
+static ssize_t hw_clk_freq_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%u\n", hns3_pmu->hw_clk_freq);
+}
+static DEVICE_ATTR_RO(hw_clk_freq);
+
+static struct attribute *hns3_pmu_events_attr[] = {
+	/* bandwidth events */
+	HNS3_PMU_BW_EVT_PAIR(bw_ssu_egu, SSU_EGU),
+	HNS3_PMU_BW_EVT_PAIR(bw_ssu_rpu, SSU_RPU),
+	HNS3_PMU_BW_EVT_PAIR(bw_ssu_roce, SSU_ROCE),
+	HNS3_PMU_BW_EVT_PAIR(bw_roce_ssu, ROCE_SSU),
+	HNS3_PMU_BW_EVT_PAIR(bw_tpu_ssu, TPU_SSU),
+	HNS3_PMU_BW_EVT_PAIR(bw_rpu_rcbrx, RPU_RCBRX),
+	HNS3_PMU_BW_EVT_PAIR(bw_rcbtx_txsch, RCBTX_TXSCH),
+	HNS3_PMU_BW_EVT_PAIR(bw_wr_fbd, WR_FBD),
+	HNS3_PMU_BW_EVT_PAIR(bw_wr_ebd, WR_EBD),
+	HNS3_PMU_BW_EVT_PAIR(bw_rd_fbd, RD_FBD),
+	HNS3_PMU_BW_EVT_PAIR(bw_rd_ebd, RD_EBD),
+	HNS3_PMU_BW_EVT_PAIR(bw_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_BW_EVT_PAIR(bw_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_BW_EVT_PAIR(bw_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_BW_EVT_PAIR(bw_wr_pay_m1, WR_PAY_M1),
+
+	/* packet rate events */
+	HNS3_PMU_PPS_EVT_PAIR(pps_igu_ssu, IGU_SSU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_ssu_egu, SSU_EGU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_ssu_rpu, SSU_RPU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_ssu_roce, SSU_ROCE),
+	HNS3_PMU_PPS_EVT_PAIR(pps_roce_ssu, ROCE_SSU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_tpu_ssu, TPU_SSU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rpu_rcbrx, RPU_RCBRX),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rcbtx_tpu, RCBTX_TPU),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rcbtx_txsch, RCBTX_TXSCH),
+	HNS3_PMU_PPS_EVT_PAIR(pps_wr_fbd, WR_FBD),
+	HNS3_PMU_PPS_EVT_PAIR(pps_wr_ebd, WR_EBD),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rd_fbd, RD_FBD),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rd_ebd, RD_EBD),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_PPS_EVT_PAIR(pps_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_PPS_EVT_PAIR(pps_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_PPS_EVT_PAIR(pps_wr_pay_m1, WR_PAY_M1),
+	HNS3_PMU_PPS_EVT_PAIR(pps_intr_nicroh_tx_pre, NICROH_TX_PRE),
+	HNS3_PMU_PPS_EVT_PAIR(pps_intr_nicroh_rx_pre, NICROH_RX_PRE),
+
+	/* latency events */
+	HNS3_PMU_DLY_EVT_PAIR(dly_tx_push_to_mac, TX_PUSH),
+	HNS3_PMU_DLY_EVT_PAIR(dly_tx_normal_to_mac, TX),
+	HNS3_PMU_DLY_EVT_PAIR(dly_ssu_tx_th_nic, SSU_TX_NIC),
+	HNS3_PMU_DLY_EVT_PAIR(dly_ssu_tx_th_roce, SSU_TX_ROCE),
+	HNS3_PMU_DLY_EVT_PAIR(dly_ssu_rx_th_nic, SSU_RX_NIC),
+	HNS3_PMU_DLY_EVT_PAIR(dly_ssu_rx_th_roce, SSU_RX_ROCE),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rpu, RPU),
+	HNS3_PMU_DLY_EVT_PAIR(dly_tpu, TPU),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rpe, RPE),
+	HNS3_PMU_DLY_EVT_PAIR(dly_tpe_normal, TPE),
+	HNS3_PMU_DLY_EVT_PAIR(dly_tpe_push, TPE_PUSH),
+	HNS3_PMU_DLY_EVT_PAIR(dly_wr_fbd, WR_FBD),
+	HNS3_PMU_DLY_EVT_PAIR(dly_wr_ebd, WR_EBD),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rd_fbd, RD_FBD),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rd_ebd, RD_EBD),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_DLY_EVT_PAIR(dly_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_DLY_EVT_PAIR(dly_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_DLY_EVT_PAIR(dly_wr_pay_m1, WR_PAY_M1),
+	HNS3_PMU_DLY_EVT_PAIR(dly_msix_write, MSIX_WRITE),
+
+	/* interrupt rate events */
+	HNS3_PMU_INTR_EVT_PAIR(pps_intr_msix_nic, MSIX_NIC),
+
+	NULL
+};
+
+static struct attribute *hns3_pmu_filter_mode_attr[] = {
+	/* bandwidth events */
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_egu, SSU_EGU),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_rpu, SSU_RPU),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_roce, SSU_ROCE),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_roce_ssu, ROCE_SSU),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_tpu_ssu, TPU_SSU),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rpu_rcbrx, RPU_RCBRX),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rcbtx_txsch, RCBTX_TXSCH),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_fbd, WR_FBD),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_ebd, WR_EBD),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_fbd, RD_FBD),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_ebd, RD_EBD),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_pay_m1, WR_PAY_M1),
+
+	/* packet rate events */
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_igu_ssu, IGU_SSU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_egu, SSU_EGU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_rpu, SSU_RPU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_roce, SSU_ROCE),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_roce_ssu, ROCE_SSU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_tpu_ssu, TPU_SSU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rpu_rcbrx, RPU_RCBRX),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rcbtx_tpu, RCBTX_TPU),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rcbtx_txsch, RCBTX_TXSCH),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_fbd, WR_FBD),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_ebd, WR_EBD),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_fbd, RD_FBD),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_ebd, RD_EBD),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_pay_m1, WR_PAY_M1),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_intr_nicroh_tx_pre, NICROH_TX_PRE),
+	HNS3_PMU_PPS_FLT_MODE_PAIR(pps_intr_nicroh_rx_pre, NICROH_RX_PRE),
+
+	/* latency events */
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tx_push_to_mac, TX_PUSH),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tx_normal_to_mac, TX),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_tx_th_nic, SSU_TX_NIC),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_tx_th_roce, SSU_TX_ROCE),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_rx_th_nic, SSU_RX_NIC),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_rx_th_roce, SSU_RX_ROCE),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rpu, RPU),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpu, TPU),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rpe, RPE),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpe_normal, TPE),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpe_push, TPE_PUSH),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_fbd, WR_FBD),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_ebd, WR_EBD),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_fbd, RD_FBD),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_ebd, RD_EBD),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_pay_m0, RD_PAY_M0),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_pay_m1, RD_PAY_M1),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_pay_m0, WR_PAY_M0),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_pay_m1, WR_PAY_M1),
+	HNS3_PMU_DLY_FLT_MODE_PAIR(dly_msix_write, MSIX_WRITE),
+
+	/* interrupt rate events */
+	HNS3_PMU_INTR_FLT_MODE_PAIR(pps_intr_msix_nic, MSIX_NIC),
+
+	NULL
+};
+
+static struct attribute_group hns3_pmu_events_group = {
+	.name = "events",
+	.attrs = hns3_pmu_events_attr,
+};
+
+static struct attribute_group hns3_pmu_filter_mode_group = {
+	.name = "filtermode",
+	.attrs = hns3_pmu_filter_mode_attr,
+};
+
+static struct attribute *hns3_pmu_format_attr[] = {
+	HNS3_PMU_FORMAT_ATTR(subevent, "config:0-7"),
+	HNS3_PMU_FORMAT_ATTR(event_type, "config:8-15"),
+	HNS3_PMU_FORMAT_ATTR(ext_counter_used, "config:16"),
+	HNS3_PMU_FORMAT_ATTR(port, "config1:0-3"),
+	HNS3_PMU_FORMAT_ATTR(tc, "config1:4-7"),
+	HNS3_PMU_FORMAT_ATTR(bdf, "config1:8-23"),
+	HNS3_PMU_FORMAT_ATTR(queue, "config1:24-39"),
+	HNS3_PMU_FORMAT_ATTR(intr, "config1:40-51"),
+	HNS3_PMU_FORMAT_ATTR(global, "config1:52"),
+	NULL
+};
+
+static struct attribute_group hns3_pmu_format_group = {
+	.name = "format",
+	.attrs = hns3_pmu_format_attr,
+};
+
+static struct attribute *hns3_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static struct attribute_group hns3_pmu_cpumask_attr_group = {
+	.attrs = hns3_pmu_cpumask_attrs,
+};
+
+static struct attribute *hns3_pmu_identifier_attrs[] = {
+	&dev_attr_identifier.attr,
+	NULL
+};
+
+static struct attribute_group hns3_pmu_identifier_attr_group = {
+	.attrs = hns3_pmu_identifier_attrs,
+};
+
+static struct attribute *hns3_pmu_bdf_range_attrs[] = {
+	&dev_attr_bdf_min.attr,
+	&dev_attr_bdf_max.attr,
+	NULL
+};
+
+static struct attribute_group hns3_pmu_bdf_range_attr_group = {
+	.attrs = hns3_pmu_bdf_range_attrs,
+};
+
+static struct attribute *hns3_pmu_hw_clk_freq_attrs[] = {
+	&dev_attr_hw_clk_freq.attr,
+	NULL
+};
+
+static struct attribute_group hns3_pmu_hw_clk_freq_attr_group = {
+	.attrs = hns3_pmu_hw_clk_freq_attrs,
+};
+
+static const struct attribute_group *hns3_pmu_attr_groups[] = {
+	&hns3_pmu_events_group,
+	&hns3_pmu_filter_mode_group,
+	&hns3_pmu_format_group,
+	&hns3_pmu_cpumask_attr_group,
+	&hns3_pmu_identifier_attr_group,
+	&hns3_pmu_bdf_range_attr_group,
+	&hns3_pmu_hw_clk_freq_attr_group,
+	NULL
+};
+
+static u32 hns3_pmu_get_event(struct perf_event *event)
+{
+	return hns3_pmu_get_ext_counter_used(event) << 16 |
+	       hns3_pmu_get_event_type(event) << 8 |
+	       hns3_pmu_get_subevent(event);
+}
+
+static u32 hns3_pmu_get_real_event(struct perf_event *event)
+{
+	return hns3_pmu_get_event_type(event) << 8 |
+	       hns3_pmu_get_subevent(event);
+}
+
+static u32 hns3_pmu_get_offset(u32 offset, u32 idx)
+{
+	return offset + HNS3_PMU_REG_EVENT_OFFSET +
+	       HNS3_PMU_REG_EVENT_SIZE * idx;
+}
+
+static u32 hns3_pmu_readl(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx)
+{
+	u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+	return readl(hns3_pmu->base + offset);
+}
+
+static void hns3_pmu_writel(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx,
+			    u32 val)
+{
+	u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+	writel(val, hns3_pmu->base + offset);
+}
+
+static u64 hns3_pmu_readq(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx)
+{
+	u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+	return readq(hns3_pmu->base + offset);
+}
+
+static void hns3_pmu_writeq(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx,
+			    u64 val)
+{
+	u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+	writeq(val, hns3_pmu->base + offset);
+}
+
+static bool hns3_pmu_cmp_event(struct perf_event *target,
+			       struct perf_event *event)
+{
+	return hns3_pmu_get_real_event(target) == hns3_pmu_get_real_event(event);
+}
+
+static int hns3_pmu_find_related_event_idx(struct hns3_pmu *hns3_pmu,
+					   struct perf_event *event)
+{
+	struct perf_event *sibling;
+	int hw_event_used = 0;
+	int idx;
+
+	for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+		sibling = hns3_pmu->hw_events[idx];
+		if (!sibling)
+			continue;
+
+		hw_event_used++;
+
+		if (!hns3_pmu_cmp_event(sibling, event))
+			continue;
+
+		/* Related events is used in group */
+		if (sibling->group_leader == event->group_leader)
+			return idx;
+	}
+
+	/* No related event and all hardware events are used up */
+	if (hw_event_used >= HNS3_PMU_MAX_HW_EVENTS)
+		return -EBUSY;
+
+	/* No related event and there is extra hardware events can be use */
+	return -ENOENT;
+}
+
+static int hns3_pmu_get_event_idx(struct hns3_pmu *hns3_pmu)
+{
+	int idx;
+
+	for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+		if (!hns3_pmu->hw_events[idx])
+			return idx;
+	}
+
+	return -EBUSY;
+}
+
+static bool hns3_pmu_valid_bdf(struct hns3_pmu *hns3_pmu, u16 bdf)
+{
+	struct pci_dev *pdev;
+
+	if (bdf < hns3_pmu->bdf_min || bdf > hns3_pmu->bdf_max) {
+		pci_err(hns3_pmu->pdev, "Invalid EP device: %#x!\n", bdf);
+		return false;
+	}
+
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(hns3_pmu->pdev->bus),
+					   PCI_BUS_NUM(bdf),
+					   GET_PCI_DEVFN(bdf));
+	if (!pdev) {
+		pci_err(hns3_pmu->pdev, "Nonexistent EP device: %#x!\n", bdf);
+		return false;
+	}
+
+	pci_dev_put(pdev);
+	return true;
+}
+
+static void hns3_pmu_set_qid_para(struct hns3_pmu *hns3_pmu, u32 idx, u16 bdf,
+				  u16 queue)
+{
+	u32 val;
+
+	val = GET_PCI_DEVFN(bdf);
+	val |= (u32)queue << HNS3_PMU_QID_PARA_QUEUE_S;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_PARA, idx, val);
+}
+
+static bool hns3_pmu_qid_req_start(struct hns3_pmu *hns3_pmu, u32 idx)
+{
+	bool queue_id_valid = false;
+	u32 reg_qid_ctrl, val;
+	int err;
+
+	/* enable queue id request */
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_CTRL, idx,
+			HNS3_PMU_QID_CTRL_REQ_ENABLE);
+
+	reg_qid_ctrl = hns3_pmu_get_offset(HNS3_PMU_REG_EVENT_QID_CTRL, idx);
+	err = readl_poll_timeout(hns3_pmu->base + reg_qid_ctrl, val,
+				 val & HNS3_PMU_QID_CTRL_DONE, 1, 1000);
+	if (err == -ETIMEDOUT) {
+		pci_err(hns3_pmu->pdev, "QID request timeout!\n");
+		goto out;
+	}
+
+	queue_id_valid = !(val & HNS3_PMU_QID_CTRL_MISS);
+
+out:
+	/* disable qid request and clear status */
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_CTRL, idx, 0);
+
+	return queue_id_valid;
+}
+
+static bool hns3_pmu_valid_queue(struct hns3_pmu *hns3_pmu, u32 idx, u16 bdf,
+				 u16 queue)
+{
+	hns3_pmu_set_qid_para(hns3_pmu, idx, bdf, queue);
+
+	return hns3_pmu_qid_req_start(hns3_pmu, idx);
+}
+
+static struct hns3_pmu_event_attr *hns3_pmu_get_pmu_event(u32 event)
+{
+	struct hns3_pmu_event_attr *pmu_event;
+	struct dev_ext_attribute *eattr;
+	struct device_attribute *dattr;
+	struct attribute *attr;
+	u32 i;
+
+	for (i = 0; i < ARRAY_SIZE(hns3_pmu_events_attr) - 1; i++) {
+		attr = hns3_pmu_events_attr[i];
+		dattr = container_of(attr, struct device_attribute, attr);
+		eattr = container_of(dattr, struct dev_ext_attribute, attr);
+		pmu_event = eattr->var;
+
+		if (event == pmu_event->event)
+			return pmu_event;
+	}
+
+	return NULL;
+}
+
+static int hns3_pmu_set_func_mode(struct perf_event *event,
+				  struct hns3_pmu *hns3_pmu)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u16 bdf = hns3_pmu_get_bdf(event);
+
+	if (!hns3_pmu_valid_bdf(hns3_pmu, bdf))
+		return -ENOENT;
+
+	HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC);
+
+	return 0;
+}
+
+static int hns3_pmu_set_func_queue_mode(struct perf_event *event,
+					struct hns3_pmu *hns3_pmu)
+{
+	u16 queue_id = hns3_pmu_get_queue(event);
+	struct hw_perf_event *hwc = &event->hw;
+	u16 bdf = hns3_pmu_get_bdf(event);
+
+	if (!hns3_pmu_valid_bdf(hns3_pmu, bdf))
+		return -ENOENT;
+
+	if (!hns3_pmu_valid_queue(hns3_pmu, hwc->idx, bdf, queue_id)) {
+		pci_err(hns3_pmu->pdev, "Invalid queue: %u\n", queue_id);
+		return -ENOENT;
+	}
+
+	HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC_QUEUE);
+
+	return 0;
+}
+
+static bool
+hns3_pmu_is_enabled_global_mode(struct perf_event *event,
+				struct hns3_pmu_event_attr *pmu_event)
+{
+	u8 global = hns3_pmu_get_global(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_GLOBAL))
+		return false;
+
+	return global;
+}
+
+static bool hns3_pmu_is_enabled_func_mode(struct perf_event *event,
+					  struct hns3_pmu_event_attr *pmu_event)
+{
+	u16 queue_id = hns3_pmu_get_queue(event);
+	u16 bdf = hns3_pmu_get_bdf(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC))
+		return false;
+	else if (queue_id != HNS3_PMU_FILTER_ALL_QUEUE)
+		return false;
+
+	return bdf;
+}
+
+static bool
+hns3_pmu_is_enabled_func_queue_mode(struct perf_event *event,
+				    struct hns3_pmu_event_attr *pmu_event)
+{
+	u16 queue_id = hns3_pmu_get_queue(event);
+	u16 bdf = hns3_pmu_get_bdf(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE))
+		return false;
+	else if (queue_id == HNS3_PMU_FILTER_ALL_QUEUE)
+		return false;
+
+	return bdf;
+}
+
+static bool hns3_pmu_is_enabled_port_mode(struct perf_event *event,
+					  struct hns3_pmu_event_attr *pmu_event)
+{
+	u8 tc_id = hns3_pmu_get_tc(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT))
+		return false;
+
+	return tc_id == HNS3_PMU_FILTER_ALL_TC;
+}
+
+static bool
+hns3_pmu_is_enabled_port_tc_mode(struct perf_event *event,
+				 struct hns3_pmu_event_attr *pmu_event)
+{
+	u8 tc_id = hns3_pmu_get_tc(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT_TC))
+		return false;
+
+	return tc_id != HNS3_PMU_FILTER_ALL_TC;
+}
+
+static bool
+hns3_pmu_is_enabled_func_intr_mode(struct perf_event *event,
+				   struct hns3_pmu *hns3_pmu,
+				   struct hns3_pmu_event_attr *pmu_event)
+{
+	u16 bdf = hns3_pmu_get_bdf(event);
+
+	if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_INTR))
+		return false;
+
+	return hns3_pmu_valid_bdf(hns3_pmu, bdf);
+}
+
+static int hns3_pmu_select_filter_mode(struct perf_event *event,
+				       struct hns3_pmu *hns3_pmu)
+{
+	u32 event_id = hns3_pmu_get_event(event);
+	struct hw_perf_event *hwc = &event->hw;
+	struct hns3_pmu_event_attr *pmu_event;
+
+	pmu_event = hns3_pmu_get_pmu_event(event_id);
+	if (!pmu_event) {
+		pci_err(hns3_pmu->pdev, "Invalid pmu event\n");
+		return -ENOENT;
+	}
+
+	if (hns3_pmu_is_enabled_global_mode(event, pmu_event)) {
+		HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_GLOBAL);
+		return 0;
+	}
+
+	if (hns3_pmu_is_enabled_func_mode(event, pmu_event))
+		return hns3_pmu_set_func_mode(event, hns3_pmu);
+
+	if (hns3_pmu_is_enabled_func_queue_mode(event, pmu_event))
+		return hns3_pmu_set_func_queue_mode(event, hns3_pmu);
+
+	if (hns3_pmu_is_enabled_port_mode(event, pmu_event)) {
+		HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_PORT);
+		return 0;
+	}
+
+	if (hns3_pmu_is_enabled_port_tc_mode(event, pmu_event)) {
+		HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_PORT_TC);
+		return 0;
+	}
+
+	if (hns3_pmu_is_enabled_func_intr_mode(event, hns3_pmu, pmu_event)) {
+		HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC_INTR);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static bool hns3_pmu_validate_event_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct perf_event *event_group[HNS3_PMU_MAX_HW_EVENTS];
+	int counters = 1;
+	int num;
+
+	event_group[0] = leader;
+	if (!is_software_event(leader)) {
+		if (leader->pmu != event->pmu)
+			return false;
+
+		if (leader != event && !hns3_pmu_cmp_event(leader, event))
+			event_group[counters++] = event;
+	}
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (is_software_event(sibling))
+			continue;
+
+		if (sibling->pmu != event->pmu)
+			return false;
+
+		for (num = 0; num < counters; num++) {
+			if (hns3_pmu_cmp_event(event_group[num], sibling))
+				break;
+		}
+
+		if (num == counters)
+			event_group[counters++] = sibling;
+	}
+
+	return counters <= HNS3_PMU_MAX_HW_EVENTS;
+}
+
+static u32 hns3_pmu_get_filter_condition(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u16 intr_id = hns3_pmu_get_intr(event);
+	u8 port_id = hns3_pmu_get_port(event);
+	u16 bdf = hns3_pmu_get_bdf(event);
+	u8 tc_id = hns3_pmu_get_tc(event);
+	u8 filter_mode;
+
+	filter_mode = *(u8 *)hwc->addr_filters;
+	switch (filter_mode) {
+	case HNS3_PMU_HW_FILTER_PORT:
+		return FILTER_CONDITION_PORT(port_id);
+	case HNS3_PMU_HW_FILTER_PORT_TC:
+		return FILTER_CONDITION_PORT_TC(port_id, tc_id);
+	case HNS3_PMU_HW_FILTER_FUNC:
+	case HNS3_PMU_HW_FILTER_FUNC_QUEUE:
+		return GET_PCI_DEVFN(bdf);
+	case HNS3_PMU_HW_FILTER_FUNC_INTR:
+		return FILTER_CONDITION_FUNC_INTR(GET_PCI_DEVFN(bdf), intr_id);
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void hns3_pmu_config_filter(struct perf_event *event)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	u8 event_type = hns3_pmu_get_event_type(event);
+	u8 subevent_id = hns3_pmu_get_subevent(event);
+	u16 queue_id = hns3_pmu_get_queue(event);
+	struct hw_perf_event *hwc = &event->hw;
+	u8 filter_mode = *(u8 *)hwc->addr_filters;
+	u16 bdf = hns3_pmu_get_bdf(event);
+	u32 idx = hwc->idx;
+	u32 val;
+
+	val = event_type;
+	val |= subevent_id << HNS3_PMU_CTRL_SUBEVENT_S;
+	val |= filter_mode << HNS3_PMU_CTRL_FILTER_MODE_S;
+	val |= HNS3_PMU_EVENT_OVERFLOW_RESTART;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+
+	val = hns3_pmu_get_filter_condition(event);
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_HIGH, idx, val);
+
+	if (filter_mode == HNS3_PMU_HW_FILTER_FUNC_QUEUE)
+		hns3_pmu_set_qid_para(hns3_pmu, idx, bdf, queue_id);
+}
+
+static void hns3_pmu_enable_counter(struct hns3_pmu *hns3_pmu,
+				    struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u32 val;
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+	val |= HNS3_PMU_EVENT_EN;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static void hns3_pmu_disable_counter(struct hns3_pmu *hns3_pmu,
+				     struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u32 val;
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+	val &= ~HNS3_PMU_EVENT_EN;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static void hns3_pmu_enable_intr(struct hns3_pmu *hns3_pmu,
+				 struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u32 val;
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx);
+	val &= ~HNS3_PMU_INTR_MASK_OVERFLOW;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx, val);
+}
+
+static void hns3_pmu_disable_intr(struct hns3_pmu *hns3_pmu,
+				  struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u32 val;
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx);
+	val |= HNS3_PMU_INTR_MASK_OVERFLOW;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx, val);
+}
+
+static void hns3_pmu_clear_intr_status(struct hns3_pmu *hns3_pmu, u32 idx)
+{
+	u32 val;
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+	val |= HNS3_PMU_EVENT_STATUS_RESET;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+
+	val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+	val &= ~HNS3_PMU_EVENT_STATUS_RESET;
+	hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static u64 hns3_pmu_read_counter(struct perf_event *event)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+
+	return hns3_pmu_readq(hns3_pmu, event->hw.event_base, event->hw.idx);
+}
+
+static void hns3_pmu_write_counter(struct perf_event *event, u64 value)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	u32 idx = event->hw.idx;
+
+	hns3_pmu_writeq(hns3_pmu, HNS3_PMU_REG_EVENT_COUNTER, idx, value);
+	hns3_pmu_writeq(hns3_pmu, HNS3_PMU_REG_EVENT_EXT_COUNTER, idx, value);
+}
+
+static void hns3_pmu_init_counter(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	local64_set(&hwc->prev_count, 0);
+	hns3_pmu_write_counter(event, 0);
+}
+
+static int hns3_pmu_event_init(struct perf_event *event)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+	int ret;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling is not supported */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	event->cpu = hns3_pmu->on_cpu;
+
+	idx = hns3_pmu_get_event_idx(hns3_pmu);
+	if (idx < 0) {
+		pci_err(hns3_pmu->pdev, "Up to %u events are supported!\n",
+			HNS3_PMU_MAX_HW_EVENTS);
+		return -EBUSY;
+	}
+
+	hwc->idx = idx;
+
+	ret = hns3_pmu_select_filter_mode(event, hns3_pmu);
+	if (ret) {
+		pci_err(hns3_pmu->pdev, "Invalid filter, ret = %d.\n", ret);
+		return ret;
+	}
+
+	if (!hns3_pmu_validate_event_group(event)) {
+		pci_err(hns3_pmu->pdev, "Invalid event group.\n");
+		return -EINVAL;
+	}
+
+	if (hns3_pmu_get_ext_counter_used(event))
+		hwc->event_base = HNS3_PMU_REG_EVENT_EXT_COUNTER;
+	else
+		hwc->event_base = HNS3_PMU_REG_EVENT_COUNTER;
+
+	return 0;
+}
+
+static void hns3_pmu_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 new_cnt, prev_cnt, delta;
+
+	do {
+		prev_cnt = local64_read(&hwc->prev_count);
+		new_cnt = hns3_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_cnt, new_cnt) !=
+		 prev_cnt);
+
+	delta = new_cnt - prev_cnt;
+	local64_add(delta, &event->count);
+}
+
+static void hns3_pmu_start(struct perf_event *event, int flags)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	hns3_pmu_config_filter(event);
+	hns3_pmu_init_counter(event);
+	hns3_pmu_enable_intr(hns3_pmu, hwc);
+	hns3_pmu_enable_counter(hns3_pmu, hwc);
+
+	perf_event_update_userpage(event);
+}
+
+static void hns3_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hns3_pmu_disable_counter(hns3_pmu, hwc);
+	hns3_pmu_disable_intr(hns3_pmu, hwc);
+
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	/* Read hardware counter and update the perf counter statistics */
+	hns3_pmu_read(event);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hns3_pmu_add(struct perf_event *event, int flags)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	/* Check all working events to find a related event. */
+	idx = hns3_pmu_find_related_event_idx(hns3_pmu, event);
+	if (idx < 0 && idx != -ENOENT)
+		return idx;
+
+	/* Current event shares an enabled hardware event with related event */
+	if (idx >= 0 && idx < HNS3_PMU_MAX_HW_EVENTS) {
+		hwc->idx = idx;
+		goto start_count;
+	}
+
+	idx = hns3_pmu_get_event_idx(hns3_pmu);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	hns3_pmu->hw_events[idx] = event;
+
+start_count:
+	if (flags & PERF_EF_START)
+		hns3_pmu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void hns3_pmu_del(struct perf_event *event, int flags)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hns3_pmu_stop(event, PERF_EF_UPDATE);
+	hns3_pmu->hw_events[hwc->idx] = NULL;
+	perf_event_update_userpage(event);
+}
+
+static void hns3_pmu_enable(struct pmu *pmu)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(pmu);
+	u32 val;
+
+	val = readl(hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+	val |= HNS3_PMU_GLOBAL_START;
+	writel(val, hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+}
+
+static void hns3_pmu_disable(struct pmu *pmu)
+{
+	struct hns3_pmu *hns3_pmu = to_hns3_pmu(pmu);
+	u32 val;
+
+	val = readl(hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+	val &= ~HNS3_PMU_GLOBAL_START;
+	writel(val, hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+}
+
+static int hns3_pmu_alloc_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu)
+{
+	u16 device_id;
+	char *name;
+	u32 val;
+
+	hns3_pmu->base = pcim_iomap_table(pdev)[BAR_2];
+	if (!hns3_pmu->base) {
+		pci_err(pdev, "ioremap failed\n");
+		return -ENOMEM;
+	}
+
+	hns3_pmu->hw_clk_freq = readl(hns3_pmu->base + HNS3_PMU_REG_CLOCK_FREQ);
+
+	val = readl(hns3_pmu->base + HNS3_PMU_REG_BDF);
+	hns3_pmu->bdf_min = val & 0xffff;
+	hns3_pmu->bdf_max = val >> 16;
+
+	val = readl(hns3_pmu->base + HNS3_PMU_REG_DEVICE_ID);
+	device_id = val & 0xffff;
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hns3_pmu_sicl_%u", device_id);
+	if (!name)
+		return -ENOMEM;
+
+	hns3_pmu->pdev = pdev;
+	hns3_pmu->on_cpu = -1;
+	hns3_pmu->identifier = readl(hns3_pmu->base + HNS3_PMU_REG_VERSION);
+	hns3_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.module		= THIS_MODULE,
+		.event_init	= hns3_pmu_event_init,
+		.pmu_enable	= hns3_pmu_enable,
+		.pmu_disable	= hns3_pmu_disable,
+		.add		= hns3_pmu_add,
+		.del		= hns3_pmu_del,
+		.start		= hns3_pmu_start,
+		.stop		= hns3_pmu_stop,
+		.read		= hns3_pmu_read,
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= hns3_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	return 0;
+}
+
+static irqreturn_t hns3_pmu_irq(int irq, void *data)
+{
+	struct hns3_pmu *hns3_pmu = data;
+	u32 intr_status, idx;
+
+	for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+		intr_status = hns3_pmu_readl(hns3_pmu,
+					     HNS3_PMU_REG_EVENT_INTR_STATUS,
+					     idx);
+
+		/*
+		 * As each counter will restart from 0 when it is overflowed,
+		 * extra processing is no need, just clear interrupt status.
+		 */
+		if (intr_status)
+			hns3_pmu_clear_intr_status(hns3_pmu, idx);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int hns3_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hns3_pmu *hns3_pmu;
+
+	hns3_pmu = hlist_entry_safe(node, struct hns3_pmu, node);
+	if (!hns3_pmu)
+		return -ENODEV;
+
+	if (hns3_pmu->on_cpu == -1) {
+		hns3_pmu->on_cpu = cpu;
+		irq_set_affinity(hns3_pmu->irq, cpumask_of(cpu));
+	}
+
+	return 0;
+}
+
+static int hns3_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hns3_pmu *hns3_pmu;
+	unsigned int target;
+
+	hns3_pmu = hlist_entry_safe(node, struct hns3_pmu, node);
+	if (!hns3_pmu)
+		return -ENODEV;
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (hns3_pmu->on_cpu != cpu)
+		return 0;
+
+	/* Choose a new CPU from all online cpus */
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&hns3_pmu->pmu, cpu, target);
+	hns3_pmu->on_cpu = target;
+	irq_set_affinity(hns3_pmu->irq, cpumask_of(target));
+
+	return 0;
+}
+
+static void hns3_pmu_free_irq(void *data)
+{
+	struct pci_dev *pdev = data;
+
+	pci_free_irq_vectors(pdev);
+}
+
+static int hns3_pmu_irq_register(struct pci_dev *pdev,
+				 struct hns3_pmu *hns3_pmu)
+{
+	int irq, ret;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+	if (ret < 0) {
+		pci_err(pdev, "failed to enable MSI vectors, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = devm_add_action(&pdev->dev, hns3_pmu_free_irq, pdev);
+	if (ret) {
+		pci_err(pdev, "failed to add free irq action, ret = %d.\n", ret);
+		return ret;
+	}
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = devm_request_irq(&pdev->dev, irq, hns3_pmu_irq, 0,
+			       hns3_pmu->pmu.name, hns3_pmu);
+	if (ret) {
+		pci_err(pdev, "failed to register irq, ret = %d.\n", ret);
+		return ret;
+	}
+
+	hns3_pmu->irq = irq;
+
+	return 0;
+}
+
+static int hns3_pmu_init_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu)
+{
+	int ret;
+
+	ret = hns3_pmu_alloc_pmu(pdev, hns3_pmu);
+	if (ret)
+		return ret;
+
+	ret = hns3_pmu_irq_register(pdev, hns3_pmu);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+				       &hns3_pmu->node);
+	if (ret) {
+		pci_err(pdev, "failed to register hotplug, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = perf_pmu_register(&hns3_pmu->pmu, hns3_pmu->pmu.name, -1);
+	if (ret) {
+		pci_err(pdev, "failed to register perf PMU, ret = %d.\n", ret);
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+					    &hns3_pmu->node);
+	}
+
+	return ret;
+}
+
+static void hns3_pmu_uninit_pmu(struct pci_dev *pdev)
+{
+	struct hns3_pmu *hns3_pmu = pci_get_drvdata(pdev);
+
+	perf_pmu_unregister(&hns3_pmu->pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+				    &hns3_pmu->node);
+}
+
+static int hns3_pmu_init_dev(struct pci_dev *pdev)
+{
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		pci_err(pdev, "failed to enable pci device, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = pcim_iomap_regions(pdev, BIT(BAR_2), "hns3_pmu");
+	if (ret < 0) {
+		pci_err(pdev, "failed to request pci region, ret = %d.\n", ret);
+		return ret;
+	}
+
+	pci_set_master(pdev);
+
+	return 0;
+}
+
+static int hns3_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct hns3_pmu *hns3_pmu;
+	int ret;
+
+	hns3_pmu = devm_kzalloc(&pdev->dev, sizeof(*hns3_pmu), GFP_KERNEL);
+	if (!hns3_pmu)
+		return -ENOMEM;
+
+	ret = hns3_pmu_init_dev(pdev);
+	if (ret)
+		return ret;
+
+	ret = hns3_pmu_init_pmu(pdev, hns3_pmu);
+	if (ret) {
+		pci_clear_master(pdev);
+		return ret;
+	}
+
+	pci_set_drvdata(pdev, hns3_pmu);
+
+	return ret;
+}
+
+static void hns3_pmu_remove(struct pci_dev *pdev)
+{
+	hns3_pmu_uninit_pmu(pdev);
+	pci_clear_master(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hns3_pmu_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa22b) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, hns3_pmu_ids);
+
+static struct pci_driver hns3_pmu_driver = {
+	.name = "hns3_pmu",
+	.id_table = hns3_pmu_ids,
+	.probe = hns3_pmu_probe,
+	.remove = hns3_pmu_remove,
+};
+
+static int __init hns3_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+				      "AP_PERF_ARM_HNS3_PMU_ONLINE",
+				      hns3_pmu_online_cpu,
+				      hns3_pmu_offline_cpu);
+	if (ret) {
+		pr_err("failed to setup HNS3 PMU hotplug, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = pci_register_driver(&hns3_pmu_driver);
+	if (ret) {
+		pr_err("failed to register pci driver, ret = %d.\n", ret);
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE);
+	}
+
+	return ret;
+}
+module_init(hns3_pmu_module_init);
+
+static void __exit hns3_pmu_module_exit(void)
+{
+	pci_unregister_driver(&hns3_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE);
+}
+module_exit(hns3_pmu_module_exit);
+
+MODULE_DESCRIPTION("HNS3 PMU driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 19f0dbfdd7fe..3e99fb4d3134 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -230,6 +230,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+	CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
-- 
cgit 


From 3b7e2482f9a3f2e99628048b945c9c6cc53bd38e Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:36 +0100
Subject: iommu: Introduce a callback to struct iommu_resv_region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A callback is introduced to struct iommu_resv_region to free memory
allocations associated with the reserved region. This will be useful
when we introduce support for IORT RMR based reserved regions.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-2-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 16 +++++++++++-----
 include/linux/iommu.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cdc86c39954e..f46431ac49e1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2590,16 +2590,22 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
  * @list: reserved region list for device
  *
  * IOMMU drivers can use this to implement their .put_resv_regions() callback
- * for simple reservations. Memory allocated for each reserved region will be
- * freed. If an IOMMU driver allocates additional resources per region, it is
- * going to have to implement a custom callback.
+ * for simple reservations. If a per region callback is provided that will be
+ * used to free all memory allocations associated with the reserved region or
+ * else just free up the memory for the regions. If an IOMMU driver allocates
+ * additional resources per region, it is going to have to implement a custom
+ * callback.
  */
 void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
 {
 	struct iommu_resv_region *entry, *next;
 
-	list_for_each_entry_safe(entry, next, list, list)
-		kfree(entry);
+	list_for_each_entry_safe(entry, next, list, list) {
+		if (entry->free)
+			entry->free(dev, entry);
+		else
+			kfree(entry);
+	}
 }
 EXPORT_SYMBOL(generic_iommu_put_resv_regions);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e1afe169549..b22ffa6bc4a9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -135,6 +135,7 @@ enum iommu_resv_type {
  * @length: Length of the region in bytes
  * @prot: IOMMU Protection flags (READ/WRITE/...)
  * @type: Type of the reserved region
+ * @free: Callback to free associated memory allocations
  */
 struct iommu_resv_region {
 	struct list_head	list;
@@ -142,6 +143,7 @@ struct iommu_resv_region {
 	size_t			length;
 	int			prot;
 	enum iommu_resv_type	type;
+	void (*free)(struct device *dev, struct iommu_resv_region *region);
 };
 
 /**
-- 
cgit 


From 8778b1d48117055c710a01498f65fa730160fdfa Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:37 +0100
Subject: ACPI/IORT: Make iort_iommu_msi_get_resv_regions() return void

At present iort_iommu_msi_get_resv_regions() returns the number of
MSI reserved regions on success and there are no users for this.
The reserved region list will get populated anyway for platforms
that require the HW MSI region reservation. Hence, change the
function to return void instead.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-3-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 25 +++++++++----------------
 include/linux/acpi_iort.h |  6 +++---
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index f2f8f05662de..213f61cae176 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -811,22 +811,19 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
  * @dev: Device from iommu_get_resv_regions()
  * @head: Reserved region list from iommu_get_resv_regions()
  *
- * Returns: Number of msi reserved regions on success (0 if platform
- *          doesn't require the reservation or no associated msi regions),
- *          appropriate error value otherwise. The ITS interrupt translation
- *          spaces (ITS_base + SZ_64K, SZ_64K) associated with the device
- *          are the msi reserved regions.
+ * The ITS interrupt translation spaces (ITS_base + SZ_64K, SZ_64K)
+ * associated with the device are the HW MSI reserved regions.
  */
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct acpi_iort_its_group *its;
 	struct acpi_iort_node *iommu_node, *its_node = NULL;
-	int i, resv = 0;
+	int i;
 
 	iommu_node = iort_get_msi_resv_iommu(dev);
 	if (!iommu_node)
-		return 0;
+		return;
 
 	/*
 	 * Current logic to reserve ITS regions relies on HW topologies
@@ -846,7 +843,7 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 	}
 
 	if (!its_node)
-		return 0;
+		return;
 
 	/* Move to ITS specific data */
 	its = (struct acpi_iort_its_group *)its_node->node_data;
@@ -860,14 +857,10 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 
 			region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K,
 							 prot, IOMMU_RESV_MSI);
-			if (region) {
+			if (region)
 				list_add_tail(&region->list, head);
-				resv++;
-			}
 		}
 	}
-
-	return (resv == its->its_count) ? resv : -ENODEV;
 }
 
 static inline bool iort_iommu_driver_enabled(u8 type)
@@ -1034,8 +1027,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 }
 
 #else
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
-{ return 0; }
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ }
 int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
 { return -ENODEV; }
 #endif
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index f1f0842a2cb2..a8198b83753d 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -36,7 +36,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
 static inline void acpi_iort_init(void) { }
@@ -52,8 +52,8 @@ static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 { return -ENODEV; }
 static inline
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
-{ return 0; }
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ }
 
 static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void)
 { return PHYS_ADDR_MAX; }
-- 
cgit 


From 55be25b8b5e4e2fd680cfb073b84a74a79c002fa Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:38 +0100
Subject: ACPI/IORT: Provide a generic helper to retrieve reserve regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently IORT provides a helper to retrieve HW MSI reserve regions.
Change this to a generic helper to retrieve any IORT related reserve
regions. This will be useful when we add support for RMR nodes in
subsequent patches.

[Lorenzo: For ACPI IORT]

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-4-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 22 +++++++++++++++-------
 drivers/iommu/dma-iommu.c |  2 +-
 include/linux/acpi_iort.h |  4 ++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 213f61cae176..cd5d1d7823cb 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -806,15 +806,13 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 	return NULL;
 }
 
-/**
- * iort_iommu_msi_get_resv_regions - Reserved region driver helper
- * @dev: Device from iommu_get_resv_regions()
- * @head: Reserved region list from iommu_get_resv_regions()
- *
+/*
+ * Retrieve platform specific HW MSI reserve regions.
  * The ITS interrupt translation spaces (ITS_base + SZ_64K, SZ_64K)
  * associated with the device are the HW MSI reserved regions.
  */
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+static void iort_iommu_msi_get_resv_regions(struct device *dev,
+					    struct list_head *head)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct acpi_iort_its_group *its;
@@ -863,6 +861,16 @@ void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 	}
 }
 
+/**
+ * iort_iommu_get_resv_regions - Generic helper to retrieve reserved regions.
+ * @dev: Device from iommu_get_resv_regions()
+ * @head: Reserved region list from iommu_get_resv_regions()
+ */
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	iort_iommu_msi_get_resv_regions(dev, head);
+}
+
 static inline bool iort_iommu_driver_enabled(u8 type)
 {
 	switch (type) {
@@ -1027,7 +1035,7 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 }
 
 #else
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 { }
 int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
 { return -ENODEV; }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 1910f4f1612b..458fb6738223 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -387,7 +387,7 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
 
 	if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))
-		iort_iommu_msi_get_resv_regions(dev, list);
+		iort_iommu_get_resv_regions(dev, list);
 
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index a8198b83753d..e5d2de9caf7f 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -36,7 +36,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
 static inline void acpi_iort_init(void) { }
@@ -52,7 +52,7 @@ static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 { return -ENODEV; }
 static inline
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 { }
 
 static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void)
-- 
cgit 


From 491cf4a6735a957cd0733365f8d17e0f4308f5a4 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:39 +0100
Subject: ACPI/IORT: Add support to retrieve IORT RMR reserved regions

Parse through the IORT RMR nodes and populate the reserve region list
corresponding to a given IOMMU and device(optional). Also, go through
the ID mappings of the RMR node and retrieve all the SIDs associated
with it.

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-5-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h     |   8 ++
 2 files changed, 299 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index cd5d1d7823cb..b6273af316c6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -788,6 +788,294 @@ void acpi_configure_pmsi_domain(struct device *dev)
 }
 
 #ifdef CONFIG_IOMMU_API
+static void iort_rmr_free(struct device *dev,
+			  struct iommu_resv_region *region)
+{
+	struct iommu_iort_rmr_data *rmr_data;
+
+	rmr_data = container_of(region, struct iommu_iort_rmr_data, rr);
+	kfree(rmr_data->sids);
+	kfree(rmr_data);
+}
+
+static struct iommu_iort_rmr_data *iort_rmr_alloc(
+					struct acpi_iort_rmr_desc *rmr_desc,
+					int prot, enum iommu_resv_type type,
+					u32 *sids, u32 num_sids)
+{
+	struct iommu_iort_rmr_data *rmr_data;
+	struct iommu_resv_region *region;
+	u32 *sids_copy;
+	u64 addr = rmr_desc->base_address, size = rmr_desc->length;
+
+	rmr_data = kmalloc(sizeof(*rmr_data), GFP_KERNEL);
+	if (!rmr_data)
+		return NULL;
+
+	/* Create a copy of SIDs array to associate with this rmr_data */
+	sids_copy = kmemdup(sids, num_sids * sizeof(*sids), GFP_KERNEL);
+	if (!sids_copy) {
+		kfree(rmr_data);
+		return NULL;
+	}
+	rmr_data->sids = sids_copy;
+	rmr_data->num_sids = num_sids;
+
+	if (!IS_ALIGNED(addr, SZ_64K) || !IS_ALIGNED(size, SZ_64K)) {
+		/* PAGE align base addr and size */
+		addr &= PAGE_MASK;
+		size = PAGE_ALIGN(size + offset_in_page(rmr_desc->base_address));
+
+		pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] not aligned to 64K, continue with [0x%llx - 0x%llx]\n",
+		       rmr_desc->base_address,
+		       rmr_desc->base_address + rmr_desc->length - 1,
+		       addr, addr + size - 1);
+	}
+
+	region = &rmr_data->rr;
+	INIT_LIST_HEAD(&region->list);
+	region->start = addr;
+	region->length = size;
+	region->prot = prot;
+	region->type = type;
+	region->free = iort_rmr_free;
+
+	return rmr_data;
+}
+
+static void iort_rmr_desc_check_overlap(struct acpi_iort_rmr_desc *desc,
+					u32 count)
+{
+	int i, j;
+
+	for (i = 0; i < count; i++) {
+		u64 end, start = desc[i].base_address, length = desc[i].length;
+
+		if (!length) {
+			pr_err(FW_BUG "RMR descriptor[0x%llx] with zero length, continue anyway\n",
+			       start);
+			continue;
+		}
+
+		end = start + length - 1;
+
+		/* Check for address overlap */
+		for (j = i + 1; j < count; j++) {
+			u64 e_start = desc[j].base_address;
+			u64 e_end = e_start + desc[j].length - 1;
+
+			if (start <= e_end && end >= e_start)
+				pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] overlaps, continue anyway\n",
+				       start, end);
+		}
+	}
+}
+
+/*
+ * Please note, we will keep the already allocated RMR reserve
+ * regions in case of a memory allocation failure.
+ */
+static void iort_get_rmrs(struct acpi_iort_node *node,
+			  struct acpi_iort_node *smmu,
+			  u32 *sids, u32 num_sids,
+			  struct list_head *head)
+{
+	struct acpi_iort_rmr *rmr = (struct acpi_iort_rmr *)node->node_data;
+	struct acpi_iort_rmr_desc *rmr_desc;
+	int i;
+
+	rmr_desc = ACPI_ADD_PTR(struct acpi_iort_rmr_desc, node,
+				rmr->rmr_offset);
+
+	iort_rmr_desc_check_overlap(rmr_desc, rmr->rmr_count);
+
+	for (i = 0; i < rmr->rmr_count; i++, rmr_desc++) {
+		struct iommu_iort_rmr_data *rmr_data;
+		enum iommu_resv_type type;
+		int prot = IOMMU_READ | IOMMU_WRITE;
+
+		if (rmr->flags & ACPI_IORT_RMR_REMAP_PERMITTED)
+			type = IOMMU_RESV_DIRECT_RELAXABLE;
+		else
+			type = IOMMU_RESV_DIRECT;
+
+		if (rmr->flags & ACPI_IORT_RMR_ACCESS_PRIVILEGE)
+			prot |= IOMMU_PRIV;
+
+		/* Attributes 0x00 - 0x03 represents device memory */
+		if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) <=
+				ACPI_IORT_RMR_ATTR_DEVICE_GRE)
+			prot |= IOMMU_MMIO;
+		else if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) ==
+				ACPI_IORT_RMR_ATTR_NORMAL_IWB_OWB)
+			prot |= IOMMU_CACHE;
+
+		rmr_data = iort_rmr_alloc(rmr_desc, prot, type,
+					  sids, num_sids);
+		if (!rmr_data)
+			return;
+
+		list_add_tail(&rmr_data->rr.list, head);
+	}
+}
+
+static u32 *iort_rmr_alloc_sids(u32 *sids, u32 count, u32 id_start,
+				u32 new_count)
+{
+	u32 *new_sids;
+	u32 total_count = count + new_count;
+	int i;
+
+	new_sids = krealloc_array(sids, count + new_count,
+				  sizeof(*new_sids), GFP_KERNEL);
+	if (!new_sids)
+		return NULL;
+
+	for (i = count; i < total_count; i++)
+		new_sids[i] = id_start++;
+
+	return new_sids;
+}
+
+static bool iort_rmr_has_dev(struct device *dev, u32 id_start,
+			     u32 id_count)
+{
+	int i;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	/*
+	 * Make sure the kernel has preserved the boot firmware PCIe
+	 * configuration. This is required to ensure that the RMR PCIe
+	 * StreamIDs are still valid (Refer: ARM DEN 0049E.d Section 3.1.1.5).
+	 */
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus);
+
+		if (!host->preserve_config)
+			return false;
+	}
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		if (fwspec->ids[i] >= id_start &&
+		    fwspec->ids[i] <= id_start + id_count)
+			return true;
+	}
+
+	return false;
+}
+
+static void iort_node_get_rmr_info(struct acpi_iort_node *node,
+				   struct acpi_iort_node *iommu,
+				   struct device *dev, struct list_head *head)
+{
+	struct acpi_iort_node *smmu = NULL;
+	struct acpi_iort_rmr *rmr;
+	struct acpi_iort_id_mapping *map;
+	u32 *sids = NULL;
+	u32 num_sids = 0;
+	int i;
+
+	if (!node->mapping_offset || !node->mapping_count) {
+		pr_err(FW_BUG "Invalid ID mapping, skipping RMR node %p\n",
+		       node);
+		return;
+	}
+
+	rmr = (struct acpi_iort_rmr *)node->node_data;
+	if (!rmr->rmr_offset || !rmr->rmr_count)
+		return;
+
+	map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node,
+			   node->mapping_offset);
+
+	/*
+	 * Go through the ID mappings and see if we have a match for SMMU
+	 * and dev(if !NULL). If found, get the sids for the Node.
+	 * Please note, id_count is equal to the number of IDs  in the
+	 * range minus one.
+	 */
+	for (i = 0; i < node->mapping_count; i++, map++) {
+		struct acpi_iort_node *parent;
+
+		if (!map->id_count)
+			continue;
+
+		parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
+				      map->output_reference);
+		if (parent != iommu)
+			continue;
+
+		/* If dev is valid, check RMR node corresponds to the dev SID */
+		if (dev && !iort_rmr_has_dev(dev, map->output_base,
+					     map->id_count))
+			continue;
+
+		/* Retrieve SIDs associated with the Node. */
+		sids = iort_rmr_alloc_sids(sids, num_sids, map->output_base,
+					   map->id_count + 1);
+		if (!sids)
+			return;
+
+		num_sids += map->id_count + 1;
+	}
+
+	if (!sids)
+		return;
+
+	iort_get_rmrs(node, smmu, sids, num_sids, head);
+	kfree(sids);
+}
+
+static void iort_find_rmrs(struct acpi_iort_node *iommu, struct device *dev,
+			   struct list_head *head)
+{
+	struct acpi_table_iort *iort;
+	struct acpi_iort_node *iort_node, *iort_end;
+	int i;
+
+	/* Only supports ARM DEN 0049E.d onwards */
+	if (iort_table->revision < 5)
+		return;
+
+	iort = (struct acpi_table_iort *)iort_table;
+
+	iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				 iort->node_offset);
+	iort_end = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				iort_table->length);
+
+	for (i = 0; i < iort->node_count; i++) {
+		if (WARN_TAINT(iort_node >= iort_end, TAINT_FIRMWARE_WORKAROUND,
+			       "IORT node pointer overflows, bad table!\n"))
+			return;
+
+		if (iort_node->type == ACPI_IORT_NODE_RMR)
+			iort_node_get_rmr_info(iort_node, iommu, dev, head);
+
+		iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort_node,
+					 iort_node->length);
+	}
+}
+
+/*
+ * Populate the RMR list associated with a given IOMMU and dev(if provided).
+ * If dev is NULL, the function populates all the RMRs associated with the
+ * given IOMMU.
+ */
+static void iort_iommu_rmr_get_resv_regions(struct fwnode_handle *iommu_fwnode,
+					    struct device *dev,
+					    struct list_head *head)
+{
+	struct acpi_iort_node *iommu;
+
+	iommu = iort_get_iort_node(iommu_fwnode);
+	if (!iommu)
+		return;
+
+	iort_find_rmrs(iommu, dev, head);
+}
+
 static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 {
 	struct acpi_iort_node *iommu;
@@ -868,7 +1156,10 @@ static void iort_iommu_msi_get_resv_regions(struct device *dev,
  */
 void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 {
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
 	iort_iommu_msi_get_resv_regions(dev, head);
+	iort_iommu_rmr_get_resv_regions(fwspec->iommu_fwnode, dev, head);
 }
 
 static inline bool iort_iommu_driver_enabled(u8 type)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b22ffa6bc4a9..e6abd998dbe7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -146,6 +146,14 @@ struct iommu_resv_region {
 	void (*free)(struct device *dev, struct iommu_resv_region *region);
 };
 
+struct iommu_iort_rmr_data {
+	struct iommu_resv_region rr;
+
+	/* Stream IDs associated with IORT RMR entry */
+	const u32 *sids;
+	u32 num_sids;
+};
+
 /**
  * enum iommu_dev_features - Per device IOMMU features
  * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
-- 
cgit 


From e302eea8f49717253ac64fd45b7cc719e87fa010 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:40 +0100
Subject: ACPI/IORT: Add a helper to retrieve RMR info directly

This will provide a way for SMMU drivers to retrieve StreamIDs
associated with IORT RMR nodes and use that to set bypass settings
for those IDs.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-6-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 28 ++++++++++++++++++++++++++++
 include/linux/acpi_iort.h |  8 ++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index b6273af316c6..cd1349d3544e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1394,6 +1394,34 @@ int iort_dma_get_ranges(struct device *dev, u64 *size)
 		return nc_dma_get_range(dev, size);
 }
 
+/**
+ * iort_get_rmr_sids - Retrieve IORT RMR node reserved regions with
+ *                     associated StreamIDs information.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @head: Resereved region list
+ */
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head)
+{
+	iort_iommu_rmr_get_resv_regions(iommu_fwnode, NULL, head);
+}
+EXPORT_SYMBOL_GPL(iort_get_rmr_sids);
+
+/**
+ * iort_put_rmr_sids - Free memory allocated for RMR reserved regions.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @head: Resereved region list
+ */
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		entry->free(NULL, entry);
+}
+EXPORT_SYMBOL_GPL(iort_put_rmr_sids);
+
 static void __init acpi_iort_register_irq(int hwirq, const char *name,
 					  int trigger,
 					  struct resource *res)
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index e5d2de9caf7f..b43be0987b19 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -33,6 +33,10 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
 					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head);
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
@@ -46,6 +50,10 @@ static inline struct irq_domain *iort_get_device_domain(
 	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
+static inline
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head) { }
+static inline
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head) { }
 /* IOMMU interface */
 static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 { return -ENODEV; }
-- 
cgit 


From 88527790c079fb1ea41cbcfa4450ee37906a2fb0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 5 Jul 2022 16:59:24 -0700
Subject: tls: rx: add sockopt for enabling optimistic decrypt with TLS 1.3

Since optimisitic decrypt may add extra load in case of retries
require socket owner to explicitly opt-in.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst | 18 ++++++++++
 include/linux/sockptr.h          |  8 +++++
 include/net/tls.h                |  3 ++
 include/uapi/linux/snmp.h        |  1 +
 include/uapi/linux/tls.h         |  2 ++
 net/tls/tls_main.c               | 75 ++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_proc.c               |  1 +
 net/tls/tls_sw.c                 | 21 +++++++----
 8 files changed, 122 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index be8e10c14b05..7a6643836e42 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -239,6 +239,19 @@ for the original TCP transmission and TCP retransmissions. To the receiver
 this will look like TLS records had been tampered with and will result
 in record authentication failures.
 
+TLS_RX_EXPECT_NO_PAD
+~~~~~~~~~~~~~~~~~~~~
+
+TLS 1.3 only. Expect the sender to not pad records. This allows the data
+to be decrypted directly into user space buffers with TLS 1.3.
+
+This optimization is safe to enable only if the remote end is trusted,
+otherwise it is an attack vector to doubling the TLS processing cost.
+
+If the record decrypted turns out to had been padded or is not a data
+record it will be decrypted again into a kernel buffer without zero copy.
+Such events are counted in the ``TlsDecryptRetry`` statistic.
+
 Statistics
 ==========
 
@@ -264,3 +277,8 @@ TLS implementation exposes the following per-namespace statistics
 
 - ``TlsDeviceRxResync`` -
   number of RX resyncs sent to NICs handling cryptography
+
+- ``TlsDecryptRetry`` -
+  number of RX records which had to be re-decrypted due to
+  ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will
+  also increment for non-data records.
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index ea193414298b..d45902fb4cad 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -102,4 +102,12 @@ static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
 	return strncpy_from_user(dst, src.user, count);
 }
 
+static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
+				       size_t size)
+{
+	if (!sockptr_is_kernel(src))
+		return check_zeroed_user(src.user + offset, size);
+	return memchr_inv(src.kernel + offset, 0, size) == NULL;
+}
+
 #endif /* _LINUX_SOCKPTR_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index 8017f1703447..4fc16ca5f469 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -149,6 +149,7 @@ struct tls_sw_context_rx {
 
 	struct sk_buff *recv_pkt;
 	u8 async_capable:1;
+	u8 zc_capable:1;
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
@@ -239,6 +240,7 @@ struct tls_context {
 	u8 tx_conf:3;
 	u8 rx_conf:3;
 	u8 zerocopy_sendfile:1;
+	u8 rx_no_pad:1;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
@@ -358,6 +360,7 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
 void tls_err_abort(struct sock *sk, int err);
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
 void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
 void tls_sw_strparser_done(struct tls_context *tls_ctx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 904909d020e2..1c9152add663 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -344,6 +344,7 @@ enum
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
+	LINUX_MIN_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index bb8f80812b0b..f1157d8f4acd 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -40,6 +40,7 @@
 #define TLS_TX			1	/* Set transmit parameters */
 #define TLS_RX			2	/* Set receive parameters */
 #define TLS_TX_ZEROCOPY_RO	3	/* TX zerocopy (only sendfile now) */
+#define TLS_RX_EXPECT_NO_PAD	4	/* Attempt opportunistic zero-copy */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -162,6 +163,7 @@ enum {
 	TLS_INFO_TXCONF,
 	TLS_INFO_RXCONF,
 	TLS_INFO_ZC_RO_TX,
+	TLS_INFO_RX_NO_PAD,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 2ffede463e4a..1b3efc96db0b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -533,6 +533,37 @@ static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
+				    int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	unsigned int value;
+	int err, len;
+
+	if (ctx->prot_info.version != TLS_1_3_VERSION)
+		return -EINVAL;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < sizeof(value))
+		return -EINVAL;
+
+	lock_sock(sk);
+	err = -EINVAL;
+	if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW)
+		value = ctx->rx_no_pad;
+	release_sock(sk);
+	if (err)
+		return err;
+
+	if (put_user(sizeof(value), optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &value, sizeof(value)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int do_tls_getsockopt(struct sock *sk, int optname,
 			     char __user *optval, int __user *optlen)
 {
@@ -547,6 +578,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
 	case TLS_TX_ZEROCOPY_RO:
 		rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
 		break;
+	case TLS_RX_EXPECT_NO_PAD:
+		rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -718,6 +752,38 @@ static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
 	return 0;
 }
 
+static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
+				    unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	u32 val;
+	int rc;
+
+	if (ctx->prot_info.version != TLS_1_3_VERSION ||
+	    sockptr_is_null(optval) || optlen < sizeof(val))
+		return -EINVAL;
+
+	rc = copy_from_sockptr(&val, optval, sizeof(val));
+	if (rc)
+		return -EFAULT;
+	if (val > 1)
+		return -EINVAL;
+	rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val));
+	if (rc < 1)
+		return rc == 0 ? -EINVAL : rc;
+
+	lock_sock(sk);
+	rc = -EINVAL;
+	if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) {
+		ctx->rx_no_pad = val;
+		tls_update_rx_zc_capable(ctx);
+		rc = 0;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			     unsigned int optlen)
 {
@@ -736,6 +802,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 		rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
 		release_sock(sk);
 		break;
+	case TLS_RX_EXPECT_NO_PAD:
+		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -976,6 +1045,11 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
 		if (err)
 			goto nla_failure;
 	}
+	if (ctx->rx_no_pad) {
+		err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD);
+		if (err)
+			goto nla_failure;
+	}
 
 	rcu_read_unlock();
 	nla_nest_end(skb, start);
@@ -997,6 +1071,7 @@ static size_t tls_get_info_size(const struct sock *sk)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_RXCONF */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
 		nla_total_size(0) +		/* TLS_INFO_ZC_RO_TX */
+		nla_total_size(0) +		/* TLS_INFO_RX_NO_PAD */
 		0;
 
 	return size;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index feeceb0e4cb4..0c200000cc45 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -18,6 +18,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
+	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 2bac57684429..7592b6519953 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1601,6 +1601,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
+		TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
 
@@ -1787,7 +1788,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek &&
-		     prot->version != TLS_1_3_VERSION;
+		ctx->zc_capable;
 	decrypted = 0;
 	while (len && (decrypted + copied < target || ctx->recv_pkt)) {
 		struct tls_decrypt_arg darg = {};
@@ -2269,6 +2270,14 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
 	strp_check_rcv(&rx_ctx->strp);
 }
 
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx)
+{
+	struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx);
+
+	rx_ctx->zc_capable = tls_ctx->rx_no_pad ||
+		tls_ctx->prot_info.version != TLS_1_3_VERSION;
+}
+
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -2504,12 +2513,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	if (sw_ctx_rx) {
 		tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
 
-		if (crypto_info->version == TLS_1_3_VERSION)
-			sw_ctx_rx->async_capable = 0;
-		else
-			sw_ctx_rx->async_capable =
-				!!(tfm->__crt_alg->cra_flags &
-				   CRYPTO_ALG_ASYNC);
+		tls_update_rx_zc_capable(ctx);
+		sw_ctx_rx->async_capable =
+			crypto_info->version != TLS_1_3_VERSION &&
+			!!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
 
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
-- 
cgit 


From 2fd26970cf66bd52dc42843c46968040caa8c9a1 Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 6 Jul 2022 06:10:26 +1000
Subject: Revert "kernfs: Change kernfs_notify_list to llist."

This reverts commit b8f35fa1188b84035c59d4842826c4e93a1b1c9f.

This is causing regression due to same kernfs_node getting
added multiple times in kernfs_notify_list so revert it until
safe way of using llist in this context is found.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: Michael Walle <michael@walle.cc>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Cc: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220705201026.2487665-1-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 47 +++++++++++++++++++++++++++--------------------
 include/linux/kernfs.h |  2 +-
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index bb933221b4ba..baff4b1d40c7 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -25,16 +25,18 @@ struct kernfs_open_node {
 	struct list_head	files; /* goes through kernfs_open_file.list */
 };
 
-/**
- * attribute_to_node - get kernfs_node object corresponding to a kernfs attribute
- * @ptr:	&struct kernfs_elem_attr
- * @type:	struct kernfs_node
- * @member:	name of member (i.e attr)
+/*
+ * kernfs_notify() may be called from any context and bounces notifications
+ * through a work item.  To minimize space overhead in kernfs_node, the
+ * pending queue is implemented as a singly linked list of kernfs_nodes.
+ * The list is terminated with the self pointer so that whether a
+ * kernfs_node is on the list or not can be determined by testing the next
+ * pointer for NULL.
  */
-#define attribute_to_node(ptr, type, member)	\
-	container_of(ptr, type, member)
+#define KERNFS_NOTIFY_EOL			((void *)&kernfs_notify_list)
 
-static LLIST_HEAD(kernfs_notify_list);
+static DEFINE_SPINLOCK(kernfs_notify_lock);
+static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
 
 static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
 {
@@ -909,16 +911,18 @@ static void kernfs_notify_workfn(struct work_struct *work)
 	struct kernfs_node *kn;
 	struct kernfs_super_info *info;
 	struct kernfs_root *root;
-	struct llist_node *free;
-	struct kernfs_elem_attr *attr;
 repeat:
 	/* pop one off the notify_list */
-	free = llist_del_first(&kernfs_notify_list);
-	if (free == NULL)
+	spin_lock_irq(&kernfs_notify_lock);
+	kn = kernfs_notify_list;
+	if (kn == KERNFS_NOTIFY_EOL) {
+		spin_unlock_irq(&kernfs_notify_lock);
 		return;
+	}
+	kernfs_notify_list = kn->attr.notify_next;
+	kn->attr.notify_next = NULL;
+	spin_unlock_irq(&kernfs_notify_lock);
 
-	attr = llist_entry(free, struct kernfs_elem_attr, notify_next);
-	kn = attribute_to_node(attr, struct kernfs_node, attr);
 	root = kernfs_root(kn);
 	/* kick fsnotify */
 	down_write(&root->kernfs_rwsem);
@@ -974,14 +978,12 @@ repeat:
 void kernfs_notify(struct kernfs_node *kn)
 {
 	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
+	unsigned long flags;
 	struct kernfs_open_node *on;
 
 	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
 		return;
 
-	/* Because we are using llist for kernfs_notify_list */
-	WARN_ON_ONCE(in_nmi());
-
 	/* kick poll immediately */
 	rcu_read_lock();
 	on = rcu_dereference(kn->attr.open);
@@ -992,9 +994,14 @@ void kernfs_notify(struct kernfs_node *kn)
 	rcu_read_unlock();
 
 	/* schedule work to kick fsnotify */
-	kernfs_get(kn);
-	llist_add(&kn->attr.notify_next, &kernfs_notify_list);
-	schedule_work(&kernfs_notify_work);
+	spin_lock_irqsave(&kernfs_notify_lock, flags);
+	if (!kn->attr.notify_next) {
+		kernfs_get(kn);
+		kn->attr.notify_next = kernfs_notify_list;
+		kernfs_notify_list = kn;
+		schedule_work(&kernfs_notify_work);
+	}
+	spin_unlock_irqrestore(&kernfs_notify_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 13e703f615f7..367044d7708c 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -173,7 +173,7 @@ struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node __rcu	*open;
 	loff_t			size;
-	struct llist_node	notify_next;	/* for kernfs_notify() */
+	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
 
 /*
-- 
cgit 


From 99e48cd6855e9535488e3c90d65edd46c6e6fc1b Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jul 2022 20:03:50 +0800
Subject: blk-mq: Add a flag for reserved requests

Add a flag for reserved requests so that drivers may know this for any
special handling.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/1657109034-206040-3-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 6 ++++++
 include/linux/blk-mq.h | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 15c7c5c4ad22..a00e43cc67e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -475,6 +475,9 @@ retry:
 	if (!(data->rq_flags & RQF_ELV))
 		blk_mq_tag_busy(data->hctx);
 
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		data->rq_flags |= RQF_RESV;
+
 	/*
 	 * Try batched alloc if we want more than 1 tag.
 	 */
@@ -589,6 +592,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	else
 		data.rq_flags |= RQF_ELV;
 
+	if (flags & BLK_MQ_REQ_RESERVED)
+		data.rq_flags |= RQF_RESV;
+
 	ret = -EWOULDBLOCK;
 	tag = blk_mq_get_tag(&data);
 	if (tag == BLK_MQ_NO_TAG)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 43aad0da3305..7c62b7fabec7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,6 +57,7 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 /* queue has elevator attached */
 #define RQF_ELV			((__force req_flags_t)(1 << 22))
+#define RQF_RESV			((__force req_flags_t)(1 << 23))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -825,6 +826,11 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
 	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
 }
 
+static inline bool blk_mq_is_reserved_rq(struct request *rq)
+{
+	return rq->rq_flags & RQF_RESV;
+}
+
 /*
  * Batched completions only work when there is no I/O error and no special
  * ->end_io handler.
-- 
cgit 


From 9bdb4833dd399cbff82cc20893f52bdec66a9eca Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jul 2022 20:03:51 +0800
Subject: blk-mq: Drop blk_mq_ops.timeout 'reserved' arg

With new API blk_mq_is_reserved_rq() we can tell if a request is from
the reserved pool, so stop passing 'reserved' arg. There is actually
only a single user of that arg for all the callback implementations, which
can use blk_mq_is_reserved_rq() instead.

This will also allow us to stop passing the same 'reserved' around the
blk-mq iter functions next.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/1657109034-206040-4-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c                    | 6 +++---
 block/bsg-lib.c                   | 2 +-
 drivers/block/mtip32xx/mtip32xx.c | 5 ++---
 drivers/block/nbd.c               | 3 +--
 drivers/block/null_blk/main.c     | 2 +-
 drivers/mmc/core/queue.c          | 3 +--
 drivers/nvme/host/apple.c         | 3 +--
 drivers/nvme/host/fc.c            | 3 +--
 drivers/nvme/host/pci.c           | 2 +-
 drivers/nvme/host/rdma.c          | 3 +--
 drivers/nvme/host/tcp.c           | 3 +--
 drivers/s390/block/dasd.c         | 2 +-
 drivers/s390/block/dasd_int.h     | 2 +-
 drivers/scsi/scsi_error.c         | 3 +--
 drivers/scsi/scsi_priv.h          | 3 +--
 include/linux/blk-mq.h            | 2 +-
 16 files changed, 19 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a00e43cc67e5..cedbec36e907 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1427,13 +1427,13 @@ bool blk_mq_queue_inflight(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
 
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req)
 {
 	req->rq_flags |= RQF_TIMED_OUT;
 	if (req->q->mq_ops->timeout) {
 		enum blk_eh_timer_return ret;
 
-		ret = req->q->mq_ops->timeout(req, reserved);
+		ret = req->q->mq_ops->timeout(req);
 		if (ret == BLK_EH_DONE)
 			return;
 		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
@@ -1482,7 +1482,7 @@ static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
 	 * from blk_mq_check_expired().
 	 */
 	if (blk_mq_req_expired(rq, next))
-		blk_mq_rq_timed_out(rq, reserved);
+		blk_mq_rq_timed_out(rq);
 	return true;
 }
 
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fd4cd5e68282..d6f5dcdce748 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -331,7 +331,7 @@ void bsg_remove_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(bsg_remove_queue);
 
-static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return bsg_timeout(struct request *rq)
 {
 	struct bsg_set *bset =
 		container_of(rq->q->tag_set, struct bsg_set, tag_set);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index e116c6cf56f5..5073cb407500 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3357,12 +3357,11 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
 	return 0;
 }
 
-static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req,
-								bool reserved)
+static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req)
 {
 	struct driver_data *dd = req->q->queuedata;
 
-	if (reserved) {
+	if (blk_mq_is_reserved_rq(req)) {
 		struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 
 		cmd->status = BLK_STS_TIMEOUT;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 5c4c9c45c6ac..028f23c965df 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -393,8 +393,7 @@ static u32 req_to_nbd_cmd_type(struct request *req)
 	}
 }
 
-static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
-						 bool reserved)
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct nbd_device *nbd = cmd->nbd;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index d695ea29efa6..4e03a020ee3c 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1578,7 +1578,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	return nr;
 }
 
-static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+static enum blk_eh_timer_return null_timeout_rq(struct request *rq)
 {
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index f824cfdab75a..fefaa901b50f 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -116,8 +116,7 @@ static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
 	}
 }
 
-static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
-						 bool reserved)
+static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req)
 {
 	struct request_queue *q = req->q;
 	struct mmc_queue *mq = q->queuedata;
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 2d23b7d41f7e..5c352d5d8ee6 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -862,8 +862,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
 	}
 }
 
-static enum blk_eh_timer_return apple_nvme_timeout(struct request *req,
-						   bool reserved)
+static enum blk_eh_timer_return apple_nvme_timeout(struct request *req)
 {
 	struct apple_nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct apple_nvme_queue *q = iod->q;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a96aa831684c..07fd6db5869c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2565,8 +2565,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 	nvme_reset_ctrl(&ctrl->ctrl);
 }
 
-static enum blk_eh_timer_return
-nvme_fc_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
 {
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 247a74aba336..4232192e10dd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1344,7 +1344,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 		 "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n");
 }
 
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
+static enum blk_eh_timer_return nvme_timeout(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = iod->nvmeq;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0fb7c8e7ab0b..a6eaf38b9646 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2013,8 +2013,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq)
 	nvmf_complete_timed_out_request(rq);
 }
 
-static enum blk_eh_timer_return
-nvme_rdma_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 	struct nvme_rdma_queue *queue = req->queue;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b81942fa5f95..ff502172accd 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2321,8 +2321,7 @@ static void nvme_tcp_complete_timed_out(struct request *rq)
 	nvmf_complete_timed_out_request(rq);
 }
 
-static enum blk_eh_timer_return
-nvme_tcp_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
 {
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 	struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e8489331f12b..4df8bf6505fc 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3145,7 +3145,7 @@ out:
  * BLK_EH_DONE if the request is handled or terminated
  *		      by the driver.
  */
-enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
+enum blk_eh_timer_return dasd_times_out(struct request *req)
 {
 	struct dasd_block *block = req->q->queuedata;
 	struct dasd_device *device;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 83b918b84b4a..333a399f754e 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -795,7 +795,7 @@ void dasd_free_device(struct dasd_device *);
 struct dasd_block *dasd_alloc_block(void);
 void dasd_free_block(struct dasd_block *);
 
-enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved);
+enum blk_eh_timer_return dasd_times_out(struct request *req);
 
 void dasd_enable_device(struct dasd_device *);
 void dasd_set_target_state(struct dasd_device *, int);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index a8b71b73a5a5..266ce414589c 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -318,7 +318,6 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
 /**
  * scsi_timeout - Timeout function for normal scsi commands.
  * @req:	request that is timing out.
- * @reserved:	whether the request is a reserved request.
  *
  * Notes:
  *     We do not need to lock this.  There is the potential for a race
@@ -326,7 +325,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
  *     normal completion function determines that the timer has already
  *     fired, then it mustn't do anything.
  */
-enum blk_eh_timer_return scsi_timeout(struct request *req, bool reserved)
+enum blk_eh_timer_return scsi_timeout(struct request *req)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
 	enum blk_eh_timer_return rtn = BLK_EH_DONE;
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 695d0c83ffe0..6eeaa0a7f86d 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -72,8 +72,7 @@ extern void scsi_exit_devinfo(void);
 
 /* scsi_error.c */
 extern void scmd_eh_abort_handler(struct work_struct *work);
-extern enum blk_eh_timer_return scsi_timeout(struct request *req,
-					     bool reserved);
+extern enum blk_eh_timer_return scsi_timeout(struct request *req);
 extern int scsi_error_handler(void *host);
 extern enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *cmd);
 extern void scsi_eh_wakeup(struct Scsi_Host *shost);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7c62b7fabec7..c84c56d296fe 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -575,7 +575,7 @@ struct blk_mq_ops {
 	/**
 	 * @timeout: Called on request timeout.
 	 */
-	enum blk_eh_timer_return (*timeout)(struct request *, bool);
+	enum blk_eh_timer_return (*timeout)(struct request *);
 
 	/**
 	 * @poll: Called to poll for completion of a specific tag.
-- 
cgit 


From 2dd6532e9591f201e7571b30915db807603ab924 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jul 2022 20:03:53 +0800
Subject: blk-mq: Drop 'reserved' arg of busy_tag_iter_fn

We no longer use the 'reserved' arg in busy_tag_iter_fn for any iter
function so it may be dropped.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me> #nvme
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/1657109034-206040-6-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c              |  2 +-
 block/blk-mq-tag.c                  |  7 +++----
 block/blk-mq.c                      | 10 ++++------
 drivers/block/mtip32xx/mtip32xx.c   |  4 ++--
 drivers/block/nbd.c                 |  2 +-
 drivers/infiniband/ulp/srp/ib_srp.c |  3 +--
 drivers/nvme/host/core.c            |  2 +-
 drivers/nvme/host/fc.c              |  3 +--
 drivers/nvme/host/nvme.h            |  2 +-
 drivers/scsi/aacraid/comminit.c     |  2 +-
 drivers/scsi/aacraid/linit.c        |  2 +-
 drivers/scsi/fnic/fnic_scsi.c       | 12 ++++--------
 drivers/scsi/hosts.c                | 14 ++++++--------
 drivers/scsi/mpi3mr/mpi3mr_os.c     | 16 ++++------------
 include/linux/blk-mq.h              |  2 +-
 include/scsi/scsi_host.h            |  2 +-
 16 files changed, 33 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b80fae7ab1d9..b11add9a95e2 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -375,7 +375,7 @@ struct show_busy_params {
  * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
  * keep iterating requests.
  */
-static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+static bool hctx_show_busy_rq(struct request *rq, void *data)
 {
 	const struct show_busy_params *params = data;
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 3cfffef1feb3..4e9b8ec55bda 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -283,7 +283,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 		return true;
 
 	if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
-		ret = iter_data->fn(rq, iter_data->data, reserved);
+		ret = iter_data->fn(rq, iter_data->data);
 	blk_mq_put_rq_ref(rq);
 	return ret;
 }
@@ -354,7 +354,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 
 	if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
 	    blk_mq_request_started(rq))
-		ret = iter_data->fn(rq, iter_data->data, reserved);
+		ret = iter_data->fn(rq, iter_data->data);
 	if (!iter_static_rqs)
 		blk_mq_put_rq_ref(rq);
 	return ret;
@@ -444,8 +444,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
-		void *data, bool reserved)
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
 {
 	unsigned *count = data;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cedbec36e907..63385742b8a8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -129,8 +129,7 @@ struct mq_inflight {
 	unsigned int inflight[2];
 };
 
-static bool blk_mq_check_inflight(struct request *rq, void *priv,
-				  bool reserved)
+static bool blk_mq_check_inflight(struct request *rq, void *priv)
 {
 	struct mq_inflight *mi = priv;
 
@@ -1400,8 +1399,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
-static bool blk_mq_rq_inflight(struct request *rq, void *priv,
-			       bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv)
 {
 	/*
 	 * If we find a request that isn't idle we know the queue is busy
@@ -1470,7 +1468,7 @@ void blk_mq_put_rq_ref(struct request *rq)
 		__blk_mq_free_request(rq);
 }
 
-static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv)
 {
 	unsigned long *next = priv;
 
@@ -3289,7 +3287,7 @@ struct rq_iter_data {
 	bool has_rq;
 };
 
-static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+static bool blk_mq_has_request(struct request *rq, void *data)
 {
 	struct rq_iter_data *iter_data = data;
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 5073cb407500..562725d222a7 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2441,7 +2441,7 @@ static void mtip_softirq_done_fn(struct request *rq)
 	blk_mq_end_request(rq, cmd->status);
 }
 
-static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_abort_cmd(struct request *req, void *data)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;
@@ -2454,7 +2454,7 @@ static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
 	return true;
 }
 
-static bool mtip_queue_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_queue_cmd(struct request *req, void *data)
 {
 	struct driver_data *dd = data;
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 028f23c965df..f5d098a148cb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -879,7 +879,7 @@ static void recv_work(struct work_struct *work)
 	kfree(args);
 }
 
-static bool nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 6058abf42ba7..7720ea270ed8 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1282,8 +1282,7 @@ struct srp_terminate_context {
 	int scsi_result;
 };
 
-static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr,
-			      bool reserved)
+static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr)
 {
 	struct srp_terminate_context *context = context_ptr;
 	struct srp_target_port *target = context->srp_target;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b5b24998a5ab..9031d10c97dc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -418,7 +418,7 @@ blk_status_t nvme_host_path_error(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_host_path_error);
 
-bool nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data)
 {
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 				"Cancelling I/O %d", req->tag);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 07fd6db5869c..9987797620b6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2456,8 +2456,7 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
  * status. The done path will return the io request back to the block
  * layer with an error status.
  */
-static bool
-nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+static bool nvme_fc_terminate_exchange(struct request *req, void *data)
 {
 	struct nvme_ctrl *nctrl = data;
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0da94b233fed..e4daa57f8bd5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -697,7 +697,7 @@ static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
 }
 
 blk_status_t nvme_host_path_error(struct request *req);
-bool nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_cancel_request(struct request *req, void *data);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index 940a6deab38f..bd99c5492b7d 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -272,7 +272,7 @@ static void aac_queue_init(struct aac_dev * dev, struct aac_queue * q, u32 *mem,
 	q->entries = qsize;
 }
 
-static bool wait_for_io_iter(struct scsi_cmnd *cmd, void *data, bool rsvd)
+static bool wait_for_io_iter(struct scsi_cmnd *cmd, void *data)
 {
 	int *active = data;
 
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 9c27bc37e5de..5ba5c18b77b4 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -633,7 +633,7 @@ struct fib_count_data {
 	int krlcnt;
 };
 
-static bool fib_count_iter(struct scsi_cmnd *scmnd, void *data, bool reserved)
+static bool fib_count_iter(struct scsi_cmnd *scmnd, void *data)
 {
 	struct fib_count_data *fib_count = data;
 
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index e7b7f6d73429..77a4d9f8aa83 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -1350,8 +1350,7 @@ int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int copy_work_to_do)
 	return wq_work_done;
 }
 
-static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data,
-				 bool reserved)
+static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data)
 {
 	const int tag = scsi_cmd_to_rq(sc)->tag;
 	struct fnic *fnic = data;
@@ -1548,8 +1547,7 @@ struct fnic_rport_abort_io_iter_data {
 	int term_cnt;
 };
 
-static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data,
-				     bool reserved)
+static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data)
 {
 	struct fnic_rport_abort_io_iter_data *iter_data = data;
 	struct fnic *fnic = iter_data->fnic;
@@ -2003,8 +2001,7 @@ struct fnic_pending_aborts_iter_data {
 	int ret;
 };
 
-static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
-				     void *data, bool reserved)
+static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, void *data)
 {
 	struct fnic_pending_aborts_iter_data *iter_data = data;
 	struct fnic *fnic = iter_data->fnic;
@@ -2668,8 +2665,7 @@ call_fc_exch_mgr_reset:
 
 }
 
-static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
-				   bool reserved)
+static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data)
 {
 	struct fnic_pending_aborts_iter_data *iter_data = data;
 	struct fnic *fnic = iter_data->fnic;
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8352f90d997d..315c7ac730e9 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -566,8 +566,7 @@ struct Scsi_Host *scsi_host_get(struct Scsi_Host *shost)
 }
 EXPORT_SYMBOL(scsi_host_get);
 
-static bool scsi_host_check_in_flight(struct request *rq, void *data,
-				      bool reserved)
+static bool scsi_host_check_in_flight(struct request *rq, void *data)
 {
 	int *count = data;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
@@ -662,7 +661,7 @@ void scsi_flush_work(struct Scsi_Host *shost)
 }
 EXPORT_SYMBOL_GPL(scsi_flush_work);
 
-static bool complete_all_cmds_iter(struct request *rq, void *data, bool rsvd)
+static bool complete_all_cmds_iter(struct request *rq, void *data)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
 	enum scsi_host_status status = *(enum scsi_host_status *)data;
@@ -693,17 +692,16 @@ void scsi_host_complete_all_commands(struct Scsi_Host *shost,
 EXPORT_SYMBOL_GPL(scsi_host_complete_all_commands);
 
 struct scsi_host_busy_iter_data {
-	bool (*fn)(struct scsi_cmnd *, void *, bool);
+	bool (*fn)(struct scsi_cmnd *, void *);
 	void *priv;
 };
 
-static bool __scsi_host_busy_iter_fn(struct request *req, void *priv,
-				   bool reserved)
+static bool __scsi_host_busy_iter_fn(struct request *req, void *priv)
 {
 	struct scsi_host_busy_iter_data *iter_data = priv;
 	struct scsi_cmnd *sc = blk_mq_rq_to_pdu(req);
 
-	return iter_data->fn(sc, iter_data->priv, reserved);
+	return iter_data->fn(sc, iter_data->priv);
 }
 
 /**
@@ -716,7 +714,7 @@ static bool __scsi_host_busy_iter_fn(struct request *req, void *priv,
  * ithas to be provided by the caller
  **/
 void scsi_host_busy_iter(struct Scsi_Host *shost,
-			 bool (*fn)(struct scsi_cmnd *, void *, bool),
+			 bool (*fn)(struct scsi_cmnd *, void *),
 			 void *priv)
 {
 	struct scsi_host_busy_iter_data iter_data = {
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index d8c195b7ca57..59a18769a4fe 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -381,14 +381,12 @@ void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc)
  * mpi3mr_print_scmd - print individual SCSI command
  * @rq: Block request
  * @data: Adapter instance reference
- * @reserved: N/A. Currently not used
  *
  * Print the SCSI command details if it is in LLD scope.
  *
  * Return: true always.
  */
-static bool mpi3mr_print_scmd(struct request *rq,
-	void *data, bool reserved)
+static bool mpi3mr_print_scmd(struct request *rq, void *data)
 {
 	struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
@@ -412,7 +410,6 @@ out:
  * mpi3mr_flush_scmd - Flush individual SCSI command
  * @rq: Block request
  * @data: Adapter instance reference
- * @reserved: N/A. Currently not used
  *
  * Return the SCSI command to the upper layers if it is in LLD
  * scope.
@@ -420,8 +417,7 @@ out:
  * Return: true always.
  */
 
-static bool mpi3mr_flush_scmd(struct request *rq,
-	void *data, bool reserved)
+static bool mpi3mr_flush_scmd(struct request *rq, void *data)
 {
 	struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
@@ -451,7 +447,6 @@ out:
  * mpi3mr_count_dev_pending - Count commands pending for a lun
  * @rq: Block request
  * @data: SCSI device reference
- * @reserved: Unused
  *
  * This is an iterator function called for each SCSI command in
  * a host and if the command is pending in the LLD for the
@@ -461,8 +456,7 @@ out:
  * Return: true always.
  */
 
-static bool mpi3mr_count_dev_pending(struct request *rq,
-	void *data, bool reserved)
+static bool mpi3mr_count_dev_pending(struct request *rq, void *data)
 {
 	struct scsi_device *sdev = (struct scsi_device *)data;
 	struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata;
@@ -485,7 +479,6 @@ out:
  * mpi3mr_count_tgt_pending - Count commands pending for target
  * @rq: Block request
  * @data: SCSI target reference
- * @reserved: Unused
  *
  * This is an iterator function called for each SCSI command in
  * a host and if the command is pending in the LLD for the
@@ -495,8 +488,7 @@ out:
  * Return: true always.
  */
 
-static bool mpi3mr_count_tgt_pending(struct request *rq,
-	void *data, bool reserved)
+static bool mpi3mr_count_tgt_pending(struct request *rq, void *data)
 {
 	struct scsi_target *starget = (struct scsi_target *)data;
 	struct mpi3mr_stgt_priv_data *stgt_priv_data = starget->hostdata;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c84c56d296fe..810a24884f7e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -520,7 +520,7 @@ struct blk_mq_queue_data {
 	bool last;
 };
 
-typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
+typedef bool (busy_tag_iter_fn)(struct request *, void *);
 
 /**
  * struct blk_mq_ops - Callback functions that implements block driver
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 667d889b92b5..65082ecdd557 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -786,7 +786,7 @@ extern int scsi_host_block(struct Scsi_Host *shost);
 extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state);
 
 void scsi_host_busy_iter(struct Scsi_Host *,
-			 bool (*fn)(struct scsi_cmnd *, void *, bool), void *priv);
+			 bool (*fn)(struct scsi_cmnd *, void *), void *priv);
 
 struct class_container;
 
-- 
cgit 


From f1a8bbd1100d9cd117bc8b7fc0903982bbaf474f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:35 +0200
Subject: block: remove a superflous ifdef in blkdev.h

It doesn't hurt to always have the blk_zone_cond_str prototype, and the
two inlines can also be defined unconditionally.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b9a94c53c6cd..270cd0c55292 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -899,8 +899,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_queue;	/* this is never NULL */
 }
 
-#ifdef CONFIG_BLK_DEV_ZONED
-
 /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
 
@@ -915,7 +913,6 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
 	return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
 				     bio->bi_iter.bi_sector);
 }
-#endif /* CONFIG_BLK_DEV_ZONED */
 
 /*
  * Return how much of the chunk is left to be used for I/O at a given offset.
-- 
cgit 


From 6b2bd274744e6454ba7bbbe6a09b44866f2f414a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:40 +0200
Subject: block: pass a gendisk to blk_queue_set_zoned

Prepare for storing the zone related field in struct gendisk instead
of struct request_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c           | 9 +++++----
 block/partitions/core.c        | 2 +-
 drivers/block/null_blk/zoned.c | 2 +-
 drivers/nvme/host/zns.c        | 2 +-
 drivers/scsi/sd.c              | 6 +++---
 drivers/scsi/sd_zbc.c          | 2 +-
 include/linux/blkdev.h         | 2 +-
 7 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ccceb421ed2..35b7bba306a8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -893,18 +893,19 @@ static bool disk_has_partitions(struct gendisk *disk)
 }
 
 /**
- * blk_queue_set_zoned - configure a disk queue zoned model.
+ * disk_set_zoned - configure the zoned model for a disk
  * @disk:	the gendisk of the queue to configure
  * @model:	the zoned model to set
  *
- * Set the zoned model of the request queue of @disk according to @model.
+ * Set the zoned model of @disk to @model.
+ *
  * When @model is BLK_ZONED_HM (host managed), this should be called only
  * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
  * If @model specifies BLK_ZONED_HA (host aware), the effective model used
  * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
  * on the disk.
  */
-void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 {
 	struct request_queue *q = disk->queue;
 
@@ -948,7 +949,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		blk_queue_clear_zone_settings(q);
 	}
 }
-EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
+EXPORT_SYMBOL_GPL(disk_set_zoned);
 
 int bdev_alignment_offset(struct block_device *bdev)
 {
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 7dc487f5b03c..1a45b1dd6491 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -330,7 +330,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	case BLK_ZONED_HA:
 		pr_info("%s: disabling host aware zoned block device support due to partitions\n",
 			disk->disk_name);
-		blk_queue_set_zoned(disk, BLK_ZONED_NONE);
+		disk_set_zoned(disk, BLK_ZONED_NONE);
 		break;
 	case BLK_ZONED_NONE:
 		break;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 2fdd7b20c224..b47bbd114058 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -159,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 	struct nullb_device *dev = nullb->dev;
 	struct request_queue *q = nullb->q;
 
-	blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM);
+	disk_set_zoned(nullb->disk, BLK_ZONED_HM);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
 
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9f81beb4df4e..0ed15c2fd56d 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -109,7 +109,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 		goto free_data;
 	}
 
-	blk_queue_set_zoned(ns->disk, BLK_ZONED_HM);
+	disk_set_zoned(ns->disk, BLK_ZONED_HM);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
 	blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cb587e488601..eb02d939dd44 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2934,15 +2934,15 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 
 	if (sdkp->device->type == TYPE_ZBC) {
 		/* Host-managed */
-		blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HM);
+		disk_set_zoned(sdkp->disk, BLK_ZONED_HM);
 	} else {
 		sdkp->zoned = zoned;
 		if (sdkp->zoned == 1) {
 			/* Host-aware */
-			blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HA);
+			disk_set_zoned(sdkp->disk, BLK_ZONED_HA);
 		} else {
 			/* Regular disk or drive managed disk */
-			blk_queue_set_zoned(sdkp->disk, BLK_ZONED_NONE);
+			disk_set_zoned(sdkp->disk, BLK_ZONED_NONE);
 		}
 	}
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 6acc4f406eb8..0f5823b67468 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -929,7 +929,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 		/*
 		 * This can happen for a host aware disk with partitions.
 		 * The block device zone model was already cleared by
-		 * blk_queue_set_zoned(). Only free the scsi disk zone
+		 * disk_set_zoned(). Only free the scsi disk zone
 		 * information and exit early.
 		 */
 		sd_zbc_free_zone_info(sdkp);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 270cd0c55292..416faa013782 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -291,7 +291,7 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
-void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
+void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
-- 
cgit 


From 1dc0172027b0aa09823b430e395b1116d2745f36 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:43 +0200
Subject: block: remove queue_max_open_zones and queue_max_active_zones

Always use the bdev based helpers instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c      |  4 ++--
 include/linux/blkdev.h | 37 ++++++++++---------------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7590810cf13f..5ce72345ac66 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -330,12 +330,12 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
 
 static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_max_open_zones(q), page);
+	return queue_var_show(bdev_max_open_zones(q->disk->part0), page);
 }
 
 static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_max_active_zones(q), page);
+	return queue_var_show(bdev_max_active_zones(q->disk->part0), page);
 }
 
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 416faa013782..7d4105d23b0a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -702,21 +702,22 @@ static inline void blk_queue_max_open_zones(struct request_queue *q,
 	q->max_open_zones = max_open_zones;
 }
 
-static inline unsigned int queue_max_open_zones(const struct request_queue *q)
-{
-	return q->max_open_zones;
-}
-
 static inline void blk_queue_max_active_zones(struct request_queue *q,
 		unsigned int max_active_zones)
 {
 	q->max_active_zones = max_active_zones;
 }
 
-static inline unsigned int queue_max_active_zones(const struct request_queue *q)
+static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
+{
+	return bdev->bd_disk->queue->max_open_zones;
+}
+
+static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
-	return q->max_active_zones;
+	return bdev->bd_disk->queue->max_active_zones;
 }
+
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
 {
@@ -732,11 +733,11 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
 {
 	return 0;
 }
-static inline unsigned int queue_max_open_zones(const struct request_queue *q)
+static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
 {
 	return 0;
 }
-static inline unsigned int queue_max_active_zones(const struct request_queue *q)
+static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
 	return 0;
 }
@@ -1314,24 +1315,6 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 	return 0;
 }
 
-static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (q)
-		return queue_max_open_zones(q);
-	return 0;
-}
-
-static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (q)
-		return queue_max_active_zones(q);
-	return 0;
-}
-
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
-- 
cgit 


From 982977df48179c8c690868f398051074e68eef0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:44 +0200
Subject: block: pass a gendisk to blk_queue_max_open_zones and
 blk_queue_max_active_zones

Switch to a gendisk based API in preparation for moving all zone related
fields from the request_queue to the gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk/zoned.c | 4 ++--
 drivers/nvme/host/zns.c        | 4 ++--
 drivers/scsi/sd_zbc.c          | 6 +++---
 include/linux/blkdev.h         | 8 ++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index b47bbd114058..576ab3ed082a 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -174,8 +174,8 @@ int null_register_zoned_dev(struct nullb *nullb)
 	}
 
 	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
-	blk_queue_max_open_zones(q, dev->zone_max_open);
-	blk_queue_max_active_zones(q, dev->zone_max_active);
+	disk_set_max_open_zones(nullb->disk, dev->zone_max_open);
+	disk_set_max_active_zones(nullb->disk, dev->zone_max_active);
 
 	return 0;
 }
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 0ed15c2fd56d..12316ab51bda 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,8 +111,8 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 
 	disk_set_zoned(ns->disk, BLK_ZONED_HM);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
-	blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
-	blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
+	disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
+	disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
 free_data:
 	kfree(id);
 	return status;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0f5823b67468..b4106f899734 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -950,10 +950,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
 	if (sdkp->zones_max_open == U32_MAX)
-		blk_queue_max_open_zones(q, 0);
+		disk_set_max_open_zones(disk, 0);
 	else
-		blk_queue_max_open_zones(q, sdkp->zones_max_open);
-	blk_queue_max_active_zones(q, 0);
+		disk_set_max_open_zones(disk, sdkp->zones_max_open);
+	disk_set_max_active_zones(disk, 0);
 	nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7d4105d23b0a..c05e1cc05c26 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -696,16 +696,16 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
 	return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
 }
 
-static inline void blk_queue_max_open_zones(struct request_queue *q,
+static inline void disk_set_max_open_zones(struct gendisk *disk,
 		unsigned int max_open_zones)
 {
-	q->max_open_zones = max_open_zones;
+	disk->queue->max_open_zones = max_open_zones;
 }
 
-static inline void blk_queue_max_active_zones(struct request_queue *q,
+static inline void disk_set_max_active_zones(struct gendisk *disk,
 		unsigned int max_active_zones)
 {
-	q->max_active_zones = max_active_zones;
+	disk->queue->max_active_zones = max_active_zones;
 }
 
 static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
-- 
cgit 


From b623e347323f6464b20fb0d899a0a73522ed8f6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:45 +0200
Subject: block: replace blkdev_nr_zones with bdev_nr_zones

Pass a block_device instead of a request_queue as that is what most
callers have at hand.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c              | 15 ++++++++-------
 block/ioctl.c                  |  2 +-
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/md/dm-zone.c           |  2 +-
 drivers/md/dm-zoned-target.c   |  5 ++---
 drivers/nvme/target/zns.c      |  6 +++---
 fs/zonefs/super.c              | 17 ++++++++---------
 include/linux/blkdev.h         |  4 ++--
 8 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 0d431394cf90..2dec25d8aa3b 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -108,21 +108,22 @@ void __blk_req_zone_write_unlock(struct request *rq)
 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 
 /**
- * blkdev_nr_zones - Get number of zones
- * @disk:	Target gendisk
+ * bdev_nr_zones - Get number of zones
+ * @bdev:	Target device
  *
  * Return the total number of zones of a zoned block device.  For a block
  * device without zone capabilities, the number of zones is always 0.
  */
-unsigned int blkdev_nr_zones(struct gendisk *disk)
+unsigned int bdev_nr_zones(struct block_device *bdev)
 {
-	sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
+	sector_t zone_sectors = bdev_zone_sectors(bdev);
 
-	if (!blk_queue_is_zoned(disk->queue))
+	if (!bdev_is_zoned(bdev))
 		return 0;
-	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+	return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
+		ilog2(zone_sectors);
 }
-EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+EXPORT_SYMBOL_GPL(bdev_nr_zones);
 
 /**
  * blkdev_report_zones - Get zones information
diff --git a/block/ioctl.c b/block/ioctl.c
index 46949f1b0dba..60121e89052b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -495,7 +495,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 	case BLKGETZONESZ:
 		return put_uint(argp, bdev_zone_sectors(bdev));
 	case BLKGETNRZONES:
-		return put_uint(argp, blkdev_nr_zones(bdev->bd_disk));
+		return put_uint(argp, bdev_nr_zones(bdev));
 	case BLKROGET:
 		return put_int(argp, bdev_read_only(bdev) != 0);
 	case BLKSSZGET: /* get block device logical block size */
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 576ab3ed082a..e62c52e96425 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -170,7 +170,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 			return ret;
 	} else {
 		blk_queue_chunk_sectors(q, dev->zone_size_sects);
-		q->nr_zones = blkdev_nr_zones(nullb->disk);
+		q->nr_zones = bdev_nr_zones(nullb->disk->part0);
 	}
 
 	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index ae616b87c91a..6d105abe1241 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -301,7 +301,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
 	 * correct value to be exposed in sysfs queue/nr_zones.
 	 */
 	WARN_ON_ONCE(queue_is_mq(q));
-	q->nr_zones = blkdev_nr_zones(md->disk);
+	q->nr_zones = bdev_nr_zones(md->disk->part0);
 
 	/* Check if zone append is natively supported */
 	if (dm_table_supports_zone_append(t)) {
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 0ec5d8b9b1a4..6ba6ef44b00e 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -793,8 +793,7 @@ static int dmz_fixup_devices(struct dm_target *ti)
 			}
 			zone_nr_sectors = blk_queue_zone_sectors(q);
 			zoned_dev->zone_nr_sectors = zone_nr_sectors;
-			zoned_dev->nr_zones =
-				blkdev_nr_zones(zoned_dev->bdev->bd_disk);
+			zoned_dev->nr_zones = bdev_nr_zones(zoned_dev->bdev);
 		}
 	} else {
 		reg_dev = NULL;
@@ -805,7 +804,7 @@ static int dmz_fixup_devices(struct dm_target *ti)
 		}
 		q = bdev_get_queue(zoned_dev->bdev);
 		zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
-		zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
+		zoned_dev->nr_zones = bdev_nr_zones(zoned_dev->bdev);
 	}
 
 	if (reg_dev) {
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 82b61acf7a72..c4c99b832daf 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -60,7 +60,7 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
 	if (ns->bdev->bd_disk->queue->conv_zones_bitmap)
 		return false;
 
-	ret = blkdev_report_zones(ns->bdev, 0, blkdev_nr_zones(bd_disk),
+	ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
 				  validate_conv_zones_cb, NULL);
 	if (ret < 0)
 		return false;
@@ -241,7 +241,7 @@ static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req)
 {
 	unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
 
-	return blkdev_nr_zones(req->ns->bdev->bd_disk) -
+	return bdev_nr_zones(req->ns->bdev) -
 		(sect >> ilog2(bdev_zone_sectors(req->ns->bdev)));
 }
 
@@ -386,7 +386,7 @@ static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d)
 static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
 {
 	struct block_device *bdev = req->ns->bdev;
-	unsigned int nr_zones = blkdev_nr_zones(bdev->bd_disk);
+	unsigned int nr_zones = bdev_nr_zones(bdev);
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = NULL;
 	sector_t sector = 0;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 053299758deb..9c0eef1ff32a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1394,7 +1394,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
 {
 	struct super_block *sb = parent->i_sb;
 
-	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
+	inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1;
 	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
 	inode->i_op = &zonefs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
@@ -1540,7 +1540,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
 	/*
 	 * The first zone contains the super block: skip it.
 	 */
-	end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk);
+	end = zd->zones + bdev_nr_zones(sb->s_bdev);
 	for (zone = &zd->zones[1]; zone < end; zone = next) {
 
 		next = zone + 1;
@@ -1635,8 +1635,8 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
 	struct block_device *bdev = zd->sb->s_bdev;
 	int ret;
 
-	zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk),
-			     sizeof(struct blk_zone), GFP_KERNEL);
+	zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone),
+			     GFP_KERNEL);
 	if (!zd->zones)
 		return -ENOMEM;
 
@@ -1648,9 +1648,9 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
 		return ret;
 	}
 
-	if (ret != blkdev_nr_zones(bdev->bd_disk)) {
+	if (ret != bdev_nr_zones(bdev)) {
 		zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
-			   ret, blkdev_nr_zones(bdev->bd_disk));
+			   ret, bdev_nr_zones(bdev));
 		return -EIO;
 	}
 
@@ -1816,8 +1816,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (ret)
 		goto cleanup;
 
-	zonefs_info(sb, "Mounting %u zones",
-		    blkdev_nr_zones(sb->s_bdev->bd_disk));
+	zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
 
 	if (!sbi->s_max_wro_seq_files &&
 	    !sbi->s_max_active_seq_files &&
@@ -1833,7 +1832,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!inode)
 		goto cleanup;
 
-	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk);
+	inode->i_ino = bdev_nr_zones(sb->s_bdev);
 	inode->i_mode = S_IFDIR | 0555;
 	inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
 	inode->i_op = &zonefs_dir_inode_operations;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c05e1cc05c26..fa2757ef4a84 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -298,7 +298,7 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data);
-unsigned int blkdev_nr_zones(struct gendisk *disk);
+unsigned int bdev_nr_zones(struct block_device *bdev);
 extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
 			    sector_t sectors, sector_t nr_sectors,
 			    gfp_t gfp_mask);
@@ -312,7 +312,7 @@ extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
-static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
 {
 	return 0;
 }
-- 
cgit 


From de71973c2951cb2ce4b46560f021f03b15906408 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:49 +0200
Subject: block: remove blk_queue_zone_sectors

Always use bdev_zone_sectors instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-16-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-table.c  |  4 +---
 drivers/md/dm-zone.c   | 10 ++++++----
 include/linux/blkdev.h | 11 +++--------
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b36b528e56cf..df904b7e95ce 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1620,13 +1620,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
 static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
 					   sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
 	unsigned int *zone_sectors = data;
 
 	if (!bdev_is_zoned(dev->bdev))
 		return 0;
-
-	return blk_queue_zone_sectors(q) != *zone_sectors;
+	return bdev_zone_sectors(dev->bdev) != *zone_sectors;
 }
 
 /*
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 6d105abe1241..842c31019b51 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -334,7 +334,7 @@ static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
 static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
 				    unsigned int *wp_ofst)
 {
-	sector_t sector = zno * blk_queue_zone_sectors(md->queue);
+	sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
 	unsigned int noio_flag;
 	struct dm_table *t;
 	int srcu_idx, ret;
@@ -373,7 +373,7 @@ struct orig_bio_details {
 static bool dm_zone_map_bio_begin(struct mapped_device *md,
 				  unsigned int zno, struct bio *clone)
 {
-	sector_t zsectors = blk_queue_zone_sectors(md->queue);
+	sector_t zsectors = bdev_zone_sectors(md->disk->part0);
 	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
 
 	/*
@@ -443,7 +443,7 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int z
 		return BLK_STS_OK;
 	case REQ_OP_ZONE_FINISH:
 		WRITE_ONCE(md->zwp_offset[zno],
-			   blk_queue_zone_sectors(md->queue));
+			   bdev_zone_sectors(md->disk->part0));
 		return BLK_STS_OK;
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE:
@@ -593,6 +593,7 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
 {
 	struct mapped_device *md = io->md;
 	struct request_queue *q = md->queue;
+	struct gendisk *disk = md->disk;
 	struct bio *orig_bio = io->orig_bio;
 	unsigned int zwp_offset;
 	unsigned int zno;
@@ -608,7 +609,8 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
 		 */
 		if (clone->bi_status == BLK_STS_OK &&
 		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
-			sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
+			sector_t mask =
+				(sector_t)bdev_zone_sectors(disk->part0) - 1;
 
 			orig_bio->bi_iter.bi_sector +=
 				clone->bi_iter.bi_sector & mask;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fa2757ef4a84..21b97f7115dc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -667,11 +667,6 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
-static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
-{
-	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
-}
-
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
 {
@@ -1310,9 +1305,9 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-	if (q)
-		return blk_queue_zone_sectors(q);
-	return 0;
+	if (!blk_queue_is_zoned(q))
+		return 0;
+	return q->limits.chunk_sectors;
 }
 
 static inline int queue_dma_alignment(const struct request_queue *q)
-- 
cgit 


From d86e716aa40643e3eb8c69fab3a198146bf76dd6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Jul 2022 09:03:50 +0200
Subject: block: move zone related fields to struct gendisk

Move the zone related fields that are currently stored in
struct request_queue to struct gendisk as these are part of the highlevel
block layer API and are only used for non-passthrough I/O that requires
the gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220706070350.1703384-17-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs-zoned.c   |  6 +--
 block/blk-sysfs.c              |  2 +-
 block/blk-zoned.c              | 45 ++++++++++-----------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/md/dm-zone.c           | 74 ++++++++++++++++------------------
 drivers/nvme/host/multipath.c  |  2 +-
 drivers/nvme/target/zns.c      |  4 +-
 drivers/scsi/sd_zbc.c          |  2 +-
 include/linux/blk-mq.h         |  8 ++--
 include/linux/blkdev.h         | 91 +++++++++++++++++++-----------------------
 10 files changed, 111 insertions(+), 125 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
index 038cb627c868..a77b099c34b7 100644
--- a/block/blk-mq-debugfs-zoned.c
+++ b/block/blk-mq-debugfs-zoned.c
@@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m)
 	struct request_queue *q = data;
 	unsigned int i;
 
-	if (!q->seq_zones_wlock)
+	if (!q->disk->seq_zones_wlock)
 		return 0;
 
-	for (i = 0; i < q->nr_zones; i++)
-		if (test_bit(i, q->seq_zones_wlock))
+	for (i = 0; i < q->disk->nr_zones; i++)
+		if (test_bit(i, q->disk->seq_zones_wlock))
 			seq_printf(m, "%u\n", i);
 
 	return 0;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5ce72345ac66..c0303026752d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -325,7 +325,7 @@ static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 
 static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(blk_queue_nr_zones(q), page);
+	return queue_var_show(disk_nr_zones(q->disk), page);
 }
 
 static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index c2d8a38f449a..7c017458d5ce 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str);
  */
 bool blk_req_needs_zone_write_lock(struct request *rq)
 {
-	if (!rq->q->seq_zones_wlock)
+	if (blk_rq_is_passthrough(rq))
 		return false;
 
-	if (blk_rq_is_passthrough(rq))
+	if (!rq->q->disk->seq_zones_wlock)
 		return false;
 
 	switch (req_op(rq)) {
@@ -77,7 +77,7 @@ bool blk_req_zone_write_trylock(struct request *rq)
 {
 	unsigned int zno = blk_rq_zone_no(rq);
 
-	if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+	if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
 		return false;
 
 	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
 void __blk_req_zone_write_lock(struct request *rq)
 {
 	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
-					  rq->q->seq_zones_wlock)))
+					  rq->q->disk->seq_zones_wlock)))
 		return;
 
 	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -101,9 +101,9 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
 void __blk_req_zone_write_unlock(struct request *rq)
 {
 	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
-	if (rq->q->seq_zones_wlock)
+	if (rq->q->disk->seq_zones_wlock)
 		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
-						 rq->q->seq_zones_wlock));
+						 rq->q->disk->seq_zones_wlock));
 }
 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 
@@ -189,7 +189,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
 static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
 					  gfp_t gfp_mask)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
+	struct gendisk *disk = bdev->bd_disk;
 	sector_t capacity = bdev_nr_sectors(bdev);
 	sector_t zone_sectors = bdev_zone_sectors(bdev);
 	unsigned long *need_reset;
@@ -197,19 +197,18 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
 	sector_t sector = 0;
 	int ret;
 
-	need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
+	need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
 	if (!need_reset)
 		return -ENOMEM;
 
-	ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
-				q->nr_zones, blk_zone_need_reset_cb,
-				need_reset);
+	ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
+				       blk_zone_need_reset_cb, need_reset);
 	if (ret < 0)
 		goto out_free_need_reset;
 
 	ret = 0;
 	while (sector < capacity) {
-		if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
+		if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
 			sector += zone_sectors;
 			continue;
 		}
@@ -452,12 +451,10 @@ fail:
 
 void disk_free_zone_bitmaps(struct gendisk *disk)
 {
-	struct request_queue *q = disk->queue;
-
-	kfree(q->conv_zones_bitmap);
-	q->conv_zones_bitmap = NULL;
-	kfree(q->seq_zones_wlock);
-	q->seq_zones_wlock = NULL;
+	kfree(disk->conv_zones_bitmap);
+	disk->conv_zones_bitmap = NULL;
+	kfree(disk->seq_zones_wlock);
+	disk->seq_zones_wlock = NULL;
 }
 
 struct blk_revalidate_zone_args {
@@ -607,9 +604,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	blk_mq_freeze_queue(q);
 	if (ret > 0) {
 		blk_queue_chunk_sectors(q, args.zone_sectors);
-		q->nr_zones = args.nr_zones;
-		swap(q->seq_zones_wlock, args.seq_zones_wlock);
-		swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+		disk->nr_zones = args.nr_zones;
+		swap(disk->seq_zones_wlock, args.seq_zones_wlock);
+		swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
 		if (update_driver_data)
 			update_driver_data(disk);
 		ret = 0;
@@ -634,9 +631,9 @@ void disk_clear_zone_settings(struct gendisk *disk)
 	disk_free_zone_bitmaps(disk);
 	blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
 	q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
-	q->nr_zones = 0;
-	q->max_open_zones = 0;
-	q->max_active_zones = 0;
+	disk->nr_zones = 0;
+	disk->max_open_zones = 0;
+	disk->max_active_zones = 0;
 	q->limits.chunk_sectors = 0;
 	q->limits.zone_write_granularity = 0;
 	q->limits.max_zone_append_sectors = 0;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index e62c52e96425..64b06caab984 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -170,7 +170,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 			return ret;
 	} else {
 		blk_queue_chunk_sectors(q, dev->zone_size_sects);
-		q->nr_zones = bdev_nr_zones(nullb->disk->part0);
+		nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
 	}
 
 	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 842c31019b51..2b89cde30c9e 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -139,13 +139,11 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 
 void dm_cleanup_zoned_dev(struct mapped_device *md)
 {
-	struct request_queue *q = md->queue;
-
-	if (q) {
-		kfree(q->conv_zones_bitmap);
-		q->conv_zones_bitmap = NULL;
-		kfree(q->seq_zones_wlock);
-		q->seq_zones_wlock = NULL;
+	if (md->disk) {
+		kfree(md->disk->conv_zones_bitmap);
+		md->disk->conv_zones_bitmap = NULL;
+		kfree(md->disk->seq_zones_wlock);
+		md->disk->seq_zones_wlock = NULL;
 	}
 
 	kvfree(md->zwp_offset);
@@ -179,31 +177,31 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
 				 void *data)
 {
 	struct mapped_device *md = data;
-	struct request_queue *q = md->queue;
+	struct gendisk *disk = md->disk;
 
 	switch (zone->type) {
 	case BLK_ZONE_TYPE_CONVENTIONAL:
-		if (!q->conv_zones_bitmap) {
-			q->conv_zones_bitmap =
-				kcalloc(BITS_TO_LONGS(q->nr_zones),
+		if (!disk->conv_zones_bitmap) {
+			disk->conv_zones_bitmap =
+				kcalloc(BITS_TO_LONGS(disk->nr_zones),
 					sizeof(unsigned long), GFP_NOIO);
-			if (!q->conv_zones_bitmap)
+			if (!disk->conv_zones_bitmap)
 				return -ENOMEM;
 		}
-		set_bit(idx, q->conv_zones_bitmap);
+		set_bit(idx, disk->conv_zones_bitmap);
 		break;
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
-		if (!q->seq_zones_wlock) {
-			q->seq_zones_wlock =
-				kcalloc(BITS_TO_LONGS(q->nr_zones),
+		if (!disk->seq_zones_wlock) {
+			disk->seq_zones_wlock =
+				kcalloc(BITS_TO_LONGS(disk->nr_zones),
 					sizeof(unsigned long), GFP_NOIO);
-			if (!q->seq_zones_wlock)
+			if (!disk->seq_zones_wlock)
 				return -ENOMEM;
 		}
 		if (!md->zwp_offset) {
 			md->zwp_offset =
-				kvcalloc(q->nr_zones, sizeof(unsigned int),
+				kvcalloc(disk->nr_zones, sizeof(unsigned int),
 					 GFP_KERNEL);
 			if (!md->zwp_offset)
 				return -ENOMEM;
@@ -228,7 +226,7 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
  */
 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 {
-	struct request_queue *q = md->queue;
+	struct gendisk *disk = md->disk;
 	unsigned int noio_flag;
 	int ret;
 
@@ -236,7 +234,7 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 	 * Check if something changed. If yes, cleanup the current resources
 	 * and reallocate everything.
 	 */
-	if (!q->nr_zones || q->nr_zones != md->nr_zones)
+	if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
 		dm_cleanup_zoned_dev(md);
 	if (md->nr_zones)
 		return 0;
@@ -246,17 +244,17 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 	 * operations in this context are done as if GFP_NOIO was specified.
 	 */
 	noio_flag = memalloc_noio_save();
-	ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones,
+	ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
 				     dm_zone_revalidate_cb, md);
 	memalloc_noio_restore(noio_flag);
 	if (ret < 0)
 		goto err;
-	if (ret != q->nr_zones) {
+	if (ret != disk->nr_zones) {
 		ret = -EIO;
 		goto err;
 	}
 
-	md->nr_zones = q->nr_zones;
+	md->nr_zones = disk->nr_zones;
 
 	return 0;
 
@@ -301,7 +299,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
 	 * correct value to be exposed in sysfs queue/nr_zones.
 	 */
 	WARN_ON_ONCE(queue_is_mq(q));
-	q->nr_zones = bdev_nr_zones(md->disk->part0);
+	md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
 
 	/* Check if zone append is natively supported */
 	if (dm_table_supports_zone_append(t)) {
@@ -466,26 +464,26 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int z
 	}
 }
 
-static inline void dm_zone_lock(struct request_queue *q,
-				unsigned int zno, struct bio *clone)
+static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
+				struct bio *clone)
 {
 	if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
 		return;
 
-	wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
+	wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
 	bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
 }
 
-static inline void dm_zone_unlock(struct request_queue *q,
-				  unsigned int zno, struct bio *clone)
+static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
+				  struct bio *clone)
 {
 	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
 		return;
 
-	WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock));
-	clear_bit_unlock(zno, q->seq_zones_wlock);
+	WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
+	clear_bit_unlock(zno, disk->seq_zones_wlock);
 	smp_mb__after_atomic();
-	wake_up_bit(q->seq_zones_wlock, zno);
+	wake_up_bit(disk->seq_zones_wlock, zno);
 
 	bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
 }
@@ -520,7 +518,6 @@ int dm_zone_map_bio(struct dm_target_io *tio)
 	struct dm_io *io = tio->io;
 	struct dm_target *ti = tio->ti;
 	struct mapped_device *md = io->md;
-	struct request_queue *q = md->queue;
 	struct bio *clone = &tio->clone;
 	struct orig_bio_details orig_bio_details;
 	unsigned int zno;
@@ -536,7 +533,7 @@ int dm_zone_map_bio(struct dm_target_io *tio)
 
 	/* Lock the target zone */
 	zno = bio_zone_no(clone);
-	dm_zone_lock(q, zno, clone);
+	dm_zone_lock(md->disk, zno, clone);
 
 	orig_bio_details.nr_sectors = bio_sectors(clone);
 	orig_bio_details.op = bio_op(clone);
@@ -546,7 +543,7 @@ int dm_zone_map_bio(struct dm_target_io *tio)
 	 * both valid, and if the bio is a zone append, remap it to a write.
 	 */
 	if (!dm_zone_map_bio_begin(md, zno, clone)) {
-		dm_zone_unlock(q, zno, clone);
+		dm_zone_unlock(md->disk, zno, clone);
 		return DM_MAPIO_KILL;
 	}
 
@@ -570,12 +567,12 @@ int dm_zone_map_bio(struct dm_target_io *tio)
 		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
 					  *tio->len_ptr);
 		if (sts != BLK_STS_OK)
-			dm_zone_unlock(q, zno, clone);
+			dm_zone_unlock(md->disk, zno, clone);
 		break;
 	case DM_MAPIO_REQUEUE:
 	case DM_MAPIO_KILL:
 	default:
-		dm_zone_unlock(q, zno, clone);
+		dm_zone_unlock(md->disk, zno, clone);
 		sts = BLK_STS_IOERR;
 		break;
 	}
@@ -592,7 +589,6 @@ int dm_zone_map_bio(struct dm_target_io *tio)
 void dm_zone_endio(struct dm_io *io, struct bio *clone)
 {
 	struct mapped_device *md = io->md;
-	struct request_queue *q = md->queue;
 	struct gendisk *disk = md->disk;
 	struct bio *orig_bio = io->orig_bio;
 	unsigned int zwp_offset;
@@ -651,5 +647,5 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
 				zwp_offset - bio_sectors(orig_bio);
 	}
 
-	dm_zone_unlock(q, zno, clone);
+	dm_zone_unlock(disk, zno, clone);
 }
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ccf9a6da8f6e..f26640ccb955 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -830,7 +830,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 				   ns->head->disk->queue);
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
-		ns->head->disk->queue->nr_zones = ns->queue->nr_zones;
+		ns->head->disk->nr_zones = ns->disk->nr_zones;
 #endif
 }
 
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 9d8717126ab3..c0ee21fcab81 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -57,7 +57,7 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
 	 * zones, reject the device. Otherwise, use report zones to detect if
 	 * the device has conventional zones.
 	 */
-	if (ns->bdev->bd_disk->queue->conv_zones_bitmap)
+	if (ns->bdev->bd_disk->conv_zones_bitmap)
 		return false;
 
 	ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
@@ -414,7 +414,7 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
 	}
 
 	while (sector < bdev_nr_sectors(bdev)) {
-		if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) {
+		if (test_bit(disk_zone_no(bdev->bd_disk, sector), d.zbitmap)) {
 			bio = blk_next_bio(bio, bdev, 0,
 				zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC,
 				GFP_KERNEL);
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index b4106f899734..b8c97456506a 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -855,7 +855,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
 
 	if (sdkp->zone_info.zone_blocks == zone_blocks &&
 	    sdkp->zone_info.nr_zones == nr_zones &&
-	    disk->queue->nr_zones == nr_zones)
+	    disk->nr_zones == nr_zones)
 		goto unlock;
 
 	flags = memalloc_noio_save();
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 810a24884f7e..d74f6a6b7e69 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1129,12 +1129,12 @@ void blk_dump_rq_flags(struct request *, char *);
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_rq_zone_no(struct request *rq)
 {
-	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
 }
 
 static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
 {
-	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
 }
 
 bool blk_req_needs_zone_write_lock(struct request *rq);
@@ -1156,8 +1156,8 @@ static inline void blk_req_zone_write_unlock(struct request *rq)
 
 static inline bool blk_req_zone_is_write_locked(struct request *rq)
 {
-	return rq->q->seq_zones_wlock &&
-		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+	return rq->q->disk->seq_zones_wlock &&
+		test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
 }
 
 static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 21b97f7115dc..22c477fadc0f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -164,6 +164,29 @@ struct gendisk {
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct kobject integrity_kobj;
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	/*
+	 * Zoned block device information for request dispatch control.
+	 * nr_zones is the total number of zones of the device. This is always
+	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
+	 * bits which indicates if a zone is conventional (bit set) or
+	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
+	 * bits which indicates if a zone is write locked, that is, if a write
+	 * request targeting the zone was dispatched.
+	 *
+	 * Reads of this information must be protected with blk_queue_enter() /
+	 * blk_queue_exit(). Modifying this information is only allowed while
+	 * no requests are being processed. See also blk_mq_freeze_queue() and
+	 * blk_mq_unfreeze_queue().
+	 */
+	unsigned int		nr_zones;
+	unsigned int		max_open_zones;
+	unsigned int		max_active_zones;
+	unsigned long		*conv_zones_bitmap;
+	unsigned long		*seq_zones_wlock;
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 #if IS_ENABLED(CONFIG_CDROM)
 	struct cdrom_device_info *cdi;
 #endif
@@ -467,31 +490,6 @@ struct request_queue {
 
 	unsigned int		required_elevator_features;
 
-#ifdef CONFIG_BLK_DEV_ZONED
-	/*
-	 * Zoned block device information for request dispatch control.
-	 * nr_zones is the total number of zones of the device. This is always
-	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
-	 * bits which indicates if a zone is conventional (bit set) or
-	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
-	 * bits which indicates if a zone is write locked, that is, if a write
-	 * request targeting the zone was dispatched. All three fields are
-	 * initialized by the low level device driver (e.g. scsi/sd.c).
-	 * Stacking drivers (device mappers) may or may not initialize
-	 * these fields.
-	 *
-	 * Reads of this information must be protected with blk_queue_enter() /
-	 * blk_queue_exit(). Modifying this information is only allowed while
-	 * no requests are being processed. See also blk_mq_freeze_queue() and
-	 * blk_mq_unfreeze_queue().
-	 */
-	unsigned int		nr_zones;
-	unsigned long		*conv_zones_bitmap;
-	unsigned long		*seq_zones_wlock;
-	unsigned int		max_open_zones;
-	unsigned int		max_active_zones;
-#endif /* CONFIG_BLK_DEV_ZONED */
-
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace __rcu	*blk_trace;
@@ -668,63 +666,59 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
-	return blk_queue_is_zoned(q) ? q->nr_zones : 0;
+	return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
 }
 
-static inline unsigned int blk_queue_zone_no(struct request_queue *q,
-					     sector_t sector)
+static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 {
-	if (!blk_queue_is_zoned(q))
+	if (!blk_queue_is_zoned(disk->queue))
 		return 0;
-	return sector >> ilog2(q->limits.chunk_sectors);
+	return sector >> ilog2(disk->queue->limits.chunk_sectors);
 }
 
-static inline bool blk_queue_zone_is_seq(struct request_queue *q,
-					 sector_t sector)
+static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
 {
-	if (!blk_queue_is_zoned(q))
+	if (!blk_queue_is_zoned(disk->queue))
 		return false;
-	if (!q->conv_zones_bitmap)
+	if (!disk->conv_zones_bitmap)
 		return true;
-	return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
+	return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
 }
 
 static inline void disk_set_max_open_zones(struct gendisk *disk,
 		unsigned int max_open_zones)
 {
-	disk->queue->max_open_zones = max_open_zones;
+	disk->max_open_zones = max_open_zones;
 }
 
 static inline void disk_set_max_active_zones(struct gendisk *disk,
 		unsigned int max_active_zones)
 {
-	disk->queue->max_active_zones = max_active_zones;
+	disk->max_active_zones = max_active_zones;
 }
 
 static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
 {
-	return bdev->bd_disk->queue->max_open_zones;
+	return bdev->bd_disk->max_open_zones;
 }
 
 static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
-	return bdev->bd_disk->queue->max_active_zones;
+	return bdev->bd_disk->max_active_zones;
 }
 
 #else /* CONFIG_BLK_DEV_ZONED */
-static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
-static inline bool blk_queue_zone_is_seq(struct request_queue *q,
-					 sector_t sector)
+static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
 {
 	return false;
 }
-static inline unsigned int blk_queue_zone_no(struct request_queue *q,
-					     sector_t sector)
+static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 {
 	return 0;
 }
@@ -732,6 +726,7 @@ static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
 {
 	return 0;
 }
+
 static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
 	return 0;
@@ -900,14 +895,12 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
 
 static inline unsigned int bio_zone_no(struct bio *bio)
 {
-	return blk_queue_zone_no(bdev_get_queue(bio->bi_bdev),
-				 bio->bi_iter.bi_sector);
+	return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
 }
 
 static inline unsigned int bio_zone_is_seq(struct bio *bio)
 {
-	return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
-				     bio->bi_iter.bi_sector);
+	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
 }
 
 /*
-- 
cgit 


From 2d693ed436a67db06d1473c22d4caee899a4e9d2 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Wed, 11 May 2022 15:45:58 +0100
Subject: coresight: Add config flag to enable branch broadcast

When enabled, all taken branch addresses are output, even if the branch
was because of a direct branch instruction. This enables reconstruction
of the program flow without having access to the memory image of the
code being executed.

Use bit 8 for the config option which would be the correct bit for
programming ETMv3. Although branch broadcast can't be enabled on ETMv3
because it's not in the define ETM3X_SUPPORTED_OPTIONS, using the
correct bit might help prevent future collisions or allow it to be
enabled if needed.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20220511144601.2257870-2-james.clark@arm.com
---
 drivers/hwtracing/coresight/coresight-etm-perf.c   |  2 ++
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 14 ++++++++++++++
 include/linux/coresight-pmu.h                      |  2 ++
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index c039b6ae206f..43bbd5dc3d3b 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -52,6 +52,7 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
  * now take them as general formats and apply on all ETMs.
  */
+PMU_FORMAT_ATTR(branch_broadcast, "config:"__stringify(ETM_OPT_BRANCH_BROADCAST));
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
 /* contextid1 enables tracing CONTEXTIDR_EL1 for ETMv4 */
 PMU_FORMAT_ATTR(contextid1,	"config:" __stringify(ETM_OPT_CTXTID));
@@ -97,6 +98,7 @@ static struct attribute *etm_config_formats_attr[] = {
 	&format_attr_sinkid.attr,
 	&format_attr_preset.attr,
 	&format_attr_configid.attr,
+	&format_attr_branch_broadcast.attr,
 	NULL,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index 87299e99dabb..cf249ecad5a5 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -696,6 +696,20 @@ static int etm4_parse_event_config(struct coresight_device *csdev,
 		ret = cscfg_csdev_enable_active_config(csdev, cfg_hash, preset);
 	}
 
+	/* branch broadcast - enable if selected and supported */
+	if (attr->config & BIT(ETM_OPT_BRANCH_BROADCAST)) {
+		if (!drvdata->trcbb) {
+			/*
+			 * Missing BB support could cause silent decode errors
+			 * so fail to open if it's not supported.
+			 */
+			ret = -EINVAL;
+			goto out;
+		} else {
+			config->cfg |= BIT(ETM4_CFG_BIT_BB);
+		}
+	}
+
 out:
 	return ret;
 }
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 4ac5c081af93..6c2fd6cc5a98 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -18,6 +18,7 @@
  * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
  * directly use below macros as config bits.
  */
+#define ETM_OPT_BRANCH_BROADCAST 8
 #define ETM_OPT_CYCACC		12
 #define ETM_OPT_CTXTID		14
 #define ETM_OPT_CTXTID2		15
@@ -25,6 +26,7 @@
 #define ETM_OPT_RETSTK		29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
+#define ETM4_CFG_BIT_BB         3
 #define ETM4_CFG_BIT_CYCACC	4
 #define ETM4_CFG_BIT_CTXTID	6
 #define ETM4_CFG_BIT_VMID	7
-- 
cgit 


From e67a004482c092e326b32f24d4542c85fb8f79bf Mon Sep 17 00:00:00 2001
From: Robert Foss <robert.foss@linaro.org>
Date: Wed, 6 Jul 2022 17:43:33 +0200
Subject: dt-bindings: clock: Add Qcom SM8350 GPUCC bindings

Add device tree bindings for graphics clock controller for
Qualcomm Technology Inc's SM8350 SoCs.

Signed-off-by: Robert Foss <robert.foss@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmityr.baryshkov@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220706154337.2026269-2-robert.foss@linaro.org
---
 .../bindings/clock/qcom,gpucc-sm8350.yaml          | 72 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,gpucc-sm8350.h      | 52 ++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,gpucc-sm8350.yaml
 create mode 100644 include/dt-bindings/clock/qcom,gpucc-sm8350.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc-sm8350.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc-sm8350.yaml
new file mode 100644
index 000000000000..0a0546c079a9
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc-sm8350.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gpucc-sm8350.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller Binding
+
+maintainers:
+  - Robert Foss <robert.foss@linaro.org>
+
+description: |
+  Qualcomm graphics clock control module which supports the clocks, resets and
+  power domains on Qualcomm SoCs.
+
+  See also:
+    dt-bindings/clock/qcom,gpucc-sm8350.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,sm8350-gpucc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 main branch source
+      - description: GPLL0 div branch source
+
+  '#clock-cells':
+    const: 1
+
+  '#reset-cells':
+    const: 1
+
+  '#power-domain-cells':
+    const: 1
+
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - '#clock-cells'
+  - '#reset-cells'
+  - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sm8350.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        clock-controller@3d90000 {
+            compatible = "qcom,sm8350-gpucc";
+            reg = <0 0x03d90000 0 0x9000>;
+            clocks = <&rpmhcc RPMH_CXO_CLK>,
+                     <&gcc GCC_GPU_GPLL0_CLK_SRC>,
+                     <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>;
+            #clock-cells = <1>;
+            #reset-cells = <1>;
+            #power-domain-cells = <1>;
+        };
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,gpucc-sm8350.h b/include/dt-bindings/clock/qcom,gpucc-sm8350.h
new file mode 100644
index 000000000000..2ca857f5bfd2
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,gpucc-sm8350.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2022, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_SM8350_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_SM8350_H
+
+/* GPU_CC clocks */
+#define GPU_CC_AHB_CLK			0
+#define GPU_CC_CB_CLK			1
+#define GPU_CC_CRC_AHB_CLK		2
+#define GPU_CC_CX_APB_CLK		3
+#define GPU_CC_CX_GMU_CLK		4
+#define GPU_CC_CX_QDSS_AT_CLK		5
+#define GPU_CC_CX_QDSS_TRIG_CLK		6
+#define GPU_CC_CX_QDSS_TSCTR_CLK	7
+#define GPU_CC_CX_SNOC_DVM_CLK		8
+#define GPU_CC_CXO_AON_CLK		9
+#define GPU_CC_CXO_CLK			10
+#define GPU_CC_FREQ_MEASURE_CLK		11
+#define GPU_CC_GMU_CLK_SRC		12
+#define GPU_CC_GX_GMU_CLK		13
+#define GPU_CC_GX_QDSS_TSCTR_CLK	14
+#define GPU_CC_GX_VSENSE_CLK		15
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK	16
+#define GPU_CC_HUB_AHB_DIV_CLK_SRC	17
+#define GPU_CC_HUB_AON_CLK		18
+#define GPU_CC_HUB_CLK_SRC		19
+#define GPU_CC_HUB_CX_INT_CLK		20
+#define GPU_CC_HUB_CX_INT_DIV_CLK_SRC	21
+#define GPU_CC_MND1X_0_GFX3D_CLK	22
+#define GPU_CC_MND1X_1_GFX3D_CLK	23
+#define GPU_CC_PLL0			24
+#define GPU_CC_PLL1			25
+#define GPU_CC_SLEEP_CLK		26
+
+/* GPU_CC resets */
+#define GPUCC_GPU_CC_ACD_BCR		0
+#define GPUCC_GPU_CC_CB_BCR		1
+#define GPUCC_GPU_CC_CX_BCR		2
+#define GPUCC_GPU_CC_FAST_HUB_BCR	3
+#define GPUCC_GPU_CC_GFX3D_AON_BCR	4
+#define GPUCC_GPU_CC_GMU_BCR		5
+#define GPUCC_GPU_CC_GX_BCR		6
+#define GPUCC_GPU_CC_XO_BCR		7
+
+/* GPU_CC GDSCRs */
+#define GPU_CX_GDSC			0
+#define GPU_GX_GDSC			1
+
+#endif
-- 
cgit 


From 909e5be2ca882a0384018370f065ef1c611e6ed9 Mon Sep 17 00:00:00 2001
From: Jonathan Marek <jonathan@marek.ca>
Date: Wed, 6 Jul 2022 17:43:35 +0200
Subject: dt-bindings: clock: Add Qcom SM8350 DISPCC bindings

Add sm8350 DISPCC bindings, which are simply a symlink to the sm8250
bindings. Update the documentation with the new compatible.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Signed-off-by: Robert Foss <robert.foss@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220706154337.2026269-4-robert.foss@linaro.org
---
 Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml | 6 ++++--
 include/dt-bindings/clock/qcom,dispcc-sm8350.h                  | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 120000 include/dt-bindings/clock/qcom,dispcc-sm8350.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
index 31497677e8de..7a8d375e055e 100644
--- a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
@@ -4,18 +4,19 @@
 $id: http://devicetree.org/schemas/clock/qcom,dispcc-sm8x50.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
-title: Qualcomm Display Clock & Reset Controller Binding for SM8150/SM8250
+title: Qualcomm Display Clock & Reset Controller Binding for SM8150/SM8250/SM8350
 
 maintainers:
   - Jonathan Marek <jonathan@marek.ca>
 
 description: |
   Qualcomm display clock control module which supports the clocks, resets and
-  power domains on SM8150 and SM8250.
+  power domains on SM8150/SM8250/SM8350.
 
   See also:
     dt-bindings/clock/qcom,dispcc-sm8150.h
     dt-bindings/clock/qcom,dispcc-sm8250.h
+    dt-bindings/clock/qcom,dispcc-sm8350.h
 
 properties:
   compatible:
@@ -23,6 +24,7 @@ properties:
       - qcom,sc8180x-dispcc
       - qcom,sm8150-dispcc
       - qcom,sm8250-dispcc
+      - qcom,sm8350-dispcc
 
   clocks:
     items:
diff --git a/include/dt-bindings/clock/qcom,dispcc-sm8350.h b/include/dt-bindings/clock/qcom,dispcc-sm8350.h
new file mode 120000
index 000000000000..0312b4544acb
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,dispcc-sm8350.h
@@ -0,0 +1 @@
+qcom,dispcc-sm8250.h
\ No newline at end of file
-- 
cgit 


From 494e984af5b26dd286baaaad8a621fd7a7258c21 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Date: Fri, 1 Jul 2022 09:26:21 +0300
Subject: dt-bindings: clock: add QCOM SM8450 camera clock bindings

The change adds device tree bindings for camera clock controller
found on SM8450 SoC.

Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220701062622.2757831-2-vladimir.zapolskiy@linaro.org
---
 .../bindings/clock/qcom,sm8450-camcc.yaml          |  80 +++++++++++
 include/dt-bindings/clock/qcom,sm8450-camcc.h      | 159 +++++++++++++++++++++
 2 files changed, 239 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sm8450-camcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml
new file mode 100644
index 000000000000..268f4c6ae0ee
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8450-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller Binding for SM8450
+
+maintainers:
+  - Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
+
+description: |
+  Qualcomm camera clock control module which supports the clocks, resets and
+  power domains on SM8450.
+
+  See also include/dt-bindings/clock/qcom,sm8450-camcc.h
+
+properties:
+  compatible:
+    const: qcom,sm8450-camcc
+
+  clocks:
+    items:
+      - description: Camera AHB clock from GCC
+      - description: Board XO source
+      - description: Board active XO source
+      - description: Sleep clock source
+
+  power-domains:
+    maxItems: 1
+    description:
+      A phandle and PM domain specifier for the MMCX power domain.
+
+  required-opps:
+    description:
+      A phandle to an OPP node describing required MMCX performance point.
+
+  '#clock-cells':
+    const: 1
+
+  '#reset-cells':
+    const: 1
+
+  '#power-domain-cells':
+    const: 1
+
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - power-domains
+  - required-opps
+  - '#clock-cells'
+  - '#reset-cells'
+  - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/power/qcom-rpmpd.h>
+    clock-controller@ade0000 {
+      compatible = "qcom,sm8450-camcc";
+      reg = <0xade0000 0x20000>;
+      clocks = <&gcc GCC_CAMERA_AHB_CLK>,
+               <&rpmhcc RPMH_CXO_CLK>,
+               <&rpmhcc RPMH_CXO_CLK_A>,
+               <&sleep_clk>;
+      power-domains = <&rpmhpd SM8450_MMCX>;
+      required-opps = <&rpmhpd_opp_low_svs>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sm8450-camcc.h b/include/dt-bindings/clock/qcom,sm8450-camcc.h
new file mode 100644
index 000000000000..7ff67acf301a
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm8450-camcc.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_CAM_CC_SM8450_H
+#define _DT_BINDINGS_CLK_QCOM_CAM_CC_SM8450_H
+
+/* CAM_CC clocks */
+#define CAM_CC_BPS_AHB_CLK					0
+#define CAM_CC_BPS_CLK						1
+#define CAM_CC_BPS_CLK_SRC					2
+#define CAM_CC_BPS_FAST_AHB_CLK					3
+#define CAM_CC_CAMNOC_AXI_CLK					4
+#define CAM_CC_CAMNOC_AXI_CLK_SRC				5
+#define CAM_CC_CAMNOC_DCD_XO_CLK				6
+#define CAM_CC_CCI_0_CLK					7
+#define CAM_CC_CCI_0_CLK_SRC					8
+#define CAM_CC_CCI_1_CLK					9
+#define CAM_CC_CCI_1_CLK_SRC					10
+#define CAM_CC_CORE_AHB_CLK					11
+#define CAM_CC_CPAS_AHB_CLK					12
+#define CAM_CC_CPAS_BPS_CLK					13
+#define CAM_CC_CPAS_FAST_AHB_CLK				14
+#define CAM_CC_CPAS_IFE_0_CLK					15
+#define CAM_CC_CPAS_IFE_1_CLK					16
+#define CAM_CC_CPAS_IFE_2_CLK					17
+#define CAM_CC_CPAS_IFE_LITE_CLK				18
+#define CAM_CC_CPAS_IPE_NPS_CLK					19
+#define CAM_CC_CPAS_SBI_CLK					20
+#define CAM_CC_CPAS_SFE_0_CLK					21
+#define CAM_CC_CPAS_SFE_1_CLK					22
+#define CAM_CC_CPHY_RX_CLK_SRC					23
+#define CAM_CC_CSI0PHYTIMER_CLK					24
+#define CAM_CC_CSI0PHYTIMER_CLK_SRC				25
+#define CAM_CC_CSI1PHYTIMER_CLK					26
+#define CAM_CC_CSI1PHYTIMER_CLK_SRC				27
+#define CAM_CC_CSI2PHYTIMER_CLK					28
+#define CAM_CC_CSI2PHYTIMER_CLK_SRC				29
+#define CAM_CC_CSI3PHYTIMER_CLK					30
+#define CAM_CC_CSI3PHYTIMER_CLK_SRC				31
+#define CAM_CC_CSI4PHYTIMER_CLK					32
+#define CAM_CC_CSI4PHYTIMER_CLK_SRC				33
+#define CAM_CC_CSI5PHYTIMER_CLK					34
+#define CAM_CC_CSI5PHYTIMER_CLK_SRC				35
+#define CAM_CC_CSID_CLK						36
+#define CAM_CC_CSID_CLK_SRC					37
+#define CAM_CC_CSID_CSIPHY_RX_CLK				38
+#define CAM_CC_CSIPHY0_CLK					39
+#define CAM_CC_CSIPHY1_CLK					40
+#define CAM_CC_CSIPHY2_CLK					41
+#define CAM_CC_CSIPHY3_CLK					42
+#define CAM_CC_CSIPHY4_CLK					43
+#define CAM_CC_CSIPHY5_CLK					44
+#define CAM_CC_FAST_AHB_CLK_SRC					45
+#define CAM_CC_GDSC_CLK						46
+#define CAM_CC_ICP_AHB_CLK					47
+#define CAM_CC_ICP_CLK						48
+#define CAM_CC_ICP_CLK_SRC					49
+#define CAM_CC_IFE_0_CLK					50
+#define CAM_CC_IFE_0_CLK_SRC					51
+#define CAM_CC_IFE_0_DSP_CLK					52
+#define CAM_CC_IFE_0_FAST_AHB_CLK				53
+#define CAM_CC_IFE_1_CLK					54
+#define CAM_CC_IFE_1_CLK_SRC					55
+#define CAM_CC_IFE_1_DSP_CLK					56
+#define CAM_CC_IFE_1_FAST_AHB_CLK				57
+#define CAM_CC_IFE_2_CLK					58
+#define CAM_CC_IFE_2_CLK_SRC					59
+#define CAM_CC_IFE_2_DSP_CLK					60
+#define CAM_CC_IFE_2_FAST_AHB_CLK				61
+#define CAM_CC_IFE_LITE_AHB_CLK					62
+#define CAM_CC_IFE_LITE_CLK					63
+#define CAM_CC_IFE_LITE_CLK_SRC					64
+#define CAM_CC_IFE_LITE_CPHY_RX_CLK				65
+#define CAM_CC_IFE_LITE_CSID_CLK				66
+#define CAM_CC_IFE_LITE_CSID_CLK_SRC				67
+#define CAM_CC_IPE_NPS_AHB_CLK					68
+#define CAM_CC_IPE_NPS_CLK					69
+#define CAM_CC_IPE_NPS_CLK_SRC					70
+#define CAM_CC_IPE_NPS_FAST_AHB_CLK				71
+#define CAM_CC_IPE_PPS_CLK					72
+#define CAM_CC_IPE_PPS_FAST_AHB_CLK				73
+#define CAM_CC_JPEG_CLK						74
+#define CAM_CC_JPEG_CLK_SRC					75
+#define CAM_CC_MCLK0_CLK					76
+#define CAM_CC_MCLK0_CLK_SRC					77
+#define CAM_CC_MCLK1_CLK					78
+#define CAM_CC_MCLK1_CLK_SRC					79
+#define CAM_CC_MCLK2_CLK					80
+#define CAM_CC_MCLK2_CLK_SRC					81
+#define CAM_CC_MCLK3_CLK					82
+#define CAM_CC_MCLK3_CLK_SRC					83
+#define CAM_CC_MCLK4_CLK					84
+#define CAM_CC_MCLK4_CLK_SRC					85
+#define CAM_CC_MCLK5_CLK					86
+#define CAM_CC_MCLK5_CLK_SRC					87
+#define CAM_CC_MCLK6_CLK					88
+#define CAM_CC_MCLK6_CLK_SRC					89
+#define CAM_CC_MCLK7_CLK					90
+#define CAM_CC_MCLK7_CLK_SRC					91
+#define CAM_CC_PLL0						92
+#define CAM_CC_PLL0_OUT_EVEN					93
+#define CAM_CC_PLL0_OUT_ODD					94
+#define CAM_CC_PLL1						95
+#define CAM_CC_PLL1_OUT_EVEN					96
+#define CAM_CC_PLL2						97
+#define CAM_CC_PLL3						98
+#define CAM_CC_PLL3_OUT_EVEN					99
+#define CAM_CC_PLL4						100
+#define CAM_CC_PLL4_OUT_EVEN					101
+#define CAM_CC_PLL5						102
+#define CAM_CC_PLL5_OUT_EVEN					103
+#define CAM_CC_PLL6						104
+#define CAM_CC_PLL6_OUT_EVEN					105
+#define CAM_CC_PLL7						106
+#define CAM_CC_PLL7_OUT_EVEN					107
+#define CAM_CC_PLL8						108
+#define CAM_CC_PLL8_OUT_EVEN					109
+#define CAM_CC_QDSS_DEBUG_CLK					110
+#define CAM_CC_QDSS_DEBUG_CLK_SRC				111
+#define CAM_CC_QDSS_DEBUG_XO_CLK				112
+#define CAM_CC_SBI_AHB_CLK					113
+#define CAM_CC_SBI_CLK						114
+#define CAM_CC_SFE_0_CLK					115
+#define CAM_CC_SFE_0_CLK_SRC					116
+#define CAM_CC_SFE_0_FAST_AHB_CLK				117
+#define CAM_CC_SFE_1_CLK					118
+#define CAM_CC_SFE_1_CLK_SRC					119
+#define CAM_CC_SFE_1_FAST_AHB_CLK				120
+#define CAM_CC_SLEEP_CLK					121
+#define CAM_CC_SLEEP_CLK_SRC					122
+#define CAM_CC_SLOW_AHB_CLK_SRC					123
+#define CAM_CC_XO_CLK_SRC					124
+
+/* CAM_CC resets */
+#define CAM_CC_BPS_BCR						0
+#define CAM_CC_ICP_BCR						1
+#define CAM_CC_IFE_0_BCR					2
+#define CAM_CC_IFE_1_BCR					3
+#define CAM_CC_IFE_2_BCR					4
+#define CAM_CC_IPE_0_BCR					5
+#define CAM_CC_QDSS_DEBUG_BCR					6
+#define CAM_CC_SBI_BCR						7
+#define CAM_CC_SFE_0_BCR					8
+#define CAM_CC_SFE_1_BCR					9
+
+/* CAM_CC GDSCRs */
+#define BPS_GDSC		0
+#define IPE_0_GDSC		1
+#define SBI_GDSC		2
+#define IFE_0_GDSC		3
+#define IFE_1_GDSC		4
+#define IFE_2_GDSC		5
+#define SFE_0_GDSC		6
+#define SFE_1_GDSC		7
+#define TITAN_TOP_GDSC		8
+
+#endif
-- 
cgit 


From bfdd231374181254742c5e2faef0bef2d30c0ee4 Mon Sep 17 00:00:00 2001
From: Yunfei Wang <yf.wang@mediatek.com>
Date: Thu, 30 Jun 2022 17:29:25 +0800
Subject: iommu/io-pgtable-arm-v7s: Add a quirk to allow pgtable PA up to 35bit

Single memory zone feature will remove ZONE_DMA32 and ZONE_DMA and
cause pgtable PA size larger than 32bit.

Since Mediatek IOMMU hardware support at most 35bit PA in pgtable,
so add a quirk to allow the PA of pgtables support up to bit35.

Signed-off-by: Ning Li <ning.li@mediatek.com>
Signed-off-by: Yunfei Wang <yf.wang@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20220630092927.24925-2-yf.wang@mediatek.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/io-pgtable-arm-v7s.c | 75 ++++++++++++++++++++++++++++----------
 include/linux/io-pgtable.h         | 15 +++++---
 2 files changed, 66 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index be066c1503d3..ba3115fd0f86 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -182,14 +182,8 @@ static bool arm_v7s_is_mtk_enabled(struct io_pgtable_cfg *cfg)
 		(cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_EXT);
 }
 
-static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
-				    struct io_pgtable_cfg *cfg)
+static arm_v7s_iopte to_mtk_iopte(phys_addr_t paddr, arm_v7s_iopte pte)
 {
-	arm_v7s_iopte pte = paddr & ARM_V7S_LVL_MASK(lvl);
-
-	if (!arm_v7s_is_mtk_enabled(cfg))
-		return pte;
-
 	if (paddr & BIT_ULL(32))
 		pte |= ARM_V7S_ATTR_MTK_PA_BIT32;
 	if (paddr & BIT_ULL(33))
@@ -199,6 +193,17 @@ static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
 	return pte;
 }
 
+static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
+				    struct io_pgtable_cfg *cfg)
+{
+	arm_v7s_iopte pte = paddr & ARM_V7S_LVL_MASK(lvl);
+
+	if (arm_v7s_is_mtk_enabled(cfg))
+		return to_mtk_iopte(paddr, pte);
+
+	return pte;
+}
+
 static phys_addr_t iopte_to_paddr(arm_v7s_iopte pte, int lvl,
 				  struct io_pgtable_cfg *cfg)
 {
@@ -240,10 +245,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 	dma_addr_t dma;
 	size_t size = ARM_V7S_TABLE_SIZE(lvl, cfg);
 	void *table = NULL;
+	gfp_t gfp_l1;
+
+	/*
+	 * ARM_MTK_TTBR_EXT extend the translation table base support larger
+	 * memory address.
+	 */
+	gfp_l1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+		 GFP_KERNEL : ARM_V7S_TABLE_GFP_DMA;
 
 	if (lvl == 1)
-		table = (void *)__get_free_pages(
-			__GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
+		table = (void *)__get_free_pages(gfp_l1 | __GFP_ZERO, get_order(size));
 	else if (lvl == 2)
 		table = kmem_cache_zalloc(data->l2_tables, gfp);
 
@@ -251,7 +263,8 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 		return NULL;
 
 	phys = virt_to_phys(table);
-	if (phys != (arm_v7s_iopte)phys) {
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+	    phys >= (1ULL << cfg->oas) : phys != (arm_v7s_iopte)phys) {
 		/* Doesn't fit in PTE */
 		dev_err(dev, "Page table does not fit in PTE: %pa", &phys);
 		goto out_free;
@@ -457,9 +470,14 @@ static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table,
 					   arm_v7s_iopte curr,
 					   struct io_pgtable_cfg *cfg)
 {
+	phys_addr_t phys = virt_to_phys(table);
 	arm_v7s_iopte old, new;
 
-	new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE;
+	new = phys | ARM_V7S_PTE_TYPE_TABLE;
+
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT)
+		new = to_mtk_iopte(phys, new);
+
 	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 		new |= ARM_V7S_ATTR_NS_TABLE;
 
@@ -779,6 +797,8 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 						void *cookie)
 {
 	struct arm_v7s_io_pgtable *data;
+	slab_flags_t slab_flag;
+	phys_addr_t paddr;
 
 	if (cfg->ias > (arm_v7s_is_mtk_enabled(cfg) ? 34 : ARM_V7S_ADDR_BITS))
 		return NULL;
@@ -788,7 +808,8 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_NO_PERMS |
-			    IO_PGTABLE_QUIRK_ARM_MTK_EXT))
+			    IO_PGTABLE_QUIRK_ARM_MTK_EXT |
+			    IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT))
 		return NULL;
 
 	/* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */
@@ -796,15 +817,27 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	    !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS))
 			return NULL;
 
+	if ((cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT) &&
+	    !arm_v7s_is_mtk_enabled(cfg))
+		return NULL;
+
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return NULL;
 
 	spin_lock_init(&data->split_lock);
+
+	/*
+	 * ARM_MTK_TTBR_EXT extend the translation table base support larger
+	 * memory address.
+	 */
+	slab_flag = cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+		    0 : ARM_V7S_TABLE_SLAB_FLAGS;
+
 	data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
 					    ARM_V7S_TABLE_SIZE(2, cfg),
 					    ARM_V7S_TABLE_SIZE(2, cfg),
-					    ARM_V7S_TABLE_SLAB_FLAGS, NULL);
+					    slab_flag, NULL);
 	if (!data->l2_tables)
 		goto out_free_data;
 
@@ -850,12 +883,16 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	wmb();
 
 	/* TTBR */
-	cfg->arm_v7s_cfg.ttbr = virt_to_phys(data->pgd) | ARM_V7S_TTBR_S |
-				(cfg->coherent_walk ? (ARM_V7S_TTBR_NOS |
-				 ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
-				 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA)) :
-				(ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_NC) |
-				 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_NC)));
+	paddr = virt_to_phys(data->pgd);
+	if (arm_v7s_is_mtk_enabled(cfg))
+		cfg->arm_v7s_cfg.ttbr = paddr | upper_32_bits(paddr);
+	else
+		cfg->arm_v7s_cfg.ttbr = paddr | ARM_V7S_TTBR_S |
+					(cfg->coherent_walk ? (ARM_V7S_TTBR_NOS |
+					 ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
+					 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA)) :
+					(ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_NC) |
+					 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_NC)));
 	return &data->iop;
 
 out_free_data:
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 86af6f0a00a2..ca98aeadcc80 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -74,17 +74,22 @@ struct io_pgtable_cfg {
 	 *	to support up to 35 bits PA where the bit32, bit33 and bit34 are
 	 *	encoded in the bit9, bit4 and bit5 of the PTE respectively.
 	 *
+	 * IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT: (ARM v7s format) MediaTek IOMMUs
+	 *	extend the translation table base support up to 35 bits PA, the
+	 *	encoding format is same with IO_PGTABLE_QUIRK_ARM_MTK_EXT.
+	 *
 	 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 	 *	for use in the upper half of a split address space.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
 	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 */
-	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
-	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
-	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT	BIT(3)
-	#define IO_PGTABLE_QUIRK_ARM_TTBR1	BIT(5)
-	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA	BIT(6)
+	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
+	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT		BIT(3)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT	BIT(4)
+	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
+	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
-- 
cgit 


From 961343d7822624d0e329ab4167c7e1d02bb53112 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Fri, 1 Jul 2022 15:00:53 -0500
Subject: genirq: Refactor accessors to use irq_data_get_affinity_mask

A couple of functions directly reference the affinity mask. Route them
through irq_data_get_affinity_mask so they will pick up any refactoring
done there.

Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220701200056.46555-6-samuel@sholland.org
---
 include/linux/irq.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 505308253d23..69ee4e2f36ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -879,16 +879,16 @@ static inline int irq_data_get_node(struct irq_data *d)
 	return irq_common_data_get_node(d->common);
 }
 
-static inline struct cpumask *irq_get_affinity_mask(int irq)
+static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 {
-	struct irq_data *d = irq_get_irq_data(irq);
-
-	return d ? d->common->affinity : NULL;
+	return d->common->affinity;
 }
 
-static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
+static inline struct cpumask *irq_get_affinity_mask(int irq)
 {
-	return d->common->affinity;
+	struct irq_data *d = irq_get_irq_data(irq);
+
+	return d ? irq_data_get_affinity_mask(d) : NULL;
 }
 
 #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -910,7 +910,7 @@ static inline void irq_data_update_effective_affinity(struct irq_data *d,
 static inline
 struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 {
-	return d->common->affinity;
+	return irq_data_get_affinity_mask(d);
 }
 #endif
 
-- 
cgit 


From 073352e951f60946452da358d64841066c3142ff Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Fri, 1 Jul 2022 15:00:54 -0500
Subject: genirq: Add and use an irq_data_update_affinity helper

Some architectures and irqchip drivers modify the cpumask returned by
irq_data_get_affinity_mask, usually by copying in to it. This is
problematic for uniprocessor configurations, where the affinity mask
should be constant, as it is known at compile time.

Add and use a setter for the affinity mask, following the pattern of
irq_data_update_effective_affinity. This allows the getter function to
return a const cpumask pointer.

Signed-off-by: Samuel Holland <samuel@sholland.org>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Xen bits
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220701200056.46555-7-samuel@sholland.org
---
 arch/alpha/kernel/irq.c          | 2 +-
 arch/ia64/kernel/iosapic.c       | 2 +-
 arch/ia64/kernel/irq.c           | 4 ++--
 arch/ia64/kernel/msi_ia64.c      | 4 ++--
 arch/parisc/kernel/irq.c         | 2 +-
 drivers/irqchip/irq-bcm6345-l1.c | 4 ++--
 drivers/parisc/iosapic.c         | 2 +-
 drivers/sh/intc/chip.c           | 2 +-
 drivers/xen/events/events_base.c | 7 ++++---
 include/linux/irq.h              | 6 ++++++
 10 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index f6d2946edbd2..15f2effd6baf 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -60,7 +60,7 @@ int irq_select_affinity(unsigned int irq)
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
 	last_cpu = cpu;
 
-	cpumask_copy(irq_data_get_affinity_mask(data), cpumask_of(cpu));
+	irq_data_update_affinity(data, cpumask_of(cpu));
 	chip->irq_set_affinity(data, cpumask_of(cpu), false);
 	return 0;
 }
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 35adcf89035a..99300850abc1 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -834,7 +834,7 @@ iosapic_unregister_intr (unsigned int gsi)
 	if (iosapic_intr_info[irq].count == 0) {
 #ifdef CONFIG_SMP
 		/* Clear affinity */
-		cpumask_setall(irq_get_affinity_mask(irq));
+		irq_data_update_affinity(irq_get_irq_data(irq), cpu_all_mask);
 #endif
 		/* Clear the interrupt information */
 		iosapic_intr_info[irq].dest = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index ecef17c7c35b..275b9ea58c64 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -57,8 +57,8 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	if (irq < NR_IRQS) {
-		cpumask_copy(irq_get_affinity_mask(irq),
-			     cpumask_of(cpu_logical_id(hwid)));
+		irq_data_update_affinity(irq_get_irq_data(irq),
+					 cpumask_of(cpu_logical_id(hwid)));
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index df5c28f252e3..025e5133c860 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -37,7 +37,7 @@ static int ia64_set_msi_irq_affinity(struct irq_data *idata,
 	msg.data = data;
 
 	pci_write_msi_msg(irq, &msg);
-	cpumask_copy(irq_data_get_affinity_mask(idata), cpumask_of(cpu));
+	irq_data_update_affinity(idata, cpumask_of(cpu));
 
 	return 0;
 }
@@ -132,7 +132,7 @@ static int dmar_msi_set_affinity(struct irq_data *data,
 	msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
 
 	dmar_msi_write(irq, &msg);
-	cpumask_copy(irq_data_get_affinity_mask(data), mask);
+	irq_data_update_affinity(data, mask);
 
 	return 0;
 }
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 0fe2d79fb123..5ebb1771b4ab 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -315,7 +315,7 @@ unsigned long txn_affinity_addr(unsigned int irq, int cpu)
 {
 #ifdef CONFIG_SMP
 	struct irq_data *d = irq_get_irq_data(irq);
-	cpumask_copy(irq_data_get_affinity_mask(d), cpumask_of(cpu));
+	irq_data_update_affinity(d, cpumask_of(cpu));
 #endif
 
 	return per_cpu(cpu_data, cpu).txn_addr;
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
index 142a7431745f..6899e37810a8 100644
--- a/drivers/irqchip/irq-bcm6345-l1.c
+++ b/drivers/irqchip/irq-bcm6345-l1.c
@@ -216,11 +216,11 @@ static int bcm6345_l1_set_affinity(struct irq_data *d,
 		enabled = intc->cpus[old_cpu]->enable_cache[word] & mask;
 		if (enabled)
 			__bcm6345_l1_mask(d);
-		cpumask_copy(irq_data_get_affinity_mask(d), dest);
+		irq_data_update_affinity(d, dest);
 		if (enabled)
 			__bcm6345_l1_unmask(d);
 	} else {
-		cpumask_copy(irq_data_get_affinity_mask(d), dest);
+		irq_data_update_affinity(d, dest);
 	}
 	raw_spin_unlock_irqrestore(&intc->lock, flags);
 
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 8a3b0c3a1e92..3a8c98615634 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -677,7 +677,7 @@ static int iosapic_set_affinity_irq(struct irq_data *d,
 	if (dest_cpu < 0)
 		return -1;
 
-	cpumask_copy(irq_data_get_affinity_mask(d), cpumask_of(dest_cpu));
+	irq_data_update_affinity(d, cpumask_of(dest_cpu));
 	vi->txn_addr = txn_affinity_addr(d->irq, dest_cpu);
 
 	spin_lock_irqsave(&iosapic_lock, flags);
diff --git a/drivers/sh/intc/chip.c b/drivers/sh/intc/chip.c
index 358df7510186..828d81e02b37 100644
--- a/drivers/sh/intc/chip.c
+++ b/drivers/sh/intc/chip.c
@@ -72,7 +72,7 @@ static int intc_set_affinity(struct irq_data *data,
 	if (!cpumask_intersects(cpumask, cpu_online_mask))
 		return -1;
 
-	cpumask_copy(irq_data_get_affinity_mask(data), cpumask);
+	irq_data_update_affinity(data, cpumask);
 
 	return IRQ_SET_MASK_OK_NOCOPY;
 }
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 46d9295d9a6e..5e8321f43cbd 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -528,9 +528,10 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
 	BUG_ON(irq == -1);
 
 	if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
-		cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
-		cpumask_copy(irq_get_effective_affinity_mask(irq),
-			     cpumask_of(cpu));
+		struct irq_data *data = irq_get_irq_data(irq);
+
+		irq_data_update_affinity(data, cpumask_of(cpu));
+		irq_data_update_effective_affinity(data, cpumask_of(cpu));
 	}
 
 	xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 69ee4e2f36ce..adcfebceb777 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -884,6 +884,12 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 	return d->common->affinity;
 }
 
+static inline void irq_data_update_affinity(struct irq_data *d,
+					    const struct cpumask *m)
+{
+	cpumask_copy(d->common->affinity, m);
+}
+
 static inline struct cpumask *irq_get_affinity_mask(int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
-- 
cgit 


From 4d0b8298818b623f5fa51d5c49e1a142d3618ac9 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Fri, 1 Jul 2022 15:00:55 -0500
Subject: genirq: Return a const cpumask from irq_data_get_affinity_mask

Now that the irq_data_update_affinity helper exists, enforce its use
by returning a a const cpumask from irq_data_get_affinity_mask.

Since the previous commit already updated places that needed to call
irq_data_update_affinity, this commit updates the remaining code that
either did not modify the cpumask or immediately passed the modified
mask to irq_set_affinity.

Signed-off-by: Samuel Holland <samuel@sholland.org>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220701200056.46555-8-samuel@sholland.org
---
 arch/mips/cavium-octeon/octeon-irq.c |  4 ++--
 arch/sh/kernel/irq.c                 |  7 ++++---
 arch/x86/hyperv/irqdomain.c          |  2 +-
 arch/xtensa/kernel/irq.c             |  7 ++++---
 drivers/iommu/hyperv-iommu.c         |  2 +-
 drivers/pci/controller/pci-hyperv.c  | 10 +++++-----
 include/linux/irq.h                  | 12 +++++++-----
 kernel/irq/chip.c                    |  8 +++++---
 kernel/irq/debugfs.c                 |  2 +-
 kernel/irq/ipi.c                     | 16 +++++++++-------
 10 files changed, 39 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 6cdcbf4de763..9cb9ed44bcaf 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -263,7 +263,7 @@ static int next_cpu_for_irq(struct irq_data *data)
 
 #ifdef CONFIG_SMP
 	int cpu;
-	struct cpumask *mask = irq_data_get_affinity_mask(data);
+	const struct cpumask *mask = irq_data_get_affinity_mask(data);
 	int weight = cpumask_weight(mask);
 	struct octeon_ciu_chip_data *cd = irq_data_get_irq_chip_data(data);
 
@@ -758,7 +758,7 @@ static void octeon_irq_cpu_offline_ciu(struct irq_data *data)
 {
 	int cpu = smp_processor_id();
 	cpumask_t new_affinity;
-	struct cpumask *mask = irq_data_get_affinity_mask(data);
+	const struct cpumask *mask = irq_data_get_affinity_mask(data);
 
 	if (!cpumask_test_cpu(cpu, mask))
 		return;
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ef0f0827cf57..56269c2c3414 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -230,16 +230,17 @@ void migrate_irqs(void)
 		struct irq_data *data = irq_get_irq_data(irq);
 
 		if (irq_data_get_node(data) == cpu) {
-			struct cpumask *mask = irq_data_get_affinity_mask(data);
+			const struct cpumask *mask = irq_data_get_affinity_mask(data);
 			unsigned int newcpu = cpumask_any_and(mask,
 							      cpu_online_mask);
 			if (newcpu >= nr_cpu_ids) {
 				pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n",
 						    irq, cpu);
 
-				cpumask_setall(mask);
+				irq_set_affinity(irq, cpu_all_mask);
+			} else {
+				irq_set_affinity(irq, mask);
 			}
-			irq_set_affinity(irq, mask);
 		}
 	}
 }
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 7e0f6bedc248..42c70d28ef27 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -192,7 +192,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	struct pci_dev *dev;
 	struct hv_interrupt_entry out_entry, *stored_entry;
 	struct irq_cfg *cfg = irqd_cfg(data);
-	cpumask_t *affinity;
+	const cpumask_t *affinity;
 	int cpu;
 	u64 status;
 
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 529fe9245821..42f106004400 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -169,7 +169,7 @@ void migrate_irqs(void)
 
 	for_each_active_irq(i) {
 		struct irq_data *data = irq_get_irq_data(i);
-		struct cpumask *mask;
+		const struct cpumask *mask;
 		unsigned int newcpu;
 
 		if (irqd_is_per_cpu(data))
@@ -185,9 +185,10 @@ void migrate_irqs(void)
 			pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n",
 					    i, cpu);
 
-			cpumask_setall(mask);
+			irq_set_affinity(i, cpu_all_mask);
+		} else {
+			irq_set_affinity(i, mask);
 		}
-		irq_set_affinity(i, mask);
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index e285a220c913..51bd66a45a11 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -194,7 +194,7 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
 	u32 vector;
 	struct irq_cfg *cfg;
 	int ioapic_id;
-	struct cpumask *affinity;
+	const struct cpumask *affinity;
 	int cpu;
 	struct hv_interrupt_entry entry;
 	struct hyperv_root_ir_data *data = irq_data->chip_data;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index db814f7b93ba..aebada45569b 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -642,7 +642,7 @@ static void hv_arch_irq_unmask(struct irq_data *data)
 	struct hv_retarget_device_interrupt *params;
 	struct tran_int_desc *int_desc;
 	struct hv_pcibus_device *hbus;
-	struct cpumask *dest;
+	const struct cpumask *dest;
 	cpumask_var_t tmp;
 	struct pci_bus *pbus;
 	struct pci_dev *pdev;
@@ -1613,7 +1613,7 @@ out:
 }
 
 static u32 hv_compose_msi_req_v1(
-	struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
+	struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
 	u32 slot, u8 vector, u8 vector_count)
 {
 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
@@ -1641,7 +1641,7 @@ static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
 }
 
 static u32 hv_compose_msi_req_v2(
-	struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
+	struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
 	u32 slot, u8 vector, u8 vector_count)
 {
 	int cpu;
@@ -1660,7 +1660,7 @@ static u32 hv_compose_msi_req_v2(
 }
 
 static u32 hv_compose_msi_req_v3(
-	struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
+	struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
 	u32 slot, u32 vector, u8 vector_count)
 {
 	int cpu;
@@ -1697,7 +1697,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	struct hv_pci_dev *hpdev;
 	struct pci_bus *pbus;
 	struct pci_dev *pdev;
-	struct cpumask *dest;
+	const struct cpumask *dest;
 	struct compose_comp_ctxt comp;
 	struct tran_int_desc *int_desc;
 	struct msi_desc *msi_desc;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index adcfebceb777..02073f7a156e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -879,7 +879,8 @@ static inline int irq_data_get_node(struct irq_data *d)
 	return irq_common_data_get_node(d->common);
 }
 
-static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
+static inline
+const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 {
 	return d->common->affinity;
 }
@@ -890,7 +891,7 @@ static inline void irq_data_update_affinity(struct irq_data *d,
 	cpumask_copy(d->common->affinity, m);
 }
 
-static inline struct cpumask *irq_get_affinity_mask(int irq)
+static inline const struct cpumask *irq_get_affinity_mask(int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 
@@ -899,7 +900,7 @@ static inline struct cpumask *irq_get_affinity_mask(int irq)
 
 #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
 static inline
-struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 {
 	return d->common->effective_affinity;
 }
@@ -914,13 +915,14 @@ static inline void irq_data_update_effective_affinity(struct irq_data *d,
 {
 }
 static inline
-struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 {
 	return irq_data_get_affinity_mask(d);
 }
 #endif
 
-static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
+static inline
+const struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 886789dcee43..9c7ad2266317 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -188,7 +188,8 @@ enum {
 
 #ifdef CONFIG_SMP
 static int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+		      bool force)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 
@@ -224,7 +225,8 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 }
 #else
 static __always_inline int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+		      bool force)
 {
 	return IRQ_STARTUP_NORMAL;
 }
@@ -252,7 +254,7 @@ static int __irq_startup(struct irq_desc *desc)
 int irq_startup(struct irq_desc *desc, bool resend, bool force)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
-	struct cpumask *aff = irq_data_get_affinity_mask(d);
+	const struct cpumask *aff = irq_data_get_affinity_mask(d);
 	int ret = 0;
 
 	desc->depth = 0;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index bc8e40cf2b65..bbcaac64038e 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -30,7 +30,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
 static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
 {
 	struct irq_data *data = irq_desc_get_irq_data(desc);
-	struct cpumask *msk;
+	const struct cpumask *msk;
 
 	msk = irq_data_get_affinity_mask(data);
 	seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 08ce7da3b57c..bbd945bacef0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -115,11 +115,11 @@ free_descs:
 int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 {
 	struct irq_data *data = irq_get_irq_data(irq);
-	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+	const struct cpumask *ipimask;
 	struct irq_domain *domain;
 	unsigned int nr_irqs;
 
-	if (!irq || !data || !ipimask)
+	if (!irq || !data)
 		return -EINVAL;
 
 	domain = data->domain;
@@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 		return -EINVAL;
 	}
 
-	if (WARN_ON(!cpumask_subset(dest, ipimask)))
+	ipimask = irq_data_get_affinity_mask(data);
+	if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
 		/*
 		 * Must be destroying a subset of CPUs to which this IPI
 		 * was set up to target
@@ -162,12 +163,13 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 {
 	struct irq_data *data = irq_get_irq_data(irq);
-	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+	const struct cpumask *ipimask;
 
-	if (!data || !ipimask || cpu >= nr_cpu_ids)
+	if (!data || cpu >= nr_cpu_ids)
 		return INVALID_HWIRQ;
 
-	if (!cpumask_test_cpu(cpu, ipimask))
+	ipimask = irq_data_get_affinity_mask(data);
+	if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
 		return INVALID_HWIRQ;
 
 	/*
@@ -186,7 +188,7 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
 static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
 			   const struct cpumask *dest, unsigned int cpu)
 {
-	struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+	const struct cpumask *ipimask = irq_data_get_affinity_mask(data);
 
 	if (!chip || !ipimask)
 		return -EINVAL;
-- 
cgit 


From aa0813581b8d37bdd91cd40b67ef79ffa45104b2 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Fri, 1 Jul 2022 15:00:56 -0500
Subject: genirq: Provide an IRQ affinity mask in non-SMP configs

IRQ affinity masks are not allocated in uniprocessor configurations.
This requires special case non-SMP code in drivers for irqchips which
have per-CPU enable or mask registers.

Since IRQ affinity is always the same in a uniprocessor configuration,
we can provide a correct affinity mask without allocating one per IRQ.

By returning a real cpumask from irq_data_get_affinity_mask even when
SMP is disabled, irqchip drivers which iterate over that mask will
automatically do the right thing.

Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220701200056.46555-9-samuel@sholland.org
---
 include/linux/irq.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 02073f7a156e..996e22744edd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -151,7 +151,9 @@ struct irq_common_data {
 #endif
 	void			*handler_data;
 	struct msi_desc		*msi_desc;
+#ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
+#endif
 #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
 	cpumask_var_t		effective_affinity;
 #endif
@@ -882,13 +884,19 @@ static inline int irq_data_get_node(struct irq_data *d)
 static inline
 const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 {
+#ifdef CONFIG_SMP
 	return d->common->affinity;
+#else
+	return cpumask_of(0);
+#endif
 }
 
 static inline void irq_data_update_affinity(struct irq_data *d,
 					    const struct cpumask *m)
 {
+#ifdef CONFIG_SMP
 	cpumask_copy(d->common->affinity, m);
+#endif
 }
 
 static inline const struct cpumask *irq_get_affinity_mask(int irq)
-- 
cgit 


From 70c248aca9e7efa85a6664d5ab56c17c326c958f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 10 Jun 2022 16:21:39 +0100
Subject: mm: kasan: Skip unpoisoning of user pages

Commit c275c5c6d50a ("kasan: disable freed user page poisoning with HW
tags") added __GFP_SKIP_KASAN_POISON to GFP_HIGHUSER_MOVABLE. A similar
argument can be made about unpoisoning, so also add
__GFP_SKIP_KASAN_UNPOISON to user pages. To ensure the user page is
still accessible via page_address() without a kasan fault, reset the
page->flags tag.

With the above changes, there is no need for the arm64
tag_clear_highpage() to reset the page->flags tag.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20220610152141.2148929-3-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/fault.c | 1 -
 include/linux/gfp.h   | 2 +-
 mm/page_alloc.c       | 7 +++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c5e11768e5c1..cdf3ffa0c223 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -927,6 +927,5 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 void tag_clear_highpage(struct page *page)
 {
 	mte_zero_clear_page_tags(page_address(page));
-	page_kasan_tag_reset(page);
 	set_bit(PG_mte_tagged, &page->flags);
 }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae933c2..0ace7759acd2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -348,7 +348,7 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 #define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
 #define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
-			 __GFP_SKIP_KASAN_POISON)
+			 __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON)
 #define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e008a3df0485..f6ed240870bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2397,6 +2397,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
 			!should_skip_init(gfp_flags);
 	bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+	int i;
 
 	set_page_private(page, 0);
 	set_page_refcounted(page);
@@ -2422,8 +2423,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 * should be initialized as well).
 	 */
 	if (init_tags) {
-		int i;
-
 		/* Initialize both memory and tags. */
 		for (i = 0; i != 1 << order; ++i)
 			tag_clear_highpage(page + i);
@@ -2438,6 +2437,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 		/* Note that memory is already initialized by KASAN. */
 		if (kasan_has_integrated_init())
 			init = false;
+	} else {
+		/* Ensure page_address() dereferencing does not fault. */
+		for (i = 0; i != 1 << order; ++i)
+			page_kasan_tag_reset(page + i);
 	}
 	/* If memory is still not initialized, do it now. */
 	if (init)
-- 
cgit 


From 2aec377a29250b942f14d3c16d49783da3e9df11 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 5 Jul 2022 14:00:36 -0400
Subject: dm table: remove dm_table_get_num_targets() wrapper

More efficient and readable to just access table->num_targets directly.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-ima.c           |  2 +-
 drivers/md/dm-ioctl.c         |  6 +++---
 drivers/md/dm-table.c         | 37 ++++++++++++++++---------------------
 drivers/md/dm-zone.c          |  2 +-
 drivers/md/dm.c               |  4 ++--
 include/linux/device-mapper.h |  1 -
 6 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
index 1842d3a958ef..3c15c069947f 100644
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -208,7 +208,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	if (!target_data_buf)
 		goto error;
 
-	num_targets = dm_table_get_num_targets(table);
+	num_targets = table->num_targets;
 
 	if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio))
 		goto error;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 87310fceb0d8..98976aaa9db9 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -832,7 +832,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 		if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
 			if (get_disk_ro(disk))
 				param->flags |= DM_READONLY_FLAG;
-			param->target_count = dm_table_get_num_targets(table);
+			param->target_count = table->num_targets;
 		}
 
 		param->flags |= DM_ACTIVE_PRESENT_FLAG;
@@ -845,7 +845,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 		if (table) {
 			if (!(dm_table_get_mode(table) & FMODE_WRITE))
 				param->flags |= DM_READONLY_FLAG;
-			param->target_count = dm_table_get_num_targets(table);
+			param->target_count = table->num_targets;
 		}
 		dm_put_live_table(md, srcu_idx);
 	}
@@ -1248,7 +1248,7 @@ static void retrieve_status(struct dm_table *table,
 		type = STATUSTYPE_INFO;
 
 	/* Get all the target info */
-	num_targets = dm_table_get_num_targets(table);
+	num_targets = table->num_targets;
 	for (i = 0; i < num_targets; i++) {
 		struct dm_target *ti = dm_table_get_target(table, i);
 		size_t l;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3f29b1113294..b4af34041a6f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -593,7 +593,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 	/*
 	 * Check each entry in the table in turn.
 	 */
-	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+	for (i = 0; i < table->num_targets; i++) {
 		ti = dm_table_get_target(table, i);
 
 		blk_set_stacking_limits(&ti_limits);
@@ -832,7 +832,7 @@ static bool dm_table_supports_dax(struct dm_table *t,
 	unsigned i;
 
 	/* Ensure that all targets support DAX. */
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (!ti->type->direct_access)
@@ -987,7 +987,7 @@ struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 		if (dm_target_is_wildcard(ti->type))
 			return ti;
@@ -1127,7 +1127,7 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
 	unsigned i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = dm_table_get_target(t, i);
 		if (!dm_target_passes_integrity(ti->type))
 			goto no_integrity;
@@ -1248,7 +1248,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
 	t = dm_get_live_table(md, &srcu_idx);
 	if (!t)
 		return 0;
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 		if (!ti->type->iterate_devices)
 			continue;
@@ -1318,7 +1318,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
 	memset(profile->modes_supported, 0xFF,
 	       sizeof(profile->modes_supported));
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (!dm_target_passes_crypto(ti->type)) {
@@ -1540,7 +1540,7 @@ static bool dm_table_any_dev_attr(struct dm_table *t,
 	struct dm_target *ti;
 	unsigned int i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (ti->type->iterate_devices &&
@@ -1566,7 +1566,7 @@ static bool dm_table_supports_poll(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	while (i < dm_table_get_num_targets(t)) {
+	while (i < t->num_targets) {
 		ti = dm_table_get_target(t, i++);
 
 		if (!ti->type->iterate_devices ||
@@ -1588,7 +1588,7 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
 	struct dm_target *ti;
 	unsigned i, num_devices;
 
-	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+	for (i = 0; i < table->num_targets; i++) {
 		ti = dm_table_get_target(table, i);
 
 		if (!ti->type->iterate_devices)
@@ -1625,7 +1625,7 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
 	struct dm_target *ti;
 	unsigned i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (dm_target_supports_zoned_hm(ti->type)) {
@@ -1699,7 +1699,7 @@ int dm_calculate_queue_limits(struct dm_table *table,
 
 	blk_set_stacking_limits(limits);
 
-	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+	for (i = 0; i < table->num_targets; i++) {
 		blk_set_stacking_limits(&ti_limits);
 
 		ti = dm_table_get_target(table, i);
@@ -1819,7 +1819,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	 * so we need to use iterate_devices here, which targets
 	 * supporting flushes must provide.
 	 */
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_flush_bios)
@@ -1877,7 +1877,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	while (i < dm_table_get_num_targets(t)) {
+	while (i < t->num_targets) {
 		ti = dm_table_get_target(t, i++);
 
 		if (!ti->num_write_zeroes_bios)
@@ -1904,7 +1904,7 @@ static bool dm_table_supports_nowait(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	while (i < dm_table_get_num_targets(t)) {
+	while (i < t->num_targets) {
 		ti = dm_table_get_target(t, i++);
 
 		if (!dm_target_supports_nowait(ti->type))
@@ -1929,7 +1929,7 @@ static bool dm_table_supports_discards(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_discard_bios)
@@ -1961,7 +1961,7 @@ static bool dm_table_supports_secure_erase(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned int i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_secure_erase_bios)
@@ -2092,11 +2092,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	return 0;
 }
 
-unsigned int dm_table_get_num_targets(struct dm_table *t)
-{
-	return t->num_targets;
-}
-
 struct list_head *dm_table_get_devices(struct dm_table *t)
 {
 	return &t->devices;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 3e7b1fe1580b..16dc9ec4bb11 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -278,7 +278,7 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned int i;
 
-	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+	for (i = 0; i < t->num_targets; i++) {
 		ti = dm_table_get_target(t, i);
 
 		if (ti->emulate_zone_append)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fa6839141118..44dae3869f29 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -422,7 +422,7 @@ retry:
 		return r;
 
 	/* We only support devices that have a single target */
-	if (dm_table_get_num_targets(map) != 1)
+	if (map->num_targets != 1)
 		return r;
 
 	tgt = dm_table_get_target(map, 0);
@@ -3092,7 +3092,7 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
 		goto out;
 
 	/* We only support devices that have a single target */
-	if (dm_table_get_num_targets(table) != 1)
+	if (table->num_targets != 1)
 		goto out;
 	ti = dm_table_get_target(table, 0);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 47a01c7cffdf..920085dd7f3b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -561,7 +561,6 @@ void dm_sync_table(struct mapped_device *md);
  * Queries
  */
 sector_t dm_table_get_size(struct dm_table *t);
-unsigned int dm_table_get_num_targets(struct dm_table *t);
 fmode_t dm_table_get_mode(struct dm_table *t);
 struct mapped_device *dm_table_get_md(struct dm_table *t);
 const char *dm_table_device_name(struct dm_table *t);
-- 
cgit 


From 752f596371289cb3b45e99453b5c09d96ed36614 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Sun, 26 Jun 2022 10:10:48 +0100
Subject: docs: filesystems: update netfs-api.rst reference

Changeset efc930fa1d84 ("docs: filesystems: caching/netfs-api.txt: convert it to ReST")
renamed: Documentation/filesystems/caching/netfs-api.txt
to: Documentation/filesystems/caching/netfs-api.rst.

Update its cross-reference accordingly.

Fixes: efc930fa1d84 ("docs: filesystems: caching/netfs-api.txt: convert it to ReST")
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/5f867f01d42c3e65e111167739ed1a41a26623f9.1656234456.git.mchehab@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 72585c9729a2..2f82ea31d4c1 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -378,7 +378,7 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data,
  *
  * Request that the size of an object be changed.
  *
- * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * See Documentation/filesystems/caching/netfs-api.rst for a complete
  * description.
  */
 static inline
-- 
cgit 


From c02b872a7ca7842e4cdbbf621f77607d0a655f83 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Sun, 26 Jun 2022 10:10:56 +0100
Subject: Documentation: update watch_queue.rst references

Changeset f5461124d59b ("Documentation: move watch_queue to core-api")
renamed: Documentation/watch_queue.rst
to: Documentation/core-api/watch_queue.rst.

Update the cross-references accordingly.

Fixes: f5461124d59b ("Documentation: move watch_queue to core-api")
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/1c220de9c58f35e815a3df9458ac2bea323c8bfb.1656234456.git.mchehab@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/security/keys/core.rst | 2 +-
 include/linux/watch_queue.h          | 2 +-
 init/Kconfig                         | 2 +-
 kernel/watch_queue.c                 | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index b3ed5c581034..811b905b56bf 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1046,7 +1046,7 @@ The keyctl syscall functions are:
      "filter" is either NULL to remove a watch or a filter specification to
      indicate what events are required from the key.
 
-     See Documentation/watch_queue.rst for more information.
+     See Documentation/core-api/watch_queue.rst for more information.
 
      Note that only one watch may be emplaced for any particular { key,
      queue_fd } combination.
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 3b9a40ae8bdb..fc6bba20273b 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -4,7 +4,7 @@
  * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
- * See Documentation/watch_queue.rst
+ * See Documentation/core-api/watch_queue.rst
  */
 
 #ifndef _LINUX_WATCH_QUEUE_H
diff --git a/init/Kconfig b/init/Kconfig
index c7900e8975f1..6f1a5355dc61 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -414,7 +414,7 @@ config WATCH_QUEUE
 	  with watches for key/keyring change notifications and device
 	  notifications.
 
-	  See Documentation/watch_queue.rst
+	  See Documentation/core-api/watch_queue.rst
 
 config CROSS_MEMORY_ATTACH
 	bool "Enable process_vm_readv/writev syscalls"
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 230038d4f908..869fea4fe26b 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -4,7 +4,7 @@
  * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
- * See Documentation/watch_queue.rst
+ * See Documentation/core-api/watch_queue.rst
  */
 
 #define pr_fmt(fmt) "watchq: " fmt
-- 
cgit 


From d6a21f2d73258f2a4cd2e7806f5755ee73fddced Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Sun, 26 Jun 2022 10:11:01 +0100
Subject: objtool: update objtool.txt references

Changeset a8e35fece49b ("objtool: Update documentation")
renamed: tools/objtool/Documentation/stack-validation.txt
to: tools/objtool/Documentation/objtool.txt.

Update the cross-references accordingly.

Fixes: a8e35fece49b ("objtool: Update documentation")
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/ec285ece6348a5be191aebe45f78d06b3319056b.1656234456.git.mchehab@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/x86/orc-unwinder.rst | 2 +-
 include/linux/objtool.h            | 2 +-
 lib/Kconfig.debug                  | 2 +-
 tools/include/linux/objtool.h      | 2 +-
 tools/objtool/check.c              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/x86/orc-unwinder.rst b/Documentation/x86/orc-unwinder.rst
index 9a66a88be765..cdb257015bd9 100644
--- a/Documentation/x86/orc-unwinder.rst
+++ b/Documentation/x86/orc-unwinder.rst
@@ -140,7 +140,7 @@ Unwinder implementation details
 
 Objtool generates the ORC data by integrating with the compile-time
 stack metadata validation feature, which is described in detail in
-tools/objtool/Documentation/stack-validation.txt.  After analyzing all
+tools/objtool/Documentation/objtool.txt.  After analyzing all
 the code paths of a .o file, it creates an array of orc_entry structs,
 and a parallel array of instruction addresses associated with those
 structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 6491fa8fba6d..6f89471005ee 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -62,7 +62,7 @@ struct unwind_hint {
  * It should only be used in special cases where you're 100% sure it won't
  * affect the reliability of frame pointers and kernel stack traces.
  *
- * For more information, see tools/objtool/Documentation/stack-validation.txt.
+ * For more information, see tools/objtool/Documentation/objtool.txt.
  */
 #define STACK_FRAME_NON_STANDARD(func) \
 	static void __used __section(".discard.func_stack_frame_non_standard") \
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e24db4bff19..79a71eb96111 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -498,7 +498,7 @@ config STACK_VALIDATION
 	  runtime stack traces are more reliable.
 
 	  For more information, see
-	  tools/objtool/Documentation/stack-validation.txt.
+	  tools/objtool/Documentation/objtool.txt.
 
 config NOINSTR_VALIDATION
 	bool
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 6491fa8fba6d..6f89471005ee 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -62,7 +62,7 @@ struct unwind_hint {
  * It should only be used in special cases where you're 100% sure it won't
  * affect the reliability of frame pointers and kernel stack traces.
  *
- * For more information, see tools/objtool/Documentation/stack-validation.txt.
+ * For more information, see tools/objtool/Documentation/objtool.txt.
  */
 #define STACK_FRAME_NON_STANDARD(func) \
 	static void __used __section(".discard.func_stack_frame_non_standard") \
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 864bb9dd3584..970844ceecdc 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3190,7 +3190,7 @@ static struct instruction *next_insn_to_validate(struct objtool_file *file,
  * Follow the branch starting at the given instruction, and recursively follow
  * any other branches (jumps).  Meanwhile, track the frame pointer state at
  * each instruction and validate all the rules described in
- * tools/objtool/Documentation/stack-validation.txt.
+ * tools/objtool/Documentation/objtool.txt.
  */
 static int validate_branch(struct objtool_file *file, struct symbol *func,
 			   struct instruction *insn, struct insn_state state)
-- 
cgit 


From 3566ee1d776c1393393564b2514f9cd52a49c16e Mon Sep 17 00:00:00 2001
From: Michael Kawano <mkawano@linux.ibm.com>
Date: Thu, 7 Jul 2022 15:57:27 +0200
Subject: vfio/ccw: Remove UUID from s390 debug log

As vfio-ccw devices are created/destroyed, the uuid of the associated
mdevs that are recorded in $S390DBF/vfio_ccw_msg/sprintf get lost.
This is because a pointer to the UUID is stored instead of the UUID
itself, and that memory may have been repurposed if/when the logs are
examined. The result is usually garbage UUID data in the logs, though
there is an outside chance of an oops happening here.

Simply remove the UUID from the traces, as the subchannel number will
provide useful configuration information for problem determination,
and is stored directly into the log instead of a pointer.

As we were the only consumer of mdev_uuid(), remove that too.

Cc: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Michael Kawano <mkawano@linux.ibm.com>
Fixes: 60e05d1cf0875 ("vfio-ccw: add some logging")
Fixes: b7701dfbf9832 ("vfio-ccw: Register a chp_event callback for vfio-ccw")
[farman: reworded commit message, added Fixes: tags]
Signed-off-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220707135737.720765-2-farman@linux.ibm.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/s390/cio/vfio_ccw_drv.c |  5 ++---
 drivers/s390/cio/vfio_ccw_fsm.c | 26 ++++++++++++--------------
 drivers/s390/cio/vfio_ccw_ops.c |  8 ++++----
 include/linux/mdev.h            |  5 -----
 4 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index ee182cfb467d..35055eb94115 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
 #include <linux/mdev.h>
 
 #include <asm/isc.h>
@@ -358,8 +357,8 @@ static int vfio_ccw_chp_event(struct subchannel *sch,
 		return 0;
 
 	trace_vfio_ccw_chp_event(private->sch->schid, mask, event);
-	VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): mask=0x%x event=%d\n",
-			   mdev_uuid(private->mdev), sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n",
+			   sch->schid.cssid,
 			   sch->schid.ssid, sch->schid.sch_no,
 			   mask, event);
 
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index 8483a266051c..bbcc5b486749 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/vfio.h>
-#include <linux/mdev.h>
 
 #include "ioasm.h"
 #include "vfio_ccw_private.h"
@@ -242,7 +241,6 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 	union orb *orb;
 	union scsw *scsw = &private->scsw;
 	struct ccw_io_region *io_region = private->io_region;
-	struct mdev_device *mdev = private->mdev;
 	char *errstr = "request";
 	struct subchannel_id schid = get_schid(private);
 
@@ -256,8 +254,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		if (orb->tm.b) {
 			io_region->ret_code = -EOPNOTSUPP;
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): transport mode\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: transport mode\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no);
 			errstr = "transport mode";
 			goto err_out;
@@ -265,8 +263,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = cp_init(&private->cp, orb);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): cp_init=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: cp_init=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp init";
@@ -276,8 +274,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = cp_prefetch(&private->cp);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): cp_prefetch=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: cp_prefetch=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp prefetch";
@@ -289,8 +287,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = fsm_io_helper(private);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): fsm_io_helper=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: fsm_io_helper=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp fsm_io_helper";
@@ -300,16 +298,16 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		return;
 	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
 		VFIO_CCW_MSG_EVENT(2,
-				   "%pUl (%x.%x.%04x): halt on io_region\n",
-				   mdev_uuid(mdev), schid.cssid,
+				   "sch %x.%x.%04x: halt on io_region\n",
+				   schid.cssid,
 				   schid.ssid, schid.sch_no);
 		/* halt is handled via the async cmd region */
 		io_region->ret_code = -EOPNOTSUPP;
 		goto err_out;
 	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
 		VFIO_CCW_MSG_EVENT(2,
-				   "%pUl (%x.%x.%04x): clear on io_region\n",
-				   mdev_uuid(mdev), schid.cssid,
+				   "sch %x.%x.%04x: clear on io_region\n",
+				   schid.cssid,
 				   schid.ssid, schid.sch_no);
 		/* clear is handled via the async cmd region */
 		io_region->ret_code = -EOPNOTSUPP;
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index b49e2e9db2dc..0e05bff78b8e 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -131,8 +131,8 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev)
 	private->mdev = mdev;
 	private->state = VFIO_CCW_STATE_IDLE;
 
-	VFIO_CCW_MSG_EVENT(2, "mdev %pUl, sch %x.%x.%04x: create\n",
-			   mdev_uuid(mdev), private->sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n",
+			   private->sch->schid.cssid,
 			   private->sch->schid.ssid,
 			   private->sch->schid.sch_no);
 
@@ -154,8 +154,8 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev)
 {
 	struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent);
 
-	VFIO_CCW_MSG_EVENT(2, "mdev %pUl, sch %x.%x.%04x: remove\n",
-			   mdev_uuid(mdev), private->sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n",
+			   private->sch->schid.cssid,
 			   private->sch->schid.ssid,
 			   private->sch->schid.sch_no);
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index bb539794f54a..47ad3b104d9e 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -65,11 +65,6 @@ struct mdev_driver {
 	struct device_driver driver;
 };
 
-static inline const guid_t *mdev_uuid(struct mdev_device *mdev)
-{
-	return &mdev->uuid;
-}
-
 extern struct bus_type mdev_bus_type;
 
 int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver);
-- 
cgit 


From 6b206a5a8c2912c3c2174c5afc2f6e798d6ad212 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 28 Jun 2022 15:02:27 -0500
Subject: scsi: target: Add callout to configure UNMAP settings

Add a callout to configure a backend's UNMAP settings. This will be used to
allow userspace to configure UNMAP after the initial device setup, similar
to how we can set up the other attributes post device configuration.

Link: https://lore.kernel.org/r/20220628200230.15052-3-michael.christie@oracle.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c  | 6 ++++++
 include/target/target_core_backend.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 25f33eb25337..086ac9c9343c 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -960,6 +960,12 @@ int target_configure_device(struct se_device *dev)
 	ret = dev->transport->configure_device(dev);
 	if (ret)
 		goto out_free_index;
+
+	if (dev->transport->configure_unmap &&
+	    dev->transport->configure_unmap(dev)) {
+		pr_debug("Discard support available, but disabled by default.\n");
+	}
+
 	/*
 	 * XXX: there is not much point to have two different values here..
 	 */
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index 773963a1e0b5..a3c193df25b3 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -37,6 +37,7 @@ struct target_backend_ops {
 	struct se_dev_plug *(*plug_device)(struct se_device *se_dev);
 	void (*unplug_device)(struct se_dev_plug *se_plug);
 
+	bool (*configure_unmap)(struct se_device *se_dev);
 	ssize_t (*set_configfs_dev_params)(struct se_device *,
 					   const char *, ssize_t);
 	ssize_t (*show_configfs_dev_params)(struct se_device *, char *);
-- 
cgit 


From 87686cc845c3be7dea777f1dbf2de0767007cda8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 16:10:39 +0530
Subject: OPP: Make dev_pm_opp_set_regulators() accept NULL terminated list

Make dev_pm_opp_set_regulators() accept a NULL terminated list of names
instead of making the callers keep the two parameters in sync, which
creates an opportunity for bugs to get in.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Steven Price <steven.price@arm.com> # panfrost
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c                |  9 ++++-----
 drivers/cpufreq/ti-cpufreq.c                |  7 +++----
 drivers/devfreq/exynos-bus.c                |  4 ++--
 drivers/gpu/drm/lima/lima_devfreq.c         |  3 ++-
 drivers/gpu/drm/panfrost/panfrost_devfreq.c |  3 +--
 drivers/gpu/drm/panfrost/panfrost_drv.c     | 15 ++++++++++-----
 drivers/opp/core.c                          | 18 ++++++++++++------
 drivers/soc/tegra/pmc.c                     |  4 ++--
 include/linux/pm_opp.h                      |  9 ++++-----
 9 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 8fcaba541539..be0c19b3ffa5 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -193,7 +193,7 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	struct private_data *priv;
 	struct device *cpu_dev;
 	bool fallback = false;
-	const char *reg_name;
+	const char *reg_name[] = { NULL, NULL };
 	int ret;
 
 	/* Check if this CPU is already covered by some other policy */
@@ -218,10 +218,9 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	 * OPP layer will be taking care of regulators now, but it needs to know
 	 * the name of the regulator first.
 	 */
-	reg_name = find_supply_name(cpu_dev);
-	if (reg_name) {
-		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, &reg_name,
-							    1);
+	reg_name[0] = find_supply_name(cpu_dev);
+	if (reg_name[0]) {
+		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, reg_name);
 		if (IS_ERR(priv->opp_table)) {
 			ret = PTR_ERR(priv->opp_table);
 			if (ret != -EPROBE_DEFER)
diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 8f9fdd864391..560d67a6bef1 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -173,7 +173,7 @@ static struct ti_cpufreq_soc_data omap34xx_soc_data = {
  *    seems to always read as 0).
  */
 
-static const char * const omap3_reg_names[] = {"cpu0", "vbb"};
+static const char * const omap3_reg_names[] = {"cpu0", "vbb", NULL};
 
 static struct ti_cpufreq_soc_data omap36xx_soc_data = {
 	.reg_names = omap3_reg_names,
@@ -326,7 +326,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 	const struct of_device_id *match;
 	struct opp_table *ti_opp_table;
 	struct ti_cpufreq_data *opp_data;
-	const char * const default_reg_names[] = {"vdd", "vbb"};
+	const char * const default_reg_names[] = {"vdd", "vbb", NULL};
 	int ret;
 
 	match = dev_get_platdata(&pdev->dev);
@@ -387,8 +387,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 		if (opp_data->soc_data->reg_names)
 			reg_names = opp_data->soc_data->reg_names;
 		ti_opp_table = dev_pm_opp_set_regulators(opp_data->cpu_dev,
-							 reg_names,
-							 ARRAY_SIZE(default_reg_names));
+							 reg_names);
 		if (IS_ERR(ti_opp_table)) {
 			dev_pm_opp_put_supported_hw(opp_data->opp_table);
 			ret =  PTR_ERR(ti_opp_table);
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index e689101abc93..541baff93ee8 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -180,10 +180,10 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 {
 	struct device *dev = bus->dev;
 	struct opp_table *opp_table;
-	const char *vdd = "vdd";
+	const char *supplies[] = { "vdd", NULL };
 	int i, ret, count, size;
 
-	opp_table = dev_pm_opp_set_regulators(dev, &vdd, 1);
+	opp_table = dev_pm_opp_set_regulators(dev, supplies);
 	if (IS_ERR(opp_table)) {
 		ret = PTR_ERR(opp_table);
 		dev_err(dev, "failed to set regulators %d\n", ret);
diff --git a/drivers/gpu/drm/lima/lima_devfreq.c b/drivers/gpu/drm/lima/lima_devfreq.c
index 8989e215dfc9..dc83c5421125 100644
--- a/drivers/gpu/drm/lima/lima_devfreq.c
+++ b/drivers/gpu/drm/lima/lima_devfreq.c
@@ -111,6 +111,7 @@ int lima_devfreq_init(struct lima_device *ldev)
 	struct dev_pm_opp *opp;
 	unsigned long cur_freq;
 	int ret;
+	const char *regulator_names[] = { "mali", NULL };
 
 	if (!device_property_present(dev, "operating-points-v2"))
 		/* Optional, continue without devfreq */
@@ -122,7 +123,7 @@ int lima_devfreq_init(struct lima_device *ldev)
 	if (ret)
 		return ret;
 
-	ret = devm_pm_opp_set_regulators(dev, (const char *[]){ "mali" }, 1);
+	ret = devm_pm_opp_set_regulators(dev, regulator_names);
 	if (ret) {
 		/* Continue if the optional regulator is missing */
 		if (ret != -ENODEV)
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 194af7f607a6..5110cd9b2425 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -101,8 +101,7 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
 		return 0;
 	}
 
-	ret = devm_pm_opp_set_regulators(dev, pfdev->comp->supply_names,
-					 pfdev->comp->num_supplies);
+	ret = devm_pm_opp_set_regulators(dev, pfdev->comp->supply_names);
 	if (ret) {
 		/* Continue if the optional regulator is missing */
 		if (ret != -ENODEV) {
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 7fcbc2a5b6cd..8a4bef65d38c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -625,24 +625,29 @@ static int panfrost_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const char * const default_supplies[] = { "mali" };
+/*
+ * The OPP core wants the supply names to be NULL terminated, but we need the
+ * correct num_supplies value for regulator core. Hence, we NULL terminate here
+ * and then initialize num_supplies with ARRAY_SIZE - 1.
+ */
+static const char * const default_supplies[] = { "mali", NULL };
 static const struct panfrost_compatible default_data = {
-	.num_supplies = ARRAY_SIZE(default_supplies),
+	.num_supplies = ARRAY_SIZE(default_supplies) - 1,
 	.supply_names = default_supplies,
 	.num_pm_domains = 1, /* optional */
 	.pm_domain_names = NULL,
 };
 
 static const struct panfrost_compatible amlogic_data = {
-	.num_supplies = ARRAY_SIZE(default_supplies),
+	.num_supplies = ARRAY_SIZE(default_supplies) - 1,
 	.supply_names = default_supplies,
 	.vendor_quirk = panfrost_gpu_amlogic_quirk,
 };
 
-static const char * const mediatek_mt8183_supplies[] = { "mali", "sram" };
+static const char * const mediatek_mt8183_supplies[] = { "mali", "sram", NULL };
 static const char * const mediatek_mt8183_pm_domains[] = { "core0", "core1", "core2" };
 static const struct panfrost_compatible mediatek_mt8183_data = {
-	.num_supplies = ARRAY_SIZE(mediatek_mt8183_supplies),
+	.num_supplies = ARRAY_SIZE(mediatek_mt8183_supplies) - 1,
 	.supply_names = mediatek_mt8183_supplies,
 	.num_pm_domains = ARRAY_SIZE(mediatek_mt8183_pm_domains),
 	.pm_domain_names = mediatek_mt8183_pm_domains,
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e166bfe5fc90..4e4593957ec5 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2105,13 +2105,20 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  * This must be called before any OPPs are initialized for the device.
  */
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[],
-					    unsigned int count)
+					    const char * const names[])
 {
 	struct dev_pm_opp_supply *supplies;
+	const char * const *temp = names;
 	struct opp_table *opp_table;
 	struct regulator *reg;
-	int ret, i;
+	int count = 0, ret, i;
+
+	/* Count number of regulators */
+	while (*temp++)
+		count++;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
 
 	opp_table = _add_opp_table(dev, false);
 	if (IS_ERR(opp_table))
@@ -2236,12 +2243,11 @@ static void devm_pm_opp_regulators_release(void *data)
  * Return: 0 on success and errorno otherwise.
  */
 int devm_pm_opp_set_regulators(struct device *dev,
-			       const char * const names[],
-			       unsigned int count)
+			       const char * const names[])
 {
 	struct opp_table *opp_table;
 
-	opp_table = dev_pm_opp_set_regulators(dev, names, count);
+	opp_table = dev_pm_opp_set_regulators(dev, names);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 5611d14d3ba2..6a4b8f7e7948 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -1384,7 +1384,7 @@ tegra_pmc_core_pd_opp_to_performance_state(struct generic_pm_domain *genpd,
 static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 {
 	struct generic_pm_domain *genpd;
-	const char *rname = "core";
+	const char *rname[] = { "core", NULL};
 	int err;
 
 	genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
@@ -1395,7 +1395,7 @@ static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 	genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
 	genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
 
-	err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
+	err = devm_pm_opp_set_regulators(pmc->dev, rname);
 	if (err)
 		return dev_err_probe(pmc->dev, err,
 				     "failed to set core OPP regulator\n");
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 6708b4ec244d..4c490865d574 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -159,9 +159,9 @@ void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-int devm_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+int devm_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -379,7 +379,7 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
+static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[])
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -387,8 +387,7 @@ static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, co
 static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
 static inline int devm_pm_opp_set_regulators(struct device *dev,
-					     const char * const names[],
-					     unsigned int count)
+					     const char * const names[])
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit 


From 11b9b663585c4f8b00846089ebbca4d1e3283e86 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 25 May 2022 15:23:16 +0530
Subject: OPP: Add dev_pm_opp_set_config() and friends

The OPP core already have few configuration specific APIs and it is
getting complex or messy for both the OPP core and its users.

Lets introduce a new set of API which will be used for all kind of
different configurations, and shall eventually be used by all the
existing ones.

The new API, returns a unique token instead of a pointer to the OPP
table, which allows the OPP core to drop the resources selectively later
on.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 229 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h      |  21 +++++
 include/linux/pm_opp.h |  42 +++++++++
 3 files changed, 291 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 4e4593957ec5..7ab20c3b91ed 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -13,11 +13,12 @@
 #include <linux/clk.h>
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/pm_domain.h>
 #include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
 
 #include "opp.h"
 
@@ -36,6 +37,9 @@ DEFINE_MUTEX(opp_table_lock);
 /* Flag indicating that opp_tables list is being updated at the moment */
 static bool opp_tables_busy;
 
+/* OPP ID allocator */
+static DEFINE_XARRAY_ALLOC1(opp_configs);
+
 static bool _find_opp_dev(const struct device *dev, struct opp_table *opp_table)
 {
 	struct opp_device *opp_dev;
@@ -2624,6 +2628,229 @@ int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names,
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
+static void _opp_clear_config(struct opp_config_data *data)
+{
+	if (data->flags & OPP_CONFIG_GENPD)
+		dev_pm_opp_detach_genpd(data->opp_table);
+	if (data->flags & OPP_CONFIG_REGULATOR)
+		dev_pm_opp_put_regulators(data->opp_table);
+	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
+		dev_pm_opp_put_supported_hw(data->opp_table);
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
+		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
+	if (data->flags & OPP_CONFIG_PROP_NAME)
+		dev_pm_opp_put_prop_name(data->opp_table);
+	if (data->flags & OPP_CONFIG_CLK)
+		dev_pm_opp_put_clkname(data->opp_table);
+
+	dev_pm_opp_put_opp_table(data->opp_table);
+	kfree(data);
+}
+
+/**
+ * dev_pm_opp_set_config() - Set OPP configuration for the device.
+ * @dev: Device for which configuration is being set.
+ * @config: OPP configuration.
+ *
+ * This allows all device OPP configurations to be performed at once.
+ *
+ * This must be called before any OPPs are initialized for the device. This may
+ * be called multiple times for the same OPP table, for example once for each
+ * CPU that share the same table. This must be balanced by the same number of
+ * calls to dev_pm_opp_clear_config() in order to free the OPP table properly.
+ *
+ * This returns a token to the caller, which must be passed to
+ * dev_pm_opp_clear_config() to free the resources later. The value of the
+ * returned token will be >= 1 for success and negative for errors. The minimum
+ * value of 1 is chosen here to make it easy for callers to manage the resource.
+ */
+int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	struct opp_table *opp_table, *err;
+	struct opp_config_data *data;
+	unsigned int id;
+	int ret;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	opp_table = _add_opp_table(dev, false);
+	if (IS_ERR(opp_table)) {
+		kfree(data);
+		return PTR_ERR(opp_table);
+	}
+
+	data->opp_table = opp_table;
+	data->flags = 0;
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Configure clocks */
+	if (config->clk_names) {
+		const char * const *temp = config->clk_names;
+		int count = 0;
+
+		/* Count number of clks */
+		while (*temp++)
+			count++;
+
+		/*
+		 * This is a special case where we have a single clock, whose
+		 * connection id name is NULL, i.e. first two entries are NULL
+		 * in the array.
+		 */
+		if (!count && !config->clk_names[1])
+			count = 1;
+
+		/* We support only one clock name for now */
+		if (count != 1) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		err = dev_pm_opp_set_clkname(dev, config->clk_names[0]);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_CLK;
+	}
+
+	/* Configure property names */
+	if (config->prop_name) {
+		err = dev_pm_opp_set_prop_name(dev, config->prop_name);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_PROP_NAME;
+	}
+
+	/* Configure opp helper */
+	if (config->set_opp) {
+		err = dev_pm_opp_register_set_opp_helper(dev, config->set_opp);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
+	}
+
+	/* Configure supported hardware */
+	if (config->supported_hw) {
+		err = dev_pm_opp_set_supported_hw(dev, config->supported_hw,
+						  config->supported_hw_count);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_SUPPORTED_HW;
+	}
+
+	/* Configure supplies */
+	if (config->regulator_names) {
+		err = dev_pm_opp_set_regulators(dev, config->regulator_names);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_REGULATOR;
+	}
+
+	/* Attach genpds */
+	if (config->genpd_names) {
+		err = dev_pm_opp_attach_genpd(dev, config->genpd_names,
+					      config->virt_devs);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_GENPD;
+	}
+
+	ret = xa_alloc(&opp_configs, &id, data, XA_LIMIT(1, INT_MAX),
+		       GFP_KERNEL);
+	if (ret)
+		goto err;
+
+	return id;
+
+err:
+	_opp_clear_config(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_config);
+
+/**
+ * dev_pm_opp_clear_config() - Releases resources blocked for OPP configuration.
+ * @opp_table: OPP table returned from dev_pm_opp_set_config().
+ *
+ * This allows all device OPP configurations to be cleared at once. This must be
+ * called once for each call made to dev_pm_opp_set_config(), in order to free
+ * the OPPs properly.
+ *
+ * Currently the first call itself ends up freeing all the OPP configurations,
+ * while the later ones only drop the OPP table reference. This works well for
+ * now as we would never want to use an half initialized OPP table and want to
+ * remove the configurations together.
+ */
+void dev_pm_opp_clear_config(int token)
+{
+	struct opp_config_data *data;
+
+	/*
+	 * This lets the callers call this unconditionally and keep their code
+	 * simple.
+	 */
+	if (unlikely(token <= 0))
+		return;
+
+	data = xa_erase(&opp_configs, token);
+	if (WARN_ON(!data))
+		return;
+
+	_opp_clear_config(data);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_clear_config);
+
+static void devm_pm_opp_config_release(void *token)
+{
+	dev_pm_opp_clear_config((unsigned long)token);
+}
+
+/**
+ * devm_pm_opp_set_config() - Set OPP configuration for the device.
+ * @dev: Device for which configuration is being set.
+ * @config: OPP configuration.
+ *
+ * This allows all device OPP configurations to be performed at once.
+ * This is a resource-managed variant of dev_pm_opp_set_config().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	int token = dev_pm_opp_set_config(dev, config);
+
+	if (token < 0)
+		return token;
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_config_release,
+					(void *) ((unsigned long) token));
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_config);
+
 /**
  * dev_pm_opp_xlate_required_opp() - Find required OPP for @src_table OPP.
  * @src_table: OPP table which has @dst_table as one of its required OPP table.
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 9e1cfcb0ea98..d652f0cc84f1 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -28,6 +28,27 @@ extern struct mutex opp_table_lock;
 
 extern struct list_head opp_tables, lazy_opp_tables;
 
+/* OPP Config flags */
+#define OPP_CONFIG_CLK			BIT(0)
+#define OPP_CONFIG_REGULATOR		BIT(1)
+#define OPP_CONFIG_REGULATOR_HELPER	BIT(2)
+#define OPP_CONFIG_PROP_NAME		BIT(3)
+#define OPP_CONFIG_SUPPORTED_HW		BIT(4)
+#define OPP_CONFIG_GENPD		BIT(5)
+
+/**
+ * struct opp_config_data - data for set config operations
+ * @opp_table: OPP table
+ * @flags: OPP config flags
+ *
+ * This structure stores the OPP config information for each OPP table
+ * configuration by the callers.
+ */
+struct opp_config_data {
+	struct opp_table *opp_table;
+	unsigned int flags;
+};
+
 /*
  * Internal data structure organization with the OPP layer library is as
  * follows:
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 4c490865d574..a08f9481efb3 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -90,6 +90,32 @@ struct dev_pm_set_opp_data {
 	struct device *dev;
 };
 
+/**
+ * struct dev_pm_opp_config - Device OPP configuration values
+ * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
+ * @prop_name: Name to postfix to properties.
+ * @set_opp: Custom set OPP helper.
+ * @supported_hw: Array of hierarchy of versions to match.
+ * @supported_hw_count: Number of elements in the array.
+ * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
+ * @genpd_names: Null terminated array of pointers containing names of genpd to
+ *		 attach.
+ * @virt_devs: Pointer to return the array of virtual devices.
+ *
+ * This structure contains platform specific OPP configurations for the device.
+ */
+struct dev_pm_opp_config {
+	/* NULL terminated */
+	const char * const *clk_names;
+	const char *prop_name;
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	const unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+	const char * const *regulator_names;
+	const char * const *genpd_names;
+	struct device ***virt_devs;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
@@ -154,6 +180,10 @@ int dev_pm_opp_disable(struct device *dev, unsigned long freq);
 int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb);
 int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb);
 
+int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
+int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
+void dev_pm_opp_clear_config(int token);
+
 struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
@@ -418,6 +448,18 @@ static inline int devm_pm_opp_attach_genpd(struct device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void dev_pm_opp_clear_config(int token) {}
+
 static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
 				struct opp_table *dst_table, struct dev_pm_opp *src_opp)
 {
-- 
cgit 


From b0ec09428621daee5101130c307634a390b0213b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 16:36:26 +0530
Subject: OPP: Migrate set-regulators API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-regulators family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c | 12 +++---
 drivers/devfreq/exynos-bus.c | 19 ++++-----
 drivers/opp/core.c           | 91 +++++++++-----------------------------------
 include/linux/pm_opp.h       | 44 +++++++++++++--------
 4 files changed, 60 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index be0c19b3ffa5..d69d13a26414 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -29,9 +29,9 @@ struct private_data {
 
 	cpumask_var_t cpus;
 	struct device *cpu_dev;
-	struct opp_table *opp_table;
 	struct cpufreq_frequency_table *freq_table;
 	bool have_static_opps;
+	int opp_token;
 };
 
 static LIST_HEAD(priv_list);
@@ -220,9 +220,9 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	 */
 	reg_name[0] = find_supply_name(cpu_dev);
 	if (reg_name[0]) {
-		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, reg_name);
-		if (IS_ERR(priv->opp_table)) {
-			ret = PTR_ERR(priv->opp_table);
+		priv->opp_token = dev_pm_opp_set_regulators(cpu_dev, reg_name);
+		if (priv->opp_token < 0) {
+			ret = priv->opp_token;
 			if (ret != -EPROBE_DEFER)
 				dev_err(cpu_dev, "failed to set regulators: %d\n",
 					ret);
@@ -294,7 +294,7 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 out:
 	if (priv->have_static_opps)
 		dev_pm_opp_of_cpumask_remove_table(priv->cpus);
-	dev_pm_opp_put_regulators(priv->opp_table);
+	dev_pm_opp_put_regulators(priv->opp_token);
 free_cpumask:
 	free_cpumask_var(priv->cpus);
 	return ret;
@@ -308,7 +308,7 @@ static void dt_cpufreq_release(void)
 		dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &priv->freq_table);
 		if (priv->have_static_opps)
 			dev_pm_opp_of_cpumask_remove_table(priv->cpus);
-		dev_pm_opp_put_regulators(priv->opp_table);
+		dev_pm_opp_put_regulators(priv->opp_token);
 		free_cpumask_var(priv->cpus);
 		list_del(&priv->node);
 	}
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 541baff93ee8..d1235242367f 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -33,7 +33,7 @@ struct exynos_bus {
 
 	unsigned long curr_freq;
 
-	struct opp_table *opp_table;
+	int opp_token;
 	struct clk *clk;
 	unsigned int ratio;
 };
@@ -161,8 +161,7 @@ static void exynos_bus_exit(struct device *dev)
 
 	dev_pm_opp_of_remove_table(dev);
 	clk_disable_unprepare(bus->clk);
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 }
 
 static void exynos_bus_passive_exit(struct device *dev)
@@ -179,18 +178,16 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 					struct exynos_bus *bus)
 {
 	struct device *dev = bus->dev;
-	struct opp_table *opp_table;
 	const char *supplies[] = { "vdd", NULL };
 	int i, ret, count, size;
 
-	opp_table = dev_pm_opp_set_regulators(dev, supplies);
-	if (IS_ERR(opp_table)) {
-		ret = PTR_ERR(opp_table);
+	ret = dev_pm_opp_set_regulators(dev, supplies);
+	if (ret < 0) {
 		dev_err(dev, "failed to set regulators %d\n", ret);
 		return ret;
 	}
 
-	bus->opp_table = opp_table;
+	bus->opp_token = ret;
 
 	/*
 	 * Get the devfreq-event devices to get the current utilization of
@@ -236,8 +233,7 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 	return 0;
 
 err_regulator:
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 
 	return ret;
 }
@@ -459,8 +455,7 @@ err:
 	dev_pm_opp_of_remove_table(dev);
 	clk_disable_unprepare(bus->clk);
 err_reg:
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 
 	return ret;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 7ab20c3b91ed..6ff9b5b69d07 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -991,8 +991,8 @@ static int _set_opp_custom(const struct opp_table *opp_table,
 	int size;
 
 	/*
-	 * We support this only if dev_pm_opp_set_regulators() was called
-	 * earlier.
+	 * We support this only if dev_pm_opp_set_config() was called
+	 * earlier to set regulators.
 	 */
 	if (opp_table->sod_supplies) {
 		size = sizeof(*old_opp->supplies) * opp_table->regulator_count;
@@ -2097,7 +2097,7 @@ void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
 /**
- * dev_pm_opp_set_regulators() - Set regulator names for the device
+ * _opp_set_regulators() - Set regulator names for the device
  * @dev: Device for which regulator name is being set.
  * @names: Array of pointers to the names of the regulator.
  * @count: Number of regulators.
@@ -2108,12 +2108,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[])
+static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
+			       const char * const names[])
 {
 	struct dev_pm_opp_supply *supplies;
 	const char * const *temp = names;
-	struct opp_table *opp_table;
 	struct regulator *reg;
 	int count = 0, ret, i;
 
@@ -2122,29 +2121,17 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 		count++;
 
 	if (!count)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
+		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the regulators ? */
 	if (opp_table->regulators)
-		return opp_table;
+		return 0;
 
 	opp_table->regulators = kmalloc_array(count,
 					      sizeof(*opp_table->regulators),
 					      GFP_KERNEL);
-	if (!opp_table->regulators) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	if (!opp_table->regulators)
+		return -ENOMEM;
 
 	for (i = 0; i < count; i++) {
 		reg = regulator_get_optional(dev, names[i]);
@@ -2174,7 +2161,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	}
 	mutex_unlock(&opp_table->lock);
 
-	return opp_table;
+	return 0;
 
 free_regulators:
 	while (i != 0)
@@ -2183,26 +2170,20 @@ free_regulators:
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
-err:
-	dev_pm_opp_put_opp_table(opp_table);
 
-	return ERR_PTR(ret);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
 
 /**
- * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
- * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ * _opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from _opp_set_regulators().
  */
-void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+static void _opp_put_regulators(struct opp_table *opp_table)
 {
 	int i;
 
-	if (unlikely(!opp_table))
-		return;
-
 	if (!opp_table->regulators)
-		goto put_opp_table;
+		return;
 
 	if (opp_table->enabled) {
 		for (i = opp_table->regulator_count - 1; i >= 0; i--)
@@ -2225,40 +2206,7 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
-
-static void devm_pm_opp_regulators_release(void *data)
-{
-	dev_pm_opp_put_regulators(data);
-}
-
-/**
- * devm_pm_opp_set_regulators() - Set regulator names for the device
- * @dev: Device for which regulator name is being set.
- * @names: Array of pointers to the names of the regulator.
- * @count: Number of regulators.
- *
- * This is a resource-managed variant of dev_pm_opp_set_regulators().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_set_regulators(struct device *dev,
-			       const char * const names[])
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_regulators(dev, names);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_regulators_release,
-					opp_table);
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_regulators);
 
 /**
  * dev_pm_opp_set_clkname() - Set clk name for the device
@@ -2633,7 +2581,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_GENPD)
 		dev_pm_opp_detach_genpd(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR)
-		dev_pm_opp_put_regulators(data->opp_table);
+		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		dev_pm_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
@@ -2758,11 +2706,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure supplies */
 	if (config->regulator_names) {
-		err = dev_pm_opp_set_regulators(dev, config->regulator_names);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_regulators(opp_table, dev,
+					  config->regulator_names);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_REGULATOR;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index a08f9481efb3..f014bd172c99 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -189,9 +189,6 @@ void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[]);
-void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-int devm_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -409,19 +406,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[])
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_regulators(struct device *dev,
-					     const char * const names[])
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -606,4 +590,32 @@ static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_ta
 }
 #endif
 
+/* OPP Configuration helpers */
+
+/* Regulators helpers */
+static inline int dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[])
+{
+	struct dev_pm_opp_config config = {
+		.regulator_names = names,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_regulators(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_regulators(struct device *dev,
+					     const char * const names[])
+{
+	struct dev_pm_opp_config config = {
+		.regulator_names = names,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From 89f03984fa2abface1ffb1fe050b7c175651ffc7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-supported-hw API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-supported-hw family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/imx-cpufreq-dt.c    | 12 ++---
 drivers/cpufreq/tegra20-cpufreq.c   | 12 +++--
 drivers/memory/tegra/tegra124-emc.c | 11 +++--
 drivers/opp/core.c                  | 87 +++++++++----------------------------
 include/linux/pm_opp.h              | 49 +++++++++++++--------
 5 files changed, 66 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/imx-cpufreq-dt.c b/drivers/cpufreq/imx-cpufreq-dt.c
index 3fe9125156b4..76e553af2071 100644
--- a/drivers/cpufreq/imx-cpufreq-dt.c
+++ b/drivers/cpufreq/imx-cpufreq-dt.c
@@ -31,8 +31,8 @@
 
 /* cpufreq-dt device registered by imx-cpufreq-dt */
 static struct platform_device *cpufreq_dt_pdev;
-static struct opp_table *cpufreq_opp_table;
 static struct device *cpu_dev;
+static int cpufreq_opp_token;
 
 enum IMX7ULP_CPUFREQ_CLKS {
 	ARM,
@@ -153,9 +153,9 @@ static int imx_cpufreq_dt_probe(struct platform_device *pdev)
 	dev_info(&pdev->dev, "cpu speed grade %d mkt segment %d supported-hw %#x %#x\n",
 			speed_grade, mkt_segment, supported_hw[0], supported_hw[1]);
 
-	cpufreq_opp_table = dev_pm_opp_set_supported_hw(cpu_dev, supported_hw, 2);
-	if (IS_ERR(cpufreq_opp_table)) {
-		ret = PTR_ERR(cpufreq_opp_table);
+	cpufreq_opp_token = dev_pm_opp_set_supported_hw(cpu_dev, supported_hw, 2);
+	if (cpufreq_opp_token < 0) {
+		ret = cpufreq_opp_token;
 		dev_err(&pdev->dev, "Failed to set supported opp: %d\n", ret);
 		return ret;
 	}
@@ -163,7 +163,7 @@ static int imx_cpufreq_dt_probe(struct platform_device *pdev)
 	cpufreq_dt_pdev = platform_device_register_data(
 			&pdev->dev, "cpufreq-dt", -1, NULL, 0);
 	if (IS_ERR(cpufreq_dt_pdev)) {
-		dev_pm_opp_put_supported_hw(cpufreq_opp_table);
+		dev_pm_opp_put_supported_hw(cpufreq_opp_token);
 		ret = PTR_ERR(cpufreq_dt_pdev);
 		dev_err(&pdev->dev, "Failed to register cpufreq-dt: %d\n", ret);
 		return ret;
@@ -176,7 +176,7 @@ static int imx_cpufreq_dt_remove(struct platform_device *pdev)
 {
 	platform_device_unregister(cpufreq_dt_pdev);
 	if (!of_machine_is_compatible("fsl,imx7ulp"))
-		dev_pm_opp_put_supported_hw(cpufreq_opp_table);
+		dev_pm_opp_put_supported_hw(cpufreq_opp_token);
 	else
 		clk_bulk_put(ARRAY_SIZE(imx7ulp_clks), imx7ulp_clks);
 
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index e8db3d75be25..ab7ac7df9e62 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -32,9 +32,9 @@ static bool cpu0_node_has_opp_v2_prop(void)
 	return ret;
 }
 
-static void tegra20_cpufreq_put_supported_hw(void *opp_table)
+static void tegra20_cpufreq_put_supported_hw(void *opp_token)
 {
-	dev_pm_opp_put_supported_hw(opp_table);
+	dev_pm_opp_put_supported_hw((unsigned long) opp_token);
 }
 
 static void tegra20_cpufreq_dt_unregister(void *cpufreq_dt)
@@ -45,7 +45,6 @@ static void tegra20_cpufreq_dt_unregister(void *cpufreq_dt)
 static int tegra20_cpufreq_probe(struct platform_device *pdev)
 {
 	struct platform_device *cpufreq_dt;
-	struct opp_table *opp_table;
 	struct device *cpu_dev;
 	u32 versions[2];
 	int err;
@@ -71,16 +70,15 @@ static int tegra20_cpufreq_probe(struct platform_device *pdev)
 	if (WARN_ON(!cpu_dev))
 		return -ENODEV;
 
-	opp_table = dev_pm_opp_set_supported_hw(cpu_dev, versions, 2);
-	err = PTR_ERR_OR_ZERO(opp_table);
-	if (err) {
+	err = dev_pm_opp_set_supported_hw(cpu_dev, versions, 2);
+	if (err < 0) {
 		dev_err(&pdev->dev, "failed to set supported hw: %d\n", err);
 		return err;
 	}
 
 	err = devm_add_action_or_reset(&pdev->dev,
 				       tegra20_cpufreq_put_supported_hw,
-				       opp_table);
+				       (void *)((unsigned long) err));
 	if (err)
 		return err;
 
diff --git a/drivers/memory/tegra/tegra124-emc.c b/drivers/memory/tegra/tegra124-emc.c
index 908f8d5392b2..85bc936c02f9 100644
--- a/drivers/memory/tegra/tegra124-emc.c
+++ b/drivers/memory/tegra/tegra124-emc.c
@@ -1395,15 +1395,14 @@ err_msg:
 static int tegra_emc_opp_table_init(struct tegra_emc *emc)
 {
 	u32 hw_version = BIT(tegra_sku_info.soc_speedo_id);
-	struct opp_table *hw_opp_table;
-	int err;
+	int opp_token, err;
 
-	hw_opp_table = dev_pm_opp_set_supported_hw(emc->dev, &hw_version, 1);
-	err = PTR_ERR_OR_ZERO(hw_opp_table);
-	if (err) {
+	err = dev_pm_opp_set_supported_hw(emc->dev, &hw_version, 1);
+	if (err < 0) {
 		dev_err(emc->dev, "failed to set OPP supported HW: %d\n", err);
 		return err;
 	}
+	opp_token = err;
 
 	err = dev_pm_opp_of_add_table(emc->dev);
 	if (err) {
@@ -1430,7 +1429,7 @@ static int tegra_emc_opp_table_init(struct tegra_emc *emc)
 remove_table:
 	dev_pm_opp_of_remove_table(emc->dev);
 put_hw_table:
-	dev_pm_opp_put_supported_hw(hw_opp_table);
+	dev_pm_opp_put_supported_hw(opp_token);
 
 	return err;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 6ff9b5b69d07..8dbdfff38973 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1952,7 +1952,7 @@ free_opp:
 }
 
 /**
- * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * _opp_set_supported_hw() - Set supported platforms
  * @dev: Device for which supported-hw has to be set.
  * @versions: Array of hierarchy of versions to match.
  * @count: Number of elements in the array.
@@ -1962,84 +1962,39 @@ free_opp:
  * OPPs, which are available for those versions, based on its 'opp-supported-hw'
  * property.
  */
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-			const u32 *versions, unsigned int count)
+static int _opp_set_supported_hw(struct opp_table *opp_table,
+				 const u32 *versions, unsigned int count)
 {
-	struct opp_table *opp_table;
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
 	/* Another CPU that shares the OPP table has set the property ? */
 	if (opp_table->supported_hw)
-		return opp_table;
+		return 0;
 
 	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
 					GFP_KERNEL);
-	if (!opp_table->supported_hw) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!opp_table->supported_hw)
+		return -ENOMEM;
 
 	opp_table->supported_hw_count = count;
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
 
 /**
- * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
+ * _opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @opp_table: OPP table returned by _opp_set_supported_hw().
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * _opp_set_supported_hw(). Until this is called, the opp_table structure
  * will not be freed.
  */
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
+static void _opp_put_supported_hw(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	kfree(opp_table->supported_hw);
-	opp_table->supported_hw = NULL;
-	opp_table->supported_hw_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
-
-static void devm_pm_opp_supported_hw_release(void *data)
-{
-	dev_pm_opp_put_supported_hw(data);
-}
-
-/**
- * devm_pm_opp_set_supported_hw() - Set supported platforms
- * @dev: Device for which supported-hw has to be set.
- * @versions: Array of hierarchy of versions to match.
- * @count: Number of elements in the array.
- *
- * This is a resource-managed variant of dev_pm_opp_set_supported_hw().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
-				 unsigned int count)
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_supported_hw(dev, versions, count);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_supported_hw_release,
-					opp_table);
+	if (opp_table->supported_hw) {
+		kfree(opp_table->supported_hw);
+		opp_table->supported_hw = NULL;
+		opp_table->supported_hw_count = 0;
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_supported_hw);
 
 /**
  * dev_pm_opp_set_prop_name() - Set prop-extn name
@@ -2583,7 +2538,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
-		dev_pm_opp_put_supported_hw(data->opp_table);
+		_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
@@ -2694,12 +2649,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure supported hardware */
 	if (config->supported_hw) {
-		err = dev_pm_opp_set_supported_hw(dev, config->supported_hw,
-						  config->supported_hw_count);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_supported_hw(opp_table, config->supported_hw,
+					    config->supported_hw_count);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_SUPPORTED_HW;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f014bd172c99..94d0101c254c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -184,9 +184,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
 
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
-int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -369,22 +366,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-							    const u32 *versions,
-							    unsigned int count)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_supported_hw(struct device *dev,
-					       const u32 *versions,
-					       unsigned int count)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
@@ -618,4 +599,34 @@ static inline int devm_pm_opp_set_regulators(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* Supported-hw helpers */
+static inline int dev_pm_opp_set_supported_hw(struct device *dev,
+					      const u32 *versions,
+					      unsigned int count)
+{
+	struct dev_pm_opp_config config = {
+		.supported_hw = versions,
+		.supported_hw_count = count,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_supported_hw(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_supported_hw(struct device *dev,
+					       const u32 *versions,
+					       unsigned int count)
+{
+	struct dev_pm_opp_config config = {
+		.supported_hw = versions,
+		.supported_hw_count = count,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From 2368f57685768f9f9cd666eaa4194a359d89afb8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-clk-name API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-clk-name family of helpers to use the new
infrastructure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 142 +++++++++++++++----------------------------------
 include/linux/pm_opp.h |  41 ++++++++------
 2 files changed, 69 insertions(+), 114 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 8dbdfff38973..0a82ca7ae453 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2164,104 +2164,71 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * In order to support OPP switching, OPP layer needs to get pointer to the
- * clock for the device. Simple cases work fine without using this routine (i.e.
- * by passing connection-id as NULL), but for a device with multiple clocks
- * available, the OPP core needs to know the exact name of the clk to use.
+ * _opp_set_clknames() - Set clk names for the device
+ * @dev: Device for which clk names is being set.
+ * @names: Clk names.
+ *
+ * In order to support OPP switching, OPP layer needs to get pointers to the
+ * clocks for the device. Simple cases work fine without using this routine
+ * (i.e. by passing connection-id as NULL), but for a device with multiple
+ * clocks available, the OPP core needs to know the exact names of the clks to
+ * use.
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
+static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
+			     const char * const names[])
 {
-	struct opp_table *opp_table;
-	int ret;
+	const char * const *temp = names;
+	int count = 0;
 
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
+	/* Count number of clks */
+	while (*temp++)
+		count++;
 
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
+	/*
+	 * This is a special case where we have a single clock, whose connection
+	 * id name is NULL, i.e. first two entries are NULL in the array.
+	 */
+	if (!count && !names[1])
+		count = 1;
+
+	/* We support only one clock name for now */
+	if (count != 1)
+		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the clkname ? */
 	if (opp_table->clk_configured)
-		return opp_table;
+		return 0;
 
 	/* clk shouldn't be initialized at this point */
-	if (WARN_ON(opp_table->clk)) {
-		ret = -EBUSY;
-		goto err;
-	}
+	if (WARN_ON(opp_table->clk))
+		return -EBUSY;
 
 	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, name);
+	opp_table->clk = clk_get(dev, names[0]);
 	if (IS_ERR(opp_table->clk)) {
-		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
+		return dev_err_probe(dev, PTR_ERR(opp_table->clk),
 				    "%s: Couldn't find clock\n", __func__);
-		goto err;
 	}
 
 	opp_table->clk_configured = true;
 
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
-
-/**
- * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
- * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
- */
-void dev_pm_opp_put_clkname(struct opp_table *opp_table)
-{
-	if (unlikely(!opp_table))
-		return;
-
-	clk_put(opp_table->clk);
-	opp_table->clk = ERR_PTR(-EINVAL);
-	opp_table->clk_configured = false;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
-
-static void devm_pm_opp_clkname_release(void *data)
-{
-	dev_pm_opp_put_clkname(data);
+	return 0;
 }
 
 /**
- * devm_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * This is a resource-managed variant of dev_pm_opp_set_clkname().
- *
- * Return: 0 on success and errorno otherwise.
+ * _opp_put_clknames() - Releases resources blocked for clks.
+ * @opp_table: OPP table returned from _opp_set_clknames().
  */
-int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+static void _opp_put_clknames(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_clkname(dev, name);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_clkname_release,
-					opp_table);
+	if (opp_table->clk_configured) {
+		clk_put(opp_table->clk);
+		opp_table->clk = ERR_PTR(-EINVAL);
+		opp_table->clk_configured = false;
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_clkname);
 
 /**
  * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
@@ -2544,7 +2511,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		dev_pm_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
-		dev_pm_opp_put_clkname(data->opp_table);
+		_opp_put_clknames(data->opp_table);
 
 	dev_pm_opp_put_opp_table(data->opp_table);
 	kfree(data);
@@ -2595,32 +2562,9 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure clocks */
 	if (config->clk_names) {
-		const char * const *temp = config->clk_names;
-		int count = 0;
-
-		/* Count number of clks */
-		while (*temp++)
-			count++;
-
-		/*
-		 * This is a special case where we have a single clock, whose
-		 * connection id name is NULL, i.e. first two entries are NULL
-		 * in the array.
-		 */
-		if (!count && !config->clk_names[1])
-			count = 1;
-
-		/* We support only one clock name for now */
-		if (count != 1) {
-			ret = -EINVAL;
-			goto err;
-		}
-
-		err = dev_pm_opp_set_clkname(dev, config->clk_names[0]);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_clknames(opp_table, dev, config->clk_names);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_CLK;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 94d0101c254c..ed1906bbe8bb 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
-void dev_pm_opp_put_clkname(struct opp_table *opp_table);
-int devm_pm_opp_set_clkname(struct device *dev, const char *name);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
@@ -387,18 +384,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -629,4 +614,30 @@ static inline int devm_pm_opp_set_supported_hw(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* clkname helpers */
+static inline int dev_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	const char *names[] = { name, NULL };
+	struct dev_pm_opp_config config = {
+		.clk_names = names,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_clkname(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	const char *names[] = { name, NULL };
+	struct dev_pm_opp_config config = {
+		.clk_names = names,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From 3c543b42a6df35feb3db6689e512fa167739451e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-opp-helper API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-opp-helper family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Remove devm_pm_opp_register_set_opp_helper() as it has no users
currently.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c          | 89 ++++++++++-----------------------------------
 drivers/opp/ti-opp-supply.c |  6 +--
 include/linux/pm_opp.h      | 33 ++++++++---------
 3 files changed, 39 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 0a82ca7ae453..9da7dcf62cab 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2231,7 +2231,7 @@ static void _opp_put_clknames(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * _opp_register_set_opp_helper() - Register custom set OPP helper
  * @dev: Device for which the helper is getting registered.
  * @set_opp: Custom set OPP helper.
  *
@@ -2240,32 +2240,18 @@ static void _opp_put_clknames(struct opp_table *opp_table)
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
+static int _opp_register_set_opp_helper(struct opp_table *opp_table,
+	struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
 	struct dev_pm_set_opp_data *data;
-	struct opp_table *opp_table;
-
-	if (!set_opp)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-EBUSY);
-	}
 
 	/* Another CPU that shares the OPP table has set the helper ? */
 	if (opp_table->set_opp)
-		return opp_table;
+		return 0;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	mutex_lock(&opp_table->lock);
 	opp_table->set_opp_data = data;
@@ -2278,60 +2264,26 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 
 	opp_table->set_opp = set_opp;
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
 
 /**
- * dev_pm_opp_unregister_set_opp_helper() - Releases resources blocked for
- *					   set_opp helper
- * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
+ * _opp_unregister_set_opp_helper() - Releases resources blocked for set_opp helper
+ * @opp_table: OPP table returned from _opp_register_set_opp_helper().
  *
  * Release resources blocked for platform specific set_opp helper.
  */
-void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
+static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	opp_table->set_opp = NULL;
-
-	mutex_lock(&opp_table->lock);
-	kfree(opp_table->set_opp_data);
-	opp_table->set_opp_data = NULL;
-	mutex_unlock(&opp_table->lock);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
-
-static void devm_pm_opp_unregister_set_opp_helper(void *data)
-{
-	dev_pm_opp_unregister_set_opp_helper(data);
-}
-
-/**
- * devm_pm_opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is a resource-managed version of dev_pm_opp_register_set_opp_helper().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_register_set_opp_helper(struct device *dev,
-					int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_register_set_opp_helper(dev, set_opp);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
+	if (opp_table->set_opp) {
+		opp_table->set_opp = NULL;
 
-	return devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper,
-					opp_table);
+		mutex_lock(&opp_table->lock);
+		kfree(opp_table->set_opp_data);
+		opp_table->set_opp_data = NULL;
+		mutex_unlock(&opp_table->lock);
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_register_set_opp_helper);
 
 static void _opp_detach_genpd(struct opp_table *opp_table)
 {
@@ -2507,7 +2459,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
-		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
+		_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		dev_pm_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2582,11 +2534,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure opp helper */
 	if (config->set_opp) {
-		err = dev_pm_opp_register_set_opp_helper(dev, config->set_opp);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_register_set_opp_helper(opp_table, dev,
+						   config->set_opp);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
 	}
diff --git a/drivers/opp/ti-opp-supply.c b/drivers/opp/ti-opp-supply.c
index bd4771f388ab..40ebc9ac82dd 100644
--- a/drivers/opp/ti-opp-supply.c
+++ b/drivers/opp/ti-opp-supply.c
@@ -405,9 +405,9 @@ static int ti_opp_supply_probe(struct platform_device *pdev)
 			return ret;
 	}
 
-	ret = PTR_ERR_OR_ZERO(dev_pm_opp_register_set_opp_helper(cpu_dev,
-								 ti_opp_supply_set_opp));
-	if (ret)
+	ret = dev_pm_opp_register_set_opp_helper(cpu_dev,
+						 ti_opp_supply_set_opp);
+	if (ret < 0)
 		_free_optimized_voltages(dev, &opp_data);
 
 	return ret;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index ed1906bbe8bb..85a4b7353979 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
-void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
-int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
@@ -363,20 +360,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_register_set_opp_helper(struct device *dev,
-				    int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -640,4 +623,20 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* set-opp helpers */
+static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct dev_pm_opp_config config = {
+		.set_opp = set_opp,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_unregister_set_opp_helper(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From 442e7a1786e628b38175314ec1805966b8d0c02c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate attach-genpd API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the attach-genpd family of helpers to use the new
infrastructure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 85 +++++++++++---------------------------------------
 include/linux/pm_opp.h | 48 ++++++++++++++++++----------
 2 files changed, 49 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 9da7dcf62cab..458584994c2b 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2285,7 +2285,7 @@ static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 	}
 }
 
-static void _opp_detach_genpd(struct opp_table *opp_table)
+static void _detach_genpd(struct opp_table *opp_table)
 {
 	int index;
 
@@ -2305,7 +2305,7 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer
+ * _opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer
  * @dev: Consumer device for which the genpd is getting attached.
  * @names: Null terminated array of pointers containing names of genpd to attach.
  * @virt_devs: Pointer to return the array of virtual devices.
@@ -2326,30 +2326,23 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
  * The order of entries in the names array must match the order in which
  * "required-opps" are added in DT.
  */
-struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
-		const char * const *names, struct device ***virt_devs)
+static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
+			const char * const *names, struct device ***virt_devs)
 {
-	struct opp_table *opp_table;
 	struct device *virt_dev;
 	int index = 0, ret = -EINVAL;
 	const char * const *name = names;
 
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
 	if (opp_table->genpd_virt_devs)
-		return opp_table;
+		return 0;
 
 	/*
 	 * If the genpd's OPP table isn't already initialized, parsing of the
 	 * required-opps fail for dev. We should retry this after genpd's OPP
 	 * table is added.
 	 */
-	if (!opp_table->required_opp_count) {
-		ret = -EPROBE_DEFER;
-		goto put_table;
-	}
+	if (!opp_table->required_opp_count)
+		return -EPROBE_DEFER;
 
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
 
@@ -2382,78 +2375,38 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 		*virt_devs = opp_table->genpd_virt_devs;
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
 
-	return opp_table;
+	return 0;
 
 err:
-	_opp_detach_genpd(opp_table);
+	_detach_genpd(opp_table);
 unlock:
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
+	return ret;
 
-put_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_attach_genpd);
 
 /**
- * dev_pm_opp_detach_genpd() - Detach genpd(s) from the device.
- * @opp_table: OPP table returned by dev_pm_opp_attach_genpd().
+ * _opp_detach_genpd() - Detach genpd(s) from the device.
+ * @opp_table: OPP table returned by _opp_attach_genpd().
  *
  * This detaches the genpd(s), resets the virtual device pointers, and puts the
  * OPP table.
  */
-void dev_pm_opp_detach_genpd(struct opp_table *opp_table)
+static void _opp_detach_genpd(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
 	/*
 	 * Acquire genpd_virt_dev_lock to make sure virt_dev isn't getting
 	 * used in parallel.
 	 */
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
-	_opp_detach_genpd(opp_table);
+	_detach_genpd(opp_table);
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
-
-	dev_pm_opp_put_opp_table(opp_table);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_detach_genpd);
-
-static void devm_pm_opp_detach_genpd(void *data)
-{
-	dev_pm_opp_detach_genpd(data);
-}
-
-/**
- * devm_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual
- *			      device pointer
- * @dev: Consumer device for which the genpd is getting attached.
- * @names: Null terminated array of pointers containing names of genpd to attach.
- * @virt_devs: Pointer to return the array of virtual devices.
- *
- * This is a resource-managed version of dev_pm_opp_attach_genpd().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names,
-			     struct device ***virt_devs)
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_attach_genpd(dev, names, virt_devs);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd,
-					opp_table);
-}
-EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
 static void _opp_clear_config(struct opp_config_data *data)
 {
 	if (data->flags & OPP_CONFIG_GENPD)
-		dev_pm_opp_detach_genpd(data->opp_table);
+		_opp_detach_genpd(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
@@ -2564,12 +2517,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Attach genpds */
 	if (config->genpd_names) {
-		err = dev_pm_opp_attach_genpd(dev, config->genpd_names,
-					      config->virt_devs);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_attach_genpd(opp_table, dev, config->genpd_names,
+					config->virt_devs);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_GENPD;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 85a4b7353979..20e1e5060a8a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
-void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
-int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -367,20 +364,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_attach_genpd(struct device *dev,
-					   const char * const *names,
-					   struct device ***virt_devs)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
 	return -EOPNOTSUPP;
@@ -639,4 +622,35 @@ static inline void dev_pm_opp_unregister_set_opp_helper(int token)
 	dev_pm_opp_clear_config(token);
 }
 
+/* genpd helpers */
+static inline int dev_pm_opp_attach_genpd(struct device *dev,
+					  const char * const *names,
+					  struct device ***virt_devs)
+{
+	struct dev_pm_opp_config config = {
+		.genpd_names = names,
+		.virt_devs = virt_devs,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_detach_genpd(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_attach_genpd(struct device *dev,
+					   const char * const *names,
+					   struct device ***virt_devs)
+{
+	struct dev_pm_opp_config config = {
+		.genpd_names = names,
+		.virt_devs = virt_devs,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From 298098e55a6fcc176a5af52cd689f33577ead5ca Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-prop-name helper API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-prop-name family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Acked-by: Samuel Holland <samuel@sholland.org> # sun50i
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/sun50i-cpufreq-nvmem.c | 31 +++++++++----------
 drivers/opp/core.c                     | 55 +++++++++++-----------------------
 include/linux/pm_opp.h                 | 23 ++++++++------
 3 files changed, 46 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
index 75e1bf3a08f7..a4922580ce06 100644
--- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c
+++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
@@ -86,20 +86,20 @@ static int sun50i_cpufreq_get_efuse(u32 *versions)
 
 static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 {
-	struct opp_table **opp_tables;
+	int *opp_tokens;
 	char name[MAX_NAME_LEN];
 	unsigned int cpu;
 	u32 speed = 0;
 	int ret;
 
-	opp_tables = kcalloc(num_possible_cpus(), sizeof(*opp_tables),
+	opp_tokens = kcalloc(num_possible_cpus(), sizeof(*opp_tokens),
 			     GFP_KERNEL);
-	if (!opp_tables)
+	if (!opp_tokens)
 		return -ENOMEM;
 
 	ret = sun50i_cpufreq_get_efuse(&speed);
 	if (ret) {
-		kfree(opp_tables);
+		kfree(opp_tokens);
 		return ret;
 	}
 
@@ -113,9 +113,9 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 			goto free_opp;
 		}
 
-		opp_tables[cpu] = dev_pm_opp_set_prop_name(cpu_dev, name);
-		if (IS_ERR(opp_tables[cpu])) {
-			ret = PTR_ERR(opp_tables[cpu]);
+		opp_tokens[cpu] = dev_pm_opp_set_prop_name(cpu_dev, name);
+		if (opp_tokens[cpu] < 0) {
+			ret = opp_tokens[cpu];
 			pr_err("Failed to set prop name\n");
 			goto free_opp;
 		}
@@ -124,7 +124,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 	cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
 							  NULL, 0);
 	if (!IS_ERR(cpufreq_dt_pdev)) {
-		platform_set_drvdata(pdev, opp_tables);
+		platform_set_drvdata(pdev, opp_tokens);
 		return 0;
 	}
 
@@ -132,27 +132,24 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 	pr_err("Failed to register platform device\n");
 
 free_opp:
-	for_each_possible_cpu(cpu) {
-		if (IS_ERR_OR_NULL(opp_tables[cpu]))
-			break;
-		dev_pm_opp_put_prop_name(opp_tables[cpu]);
-	}
-	kfree(opp_tables);
+	for_each_possible_cpu(cpu)
+		dev_pm_opp_put_prop_name(opp_tokens[cpu]);
+	kfree(opp_tokens);
 
 	return ret;
 }
 
 static int sun50i_cpufreq_nvmem_remove(struct platform_device *pdev)
 {
-	struct opp_table **opp_tables = platform_get_drvdata(pdev);
+	int *opp_tokens = platform_get_drvdata(pdev);
 	unsigned int cpu;
 
 	platform_device_unregister(cpufreq_dt_pdev);
 
 	for_each_possible_cpu(cpu)
-		dev_pm_opp_put_prop_name(opp_tables[cpu]);
+		dev_pm_opp_put_prop_name(opp_tokens[cpu]);
 
-	kfree(opp_tables);
+	kfree(opp_tokens);
 
 	return 0;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 458584994c2b..1745e25c1eaf 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1997,7 +1997,7 @@ static void _opp_put_supported_hw(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * _opp_set_prop_name() - Set prop-extn name
  * @dev: Device for which the prop-name has to be set.
  * @name: name to postfix to properties.
  *
@@ -2006,50 +2006,33 @@ static void _opp_put_supported_hw(struct opp_table *opp_table)
  * which the extension will apply are opp-microvolt and opp-microamp. OPP core
  * should postfix the property name with -<name> while looking for them.
  */
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+static int _opp_set_prop_name(struct opp_table *opp_table, const char *name)
 {
-	struct opp_table *opp_table;
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
 	/* Another CPU that shares the OPP table has set the property ? */
-	if (opp_table->prop_name)
-		return opp_table;
-
-	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
 	if (!opp_table->prop_name) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-ENOMEM);
+		opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+		if (!opp_table->prop_name)
+			return -ENOMEM;
 	}
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
 
 /**
- * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
+ * _opp_put_prop_name() - Releases resources blocked for prop-name
+ * @opp_table: OPP table returned by _opp_set_prop_name().
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * _opp_set_prop_name(). Until this is called, the opp_table structure
  * will not be freed.
  */
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
+static void _opp_put_prop_name(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	kfree(opp_table->prop_name);
-	opp_table->prop_name = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
+	if (opp_table->prop_name) {
+		kfree(opp_table->prop_name);
+		opp_table->prop_name = NULL;
+	}
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
 /**
  * _opp_set_regulators() - Set regulator names for the device
@@ -2414,7 +2397,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
-		dev_pm_opp_put_prop_name(data->opp_table);
+		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
 		_opp_put_clknames(data->opp_table);
 
@@ -2441,7 +2424,7 @@ static void _opp_clear_config(struct opp_config_data *data)
  */
 int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
-	struct opp_table *opp_table, *err;
+	struct opp_table *opp_table;
 	struct opp_config_data *data;
 	unsigned int id;
 	int ret;
@@ -2476,11 +2459,9 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure property names */
 	if (config->prop_name) {
-		err = dev_pm_opp_set_prop_name(dev, config->prop_name);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_prop_name(opp_table, config->prop_name);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_PROP_NAME;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 20e1e5060a8a..f995ca1406e8 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -184,8 +184,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
 
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -357,13 +355,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
-
 static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
 	return -EOPNOTSUPP;
@@ -652,5 +643,19 @@ static inline int devm_pm_opp_attach_genpd(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* prop-name helpers */
+static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct dev_pm_opp_config config = {
+		.prop_name = name,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_prop_name(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit 


From aee3352f6ecf8cfad1f1ee5838cfc4d37c6b8f75 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 13:45:08 +0530
Subject: OPP: Add support for config_regulators() helper

Extend the dev_pm_opp_set_config() interface to allow adding
config_regulators() helpers. This helper will be called to set the
voltages of the regulators from the regular path in _set_opp(), while we
are trying to change the OPP.

This will eventually replace the custom set_opp() helper.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h      |  2 ++
 include/linux/pm_opp.h | 22 ++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 1745e25c1eaf..12bae79564f1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1185,6 +1185,17 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 			dev_err(dev, "Failed to set bw: %d\n", ret);
 			return ret;
 		}
+
+		if (opp_table->config_regulators) {
+			ret = opp_table->config_regulators(dev, old_opp, opp,
+							   opp_table->regulators,
+							   opp_table->regulator_count);
+			if (ret) {
+				dev_err(dev, "Failed to set regulator voltages: %d\n",
+					ret);
+				return ret;
+			}
+		}
 	}
 
 	if (opp_table->set_opp) {
@@ -1202,6 +1213,17 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 
 	/* Scaling down? Configure required OPPs after frequency */
 	if (scaling_down) {
+		if (opp_table->config_regulators) {
+			ret = opp_table->config_regulators(dev, old_opp, opp,
+							   opp_table->regulators,
+							   opp_table->regulator_count);
+			if (ret) {
+				dev_err(dev, "Failed to set regulator voltages: %d\n",
+					ret);
+				return ret;
+			}
+		}
+
 		ret = _set_opp_bw(opp_table, opp, dev);
 		if (ret) {
 			dev_err(dev, "Failed to set bw: %d\n", ret);
@@ -2268,6 +2290,38 @@ static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 	}
 }
 
+/**
+ * _opp_set_config_regulators_helper() - Register custom set regulator helper.
+ * @dev: Device for which the helper is getting registered.
+ * @config_regulators: Custom set regulator helper.
+ *
+ * This is useful to support platforms with multiple regulators per device.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+static int _opp_set_config_regulators_helper(struct opp_table *opp_table,
+		struct device *dev, config_regulators_t config_regulators)
+{
+	/* Another CPU that shares the OPP table has set the helper ? */
+	if (!opp_table->config_regulators)
+		opp_table->config_regulators = config_regulators;
+
+	return 0;
+}
+
+/**
+ * _opp_put_config_regulators_helper() - Releases resources blocked for
+ *					 config_regulators helper.
+ * @opp_table: OPP table returned from _opp_set_config_regulators_helper().
+ *
+ * Release resources blocked for platform specific config_regulators helper.
+ */
+static void _opp_put_config_regulators_helper(struct opp_table *opp_table)
+{
+	if (opp_table->config_regulators)
+		opp_table->config_regulators = NULL;
+}
+
 static void _detach_genpd(struct opp_table *opp_table)
 {
 	int index;
@@ -2394,8 +2448,10 @@ static void _opp_clear_config(struct opp_config_data *data)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
-	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER) {
+		_opp_put_config_regulators_helper(data->opp_table);
 		_opp_unregister_set_opp_helper(data->opp_table);
+	}
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2476,6 +2532,16 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
 	}
 
+	/* Configure config_regulators helper */
+	if (config->config_regulators) {
+		ret = _opp_set_config_regulators_helper(opp_table, dev,
+						config->config_regulators);
+		if (ret)
+			goto err;
+
+		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
+	}
+
 	/* Configure supported hardware */
 	if (config->supported_hw) {
 		ret = _opp_set_supported_hw(opp_table, config->supported_hw,
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index d652f0cc84f1..45fd40737159 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -172,6 +172,7 @@ enum opp_table_access {
  * @prop_name: A name to postfix to many DT properties, while parsing them.
  * @clk_configured: Clock name is configured by the platform.
  * @clk: Device's clock handle
+ * @config_regulators: Platform specific config_regulators() callback.
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
  * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt
@@ -224,6 +225,7 @@ struct opp_table {
 	const char *prop_name;
 	bool clk_configured;
 	struct clk *clk;
+	config_regulators_t config_regulators;
 	struct regulator **regulators;
 	int regulator_count;
 	struct icc_path **paths;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f995ca1406e8..9f2f9a792a19 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -90,11 +90,16 @@ struct dev_pm_set_opp_data {
 	struct device *dev;
 };
 
+typedef int (*config_regulators_t)(struct device *dev,
+			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
+			struct regulator **regulators, unsigned int count);
+
 /**
  * struct dev_pm_opp_config - Device OPP configuration values
  * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
  * @prop_name: Name to postfix to properties.
  * @set_opp: Custom set OPP helper.
+ * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
@@ -109,6 +114,7 @@ struct dev_pm_opp_config {
 	const char * const *clk_names;
 	const char *prop_name;
 	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char * const *regulator_names;
@@ -613,6 +619,22 @@ static inline void dev_pm_opp_unregister_set_opp_helper(int token)
 	dev_pm_opp_clear_config(token);
 }
 
+/* config-regulators helpers */
+static inline int dev_pm_opp_set_config_regulators(struct device *dev,
+						   config_regulators_t helper)
+{
+	struct dev_pm_opp_config config = {
+		.config_regulators = helper,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_config_regulators(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
 /* genpd helpers */
 static inline int dev_pm_opp_attach_genpd(struct device *dev,
 					  const char * const *names,
-- 
cgit 


From 69b1af178a3a534da2f20fa0c7356fcbbe8cce1f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 31 May 2022 13:24:21 +0530
Subject: OPP: Add dev_pm_opp_get_supplies()

We already have an API for getting voltage information for a single
regulator, dev_pm_opp_get_voltage(), but there is nothing available for
multiple regulator case.

This patch adds a new API, dev_pm_opp_get_supplies(), to get all
information related to the supplies for an OPP.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 25 +++++++++++++++++++++++++
 include/linux/pm_opp.h |  7 +++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 5ecf077b6b17..3a794bff2ba1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -117,6 +117,31 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
 
+/**
+ * dev_pm_opp_get_supplies() - Gets the supply information corresponding to an opp
+ * @opp:	opp for which voltage has to be returned for
+ * @supplies:	Placeholder for copying the supply information.
+ *
+ * Return: negative error number on failure, 0 otherwise on success after
+ * setting @supplies.
+ *
+ * This can be used for devices with any number of power supplies. The caller
+ * must ensure the @supplies array must contain space for each regulator.
+ */
+int dev_pm_opp_get_supplies(struct dev_pm_opp *opp,
+			    struct dev_pm_opp_supply *supplies)
+{
+	if (IS_ERR_OR_NULL(opp) || !supplies) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return -EINVAL;
+	}
+
+	memcpy(supplies, opp->supplies,
+	       sizeof(*supplies) * opp->opp_table->regulator_count);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_supplies);
+
 /**
  * dev_pm_opp_get_power() - Gets the power corresponding to an opp
  * @opp:	opp for which power has to be returned for
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9f2f9a792a19..1e2b33d79ba6 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -129,6 +129,8 @@ void dev_pm_opp_put_opp_table(struct opp_table *opp_table);
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 
+int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies);
+
 unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp);
 
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
@@ -217,6 +219,11 @@ static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	return 0;
 }
 
+static inline int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp)
 {
 	return 0;
-- 
cgit 


From 1f378c6ead5c69decf36e8e61de2e50377c76ab8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 14:57:06 +0530
Subject: OPP: Remove custom OPP helper support

The only user of the custom helper is migrated to use
dev_pm_opp_set_config_regulators() interface. Remove the now unused
custom OPP helper support.

This cleans up _set_opp() and leaves a single code path to be used by
all users.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 132 +------------------------------------------------
 drivers/opp/opp.h      |   7 ---
 include/linux/pm_opp.h |  51 -------------------
 3 files changed, 2 insertions(+), 188 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 3a794bff2ba1..e74bdc134c5a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -979,36 +979,6 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 	return 0;
 }
 
-static int _set_opp_custom(const struct opp_table *opp_table,
-			   struct device *dev, struct dev_pm_opp *opp,
-			   unsigned long freq)
-{
-	struct dev_pm_set_opp_data *data = opp_table->set_opp_data;
-	struct dev_pm_opp *old_opp = opp_table->current_opp;
-	int size;
-
-	/*
-	 * We support this only if dev_pm_opp_set_config() was called
-	 * earlier to set regulators.
-	 */
-	if (opp_table->sod_supplies) {
-		size = sizeof(*old_opp->supplies) * opp_table->regulator_count;
-		memcpy(data->old_opp.supplies, old_opp->supplies, size);
-		memcpy(data->new_opp.supplies, opp->supplies, size);
-		data->regulator_count = opp_table->regulator_count;
-	} else {
-		data->regulator_count = 0;
-	}
-
-	data->regulators = opp_table->regulators;
-	data->clk = opp_table->clk;
-	data->dev = dev;
-	data->old_opp.rate = old_opp->rate;
-	data->new_opp.rate = freq;
-
-	return opp_table->set_opp(data);
-}
-
 static int _set_required_opp(struct device *dev, struct device *pd_dev,
 			     struct dev_pm_opp *opp, int i)
 {
@@ -1195,13 +1165,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 		}
 	}
 
-	if (opp_table->set_opp) {
-		ret = _set_opp_custom(opp_table, dev, opp, freq);
-	} else {
-		/* Only frequency scaling */
-		ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
-	}
-
+	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
 	if (ret)
 		return ret;
 
@@ -2065,7 +2029,6 @@ static void _opp_put_prop_name(struct opp_table *opp_table)
 static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
 			       const char * const names[])
 {
-	struct dev_pm_opp_supply *supplies;
 	const char * const *temp = names;
 	struct regulator *reg;
 	int count = 0, ret, i;
@@ -2101,20 +2064,6 @@ static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
 
 	opp_table->regulator_count = count;
 
-	supplies = kmalloc_array(count * 2, sizeof(*supplies), GFP_KERNEL);
-	if (!supplies) {
-		ret = -ENOMEM;
-		goto free_regulators;
-	}
-
-	mutex_lock(&opp_table->lock);
-	opp_table->sod_supplies = supplies;
-	if (opp_table->set_opp_data) {
-		opp_table->set_opp_data->old_opp.supplies = supplies;
-		opp_table->set_opp_data->new_opp.supplies = supplies + count;
-	}
-	mutex_unlock(&opp_table->lock);
-
 	/* Set generic config_regulators() for single regulators here */
 	if (count == 1)
 		opp_table->config_regulators = _opp_config_regulator_single;
@@ -2151,16 +2100,6 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 	for (i = opp_table->regulator_count - 1; i >= 0; i--)
 		regulator_put(opp_table->regulators[i]);
 
-	mutex_lock(&opp_table->lock);
-	if (opp_table->set_opp_data) {
-		opp_table->set_opp_data->old_opp.supplies = NULL;
-		opp_table->set_opp_data->new_opp.supplies = NULL;
-	}
-
-	kfree(opp_table->sod_supplies);
-	opp_table->sod_supplies = NULL;
-	mutex_unlock(&opp_table->lock);
-
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
@@ -2233,61 +2172,6 @@ static void _opp_put_clknames(struct opp_table *opp_table)
 	}
 }
 
-/**
- * _opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is useful to support complex platforms (like platforms with multiple
- * regulators per device), instead of the generic OPP set rate helper.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-static int _opp_register_set_opp_helper(struct opp_table *opp_table,
-	struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct dev_pm_set_opp_data *data;
-
-	/* Another CPU that shares the OPP table has set the helper ? */
-	if (opp_table->set_opp)
-		return 0;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	mutex_lock(&opp_table->lock);
-	opp_table->set_opp_data = data;
-	if (opp_table->sod_supplies) {
-		data->old_opp.supplies = opp_table->sod_supplies;
-		data->new_opp.supplies = opp_table->sod_supplies +
-					 opp_table->regulator_count;
-	}
-	mutex_unlock(&opp_table->lock);
-
-	opp_table->set_opp = set_opp;
-
-	return 0;
-}
-
-/**
- * _opp_unregister_set_opp_helper() - Releases resources blocked for set_opp helper
- * @opp_table: OPP table returned from _opp_register_set_opp_helper().
- *
- * Release resources blocked for platform specific set_opp helper.
- */
-static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
-{
-	if (opp_table->set_opp) {
-		opp_table->set_opp = NULL;
-
-		mutex_lock(&opp_table->lock);
-		kfree(opp_table->set_opp_data);
-		opp_table->set_opp_data = NULL;
-		mutex_unlock(&opp_table->lock);
-	}
-}
-
 /**
  * _opp_set_config_regulators_helper() - Register custom set regulator helper.
  * @dev: Device for which the helper is getting registered.
@@ -2446,10 +2330,8 @@ static void _opp_clear_config(struct opp_config_data *data)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
-	if (data->flags & OPP_CONFIG_REGULATOR_HELPER) {
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		_opp_put_config_regulators_helper(data->opp_table);
-		_opp_unregister_set_opp_helper(data->opp_table);
-	}
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2520,16 +2402,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 		data->flags |= OPP_CONFIG_PROP_NAME;
 	}
 
-	/* Configure opp helper */
-	if (config->set_opp) {
-		ret = _opp_register_set_opp_helper(opp_table, dev,
-						   config->set_opp);
-		if (ret)
-			goto err;
-
-		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
-	}
-
 	/* Configure config_regulators helper */
 	if (config->config_regulators) {
 		ret = _opp_set_config_regulators_helper(opp_table, dev,
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 45fd40737159..13abe991e811 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -182,9 +182,6 @@ enum opp_table_access {
  * @enabled: Set to true if the device's resources are enabled/configured.
  * @genpd_performance_state: Device's power domain support performance state.
  * @is_genpd: Marks if the OPP table belongs to a genpd.
- * @set_opp: Platform specific set_opp callback
- * @sod_supplies: Set opp data supplies
- * @set_opp_data: Data to be passed to set_opp callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -234,10 +231,6 @@ struct opp_table {
 	bool genpd_performance_state;
 	bool is_genpd;
 
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
-	struct dev_pm_opp_supply *sod_supplies;
-	struct dev_pm_set_opp_data *set_opp_data;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
 	char dentry_name[NAME_MAX];
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 1e2b33d79ba6..9d59aedc2be3 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -57,39 +57,6 @@ struct dev_pm_opp_icc_bw {
 	u32 peak;
 };
 
-/**
- * struct dev_pm_opp_info - OPP freq/voltage/current values
- * @rate:	Target clk rate in hz
- * @supplies:	Array of voltage/current values for all power supplies
- *
- * This structure stores the freq/voltage/current values for a single OPP.
- */
-struct dev_pm_opp_info {
-	unsigned long rate;
-	struct dev_pm_opp_supply *supplies;
-};
-
-/**
- * struct dev_pm_set_opp_data - Set OPP data
- * @old_opp:	Old OPP info
- * @new_opp:	New OPP info
- * @regulators:	Array of regulator pointers
- * @regulator_count: Number of regulators
- * @clk:	Pointer to clk
- * @dev:	Pointer to the struct device
- *
- * This structure contains all information required for setting an OPP.
- */
-struct dev_pm_set_opp_data {
-	struct dev_pm_opp_info old_opp;
-	struct dev_pm_opp_info new_opp;
-
-	struct regulator **regulators;
-	unsigned int regulator_count;
-	struct clk *clk;
-	struct device *dev;
-};
-
 typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
@@ -98,7 +65,6 @@ typedef int (*config_regulators_t)(struct device *dev,
  * struct dev_pm_opp_config - Device OPP configuration values
  * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
  * @prop_name: Name to postfix to properties.
- * @set_opp: Custom set OPP helper.
  * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
  * @supported_hw_count: Number of elements in the array.
@@ -113,7 +79,6 @@ struct dev_pm_opp_config {
 	/* NULL terminated */
 	const char * const *clk_names;
 	const char *prop_name;
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
 	unsigned int supported_hw_count;
@@ -610,22 +575,6 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return devm_pm_opp_set_config(dev, &config);
 }
 
-/* set-opp helpers */
-static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct dev_pm_opp_config config = {
-		.set_opp = set_opp,
-	};
-
-	return dev_pm_opp_set_config(dev, &config);
-}
-
-static inline void dev_pm_opp_unregister_set_opp_helper(int token)
-{
-	dev_pm_opp_clear_config(token);
-}
-
 /* config-regulators helpers */
 static inline int dev_pm_opp_set_config_regulators(struct device *dev,
 						   config_regulators_t helper)
-- 
cgit 


From 9fbb62605607d8f00c32a4765cd1ae67f022b640 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 2 Jun 2022 14:05:59 +0530
Subject: OPP: Remove dev_pm_opp_find_freq_ceil_by_volt()

This was added few years back, but the code that was supposed to use it
never got merged. Remove the unused helper.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 54 --------------------------------------------------
 include/linux/pm_opp.h |  8 --------
 2 files changed, 62 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e74bdc134c5a..fc0232f695c6 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -607,60 +607,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
 
-/**
- * dev_pm_opp_find_freq_ceil_by_volt() - Find OPP with highest frequency for
- *					 target voltage.
- * @dev:	Device for which we do this operation.
- * @u_volt:	Target voltage.
- *
- * Search for OPP with highest (ceil) frequency and has voltage <= u_volt.
- *
- * Return: matching *opp, else returns ERR_PTR in case of error which should be
- * handled using IS_ERR.
- *
- * Error return values can be:
- * EINVAL:	bad parameters
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-						     unsigned long u_volt)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	if (!dev || !u_volt) {
-		dev_err(dev, "%s: Invalid argument volt=%lu\n", __func__,
-			u_volt);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available) {
-			if (temp_opp->supplies[0].u_volt > u_volt)
-				break;
-			opp = temp_opp;
-		}
-	}
-
-	/* Increment the reference count of OPP */
-	if (!IS_ERR(opp))
-		dev_pm_opp_get(opp);
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil_by_volt);
-
 /**
  * dev_pm_opp_find_level_exact() - search for an exact level
  * @dev:		device for which we do this operation
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9d59aedc2be3..50cbc75bef71 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -118,8 +118,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      bool available);
 struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					      unsigned long *freq);
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-						     unsigned long u_volt);
 
 struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 					       unsigned int level);
@@ -265,12 +263,6 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-					unsigned long u_volt)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 					unsigned long *freq)
 {
-- 
cgit 


From 5de7d31b50c77d97fc568e7279c1831a72c33ee9 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 6 Jul 2022 08:42:51 +0530
Subject: dt-bindings: power: Add Tegra234 MGBE power domains

Add power domain IDs for the four MGBE power partitions found on
Tegra234.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Bhadram Varka <vbhadram@nvidia.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/power/tegra234-powergate.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/power/tegra234-powergate.h b/include/dt-bindings/power/tegra234-powergate.h
index f610eee9bce8..df1d4dd8dcf3 100644
--- a/include/dt-bindings/power/tegra234-powergate.h
+++ b/include/dt-bindings/power/tegra234-powergate.h
@@ -18,5 +18,6 @@
 #define TEGRA234_POWER_DOMAIN_MGBEA	17U
 #define TEGRA234_POWER_DOMAIN_MGBEB	18U
 #define TEGRA234_POWER_DOMAIN_MGBEC	19U
+#define TEGRA234_POWER_DOMAIN_MGBED	20U
 
 #endif
-- 
cgit 


From b0aedf342bc31d7b8e9a782cd8b439db07fbdc78 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 6 Jul 2022 08:42:52 +0530
Subject: dt-bindings: Add Tegra234 MGBE clocks and resets

Add the clocks and resets used by the MGBE Ethernet hardware found on
Tegra234 SoCs.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Bhadram Varka <vbhadram@nvidia.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/clock/tegra234-clock.h | 101 +++++++++++++++++++++++++++++
 include/dt-bindings/reset/tegra234-reset.h |   8 +++
 2 files changed, 109 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/tegra234-clock.h b/include/dt-bindings/clock/tegra234-clock.h
index bd4c3086a2da..bab85d9ba8cd 100644
--- a/include/dt-bindings/clock/tegra234-clock.h
+++ b/include/dt-bindings/clock/tegra234-clock.h
@@ -164,10 +164,111 @@
 #define TEGRA234_CLK_PEX1_C5_CORE		225U
 /** @brief PLL controlled by CLK_RST_CONTROLLER_PLLC4_BASE */
 #define TEGRA234_CLK_PLLC4			237U
+/** @brief RX clock recovered from MGBE0 lane input */
+#define TEGRA234_CLK_MGBE0_RX_INPUT		248U
+/** @brief RX clock recovered from MGBE1 lane input */
+#define TEGRA234_CLK_MGBE1_RX_INPUT		249U
+/** @brief RX clock recovered from MGBE2 lane input */
+#define TEGRA234_CLK_MGBE2_RX_INPUT		250U
+/** @brief RX clock recovered from MGBE3 lane input */
+#define TEGRA234_CLK_MGBE3_RX_INPUT		251U
 /** @brief 32K input clock provided by PMIC */
 #define TEGRA234_CLK_CLK_32K			289U
+/** @brief Monitored branch of MBGE0 RX input clock */
+#define TEGRA234_CLK_MGBE0_RX_INPUT_M		357U
+/** @brief Monitored branch of MBGE1 RX input clock */
+#define TEGRA234_CLK_MGBE1_RX_INPUT_M		358U
+/** @brief Monitored branch of MBGE2 RX input clock */
+#define TEGRA234_CLK_MGBE2_RX_INPUT_M		359U
+/** @brief Monitored branch of MBGE3 RX input clock */
+#define TEGRA234_CLK_MGBE3_RX_INPUT_M		360U
+/** @brief Monitored branch of MGBE0 RX PCS mux output */
+#define TEGRA234_CLK_MGBE0_RX_PCS_M		361U
+/** @brief Monitored branch of MGBE1 RX PCS mux output */
+#define TEGRA234_CLK_MGBE1_RX_PCS_M		362U
+/** @brief Monitored branch of MGBE2 RX PCS mux output */
+#define TEGRA234_CLK_MGBE2_RX_PCS_M		363U
+/** @brief Monitored branch of MGBE3 RX PCS mux output */
+#define TEGRA234_CLK_MGBE3_RX_PCS_M		364U
+/** @brief RX PCS clock recovered from MGBE0 lane input */
+#define TEGRA234_CLK_MGBE0_RX_PCS_INPUT		369U
+/** @brief RX PCS clock recovered from MGBE1 lane input */
+#define TEGRA234_CLK_MGBE1_RX_PCS_INPUT		370U
+/** @brief RX PCS clock recovered from MGBE2 lane input */
+#define TEGRA234_CLK_MGBE2_RX_PCS_INPUT		371U
+/** @brief RX PCS clock recovered from MGBE3 lane input */
+#define TEGRA234_CLK_MGBE3_RX_PCS_INPUT		372U
+/** @brief output of mux controlled by GBE_UPHY_MGBE0_RX_PCS_CLK_SRC_SEL */
+#define TEGRA234_CLK_MGBE0_RX_PCS		373U
+/** @brief GBE_UPHY_MGBE0_TX_CLK divider gated output */
+#define TEGRA234_CLK_MGBE0_TX			374U
+/** @brief GBE_UPHY_MGBE0_TX_PCS_CLK divider gated output */
+#define TEGRA234_CLK_MGBE0_TX_PCS		375U
+/** @brief GBE_UPHY_MGBE0_MAC_CLK divider output */
+#define TEGRA234_CLK_MGBE0_MAC_DIVIDER		376U
+/** @brief GBE_UPHY_MGBE0_MAC_CLK gate output */
+#define TEGRA234_CLK_MGBE0_MAC			377U
+/** @brief GBE_UPHY_MGBE0_MACSEC_CLK gate output */
+#define TEGRA234_CLK_MGBE0_MACSEC		378U
+/** @brief GBE_UPHY_MGBE0_EEE_PCS_CLK gate output */
+#define TEGRA234_CLK_MGBE0_EEE_PCS		379U
+/** @brief GBE_UPHY_MGBE0_APP_CLK gate output */
+#define TEGRA234_CLK_MGBE0_APP			380U
+/** @brief GBE_UPHY_MGBE0_PTP_REF_CLK divider gated output */
+#define TEGRA234_CLK_MGBE0_PTP_REF		381U
+/** @brief output of mux controlled by GBE_UPHY_MGBE1_RX_PCS_CLK_SRC_SEL */
+#define TEGRA234_CLK_MGBE1_RX_PCS		382U
+/** @brief GBE_UPHY_MGBE1_TX_CLK divider gated output */
+#define TEGRA234_CLK_MGBE1_TX			383U
+/** @brief GBE_UPHY_MGBE1_TX_PCS_CLK divider gated output */
+#define TEGRA234_CLK_MGBE1_TX_PCS		384U
+/** @brief GBE_UPHY_MGBE1_MAC_CLK divider output */
+#define TEGRA234_CLK_MGBE1_MAC_DIVIDER		385U
+/** @brief GBE_UPHY_MGBE1_MAC_CLK gate output */
+#define TEGRA234_CLK_MGBE1_MAC			386U
+/** @brief GBE_UPHY_MGBE1_EEE_PCS_CLK gate output */
+#define TEGRA234_CLK_MGBE1_EEE_PCS		388U
+/** @brief GBE_UPHY_MGBE1_APP_CLK gate output */
+#define TEGRA234_CLK_MGBE1_APP			389U
+/** @brief GBE_UPHY_MGBE1_PTP_REF_CLK divider gated output */
+#define TEGRA234_CLK_MGBE1_PTP_REF		390U
+/** @brief output of mux controlled by GBE_UPHY_MGBE2_RX_PCS_CLK_SRC_SEL */
+#define TEGRA234_CLK_MGBE2_RX_PCS		391U
+/** @brief GBE_UPHY_MGBE2_TX_CLK divider gated output */
+#define TEGRA234_CLK_MGBE2_TX			392U
+/** @brief GBE_UPHY_MGBE2_TX_PCS_CLK divider gated output */
+#define TEGRA234_CLK_MGBE2_TX_PCS		393U
+/** @brief GBE_UPHY_MGBE2_MAC_CLK divider output */
+#define TEGRA234_CLK_MGBE2_MAC_DIVIDER		394U
+/** @brief GBE_UPHY_MGBE2_MAC_CLK gate output */
+#define TEGRA234_CLK_MGBE2_MAC			395U
+/** @brief GBE_UPHY_MGBE2_EEE_PCS_CLK gate output */
+#define TEGRA234_CLK_MGBE2_EEE_PCS		397U
+/** @brief GBE_UPHY_MGBE2_APP_CLK gate output */
+#define TEGRA234_CLK_MGBE2_APP			398U
+/** @brief GBE_UPHY_MGBE2_PTP_REF_CLK divider gated output */
+#define TEGRA234_CLK_MGBE2_PTP_REF		399U
+/** @brief output of mux controlled by GBE_UPHY_MGBE3_RX_PCS_CLK_SRC_SEL */
+#define TEGRA234_CLK_MGBE3_RX_PCS		400U
+/** @brief GBE_UPHY_MGBE3_TX_CLK divider gated output */
+#define TEGRA234_CLK_MGBE3_TX			401U
+/** @brief GBE_UPHY_MGBE3_TX_PCS_CLK divider gated output */
+#define TEGRA234_CLK_MGBE3_TX_PCS		402U
+/** @brief GBE_UPHY_MGBE3_MAC_CLK divider output */
+#define TEGRA234_CLK_MGBE3_MAC_DIVIDER		403U
+/** @brief GBE_UPHY_MGBE3_MAC_CLK gate output */
+#define TEGRA234_CLK_MGBE3_MAC			404U
+/** @brief GBE_UPHY_MGBE3_MACSEC_CLK gate output */
+#define TEGRA234_CLK_MGBE3_MACSEC		405U
+/** @brief GBE_UPHY_MGBE3_EEE_PCS_CLK gate output */
+#define TEGRA234_CLK_MGBE3_EEE_PCS		406U
+/** @brief GBE_UPHY_MGBE3_APP_CLK gate output */
+#define TEGRA234_CLK_MGBE3_APP			407U
+/** @brief GBE_UPHY_MGBE3_PTP_REF_CLK divider gated output */
+#define TEGRA234_CLK_MGBE3_PTP_REF		408U
 /** @brief CLK_RST_CONTROLLER_AZA2XBITCLK_OUT_SWITCH_DIVIDER switch divider output (aza_2xbitclk) */
 #define TEGRA234_CLK_AZA_2XBIT			457U
 /** @brief aza_2xbitclk / 2 (aza_bitclk) */
 #define TEGRA234_CLK_AZA_BIT			458U
+
 #endif
diff --git a/include/dt-bindings/reset/tegra234-reset.h b/include/dt-bindings/reset/tegra234-reset.h
index 5809245e4e21..bd58a05f1d94 100644
--- a/include/dt-bindings/reset/tegra234-reset.h
+++ b/include/dt-bindings/reset/tegra234-reset.h
@@ -30,6 +30,12 @@
 #define TEGRA234_RESET_I2C7			33U
 #define TEGRA234_RESET_I2C8			34U
 #define TEGRA234_RESET_I2C9			35U
+#define TEGRA234_RESET_MGBE0_PCS		45U
+#define TEGRA234_RESET_MGBE0_MAC		46U
+#define TEGRA234_RESET_MGBE1_PCS		49U
+#define TEGRA234_RESET_MGBE1_MAC		50U
+#define TEGRA234_RESET_MGBE2_PCS		53U
+#define TEGRA234_RESET_MGBE2_MAC		54U
 #define TEGRA234_RESET_PEX2_CORE_10		56U
 #define TEGRA234_RESET_PEX2_CORE_10_APB		57U
 #define TEGRA234_RESET_PEX2_COMMON_APB		58U
@@ -44,6 +50,8 @@
 #define TEGRA234_RESET_QSPI0			76U
 #define TEGRA234_RESET_QSPI1			77U
 #define TEGRA234_RESET_SDMMC4			85U
+#define TEGRA234_RESET_MGBE3_PCS		87U
+#define TEGRA234_RESET_MGBE3_MAC		88U
 #define TEGRA234_RESET_UARTA			100U
 #define TEGRA234_RESET_PEX0_CORE_0		116U
 #define TEGRA234_RESET_PEX0_CORE_1		117U
-- 
cgit 


From 833f5a7eb2889067c45e5367aa7d5516df17520d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 6 Jul 2022 08:42:53 +0530
Subject: dt-bindings: memory: Add Tegra234 MGBE memory clients

Add the memory client and stream ID definitions for the MGBE hardware
found on Tegra234 SoCs.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Bhadram Varka <vbhadram@nvidia.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/memory/tegra234-mc.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/memory/tegra234-mc.h b/include/dt-bindings/memory/tegra234-mc.h
index 4bc84a312ad6..8b0ddcb715ff 100644
--- a/include/dt-bindings/memory/tegra234-mc.h
+++ b/include/dt-bindings/memory/tegra234-mc.h
@@ -12,11 +12,15 @@
 #define TEGRA234_SID_APE	0x02
 #define TEGRA234_SID_HDA	0x03
 #define TEGRA234_SID_GPCDMA	0x04
+#define TEGRA234_SID_MGBE	0x06
 #define TEGRA234_SID_PCIE0	0x12
 #define TEGRA234_SID_PCIE4	0x13
 #define TEGRA234_SID_PCIE5	0x14
 #define TEGRA234_SID_PCIE6	0x15
 #define TEGRA234_SID_PCIE9	0x1f
+#define TEGRA234_SID_MGBE_VF1	0x49
+#define TEGRA234_SID_MGBE_VF2	0x4a
+#define TEGRA234_SID_MGBE_VF3	0x4b
 
 /* NISO1 stream IDs */
 #define TEGRA234_SID_SDMMC4	0x02
@@ -62,8 +66,24 @@
 #define TEGRA234_MEMORY_CLIENT_PCIE10AR1 0x48
 /* PCIE7r1 read clients */
 #define TEGRA234_MEMORY_CLIENT_PCIE7AR1 0x49
+/* MGBE0 read client */
+#define TEGRA234_MEMORY_CLIENT_MGBEARD 0x58
+/* MGBEB read client */
+#define TEGRA234_MEMORY_CLIENT_MGBEBRD 0x59
+/* MGBEC read client */
+#define TEGRA234_MEMORY_CLIENT_MGBECRD 0x5a
+/* MGBED read client */
+#define TEGRA234_MEMORY_CLIENT_MGBEDRD 0x5b
+/* MGBE0 write client */
+#define TEGRA234_MEMORY_CLIENT_MGBEAWR 0x5c
+/* MGBEB write client */
+#define TEGRA234_MEMORY_CLIENT_MGBEBWR 0x5f
+/* MGBEC write client */
+#define TEGRA234_MEMORY_CLIENT_MGBECWR 0x61
 /* sdmmcd memory read client */
 #define TEGRA234_MEMORY_CLIENT_SDMMCRAB 0x63
+/* MGBED write client */
+#define TEGRA234_MEMORY_CLIENT_MGBEDWR 0x65
 /* sdmmcd memory write client */
 #define TEGRA234_MEMORY_CLIENT_SDMMCWAB 0x67
 /* BPMP read client */
-- 
cgit 


From 2b48db01a066f738cf467876237587f1c4b6b7a2 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Tue, 3 May 2022 16:14:40 +0200
Subject: dt-bindings: power: Add MediaTek Helio X10 MT6795 power domains

Add power domains dt-bindings for MediaTek Helio X10 (MT6795).

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220503141441.125852-2-angelogioacchino.delregno@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 .../bindings/power/mediatek,power-controller.yaml        |  2 ++
 include/dt-bindings/power/mt6795-power.h                 | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 include/dt-bindings/power/mt6795-power.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
index 135c6f722091..b448101fac43 100644
--- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
+++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
@@ -23,6 +23,7 @@ properties:
 
   compatible:
     enum:
+      - mediatek,mt6795-power-controller
       - mediatek,mt8167-power-controller
       - mediatek,mt8173-power-controller
       - mediatek,mt8183-power-controller
@@ -62,6 +63,7 @@ patternProperties:
       reg:
         description: |
           Power domain index. Valid values are defined in:
+              "include/dt-bindings/power/mt6795-power.h" - for MT8167 type power domain.
               "include/dt-bindings/power/mt8167-power.h" - for MT8167 type power domain.
               "include/dt-bindings/power/mt8173-power.h" - for MT8173 type power domain.
               "include/dt-bindings/power/mt8183-power.h" - for MT8183 type power domain.
diff --git a/include/dt-bindings/power/mt6795-power.h b/include/dt-bindings/power/mt6795-power.h
new file mode 100644
index 000000000000..b0fc26cb1da4
--- /dev/null
+++ b/include/dt-bindings/power/mt6795-power.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+#ifndef _DT_BINDINGS_POWER_MT6795_POWER_H
+#define _DT_BINDINGS_POWER_MT6795_POWER_H
+
+#define MT6795_POWER_DOMAIN_MM		0
+#define MT6795_POWER_DOMAIN_VDEC	1
+#define MT6795_POWER_DOMAIN_VENC	2
+#define MT6795_POWER_DOMAIN_ISP		3
+#define MT6795_POWER_DOMAIN_MJC		4
+#define MT6795_POWER_DOMAIN_AUDIO	5
+#define MT6795_POWER_DOMAIN_MFG_ASYNC	6
+#define MT6795_POWER_DOMAIN_MFG_2D	7
+#define MT6795_POWER_DOMAIN_MFG		8
+#define MT6795_POWER_DOMAIN_MODEM	9
+
+#endif /* _DT_BINDINGS_POWER_MT6795_POWER_H */
-- 
cgit 


From f8d3da4ef8faf027261e06b7864583930dd7c7b9 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 6 Jul 2022 16:25:47 -0700
Subject: bpf: Add flags arg to bpf_dynptr_read and bpf_dynptr_write APIs

Commit 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
added the bpf_dynptr_write() and bpf_dynptr_read() APIs.

However, it will be needed for some dynptr types to pass in flags as
well (e.g. when writing to a skb, the user may like to invalidate the
hash or recompute the checksum).

This patch adds a "u64 flags" arg to the bpf_dynptr_read() and
bpf_dynptr_write() APIs before their UAPI signature freezes where
we then cannot change them anymore with a 5.19.x released kernel.

Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20220706232547.4016651-1-joannelkoong@gmail.com
---
 include/uapi/linux/bpf.h                           | 11 +++++++----
 kernel/bpf/helpers.c                               | 12 ++++++++----
 tools/include/uapi/linux/bpf.h                     | 11 +++++++----
 tools/testing/selftests/bpf/progs/dynptr_fail.c    | 10 +++++-----
 tools/testing/selftests/bpf/progs/dynptr_success.c |  4 ++--
 5 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..ef78e0e1a754 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efb..bb1254f07667 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+	   u32, offset, u64, flags)
 {
 	int err;
 
-	if (!src->data)
+	if (!src->data || flags)
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(src, offset, len);
@@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_PTR_TO_DYNPTR,
 	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+	   u32, len, u64, flags)
 {
 	int err;
 
-	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62d..ef78e0e1a754 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index d811cff73597..0a26c243e6e9 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -140,12 +140,12 @@ int use_after_invalid(void *ctx)
 
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr);
 
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	return 0;
 }
@@ -338,7 +338,7 @@ int invalid_helper2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0);
 
 	return 0;
 }
@@ -377,7 +377,7 @@ int invalid_write2(void *ctx)
 	memcpy((void *)&ptr + 8, &x, sizeof(x));
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
@@ -473,7 +473,7 @@ int invalid_read2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index d67be48df4b2..a3a6103c8569 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -43,10 +43,10 @@ int test_read_write(void *ctx)
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
 
 	/* Write data into the dynptr */
-	err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data));
+	err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
 
 	/* Read the data that was written into the dynptr */
-	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	/* Ensure the data we read matches the data we wrote */
 	for (i = 0; i < sizeof(read_data); i++) {
-- 
cgit 


From 6976ed0137d98c2ec0f11af8a01716e9f3af873d Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Fri, 1 Jul 2022 05:18:27 +0000
Subject: ASoC: audio-graph-card2.c: remove pre-alloced Codec2Codec space

Because Codec2Codec settings becomes optional, we don't need to keep
its parameter space when init time. This patch removes its default
memory allocation from simple-card-utils.c, and allocate it at
audio-graph-card2 ondemand.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://lore.kernel.org/r/87edz5s95o.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/simple_card_utils.h     |  3 ---
 sound/soc/generic/audio-graph-card2.c | 10 +++++++---
 sound/soc/generic/simple-card-utils.c | 18 +-----------------
 3 files changed, 8 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h
index fe2337fde1f4..ab55f40896e0 100644
--- a/include/sound/simple_card_utils.h
+++ b/include/sound/simple_card_utils.h
@@ -51,7 +51,6 @@ struct prop_nums {
 	int cpus;
 	int codecs;
 	int platforms;
-	int c2c;
 };
 
 struct asoc_simple_priv {
@@ -64,7 +63,6 @@ struct asoc_simple_priv {
 		struct snd_soc_dai_link_component *platforms;
 		struct asoc_simple_data adata;
 		struct snd_soc_codec_conf *codec_conf;
-		struct snd_soc_pcm_stream *c2c_conf;
 		struct prop_nums num;
 		unsigned int mclk_fs;
 	} *dai_props;
@@ -75,7 +73,6 @@ struct asoc_simple_priv {
 	struct snd_soc_dai_link_component *dlcs;
 	struct snd_soc_dai_link_component dummy;
 	struct snd_soc_codec_conf *codec_conf;
-	struct snd_soc_pcm_stream *c2c_conf;
 	struct gpio_desc *pa_gpio;
 	const struct snd_soc_ops *ops;
 	unsigned int dpcm_selectable:1;
diff --git a/sound/soc/generic/audio-graph-card2.c b/sound/soc/generic/audio-graph-card2.c
index 510058c47a92..19e31d53422a 100644
--- a/sound/soc/generic/audio-graph-card2.c
+++ b/sound/soc/generic/audio-graph-card2.c
@@ -888,8 +888,12 @@ int audio_graph2_link_c2c(struct asoc_simple_priv *priv,
 	 */
 	of_property_read_u32(ports, "rate", &val);
 	if (val) {
-		struct simple_dai_props *dai_props = simple_priv_to_props(priv, li->link);
-		struct snd_soc_pcm_stream *c2c_conf = dai_props->c2c_conf;
+		struct device *dev = simple_priv_to_dev(priv);
+		struct snd_soc_pcm_stream *c2c_conf;
+
+		c2c_conf = devm_kzalloc(dev, sizeof(*c2c_conf), GFP_KERNEL);
+		if (!c2c_conf)
+			goto err1;
 
 		c2c_conf->formats	= SNDRV_PCM_FMTBIT_S32_LE; /* update ME */
 		c2c_conf->rates		= SNDRV_PCM_RATE_8000_384000;
@@ -930,6 +934,7 @@ err2:
 	of_node_put(ep1);
 	of_node_put(codec0_port);
 	of_node_put(codec1_port);
+err1:
 	of_node_put(ports);
 	of_node_put(port0);
 	of_node_put(port1);
@@ -1093,7 +1098,6 @@ static int graph_count_c2c(struct asoc_simple_priv *priv,
 	li->num[li->link].cpus		=
 	li->num[li->link].platforms	= graph_counter(codec0);
 	li->num[li->link].codecs	= graph_counter(codec1);
-	li->num[li->link].c2c		= 1;
 
 	of_node_put(ports);
 	of_node_put(port1);
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index 7be84c7840cb..a761af6b13b6 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -746,8 +746,7 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 	struct asoc_simple_dai *dais;
 	struct snd_soc_dai_link_component *dlcs;
 	struct snd_soc_codec_conf *cconf = NULL;
-	struct snd_soc_pcm_stream *c2c_conf = NULL;
-	int i, dai_num = 0, dlc_num = 0, cnf_num = 0, c2c_num = 0;
+	int i, dai_num = 0, dlc_num = 0, cnf_num = 0;
 
 	dai_props = devm_kcalloc(dev, li->link, sizeof(*dai_props), GFP_KERNEL);
 	dai_link  = devm_kcalloc(dev, li->link, sizeof(*dai_link),  GFP_KERNEL);
@@ -766,8 +765,6 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 
 		if (!li->num[i].cpus)
 			cnf_num += li->num[i].codecs;
-
-		c2c_num += li->num[i].c2c;
 	}
 
 	dais = devm_kcalloc(dev, dai_num, sizeof(*dais), GFP_KERNEL);
@@ -781,12 +778,6 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 			return -ENOMEM;
 	}
 
-	if (c2c_num) {
-		c2c_conf = devm_kcalloc(dev, c2c_num, sizeof(*c2c_conf), GFP_KERNEL);
-		if (!c2c_conf)
-			return -ENOMEM;
-	}
-
 	dev_dbg(dev, "link %d, dais %d, ccnf %d\n",
 		li->link, dai_num, cnf_num);
 
@@ -800,7 +791,6 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 	priv->dais		= dais;
 	priv->dlcs		= dlcs;
 	priv->codec_conf	= cconf;
-	priv->c2c_conf		= c2c_conf;
 
 	card->dai_link		= priv->dai_link;
 	card->num_links		= li->link;
@@ -818,12 +808,6 @@ int asoc_simple_init_priv(struct asoc_simple_priv *priv,
 
 			dlcs += li->num[i].cpus;
 			dais += li->num[i].cpus;
-
-			if (li->num[i].c2c) {
-				/* Codec2Codec */
-				dai_props[i].c2c_conf = c2c_conf;
-				c2c_conf += li->num[i].c2c;
-			}
 		} else {
 			/* DPCM Be's CPU = dummy */
 			dai_props[i].cpus	=
-- 
cgit 


From 820b8963adaea34a87abbecb906d1f54c0aabfb7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Jul 2022 10:50:40 -0400
Subject: net: sock: tracing: Fix sock_exceed_buf_limit not to dereference
 stale pointer

The trace event sock_exceed_buf_limit saves the prot->sysctl_mem pointer
and then dereferences it in the TP_printk() portion. This is unsafe as the
TP_printk() portion is executed at the time the buffer is read. That is,
it can be seconds, minutes, days, months, even years later. If the proto
is freed, then this dereference will can also lead to a kernel crash.

Instead, save the sysctl_mem array into the ring buffer and have the
TP_printk() reference that instead. This is the proper and safe way to
read pointers in trace events.

Link: https://lore.kernel.org/all/20220706052130.16368-12-kuniyu@amazon.com/

Cc: stable@vger.kernel.org
Fixes: 3847ce32aea9f ("core: add tracepoints for queueing skb to rcvbuf")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/sock.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 12c315782766..777ee6cbe933 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -98,7 +98,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_STRUCT__entry(
 		__array(char, name, 32)
-		__field(long *, sysctl_mem)
+		__array(long, sysctl_mem, 3)
 		__field(long, allocated)
 		__field(int, sysctl_rmem)
 		__field(int, rmem_alloc)
@@ -110,7 +110,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_fast_assign(
 		strncpy(__entry->name, prot->name, 32);
-		__entry->sysctl_mem = prot->sysctl_mem;
+		__entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
+		__entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
+		__entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
 		__entry->allocated = allocated;
 		__entry->sysctl_rmem = sk_get_rmem0(sk, prot);
 		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
-- 
cgit 


From 310731e2f1611d1d13aae237abcf8e66d33345d5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 6 Jul 2022 16:40:00 -0700
Subject: net: Fix data-races around sysctl_mem.

While reading .sysctl_mem, it can be changed concurrently.
So, we need to add READ_ONCE() to avoid data-races.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 72ca97ccb460..9fa54762e077 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1529,7 +1529,7 @@ void __sk_mem_reclaim(struct sock *sk, int amount);
 /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 {
-	long val = sk->sk_prot->sysctl_mem[index];
+	long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);
 
 #if PAGE_SIZE > SK_MEM_QUANTUM
 	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
-- 
cgit 


From 55bfc376b8fb421a193fb422ca052235f023161b Mon Sep 17 00:00:00 2001
From: Qin Jian <qinjian@cqplus1.com>
Date: Tue, 28 Jun 2022 14:26:42 +0800
Subject: dt-bindings: reset: Add bindings for SP7021 reset driver

Add documentation to describe Sunplus SP7021 reset driver bindings.

Signed-off-by: Qin Jian <qinjian@cqplus1.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../devicetree/bindings/reset/sunplus,reset.yaml   | 38 ++++++++++
 MAINTAINERS                                        |  2 +
 include/dt-bindings/reset/sunplus,sp7021-reset.h   | 87 ++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/reset/sunplus,reset.yaml
 create mode 100644 include/dt-bindings/reset/sunplus,sp7021-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/sunplus,reset.yaml b/Documentation/devicetree/bindings/reset/sunplus,reset.yaml
new file mode 100644
index 000000000000..f24646ba9761
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/sunplus,reset.yaml
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) Sunplus Co., Ltd. 2021
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/reset/sunplus,reset.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: Sunplus SoC Reset Controller
+
+maintainers:
+  - Qin Jian <qinjian@cqplus1.com>
+
+properties:
+  compatible:
+    const: sunplus,sp7021-reset
+
+  reg:
+    maxItems: 1
+
+  "#reset-cells":
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    rstc: reset@9c000054 {
+      compatible = "sunplus,sp7021-reset";
+      reg = <0x9c000054 0x28>;
+      #reset-cells = <1>;
+    };
+
+...
diff --git a/MAINTAINERS b/MAINTAINERS
index 3258a4d1aaf7..d24ef0b09ac6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2831,6 +2831,8 @@ L:	linux-arm-kernel@lists.infradead.org (moderated for mon-subscribers)
 S:	Maintained
 W:	https://sunplus-tibbo.atlassian.net/wiki/spaces/doc/overview
 F:	Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml
+F:	Documentation/devicetree/bindings/reset/sunplus,reset.yaml
+F:	include/dt-bindings/reset/sunplus,sp7021-reset.h
 
 ARM/Synaptics SoC support
 M:	Jisheng Zhang <jszhang@kernel.org>
diff --git a/include/dt-bindings/reset/sunplus,sp7021-reset.h b/include/dt-bindings/reset/sunplus,sp7021-reset.h
new file mode 100644
index 000000000000..ab486707387f
--- /dev/null
+++ b/include/dt-bindings/reset/sunplus,sp7021-reset.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) Sunplus Technology Co., Ltd.
+ *       All rights reserved.
+ */
+#ifndef _DT_BINDINGS_RST_SUNPLUS_SP7021_H
+#define _DT_BINDINGS_RST_SUNPLUS_SP7021_H
+
+#define RST_SYSTEM              0
+#define RST_RTC                 1
+#define RST_IOCTL               2
+#define RST_IOP                 3
+#define RST_OTPRX               4
+#define RST_NOC                 5
+#define RST_BR                  6
+#define RST_RBUS_L00            7
+#define RST_SPIFL               8
+#define RST_SDCTRL0             9
+#define RST_PERI0               10
+#define RST_A926                11
+#define RST_UMCTL2              12
+#define RST_PERI1               13
+#define RST_DDR_PHY0            14
+#define RST_ACHIP               15
+#define RST_STC0                16
+#define RST_STC_AV0             17
+#define RST_STC_AV1             18
+#define RST_STC_AV2             19
+#define RST_UA0                 20
+#define RST_UA1                 21
+#define RST_UA2                 22
+#define RST_UA3                 23
+#define RST_UA4                 24
+#define RST_HWUA                25
+#define RST_DDC0                26
+#define RST_UADMA               27
+#define RST_CBDMA0              28
+#define RST_CBDMA1              29
+#define RST_SPI_COMBO_0         30
+#define RST_SPI_COMBO_1         31
+#define RST_SPI_COMBO_2         32
+#define RST_SPI_COMBO_3         33
+#define RST_AUD                 34
+#define RST_USBC0               35
+#define RST_USBC1               36
+#define RST_UPHY0               37
+#define RST_UPHY1               38
+#define RST_I2CM0               39
+#define RST_I2CM1               40
+#define RST_I2CM2               41
+#define RST_I2CM3               42
+#define RST_PMC                 43
+#define RST_CARD_CTL0           44
+#define RST_CARD_CTL1           45
+#define RST_CARD_CTL4           46
+#define RST_BCH                 47
+#define RST_DDFCH               48
+#define RST_CSIIW0              49
+#define RST_CSIIW1              50
+#define RST_MIPICSI0            51
+#define RST_MIPICSI1            52
+#define RST_HDMI_TX             53
+#define RST_VPOST               54
+#define RST_TGEN                55
+#define RST_DMIX                56
+#define RST_TCON                57
+#define RST_INTERRUPT           58
+#define RST_RGST                59
+#define RST_GPIO                60
+#define RST_RBUS_TOP            61
+#define RST_MAILBOX             62
+#define RST_SPIND               63
+#define RST_I2C2CBUS            64
+#define RST_SEC                 65
+#define RST_DVE                 66
+#define RST_GPOST0              67
+#define RST_OSD0                68
+#define RST_DISP_PWM            69
+#define RST_UADBG               70
+#define RST_DUMMY_MASTER        71
+#define RST_FIO_CTL             72
+#define RST_FPGA                73
+#define RST_L2SW                74
+#define RST_ICM                 75
+#define RST_AXI_GLOBAL          76
+
+#endif
-- 
cgit 


From 5543604a05a9dcc8972489c6051347aee17ac135 Mon Sep 17 00:00:00 2001
From: Qin Jian <qinjian@cqplus1.com>
Date: Tue, 28 Jun 2022 14:26:44 +0800
Subject: dt-bindings: clock: Add bindings for SP7021 clock driver

Add documentation to describe Sunplus SP7021 clock driver bindings.

Signed-off-by: Qin Jian <qinjian@cqplus1.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../bindings/clock/sunplus,sp7021-clkc.yaml        | 52 +++++++++++++
 MAINTAINERS                                        |  2 +
 include/dt-bindings/clock/sunplus,sp7021-clkc.h    | 88 ++++++++++++++++++++++
 3 files changed, 142 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
 create mode 100644 include/dt-bindings/clock/sunplus,sp7021-clkc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml b/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
new file mode 100644
index 000000000000..bcc14088220a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) Sunplus Co., Ltd. 2021
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/sunplus,sp7021-clkc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sunplus SP7021 SoC Clock Controller
+
+maintainers:
+  - Qin Jian <qinjian@cqplus1.com>
+
+properties:
+  compatible:
+    const: sunplus,sp7021-clkc
+
+  reg:
+    maxItems: 3
+
+  clocks:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    extclk: osc0 {
+      compatible = "fixed-clock";
+      #clock-cells = <0>;
+      clock-frequency = <27000000>;
+      clock-output-names = "extclk";
+    };
+
+    clkc: clock-controller@9c000004 {
+      compatible = "sunplus,sp7021-clkc";
+      reg = <0x9c000004 0x28>,
+            <0x9c000200 0x44>,
+            <0x9c000268 0x08>;
+      clocks = <&extclk>;
+      #clock-cells = <1>;
+    };
+
+...
diff --git a/MAINTAINERS b/MAINTAINERS
index db00f5e50ba4..28948357e4aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2831,8 +2831,10 @@ L:	linux-arm-kernel@lists.infradead.org (moderated for mon-subscribers)
 S:	Maintained
 W:	https://sunplus-tibbo.atlassian.net/wiki/spaces/doc/overview
 F:	Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml
+F:	Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
 F:	Documentation/devicetree/bindings/reset/sunplus,reset.yaml
 F:	drivers/reset/reset-sunplus.c
+F:	include/dt-bindings/clock/sunplus,sp7021-clkc.h
 F:	include/dt-bindings/reset/sunplus,sp7021-reset.h
 
 ARM/Synaptics SoC support
diff --git a/include/dt-bindings/clock/sunplus,sp7021-clkc.h b/include/dt-bindings/clock/sunplus,sp7021-clkc.h
new file mode 100644
index 000000000000..cd84321eb2b5
--- /dev/null
+++ b/include/dt-bindings/clock/sunplus,sp7021-clkc.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) Sunplus Technology Co., Ltd.
+ *       All rights reserved.
+ */
+#ifndef _DT_BINDINGS_CLOCK_SUNPLUS_SP7021_H
+#define _DT_BINDINGS_CLOCK_SUNPLUS_SP7021_H
+
+/* gates */
+#define CLK_RTC         0
+#define CLK_OTPRX       1
+#define CLK_NOC         2
+#define CLK_BR          3
+#define CLK_SPIFL       4
+#define CLK_PERI0       5
+#define CLK_PERI1       6
+#define CLK_STC0        7
+#define CLK_STC_AV0     8
+#define CLK_STC_AV1     9
+#define CLK_STC_AV2     10
+#define CLK_UA0         11
+#define CLK_UA1         12
+#define CLK_UA2         13
+#define CLK_UA3         14
+#define CLK_UA4         15
+#define CLK_HWUA        16
+#define CLK_DDC0        17
+#define CLK_UADMA       18
+#define CLK_CBDMA0      19
+#define CLK_CBDMA1      20
+#define CLK_SPI_COMBO_0 21
+#define CLK_SPI_COMBO_1 22
+#define CLK_SPI_COMBO_2 23
+#define CLK_SPI_COMBO_3 24
+#define CLK_AUD         25
+#define CLK_USBC0       26
+#define CLK_USBC1       27
+#define CLK_UPHY0       28
+#define CLK_UPHY1       29
+#define CLK_I2CM0       30
+#define CLK_I2CM1       31
+#define CLK_I2CM2       32
+#define CLK_I2CM3       33
+#define CLK_PMC         34
+#define CLK_CARD_CTL0   35
+#define CLK_CARD_CTL1   36
+#define CLK_CARD_CTL4   37
+#define CLK_BCH         38
+#define CLK_DDFCH       39
+#define CLK_CSIIW0      40
+#define CLK_CSIIW1      41
+#define CLK_MIPICSI0    42
+#define CLK_MIPICSI1    43
+#define CLK_HDMI_TX     44
+#define CLK_VPOST       45
+#define CLK_TGEN        46
+#define CLK_DMIX        47
+#define CLK_TCON        48
+#define CLK_GPIO        49
+#define CLK_MAILBOX     50
+#define CLK_SPIND       51
+#define CLK_I2C2CBUS    52
+#define CLK_SEC         53
+#define CLK_DVE         54
+#define CLK_GPOST0      55
+#define CLK_OSD0        56
+#define CLK_DISP_PWM    57
+#define CLK_UADBG       58
+#define CLK_FIO_CTL     59
+#define CLK_FPGA        60
+#define CLK_L2SW        61
+#define CLK_ICM         62
+#define CLK_AXI_GLOBAL  63
+
+/* plls */
+#define PLL_A           64
+#define PLL_E           65
+#define PLL_E_2P5       66
+#define PLL_E_25        67
+#define PLL_E_112P5     68
+#define PLL_F           69
+#define PLL_TV          70
+#define PLL_TV_A        71
+#define PLL_SYS         72
+
+#define CLK_MAX         73
+
+#endif
-- 
cgit 


From 7963d4d710112bc457f99bdb56608211e561190e Mon Sep 17 00:00:00 2001
From: Xin Ji <xji@analogixsemi.com>
Date: Wed, 6 Jul 2022 16:34:31 +0800
Subject: usb: typec: tcpci: move tcpci.h to include/linux/usb/

USB PD controllers which consisting of a microcontroller (acting as the TCPM)
and a port controller (TCPC) - may require that the driver for the PD
controller accesses directly also the on-chip port controller in some cases.

Move tcpci.h to include/linux/usb/ is convenience access TCPC registers.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Xin Ji <xji@analogixsemi.com>

Link: https://lore.kernel.org/r/20220706083433.2415524-1-xji@analogixsemi.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpci.c         |   3 +-
 drivers/usb/typec/tcpm/tcpci.h         | 209 --------------------------------
 drivers/usb/typec/tcpm/tcpci_maxim.c   |   3 +-
 drivers/usb/typec/tcpm/tcpci_mt6360.c  |   3 +-
 drivers/usb/typec/tcpm/tcpci_rt1711h.c |   2 +-
 include/linux/usb/tcpci.h              | 210 +++++++++++++++++++++++++++++++++
 6 files changed, 214 insertions(+), 216 deletions(-)
 delete mode 100644 drivers/usb/typec/tcpm/tcpci.h
 create mode 100644 include/linux/usb/tcpci.h

(limited to 'include')

diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c
index f33e08eb7670..812784702d53 100644
--- a/drivers/usb/typec/tcpm/tcpci.c
+++ b/drivers/usb/typec/tcpm/tcpci.c
@@ -13,11 +13,10 @@
 #include <linux/property.h>
 #include <linux/regmap.h>
 #include <linux/usb/pd.h>
+#include <linux/usb/tcpci.h>
 #include <linux/usb/tcpm.h>
 #include <linux/usb/typec.h>
 
-#include "tcpci.h"
-
 #define	PD_RETRY_COUNT_DEFAULT			3
 #define	PD_RETRY_COUNT_3_0_OR_HIGHER		2
 #define	AUTO_DISCHARGE_DEFAULT_THRESHOLD_MV	3500
diff --git a/drivers/usb/typec/tcpm/tcpci.h b/drivers/usb/typec/tcpm/tcpci.h
deleted file mode 100644
index b2edd45f13c6..000000000000
--- a/drivers/usb/typec/tcpm/tcpci.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * USB Type-C Port Controller Interface.
- */
-
-#ifndef __LINUX_USB_TCPCI_H
-#define __LINUX_USB_TCPCI_H
-
-#include <linux/usb/typec.h>
-
-#define TCPC_VENDOR_ID			0x0
-#define TCPC_PRODUCT_ID			0x2
-#define TCPC_BCD_DEV			0x4
-#define TCPC_TC_REV			0x6
-#define TCPC_PD_REV			0x8
-#define TCPC_PD_INT_REV			0xa
-
-#define TCPC_ALERT			0x10
-#define TCPC_ALERT_EXTND		BIT(14)
-#define TCPC_ALERT_EXTENDED_STATUS	BIT(13)
-#define TCPC_ALERT_VBUS_DISCNCT		BIT(11)
-#define TCPC_ALERT_RX_BUF_OVF		BIT(10)
-#define TCPC_ALERT_FAULT		BIT(9)
-#define TCPC_ALERT_V_ALARM_LO		BIT(8)
-#define TCPC_ALERT_V_ALARM_HI		BIT(7)
-#define TCPC_ALERT_TX_SUCCESS		BIT(6)
-#define TCPC_ALERT_TX_DISCARDED		BIT(5)
-#define TCPC_ALERT_TX_FAILED		BIT(4)
-#define TCPC_ALERT_RX_HARD_RST		BIT(3)
-#define TCPC_ALERT_RX_STATUS		BIT(2)
-#define TCPC_ALERT_POWER_STATUS		BIT(1)
-#define TCPC_ALERT_CC_STATUS		BIT(0)
-
-#define TCPC_ALERT_MASK			0x12
-#define TCPC_POWER_STATUS_MASK		0x14
-#define TCPC_FAULT_STATUS_MASK		0x15
-
-#define TCPC_EXTENDED_STATUS_MASK		0x16
-#define TCPC_EXTENDED_STATUS_MASK_VSAFE0V	BIT(0)
-
-#define TCPC_ALERT_EXTENDED_MASK	0x17
-#define TCPC_SINK_FAST_ROLE_SWAP	BIT(0)
-
-#define TCPC_CONFIG_STD_OUTPUT		0x18
-
-#define TCPC_TCPC_CTRL			0x19
-#define TCPC_TCPC_CTRL_ORIENTATION	BIT(0)
-#define PLUG_ORNT_CC1			0
-#define PLUG_ORNT_CC2			1
-#define TCPC_TCPC_CTRL_BIST_TM		BIT(1)
-#define TCPC_TCPC_CTRL_EN_LK4CONN_ALRT	BIT(6)
-
-#define TCPC_EXTENDED_STATUS		0x20
-#define TCPC_EXTENDED_STATUS_VSAFE0V	BIT(0)
-
-#define TCPC_ROLE_CTRL			0x1a
-#define TCPC_ROLE_CTRL_DRP		BIT(6)
-#define TCPC_ROLE_CTRL_RP_VAL_SHIFT	4
-#define TCPC_ROLE_CTRL_RP_VAL_MASK	0x3
-#define TCPC_ROLE_CTRL_RP_VAL_DEF	0x0
-#define TCPC_ROLE_CTRL_RP_VAL_1_5	0x1
-#define TCPC_ROLE_CTRL_RP_VAL_3_0	0x2
-#define TCPC_ROLE_CTRL_CC2_SHIFT	2
-#define TCPC_ROLE_CTRL_CC2_MASK		0x3
-#define TCPC_ROLE_CTRL_CC1_SHIFT	0
-#define TCPC_ROLE_CTRL_CC1_MASK		0x3
-#define TCPC_ROLE_CTRL_CC_RA		0x0
-#define TCPC_ROLE_CTRL_CC_RP		0x1
-#define TCPC_ROLE_CTRL_CC_RD		0x2
-#define TCPC_ROLE_CTRL_CC_OPEN		0x3
-
-#define TCPC_FAULT_CTRL			0x1b
-
-#define TCPC_POWER_CTRL			0x1c
-#define TCPC_POWER_CTRL_VCONN_ENABLE	BIT(0)
-#define TCPC_POWER_CTRL_BLEED_DISCHARGE	BIT(3)
-#define TCPC_POWER_CTRL_AUTO_DISCHARGE	BIT(4)
-#define TCPC_DIS_VOLT_ALRM		BIT(5)
-#define TCPC_POWER_CTRL_VBUS_VOLT_MON	BIT(6)
-#define TCPC_FAST_ROLE_SWAP_EN		BIT(7)
-
-#define TCPC_CC_STATUS			0x1d
-#define TCPC_CC_STATUS_TOGGLING		BIT(5)
-#define TCPC_CC_STATUS_TERM		BIT(4)
-#define TCPC_CC_STATUS_TERM_RP		0
-#define TCPC_CC_STATUS_TERM_RD		1
-#define TCPC_CC_STATE_SRC_OPEN		0
-#define TCPC_CC_STATUS_CC2_SHIFT	2
-#define TCPC_CC_STATUS_CC2_MASK		0x3
-#define TCPC_CC_STATUS_CC1_SHIFT	0
-#define TCPC_CC_STATUS_CC1_MASK		0x3
-
-#define TCPC_POWER_STATUS		0x1e
-#define TCPC_POWER_STATUS_DBG_ACC_CON	BIT(7)
-#define TCPC_POWER_STATUS_UNINIT	BIT(6)
-#define TCPC_POWER_STATUS_SOURCING_VBUS	BIT(4)
-#define TCPC_POWER_STATUS_VBUS_DET	BIT(3)
-#define TCPC_POWER_STATUS_VBUS_PRES	BIT(2)
-#define TCPC_POWER_STATUS_VCONN_PRES	BIT(1)
-#define TCPC_POWER_STATUS_SINKING_VBUS	BIT(0)
-
-#define TCPC_FAULT_STATUS		0x1f
-
-#define TCPC_ALERT_EXTENDED		0x21
-
-#define TCPC_COMMAND			0x23
-#define TCPC_CMD_WAKE_I2C		0x11
-#define TCPC_CMD_DISABLE_VBUS_DETECT	0x22
-#define TCPC_CMD_ENABLE_VBUS_DETECT	0x33
-#define TCPC_CMD_DISABLE_SINK_VBUS	0x44
-#define TCPC_CMD_SINK_VBUS		0x55
-#define TCPC_CMD_DISABLE_SRC_VBUS	0x66
-#define TCPC_CMD_SRC_VBUS_DEFAULT	0x77
-#define TCPC_CMD_SRC_VBUS_HIGH		0x88
-#define TCPC_CMD_LOOK4CONNECTION	0x99
-#define TCPC_CMD_RXONEMORE		0xAA
-#define TCPC_CMD_I2C_IDLE		0xFF
-
-#define TCPC_DEV_CAP_1			0x24
-#define TCPC_DEV_CAP_2			0x26
-#define TCPC_STD_INPUT_CAP		0x28
-#define TCPC_STD_OUTPUT_CAP		0x29
-
-#define TCPC_MSG_HDR_INFO		0x2e
-#define TCPC_MSG_HDR_INFO_DATA_ROLE	BIT(3)
-#define TCPC_MSG_HDR_INFO_PWR_ROLE	BIT(0)
-#define TCPC_MSG_HDR_INFO_REV_SHIFT	1
-#define TCPC_MSG_HDR_INFO_REV_MASK	0x3
-
-#define TCPC_RX_DETECT			0x2f
-#define TCPC_RX_DETECT_HARD_RESET	BIT(5)
-#define TCPC_RX_DETECT_SOP		BIT(0)
-#define TCPC_RX_DETECT_SOP1		BIT(1)
-#define TCPC_RX_DETECT_SOP2		BIT(2)
-#define TCPC_RX_DETECT_DBG1		BIT(3)
-#define TCPC_RX_DETECT_DBG2		BIT(4)
-
-#define TCPC_RX_BYTE_CNT		0x30
-#define TCPC_RX_BUF_FRAME_TYPE		0x31
-#define TCPC_RX_BUF_FRAME_TYPE_SOP	0
-#define TCPC_RX_HDR			0x32
-#define TCPC_RX_DATA			0x34 /* through 0x4f */
-
-#define TCPC_TRANSMIT			0x50
-#define TCPC_TRANSMIT_RETRY_SHIFT	4
-#define TCPC_TRANSMIT_RETRY_MASK	0x3
-#define TCPC_TRANSMIT_TYPE_SHIFT	0
-#define TCPC_TRANSMIT_TYPE_MASK		0x7
-
-#define TCPC_TX_BYTE_CNT		0x51
-#define TCPC_TX_HDR			0x52
-#define TCPC_TX_DATA			0x54 /* through 0x6f */
-
-#define TCPC_VBUS_VOLTAGE			0x70
-#define TCPC_VBUS_VOLTAGE_MASK			0x3ff
-#define TCPC_VBUS_VOLTAGE_LSB_MV		25
-#define TCPC_VBUS_SINK_DISCONNECT_THRESH	0x72
-#define TCPC_VBUS_SINK_DISCONNECT_THRESH_LSB_MV	25
-#define TCPC_VBUS_SINK_DISCONNECT_THRESH_MAX	0x3ff
-#define TCPC_VBUS_STOP_DISCHARGE_THRESH		0x74
-#define TCPC_VBUS_VOLTAGE_ALARM_HI_CFG		0x76
-#define TCPC_VBUS_VOLTAGE_ALARM_LO_CFG		0x78
-
-/* I2C_WRITE_BYTE_COUNT + 1 when TX_BUF_BYTE_x is only accessible I2C_WRITE_BYTE_COUNT */
-#define TCPC_TRANSMIT_BUFFER_MAX_LEN		31
-
-struct tcpci;
-
-/*
- * @TX_BUF_BYTE_x_hidden:
- *		optional; Set when TX_BUF_BYTE_x can only be accessed through I2C_WRITE_BYTE_COUNT.
- * @frs_sourcing_vbus:
- *		Optional; Callback to perform chip specific operations when FRS
- *		is sourcing vbus.
- * @auto_discharge_disconnect:
- *		Optional; Enables TCPC to autonously discharge vbus on disconnect.
- * @vbus_vsafe0v:
- *		optional; Set when TCPC can detect whether vbus is at VSAFE0V.
- * @set_partner_usb_comm_capable:
- *		Optional; The USB Communications Capable bit indicates if port
- *		partner is capable of communication over the USB data lines
- *		(e.g. D+/- or SS Tx/Rx). Called to notify the status of the bit.
- */
-struct tcpci_data {
-	struct regmap *regmap;
-	unsigned char TX_BUF_BYTE_x_hidden:1;
-	unsigned char auto_discharge_disconnect:1;
-	unsigned char vbus_vsafe0v:1;
-
-	int (*init)(struct tcpci *tcpci, struct tcpci_data *data);
-	int (*set_vconn)(struct tcpci *tcpci, struct tcpci_data *data,
-			 bool enable);
-	int (*start_drp_toggling)(struct tcpci *tcpci, struct tcpci_data *data,
-				  enum typec_cc_status cc);
-	int (*set_vbus)(struct tcpci *tcpci, struct tcpci_data *data, bool source, bool sink);
-	void (*frs_sourcing_vbus)(struct tcpci *tcpci, struct tcpci_data *data);
-	void (*set_partner_usb_comm_capable)(struct tcpci *tcpci, struct tcpci_data *data,
-					     bool capable);
-};
-
-struct tcpci *tcpci_register_port(struct device *dev, struct tcpci_data *data);
-void tcpci_unregister_port(struct tcpci *tcpci);
-irqreturn_t tcpci_irq(struct tcpci *tcpci);
-
-struct tcpm_port;
-struct tcpm_port *tcpci_get_tcpm_port(struct tcpci *tcpci);
-#endif /* __LINUX_USB_TCPCI_H */
diff --git a/drivers/usb/typec/tcpm/tcpci_maxim.c b/drivers/usb/typec/tcpm/tcpci_maxim.c
index df2505570f07..4b6705f3d7b7 100644
--- a/drivers/usb/typec/tcpm/tcpci_maxim.c
+++ b/drivers/usb/typec/tcpm/tcpci_maxim.c
@@ -11,11 +11,10 @@
 #include <linux/module.h>
 #include <linux/regmap.h>
 #include <linux/usb/pd.h>
+#include <linux/usb/tcpci.h>
 #include <linux/usb/tcpm.h>
 #include <linux/usb/typec.h>
 
-#include "tcpci.h"
-
 #define PD_ACTIVITY_TIMEOUT_MS				10000
 
 #define TCPC_VENDOR_ALERT				0x80
diff --git a/drivers/usb/typec/tcpm/tcpci_mt6360.c b/drivers/usb/typec/tcpm/tcpci_mt6360.c
index 8a952eaf9016..1b7c31278ebb 100644
--- a/drivers/usb/typec/tcpm/tcpci_mt6360.c
+++ b/drivers/usb/typec/tcpm/tcpci_mt6360.c
@@ -11,10 +11,9 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
+#include <linux/usb/tcpci.h>
 #include <linux/usb/tcpm.h>
 
-#include "tcpci.h"
-
 #define MT6360_REG_PHYCTRL1	0x80
 #define MT6360_REG_PHYCTRL3	0x82
 #define MT6360_REG_PHYCTRL7	0x86
diff --git a/drivers/usb/typec/tcpm/tcpci_rt1711h.c b/drivers/usb/typec/tcpm/tcpci_rt1711h.c
index b56a0880a044..3291ca4948da 100644
--- a/drivers/usb/typec/tcpm/tcpci_rt1711h.c
+++ b/drivers/usb/typec/tcpm/tcpci_rt1711h.c
@@ -10,9 +10,9 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/gpio/consumer.h>
+#include <linux/usb/tcpci.h>
 #include <linux/usb/tcpm.h>
 #include <linux/regmap.h>
-#include "tcpci.h"
 
 #define RT1711H_VID		0x29CF
 #define RT1711H_PID		0x1711
diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
new file mode 100644
index 000000000000..20c0bedb8ec8
--- /dev/null
+++ b/include/linux/usb/tcpci.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * USB Type-C Port Controller Interface.
+ */
+
+#ifndef __LINUX_USB_TCPCI_H
+#define __LINUX_USB_TCPCI_H
+
+#include <linux/usb/typec.h>
+#include <linux/usb/tcpm.h>
+
+#define TCPC_VENDOR_ID			0x0
+#define TCPC_PRODUCT_ID			0x2
+#define TCPC_BCD_DEV			0x4
+#define TCPC_TC_REV			0x6
+#define TCPC_PD_REV			0x8
+#define TCPC_PD_INT_REV			0xa
+
+#define TCPC_ALERT			0x10
+#define TCPC_ALERT_EXTND		BIT(14)
+#define TCPC_ALERT_EXTENDED_STATUS	BIT(13)
+#define TCPC_ALERT_VBUS_DISCNCT		BIT(11)
+#define TCPC_ALERT_RX_BUF_OVF		BIT(10)
+#define TCPC_ALERT_FAULT		BIT(9)
+#define TCPC_ALERT_V_ALARM_LO		BIT(8)
+#define TCPC_ALERT_V_ALARM_HI		BIT(7)
+#define TCPC_ALERT_TX_SUCCESS		BIT(6)
+#define TCPC_ALERT_TX_DISCARDED		BIT(5)
+#define TCPC_ALERT_TX_FAILED		BIT(4)
+#define TCPC_ALERT_RX_HARD_RST		BIT(3)
+#define TCPC_ALERT_RX_STATUS		BIT(2)
+#define TCPC_ALERT_POWER_STATUS		BIT(1)
+#define TCPC_ALERT_CC_STATUS		BIT(0)
+
+#define TCPC_ALERT_MASK			0x12
+#define TCPC_POWER_STATUS_MASK		0x14
+#define TCPC_FAULT_STATUS_MASK		0x15
+
+#define TCPC_EXTENDED_STATUS_MASK		0x16
+#define TCPC_EXTENDED_STATUS_MASK_VSAFE0V	BIT(0)
+
+#define TCPC_ALERT_EXTENDED_MASK	0x17
+#define TCPC_SINK_FAST_ROLE_SWAP	BIT(0)
+
+#define TCPC_CONFIG_STD_OUTPUT		0x18
+
+#define TCPC_TCPC_CTRL			0x19
+#define TCPC_TCPC_CTRL_ORIENTATION	BIT(0)
+#define PLUG_ORNT_CC1			0
+#define PLUG_ORNT_CC2			1
+#define TCPC_TCPC_CTRL_BIST_TM		BIT(1)
+#define TCPC_TCPC_CTRL_EN_LK4CONN_ALRT	BIT(6)
+
+#define TCPC_EXTENDED_STATUS		0x20
+#define TCPC_EXTENDED_STATUS_VSAFE0V	BIT(0)
+
+#define TCPC_ROLE_CTRL			0x1a
+#define TCPC_ROLE_CTRL_DRP		BIT(6)
+#define TCPC_ROLE_CTRL_RP_VAL_SHIFT	4
+#define TCPC_ROLE_CTRL_RP_VAL_MASK	0x3
+#define TCPC_ROLE_CTRL_RP_VAL_DEF	0x0
+#define TCPC_ROLE_CTRL_RP_VAL_1_5	0x1
+#define TCPC_ROLE_CTRL_RP_VAL_3_0	0x2
+#define TCPC_ROLE_CTRL_CC2_SHIFT	2
+#define TCPC_ROLE_CTRL_CC2_MASK		0x3
+#define TCPC_ROLE_CTRL_CC1_SHIFT	0
+#define TCPC_ROLE_CTRL_CC1_MASK		0x3
+#define TCPC_ROLE_CTRL_CC_RA		0x0
+#define TCPC_ROLE_CTRL_CC_RP		0x1
+#define TCPC_ROLE_CTRL_CC_RD		0x2
+#define TCPC_ROLE_CTRL_CC_OPEN		0x3
+
+#define TCPC_FAULT_CTRL			0x1b
+
+#define TCPC_POWER_CTRL			0x1c
+#define TCPC_POWER_CTRL_VCONN_ENABLE	BIT(0)
+#define TCPC_POWER_CTRL_BLEED_DISCHARGE	BIT(3)
+#define TCPC_POWER_CTRL_AUTO_DISCHARGE	BIT(4)
+#define TCPC_DIS_VOLT_ALRM		BIT(5)
+#define TCPC_POWER_CTRL_VBUS_VOLT_MON	BIT(6)
+#define TCPC_FAST_ROLE_SWAP_EN		BIT(7)
+
+#define TCPC_CC_STATUS			0x1d
+#define TCPC_CC_STATUS_TOGGLING		BIT(5)
+#define TCPC_CC_STATUS_TERM		BIT(4)
+#define TCPC_CC_STATUS_TERM_RP		0
+#define TCPC_CC_STATUS_TERM_RD		1
+#define TCPC_CC_STATE_SRC_OPEN		0
+#define TCPC_CC_STATUS_CC2_SHIFT	2
+#define TCPC_CC_STATUS_CC2_MASK		0x3
+#define TCPC_CC_STATUS_CC1_SHIFT	0
+#define TCPC_CC_STATUS_CC1_MASK		0x3
+
+#define TCPC_POWER_STATUS		0x1e
+#define TCPC_POWER_STATUS_DBG_ACC_CON	BIT(7)
+#define TCPC_POWER_STATUS_UNINIT	BIT(6)
+#define TCPC_POWER_STATUS_SOURCING_VBUS	BIT(4)
+#define TCPC_POWER_STATUS_VBUS_DET	BIT(3)
+#define TCPC_POWER_STATUS_VBUS_PRES	BIT(2)
+#define TCPC_POWER_STATUS_VCONN_PRES	BIT(1)
+#define TCPC_POWER_STATUS_SINKING_VBUS	BIT(0)
+
+#define TCPC_FAULT_STATUS		0x1f
+
+#define TCPC_ALERT_EXTENDED		0x21
+
+#define TCPC_COMMAND			0x23
+#define TCPC_CMD_WAKE_I2C		0x11
+#define TCPC_CMD_DISABLE_VBUS_DETECT	0x22
+#define TCPC_CMD_ENABLE_VBUS_DETECT	0x33
+#define TCPC_CMD_DISABLE_SINK_VBUS	0x44
+#define TCPC_CMD_SINK_VBUS		0x55
+#define TCPC_CMD_DISABLE_SRC_VBUS	0x66
+#define TCPC_CMD_SRC_VBUS_DEFAULT	0x77
+#define TCPC_CMD_SRC_VBUS_HIGH		0x88
+#define TCPC_CMD_LOOK4CONNECTION	0x99
+#define TCPC_CMD_RXONEMORE		0xAA
+#define TCPC_CMD_I2C_IDLE		0xFF
+
+#define TCPC_DEV_CAP_1			0x24
+#define TCPC_DEV_CAP_2			0x26
+#define TCPC_STD_INPUT_CAP		0x28
+#define TCPC_STD_OUTPUT_CAP		0x29
+
+#define TCPC_MSG_HDR_INFO		0x2e
+#define TCPC_MSG_HDR_INFO_DATA_ROLE	BIT(3)
+#define TCPC_MSG_HDR_INFO_PWR_ROLE	BIT(0)
+#define TCPC_MSG_HDR_INFO_REV_SHIFT	1
+#define TCPC_MSG_HDR_INFO_REV_MASK	0x3
+
+#define TCPC_RX_DETECT			0x2f
+#define TCPC_RX_DETECT_HARD_RESET	BIT(5)
+#define TCPC_RX_DETECT_SOP		BIT(0)
+#define TCPC_RX_DETECT_SOP1		BIT(1)
+#define TCPC_RX_DETECT_SOP2		BIT(2)
+#define TCPC_RX_DETECT_DBG1		BIT(3)
+#define TCPC_RX_DETECT_DBG2		BIT(4)
+
+#define TCPC_RX_BYTE_CNT		0x30
+#define TCPC_RX_BUF_FRAME_TYPE		0x31
+#define TCPC_RX_BUF_FRAME_TYPE_SOP	0
+#define TCPC_RX_HDR			0x32
+#define TCPC_RX_DATA			0x34 /* through 0x4f */
+
+#define TCPC_TRANSMIT			0x50
+#define TCPC_TRANSMIT_RETRY_SHIFT	4
+#define TCPC_TRANSMIT_RETRY_MASK	0x3
+#define TCPC_TRANSMIT_TYPE_SHIFT	0
+#define TCPC_TRANSMIT_TYPE_MASK		0x7
+
+#define TCPC_TX_BYTE_CNT		0x51
+#define TCPC_TX_HDR			0x52
+#define TCPC_TX_DATA			0x54 /* through 0x6f */
+
+#define TCPC_VBUS_VOLTAGE			0x70
+#define TCPC_VBUS_VOLTAGE_MASK			0x3ff
+#define TCPC_VBUS_VOLTAGE_LSB_MV		25
+#define TCPC_VBUS_SINK_DISCONNECT_THRESH	0x72
+#define TCPC_VBUS_SINK_DISCONNECT_THRESH_LSB_MV	25
+#define TCPC_VBUS_SINK_DISCONNECT_THRESH_MAX	0x3ff
+#define TCPC_VBUS_STOP_DISCHARGE_THRESH		0x74
+#define TCPC_VBUS_VOLTAGE_ALARM_HI_CFG		0x76
+#define TCPC_VBUS_VOLTAGE_ALARM_LO_CFG		0x78
+
+/* I2C_WRITE_BYTE_COUNT + 1 when TX_BUF_BYTE_x is only accessible I2C_WRITE_BYTE_COUNT */
+#define TCPC_TRANSMIT_BUFFER_MAX_LEN		31
+
+struct tcpci;
+
+/*
+ * @TX_BUF_BYTE_x_hidden:
+ *		optional; Set when TX_BUF_BYTE_x can only be accessed through I2C_WRITE_BYTE_COUNT.
+ * @frs_sourcing_vbus:
+ *		Optional; Callback to perform chip specific operations when FRS
+ *		is sourcing vbus.
+ * @auto_discharge_disconnect:
+ *		Optional; Enables TCPC to autonously discharge vbus on disconnect.
+ * @vbus_vsafe0v:
+ *		optional; Set when TCPC can detect whether vbus is at VSAFE0V.
+ * @set_partner_usb_comm_capable:
+ *		Optional; The USB Communications Capable bit indicates if port
+ *		partner is capable of communication over the USB data lines
+ *		(e.g. D+/- or SS Tx/Rx). Called to notify the status of the bit.
+ */
+struct tcpci_data {
+	struct regmap *regmap;
+	unsigned char TX_BUF_BYTE_x_hidden:1;
+	unsigned char auto_discharge_disconnect:1;
+	unsigned char vbus_vsafe0v:1;
+
+	int (*init)(struct tcpci *tcpci, struct tcpci_data *data);
+	int (*set_vconn)(struct tcpci *tcpci, struct tcpci_data *data,
+			 bool enable);
+	int (*start_drp_toggling)(struct tcpci *tcpci, struct tcpci_data *data,
+				  enum typec_cc_status cc);
+	int (*set_vbus)(struct tcpci *tcpci, struct tcpci_data *data, bool source, bool sink);
+	void (*frs_sourcing_vbus)(struct tcpci *tcpci, struct tcpci_data *data);
+	void (*set_partner_usb_comm_capable)(struct tcpci *tcpci, struct tcpci_data *data,
+					     bool capable);
+};
+
+struct tcpci *tcpci_register_port(struct device *dev, struct tcpci_data *data);
+void tcpci_unregister_port(struct tcpci *tcpci);
+irqreturn_t tcpci_irq(struct tcpci *tcpci);
+
+struct tcpm_port;
+struct tcpm_port *tcpci_get_tcpm_port(struct tcpci *tcpci);
+#endif /* __LINUX_USB_TCPCI_H */
-- 
cgit 


From 620e8e8ba6210003fa45081b63d90ef4f3a0637f Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 30 Jun 2022 12:35:27 -0700
Subject: of/platform: Add stubs for of_platform_device_create/destroy()

Code for platform_device_create() and of_platform_device_destroy() is
only generated if CONFIG_OF_ADDRESS=y. Add stubs to avoid unresolved
symbols when CONFIG_OF_ADDRESS is not set.

Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/20220630123445.v24.1.I08fd2e1c775af04f663730e9fb4d00e6bbb38541@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_platform.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 84a966623e78..d15b6cd5e1c3 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -61,16 +61,18 @@ static inline struct platform_device *of_find_device_by_node(struct device_node
 }
 #endif
 
+extern int of_platform_bus_probe(struct device_node *root,
+				 const struct of_device_id *matches,
+				 struct device *parent);
+
+#ifdef CONFIG_OF_ADDRESS
 /* Platform devices and busses creation */
 extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
 extern int of_platform_device_destroy(struct device *dev, void *data);
-extern int of_platform_bus_probe(struct device_node *root,
-				 const struct of_device_id *matches,
-				 struct device *parent);
-#ifdef CONFIG_OF_ADDRESS
+
 extern int of_platform_populate(struct device_node *root,
 				const struct of_device_id *matches,
 				const struct of_dev_auxdata *lookup,
@@ -84,6 +86,18 @@ extern int devm_of_platform_populate(struct device *dev);
 
 extern void devm_of_platform_depopulate(struct device *dev);
 #else
+/* Platform devices and busses creation */
+static inline struct platform_device *of_platform_device_create(struct device_node *np,
+								const char *bus_id,
+								struct device *parent)
+{
+	return NULL;
+}
+static inline int of_platform_device_destroy(struct device *dev, void *data)
+{
+	return -ENODEV;
+}
+
 static inline int of_platform_populate(struct device_node *root,
 					const struct of_device_id *matches,
 					const struct of_dev_auxdata *lookup,
-- 
cgit 


From 8bc063641cebf9d555e41d135db2b5035b521768 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 30 Jun 2022 12:35:29 -0700
Subject: usb: misc: Add onboard_usb_hub driver

The main issue this driver addresses is that a USB hub needs to be
powered before it can be discovered. For discrete onboard hubs (an
example for such a hub is the Realtek RTS5411) this is often solved
by supplying the hub with an 'always-on' regulator, which is kind
of a hack. Some onboard hubs may require further initialization
steps, like changing the state of a GPIO or enabling a clock, which
requires even more hacks. This driver creates a platform device
representing the hub which performs the necessary initialization.
Currently it only supports switching on a single regulator, support
for multiple regulators or other actions can be added as needed.
Different initialization sequences can be supported based on the
compatible string.

Besides performing the initialization the driver can be configured
to power the hub off during system suspend. This can help to extend
battery life on battery powered devices which have no requirements
to keep the hub powered during suspend. The driver can also be
configured to leave the hub powered when a wakeup capable USB device
is connected when suspending, and power it off otherwise.

Technically the driver consists of two drivers, the platform driver
described above and a very thin USB driver that subclasses the
generic driver. The purpose of this driver is to provide the platform
driver with the USB devices corresponding to the hub(s) (a hub
controller may provide multiple 'logical' hubs, e.g. one to support
USB 2.0 and another for USB 3.x).

Co-developed-by: Ravi Chandra Sadineni <ravisadineni@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Ravi Chandra Sadineni <ravisadineni@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/20220630123445.v24.3.I7c9a1f1d6ced41dd8310e8a03da666a32364e790@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-platform-onboard-usb-hub |   8 +
 MAINTAINERS                                        |   7 +
 drivers/usb/core/Makefile                          |   4 +
 drivers/usb/misc/Kconfig                           |  16 +
 drivers/usb/misc/Makefile                          |   1 +
 drivers/usb/misc/onboard_usb_hub.c                 | 428 +++++++++++++++++++++
 drivers/usb/misc/onboard_usb_hub.h                 |  17 +
 drivers/usb/misc/onboard_usb_hub_pdevs.c           | 142 +++++++
 include/linux/usb/onboard_hub.h                    |  18 +
 9 files changed, 641 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
 create mode 100644 drivers/usb/misc/onboard_usb_hub.c
 create mode 100644 drivers/usb/misc/onboard_usb_hub.h
 create mode 100644 drivers/usb/misc/onboard_usb_hub_pdevs.c
 create mode 100644 include/linux/usb/onboard_hub.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
new file mode 100644
index 000000000000..42deb0552065
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
@@ -0,0 +1,8 @@
+What:		/sys/bus/platform/devices/<dev>/always_powered_in_suspend
+Date:		June 2022
+KernelVersion:	5.20
+Contact:	Matthias Kaehlcke <matthias@kaehlcke.net>
+		linux-usb@vger.kernel.org
+Description:
+		(RW) Controls whether the USB hub remains always powered
+		during system suspend or not.
\ No newline at end of file
diff --git a/MAINTAINERS b/MAINTAINERS
index 7533cb27adc0..f43b126eaf59 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14848,6 +14848,13 @@ S:	Maintained
 T:	git git://linuxtv.org/media_tree.git
 F:	drivers/media/i2c/ov9734.c
 
+ONBOARD USB HUB DRIVER
+M:	Matthias Kaehlcke <mka@chromium.org>
+L:	linux-usb@vger.kernel.org
+S:	Maintained
+F:	Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
+F:	drivers/usb/misc/onboard_usb_hub.c
+
 ONENAND FLASH DRIVER
 M:	Kyungmin Park <kyungmin.park@samsung.com>
 L:	linux-mtd@lists.infradead.org
diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile
index 18e874b0441e..7d338e9c0657 100644
--- a/drivers/usb/core/Makefile
+++ b/drivers/usb/core/Makefile
@@ -12,6 +12,10 @@ usbcore-$(CONFIG_OF)		+= of.o
 usbcore-$(CONFIG_USB_PCI)		+= hcd-pci.o
 usbcore-$(CONFIG_ACPI)		+= usb-acpi.o
 
+ifdef CONFIG_USB_ONBOARD_HUB
+usbcore-y			+= ../misc/onboard_usb_hub_pdevs.o
+endif
+
 obj-$(CONFIG_USB)		+= usbcore.o
 
 obj-$(CONFIG_USB_LEDS_TRIGGER_USBPORT)	+= ledtrig-usbport.o
diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index 4c5ddbd75b7e..9367c12c7e6f 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -295,3 +295,19 @@ config BRCM_USB_PINMAP
 	  This option enables support for remapping some USB external
 	  signals, which are typically on dedicated pins on the chip,
 	  to any gpio.
+
+config USB_ONBOARD_HUB
+	tristate "Onboard USB hub support"
+	depends on OF || COMPILE_TEST
+	help
+	  Say Y here if you want to support discrete onboard USB hubs that
+	  don't require an additional control bus for initialization, but
+	  need some non-trivial form of initialization, such as enabling a
+	  power regulator. An example for such a hub is the Realtek
+	  RTS5411.
+
+	  This driver can be used as a module but its state (module vs
+	  builtin) must match the state of the USB subsystem. Enabling
+	  this config will enable the driver and it will automatically
+	  match the state of the USB subsystem. If this driver is a
+	  module it will be called onboard_usb_hub.
diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile
index 35bdb4b6c3b6..93581baec3a8 100644
--- a/drivers/usb/misc/Makefile
+++ b/drivers/usb/misc/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_USB_CHAOSKEY)		+= chaoskey.o
 obj-$(CONFIG_USB_SISUSBVGA)		+= sisusbvga/
 obj-$(CONFIG_USB_LINK_LAYER_TEST)	+= lvstest.o
 obj-$(CONFIG_BRCM_USB_PINMAP)		+= brcmstb-usb-pinmap.o
+obj-$(CONFIG_USB_ONBOARD_HUB)		+= onboard_usb_hub.o
diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c
new file mode 100644
index 000000000000..6b9b949d17d3
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_hub.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for onboard USB hubs
+ *
+ * Copyright (c) 2022, Google LLC
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/onboard_hub.h>
+#include <linux/workqueue.h>
+
+#include "onboard_usb_hub.h"
+
+static struct usb_device_driver onboard_hub_usbdev_driver;
+
+/************************** Platform driver **************************/
+
+struct usbdev_node {
+	struct usb_device *udev;
+	struct list_head list;
+};
+
+struct onboard_hub {
+	struct regulator *vdd;
+	struct device *dev;
+	bool always_powered_in_suspend;
+	bool is_powered_on;
+	bool going_away;
+	struct list_head udev_list;
+	struct work_struct attach_usb_driver_work;
+	struct mutex lock;
+};
+
+static int onboard_hub_power_on(struct onboard_hub *hub)
+{
+	int err;
+
+	err = regulator_enable(hub->vdd);
+	if (err) {
+		dev_err(hub->dev, "failed to enable regulator: %d\n", err);
+		return err;
+	}
+
+	hub->is_powered_on = true;
+
+	return 0;
+}
+
+static int onboard_hub_power_off(struct onboard_hub *hub)
+{
+	int err;
+
+	err = regulator_disable(hub->vdd);
+	if (err) {
+		dev_err(hub->dev, "failed to disable regulator: %d\n", err);
+		return err;
+	}
+
+	hub->is_powered_on = false;
+
+	return 0;
+}
+
+static int __maybe_unused onboard_hub_suspend(struct device *dev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+	struct usbdev_node *node;
+	bool power_off = true;
+
+	if (hub->always_powered_in_suspend)
+		return 0;
+
+	mutex_lock(&hub->lock);
+
+	list_for_each_entry(node, &hub->udev_list, list) {
+		if (!device_may_wakeup(node->udev->bus->controller))
+			continue;
+
+		if (usb_wakeup_enabled_descendants(node->udev)) {
+			power_off = false;
+			break;
+		}
+	}
+
+	mutex_unlock(&hub->lock);
+
+	if (!power_off)
+		return 0;
+
+	return onboard_hub_power_off(hub);
+}
+
+static int __maybe_unused onboard_hub_resume(struct device *dev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+
+	if (hub->is_powered_on)
+		return 0;
+
+	return onboard_hub_power_on(hub);
+}
+
+static inline void get_udev_link_name(const struct usb_device *udev, char *buf, size_t size)
+{
+	snprintf(buf, size, "usb_dev.%s", dev_name(&udev->dev));
+}
+
+static int onboard_hub_add_usbdev(struct onboard_hub *hub, struct usb_device *udev)
+{
+	struct usbdev_node *node;
+	char link_name[64];
+	int err;
+
+	mutex_lock(&hub->lock);
+
+	if (hub->going_away) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	node->udev = udev;
+
+	list_add(&node->list, &hub->udev_list);
+
+	mutex_unlock(&hub->lock);
+
+	get_udev_link_name(udev, link_name, sizeof(link_name));
+	WARN_ON(sysfs_create_link(&hub->dev->kobj, &udev->dev.kobj, link_name));
+
+	return 0;
+
+error:
+	mutex_unlock(&hub->lock);
+
+	return err;
+}
+
+static void onboard_hub_remove_usbdev(struct onboard_hub *hub, const struct usb_device *udev)
+{
+	struct usbdev_node *node;
+	char link_name[64];
+
+	get_udev_link_name(udev, link_name, sizeof(link_name));
+	sysfs_remove_link(&hub->dev->kobj, link_name);
+
+	mutex_lock(&hub->lock);
+
+	list_for_each_entry(node, &hub->udev_list, list) {
+		if (node->udev == udev) {
+			list_del(&node->list);
+			kfree(node);
+			break;
+		}
+	}
+
+	mutex_unlock(&hub->lock);
+}
+
+static ssize_t always_powered_in_suspend_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	const struct onboard_hub *hub = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%d\n", hub->always_powered_in_suspend);
+}
+
+static ssize_t always_powered_in_suspend_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+	bool val;
+	int ret;
+
+	ret = kstrtobool(buf, &val);
+	if (ret < 0)
+		return ret;
+
+	hub->always_powered_in_suspend = val;
+
+	return count;
+}
+static DEVICE_ATTR_RW(always_powered_in_suspend);
+
+static struct attribute *onboard_hub_attrs[] = {
+	&dev_attr_always_powered_in_suspend.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(onboard_hub);
+
+static void onboard_hub_attach_usb_driver(struct work_struct *work)
+{
+	int err;
+
+	err = driver_attach(&onboard_hub_usbdev_driver.drvwrap.driver);
+	if (err)
+		pr_err("Failed to attach USB driver: %d\n", err);
+}
+
+static int onboard_hub_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct onboard_hub *hub;
+	int err;
+
+	hub = devm_kzalloc(dev, sizeof(*hub), GFP_KERNEL);
+	if (!hub)
+		return -ENOMEM;
+
+	hub->vdd = devm_regulator_get(dev, "vdd");
+	if (IS_ERR(hub->vdd))
+		return PTR_ERR(hub->vdd);
+
+	hub->dev = dev;
+	mutex_init(&hub->lock);
+	INIT_LIST_HEAD(&hub->udev_list);
+
+	dev_set_drvdata(dev, hub);
+
+	err = onboard_hub_power_on(hub);
+	if (err)
+		return err;
+
+	/*
+	 * The USB driver might have been detached from the USB devices by
+	 * onboard_hub_remove() (e.g. through an 'unbind' by userspace),
+	 * make sure to re-attach it if needed.
+	 *
+	 * This needs to be done deferred to avoid self-deadlocks on systems
+	 * with nested onboard hubs.
+	 */
+	INIT_WORK(&hub->attach_usb_driver_work, onboard_hub_attach_usb_driver);
+	schedule_work(&hub->attach_usb_driver_work);
+
+	return 0;
+}
+
+static int onboard_hub_remove(struct platform_device *pdev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(&pdev->dev);
+	struct usbdev_node *node;
+	struct usb_device *udev;
+
+	hub->going_away = true;
+
+	if (&hub->attach_usb_driver_work != current_work())
+		cancel_work_sync(&hub->attach_usb_driver_work);
+
+	mutex_lock(&hub->lock);
+
+	/* unbind the USB devices to avoid dangling references to this device */
+	while (!list_empty(&hub->udev_list)) {
+		node = list_first_entry(&hub->udev_list, struct usbdev_node, list);
+		udev = node->udev;
+
+		/*
+		 * Unbinding the driver will call onboard_hub_remove_usbdev(),
+		 * which acquires hub->lock.  We must release the lock first.
+		 */
+		get_device(&udev->dev);
+		mutex_unlock(&hub->lock);
+		device_release_driver(&udev->dev);
+		put_device(&udev->dev);
+		mutex_lock(&hub->lock);
+	}
+
+	mutex_unlock(&hub->lock);
+
+	return onboard_hub_power_off(hub);
+}
+
+MODULE_DEVICE_TABLE(of, onboard_hub_match);
+
+static const struct dev_pm_ops __maybe_unused onboard_hub_pm_ops = {
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(onboard_hub_suspend, onboard_hub_resume)
+};
+
+static struct platform_driver onboard_hub_driver = {
+	.probe = onboard_hub_probe,
+	.remove = onboard_hub_remove,
+
+	.driver = {
+		.name = "onboard-usb-hub",
+		.of_match_table = onboard_hub_match,
+		.pm = pm_ptr(&onboard_hub_pm_ops),
+		.dev_groups = onboard_hub_groups,
+	},
+};
+
+/************************** USB driver **************************/
+
+#define VENDOR_ID_REALTEK	0x0bda
+
+/*
+ * Returns the onboard_hub platform device that is associated with the USB
+ * device passed as parameter.
+ */
+static struct onboard_hub *_find_onboard_hub(struct device *dev)
+{
+	struct platform_device *pdev;
+	struct device_node *np;
+	struct onboard_hub *hub;
+
+	pdev = of_find_device_by_node(dev->of_node);
+	if (!pdev) {
+		np = of_parse_phandle(dev->of_node, "peer-hub", 0);
+		if (!np) {
+			dev_err(dev, "failed to find device node for peer hub\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		pdev = of_find_device_by_node(np);
+		of_node_put(np);
+
+		if (!pdev)
+			return ERR_PTR(-ENODEV);
+	}
+
+	hub = dev_get_drvdata(&pdev->dev);
+	put_device(&pdev->dev);
+
+	/*
+	 * The presence of drvdata ('hub') indicates that the platform driver
+	 * finished probing. This handles the case where (conceivably) we could
+	 * be running at the exact same time as the platform driver's probe. If
+	 * we detect the race we request probe deferral and we'll come back and
+	 * try again.
+	 */
+	if (!hub)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	return hub;
+}
+
+static int onboard_hub_usbdev_probe(struct usb_device *udev)
+{
+	struct device *dev = &udev->dev;
+	struct onboard_hub *hub;
+	int err;
+
+	/* ignore supported hubs without device tree node */
+	if (!dev->of_node)
+		return -ENODEV;
+
+	hub = _find_onboard_hub(dev);
+	if (IS_ERR(hub))
+		return PTR_ERR(hub);
+
+	dev_set_drvdata(dev, hub);
+
+	err = onboard_hub_add_usbdev(hub, udev);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void onboard_hub_usbdev_disconnect(struct usb_device *udev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(&udev->dev);
+
+	onboard_hub_remove_usbdev(hub, udev);
+}
+
+static const struct usb_device_id onboard_hub_id_table[] = {
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.1 */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.1 */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0414) }, /* RTS5414 USB 3.2 */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5414) }, /* RTS5414 USB 2.1 */
+	{}
+};
+MODULE_DEVICE_TABLE(usb, onboard_hub_id_table);
+
+static struct usb_device_driver onboard_hub_usbdev_driver = {
+	.name = "onboard-usb-hub",
+	.probe = onboard_hub_usbdev_probe,
+	.disconnect = onboard_hub_usbdev_disconnect,
+	.generic_subclass = 1,
+	.supports_autosuspend =	1,
+	.id_table = onboard_hub_id_table,
+};
+
+static int __init onboard_hub_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&onboard_hub_driver);
+	if (ret)
+		return ret;
+
+	ret = usb_register_device_driver(&onboard_hub_usbdev_driver, THIS_MODULE);
+	if (ret)
+		platform_driver_unregister(&onboard_hub_driver);
+
+	return ret;
+}
+module_init(onboard_hub_init);
+
+static void __exit onboard_hub_exit(void)
+{
+	usb_deregister_device_driver(&onboard_hub_usbdev_driver);
+	platform_driver_unregister(&onboard_hub_driver);
+}
+module_exit(onboard_hub_exit);
+
+MODULE_AUTHOR("Matthias Kaehlcke <mka@chromium.org>");
+MODULE_DESCRIPTION("Driver for discrete onboard USB hubs");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/usb/misc/onboard_usb_hub.h b/drivers/usb/misc/onboard_usb_hub.h
new file mode 100644
index 000000000000..d3a5b6938582
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_hub.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2022, Google LLC
+ */
+
+#ifndef _USB_MISC_ONBOARD_USB_HUB_H
+#define _USB_MISC_ONBOARD_USB_HUB_H
+
+static const struct of_device_id onboard_hub_match[] = {
+	{ .compatible = "usbbda,411" },
+	{ .compatible = "usbbda,5411" },
+	{ .compatible = "usbbda,414" },
+	{ .compatible = "usbbda,5414" },
+	{}
+};
+
+#endif /* _USB_MISC_ONBOARD_USB_HUB_H */
diff --git a/drivers/usb/misc/onboard_usb_hub_pdevs.c b/drivers/usb/misc/onboard_usb_hub_pdevs.c
new file mode 100644
index 000000000000..a0a5f719129f
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_hub_pdevs.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * API for creating and destroying USB onboard hub platform devices
+ *
+ * Copyright (c) 2022, Google LLC
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/of.h>
+
+#include "onboard_usb_hub.h"
+
+struct pdev_list_entry {
+	struct platform_device *pdev;
+	struct list_head node;
+};
+
+static bool of_is_onboard_usb_hub(const struct device_node *np)
+{
+	return !!of_match_node(onboard_hub_match, np);
+}
+
+/**
+ * onboard_hub_create_pdevs -- create platform devices for onboard USB hubs
+ * @parent_hub	: parent hub to scan for connected onboard hubs
+ * @pdev_list	: list of onboard hub platform devices owned by the parent hub
+ *
+ * Creates a platform device for each supported onboard hub that is connected to
+ * the given parent hub. The platform device is in charge of initializing the
+ * hub (enable regulators, take the hub out of reset, ...) and can optionally
+ * control whether the hub remains powered during system suspend or not.
+ *
+ * To keep track of the platform devices they are added to a list that is owned
+ * by the parent hub.
+ *
+ * Some background about the logic in this function, which can be a bit hard
+ * to follow:
+ *
+ * Root hubs don't have dedicated device tree nodes, but use the node of their
+ * HCD. The primary and secondary HCD are usually represented by a single DT
+ * node. That means the root hubs of the primary and secondary HCD share the
+ * same device tree node (the HCD node). As a result this function can be called
+ * twice with the same DT node for root hubs. We only want to create a single
+ * platform device for each physical onboard hub, hence for root hubs the loop
+ * is only executed for the root hub of the primary HCD. Since the function
+ * scans through all child nodes it still creates pdevs for onboard hubs
+ * connected to the root hub of the secondary HCD if needed.
+ *
+ * Further there must be only one platform device for onboard hubs with a peer
+ * hub (the hub is a single physical device). To achieve this two measures are
+ * taken: pdevs for onboard hubs with a peer are only created when the function
+ * is called on behalf of the parent hub that is connected to the primary HCD
+ * (directly or through other hubs). For onboard hubs connected to root hubs
+ * the function processes the nodes of both peers. A platform device is only
+ * created if the peer hub doesn't have one already.
+ */
+void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list)
+{
+	int i;
+	struct usb_hcd *hcd = bus_to_hcd(parent_hub->bus);
+	struct device_node *np, *npc;
+	struct platform_device *pdev;
+	struct pdev_list_entry *pdle;
+
+	if (!parent_hub->dev.of_node)
+		return;
+
+	if (!parent_hub->parent && !usb_hcd_is_primary_hcd(hcd))
+		return;
+
+	for (i = 1; i <= parent_hub->maxchild; i++) {
+		np = usb_of_get_device_node(parent_hub, i);
+		if (!np)
+			continue;
+
+		if (!of_is_onboard_usb_hub(np))
+			goto node_put;
+
+		npc = of_parse_phandle(np, "peer-hub", 0);
+		if (npc) {
+			if (!usb_hcd_is_primary_hcd(hcd)) {
+				of_node_put(npc);
+				goto node_put;
+			}
+
+			pdev = of_find_device_by_node(npc);
+			of_node_put(npc);
+
+			if (pdev) {
+				put_device(&pdev->dev);
+				goto node_put;
+			}
+		}
+
+		pdev = of_platform_device_create(np, NULL, &parent_hub->dev);
+		if (!pdev) {
+			dev_err(&parent_hub->dev,
+				"failed to create platform device for onboard hub '%pOF'\n", np);
+			goto node_put;
+		}
+
+		pdle = kzalloc(sizeof(*pdle), GFP_KERNEL);
+		if (!pdle) {
+			of_platform_device_destroy(&pdev->dev, NULL);
+			goto node_put;
+		}
+
+		pdle->pdev = pdev;
+		list_add(&pdle->node, pdev_list);
+
+node_put:
+		of_node_put(np);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_hub_create_pdevs);
+
+/**
+ * onboard_hub_destroy_pdevs -- free resources of onboard hub platform devices
+ * @pdev_list	: list of onboard hub platform devices
+ *
+ * Destroys the platform devices in the given list and frees the memory associated
+ * with the list entry.
+ */
+void onboard_hub_destroy_pdevs(struct list_head *pdev_list)
+{
+	struct pdev_list_entry *pdle, *tmp;
+
+	list_for_each_entry_safe(pdle, tmp, pdev_list, node) {
+		list_del(&pdle->node);
+		of_platform_device_destroy(&pdle->pdev->dev, NULL);
+		kfree(pdle);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_hub_destroy_pdevs);
diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h
new file mode 100644
index 000000000000..d9373230556e
--- /dev/null
+++ b/include/linux/usb/onboard_hub.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_USB_ONBOARD_HUB_H
+#define __LINUX_USB_ONBOARD_HUB_H
+
+struct usb_device;
+struct list_head;
+
+#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB)
+void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list);
+void onboard_hub_destroy_pdevs(struct list_head *pdev_list);
+#else
+static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub,
+					    struct list_head *pdev_list) {}
+static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {}
+#endif
+
+#endif /* __LINUX_USB_ONBOARD_HUB_H */
-- 
cgit 


From 0139da50dc53f0ce2804e83566d290c7e626fd17 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 4 Jul 2022 12:45:14 +0300
Subject: serial: Embed rs485_supported to uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Embed rs485_supported to uart_port to allow serial core to tweak it as
needed.

Reviewed-by: Lino Sanfilippo <l.sanfilippo@kunbus.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220704094515.6831-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_bcm2835aux.c | 2 +-
 drivers/tty/serial/8250/8250_dwlib.c      | 4 ++--
 drivers/tty/serial/8250/8250_exar.c       | 2 +-
 drivers/tty/serial/8250/8250_fintek.c     | 4 ++--
 drivers/tty/serial/8250/8250_lpc18xx.c    | 2 +-
 drivers/tty/serial/8250/8250_of.c         | 2 +-
 drivers/tty/serial/8250/8250_pci.c        | 2 +-
 drivers/tty/serial/amba-pl011.c           | 2 +-
 drivers/tty/serial/ar933x_uart.c          | 4 ++--
 drivers/tty/serial/atmel_serial.c         | 2 +-
 drivers/tty/serial/fsl_lpuart.c           | 2 +-
 drivers/tty/serial/imx.c                  | 4 ++--
 drivers/tty/serial/max310x.c              | 2 +-
 drivers/tty/serial/mcf.c                  | 4 ++--
 drivers/tty/serial/omap-serial.c          | 2 +-
 drivers/tty/serial/sc16is7xx.c            | 2 +-
 drivers/tty/serial/serial_core.c          | 8 ++++----
 drivers/tty/serial/stm32-usart.c          | 2 +-
 include/linux/serial_core.h               | 2 +-
 19 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_bcm2835aux.c b/drivers/tty/serial/8250/8250_bcm2835aux.c
index d9f1e618cfbd..047e14ccb165 100644
--- a/drivers/tty/serial/8250/8250_bcm2835aux.c
+++ b/drivers/tty/serial/8250/8250_bcm2835aux.c
@@ -108,7 +108,7 @@ static int bcm2835aux_serial_probe(struct platform_device *pdev)
 	up.port.flags = UPF_SHARE_IRQ | UPF_FIXED_PORT | UPF_FIXED_TYPE |
 			UPF_SKIP_TEST | UPF_IOREMAP;
 	up.port.rs485_config = serial8250_em485_config;
-	up.port.rs485_supported = &serial8250_em485_supported;
+	up.port.rs485_supported = serial8250_em485_supported;
 	up.rs485_start_tx = bcm2835aux_rs485_start_tx;
 	up.rs485_stop_tx = bcm2835aux_rs485_stop_tx;
 
diff --git a/drivers/tty/serial/8250/8250_dwlib.c b/drivers/tty/serial/8250/8250_dwlib.c
index a8bbed74ea70..2c3b1468bd88 100644
--- a/drivers/tty/serial/8250/8250_dwlib.c
+++ b/drivers/tty/serial/8250/8250_dwlib.c
@@ -255,10 +255,10 @@ void dw8250_setup_port(struct uart_port *p)
 	if (pd->hw_rs485_support) {
 		p->rs485_config = dw8250_rs485_config;
 		up->lsr_save_mask = LSR_SAVE_FLAGS | DW_UART_LSR_ADDR_RCVD;
-		p->rs485_supported = &dw8250_rs485_supported;
+		p->rs485_supported = dw8250_rs485_supported;
 	} else {
 		p->rs485_config = serial8250_em485_config;
-		p->rs485_supported = &serial8250_em485_supported;
+		p->rs485_supported = serial8250_em485_supported;
 		up->rs485_start_tx = serial8250_em485_start_tx;
 		up->rs485_stop_tx = serial8250_em485_stop_tx;
 	}
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index f5344cfe303c..314a05e009df 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -550,7 +550,7 @@ pci_xr17v35x_setup(struct exar8250 *priv, struct pci_dev *pcidev,
 
 	port->port.uartclk = baud * 16;
 	port->port.rs485_config = platform->rs485_config;
-	port->port.rs485_supported = platform->rs485_supported;
+	port->port.rs485_supported = *(platform->rs485_supported);
 
 	/*
 	 * Setup the UART clock for the devices on expansion slot to
diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c
index eea693f5b577..65b6b3cbaff6 100644
--- a/drivers/tty/serial/8250/8250_fintek.c
+++ b/drivers/tty/serial/8250/8250_fintek.c
@@ -433,9 +433,9 @@ static void fintek_8250_set_rs485_handler(struct uart_8250_port *uart)
 	case CHIP_ID_F81865:
 		uart->port.rs485_config = fintek_8250_rs485_config;
 		if (!pdata->index)
-			uart->port.rs485_supported = &fintek_8250_rs485_supported_port0;
+			uart->port.rs485_supported = fintek_8250_rs485_supported_port0;
 		else
-			uart->port.rs485_supported = &fintek_8250_rs485_supported;
+			uart->port.rs485_supported = fintek_8250_rs485_supported;
 		break;
 
 	default: /* No RS485 Auto direction functional */
diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c
index d7cb3bb52069..d6ca0d47e9d5 100644
--- a/drivers/tty/serial/8250/8250_lpc18xx.c
+++ b/drivers/tty/serial/8250/8250_lpc18xx.c
@@ -161,7 +161,7 @@ static int lpc18xx_serial_probe(struct platform_device *pdev)
 	uart.port.uartclk = clk_get_rate(data->clk_uart);
 	uart.port.private_data = data;
 	uart.port.rs485_config = lpc18xx_rs485_config;
-	uart.port.rs485_supported = &lpc18xx_rs485_supported;
+	uart.port.rs485_supported = lpc18xx_rs485_supported;
 	uart.port.serial_out = lpc18xx_uart_serial_out;
 
 	uart.dma = &data->dma;
diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index 65cccd559db2..1b461fba15a3 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -165,7 +165,7 @@ static int of_platform_serial_setup(struct platform_device *ofdev,
 
 	port->dev = &ofdev->dev;
 	port->rs485_config = serial8250_em485_config;
-	port->rs485_supported = &serial8250_em485_supported;
+	port->rs485_supported = serial8250_em485_supported;
 	up->rs485_start_tx = serial8250_em485_start_tx;
 	up->rs485_stop_tx = serial8250_em485_stop_tx;
 
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index d31d2350a9db..8a39ae072c65 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1607,7 +1607,7 @@ static int pci_fintek_setup(struct serial_private *priv,
 	port->port.iotype = UPIO_PORT;
 	port->port.iobase = iobase;
 	port->port.rs485_config = pci_fintek_rs485_config;
-	port->port.rs485_supported = &pci_fintek_rs485_supported;
+	port->port.rs485_supported = pci_fintek_rs485_supported;
 
 	data = devm_kzalloc(&pdev->dev, sizeof(u8), GFP_KERNEL);
 	if (!data)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index c8f52945a4aa..abeceeefdece 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2779,7 +2779,7 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->port.irq = dev->irq[0];
 	uap->port.ops = &amba_pl011_pops;
 	uap->port.rs485_config = pl011_rs485_config;
-	uap->port.rs485_supported = &pl011_rs485_supported;
+	uap->port.rs485_supported = pl011_rs485_supported;
 	snprintf(uap->type, sizeof(uap->type), "PL011 rev%u", amba_rev(dev));
 
 	ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr);
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index b73ce13683db..f931ecbc0bc0 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -778,7 +778,7 @@ static int ar933x_uart_probe(struct platform_device *pdev)
 	port->fifosize = AR933X_UART_FIFO_SIZE;
 	port->ops = &ar933x_uart_ops;
 	port->rs485_config = ar933x_config_rs485;
-	port->rs485_supported = &ar933x_rs485_supported;
+	port->rs485_supported = ar933x_rs485_supported;
 
 	baud = ar933x_uart_get_baud(port->uartclk, AR933X_UART_MAX_SCALE, 1);
 	up->min_baud = max_t(unsigned int, baud, AR933X_UART_MIN_BAUD);
@@ -802,7 +802,7 @@ static int ar933x_uart_probe(struct platform_device *pdev)
 	    !up->rts_gpiod) {
 		dev_err(&pdev->dev, "lacking rts-gpio, disabling RS485\n");
 		port->rs485.flags &= ~SER_RS485_ENABLED;
-		port->rs485_supported = &ar933x_no_rs485;
+		port->rs485_supported = ar933x_no_rs485;
 	}
 
 #ifdef CONFIG_SERIAL_AR933X_CONSOLE
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index bc6004679585..30ba9eef7b39 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -2498,7 +2498,7 @@ static int atmel_init_port(struct atmel_uart_port *atmel_port,
 	port->mapbase		= mpdev->resource[0].start;
 	port->irq		= platform_get_irq(mpdev, 0);
 	port->rs485_config	= atmel_config_rs485;
-	port->rs485_supported	= &atmel_rs485_supported;
+	port->rs485_supported	= atmel_rs485_supported;
 	port->iso7816_config	= atmel_config_iso7816;
 	port->membase		= NULL;
 
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 8fe0494d4057..fc7d235a1e27 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2655,7 +2655,7 @@ static int lpuart_probe(struct platform_device *pdev)
 		sport->port.rs485_config = lpuart32_config_rs485;
 	else
 		sport->port.rs485_config = lpuart_config_rs485;
-	sport->port.rs485_supported = &lpuart_rs485_supported;
+	sport->port.rs485_supported = lpuart_rs485_supported;
 
 	sport->ipg_clk = devm_clk_get(&pdev->dev, "ipg");
 	if (IS_ERR(sport->ipg_clk)) {
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 3457006cea3f..522445a8f666 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -2285,9 +2285,9 @@ static int imx_uart_probe(struct platform_device *pdev)
 	sport->port.rs485_config = imx_uart_rs485_config;
 	/* RTS is required to control the RS485 transmitter */
 	if (sport->have_rtscts || sport->have_rtsgpio)
-		sport->port.rs485_supported = &imx_rs485_supported;
+		sport->port.rs485_supported = imx_rs485_supported;
 	else
-		sport->port.rs485_supported = &imx_no_rs485;
+		sport->port.rs485_supported = imx_no_rs485;
 	sport->port.flags = UPF_BOOT_AUTOCONF;
 	timer_setup(&sport->timer, imx_uart_timeout, 0);
 
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index e162bfb44080..ab10ca4a45b5 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1370,7 +1370,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
 		s->p[i].port.membase	= (void __iomem *)~0;
 		s->p[i].port.uartclk	= uartclk;
 		s->p[i].port.rs485_config = max310x_rs485_config;
-		s->p[i].port.rs485_supported = &max310x_rs485_supported;
+		s->p[i].port.rs485_supported = max310x_rs485_supported;
 		s->p[i].port.ops	= &max310x_ops;
 		s->p[i].regmap		= regmaps[i];
 
diff --git a/drivers/tty/serial/mcf.c b/drivers/tty/serial/mcf.c
index 73c5287b8e5e..f4aaaadd0742 100644
--- a/drivers/tty/serial/mcf.c
+++ b/drivers/tty/serial/mcf.c
@@ -506,7 +506,7 @@ int __init early_mcf_setup(struct mcf_platform_uart *platp)
 		port->uartclk = MCF_BUSCLK;
 		port->flags = UPF_BOOT_AUTOCONF;
 		port->rs485_config = mcf_config_rs485;
-		port->rs485_supported = &mcf_rs485_supported;
+		port->rs485_supported = mcf_rs485_supported;
 		port->ops = &mcf_uart_ops;
 	}
 
@@ -634,7 +634,7 @@ static int mcf_probe(struct platform_device *pdev)
 		port->ops = &mcf_uart_ops;
 		port->flags = UPF_BOOT_AUTOCONF;
 		port->rs485_config = mcf_config_rs485;
-		port->rs485_supported = &mcf_rs485_supported;
+		port->rs485_supported = mcf_rs485_supported;
 		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_MCF_CONSOLE);
 
 		uart_add_one_port(&mcf_driver, port);
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 196bae704f85..0aa666e247d5 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1643,7 +1643,7 @@ static int serial_omap_probe(struct platform_device *pdev)
 	up->port.flags = omap_up_info->flags;
 	up->port.uartclk = omap_up_info->uartclk;
 	up->port.rs485_config = serial_omap_config_rs485;
-	up->port.rs485_supported = &serial_omap_rs485_supported;
+	up->port.rs485_supported = serial_omap_rs485_supported;
 	if (!up->port.uartclk) {
 		up->port.uartclk = DEFAULT_CLK_SPEED;
 		dev_warn(&pdev->dev,
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 8cb92a3b3fb8..259e08cc347c 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1461,7 +1461,7 @@ static int sc16is7xx_probe(struct device *dev,
 		s->p[i].port.iotype	= UPIO_PORT;
 		s->p[i].port.uartclk	= freq;
 		s->p[i].port.rs485_config = sc16is7xx_config_rs485;
-		s->p[i].port.rs485_supported = &sc16is7xx_rs485_supported;
+		s->p[i].port.rs485_supported = sc16is7xx_rs485_supported;
 		s->p[i].port.ops	= &sc16is7xx_ops;
 		s->p[i].old_mctrl	= 0;
 		s->p[i].port.line	= sc16is7xx_alloc_line();
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 85ef7ef00b82..a9cf1044a9fa 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1285,7 +1285,7 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 	 * For any bit outside of the legacy ones that is not supported by
 	 * the driver, return -EINVAL.
 	 */
-	if (flags & ~port->rs485_supported->flags)
+	if (flags & ~port->rs485_supported.flags)
 		return -EINVAL;
 
 	/* Asking for address w/o addressing mode? */
@@ -1304,7 +1304,7 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 
 static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs485 *rs485)
 {
-	u32 supported_flags = port->rs485_supported->flags;
+	u32 supported_flags = port->rs485_supported.flags;
 
 	if (!(rs485->flags & SER_RS485_ENABLED)) {
 		memset(rs485, 0, sizeof(*rs485));
@@ -1323,7 +1323,7 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 		supported_flags |= SER_RS485_RTS_ON_SEND|SER_RS485_RTS_AFTER_SEND;
 	}
 
-	if (!port->rs485_supported->delay_rts_before_send) {
+	if (!port->rs485_supported.delay_rts_before_send) {
 		if (rs485->delay_rts_before_send) {
 			dev_warn_ratelimited(port->dev,
 				"%s (%d): RTS delay before sending not supported\n",
@@ -1337,7 +1337,7 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 			port->name, port->line, rs485->delay_rts_before_send);
 	}
 
-	if (!port->rs485_supported->delay_rts_after_send) {
+	if (!port->rs485_supported.delay_rts_after_send) {
 		if (rs485->delay_rts_after_send) {
 			dev_warn_ratelimited(port->dev,
 				"%s (%d): RTS delay after sending not supported\n",
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index 13992e64a7df..ff5c7e0ebc4c 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -1401,7 +1401,7 @@ static int stm32_usart_init_port(struct stm32_port *stm32port,
 	port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_STM32_CONSOLE);
 	port->irq = irq;
 	port->rs485_config = stm32_usart_config_rs485;
-	port->rs485_supported = &stm32_rs485_supported;
+	port->rs485_supported = stm32_rs485_supported;
 
 	ret = stm32_usart_init_rs485(port, pdev);
 	if (ret)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index b7b86ee3cb12..a6fa7c40c330 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -255,7 +255,7 @@ struct uart_port {
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
-	const struct serial_rs485	*rs485_supported;	/* Supported mask for serial_rs485 */
+	struct serial_rs485	rs485_supported;	/* Supported mask for serial_rs485 */
 	struct gpio_desc	*rs485_term_gpio;	/* enable RS485 bus termination */
 	struct serial_iso7816   iso7816;
 	void			*private_data;		/* generic platform data pointer */
-- 
cgit 


From 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Wed, 6 Jul 2022 11:06:21 +0100
Subject: dt-bindings: microchip-otpc: document Microchip OTPC

Document Microchip OTP controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20220706100627.6534-2-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../bindings/nvmem/microchip,sama7g5-otpc.yaml     | 50 ++++++++++++++++++++++
 include/dt-bindings/nvmem/microchip,sama7g5-otpc.h | 12 ++++++
 2 files changed, 62 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml
 create mode 100644 include/dt-bindings/nvmem/microchip,sama7g5-otpc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml b/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml
new file mode 100644
index 000000000000..c3c96fd0baac
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/microchip,sama7g5-otpc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip SAMA7G5 OTP Controller (OTPC)
+
+maintainers:
+  - Claudiu Beznea <claudiu.beznea@microchip.com>
+
+description: |
+  OTP controller drives a NVMEM memory where system specific data
+  (e.g. calibration data for analog cells, hardware configuration
+  settings, chip identifiers) or user specific data could be stored.
+
+allOf:
+  - $ref: "nvmem.yaml#"
+
+properties:
+  compatible:
+    items:
+      - const: microchip,sama7g5-otpc
+      - const: syscon
+
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/nvmem/microchip,sama7g5-otpc.h>
+
+    otpc: efuse@e8c00000 {
+        compatible = "microchip,sama7g5-otpc", "syscon";
+        reg = <0xe8c00000 0xec>;
+        #address-cells = <1>;
+        #size-cells = <1>;
+
+        temperature_calib: calib@1 {
+            reg = <OTP_PKT(1) 76>;
+        };
+    };
+
+...
diff --git a/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h b/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h
new file mode 100644
index 000000000000..f570b23165a2
--- /dev/null
+++ b/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+
+#ifndef _DT_BINDINGS_NVMEM_MICROCHIP_OTPC_H
+#define _DT_BINDINGS_NVMEM_MICROCHIP_OTPC_H
+
+/*
+ * Need to have it as a multiple of 4 as NVMEM memory is registered with
+ * stride = 4.
+ */
+#define OTP_PKT(id)			((id) * 4)
+
+#endif
-- 
cgit 


From 63a6ef2360bdeffcdd41bcdd86937b6db17b573d Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Mon, 27 Jun 2022 17:19:57 +0300
Subject: dt-bindings: Add headers for Host1x and VIC on Tegra234

Add clock, memory controller, powergate and reset dt-binding headers
for Host1x and VIC on Tegra234.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/clock/tegra234-clock.h     | 4 ++++
 include/dt-bindings/memory/tegra234-mc.h       | 5 +++++
 include/dt-bindings/power/tegra234-powergate.h | 1 +
 include/dt-bindings/reset/tegra234-reset.h     | 1 +
 4 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/tegra234-clock.h b/include/dt-bindings/clock/tegra234-clock.h
index bab85d9ba8cd..173364a93381 100644
--- a/include/dt-bindings/clock/tegra234-clock.h
+++ b/include/dt-bindings/clock/tegra234-clock.h
@@ -38,6 +38,8 @@
  * throughput and memory controller power.
  */
 #define TEGRA234_CLK_EMC			31U
+/** @brief output of mux controlled by CLK_RST_CONTROLLER_CLK_SOURCE_HOST1X */
+#define TEGRA234_CLK_HOST1X                     46U
 /** @brief output of gate CLK_ENB_FUSE */
 #define TEGRA234_CLK_FUSE			40U
 /** @brief output of mux controlled by CLK_RST_CONTROLLER_CLK_SOURCE_I2C1 */
@@ -132,6 +134,8 @@
 #define TEGRA234_CLK_UARTA			155U
 /** @brief output of gate CLK_ENB_PEX1_CORE_6 */
 #define TEGRA234_CLK_PEX1_C6_CORE		161U
+/** @brief output of mux controlled by CLK_RST_CONTROLLER_CLK_SOURCE_VIC */
+#define TEGRA234_CLK_VIC                        167U
 /** @brief output of gate CLK_ENB_PEX2_CORE_7 */
 #define TEGRA234_CLK_PEX2_C7_CORE		171U
 /** @brief output of gate CLK_ENB_PEX2_CORE_8 */
diff --git a/include/dt-bindings/memory/tegra234-mc.h b/include/dt-bindings/memory/tegra234-mc.h
index 8b0ddcb715ff..62987b47ce81 100644
--- a/include/dt-bindings/memory/tegra234-mc.h
+++ b/include/dt-bindings/memory/tegra234-mc.h
@@ -31,6 +31,8 @@
 #define TEGRA234_SID_PCIE8	0x09
 #define TEGRA234_SID_PCIE10	0x0b
 #define TEGRA234_SID_BPMP	0x10
+#define TEGRA234_SID_HOST1X	0x27
+#define TEGRA234_SID_VIC	0x34
 
 /*
  * memory client IDs
@@ -38,6 +40,7 @@
 
 /* High-definition audio (HDA) read clients */
 #define TEGRA234_MEMORY_CLIENT_HDAR 0x15
+#define TEGRA234_MEMORY_CLIENT_HOST1XDMAR 0x16
 /* PCIE6 read clients */
 #define TEGRA234_MEMORY_CLIENT_PCIE6AR 0x28
 /* PCIE6 write clients */
@@ -86,6 +89,8 @@
 #define TEGRA234_MEMORY_CLIENT_MGBEDWR 0x65
 /* sdmmcd memory write client */
 #define TEGRA234_MEMORY_CLIENT_SDMMCWAB 0x67
+#define TEGRA234_MEMORY_CLIENT_VICSRD 0x6c
+#define TEGRA234_MEMORY_CLIENT_VICSWR 0x6d
 /* BPMP read client */
 #define TEGRA234_MEMORY_CLIENT_BPMPR 0x93
 /* BPMP write client */
diff --git a/include/dt-bindings/power/tegra234-powergate.h b/include/dt-bindings/power/tegra234-powergate.h
index df1d4dd8dcf3..ae9286cef85c 100644
--- a/include/dt-bindings/power/tegra234-powergate.h
+++ b/include/dt-bindings/power/tegra234-powergate.h
@@ -19,5 +19,6 @@
 #define TEGRA234_POWER_DOMAIN_MGBEB	18U
 #define TEGRA234_POWER_DOMAIN_MGBEC	19U
 #define TEGRA234_POWER_DOMAIN_MGBED	20U
+#define TEGRA234_POWER_DOMAIN_VIC	29U
 
 #endif
diff --git a/include/dt-bindings/reset/tegra234-reset.h b/include/dt-bindings/reset/tegra234-reset.h
index bd58a05f1d94..d48d22b2bc7f 100644
--- a/include/dt-bindings/reset/tegra234-reset.h
+++ b/include/dt-bindings/reset/tegra234-reset.h
@@ -53,6 +53,7 @@
 #define TEGRA234_RESET_MGBE3_PCS		87U
 #define TEGRA234_RESET_MGBE3_MAC		88U
 #define TEGRA234_RESET_UARTA			100U
+#define TEGRA234_RESET_VIC                      113U
 #define TEGRA234_RESET_PEX0_CORE_0		116U
 #define TEGRA234_RESET_PEX0_CORE_1		117U
 #define TEGRA234_RESET_PEX0_CORE_2		118U
-- 
cgit 


From 7b2379454b9a72fece618319acea8d33497d2299 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Sat, 2 Jul 2022 12:07:40 +0100
Subject: kunit: test.h: fix a kernel-doc markup

Fix this kernel-doc warning:

	Documentation/dev-tools/kunit/api/test:9: ./include/kunit/test.h:323: WARNING: Inline interpreted text or phrase reference start-string without end-string.

Functions should use func_name() on kernel-doc markups, as
documented at:
	Documentation/doc-guide/kernel-doc.rst

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index ccae848720dc..7646d1bcf685 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -321,7 +321,7 @@ static inline int kunit_run_all_tests(void)
  *
  * @__suites: a statically allocated list of &struct kunit_suite.
  *
- * This functions identically as &kunit_test_suites() except that it suppresses
+ * This functions identically as kunit_test_suites() except that it suppresses
  * modpost warnings for referencing functions marked __init or data marked
  * __initdata; this is OK because currently KUnit only runs tests upon boot
  * during the init phase or upon loading a module during the init phase.
-- 
cgit 


From b6c1c5745ccc68ac5d57c7ffb51ea25a86d0e97b Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 27 Jun 2022 08:35:24 -0700
Subject: dm: Add verity helpers for LoadPin

LoadPin limits loading of kernel modules, firmware and certain
other files to a 'pinned' file system (typically a read-only
rootfs). To provide more flexibility LoadPin is being extended
to also allow loading these files from trusted dm-verity
devices. For that purpose LoadPin can be provided with a list
of verity root digests that it should consider as trusted.

Add a bunch of helpers to allow LoadPin to check whether a DM
device is a trusted verity device. The new functions broadly
fall in two categories: those that need access to verity
internals (like the root digest), and the 'glue' between
LoadPin and verity. The new file dm-verity-loadpin.c contains
the glue functions.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Link: https://lore.kernel.org/lkml/20220627083512.v7.1.I3e928575a23481121e73286874c4c2bdb403355d@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/md/Makefile               |  6 ++++
 drivers/md/dm-verity-loadpin.c    | 74 +++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-verity-target.c     | 33 ++++++++++++++++-
 drivers/md/dm-verity.h            |  4 +++
 include/linux/dm-verity-loadpin.h | 27 ++++++++++++++
 5 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 drivers/md/dm-verity-loadpin.c
 create mode 100644 include/linux/dm-verity-loadpin.h

(limited to 'include')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 0454b0885b01..71771901c823 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -108,6 +108,12 @@ ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y)
 dm-verity-objs			+= dm-verity-verify-sig.o
 endif
 
+ifeq ($(CONFIG_DM_VERITY),y)
+ifeq ($(CONFIG_SECURITY_LOADPIN),y)
+dm-verity-objs			+= dm-verity-loadpin.o
+endif
+endif
+
 ifeq ($(CONFIG_DM_AUDIT),y)
 dm-mod-objs			+= dm-audit.o
 endif
diff --git a/drivers/md/dm-verity-loadpin.c b/drivers/md/dm-verity-loadpin.c
new file mode 100644
index 000000000000..10c18bc1652c
--- /dev/null
+++ b/drivers/md/dm-verity-loadpin.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/dm-verity-loadpin.h>
+
+#include "dm.h"
+#include "dm-verity.h"
+
+#define DM_MSG_PREFIX	"verity-loadpin"
+
+LIST_HEAD(dm_verity_loadpin_trusted_root_digests);
+
+static bool is_trusted_verity_target(struct dm_target *ti)
+{
+	u8 *root_digest;
+	unsigned int digest_size;
+	struct dm_verity_loadpin_trusted_root_digest *trd;
+	bool trusted = false;
+
+	if (!dm_is_verity_target(ti))
+		return false;
+
+	if (dm_verity_get_root_digest(ti, &root_digest, &digest_size))
+		return false;
+
+	list_for_each_entry(trd, &dm_verity_loadpin_trusted_root_digests, node) {
+		if ((trd->len == digest_size) &&
+		    !memcmp(trd->data, root_digest, digest_size)) {
+			trusted = true;
+			break;
+		}
+	}
+
+	kfree(root_digest);
+
+	return trusted;
+}
+
+/*
+ * Determines whether the file system of a superblock is located on
+ * a verity device that is trusted by LoadPin.
+ */
+bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev)
+{
+	struct mapped_device *md;
+	struct dm_table *table;
+	struct dm_target *ti;
+	int srcu_idx;
+	bool trusted = false;
+
+	if (list_empty(&dm_verity_loadpin_trusted_root_digests))
+		return false;
+
+	md = dm_get_md(bdev->bd_dev);
+	if (!md)
+		return false;
+
+	table = dm_get_live_table(md, &srcu_idx);
+
+	if (dm_table_get_num_targets(table) != 1)
+		goto out;
+
+	ti = dm_table_get_target(table, 0);
+
+	if (is_trusted_verity_target(ti))
+		trusted = true;
+
+out:
+	dm_put_live_table(md, srcu_idx);
+	dm_put(md);
+
+	return trusted;
+}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index d6dbd47492a8..e5a01e2f96e9 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/scatterlist.h>
+#include <linux/string.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -1310,10 +1311,40 @@ bad:
 	return r;
 }
 
+/*
+ * Check whether a DM target is a verity target.
+ */
+bool dm_is_verity_target(struct dm_target *ti)
+{
+	return ti->type->module == THIS_MODULE;
+}
+
+/*
+ * Get the root digest of a verity target.
+ *
+ * Returns a copy of the root digest, the caller is responsible for
+ * freeing the memory of the digest.
+ */
+int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned int *digest_size)
+{
+	struct dm_verity *v = ti->private;
+
+	if (!dm_is_verity_target(ti))
+		return -EINVAL;
+
+	*root_digest = kmemdup(v->root_digest, v->digest_size, GFP_KERNEL);
+	if (*root_digest == NULL)
+		return -ENOMEM;
+
+	*digest_size = v->digest_size;
+
+	return 0;
+}
+
 static struct target_type verity_target = {
 	.name		= "verity",
 	.features	= DM_TARGET_IMMUTABLE,
-	.version	= {1, 8, 0},
+	.version	= {1, 8, 1},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 4e769d13473a..c832cc3e3d24 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -129,4 +129,8 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 				 sector_t block, u8 *digest, bool *is_zero);
 
+extern bool dm_is_verity_target(struct dm_target *ti);
+extern int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest,
+				     unsigned int *digest_size);
+
 #endif /* DM_VERITY_H */
diff --git a/include/linux/dm-verity-loadpin.h b/include/linux/dm-verity-loadpin.h
new file mode 100644
index 000000000000..fb695ecaa5d5
--- /dev/null
+++ b/include/linux/dm-verity-loadpin.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_DM_VERITY_LOADPIN_H
+#define __LINUX_DM_VERITY_LOADPIN_H
+
+#include <linux/list.h>
+
+struct block_device;
+
+extern struct list_head dm_verity_loadpin_trusted_root_digests;
+
+struct dm_verity_loadpin_trusted_root_digest {
+	struct list_head node;
+	unsigned int len;
+	u8 data[];
+};
+
+#if IS_ENABLED(CONFIG_SECURITY_LOADPIN) && IS_BUILTIN(CONFIG_DM_VERITY)
+bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev);
+#else
+static inline bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev)
+{
+	return false;
+}
+#endif
+
+#endif /* __LINUX_DM_VERITY_LOADPIN_H */
-- 
cgit 


From 3f805f8cc23ba35679dd01446929292911c2b469 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 27 Jun 2022 08:35:25 -0700
Subject: LoadPin: Enable loading from trusted dm-verity devices

Extend LoadPin to allow loading of kernel files from trusted dm-verity [1]
devices.

This change adds the concept of trusted verity devices to LoadPin. LoadPin
maintains a list of root digests of verity devices it considers trusted.
Userspace can populate this list through an ioctl on the new LoadPin
securityfs entry 'dm-verity'. The ioctl receives a file descriptor of
a file with verity digests as parameter. Verity reads the digests from
this file after confirming that the file is located on the pinned root.
The digest file must contain one digest per line. The list of trusted
digests can only be set up once, which is typically done at boot time.

When a kernel file is read LoadPin first checks (as usual) whether the file
is located on the pinned root, if so the file can be loaded. Otherwise, if
the verity extension is enabled, LoadPin determines whether the file is
located on a verity backed device and whether the root digest of that
device is in the list of trusted digests. The file can be loaded if the
verity device has a trusted root digest.

Background:

As of now LoadPin restricts loading of kernel files to a single pinned
filesystem, typically the rootfs. This works for many systems, however it
can result in a bloated rootfs (and OTA updates) on platforms where
multiple boards with different hardware configurations use the same rootfs
image. Especially when 'optional' files are large it may be preferable to
download/install them only when they are actually needed by a given board.
Chrome OS uses Downloadable Content (DLC) [2] to deploy certain 'packages'
at runtime. As an example a DLC package could contain firmware for a
peripheral that is not present on all boards. DLCs use dm-verity to verify
the integrity of the DLC content.

[1] https://www.kernel.org/doc/html/latest/admin-guide/device-mapper/verity.html
[2] https://chromium.googlesource.com/chromiumos/platform2/+/HEAD/dlcservice/docs/developer.md

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Link: https://lore.kernel.org/lkml/20220627083512.v7.2.I01c67af41d2f6525c6d023101671d7339a9bc8b5@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/loadpin.h |  22 ++++++
 security/loadpin/Kconfig     |  16 +++++
 security/loadpin/loadpin.c   | 167 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/loadpin.h

(limited to 'include')

diff --git a/include/uapi/linux/loadpin.h b/include/uapi/linux/loadpin.h
new file mode 100644
index 000000000000..daa6dbb8bb02
--- /dev/null
+++ b/include/uapi/linux/loadpin.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2022, Google LLC
+ */
+
+#ifndef _UAPI_LINUX_LOOP_LOADPIN_H
+#define _UAPI_LINUX_LOOP_LOADPIN_H
+
+#define LOADPIN_IOC_MAGIC	'L'
+
+/**
+ * LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS - Set up the root digests of verity devices
+ *                                          that loadpin should trust.
+ *
+ * Takes a file descriptor from which to read the root digests of trusted verity devices. The file
+ * is expected to contain a list of digests in ASCII format, with one line per digest. The ioctl
+ * must be issued on the securityfs attribute 'loadpin/dm-verity' (which can be typically found
+ * under /sys/kernel/security/loadpin/dm-verity).
+ */
+#define LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS _IOW(LOADPIN_IOC_MAGIC, 0x00, unsigned int)
+
+#endif /* _UAPI_LINUX_LOOP_LOADPIN_H */
diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
index 91be65dec2ab..70e7985b2561 100644
--- a/security/loadpin/Kconfig
+++ b/security/loadpin/Kconfig
@@ -18,3 +18,19 @@ config SECURITY_LOADPIN_ENFORCE
 	  If selected, LoadPin will enforce pinning at boot. If not
 	  selected, it can be enabled at boot with the kernel parameter
 	  "loadpin.enforce=1".
+
+config SECURITY_LOADPIN_VERITY
+	bool "Allow reading files from certain other filesystems that use dm-verity"
+	depends on SECURITY_LOADPIN && DM_VERITY=y && SECURITYFS
+	help
+	  If selected LoadPin can allow reading files from filesystems
+	  that use dm-verity. LoadPin maintains a list of verity root
+	  digests it considers trusted. A verity backed filesystem is
+	  considered trusted if its root digest is found in the list
+	  of trusted digests.
+
+	  The list of trusted verity can be populated through an ioctl
+	  on the LoadPin securityfs entry 'dm-verity'. The ioctl
+	  expects a file descriptor of a file with verity digests as
+	  parameter. The file must be located on the pinned root and
+	  contain a comma separated list of digests.
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index ad4e6756c038..6ab5f2bbf41f 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -18,6 +18,8 @@
 #include <linux/path.h>
 #include <linux/sched.h>	/* current */
 #include <linux/string_helpers.h>
+#include <linux/dm-verity-loadpin.h>
+#include <uapi/linux/loadpin.h>
 
 static void report_load(const char *origin, struct file *file, char *operation)
 {
@@ -43,6 +45,9 @@ static char *exclude_read_files[READING_MAX_ID];
 static int ignore_read_file_id[READING_MAX_ID] __ro_after_init;
 static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
+#ifdef CONFIG_SECURITY_LOADPIN_VERITY
+static bool deny_reading_verity_digests;
+#endif
 
 #ifdef CONFIG_SYSCTL
 
@@ -171,7 +176,8 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id,
 		spin_unlock(&pinned_root_spinlock);
 	}
 
-	if (IS_ERR_OR_NULL(pinned_root) || load_root != pinned_root) {
+	if (IS_ERR_OR_NULL(pinned_root) ||
+	    ((load_root != pinned_root) && !dm_verity_loadpin_is_bdev_trusted(load_root->s_bdev))) {
 		if (unlikely(!enforce)) {
 			report_load(origin, file, "pinning-ignored");
 			return 0;
@@ -237,6 +243,7 @@ static int __init loadpin_init(void)
 		enforce ? "" : "not ");
 	parse_exclude();
 	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
+
 	return 0;
 }
 
@@ -245,6 +252,164 @@ DEFINE_LSM(loadpin) = {
 	.init = loadpin_init,
 };
 
+#ifdef CONFIG_SECURITY_LOADPIN_VERITY
+
+enum loadpin_securityfs_interface_index {
+	LOADPIN_DM_VERITY,
+};
+
+static int read_trusted_verity_root_digests(unsigned int fd)
+{
+	struct fd f;
+	void *data;
+	int rc;
+	char *p, *d;
+
+	if (deny_reading_verity_digests)
+		return -EPERM;
+
+	/* The list of trusted root digests can only be set up once */
+	if (!list_empty(&dm_verity_loadpin_trusted_root_digests))
+		return -EPERM;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EINVAL;
+
+	data = kzalloc(SZ_4K, GFP_KERNEL);
+	if (!data) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = kernel_read_file(f.file, 0, (void **)&data, SZ_4K - 1, NULL, READING_POLICY);
+	if (rc < 0)
+		goto err;
+
+	p = data;
+	p[rc] = '\0';
+	p = strim(p);
+
+	p = strim(data);
+	while ((d = strsep(&p, "\n")) != NULL) {
+		int len = strlen(d);
+		struct dm_verity_loadpin_trusted_root_digest *trd;
+
+		if (len % 2) {
+			rc = -EPROTO;
+			goto err;
+		}
+
+		len /= 2;
+
+		trd = kzalloc(struct_size(trd, data, len), GFP_KERNEL);
+		if (!trd) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		if (hex2bin(trd->data, d, len)) {
+			kfree(trd);
+			rc = -EPROTO;
+			goto err;
+		}
+
+		trd->len = len;
+
+		list_add_tail(&trd->node, &dm_verity_loadpin_trusted_root_digests);
+	}
+
+	if (list_empty(&dm_verity_loadpin_trusted_root_digests)) {
+		rc = -EPROTO;
+		goto err;
+	}
+
+	kfree(data);
+	fdput(f);
+
+	return 0;
+
+err:
+	kfree(data);
+
+	/* any failure in loading/parsing invalidates the entire list */
+	{
+		struct dm_verity_loadpin_trusted_root_digest *trd, *tmp;
+
+		list_for_each_entry_safe(trd, tmp, &dm_verity_loadpin_trusted_root_digests, node) {
+			list_del(&trd->node);
+			kfree(trd);
+		}
+	}
+
+	/* disallow further attempts after reading a corrupt/invalid file */
+	deny_reading_verity_digests = true;
+
+	fdput(f);
+
+	return rc;
+}
+
+/******************************** securityfs ********************************/
+
+static long dm_verity_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	void __user *uarg = (void __user *)arg;
+	unsigned int fd;
+	int rc;
+
+	switch (cmd) {
+	case LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS:
+		rc = copy_from_user(&fd, uarg, sizeof(fd));
+		if (rc)
+			return rc;
+
+		return read_trusted_verity_root_digests(fd);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations loadpin_dm_verity_ops = {
+	.unlocked_ioctl = dm_verity_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+};
+
+/**
+ * init_loadpin_securityfs - create the securityfs directory for LoadPin
+ *
+ * We can not put this method normally under the loadpin_init() code path since
+ * the security subsystem gets initialized before the vfs caches.
+ *
+ * Returns 0 if the securityfs directory creation was successful.
+ */
+static int __init init_loadpin_securityfs(void)
+{
+	struct dentry *loadpin_dir, *dentry;
+
+	loadpin_dir = securityfs_create_dir("loadpin", NULL);
+	if (IS_ERR(loadpin_dir)) {
+		pr_err("LoadPin: could not create securityfs dir: %ld\n",
+		       PTR_ERR(loadpin_dir));
+		return PTR_ERR(loadpin_dir);
+	}
+
+	dentry = securityfs_create_file("dm-verity", 0600, loadpin_dir,
+					(void *)LOADPIN_DM_VERITY, &loadpin_dm_verity_ops);
+	if (IS_ERR(dentry)) {
+		pr_err("LoadPin: could not create securityfs entry 'dm-verity': %ld\n",
+		       PTR_ERR(dentry));
+		return PTR_ERR(dentry);
+	}
+
+	return 0;
+}
+
+fs_initcall(init_loadpin_securityfs);
+
+#endif /* CONFIG_SECURITY_LOADPIN_VERITY */
+
 /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
 module_param(enforce, int, 0);
 MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning");
-- 
cgit 


From 231af4709018a8e4f20e511da4b6506346d662d3 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 27 Jun 2022 08:35:26 -0700
Subject: dm: verity-loadpin: Use CONFIG_SECURITY_LOADPIN_VERITY for
 conditional compilation

The verity glue for LoadPin is only needed when CONFIG_SECURITY_LOADPIN_VERITY
is set, use this option for conditional compilation instead of the combo of
CONFIG_DM_VERITY and CONFIG_SECURITY_LOADPIN.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Link: https://lore.kernel.org/lkml/20220627083512.v7.3.I5aca2dcc3b06de4bf53696cd21329dce8272b8aa@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/md/Makefile               | 7 +------
 include/linux/dm-verity-loadpin.h | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 71771901c823..a96441752ec7 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
 obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
 obj-$(CONFIG_DM_WRITECACHE)	+= dm-writecache.o
+obj-$(CONFIG_SECURITY_LOADPIN_VERITY)	+= dm-verity-loadpin.o
 
 ifeq ($(CONFIG_DM_INIT),y)
 dm-mod-objs			+= dm-init.o
@@ -108,12 +109,6 @@ ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y)
 dm-verity-objs			+= dm-verity-verify-sig.o
 endif
 
-ifeq ($(CONFIG_DM_VERITY),y)
-ifeq ($(CONFIG_SECURITY_LOADPIN),y)
-dm-verity-objs			+= dm-verity-loadpin.o
-endif
-endif
-
 ifeq ($(CONFIG_DM_AUDIT),y)
 dm-mod-objs			+= dm-audit.o
 endif
diff --git a/include/linux/dm-verity-loadpin.h b/include/linux/dm-verity-loadpin.h
index fb695ecaa5d5..552b817ab102 100644
--- a/include/linux/dm-verity-loadpin.h
+++ b/include/linux/dm-verity-loadpin.h
@@ -15,7 +15,7 @@ struct dm_verity_loadpin_trusted_root_digest {
 	u8 data[];
 };
 
-#if IS_ENABLED(CONFIG_SECURITY_LOADPIN) && IS_BUILTIN(CONFIG_DM_VERITY)
+#if IS_ENABLED(CONFIG_SECURITY_LOADPIN_VERITY)
 bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev);
 #else
 static inline bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev)
-- 
cgit 


From 2d91ecace6614cf6254001566292b808d7f70a91 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Jul 2022 18:03:09 -0700
Subject: strparser: pad sk_skb_cb to avoid straddling cachelines

sk_skb_cb lives within skb->cb[]. skb->cb[] straddles
2 cache lines, each containing 24B of data.
The first cache line does not contain much interesting
information for users of strparser, so pad things a little.
Previously strp_msg->full_len would live in the first cache
line and strp_msg->offset in the second.

We need to reorder the 8 byte temp_reg with struct tls_msg
to prevent a 4B hole which would push the struct over 48B.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/strparser.h   | 12 ++++++++----
 net/strparser/strparser.c |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index a191486eb1e4..88900b05443e 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -65,15 +65,19 @@ struct _strp_msg {
 struct sk_skb_cb {
 #define SK_SKB_CB_PRIV_LEN 20
 	unsigned char data[SK_SKB_CB_PRIV_LEN];
+	/* align strp on cache line boundary within skb->cb[] */
+	unsigned char pad[4];
 	struct _strp_msg strp;
-	/* temp_reg is a temporary register used for bpf_convert_data_end_access
-	 * when dst_reg == src_reg.
-	 */
-	u64 temp_reg;
+
+	/* strp users' data follows */
 	struct tls_msg {
 		u8 control;
 		u8 decrypted;
 	} tls;
+	/* temp_reg is a temporary register used for bpf_convert_data_end_access
+	 * when dst_reg == src_reg.
+	 */
+	u64 temp_reg;
 };
 
 static inline struct strp_msg *strp_msg(struct sk_buff *skb)
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 1a72c67afed5..8299ceb3e373 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -533,6 +533,9 @@ EXPORT_SYMBOL_GPL(strp_check_rcv);
 
 static int __init strp_dev_init(void)
 {
+	BUILD_BUG_ON(sizeof(struct sk_skb_cb) >
+		     sizeof_field(struct sk_buff, cb));
+
 	strp_wq = create_singlethread_workqueue("kstrp");
 	if (unlikely(!strp_wq))
 		return -ENOMEM;
-- 
cgit 


From 50a07aa5316181e08fb80914fcf70229a827a2e0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Jul 2022 18:03:10 -0700
Subject: tls: rx: always allocate max possible aad size for decrypt

AAD size is either 5 or 13. Really no point complicating
the code for the 8B of difference. This will also let us
turn the chunked up buffer into a sane struct.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h |  1 +
 net/tls/tls_sw.c  | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 4fc16ca5f469..9394c0459fe8 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -66,6 +66,7 @@
 #define MAX_IV_SIZE			16
 #define TLS_TAG_SIZE			16
 #define TLS_MAX_REC_SEQ_SIZE		8
+#define TLS_MAX_AAD_SIZE		TLS_AAD_SPACE_SIZE
 
 /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes.
  *
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f1777d67527f..377c0f608167 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1450,7 +1450,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 
 	aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
 	mem_size = aead_size + (nsg * sizeof(struct scatterlist));
-	mem_size = mem_size + prot->aad_size;
+	mem_size = mem_size + TLS_MAX_AAD_SIZE;
 	mem_size = mem_size + MAX_IV_SIZE;
 	mem_size = mem_size + prot->tail_size;
 
@@ -1467,7 +1467,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 	sgin = (struct scatterlist *)(mem + aead_size);
 	sgout = sgin + n_sgin;
 	aad = (u8 *)(sgout + n_sgout);
-	iv = aad + prot->aad_size;
+	iv = aad + TLS_MAX_AAD_SIZE;
 	tail = iv + MAX_IV_SIZE;
 
 	/* For CCM based ciphers, first byte of nonce+iv is a constant */
@@ -2474,13 +2474,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_priv;
 	}
 
-	/* Sanity-check the sizes for stack allocations. */
-	if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
-	    rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE) {
-		rc = -EINVAL;
-		goto free_priv;
-	}
-
 	if (crypto_info->version == TLS_1_3_VERSION) {
 		nonce_size = 0;
 		prot->aad_size = TLS_HEADER_SIZE;
@@ -2490,6 +2483,14 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		prot->tail_size = 0;
 	}
 
+	/* Sanity-check the sizes for stack allocations. */
+	if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
+	    rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE ||
+	    prot->aad_size > TLS_MAX_AAD_SIZE) {
+		rc = -EINVAL;
+		goto free_priv;
+	}
+
 	prot->version = crypto_info->version;
 	prot->cipher_type = crypto_info->cipher_type;
 	prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
-- 
cgit 


From 5879031423089b2e19b769f30fc618af742264c3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Jul 2022 18:03:13 -0700
Subject: tls: create an internal header

include/net/tls.h is getting a little long, and is probably hard
for driver authors to navigate. Split out the internals into a
header which will live under net/tls/. While at it move some
static inlines with a single user into the source files, add
a few tls_ prefixes and fix spelling of 'proccess'.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h             | 277 +---------------------------------------
 net/tls/tls.h                 | 290 ++++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_device.c          |   3 +-
 net/tls/tls_device_fallback.c |   2 +
 net/tls/tls_main.c            |  23 +++-
 net/tls/tls_proc.c            |   2 +
 net/tls/tls_sw.c              |  22 +++-
 net/tls/tls_toe.c             |   2 +
 8 files changed, 338 insertions(+), 283 deletions(-)
 create mode 100644 net/tls/tls.h

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 9394c0459fe8..8742e13bc362 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -39,7 +39,6 @@
 #include <linux/crypto.h>
 #include <linux/socket.h>
 #include <linux/tcp.h>
-#include <linux/skmsg.h>
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/rcupdate.h>
@@ -50,6 +49,7 @@
 #include <crypto/aead.h>
 #include <uapi/linux/tls.h>
 
+struct tls_rec;
 
 /* Maximum data size carried in a TLS record */
 #define TLS_MAX_PAYLOAD_SIZE		((size_t)1 << 14)
@@ -78,13 +78,6 @@
 #define TLS_AES_CCM_IV_B0_BYTE		2
 #define TLS_SM4_CCM_IV_B0_BYTE		2
 
-#define __TLS_INC_STATS(net, field)				\
-	__SNMP_INC_STATS((net)->mib.tls_statistics, field)
-#define TLS_INC_STATS(net, field)				\
-	SNMP_INC_STATS((net)->mib.tls_statistics, field)
-#define TLS_DEC_STATS(net, field)				\
-	SNMP_DEC_STATS((net)->mib.tls_statistics, field)
-
 enum {
 	TLS_BASE,
 	TLS_SW,
@@ -93,32 +86,6 @@ enum {
 	TLS_NUM_CONFIG,
 };
 
-/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages
- * allocated or mapped for each TLS record. After encryption, the records are
- * stores in a linked list.
- */
-struct tls_rec {
-	struct list_head list;
-	int tx_ready;
-	int tx_flags;
-
-	struct sk_msg msg_plaintext;
-	struct sk_msg msg_encrypted;
-
-	/* AAD | msg_plaintext.sg.data | sg_tag */
-	struct scatterlist sg_aead_in[2];
-	/* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
-	struct scatterlist sg_aead_out[2];
-
-	char content_type;
-	struct scatterlist sg_content_type;
-
-	char aad_space[TLS_AAD_SPACE_SIZE];
-	u8 iv_data[MAX_IV_SIZE];
-	struct aead_request aead_req;
-	u8 aead_req_ctx[];
-};
-
 struct tx_work {
 	struct delayed_work work;
 	struct sock *sk;
@@ -349,44 +316,6 @@ struct tls_offload_context_rx {
 #define TLS_OFFLOAD_CONTEXT_SIZE_RX					\
 	(sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX)
 
-struct tls_context *tls_ctx_create(struct sock *sk);
-void tls_ctx_free(struct sock *sk, struct tls_context *ctx);
-void update_sk_prot(struct sock *sk, struct tls_context *ctx);
-
-int wait_on_pending_writer(struct sock *sk, long *timeo);
-int tls_sk_query(struct sock *sk, int optname, char __user *optval,
-		int __user *optlen);
-int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
-		  unsigned int optlen);
-void tls_err_abort(struct sock *sk, int err);
-
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
-void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
-void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
-void tls_sw_strparser_done(struct tls_context *tls_ctx);
-int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
-			   int offset, size_t size, int flags);
-int tls_sw_sendpage(struct sock *sk, struct page *page,
-		    int offset, size_t size, int flags);
-void tls_sw_cancel_work_tx(struct tls_context *tls_ctx);
-void tls_sw_release_resources_tx(struct sock *sk);
-void tls_sw_free_ctx_tx(struct tls_context *tls_ctx);
-void tls_sw_free_resources_rx(struct sock *sk);
-void tls_sw_release_resources_rx(struct sock *sk);
-void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
-int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-		   int flags, int *addr_len);
-bool tls_sw_sock_is_readable(struct sock *sk);
-ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
-			   struct pipe_inode_info *pipe,
-			   size_t len, unsigned int flags);
-
-int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-int tls_device_sendpage(struct sock *sk, struct page *page,
-			int offset, size_t size, int flags);
-int tls_tx_records(struct sock *sk, int flags);
-
 struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
 				       u32 seq, u64 *p_record_sn);
 
@@ -400,58 +329,6 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec)
 	return rec->end_seq - rec->len;
 }
 
-int tls_push_sg(struct sock *sk, struct tls_context *ctx,
-		struct scatterlist *sg, u16 first_offset,
-		int flags);
-int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
-			    int flags);
-void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
-
-static inline struct tls_msg *tls_msg(struct sk_buff *skb)
-{
-	struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
-
-	return &scb->tls;
-}
-
-static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
-{
-	return !!ctx->partially_sent_record;
-}
-
-static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
-{
-	return tls_ctx->pending_open_record_frags;
-}
-
-static inline bool is_tx_ready(struct tls_sw_context_tx *ctx)
-{
-	struct tls_rec *rec;
-
-	rec = list_first_entry(&ctx->tx_list, struct tls_rec, list);
-	if (!rec)
-		return false;
-
-	return READ_ONCE(rec->tx_ready);
-}
-
-static inline u16 tls_user_config(struct tls_context *ctx, bool tx)
-{
-	u16 config = tx ? ctx->tx_conf : ctx->rx_conf;
-
-	switch (config) {
-	case TLS_BASE:
-		return TLS_CONF_BASE;
-	case TLS_SW:
-		return TLS_CONF_SW;
-	case TLS_HW:
-		return TLS_CONF_HW;
-	case TLS_HW_RECORD:
-		return TLS_CONF_HW_RECORD;
-	}
-	return 0;
-}
-
 struct sk_buff *
 tls_validate_xmit_skb(struct sock *sk, struct net_device *dev,
 		      struct sk_buff *skb);
@@ -470,31 +347,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
 #endif
 }
 
-static inline bool tls_bigint_increment(unsigned char *seq, int len)
-{
-	int i;
-
-	for (i = len - 1; i >= 0; i--) {
-		++seq[i];
-		if (seq[i] != 0)
-			break;
-	}
-
-	return (i == -1);
-}
-
-static inline void tls_bigint_subtract(unsigned char *seq, int  n)
-{
-	u64 rcd_sn;
-	__be64 *p;
-
-	BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8);
-
-	p = (__be64 *)seq;
-	rcd_sn = be64_to_cpu(*p);
-	*p = cpu_to_be64(rcd_sn - n);
-}
-
 static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -505,82 +357,6 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 	return (__force void *)icsk->icsk_ulp_data;
 }
 
-static inline void tls_advance_record_sn(struct sock *sk,
-					 struct tls_prot_info *prot,
-					 struct cipher_context *ctx)
-{
-	if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
-		tls_err_abort(sk, -EBADMSG);
-
-	if (prot->version != TLS_1_3_VERSION &&
-	    prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
-		tls_bigint_increment(ctx->iv + prot->salt_size,
-				     prot->iv_size);
-}
-
-static inline void tls_fill_prepend(struct tls_context *ctx,
-			     char *buf,
-			     size_t plaintext_len,
-			     unsigned char record_type)
-{
-	struct tls_prot_info *prot = &ctx->prot_info;
-	size_t pkt_len, iv_size = prot->iv_size;
-
-	pkt_len = plaintext_len + prot->tag_size;
-	if (prot->version != TLS_1_3_VERSION &&
-	    prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) {
-		pkt_len += iv_size;
-
-		memcpy(buf + TLS_NONCE_OFFSET,
-		       ctx->tx.iv + prot->salt_size, iv_size);
-	}
-
-	/* we cover nonce explicit here as well, so buf should be of
-	 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
-	 */
-	buf[0] = prot->version == TLS_1_3_VERSION ?
-		   TLS_RECORD_TYPE_DATA : record_type;
-	/* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */
-	buf[1] = TLS_1_2_VERSION_MINOR;
-	buf[2] = TLS_1_2_VERSION_MAJOR;
-	/* we can use IV for nonce explicit according to spec */
-	buf[3] = pkt_len >> 8;
-	buf[4] = pkt_len & 0xFF;
-}
-
-static inline void tls_make_aad(char *buf,
-				size_t size,
-				char *record_sequence,
-				unsigned char record_type,
-				struct tls_prot_info *prot)
-{
-	if (prot->version != TLS_1_3_VERSION) {
-		memcpy(buf, record_sequence, prot->rec_seq_size);
-		buf += 8;
-	} else {
-		size += prot->tag_size;
-	}
-
-	buf[0] = prot->version == TLS_1_3_VERSION ?
-		  TLS_RECORD_TYPE_DATA : record_type;
-	buf[1] = TLS_1_2_VERSION_MAJOR;
-	buf[2] = TLS_1_2_VERSION_MINOR;
-	buf[3] = size >> 8;
-	buf[4] = size & 0xFF;
-}
-
-static inline void xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq)
-{
-	int i;
-
-	if (prot->version == TLS_1_3_VERSION ||
-	    prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) {
-		for (i = 0; i < 8; i++)
-			iv[i + 4] ^= seq[i];
-	}
-}
-
-
 static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
 		const struct tls_context *tls_ctx)
 {
@@ -617,9 +393,6 @@ static inline bool tls_sw_has_ctx_rx(const struct sock *sk)
 	return !!tls_sw_ctx_rx(ctx);
 }
 
-void tls_sw_write_space(struct sock *sk, struct tls_context *ctx);
-void tls_device_write_space(struct sock *sk, struct tls_context *ctx);
-
 static inline struct tls_offload_context_rx *
 tls_offload_ctx_rx(const struct tls_context *tls_ctx)
 {
@@ -694,31 +467,11 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk)
 	return ret;
 }
 
-int __net_init tls_proc_init(struct net *net);
-void __net_exit tls_proc_fini(struct net *net);
-
-int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
-		      unsigned char *record_type);
-int decrypt_skb(struct sock *sk, struct sk_buff *skb,
-		struct scatterlist *sgout);
 struct sk_buff *tls_encrypt_skb(struct sk_buff *skb);
 
-int tls_sw_fallback_init(struct sock *sk,
-			 struct tls_offload_context_tx *offload_ctx,
-			 struct tls_crypto_info *crypto_info);
-
 #ifdef CONFIG_TLS_DEVICE
-void tls_device_init(void);
-void tls_device_cleanup(void);
 void tls_device_sk_destruct(struct sock *sk);
-int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
-void tls_device_free_resources_tx(struct sock *sk);
-int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
-void tls_device_offload_cleanup_rx(struct sock *sk);
-void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
 void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq);
-int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
-			 struct sk_buff *skb, struct strp_msg *rxm);
 
 static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk)
 {
@@ -727,33 +480,5 @@ static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk)
 		return false;
 	return tls_get_ctx(sk)->rx_conf == TLS_HW;
 }
-#else
-static inline void tls_device_init(void) {}
-static inline void tls_device_cleanup(void) {}
-
-static inline int
-tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void tls_device_free_resources_tx(struct sock *sk) {}
-
-static inline int
-tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void tls_device_offload_cleanup_rx(struct sock *sk) {}
-static inline void
-tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {}
-
-static inline int
-tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
-		     struct sk_buff *skb, struct strp_msg *rxm)
-{
-	return 0;
-}
 #endif
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/tls.h b/net/tls/tls.h
new file mode 100644
index 000000000000..8005ee25157d
--- /dev/null
+++ b/net/tls/tls.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_INT_H
+#define _TLS_INT_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/skmsg.h>
+#include <net/tls.h>
+
+#define __TLS_INC_STATS(net, field)				\
+	__SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define TLS_INC_STATS(net, field)				\
+	SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define TLS_DEC_STATS(net, field)				\
+	SNMP_DEC_STATS((net)->mib.tls_statistics, field)
+
+/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages
+ * allocated or mapped for each TLS record. After encryption, the records are
+ * stores in a linked list.
+ */
+struct tls_rec {
+	struct list_head list;
+	int tx_ready;
+	int tx_flags;
+
+	struct sk_msg msg_plaintext;
+	struct sk_msg msg_encrypted;
+
+	/* AAD | msg_plaintext.sg.data | sg_tag */
+	struct scatterlist sg_aead_in[2];
+	/* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
+	struct scatterlist sg_aead_out[2];
+
+	char content_type;
+	struct scatterlist sg_content_type;
+
+	char aad_space[TLS_AAD_SPACE_SIZE];
+	u8 iv_data[MAX_IV_SIZE];
+	struct aead_request aead_req;
+	u8 aead_req_ctx[];
+};
+
+int __net_init tls_proc_init(struct net *net);
+void __net_exit tls_proc_fini(struct net *net);
+
+struct tls_context *tls_ctx_create(struct sock *sk);
+void tls_ctx_free(struct sock *sk, struct tls_context *ctx);
+void update_sk_prot(struct sock *sk, struct tls_context *ctx);
+
+int wait_on_pending_writer(struct sock *sk, long *timeo);
+int tls_sk_query(struct sock *sk, int optname, char __user *optval,
+		 int __user *optlen);
+int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
+		  unsigned int optlen);
+void tls_err_abort(struct sock *sk, int err);
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
+void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
+void tls_sw_strparser_done(struct tls_context *tls_ctx);
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
+			   int offset, size_t size, int flags);
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+		    int offset, size_t size, int flags);
+void tls_sw_cancel_work_tx(struct tls_context *tls_ctx);
+void tls_sw_release_resources_tx(struct sock *sk);
+void tls_sw_free_ctx_tx(struct tls_context *tls_ctx);
+void tls_sw_free_resources_rx(struct sock *sk);
+void tls_sw_release_resources_rx(struct sock *sk);
+void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		   int flags, int *addr_len);
+bool tls_sw_sock_is_readable(struct sock *sk);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+			   struct pipe_inode_info *pipe,
+			   size_t len, unsigned int flags);
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_device_sendpage(struct sock *sk, struct page *page,
+			int offset, size_t size, int flags);
+int tls_tx_records(struct sock *sk, int flags);
+
+void tls_sw_write_space(struct sock *sk, struct tls_context *ctx);
+void tls_device_write_space(struct sock *sk, struct tls_context *ctx);
+
+int tls_process_cmsg(struct sock *sk, struct msghdr *msg,
+		     unsigned char *record_type);
+int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+		struct scatterlist *sgout);
+
+int tls_sw_fallback_init(struct sock *sk,
+			 struct tls_offload_context_tx *offload_ctx,
+			 struct tls_crypto_info *crypto_info);
+
+static inline struct tls_msg *tls_msg(struct sk_buff *skb)
+{
+	struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
+
+	return &scb->tls;
+}
+
+#ifdef CONFIG_TLS_DEVICE
+void tls_device_init(void);
+void tls_device_cleanup(void);
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
+void tls_device_free_resources_tx(struct sock *sk);
+int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
+void tls_device_offload_cleanup_rx(struct sock *sk);
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
+int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+			 struct sk_buff *skb, struct strp_msg *rxm);
+#else
+static inline void tls_device_init(void) {}
+static inline void tls_device_cleanup(void) {}
+
+static inline int
+tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void tls_device_free_resources_tx(struct sock *sk) {}
+
+static inline int
+tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void tls_device_offload_cleanup_rx(struct sock *sk) {}
+static inline void
+tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {}
+
+static inline int
+tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+		     struct sk_buff *skb, struct strp_msg *rxm)
+{
+	return 0;
+}
+#endif
+
+int tls_push_sg(struct sock *sk, struct tls_context *ctx,
+		struct scatterlist *sg, u16 first_offset,
+		int flags);
+int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
+			    int flags);
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
+
+static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
+{
+	return !!ctx->partially_sent_record;
+}
+
+static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
+{
+	return tls_ctx->pending_open_record_frags;
+}
+
+static inline bool tls_bigint_increment(unsigned char *seq, int len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0; i--) {
+		++seq[i];
+		if (seq[i] != 0)
+			break;
+	}
+
+	return (i == -1);
+}
+
+static inline void tls_bigint_subtract(unsigned char *seq, int  n)
+{
+	u64 rcd_sn;
+	__be64 *p;
+
+	BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8);
+
+	p = (__be64 *)seq;
+	rcd_sn = be64_to_cpu(*p);
+	*p = cpu_to_be64(rcd_sn - n);
+}
+
+static inline void
+tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot,
+		      struct cipher_context *ctx)
+{
+	if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
+		tls_err_abort(sk, -EBADMSG);
+
+	if (prot->version != TLS_1_3_VERSION &&
+	    prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
+		tls_bigint_increment(ctx->iv + prot->salt_size,
+				     prot->iv_size);
+}
+
+static inline void
+tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq)
+{
+	int i;
+
+	if (prot->version == TLS_1_3_VERSION ||
+	    prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) {
+		for (i = 0; i < 8; i++)
+			iv[i + 4] ^= seq[i];
+	}
+}
+
+static inline void
+tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len,
+		 unsigned char record_type)
+{
+	struct tls_prot_info *prot = &ctx->prot_info;
+	size_t pkt_len, iv_size = prot->iv_size;
+
+	pkt_len = plaintext_len + prot->tag_size;
+	if (prot->version != TLS_1_3_VERSION &&
+	    prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) {
+		pkt_len += iv_size;
+
+		memcpy(buf + TLS_NONCE_OFFSET,
+		       ctx->tx.iv + prot->salt_size, iv_size);
+	}
+
+	/* we cover nonce explicit here as well, so buf should be of
+	 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
+	 */
+	buf[0] = prot->version == TLS_1_3_VERSION ?
+		   TLS_RECORD_TYPE_DATA : record_type;
+	/* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */
+	buf[1] = TLS_1_2_VERSION_MINOR;
+	buf[2] = TLS_1_2_VERSION_MAJOR;
+	/* we can use IV for nonce explicit according to spec */
+	buf[3] = pkt_len >> 8;
+	buf[4] = pkt_len & 0xFF;
+}
+
+static inline
+void tls_make_aad(char *buf, size_t size, char *record_sequence,
+		  unsigned char record_type, struct tls_prot_info *prot)
+{
+	if (prot->version != TLS_1_3_VERSION) {
+		memcpy(buf, record_sequence, prot->rec_seq_size);
+		buf += 8;
+	} else {
+		size += prot->tag_size;
+	}
+
+	buf[0] = prot->version == TLS_1_3_VERSION ?
+		  TLS_RECORD_TYPE_DATA : record_type;
+	buf[1] = TLS_1_2_VERSION_MAJOR;
+	buf[2] = TLS_1_2_VERSION_MINOR;
+	buf[3] = size >> 8;
+	buf[4] = size & 0xFF;
+}
+
+#endif
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index ec6f4b699a2b..227b92a3064a 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -38,6 +38,7 @@
 #include <net/tcp.h>
 #include <net/tls.h>
 
+#include "tls.h"
 #include "trace.h"
 
 /* device_offload_lock is used to synchronize tls_dev_add
@@ -562,7 +563,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	if (unlikely(msg->msg_controllen)) {
-		rc = tls_proccess_cmsg(sk, msg, &record_type);
+		rc = tls_process_cmsg(sk, msg, &record_type);
 		if (rc)
 			goto out;
 	}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 3bae29ae57ca..618cee704217 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -34,6 +34,8 @@
 #include <crypto/scatterwalk.h>
 #include <net/ip6_checksum.h>
 
+#include "tls.h"
+
 static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
 {
 	struct scatterlist *src = walk->sg;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 1b3efc96db0b..f3d9dbfa507e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,6 +45,8 @@
 #include <net/tls.h>
 #include <net/tls_toe.h>
 
+#include "tls.h"
+
 MODULE_AUTHOR("Mellanox Technologies");
 MODULE_DESCRIPTION("Transport Layer Security Support");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -164,8 +166,8 @@ static int tls_handle_open_record(struct sock *sk, int flags)
 	return 0;
 }
 
-int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
-		      unsigned char *record_type)
+int tls_process_cmsg(struct sock *sk, struct msghdr *msg,
+		     unsigned char *record_type)
 {
 	struct cmsghdr *cmsg;
 	int rc = -EINVAL;
@@ -1003,6 +1005,23 @@ static void tls_update(struct sock *sk, struct proto *p,
 	}
 }
 
+static u16 tls_user_config(struct tls_context *ctx, bool tx)
+{
+	u16 config = tx ? ctx->tx_conf : ctx->rx_conf;
+
+	switch (config) {
+	case TLS_BASE:
+		return TLS_CONF_BASE;
+	case TLS_SW:
+		return TLS_CONF_SW;
+	case TLS_HW:
+		return TLS_CONF_HW;
+	case TLS_HW_RECORD:
+		return TLS_CONF_HW_RECORD;
+	}
+	return 0;
+}
+
 static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
 {
 	u16 version, cipher_type;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 0c200000cc45..1246e52b48f6 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -6,6 +6,8 @@
 #include <net/snmp.h>
 #include <net/tls.h>
 
+#include "tls.h"
+
 #ifdef CONFIG_PROC_FS
 static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW),
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 21c76db8f9b3..1376f866734d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -44,6 +44,8 @@
 #include <net/strparser.h>
 #include <net/tls.h>
 
+#include "tls.h"
+
 struct tls_decrypt_arg {
 	bool zc;
 	bool async;
@@ -524,7 +526,8 @@ static int tls_do_encryption(struct sock *sk,
 	memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv,
 	       prot->iv_size + prot->salt_size);
 
-	xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq);
+	tls_xor_iv_with_seq(prot, rec->iv_data + iv_offset,
+			    tls_ctx->tx.rec_seq);
 
 	sge->offset += prot->prepend_size;
 	sge->length -= prot->prepend_size;
@@ -961,7 +964,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	if (unlikely(msg->msg_controllen)) {
-		ret = tls_proccess_cmsg(sk, msg, &record_type);
+		ret = tls_process_cmsg(sk, msg, &record_type);
 		if (ret) {
 			if (ret == -EINPROGRESS)
 				num_async++;
@@ -1495,7 +1498,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 			goto exit_free;
 		memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->salt_size);
 	}
-	xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq);
+	tls_xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq);
 
 	/* Prepare AAD */
 	tls_make_aad(dctx->aad, rxm->full_len - prot->overhead_size +
@@ -2267,12 +2270,23 @@ static void tx_work_handler(struct work_struct *work)
 	mutex_unlock(&tls_ctx->tx_lock);
 }
 
+static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx)
+{
+	struct tls_rec *rec;
+
+	rec = list_first_entry(&ctx->tx_list, struct tls_rec, list);
+	if (!rec)
+		return false;
+
+	return READ_ONCE(rec->tx_ready);
+}
+
 void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)
 {
 	struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
 
 	/* Schedule the transmission if tx list is ready */
-	if (is_tx_ready(tx_ctx) &&
+	if (tls_is_tx_ready(tx_ctx) &&
 	    !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
 		schedule_delayed_work(&tx_ctx->tx_work.work, 0);
 }
diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c
index 7e1330f19165..825669e1ab47 100644
--- a/net/tls/tls_toe.c
+++ b/net/tls/tls_toe.c
@@ -38,6 +38,8 @@
 #include <net/tls.h>
 #include <net/tls_toe.h>
 
+#include "tls.h"
+
 static LIST_HEAD(device_list);
 static DEFINE_SPINLOCK(device_spinlock);
 
-- 
cgit 


From 2a96271fb66c499e4a89d76a89d3d01170c10bef Mon Sep 17 00:00:00 2001
From: Siarhei Vishniakou <svv@google.com>
Date: Fri, 8 Jul 2022 21:59:23 -0700
Subject: Input: document the units for resolution of size axes

Today, the resolution of size axes is not documented. As a result, it's
not clear what the canonical interpretation of this value should be. On
Android, there is a need to calculate the size of the touch ellipse in
physical units (millimeters).

After reviewing linux source, it turned out that most of the existing
usages are already interpreting this value as "units/mm". This
documentation will make it explicit. This will help device
implementations with correctly following the linux specs, and will
ensure that the devices will work on Android without needing further
customized parameters for scaling of major/minor values.

Signed-off-by: Siarhei Vishniakou <svv@google.com>
Reviewed-by: Jeff LaBundy <jeff@labundy.com>
Link: https://lore.kernel.org/r/20220520084514.3451193-1-svv@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index ee3127461ee0..328cf545c029 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -78,10 +78,13 @@ struct input_id {
  * Note that input core does not clamp reported values to the
  * [minimum, maximum] limits, such task is left to userspace.
  *
- * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z)
- * is reported in units per millimeter (units/mm), resolution
- * for rotational axes (ABS_RX, ABS_RY, ABS_RZ) is reported
- * in units per radian.
+ * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z,
+ * ABS_MT_POSITION_X, ABS_MT_POSITION_Y) is reported in units
+ * per millimeter (units/mm), resolution for rotational axes
+ * (ABS_RX, ABS_RY, ABS_RZ) is reported in units per radian.
+ * The resolution for the size axes (ABS_MT_TOUCH_MAJOR,
+ * ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MAJOR, ABS_MT_WIDTH_MINOR)
+ * is reported in units per millimeter (units/mm).
  * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
  * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
  * units per g (units/g) and in units per degree per second
-- 
cgit 


From 6cda12864cb0f99810a5809e11e3ee5b102c9a47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Jul 2022 21:02:49 -0400
Subject: 9p: Drop kref usage

An upcoming patch is going to require passing the client through
p9_req_put() -> p9_req_free(), but that's awkward with the kref
indirection - so this patch switches to using refcount_t directly.

Link: https://lkml.kernel.org/r/20220704014243.153050-1-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/client.h |  6 +++---
 net/9p/client.c         | 19 ++++++++-----------
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 8f629f1df865..ad38325aaef0 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -77,7 +77,7 @@ enum p9_req_status_t {
 struct p9_req_t {
 	int status;
 	int t_err;
-	struct kref refcount;
+	refcount_t refcount;
 	wait_queue_head_t wq;
 	struct p9_fcall tc;
 	struct p9_fcall rc;
@@ -228,12 +228,12 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag);
 
 static inline void p9_req_get(struct p9_req_t *r)
 {
-	kref_get(&r->refcount);
+	refcount_inc(&r->refcount);
 }
 
 static inline int p9_req_try_get(struct p9_req_t *r)
 {
-	return kref_get_unless_zero(&r->refcount);
+	return refcount_inc_not_zero(&r->refcount);
 }
 
 int p9_req_put(struct p9_req_t *r);
diff --git a/net/9p/client.c b/net/9p/client.c
index dfe8beb864fc..3717d3f21a9f 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -305,7 +305,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
 	 * callback), so p9_client_cb eats the second ref there
 	 * as the pointer is duplicated directly by virtqueue_add_sgs()
 	 */
-	refcount_set(&req->refcount.refcount, 2);
+	refcount_set(&req->refcount, 2);
 
 	return req;
 
@@ -370,18 +370,15 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
 	return p9_req_put(r);
 }
 
-static void p9_req_free(struct kref *ref)
-{
-	struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount);
-
-	p9_fcall_fini(&r->tc);
-	p9_fcall_fini(&r->rc);
-	kmem_cache_free(p9_req_cache, r);
-}
-
 int p9_req_put(struct p9_req_t *r)
 {
-	return kref_put(&r->refcount, p9_req_free);
+	if (refcount_dec_and_test(&r->refcount)) {
+		p9_fcall_fini(&r->tc);
+		p9_fcall_fini(&r->rc);
+		kmem_cache_free(p9_req_cache, r);
+		return 1;
+	}
+	return 0;
 }
 EXPORT_SYMBOL(p9_req_put);
 
-- 
cgit 


From 8b11ff098af42b1fa57fc817daadd53c8b244a0c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 3 Jul 2022 21:08:18 -0400
Subject: 9p: Add client parameter to p9_req_put()

This is to aid in adding mempools, in the next patch.

Link: https://lkml.kernel.org/r/20220704014243.153050-2-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/client.h |  2 +-
 net/9p/client.c         | 12 ++++++------
 net/9p/trans_fd.c       | 12 ++++++------
 net/9p/trans_rdma.c     |  2 +-
 net/9p/trans_virtio.c   |  4 ++--
 net/9p/trans_xen.c      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index ad38325aaef0..78ebcf782ce5 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -236,7 +236,7 @@ static inline int p9_req_try_get(struct p9_req_t *r)
 	return refcount_inc_not_zero(&r->refcount);
 }
 
-int p9_req_put(struct p9_req_t *r);
+int p9_req_put(struct p9_client *c, struct p9_req_t *r);
 
 /* We cannot have the real tracepoints in header files,
  * use a wrapper function */
diff --git a/net/9p/client.c b/net/9p/client.c
index 3717d3f21a9f..0be19ab1f693 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -341,7 +341,7 @@ again:
 		if (!p9_req_try_get(req))
 			goto again;
 		if (req->tc.tag != tag) {
-			p9_req_put(req);
+			p9_req_put(c, req);
 			goto again;
 		}
 	}
@@ -367,10 +367,10 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
 	spin_lock_irqsave(&c->lock, flags);
 	idr_remove(&c->reqs, tag);
 	spin_unlock_irqrestore(&c->lock, flags);
-	return p9_req_put(r);
+	return p9_req_put(c, r);
 }
 
-int p9_req_put(struct p9_req_t *r)
+int p9_req_put(struct p9_client *c, struct p9_req_t *r)
 {
 	if (refcount_dec_and_test(&r->refcount)) {
 		p9_fcall_fini(&r->tc);
@@ -423,7 +423,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
 
 	wake_up(&req->wq);
 	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
-	p9_req_put(req);
+	p9_req_put(c, req);
 }
 EXPORT_SYMBOL(p9_client_cb);
 
@@ -706,7 +706,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
 reterr:
 	p9_tag_remove(c, req);
 	/* We have to put also the 2nd reference as it won't be used */
-	p9_req_put(req);
+	p9_req_put(c, req);
 	return ERR_PTR(err);
 }
 
@@ -743,7 +743,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 	err = c->trans_mod->request(c, req);
 	if (err < 0) {
 		/* write won't happen */
-		p9_req_put(req);
+		p9_req_put(c, req);
 		if (err != -ERESTARTSYS && err != -EFAULT)
 			c->status = Disconnected;
 		goto recalc_sigpending;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 8f8f95e39b03..007c3f45fe05 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work)
 		m->rc.sdata = NULL;
 		m->rc.offset = 0;
 		m->rc.capacity = 0;
-		p9_req_put(m->rreq);
+		p9_req_put(m->client, m->rreq);
 		m->rreq = NULL;
 	}
 
@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work)
 	m->wpos += err;
 	if (m->wpos == m->wsize) {
 		m->wpos = m->wsize = 0;
-		p9_req_put(m->wreq);
+		p9_req_put(m->client, m->wreq);
 		m->wreq = NULL;
 	}
 
@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
 	if (req->status == REQ_STATUS_UNSENT) {
 		list_del(&req->req_list);
 		req->status = REQ_STATUS_FLSHD;
-		p9_req_put(req);
+		p9_req_put(client, req);
 		ret = 0;
 	}
 	spin_unlock(&client->lock);
@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
 	list_del(&req->req_list);
 	req->status = REQ_STATUS_FLSHD;
 	spin_unlock(&client->lock);
-	p9_req_put(req);
+	p9_req_put(client, req);
 
 	return 0;
 }
@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m)
 	p9_mux_poll_stop(m);
 	cancel_work_sync(&m->rq);
 	if (m->rreq) {
-		p9_req_put(m->rreq);
+		p9_req_put(m->client, m->rreq);
 		m->rreq = NULL;
 	}
 	cancel_work_sync(&m->wq);
 	if (m->wreq) {
-		p9_req_put(m->wreq);
+		p9_req_put(m->client, m->wreq);
 		m->wreq = NULL;
 	}
 
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 88e563826674..d817d3745238 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
 			    c->busa, c->req->tc.size,
 			    DMA_TO_DEVICE);
 	up(&rdma->sq_sem);
-	p9_req_put(c->req);
+	p9_req_put(client, c->req);
 	kfree(c);
 }
 
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index b24a4fb0f0a2..147972bf2e79 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
 /* Reply won't come, so drop req ref */
 static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
 {
-	p9_req_put(req);
+	p9_req_put(client, req);
 	return 0;
 }
 
@@ -523,7 +523,7 @@ err_out:
 	kvfree(out_pages);
 	if (!kicked) {
 		/* reply won't come */
-		p9_req_put(req);
+		p9_req_put(client, req);
 	}
 	return err;
 }
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 833cd3792c51..227f89cc7237 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -163,7 +163,7 @@ again:
 	ring->intf->out_prod = prod;
 	spin_unlock_irqrestore(&ring->lock, flags);
 	notify_remote_via_irq(ring->irq);
-	p9_req_put(p9_req);
+	p9_req_put(client, p9_req);
 
 	return 0;
 }
-- 
cgit 


From f7657ff4a7097eaf5220776456b7d75eb09062cb Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Fri, 8 Jul 2022 10:14:08 -0700
Subject: mptcp: move MPTCPOPT_HMAC_LEN to net/mptcp.h

Move macro MPTCPOPT_HMAC_LEN definition from net/mptcp/protocol.h to
include/net/mptcp.h.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h  | 3 ++-
 net/mptcp/protocol.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 4d761ad530c9..ac9cf7271d46 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -39,6 +39,7 @@ struct mptcp_ext {
 			infinite_map:1;
 };
 
+#define MPTCPOPT_HMAC_LEN	20
 #define MPTCP_RM_IDS_MAX	8
 
 struct mptcp_rm_list {
@@ -89,7 +90,7 @@ struct mptcp_out_options {
 			u32 nonce;
 			u32 token;
 			u64 thmac;
-			u8 hmac[20];
+			u8 hmac[MPTCPOPT_HMAC_LEN];
 		};
 	};
 #endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 480c5320b86e..07871e10e510 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -83,7 +83,6 @@
 
 /* MPTCP MP_JOIN flags */
 #define MPTCPOPT_BACKUP		BIT(0)
-#define MPTCPOPT_HMAC_LEN	20
 #define MPTCPOPT_THMAC_LEN	8
 
 /* MPTCP MP_CAPABLE flags */
-- 
cgit 


From c39ba4de6b0a843bec5d46c2b6f2064428dada5e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 5 Jul 2022 11:41:59 +0200
Subject: netfilter: nf_tables: replace BUG_ON by element length check

BUG_ON can be triggered from userspace with an element with a large
userdata area. Replace it by length check and return EINVAL instead.
Over time extensions have been growing in size.

Pick a sufficiently old Fixes: tag to propagate this fix.

Fixes: 7d7402642eaf ("netfilter: nf_tables: variable sized set element keys / data")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 14 +++++---
 net/netfilter/nf_tables_api.c     | 72 +++++++++++++++++++++++++++------------
 2 files changed, 60 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5c4e5a96a984..64cf655c818c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -657,18 +657,22 @@ static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl)
 	tmpl->len = sizeof(struct nft_set_ext);
 }
 
-static inline void nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
-					  unsigned int len)
+static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
+					 unsigned int len)
 {
 	tmpl->len	 = ALIGN(tmpl->len, nft_set_ext_types[id].align);
-	BUG_ON(tmpl->len > U8_MAX);
+	if (tmpl->len > U8_MAX)
+		return -EINVAL;
+
 	tmpl->offset[id] = tmpl->len;
 	tmpl->len	+= nft_set_ext_types[id].len + len;
+
+	return 0;
 }
 
-static inline void nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id)
+static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id)
 {
-	nft_set_ext_add_length(tmpl, id, 0);
+	return nft_set_ext_add_length(tmpl, id, 0);
 }
 
 static inline void nft_set_ext_init(struct nft_set_ext *ext,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d6b59beab3a9..646d5fd53604 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5833,8 +5833,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
 		return -EINVAL;
 
-	if (flags != 0)
-		nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+	if (flags != 0) {
+		err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+		if (err < 0)
+			return err;
+	}
 
 	if (set->flags & NFT_SET_MAP) {
 		if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -5943,7 +5946,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		if (err < 0)
 			goto err_set_elem_expr;
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		if (err < 0)
+			goto err_parse_key;
 	}
 
 	if (nla[NFTA_SET_ELEM_KEY_END]) {
@@ -5952,22 +5957,31 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		if (err < 0)
 			goto err_parse_key;
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+		if (err < 0)
+			goto err_parse_key_end;
 	}
 
 	if (timeout > 0) {
-		nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
-		if (timeout != set->timeout)
-			nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+		err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
+		if (err < 0)
+			goto err_parse_key_end;
+
+		if (timeout != set->timeout) {
+			err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+			if (err < 0)
+				goto err_parse_key_end;
+		}
 	}
 
 	if (num_exprs) {
 		for (i = 0; i < num_exprs; i++)
 			size += expr_array[i]->ops->size;
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
-				       sizeof(struct nft_set_elem_expr) +
-				       size);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+					     sizeof(struct nft_set_elem_expr) + size);
+		if (err < 0)
+			goto err_parse_key_end;
 	}
 
 	if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
@@ -5982,7 +5996,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			err = PTR_ERR(obj);
 			goto err_parse_key_end;
 		}
-		nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+		err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+		if (err < 0)
+			goto err_parse_key_end;
 	}
 
 	if (nla[NFTA_SET_ELEM_DATA] != NULL) {
@@ -6016,7 +6032,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 							  NFT_VALIDATE_NEED);
 		}
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+		if (err < 0)
+			goto err_parse_data;
 	}
 
 	/* The full maximum length of userdata can exceed the maximum
@@ -6026,9 +6044,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	ulen = 0;
 	if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
 		ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
-		if (ulen > 0)
-			nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
-					       ulen);
+		if (ulen > 0) {
+			err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+						     ulen);
+			if (err < 0)
+				goto err_parse_data;
+		}
 	}
 
 	err = -ENOMEM;
@@ -6256,8 +6277,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 
 	nft_set_ext_prepare(&tmpl);
 
-	if (flags != 0)
-		nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+	if (flags != 0) {
+		err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+		if (err < 0)
+			return err;
+	}
 
 	if (nla[NFTA_SET_ELEM_KEY]) {
 		err = nft_setelem_parse_key(ctx, set, &elem.key.val,
@@ -6265,16 +6289,20 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 		if (err < 0)
 			return err;
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		if (err < 0)
+			goto fail_elem;
 	}
 
 	if (nla[NFTA_SET_ELEM_KEY_END]) {
 		err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
 					    nla[NFTA_SET_ELEM_KEY_END]);
 		if (err < 0)
-			return err;
+			goto fail_elem;
 
-		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+		err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+		if (err < 0)
+			goto fail_elem_key_end;
 	}
 
 	err = -ENOMEM;
@@ -6282,7 +6310,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 				      elem.key_end.val.data, NULL, 0, 0,
 				      GFP_KERNEL_ACCOUNT);
 	if (elem.priv == NULL)
-		goto fail_elem;
+		goto fail_elem_key_end;
 
 	ext = nft_set_elem_ext(set, elem.priv);
 	if (flags)
@@ -6306,6 +6334,8 @@ fail_ops:
 	kfree(trans);
 fail_trans:
 	kfree(elem.priv);
+fail_elem_key_end:
+	nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
 fail_elem:
 	nft_data_release(&elem.key.val, NFT_DATA_VALUE);
 	return err;
-- 
cgit 


From 91a29af413def677495e447fb9a06957ebc8bed5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 7 Jul 2022 19:23:09 +0100
Subject: gpio: Remove dynamic allocation from populate_parent_alloc_arg()

The gpiolib is unique in the way it uses intermediate fwspecs
when feeding an interrupt specifier to the parent domain, as it
relies on the populate_parent_alloc_arg() callback to perform
a dynamic allocation.

This is pretty inefficient (we free the structure almost immediately),
and the only reason this isn't a stack allocation is that our
ThunderX friend uses MSIs rather than a FW-constructed structure.

Let's solve it by providing a new type composed of the union
of a struct irq_fwspec and a msi_info_t, which satisfies both
requirements. This allows us to use a stack allocation, and we
can move the handful of users to this new scheme.

Also perform some additional cleanup, such as getting rid of the
stub versions of the irq_domain_translate_*cell helpers, which
are never used when CONFIG_IRQ_DOMAIN_HIERARCHY isn't selected.

Tested on a Tegra186.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Daniel Palmer <daniel@thingy.jp>
Cc: Romain Perier <romain.perier@gmail.com>
Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Jonathan Hunter <jonathanh@nvidia.com>
Cc: Robert Richter <rric@kernel.org>
Cc: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
Cc: Andy Gross <agross@kernel.org>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Bartosz Golaszewski <brgl@bgdev.pl>
Link: https://lore.kernel.org/r/20220707182314.66610-2-prabhakar.mahadev-lad.rj@bp.renesas.com
---
 drivers/gpio/gpio-msc313.c               | 15 +++++-------
 drivers/gpio/gpio-tegra.c                | 15 +++++-------
 drivers/gpio/gpio-tegra186.c             | 15 +++++-------
 drivers/gpio/gpio-thunderx.c             | 15 +++++-------
 drivers/gpio/gpio-visconti.c             | 15 +++++-------
 drivers/gpio/gpiolib.c                   | 42 ++++++++++++++------------------
 drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 15 +++++-------
 include/linux/gpio/driver.h              | 42 +++++++++++++++-----------------
 8 files changed, 73 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-msc313.c b/drivers/gpio/gpio-msc313.c
index b2c90bdd39d0..52d7b8d99170 100644
--- a/drivers/gpio/gpio-msc313.c
+++ b/drivers/gpio/gpio-msc313.c
@@ -550,15 +550,12 @@ static struct irq_chip msc313_gpio_irqchip = {
  * so we need to provide the fwspec. Essentially gpiochip_populate_parent_fwspec_twocell
  * that puts GIC_SPI into the first cell.
  */
-static void *msc313_gpio_populate_parent_fwspec(struct gpio_chip *gc,
-					     unsigned int parent_hwirq,
-					     unsigned int parent_type)
+static int msc313_gpio_populate_parent_fwspec(struct gpio_chip *gc,
+					      union gpio_irq_fwspec *gfwspec,
+					      unsigned int parent_hwirq,
+					      unsigned int parent_type)
 {
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = gc->irq.parent_domain->fwnode;
 	fwspec->param_count = 3;
@@ -566,7 +563,7 @@ static void *msc313_gpio_populate_parent_fwspec(struct gpio_chip *gc,
 	fwspec->param[1] = parent_hwirq;
 	fwspec->param[2] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 
 static int msc313e_gpio_child_to_parent_hwirq(struct gpio_chip *chip,
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index ff2d2a1f9c73..e4fb4cb38a0f 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -443,15 +443,12 @@ static int tegra_gpio_child_to_parent_hwirq(struct gpio_chip *chip,
 	return 0;
 }
 
-static void *tegra_gpio_populate_parent_fwspec(struct gpio_chip *chip,
-					       unsigned int parent_hwirq,
-					       unsigned int parent_type)
+static int tegra_gpio_populate_parent_fwspec(struct gpio_chip *chip,
+					     union gpio_irq_fwspec *gfwspec,
+					     unsigned int parent_hwirq,
+					     unsigned int parent_type)
 {
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = chip->irq.parent_domain->fwnode;
 	fwspec->param_count = 3;
@@ -459,7 +456,7 @@ static void *tegra_gpio_populate_parent_fwspec(struct gpio_chip *chip,
 	fwspec->param[1] = parent_hwirq;
 	fwspec->param[2] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c
index de28a68daea0..54d9fa7da9c1 100644
--- a/drivers/gpio/gpio-tegra186.c
+++ b/drivers/gpio/gpio-tegra186.c
@@ -621,16 +621,13 @@ static int tegra186_gpio_irq_domain_translate(struct irq_domain *domain,
 	return 0;
 }
 
-static void *tegra186_gpio_populate_parent_fwspec(struct gpio_chip *chip,
-						 unsigned int parent_hwirq,
-						 unsigned int parent_type)
+static int tegra186_gpio_populate_parent_fwspec(struct gpio_chip *chip,
+						union gpio_irq_fwspec *gfwspec,
+						unsigned int parent_hwirq,
+						unsigned int parent_type)
 {
 	struct tegra_gpio *gpio = gpiochip_get_data(chip);
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = chip->irq.parent_domain->fwnode;
 	fwspec->param_count = 3;
@@ -638,7 +635,7 @@ static void *tegra186_gpio_populate_parent_fwspec(struct gpio_chip *chip,
 	fwspec->param[1] = parent_hwirq;
 	fwspec->param[2] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 
 static int tegra186_gpio_child_to_parent_hwirq(struct gpio_chip *chip,
diff --git a/drivers/gpio/gpio-thunderx.c b/drivers/gpio/gpio-thunderx.c
index 9f66deab46ea..e1dedbca0c85 100644
--- a/drivers/gpio/gpio-thunderx.c
+++ b/drivers/gpio/gpio-thunderx.c
@@ -408,18 +408,15 @@ static int thunderx_gpio_child_to_parent_hwirq(struct gpio_chip *gc,
 	return 0;
 }
 
-static void *thunderx_gpio_populate_parent_alloc_info(struct gpio_chip *chip,
-						      unsigned int parent_hwirq,
-						      unsigned int parent_type)
+static int thunderx_gpio_populate_parent_alloc_info(struct gpio_chip *chip,
+						    union gpio_irq_fwspec *gfwspec,
+						    unsigned int parent_hwirq,
+						    unsigned int parent_type)
 {
-	msi_alloc_info_t *info;
-
-	info = kmalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return NULL;
+	msi_alloc_info_t *info = &gfwspec->msiinfo;
 
 	info->hwirq = parent_hwirq;
-	return info;
+	return 0;
 }
 
 static int thunderx_gpio_probe(struct pci_dev *pdev,
diff --git a/drivers/gpio/gpio-visconti.c b/drivers/gpio/gpio-visconti.c
index e6534ea1eaa7..5e108ba9956a 100644
--- a/drivers/gpio/gpio-visconti.c
+++ b/drivers/gpio/gpio-visconti.c
@@ -103,15 +103,12 @@ static int visconti_gpio_child_to_parent_hwirq(struct gpio_chip *gc,
 	return -EINVAL;
 }
 
-static void *visconti_gpio_populate_parent_fwspec(struct gpio_chip *chip,
-						  unsigned int parent_hwirq,
-						  unsigned int parent_type)
+static int visconti_gpio_populate_parent_fwspec(struct gpio_chip *chip,
+						union gpio_irq_fwspec *gfwspec,
+						unsigned int parent_hwirq,
+						unsigned int parent_type)
 {
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = chip->irq.parent_domain->fwnode;
 	fwspec->param_count = 3;
@@ -119,7 +116,7 @@ static void *visconti_gpio_populate_parent_fwspec(struct gpio_chip *chip,
 	fwspec->param[1] = parent_hwirq;
 	fwspec->param[2] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 
 static int visconti_gpio_probe(struct platform_device *pdev)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9535f48e18d1..bfde94243752 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1107,7 +1107,7 @@ static int gpiochip_hierarchy_irq_domain_alloc(struct irq_domain *d,
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	struct irq_fwspec *fwspec = data;
-	void *parent_arg;
+	union gpio_irq_fwspec gpio_parent_fwspec = {};
 	unsigned int parent_hwirq;
 	unsigned int parent_type;
 	struct gpio_irq_chip *girq = &gc->irq;
@@ -1147,14 +1147,15 @@ static int gpiochip_hierarchy_irq_domain_alloc(struct irq_domain *d,
 	irq_set_probe(irq);
 
 	/* This parent only handles asserted level IRQs */
-	parent_arg = girq->populate_parent_alloc_arg(gc, parent_hwirq, parent_type);
-	if (!parent_arg)
-		return -ENOMEM;
+	ret = girq->populate_parent_alloc_arg(gc, &gpio_parent_fwspec,
+					      parent_hwirq, parent_type);
+	if (ret)
+		return ret;
 
 	chip_dbg(gc, "alloc_irqs_parent for %d parent hwirq %d\n",
 		  irq, parent_hwirq);
 	irq_set_lockdep_class(irq, gc->irq.lock_key, gc->irq.request_key);
-	ret = irq_domain_alloc_irqs_parent(d, irq, 1, parent_arg);
+	ret = irq_domain_alloc_irqs_parent(d, irq, 1, &gpio_parent_fwspec);
 	/*
 	 * If the parent irqdomain is msi, the interrupts have already
 	 * been allocated, so the EEXIST is good.
@@ -1166,7 +1167,6 @@ static int gpiochip_hierarchy_irq_domain_alloc(struct irq_domain *d,
 			 "failed to allocate parent hwirq %d for hwirq %lu\n",
 			 parent_hwirq, hwirq);
 
-	kfree(parent_arg);
 	return ret;
 }
 
@@ -1230,34 +1230,28 @@ static bool gpiochip_hierarchy_is_hierarchical(struct gpio_chip *gc)
 	return !!gc->irq.parent_domain;
 }
 
-void *gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
-					     unsigned int parent_hwirq,
-					     unsigned int parent_type)
+int gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
+					    union gpio_irq_fwspec *gfwspec,
+					    unsigned int parent_hwirq,
+					    unsigned int parent_type)
 {
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = gc->irq.parent_domain->fwnode;
 	fwspec->param_count = 2;
 	fwspec->param[0] = parent_hwirq;
 	fwspec->param[1] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_twocell);
 
-void *gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
-					      unsigned int parent_hwirq,
-					      unsigned int parent_type)
+int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
+					     union gpio_irq_fwspec *gfwspec,
+					     unsigned int parent_hwirq,
+					     unsigned int parent_type)
 {
-	struct irq_fwspec *fwspec;
-
-	fwspec = kmalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = gc->irq.parent_domain->fwnode;
 	fwspec->param_count = 4;
@@ -1266,7 +1260,7 @@ void *gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
 	fwspec->param[2] = 0;
 	fwspec->param[3] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_fourcell);
 
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
index fd5fff9adff0..3be2a08ae3a6 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
@@ -966,16 +966,13 @@ static int pmic_gpio_child_to_parent_hwirq(struct gpio_chip *chip,
 	return 0;
 }
 
-static void *pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip,
-					     unsigned int parent_hwirq,
-					     unsigned int parent_type)
+static int pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip,
+					    union gpio_irq_fwspec *gfwspec,
+					    unsigned int parent_hwirq,
+					    unsigned int parent_type)
 {
 	struct pmic_gpio_state *state = gpiochip_get_data(chip);
-	struct irq_fwspec *fwspec;
-
-	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return NULL;
+	struct irq_fwspec *fwspec = &gfwspec->fwspec;
 
 	fwspec->fwnode = chip->irq.parent_domain->fwnode;
 
@@ -985,7 +982,7 @@ static void *pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip,
 	/* param[2] must be left as 0 */
 	fwspec->param[3] = parent_type;
 
-	return fwspec;
+	return 0;
 }
 
 static int pmic_gpio_probe(struct platform_device *pdev)
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index b1e0f1f8ee2e..ad5b92b74ccf 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -12,6 +12,8 @@
 #include <linux/property.h>
 #include <linux/types.h>
 
+#include <asm/msi.h>
+
 struct gpio_desc;
 struct of_phandle_args;
 struct device_node;
@@ -23,6 +25,13 @@ enum gpio_lookup_flags;
 
 struct gpio_chip;
 
+union gpio_irq_fwspec {
+	struct irq_fwspec	fwspec;
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+	msi_alloc_info_t	msiinfo;
+#endif
+};
+
 #define GPIO_LINE_DIRECTION_IN	1
 #define GPIO_LINE_DIRECTION_OUT	0
 
@@ -103,9 +112,10 @@ struct gpio_irq_chip {
 	 * variant named &gpiochip_populate_parent_fwspec_fourcell is also
 	 * available.
 	 */
-	void *(*populate_parent_alloc_arg)(struct gpio_chip *gc,
-				       unsigned int parent_hwirq,
-				       unsigned int parent_type);
+	int (*populate_parent_alloc_arg)(struct gpio_chip *gc,
+					 union gpio_irq_fwspec *fwspec,
+					 unsigned int parent_hwirq,
+					 unsigned int parent_type);
 
 	/**
 	 * @child_offset_to_irq:
@@ -646,28 +656,14 @@ struct bgpio_pdata {
 
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
-void *gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
+int gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
+					    union gpio_irq_fwspec *gfwspec,
+					    unsigned int parent_hwirq,
+					    unsigned int parent_type);
+int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
+					     union gpio_irq_fwspec *gfwspec,
 					     unsigned int parent_hwirq,
 					     unsigned int parent_type);
-void *gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
-					      unsigned int parent_hwirq,
-					      unsigned int parent_type);
-
-#else
-
-static inline void *gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
-						    unsigned int parent_hwirq,
-						    unsigned int parent_type)
-{
-	return NULL;
-}
-
-static inline void *gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc,
-						     unsigned int parent_hwirq,
-						     unsigned int parent_type)
-{
-	return NULL;
-}
 
 #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
-- 
cgit 


From 1dd685c414a7b9fdb3d23aca3aedae84f0b998ae Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 6 Jul 2022 14:51:00 -0400
Subject: XArray: Add calls to might_alloc()

Catch bogus GFP flags deterministically, instead of occasionally
when we actually have to allocate memory.

Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h         | 15 +++++++++++++++
 tools/include/linux/sched/mm.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index c29e11b2c073..44dd6d6e01bc 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -16,6 +16,7 @@
 #include <linux/kconfig.h>
 #include <linux/kernel.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 
@@ -586,6 +587,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
 {
 	void *curr;
 
+	might_alloc(gfp);
 	xa_lock_bh(xa);
 	curr = __xa_store(xa, index, entry, gfp);
 	xa_unlock_bh(xa);
@@ -612,6 +614,7 @@ static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
 {
 	void *curr;
 
+	might_alloc(gfp);
 	xa_lock_irq(xa);
 	curr = __xa_store(xa, index, entry, gfp);
 	xa_unlock_irq(xa);
@@ -687,6 +690,7 @@ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
 {
 	void *curr;
 
+	might_alloc(gfp);
 	xa_lock(xa);
 	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
 	xa_unlock(xa);
@@ -714,6 +718,7 @@ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
 {
 	void *curr;
 
+	might_alloc(gfp);
 	xa_lock_bh(xa);
 	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
 	xa_unlock_bh(xa);
@@ -741,6 +746,7 @@ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
 {
 	void *curr;
 
+	might_alloc(gfp);
 	xa_lock_irq(xa);
 	curr = __xa_cmpxchg(xa, index, old, entry, gfp);
 	xa_unlock_irq(xa);
@@ -770,6 +776,7 @@ static inline int __must_check xa_insert(struct xarray *xa,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock(xa);
 	err = __xa_insert(xa, index, entry, gfp);
 	xa_unlock(xa);
@@ -799,6 +806,7 @@ static inline int __must_check xa_insert_bh(struct xarray *xa,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_bh(xa);
 	err = __xa_insert(xa, index, entry, gfp);
 	xa_unlock_bh(xa);
@@ -828,6 +836,7 @@ static inline int __must_check xa_insert_irq(struct xarray *xa,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_irq(xa);
 	err = __xa_insert(xa, index, entry, gfp);
 	xa_unlock_irq(xa);
@@ -857,6 +866,7 @@ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock(xa);
 	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock(xa);
@@ -886,6 +896,7 @@ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_bh(xa);
 	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock_bh(xa);
@@ -915,6 +926,7 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_irq(xa);
 	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock_irq(xa);
@@ -948,6 +960,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock(xa);
 	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
 	xa_unlock(xa);
@@ -981,6 +994,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_bh(xa);
 	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
 	xa_unlock_bh(xa);
@@ -1014,6 +1028,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
 {
 	int err;
 
+	might_alloc(gfp);
 	xa_lock_irq(xa);
 	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
 	xa_unlock_irq(xa);
diff --git a/tools/include/linux/sched/mm.h b/tools/include/linux/sched/mm.h
index c8d9f19c1f35..967294b8edcf 100644
--- a/tools/include/linux/sched/mm.h
+++ b/tools/include/linux/sched/mm.h
@@ -1,4 +1,6 @@
 #ifndef _TOOLS_PERF_LINUX_SCHED_MM_H
 #define _TOOLS_PERF_LINUX_SCHED_MM_H
 
+#define might_alloc(gfp)	do { } while (0)
+
 #endif  /* _TOOLS_PERF_LINUX_SCHED_MM_H */
-- 
cgit 


From c435c54639aa5513ab877c8b014dd83d4ce6b40e Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:14 -0400
Subject: vfio/pci: introduce CONFIG_VFIO_PCI_ZDEV_KVM

The current contents of vfio-pci-zdev are today only useful in a KVM
environment; let's tie everything currently under vfio-pci-zdev to
this Kconfig statement and require KVM in this case, reducing complexity
(e.g. symbol lookups).

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Link: https://lore.kernel.org/r/20220606203325.110625-11-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/Kconfig      | 11 +++++++++++
 drivers/vfio/pci/Makefile     |  2 +-
 include/linux/vfio_pci_core.h |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 4da1914425e1..f9d0c908e738 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -44,6 +44,17 @@ config VFIO_PCI_IGD
 	  To enable Intel IGD assignment through vfio-pci, say Y.
 endif
 
+config VFIO_PCI_ZDEV_KVM
+	bool "VFIO PCI extensions for s390x KVM passthrough"
+	depends on S390 && KVM
+	default y
+	help
+	  Support s390x-specific extensions to enable support for enhancements
+	  to KVM passthrough capabilities, such as interpretive execution of
+	  zPCI instructions.
+
+	  To enable s390x KVM vfio-pci extensions, say Y.
+
 source "drivers/vfio/pci/mlx5/Kconfig"
 
 source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 7052ebd893e0..24c524224da5 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
-vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 
 vfio-pci-y := vfio_pci.o
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 23c176d4b073..63af2897939c 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -206,7 +206,7 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 }
 #endif
 
-#ifdef CONFIG_S390
+#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 				       struct vfio_info_cap *caps);
 #else
-- 
cgit 


From 3c5a1b6f0a18520a0edd0600fef6f1a8553b8fdc Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:19 -0400
Subject: KVM: s390: pci: provide routines for enabling/disabling interrupt
 forwarding

These routines will be wired into a kvm ioctl in order to respond to
requests to enable / disable a device for Adapter Event Notifications /
Adapter Interuption Forwarding.

Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Link: https://lore.kernel.org/r/20220606203325.110625-16-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/pci.c        | 247 +++++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/pci.h        |   1 +
 arch/s390/pci/pci_insn.c   |   1 +
 include/linux/sched/user.h |   3 +-
 4 files changed, 251 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index d566551d3018..b232c8cbaa81 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <asm/pci.h>
 #include <asm/pci_insn.h>
+#include <asm/pci_io.h>
 #include "pci.h"
 
 struct zpci_aift *aift;
@@ -152,6 +153,252 @@ unlock:
 	return rc;
 }
 
+/* Modify PCI: Register floating adapter interruption forwarding */
+static int kvm_zpci_set_airq(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
+	struct zpci_fib fib = {};
+	u8 status;
+
+	fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
+	fib.fmt0.sum = 1;       /* enable summary notifications */
+	fib.fmt0.noi = airq_iv_end(zdev->aibv);
+	fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
+	fib.fmt0.aibvo = 0;
+	fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+	fib.fmt0.aisbo = zdev->aisb & 63;
+	fib.gd = zdev->gisa;
+
+	return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+}
+
+/* Modify PCI: Unregister floating adapter interruption forwarding */
+static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
+{
+	u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
+	struct zpci_fib fib = {};
+	u8 cc, status;
+
+	fib.gd = zdev->gisa;
+
+	cc = zpci_mod_fc(req, &fib, &status);
+	if (cc == 3 || (cc == 1 && status == 24))
+		/* Function already gone or IRQs already deregistered. */
+		cc = 0;
+
+	return cc ? -EIO : 0;
+}
+
+static inline void unaccount_mem(unsigned long nr_pages)
+{
+	struct user_struct *user = get_uid(current_user());
+
+	if (user)
+		atomic_long_sub(nr_pages, &user->locked_vm);
+	if (current->mm)
+		atomic64_sub(nr_pages, &current->mm->pinned_vm);
+}
+
+static inline int account_mem(unsigned long nr_pages)
+{
+	struct user_struct *user = get_uid(current_user());
+	unsigned long page_limit, cur_pages, new_pages;
+
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	do {
+		cur_pages = atomic_long_read(&user->locked_vm);
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+					new_pages) != cur_pages);
+
+	atomic64_add(nr_pages, &current->mm->pinned_vm);
+
+	return 0;
+}
+
+static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
+				   bool assist)
+{
+	struct page *pages[1], *aibv_page, *aisb_page = NULL;
+	unsigned int msi_vecs, idx;
+	struct zpci_gaite *gaite;
+	unsigned long hva, bit;
+	struct kvm *kvm;
+	phys_addr_t gaddr;
+	int rc = 0, gisc, npages, pcount = 0;
+
+	/*
+	 * Interrupt forwarding is only applicable if the device is already
+	 * enabled for interpretation
+	 */
+	if (zdev->gisa == 0)
+		return -EINVAL;
+
+	kvm = zdev->kzdev->kvm;
+	msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
+
+	/* Get the associated forwarding ISC - if invalid, return the error */
+	gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
+	if (gisc < 0)
+		return gisc;
+
+	/* Replace AIBV address */
+	idx = srcu_read_lock(&kvm->srcu);
+	hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
+	npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
+	srcu_read_unlock(&kvm->srcu, idx);
+	if (npages < 1) {
+		rc = -EIO;
+		goto out;
+	}
+	aibv_page = pages[0];
+	pcount++;
+	gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
+	fib->fmt0.aibv = gaddr;
+
+	/* Pin the guest AISB if one was specified */
+	if (fib->fmt0.sum == 1) {
+		idx = srcu_read_lock(&kvm->srcu);
+		hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
+		npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
+					     pages);
+		srcu_read_unlock(&kvm->srcu, idx);
+		if (npages < 1) {
+			rc = -EIO;
+			goto unpin1;
+		}
+		aisb_page = pages[0];
+		pcount++;
+	}
+
+	/* Account for pinned pages, roll back on failure */
+	if (account_mem(pcount))
+		goto unpin2;
+
+	/* AISB must be allocated before we can fill in GAITE */
+	mutex_lock(&aift->aift_lock);
+	bit = airq_iv_alloc_bit(aift->sbv);
+	if (bit == -1UL)
+		goto unlock;
+	zdev->aisb = bit; /* store the summary bit number */
+	zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
+				    AIRQ_IV_BITLOCK |
+				    AIRQ_IV_GUESTVEC,
+				    phys_to_virt(fib->fmt0.aibv));
+
+	spin_lock_irq(&aift->gait_lock);
+	gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+						   sizeof(struct zpci_gaite));
+
+	/* If assist not requested, host will get all alerts */
+	if (assist)
+		gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+	else
+		gaite->gisa = 0;
+
+	gaite->gisc = fib->fmt0.isc;
+	gaite->count++;
+	gaite->aisbo = fib->fmt0.aisbo;
+	gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
+							      ~PAGE_MASK));
+	aift->kzdev[zdev->aisb] = zdev->kzdev;
+	spin_unlock_irq(&aift->gait_lock);
+
+	/* Update guest FIB for re-issue */
+	fib->fmt0.aisbo = zdev->aisb & 63;
+	fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+	fib->fmt0.isc = gisc;
+
+	/* Save some guest fib values in the host for later use */
+	zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
+	zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
+	mutex_unlock(&aift->aift_lock);
+
+	/* Issue the clp to setup the irq now */
+	rc = kvm_zpci_set_airq(zdev);
+	return rc;
+
+unlock:
+	mutex_unlock(&aift->aift_lock);
+unpin2:
+	if (fib->fmt0.sum == 1)
+		unpin_user_page(aisb_page);
+unpin1:
+	unpin_user_page(aibv_page);
+out:
+	return rc;
+}
+
+static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
+{
+	struct kvm_zdev *kzdev = zdev->kzdev;
+	struct zpci_gaite *gaite;
+	struct page *vpage = NULL, *spage = NULL;
+	int rc, pcount = 0;
+	u8 isc;
+
+	if (zdev->gisa == 0)
+		return -EINVAL;
+
+	mutex_lock(&aift->aift_lock);
+
+	/*
+	 * If the clear fails due to an error, leave now unless we know this
+	 * device is about to go away (force) -- In that case clear the GAITE
+	 * regardless.
+	 */
+	rc = kvm_zpci_clear_airq(zdev);
+	if (rc && !force)
+		goto out;
+
+	if (zdev->kzdev->fib.fmt0.aibv == 0)
+		goto out;
+	spin_lock_irq(&aift->gait_lock);
+	gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+						   sizeof(struct zpci_gaite));
+	isc = gaite->gisc;
+	gaite->count--;
+	if (gaite->count == 0) {
+		/* Release guest AIBV and AISB */
+		vpage = phys_to_page(kzdev->fib.fmt0.aibv);
+		if (gaite->aisb != 0)
+			spage = phys_to_page(gaite->aisb);
+		/* Clear the GAIT entry */
+		gaite->aisb = 0;
+		gaite->gisc = 0;
+		gaite->aisbo = 0;
+		gaite->gisa = 0;
+		aift->kzdev[zdev->aisb] = 0;
+		/* Clear zdev info */
+		airq_iv_free_bit(aift->sbv, zdev->aisb);
+		airq_iv_release(zdev->aibv);
+		zdev->aisb = 0;
+		zdev->aibv = NULL;
+	}
+	spin_unlock_irq(&aift->gait_lock);
+	kvm_s390_gisc_unregister(kzdev->kvm, isc);
+	kzdev->fib.fmt0.isc = 0;
+	kzdev->fib.fmt0.aibv = 0;
+
+	if (vpage) {
+		unpin_user_page(vpage);
+		pcount++;
+	}
+	if (spage) {
+		unpin_user_page(spage);
+		pcount++;
+	}
+	if (pcount > 0)
+		unaccount_mem(pcount);
+out:
+	mutex_unlock(&aift->aift_lock);
+
+	return rc;
+}
+
 static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
 {
 	struct kvm_zdev *kzdev;
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index 9f7828d97605..9d091033fc02 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -20,6 +20,7 @@
 struct kvm_zdev {
 	struct zpci_dev *zdev;
 	struct kvm *kvm;
+	struct zpci_fib fib;
 };
 
 struct zpci_gaite {
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 5798178aec9d..56480be48244 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
 
 	return cc;
 }
+EXPORT_SYMBOL_GPL(zpci_mod_fc);
 
 /* Refresh PCI Translations */
 static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 00ed419dd464..f054d0360a75 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -24,7 +24,8 @@ struct user_struct {
 	kuid_t uid;
 
 #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
-    defined(CONFIG_NET) || defined(CONFIG_IO_URING)
+	defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
+	defined(CONFIG_VFIO_PCI_ZDEV_KVM)
 	atomic_long_t locked_vm;
 #endif
 #ifdef CONFIG_WATCH_QUEUE
-- 
cgit 


From 8061d1c31f1a018281bc9877ecce44bfc779e21d Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:21 -0400
Subject: vfio-pci/zdev: add open/close device hooks

During vfio-pci open_device, pass the KVM associated with the vfio group
(if one exists).  This is needed in order to pass a special indicator
(GISA) to firmware to allow zPCI interpretation facilities to be used
for only the specific KVM associated with the vfio-pci device.  During
vfio-pci close_device, unregister the notifier.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Link: https://lore.kernel.org/r/20220606203325.110625-18-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 10 +++++++++-
 drivers/vfio/pci/vfio_pci_zdev.c | 24 ++++++++++++++++++++++++
 include/linux/vfio_pci_core.h    | 10 ++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a0d69ddaf90d..b1e5cfbadf38 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -316,10 +316,14 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
 	}
 
-	ret = vfio_config_init(vdev);
+	ret = vfio_pci_zdev_open_device(vdev);
 	if (ret)
 		goto out_free_state;
 
+	ret = vfio_config_init(vdev);
+	if (ret)
+		goto out_free_zdev;
+
 	msix_pos = pdev->msix_cap;
 	if (msix_pos) {
 		u16 flags;
@@ -340,6 +344,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 
 	return 0;
 
+out_free_zdev:
+	vfio_pci_zdev_close_device(vdev);
 out_free_state:
 	kfree(vdev->pci_saved_state);
 	vdev->pci_saved_state = NULL;
@@ -418,6 +424,8 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 
 	vdev->needs_reset = true;
 
+	vfio_pci_zdev_close_device(vdev);
+
 	/*
 	 * If we have saved state, restore it.  If we can reset the device,
 	 * even better.  Resetting with current state seems better than
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index ea4c0d2b0663..686f2e75e392 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -11,6 +11,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/vfio_zdev.h>
+#include <linux/kvm_host.h>
 #include <asm/pci_clp.h>
 #include <asm/pci_io.h>
 
@@ -136,3 +137,26 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 
 	return ret;
 }
+
+int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+
+	if (!zdev)
+		return -ENODEV;
+
+	if (!vdev->vdev.kvm)
+		return 0;
+
+	return kvm_s390_pci_register_kvm(zdev, vdev->vdev.kvm);
+}
+
+void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+
+	if (!zdev || !vdev->vdev.kvm)
+		return;
+
+	kvm_s390_pci_unregister_kvm(zdev);
+}
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 63af2897939c..d5d9e17f0156 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -209,12 +209,22 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 #ifdef CONFIG_VFIO_PCI_ZDEV_KVM
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 				       struct vfio_info_cap *caps);
+int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
 #else
 static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 					      struct vfio_info_cap *caps)
 {
 	return -ENODEV;
 }
+
+static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
+{
+	return 0;
+}
+
+static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
+{}
 #endif
 
 /* Will be exported for vfio pci drivers usage */
-- 
cgit 


From faf3bfcb895037ae2a8b89d1048090c9e1291cae Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:22 -0400
Subject: vfio-pci/zdev: add function handle to clp base capability

The function handle is a system-wide unique identifier for a zPCI
device.  With zPCI instruction interpretation, the host will no
longer be executing the zPCI instructions on behalf of the guest.
As a result, the guest needs to use the real function handle in
order for firmware to associate the instruction with the proper
PCI function.  Let's provide that handle to the guest.

Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-19-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 5 +++--
 include/uapi/linux/vfio_zdev.h   | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 686f2e75e392..4f28cdd7ecd1 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -24,14 +24,15 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_base cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
-		.header.version = 1,
+		.header.version = 2,
 		.start_dma = zdev->start_dma,
 		.end_dma = zdev->end_dma,
 		.pchid = zdev->pchid,
 		.vfn = zdev->vfn,
 		.fmb_length = zdev->fmb_length,
 		.pft = zdev->pft,
-		.gid = zdev->pfgid
+		.gid = zdev->pfgid,
+		.fh = zdev->fh
 	};
 
 	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
index b4309397b6b2..78c022af3d29 100644
--- a/include/uapi/linux/vfio_zdev.h
+++ b/include/uapi/linux/vfio_zdev.h
@@ -29,6 +29,9 @@ struct vfio_device_info_cap_zpci_base {
 	__u16 fmb_length;	/* Measurement Block Length (in bytes) */
 	__u8 pft;		/* PCI Function Type */
 	__u8 gid;		/* PCI function group ID */
+	/* End of version 1 */
+	__u32 fh;		/* PCI function handle */
+	/* End of version 2 */
 };
 
 /**
-- 
cgit 


From ba6090ff8ae01b41288be87ed9f6bed3d8cf5961 Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:23 -0400
Subject: vfio-pci/zdev: different maxstbl for interpreted devices

When doing load/store interpretation, the maximum store block length is
determined by the underlying firmware, not the host kernel API.  Reflect
that in the associated Query PCI Function Group clp capability and let
userspace decide which is appropriate to present to the guest.

Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-20-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 6 ++++--
 include/uapi/linux/vfio_zdev.h   | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 4f28cdd7ecd1..e163aa9f6144 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -45,14 +45,16 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_group cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
-		.header.version = 1,
+		.header.version = 2,
 		.dasm = zdev->dma_mask,
 		.msi_addr = zdev->msi_addr,
 		.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
 		.mui = zdev->fmb_update,
 		.noi = zdev->max_msi,
 		.maxstbl = ZPCI_MAX_WRITE_SIZE,
-		.version = zdev->version
+		.version = zdev->version,
+		.reserved = 0,
+		.imaxstbl = zdev->maxstbl
 	};
 
 	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
index 78c022af3d29..77f2aff1f27e 100644
--- a/include/uapi/linux/vfio_zdev.h
+++ b/include/uapi/linux/vfio_zdev.h
@@ -50,6 +50,10 @@ struct vfio_device_info_cap_zpci_group {
 	__u16 noi;		/* Maximum number of MSIs */
 	__u16 maxstbl;		/* Maximum Store Block Length */
 	__u8 version;		/* Supported PCI Version */
+	/* End of version 1 */
+	__u8 reserved;
+	__u16 imaxstbl;		/* Maximum Interpreted Store Block Length */
+	/* End of version 2 */
 };
 
 /**
-- 
cgit 


From db1c875e0539518e3d5fe9876ef50975cf4476bb Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:24 -0400
Subject: KVM: s390: add KVM_S390_ZPCI_OP to manage guest zPCI devices

The KVM_S390_ZPCI_OP ioctl provides a mechanism for managing
hardware-assisted virtualization features for s390x zPCI passthrough.
Add the first 2 operations, which can be used to enable/disable
the specified device for Adapter Event Notification interpretation.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Pierre Morel <pmorel@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-21-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst | 47 +++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c       | 16 ++++++++
 arch/s390/kvm/pci.c            | 85 ++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/pci.h            |  2 +
 include/uapi/linux/kvm.h       | 31 +++++++++++++++
 5 files changed, 181 insertions(+)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 11e00a46c610..d58354e9af8f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5802,6 +5802,53 @@ of CPUID leaf 0xD on the host.
 
 This ioctl injects an event channel interrupt directly to the guest vCPU.
 
+4.137 KVM_S390_ZPCI_OP
+--------------------
+
+:Capability: KVM_CAP_S390_ZPCI_OP
+:Architectures: s390
+:Type: vm ioctl
+:Parameters: struct kvm_s390_zpci_op (in)
+:Returns: 0 on success, <0 on error
+
+Used to manage hardware-assisted virtualization features for zPCI devices.
+
+Parameters are specified via the following structure::
+
+  struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;		/* target device */
+	__u8  op;		/* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;	/* Guest addr of interrupt bit vector */
+			__u64 sb;	/* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;	/* Number of interrupts */
+			__u8 isc;	/* Guest interrupt subclass */
+			__u8 sbo;	/* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+  };
+
+The type of operation is specified in the "op" field.
+KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event
+notification interpretation, which will allow firmware delivery of adapter
+events directly to the vm, with KVM providing a backup delivery mechanism;
+KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of
+adapter event notifications.
+
+The target zPCI function must also be specified via the "fh" field.  For the
+KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware
+delivery must be provided via the "reg_aen" struct.
+
+The "pad" and "reserved" fields may be used for future extensions and should be
+set to 0s by userspace.
+
 5. The kvm_run structure
 ========================
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4758bb731199..f214e0fc62ed 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_PROTECTED:
 		r = is_prot_virt_host();
 		break;
+	case KVM_CAP_S390_ZPCI_OP:
+		r = kvm_s390_pci_interp_allowed();
+		break;
 	default:
 		r = 0;
 	}
@@ -2629,6 +2632,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+	case KVM_S390_ZPCI_OP: {
+		struct kvm_s390_zpci_op args;
+
+		r = -EINVAL;
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+			break;
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		r = kvm_s390_pci_zpci_op(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index 24211741deb0..4946fb7757d6 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -585,6 +585,91 @@ void kvm_s390_pci_clear_list(struct kvm *kvm)
 	spin_unlock(&kvm->arch.kzdev_list_lock);
 }
 
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+	struct zpci_dev *zdev = NULL;
+	struct kvm_zdev *kzdev;
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+		if (kzdev->zdev->fh == fh) {
+			zdev = kzdev->zdev;
+			break;
+		}
+	}
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+				     struct kvm_s390_zpci_op *args)
+{
+	struct zpci_fib fib = {};
+	bool hostflag;
+
+	fib.fmt0.aibv = args->u.reg_aen.ibv;
+	fib.fmt0.isc = args->u.reg_aen.isc;
+	fib.fmt0.noi = args->u.reg_aen.noi;
+	if (args->u.reg_aen.sb != 0) {
+		fib.fmt0.aisb = args->u.reg_aen.sb;
+		fib.fmt0.aisbo = args->u.reg_aen.sbo;
+		fib.fmt0.sum = 1;
+	} else {
+		fib.fmt0.aisb = 0;
+		fib.fmt0.aisbo = 0;
+		fib.fmt0.sum = 0;
+	}
+
+	hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+	return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+	struct kvm_zdev *kzdev;
+	struct zpci_dev *zdev;
+	int r;
+
+	zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+	if (!zdev)
+		return -ENODEV;
+
+	mutex_lock(&zdev->kzdev_lock);
+	mutex_lock(&kvm->lock);
+
+	kzdev = zdev->kzdev;
+	if (!kzdev) {
+		r = -ENODEV;
+		goto out;
+	}
+	if (kzdev->kvm != kvm) {
+		r = -EPERM;
+		goto out;
+	}
+
+	switch (args->op) {
+	case KVM_S390_ZPCIOP_REG_AEN:
+		/* Fail on unknown flags */
+		if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+			r = -EINVAL;
+			break;
+		}
+		r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+		break;
+	case KVM_S390_ZPCIOP_DEREG_AEN:
+		r = kvm_s390_pci_aif_disable(zdev, false);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+	return r;
+}
+
 int kvm_s390_pci_init(void)
 {
 	aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index fb2b91b76e0c..0351382e990f 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -59,6 +59,8 @@ void kvm_s390_pci_aen_exit(void);
 void kvm_s390_pci_init_list(struct kvm *kvm);
 void kvm_s390_pci_clear_list(struct kvm *kvm);
 
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
 int kvm_s390_pci_init(void);
 void kvm_s390_pci_exit(void);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..2f302e2287d1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1157,6 +1157,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_TSC_CONTROL 214
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
+#define KVM_CAP_S390_ZPCI_OP 221
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2118,4 +2119,34 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 
+/* Available with KVM_CAP_S390_ZPCI_OP */
+#define KVM_S390_ZPCI_OP         _IOW(KVMIO,  0xd1, struct kvm_s390_zpci_op)
+
+struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;               /* target device */
+	__u8  op;               /* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;      /* Guest addr of interrupt bit vector */
+			__u64 sb;       /* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;      /* Number of interrupts */
+			__u8 isc;       /* Guest interrupt subclass */
+			__u8 sbo;       /* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+};
+
+/* types for kvm_s390_zpci_op->op */
+#define KVM_S390_ZPCIOP_REG_AEN                0
+#define KVM_S390_ZPCIOP_DEREG_AEN      1
+
+/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
+#define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit 


From 3c512307de4097aaaab3f4741c7a98fe88afa469 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Fri, 8 Jul 2022 17:56:07 +0530
Subject: wifi: nl80211: fix sending link ID info of associated BSS

commit dd374f84baec ("wifi: nl80211: expose link ID for associated
BSSes") used a top-level attribute to send link ID of the associated
BSS in the nested attribute NL80211_ATTR_BSS. But since NL80211_ATTR_BSS
is a nested attribute of the attributes defined in enum nl80211_bss,
define a new attribute in enum nl80211_bss and use it for sending the
link ID of the BSS.

Fixes: dd374f84baec ("wifi: nl80211: expose link ID for associated BSSes")
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20220708122607.1836958-1-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 279f9715919e..7bb1ae59f3a5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4904,6 +4904,7 @@ enum nl80211_bss_scan_width {
  *	Contains a nested array of signal strength attributes (u8, dBm),
  *	using the nesting index as the antenna number.
  * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz
+ * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8).
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -4929,6 +4930,7 @@ enum nl80211_bss {
 	NL80211_BSS_PARENT_BSSID,
 	NL80211_BSS_CHAIN_SIGNAL,
 	NL80211_BSS_FREQUENCY_OFFSET,
+	NL80211_BSS_MLO_LINK_ID,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e20d0fc9678a..22c4cf6fbb57 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9991,7 +9991,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			    (nla_put_u32(msg, NL80211_BSS_STATUS,
 					 NL80211_BSS_STATUS_ASSOCIATED) ||
 			     (wdev->valid_links &&
-			      nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))))
+			      nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID,
+					 link_id))))
 				goto nla_put_failure;
 		}
 		break;
-- 
cgit 


From ef6e5d61eb7a0a30f776a829274573094185d03d Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Wed, 6 Jul 2022 17:15:52 +0200
Subject: genirq: Allow irq_set_chip_handler_name_locked() to take a const
 irq_chip

Similar to commit 393e1280f765 ("genirq: Allow irq_chip registration
functions to take a const irq_chip"), allow the
irq_set_chip_handler_name_locked() function to take a const irq_chip
argument.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220706151553.1580790-1-michael@walle.cc
---
 include/linux/irqdesc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index a77584593f7d..1cd4e36890fb 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -209,14 +209,15 @@ static inline void irq_set_handler_locked(struct irq_data *data,
  * Must be called with irq_desc locked and valid parameters.
  */
 static inline void
-irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
+irq_set_chip_handler_name_locked(struct irq_data *data,
+				 const struct irq_chip *chip,
 				 irq_flow_handler_t handler, const char *name)
 {
 	struct irq_desc *desc = irq_data_to_desc(data);
 
 	desc->handle_irq = handler;
 	desc->name = name;
-	data->chip = chip;
+	data->chip = (struct irq_chip *)chip;
 }
 
 bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
-- 
cgit 


From e22aa14866684f77b4f6b6cae98539e520ddb731 Mon Sep 17 00:00:00 2001
From: sewookseo <sewookseo@google.com>
Date: Thu, 7 Jul 2022 10:01:39 +0000
Subject: net: Find dst with sk's xfrm policy not ctl_sk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we set XFRM security policy by calling setsockopt with option
IPV6_XFRM_POLICY, the policy will be stored in 'sock_policy' in 'sock'
struct. However tcp_v6_send_response doesn't look up dst_entry with the
actual socket but looks up with tcp control socket. This may cause a
problem that a RST packet is sent without ESP encryption & peer's TCP
socket can't receive it.
This patch will make the function look up dest_entry with actual socket,
if the socket has XFRM policy(sock_policy), so that the TCP response
packet via this function can be encrypted, & aligned on the encrypted
TCP socket.

Tested: We encountered this problem when a TCP socket which is encrypted
in ESP transport mode encryption, receives challenge ACK at SYN_SENT
state. After receiving challenge ACK, TCP needs to send RST to
establish the socket at next SYN try. But the RST was not encrypted &
peer TCP socket still remains on ESTABLISHED state.
So we verified this with test step as below.
[Test step]
1. Making a TCP state mismatch between client(IDLE) & server(ESTABLISHED).
2. Client tries a new connection on the same TCP ports(src & dst).
3. Server will return challenge ACK instead of SYN,ACK.
4. Client will send RST to server to clear the SOCKET.
5. Client will retransmit SYN to server on the same TCP ports.
[Expected result]
The TCP connection should be established.

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Sehee Lee <seheele@google.com>
Signed-off-by: Sewook Seo <sewookseo@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h   | 2 ++
 net/ipv4/ip_output.c | 2 +-
 net/ipv4/tcp_ipv4.c  | 2 ++
 net/ipv6/tcp_ipv6.c  | 5 ++++-
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9287712ad977..67b799ef5f67 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1195,6 +1195,8 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);
 
 static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
+	if (!sk_fullsock(osk))
+		return 0;
 	sk->sk_policy[0] = NULL;
 	sk->sk_policy[1] = NULL;
 	if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5e32a2f86fbd..f7156845ddf7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1700,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
 			   arg->uid);
 	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
-	rt = ip_route_output_key(net, &fl4);
+	rt = ip_route_output_flow(net, &fl4, sk);
 	if (IS_ERR(rt))
 		return;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 68d0d8a008e2..228d36692d08 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -819,6 +819,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
 		transmit_time = tcp_transmit_time(sk);
+		xfrm_sk_clone_policy(ctl_sk, sk);
 	}
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -827,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 			      transmit_time);
 
 	ctl_sk->sk_mark = 0;
+	xfrm_sk_free_policy(ctl_sk);
 	sock_net_set(ctl_sk, &init_net);
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c72448ba6dc9..70d4890d8d2f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -952,7 +952,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	 * Underlying function will use this to retrieve the network
 	 * namespace
 	 */
-	dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
+	if (sk && sk->sk_state != TCP_TIME_WAIT)
+		dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
+	else
+		dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
 		ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
-- 
cgit 


From b038177636f83bbf87c2b238706474145dd2cd04 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Wed, 15 Jun 2022 12:43:55 +0200
Subject: netfilter: nf_flow_table: count pending offload workqueue tasks

To improve hardware offload debuggability count pending 'add', 'del' and
'stats' flow_table offload workqueue tasks. Counters are incremented before
scheduling new task and decremented when workqueue handler finishes
executing. These counters allow user to diagnose congestion on hardware
offload workqueues that can happen when either CPU is starved and workqueue
jobs are executed at lower rate than new ones are added or when
hardware/driver can't keep up with the rate.

Implement the described counters as percpu counters inside new struct
netns_ft which is stored inside struct net. Expose them via new procfs file
'/proc/net/stats/nf_flowtable' that is similar to existing 'nf_conntrack'
file.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h           |  6 +++
 include/net/netfilter/nf_flow_table.h | 21 +++++++++
 include/net/netns/flow_table.h        | 14 ++++++
 net/netfilter/Kconfig                 |  9 ++++
 net/netfilter/Makefile                |  1 +
 net/netfilter/nf_flow_table_core.c    | 62 ++++++++++++++++++++++++++-
 net/netfilter/nf_flow_table_offload.c | 17 ++++++--
 net/netfilter/nf_flow_table_procfs.c  | 80 +++++++++++++++++++++++++++++++++++
 8 files changed, 206 insertions(+), 4 deletions(-)
 create mode 100644 include/net/netns/flow_table.h
 create mode 100644 net/netfilter/nf_flow_table_procfs.c

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 20a2992901c2..8c3587d5c308 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -26,6 +26,9 @@
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netns/conntrack.h>
 #endif
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+#include <net/netns/flow_table.h>
+#endif
 #include <net/netns/nftables.h>
 #include <net/netns/xfrm.h>
 #include <net/netns/mpls.h>
@@ -142,6 +145,9 @@ struct net {
 #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
 	struct netns_nftables	nft;
 #endif
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+	struct netns_ft ft;
+#endif
 #endif
 #ifdef CONFIG_WEXT_CORE
 	struct sk_buff_head	wext_nlevents;
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 64daafd1fc41..d5326c44b453 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -335,4 +335,25 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
 	return 0;
 }
 
+#define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count)
+#define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count)
+#define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count)	\
+	this_cpu_inc((net)->ft.stat->count)
+#define NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count)	\
+	this_cpu_dec((net)->ft.stat->count)
+
+#ifdef CONFIG_NF_FLOW_TABLE_PROCFS
+int nf_flow_table_init_proc(struct net *net);
+void nf_flow_table_fini_proc(struct net *net);
+#else
+static inline int nf_flow_table_init_proc(struct net *net)
+{
+	return 0;
+}
+
+static inline void nf_flow_table_fini_proc(struct net *net)
+{
+}
+#endif /* CONFIG_NF_FLOW_TABLE_PROCFS */
+
 #endif /* _NF_FLOW_TABLE_H */
diff --git a/include/net/netns/flow_table.h b/include/net/netns/flow_table.h
new file mode 100644
index 000000000000..1c5fc657e267
--- /dev/null
+++ b/include/net/netns/flow_table.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NETNS_FLOW_TABLE_H
+#define __NETNS_FLOW_TABLE_H
+
+struct nf_flow_table_stat {
+	unsigned int count_wq_add;
+	unsigned int count_wq_del;
+	unsigned int count_wq_stats;
+};
+
+struct netns_ft {
+	struct nf_flow_table_stat __percpu *stat;
+};
+#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ddc54b6d18ee..df6abbfe0079 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -734,6 +734,15 @@ config NF_FLOW_TABLE
 
 	  To compile it as a module, choose M here.
 
+config NF_FLOW_TABLE_PROCFS
+	bool "Supply flow table statistics in procfs"
+	default y
+	depends on PROC_FS
+	depends on SYSCTL
+	help
+	  This option enables for the flow table offload statistics
+	  to be shown in procfs under net/netfilter/nf_flowtable.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 238b6a620e88..06df49ea6329 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
 nf_flow_table-objs		:= nf_flow_table_core.o nf_flow_table_ip.o \
 				   nf_flow_table_offload.o
+nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
 
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index f2def06d1070..51c2b1570838 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -614,14 +614,74 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_free);
 
+static int nf_flow_table_init_net(struct net *net)
+{
+	net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
+	return net->ft.stat ? 0 : -ENOMEM;
+}
+
+static void nf_flow_table_fini_net(struct net *net)
+{
+	free_percpu(net->ft.stat);
+}
+
+static int nf_flow_table_pernet_init(struct net *net)
+{
+	int ret;
+
+	ret = nf_flow_table_init_net(net);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_flow_table_init_proc(net);
+	if (ret < 0)
+		goto out_proc;
+
+	return 0;
+
+out_proc:
+	nf_flow_table_fini_net(net);
+	return ret;
+}
+
+static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		nf_flow_table_fini_proc(net);
+		nf_flow_table_fini_net(net);
+	}
+}
+
+static struct pernet_operations nf_flow_table_net_ops = {
+	.init = nf_flow_table_pernet_init,
+	.exit_batch = nf_flow_table_pernet_exit,
+};
+
 static int __init nf_flow_table_module_init(void)
 {
-	return nf_flow_table_offload_init();
+	int ret;
+
+	ret = register_pernet_subsys(&nf_flow_table_net_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_flow_table_offload_init();
+	if (ret)
+		goto out_offload;
+
+	return 0;
+
+out_offload:
+	unregister_pernet_subsys(&nf_flow_table_net_ops);
+	return ret;
 }
 
 static void __exit nf_flow_table_module_exit(void)
 {
 	nf_flow_table_offload_exit();
+	unregister_pernet_subsys(&nf_flow_table_net_ops);
 }
 
 module_init(nf_flow_table_module_init);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 11b6e1942092..103b6cbf257f 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -967,17 +967,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
 static void flow_offload_work_handler(struct work_struct *work)
 {
 	struct flow_offload_work *offload;
+	struct net *net;
 
 	offload = container_of(work, struct flow_offload_work, work);
+	net = read_pnet(&offload->flowtable->net);
 	switch (offload->cmd) {
 		case FLOW_CLS_REPLACE:
 			flow_offload_work_add(offload);
+			NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add);
 			break;
 		case FLOW_CLS_DESTROY:
 			flow_offload_work_del(offload);
+			NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del);
 			break;
 		case FLOW_CLS_STATS:
 			flow_offload_work_stats(offload);
+			NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats);
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -989,12 +994,18 @@ static void flow_offload_work_handler(struct work_struct *work)
 
 static void flow_offload_queue_work(struct flow_offload_work *offload)
 {
-	if (offload->cmd == FLOW_CLS_REPLACE)
+	struct net *net = read_pnet(&offload->flowtable->net);
+
+	if (offload->cmd == FLOW_CLS_REPLACE) {
+		NF_FLOW_TABLE_STAT_INC(net, count_wq_add);
 		queue_work(nf_flow_offload_add_wq, &offload->work);
-	else if (offload->cmd == FLOW_CLS_DESTROY)
+	} else if (offload->cmd == FLOW_CLS_DESTROY) {
+		NF_FLOW_TABLE_STAT_INC(net, count_wq_del);
 		queue_work(nf_flow_offload_del_wq, &offload->work);
-	else
+	} else {
+		NF_FLOW_TABLE_STAT_INC(net, count_wq_stats);
 		queue_work(nf_flow_offload_stats_wq, &offload->work);
+	}
 }
 
 static struct flow_offload_work *
diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c
new file mode 100644
index 000000000000..159b033a43e6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_procfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(net->ft.stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(net->ft.stat, cpu);
+	}
+	(*pos)++;
+	return NULL;
+}
+
+static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	const struct nf_flow_table_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "wq_add   wq_del   wq_stats\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%8d %8d %8d\n",
+		   st->count_wq_add,
+		   st->count_wq_del,
+		   st->count_wq_stats
+		);
+	return 0;
+}
+
+static const struct seq_operations nf_flow_table_cpu_seq_ops = {
+	.start	= nf_flow_table_cpu_seq_start,
+	.next	= nf_flow_table_cpu_seq_next,
+	.stop	= nf_flow_table_cpu_seq_stop,
+	.show	= nf_flow_table_cpu_seq_show,
+};
+
+int nf_flow_table_init_proc(struct net *net)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat,
+			      &nf_flow_table_cpu_seq_ops,
+			      sizeof(struct seq_net_private));
+	return pde ? 0 : -ENOMEM;
+}
+
+void nf_flow_table_fini_proc(struct net *net)
+{
+	remove_proc_entry("nf_flowtable", net->proc_net_stat);
+}
-- 
cgit 


From 6976890e8998afd8abbbd9fe27ed71387b24f57f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 22 Jun 2022 11:00:45 +0200
Subject: netfilter: nf_conntrack: add missing __rcu annotations

Access to the hook pointers use correct helpers but the pointers lack
the needed __rcu annotation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_sip.h   | 2 +-
 include/net/netfilter/nf_conntrack_timeout.h | 2 +-
 net/netfilter/nf_conntrack_pptp.c            | 2 +-
 net/netfilter/nf_conntrack_sip.c             | 2 +-
 net/netfilter/nf_conntrack_timeout.c         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index c620521c42bc..dbc614dfe0d5 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -164,7 +164,7 @@ struct nf_nat_sip_hooks {
 				  unsigned int medialen,
 				  union nf_inet_addr *rtp_addr);
 };
-extern const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+extern const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
 
 int ct_sip_parse_request(const struct nf_conn *ct, const char *dptr,
 			 unsigned int datalen, unsigned int *matchoff,
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index fea258983d23..9fdaba911de6 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -105,7 +105,7 @@ struct nf_ct_timeout_hooks {
 	void (*timeout_put)(struct nf_ct_timeout *timeout);
 };
 
-extern const struct nf_ct_timeout_hooks *nf_ct_timeout_hook;
+extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook;
 #endif
 
 #endif /* _NF_CONNTRACK_TIMEOUT_H */
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index f3fa367b455f..4c679638df06 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -45,7 +45,7 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
 
 static DEFINE_SPINLOCK(nf_pptp_lock);
 
-const struct nf_nat_pptp_hook *nf_nat_pptp_hook;
+const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
 EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
 
 #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index b83dc9bf0a5d..a88b43624b27 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -60,7 +60,7 @@ module_param(sip_external_media, int, 0600);
 MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external "
 				     "endpoints (default 0)");
 
-const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
 EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
 
 static int string_len(const struct nf_conn *ct, const char *dptr,
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 0f828d05ea60..821365ed5b2c 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -22,7 +22,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 
-const struct nf_ct_timeout_hooks *nf_ct_timeout_hook __read_mostly;
+const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
 
 static int untimeout(struct nf_conn *ct, void *timeout)
-- 
cgit 


From d3f2d0a292c24fc624afb2b4f47f838e83775721 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 22 Jun 2022 11:00:47 +0200
Subject: netfilter: h323: merge nat hook pointers into one

sparse complains about incorrect rcu usage.

Code uses the correct rcu access primitives, but the function pointers
lack rcu annotations.

Collapse all of them into a single structure, then annotate the pointer.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_h323.h | 109 ++++++------
 net/ipv4/netfilter/nf_nat_h323.c            |  42 ++---
 net/netfilter/nf_conntrack_h323_main.c      | 260 +++++++++++-----------------
 3 files changed, 169 insertions(+), 242 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_h323.h b/include/linux/netfilter/nf_conntrack_h323.h
index 4561ec0fcea4..9e937f64a1ad 100644
--- a/include/linux/netfilter/nf_conntrack_h323.h
+++ b/include/linux/netfilter/nf_conntrack_h323.h
@@ -38,60 +38,63 @@ void nf_conntrack_h245_expect(struct nf_conn *new,
 			      struct nf_conntrack_expect *this);
 void nf_conntrack_q931_expect(struct nf_conn *new,
 			      struct nf_conntrack_expect *this);
-extern int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
-				  unsigned char **data, int dataoff,
-				  H245_TransportAddress *taddr,
-				  union nf_inet_addr *addr,
-				  __be16 port);
-extern int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
-				  unsigned char **data, int dataoff,
-				  TransportAddress *taddr,
-				  union nf_inet_addr *addr,
-				  __be16 port);
-extern int (*set_sig_addr_hook) (struct sk_buff *skb,
-				 struct nf_conn *ct,
-				 enum ip_conntrack_info ctinfo,
-				 unsigned int protoff, unsigned char **data,
-				 TransportAddress *taddr, int count);
-extern int (*set_ras_addr_hook) (struct sk_buff *skb,
-				 struct nf_conn *ct,
-				 enum ip_conntrack_info ctinfo,
-				 unsigned int protoff, unsigned char **data,
-				 TransportAddress *taddr, int count);
-extern int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
-				 struct nf_conn *ct,
-				 enum ip_conntrack_info ctinfo,
-				 unsigned int protoff, unsigned char **data,
-				 int dataoff,
-				 H245_TransportAddress *taddr,
-				 __be16 port, __be16 rtp_port,
-				 struct nf_conntrack_expect *rtp_exp,
-				 struct nf_conntrack_expect *rtcp_exp);
-extern int (*nat_t120_hook) (struct sk_buff *skb, struct nf_conn *ct,
-			     enum ip_conntrack_info ctinfo,
-			     unsigned int protoff,
+
+struct nfct_h323_nat_hooks {
+	int (*set_h245_addr)(struct sk_buff *skb, unsigned int protoff,
 			     unsigned char **data, int dataoff,
-			     H245_TransportAddress *taddr, __be16 port,
-			     struct nf_conntrack_expect *exp);
-extern int (*nat_h245_hook) (struct sk_buff *skb, struct nf_conn *ct,
-			     enum ip_conntrack_info ctinfo,
-			     unsigned int protoff,
+			     H245_TransportAddress *taddr,
+			     union nf_inet_addr *addr, __be16 port);
+	int (*set_h225_addr)(struct sk_buff *skb, unsigned int protoff,
 			     unsigned char **data, int dataoff,
-			     TransportAddress *taddr, __be16 port,
-			     struct nf_conntrack_expect *exp);
-extern int (*nat_callforwarding_hook) (struct sk_buff *skb,
-				       struct nf_conn *ct,
-				       enum ip_conntrack_info ctinfo,
-				       unsigned int protoff,
-				       unsigned char **data, int dataoff,
-				       TransportAddress *taddr,
-				       __be16 port,
-				       struct nf_conntrack_expect *exp);
-extern int (*nat_q931_hook) (struct sk_buff *skb, struct nf_conn *ct,
-			     enum ip_conntrack_info ctinfo,
-			     unsigned int protoff,
-			     unsigned char **data, TransportAddress *taddr,
-			     int idx, __be16 port,
-			     struct nf_conntrack_expect *exp);
+			     TransportAddress *taddr,
+			     union nf_inet_addr *addr, __be16 port);
+	int (*set_sig_addr)(struct sk_buff *skb,
+			    struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned int protoff, unsigned char **data,
+			    TransportAddress *taddr, int count);
+	int (*set_ras_addr)(struct sk_buff *skb,
+			    struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned int protoff, unsigned char **data,
+			    TransportAddress *taddr, int count);
+	int (*nat_rtp_rtcp)(struct sk_buff *skb,
+			    struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned int protoff,
+			    unsigned char **data, int dataoff,
+			    H245_TransportAddress *taddr,
+			    __be16 port, __be16 rtp_port,
+			    struct nf_conntrack_expect *rtp_exp,
+			    struct nf_conntrack_expect *rtcp_exp);
+	int (*nat_t120)(struct sk_buff *skb,
+			struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff,
+			unsigned char **data, int dataoff,
+			H245_TransportAddress *taddr, __be16 port,
+			struct nf_conntrack_expect *exp);
+	int (*nat_h245)(struct sk_buff *skb,
+			struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff,
+			unsigned char **data, int dataoff,
+			TransportAddress *taddr, __be16 port,
+			struct nf_conntrack_expect *exp);
+	int (*nat_callforwarding)(struct sk_buff *skb,
+				  struct nf_conn *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned int protoff,
+				  unsigned char **data, int dataoff,
+				  TransportAddress *taddr, __be16 port,
+				  struct nf_conntrack_expect *exp);
+	int (*nat_q931)(struct sk_buff *skb,
+			struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff,
+			unsigned char **data, TransportAddress *taddr, int idx,
+			__be16 port, struct nf_conntrack_expect *exp);
+};
+extern const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook;
 
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 76a411ae9fe6..a334f0dcc2d0 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -579,28 +579,22 @@ static struct nf_ct_helper_expectfn callforwarding_nat = {
 	.expectfn	= ip_nat_callforwarding_expect,
 };
 
+static const struct nfct_h323_nat_hooks nathooks = {
+	.set_h245_addr = set_h245_addr,
+	.set_h225_addr = set_h225_addr,
+	.set_sig_addr = set_sig_addr,
+	.set_ras_addr = set_ras_addr,
+	.nat_rtp_rtcp = nat_rtp_rtcp,
+	.nat_t120 = nat_t120,
+	.nat_h245 = nat_h245,
+	.nat_callforwarding = nat_callforwarding,
+	.nat_q931 = nat_q931,
+};
+
 /****************************************************************************/
 static int __init nf_nat_h323_init(void)
 {
-	BUG_ON(set_h245_addr_hook != NULL);
-	BUG_ON(set_h225_addr_hook != NULL);
-	BUG_ON(set_sig_addr_hook != NULL);
-	BUG_ON(set_ras_addr_hook != NULL);
-	BUG_ON(nat_rtp_rtcp_hook != NULL);
-	BUG_ON(nat_t120_hook != NULL);
-	BUG_ON(nat_h245_hook != NULL);
-	BUG_ON(nat_callforwarding_hook != NULL);
-	BUG_ON(nat_q931_hook != NULL);
-
-	RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
-	RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
-	RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
-	RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
-	RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
-	RCU_INIT_POINTER(nat_t120_hook, nat_t120);
-	RCU_INIT_POINTER(nat_h245_hook, nat_h245);
-	RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
-	RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+	RCU_INIT_POINTER(nfct_h323_nat_hook, &nathooks);
 	nf_ct_helper_expectfn_register(&q931_nat);
 	nf_ct_helper_expectfn_register(&callforwarding_nat);
 	return 0;
@@ -609,15 +603,7 @@ static int __init nf_nat_h323_init(void)
 /****************************************************************************/
 static void __exit nf_nat_h323_fini(void)
 {
-	RCU_INIT_POINTER(set_h245_addr_hook, NULL);
-	RCU_INIT_POINTER(set_h225_addr_hook, NULL);
-	RCU_INIT_POINTER(set_sig_addr_hook, NULL);
-	RCU_INIT_POINTER(set_ras_addr_hook, NULL);
-	RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
-	RCU_INIT_POINTER(nat_t120_hook, NULL);
-	RCU_INIT_POINTER(nat_h245_hook, NULL);
-	RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
-	RCU_INIT_POINTER(nat_q931_hook, NULL);
+	RCU_INIT_POINTER(nfct_h323_nat_hook, NULL);
 	nf_ct_helper_expectfn_unregister(&q931_nat);
 	nf_ct_helper_expectfn_unregister(&callforwarding_nat);
 	synchronize_rcu();
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 2eb31ffb3d14..bb76305bb7ff 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -49,64 +49,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
 				     "if both endpoints are on different sides "
 				     "(determined by routing information)");
 
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
-			   unsigned char **data, int dataoff,
-			   H245_TransportAddress *taddr,
-			   union nf_inet_addr *addr, __be16 port)
-			   __read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
-			   unsigned char **data, int dataoff,
-			   TransportAddress *taddr,
-			   union nf_inet_addr *addr, __be16 port)
-			   __read_mostly;
-int (*set_sig_addr_hook) (struct sk_buff *skb,
-			  struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  unsigned int protoff, unsigned char **data,
-			  TransportAddress *taddr, int count) __read_mostly;
-int (*set_ras_addr_hook) (struct sk_buff *skb,
-			  struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  unsigned int protoff, unsigned char **data,
-			  TransportAddress *taddr, int count) __read_mostly;
-int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
-			  struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  unsigned int protoff,
-			  unsigned char **data, int dataoff,
-			  H245_TransportAddress *taddr,
-			  __be16 port, __be16 rtp_port,
-			  struct nf_conntrack_expect *rtp_exp,
-			  struct nf_conntrack_expect *rtcp_exp) __read_mostly;
-int (*nat_t120_hook) (struct sk_buff *skb,
-		      struct nf_conn *ct,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int protoff,
-		      unsigned char **data, int dataoff,
-		      H245_TransportAddress *taddr, __be16 port,
-		      struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_h245_hook) (struct sk_buff *skb,
-		      struct nf_conn *ct,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int protoff,
-		      unsigned char **data, int dataoff,
-		      TransportAddress *taddr, __be16 port,
-		      struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_callforwarding_hook) (struct sk_buff *skb,
-				struct nf_conn *ct,
-				enum ip_conntrack_info ctinfo,
-				unsigned int protoff,
-				unsigned char **data, int dataoff,
-				TransportAddress *taddr, __be16 port,
-				struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_q931_hook) (struct sk_buff *skb,
-		      struct nf_conn *ct,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int protoff,
-		      unsigned char **data, TransportAddress *taddr, int idx,
-		      __be16 port, struct nf_conntrack_expect *exp)
-		      __read_mostly;
+const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfct_h323_nat_hook);
 
 static DEFINE_SPINLOCK(nf_h323_lock);
 static char *h323_buffer;
@@ -259,6 +203,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 			   unsigned char **data, int dataoff,
 			   H245_TransportAddress *taddr)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	__be16 port;
@@ -266,7 +211,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *rtp_exp;
 	struct nf_conntrack_expect *rtcp_exp;
-	typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
 
 	/* Read RTP or RTCP address */
 	if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -296,15 +240,16 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 			  &ct->tuplehash[!dir].tuple.dst.u3,
 			  IPPROTO_UDP, NULL, &rtcp_port);
 
+	nathook = rcu_dereference(nfct_h323_nat_hook);
 	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
 		   &ct->tuplehash[!dir].tuple.dst.u3,
 		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
-		   (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+		   nathook &&
 		   nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 		   ct->status & IPS_NAT_MASK) {
 		/* NAT needed */
-		ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
-				   taddr, port, rtp_port, rtp_exp, rtcp_exp);
+		ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+					    taddr, port, rtp_port, rtp_exp, rtcp_exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(rtp_exp, 0) == 0) {
 			if (nf_ct_expect_related(rtcp_exp, 0) == 0) {
@@ -333,12 +278,12 @@ static int expect_t120(struct sk_buff *skb,
 		       unsigned char **data, int dataoff,
 		       H245_TransportAddress *taddr)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
-	typeof(nat_t120_hook) nat_t120;
 
 	/* Read T.120 address */
 	if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -355,15 +300,16 @@ static int expect_t120(struct sk_buff *skb,
 			  IPPROTO_TCP, NULL, &port);
 	exp->flags = NF_CT_EXPECT_PERMANENT;	/* Accept multiple channels */
 
+	nathook = rcu_dereference(nfct_h323_nat_hook);
 	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
 		   &ct->tuplehash[!dir].tuple.dst.u3,
 		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
-	    (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+	    nathook &&
 	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
 		/* NAT needed */
-		ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
-			       port, exp);
+		ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data,
+					dataoff, taddr, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp, 0) == 0) {
 			pr_debug("nf_ct_h323: expect T.120 ");
@@ -664,18 +610,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(get_h225_addr);
 
 static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned int protoff, unsigned char **data, int dataoff,
 		       TransportAddress *taddr)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
-	typeof(nat_h245_hook) nat_h245;
 
 	/* Read h245Address */
 	if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
@@ -692,15 +639,16 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
 			  IPPROTO_TCP, NULL, &port);
 	exp->helper = &nf_conntrack_helper_h245;
 
+	nathook = rcu_dereference(nfct_h323_nat_hook);
 	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
 		   &ct->tuplehash[!dir].tuple.dst.u3,
 		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
-	    (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+	    nathook &&
 	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
 		/* NAT needed */
-		ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
-			       port, exp);
+		ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data,
+					dataoff, taddr, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp, 0) == 0) {
 			pr_debug("nf_ct_q931: expect H.245 ");
@@ -785,13 +733,13 @@ static int expect_callforwarding(struct sk_buff *skb,
 				 unsigned char **data, int dataoff,
 				 TransportAddress *taddr)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
 	struct net *net = nf_ct_net(ct);
-	typeof(nat_callforwarding_hook) nat_callforwarding;
 
 	/* Read alternativeAddress */
 	if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
@@ -815,16 +763,17 @@ static int expect_callforwarding(struct sk_buff *skb,
 			  IPPROTO_TCP, NULL, &port);
 	exp->helper = nf_conntrack_helper_q931;
 
+	nathook = rcu_dereference(nfct_h323_nat_hook);
 	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
 		   &ct->tuplehash[!dir].tuple.dst.u3,
 		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
-	    (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+	    nathook &&
 	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
 		/* Need NAT */
-		ret = nat_callforwarding(skb, ct, ctinfo,
-					 protoff, data, dataoff,
-					 taddr, port, exp);
+		ret = nathook->nat_callforwarding(skb, ct, ctinfo,
+						  protoff, data, dataoff,
+						  taddr, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp, 0) == 0) {
 			pr_debug("nf_ct_q931: expect Call Forwarding ");
@@ -844,12 +793,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 			 unsigned char **data, int dataoff,
 			 Setup_UUIE *setup)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
 	int i;
 	__be16 port;
 	union nf_inet_addr addr;
-	typeof(set_h225_addr_hook) set_h225_addr;
 
 	pr_debug("nf_ct_q931: Setup\n");
 
@@ -860,9 +809,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 			return -1;
 	}
 
-	set_h225_addr = rcu_dereference(set_h225_addr_hook);
+	nathook = rcu_dereference(nfct_h323_nat_hook);
 	if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
-	    (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	    nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK &&
 	    get_h225_addr(ct, *data, &setup->destCallSignalAddress,
 			  &addr, &port) &&
@@ -870,16 +819,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 		pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
 			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
 			 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
-		ret = set_h225_addr(skb, protoff, data, dataoff,
-				    &setup->destCallSignalAddress,
-				    &ct->tuplehash[!dir].tuple.src.u3,
-				    ct->tuplehash[!dir].tuple.src.u.tcp.port);
+		ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+					     &setup->destCallSignalAddress,
+					     &ct->tuplehash[!dir].tuple.src.u3,
+					     ct->tuplehash[!dir].tuple.src.u.tcp.port);
 		if (ret < 0)
 			return -1;
 	}
 
 	if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
-	    (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	    nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK &&
 	    get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
 			  &addr, &port) &&
@@ -887,10 +836,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 		pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
 			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
 			 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
-		ret = set_h225_addr(skb, protoff, data, dataoff,
-				    &setup->sourceCallSignalAddress,
-				    &ct->tuplehash[!dir].tuple.dst.u3,
-				    ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+		ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+					     &setup->sourceCallSignalAddress,
+					     &ct->tuplehash[!dir].tuple.dst.u3,
+					     ct->tuplehash[!dir].tuple.dst.u.tcp.port);
 		if (ret < 0)
 			return -1;
 	}
@@ -1249,13 +1198,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 		       TransportAddress *taddr, int count)
 {
 	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	int i;
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
-	typeof(nat_q931_hook) nat_q931;
 
 	/* Look for the first related address */
 	for (i = 0; i < count; i++) {
@@ -1279,11 +1228,11 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 	exp->helper = nf_conntrack_helper_q931;
 	exp->flags = NF_CT_EXPECT_PERMANENT;	/* Accept multiple calls */
 
-	nat_q931 = rcu_dereference(nat_q931_hook);
-	if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {	/* Need NAT */
-		ret = nat_q931(skb, ct, ctinfo, protoff, data,
-			       taddr, i, port, exp);
+		ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data,
+					taddr, i, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp, 0) == 0) {
 			pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1305,15 +1254,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned int protoff,
 		       unsigned char **data, GatekeeperRequest *grq)
 {
-	typeof(set_ras_addr_hook) set_ras_addr;
+	const struct nfct_h323_nat_hooks *nathook;
 
 	pr_debug("nf_ct_ras: GRQ\n");
 
-	set_ras_addr = rcu_dereference(set_ras_addr_hook);
-	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK)	/* NATed */
-		return set_ras_addr(skb, ct, ctinfo, protoff, data,
-				    &grq->rasAddress, 1);
+		return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+					     &grq->rasAddress, 1);
 	return 0;
 }
 
@@ -1367,8 +1316,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned char **data, RegistrationRequest *rrq)
 {
 	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	const struct nfct_h323_nat_hooks *nathook;
 	int ret;
-	typeof(set_ras_addr_hook) set_ras_addr;
 
 	pr_debug("nf_ct_ras: RRQ\n");
 
@@ -1378,12 +1327,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 	if (ret < 0)
 		return -1;
 
-	set_ras_addr = rcu_dereference(set_ras_addr_hook);
-	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
-		ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
-				   rrq->rasAddress.item,
-				   rrq->rasAddress.count);
+		ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+					    rrq->rasAddress.item,
+					    rrq->rasAddress.count);
 		if (ret < 0)
 			return -1;
 	}
@@ -1403,19 +1352,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned char **data, RegistrationConfirm *rcf)
 {
 	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
 	struct nf_conntrack_expect *exp;
-	typeof(set_sig_addr_hook) set_sig_addr;
 
 	pr_debug("nf_ct_ras: RCF\n");
 
-	set_sig_addr = rcu_dereference(set_sig_addr_hook);
-	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
-		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
-					rcf->callSignalAddress.item,
-					rcf->callSignalAddress.count);
+		ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+					    rcf->callSignalAddress.item,
+					    rcf->callSignalAddress.count);
 		if (ret < 0)
 			return -1;
 	}
@@ -1454,18 +1403,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned char **data, UnregistrationRequest *urq)
 {
 	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
-	typeof(set_sig_addr_hook) set_sig_addr;
 
 	pr_debug("nf_ct_ras: URQ\n");
 
-	set_sig_addr = rcu_dereference(set_sig_addr_hook);
-	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
-		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
-				   urq->callSignalAddress.item,
-				   urq->callSignalAddress.count);
+		ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+					    urq->callSignalAddress.item,
+					    urq->callSignalAddress.count);
 		if (ret < 0)
 			return -1;
 	}
@@ -1487,39 +1436,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned char **data, AdmissionRequest *arq)
 {
 	const struct nf_ct_h323_master *info = nfct_help_data(ct);
+	const struct nfct_h323_nat_hooks *nathook;
 	int dir = CTINFO2DIR(ctinfo);
 	__be16 port;
 	union nf_inet_addr addr;
-	typeof(set_h225_addr_hook) set_h225_addr;
 
 	pr_debug("nf_ct_ras: ARQ\n");
 
-	set_h225_addr = rcu_dereference(set_h225_addr_hook);
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (!nathook)
+		return 0;
+
 	if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
 	    get_h225_addr(ct, *data, &arq->destCallSignalAddress,
 			  &addr, &port) &&
 	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
 	    port == info->sig_port[dir] &&
 	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&
-	    set_h225_addr && ct->status & IPS_NAT_MASK) {
+	    ct->status & IPS_NAT_MASK) {
 		/* Answering ARQ */
-		return set_h225_addr(skb, protoff, data, 0,
-				     &arq->destCallSignalAddress,
-				     &ct->tuplehash[!dir].tuple.dst.u3,
-				     info->sig_port[!dir]);
+		return nathook->set_h225_addr(skb, protoff, data, 0,
+					      &arq->destCallSignalAddress,
+					      &ct->tuplehash[!dir].tuple.dst.u3,
+					      info->sig_port[!dir]);
 	}
 
 	if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
 	    get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
 			  &addr, &port) &&
 	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
-	    set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
 		/* Calling ARQ */
-		return set_h225_addr(skb, protoff, data, 0,
-				     &arq->srcCallSignalAddress,
-				     &ct->tuplehash[!dir].tuple.dst.u3,
-				     port);
+		return nathook->set_h225_addr(skb, protoff, data, 0,
+					      &arq->srcCallSignalAddress,
+					      &ct->tuplehash[!dir].tuple.dst.u3,
+					      port);
 	}
 
 	return 0;
@@ -1535,7 +1487,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
 	__be16 port;
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
-	typeof(set_sig_addr_hook) set_sig_addr;
 
 	pr_debug("nf_ct_ras: ACF\n");
 
@@ -1544,12 +1495,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
 		return 0;
 
 	if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+		const struct nfct_h323_nat_hooks *nathook;
+
 		/* Answering ACF */
-		set_sig_addr = rcu_dereference(set_sig_addr_hook);
-		if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+		nathook = rcu_dereference(nfct_h323_nat_hook);
+		if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 		    ct->status & IPS_NAT_MASK)
-			return set_sig_addr(skb, ct, ctinfo, protoff, data,
-					    &acf->destCallSignalAddress, 1);
+			return nathook->set_sig_addr(skb, ct, ctinfo, protoff,
+						     data,
+						     &acf->destCallSignalAddress, 1);
 		return 0;
 	}
 
@@ -1578,15 +1532,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned int protoff,
 		       unsigned char **data, LocationRequest *lrq)
 {
-	typeof(set_ras_addr_hook) set_ras_addr;
+	const struct nfct_h323_nat_hooks *nathook;
 
 	pr_debug("nf_ct_ras: LRQ\n");
 
-	set_ras_addr = rcu_dereference(set_ras_addr_hook);
-	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK)
-		return set_ras_addr(skb, ct, ctinfo, protoff, data,
-				    &lrq->replyAddress, 1);
+		return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+					     &lrq->replyAddress, 1);
 	return 0;
 }
 
@@ -1634,27 +1588,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned int protoff,
 		       unsigned char **data, InfoRequestResponse *irr)
 {
+	const struct nfct_h323_nat_hooks *nathook;
 	int ret;
-	typeof(set_ras_addr_hook) set_ras_addr;
-	typeof(set_sig_addr_hook) set_sig_addr;
 
 	pr_debug("nf_ct_ras: IRR\n");
 
-	set_ras_addr = rcu_dereference(set_ras_addr_hook);
-	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+	nathook = rcu_dereference(nfct_h323_nat_hook);
+	if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
 	    ct->status & IPS_NAT_MASK) {
-		ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
-				   &irr->rasAddress, 1);
+		ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+					    &irr->rasAddress, 1);
 		if (ret < 0)
 			return -1;
-	}
 
-	set_sig_addr = rcu_dereference(set_sig_addr_hook);
-	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
-	    ct->status & IPS_NAT_MASK) {
-		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
-					irr->callSignalAddress.item,
-					irr->callSignalAddress.count);
+		ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+					    irr->callSignalAddress.item,
+					    irr->callSignalAddress.count);
 		if (ret < 0)
 			return -1;
 	}
@@ -1837,17 +1786,6 @@ err1:
 module_init(nf_conntrack_h323_init);
 module_exit(nf_conntrack_h323_fini);
 
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
 MODULE_DESCRIPTION("H.323 connection tracking helper");
 MODULE_LICENSE("GPL");
-- 
cgit 


From 7278b3c1e4ebf6f9c4cda07600f19824857c81fe Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jun 2022 15:05:12 +0200
Subject: netfilter: nf_tables: add and use BE register load-store helpers

Same as the existing ones, no conversions. This is just for sparse sake
only so that we no longer mix be16/u16 and be32/u32 types.

Alternative is to add __force __beX in various places, but this
seems nicer.

objdiff shows no changes.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      | 15 +++++++++++++++
 net/bridge/netfilter/nft_meta_bridge.c |  2 +-
 net/netfilter/nft_tproxy.c             |  6 +++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5c4e5a96a984..dd6ad0b67d7d 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -157,11 +157,26 @@ static inline void nft_reg_store16(u32 *dreg, u16 val)
 	*(u16 *)dreg = val;
 }
 
+static inline void nft_reg_store_be16(u32 *dreg, __be16 val)
+{
+	nft_reg_store16(dreg, (__force __u16)val);
+}
+
 static inline u16 nft_reg_load16(const u32 *sreg)
 {
 	return *(u16 *)sreg;
 }
 
+static inline __be16 nft_reg_load_be16(const u32 *sreg)
+{
+	return (__force __be16)nft_reg_load16(sreg);
+}
+
+static inline __be32 nft_reg_load_be32(const u32 *sreg)
+{
+	return *(__force __be32 *)sreg;
+}
+
 static inline void nft_reg_store64(u32 *dreg, u64 val)
 {
 	put_unaligned(val, (u64 *)dreg);
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 8c3eaba87ad2..c3ecd77e25cb 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -53,7 +53,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
 			goto err;
 
 		br_vlan_get_proto(br_dev, &p_proto);
-		nft_reg_store16(dest, htons(p_proto));
+		nft_reg_store_be16(dest, htons(p_proto));
 		return;
 	}
 	default:
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index 801f013971df..68b2eed742df 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -52,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
 				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
 
 	if (priv->sreg_addr)
-		taddr = regs->data[priv->sreg_addr];
+		taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]);
 	taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
 
 	if (priv->sreg_port)
-		tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+		tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
 	if (!tport)
 		tport = hp->dest;
 
@@ -124,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
 	taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
 
 	if (priv->sreg_port)
-		tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+		tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
 	if (!tport)
 		tport = hp->dest;
 
-- 
cgit 


From 6b77205374fdeae4c9a585a7ca754673ef52f75b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jun 2022 15:05:14 +0200
Subject: netfilter: nf_tables: move nft_cmp_fast_mask to where its used

... and cast result to u32 so sparse won't complain anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 10 ----------
 net/netfilter/nft_cmp.c                | 12 ++++++++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 0ea7c55cea4d..1223af68cd9a 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -56,16 +56,6 @@ struct nft_immediate_expr {
 	u8			dlen;
 };
 
-/* Calculate the mask for the nft_cmp_fast expression. On big endian the
- * mask needs to include the *upper* bytes when interpreting that data as
- * something smaller than the full u32, therefore a cpu_to_le32 is done.
- */
-static inline u32 nft_cmp_fast_mask(unsigned int len)
-{
-	return cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
-						data) * BITS_PER_BYTE - len));
-}
-
 extern const struct nft_expr_ops nft_cmp_fast_ops;
 extern const struct nft_expr_ops nft_cmp16_fast_ops;
 
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index bec22584395f..777f09e4dc60 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -197,6 +197,18 @@ static const struct nft_expr_ops nft_cmp_ops = {
 	.offload	= nft_cmp_offload,
 };
 
+/* Calculate the mask for the nft_cmp_fast expression. On big endian the
+ * mask needs to include the *upper* bytes when interpreting that data as
+ * something smaller than the full u32, therefore a cpu_to_le32 is done.
+ */
+static u32 nft_cmp_fast_mask(unsigned int len)
+{
+	__le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
+					  data) * BITS_PER_BYTE - len));
+
+	return (__force u32)mask;
+}
+
 static int nft_cmp_fast_init(const struct nft_ctx *ctx,
 			     const struct nft_expr *expr,
 			     const struct nlattr * const tb[])
-- 
cgit 


From d5b36a4dbd06c5e8e36ca8ccc552f679069e2946 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 11 Jul 2022 18:16:25 +0200
Subject: fix race between exit_itimers() and /proc/pid/timers

As Chris explains, the comment above exit_itimers() is not correct,
we can race with proc_timers_seq_ops. Change exit_itimers() to clear
signal->posix_timers with ->siglock held.

Cc: <stable@vger.kernel.org>
Reported-by: chris@accessvector.net
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                  |  2 +-
 include/linux/sched/task.h |  2 +-
 kernel/exit.c              |  2 +-
 kernel/time/posix-timers.c | 19 ++++++++++++++-----
 4 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 0989fb8472a1..778123259e42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;
 
 #ifdef CONFIG_POSIX_TIMERS
-	exit_itimers(me->signal);
+	exit_itimers(me);
 	flush_itimer_signals();
 #endif
 
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 505aaf9fe477..81cab4b01edc 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -85,7 +85,7 @@ static inline void exit_thread(struct task_struct *tsk)
 extern __noreturn void do_group_exit(int);
 
 extern void exit_files(struct task_struct *);
-extern void exit_itimers(struct signal_struct *);
+extern void exit_itimers(struct task_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
diff --git a/kernel/exit.c b/kernel/exit.c
index f072959fcab7..64c938ce36fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -766,7 +766,7 @@ void __noreturn do_exit(long code)
 
 #ifdef CONFIG_POSIX_TIMERS
 		hrtimer_cancel(&tsk->signal->real_timer);
-		exit_itimers(tsk->signal);
+		exit_itimers(tsk);
 #endif
 		if (tsk->mm)
 			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1cd10b102c51..5dead89308b7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1051,15 +1051,24 @@ retry_delete:
 }
 
 /*
- * This is called by do_exit or de_thread, only when there are no more
- * references to the shared signal_struct.
+ * This is called by do_exit or de_thread, only when nobody else can
+ * modify the signal->posix_timers list. Yet we need sighand->siglock
+ * to prevent the race with /proc/pid/timers.
  */
-void exit_itimers(struct signal_struct *sig)
+void exit_itimers(struct task_struct *tsk)
 {
+	struct list_head timers;
 	struct k_itimer *tmr;
 
-	while (!list_empty(&sig->posix_timers)) {
-		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+	if (list_empty(&tsk->signal->posix_timers))
+		return;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	list_replace_init(&tsk->signal->posix_timers, &timers);
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	while (!list_empty(&timers)) {
+		tmr = list_first_entry(&timers, struct k_itimer, list);
 		itimer_delete(tmr);
 	}
 }
-- 
cgit 


From 8c18fece15f64df1d8101fd32efe304bf10032d5 Mon Sep 17 00:00:00 2001
From: Cixi Geng <cixi.geng1@unisoc.com>
Date: Thu, 5 May 2022 18:14:32 +0800
Subject: clk: sprd: Add dt-bindings include file for UMS512

This file defines all UMS512 clock indexes, it should be included in the
device tree in which there's device using the clocks.

Signed-off-by: Cixi Geng <cixi.geng1@unisoc.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220505101433.1575096-4-gengcixi@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/sprd,ums512-clk.h | 397 ++++++++++++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 include/dt-bindings/clock/sprd,ums512-clk.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/sprd,ums512-clk.h b/include/dt-bindings/clock/sprd,ums512-clk.h
new file mode 100644
index 000000000000..4f1d90849944
--- /dev/null
+++ b/include/dt-bindings/clock/sprd,ums512-clk.h
@@ -0,0 +1,397 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Unisoc UMS512 SoC DTS file
+ *
+ * Copyright (C) 2022, Unisoc Inc.
+ */
+
+#ifndef _DT_BINDINGS_CLK_UMS512_H_
+#define _DT_BINDINGS_CLK_UMS512_H_
+
+#define CLK_26M_AUD			0
+#define CLK_13M				1
+#define CLK_6M5				2
+#define CLK_4M3				3
+#define CLK_2M				4
+#define CLK_1M				5
+#define CLK_250K			6
+#define CLK_RCO_25M			7
+#define CLK_RCO_4M			8
+#define CLK_RCO_2M			9
+#define CLK_ISPPLL_GATE			10
+#define CLK_DPLL0_GATE			11
+#define CLK_DPLL1_GATE			12
+#define CLK_LPLL_GATE			13
+#define CLK_TWPLL_GATE			14
+#define CLK_GPLL_GATE			15
+#define CLK_RPLL_GATE			16
+#define CLK_CPPLL_GATE			17
+#define CLK_MPLL0_GATE			18
+#define CLK_MPLL1_GATE			19
+#define CLK_MPLL2_GATE			20
+#define CLK_PMU_GATE_NUM		(CLK_MPLL2_GATE + 1)
+
+#define CLK_DPLL0			0
+#define CLK_DPLL0_58M31			1
+#define CLK_ANLG_PHY_G0_NUM		(CLK_DPLL0_58M31 + 1)
+
+#define CLK_MPLL1			0
+#define CLK_MPLL1_63M38			1
+#define CLK_ANLG_PHY_G2_NUM		(CLK_MPLL1_63M38 + 1)
+
+#define CLK_RPLL			0
+#define CLK_AUDIO_GATE			1
+#define CLK_MPLL0			2
+#define CLK_MPLL0_56M88			3
+#define CLK_MPLL2			4
+#define CLK_MPLL2_47M13			5
+#define CLK_ANLG_PHY_G3_NUM		(CLK_MPLL2_47M13 + 1)
+
+#define CLK_TWPLL			0
+#define CLK_TWPLL_768M			1
+#define CLK_TWPLL_384M			2
+#define CLK_TWPLL_192M			3
+#define CLK_TWPLL_96M			4
+#define CLK_TWPLL_48M			5
+#define CLK_TWPLL_24M			6
+#define CLK_TWPLL_12M			7
+#define CLK_TWPLL_512M			8
+#define CLK_TWPLL_256M			9
+#define CLK_TWPLL_128M			10
+#define CLK_TWPLL_64M			11
+#define CLK_TWPLL_307M2			12
+#define CLK_TWPLL_219M4			13
+#define CLK_TWPLL_170M6			14
+#define CLK_TWPLL_153M6			15
+#define CLK_TWPLL_76M8			16
+#define CLK_TWPLL_51M2			17
+#define CLK_TWPLL_38M4			18
+#define CLK_TWPLL_19M2			19
+#define CLK_TWPLL_12M29			20
+#define CLK_LPLL			21
+#define CLK_LPLL_614M4			22
+#define CLK_LPLL_409M6			23
+#define CLK_LPLL_245M76			24
+#define CLK_LPLL_30M72			25
+#define CLK_ISPPLL			26
+#define CLK_ISPPLL_468M			27
+#define CLK_ISPPLL_78M			28
+#define CLK_GPLL			29
+#define CLK_GPLL_40M			30
+#define CLK_CPPLL			31
+#define CLK_CPPLL_39M32			32
+#define CLK_ANLG_PHY_GC_NUM		(CLK_CPPLL_39M32 + 1)
+
+#define CLK_AP_APB			0
+#define CLK_IPI			        1
+#define CLK_AP_UART0			2
+#define CLK_AP_UART1			3
+#define CLK_AP_UART2			4
+#define CLK_AP_I2C0			5
+#define CLK_AP_I2C1			6
+#define CLK_AP_I2C2			7
+#define CLK_AP_I2C3			8
+#define CLK_AP_I2C4			9
+#define CLK_AP_SPI0			10
+#define CLK_AP_SPI1			11
+#define CLK_AP_SPI2			12
+#define CLK_AP_SPI3			13
+#define CLK_AP_IIS0			14
+#define CLK_AP_IIS1			15
+#define CLK_AP_IIS2			16
+#define CLK_AP_SIM			17
+#define CLK_AP_CE			18
+#define CLK_SDIO0_2X			19
+#define CLK_SDIO1_2X			20
+#define CLK_EMMC_2X			21
+#define CLK_VSP				22
+#define CLK_DISPC0			23
+#define CLK_DISPC0_DPI			24
+#define CLK_DSI_APB			25
+#define CLK_DSI_RXESC			26
+#define CLK_DSI_LANEBYTE		27
+#define CLK_VDSP		        28
+#define CLK_VDSP_M		        29
+#define CLK_AP_CLK_NUM			(CLK_VDSP_M + 1)
+
+#define CLK_DSI_EB			0
+#define CLK_DISPC_EB			1
+#define CLK_VSP_EB			2
+#define CLK_VDMA_EB			3
+#define CLK_DMA_PUB_EB			4
+#define CLK_DMA_SEC_EB			5
+#define CLK_IPI_EB			6
+#define CLK_AHB_CKG_EB			7
+#define CLK_BM_CLK_EB			8
+#define CLK_AP_AHB_GATE_NUM		(CLK_BM_CLK_EB + 1)
+
+#define CLK_AON_APB			0
+#define CLK_ADI				1
+#define CLK_AUX0			2
+#define CLK_AUX1			3
+#define CLK_AUX2			4
+#define CLK_PROBE			5
+#define CLK_PWM0			6
+#define CLK_PWM1			7
+#define CLK_PWM2			8
+#define CLK_PWM3			9
+#define CLK_EFUSE			10
+#define CLK_UART0			11
+#define CLK_UART1			12
+#define CLK_THM0			13
+#define CLK_THM1			14
+#define CLK_THM2			15
+#define CLK_THM3			16
+#define CLK_AON_I2C			17
+#define CLK_AON_IIS			18
+#define CLK_SCC				19
+#define CLK_APCPU_DAP			20
+#define CLK_APCPU_DAP_MTCK		21
+#define CLK_APCPU_TS			22
+#define CLK_DEBUG_TS			23
+#define CLK_DSI_TEST_S			24
+#define CLK_DJTAG_TCK			25
+#define CLK_DJTAG_TCK_HW		26
+#define CLK_AON_TMR			27
+#define CLK_AON_PMU			28
+#define CLK_DEBOUNCE			29
+#define CLK_APCPU_PMU			30
+#define CLK_TOP_DVFS			31
+#define CLK_OTG_UTMI			32
+#define CLK_OTG_REF			33
+#define CLK_CSSYS			34
+#define CLK_CSSYS_PUB			35
+#define CLK_CSSYS_APB			36
+#define CLK_AP_AXI			37
+#define CLK_AP_MM			38
+#define CLK_SDIO2_2X			39
+#define CLK_ANALOG_IO_APB		40
+#define CLK_DMC_REF_CLK			41
+#define CLK_EMC				42
+#define CLK_USB				43
+#define CLK_26M_PMU			44
+#define CLK_AON_APB_NUM			(CLK_26M_PMU + 1)
+
+#define CLK_MM_AHB			0
+#define CLK_MM_MTX			1
+#define CLK_SENSOR0			2
+#define CLK_SENSOR1			3
+#define CLK_SENSOR2			4
+#define CLK_CPP				5
+#define CLK_JPG				6
+#define CLK_FD				7
+#define CLK_DCAM_IF			8
+#define CLK_DCAM_AXI			9
+#define CLK_ISP				10
+#define CLK_MIPI_CSI0			11
+#define CLK_MIPI_CSI1			12
+#define CLK_MIPI_CSI2			13
+#define CLK_MM_CLK_NUM			(CLK_MIPI_CSI2 + 1)
+
+#define CLK_RC100M_CAL_EB		0
+#define CLK_DJTAG_TCK_EB		1
+#define CLK_DJTAG_EB			2
+#define CLK_AUX0_EB			3
+#define CLK_AUX1_EB			4
+#define CLK_AUX2_EB			5
+#define CLK_PROBE_EB			6
+#define CLK_MM_EB			7
+#define CLK_GPU_EB			8
+#define CLK_MSPI_EB			9
+#define CLK_APCPU_DAP_EB		10
+#define CLK_AON_CSSYS_EB		11
+#define CLK_CSSYS_APB_EB		12
+#define CLK_CSSYS_PUB_EB		13
+#define CLK_SDPHY_CFG_EB		14
+#define CLK_SDPHY_REF_EB		15
+#define CLK_EFUSE_EB			16
+#define CLK_GPIO_EB			17
+#define CLK_MBOX_EB			18
+#define CLK_KPD_EB			19
+#define CLK_AON_SYST_EB			20
+#define CLK_AP_SYST_EB			21
+#define CLK_AON_TMR_EB			22
+#define CLK_OTG_UTMI_EB			23
+#define CLK_OTG_PHY_EB			24
+#define CLK_SPLK_EB			25
+#define CLK_PIN_EB			26
+#define CLK_ANA_EB			27
+#define CLK_APCPU_TS0_EB		28
+#define CLK_APB_BUSMON_EB		29
+#define CLK_AON_IIS_EB			30
+#define CLK_SCC_EB			31
+#define CLK_THM0_EB			32
+#define CLK_THM1_EB			33
+#define CLK_THM2_EB			34
+#define CLK_ASIM_TOP_EB			35
+#define CLK_I2C_EB			36
+#define CLK_PMU_EB			37
+#define CLK_ADI_EB			38
+#define CLK_EIC_EB			39
+#define CLK_AP_INTC0_EB			40
+#define CLK_AP_INTC1_EB			41
+#define CLK_AP_INTC2_EB			42
+#define CLK_AP_INTC3_EB			43
+#define CLK_AP_INTC4_EB			44
+#define CLK_AP_INTC5_EB			45
+#define CLK_AUDCP_INTC_EB		46
+#define CLK_AP_TMR0_EB			47
+#define CLK_AP_TMR1_EB			48
+#define CLK_AP_TMR2_EB			49
+#define CLK_PWM0_EB			50
+#define CLK_PWM1_EB			51
+#define CLK_PWM2_EB			52
+#define CLK_PWM3_EB			53
+#define CLK_AP_WDG_EB			54
+#define CLK_APCPU_WDG_EB		55
+#define CLK_SERDES_EB			56
+#define CLK_ARCH_RTC_EB			57
+#define CLK_KPD_RTC_EB			58
+#define CLK_AON_SYST_RTC_EB		59
+#define CLK_AP_SYST_RTC_EB		60
+#define CLK_AON_TMR_RTC_EB		61
+#define CLK_EIC_RTC_EB			62
+#define CLK_EIC_RTCDV5_EB		63
+#define CLK_AP_WDG_RTC_EB		64
+#define CLK_AC_WDG_RTC_EB		65
+#define CLK_AP_TMR0_RTC_EB		66
+#define CLK_AP_TMR1_RTC_EB		67
+#define CLK_AP_TMR2_RTC_EB		68
+#define CLK_DCXO_LC_RTC_EB		69
+#define CLK_BB_CAL_RTC_EB		70
+#define CLK_AP_EMMC_RTC_EB		71
+#define CLK_AP_SDIO0_RTC_EB		72
+#define CLK_AP_SDIO1_RTC_EB		73
+#define CLK_AP_SDIO2_RTC_EB		74
+#define CLK_DSI_CSI_TEST_EB		75
+#define CLK_DJTAG_TCK_EN		76
+#define CLK_DPHY_REF_EB			77
+#define CLK_DMC_REF_EB			78
+#define CLK_OTG_REF_EB			79
+#define CLK_TSEN_EB			80
+#define CLK_TMR_EB			81
+#define CLK_RC100M_REF_EB		82
+#define CLK_RC100M_FDK_EB		83
+#define CLK_DEBOUNCE_EB			84
+#define CLK_DET_32K_EB			85
+#define CLK_TOP_CSSYS_EB		86
+#define CLK_AP_AXI_EN			87
+#define CLK_SDIO0_2X_EN			88
+#define CLK_SDIO0_1X_EN			89
+#define CLK_SDIO1_2X_EN			90
+#define CLK_SDIO1_1X_EN			91
+#define CLK_SDIO2_2X_EN			92
+#define CLK_SDIO2_1X_EN			93
+#define CLK_EMMC_2X_EN			94
+#define CLK_EMMC_1X_EN			95
+#define CLK_PLL_TEST_EN			96
+#define CLK_CPHY_CFG_EN			97
+#define CLK_DEBUG_TS_EN			98
+#define CLK_ACCESS_AUD_EN		99
+#define CLK_AON_APB_GATE_NUM		(CLK_ACCESS_AUD_EN + 1)
+
+#define CLK_MM_CPP_EB			0
+#define CLK_MM_JPG_EB			1
+#define CLK_MM_DCAM_EB			2
+#define CLK_MM_ISP_EB			3
+#define CLK_MM_CSI2_EB			4
+#define CLK_MM_CSI1_EB			5
+#define CLK_MM_CSI0_EB			6
+#define CLK_MM_CKG_EB			7
+#define CLK_ISP_AHB_EB			8
+#define CLK_MM_DVFS_EB			9
+#define CLK_MM_FD_EB			10
+#define CLK_MM_SENSOR2_EB		11
+#define CLK_MM_SENSOR1_EB		12
+#define CLK_MM_SENSOR0_EB		13
+#define CLK_MM_MIPI_CSI2_EB		14
+#define CLK_MM_MIPI_CSI1_EB		15
+#define CLK_MM_MIPI_CSI0_EB		16
+#define CLK_DCAM_AXI_EB			17
+#define CLK_ISP_AXI_EB			18
+#define CLK_MM_CPHY_EB			19
+#define CLK_MM_GATE_CLK_NUM		(CLK_MM_CPHY_EB + 1)
+
+#define CLK_SIM0_EB			0
+#define CLK_IIS0_EB			1
+#define CLK_IIS1_EB			2
+#define CLK_IIS2_EB			3
+#define CLK_APB_REG_EB			4
+#define CLK_SPI0_EB			5
+#define CLK_SPI1_EB			6
+#define CLK_SPI2_EB			7
+#define CLK_SPI3_EB			8
+#define CLK_I2C0_EB			9
+#define CLK_I2C1_EB			10
+#define CLK_I2C2_EB			11
+#define CLK_I2C3_EB			12
+#define CLK_I2C4_EB			13
+#define CLK_UART0_EB			14
+#define CLK_UART1_EB			15
+#define CLK_UART2_EB			16
+#define CLK_SIM0_32K_EB			17
+#define CLK_SPI0_LFIN_EB		18
+#define CLK_SPI1_LFIN_EB		19
+#define CLK_SPI2_LFIN_EB		20
+#define CLK_SPI3_LFIN_EB		21
+#define CLK_SDIO0_EB			22
+#define CLK_SDIO1_EB			23
+#define CLK_SDIO2_EB			24
+#define CLK_EMMC_EB			25
+#define CLK_SDIO0_32K_EB		26
+#define CLK_SDIO1_32K_EB		27
+#define CLK_SDIO2_32K_EB		28
+#define CLK_EMMC_32K_EB			29
+#define CLK_AP_APB_GATE_NUM		(CLK_EMMC_32K_EB + 1)
+
+#define CLK_GPU_CORE_EB			0
+#define CLK_GPU_CORE			1
+#define CLK_GPU_MEM_EB			2
+#define CLK_GPU_MEM			3
+#define CLK_GPU_SYS_EB			4
+#define CLK_GPU_SYS			5
+#define CLK_GPU_CLK_NUM			(CLK_GPU_SYS + 1)
+
+#define CLK_AUDCP_IIS0_EB		0
+#define CLK_AUDCP_IIS1_EB		1
+#define CLK_AUDCP_IIS2_EB		2
+#define CLK_AUDCP_UART_EB		3
+#define CLK_AUDCP_DMA_CP_EB		4
+#define CLK_AUDCP_DMA_AP_EB		5
+#define CLK_AUDCP_SRC48K_EB		6
+#define CLK_AUDCP_MCDT_EB		7
+#define CLK_AUDCP_VBCIFD_EB		8
+#define CLK_AUDCP_VBC_EB		9
+#define CLK_AUDCP_SPLK_EB		10
+#define CLK_AUDCP_ICU_EB		11
+#define CLK_AUDCP_DMA_AP_ASHB_EB	12
+#define CLK_AUDCP_DMA_CP_ASHB_EB	13
+#define CLK_AUDCP_AUD_EB		14
+#define CLK_AUDCP_VBC_24M_EB		15
+#define CLK_AUDCP_TMR_26M_EB		16
+#define CLK_AUDCP_DVFS_ASHB_EB		17
+#define CLK_AUDCP_AHB_GATE_NUM		(CLK_AUDCP_DVFS_ASHB_EB + 1)
+
+#define CLK_AUDCP_WDG_EB		0
+#define CLK_AUDCP_RTC_WDG_EB		1
+#define CLK_AUDCP_TMR0_EB		2
+#define CLK_AUDCP_TMR1_EB		3
+#define CLK_AUDCP_APB_GATE_NUM		(CLK_AUDCP_TMR1_EB + 1)
+
+#define CLK_ACORE0			0
+#define CLK_ACORE1			1
+#define CLK_ACORE2			2
+#define CLK_ACORE3			3
+#define CLK_ACORE4			4
+#define CLK_ACORE5			5
+#define CLK_PCORE0			6
+#define CLK_PCORE1			7
+#define CLK_SCU				8
+#define CLK_ACE				9
+#define CLK_PERIPH			10
+#define CLK_GIC				11
+#define CLK_ATB				12
+#define CLK_DEBUG_APB			13
+#define CLK_APCPU_SEC_NUM		(CLK_DEBUG_APB + 1)
+
+#endif /* _DT_BINDINGS_CLK_UMS512_H_ */
-- 
cgit 


From 3d6e44623841c8b82c2157f2f749019803fb238a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Sat, 9 Jul 2022 11:19:57 +0800
Subject: kunit: unify module and builtin suite definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, KUnit runs built-in tests and tests loaded from modules
differently. For built-in tests, the kunit_test_suite{,s}() macro adds a
list of suites in the .kunit_test_suites linker section. However, for
kernel modules, a module_init() function is used to run the test suites.

This causes problems if tests are included in a module which already
defines module_init/exit_module functions, as they'll conflict with the
kunit-provided ones.

This change removes the kunit-defined module inits, and instead parses
the kunit tests from their own section in the module. After module init,
we call __kunit_test_suites_init() on the contents of that section,
which prepares and runs the suite.

This essentially unifies the module- and non-module kunit init formats.

Tested-by: Maíra Canal <maira.canal@usp.br>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Daniel Latypov <dlatypov@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h   | 49 ++++++-----------------------------------------
 include/linux/module.h |  5 +++++
 kernel/module/main.c   |  6 ++++++
 lib/kunit/test.c       | 52 +++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 68 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 7646d1bcf685..cb155d3da284 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -250,42 +250,9 @@ static inline int kunit_run_all_tests(void)
 }
 #endif /* IS_BUILTIN(CONFIG_KUNIT) */
 
-#ifdef MODULE
-/**
- * kunit_test_suites_for_module() - used to register one or more
- *			 &struct kunit_suite with KUnit.
- *
- * @__suites: a statically allocated list of &struct kunit_suite.
- *
- * Registers @__suites with the test framework. See &struct kunit_suite for
- * more information.
- *
- * If a test suite is built-in, module_init() gets translated into
- * an initcall which we don't want as the idea is that for builtins
- * the executor will manage execution.  So ensure we do not define
- * module_{init|exit} functions for the builtin case when registering
- * suites via kunit_test_suites() below.
- */
-#define kunit_test_suites_for_module(__suites)				\
-	static int __init kunit_test_suites_init(void)			\
-	{								\
-		return __kunit_test_suites_init(__suites);		\
-	}								\
-	module_init(kunit_test_suites_init);				\
-									\
-	static void __exit kunit_test_suites_exit(void)			\
-	{								\
-		return __kunit_test_suites_exit(__suites);		\
-	}								\
-	module_exit(kunit_test_suites_exit)				\
-	MODULE_INFO(test, "Y");
-#else
-#define kunit_test_suites_for_module(__suites)
-#endif /* MODULE */
-
 #define __kunit_test_suites(unique_array, unique_suites, ...)		       \
+	MODULE_INFO(test, "Y");						       \
 	static struct kunit_suite *unique_array[] = { __VA_ARGS__, NULL };     \
-	kunit_test_suites_for_module(unique_array);			       \
 	static struct kunit_suite **unique_suites			       \
 	__used __section(".kunit_test_suites") = unique_array
 
@@ -295,16 +262,12 @@ static inline int kunit_run_all_tests(void)
  *
  * @__suites: a statically allocated list of &struct kunit_suite.
  *
- * Registers @suites with the test framework. See &struct kunit_suite for
- * more information.
- *
- * When builtin,  KUnit tests are all run via executor; this is done
- * by placing the array of struct kunit_suite * in the .kunit_test_suites
- * ELF section.
+ * Registers @suites with the test framework.
+ * This is done by placing the array of struct kunit_suite * in the
+ * .kunit_test_suites ELF section.
  *
- * An alternative is to build the tests as a module.  Because modules do not
- * support multiple initcall()s, we need to initialize an array of suites for a
- * module.
+ * When builtin, KUnit tests are all run via the executor at boot, and when
+ * built as a module, they run on module load.
  *
  */
 #define kunit_test_suites(__suites...)						\
diff --git a/include/linux/module.h b/include/linux/module.h
index abd9fa916b7d..2490223c975d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -505,6 +505,11 @@ struct module {
 	int num_static_call_sites;
 	struct static_call_site *static_call_sites;
 #endif
+#if IS_ENABLED(CONFIG_KUNIT)
+	int num_kunit_suites;
+	struct kunit_suite ***kunit_suites;
+#endif
+
 
 #ifdef CONFIG_LIVEPATCH
 	bool klp; /* Is this a livepatch module? */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4723f1316709..324a770f789c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2094,6 +2094,12 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					      sizeof(*mod->static_call_sites),
 					      &mod->num_static_call_sites);
 #endif
+#ifdef CONFIG_KUNIT
+	mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+					      sizeof(*mod->kunit_suites),
+					      &mod->num_kunit_suites);
+#endif
+
 	mod->extable = section_objs(info, "__ex_table",
 				    sizeof(*mod->extable), &mod->num_exentries);
 
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 8b11552dc215..246645eb3cef 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -10,6 +10,7 @@
 #include <kunit/test.h>
 #include <kunit/test-bug.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/panic.h>
 #include <linux/sched/debug.h>
@@ -613,6 +614,49 @@ void __kunit_test_suites_exit(struct kunit_suite **suites)
 }
 EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
 
+#ifdef CONFIG_MODULES
+static void kunit_module_init(struct module *mod)
+{
+	unsigned int i;
+
+	for (i = 0; i < mod->num_kunit_suites; i++)
+		__kunit_test_suites_init(mod->kunit_suites[i]);
+}
+
+static void kunit_module_exit(struct module *mod)
+{
+	unsigned int i;
+
+	for (i = 0; i < mod->num_kunit_suites; i++)
+		__kunit_test_suites_exit(mod->kunit_suites[i]);
+}
+
+static int kunit_module_notify(struct notifier_block *nb, unsigned long val,
+			       void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_LIVE:
+		kunit_module_init(mod);
+		break;
+	case MODULE_STATE_GOING:
+		kunit_module_exit(mod);
+		break;
+	case MODULE_STATE_COMING:
+	case MODULE_STATE_UNFORMED:
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block kunit_mod_nb = {
+	.notifier_call = kunit_module_notify,
+	.priority = 0,
+};
+#endif
+
 struct kunit_kmalloc_array_params {
 	size_t n;
 	size_t size;
@@ -707,13 +751,19 @@ EXPORT_SYMBOL_GPL(kunit_cleanup);
 static int __init kunit_init(void)
 {
 	kunit_debugfs_init();
-
+#ifdef CONFIG_MODULES
+	return register_module_notifier(&kunit_mod_nb);
+#else
 	return 0;
+#endif
 }
 late_initcall(kunit_init);
 
 static void __exit kunit_exit(void)
 {
+#ifdef CONFIG_MODULES
+	unregister_module_notifier(&kunit_mod_nb);
+#endif
 	kunit_debugfs_cleanup();
 }
 module_exit(kunit_exit);
-- 
cgit 


From e5857d396f35e59e6fe96cf1178b0357cc3a1ea4 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Sat, 9 Jul 2022 11:19:58 +0800
Subject: kunit: flatten kunit_suite*** to kunit_suite** in .kunit_test_suites
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently store kunit suites in the .kunit_test_suites ELF section as
a `struct kunit_suite***` (modulo some `const`s).
For every test file, we store a struct kunit_suite** NULL-terminated array.

This adds quite a bit of complexity to the test filtering code in the
executor.

Instead, let's just make the .kunit_test_suites section contain a single
giant array of struct kunit_suite pointers, which can then be directly
manipulated. This array is not NULL-terminated, and so none of the test
filtering code needs to NULL-terminate anything.

Tested-by: Maíra Canal <maira.canal@usp.br>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Daniel Latypov <dlatypov@google.com>
Co-developed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h      |  13 ++---
 include/linux/module.h    |   2 +-
 lib/kunit/executor.c      | 115 +++++++++---------------------------
 lib/kunit/executor_test.c | 144 +++++++++++++---------------------------------
 lib/kunit/test.c          |  18 ++----
 5 files changed, 82 insertions(+), 210 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index cb155d3da284..c958855681cc 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -237,9 +237,9 @@ size_t kunit_suite_num_test_cases(struct kunit_suite *suite);
 unsigned int kunit_test_case_num(struct kunit_suite *suite,
 				 struct kunit_case *test_case);
 
-int __kunit_test_suites_init(struct kunit_suite * const * const suites);
+int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_suites);
 
-void __kunit_test_suites_exit(struct kunit_suite **suites);
+void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites);
 
 #if IS_BUILTIN(CONFIG_KUNIT)
 int kunit_run_all_tests(void);
@@ -250,11 +250,11 @@ static inline int kunit_run_all_tests(void)
 }
 #endif /* IS_BUILTIN(CONFIG_KUNIT) */
 
-#define __kunit_test_suites(unique_array, unique_suites, ...)		       \
+#define __kunit_test_suites(unique_array, ...)				       \
 	MODULE_INFO(test, "Y");						       \
-	static struct kunit_suite *unique_array[] = { __VA_ARGS__, NULL };     \
-	static struct kunit_suite **unique_suites			       \
-	__used __section(".kunit_test_suites") = unique_array
+	static struct kunit_suite *unique_array[]			       \
+	__aligned(sizeof(struct kunit_suite *))				       \
+	__used __section(".kunit_test_suites") = { __VA_ARGS__ }
 
 /**
  * kunit_test_suites() - used to register one or more &struct kunit_suite
@@ -272,7 +272,6 @@ static inline int kunit_run_all_tests(void)
  */
 #define kunit_test_suites(__suites...)						\
 	__kunit_test_suites(__UNIQUE_ID(array),				\
-			    __UNIQUE_ID(suites),			\
 			    ##__suites)
 
 #define kunit_test_suite(suite)	kunit_test_suites(&suite)
diff --git a/include/linux/module.h b/include/linux/module.h
index 2490223c975d..518296ea7f73 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -507,7 +507,7 @@ struct module {
 #endif
 #if IS_ENABLED(CONFIG_KUNIT)
 	int num_kunit_suites;
-	struct kunit_suite ***kunit_suites;
+	struct kunit_suite **kunit_suites;
 #endif
 
 
diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index 572f64e0a41a..6c489d6c5e5d 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -9,8 +9,8 @@
  * These symbols point to the .kunit_test_suites section and are defined in
  * include/asm-generic/vmlinux.lds.h, and consequently must be extern.
  */
-extern struct kunit_suite * const * const __kunit_suites_start[];
-extern struct kunit_suite * const * const __kunit_suites_end[];
+extern struct kunit_suite * const __kunit_suites_start[];
+extern struct kunit_suite * const __kunit_suites_end[];
 
 #if IS_BUILTIN(CONFIG_KUNIT)
 
@@ -90,62 +90,18 @@ kunit_filter_tests(const struct kunit_suite *const suite, const char *test_glob)
 static char *kunit_shutdown;
 core_param(kunit_shutdown, kunit_shutdown, charp, 0644);
 
-static struct kunit_suite * const *
-kunit_filter_subsuite(struct kunit_suite * const * const subsuite,
-		      struct kunit_test_filter *filter)
-{
-	int i, n = 0;
-	struct kunit_suite **filtered, *filtered_suite;
-
-	n = 0;
-	for (i = 0; subsuite[i]; ++i) {
-		if (glob_match(filter->suite_glob, subsuite[i]->name))
-			++n;
-	}
-
-	if (n == 0)
-		return NULL;
-
-	filtered = kmalloc_array(n + 1, sizeof(*filtered), GFP_KERNEL);
-	if (!filtered)
-		return ERR_PTR(-ENOMEM);
-
-	n = 0;
-	for (i = 0; subsuite[i] != NULL; ++i) {
-		if (!glob_match(filter->suite_glob, subsuite[i]->name))
-			continue;
-		filtered_suite = kunit_filter_tests(subsuite[i], filter->test_glob);
-		if (IS_ERR(filtered_suite))
-			return ERR_CAST(filtered_suite);
-		else if (filtered_suite)
-			filtered[n++] = filtered_suite;
-	}
-	filtered[n] = NULL;
-
-	return filtered;
-}
-
+/* Stores an array of suites, end points one past the end */
 struct suite_set {
-	struct kunit_suite * const * const *start;
-	struct kunit_suite * const * const *end;
+	struct kunit_suite * const *start;
+	struct kunit_suite * const *end;
 };
 
-static void kunit_free_subsuite(struct kunit_suite * const *subsuite)
-{
-	unsigned int i;
-
-	for (i = 0; subsuite[i]; i++)
-		kfree(subsuite[i]);
-
-	kfree(subsuite);
-}
-
 static void kunit_free_suite_set(struct suite_set suite_set)
 {
-	struct kunit_suite * const * const *suites;
+	struct kunit_suite * const *suites;
 
 	for (suites = suite_set.start; suites < suite_set.end; suites++)
-		kunit_free_subsuite(*suites);
+		kfree(*suites);
 	kfree(suite_set.start);
 }
 
@@ -154,7 +110,7 @@ static struct suite_set kunit_filter_suites(const struct suite_set *suite_set,
 					    int *err)
 {
 	int i;
-	struct kunit_suite * const **copy, * const *filtered_subsuite;
+	struct kunit_suite **copy, *filtered_suite;
 	struct suite_set filtered;
 	struct kunit_test_filter filter;
 
@@ -169,14 +125,19 @@ static struct suite_set kunit_filter_suites(const struct suite_set *suite_set,
 
 	kunit_parse_filter_glob(&filter, filter_glob);
 
-	for (i = 0; i < max; ++i) {
-		filtered_subsuite = kunit_filter_subsuite(suite_set->start[i], &filter);
-		if (IS_ERR(filtered_subsuite)) {
-			*err = PTR_ERR(filtered_subsuite);
+	for (i = 0; &suite_set->start[i] != suite_set->end; i++) {
+		if (!glob_match(filter.suite_glob, suite_set->start[i]->name))
+			continue;
+
+		filtered_suite = kunit_filter_tests(suite_set->start[i], filter.test_glob);
+		if (IS_ERR(filtered_suite)) {
+			*err = PTR_ERR(filtered_suite);
 			return filtered;
 		}
-		if (filtered_subsuite)
-			*copy++ = filtered_subsuite;
+		if (!filtered_suite)
+			continue;
+
+		*copy++ = filtered_suite;
 	}
 	filtered.end = copy;
 
@@ -199,52 +160,33 @@ static void kunit_handle_shutdown(void)
 
 }
 
-static void kunit_print_tap_header(struct suite_set *suite_set)
-{
-	struct kunit_suite * const * const *suites, * const *subsuite;
-	int num_of_suites = 0;
-
-	for (suites = suite_set->start; suites < suite_set->end; suites++)
-		for (subsuite = *suites; *subsuite != NULL; subsuite++)
-			num_of_suites++;
-
-	pr_info("TAP version 14\n");
-	pr_info("1..%d\n", num_of_suites);
-}
-
 static void kunit_exec_run_tests(struct suite_set *suite_set)
 {
-	struct kunit_suite * const * const *suites;
+	size_t num_suites = suite_set->end - suite_set->start;
 
-	kunit_print_tap_header(suite_set);
+	pr_info("TAP version 14\n");
+	pr_info("1..%zu\n", num_suites);
 
-	for (suites = suite_set->start; suites < suite_set->end; suites++)
-		__kunit_test_suites_init(*suites);
+	__kunit_test_suites_init(suite_set->start, num_suites);
 }
 
 static void kunit_exec_list_tests(struct suite_set *suite_set)
 {
-	unsigned int i;
-	struct kunit_suite * const * const *suites;
+	struct kunit_suite * const *suites;
 	struct kunit_case *test_case;
 
 	/* Hack: print a tap header so kunit.py can find the start of KUnit output. */
 	pr_info("TAP version 14\n");
 
 	for (suites = suite_set->start; suites < suite_set->end; suites++)
-		for (i = 0; (*suites)[i] != NULL; i++) {
-			kunit_suite_for_each_test_case((*suites)[i], test_case) {
-				pr_info("%s.%s\n", (*suites)[i]->name, test_case->name);
-			}
+		kunit_suite_for_each_test_case((*suites), test_case) {
+			pr_info("%s.%s\n", (*suites)->name, test_case->name);
 		}
 }
 
 int kunit_run_all_tests(void)
 {
-	struct suite_set suite_set = {
-		.start = __kunit_suites_start,
-		.end = __kunit_suites_end,
-	};
+	struct suite_set suite_set = {__kunit_suites_start, __kunit_suites_end};
 	int err = 0;
 
 	if (filter_glob_param) {
@@ -262,11 +204,10 @@ int kunit_run_all_tests(void)
 	else
 		pr_err("kunit executor: unknown action '%s'\n", action_param);
 
-	if (filter_glob_param) { /* a copy was made of each array */
+	if (filter_glob_param) { /* a copy was made of each suite */
 		kunit_free_suite_set(suite_set);
 	}
 
-
 out:
 	kunit_handle_shutdown();
 	return err;
diff --git a/lib/kunit/executor_test.c b/lib/kunit/executor_test.c
index eac6ff480273..0cea31c27b23 100644
--- a/lib/kunit/executor_test.c
+++ b/lib/kunit/executor_test.c
@@ -9,8 +9,6 @@
 #include <kunit/test.h>
 
 static void kfree_at_end(struct kunit *test, const void *to_free);
-static void free_subsuite_at_end(struct kunit *test,
-				 struct kunit_suite *const *to_free);
 static struct kunit_suite *alloc_fake_suite(struct kunit *test,
 					    const char *suite_name,
 					    struct kunit_case *test_cases);
@@ -41,126 +39,80 @@ static void parse_filter_test(struct kunit *test)
 	kfree(filter.test_glob);
 }
 
-static void filter_subsuite_test(struct kunit *test)
+static void filter_suites_test(struct kunit *test)
 {
-	struct kunit_suite *subsuite[3] = {NULL, NULL, NULL};
-	struct kunit_suite * const *filtered;
-	struct kunit_test_filter filter = {
-		.suite_glob = "suite2",
-		.test_glob = NULL,
-	};
+	struct kunit_suite *subsuite[3] = {NULL, NULL};
+	struct suite_set suite_set = {.start = subsuite, .end = &subsuite[2]};
+	struct suite_set got;
+	int err = 0;
 
 	subsuite[0] = alloc_fake_suite(test, "suite1", dummy_test_cases);
 	subsuite[1] = alloc_fake_suite(test, "suite2", dummy_test_cases);
 
 	/* Want: suite1, suite2, NULL -> suite2, NULL */
-	filtered = kunit_filter_subsuite(subsuite, &filter);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered);
-	free_subsuite_at_end(test, filtered);
+	got = kunit_filter_suites(&suite_set, "suite2", &err);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
+	KUNIT_ASSERT_EQ(test, err, 0);
+	kfree_at_end(test, got.start);
 
 	/* Validate we just have suite2 */
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered[0]);
-	KUNIT_EXPECT_STREQ(test, (const char *)filtered[0]->name, "suite2");
-	KUNIT_EXPECT_FALSE(test, filtered[1]);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]);
+	KUNIT_EXPECT_STREQ(test, (const char *)got.start[0]->name, "suite2");
+
+	/* Contains one element (end is 1 past end) */
+	KUNIT_ASSERT_EQ(test, got.end - got.start, 1);
 }
 
-static void filter_subsuite_test_glob_test(struct kunit *test)
+static void filter_suites_test_glob_test(struct kunit *test)
 {
-	struct kunit_suite *subsuite[3] = {NULL, NULL, NULL};
-	struct kunit_suite * const *filtered;
-	struct kunit_test_filter filter = {
-		.suite_glob = "suite2",
-		.test_glob = "test2",
-	};
+	struct kunit_suite *subsuite[3] = {NULL, NULL};
+	struct suite_set suite_set = {.start = subsuite, .end = &subsuite[2]};
+	struct suite_set got;
+	int err = 0;
 
 	subsuite[0] = alloc_fake_suite(test, "suite1", dummy_test_cases);
 	subsuite[1] = alloc_fake_suite(test, "suite2", dummy_test_cases);
 
 	/* Want: suite1, suite2, NULL -> suite2 (just test1), NULL */
-	filtered = kunit_filter_subsuite(subsuite, &filter);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered);
-	free_subsuite_at_end(test, filtered);
+	got = kunit_filter_suites(&suite_set, "suite2.test2", &err);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
+	KUNIT_ASSERT_EQ(test, err, 0);
+	kfree_at_end(test, got.start);
 
 	/* Validate we just have suite2 */
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered[0]);
-	KUNIT_EXPECT_STREQ(test, (const char *)filtered[0]->name, "suite2");
-	KUNIT_EXPECT_FALSE(test, filtered[1]);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]);
+	KUNIT_EXPECT_STREQ(test, (const char *)got.start[0]->name, "suite2");
+	KUNIT_ASSERT_EQ(test, got.end - got.start, 1);
 
 	/* Now validate we just have test2 */
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered[0]->test_cases);
-	KUNIT_EXPECT_STREQ(test, (const char *)filtered[0]->test_cases[0].name, "test2");
-	KUNIT_EXPECT_FALSE(test, filtered[0]->test_cases[1].name);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]->test_cases);
+	KUNIT_EXPECT_STREQ(test, (const char *)got.start[0]->test_cases[0].name, "test2");
+	KUNIT_EXPECT_FALSE(test, got.start[0]->test_cases[1].name);
 }
 
-static void filter_subsuite_to_empty_test(struct kunit *test)
+static void filter_suites_to_empty_test(struct kunit *test)
 {
-	struct kunit_suite *subsuite[3] = {NULL, NULL, NULL};
-	struct kunit_suite * const *filtered;
-	struct kunit_test_filter filter = {
-		.suite_glob = "not_found",
-		.test_glob = NULL,
-	};
+	struct kunit_suite *subsuite[3] = {NULL, NULL};
+	struct suite_set suite_set = {.start = subsuite, .end = &subsuite[2]};
+	struct suite_set got;
+	int err = 0;
 
 	subsuite[0] = alloc_fake_suite(test, "suite1", dummy_test_cases);
 	subsuite[1] = alloc_fake_suite(test, "suite2", dummy_test_cases);
 
-	filtered = kunit_filter_subsuite(subsuite, &filter);
-	free_subsuite_at_end(test, filtered); /* just in case */
+	got = kunit_filter_suites(&suite_set, "not_found", &err);
+	KUNIT_ASSERT_EQ(test, err, 0);
+	kfree_at_end(test, got.start); /* just in case */
 
-	KUNIT_EXPECT_FALSE_MSG(test, filtered,
-			       "should be NULL to indicate no match");
-}
-
-static void kfree_subsuites_at_end(struct kunit *test, struct suite_set *suite_set)
-{
-	struct kunit_suite * const * const *suites;
-
-	kfree_at_end(test, suite_set->start);
-	for (suites = suite_set->start; suites < suite_set->end; suites++)
-		free_subsuite_at_end(test, *suites);
-}
-
-static void filter_suites_test(struct kunit *test)
-{
-	/* Suites per-file are stored as a NULL terminated array */
-	struct kunit_suite *subsuites[2][2] = {
-		{NULL, NULL},
-		{NULL, NULL},
-	};
-	/* Match the memory layout of suite_set */
-	struct kunit_suite * const * const suites[2] = {
-		subsuites[0], subsuites[1],
-	};
-
-	const struct suite_set suite_set = {
-		.start = suites,
-		.end = suites + 2,
-	};
-	struct suite_set filtered = {.start = NULL, .end = NULL};
-	int err = 0;
-
-	/* Emulate two files, each having one suite */
-	subsuites[0][0] = alloc_fake_suite(test, "suite0", dummy_test_cases);
-	subsuites[1][0] = alloc_fake_suite(test, "suite1", dummy_test_cases);
-
-	/* Filter out suite1 */
-	filtered = kunit_filter_suites(&suite_set, "suite0", &err);
-	kfree_subsuites_at_end(test, &filtered); /* let us use ASSERTs without leaking */
-	KUNIT_EXPECT_EQ(test, err, 0);
-	KUNIT_ASSERT_EQ(test, filtered.end - filtered.start, (ptrdiff_t)1);
-
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered.start);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered.start[0]);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filtered.start[0][0]);
-	KUNIT_EXPECT_STREQ(test, (const char *)filtered.start[0][0]->name, "suite0");
+	KUNIT_EXPECT_PTR_EQ_MSG(test, got.start, got.end,
+				"should be empty to indicate no match");
 }
 
 static struct kunit_case executor_test_cases[] = {
 	KUNIT_CASE(parse_filter_test),
-	KUNIT_CASE(filter_subsuite_test),
-	KUNIT_CASE(filter_subsuite_test_glob_test),
-	KUNIT_CASE(filter_subsuite_to_empty_test),
 	KUNIT_CASE(filter_suites_test),
+	KUNIT_CASE(filter_suites_test_glob_test),
+	KUNIT_CASE(filter_suites_to_empty_test),
 	{}
 };
 
@@ -190,20 +142,6 @@ static void kfree_at_end(struct kunit *test, const void *to_free)
 			     (void *)to_free);
 }
 
-static void free_subsuite_res_free(struct kunit_resource *res)
-{
-	kunit_free_subsuite(res->data);
-}
-
-static void free_subsuite_at_end(struct kunit *test,
-				 struct kunit_suite *const *to_free)
-{
-	if (IS_ERR_OR_NULL(to_free))
-		return;
-	kunit_alloc_resource(test, NULL, free_subsuite_res_free,
-			     GFP_KERNEL, (void *)to_free);
-}
-
 static struct kunit_suite *alloc_fake_suite(struct kunit *test,
 					    const char *suite_name,
 					    struct kunit_case *test_cases)
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 246645eb3cef..b73d5bb5c473 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -586,11 +586,11 @@ static void kunit_init_suite(struct kunit_suite *suite)
 	suite->suite_init_err = 0;
 }
 
-int __kunit_test_suites_init(struct kunit_suite * const * const suites)
+int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_suites)
 {
 	unsigned int i;
 
-	for (i = 0; suites[i] != NULL; i++) {
+	for (i = 0; i < num_suites; i++) {
 		kunit_init_suite(suites[i]);
 		kunit_run_tests(suites[i]);
 	}
@@ -603,11 +603,11 @@ static void kunit_exit_suite(struct kunit_suite *suite)
 	kunit_debugfs_destroy_suite(suite);
 }
 
-void __kunit_test_suites_exit(struct kunit_suite **suites)
+void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites)
 {
 	unsigned int i;
 
-	for (i = 0; suites[i] != NULL; i++)
+	for (i = 0; i < num_suites; i++)
 		kunit_exit_suite(suites[i]);
 
 	kunit_suite_counter = 1;
@@ -617,18 +617,12 @@ EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
 #ifdef CONFIG_MODULES
 static void kunit_module_init(struct module *mod)
 {
-	unsigned int i;
-
-	for (i = 0; i < mod->num_kunit_suites; i++)
-		__kunit_test_suites_init(mod->kunit_suites[i]);
+	__kunit_test_suites_init(mod->kunit_suites, mod->num_kunit_suites);
 }
 
 static void kunit_module_exit(struct module *mod)
 {
-	unsigned int i;
-
-	for (i = 0; i < mod->num_kunit_suites; i++)
-		__kunit_test_suites_exit(mod->kunit_suites[i]);
+	__kunit_test_suites_exit(mod->kunit_suites, mod->num_kunit_suites);
 }
 
 static int kunit_module_notify(struct notifier_block *nb, unsigned long val,
-- 
cgit 


From 1090c1ea2208702a2fe0e3f71d262e3097d939f6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 8 Jul 2022 19:52:52 -0700
Subject: tls: fix spelling of MIB

MIN -> MIB

Fixes: 88527790c079 ("tls: rx: add sockopt for enabling optimistic decrypt with TLS 1.3")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/snmp.h | 2 +-
 net/tls/tls_proc.c        | 2 +-
 net/tls/tls_sw.c          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1c9152add663..fd83fb9e525a 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -344,7 +344,7 @@ enum
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
-	LINUX_MIN_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
+	LINUX_MIB_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 1246e52b48f6..ede9df13c398 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -20,7 +20,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
-	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
+	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 09370f853031..e12846d1871a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1596,7 +1596,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
-		TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
+		TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
 
-- 
cgit 


From bb56cea9abd85c22175b31d8f7c44d6c615fe526 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 8 Jul 2022 19:52:53 -0700
Subject: tls: rx: add counter for NoPad violations

As discussed with Maxim add a counter for true NoPad violations.
This should help deployments catch unexpected padded records vs
just control records which always need re-encryption.

https: //lore.kernel.org/all/b111828e6ac34baad9f4e783127eba8344ac252d.camel@nvidia.com/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/tls.rst | 4 ++++
 include/uapi/linux/snmp.h        | 1 +
 net/tls/tls_proc.c               | 1 +
 net/tls/tls_sw.c                 | 2 ++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 7a6643836e42..658ed3a71e1b 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -282,3 +282,7 @@ TLS implementation exposes the following per-namespace statistics
   number of RX records which had to be re-decrypted due to
   ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will
   also increment for non-data records.
+
+- ``TlsRxNoPadViolation`` -
+  number of data RX records which had to be re-decrypted due to
+  ``TLS_RX_EXPECT_NO_PAD`` mis-prediction.
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index fd83fb9e525a..4d7470036a8b 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -345,6 +345,7 @@ enum
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
 	LINUX_MIB_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
+	LINUX_MIB_TLSRXNOPADVIOL,		/* TlsRxNoPadViolation */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index ede9df13c398..68982728f620 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -21,6 +21,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
 	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY),
+	SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index e12846d1871a..68d79ee48a56 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1596,6 +1596,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
+		if (!darg->tail)
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXNOPADVIOL);
 		TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
-- 
cgit 


From 2b8bf3d6c99318eae669e3098c490ba6b508fd37 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 9 Jul 2022 16:37:53 +0200
Subject: net/fq_impl: Use the bitmap API to allocate bitmaps

Use bitmap_zalloc()/bitmap_free() instead of hand-writing them.

It is less verbose and it improves the semantic.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/c7bf099af07eb497b02d195906ee8c11fea3b3bd.1657377335.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/fq_impl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index a5f67a2c0c73..524b510f1c68 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -358,8 +358,7 @@ static int fq_init(struct fq *fq, int flows_cnt)
 	if (!fq->flows)
 		return -ENOMEM;
 
-	fq->flows_bitmap = kcalloc(BITS_TO_LONGS(fq->flows_cnt), sizeof(long),
-				   GFP_KERNEL);
+	fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL);
 	if (!fq->flows_bitmap) {
 		kvfree(fq->flows);
 		fq->flows = NULL;
@@ -383,7 +382,7 @@ static void fq_reset(struct fq *fq,
 	kvfree(fq->flows);
 	fq->flows = NULL;
 
-	kfree(fq->flows_bitmap);
+	bitmap_free(fq->flows_bitmap);
 	fq->flows_bitmap = NULL;
 }
 
-- 
cgit 


From f16214c102f0f64b2f3546e989498525bd7b7708 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Mon, 11 Jul 2022 10:12:00 +0200
Subject: bpf: Fix 'dubious one-bit signed bitfield' warnings

Our CI[1] reported these warnings when using Sparse:

  $ touch net/mptcp/bpf.c
  $ make C=1 net/mptcp/bpf.o
  net/mptcp/bpf.c: note: in included file:
  include/linux/bpf_verifier.h:348:26: error: dubious one-bit signed bitfield
  include/linux/bpf_verifier.h:349:29: error: dubious one-bit signed bitfield

Set them as 'unsigned' to avoid warnings.

[1] https://github.com/multipath-tcp/mptcp_net-next/actions/runs/2643588487

Fixes: 1ade23711971 ("bpf: Inline calls to bpf_loop when callback is known")
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220711081200.2081262-1-matthieu.baerts@tessares.net
---
 include/linux/bpf_verifier.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 81b19669efba..2e3bad8640dc 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -345,10 +345,10 @@ struct bpf_verifier_state_list {
 };
 
 struct bpf_loop_inline_state {
-	int initialized:1; /* set to true upon first entry */
-	int fit_for_inline:1; /* true if callback function is the same
-			       * at each call and flags are always zero
-			       */
+	unsigned int initialized:1; /* set to true upon first entry */
+	unsigned int fit_for_inline:1; /* true if callback function is the same
+					* at each call and flags are always zero
+					*/
 	u32 callback_subprogno; /* valid when fit_for_inline is true */
 };
 
-- 
cgit 


From 2acd21cd00ce635adfec5a8725a0c342812bffb4 Mon Sep 17 00:00:00 2001
From: Dan Rapaport <drapaport@habana.ai>
Date: Mon, 30 May 2022 14:11:45 +0300
Subject: habanalabs: align ioctl uapi structures to 64-bit

The compiler is padding the members of the struct to be aligned to
64-bit. The content of the padded bytes is and not zeroed explicitly,
hence might copy undefined data. We add a padding member to the struct
to get a zeroed 64-bit align struct.

Signed-off-by: Dan Rapaport <drapaport@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/uapi/misc/habanalabs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 52540d5b4fc9..6d2ccc09dcf2 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -949,6 +949,7 @@ struct hl_cs_in {
 
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
+	__u8 pad[4];
 };
 
 struct hl_cs_out {
-- 
cgit 


From a7d6c35bcd6b5389eb680daff5839339bd8206e0 Mon Sep 17 00:00:00 2001
From: Tal Cohen <talcohen@habana.ai>
Date: Wed, 11 May 2022 18:02:39 +0300
Subject: habanalabs/gaudi: collect undefined opcode error info

when an undefined opcode error occurres, the driver collects
the relevant information from the Qman and stores it inside
the hdev data structure. An event fd indication is sent towards the
user space.

Note: another commit shall be followed which will add support to
read the error info by an ioctl.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c         |  13 +--
 drivers/misc/habanalabs/common/habanalabs.h     |  40 ++++++++-
 drivers/misc/habanalabs/common/habanalabs_drv.c |   1 +
 drivers/misc/habanalabs/gaudi/gaudi.c           | 108 +++++++++++++++++++-----
 include/uapi/misc/habanalabs.h                  |   8 +-
 5 files changed, 138 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 38e1ad432e51..0f804ecb6caa 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1531,10 +1531,11 @@ out_err:
 	return rc;
 }
 
-static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
+static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
 {
 	mutex_lock(&notifier_event->lock);
-	notifier_event->events_mask |= event;
+	notifier_event->events_mask |= event_mask;
+
 	if (notifier_event->eventfd)
 		eventfd_signal(notifier_event->eventfd, 1);
 
@@ -1545,17 +1546,17 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64
  * hl_notifier_event_send_all - notify all user processes via eventfd
  *
  * @hdev: pointer to habanalabs device structure
- * @event: the occurred event
+ * @event_mask: the occurred event/s
  * Returns 0 for success or an error on failure.
  */
-void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
 {
 	struct hl_fpriv	*hpriv;
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
 	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
-		hl_notifier_event_send(&hpriv->notifier_event, event);
+		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
 
 	mutex_unlock(&hdev->fpriv_list_lock);
 
@@ -1563,7 +1564,7 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
 
 	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
-		hl_notifier_event_send(&hpriv->notifier_event, event);
+		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
 
 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 }
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1ab64e8a05c6..3a0f6dca8361 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2644,14 +2644,48 @@ struct razwi_info {
 	u8		type;
 };
 
+#define MAX_QMAN_STREAMS_INFO		4
+#define OPCODE_INFO_MAX_ADDR_SIZE	8
+/**
+ * struct undefined_opcode_info - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ *                   entiers. In case all streams array entries are
+ *                   filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ *                       should be equal to 1 incase of undefined opcode
+ *                       in Upper-CP (specific stream) and equal to 4 incase
+ *                       of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ *             MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ * @write_enable: if set, writing to undefined opcode parameters in the structure
+ *                 is enable so the first (root cause) undefined opcode will not be
+ *                 overwritten.
+ */
+struct undefined_opcode_info {
+	ktime_t timestamp;
+	u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+	u64 cq_addr;
+	u32 cq_size;
+	u32 cb_addr_streams_len;
+	u32 engine_id;
+	u32 stream_id;
+	bool write_enable;
+};
+
 /**
  * struct last_error_session_info - info about last session errors occurred.
  * @cs_timeout: CS timeout error last information.
  * @razwi: razwi last information.
+ * @undef_opcode: undefined opcode information
  */
 struct last_error_session_info {
-	struct	cs_timeout_info	cs_timeout;
-	struct	razwi_info	razwi;
+	struct cs_timeout_info		cs_timeout;
+	struct razwi_info		razwi;
+	struct undefined_opcode_info	undef_opcode;
 };
 
 /**
@@ -3159,7 +3193,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct cpucp_sensor *sensors_arr);
 
-void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask);
 
 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index e617cc394ff7..d02533666746 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -198,6 +198,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	atomic_set(&hdev->last_error.cs_timeout.write_enable, 1);
 	atomic_set(&hdev->last_error.razwi.write_enable, 1);
+	hdev->last_error.undef_opcode.write_enable = true;
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 72b0d145e853..ec9f0a93cbe2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -443,6 +443,38 @@ static s64 gaudi_state_dump_specs_props[] = {
 	[SP_NUM_CORES] = 1,
 };
 
+static const int gaudi_queue_id_to_engine_id[] = {
+	[GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0,
+	[GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1,
+	[GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE,
+	[GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2,
+	[GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3,
+	[GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4,
+	[GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5,
+	[GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6,
+	[GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7,
+	[GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0,
+	[GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_1,
+	[GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0,
+	[GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1,
+	[GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2,
+	[GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3,
+	[GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4,
+	[GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5,
+	[GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6,
+	[GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7,
+	[GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0,
+	[GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1,
+	[GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2,
+	[GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3,
+	[GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4,
+	[GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5,
+	[GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6,
+	[GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7,
+	[GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8,
+	[GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9,
+};
+
 /* The order here is opposite to the order of the indexing in the h/w.
  * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
  */
@@ -6989,14 +7021,15 @@ static inline u32 gaudi_queue_idx_dec(u32 idx, u32 q_len)
 }
 
 /**
- * gaudi_print_sw_config_stream_data - print SW config stream data
+ * gaudi_handle_sw_config_stream_data - print SW config stream data
  *
  * @hdev: pointer to the habanalabs device structure
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  */
-static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream,
-						u64 qman_base)
+static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 stream,
+						u64 qman_base, u64 event_mask)
 {
 	u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
 	u32 cq_ptr_lo_off, size;
@@ -7014,24 +7047,32 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream
 	size = RREG32(cq_tsize);
 	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
 							stream, cq_ptr, size);
+
+	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+		hdev->last_error.undef_opcode.cq_addr = cq_ptr;
+		hdev->last_error.undef_opcode.cq_size = size;
+		hdev->last_error.undef_opcode.stream_id = stream;
+	}
 }
 
 /**
- * gaudi_print_last_pqes_on_err - print last PQEs on error
+ * gaudi_handle_last_pqes_on_err - print last PQEs on error
  *
  * @hdev: pointer to the habanalabs device structure
  * @qid_base: first QID of the QMAN (out of 4 streams)
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
  */
-static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
+static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 						u32 stream, u64 qman_base,
+						u64 event_mask,
 						bool pr_sw_conf)
 {
 	u32 ci, qm_ci_stream_off, queue_len;
 	struct hl_hw_queue *q;
-	u64 pq_ci;
+	u64 pq_ci, addr[PQ_FETCHER_CACHE_SIZE];
 	int i;
 
 	q = &hdev->kernel_queues[qid_base + stream];
@@ -7046,16 +7087,16 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 	hdev->asic_funcs->hw_queues_lock(hdev);
 
 	if (pr_sw_conf)
-		gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+		gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
 
 	ci = RREG32(pq_ci);
 
 	/* we should start printing form ci -1 */
 	ci = gaudi_queue_idx_dec(ci, queue_len);
+	memset(addr, 0, sizeof(addr));
 
 	for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
 		struct hl_bd *bd;
-		u64 addr;
 		u32 len;
 
 		bd = q->kernel_address;
@@ -7066,52 +7107,68 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 		if (!len)
 			break;
 
-		addr = le64_to_cpu(bd->ptr);
+		addr[i] = le64_to_cpu(bd->ptr);
 
 		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
-							stream, ci, addr, len);
+							stream, ci, addr[i], len);
 
 		/* get previous ci, wrap if needed */
 		ci = gaudi_queue_idx_dec(ci, queue_len);
 	}
 
+	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+		struct undefined_opcode_info *undef_opcode = &hdev->last_error.undef_opcode;
+		u32 arr_idx = undef_opcode->cb_addr_streams_len;
+
+		if (arr_idx == 0) {
+			undef_opcode->timestamp = ktime_get();
+			undef_opcode->engine_id = gaudi_queue_id_to_engine_id[qid_base];
+		}
+
+		memcpy(undef_opcode->cb_addr_streams[arr_idx], addr, sizeof(addr));
+		undef_opcode->cb_addr_streams_len++;
+	}
+
 	hdev->asic_funcs->hw_queues_unlock(hdev);
 }
 
 /**
- * print_qman_data_on_err - extract QMAN data on error
+ * handle_qman_data_on_err - extract QMAN data on error
  *
  * @hdev: pointer to the habanalabs device structure
  * @qid_base: first QID of the QMAN (out of 4 streams)
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  *
  * This function attempt to exatract as much data as possible on QMAN error.
  * On upper CP print the SW config stream data and last 8 PQEs.
  * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
  */
-static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
-						u32 stream, u64 qman_base)
+static void handle_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
+				   u32 stream, u64 qman_base, u64 event_mask)
 {
 	u32 i;
 
 	if (stream != QMAN_STREAMS) {
-		gaudi_print_last_pqes_on_err(hdev, qid_base, stream, qman_base,
-									true);
+		gaudi_handle_last_pqes_on_err(hdev, qid_base, stream,
+			qman_base, event_mask, true);
 		return;
 	}
 
-	gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+	/* handle Lower-CP */
+	gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
 
 	for (i = 0; i < QMAN_STREAMS; i++)
-		gaudi_print_last_pqes_on_err(hdev, qid_base, i, qman_base,
-									false);
+		gaudi_handle_last_pqes_on_err(hdev, qid_base, i,
+			qman_base, event_mask, false);
 }
 
 static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
 					  const char *qm_name,
 					  u64 qman_base,
-					  u32 qid_base)
+					  u32 qid_base,
+					  u64 *event_mask)
 {
 	u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val;
 	u64 glbl_sts_addr, arb_err_addr;
@@ -7142,12 +7199,21 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
 				glbl_sts_clr_val |= BIT(j);
 			}
 		}
+		/* check for undefined opcode */
+		if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK &&
+				hdev->last_error.undef_opcode.write_enable) {
+			memset(&hdev->last_error.undef_opcode, 0,
+						sizeof(hdev->last_error.undef_opcode));
+
+			hdev->last_error.undef_opcode.write_enable = false;
+			*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+		}
 
 		/* Write 1 clear errors */
 		if (!hdev->stop_on_err)
 			WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
 		else
-			print_qman_data_on_err(hdev, qid_base, i, qman_base);
+			handle_qman_data_on_err(hdev, qid_base, i, qman_base, *event_mask);
 	}
 
 	arb_err_val = RREG32(arb_err_addr);
@@ -7385,7 +7451,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 		return;
 	}
 
-	gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base);
+	gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base, event_mask);
 }
 
 static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 6d2ccc09dcf2..c94b89cf1ec1 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1402,9 +1402,13 @@ struct hl_debug_args {
 /*
  * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
  *
- * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
+ * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
  */
-#define HL_NOTIFIER_EVENT_TPC_ASSERT  (1 << 0)
+#define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
+#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
+#define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
 
 /*
  * Various information operations such as:
-- 
cgit 


From 647469148360dc873405acc6fcf63772e9e401f4 Mon Sep 17 00:00:00 2001
From: Tal Cohen <talcohen@habana.ai>
Date: Thu, 12 May 2022 11:59:41 +0300
Subject: habanalabs: expose undefined opcode status via info ioctl

The info ioctl retrieves information on the last undefined opcode
occurred.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 25 +++++++++++++++++++
 include/uapi/misc/habanalabs.h                    | 30 +++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index c7864d6bb0a1..fe7ed46cd1c5 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -610,6 +610,28 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
 
+static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct hl_info_undefined_opcode_event info = {0};
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	info.timestamp = ktime_to_ns(hdev->last_error.undef_opcode.timestamp);
+	info.engine_id = hdev->last_error.undef_opcode.engine_id;
+	info.cq_addr = hdev->last_error.undef_opcode.cq_addr;
+	info.cq_size = hdev->last_error.undef_opcode.cq_size;
+	info.stream_id = hdev->last_error.undef_opcode.stream_id;
+	info.cb_addr_streams_len = hdev->last_error.undef_opcode.cb_addr_streams_len;
+	memcpy(info.cb_addr_streams, hdev->last_error.undef_opcode.cb_addr_streams,
+			sizeof(info.cb_addr_streams));
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
 static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@@ -718,6 +740,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_RAZWI_EVENT:
 		return razwi_info(hpriv, args);
 
+	case HL_INFO_UNDEFINED_OPCODE_EVENT:
+		return undefined_opcode_info(hpriv, args);
+
 	case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
 		return dev_mem_alloc_page_sizes_info(hpriv, args);
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index c94b89cf1ec1..5f9a6097f5f3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -352,6 +352,7 @@ enum hl_server_type {
  * HL_INFO_REGISTER_EVENTFD   - Register eventfd for event notifications.
  * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
  * HL_INFO_GET_EVENTS         - Retrieve the last occurred events
+ * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
  */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -380,6 +381,7 @@ enum hl_server_type {
 #define HL_INFO_REGISTER_EVENTFD		28
 #define HL_INFO_UNREGISTER_EVENTFD		29
 #define HL_INFO_GET_EVENTS			30
+#define HL_INFO_UNDEFINED_OPCODE_EVENT		31
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -656,6 +658,34 @@ struct hl_info_razwi_event {
 	__u8 pad[2];
 };
 
+#define MAX_QMAN_STREAMS_INFO		4
+#define OPCODE_INFO_MAX_ADDR_SIZE	8
+/**
+ * struct hl_info_undefined_opcode_event - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ *                   entiers. In case all streams array entries are
+ *                   filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ *                       should be equal to 1 incase of undefined opcode
+ *                       in Upper-CP (specific stream) and equal to 4 incase
+ *                       of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ *             MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ */
+struct hl_info_undefined_opcode_event {
+	__s64 timestamp;
+	__u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+	__u64 cq_addr;
+	__u32 cq_size;
+	__u32 cb_addr_streams_len;
+	__u32 engine_id;
+	__u32 stream_id;
+};
+
 /**
  * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
  * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
-- 
cgit 


From fa9deaca2f9123b1749ea81bbeb12778645af580 Mon Sep 17 00:00:00 2001
From: Tal Cohen <talcohen@habana.ai>
Date: Thu, 19 May 2022 18:00:55 +0300
Subject: habanalabs: send an event notification when CS timeout occurs

The Driver needs to inform the User process whenever one of its
CS is timed out. The Driver shall recognize the CS timeout and shall
send an eventfd notification, towards user space, whenever a timeout
is expired on a CS.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/command_submission.c    | 26 ++++++++++++++--------
 include/uapi/misc/habanalabs.h                     |  2 ++
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 47b49cbf67ab..cbb7c29966ff 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -797,10 +797,11 @@ out:
 static void cs_timedout(struct work_struct *work)
 {
 	struct hl_device *hdev;
+	u64 event_mask;
 	int rc;
 	struct hl_cs *cs = container_of(work, struct hl_cs,
 						 work_tdr.work);
-	bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
+	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
 
 	rc = cs_get_unless_zero(cs);
 	if (!rc)
@@ -811,9 +812,15 @@ static void cs_timedout(struct work_struct *work)
 		return;
 	}
 
-	/* Mark the CS is timed out so we won't try to cancel its TDR */
-	if (likely(!skip_reset_on_timeout))
+	if (likely(!skip_reset_on_timeout)) {
+		if (hdev->reset_on_lockup)
+			device_reset = true;
+		else
+			hdev->reset_info.needs_reset = true;
+
+		/* Mark the CS is timed out so we won't try to cancel its TDR */
 		cs->timedout = true;
+	}
 
 	hdev = cs->ctx->hdev;
 
@@ -822,6 +829,11 @@ static void cs_timedout(struct work_struct *work)
 	if (rc) {
 		hdev->last_error.cs_timeout.timestamp = ktime_get();
 		hdev->last_error.cs_timeout.seq = cs->sequence;
+
+		event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
+				HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
+
+		hl_notifier_event_send_all(hdev, event_mask);
 	}
 
 	switch (cs->type) {
@@ -856,12 +868,8 @@ static void cs_timedout(struct work_struct *work)
 
 	cs_put(cs);
 
-	if (likely(!skip_reset_on_timeout)) {
-		if (hdev->reset_on_lockup)
-			hl_device_reset(hdev, HL_DRV_RESET_TDR);
-		else
-			hdev->reset_info.needs_reset = true;
-	}
+	if (device_reset)
+		hl_device_reset(hdev, HL_DRV_RESET_TDR);
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 5f9a6097f5f3..18f86d259421 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1435,10 +1435,12 @@ struct hl_debug_args {
  * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
  * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
  * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
+ * HL_NOTIFIER_EVENT_CS_TIMEOUT       - Indicates CS timeout error
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
 #define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
+#define HL_NOTIFIER_EVENT_CS_TIMEOUT            (1ULL << 3)
 
 /*
  * Various information operations such as:
-- 
cgit 


From 67a54d5de2c348562b745c5029daa9e5207514c9 Mon Sep 17 00:00:00 2001
From: Tal Cohen <talcohen@habana.ai>
Date: Thu, 9 Jun 2022 18:08:31 +0300
Subject: habanalabs/gaudi: notify user process on device unavailable

When a device error occurs, user process would like to get some
indication on the error by reading some device HW info. If the
device is unavailable, user process can't perform any HW device
reading.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c |  5 ++++-
 include/uapi/misc/habanalabs.h        | 12 +++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1156ec7dacc1..939d2636b9ed 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8063,7 +8063,10 @@ reset_device:
 
 	if (hdev->asic_prop.fw_security_enabled && !reset_direct) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW | fw_fatal_err_flag;
-		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+
+		/* notify on device unavailable while the reset triggered by fw */
+		event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET |
+					HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE);
 	} else if (hdev->hard_reset_on_fw_events) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag;
 		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 18f86d259421..78aecea4684d 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1432,15 +1432,17 @@ struct hl_debug_args {
 /*
  * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
  *
- * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
- * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
- * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
- * HL_NOTIFIER_EVENT_CS_TIMEOUT       - Indicates CS timeout error
+ * HL_NOTIFIER_EVENT_TPC_ASSERT		- Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	- Indicates undefined operation code
+ * HL_NOTIFIER_EVENT_DEVICE_RESET	- Indicates device requires a reset
+ * HL_NOTIFIER_EVENT_CS_TIMEOUT		- Indicates CS timeout error
+ * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	- Indicates device is unavailable
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
 #define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
-#define HL_NOTIFIER_EVENT_CS_TIMEOUT            (1ULL << 3)
+#define HL_NOTIFIER_EVENT_CS_TIMEOUT		(1ULL << 3)
+#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	(1ULL << 4)
 
 /*
  * Various information operations such as:
-- 
cgit 


From 5125aa3368892dd9dd8bca3d64e88b11bc3490a4 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Fri, 24 Jun 2022 13:36:10 +0300
Subject: habanalabs/goya: move dma direction enum to uapi file

The values in this enum are not used by h/w but are a contract
between userspace and the kernel driver so they must be defined
in the uapi file.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/goya/goya.c                | 28 +++++++++++-----------
 .../misc/habanalabs/include/goya/goya_packets.h    | 12 ----------
 include/uapi/misc/habanalabs.h                     | 27 +++++++++++++++++++++
 3 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 25b1e3e139e8..411a4be09aa6 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3403,7 +3403,7 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
 {
 	u64 device_memory_addr, addr;
 	enum dma_data_direction dir;
-	enum goya_dma_direction user_dir;
+	enum hl_goya_dma_direction user_dir;
 	bool sram_addr = true;
 	bool skip_host_mem_pin = false;
 	bool user_memset;
@@ -3419,7 +3419,7 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
 			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
 
 	switch (user_dir) {
-	case DMA_HOST_TO_DRAM:
+	case HL_DMA_HOST_TO_DRAM:
 		dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n");
 		dir = DMA_TO_DEVICE;
 		sram_addr = false;
@@ -3429,7 +3429,7 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
 			skip_host_mem_pin = true;
 		break;
 
-	case DMA_DRAM_TO_HOST:
+	case HL_DMA_DRAM_TO_HOST:
 		dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n");
 		dir = DMA_FROM_DEVICE;
 		sram_addr = false;
@@ -3437,7 +3437,7 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
 		device_memory_addr = le64_to_cpu(user_dma_pkt->src_addr);
 		break;
 
-	case DMA_HOST_TO_SRAM:
+	case HL_DMA_HOST_TO_SRAM:
 		dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n");
 		dir = DMA_TO_DEVICE;
 		addr = le64_to_cpu(user_dma_pkt->src_addr);
@@ -3446,14 +3446,14 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
 			skip_host_mem_pin = true;
 		break;
 
-	case DMA_SRAM_TO_HOST:
+	case HL_DMA_SRAM_TO_HOST:
 		dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n");
 		dir = DMA_FROM_DEVICE;
 		addr = le64_to_cpu(user_dma_pkt->dst_addr);
 		device_memory_addr = le64_to_cpu(user_dma_pkt->src_addr);
 		break;
 	default:
-		dev_err(hdev->dev, "DMA direction is undefined\n");
+		dev_err(hdev->dev, "DMA direction %d is unsupported/undefined\n", user_dir);
 		return -EFAULT;
 	}
 
@@ -3505,14 +3505,14 @@ static int goya_validate_dma_pkt_no_host(struct hl_device *hdev,
 				struct packet_lin_dma *user_dma_pkt)
 {
 	u64 sram_memory_addr, dram_memory_addr;
-	enum goya_dma_direction user_dir;
+	enum hl_goya_dma_direction user_dir;
 	u32 ctl;
 
 	ctl = le32_to_cpu(user_dma_pkt->ctl);
 	user_dir = (ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
 			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
 
-	if (user_dir == DMA_DRAM_TO_SRAM) {
+	if (user_dir == HL_DMA_DRAM_TO_SRAM) {
 		dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n");
 		dram_memory_addr = le64_to_cpu(user_dma_pkt->src_addr);
 		sram_memory_addr = le64_to_cpu(user_dma_pkt->dst_addr);
@@ -3549,7 +3549,7 @@ static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
 				struct hl_cs_parser *parser,
 				struct packet_lin_dma *user_dma_pkt)
 {
-	enum goya_dma_direction user_dir;
+	enum hl_goya_dma_direction user_dir;
 	u32 ctl;
 	int rc;
 
@@ -3574,7 +3574,7 @@ static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
 		return -EINVAL;
 	}
 
-	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM))
+	if ((user_dir == HL_DMA_DRAM_TO_SRAM) || (user_dir == HL_DMA_SRAM_TO_DRAM))
 		rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt);
 	else
 		rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt);
@@ -3781,7 +3781,7 @@ static int goya_patch_dma_packet(struct hl_device *hdev,
 	u32 count, dma_desc_cnt;
 	u64 len, len_next;
 	dma_addr_t dma_addr, dma_addr_next;
-	enum goya_dma_direction user_dir;
+	enum hl_goya_dma_direction user_dir;
 	u64 device_memory_addr, addr;
 	enum dma_data_direction dir;
 	struct sg_table *sgt;
@@ -3797,14 +3797,14 @@ static int goya_patch_dma_packet(struct hl_device *hdev,
 	user_memset = (ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
 			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
 
-	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) ||
+	if ((user_dir == HL_DMA_DRAM_TO_SRAM) || (user_dir == HL_DMA_SRAM_TO_DRAM) ||
 			(user_dma_pkt->tsize == 0)) {
 		memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt));
 		*new_dma_pkt_size = sizeof(*new_dma_pkt);
 		return 0;
 	}
 
-	if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) {
+	if ((user_dir == HL_DMA_HOST_TO_DRAM) || (user_dir == HL_DMA_HOST_TO_SRAM)) {
 		addr = le64_to_cpu(user_dma_pkt->src_addr);
 		device_memory_addr = le64_to_cpu(user_dma_pkt->dst_addr);
 		dir = DMA_TO_DEVICE;
@@ -4804,7 +4804,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
 				(1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) |
 				(1 << GOYA_PKT_CTL_RB_SHIFT) |
 				(1 << GOYA_PKT_CTL_MB_SHIFT));
-		ctl |= (is_dram ? DMA_HOST_TO_DRAM : DMA_HOST_TO_SRAM) <<
+		ctl |= (is_dram ? HL_DMA_HOST_TO_DRAM : HL_DMA_HOST_TO_SRAM) <<
 				GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
 		lin_dma_pkt->ctl = cpu_to_le32(ctl);
 
diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h
index 50ce5175b63a..896799204fb0 100644
--- a/drivers/misc/habanalabs/include/goya/goya_packets.h
+++ b/drivers/misc/habanalabs/include/goya/goya_packets.h
@@ -28,18 +28,6 @@ enum packet_id {
 				PACKET_HEADER_PACKET_ID_SHIFT) + 1
 };
 
-enum goya_dma_direction {
-	DMA_HOST_TO_DRAM,
-	DMA_HOST_TO_SRAM,
-	DMA_DRAM_TO_SRAM,
-	DMA_SRAM_TO_DRAM,
-	DMA_SRAM_TO_HOST,
-	DMA_DRAM_TO_HOST,
-	DMA_DRAM_TO_DRAM,
-	DMA_SRAM_TO_SRAM,
-	DMA_ENUM_MAX
-};
-
 #define GOYA_PKT_CTL_OPCODE_SHIFT	24
 #define GOYA_PKT_CTL_OPCODE_MASK	0x1F000000
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 78aecea4684d..41a0fa345e4e 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -275,6 +275,33 @@ enum hl_gaudi_pll_index {
 	HL_GAUDI_PLL_MAX
 };
 
+/**
+ * enum hl_goya_dma_direction - Direction of DMA operation inside a LIN_DMA packet that is
+ *                              submitted to the GOYA's DMA QMAN. This attribute is not relevant
+ *                              to the H/W but the kernel driver use it to parse the packet's
+ *                              addresses and patch/validate them.
+ * @HL_DMA_HOST_TO_DRAM: DMA operation from Host memory to GOYA's DDR.
+ * @HL_DMA_HOST_TO_SRAM: DMA operation from Host memory to GOYA's SRAM.
+ * @HL_DMA_DRAM_TO_SRAM: DMA operation from GOYA's DDR to GOYA's SRAM.
+ * @HL_DMA_SRAM_TO_DRAM: DMA operation from GOYA's SRAM to GOYA's DDR.
+ * @HL_DMA_SRAM_TO_HOST: DMA operation from GOYA's SRAM to Host memory.
+ * @HL_DMA_DRAM_TO_HOST: DMA operation from GOYA's DDR to Host memory.
+ * @HL_DMA_DRAM_TO_DRAM: DMA operation from GOYA's DDR to GOYA's DDR.
+ * @HL_DMA_SRAM_TO_SRAM: DMA operation from GOYA's SRAM to GOYA's SRAM.
+ * @HL_DMA_ENUM_MAX: number of values in enum
+ */
+enum hl_goya_dma_direction {
+	HL_DMA_HOST_TO_DRAM,
+	HL_DMA_HOST_TO_SRAM,
+	HL_DMA_DRAM_TO_SRAM,
+	HL_DMA_SRAM_TO_DRAM,
+	HL_DMA_SRAM_TO_HOST,
+	HL_DMA_DRAM_TO_HOST,
+	HL_DMA_DRAM_TO_DRAM,
+	HL_DMA_SRAM_TO_SRAM,
+	HL_DMA_ENUM_MAX
+};
+
 /**
  * enum hl_device_status - Device status information.
  * @HL_DEVICE_STATUS_OPERATIONAL: Device is operational.
-- 
cgit 


From 97c6d22fa4bd54cccff39e862893327eef1bbfa8 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Fri, 24 Jun 2022 13:38:57 +0300
Subject: uapi: habanalabs: add gaudi2 defines

Add the new defines for GAUDI2 uapi interface.

It includes the following:
1. Enums of engines and PLLs.
2. New information in the info IOCTL that is retrieved by the driver.
3. Update comments regarding the CB/CS/wait for CS ioctls.
4. New fields in the debug IOCTL for configuring the profiler for
   Gaudi2.

There is no new IOCTL.

Some of the changes are also relevant for Greco (which will be
upstreamed later this year). When ever it says "Greco and onwards",
it means it is also for Gaudi2.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/uapi/misc/habanalabs.h | 455 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 435 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 41a0fa345e4e..77b89c537ee8 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -184,6 +184,285 @@ enum gaudi_queue_id {
 	GAUDI_QUEUE_ID_SIZE
 };
 
+/*
+ * In GAUDI2 we have two modes of operation in regard to queues:
+ * 1. Legacy mode, where each QMAN exposes 4 streams to the user
+ * 2. F/W mode, where we use F/W to schedule the JOBS to the different queues.
+ *
+ * When in legacy mode, the user sends the queue id per JOB according to
+ * enum gaudi2_queue_id below.
+ *
+ * When in F/W mode, the user sends a stream id per Command Submission. The
+ * stream id is a running number from 0 up to (N-1), where N is the number
+ * of streams the F/W exposes and is passed to the user in
+ * struct hl_info_hw_ip_info
+ */
+
+enum gaudi2_queue_id {
+	GAUDI2_QUEUE_ID_PDMA_0_0 = 0,
+	GAUDI2_QUEUE_ID_PDMA_0_1 = 1,
+	GAUDI2_QUEUE_ID_PDMA_0_2 = 2,
+	GAUDI2_QUEUE_ID_PDMA_0_3 = 3,
+	GAUDI2_QUEUE_ID_PDMA_1_0 = 4,
+	GAUDI2_QUEUE_ID_PDMA_1_1 = 5,
+	GAUDI2_QUEUE_ID_PDMA_1_2 = 6,
+	GAUDI2_QUEUE_ID_PDMA_1_3 = 7,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0 = 8,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_0_1 = 9,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_0_2 = 10,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_0_3 = 11,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0 = 12,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_1_1 = 13,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_1_2 = 14,
+	GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3 = 15,
+	GAUDI2_QUEUE_ID_DCORE0_MME_0_0 = 16,
+	GAUDI2_QUEUE_ID_DCORE0_MME_0_1 = 17,
+	GAUDI2_QUEUE_ID_DCORE0_MME_0_2 = 18,
+	GAUDI2_QUEUE_ID_DCORE0_MME_0_3 = 19,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_0_0 = 20,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_0_1 = 21,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_0_2 = 22,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_0_3 = 23,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_1_0 = 24,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_1_1 = 25,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_1_2 = 26,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_1_3 = 27,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_2_0 = 28,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_2_1 = 29,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_2_2 = 30,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_2_3 = 31,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_3_0 = 32,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_3_1 = 33,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_3_2 = 34,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_3_3 = 35,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_4_0 = 36,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_4_1 = 37,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_4_2 = 38,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_4_3 = 39,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_5_0 = 40,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_5_1 = 41,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_5_2 = 42,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_5_3 = 43,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_6_0 = 44,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_6_1 = 45,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_6_2 = 46,
+	GAUDI2_QUEUE_ID_DCORE0_TPC_6_3 = 47,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0 = 48,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_0_1 = 49,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_0_2 = 50,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_0_3 = 51,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_1_0 = 52,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_1_1 = 53,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_1_2 = 54,
+	GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3 = 55,
+	GAUDI2_QUEUE_ID_DCORE1_MME_0_0 = 56,
+	GAUDI2_QUEUE_ID_DCORE1_MME_0_1 = 57,
+	GAUDI2_QUEUE_ID_DCORE1_MME_0_2 = 58,
+	GAUDI2_QUEUE_ID_DCORE1_MME_0_3 = 59,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_0_0 = 60,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_0_1 = 61,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_0_2 = 62,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_0_3 = 63,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_1_0 = 64,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_1_1 = 65,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_1_2 = 66,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_1_3 = 67,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_2_0 = 68,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_2_1 = 69,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_2_2 = 70,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_2_3 = 71,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_3_0 = 72,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_3_1 = 73,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_3_2 = 74,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_3_3 = 75,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_4_0 = 76,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_4_1 = 77,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_4_2 = 78,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_4_3 = 79,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_5_0 = 80,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_5_1 = 81,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_5_2 = 82,
+	GAUDI2_QUEUE_ID_DCORE1_TPC_5_3 = 83,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0 = 84,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_0_1 = 85,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_0_2 = 86,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_0_3 = 87,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_1_0 = 88,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_1_1 = 89,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_1_2 = 90,
+	GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3 = 91,
+	GAUDI2_QUEUE_ID_DCORE2_MME_0_0 = 92,
+	GAUDI2_QUEUE_ID_DCORE2_MME_0_1 = 93,
+	GAUDI2_QUEUE_ID_DCORE2_MME_0_2 = 94,
+	GAUDI2_QUEUE_ID_DCORE2_MME_0_3 = 95,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_0_0 = 96,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_0_1 = 97,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_0_2 = 98,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_0_3 = 99,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_1_0 = 100,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_1_1 = 101,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_1_2 = 102,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_1_3 = 103,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_2_0 = 104,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_2_1 = 105,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_2_2 = 106,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_2_3 = 107,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_3_0 = 108,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_3_1 = 109,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_3_2 = 110,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_3_3 = 111,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_4_0 = 112,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_4_1 = 113,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_4_2 = 114,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_4_3 = 115,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_5_0 = 116,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_5_1 = 117,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_5_2 = 118,
+	GAUDI2_QUEUE_ID_DCORE2_TPC_5_3 = 119,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0 = 120,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_0_1 = 121,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_0_2 = 122,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_0_3 = 123,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_1_0 = 124,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_1_1 = 125,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_1_2 = 126,
+	GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3 = 127,
+	GAUDI2_QUEUE_ID_DCORE3_MME_0_0 = 128,
+	GAUDI2_QUEUE_ID_DCORE3_MME_0_1 = 129,
+	GAUDI2_QUEUE_ID_DCORE3_MME_0_2 = 130,
+	GAUDI2_QUEUE_ID_DCORE3_MME_0_3 = 131,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_0_0 = 132,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_0_1 = 133,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_0_2 = 134,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_0_3 = 135,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_1_0 = 136,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_1_1 = 137,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_1_2 = 138,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_1_3 = 139,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_2_0 = 140,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_2_1 = 141,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_2_2 = 142,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_2_3 = 143,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_3_0 = 144,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_3_1 = 145,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_3_2 = 146,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_3_3 = 147,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_4_0 = 148,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_4_1 = 149,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_4_2 = 150,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_4_3 = 151,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_5_0 = 152,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_5_1 = 153,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_5_2 = 154,
+	GAUDI2_QUEUE_ID_DCORE3_TPC_5_3 = 155,
+	GAUDI2_QUEUE_ID_NIC_0_0 = 156,
+	GAUDI2_QUEUE_ID_NIC_0_1 = 157,
+	GAUDI2_QUEUE_ID_NIC_0_2 = 158,
+	GAUDI2_QUEUE_ID_NIC_0_3 = 159,
+	GAUDI2_QUEUE_ID_NIC_1_0 = 160,
+	GAUDI2_QUEUE_ID_NIC_1_1 = 161,
+	GAUDI2_QUEUE_ID_NIC_1_2 = 162,
+	GAUDI2_QUEUE_ID_NIC_1_3 = 163,
+	GAUDI2_QUEUE_ID_NIC_2_0 = 164,
+	GAUDI2_QUEUE_ID_NIC_2_1 = 165,
+	GAUDI2_QUEUE_ID_NIC_2_2 = 166,
+	GAUDI2_QUEUE_ID_NIC_2_3 = 167,
+	GAUDI2_QUEUE_ID_NIC_3_0 = 168,
+	GAUDI2_QUEUE_ID_NIC_3_1 = 169,
+	GAUDI2_QUEUE_ID_NIC_3_2 = 170,
+	GAUDI2_QUEUE_ID_NIC_3_3 = 171,
+	GAUDI2_QUEUE_ID_NIC_4_0 = 172,
+	GAUDI2_QUEUE_ID_NIC_4_1 = 173,
+	GAUDI2_QUEUE_ID_NIC_4_2 = 174,
+	GAUDI2_QUEUE_ID_NIC_4_3 = 175,
+	GAUDI2_QUEUE_ID_NIC_5_0 = 176,
+	GAUDI2_QUEUE_ID_NIC_5_1 = 177,
+	GAUDI2_QUEUE_ID_NIC_5_2 = 178,
+	GAUDI2_QUEUE_ID_NIC_5_3 = 179,
+	GAUDI2_QUEUE_ID_NIC_6_0 = 180,
+	GAUDI2_QUEUE_ID_NIC_6_1 = 181,
+	GAUDI2_QUEUE_ID_NIC_6_2 = 182,
+	GAUDI2_QUEUE_ID_NIC_6_3 = 183,
+	GAUDI2_QUEUE_ID_NIC_7_0 = 184,
+	GAUDI2_QUEUE_ID_NIC_7_1 = 185,
+	GAUDI2_QUEUE_ID_NIC_7_2 = 186,
+	GAUDI2_QUEUE_ID_NIC_7_3 = 187,
+	GAUDI2_QUEUE_ID_NIC_8_0 = 188,
+	GAUDI2_QUEUE_ID_NIC_8_1 = 189,
+	GAUDI2_QUEUE_ID_NIC_8_2 = 190,
+	GAUDI2_QUEUE_ID_NIC_8_3 = 191,
+	GAUDI2_QUEUE_ID_NIC_9_0 = 192,
+	GAUDI2_QUEUE_ID_NIC_9_1 = 193,
+	GAUDI2_QUEUE_ID_NIC_9_2 = 194,
+	GAUDI2_QUEUE_ID_NIC_9_3 = 195,
+	GAUDI2_QUEUE_ID_NIC_10_0 = 196,
+	GAUDI2_QUEUE_ID_NIC_10_1 = 197,
+	GAUDI2_QUEUE_ID_NIC_10_2 = 198,
+	GAUDI2_QUEUE_ID_NIC_10_3 = 199,
+	GAUDI2_QUEUE_ID_NIC_11_0 = 200,
+	GAUDI2_QUEUE_ID_NIC_11_1 = 201,
+	GAUDI2_QUEUE_ID_NIC_11_2 = 202,
+	GAUDI2_QUEUE_ID_NIC_11_3 = 203,
+	GAUDI2_QUEUE_ID_NIC_12_0 = 204,
+	GAUDI2_QUEUE_ID_NIC_12_1 = 205,
+	GAUDI2_QUEUE_ID_NIC_12_2 = 206,
+	GAUDI2_QUEUE_ID_NIC_12_3 = 207,
+	GAUDI2_QUEUE_ID_NIC_13_0 = 208,
+	GAUDI2_QUEUE_ID_NIC_13_1 = 209,
+	GAUDI2_QUEUE_ID_NIC_13_2 = 210,
+	GAUDI2_QUEUE_ID_NIC_13_3 = 211,
+	GAUDI2_QUEUE_ID_NIC_14_0 = 212,
+	GAUDI2_QUEUE_ID_NIC_14_1 = 213,
+	GAUDI2_QUEUE_ID_NIC_14_2 = 214,
+	GAUDI2_QUEUE_ID_NIC_14_3 = 215,
+	GAUDI2_QUEUE_ID_NIC_15_0 = 216,
+	GAUDI2_QUEUE_ID_NIC_15_1 = 217,
+	GAUDI2_QUEUE_ID_NIC_15_2 = 218,
+	GAUDI2_QUEUE_ID_NIC_15_3 = 219,
+	GAUDI2_QUEUE_ID_NIC_16_0 = 220,
+	GAUDI2_QUEUE_ID_NIC_16_1 = 221,
+	GAUDI2_QUEUE_ID_NIC_16_2 = 222,
+	GAUDI2_QUEUE_ID_NIC_16_3 = 223,
+	GAUDI2_QUEUE_ID_NIC_17_0 = 224,
+	GAUDI2_QUEUE_ID_NIC_17_1 = 225,
+	GAUDI2_QUEUE_ID_NIC_17_2 = 226,
+	GAUDI2_QUEUE_ID_NIC_17_3 = 227,
+	GAUDI2_QUEUE_ID_NIC_18_0 = 228,
+	GAUDI2_QUEUE_ID_NIC_18_1 = 229,
+	GAUDI2_QUEUE_ID_NIC_18_2 = 230,
+	GAUDI2_QUEUE_ID_NIC_18_3 = 231,
+	GAUDI2_QUEUE_ID_NIC_19_0 = 232,
+	GAUDI2_QUEUE_ID_NIC_19_1 = 233,
+	GAUDI2_QUEUE_ID_NIC_19_2 = 234,
+	GAUDI2_QUEUE_ID_NIC_19_3 = 235,
+	GAUDI2_QUEUE_ID_NIC_20_0 = 236,
+	GAUDI2_QUEUE_ID_NIC_20_1 = 237,
+	GAUDI2_QUEUE_ID_NIC_20_2 = 238,
+	GAUDI2_QUEUE_ID_NIC_20_3 = 239,
+	GAUDI2_QUEUE_ID_NIC_21_0 = 240,
+	GAUDI2_QUEUE_ID_NIC_21_1 = 241,
+	GAUDI2_QUEUE_ID_NIC_21_2 = 242,
+	GAUDI2_QUEUE_ID_NIC_21_3 = 243,
+	GAUDI2_QUEUE_ID_NIC_22_0 = 244,
+	GAUDI2_QUEUE_ID_NIC_22_1 = 245,
+	GAUDI2_QUEUE_ID_NIC_22_2 = 246,
+	GAUDI2_QUEUE_ID_NIC_22_3 = 247,
+	GAUDI2_QUEUE_ID_NIC_23_0 = 248,
+	GAUDI2_QUEUE_ID_NIC_23_1 = 249,
+	GAUDI2_QUEUE_ID_NIC_23_2 = 250,
+	GAUDI2_QUEUE_ID_NIC_23_3 = 251,
+	GAUDI2_QUEUE_ID_ROT_0_0 = 252,
+	GAUDI2_QUEUE_ID_ROT_0_1 = 253,
+	GAUDI2_QUEUE_ID_ROT_0_2 = 254,
+	GAUDI2_QUEUE_ID_ROT_0_3 = 255,
+	GAUDI2_QUEUE_ID_ROT_1_0 = 256,
+	GAUDI2_QUEUE_ID_ROT_1_1 = 257,
+	GAUDI2_QUEUE_ID_ROT_1_2 = 258,
+	GAUDI2_QUEUE_ID_ROT_1_3 = 259,
+	GAUDI2_QUEUE_ID_CPU_PQ = 260,
+	GAUDI2_QUEUE_ID_SIZE
+};
+
 /*
  * Engine Numbering
  *
@@ -242,6 +521,85 @@ enum gaudi_engine_id {
 	GAUDI_ENGINE_ID_SIZE
 };
 
+enum gaudi2_engine_id {
+	GAUDI2_DCORE0_ENGINE_ID_EDMA_0 = 0,
+	GAUDI2_DCORE0_ENGINE_ID_EDMA_1,
+	GAUDI2_DCORE0_ENGINE_ID_MME,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_0,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_1,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_2,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_3,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_4,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_5,
+	GAUDI2_DCORE0_ENGINE_ID_DEC_0,
+	GAUDI2_DCORE0_ENGINE_ID_DEC_1,
+	GAUDI2_DCORE1_ENGINE_ID_EDMA_0,
+	GAUDI2_DCORE1_ENGINE_ID_EDMA_1,
+	GAUDI2_DCORE1_ENGINE_ID_MME,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_0,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_1,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_2,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_3,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_4,
+	GAUDI2_DCORE1_ENGINE_ID_TPC_5,
+	GAUDI2_DCORE1_ENGINE_ID_DEC_0,
+	GAUDI2_DCORE1_ENGINE_ID_DEC_1,
+	GAUDI2_DCORE2_ENGINE_ID_EDMA_0,
+	GAUDI2_DCORE2_ENGINE_ID_EDMA_1,
+	GAUDI2_DCORE2_ENGINE_ID_MME,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_0,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_1,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_2,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_3,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_4,
+	GAUDI2_DCORE2_ENGINE_ID_TPC_5,
+	GAUDI2_DCORE2_ENGINE_ID_DEC_0,
+	GAUDI2_DCORE2_ENGINE_ID_DEC_1,
+	GAUDI2_DCORE3_ENGINE_ID_EDMA_0,
+	GAUDI2_DCORE3_ENGINE_ID_EDMA_1,
+	GAUDI2_DCORE3_ENGINE_ID_MME,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_0,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_1,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_2,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_3,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_4,
+	GAUDI2_DCORE3_ENGINE_ID_TPC_5,
+	GAUDI2_DCORE3_ENGINE_ID_DEC_0,
+	GAUDI2_DCORE3_ENGINE_ID_DEC_1,
+	GAUDI2_DCORE0_ENGINE_ID_TPC_6,
+	GAUDI2_ENGINE_ID_PDMA_0,
+	GAUDI2_ENGINE_ID_PDMA_1,
+	GAUDI2_ENGINE_ID_ROT_0,
+	GAUDI2_ENGINE_ID_ROT_1,
+	GAUDI2_PCIE_ENGINE_ID_DEC_0,
+	GAUDI2_PCIE_ENGINE_ID_DEC_1,
+	GAUDI2_ENGINE_ID_NIC0_0,
+	GAUDI2_ENGINE_ID_NIC0_1,
+	GAUDI2_ENGINE_ID_NIC1_0,
+	GAUDI2_ENGINE_ID_NIC1_1,
+	GAUDI2_ENGINE_ID_NIC2_0,
+	GAUDI2_ENGINE_ID_NIC2_1,
+	GAUDI2_ENGINE_ID_NIC3_0,
+	GAUDI2_ENGINE_ID_NIC3_1,
+	GAUDI2_ENGINE_ID_NIC4_0,
+	GAUDI2_ENGINE_ID_NIC4_1,
+	GAUDI2_ENGINE_ID_NIC5_0,
+	GAUDI2_ENGINE_ID_NIC5_1,
+	GAUDI2_ENGINE_ID_NIC6_0,
+	GAUDI2_ENGINE_ID_NIC6_1,
+	GAUDI2_ENGINE_ID_NIC7_0,
+	GAUDI2_ENGINE_ID_NIC7_1,
+	GAUDI2_ENGINE_ID_NIC8_0,
+	GAUDI2_ENGINE_ID_NIC8_1,
+	GAUDI2_ENGINE_ID_NIC9_0,
+	GAUDI2_ENGINE_ID_NIC9_1,
+	GAUDI2_ENGINE_ID_NIC10_0,
+	GAUDI2_ENGINE_ID_NIC10_1,
+	GAUDI2_ENGINE_ID_NIC11_0,
+	GAUDI2_ENGINE_ID_NIC11_1,
+	GAUDI2_ENGINE_ID_SIZE
+};
+
 /*
  * ASIC specific PLL index
  *
@@ -275,6 +633,22 @@ enum hl_gaudi_pll_index {
 	HL_GAUDI_PLL_MAX
 };
 
+enum hl_gaudi2_pll_index {
+	HL_GAUDI2_CPU_PLL = 0,
+	HL_GAUDI2_PCI_PLL,
+	HL_GAUDI2_SRAM_PLL,
+	HL_GAUDI2_HBM_PLL,
+	HL_GAUDI2_NIC_PLL,
+	HL_GAUDI2_DMA_PLL,
+	HL_GAUDI2_MESH_PLL,
+	HL_GAUDI2_MME_PLL,
+	HL_GAUDI2_TPC_PLL,
+	HL_GAUDI2_IF_PLL,
+	HL_GAUDI2_VID_PLL,
+	HL_GAUDI2_MSS_PLL,
+	HL_GAUDI2_PLL_MAX
+};
+
 /**
  * enum hl_goya_dma_direction - Direction of DMA operation inside a LIN_DMA packet that is
  *                              submitted to the GOYA's DMA QMAN. This attribute is not relevant
@@ -326,7 +700,8 @@ enum hl_server_type {
 	HL_SERVER_GAUDI_HLS1 = 1,
 	HL_SERVER_GAUDI_HLS1H = 2,
 	HL_SERVER_GAUDI_TYPE1 = 3,
-	HL_SERVER_GAUDI_TYPE2 = 4
+	HL_SERVER_GAUDI_TYPE2 = 4,
+	HL_SERVER_GAUDI2_HLS2 = 5
 };
 
 /* Opcode for management ioctl
@@ -428,8 +803,10 @@ enum hl_server_type {
  * @device_id: PCI device ID of the ASIC.
  * @module_id: Module ID of the ASIC for mezzanine cards in servers
  *             (From OCP spec).
+ * @decoder_enabled_mask: Bit-mask that represents which decoders are enabled.
  * @first_available_interrupt_id: The first available interrupt ID for the user
  *                                to be used when it works with user interrupts.
+ *                                Relevant for Gaudi2 and later.
  * @server_type: Server type that the Gaudi ASIC is currently installed in.
  *               The value is according to enum hl_server_type
  * @cpld_version: CPLD version on the board.
@@ -441,9 +818,15 @@ enum hl_server_type {
  * @tpc_enabled_mask: Bit-mask that represents which TPCs are enabled. Relevant
  *                    for Goya/Gaudi only.
  * @dram_enabled: Whether the DRAM is enabled.
+ * @mme_master_slave_mode: Indicate whether the MME is working in master/slave
+ *                         configuration. Relevant for Greco and later.
  * @cpucp_version: The CPUCP f/w version.
  * @card_name: The card name as passed by the f/w.
+ * @tpc_enabled_mask_ext: Bit-mask that represents which TPCs are enabled.
+ *                        Relevant for Greco and later.
  * @dram_page_size: The DRAM physical page size.
+ * @edma_enabled_mask: Bit-mask that represents which EDMAs are enabled.
+ *                     Relevant for Gaudi2 and later.
  * @number_of_user_interrupts: The number of interrupts that are available to the userspace
  *                             application to use. Relevant for Gaudi2 and later.
  * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
@@ -456,7 +839,7 @@ struct hl_info_hw_ip_info {
 	__u32 num_of_events;
 	__u32 device_id;
 	__u32 module_id;
-	__u32 reserved;
+	__u32 decoder_enabled_mask;
 	__u16 first_available_interrupt_id;
 	__u16 server_type;
 	__u32 cpld_version;
@@ -466,12 +849,13 @@ struct hl_info_hw_ip_info {
 	__u32 psoc_pci_pll_div_factor;
 	__u8 tpc_enabled_mask;
 	__u8 dram_enabled;
-	__u8 pad[2];
+	__u8 reserved;
+	__u8 mme_master_slave_mode;
 	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
-	__u64 reserved2;
+	__u64 tpc_enabled_mask_ext;
 	__u64 dram_page_size;
-	__u32 reserved3;
+	__u32 edma_enabled_mask;
 	__u16 number_of_user_interrupts;
 	__u16 pad2;
 	__u64 reserved4;
@@ -821,16 +1205,16 @@ union hl_cb_args {
 /* HL_CS_CHUNK_FLAGS_ values
  *
  * HL_CS_CHUNK_FLAGS_USER_ALLOC_CB:
- *      Indicates if the CB was allocated and mapped by userspace.
- *      User allocated CB is a command buffer allocated by the user, via malloc
- *      (or similar). After allocating the CB, the user invokes “memory ioctl”
- *      to map the user memory into a device virtual address. The user provides
- *      this address via the cb_handle field. The interface provides the
- *      ability to create a large CBs, Which aren’t limited to
- *      “HL_MAX_CB_SIZE”. Therefore, it increases the PCI-DMA queues
- *      throughput. This CB allocation method also reduces the use of Linux
- *      DMA-able memory pool. Which are limited and used by other Linux
- *      sub-systems.
+ *      Indicates if the CB was allocated and mapped by userspace
+ *      (relevant to greco and above). User allocated CB is a command buffer,
+ *      allocated by the user, via malloc (or similar). After allocating the
+ *      CB, the user invokes - “memory ioctl” to map the user memory into a
+ *      device virtual address. The user provides this address via the
+ *      cb_handle field. The interface provides the ability to create a
+ *      large CBs, Which aren’t limited to “HL_MAX_CB_SIZE”. Therefore, it
+ *      increases the PCI-DMA queues throughput. This CB allocation method
+ *      also reduces the use of Linux DMA-able memory pool. Which are limited
+ *      and used by other Linux sub-systems.
  */
 #define HL_CS_CHUNK_FLAGS_USER_ALLOC_CB 0x1
 
@@ -840,12 +1224,17 @@ union hl_cb_args {
  */
 struct hl_cs_chunk {
 	union {
-		/* For external queue, this represents a Handle of CB on the
+		/* Goya/Gaudi:
+		 * For external queue, this represents a Handle of CB on the
 		 * Host.
 		 * For internal queue in Goya, this represents an SRAM or
 		 * a DRAM address of the internal CB. In Gaudi, this might also
 		 * represent a mapped host address of the CB.
 		 *
+		 * Greco onwards:
+		 * For H/W queue, this represents either a Handle of CB on the
+		 * Host, or an SRAM, a DRAM, or a mapped host address of the CB.
+		 *
 		 * A mapped host address is in the device address space, after
 		 * a host address was mapped by the device MMU.
 		 */
@@ -910,11 +1299,12 @@ struct hl_cs_chunk {
 	__u32 pad[10];
 };
 
-/* SIGNAL and WAIT/COLLECTIVE_WAIT flags are mutually exclusive */
+/* SIGNAL/WAIT/COLLECTIVE_WAIT flags are mutually exclusive */
 #define HL_CS_FLAGS_FORCE_RESTORE		0x1
 #define HL_CS_FLAGS_SIGNAL			0x2
 #define HL_CS_FLAGS_WAIT			0x4
 #define HL_CS_FLAGS_COLLECTIVE_WAIT		0x8
+
 #define HL_CS_FLAGS_TIMESTAMP			0x20
 #define HL_CS_FLAGS_STAGED_SUBMISSION		0x40
 #define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST	0x80
@@ -1174,14 +1564,19 @@ union hl_wait_cs_args {
 
 /* Opcode to allocate device memory */
 #define HL_MEM_OP_ALLOC			0
+
 /* Opcode to free previously allocated device memory */
 #define HL_MEM_OP_FREE			1
+
 /* Opcode to map host and device memory */
 #define HL_MEM_OP_MAP			2
+
 /* Opcode to unmap previously mapped host and device memory */
 #define HL_MEM_OP_UNMAP			3
+
 /* Opcode to map a hw block */
 #define HL_MEM_OP_MAP_BLOCK		4
+
 /* Opcode to create DMA-BUF object for an existing device memory allocation
  * and to export an FD of that DMA-BUF back to the caller
  */
@@ -1400,7 +1795,16 @@ struct hl_debug_params_bmon {
 
 	/* Trace source ID */
 	__u32 id;
-	__u32 pad;
+
+	/* Control register */
+	__u32 control;
+
+	/* Two more address ranges that the user can request to filter */
+	__u64 start_addr2;
+	__u64 end_addr2;
+
+	__u64 start_addr3;
+	__u64 end_addr3;
 };
 
 struct hl_debug_params_spmu {
@@ -1409,7 +1813,11 @@ struct hl_debug_params_spmu {
 
 	/* Number of event types selection */
 	__u32 event_types_num;
-	__u32 pad;
+
+	/* TRC configuration register values */
+	__u32 pmtrc_val;
+	__u32 trc_ctrl_host_val;
+	__u32 trc_en_host_val;
 };
 
 /* Opcode for ETR component */
@@ -1524,16 +1932,23 @@ struct hl_debug_args {
  * (or if its the first CS for this context). The user can also order the
  * driver to run the "restore" phase explicitly
  *
+ * Goya/Gaudi:
  * There are two types of queues - external and internal. External queues
  * are DMA queues which transfer data from/to the Host. All other queues are
  * internal. The driver will get completion notifications from the device only
  * on JOBS which are enqueued in the external queues.
  *
+ * Greco onwards:
+ * There is a single type of queue for all types of engines, either DMA engines
+ * for transfers from/to the host or inside the device, or compute engines.
+ * The driver will get completion notifications from the device for all queues.
+ *
  * For jobs on external queues, the user needs to create command buffers
  * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on
  * internal queues, the user needs to prepare a "command buffer" with packets
  * on either the device SRAM/DRAM or the host, and give the device address of
  * that buffer to the CS ioctl.
+ * For jobs on H/W queues both options of command buffers are valid.
  *
  * This IOCTL is asynchronous in regard to the actual execution of the CS. This
  * means it returns immediately after ALL the JOBS were enqueued on their
@@ -1542,7 +1957,7 @@ struct hl_debug_args {
  *
  * Upon successful enqueue, the IOCTL returns a sequence number which the user
  * can use with the "Wait for CS" IOCTL to check whether the handle's CS
- * external JOBS have been completed. Note that if the CS has internal JOBS
+ * non-internal JOBS have been completed. Note that if the CS has internal JOBS
  * which can execute AFTER the external JOBS have finished, the driver might
  * report that the CS has finished executing BEFORE the internal JOBS have
  * actually finished executing.
-- 
cgit 


From 1a6609cdd496b1d2183189674962035903025976 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 28 Jun 2022 21:05:28 +0300
Subject: habanalabs: naming refactor of user interrupt flow

Current naming convention can be misleading. Hence renaming some
variables and defines in order to be more explicit.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c |  6 +++---
 drivers/misc/habanalabs/common/habanalabs.h         |  8 ++++----
 drivers/misc/habanalabs/common/irq.c                | 12 ++++++------
 drivers/misc/habanalabs/gaudi2/gaudi2.c             |  6 +++---
 include/uapi/misc/habanalabs.h                      |  7 +++++--
 5 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index e91ca31d4930..275dcb69a40e 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1080,7 +1080,7 @@ void hl_release_pending_user_interrupts(struct hl_device *hdev)
 		wake_pending_user_interrupt_threads(interrupt);
 	}
 
-	interrupt = &hdev->common_user_interrupt;
+	interrupt = &hdev->common_user_cq_interrupt;
 	wake_pending_user_interrupt_threads(interrupt);
 }
 
@@ -3373,8 +3373,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
 		interrupt = &hdev->user_interrupt[int_idx];
 
-	} else if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID) {
-		interrupt = &hdev->common_user_interrupt;
+	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
+		interrupt = &hdev->common_user_cq_interrupt;
 	} else {
 		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
 		return -EINVAL;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 8c38c2c1b1dc..9b2451f3619a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -76,7 +76,7 @@ struct hl_fpriv;
 
 #define HL_INVALID_QUEUE		UINT_MAX
 
-#define HL_COMMON_USER_INTERRUPT_ID	0xFFF
+#define HL_COMMON_USER_CQ_INTERRUPT_ID	0xFFF
 
 #define HL_STATE_DUMP_HIST_LEN		5
 
@@ -2952,8 +2952,8 @@ struct hl_reset_info {
  * @user_interrupt: array of hl_user_interrupt. upon the corresponding user
  *                  interrupt, driver will monitor the list of fences
  *                  registered to this interrupt.
- * @common_user_interrupt: common user interrupt for all user interrupts.
- *                         upon any user interrupt, driver will monitor the
+ * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
+ *                         upon any user CQ interrupt, driver will monitor the
  *                         list of fences registered to this common structure.
  * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
  *                   outstanding command submissions.
@@ -3118,7 +3118,7 @@ struct hl_device {
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
-	struct hl_user_interrupt	common_user_interrupt;
+	struct hl_user_interrupt	common_user_cq_interrupt;
 	struct hl_cs			**shadow_cs_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index c1088377d1de..fd8f2bd9020e 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -269,7 +269,7 @@ static int handle_registration_node(struct hl_device *hdev, struct hl_user_pendi
 	return 0;
 }
 
-static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *user_cq)
+static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interrupt *intr)
 {
 	struct hl_user_pending_interrupt *pend, *temp_pend;
 	struct list_head *ts_reg_free_list_head = NULL;
@@ -291,8 +291,8 @@ static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *use
 	if (!job)
 		return;
 
-	spin_lock(&user_cq->wait_list_lock);
-	list_for_each_entry_safe(pend, temp_pend, &user_cq->wait_list_head, wait_list_node) {
+	spin_lock(&intr->wait_list_lock);
+	list_for_each_entry_safe(pend, temp_pend, &intr->wait_list_head, wait_list_node) {
 		if ((pend->cq_kernel_addr && *(pend->cq_kernel_addr) >= pend->cq_target_value) ||
 				!pend->cq_kernel_addr) {
 			if (pend->ts_reg_info.buf) {
@@ -309,7 +309,7 @@ static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *use
 			}
 		}
 	}
-	spin_unlock(&user_cq->wait_list_lock);
+	spin_unlock(&intr->wait_list_lock);
 
 	if (ts_reg_free_list_head) {
 		INIT_WORK(&job->free_obj, hl_ts_free_objects);
@@ -339,10 +339,10 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 	 */
 	if (!user_int->is_decoder)
 		/* Handle user cq interrupts registered on all interrupts */
-		handle_user_cq(hdev, &hdev->common_user_interrupt);
+		handle_user_interrupt(hdev, &hdev->common_user_cq_interrupt);
 
 	/* Handle user cq or decoder interrupts registered on this specific irq */
-	handle_user_cq(hdev, user_int);
+	handle_user_interrupt(hdev, user_int);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 93373290e894..8f012076363d 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -2891,9 +2891,9 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int i, j, k;
 
-	/* Initialize common user interrupt */
-	HL_USR_INTR_STRUCT_INIT(hdev->common_user_interrupt, hdev, HL_COMMON_USER_INTERRUPT_ID,
-				false);
+	/* Initialize common user CQ interrupt */
+	HL_USR_INTR_STRUCT_INIT(hdev->common_user_cq_interrupt, hdev,
+				HL_COMMON_USER_CQ_INTERRUPT_ID, false);
 
 	/* User interrupts structure holds both decoder and user interrupts from various engines.
 	 * We first initialize the decoder interrupts and then we add the user interrupts.
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 77b89c537ee8..4ee24a3a13e9 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1442,6 +1442,7 @@ union hl_cs_args {
 
 #define HL_WAIT_CS_FLAGS_INTERRUPT		0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK		0xFFF00000
+#define HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT	0xFFF00000
 #define HL_WAIT_CS_FLAGS_MULTI_CS		0x4
 #define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ	0x10
 #define HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT	0x20
@@ -1491,8 +1492,10 @@ struct hl_wait_cs_in {
 
 	/* HL_WAIT_CS_FLAGS_*
 	 * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
-	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
-	 * not to specify an interrupt id ,set mask to all 1s.
+	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK
+	 *
+	 * in order to wait for any CQ interrupt, set interrupt value to
+	 * HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT.
 	 */
 	__u32 flags;
 
-- 
cgit 


From d6a66d59609fc45afc91149e13dddafb8faff0d6 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 28 Jun 2022 18:34:58 +0300
Subject: habanalabs: add support for common decoder interrupts

User application should be able to get notification for any decoder
completion. Hence, we introduce a new interface in which a user
can wait for all current decoder pending interrupts.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 5 +++++
 drivers/misc/habanalabs/common/habanalabs.h         | 3 +++
 drivers/misc/habanalabs/common/irq.c                | 9 +++------
 drivers/misc/habanalabs/gaudi2/gaudi2.c             | 4 ++++
 include/uapi/misc/habanalabs.h                      | 4 ++++
 5 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 275dcb69a40e..eb5f1aee15fc 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1082,6 +1082,9 @@ void hl_release_pending_user_interrupts(struct hl_device *hdev)
 
 	interrupt = &hdev->common_user_cq_interrupt;
 	wake_pending_user_interrupt_threads(interrupt);
+
+	interrupt = &hdev->common_decoder_interrupt;
+	wake_pending_user_interrupt_threads(interrupt);
 }
 
 static void job_wq_completion(struct work_struct *work)
@@ -3375,6 +3378,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
 		interrupt = &hdev->common_user_cq_interrupt;
+	} else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
+		interrupt = &hdev->common_decoder_interrupt;
 	} else {
 		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
 		return -EINVAL;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9b2451f3619a..7e84f2ce49ae 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -77,6 +77,7 @@ struct hl_fpriv;
 #define HL_INVALID_QUEUE		UINT_MAX
 
 #define HL_COMMON_USER_CQ_INTERRUPT_ID	0xFFF
+#define HL_COMMON_DEC_INTERRUPT_ID	0xFFE
 
 #define HL_STATE_DUMP_HIST_LEN		5
 
@@ -2955,6 +2956,7 @@ struct hl_reset_info {
  * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
  *                         upon any user CQ interrupt, driver will monitor the
  *                         list of fences registered to this common structure.
+ * @common_decoder_interrupt: common decoder interrupt for all user decoder interrupts.
  * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
  *                   outstanding command submissions.
  * @cq_wq: work queues of completion queues for executing work in process
@@ -3119,6 +3121,7 @@ struct hl_device {
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
 	struct hl_user_interrupt	common_user_cq_interrupt;
+	struct hl_user_interrupt	common_decoder_interrupt;
 	struct hl_cs			**shadow_cs_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index fd8f2bd9020e..d60dafb03a8e 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -333,12 +333,9 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 	struct hl_user_interrupt *user_int = arg;
 	struct hl_device *hdev = user_int->hdev;
 
-	/* If the interrupt is not a decoder interrupt, it means the interrupt
-	 * belongs to a user cq. In that case, before handling it, we need to handle the common
-	 * user cq
-	 */
-	if (!user_int->is_decoder)
-		/* Handle user cq interrupts registered on all interrupts */
+	if (user_int->is_decoder)
+		handle_user_interrupt(hdev, &hdev->common_decoder_interrupt);
+	else
 		handle_user_interrupt(hdev, &hdev->common_user_cq_interrupt);
 
 	/* Handle user cq or decoder interrupts registered on this specific irq */
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 8f012076363d..83da4247e71b 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -2895,6 +2895,10 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	HL_USR_INTR_STRUCT_INIT(hdev->common_user_cq_interrupt, hdev,
 				HL_COMMON_USER_CQ_INTERRUPT_ID, false);
 
+	/* Initialize common decoder interrupt */
+	HL_USR_INTR_STRUCT_INIT(hdev->common_decoder_interrupt, hdev,
+				HL_COMMON_DEC_INTERRUPT_ID, true);
+
 	/* User interrupts structure holds both decoder and user interrupts from various engines.
 	 * We first initialize the decoder interrupts and then we add the user interrupts.
 	 * The only limitation is that the last decoder interrupt id must be smaller
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 4ee24a3a13e9..8c6ab71e7831 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1443,6 +1443,7 @@ union hl_cs_args {
 #define HL_WAIT_CS_FLAGS_INTERRUPT		0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK		0xFFF00000
 #define HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT	0xFFF00000
+#define HL_WAIT_CS_FLAGS_ANY_DEC_INTERRUPT	0xFFE00000
 #define HL_WAIT_CS_FLAGS_MULTI_CS		0x4
 #define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ	0x10
 #define HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT	0x20
@@ -1496,6 +1497,9 @@ struct hl_wait_cs_in {
 	 *
 	 * in order to wait for any CQ interrupt, set interrupt value to
 	 * HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT.
+	 *
+	 * in order to wait for any decoder interrupt, set interrupt value to
+	 * HL_WAIT_CS_FLAGS_ANY_DEC_INTERRUPT.
 	 */
 	__u32 flags;
 
-- 
cgit 


From e3b20f3ee452e3ce1077822b2558846eb4fe571f Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 7 Jul 2022 11:42:15 +0300
Subject: habanalabs: add status of reset after device release

The user might want to know the device is in reset after device
release, which is not an erroneous event as a regular reset.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c         | 17 +++++++++++------
 drivers/misc/habanalabs/common/habanalabs_drv.c |  6 +++++-
 include/uapi/misc/habanalabs.h                  |  5 ++++-
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 5bc291c11e9b..19c049046383 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -271,16 +271,20 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
 
-	if (hdev->reset_info.in_reset)
-		status = HL_DEVICE_STATUS_IN_RESET;
-	else if (hdev->reset_info.needs_reset)
+	if (hdev->reset_info.in_reset) {
+		if (hdev->reset_info.is_in_soft_reset)
+			status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE;
+		else
+			status = HL_DEVICE_STATUS_IN_RESET;
+	} else if (hdev->reset_info.needs_reset) {
 		status = HL_DEVICE_STATUS_NEEDS_RESET;
-	else if (hdev->disabled)
+	} else if (hdev->disabled) {
 		status = HL_DEVICE_STATUS_MALFUNCTION;
-	else if (!hdev->init_done)
+	} else if (!hdev->init_done) {
 		status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
-	else
+	} else {
 		status = HL_DEVICE_STATUS_OPERATIONAL;
+	}
 
 	return status;
 }
@@ -296,6 +300,7 @@ bool hl_device_operational(struct hl_device *hdev,
 
 	switch (current_status) {
 	case HL_DEVICE_STATUS_IN_RESET:
+	case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
 	case HL_DEVICE_STATUS_MALFUNCTION:
 	case HL_DEVICE_STATUS_NEEDS_RESET:
 		return false;
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index d900bae86168..f733ead605e7 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -165,7 +165,8 @@ int hl_device_open(struct inode *inode, struct file *filp)
 			"Can't open %s because it is %s\n",
 			dev_name(hdev->dev), hdev->status[status]);
 
-		if (status == HL_DEVICE_STATUS_IN_RESET)
+		if (status == HL_DEVICE_STATUS_IN_RESET ||
+					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
 			rc = -EAGAIN;
 		else
 			rc = -EPERM;
@@ -395,6 +396,9 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
 					"in device creation", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
+					"in reset after device release", HL_STR_MAX);
+
 
 	/* First, we must find out which ASIC are we handling. This is needed
 	 * to configure the behavior of the driver (kernel parameters)
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 8c6ab71e7831..5d06d5c74dd1 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -684,6 +684,8 @@ enum hl_goya_dma_direction {
  * @HL_DEVICE_STATUS_NEEDS_RESET: Device needs reset because auto reset was disabled.
  * @HL_DEVICE_STATUS_IN_DEVICE_CREATION: Device is operational but its creation is still in
  *                                       progress.
+ * @HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: Device is currently during reset that was
+ *                                                  triggered because the user released the device
  * @HL_DEVICE_STATUS_LAST: Last status.
  */
 enum hl_device_status {
@@ -692,7 +694,8 @@ enum hl_device_status {
 	HL_DEVICE_STATUS_MALFUNCTION,
 	HL_DEVICE_STATUS_NEEDS_RESET,
 	HL_DEVICE_STATUS_IN_DEVICE_CREATION,
-	HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION
+	HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE,
+	HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE
 };
 
 enum hl_server_type {
-- 
cgit 


From 868232f5cd382c37a5005deb7be0620c6f7bc08d Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Mon, 11 Jul 2022 01:14:02 -0700
Subject: devlink: Remove unused function devlink_rate_nodes_destroy

The previous patch removed the last usage of the function
devlink_rate_nodes_destroy(). Thus, remove this function from devlink
API.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/devlink.h |  1 -
 net/core/devlink.c    | 18 ------------------
 2 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2a2a2a0c93f7..0e163cc87d45 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1571,7 +1571,6 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   bool external);
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
-void devlink_rate_nodes_destroy(struct devlink *devlink);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
 			       struct devlink_linecard *linecard);
 struct devlink_linecard *
diff --git a/net/core/devlink.c b/net/core/devlink.c
index db61f3a341cb..1588e2246234 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10095,24 +10095,6 @@ void devl_rate_nodes_destroy(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
 
-/**
- * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device
- *
- * @devlink: devlink instance
- *
- * Unset parent for all rate objects and destroy all rate nodes
- * on specified device.
- *
- * Context: Takes and release devlink->lock <mutex>.
- */
-void devlink_rate_nodes_destroy(struct devlink *devlink)
-{
-	mutex_lock(&devlink->lock);
-	devl_rate_nodes_destroy(devlink);
-	mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy);
-
 /**
  *	devlink_port_linecard_set - Link port with a linecard
  *
-- 
cgit 


From df539fc62b069ed2b2395d8d1c77f7758c5001a6 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Mon, 11 Jul 2022 01:14:05 -0700
Subject: devlink: Remove unused functions devlink_rate_leaf_create/destroy

The previous patch removed the last usage of the functions
devlink_rate_leaf_create() and devlink_rate_nodes_destroy(). Thus,
remove these function from devlink API.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/devlink.h |  2 --
 net/core/devlink.c    | 42 +++++++-----------------------------------
 2 files changed, 7 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0e163cc87d45..5150deb67fab 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1569,8 +1569,6 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   u32 controller, u16 pf, u32 sf,
 				   bool external);
-int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
-void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
 			       struct devlink_linecard *linecard);
 struct devlink_linecard *
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1588e2246234..970e5c2a52bd 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10006,20 +10006,13 @@ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
 }
 EXPORT_SYMBOL_GPL(devl_rate_leaf_create);
 
-int
-devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
-{
-	struct devlink *devlink = devlink_port->devlink;
-	int ret;
-
-	mutex_lock(&devlink->lock);
-	ret = devl_rate_leaf_create(devlink_port, priv);
-	mutex_unlock(&devlink->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(devlink_rate_leaf_create);
-
+/**
+ * devl_rate_leaf_destroy - destroy devlink rate leaf
+ *
+ * @devlink_port: devlink port linked to the rate object
+ *
+ * Destroy the devlink rate object of type leaf on provided @devlink_port.
+ */
 void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
 {
 	struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
@@ -10037,27 +10030,6 @@ void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
 }
 EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
 
-/**
- * devlink_rate_leaf_destroy - destroy devlink rate leaf
- *
- * @devlink_port: devlink port linked to the rate object
- *
- * Context: Takes and release devlink->lock <mutex>.
- */
-void devlink_rate_leaf_destroy(struct devlink_port *devlink_port)
-{
-	struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
-	struct devlink *devlink = devlink_port->devlink;
-
-	if (!devlink_rate)
-		return;
-
-	mutex_lock(&devlink->lock);
-	devl_rate_leaf_destroy(devlink_port);
-	mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy);
-
 /**
  * devl_rate_nodes_destroy - destroy all devlink rate nodes on device
  * @devlink: devlink instance
-- 
cgit 


From 7b19119f4c7dc9412b556ea12fae6b32574e2810 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Mon, 11 Jul 2022 01:14:06 -0700
Subject: net/mlx5: Use devl_ API in mlx5e_devlink_port_register

As part of the flows invoked by mlx5_devlink_eswitch_mode_set() get to
mlx5_rescan_drivers_locked() which can call mlx5e_probe()/mlx5e_remove
and register/unregister mlx5e driver ports accordingly. This can lead to
deadlock once mlx5_devlink_eswitch_mode_set() will use devlink lock.
Use devl_port_register/unregister() instead of
devlink_port_register/unregister() and add devlink instance locks in the
driver paths to this function to have it locked while calling devl_ API
function.

If remove or probe were called by module init or module cleanup flows,
need to lock devlink just before calling devl_port_register(), otherwise
it is called by attach/detach or register/unregister flow and we can
have the flow locked. Added flag to distinguish between these cases.

This will be used by the downstream patch to invoke
mlx5_devlink_eswitch_mode_set() with devlink locked.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 29 ++++++++++++++++++++--
 .../net/ethernet/mellanox/mlx5/core/en/devlink.c   | 16 ++++++++++--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  2 ++
 include/linux/mlx5/driver.h                        |  4 +++
 4 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 50422b56a64d..ccf2068d2e79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -335,13 +335,16 @@ static void del_adev(struct auxiliary_device *adev)
 
 int mlx5_attach_device(struct mlx5_core_dev *dev)
 {
+	struct devlink *devlink = priv_to_devlink(dev);
 	struct mlx5_priv *priv = &dev->priv;
 	struct auxiliary_device *adev;
 	struct auxiliary_driver *adrv;
 	int ret = 0, i;
 
+	devl_lock(devlink);
 	mutex_lock(&mlx5_intf_mutex);
 	priv->flags &= ~MLX5_PRIV_FLAGS_DETACH;
+	priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
 	for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) {
 		if (!priv->adev[i]) {
 			bool is_supported = false;
@@ -389,19 +392,24 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)
 			break;
 		}
 	}
+	priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
 	mutex_unlock(&mlx5_intf_mutex);
+	devl_unlock(devlink);
 	return ret;
 }
 
 void mlx5_detach_device(struct mlx5_core_dev *dev)
 {
+	struct devlink *devlink = priv_to_devlink(dev);
 	struct mlx5_priv *priv = &dev->priv;
 	struct auxiliary_device *adev;
 	struct auxiliary_driver *adrv;
 	pm_message_t pm = {};
 	int i;
 
+	devl_lock(devlink);
 	mutex_lock(&mlx5_intf_mutex);
+	priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
 	for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) {
 		if (!priv->adev[i])
 			continue;
@@ -430,18 +438,24 @@ skip_suspend:
 		del_adev(&priv->adev[i]->adev);
 		priv->adev[i] = NULL;
 	}
+	priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
 	priv->flags |= MLX5_PRIV_FLAGS_DETACH;
 	mutex_unlock(&mlx5_intf_mutex);
+	devl_unlock(devlink);
 }
 
 int mlx5_register_device(struct mlx5_core_dev *dev)
 {
+	struct devlink *devlink;
 	int ret;
 
+	devlink = priv_to_devlink(dev);
+	devl_lock(devlink);
 	mutex_lock(&mlx5_intf_mutex);
 	dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
 	ret = mlx5_rescan_drivers_locked(dev);
 	mutex_unlock(&mlx5_intf_mutex);
+	devl_unlock(devlink);
 	if (ret)
 		mlx5_unregister_device(dev);
 
@@ -450,10 +464,15 @@ int mlx5_register_device(struct mlx5_core_dev *dev)
 
 void mlx5_unregister_device(struct mlx5_core_dev *dev)
 {
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(dev);
+	devl_lock(devlink);
 	mutex_lock(&mlx5_intf_mutex);
 	dev->priv.flags = MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
 	mlx5_rescan_drivers_locked(dev);
 	mutex_unlock(&mlx5_intf_mutex);
+	devl_unlock(devlink);
 }
 
 static int add_drivers(struct mlx5_core_dev *dev)
@@ -526,16 +545,22 @@ del_adev:
 int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
+	int err = 0;
 
 	lockdep_assert_held(&mlx5_intf_mutex);
 	if (priv->flags & MLX5_PRIV_FLAGS_DETACH)
 		return 0;
 
+	priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
 	delete_drivers(dev);
 	if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
-		return 0;
+		goto out;
+
+	err = add_drivers(dev);
 
-	return add_drivers(dev);
+out:
+	priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW;
+	return err;
 }
 
 bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
index ae52e7f38306..b69f9d10ccbd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
@@ -21,6 +21,7 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv)
 	struct netdev_phys_item_id ppid = {};
 	struct devlink_port *dl_port;
 	unsigned int dl_port_index;
+	int ret;
 
 	if (mlx5_core_is_pf(priv->mdev)) {
 		attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
@@ -41,7 +42,13 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv)
 	memset(dl_port, 0, sizeof(*dl_port));
 	devlink_port_attrs_set(dl_port, &attrs);
 
-	return devlink_port_register(devlink, dl_port, dl_port_index);
+	if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW))
+		devl_lock(devlink);
+	ret = devl_port_register(devlink, dl_port, dl_port_index);
+	if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW))
+		devl_unlock(devlink);
+
+	return ret;
 }
 
 void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv)
@@ -54,8 +61,13 @@ void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv)
 void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv)
 {
 	struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
+	struct devlink *devlink = priv_to_devlink(priv->mdev);
 
-	devlink_port_unregister(dl_port);
+	if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW))
+		devl_lock(devlink);
+	devl_port_unregister(dl_port);
+	if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW))
+		devl_unlock(devlink);
 }
 
 struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1bfbc88f513f..ccda3a0a2594 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3400,7 +3400,9 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		err = esw_offloads_start(esw, extack);
 	} else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) {
 		err = esw_offloads_stop(esw, extack);
+		devl_lock(devlink);
 		mlx5_rescan_drivers(esw->dev);
+		devl_unlock(devlink);
 	} else {
 		err = -EINVAL;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 76d7661e3e63..bd882884b23c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -551,6 +551,10 @@ enum {
 	 * creation/deletion on drivers rescan. Unset during device attach.
 	 */
 	MLX5_PRIV_FLAGS_DETACH = 1 << 2,
+	/* Distinguish between mlx5e_probe/remove called by module init/cleanup
+	 * and called by other flows which can already hold devlink lock
+	 */
+	MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW = 1 << 3,
 };
 
 struct mlx5_adev {
-- 
cgit 


From 91f059c95c6a5dbc0907a5f871e7915a5e93c1f9 Mon Sep 17 00:00:00 2001
From: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Date: Fri, 27 May 2022 23:23:52 +0530
Subject: mmc: core: Capture eMMC and SD card errors

Add changes to capture eMMC and SD card errors.
This is useful for debug and testing.

Signed-off-by: Liangliang Lu <quic_luliang@quicinc.com>
Signed-off-by: Sayali Lokhande <quic_sayalil@quicinc.com>
Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Signed-off-by: Ram Prakash Gupta <quic_rampraka@quicinc.com>
Signed-off-by: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/1653674036-21829-2-git-send-email-quic_c_sbhanu@quicinc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 10 +++++++++-
 include/linux/mmc/host.h | 26 ++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 4b70cbfc6d5d..ef53a2578824 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -943,9 +943,11 @@ int mmc_execute_tuning(struct mmc_card *card)
 	}
 
 	/* Only print error when we don't check for card removal */
-	if (!host->detect_change)
+	if (!host->detect_change) {
 		pr_err("%s: tuning execution failed: %d\n",
 			mmc_hostname(host), err);
+		mmc_debugfs_err_stats_inc(host, MMC_ERR_TUNING);
+	}
 
 	return err;
 }
@@ -2244,6 +2246,12 @@ void mmc_rescan(struct work_struct *work)
 		if (freqs[i] <= host->f_min)
 			break;
 	}
+
+	/*
+	 * Ignore the command timeout errors observed during
+	 * the card init as those are excepted.
+	 */
+	host->err_stats[MMC_ERR_CMD_TIMEOUT] = 0;
 	mmc_release_host(host);
 
  out:
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c193c50ccd78..eb8bc5b9b0b7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -93,6 +93,25 @@ struct mmc_clk_phase_map {
 
 struct mmc_host;
 
+enum mmc_err_stat {
+	MMC_ERR_CMD_TIMEOUT,
+	MMC_ERR_CMD_CRC,
+	MMC_ERR_DAT_TIMEOUT,
+	MMC_ERR_DAT_CRC,
+	MMC_ERR_AUTO_CMD,
+	MMC_ERR_ADMA,
+	MMC_ERR_TUNING,
+	MMC_ERR_CMDQ_RED,
+	MMC_ERR_CMDQ_GCE,
+	MMC_ERR_CMDQ_ICCE,
+	MMC_ERR_REQ_TIMEOUT,
+	MMC_ERR_CMDQ_REQ_TIMEOUT,
+	MMC_ERR_ICE_CFG,
+	MMC_ERR_CTRL_TIMEOUT,
+	MMC_ERR_UNEXPECTED_IRQ,
+	MMC_ERR_MAX,
+};
+
 struct mmc_host_ops {
 	/*
 	 * It is optional for the host to implement pre_req and post_req in
@@ -501,6 +520,7 @@ struct mmc_host {
 	/* Host Software Queue support */
 	bool			hsq_enabled;
 
+	u32			err_stats[MMC_ERR_MAX];
 	unsigned long		private[] ____cacheline_aligned;
 };
 
@@ -635,6 +655,12 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data)
 	return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 }
 
+static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host,
+					     enum mmc_err_stat stat)
+{
+	host->err_stats[stat] += 1;
+}
+
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode);
 int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
-- 
cgit 


From efe8f5c9b5e118070f424205078ababc46fd130a Mon Sep 17 00:00:00 2001
From: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Date: Fri, 27 May 2022 23:23:53 +0530
Subject: mmc: sdhci: Capture eMMC and SD card errors

Add changes to capture eMMC and SD card errors.
This is useful for debug and testing.

Signed-off-by: Liangliang Lu <quic_luliang@quicinc.com>
Signed-off-by: Sayali Lokhande <quic_sayalil@quicinc.com>
Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Signed-off-by: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/1653674036-21829-3-git-send-email-quic_c_sbhanu@quicinc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c | 59 ++++++++++++++++++++++++++++++++++++------------
 drivers/mmc/host/sdhci.h |  3 +++
 include/linux/mmc/mmc.h  |  6 +++++
 3 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 22152029e14c..7689ffec5ad1 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -224,6 +224,7 @@ void sdhci_reset(struct sdhci_host *host, u8 mask)
 		if (timedout) {
 			pr_err("%s: Reset 0x%x never completed.\n",
 				mmc_hostname(host->mmc), (int)mask);
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			return;
 		}
@@ -1716,6 +1717,7 @@ static bool sdhci_send_command_retry(struct sdhci_host *host,
 		if (!timeout--) {
 			pr_err("%s: Controller never released inhibit bit(s).\n",
 			       mmc_hostname(host->mmc));
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			cmd->error = -EIO;
 			return false;
@@ -1965,6 +1967,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk)
 		if (timedout) {
 			pr_err("%s: Internal clock never stabilised.\n",
 			       mmc_hostname(host->mmc));
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			return;
 		}
@@ -1987,6 +1990,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk)
 			if (timedout) {
 				pr_err("%s: PLL clock never stabilised.\n",
 				       mmc_hostname(host->mmc));
+				sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 				sdhci_dumpregs(host);
 				return;
 			}
@@ -3161,6 +3165,7 @@ static void sdhci_timeout_timer(struct timer_list *t)
 	if (host->cmd && !sdhci_data_line_cmd(host->cmd)) {
 		pr_err("%s: Timeout waiting for hardware cmd interrupt.\n",
 		       mmc_hostname(host->mmc));
+		sdhci_err_stats_inc(host, REQ_TIMEOUT);
 		sdhci_dumpregs(host);
 
 		host->cmd->error = -ETIMEDOUT;
@@ -3183,6 +3188,7 @@ static void sdhci_timeout_data_timer(struct timer_list *t)
 	    (host->cmd && sdhci_data_line_cmd(host->cmd))) {
 		pr_err("%s: Timeout waiting for hardware interrupt.\n",
 		       mmc_hostname(host->mmc));
+		sdhci_err_stats_inc(host, REQ_TIMEOUT);
 		sdhci_dumpregs(host);
 
 		if (host->data) {
@@ -3234,17 +3240,21 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p)
 			return;
 		pr_err("%s: Got command interrupt 0x%08x even though no command operation was in progress.\n",
 		       mmc_hostname(host->mmc), (unsigned)intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 		return;
 	}
 
 	if (intmask & (SDHCI_INT_TIMEOUT | SDHCI_INT_CRC |
 		       SDHCI_INT_END_BIT | SDHCI_INT_INDEX)) {
-		if (intmask & SDHCI_INT_TIMEOUT)
+		if (intmask & SDHCI_INT_TIMEOUT) {
 			host->cmd->error = -ETIMEDOUT;
-		else
+			sdhci_err_stats_inc(host, CMD_TIMEOUT);
+		} else {
 			host->cmd->error = -EILSEQ;
-
+			if (!mmc_op_tuning(host->cmd->opcode))
+				sdhci_err_stats_inc(host, CMD_CRC);
+		}
 		/* Treat data command CRC error the same as data CRC error */
 		if (host->cmd->data &&
 		    (intmask & (SDHCI_INT_CRC | SDHCI_INT_TIMEOUT)) ==
@@ -3266,6 +3276,8 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p)
 			  -ETIMEDOUT :
 			  -EILSEQ;
 
+		sdhci_err_stats_inc(host, AUTO_CMD);
+
 		if (sdhci_auto_cmd23(host, mrq)) {
 			mrq->sbc->error = err;
 			__sdhci_finish_mrq(host, mrq);
@@ -3342,6 +3354,7 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 			if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 				host->data_cmd = NULL;
 				data_cmd->error = -ETIMEDOUT;
+				sdhci_err_stats_inc(host, CMD_TIMEOUT);
 				__sdhci_finish_mrq(host, data_cmd->mrq);
 				return;
 			}
@@ -3370,23 +3383,30 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 
 		pr_err("%s: Got data interrupt 0x%08x even though no data operation was in progress.\n",
 		       mmc_hostname(host->mmc), (unsigned)intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 
 		return;
 	}
 
-	if (intmask & SDHCI_INT_DATA_TIMEOUT)
+	if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 		host->data->error = -ETIMEDOUT;
-	else if (intmask & SDHCI_INT_DATA_END_BIT)
+		sdhci_err_stats_inc(host, DAT_TIMEOUT);
+	} else if (intmask & SDHCI_INT_DATA_END_BIT) {
 		host->data->error = -EILSEQ;
-	else if ((intmask & SDHCI_INT_DATA_CRC) &&
+		if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if ((intmask & SDHCI_INT_DATA_CRC) &&
 		SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))
-			!= MMC_BUS_TEST_R)
+			!= MMC_BUS_TEST_R) {
 		host->data->error = -EILSEQ;
-	else if (intmask & SDHCI_INT_ADMA_ERROR) {
+		if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc),
 		       intmask);
 		sdhci_adma_show_error(host);
+		sdhci_err_stats_inc(host, ADMA);
 		host->data->error = -EIO;
 		if (host->ops->adma_workaround)
 			host->ops->adma_workaround(host, intmask);
@@ -3584,6 +3604,7 @@ out:
 	if (unexpected) {
 		pr_err("%s: Unexpected interrupt 0x%08x.\n",
 			   mmc_hostname(host->mmc), unexpected);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 	}
 
@@ -3905,20 +3926,27 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error,
 	if (!host->cqe_on)
 		return false;
 
-	if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC))
+	if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC)) {
 		*cmd_error = -EILSEQ;
-	else if (intmask & SDHCI_INT_TIMEOUT)
+		if (!mmc_op_tuning(host->cmd->opcode))
+			sdhci_err_stats_inc(host, CMD_CRC);
+	} else if (intmask & SDHCI_INT_TIMEOUT) {
 		*cmd_error = -ETIMEDOUT;
-	else
+		sdhci_err_stats_inc(host, CMD_TIMEOUT);
+	} else
 		*cmd_error = 0;
 
-	if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC))
+	if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC)) {
 		*data_error = -EILSEQ;
-	else if (intmask & SDHCI_INT_DATA_TIMEOUT)
+		if (!mmc_op_tuning(host->cmd->opcode))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 		*data_error = -ETIMEDOUT;
-	else if (intmask & SDHCI_INT_ADMA_ERROR)
+		sdhci_err_stats_inc(host, DAT_TIMEOUT);
+	} else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		*data_error = -EIO;
-	else
+		sdhci_err_stats_inc(host, ADMA);
+	} else
 		*data_error = 0;
 
 	/* Clear selected interrupts. */
@@ -3934,6 +3962,7 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error,
 		sdhci_writel(host, intmask, SDHCI_INT_STATUS);
 		pr_err("%s: CQE: Unexpected interrupt 0x%08x.\n",
 		       mmc_hostname(host->mmc), intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 	}
 
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index d7929d725730..95a08f09df30 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -356,6 +356,9 @@ struct sdhci_adma2_64_desc {
  */
 #define MMC_CMD_TRANSFER_TIME	(10 * NSEC_PER_MSEC) /* max 10 ms */
 
+#define sdhci_err_stats_inc(host, err_name) \
+	mmc_debugfs_err_stats_inc((host)->mmc, MMC_ERR_##err_name)
+
 enum sdhci_cookie {
 	COOKIE_UNMAPPED,
 	COOKIE_PRE_MAPPED,	/* mapped by sdhci_pre_req() */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index d9a65c6a8816..9c50bc40f8ff 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -99,6 +99,12 @@ static inline bool mmc_op_multi(u32 opcode)
 	       opcode == MMC_READ_MULTIPLE_BLOCK;
 }
 
+static inline bool mmc_op_tuning(u32 opcode)
+{
+	return opcode == MMC_SEND_TUNING_BLOCK ||
+			opcode == MMC_SEND_TUNING_BLOCK_HS200;
+}
+
 /*
  * MMC_SWITCH argument format:
  *
-- 
cgit 


From 5022e221c98a609e0e5b0a73852c7e3d32f1c545 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Mon, 11 Jul 2022 15:35:49 +0800
Subject: net: change the type of ip_route_input_rcu to static

The type of ip_route_input_rcu should be static.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Link: https://lore.kernel.org/r/20220711073549.8947-1-shaozhengchao@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/route.h |  4 ----
 net/ipv4/route.c    | 34 +++++++++++++++++-----------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 991a3985712d..651424ef09c9 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,10 +201,6 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			  struct in_device *in_dev, u32 *itag);
 int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
 			 u8 tos, struct net_device *devin);
-int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
-		       u8 tos, struct net_device *devin,
-		       struct fib_result *res);
-
 int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
 		      u8 tos, struct net_device *devin,
 		      const struct sk_buff *hint);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2d16bcc7d346..8f4a8f66d604 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2439,24 +2439,9 @@ martian_source:
 	goto out;
 }
 
-int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			 u8 tos, struct net_device *dev)
-{
-	struct fib_result res;
-	int err;
-
-	tos &= IPTOS_RT_MASK;
-	rcu_read_lock();
-	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
-	rcu_read_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL(ip_route_input_noref);
-
 /* called with rcu_read_lock held */
-int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		       u8 tos, struct net_device *dev, struct fib_result *res)
+static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			      u8 tos, struct net_device *dev, struct fib_result *res)
 {
 	/* Multicast recognition logic is moved from route cache to here.
 	 * The problem was that too many Ethernet cards have broken/missing
@@ -2505,6 +2490,21 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
 }
 
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			 u8 tos, struct net_device *dev)
+{
+	struct fib_result res;
+	int err;
+
+	tos &= IPTOS_RT_MASK;
+	rcu_read_lock();
+	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
 				       const struct flowi4 *fl4, int orig_oif,
-- 
cgit 


From 3f70c360d484466da7420f395d4675ca02436e32 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@nxp.com>
Date: Tue, 12 Jul 2022 17:15:29 +0300
Subject: ASoC: SOF: Copy compress parameters into extended data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allocate memory at the end of sof_ipc_stream_params to store
snd_compr_params in order to be sent them to SOF firmware.

This will help firmware correctly configure codecs parameters.

Notice, that we use 2 bytes from the reserved pool in order to store
the extended data length. This is compatible with older FWs where
there was no extended data.

Signed-off-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220712141531.14599-3-daniel.baluta@oss.nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/stream.h |  6 ++++--
 sound/soc/sof/compress.c   | 17 +++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/sound/sof/stream.h b/include/sound/sof/stream.h
index 1db3bbc3e65d..9377113f13e4 100644
--- a/include/sound/sof/stream.h
+++ b/include/sound/sof/stream.h
@@ -86,9 +86,11 @@ struct sof_ipc_stream_params {
 	uint32_t host_period_bytes;
 	uint16_t no_stream_position; /**< 1 means don't send stream position */
 	uint8_t cont_update_posn; /**< 1 means continuous update stream position */
-
-	uint8_t reserved[5];
+	uint8_t reserved0;
+	int16_t ext_data_length; /**< 0, means no extended data */
+	uint8_t reserved[2];
 	uint16_t chmap[SOF_IPC_MAX_CHANNELS];	/**< channel map - SOF_CHMAP_ */
+	uint8_t ext_data[]; /**< extended data */
 } __packed;
 
 /* PCM params info - SOF_IPC_STREAM_PCM_PARAMS */
diff --git a/sound/soc/sof/compress.c b/sound/soc/sof/compress.c
index 45c2ff61ee4d..1204dce29ef9 100644
--- a/sound/soc/sof/compress.c
+++ b/sound/soc/sof/compress.c
@@ -170,6 +170,7 @@ static int sof_compr_set_params(struct snd_soc_component *component,
 	struct snd_compr_tstamp *tstamp;
 	struct sof_ipc_pcm_params *pcm;
 	struct snd_sof_pcm *spcm;
+	size_t ext_data_size;
 	int ret;
 
 	tstamp = crtd->private_data;
@@ -179,7 +180,12 @@ static int sof_compr_set_params(struct snd_soc_component *component,
 	if (!spcm)
 		return -EINVAL;
 
-	pcm = kzalloc(sizeof(*pcm), GFP_KERNEL);
+	ext_data_size = sizeof(params->codec);
+
+	if (sizeof(*pcm) + ext_data_size > sdev->ipc->max_payload_size)
+		return -EINVAL;
+
+	pcm = kzalloc(sizeof(*pcm) + ext_data_size, GFP_KERNEL);
 	if (!pcm)
 		return -ENOMEM;
 
@@ -194,11 +200,11 @@ static int sof_compr_set_params(struct snd_soc_component *component,
 		goto out;
 
 	pcm->params.buffer.pages = PFN_UP(crtd->dma_bytes);
-	pcm->hdr.size = sizeof(*pcm);
+	pcm->hdr.size = sizeof(*pcm) + ext_data_size;
 	pcm->hdr.cmd = SOF_IPC_GLB_STREAM_MSG | SOF_IPC_STREAM_PCM_PARAMS;
 
 	pcm->comp_id = spcm->stream[cstream->direction].comp_id;
-	pcm->params.hdr.size = sizeof(pcm->params);
+	pcm->params.hdr.size = sizeof(pcm->params) + ext_data_size;
 	pcm->params.buffer.phy_addr = spcm->stream[cstream->direction].page_table.addr;
 	pcm->params.buffer.size = crtd->dma_bytes;
 	pcm->params.direction = cstream->direction;
@@ -209,8 +215,11 @@ static int sof_compr_set_params(struct snd_soc_component *component,
 	pcm->params.sample_container_bytes =
 		snd_pcm_format_physical_width(SNDRV_PCM_FORMAT_S32) >> 3;
 	pcm->params.host_period_bytes = params->buffer.fragment_size;
+	pcm->params.ext_data_length = ext_data_size;
+
+	memcpy((u8 *)pcm->params.ext_data, &params->codec, ext_data_size);
 
-	ret = sof_ipc_tx_message(sdev->ipc, pcm, sizeof(*pcm),
+	ret = sof_ipc_tx_message(sdev->ipc, pcm, sizeof(*pcm) + ext_data_size,
 				 &ipc_params_reply, sizeof(ipc_params_reply));
 	if (ret < 0) {
 		dev_err(component->dev, "error ipc failed\n");
-- 
cgit 


From 75b5b7a1ccf606281c4afe365a57ccca486641a2 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@nxp.com>
Date: Tue, 12 Jul 2022 17:15:31 +0300
Subject: uapi: sof: abi: Bump SOF ABI for ext_data_length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new field to sof_ipc_stream_params in order to extend
stream params struct with extended data to store compress parameters.

Older kernel will still work this as they ext_data_length will always be
zero.

Signed-off-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220712141531.14599-5-daniel.baluta@oss.nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/sof/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h
index c88f467374ae..b7dce4df7ecd 100644
--- a/include/uapi/sound/sof/abi.h
+++ b/include/uapi/sound/sof/abi.h
@@ -28,7 +28,7 @@
 
 /* SOF ABI version major, minor and patch numbers */
 #define SOF_ABI_MAJOR 3
-#define SOF_ABI_MINOR 21
+#define SOF_ABI_MINOR 22
 #define SOF_ABI_PATCH 0
 
 /* SOF ABI version number. Format within 32bit word is MMmmmppp */
-- 
cgit 


From 2083da24eb56ce622332946800a67a7449d85fe5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Jun 2022 12:02:04 +0530
Subject: OPP: Allow multiple clocks for a device

This patch adds support to allow multiple clocks for a device.

The design is pretty much similar to how this is done for regulators,
and platforms can supply their own version of the config_clks() callback
if they have multiple clocks for their device. The core manages the
calls via opp_table->config_clks() eventually.

We have kept both "clk" and "clks" fields in the OPP table structure and
the reason is provided as a comment in _opp_set_clknames(). The same
isn't done for "rates" though and we use rates[0] at most of the places
now.

Co-developed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 213 +++++++++++++++++++++++++++++++++++--------------
 drivers/opp/debugfs.c  |  27 ++++++-
 drivers/opp/of.c       |  66 +++++++++++----
 drivers/opp/opp.h      |  16 ++--
 include/linux/pm_opp.h |   7 +-
 5 files changed, 246 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index daabc810a1f9..e3322d54942a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -181,7 +181,7 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 		return 0;
 	}
 
-	return opp->rate;
+	return opp->rates[0];
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
 
@@ -430,7 +430,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
 /* Helpers to read keys */
 static unsigned long _read_freq(struct dev_pm_opp *opp, int index)
 {
-	return opp->rate;
+	return opp->rates[0];
 }
 
 static unsigned long _read_level(struct dev_pm_opp *opp, int index)
@@ -784,22 +784,19 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 	return ret;
 }
 
-static inline int _generic_set_opp_clk_only(struct device *dev,
-		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data)
+static int
+_opp_config_clk_single(struct device *dev, struct opp_table *opp_table,
+		       struct dev_pm_opp *opp, void *data, bool scaling_down)
 {
 	unsigned long *target = data;
 	unsigned long freq;
 	int ret;
 
-	/* We may reach here for devices which don't change frequency */
-	if (IS_ERR(opp_table->clk))
-		return 0;
-
 	/* One of target and opp must be available */
 	if (target) {
 		freq = *target;
 	} else if (opp) {
-		freq = opp->rate;
+		freq = opp->rates[0];
 	} else {
 		WARN_ON(1);
 		return -EINVAL;
@@ -1025,11 +1022,11 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 	}
 
 	dev_dbg(dev, "%s: switching OPP: Freq %lu -> %lu Hz, Level %u -> %u, Bw %u -> %u\n",
-		__func__, old_opp->rate, opp->rate, old_opp->level, opp->level,
-		old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0,
+		__func__, old_opp->rates[0], opp->rates[0], old_opp->level,
+		opp->level, old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0,
 		opp->bandwidth ? opp->bandwidth[0].peak : 0);
 
-	scaling_down = _opp_compare_key(old_opp, opp);
+	scaling_down = _opp_compare_key(opp_table, old_opp, opp);
 	if (scaling_down == -1)
 		scaling_down = 0;
 
@@ -1059,9 +1056,11 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 		}
 	}
 
-	ret = _generic_set_opp_clk_only(dev, opp_table, opp, clk_data);
-	if (ret)
-		return ret;
+	if (opp_table->config_clks) {
+		ret = opp_table->config_clks(dev, opp_table, opp, clk_data, scaling_down);
+		if (ret)
+			return ret;
+	}
 
 	/* Scaling down? Configure required OPPs after frequency */
 	if (scaling_down) {
@@ -1133,8 +1132,8 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		 * equivalent to a clk_set_rate()
 		 */
 		if (!_get_opp_count(opp_table)) {
-			ret = _generic_set_opp_clk_only(dev, opp_table, NULL,
-							&target_freq);
+			ret = opp_table->config_clks(dev, opp_table, NULL,
+						     &target_freq, false);
 			goto put_opp_table;
 		}
 
@@ -1255,6 +1254,8 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index)
 	INIT_LIST_HEAD(&opp_table->dev_list);
 	INIT_LIST_HEAD(&opp_table->lazy);
 
+	opp_table->clk = ERR_PTR(-ENODEV);
+
 	/* Mark regulator count uninitialized */
 	opp_table->regulator_count = -1;
 
@@ -1301,20 +1302,38 @@ static struct opp_table *_update_opp_table_clk(struct device *dev,
 	int ret;
 
 	/*
-	 * Return early if we don't need to get clk or we have already tried it
+	 * Return early if we don't need to get clk or we have already done it
 	 * earlier.
 	 */
-	if (!getclk || IS_ERR(opp_table) || opp_table->clk)
+	if (!getclk || IS_ERR(opp_table) || !IS_ERR(opp_table->clk) ||
+	    opp_table->clks)
 		return opp_table;
 
 	/* Find clk for the device */
 	opp_table->clk = clk_get(dev, NULL);
 
 	ret = PTR_ERR_OR_ZERO(opp_table->clk);
-	if (!ret)
+	if (!ret) {
+		opp_table->config_clks = _opp_config_clk_single;
+		opp_table->clk_count = 1;
 		return opp_table;
+	}
 
 	if (ret == -ENOENT) {
+		/*
+		 * There are few platforms which don't want the OPP core to
+		 * manage device's clock settings. In such cases neither the
+		 * platform provides the clks explicitly to us, nor the DT
+		 * contains a valid clk entry. The OPP nodes in DT may still
+		 * contain "opp-hz" property though, which we need to parse and
+		 * allow the platform to find an OPP based on freq later on.
+		 *
+		 * This is a simple solution to take care of such corner cases,
+		 * i.e. make the clk_count 1, which lets us allocate space for
+		 * frequency in opp->rates and also parse the entries in DT.
+		 */
+		opp_table->clk_count = 1;
+
 		dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret);
 		return opp_table;
 	}
@@ -1417,7 +1436,7 @@ static void _opp_table_kref_release(struct kref *kref)
 
 	_of_clear_opp_table(opp_table);
 
-	/* Release clk */
+	/* Release automatically acquired single clk */
 	if (!IS_ERR(opp_table->clk))
 		clk_put(opp_table->clk);
 
@@ -1505,7 +1524,7 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 	mutex_lock(&opp_table->lock);
 
 	list_for_each_entry(iter, &opp_table->opp_list, node) {
-		if (iter->rate == freq) {
+		if (iter->rates[0] == freq) {
 			opp = iter;
 			break;
 		}
@@ -1612,24 +1631,28 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_remove_all_dynamic);
 struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table)
 {
 	struct dev_pm_opp *opp;
-	int supply_count, supply_size, icc_size;
+	int supply_count, supply_size, icc_size, clk_size;
 
 	/* Allocate space for at least one supply */
 	supply_count = opp_table->regulator_count > 0 ?
 			opp_table->regulator_count : 1;
 	supply_size = sizeof(*opp->supplies) * supply_count;
+	clk_size = sizeof(*opp->rates) * opp_table->clk_count;
 	icc_size = sizeof(*opp->bandwidth) * opp_table->path_count;
 
 	/* allocate new OPP node and supplies structures */
-	opp = kzalloc(sizeof(*opp) + supply_size + icc_size, GFP_KERNEL);
-
+	opp = kzalloc(sizeof(*opp) + supply_size + clk_size + icc_size, GFP_KERNEL);
 	if (!opp)
 		return NULL;
 
-	/* Put the supplies at the end of the OPP structure as an empty array */
+	/* Put the supplies, bw and clock at the end of the OPP structure */
 	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+
+	opp->rates = (unsigned long *)(opp->supplies + supply_count);
+
 	if (icc_size)
-		opp->bandwidth = (struct dev_pm_opp_icc_bw *)(opp->supplies + supply_count);
+		opp->bandwidth = (struct dev_pm_opp_icc_bw *)(opp->rates + opp_table->clk_count);
+
 	INIT_LIST_HEAD(&opp->node);
 
 	return opp;
@@ -1660,21 +1683,43 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 	return true;
 }
 
+static int _opp_compare_rate(struct opp_table *opp_table,
+			     struct dev_pm_opp *opp1, struct dev_pm_opp *opp2)
+{
+	int i;
+
+	for (i = 0; i < opp_table->clk_count; i++) {
+		if (opp1->rates[i] != opp2->rates[i])
+			return opp1->rates[i] < opp2->rates[i] ? -1 : 1;
+	}
+
+	/* Same rates for both OPPs */
+	return 0;
+}
+
 /*
  * Returns
  * 0: opp1 == opp2
  * 1: opp1 > opp2
  * -1: opp1 < opp2
  */
-int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2)
+int _opp_compare_key(struct opp_table *opp_table, struct dev_pm_opp *opp1,
+		     struct dev_pm_opp *opp2)
 {
-	if (opp1->rate != opp2->rate)
-		return opp1->rate < opp2->rate ? -1 : 1;
+	int ret;
+
+	ret = _opp_compare_rate(opp_table, opp1, opp2);
+	if (ret)
+		return ret;
+
 	if (opp1->bandwidth && opp2->bandwidth &&
 	    opp1->bandwidth[0].peak != opp2->bandwidth[0].peak)
 		return opp1->bandwidth[0].peak < opp2->bandwidth[0].peak ? -1 : 1;
+
 	if (opp1->level != opp2->level)
 		return opp1->level < opp2->level ? -1 : 1;
+
+	/* Duplicate OPPs */
 	return 0;
 }
 
@@ -1694,7 +1739,7 @@ static int _opp_is_duplicate(struct device *dev, struct dev_pm_opp *new_opp,
 	 * loop.
 	 */
 	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		opp_cmp = _opp_compare_key(new_opp, opp);
+		opp_cmp = _opp_compare_key(opp_table, new_opp, opp);
 		if (opp_cmp > 0) {
 			*head = &opp->node;
 			continue;
@@ -1705,8 +1750,8 @@ static int _opp_is_duplicate(struct device *dev, struct dev_pm_opp *new_opp,
 
 		/* Duplicate OPPs */
 		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->supplies[0].u_volt,
-			 opp->available, new_opp->rate,
+			 __func__, opp->rates[0], opp->supplies[0].u_volt,
+			 opp->available, new_opp->rates[0],
 			 new_opp->supplies[0].u_volt, new_opp->available);
 
 		/* Should we compare voltages for all regulators here ? */
@@ -1727,7 +1772,7 @@ void _required_opps_available(struct dev_pm_opp *opp, int count)
 
 		opp->available = false;
 		pr_warn("%s: OPP not supported by required OPP %pOF (%lu)\n",
-			 __func__, opp->required_opps[i]->np, opp->rate);
+			 __func__, opp->required_opps[i]->np, opp->rates[0]);
 		return;
 	}
 }
@@ -1768,7 +1813,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
 		new_opp->available = false;
 		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
-			 __func__, new_opp->rate);
+			 __func__, new_opp->rates[0]);
 	}
 
 	/* required-opps not fully initialized yet */
@@ -1814,7 +1859,7 @@ int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
 		return -ENOMEM;
 
 	/* populate the opp table */
-	new_opp->rate = freq;
+	new_opp->rates[0] = freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
 	new_opp->supplies[0].u_volt = u_volt;
 	new_opp->supplies[0].u_volt_min = u_volt - tol;
@@ -2017,6 +2062,17 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 	opp_table->regulator_count = -1;
 }
 
+static void _put_clks(struct opp_table *opp_table, int count)
+{
+	int i;
+
+	for (i = count - 1; i >= 0; i--)
+		clk_put(opp_table->clks[i]);
+
+	kfree(opp_table->clks);
+	opp_table->clks = NULL;
+}
+
 /**
  * _opp_set_clknames() - Set clk names for the device
  * @dev: Device for which clk names is being set.
@@ -2031,10 +2087,12 @@ static void _opp_put_regulators(struct opp_table *opp_table)
  * This must be called before any OPPs are initialized for the device.
  */
 static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
-			     const char * const names[])
+			     const char * const names[],
+			     config_clks_t config_clks)
 {
 	const char * const *temp = names;
-	int count = 0;
+	int count = 0, ret, i;
+	struct clk *clk;
 
 	/* Count number of clks */
 	while (*temp++)
@@ -2047,28 +2105,60 @@ static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
 	if (!count && !names[1])
 		count = 1;
 
-	/* We support only one clock name for now */
-	if (count != 1)
+	/* Fail early for invalid configurations */
+	if (!count || (config_clks && count == 1) || (!config_clks && count > 1))
 		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the clkname ? */
-	if (opp_table->clk_configured)
+	if (opp_table->clks)
 		return 0;
 
-	/* clk shouldn't be initialized at this point */
-	if (WARN_ON(opp_table->clk))
-		return -EBUSY;
+	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
+					GFP_KERNEL);
+	if (!opp_table->clks)
+		return -ENOMEM;
 
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, names[0]);
-	if (IS_ERR(opp_table->clk)) {
-		return dev_err_probe(dev, PTR_ERR(opp_table->clk),
-				    "%s: Couldn't find clock\n", __func__);
+	/* Find clks for the device */
+	for (i = 0; i < count; i++) {
+		clk = clk_get(dev, names[i]);
+		if (IS_ERR(clk)) {
+			ret = dev_err_probe(dev, PTR_ERR(clk),
+					    "%s: Couldn't find clock with name: %s\n",
+					    __func__, names[i]);
+			goto free_clks;
+		}
+
+		opp_table->clks[i] = clk;
 	}
 
-	opp_table->clk_configured = true;
+	opp_table->clk_count = count;
+
+	/* Set generic single clk set here */
+	if (count == 1) {
+		opp_table->config_clks = _opp_config_clk_single;
+
+		/*
+		 * We could have just dropped the "clk" field and used "clks"
+		 * everywhere. Instead we kept the "clk" field around for
+		 * following reasons:
+		 *
+		 * - avoiding clks[0] everywhere else.
+		 * - not running single clk helpers for multiple clk usecase by
+		 *   mistake.
+		 *
+		 * Since this is single-clk case, just update the clk pointer
+		 * too.
+		 */
+		opp_table->clk = opp_table->clks[0];
+	} else {
+		opp_table->config_clks = config_clks;
+	}
 
 	return 0;
+
+free_clks:
+	_put_clks(opp_table, i);
+	return ret;
 }
 
 /**
@@ -2077,11 +2167,13 @@ static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
  */
 static void _opp_put_clknames(struct opp_table *opp_table)
 {
-	if (opp_table->clk_configured) {
-		clk_put(opp_table->clk);
-		opp_table->clk = ERR_PTR(-EINVAL);
-		opp_table->clk_configured = false;
-	}
+	if (!opp_table->clks)
+		return;
+
+	opp_table->config_clks = NULL;
+	opp_table->clk = ERR_PTR(-ENODEV);
+
+	_put_clks(opp_table, opp_table->clk_count);
 }
 
 /**
@@ -2298,11 +2390,16 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure clocks */
 	if (config->clk_names) {
-		ret = _opp_set_clknames(opp_table, dev, config->clk_names);
+		ret = _opp_set_clknames(opp_table, dev, config->clk_names,
+					config->config_clks);
 		if (ret)
 			goto err;
 
 		data->flags |= OPP_CONFIG_CLK;
+	} else if (config->config_clks) {
+		/* Don't allow config callback without clocks */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	/* Configure property names */
@@ -2614,7 +2711,7 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
 
 	/* Do we have the frequency? */
 	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
+		if (tmp_opp->rates[0] == freq) {
 			opp = tmp_opp;
 			break;
 		}
@@ -2685,7 +2782,7 @@ int dev_pm_opp_adjust_voltage(struct device *dev, unsigned long freq,
 
 	/* Do we have the frequency? */
 	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
+		if (tmp_opp->rates[0] == freq) {
 			opp = tmp_opp;
 			break;
 		}
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index 1b6e5c55c3ed..96a30a032c5f 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -74,6 +74,24 @@ static void opp_debug_create_bw(struct dev_pm_opp *opp,
 	}
 }
 
+static void opp_debug_create_clks(struct dev_pm_opp *opp,
+				  struct opp_table *opp_table,
+				  struct dentry *pdentry)
+{
+	char name[12];
+	int i;
+
+	if (opp_table->clk_count == 1) {
+		debugfs_create_ulong("rate_hz", S_IRUGO, pdentry, &opp->rates[0]);
+		return;
+	}
+
+	for (i = 0; i < opp_table->clk_count; i++) {
+		snprintf(name, sizeof(name), "rate_hz_%d", i);
+		debugfs_create_ulong(name, S_IRUGO, pdentry, &opp->rates[i]);
+	}
+}
+
 static void opp_debug_create_supplies(struct dev_pm_opp *opp,
 				      struct opp_table *opp_table,
 				      struct dentry *pdentry)
@@ -117,10 +135,11 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	 * Get directory name for OPP.
 	 *
 	 * - Normally rate is unique to each OPP, use it to get unique opp-name.
-	 * - For some devices rate isn't available, use index instead.
+	 * - For some devices rate isn't available or there are multiple, use
+	 *   index instead for them.
 	 */
-	if (likely(opp->rate))
-		id = opp->rate;
+	if (likely(opp_table->clk_count == 1 && opp->rates[0]))
+		id = opp->rates[0];
 	else
 		id = _get_opp_count(opp_table);
 
@@ -134,7 +153,6 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo);
 	debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend);
 	debugfs_create_u32("performance_state", S_IRUGO, d, &opp->pstate);
-	debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate);
 	debugfs_create_u32("level", S_IRUGO, d, &opp->level);
 	debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
 			     &opp->clock_latency_ns);
@@ -142,6 +160,7 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	opp->of_name = of_node_full_name(opp->np);
 	debugfs_create_str("of_name", S_IRUGO, d, (char **)&opp->of_name);
 
+	opp_debug_create_clks(opp, opp_table, d);
 	opp_debug_create_supplies(opp, opp_table, d);
 	opp_debug_create_bw(opp, opp_table, d);
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index bb49057cb1fc..7607e112df3f 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -767,6 +767,50 @@ void dev_pm_opp_of_remove_table(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
+static int _read_rate(struct dev_pm_opp *new_opp, struct opp_table *opp_table,
+		      struct device_node *np)
+{
+	struct property *prop;
+	int i, count, ret;
+	u64 *rates;
+
+	prop = of_find_property(np, "opp-hz", NULL);
+	if (!prop)
+		return -ENODEV;
+
+	count = prop->length / sizeof(u64);
+	if (opp_table->clk_count != count) {
+		pr_err("%s: Count mismatch between opp-hz and clk_count (%d %d)\n",
+		       __func__, count, opp_table->clk_count);
+		return -EINVAL;
+	}
+
+	rates = kmalloc_array(count, sizeof(*rates), GFP_KERNEL);
+	if (!rates)
+		return -ENOMEM;
+
+	ret = of_property_read_u64_array(np, "opp-hz", rates, count);
+	if (ret) {
+		pr_err("%s: Error parsing opp-hz: %d\n", __func__, ret);
+	} else {
+		/*
+		 * Rate is defined as an unsigned long in clk API, and so
+		 * casting explicitly to its type. Must be fixed once rate is 64
+		 * bit guaranteed in clk API.
+		 */
+		for (i = 0; i < count; i++) {
+			new_opp->rates[i] = (unsigned long)rates[i];
+
+			/* This will happen for frequencies > 4.29 GHz */
+			WARN_ON(new_opp->rates[i] != rates[i]);
+		}
+	}
+
+	kfree(rates);
+
+	return ret;
+}
+
 static int _read_bw(struct dev_pm_opp *new_opp, struct opp_table *opp_table,
 		    struct device_node *np, bool peak)
 {
@@ -812,19 +856,13 @@ static int _read_opp_key(struct dev_pm_opp *new_opp,
 			 struct opp_table *opp_table, struct device_node *np)
 {
 	bool found = false;
-	u64 rate;
 	int ret;
 
-	ret = of_property_read_u64(np, "opp-hz", &rate);
-	if (!ret) {
-		/*
-		 * Rate is defined as an unsigned long in clk API, and so
-		 * casting explicitly to its type. Must be fixed once rate is 64
-		 * bit guaranteed in clk API.
-		 */
-		new_opp->rate = (unsigned long)rate;
+	ret = _read_rate(new_opp, opp_table, np);
+	if (!ret)
 		found = true;
-	}
+	else if (ret != -ENODEV)
+		return ret;
 
 	/*
 	 * Bandwidth consists of peak and average (optional) values:
@@ -893,8 +931,8 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 
 	/* Check if the OPP supports hardware's hierarchy of versions or not */
 	if (!_opp_is_supported(dev, opp_table, np)) {
-		dev_dbg(dev, "OPP not supported by hardware: %lu\n",
-			new_opp->rate);
+		dev_dbg(dev, "OPP not supported by hardware: %s\n",
+			of_node_full_name(np));
 		goto free_opp;
 	}
 
@@ -930,7 +968,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 	if (of_property_read_bool(np, "opp-suspend")) {
 		if (opp_table->suspend_opp) {
 			/* Pick the OPP with higher rate/bw/level as suspend OPP */
-			if (_opp_compare_key(new_opp, opp_table->suspend_opp) == 1) {
+			if (_opp_compare_key(opp_table, new_opp, opp_table->suspend_opp) == 1) {
 				opp_table->suspend_opp->suspend = false;
 				new_opp->suspend = true;
 				opp_table->suspend_opp = new_opp;
@@ -945,7 +983,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu level:%u\n",
-		 __func__, new_opp->turbo, new_opp->rate,
+		 __func__, new_opp->turbo, new_opp->rates[0],
 		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
 		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns,
 		 new_opp->level);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e481bdb59499..816009eaafee 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -79,7 +79,7 @@ struct opp_config_data {
  * @suspend:	true if suspend OPP
  * @removed:	flag indicating that OPP's reference is dropped by OPP core.
  * @pstate: Device's power domain's performance state.
- * @rate:	Frequency in hertz
+ * @rates:	Frequencies in hertz
  * @level:	Performance level
  * @supplies:	Power supplies voltage/current values
  * @bandwidth:	Interconnect bandwidth values
@@ -102,7 +102,7 @@ struct dev_pm_opp {
 	bool suspend;
 	bool removed;
 	unsigned int pstate;
-	unsigned long rate;
+	unsigned long *rates;
 	unsigned int level;
 
 	struct dev_pm_opp_supply *supplies;
@@ -170,8 +170,10 @@ enum opp_table_access {
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
- * @clk_configured: Clock name is configured by the platform.
- * @clk: Device's clock handle
+ * @config_clks: Platform specific config_clks() callback.
+ * @clks: Device's clock handles, for multiple clocks.
+ * @clk: Device's clock handle, for single clock.
+ * @clk_count: Number of clocks.
  * @config_regulators: Platform specific config_regulators() callback.
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
@@ -220,8 +222,10 @@ struct opp_table {
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char *prop_name;
-	bool clk_configured;
+	config_clks_t config_clks;
+	struct clk **clks;
 	struct clk *clk;
+	int clk_count;
 	config_regulators_t config_regulators;
 	struct regulator **regulators;
 	int regulator_count;
@@ -246,7 +250,7 @@ struct opp_table *_find_opp_table(struct device *dev);
 struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
 struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
 void _opp_free(struct dev_pm_opp *opp);
-int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2);
+int _opp_compare_key(struct opp_table *opp_table, struct dev_pm_opp *opp1, struct dev_pm_opp *opp2);
 int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
 int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
 void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 50cbc75bef71..104151dfe46c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -61,9 +61,13 @@ typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
 
+typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
+			struct dev_pm_opp *opp, void *data, bool scaling_down);
+
 /**
  * struct dev_pm_opp_config - Device OPP configuration values
- * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
+ * @clk_names: Clk names, NULL terminated array.
+ * @config_clks: Custom set clk helper.
  * @prop_name: Name to postfix to properties.
  * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
@@ -78,6 +82,7 @@ typedef int (*config_regulators_t)(struct device *dev,
 struct dev_pm_opp_config {
 	/* NULL terminated */
 	const char * const *clk_names;
+	config_clks_t config_clks;
 	const char *prop_name;
 	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
-- 
cgit 


From 8174a3a613af1a911ab19da812824f7180b261f9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Jun 2022 12:37:25 +0530
Subject: OPP: Provide a simple implementation to configure multiple clocks

This provides a simple implementation to configure multiple clocks for a
device.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h | 10 ++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 4675a00b9816..0c5b12e99d39 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -843,6 +843,40 @@ _opp_config_clk_single(struct device *dev, struct opp_table *opp_table,
 	return ret;
 }
 
+/*
+ * Simple implementation for configuring multiple clocks. Configure clocks in
+ * the order in which they are present in the array while scaling up.
+ */
+int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down)
+{
+	int ret, i;
+
+	if (scaling_down) {
+		for (i = opp_table->clk_count - 1; i >= 0; i--) {
+			ret = clk_set_rate(opp_table->clks[i], opp->rates[i]);
+			if (ret) {
+				dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+					ret);
+				return ret;
+			}
+		}
+	} else {
+		for (i = 0; i < opp_table->clk_count; i++) {
+			ret = clk_set_rate(opp_table->clks[i], opp->rates[i]);
+			if (ret) {
+				dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+					ret);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_config_clks_simple);
+
 static int _opp_config_regulator_single(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count)
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 104151dfe46c..683e6baf9618 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -159,6 +159,9 @@ int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb
 int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
+int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down);
 
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
@@ -342,6 +345,13 @@ static inline int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_c
 
 static inline void dev_pm_opp_clear_config(int token) {}
 
+static inline int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
 				struct opp_table *dst_table, struct dev_pm_opp *src_opp)
 {
-- 
cgit 


From 1e5fb38442ebaabb65057c2e58355ee36c2a178f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Jul 2022 09:41:56 +0530
Subject: OPP: Remove dev{m}_pm_opp_of_add_table_noclk()

Remove the now unused variants and the now unnecessary "getclk"
parameter from few routines.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/of.c       | 48 ++++++++----------------------------------------
 include/linux/pm_opp.h | 12 ------------
 2 files changed, 8 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 7607e112df3f..8367823a2001 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1120,7 +1120,7 @@ remove_static_opp:
 	return ret;
 }
 
-static int _of_add_table_indexed(struct device *dev, int index, bool getclk)
+static int _of_add_table_indexed(struct device *dev, int index)
 {
 	struct opp_table *opp_table;
 	int ret, count;
@@ -1136,7 +1136,7 @@ static int _of_add_table_indexed(struct device *dev, int index, bool getclk)
 			index = 0;
 	}
 
-	opp_table = _add_opp_table_indexed(dev, index, getclk);
+	opp_table = _add_opp_table_indexed(dev, index, true);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -1160,11 +1160,11 @@ static void devm_pm_opp_of_table_release(void *data)
 	dev_pm_opp_of_remove_table(data);
 }
 
-static int _devm_of_add_table_indexed(struct device *dev, int index, bool getclk)
+static int _devm_of_add_table_indexed(struct device *dev, int index)
 {
 	int ret;
 
-	ret = _of_add_table_indexed(dev, index, getclk);
+	ret = _of_add_table_indexed(dev, index);
 	if (ret)
 		return ret;
 
@@ -1192,7 +1192,7 @@ static int _devm_of_add_table_indexed(struct device *dev, int index, bool getclk
  */
 int devm_pm_opp_of_add_table(struct device *dev)
 {
-	return _devm_of_add_table_indexed(dev, 0, true);
+	return _devm_of_add_table_indexed(dev, 0);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table);
 
@@ -1215,7 +1215,7 @@ EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table);
  */
 int dev_pm_opp_of_add_table(struct device *dev)
 {
-	return _of_add_table_indexed(dev, 0, true);
+	return _of_add_table_indexed(dev, 0);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
 
@@ -1231,7 +1231,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
  */
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 {
-	return _of_add_table_indexed(dev, index, true);
+	return _of_add_table_indexed(dev, index);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
 
@@ -1244,42 +1244,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
  */
 int devm_pm_opp_of_add_table_indexed(struct device *dev, int index)
 {
-	return _devm_of_add_table_indexed(dev, index, true);
+	return _devm_of_add_table_indexed(dev, index);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table_indexed);
 
-/**
- * dev_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device
- *		tree without getting clk for device.
- * @dev:	device pointer used to lookup OPP table.
- * @index:	Index number.
- *
- * Register the initial OPP table with the OPP library for given device only
- * using the "operating-points-v2" property. Do not try to get the clk for the
- * device.
- *
- * Return: Refer to dev_pm_opp_of_add_table() for return values.
- */
-int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return _of_add_table_indexed(dev, index, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_noclk);
-
-/**
- * devm_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device
- *		tree without getting clk for device.
- * @dev:	device pointer used to lookup OPP table.
- * @index:	Index number.
- *
- * This is a resource-managed variant of dev_pm_opp_of_add_table_noclk().
- */
-int devm_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return _devm_of_add_table_indexed(dev, index, false);
-}
-EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table_noclk);
-
 /* CPU device specific helpers */
 
 /**
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 683e6baf9618..dc1fb5890792 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -402,8 +402,6 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
 int devm_pm_opp_of_add_table_indexed(struct device *dev, int index);
-int dev_pm_opp_of_add_table_noclk(struct device *dev, int index);
-int devm_pm_opp_of_add_table_noclk(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
 int devm_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
@@ -434,16 +432,6 @@ static inline int devm_pm_opp_of_add_table_indexed(struct device *dev, int index
 	return -EOPNOTSUPP;
 }
 
-static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int devm_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
-- 
cgit 


From 0bb7e14c8e15ad78b7300e7d89a615ea8b8c89a9 Mon Sep 17 00:00:00 2001
From: Li kunyu <kunyu@nfschina.com>
Date: Wed, 29 Jun 2022 11:00:13 +0800
Subject: blk-iocost: tracing: atomic64_read(&ioc->vtime_rate) is assigned an
 extra semicolon

Remove extra semicolon.

Link: https://lkml.kernel.org/r/20220629030013.10362-1-kunyu@nfschina.com

Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Li kunyu <kunyu@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/iocost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
index e282ce02fa2d..6d1626e7a4ce 100644
--- a/include/trace/events/iocost.h
+++ b/include/trace/events/iocost.h
@@ -160,7 +160,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
 
 	TP_fast_assign(
 		__assign_str(devname, ioc_name(ioc));
-		__entry->old_vrate = atomic64_read(&ioc->vtime_rate);;
+		__entry->old_vrate = atomic64_read(&ioc->vtime_rate);
 		__entry->new_vrate = new_vrate;
 		__entry->busy_level = ioc->busy_level;
 		__entry->read_missed_ppm = missed_ppm[READ];
-- 
cgit 


From 79f772b9e8004c542cc88aaf680189c3f6e9a0f2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 14 Jun 2022 22:56:15 +0000
Subject: KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor, again

Read vcpu->vcpu_idx directly instead of bouncing through the one-line
wrapper, kvm_vcpu_get_idx(), and drop the wrapper.  The wrapper is a
remnant of the original implementation and serves no purpose; remove it
(again) before it gains more users.

kvm_vcpu_get_idx() was removed in the not-too-distant past by commit
4eeef2424153 ("KVM: x86: Query vcpu->vcpu_idx directly and drop its
accessor"), but was unintentionally re-introduced by commit a54d806688fe
("KVM: Keep memslots in tree-based structures instead of array-based ones"),
likely due to a rebase goof.  The wrapper then managed to gain users in
KVM's Xen code.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Link: https://lore.kernel.org/r/20220614225615.3843835-1-seanjc@google.com
---
 arch/x86/kvm/xen.c       | 10 +++++-----
 include/linux/kvm_host.h |  5 -----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 610beba35907..a0c05ccbf4b1 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
 	else
 		vcpu->arch.xen.poll_evtchn = -1;
 
-	set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+	set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
 
 	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
@@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
 	*r = 0;
 out:
 	/* Really, this is only needed in case of timeout */
-	clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
 
 	if (unlikely(sched_poll.nr_ports > 1))
 		kfree(ports);
@@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
 	int poll_evtchn = vcpu->arch.xen.poll_evtchn;
 
 	if ((poll_evtchn == port || poll_evtchn == -1) &&
-	    test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+	    test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
 		kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
 		kvm_vcpu_kick(vcpu);
 	}
@@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 		vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
 		if (!vcpu)
 			return -EINVAL;
-		WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+		WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
 	}
 
 	if (!vcpu->arch.xen.vcpu_info_cache.active)
@@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
 	 */
 	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
 	if (vcpu)
-		e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+		e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
 	else
 		e->xen_evtchn.vcpu_idx = -1;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b40f8d68fbb..cb8168cc9755 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -907,11 +907,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 	return NULL;
 }
 
-static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
-{
-	return vcpu->vcpu_idx;
-}
-
 void kvm_destroy_vcpus(struct kvm *kvm);
 
 void vcpu_load(struct kvm_vcpu *vcpu);
-- 
cgit 


From 4201d9ab3e42d9e2a20320b751a931e6239c0df2 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 11 Jul 2022 09:28:27 -0700
Subject: bpf: reparent bpf maps on memcg offlining

The memory consumed by a bpf map is always accounted to the memory
cgroup of the process which created the map. The map can outlive
the memory cgroup if it's used by processes in other cgroups or
is pinned on bpffs. In this case the map pins the original cgroup
in the dying state.

For other types of objects (slab objects, non-slab kernel allocations,
percpu objects and recently LRU pages) there is a reparenting process
implemented: on cgroup offlining charged objects are getting
reassigned to the parent cgroup. Because all charges and statistics
are fully recursive it's a fairly cheap operation.

For efficiency and consistency with other types of objects, let's do
the same for bpf maps. Fortunately thanks to the objcg API, the
required changes are minimal.

Please, note that individual allocations (slabs, percpu and large
kmallocs) already have the reparenting mechanism. This commit adds
it to the saved map->memcg pointer by replacing it to map->objcg.
Because dying cgroups are not visible for a user and all charges are
recursive, this commit doesn't bring any behavior changes for a user.

v2:
  added a missing const qualifier

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Link: https://lore.kernel.org/r/20220711162827.184743-1-roman.gushchin@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 +-
 kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b21f2a3452f..85a4db3e0536 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -221,7 +221,7 @@ struct bpf_map {
 	u32 btf_vmlinux_value_type_id;
 	struct btf *btf;
 #ifdef CONFIG_MEMCG_KMEM
-	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
 	struct bpf_map_off_arr *off_arr;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ab688d85b2c6..83c7136c5788 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -419,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 #ifdef CONFIG_MEMCG_KMEM
 static void bpf_map_save_memcg(struct bpf_map *map)
 {
-	map->memcg = get_mem_cgroup_from_mm(current->mm);
+	/* Currently if a map is created by a process belonging to the root
+	 * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+	 * So we have to check map->objcg for being NULL each time it's
+	 * being used.
+	 */
+	map->objcg = get_obj_cgroup_from_current();
 }
 
 static void bpf_map_release_memcg(struct bpf_map *map)
 {
-	mem_cgroup_put(map->memcg);
+	if (map->objcg)
+		obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+	if (map->objcg)
+		return get_mem_cgroup_from_objcg(map->objcg);
+
+	return root_mem_cgroup;
 }
 
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
 			   int node)
 {
-	struct mem_cgroup *old_memcg;
+	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	old_memcg = set_active_memcg(map->memcg);
+	memcg = bpf_map_get_memcg(map);
+	old_memcg = set_active_memcg(memcg);
 	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
 	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
 
 	return ptr;
 }
 
 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
 {
-	struct mem_cgroup *old_memcg;
+	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	old_memcg = set_active_memcg(map->memcg);
+	memcg = bpf_map_get_memcg(map);
+	old_memcg = set_active_memcg(memcg);
 	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
 	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
 
 	return ptr;
 }
@@ -455,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
 				    size_t align, gfp_t flags)
 {
-	struct mem_cgroup *old_memcg;
+	struct mem_cgroup *memcg, *old_memcg;
 	void __percpu *ptr;
 
-	old_memcg = set_active_memcg(map->memcg);
+	memcg = bpf_map_get_memcg(map);
+	old_memcg = set_active_memcg(memcg);
 	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
 	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
 
 	return ptr;
 }
-- 
cgit 


From 1d5f82d9dd477d5c66e0214a68c3e4f308eadd6d Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 5 Jul 2022 17:26:12 -0700
Subject: bpf, x86: fix freeing of not-finalized bpf_prog_pack

syzbot reported a few issues with bpf_prog_pack [1], [2]. This only happens
with multiple subprogs. In jit_subprogs(), we first call bpf_int_jit_compile()
on each sub program. And then, we call it on each sub program again. jit_data
is not freed in the first call of bpf_int_jit_compile(). Similarly we don't
call bpf_jit_binary_pack_finalize() in the first call of bpf_int_jit_compile().

If bpf_int_jit_compile() failed for one sub program, we will call
bpf_jit_binary_pack_finalize() for this sub program. However, we don't have a
chance to call it for other sub programs. Then we will hit "goto out_free" in
jit_subprogs(), and call bpf_jit_free on some subprograms that haven't got
bpf_jit_binary_pack_finalize() yet.

At this point, bpf_jit_binary_pack_free() is called and the whole 2MB page is
freed erroneously.

Fix this with a custom bpf_jit_free() for x86_64, which calls
bpf_jit_binary_pack_finalize() if necessary. Also, with custom
bpf_jit_free(), bpf_prog_aux->use_bpf_prog_pack is not needed any more,
remove it.

Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc")
[1] https://syzkaller.appspot.com/bug?extid=2f649ec6d2eea1495a8f
[2] https://syzkaller.appspot.com/bug?extid=87f65c75f4a72db05445
Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com
Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220706002612.4013790-1-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 25 +++++++++++++++++++++++++
 include/linux/bpf.h         |  1 -
 include/linux/filter.h      |  8 ++++++++
 kernel/bpf/core.c           | 29 ++++++++++++-----------------
 4 files changed, 45 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d2614f1bf838..54c7f46c453f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2486,3 +2486,28 @@ bool bpf_jit_supports_subprog_tailcalls(void)
 {
 	return true;
 }
+
+void bpf_jit_free(struct bpf_prog *prog)
+{
+	if (prog->jited) {
+		struct x64_jit_data *jit_data = prog->aux->jit_data;
+		struct bpf_binary_header *hdr;
+
+		/*
+		 * If we fail the final pass of JIT (from jit_subprogs),
+		 * the program may not be finalized yet. Call finalize here
+		 * before freeing it.
+		 */
+		if (jit_data) {
+			bpf_jit_binary_pack_finalize(prog, jit_data->header,
+						     jit_data->rw_header);
+			kvfree(jit_data->addrs);
+			kfree(jit_data);
+		}
+		hdr = bpf_jit_binary_pack_hdr(prog);
+		bpf_jit_binary_pack_free(hdr, NULL);
+		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
+	}
+
+	bpf_prog_unlock_free(prog);
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 85a4db3e0536..a5bf00649995 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1044,7 +1044,6 @@ struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
-	bool use_bpf_prog_pack;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4c1a8b247545..a5f21dc3c432 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1027,6 +1027,14 @@ u64 bpf_jit_alloc_exec_limit(void);
 void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
+
+static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+	return list_empty(&fp->aux->ksym.lnode) ||
+	       fp->aux->ksym.lnode.prev == LIST_POISON2;
+}
 
 struct bpf_binary_header *
 bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 805c2ad5c793..cfb8a50a9f12 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -650,12 +650,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 	return fp->jited && !bpf_prog_was_classic(fp);
 }
 
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
-	return list_empty(&fp->aux->ksym.lnode) ||
-	       fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
@@ -1153,7 +1147,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
 		bpf_prog_pack_free(ro_header);
 		return PTR_ERR(ptr);
 	}
-	prog->aux->use_bpf_prog_pack = true;
 	return 0;
 }
 
@@ -1177,17 +1170,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
 	bpf_jit_uncharge_modmem(size);
 }
 
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr;
+
+	addr = real_start & BPF_PROG_CHUNK_MASK;
+	return (void *)addr;
+}
+
 static inline struct bpf_binary_header *
 bpf_jit_binary_hdr(const struct bpf_prog *fp)
 {
 	unsigned long real_start = (unsigned long)fp->bpf_func;
 	unsigned long addr;
 
-	if (fp->aux->use_bpf_prog_pack)
-		addr = real_start & BPF_PROG_CHUNK_MASK;
-	else
-		addr = real_start & PAGE_MASK;
-
+	addr = real_start & PAGE_MASK;
 	return (void *)addr;
 }
 
@@ -1200,11 +1199,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	if (fp->jited) {
 		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
-		if (fp->aux->use_bpf_prog_pack)
-			bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
-		else
-			bpf_jit_binary_free(hdr);
-
+		bpf_jit_binary_free(hdr);
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
 	}
 
-- 
cgit 


From 401e4963bf45c800e3e9ea0d3a0289d738005fd4 Mon Sep 17 00:00:00 2001
From: John Keeping <john@metanate.com>
Date: Fri, 8 Jul 2022 17:27:02 +0100
Subject: sched/core: Always flush pending blk_plug

With CONFIG_PREEMPT_RT, it is possible to hit a deadlock between two
normal priority tasks (SCHED_OTHER, nice level zero):

	INFO: task kworker/u8:0:8 blocked for more than 491 seconds.
	      Not tainted 5.15.49-rt46 #1
	"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
	task:kworker/u8:0    state:D stack:    0 pid:    8 ppid:     2 flags:0x00000000
	Workqueue: writeback wb_workfn (flush-7:0)
	[<c08a3a10>] (__schedule) from [<c08a3d84>] (schedule+0xdc/0x134)
	[<c08a3d84>] (schedule) from [<c08a65a0>] (rt_mutex_slowlock_block.constprop.0+0xb8/0x174)
	[<c08a65a0>] (rt_mutex_slowlock_block.constprop.0) from [<c08a6708>]
	+(rt_mutex_slowlock.constprop.0+0xac/0x174)
	[<c08a6708>] (rt_mutex_slowlock.constprop.0) from [<c0374d60>] (fat_write_inode+0x34/0x54)
	[<c0374d60>] (fat_write_inode) from [<c0297304>] (__writeback_single_inode+0x354/0x3ec)
	[<c0297304>] (__writeback_single_inode) from [<c0297998>] (writeback_sb_inodes+0x250/0x45c)
	[<c0297998>] (writeback_sb_inodes) from [<c0297c20>] (__writeback_inodes_wb+0x7c/0xb8)
	[<c0297c20>] (__writeback_inodes_wb) from [<c0297f24>] (wb_writeback+0x2c8/0x2e4)
	[<c0297f24>] (wb_writeback) from [<c0298c40>] (wb_workfn+0x1a4/0x3e4)
	[<c0298c40>] (wb_workfn) from [<c0138ab8>] (process_one_work+0x1fc/0x32c)
	[<c0138ab8>] (process_one_work) from [<c0139120>] (worker_thread+0x22c/0x2d8)
	[<c0139120>] (worker_thread) from [<c013e6e0>] (kthread+0x16c/0x178)
	[<c013e6e0>] (kthread) from [<c01000fc>] (ret_from_fork+0x14/0x38)
	Exception stack(0xc10e3fb0 to 0xc10e3ff8)
	3fa0:                                     00000000 00000000 00000000 00000000
	3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
	3fe0: 00000000 00000000 00000000 00000000 00000013 00000000

	INFO: task tar:2083 blocked for more than 491 seconds.
	      Not tainted 5.15.49-rt46 #1
	"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
	task:tar             state:D stack:    0 pid: 2083 ppid:  2082 flags:0x00000000
	[<c08a3a10>] (__schedule) from [<c08a3d84>] (schedule+0xdc/0x134)
	[<c08a3d84>] (schedule) from [<c08a41b0>] (io_schedule+0x14/0x24)
	[<c08a41b0>] (io_schedule) from [<c08a455c>] (bit_wait_io+0xc/0x30)
	[<c08a455c>] (bit_wait_io) from [<c08a441c>] (__wait_on_bit_lock+0x54/0xa8)
	[<c08a441c>] (__wait_on_bit_lock) from [<c08a44f4>] (out_of_line_wait_on_bit_lock+0x84/0xb0)
	[<c08a44f4>] (out_of_line_wait_on_bit_lock) from [<c0371fb0>] (fat_mirror_bhs+0xa0/0x144)
	[<c0371fb0>] (fat_mirror_bhs) from [<c0372a68>] (fat_alloc_clusters+0x138/0x2a4)
	[<c0372a68>] (fat_alloc_clusters) from [<c0370b14>] (fat_alloc_new_dir+0x34/0x250)
	[<c0370b14>] (fat_alloc_new_dir) from [<c03787c0>] (vfat_mkdir+0x58/0x148)
	[<c03787c0>] (vfat_mkdir) from [<c0277b60>] (vfs_mkdir+0x68/0x98)
	[<c0277b60>] (vfs_mkdir) from [<c027b484>] (do_mkdirat+0xb0/0xec)
	[<c027b484>] (do_mkdirat) from [<c0100060>] (ret_fast_syscall+0x0/0x1c)
	Exception stack(0xc2e1bfa8 to 0xc2e1bff0)
	bfa0:                   01ee42f0 01ee4208 01ee42f0 000041ed 00000000 00004000
	bfc0: 01ee42f0 01ee4208 00000000 00000027 01ee4302 00000004 000dcb00 01ee4190
	bfe0: 000dc368 bed11924 0006d4b0 b6ebddfc

Here the kworker is waiting on msdos_sb_info::s_lock which is held by
tar which is in turn waiting for a buffer which is locked waiting to be
flushed, but this operation is plugged in the kworker.

The lock is a normal struct mutex, so tsk_is_pi_blocked() will always
return false on !RT and thus the behaviour changes for RT.

It seems that the intent here is to skip blk_flush_plug() in the case
where a non-preemptible lock (such as a spinlock) has been converted to
a rtmutex on RT, which is the case covered by the SM_RTLOCK_WAIT
schedule flag.  But sched_submit_work() is only called from schedule()
which is never called in this scenario, so the check can simply be
deleted.

Looking at the history of the -rt patchset, in fact this change was
present from v5.9.1-rt20 until being dropped in v5.13-rt1 as it was part
of a larger patch [1] most of which was replaced by commit b4bfa3fcfe3b
("sched/core: Rework the __schedule() preempt argument").

As described in [1]:

   The schedule process must distinguish between blocking on a regular
   sleeping lock (rwsem and mutex) and a RT-only sleeping lock (spinlock
   and rwlock):
   - rwsem and mutex must flush block requests (blk_schedule_flush_plug())
     even if blocked on a lock. This can not deadlock because this also
     happens for non-RT.
     There should be a warning if the scheduling point is within a RCU read
     section.

   - spinlock and rwlock must not flush block requests. This will deadlock
     if the callback attempts to acquire a lock which is already acquired.
     Similarly to being preempted, there should be no warning if the
     scheduling point is within a RCU read section.

and with the tsk_is_pi_blocked() in the scheduler path, we hit the first
issue.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/tree/patches/0022-locking-rtmutex-Use-custom-scheduling-function-for-s.patch?h=linux-5.10.y-rt-patches

Signed-off-by: John Keeping <john@metanate.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20220708162702.1758865-1-john@metanate.com
---
 include/linux/sched/rt.h | 8 --------
 kernel/sched/core.c      | 8 ++++++--
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index e5af028c08b4..994c25640e15 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
 }
 extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return tsk->pi_blocked_on != NULL;
-}
 #else
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
 }
 # define rt_mutex_adjust_pi(p)		do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return false;
-}
 #endif
 
 extern void normalize_rt_tasks(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c703d177f62d..a463dbc92fcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6470,8 +6470,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 			io_wq_worker_sleeping(tsk);
 	}
 
-	if (tsk_is_pi_blocked(tsk))
-		return;
+	/*
+	 * spinlock and rwlock must not flush block requests.  This will
+	 * deadlock if the callback attempts to acquire a lock which is
+	 * already acquired.
+	 */
+	SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
 
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
-- 
cgit 


From 3beb0ab5bffba625007ea5c9e5e0ee5eef05c1ea Mon Sep 17 00:00:00 2001
From: Seunghui Lee <sh043.lee@samsung.com>
Date: Wed, 13 Jul 2022 12:36:34 +0900
Subject: mmc: core: Use mmc_card_* macro and add a new for the sd_combo type

Add mmc_card_sd_combo() macro for sd combo type card and use the mmc_card_*
macro to simplify code instead of comparing card->type.

Signed-off-by: Seunghui Lee <sh043.lee@samsung.com>
Link: https://lore.kernel.org/r/20220713033635.28432-2-sh043.lee@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c |  4 ++--
 drivers/mmc/core/bus.c   |  4 ++--
 drivers/mmc/core/sd.c    |  2 +-
 drivers/mmc/core/sdio.c  | 16 ++++++++--------
 include/linux/mmc/card.h |  1 +
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f4a1281658db..9c642b3b7c20 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2988,7 +2988,7 @@ static int mmc_blk_probe(struct mmc_card *card)
 	 * Don't enable runtime PM for SD-combo cards here. Leave that
 	 * decision to be taken during the SDIO init sequence instead.
 	 */
-	if (card->type != MMC_TYPE_SD_COMBO) {
+	if (!mmc_card_sd_combo(card)) {
 		pm_runtime_set_active(&card->dev);
 		pm_runtime_enable(&card->dev);
 	}
@@ -3015,7 +3015,7 @@ static void mmc_blk_remove(struct mmc_card *card)
 		mmc_blk_part_switch(card, md->part_type);
 		mmc_release_host(card->host);
 	}
-	if (card->type != MMC_TYPE_SD_COMBO)
+	if (!mmc_card_sd_combo(card))
 		pm_runtime_disable(&card->dev);
 	pm_runtime_put_noidle(&card->dev);
 	mmc_blk_remove_req(md);
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 58a60afa650b..d8762fa3d5cd 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -85,7 +85,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 			return retval;
 	}
 
-	if (card->type == MMC_TYPE_SDIO || card->type == MMC_TYPE_SD_COMBO) {
+	if (mmc_card_sdio(card) || mmc_card_sd_combo(card)) {
 		retval = add_uevent_var(env, "SDIO_ID=%04X:%04X",
 					card->cis.vendor, card->cis.device);
 		if (retval)
@@ -107,7 +107,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 	 * SDIO (non-combo) cards are not handled by mmc_block driver and do not
 	 * have accessible CID register which used by mmc_card_name() function.
 	 */
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		return 0;
 
 	retval = add_uevent_var(env, "MMC_NAME=%s", mmc_card_name(card));
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index d2023837dd72..cee4c0b59f43 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -793,7 +793,7 @@ static umode_t sd_std_is_visible(struct kobject *kobj, struct attribute *attr,
 	     attr == &dev_attr_info2.attr ||
 	     attr == &dev_attr_info3.attr ||
 	     attr == &dev_attr_info4.attr
-	    ) && card->type != MMC_TYPE_SD_COMBO)
+	    ) &&!mmc_card_sd_combo(card))
 		return 0;
 
 	return attr->mode;
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 25799accf8a0..b589df1c35e0 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -335,7 +335,7 @@ static int sdio_disable_4bit_bus(struct mmc_card *card)
 {
 	int err;
 
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		goto out;
 
 	if (!(card->host->caps & MMC_CAP_4_BIT_DATA))
@@ -360,7 +360,7 @@ static int sdio_enable_4bit_bus(struct mmc_card *card)
 	err = sdio_enable_wide(card);
 	if (err <= 0)
 		return err;
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		goto out;
 
 	if (card->scr.bus_widths & SD_SCR_BUS_WIDTH_4) {
@@ -415,7 +415,7 @@ static int sdio_enable_hs(struct mmc_card *card)
 	int ret;
 
 	ret = mmc_sdio_switch_hs(card, true);
-	if (ret <= 0 || card->type == MMC_TYPE_SDIO)
+	if (ret <= 0 || mmc_card_sdio(card))
 		return ret;
 
 	ret = mmc_sd_switch_hs(card);
@@ -441,7 +441,7 @@ static unsigned mmc_sdio_get_max_clock(struct mmc_card *card)
 		max_dtr = card->cis.max_dtr;
 	}
 
-	if (card->type == MMC_TYPE_SD_COMBO)
+	if (mmc_card_sd_combo(card))
 		max_dtr = min(max_dtr, mmc_sd_get_max_clock(card));
 
 	return max_dtr;
@@ -689,7 +689,7 @@ try_again:
 	    mmc_sd_get_cid(host, ocr & rocr, card->raw_cid, NULL) == 0) {
 		card->type = MMC_TYPE_SD_COMBO;
 
-		if (oldcard && (oldcard->type != MMC_TYPE_SD_COMBO ||
+		if (oldcard && (!mmc_card_sd_combo(oldcard) ||
 		    memcmp(card->raw_cid, oldcard->raw_cid, sizeof(card->raw_cid)) != 0)) {
 			err = -ENOENT;
 			goto mismatch;
@@ -697,7 +697,7 @@ try_again:
 	} else {
 		card->type = MMC_TYPE_SDIO;
 
-		if (oldcard && oldcard->type != MMC_TYPE_SDIO) {
+		if (oldcard && !mmc_card_sdio(oldcard)) {
 			err = -ENOENT;
 			goto mismatch;
 		}
@@ -754,7 +754,7 @@ try_again:
 	/*
 	 * Read CSD, before selecting the card
 	 */
-	if (!oldcard && card->type == MMC_TYPE_SD_COMBO) {
+	if (!oldcard && mmc_card_sd_combo(card)) {
 		err = mmc_sd_get_csd(card);
 		if (err)
 			goto remove;
@@ -827,7 +827,7 @@ try_again:
 
 	mmc_fixup_device(card, sdio_fixup_methods);
 
-	if (card->type == MMC_TYPE_SD_COMBO) {
+	if (mmc_card_sd_combo(card)) {
 		err = mmc_sd_setup_card(host, card, oldcard != NULL);
 		/* handle as SDIO-only card if memory init failed */
 		if (err) {
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 37f975875102..156a7b673a28 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -348,5 +348,6 @@ bool mmc_card_is_blockaddr(struct mmc_card *card);
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
 #define mmc_card_sd(c)		((c)->type == MMC_TYPE_SD)
 #define mmc_card_sdio(c)	((c)->type == MMC_TYPE_SDIO)
+#define mmc_card_sd_combo(c)	((c)->type == MMC_TYPE_SD_COMBO)
 
 #endif /* LINUX_MMC_CARD_H */
-- 
cgit 


From 83d85bb069152b790caad905fa53e6d50cd3734d Mon Sep 17 00:00:00 2001
From: Maksym Glubokiy <maksym.glubokiy@plvision.eu>
Date: Mon, 11 Jul 2022 18:09:07 +0300
Subject: net: extract port range fields from fl_flow_key

So it can be used for port range filter offloading.

Co-developed-by: Volodymyr Mytnyk <volodymyr.mytnyk@plvision.eu>
Signed-off-by: Volodymyr Mytnyk <volodymyr.mytnyk@plvision.eu>
Signed-off-by: Maksym Glubokiy <maksym.glubokiy@plvision.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 16 ++++++++++++++++
 include/net/flow_offload.h   |  6 ++++++
 net/core/flow_offload.c      |  7 +++++++
 net/sched/cls_flower.c       |  8 +-------
 4 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index a4c6057c7097..0f9544a9bb9e 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -178,6 +178,22 @@ struct flow_dissector_key_ports {
 	};
 };
 
+/**
+ * struct flow_dissector_key_ports_range
+ * @tp: port number from packet
+ * @tp_min: min port number in range
+ * @tp_max: max port number in range
+ */
+struct flow_dissector_key_ports_range {
+	union {
+		struct flow_dissector_key_ports tp;
+		struct {
+			struct flow_dissector_key_ports tp_min;
+			struct flow_dissector_key_ports tp_max;
+		};
+	};
+};
+
 /**
  * flow_dissector_key_icmp:
  *		type: ICMP type
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 7ac313858037..a8d8512b7059 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -48,6 +48,10 @@ struct flow_match_ports {
 	struct flow_dissector_key_ports *key, *mask;
 };
 
+struct flow_match_ports_range {
+	struct flow_dissector_key_ports_range *key, *mask;
+};
+
 struct flow_match_icmp {
 	struct flow_dissector_key_icmp *key, *mask;
 };
@@ -94,6 +98,8 @@ void flow_rule_match_ip(const struct flow_rule *rule,
 			struct flow_match_ip *out);
 void flow_rule_match_ports(const struct flow_rule *rule,
 			   struct flow_match_ports *out);
+void flow_rule_match_ports_range(const struct flow_rule *rule,
+				 struct flow_match_ports_range *out);
 void flow_rule_match_tcp(const struct flow_rule *rule,
 			 struct flow_match_tcp *out);
 void flow_rule_match_icmp(const struct flow_rule *rule,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 929f6379a279..0d3075d3c8fb 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -125,6 +125,13 @@ void flow_rule_match_ports(const struct flow_rule *rule,
 }
 EXPORT_SYMBOL(flow_rule_match_ports);
 
+void flow_rule_match_ports_range(const struct flow_rule *rule,
+				 struct flow_match_ports_range *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports_range);
+
 void flow_rule_match_tcp(const struct flow_rule *rule,
 			 struct flow_match_tcp *out)
 {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index dcca70144dff..1a1e34480b7e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -63,13 +63,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_ip ip;
 	struct flow_dissector_key_ip enc_ip;
 	struct flow_dissector_key_enc_opts enc_opts;
-	union {
-		struct flow_dissector_key_ports tp;
-		struct {
-			struct flow_dissector_key_ports tp_min;
-			struct flow_dissector_key_ports tp_max;
-		};
-	} tp_range;
+	struct flow_dissector_key_ports_range tp_range;
 	struct flow_dissector_key_ct ct;
 	struct flow_dissector_key_hash hash;
 	struct flow_dissector_key_num_of_vlans num_of_vlans;
-- 
cgit 


From 20347fca71a387a3751f7bb270062616ddc5317a Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Fri, 8 Jul 2022 12:15:44 -0400
Subject: swiotlb: split up the global swiotlb lock

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb bounce buffer pool into individual areas
which have their own lock. Each CPU tries to allocate in its own area
first. Only if that fails does it search other areas. On freeing the
allocation is freed into the area where the memory was originally
allocated from.

Area number can be set via swiotlb kernel parameter and is default
to be possible cpu number. If possible cpu number is not power of
2, area number will be round up to the next power of 2.

This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/
4529b5784c141782c72ec9bd9a92df2b68cb7d45).

Based-on-idea-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/admin-guide/kernel-parameters.txt |   4 +-
 include/linux/swiotlb.h                         |   5 +
 kernel/dma/swiotlb.c                            | 229 +++++++++++++++++++-----
 3 files changed, 197 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..4a6ad177d4b8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5904,8 +5904,10 @@
 			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
-			Format: { <int> | force | noforce }
+			Format: { <int> [,<int>] | force | noforce }
 			<int> -- Number of I/O TLB slabs
+			<int> -- Second integer after comma. Number of swiotlb
+				 areas with their own lock. Must be power of 2.
 			force -- force using of bounce buffers even if they
 			         wouldn't be automatically used by the kernel
 			noforce -- Never use bounce buffers (for debugging)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index bdc58a0e20b1..f65ff1930120 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -88,6 +88,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  * @late_alloc:	%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
  * @for_alloc:  %true if the pool is used for memory allocation
+ * @nareas:  The area number in the pool.
+ * @area_nslabs: The slot number in the area.
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -101,6 +103,9 @@ struct io_tlb_mem {
 	bool late_alloc;
 	bool force_bounce;
 	bool for_alloc;
+	unsigned int nareas;
+	unsigned int area_nslabs;
+	struct io_tlb_area *areas;
 	struct io_tlb_slot {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 909b43445574..dcf1459ce723 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -70,6 +70,43 @@ struct io_tlb_mem io_tlb_default_mem;
 phys_addr_t swiotlb_unencrypted_base;
 
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:	The number of used IO TLB block.
+ * @index:	The slot index to start searching in this area for next round.
+ * @lock:	The lock to protect the above data structures in the map and
+ *		unmap calls.
+ */
+struct io_tlb_area {
+	unsigned long used;
+	unsigned int index;
+	spinlock_t lock;
+};
+
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+	if (!is_power_of_2(nareas))
+		nareas = roundup_pow_of_two(nareas);
+
+	default_nareas = nareas;
+
+	pr_info("area num %d.\n", nareas);
+	/*
+	 * Round up number of slabs to the next power of 2.
+	 * The last area is going be smaller than the rest if
+	 * default_nslabs is not power of two.
+	 */
+	if (nareas && !is_power_of_2(default_nslabs)) {
+		default_nslabs = roundup_pow_of_two(default_nslabs);
+		pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+			(default_nslabs << IO_TLB_SHIFT) >> 20);
+	}
+}
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -79,6 +116,10 @@ setup_io_tlb_npages(char *str)
 		default_nslabs =
 			ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
 	}
+	if (*str == ',')
+		++str;
+	if (isdigit(*str))
+		swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
 	if (*str == ',')
 		++str;
 	if (!strcmp(str, "force"))
@@ -112,8 +153,19 @@ void __init swiotlb_adjust_size(unsigned long size)
 	 */
 	if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
 		return;
+
+	/*
+	 * Round up number of slabs to the next power of 2.
+	 * The last area is going be smaller than the rest if
+	 * default_nslabs is not power of two.
+	 */
 	size = ALIGN(size, IO_TLB_SIZE);
 	default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	if (default_nareas) {
+		default_nslabs = roundup_pow_of_two(default_nslabs);
+		size = default_nslabs << IO_TLB_SHIFT;
+	}
+
 	pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 }
 
@@ -192,7 +244,8 @@ void __init swiotlb_update_mem_attributes(void)
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
-		unsigned long nslabs, unsigned int flags, bool late_alloc)
+		unsigned long nslabs, unsigned int flags,
+		bool late_alloc, unsigned int nareas)
 {
 	void *vaddr = phys_to_virt(start);
 	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -202,10 +255,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->end = mem->start + bytes;
 	mem->index = 0;
 	mem->late_alloc = late_alloc;
+	mem->nareas = nareas;
+	mem->area_nslabs = nslabs / mem->nareas;
 
 	mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
 
 	spin_lock_init(&mem->lock);
+	for (i = 0; i < mem->nareas; i++) {
+		spin_lock_init(&mem->areas[i].lock);
+		mem->areas[i].index = 0;
+	}
+
 	for (i = 0; i < mem->nslabs; i++) {
 		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
@@ -232,7 +292,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 		int (*remap)(void *tlb, unsigned long nslabs))
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long nslabs = default_nslabs;
+	unsigned long nslabs;
 	size_t alloc_size;
 	size_t bytes;
 	void *tlb;
@@ -242,6 +302,14 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 	if (swiotlb_force_disable)
 		return;
 
+	/*
+	 * default_nslabs maybe changed when adjust area number.
+	 * So allocate bounce buffer after adjusting area number.
+	 */
+	if (!default_nareas)
+		swiotlb_adjust_nareas(num_possible_cpus());
+
+	nslabs = default_nslabs;
 	if (nslabs < IO_TLB_MIN_SLABS)
 		panic("%s: nslabs = %lu too small\n", __func__, nslabs);
 
@@ -278,7 +346,13 @@ retry:
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false);
+	mem->areas = memblock_alloc(sizeof(struct io_tlb_area) *
+		default_nareas, SMP_CACHE_BYTES);
+	if (!mem->areas)
+		panic("%s: Failed to allocate mem->areas.\n", __func__);
+
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
+				default_nareas);
 
 	if (flags & SWIOTLB_VERBOSE)
 		swiotlb_print_info();
@@ -300,7 +374,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned char *vstart = NULL;
-	unsigned int order;
+	unsigned int order, area_order;
 	bool retried = false;
 	int rc = 0;
 
@@ -341,19 +415,34 @@ retry:
 			(PAGE_SIZE << order) >> 20);
 	}
 
+	if (!default_nareas)
+		swiotlb_adjust_nareas(num_possible_cpus());
+
+	area_order = get_order(array_size(sizeof(*mem->areas),
+		default_nareas));
+	mem->areas = (struct io_tlb_area *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+	if (!mem->areas)
+		goto error_area;
+
 	mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 		get_order(array_size(sizeof(*mem->slots), nslabs)));
-	if (!mem->slots) {
-		free_pages((unsigned long)vstart, order);
-		return -ENOMEM;
-	}
+	if (!mem->slots)
+		goto error_slots;
 
 	set_memory_decrypted((unsigned long)vstart,
 			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
-	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true);
+	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
+				default_nareas);
 
 	swiotlb_print_info();
 	return 0;
+
+error_slots:
+	free_pages((unsigned long)mem->areas, area_order);
+error_area:
+	free_pages((unsigned long)vstart, order);
+	return -ENOMEM;
 }
 
 void __init swiotlb_exit(void)
@@ -361,6 +450,7 @@ void __init swiotlb_exit(void)
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long tbl_vaddr;
 	size_t tbl_size, slots_size;
+	unsigned int area_order;
 
 	if (swiotlb_force_bounce)
 		return;
@@ -375,9 +465,14 @@ void __init swiotlb_exit(void)
 
 	set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
 	if (mem->late_alloc) {
+		area_order = get_order(array_size(sizeof(*mem->areas),
+			mem->nareas));
+		free_pages((unsigned long)mem->areas, area_order);
 		free_pages(tbl_vaddr, get_order(tbl_size));
 		free_pages((unsigned long)mem->slots, get_order(slots_size));
 	} else {
+		memblock_free_late(__pa(mem->areas),
+				   mem->nareas * sizeof(struct io_tlb_area));
 		memblock_free_late(mem->start, tbl_size);
 		memblock_free_late(__pa(mem->slots), slots_size);
 	}
@@ -480,9 +575,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
 	return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
 {
-	if (index >= mem->nslabs)
+	if (index >= mem->area_nslabs)
 		return 0;
 	return index;
 }
@@ -491,10 +586,11 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
  */
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
-			      size_t alloc_size, unsigned int alloc_align_mask)
+static int swiotlb_do_find_slots(struct io_tlb_mem *mem,
+		struct io_tlb_area *area, int area_index,
+		struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size, unsigned int alloc_align_mask)
 {
-	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
 		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -505,8 +601,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	unsigned int index, wrap, count = 0, i;
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned long flags;
+	unsigned int slot_base;
+	unsigned int slot_index;
 
 	BUG_ON(!nslots);
+	BUG_ON(area_index >= mem->nareas);
 
 	/*
 	 * For mappings with an alignment requirement don't bother looping to
@@ -518,16 +617,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 		stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 	stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
 
-	spin_lock_irqsave(&mem->lock, flags);
-	if (unlikely(nslots > mem->nslabs - mem->used))
+	spin_lock_irqsave(&area->lock, flags);
+	if (unlikely(nslots > mem->area_nslabs - area->used))
 		goto not_found;
 
-	index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
+	slot_base = area_index * mem->area_nslabs;
+	index = wrap = wrap_area_index(mem, ALIGN(area->index, stride));
+
 	do {
+		slot_index = slot_base + index;
+
 		if (orig_addr &&
-		    (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
-			    (orig_addr & iotlb_align_mask)) {
-			index = wrap_index(mem, index + 1);
+		    (slot_addr(tbl_dma_addr, slot_index) &
+		     iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+			index = wrap_area_index(mem, index + 1);
 			continue;
 		}
 
@@ -536,26 +639,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 		 * contiguous buffers, we allocate the buffers from that slot
 		 * and mark the entries as '0' indicating unavailable.
 		 */
-		if (!iommu_is_span_boundary(index, nslots,
+		if (!iommu_is_span_boundary(slot_index, nslots,
 					    nr_slots(tbl_dma_addr),
 					    max_slots)) {
-			if (mem->slots[index].list >= nslots)
+			if (mem->slots[slot_index].list >= nslots)
 				goto found;
 		}
-		index = wrap_index(mem, index + stride);
+		index = wrap_area_index(mem, index + stride);
 	} while (index != wrap);
 
 not_found:
-	spin_unlock_irqrestore(&mem->lock, flags);
+	spin_unlock_irqrestore(&area->lock, flags);
 	return -1;
 
 found:
-	for (i = index; i < index + nslots; i++) {
+	for (i = slot_index; i < slot_index + nslots; i++) {
 		mem->slots[i].list = 0;
-		mem->slots[i].alloc_size =
-			alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+		mem->slots[i].alloc_size = alloc_size - (offset +
+				((i - slot_index) << IO_TLB_SHIFT));
 	}
-	for (i = index - 1;
+	for (i = slot_index - 1;
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
 	     mem->slots[i].list; i--)
 		mem->slots[i].list = ++count;
@@ -563,14 +666,43 @@ found:
 	/*
 	 * Update the indices to avoid searching in the next round.
 	 */
-	if (index + nslots < mem->nslabs)
-		mem->index = index + nslots;
+	if (index + nslots < mem->area_nslabs)
+		area->index = index + nslots;
 	else
-		mem->index = 0;
-	mem->used += nslots;
+		area->index = 0;
+	area->used += nslots;
+	spin_unlock_irqrestore(&area->lock, flags);
+	return slot_index;
+}
 
-	spin_unlock_irqrestore(&mem->lock, flags);
-	return index;
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size, unsigned int alloc_align_mask)
+{
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+	int start = raw_smp_processor_id() & ((1U << __fls(mem->nareas)) - 1);
+	int i = start, index;
+
+	do {
+		index = swiotlb_do_find_slots(mem, mem->areas + i, i,
+					      dev, orig_addr, alloc_size,
+					      alloc_align_mask);
+		if (index >= 0)
+			return index;
+		if (++i >= mem->nareas)
+			i = 0;
+	} while (i != start);
+
+	return -1;
+}
+
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+	int i;
+	unsigned long used = 0;
+
+	for (i = 0; i < mem->nareas; i++)
+		used += mem->areas[i].used;
+	return used;
 }
 
 phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
@@ -602,7 +734,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
 	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-				 alloc_size, mem->nslabs, mem->used);
+				 alloc_size, mem->nslabs, mem_used(mem));
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
@@ -632,6 +764,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
 	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
 	int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+	int aindex = index / mem->area_nslabs;
+	struct io_tlb_area *area = &mem->areas[aindex];
 	int count, i;
 
 	/*
@@ -640,7 +774,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	 * While returning the entries to the free list, we merge the entries
 	 * with slots below and above the pool being returned.
 	 */
-	spin_lock_irqsave(&mem->lock, flags);
+	BUG_ON(aindex >= mem->nareas);
+
+	spin_lock_irqsave(&area->lock, flags);
 	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
 		count = mem->slots[index + nslots].list;
 	else
@@ -664,8 +800,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
 	     i--)
 		mem->slots[i].list = ++count;
-	mem->used -= nslots;
-	spin_unlock_irqrestore(&mem->lock, flags);
+	area->used -= nslots;
+	spin_unlock_irqrestore(&area->lock, flags);
 }
 
 /*
@@ -763,12 +899,14 @@ EXPORT_SYMBOL_GPL(is_swiotlb_active);
 static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
 					 const char *dirname)
 {
+	unsigned long used = mem_used(mem);
+
 	mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
 	if (!mem->nslabs)
 		return;
 
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
-	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &used);
 }
 
 static int __init __maybe_unused swiotlb_create_default_debugfs(void)
@@ -819,6 +957,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 	struct io_tlb_mem *mem = rmem->priv;
 	unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
 
+	/* Set Per-device io tlb area to one */
+	unsigned int nareas = 1;
+
 	/*
 	 * Since multiple devices can share the same pool, the private data,
 	 * io_tlb_mem struct, will be initialized by the first device attached
@@ -835,10 +976,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 			return -ENOMEM;
 		}
 
+		mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+				GFP_KERNEL);
+		if (!mem->areas) {
+			kfree(mem);
+			kfree(mem->slots);
+			return -ENOMEM;
+		}
+
 		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
 				     rmem->size >> PAGE_SHIFT);
 		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
-				false);
+					false, nareas);
 		mem->for_alloc = true;
 
 		rmem->priv = mem;
-- 
cgit 


From 4e2b70673f2b93ab8e037a2b89c15f146c1ae9b0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 13 Jul 2022 12:47:54 +0200
Subject: ALSA: pcm: Fix missing return value comments for kernel docs

Each kernel doc comment expects the definition of the return value in
a proper format.  This patch adds or fixes the missing entries for PCM
API.

Link: https://lore.kernel.org/r/20220713104759.4365-3-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/pcm.h     | 50 +++++++++++++++++++++++++++++++++++++++++++++++--
 sound/core/pcm.c        |  4 ++++
 sound/core/pcm_memory.c |  4 ++++
 sound/core/pcm_native.c |  6 ++++++
 4 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 6b99310b5b88..08cf4a5801f3 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -607,7 +607,7 @@ snd_pcm_debug_name(struct snd_pcm_substream *substream, char *buf, size_t size)
  * snd_pcm_stream_linked - Check whether the substream is linked with others
  * @substream: substream to check
  *
- * Returns true if the given substream is being linked with others.
+ * Return: true if the given substream is being linked with others
  */
 static inline int snd_pcm_stream_linked(struct snd_pcm_substream *substream)
 {
@@ -673,7 +673,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
  * snd_pcm_running - Check whether the substream is in a running state
  * @substream: substream to check
  *
- * Returns true if the given substream is in the state RUNNING, or in the
+ * Return: true if the given substream is in the state RUNNING, or in the
  * state DRAINING for playback.
  */
 static inline int snd_pcm_running(struct snd_pcm_substream *substream)
@@ -687,6 +687,8 @@ static inline int snd_pcm_running(struct snd_pcm_substream *substream)
  * bytes_to_samples - Unit conversion of the size from bytes to samples
  * @runtime: PCM runtime instance
  * @size: size in bytes
+ *
+ * Return: the size in samples
  */
 static inline ssize_t bytes_to_samples(struct snd_pcm_runtime *runtime, ssize_t size)
 {
@@ -697,6 +699,8 @@ static inline ssize_t bytes_to_samples(struct snd_pcm_runtime *runtime, ssize_t
  * bytes_to_frames - Unit conversion of the size from bytes to frames
  * @runtime: PCM runtime instance
  * @size: size in bytes
+ *
+ * Return: the size in frames
  */
 static inline snd_pcm_sframes_t bytes_to_frames(struct snd_pcm_runtime *runtime, ssize_t size)
 {
@@ -707,6 +711,8 @@ static inline snd_pcm_sframes_t bytes_to_frames(struct snd_pcm_runtime *runtime,
  * samples_to_bytes - Unit conversion of the size from samples to bytes
  * @runtime: PCM runtime instance
  * @size: size in samples
+ *
+ * Return: the byte size
  */
 static inline ssize_t samples_to_bytes(struct snd_pcm_runtime *runtime, ssize_t size)
 {
@@ -717,6 +723,8 @@ static inline ssize_t samples_to_bytes(struct snd_pcm_runtime *runtime, ssize_t
  * frames_to_bytes - Unit conversion of the size from frames to bytes
  * @runtime: PCM runtime instance
  * @size: size in frames
+ *
+ * Return: the byte size
  */
 static inline ssize_t frames_to_bytes(struct snd_pcm_runtime *runtime, snd_pcm_sframes_t size)
 {
@@ -727,6 +735,8 @@ static inline ssize_t frames_to_bytes(struct snd_pcm_runtime *runtime, snd_pcm_s
  * frame_aligned - Check whether the byte size is aligned to frames
  * @runtime: PCM runtime instance
  * @bytes: size in bytes
+ *
+ * Return: true if aligned, or false if not
  */
 static inline int frame_aligned(struct snd_pcm_runtime *runtime, ssize_t bytes)
 {
@@ -736,6 +746,8 @@ static inline int frame_aligned(struct snd_pcm_runtime *runtime, ssize_t bytes)
 /**
  * snd_pcm_lib_buffer_bytes - Get the buffer size of the current PCM in bytes
  * @substream: PCM substream
+ *
+ * Return: buffer byte size
  */
 static inline size_t snd_pcm_lib_buffer_bytes(struct snd_pcm_substream *substream)
 {
@@ -746,6 +758,8 @@ static inline size_t snd_pcm_lib_buffer_bytes(struct snd_pcm_substream *substrea
 /**
  * snd_pcm_lib_period_bytes - Get the period size of the current PCM in bytes
  * @substream: PCM substream
+ *
+ * Return: period byte size
  */
 static inline size_t snd_pcm_lib_period_bytes(struct snd_pcm_substream *substream)
 {
@@ -758,6 +772,8 @@ static inline size_t snd_pcm_lib_period_bytes(struct snd_pcm_substream *substrea
  * @runtime: PCM runtime instance
  *
  * Result is between 0 ... (boundary - 1)
+ *
+ * Return: available frame size
  */
 static inline snd_pcm_uframes_t snd_pcm_playback_avail(struct snd_pcm_runtime *runtime)
 {
@@ -774,6 +790,8 @@ static inline snd_pcm_uframes_t snd_pcm_playback_avail(struct snd_pcm_runtime *r
  * @runtime: PCM runtime instance
  *
  * Result is between 0 ... (boundary - 1)
+ *
+ * Return: available frame size
  */
 static inline snd_pcm_uframes_t snd_pcm_capture_avail(struct snd_pcm_runtime *runtime)
 {
@@ -786,6 +804,8 @@ static inline snd_pcm_uframes_t snd_pcm_capture_avail(struct snd_pcm_runtime *ru
 /**
  * snd_pcm_playback_hw_avail - Get the queued space for playback
  * @runtime: PCM runtime instance
+ *
+ * Return: available frame size
  */
 static inline snd_pcm_sframes_t snd_pcm_playback_hw_avail(struct snd_pcm_runtime *runtime)
 {
@@ -795,6 +815,8 @@ static inline snd_pcm_sframes_t snd_pcm_playback_hw_avail(struct snd_pcm_runtime
 /**
  * snd_pcm_capture_hw_avail - Get the free space for capture
  * @runtime: PCM runtime instance
+ *
+ * Return: available frame size
  */
 static inline snd_pcm_sframes_t snd_pcm_capture_hw_avail(struct snd_pcm_runtime *runtime)
 {
@@ -934,6 +956,8 @@ static inline const struct snd_interval *hw_param_interval_c(const struct snd_pc
 /**
  * params_channels - Get the number of channels from the hw params
  * @p: hw params
+ *
+ * Return: the number of channels
  */
 static inline unsigned int params_channels(const struct snd_pcm_hw_params *p)
 {
@@ -943,6 +967,8 @@ static inline unsigned int params_channels(const struct snd_pcm_hw_params *p)
 /**
  * params_rate - Get the sample rate from the hw params
  * @p: hw params
+ *
+ * Return: the sample rate
  */
 static inline unsigned int params_rate(const struct snd_pcm_hw_params *p)
 {
@@ -952,6 +978,8 @@ static inline unsigned int params_rate(const struct snd_pcm_hw_params *p)
 /**
  * params_period_size - Get the period size (in frames) from the hw params
  * @p: hw params
+ *
+ * Return: the period size in frames
  */
 static inline unsigned int params_period_size(const struct snd_pcm_hw_params *p)
 {
@@ -961,6 +989,8 @@ static inline unsigned int params_period_size(const struct snd_pcm_hw_params *p)
 /**
  * params_periods - Get the number of periods from the hw params
  * @p: hw params
+ *
+ * Return: the number of periods
  */
 static inline unsigned int params_periods(const struct snd_pcm_hw_params *p)
 {
@@ -970,6 +1000,8 @@ static inline unsigned int params_periods(const struct snd_pcm_hw_params *p)
 /**
  * params_buffer_size - Get the buffer size (in frames) from the hw params
  * @p: hw params
+ *
+ * Return: the buffer size in frames
  */
 static inline unsigned int params_buffer_size(const struct snd_pcm_hw_params *p)
 {
@@ -979,6 +1011,8 @@ static inline unsigned int params_buffer_size(const struct snd_pcm_hw_params *p)
 /**
  * params_buffer_bytes - Get the buffer size (in bytes) from the hw params
  * @p: hw params
+ *
+ * Return: the buffer size in bytes
  */
 static inline unsigned int params_buffer_bytes(const struct snd_pcm_hw_params *p)
 {
@@ -1241,6 +1275,8 @@ int snd_pcm_set_managed_buffer_all(struct snd_pcm *pcm, int type,
  * only the given sized buffer and doesn't allow re-allocation nor dynamic
  * allocation of a larger buffer unlike the standard one.
  * The function may return -ENOMEM error, hence the caller must check it.
+ *
+ * Return: zero if successful, or a negative error code
  */
 static inline int __must_check
 snd_pcm_set_fixed_buffer(struct snd_pcm_substream *substream, int type,
@@ -1259,6 +1295,8 @@ snd_pcm_set_fixed_buffer(struct snd_pcm_substream *substream, int type,
  * Apply the set up of the fixed buffer via snd_pcm_set_fixed_buffer() for
  * all substream.  If any of allocation fails, it returns -ENOMEM, hence the
  * caller must check the return value.
+ *
+ * Return: zero if successful, or a negative error code
  */
 static inline int __must_check
 snd_pcm_set_fixed_buffer_all(struct snd_pcm *pcm, int type,
@@ -1315,6 +1353,8 @@ static inline int snd_pcm_lib_alloc_vmalloc_32_buffer
  * snd_pcm_sgbuf_get_addr - Get the DMA address at the corresponding offset
  * @substream: PCM substream
  * @ofs: byte offset
+ *
+ * Return: DMA address
  */
 static inline dma_addr_t
 snd_pcm_sgbuf_get_addr(struct snd_pcm_substream *substream, unsigned int ofs)
@@ -1328,6 +1368,8 @@ snd_pcm_sgbuf_get_addr(struct snd_pcm_substream *substream, unsigned int ofs)
  * @substream: PCM substream
  * @ofs: byte offset
  * @size: byte size to examine
+ *
+ * Return: chunk size
  */
 static inline unsigned int
 snd_pcm_sgbuf_get_chunk_size(struct snd_pcm_substream *substream,
@@ -1430,6 +1472,8 @@ struct snd_pcm_chmap {
  * snd_pcm_chmap_substream - get the PCM substream assigned to the given chmap info
  * @info: chmap information
  * @idx: the substream number index
+ *
+ * Return: the matched PCM substream, or NULL if not found
  */
 static inline struct snd_pcm_substream *
 snd_pcm_chmap_substream(struct snd_pcm_chmap *info, unsigned int idx)
@@ -1460,6 +1504,8 @@ int snd_pcm_add_chmap_ctls(struct snd_pcm *pcm, int stream,
 /**
  * pcm_format_to_bits - Strong-typed conversion of pcm_format to bitwise
  * @pcm_format: PCM format
+ *
+ * Return: 64bit mask corresponding to the given PCM format
  */
 static inline u64 pcm_format_to_bits(snd_pcm_format_t pcm_format)
 {
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 977d54320a5c..03fc5fa5813e 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -216,6 +216,8 @@ static const char * const snd_pcm_format_names[] = {
 /**
  * snd_pcm_format_name - Return a name string for the given PCM format
  * @format: PCM format
+ *
+ * Return: the format name string
  */
 const char *snd_pcm_format_name(snd_pcm_format_t format)
 {
@@ -1138,6 +1140,8 @@ static int snd_pcm_dev_disconnect(struct snd_device *device)
  * This adds the given notifier to the global list so that the callback is
  * called for each registered PCM devices.  This exists only for PCM OSS
  * emulation, so far.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_notify(struct snd_pcm_notify *notify, int nfree)
 {
diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c
index b8296b6eb2c1..7bde7fb64011 100644
--- a/sound/core/pcm_memory.c
+++ b/sound/core/pcm_memory.c
@@ -350,6 +350,8 @@ EXPORT_SYMBOL(snd_pcm_lib_preallocate_pages_for_all);
  * SNDRV_DMA_TYPE_VMALLOC type.
  *
  * Upon successful buffer allocation and setup, the function returns 0.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_set_managed_buffer(struct snd_pcm_substream *substream, int type,
 				struct device *data, size_t size, size_t max)
@@ -369,6 +371,8 @@ EXPORT_SYMBOL(snd_pcm_set_managed_buffer);
  *
  * Do pre-allocation to all substreams of the given pcm for the specified DMA
  * type and size, and set the managed_buffer_alloc flag to each substream.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_set_managed_buffer_all(struct snd_pcm *pcm, int type,
 				   struct device *data,
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 4adaee62ef33..aa0453e51595 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3412,6 +3412,8 @@ static long snd_pcm_ioctl(struct file *file, unsigned int cmd,
  * The function is provided primarily for OSS layer and USB gadget drivers,
  * and it allows only the limited set of ioctls (hw_params, sw_params,
  * prepare, start, drain, drop, forward).
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream,
 			 unsigned int cmd, void *arg)
@@ -3810,6 +3812,8 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = {
  *
  * This is the default mmap handler for PCM data.  When mmap pcm_ops is NULL,
  * this function is invoked implicitly.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
 			     struct vm_area_struct *area)
@@ -3836,6 +3840,8 @@ EXPORT_SYMBOL_GPL(snd_pcm_lib_default_mmap);
  * When your hardware uses the iomapped pages as the hardware buffer and
  * wants to mmap it, pass this function as mmap pcm_ops.  Note that this
  * is supposed to work only on limited architectures.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream,
 			   struct vm_area_struct *area)
-- 
cgit 


From 5c121d6362d60198d9e37429f87e87d5477e3555 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 13 Jul 2022 12:47:55 +0200
Subject: ALSA: dmaengine: Fix missing return value comments for kernel docs

Each kernel doc comment expects the definition of the return value in
a proper format.  This patch adds or fixes the missing entries for PCM
dmaengine API.

Link: https://lore.kernel.org/r/20220713104759.4365-4-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/dmaengine_pcm.h |  2 ++
 sound/core/pcm_dmaengine.c    | 30 ++++++++++++++++++++----------
 2 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h
index 38ea046e653c..2df54cf02cb3 100644
--- a/include/sound/dmaengine_pcm.h
+++ b/include/sound/dmaengine_pcm.h
@@ -15,6 +15,8 @@
  * snd_pcm_substream_to_dma_direction - Get dma_transfer_direction for a PCM
  *   substream
  * @substream: PCM substream
+ *
+ * Return: DMA transfer direction
  */
 static inline enum dma_transfer_direction
 snd_pcm_substream_to_dma_direction(const struct snd_pcm_substream *substream)
diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c
index af6f717e1e7e..5b2ca028f5aa 100644
--- a/sound/core/pcm_dmaengine.c
+++ b/sound/core/pcm_dmaengine.c
@@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_get_chan);
  *
  * This function can be used to initialize a dma_slave_config from a substream
  * and hw_params in a dmaengine based PCM driver implementation.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_hwparams_to_dma_slave_config(const struct snd_pcm_substream *substream,
 	const struct snd_pcm_hw_params *params,
@@ -175,10 +177,10 @@ static int dmaengine_pcm_prepare_and_submit(struct snd_pcm_substream *substream)
  * @substream: PCM substream
  * @cmd: Trigger command
  *
- * Returns 0 on success, a negative error code otherwise.
- *
  * This function can be used as the PCM trigger callback for dmaengine based PCM
  * driver implementations.
+ *
+ * Return: 0 on success, a negative error code otherwise
  */
 int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 {
@@ -223,6 +225,8 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_trigger);
  *
  * This function is deprecated and should not be used by new drivers, as its
  * results may be unreliable.
+ *
+ * Return: PCM position in frames
  */
 snd_pcm_uframes_t snd_dmaengine_pcm_pointer_no_residue(struct snd_pcm_substream *substream)
 {
@@ -237,6 +241,8 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_pointer_no_residue);
  *
  * This function can be used as the PCM pointer callback for dmaengine based PCM
  * driver implementations.
+ *
+ * Return: PCM position in frames
  */
 snd_pcm_uframes_t snd_dmaengine_pcm_pointer(struct snd_pcm_substream *substream)
 {
@@ -266,9 +272,9 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_pointer);
  * @filter_fn: Filter function used to request the DMA channel
  * @filter_data: Data passed to the DMA filter function
  *
- * Returns NULL or the requested DMA channel.
- *
  * This function request a DMA channel for usage with dmaengine PCM.
+ *
+ * Return: NULL or the requested DMA channel
  */
 struct dma_chan *snd_dmaengine_pcm_request_channel(dma_filter_fn filter_fn,
 	void *filter_data)
@@ -288,11 +294,11 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_request_channel);
  * @substream: PCM substream
  * @chan: DMA channel to use for data transfers
  *
- * Returns 0 on success, a negative error code otherwise.
- *
  * The function should usually be called from the pcm open callback. Note that
  * this function will use private_data field of the substream's runtime. So it
  * is not available to your pcm driver implementation.
+ *
+ * Return: 0 on success, a negative error code otherwise
  */
 int snd_dmaengine_pcm_open(struct snd_pcm_substream *substream,
 	struct dma_chan *chan)
@@ -326,12 +332,12 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open);
  * @filter_fn: Filter function used to request the DMA channel
  * @filter_data: Data passed to the DMA filter function
  *
- * Returns 0 on success, a negative error code otherwise.
- *
  * This function will request a DMA channel using the passed filter function and
  * data. The function should usually be called from the pcm open callback. Note
  * that this function will use private_data field of the substream's runtime. So
  * it is not available to your pcm driver implementation.
+ *
+ * Return: 0 on success, a negative error code otherwise
  */
 int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream,
 	dma_filter_fn filter_fn, void *filter_data)
@@ -344,6 +350,8 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open_request_chan);
 /**
  * snd_dmaengine_pcm_close - Close a dmaengine based PCM substream
  * @substream: PCM substream
+ *
+ * Return: 0 on success, a negative error code otherwise
  */
 int snd_dmaengine_pcm_close(struct snd_pcm_substream *substream)
 {
@@ -362,6 +370,8 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_close);
  * @substream: PCM substream
  *
  * Releases the DMA channel associated with the PCM substream.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_dmaengine_pcm_close_release_chan(struct snd_pcm_substream *substream)
 {
@@ -382,10 +392,10 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_close_release_chan);
  * @hw: PCM hw params
  * @chan: DMA channel to use for data transfers
  *
- * Returns 0 on success, a negative error code otherwise.
- *
  * This function will query DMA capability, then refine the pcm hardware
  * parameters.
+ *
+ * Return: 0 on success, a negative error code otherwise
  */
 int snd_dmaengine_pcm_refine_runtime_hwparams(
 	struct snd_pcm_substream *substream,
-- 
cgit 


From b05d834ef8f8fbd90b1bacca909c1eeec02e3625 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 13 Jul 2022 12:47:56 +0200
Subject: ALSA: compress: Fix kernel doc warnings

Each kernel doc comment expects the definition of the return value and
the summary for each struct / enum in a proper format.  This patch
adds or fixes the missing entries for compress-offload API.

Link: https://lore.kernel.org/r/20220713104759.4365-5-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/compress_offload.h | 2 +-
 include/uapi/sound/compress_params.h  | 6 +++---
 sound/core/compress_offload.c         | 4 ++++
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h
index 9555f31c8425..3aef123dbd7f 100644
--- a/include/uapi/sound/compress_offload.h
+++ b/include/uapi/sound/compress_offload.h
@@ -123,7 +123,7 @@ struct snd_compr_codec_caps {
 } __attribute__((packed, aligned(4)));
 
 /**
- * enum sndrv_compress_encoder
+ * enum sndrv_compress_encoder - encoder metadata key
  * @SNDRV_COMPRESS_ENCODER_PADDING: no of samples appended by the encoder at the
  * end of the track
  * @SNDRV_COMPRESS_ENCODER_DELAY: no of samples inserted by the encoder at the
diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h
index 79b14389ae41..726361716919 100644
--- a/include/uapi/sound/compress_params.h
+++ b/include/uapi/sound/compress_params.h
@@ -250,7 +250,7 @@ struct snd_enc_wma {
 
 
 /**
- * struct snd_enc_vorbis
+ * struct snd_enc_vorbis - Vorbis encoder parameters
  * @quality: Sets encoding quality to n, between -1 (low) and 10 (high).
  * In the default mode of operation, the quality level is 3.
  * Normal quality range is 0 - 10.
@@ -279,7 +279,7 @@ struct snd_enc_vorbis {
 
 
 /**
- * struct snd_enc_real
+ * struct snd_enc_real - RealAudio encoder parameters
  * @quant_bits: number of coupling quantization bits in the stream
  * @start_region: coupling start region in the stream
  * @num_regions: number of regions value
@@ -294,7 +294,7 @@ struct snd_enc_real {
 } __attribute__((packed, aligned(4)));
 
 /**
- * struct snd_enc_flac
+ * struct snd_enc_flac - FLAC encoder parameters
  * @num: serial number, valid only for OGG formats
  *	needs to be set by application
  * @gain: Add replay gain tags
diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c
index cf415fe231ed..243acad89fd3 100644
--- a/sound/core/compress_offload.c
+++ b/sound/core/compress_offload.c
@@ -818,6 +818,8 @@ static void error_delayed_work(struct work_struct *work)
  * Stop the stream and set its state.
  *
  * Should be called with compressed device lock held.
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_compr_stop_error(struct snd_compr_stream *stream,
 			 snd_pcm_state_t state)
@@ -1164,6 +1166,8 @@ static int snd_compress_dev_free(struct snd_device *device)
  * @dirn: device direction, should be of type enum snd_compr_direction
  * @id: ID string
  * @compr: compress device pointer
+ *
+ * Return: zero if successful, or a negative error code
  */
 int snd_compress_new(struct snd_card *card, int device,
 			int dirn, const char *id, struct snd_compr *compr)
-- 
cgit 


From 1dace014928e6e385363032d359a04dee9158af0 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 11 Jul 2022 17:15:29 -0700
Subject: raw: Fix a data-race around sysctl_raw_l3mdev_accept.

While reading sysctl_raw_l3mdev_accept, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: 6897445fb194 ("net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 8ad8df594853..c51a635671a7 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -75,7 +75,7 @@ static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
 				       int dif, int sdif)
 {
 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-	return inet_bound_dev_eq(!!net->ipv4.sysctl_raw_l3mdev_accept,
+	return inet_bound_dev_eq(READ_ONCE(net->ipv4.sysctl_raw_l3mdev_accept),
 				 bound_dev_if, dif, sdif);
 #else
 	return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
-- 
cgit 


From 277cbb6bc4bd398133eb5327c5256482a2289fe1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 12 Jul 2022 12:24:24 +0200
Subject: net: devlink: move unlocked function prototypes alongside the locked
 ones

Maintain the same order as it is in devlink.c for function prototypes.
The most of the locked variants would very likely soon be removed
and the unlocked version would be the only one.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5150deb67fab..b1b5c19a8316 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1521,15 +1521,6 @@ void devl_unlock(struct devlink *devlink);
 void devl_assert_locked(struct devlink *devlink);
 bool devl_lock_is_held(struct devlink *devlink);
 
-int devl_port_register(struct devlink *devlink,
-		       struct devlink_port *devlink_port,
-		       unsigned int port_index);
-void devl_port_unregister(struct devlink_port *devlink_port);
-
-int devl_rate_leaf_create(struct devlink_port *port, void *priv);
-void devl_rate_leaf_destroy(struct devlink_port *devlink_port);
-void devl_rate_nodes_destroy(struct devlink *devlink);
-
 struct ib_device;
 
 struct net *devlink_net(const struct devlink *devlink);
@@ -1551,9 +1542,13 @@ void devlink_set_features(struct devlink *devlink, u64 features);
 void devlink_register(struct devlink *devlink);
 void devlink_unregister(struct devlink *devlink);
 void devlink_free(struct devlink *devlink);
+int devl_port_register(struct devlink *devlink,
+		       struct devlink_port *devlink_port,
+		       unsigned int port_index);
 int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
 			  unsigned int port_index);
+void devl_port_unregister(struct devlink_port *devlink_port);
 void devlink_port_unregister(struct devlink_port *devlink_port);
 void devlink_port_type_eth_set(struct devlink_port *devlink_port,
 			       struct net_device *netdev);
@@ -1569,6 +1564,9 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   u32 controller, u16 pf, u32 sf,
 				   bool external);
+int devl_rate_leaf_create(struct devlink_port *port, void *priv);
+void devl_rate_leaf_destroy(struct devlink_port *devlink_port);
+void devl_rate_nodes_destroy(struct devlink *devlink);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
 			       struct devlink_linecard *linecard);
 struct devlink_linecard *
-- 
cgit 


From 6d1c1a73e1126572de0a8b063fe62fe43786ed59 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 8 Jul 2022 14:13:11 +0800
Subject: soundwire: Intel: add trigger callback

When a pipeline is split into FE and BE parts, the BE pipeline may need to
be triggered separately in the BE trigger op. So add the trigger callback
in the link_res ops that will be invoked during BE DAI trigger.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20220708061312.25878-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/intel.c           | 9 +++++++++
 include/linux/soundwire/sdw_intel.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 0268fa527c0c..fed6418d6375 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -1004,9 +1004,18 @@ static int intel_trigger(struct snd_pcm_substream *substream, int cmd, struct sn
 {
 	struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai);
 	struct sdw_intel *sdw = cdns_to_intel(cdns);
+	struct sdw_intel_link_res *res = sdw->link_res;
 	struct sdw_cdns_dma_data *dma;
 	int ret = 0;
 
+	/*
+	 * The .trigger callback is used to send required IPC to audio
+	 * firmware. The .free_stream callback will still be called
+	 * by intel_free_stream() in the TRIGGER_SUSPEND case.
+	 */
+	if (res->ops && res->ops->trigger)
+		res->ops->trigger(dai, cmd, substream->stream);
+
 	dma = snd_soc_dai_get_dma_data(dai, substream);
 	if (!dma) {
 		dev_err(dai->dev, "failed to get dma data in %s\n",
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index b5b489ea1aef..ec16ae49e6a4 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -121,6 +121,7 @@ struct sdw_intel_ops {
 			     struct sdw_intel_stream_params_data *params_data);
 	int (*free_stream)(struct device *dev,
 			   struct sdw_intel_stream_free_data *free_data);
+	int (*trigger)(struct snd_soc_dai *dai, int cmd, int stream);
 };
 
 /**
-- 
cgit 


From bc5c8260f4114951de3b4ec629650a722ca58a2b Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Wed, 13 Jul 2022 09:54:38 +0800
Subject: net/sched: remove return value of unregister_tcf_proto_ops

Return value of unregister_tcf_proto_ops is unused, remove it.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 +-
 net/sched/cls_api.c   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8cf001aed858..d9d90e6925e1 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -23,7 +23,7 @@ struct tcf_walker {
 };
 
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
-int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
+void unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
 struct tcf_block_ext_info {
 	enum flow_block_binder_type binder_type;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9bb4d3dcc994..c7a240232b8d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -194,7 +194,7 @@ EXPORT_SYMBOL(register_tcf_proto_ops);
 
 static struct workqueue_struct *tc_filter_wq;
 
-int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+void unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
 	struct tcf_proto_ops *t;
 	int rc = -ENOENT;
@@ -214,7 +214,8 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 		}
 	}
 	write_unlock(&cls_mod_lock);
-	return rc;
+
+	WARN(rc, "unregister tc filter kind(%s) failed %d\n", ops->kind, rc);
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
-- 
cgit 


From af16df54b89dee72df253abc5e7b5e8a6d16c11c Mon Sep 17 00:00:00 2001
From: Coiby Xu <coxu@redhat.com>
Date: Wed, 13 Jul 2022 15:21:11 +0800
Subject: ima: force signature verification when CONFIG_KEXEC_SIG is configured

Currently, an unsigned kernel could be kexec'ed when IMA arch specific
policy is configured unless lockdown is enabled. Enforce kernel
signature verification check in the kexec_file_load syscall when IMA
arch specific policy is configured.

Fixes: 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE")
Reported-and-suggested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Coiby Xu <coxu@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/kexec.h            |  6 ++++++
 kernel/kexec_file.c              | 11 ++++++++++-
 security/integrity/ima/ima_efi.c |  2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ce6536f1d269..475683cd67f1 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -452,6 +452,12 @@ static inline int kexec_crash_loaded(void) { return 0; }
 #define kexec_in_progress false
 #endif /* CONFIG_KEXEC_CORE */
 
+#ifdef CONFIG_KEXEC_SIG
+void set_kexec_sig_enforced(void);
+#else
+static inline void set_kexec_sig_enforced(void) {}
+#endif
+
 #endif /* !defined(__ASSEBMLY__) */
 
 #endif /* LINUX_KEXEC_H */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 145321a5e798..f9261c07b048 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -29,6 +29,15 @@
 #include <linux/vmalloc.h>
 #include "kexec_internal.h"
 
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+#endif
+
 static int kexec_calculate_store_digests(struct kimage *image);
 
 /*
@@ -159,7 +168,7 @@ kimage_validate_signature(struct kimage *image)
 					   image->kernel_buf_len);
 	if (ret) {
 
-		if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+		if (sig_enforce) {
 			pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
 			return ret;
 		}
diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c
index 71786d01946f..9db66fe310d4 100644
--- a/security/integrity/ima/ima_efi.c
+++ b/security/integrity/ima/ima_efi.c
@@ -67,6 +67,8 @@ const char * const *arch_get_ima_policy(void)
 	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
 		if (IS_ENABLED(CONFIG_MODULE_SIG))
 			set_module_sig_enforced();
+		if (IS_ENABLED(CONFIG_KEXEC_SIG))
+			set_kexec_sig_enforced();
 		return sb_arch_rules;
 	}
 	return NULL;
-- 
cgit 


From 9d2f00fb0a0ce0d1127e54cac5217b6517ab0d54 Mon Sep 17 00:00:00 2001
From: James Yonan <james@openvpn.net>
Date: Wed, 29 Jun 2022 13:22:10 -0600
Subject: netfilter: nf_nat: in nf_nat_initialized(), use const struct nf_conn
 *

nf_nat_initialized() doesn't modify passed struct nf_conn,
so declare as const.

This is helpful for code readability and makes it possible
to call nf_nat_initialized() with a const struct nf_conn *.

Signed-off-by: James Yonan <james@openvpn.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 987111ae5240..e9eb01e99d2f 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -104,7 +104,7 @@ unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state);
 
-static inline int nf_nat_initialized(struct nf_conn *ct,
+static inline int nf_nat_initialized(const struct nf_conn *ct,
 				     enum nf_nat_manip_type manip)
 {
 	if (manip == NF_NAT_MANIP_SRC)
-- 
cgit 


From 0372c546eca575445331c0ad8902210b70be6d61 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 2 Jun 2022 12:41:00 +0300
Subject: net/mlx5: Introduce ifc bits for using software vhca id

Introduce ifc related stuff to enable using software vhca id
functionality.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8e87eb47f9dc..254cc22f5eec 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1826,7 +1826,14 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   max_reformat_remove_size[0x8];
 	u8	   max_reformat_remove_offset[0x8];
 
-	u8	   reserved_at_c0[0x740];
+	u8	   reserved_at_c0[0x160];
+
+	u8	   reserved_at_220[0x1];
+	u8	   sw_vhca_id_valid[0x1];
+	u8	   sw_vhca_id[0xe];
+	u8	   reserved_at_230[0x10];
+
+	u8	   reserved_at_240[0x5c0];
 };
 
 enum mlx5_ifc_flow_destination_type {
@@ -3782,6 +3789,11 @@ struct mlx5_ifc_rmpc_bits {
 	struct mlx5_ifc_wq_bits wq;
 };
 
+enum {
+	VHCA_ID_TYPE_HW = 0,
+	VHCA_ID_TYPE_SW = 1,
+};
+
 struct mlx5_ifc_nic_vport_context_bits {
 	u8         reserved_at_0[0x5];
 	u8         min_wqe_inline_mode[0x3];
@@ -3798,8 +3810,8 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8         event_on_mc_address_change[0x1];
 	u8         event_on_uc_address_change[0x1];
 
-	u8         reserved_at_40[0xc];
-
+	u8         vhca_id_type[0x1];
+	u8         reserved_at_41[0xb];
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
@@ -7259,7 +7271,12 @@ struct mlx5_ifc_init_hca_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x2];
+	u8         sw_vhca_id[0xe];
+	u8         reserved_at_70[0x10];
+
 	u8	   sw_owner_id[4][0x20];
 };
 
-- 
cgit 


From dc402ccc0d7b55922a79505df3000da7deb77a2b Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 2 Jun 2022 12:47:34 +0300
Subject: net/mlx5: Use software VHCA id when it's supported

Use software VHCA id when it's supported by the firmware.

A unique id is allocated upon mlx5_mdev_init() and freed upon
mlx5_mdev_uninit(), as such it stays the same during the full life cycle
of the device including upon health recovery if occurred.

The conjunction of sw_vhca_id with sw_owner_id will be a global unique
id per function which uses mlx5_core.

The sw_vhca_id is set upon init_hca command and is used to specify the
VHCA that the NIC vport is affiliated with.

This functionality is needed upon migration of VM which is MPV based.
(i.e. multi port device).

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c    |  4 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c  | 49 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 14 +++++--
 include/linux/mlx5/driver.h                     |  1 +
 4 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index cfb8bedba512..079fa44ada71 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -289,6 +289,10 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id)
 				       sw_owner_id[i]);
 	}
 
+	if (MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) &&
+	    dev->priv.sw_vhca_id > 0)
+		MLX5_SET(init_hca_in, in, sw_vhca_id, dev->priv.sw_vhca_id);
+
 	return mlx5_cmd_exec_in(dev, init_hca, in);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a9e51c1b7738..8b621c1ddd14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -90,6 +90,8 @@ module_param_named(prof_sel, prof_sel, uint, 0444);
 MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
 
 static u32 sw_owner_id[4];
+#define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1)
+static DEFINE_IDA(sw_vhca_ida);
 
 enum {
 	MLX5_ATOMIC_REQ_MODE_BE = 0x0,
@@ -492,6 +494,31 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev)
 	return err;
 }
 
+static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx)
+{
+	void *set_hca_cap;
+	int err;
+
+	if (!MLX5_CAP_GEN_MAX(dev, hca_cap_2))
+		return 0;
+
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL_2);
+	if (err)
+		return err;
+
+	if (!MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) ||
+	    !(dev->priv.sw_vhca_id > 0))
+		return 0;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
+				   capability);
+	memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL_2]->cur,
+	       MLX5_ST_SZ_BYTES(cmd_hca_cap_2));
+	MLX5_SET(cmd_hca_cap_2, set_hca_cap, sw_vhca_id_valid, 1);
+
+	return set_caps(dev, set_ctx, MLX5_CAP_GENERAL_2);
+}
+
 static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 {
 	struct mlx5_profile *prof = &dev->profile;
@@ -662,6 +689,13 @@ static int set_hca_cap(struct mlx5_core_dev *dev)
 		goto out;
 	}
 
+	memset(set_ctx, 0, set_sz);
+	err = handle_hca_cap_2(dev, set_ctx);
+	if (err) {
+		mlx5_core_err(dev, "handle_hca_cap_2 failed\n");
+		goto out;
+	}
+
 out:
 	kfree(set_ctx);
 	return err;
@@ -1506,6 +1540,18 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	if (err)
 		goto err_hca_caps;
 
+	/* The conjunction of sw_vhca_id with sw_owner_id will be a global
+	 * unique id per function which uses mlx5_core.
+	 * Those values are supplied to FW as part of the init HCA command to
+	 * be used by both driver and FW when it's applicable.
+	 */
+	dev->priv.sw_vhca_id = ida_alloc_range(&sw_vhca_ida, 1,
+					       MAX_SW_VHCA_ID,
+					       GFP_KERNEL);
+	if (dev->priv.sw_vhca_id < 0)
+		mlx5_core_err(dev, "failed to allocate sw_vhca_id, err=%d\n",
+			      dev->priv.sw_vhca_id);
+
 	return 0;
 
 err_hca_caps:
@@ -1530,6 +1576,9 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 
+	if (priv->sw_vhca_id > 0)
+		ida_free(&sw_vhca_ida, dev->priv.sw_vhca_id);
+
 	mlx5_hca_caps_free(dev);
 	mlx5_adev_cleanup(dev);
 	mlx5_pagealloc_cleanup(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index ac020cb78072..d5c317325030 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1086,9 +1086,17 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
 		goto free;
 
 	MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
-	MLX5_SET(modify_nic_vport_context_in, in,
-		 nic_vport_context.affiliated_vhca_id,
-		 MLX5_CAP_GEN(master_mdev, vhca_id));
+	if (MLX5_CAP_GEN_2(master_mdev, sw_vhca_id_valid)) {
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 nic_vport_context.vhca_id_type, VHCA_ID_TYPE_SW);
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 nic_vport_context.affiliated_vhca_id,
+			 MLX5_CAP_GEN_2(master_mdev, sw_vhca_id));
+	} else {
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 nic_vport_context.affiliated_vhca_id,
+			 MLX5_CAP_GEN(master_mdev, vhca_id));
+	}
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 nic_vport_context.affiliation_criteria,
 		 MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria));
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bd882884b23c..ecda6e63d5f2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -610,6 +610,7 @@ struct mlx5_priv {
 	spinlock_t              ctx_lock;
 	struct mlx5_adev       **adev;
 	int			adev_idx;
+	int			sw_vhca_id;
 	struct mlx5_events      *events;
 
 	struct mlx5_flow_steering *steering;
-- 
cgit 


From fac47b43c760ea90e64b895dba60df0327be7775 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 11 Jul 2022 12:11:21 +0800
Subject: netfs: do not unlock and put the folio twice

check_write_begin() will unlock and put the folio when return
non-zero.  So we should avoid unlocking and putting it twice in
netfs layer.

Change the way ->check_write_begin() works in the following two ways:

 (1) Pass it a pointer to the folio pointer, allowing it to unlock and put
     the folio prior to doing the stuff it wants to do, provided it clears
     the folio pointer.

 (2) Change the return values such that 0 with folio pointer set means
     continue, 0 with folio pointer cleared means re-get and all error
     codes indicating an error (no special treatment for -EAGAIN).

[ bagasdotme: use Sphinx code text syntax for *foliop pointer ]

Cc: stable@vger.kernel.org
Link: https://tracker.ceph.com/issues/56423
Link: https://lore.kernel.org/r/cf169f43-8ee7-8697-25da-0204d1b4343e@redhat.com
Co-developed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 Documentation/filesystems/netfs_library.rst |  8 +++++---
 fs/afs/file.c                               |  2 +-
 fs/ceph/addr.c                              | 11 ++++++-----
 fs/netfs/buffered_read.c                    | 17 ++++++++++-------
 include/linux/netfs.h                       |  2 +-
 5 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst
index 4d19b19bcc08..73a4176144b3 100644
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -301,7 +301,7 @@ through which it can issue requests and negotiate::
 		void (*issue_read)(struct netfs_io_subrequest *subreq);
 		bool (*is_still_valid)(struct netfs_io_request *rreq);
 		int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
-					 struct folio *folio, void **_fsdata);
+					 struct folio **foliop, void **_fsdata);
 		void (*done)(struct netfs_io_request *rreq);
 	};
 
@@ -381,8 +381,10 @@ The operations are as follows:
    allocated/grabbed the folio to be modified to allow the filesystem to flush
    conflicting state before allowing it to be modified.
 
-   It should return 0 if everything is now fine, -EAGAIN if the folio should be
-   regrabbed and any other error code to abort the operation.
+   It may unlock and discard the folio it was given and set the caller's folio
+   pointer to NULL.  It should return 0 if everything is now fine (``*foliop``
+   left set) or the op should be retried (``*foliop`` cleared) and any other
+   error code to abort the operation.
 
  * ``done``
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 42118a4f3383..d1cfb235c4b9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq)
 }
 
 static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
-				 struct folio *folio, void **_fsdata)
+				 struct folio **foliop, void **_fsdata)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6dee88815491..d6e5916138e4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -63,7 +63,7 @@
 	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
 
 static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
-					struct folio *folio, void **_fsdata);
+					struct folio **foliop, void **_fsdata);
 
 static inline struct ceph_snap_context *page_snap_context(struct page *page)
 {
@@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page)
 }
 
 static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
-					struct folio *folio, void **_fsdata)
+					struct folio **foliop, void **_fsdata)
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc;
 
-	snapc = ceph_find_incompatible(folio_page(folio, 0));
+	snapc = ceph_find_incompatible(folio_page(*foliop, 0));
 	if (snapc) {
 		int r;
 
-		folio_unlock(folio);
-		folio_put(folio);
+		folio_unlock(*foliop);
+		folio_put(*foliop);
+		*foliop = NULL;
 		if (IS_ERR(snapc))
 			return PTR_ERR(snapc);
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 42f892c5712e..0ce535852151 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -319,8 +319,9 @@ zero_out:
  * conflicting writes once the folio is grabbed and locked.  It is passed a
  * pointer to the fsdata cookie that gets returned to the VM to be passed to
  * write_end.  It is permitted to sleep.  It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
+ * should go ahead or it may return an error.  It may also unlock and put the
+ * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
+ * will cause the folio to be re-got and the process to be retried.
  *
  * The calling netfs must initialise a netfs context contiguous to the vfs
  * inode before calling this.
@@ -348,13 +349,13 @@ retry:
 
 	if (ctx->ops->check_write_begin) {
 		/* Allow the netfs (eg. ceph) to flush conflicts. */
-		ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+		ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
 		if (ret < 0) {
 			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
-			if (ret == -EAGAIN)
-				goto retry;
 			goto error;
 		}
+		if (!folio)
+			goto retry;
 	}
 
 	if (folio_test_uptodate(folio))
@@ -416,8 +417,10 @@ have_folio_no_wait:
 error_put:
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
 error:
-	folio_unlock(folio);
-	folio_put(folio);
+	if (folio) {
+		folio_unlock(folio);
+		folio_put(folio);
+	}
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 1773e5df8e65..1b18dfa52e48 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -214,7 +214,7 @@ struct netfs_request_ops {
 	void (*issue_read)(struct netfs_io_subrequest *subreq);
 	bool (*is_still_valid)(struct netfs_io_request *rreq);
 	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
-				 struct folio *folio, void **_fsdata);
+				 struct folio **foliop, void **_fsdata);
 	void (*done)(struct netfs_io_request *rreq);
 };
 
-- 
cgit 


From 2e81e1fffd53ba108481f2f14388b628884efe61 Mon Sep 17 00:00:00 2001
From: Vitaly Rodionov <vitaly.rodionov@cirrus.com>
Date: Thu, 30 Jun 2022 01:23:25 +0100
Subject: ALSA: hda: cs35l41: Add initial DSP support and firmware loading

This patch adds support for the CS35L41 DSP.
The DSP allows for extra features, such as running
speaker protection algorithms and hibernations.

To utilize these features, the driver must load
firmware into the DSP, as well as various tuning
files which allow for customization for specific
models.

[ Slightly simplified Kconfig changes by tiwai ]

Signed-off-by: Vitaly Rodionov <vitaly.rodionov@cirrus.com>
Signed-off-by: Vitaly Rodionov <vitalyr@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220630002335.366545-5-vitalyr@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/cs35l41.h     |   4 +
 sound/pci/hda/Kconfig       |   2 +
 sound/pci/hda/cs35l41_hda.c | 251 +++++++++++++++++++++++++++++++++++++++++++-
 sound/pci/hda/cs35l41_hda.h |  13 +++
 4 files changed, 269 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 8972fa697622..8887087815a7 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -665,6 +665,10 @@
 #define CS35L41_BST_EN_DEFAULT		0x2
 #define CS35L41_AMP_EN_SHIFT		0
 #define CS35L41_AMP_EN_MASK		1
+#define CS35L41_VMON_EN_MASK		0x1000
+#define CS35L41_VMON_EN_SHIFT		12
+#define CS35L41_IMON_EN_MASK		0x2000
+#define CS35L41_IMON_EN_SHIFT		13
 
 #define CS35L41_PDN_DONE_MASK		0x00800000
 #define CS35L41_PDN_DONE_SHIFT		23
diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig
index a17803953222..44c33bc0740e 100644
--- a/sound/pci/hda/Kconfig
+++ b/sound/pci/hda/Kconfig
@@ -107,6 +107,7 @@ config SND_HDA_SCODEC_CS35L41_I2C
 	depends on SND_SOC
 	select SND_SOC_CS35L41_LIB
 	select SND_HDA_SCODEC_CS35L41
+	select SND_HDA_CS_DSP_CONTROLS
 	help
 	  Say Y or M here to include CS35L41 I2C HD-audio side codec support
 	  in snd-hda-intel driver, such as ALC287.
@@ -121,6 +122,7 @@ config SND_HDA_SCODEC_CS35L41_SPI
 	depends on SND_SOC
 	select SND_SOC_CS35L41_LIB
 	select SND_HDA_SCODEC_CS35L41
+	select SND_HDA_CS_DSP_CONTROLS
 	help
 	  Say Y or M here to include CS35L41 SPI HD-audio side codec support
 	  in snd-hda-intel driver, such as ALC287.
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index 20d3ce8773dd..d68d951c434e 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -9,12 +9,22 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <sound/hda_codec.h>
+#include <sound/soc.h>
 #include "hda_local.h"
 #include "hda_auto_parser.h"
 #include "hda_jack.h"
 #include "hda_generic.h"
 #include "hda_component.h"
 #include "cs35l41_hda.h"
+#include "hda_cs_dsp_ctl.h"
+
+#define CS35L41_FIRMWARE_ROOT "cirrus/"
+#define CS35L41_PART "cs35l41"
+#define FW_NAME "CSPL"
+
+#define HALO_STATE_DSP_CTL_NAME		"HALO_STATE"
+#define HALO_STATE_DSP_CTL_TYPE		5
+#define HALO_STATE_DSP_CTL_ALG		262308
 
 static const struct reg_sequence cs35l41_hda_config[] = {
 	{ CS35L41_PLL_CLK_CTRL,		0x00000430 }, // 3072000Hz, BCLK Input, PLL_REFCLK_EN = 1
@@ -27,11 +37,172 @@ static const struct reg_sequence cs35l41_hda_config[] = {
 	{ CS35L41_AMP_GAIN_CTRL,	0x00000084 }, // AMP_GAIN_PCM 4.5 dB
 };
 
+static const struct reg_sequence cs35l41_hda_config_dsp[] = {
+	{ CS35L41_PLL_CLK_CTRL,		0x00000430 }, // 3072000Hz, BCLK Input, PLL_REFCLK_EN = 1
+	{ CS35L41_DSP_CLK_CTRL,		0x00000003 }, // DSP CLK EN
+	{ CS35L41_GLOBAL_CLK_CTRL,	0x00000003 }, // GLOBAL_FS = 48 kHz
+	{ CS35L41_SP_ENABLES,		0x00010001 }, // ASP_RX1_EN = 1, ASP_TX1_EN = 1
+	{ CS35L41_SP_RATE_CTRL,		0x00000021 }, // ASP_BCLK_FREQ = 3.072 MHz
+	{ CS35L41_SP_FORMAT,		0x20200200 }, // 32 bits RX/TX slots, I2S, clk consumer
+	{ CS35L41_SP_HIZ_CTRL,		0x00000003 }, // Hi-Z unused/disabled
+	{ CS35L41_SP_TX_WL,		0x00000018 }, // 24 cycles/slot
+	{ CS35L41_SP_RX_WL,		0x00000018 }, // 24 cycles/slot
+	{ CS35L41_DAC_PCM1_SRC,		0x00000032 }, // DACPCM1_SRC = ERR_VOL
+	{ CS35L41_ASP_TX1_SRC,		0x00000018 }, // ASPTX1 SRC = VMON
+	{ CS35L41_ASP_TX2_SRC,		0x00000019 }, // ASPTX2 SRC = IMON
+	{ CS35L41_ASP_TX3_SRC,		0x00000028 }, // ASPTX3 SRC = VPMON
+	{ CS35L41_ASP_TX4_SRC,		0x00000029 }, // ASPTX4 SRC = VBSTMON
+	{ CS35L41_DSP1_RX1_SRC,		0x00000008 }, // DSP1RX1 SRC = ASPRX1
+	{ CS35L41_DSP1_RX2_SRC,		0x00000008 }, // DSP1RX2 SRC = ASPRX1
+	{ CS35L41_DSP1_RX3_SRC,         0x00000018 }, // DSP1RX3 SRC = VMON
+	{ CS35L41_DSP1_RX4_SRC,         0x00000019 }, // DSP1RX4 SRC = IMON
+	{ CS35L41_DSP1_RX5_SRC,         0x00000029 }, // DSP1RX5 SRC = VBSTMON
+	{ CS35L41_AMP_DIG_VOL_CTRL,	0x00000000 }, // AMP_VOL_PCM  0.0 dB
+	{ CS35L41_AMP_GAIN_CTRL,	0x00000233 }, // AMP_GAIN_PCM = 17.5dB AMP_GAIN_PDM = 19.5dB
+};
+
 static const struct reg_sequence cs35l41_hda_mute[] = {
 	{ CS35L41_AMP_GAIN_CTRL,	0x00000000 }, // AMP_GAIN_PCM 0.5 dB
 	{ CS35L41_AMP_DIG_VOL_CTRL,	0x0000A678 }, // AMP_VOL_PCM Mute
 };
 
+static int cs35l41_control_add(struct cs_dsp_coeff_ctl *cs_ctl)
+{
+	struct cs35l41_hda *cs35l41 = container_of(cs_ctl->dsp, struct cs35l41_hda, cs_dsp);
+	struct hda_cs_dsp_ctl_info info;
+
+	info.device_name = cs35l41->amp_name;
+	info.fw_type = HDA_CS_DSP_FW_SPK_PROT;
+	info.card = cs35l41->codec->card;
+
+	return hda_cs_dsp_control_add(cs_ctl, &info);
+}
+
+static const struct cs_dsp_client_ops client_ops = {
+	.control_add = cs35l41_control_add,
+	.control_remove = hda_cs_dsp_control_remove,
+};
+
+static int cs35l41_request_firmware_file(struct cs35l41_hda *cs35l41,
+					 const struct firmware **firmware, char **filename,
+					 const char *dir, const char *filetype)
+{
+	const char * const dsp_name = cs35l41->cs_dsp.name;
+	char *s, c;
+	int ret = 0;
+
+	*filename = kasprintf(GFP_KERNEL, "%s%s-%s-%s.%s", dir, CS35L41_PART, dsp_name, "spk-prot",
+			      filetype);
+
+	if (*filename == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Make sure that filename is lower-case and any non alpha-numeric
+	 * characters except full stop and '/' are replaced with hyphens.
+	 */
+	s = *filename;
+	while (*s) {
+		c = *s;
+		if (isalnum(c))
+			*s = tolower(c);
+		else if (c != '.' && c != '/')
+			*s = '-';
+		s++;
+	}
+
+	ret = firmware_request_nowarn(firmware, *filename, cs35l41->dev);
+	if (ret != 0) {
+		dev_dbg(cs35l41->dev, "Failed to request '%s'\n", *filename);
+		kfree(*filename);
+		*filename = NULL;
+	}
+
+	return ret;
+}
+
+static int cs35l41_request_firmware_files(struct cs35l41_hda *cs35l41,
+					  const struct firmware **wmfw_firmware,
+					  char **wmfw_filename,
+					  const struct firmware **coeff_firmware,
+					  char **coeff_filename)
+{
+	int ret;
+
+	/* cirrus/part-dspN-fwtype.wmfw */
+	ret = cs35l41_request_firmware_file(cs35l41, wmfw_firmware, wmfw_filename,
+					    CS35L41_FIRMWARE_ROOT, "wmfw");
+	if (!ret) {
+		cs35l41_request_firmware_file(cs35l41, coeff_firmware, coeff_filename,
+					      CS35L41_FIRMWARE_ROOT, "bin");
+		return 0;
+	}
+
+	dev_warn(cs35l41->dev, "Failed to request firmware\n");
+
+	return ret;
+}
+
+static int cs35l41_init_dsp(struct cs35l41_hda *cs35l41)
+{
+	const struct firmware *coeff_firmware = NULL;
+	const struct firmware *wmfw_firmware = NULL;
+	struct cs_dsp *dsp = &cs35l41->cs_dsp;
+	char *coeff_filename = NULL;
+	char *wmfw_filename = NULL;
+	int ret;
+
+	if (!cs35l41->halo_initialized) {
+		cs35l41_configure_cs_dsp(cs35l41->dev, cs35l41->regmap, dsp);
+		dsp->client_ops = &client_ops;
+
+		ret = cs_dsp_halo_init(&cs35l41->cs_dsp);
+		if (ret)
+			return ret;
+		cs35l41->halo_initialized = true;
+	}
+
+	ret = cs35l41_request_firmware_files(cs35l41, &wmfw_firmware, &wmfw_filename,
+					     &coeff_firmware, &coeff_filename);
+	if (ret < 0)
+		return ret;
+
+	dev_dbg(cs35l41->dev, "Loading WMFW Firmware: %s\n", wmfw_filename);
+	if (coeff_filename)
+		dev_dbg(cs35l41->dev, "Loading Coefficient File: %s\n", coeff_filename);
+	else
+		dev_warn(cs35l41->dev, "No Coefficient File available.\n");
+
+	ret = cs_dsp_power_up(dsp, wmfw_firmware, wmfw_filename, coeff_firmware, coeff_filename,
+			      FW_NAME);
+
+	release_firmware(wmfw_firmware);
+	release_firmware(coeff_firmware);
+	kfree(wmfw_filename);
+	kfree(coeff_filename);
+
+	return ret;
+}
+
+static void cs35l41_shutdown_dsp(struct cs35l41_hda *cs35l41)
+{
+	struct cs_dsp *dsp = &cs35l41->cs_dsp;
+
+	cs_dsp_stop(dsp);
+	cs_dsp_power_down(dsp);
+	cs35l41->firmware_running = false;
+	dev_dbg(cs35l41->dev, "Unloaded Firmware\n");
+}
+
+static void cs35l41_remove_dsp(struct cs35l41_hda *cs35l41)
+{
+	struct cs_dsp *dsp = &cs35l41->cs_dsp;
+
+	cs35l41_shutdown_dsp(cs35l41);
+	cs_dsp_remove(dsp);
+	cs35l41->halo_initialized = false;
+}
+
 /* Protection release cycle to get the speaker out of Safe-Mode */
 static void cs35l41_error_release(struct device *dev, struct regmap *regmap, unsigned int mask)
 {
@@ -53,9 +224,22 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action)
 	struct regmap *reg = cs35l41->regmap;
 	int ret = 0;
 
+	mutex_lock(&cs35l41->fw_mutex);
+
 	switch (action) {
 	case HDA_GEN_PCM_ACT_OPEN:
-		regmap_multi_reg_write(reg, cs35l41_hda_config, ARRAY_SIZE(cs35l41_hda_config));
+		if (cs35l41->firmware_running) {
+			regmap_multi_reg_write(reg, cs35l41_hda_config_dsp,
+					       ARRAY_SIZE(cs35l41_hda_config_dsp));
+			regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2,
+					   CS35L41_VMON_EN_MASK | CS35L41_IMON_EN_MASK,
+					   1 << CS35L41_VMON_EN_SHIFT | 1 << CS35L41_IMON_EN_SHIFT);
+			cs35l41_set_cspl_mbox_cmd(cs35l41->dev, cs35l41->regmap,
+						  CSPL_MBOX_CMD_RESUME);
+		} else {
+			regmap_multi_reg_write(reg, cs35l41_hda_config,
+					       ARRAY_SIZE(cs35l41_hda_config));
+		}
 		ret = regmap_update_bits(reg, CS35L41_PWR_CTRL2,
 					 CS35L41_AMP_EN_MASK, 1 << CS35L41_AMP_EN_SHIFT);
 		if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST)
@@ -73,6 +257,13 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action)
 					 CS35L41_AMP_EN_MASK, 0 << CS35L41_AMP_EN_SHIFT);
 		if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST)
 			regmap_write(reg, CS35L41_GPIO1_CTRL1, 0x00000001);
+		if (cs35l41->firmware_running) {
+			cs35l41_set_cspl_mbox_cmd(cs35l41->dev, cs35l41->regmap,
+						  CSPL_MBOX_CMD_PAUSE);
+			regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2,
+					   CS35L41_VMON_EN_MASK | CS35L41_IMON_EN_MASK,
+					   0 << CS35L41_VMON_EN_SHIFT | 0 << CS35L41_IMON_EN_SHIFT);
+		}
 		cs35l41_irq_release(cs35l41);
 		break;
 	default:
@@ -80,6 +271,8 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action)
 		break;
 	}
 
+	mutex_unlock(&cs35l41->fw_mutex);
+
 	if (ret)
 		dev_err(cs35l41->dev, "Regmap access fail: %d\n", ret);
 }
@@ -104,6 +297,51 @@ static int cs35l41_hda_channel_map(struct device *dev, unsigned int tx_num, unsi
 				    rx_slot);
 }
 
+static int cs35l41_smart_amp(struct cs35l41_hda *cs35l41)
+{
+	int halo_sts;
+	int ret;
+
+	ret = cs35l41_init_dsp(cs35l41);
+	if (ret) {
+		dev_warn(cs35l41->dev, "Cannot Initialize Firmware. Error: %d\n", ret);
+		goto clean_dsp;
+	}
+
+	ret = cs35l41_write_fs_errata(cs35l41->dev, cs35l41->regmap);
+	if (ret) {
+		dev_err(cs35l41->dev, "Cannot Write FS Errata: %d\n", ret);
+		goto clean_dsp;
+	}
+
+	ret = cs_dsp_run(&cs35l41->cs_dsp);
+	if (ret) {
+		dev_err(cs35l41->dev, "Fail to start dsp: %d\n", ret);
+		goto clean_dsp;
+	}
+
+	ret = read_poll_timeout(hda_cs_dsp_read_ctl, ret,
+				be32_to_cpu(halo_sts) == HALO_STATE_CODE_RUN,
+				1000, 15000, false, &cs35l41->cs_dsp, HALO_STATE_DSP_CTL_NAME,
+				HALO_STATE_DSP_CTL_TYPE, HALO_STATE_DSP_CTL_ALG,
+				&halo_sts, sizeof(halo_sts));
+
+	if (ret) {
+		dev_err(cs35l41->dev, "Timeout waiting for HALO Core to start. State: %d\n",
+			 halo_sts);
+		goto clean_dsp;
+	}
+
+	cs35l41_set_cspl_mbox_cmd(cs35l41->dev, cs35l41->regmap, CSPL_MBOX_CMD_PAUSE);
+	cs35l41->firmware_running = true;
+
+	return 0;
+
+clean_dsp:
+	cs35l41_shutdown_dsp(cs35l41);
+	return ret;
+}
+
 static int cs35l41_hda_bind(struct device *dev, struct device *master, void *master_data)
 {
 	struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev);
@@ -121,6 +359,11 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas
 	strscpy(comps->name, dev_name(dev), sizeof(comps->name));
 	comps->playback_hook = cs35l41_hda_playback_hook;
 
+	mutex_lock(&cs35l41->fw_mutex);
+	if (cs35l41_smart_amp(cs35l41) < 0)
+		dev_warn(cs35l41->dev, "Cannot Run Firmware, reverting to dsp bypass...\n");
+	mutex_unlock(&cs35l41->fw_mutex);
+
 	return 0;
 }
 
@@ -535,6 +778,8 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i
 	if (ret)
 		goto err;
 
+	mutex_init(&cs35l41->fw_mutex);
+
 	ret = cs35l41_hda_apply_properties(cs35l41);
 	if (ret)
 		goto err;
@@ -562,6 +807,9 @@ void cs35l41_hda_remove(struct device *dev)
 {
 	struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev);
 
+	if (cs35l41->halo_initialized)
+		cs35l41_remove_dsp(cs35l41);
+
 	component_del(cs35l41->dev, &cs35l41_hda_comp_ops);
 
 	if (cs35l41_safe_reset(cs35l41->regmap, cs35l41->hw_cfg.bst_type))
@@ -571,5 +819,6 @@ void cs35l41_hda_remove(struct device *dev)
 EXPORT_SYMBOL_NS_GPL(cs35l41_hda_remove, SND_HDA_SCODEC_CS35L41);
 
 MODULE_DESCRIPTION("CS35L41 HDA Driver");
+MODULE_IMPORT_NS(SND_HDA_CS_DSP_CONTROLS);
 MODULE_AUTHOR("Lucas Tanure, Cirrus Logic Inc, <tanureal@opensource.cirrus.com>");
 MODULE_LICENSE("GPL");
diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h
index aaf9e16684c2..5814af050944 100644
--- a/sound/pci/hda/cs35l41_hda.h
+++ b/sound/pci/hda/cs35l41_hda.h
@@ -15,6 +15,9 @@
 #include <linux/device.h>
 #include <sound/cs35l41.h>
 
+#include <linux/firmware/cirrus/cs_dsp.h>
+#include <linux/firmware/cirrus/wmfw.h>
+
 enum cs35l41_hda_spk_pos {
 	CS35l41_LEFT,
 	CS35l41_RIGHT,
@@ -39,7 +42,17 @@ struct cs35l41_hda {
 	int channel_index;
 	unsigned volatile long irq_errors;
 	const char *amp_name;
+	struct mutex fw_mutex;
 	struct regmap_irq_chip_data *irq_data;
+	bool firmware_running;
+	bool halo_initialized;
+	struct cs_dsp cs_dsp;
+};
+
+enum halo_state {
+	HALO_STATE_CODE_INIT_DOWNLOAD = 0,
+	HALO_STATE_CODE_START,
+	HALO_STATE_CODE_RUN
 };
 
 int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int irq,
-- 
cgit 


From 9745fb07474f4501eff62130a78a42a8b8c18b05 Mon Sep 17 00:00:00 2001
From: Jonathan Yong <jonathan.yong@intel.com>
Date: Mon, 6 Jun 2022 19:41:27 +0300
Subject: platform/x86/intel: Add Primary to Sideband (P2SB) bridge support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SoC features such as GPIO are accessed via a reserved MMIO area,
we don't know its address but can obtain it from the BAR of
the P2SB device, that device is normally hidden so we have to
temporarily unhide it, read address and hide it back.

There are already a few users and at least one more is coming which
require an access to Primary to Sideband (P2SB) bridge in order
to get IO or MMIO BAR hidden by BIOS.

Create a library to access P2SB for x86 devices in a unified way.

Background information
======================
Note, the term "bridge" is used in the documentation and it has nothing
to do with a PCI (host) bridge as per the PCI specifications.

The P2SB is an interesting device by its nature and hardware design.
First of all, it has several devices in the hardware behind it. These
devices may or may not be represented as ACPI devices by a firmware.

It also has a hardwired (to 0s) the least significant bits of the
base address register which is represented by the only 64-bit BAR0.
It means that OS mustn't reallocate the BAR.

On top of that in some cases P2SB is represented by function 0 on PCI
slot (in terms of B:D.F) and according to the PCI specification any
other function can't be seen until function 0 is present and visible.

In the PCI configuration space of P2SB device the full 32-bit register
is allocated for the only purpose of hiding the entire P2SB device. As
per [3]:

  3.1.39 P2SB Control (P2SBC)—Offset E0h

  Hide Device (HIDE): When this bit is set, the P2SB will return 1s on
  any PCI Configuration Read on IOSF-P. All other transactions including
  PCI Configuration Writes on IOSF-P are unaffected by this. This does
  not affect reads performed on the IOSF-SB interface.

This doesn't prevent MMIO accesses, although preventing the OS from
assigning these addresses. The firmware on the affected platforms marks
the region as unusable (by cutting it off from the PCI host bridge
resources) as depicted in the Apollo Lake example below:

  PCI host bridge to bus 0000:00
  pci_bus 0000:00: root bus resource [io  0x0070-0x0077]
  pci_bus 0000:00: root bus resource [io  0x0000-0x006f window]
  pci_bus 0000:00: root bus resource [io  0x0078-0x0cf7 window]
  pci_bus 0000:00: root bus resource [io  0x0d00-0xffff window]
  pci_bus 0000:00: root bus resource [mem 0x7c000001-0x7fffffff window]
  pci_bus 0000:00: root bus resource [mem 0x7b800001-0x7bffffff window]
  pci_bus 0000:00: root bus resource [mem 0x80000000-0xcfffffff window]
  pci_bus 0000:00: root bus resource [mem 0xe0000000-0xefffffff window]
  pci_bus 0000:00: root bus resource [bus 00-ff]

The P2SB 16MB BAR is located at 0xd0000000-0xd0ffffff memory window.

The generic solution
====================
The generic solution for all cases when we need to access to the information
behind P2SB device is a library code where users ask for necessary resources
by demand and hence those users take care of not being run on the systems
where this access is not required.

The library provides the p2sb_bar() API to retrieve the MMIO of the BAR0 of
the device from P2SB device slot.

P2SB unconditional unhiding awareness
=====================================
Technically it's possible to unhide the P2SB device and devices on
the same PCI slot and access them at any time as needed. But there are
several potential issues with that:

 - the systems were never tested against such configuration and hence
   nobody knows what kind of bugs it may bring, especially when we talk
   about SPI NOR case which contains Intel FirmWare Image (IFWI) code
   (including BIOS) and already known to be problematic in the past for
   end users

 - the PCI by its nature is a hotpluggable bus and in case somebody
   attaches a driver to the functions of a P2SB slot device(s) the
   end user experience and system behaviour can be unpredictable

 - the kernel code would need some ugly hacks (or code looking as an
   ugly hack) under arch/x86/pci in order to enable these devices on
   only selected platforms (which may include CPU ID table followed by
   a potentially growing number of DMI strings

The future improvements
=======================
The future improvements with this code may go in order to gain some kind
of cache, if it's possible at all, to prevent unhiding and hiding many
times to take static information that may be saved once per boot.

Links
=====
[1]: https://lab.whitequark.org/notes/2017-11-08/accessing-intel-ich-pch-gpios/
[2]: https://cdrdv2.intel.com/v1/dl/getContent/332690?wapkw=332690
[3]: https://cdrdv2.intel.com/v1/dl/getContent/332691?wapkw=332691
[4]: https://medium.com/@jacksonchen_43335/bios-gpio-p2sb-70e9b829b403

Signed-off-by: Jonathan Yong <jonathan.yong@intel.com>
Co-developed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Henning Schild <henning.schild@siemens.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/platform/x86/intel/Kconfig     |  12 ++++
 drivers/platform/x86/intel/Makefile    |   2 +
 drivers/platform/x86/intel/p2sb.c      | 127 +++++++++++++++++++++++++++++++++
 include/linux/platform_data/x86/p2sb.h |  28 ++++++++
 4 files changed, 169 insertions(+)
 create mode 100644 drivers/platform/x86/intel/p2sb.c
 create mode 100644 include/linux/platform_data/x86/p2sb.h

(limited to 'include')

diff --git a/drivers/platform/x86/intel/Kconfig b/drivers/platform/x86/intel/Kconfig
index 794968bda115..c9cfbaae436b 100644
--- a/drivers/platform/x86/intel/Kconfig
+++ b/drivers/platform/x86/intel/Kconfig
@@ -70,6 +70,18 @@ config INTEL_OAKTRAIL
 	  enable/disable the Camera, WiFi, BT etc. devices. If in doubt, say Y
 	  here; it will only load on supported platforms.
 
+config P2SB
+	bool "Primary to Sideband (P2SB) bridge access support"
+	depends on PCI
+	help
+	  The Primary to Sideband (P2SB) bridge is an interface to some
+	  PCI devices connected through it. In particular, SPI NOR controller
+	  in Intel Apollo Lake SoC is one of such devices.
+
+	  The main purpose of this library is to unhide P2SB device in case
+	  firmware kept it hidden on some platforms in order to access devices
+	  behind it.
+
 config INTEL_BXTWC_PMIC_TMU
 	tristate "Intel Broxton Whiskey Cove TMU Driver"
 	depends on INTEL_SOC_PMIC_BXTWC
diff --git a/drivers/platform/x86/intel/Makefile b/drivers/platform/x86/intel/Makefile
index 717933dd0cfd..741a9404db98 100644
--- a/drivers/platform/x86/intel/Makefile
+++ b/drivers/platform/x86/intel/Makefile
@@ -28,6 +28,8 @@ intel_int0002_vgpio-y			:= int0002_vgpio.o
 obj-$(CONFIG_INTEL_INT0002_VGPIO)	+= intel_int0002_vgpio.o
 intel_oaktrail-y			:= oaktrail.o
 obj-$(CONFIG_INTEL_OAKTRAIL)		+= intel_oaktrail.o
+intel_p2sb-y				:= p2sb.o
+obj-$(CONFIG_P2SB)			+= intel_p2sb.o
 intel_sdsi-y				:= sdsi.o
 obj-$(CONFIG_INTEL_SDSI)		+= intel_sdsi.o
 intel_vsec-y				:= vsec.o
diff --git a/drivers/platform/x86/intel/p2sb.c b/drivers/platform/x86/intel/p2sb.c
new file mode 100644
index 000000000000..b598ef14dbc6
--- /dev/null
+++ b/drivers/platform/x86/intel/p2sb.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Primary to Sideband (P2SB) bridge access support
+ *
+ * Copyright (c) 2017, 2021-2022 Intel Corporation.
+ *
+ * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *	    Jonathan Yong <jonathan.yong@intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/platform_data/x86/p2sb.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define P2SBC			0xe0
+#define P2SBC_HIDE		BIT(8)
+
+static const struct x86_cpu_id p2sb_cpu_ids[] = {
+	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	PCI_DEVFN(13, 0)),
+	{}
+};
+
+static int p2sb_get_devfn(unsigned int *devfn)
+{
+	const struct x86_cpu_id *id;
+
+	id = x86_match_cpu(p2sb_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	*devfn = (unsigned int)id->driver_data;
+	return 0;
+}
+
+static int p2sb_read_bar0(struct pci_dev *pdev, struct resource *mem)
+{
+	/* Copy resource from the first BAR of the device in question */
+	*mem = pdev->resource[0];
+	return 0;
+}
+
+static int p2sb_scan_and_read(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	struct pci_dev *pdev;
+	int ret;
+
+	pdev = pci_scan_single_device(bus, devfn);
+	if (!pdev)
+		return -ENODEV;
+
+	ret = p2sb_read_bar0(pdev, mem);
+
+	pci_stop_and_remove_bus_device(pdev);
+	return ret;
+}
+
+/**
+ * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR
+ * @bus: PCI bus to communicate with
+ * @devfn: PCI slot and function to communicate with
+ * @mem: memory resource to be filled in
+ *
+ * The BIOS prevents the P2SB device from being enumerated by the PCI
+ * subsystem, so we need to unhide and hide it back to lookup the BAR.
+ *
+ * if @bus is NULL, the bus 0 in domain 0 will be used.
+ * If @devfn is 0, it will be replaced by devfn of the P2SB device.
+ *
+ * Caller must provide a valid pointer to @mem.
+ *
+ * Locking is handled by pci_rescan_remove_lock mutex.
+ *
+ * Return:
+ * 0 on success or appropriate errno value on error.
+ */
+int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	struct pci_dev *pdev_p2sb;
+	unsigned int devfn_p2sb;
+	u32 value = P2SBC_HIDE;
+	int ret;
+
+	/* Get devfn for P2SB device itself */
+	ret = p2sb_get_devfn(&devfn_p2sb);
+	if (ret)
+		return ret;
+
+	/* if @bus is NULL, use bus 0 in domain 0 */
+	bus = bus ?: pci_find_bus(0, 0);
+
+	/*
+	 * Prevent concurrent PCI bus scan from seeing the P2SB device and
+	 * removing via sysfs while it is temporarily exposed.
+	 */
+	pci_lock_rescan_remove();
+
+	/* Unhide the P2SB device, if needed */
+	pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value);
+	if (value & P2SBC_HIDE)
+		pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0);
+
+	pdev_p2sb = pci_scan_single_device(bus, devfn_p2sb);
+	if (devfn)
+		ret = p2sb_scan_and_read(bus, devfn, mem);
+	else
+		ret = p2sb_read_bar0(pdev_p2sb, mem);
+	pci_stop_and_remove_bus_device(pdev_p2sb);
+
+	/* Hide the P2SB device, if it was hidden */
+	if (value & P2SBC_HIDE)
+		pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE);
+
+	pci_unlock_rescan_remove();
+
+	if (ret)
+		return ret;
+
+	if (mem->flags == 0)
+		return -ENODEV;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(p2sb_bar);
diff --git a/include/linux/platform_data/x86/p2sb.h b/include/linux/platform_data/x86/p2sb.h
new file mode 100644
index 000000000000..a1d5fddc8f13
--- /dev/null
+++ b/include/linux/platform_data/x86/p2sb.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Primary to Sideband (P2SB) bridge access support
+ */
+
+#ifndef _PLATFORM_DATA_X86_P2SB_H
+#define _PLATFORM_DATA_X86_P2SB_H
+
+#include <linux/errno.h>
+#include <linux/kconfig.h>
+
+struct pci_bus;
+struct resource;
+
+#if IS_BUILTIN(CONFIG_P2SB)
+
+int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem);
+
+#else /* CONFIG_P2SB */
+
+static inline int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_P2SB is not set */
+
+#endif /* _PLATFORM_DATA_X86_P2SB_H */
-- 
cgit 


From 446f0cf9e08b483d7dc6f61eee0ee846b22f6386 Mon Sep 17 00:00:00 2001
From: Henning Schild <henning.schild@siemens.com>
Date: Mon, 6 Jun 2022 19:41:37 +0300
Subject: platform/x86: simatic-ipc: drop custom P2SB bar code

The two drivers that used to use this have been switched over to the
common P2SB accessor, so this code is not needed any longer.

Signed-off-by: Henning Schild <henning.schild@siemens.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/platform/x86/simatic-ipc.c                 | 38 ----------------------
 include/linux/platform_data/x86/simatic-ipc-base.h |  2 --
 2 files changed, 40 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/simatic-ipc.c b/drivers/platform/x86/simatic-ipc.c
index b599cda5ba3c..26c35e1660cb 100644
--- a/drivers/platform/x86/simatic-ipc.c
+++ b/drivers/platform/x86/simatic-ipc.c
@@ -101,44 +101,6 @@ static int register_platform_devices(u32 station_id)
 	return 0;
 }
 
-/* FIXME: this should eventually be done with generic P2SB discovery code
- * the individual drivers for watchdogs and LEDs access memory that implements
- * GPIO, but pinctrl will not come up because of missing ACPI entries
- *
- * While there is no conflict a cleaner solution would be to somehow bring up
- * pinctrl even with these ACPI entries missing, and base the drivers on pinctrl.
- * After which the following function could be dropped, together with the code
- * poking the memory.
- */
-/*
- * Get membase address from PCI, used in leds and wdt module. Here we read
- * the bar0. The final address calculation is done in the appropriate modules
- */
-u32 simatic_ipc_get_membase0(unsigned int p2sb)
-{
-	struct pci_bus *bus;
-	u32 bar0 = 0;
-	/*
-	 * The GPIO memory is in bar0 of the hidden P2SB device.
-	 * Unhide the device to have a quick look at it, before we hide it
-	 * again.
-	 * Also grab the pci rescan lock so that device does not get discovered
-	 * and remapped while it is visible.
-	 * This code is inspired by drivers/mfd/lpc_ich.c
-	 */
-	bus = pci_find_bus(0, 0);
-	pci_lock_rescan_remove();
-	pci_bus_write_config_byte(bus, p2sb, 0xE1, 0x0);
-	pci_bus_read_config_dword(bus, p2sb, PCI_BASE_ADDRESS_0, &bar0);
-
-	bar0 &= ~0xf;
-	pci_bus_write_config_byte(bus, p2sb, 0xE1, 0x1);
-	pci_unlock_rescan_remove();
-
-	return bar0;
-}
-EXPORT_SYMBOL(simatic_ipc_get_membase0);
-
 static int __init simatic_ipc_init_module(void)
 {
 	const struct dmi_system_id *match;
diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h
index 62d2bc774067..39fefd48cf4d 100644
--- a/include/linux/platform_data/x86/simatic-ipc-base.h
+++ b/include/linux/platform_data/x86/simatic-ipc-base.h
@@ -24,6 +24,4 @@ struct simatic_ipc_platform {
 	u8	devmode;
 };
 
-u32 simatic_ipc_get_membase0(unsigned int p2sb);
-
 #endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H */
-- 
cgit 


From 1b870fa5573e260bc74d19f381ab0dd971a8d8e7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 14 Jul 2022 07:27:31 -0400
Subject: kvm: stats: tell userspace which values are boolean

Some of the statistics values exported by KVM are always only 0 or 1.
It can be useful to export this fact to userspace so that it can track
them specially (for example by polling the value every now and then to
compute a % of time spent in a specific state).

Therefore, add "boolean value" as a new "unit".  While it is not exactly
a unit, it walks and quacks like one.  In particular, using the type
would be wrong because boolean values could be instantaneous or peak
values (e.g. "is the rmap allocated?") or even two-bucket histograms
(e.g. "number of posted vs. non-posted interrupt injections").

Suggested-by: Amneesh Singh <natto@weirdnatto.in>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst |  6 ++++++
 arch/x86/kvm/x86.c             |  2 +-
 include/linux/kvm_host.h       | 11 ++++++++++-
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 11e00a46c610..48bf6e49a7de 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5657,6 +5657,7 @@ by a string of size ``name_size``.
 	#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+	#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
 
 	#define KVM_STATS_BASE_SHIFT		8
@@ -5724,6 +5725,11 @@ Bits 4-7 of ``flags`` encode the unit:
     It indicates that the statistics data is used to measure time or latency.
   * ``KVM_STATS_UNIT_CYCLES``
     It indicates that the statistics data is used to measure CPU clock cycles.
+  * ``KVM_STATS_UNIT_BOOLEAN``
+    It indicates that the statistic will always be either 0 or 1.  Boolean
+    statistics of "peak" type will never go back from 1 to 0.  Boolean
+    statistics can be linear histograms (with two buckets) but not logarithmic
+    histograms.
 
 Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the
 unit:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26d0cac32f73..0c3e85e8fce9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -298,7 +298,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
 	STATS_DESC_COUNTER(VCPU, preemption_reported),
 	STATS_DESC_COUNTER(VCPU, preemption_other),
-	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+	STATS_DESC_IBOOLEAN(VCPU, guest_mode)
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 83cf7fd842e0..90a45ef7203b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1822,6 +1822,15 @@ struct _kvm_stats_desc {
 	STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
 		KVM_STATS_BASE_POW10, 0)
 
+/* Instantaneous boolean value, read only */
+#define STATS_DESC_IBOOLEAN(SCOPE, name)				       \
+	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,		       \
+		KVM_STATS_BASE_POW10, 0)
+/* Peak (sticky) boolean value, read/write */
+#define STATS_DESC_PBOOLEAN(SCOPE, name)				       \
+	STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,		       \
+		KVM_STATS_BASE_POW10, 0)
+
 /* Cumulative time in nanosecond */
 #define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
 	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
@@ -1853,7 +1862,7 @@ struct _kvm_stats_desc {
 			HALT_POLL_HIST_COUNT),				       \
 	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,	       \
 			HALT_POLL_HIST_COUNT),				       \
-	STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking)
+	STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking)
 
 extern struct dentry *kvm_debugfs_dir;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..811897dadcae 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2083,6 +2083,7 @@ struct kvm_stats_header {
 #define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
 
 #define KVM_STATS_BASE_SHIFT		8
-- 
cgit 


From 71f28f3136aff5890cd56de78abc673f8393cad9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 13 Jul 2022 22:07:10 +0800
Subject: ublk_drv: add io_uring based userspace block driver

This is the driver part of userspace block driver(ublk driver), the other
part is userspace daemon part(ublksrv)[1].

The two parts communicate by io_uring's IORING_OP_URING_CMD with one
shared cmd buffer for storing io command, and the buffer is read only for
ublksrv, each io command is indexed by io request tag directly, and is
written by ublk driver.

For example, when one READ io request is submitted to ublk block driver,
ublk driver stores the io command into cmd buffer first, then completes
one IORING_OP_URING_CMD for notifying ublksrv, and the URING_CMD is issued
to ublk driver beforehand by ublksrv for getting notification of any new
io request, and each URING_CMD is associated with one io request by tag.

After ublksrv gets the io command, it translates and handles the ublk io
request, such as, for the ublk-loop target, ublksrv translates the request
into same request on another file or disk, like the kernel loop block
driver. In ublksrv's implementation, the io is still handled by io_uring,
and share same ring with IORING_OP_URING_CMD command. When the target io
request is done, the same IORING_OP_URING_CMD is issued to ublk driver for
both committing io request result and getting future notification of new
io request.

Another thing done by ublk driver is to copy data between kernel io
request and ublksrv's io buffer:

1) before ubsrv handles WRITE request, copy the request's data into
   ublksrv's userspace io buffer, so that ublksrv can handle the write
   request

2) after ubsrv handles READ request, copy ublksrv's userspace io buffer
   into this READ request, then ublk driver can complete the READ request

Zero copy may be switched if mm is ready to support it.

ublk driver doesn't handle any logic of the specific user space driver,
so it is small/simple enough.

[1] ublksrv

https://github.com/ming1/ubdsrv

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220713140711.97356-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig         |    9 +
 drivers/block/Makefile        |    2 +
 drivers/block/ublk_drv.c      | 1530 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/ublk_cmd.h |  156 +++++
 4 files changed, 1697 insertions(+)
 create mode 100644 drivers/block/ublk_drv.c
 create mode 100644 include/uapi/linux/ublk_cmd.h

(limited to 'include')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index fdb81f2794cd..e19fcab016ba 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -408,6 +408,15 @@ config BLK_DEV_RBD
 
 	  If unsure, say N.
 
+config BLK_DEV_UBLK
+	tristate "Userspace block driver (Experimental)"
+	select IO_URING
+	help
+	  io_uring based userspace block driver. Together with ublk server, ublk
+	  has been working well, but interface with userspace or command data
+	  definition isn't finalized yet, and might change according to future
+	  requirement, so mark is as experimental now.
+
 source "drivers/block/rnbd/Kconfig"
 
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 934a9c7c3a7c..be631352567e 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -39,4 +39,6 @@ obj-$(CONFIG_BLK_DEV_RNBD)	+= rnbd/
 
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk/
 
+obj-$(CONFIG_BLK_DEV_UBLK)			+= ublk_drv.o
+
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
new file mode 100644
index 000000000000..922a84c86fc6
--- /dev/null
+++ b/drivers/block/ublk_drv.c
@@ -0,0 +1,1530 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Userspace block device - block device which IO is handled from userspace
+ *
+ * Take full use of io_uring passthrough command for communicating with
+ * ublk userspace daemon(ublksrvd) for handling basic IO request.
+ *
+ * Copyright 2022 Ming Lei <ming.lei@redhat.com>
+ *
+ * (part of code stolen from loop.c)
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+#include <linux/writeback.h>
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include <linux/ioprio.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/cdev.h>
+#include <linux/io_uring.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <uapi/linux/ublk_cmd.h>
+
+#define UBLK_MINORS		(1U << MINORBITS)
+
+struct ublk_uring_cmd_pdu {
+	struct request *req;
+};
+
+/*
+ * io command is active: sqe cmd is received, and its cqe isn't done
+ *
+ * If the flag is set, the io command is owned by ublk driver, and waited
+ * for incoming blk-mq request from the ublk block device.
+ *
+ * If the flag is cleared, the io command will be completed, and owned by
+ * ublk server.
+ */
+#define UBLK_IO_FLAG_ACTIVE	0x01
+
+/*
+ * IO command is completed via cqe, and it is being handled by ublksrv, and
+ * not committed yet
+ *
+ * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
+ * cross verification
+ */
+#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
+
+/*
+ * IO command is aborted, so this flag is set in case of
+ * !UBLK_IO_FLAG_ACTIVE.
+ *
+ * After this flag is observed, any pending or new incoming request
+ * associated with this io command will be failed immediately
+ */
+#define UBLK_IO_FLAG_ABORTED 0x04
+
+struct ublk_io {
+	/* userspace buffer address from io cmd */
+	__u64	addr;
+	unsigned int flags;
+	int res;
+
+	struct io_uring_cmd *cmd;
+};
+
+struct ublk_queue {
+	int q_id;
+	int q_depth;
+
+	struct task_struct	*ubq_daemon;
+	char *io_cmd_buf;
+
+	unsigned long io_addr;	/* mapped vm address */
+	unsigned int max_io_sz;
+	bool abort_work_pending;
+	unsigned short nr_io_ready;	/* how many ios setup */
+	struct ublk_device *dev;
+	struct ublk_io ios[0];
+};
+
+#define UBLK_DAEMON_MONITOR_PERIOD	(5 * HZ)
+
+struct ublk_device {
+	struct gendisk		*ub_disk;
+	struct request_queue	*ub_queue;
+
+	char	*__queues;
+
+	unsigned short  queue_size;
+	unsigned short  bs_shift;
+	struct ublksrv_ctrl_dev_info	dev_info;
+
+	struct blk_mq_tag_set	tag_set;
+
+	struct cdev		cdev;
+	struct device		cdev_dev;
+
+	atomic_t		ch_open_cnt;
+	int			ub_number;
+
+	struct mutex		mutex;
+
+	struct mm_struct	*mm;
+
+	struct completion	completion;
+	unsigned int		nr_queues_ready;
+	atomic_t		nr_aborted_queues;
+
+	/*
+	 * Our ubq->daemon may be killed without any notification, so
+	 * monitor each queue's daemon periodically
+	 */
+	struct delayed_work	monitor_work;
+	struct work_struct	stop_work;
+};
+
+static dev_t ublk_chr_devt;
+static struct class *ublk_chr_class;
+
+static DEFINE_IDR(ublk_index_idr);
+static DEFINE_SPINLOCK(ublk_idr_lock);
+static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
+
+static DEFINE_MUTEX(ublk_ctl_mutex);
+
+static struct miscdevice ublk_misc;
+
+static struct ublk_device *ublk_get_device(struct ublk_device *ub)
+{
+	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
+		return ub;
+	return NULL;
+}
+
+static void ublk_put_device(struct ublk_device *ub)
+{
+	put_device(&ub->cdev_dev);
+}
+
+static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
+		int qid)
+{
+       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+}
+
+static inline bool ublk_rq_has_data(const struct request *rq)
+{
+	return rq->bio && bio_has_data(rq->bio);
+}
+
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+		int tag)
+{
+	return (struct ublksrv_io_desc *)
+		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
+}
+
+static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
+{
+	return ublk_get_queue(ub, q_id)->io_cmd_buf;
+}
+
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
+			PAGE_SIZE);
+}
+
+static int ublk_open(struct block_device *bdev, fmode_t mode)
+{
+	return 0;
+}
+
+static void ublk_release(struct gendisk *disk, fmode_t mode)
+{
+}
+
+static const struct block_device_operations ub_fops = {
+	.owner =	THIS_MODULE,
+	.open =		ublk_open,
+	.release =	ublk_release,
+};
+
+#define UBLK_MAX_PIN_PAGES	32
+
+struct ublk_map_data {
+	const struct ublk_queue *ubq;
+	const struct request *rq;
+	const struct ublk_io *io;
+	unsigned max_bytes;
+};
+
+struct ublk_io_iter {
+	struct page *pages[UBLK_MAX_PIN_PAGES];
+	unsigned pg_off;	/* offset in the 1st page in pages */
+	int nr_pages;		/* how many page pointers in pages */
+	struct bio *bio;
+	struct bvec_iter iter;
+};
+
+static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
+		unsigned max_bytes, bool to_vm)
+{
+	const unsigned total = min_t(unsigned, max_bytes,
+			PAGE_SIZE - data->pg_off +
+			((data->nr_pages - 1) << PAGE_SHIFT));
+	unsigned done = 0;
+	unsigned pg_idx = 0;
+
+	while (done < total) {
+		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
+		const unsigned int bytes = min3(bv.bv_len, total - done,
+				(unsigned)(PAGE_SIZE - data->pg_off));
+		void *bv_buf = bvec_kmap_local(&bv);
+		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
+
+		if (to_vm)
+			memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+		else
+			memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+
+		kunmap_local(pg_buf);
+		kunmap_local(bv_buf);
+
+		/* advance page array */
+		data->pg_off += bytes;
+		if (data->pg_off == PAGE_SIZE) {
+			pg_idx += 1;
+			data->pg_off = 0;
+		}
+
+		done += bytes;
+
+		/* advance bio */
+		bio_advance_iter_single(data->bio, &data->iter, bytes);
+		if (!data->iter.bi_size) {
+			data->bio = data->bio->bi_next;
+			if (data->bio == NULL)
+				break;
+			data->iter = data->bio->bi_iter;
+		}
+	}
+
+	return done;
+}
+
+static inline int ublk_copy_user_pages(struct ublk_map_data *data,
+		bool to_vm)
+{
+	const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
+	const unsigned long start_vm = data->io->addr;
+	unsigned int done = 0;
+	struct ublk_io_iter iter = {
+		.pg_off	= start_vm & (PAGE_SIZE - 1),
+		.bio	= data->rq->bio,
+		.iter	= data->rq->bio->bi_iter,
+	};
+	const unsigned int nr_pages = round_up(data->max_bytes +
+			(start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
+
+	while (done < nr_pages) {
+		const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
+				nr_pages - done);
+		unsigned i, len;
+
+		iter.nr_pages = get_user_pages_fast(start_vm +
+				(done << PAGE_SHIFT), to_pin, gup_flags,
+				iter.pages);
+		if (iter.nr_pages <= 0)
+			return done == 0 ? iter.nr_pages : done;
+		len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
+		for (i = 0; i < iter.nr_pages; i++) {
+			if (to_vm)
+				set_page_dirty(iter.pages[i]);
+			put_page(iter.pages[i]);
+		}
+		data->max_bytes -= len;
+		done += iter.nr_pages;
+	}
+
+	return done;
+}
+
+static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+	/*
+	 * no zero copy, we delay copy WRITE request data into ublksrv
+	 * context and the big benefit is that pinning pages in current
+	 * context is pretty fast, see ublk_pin_user_pages
+	 */
+	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
+		return rq_bytes;
+
+	if (ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	rq_bytes,
+		};
+
+		ublk_copy_user_pages(&data, true);
+
+		return rq_bytes - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static int ublk_unmap_io(const struct ublk_queue *ubq,
+		const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+
+	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	io->res,
+		};
+
+		WARN_ON_ONCE(io->res > rq_bytes);
+
+		ublk_copy_user_pages(&data, false);
+
+		return io->res - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static inline unsigned int ublk_req_build_flags(struct request *req)
+{
+	unsigned flags = 0;
+
+	if (req->cmd_flags & REQ_FAILFAST_DEV)
+		flags |= UBLK_IO_F_FAILFAST_DEV;
+
+	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
+		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
+
+	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
+		flags |= UBLK_IO_F_FAILFAST_DRIVER;
+
+	if (req->cmd_flags & REQ_META)
+		flags |= UBLK_IO_F_META;
+
+	if (req->cmd_flags & REQ_INTEGRITY)
+		flags |= UBLK_IO_F_INTEGRITY;
+
+	if (req->cmd_flags & REQ_FUA)
+		flags |= UBLK_IO_F_FUA;
+
+	if (req->cmd_flags & REQ_PREFLUSH)
+		flags |= UBLK_IO_F_PREFLUSH;
+
+	if (req->cmd_flags & REQ_NOUNMAP)
+		flags |= UBLK_IO_F_NOUNMAP;
+
+	if (req->cmd_flags & REQ_SWAP)
+		flags |= UBLK_IO_F_SWAP;
+
+	return flags;
+}
+
+static int ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
+{
+	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+	struct ublk_io *io = &ubq->ios[req->tag];
+	u32 ublk_op;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		ublk_op = UBLK_IO_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		ublk_op = UBLK_IO_OP_WRITE;
+		break;
+	case REQ_OP_FLUSH:
+		ublk_op = UBLK_IO_OP_FLUSH;
+		break;
+	case REQ_OP_DISCARD:
+		ublk_op = UBLK_IO_OP_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
+		break;
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	/* need to translate since kernel may change */
+	iod->op_flags = ublk_op | ublk_req_build_flags(req);
+	iod->nr_sectors = blk_rq_sectors(req);
+	iod->start_sector = blk_rq_pos(req);
+	iod->addr = io->addr;
+
+	return BLK_STS_OK;
+}
+
+static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static bool ubq_daemon_is_dying(struct ublk_queue *ubq)
+{
+	return ubq->ubq_daemon->flags & PF_EXITING;
+}
+
+/* todo: handle partial completion */
+static void ublk_complete_rq(struct request *req)
+{
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	struct ublk_io *io = &ubq->ios[req->tag];
+	unsigned int unmapped_bytes;
+
+	/* failed read IO if nothing is read */
+	if (!io->res && req_op(req) == REQ_OP_READ)
+		io->res = -EIO;
+
+	if (io->res < 0) {
+		blk_mq_end_request(req, errno_to_blk_status(io->res));
+		return;
+	}
+
+	/*
+	 * FLUSH or DISCARD usually won't return bytes returned, so end them
+	 * directly.
+	 *
+	 * Both the two needn't unmap.
+	 */
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
+		blk_mq_end_request(req, BLK_STS_OK);
+		return;
+	}
+
+	/* for READ request, writing data in iod->addr to rq buffers */
+	unmapped_bytes = ublk_unmap_io(ubq, req, io);
+
+	/*
+	 * Extremely impossible since we got data filled in just before
+	 *
+	 * Re-read simply for this unlikely case.
+	 */
+	if (unlikely(unmapped_bytes < io->res))
+		io->res = unmapped_bytes;
+
+	if (blk_update_request(req, BLK_STS_OK, io->res))
+		blk_mq_requeue_request(req, true);
+	else
+		__blk_mq_end_request(req, BLK_STS_OK);
+}
+
+/*
+ * __ublk_fail_req() may be called from abort context or ->ubq_daemon
+ * context during exiting, so lock is required.
+ *
+ * Also aborting may not be started yet, keep in mind that one failed
+ * request may be issued by block layer again.
+ */
+static void __ublk_fail_req(struct ublk_io *io, struct request *req)
+{
+	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		io->flags |= UBLK_IO_FLAG_ABORTED;
+		blk_mq_end_request(req, BLK_STS_IOERR);
+	}
+}
+
+#define UBLK_REQUEUE_DELAY_MS	3
+
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_device *ub = cmd->file->private_data;
+	struct request *req = pdu->req;
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	int tag = req->tag;
+	struct ublk_io *io = &ubq->ios[tag];
+	bool task_exiting = current != ubq->ubq_daemon ||
+		(current->flags & PF_EXITING);
+	unsigned int mapped_bytes;
+
+	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
+			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+			ublk_get_iod(ubq, req->tag)->addr);
+
+	if (unlikely(task_exiting)) {
+		blk_mq_end_request(req, BLK_STS_IOERR);
+		mod_delayed_work(system_wq, &ub->monitor_work, 0);
+		return;
+	}
+
+	mapped_bytes = ublk_map_io(ubq, req, io);
+
+	/* partially mapped, update io descriptor */
+	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+		/*
+		 * Nothing mapped, retry until we succeed.
+		 *
+		 * We may never succeed in mapping any bytes here because
+		 * of OOM. TODO: reserve one buffer with single page pinned
+		 * for providing forward progress guarantee.
+		 */
+		if (unlikely(!mapped_bytes)) {
+			blk_mq_requeue_request(req, false);
+			blk_mq_delay_kick_requeue_list(req->q,
+					UBLK_REQUEUE_DELAY_MS);
+			return;
+		}
+
+		ublk_get_iod(ubq, req->tag)->nr_sectors =
+			mapped_bytes >> 9;
+	}
+
+	/* mark this cmd owned by ublksrv */
+	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+	/*
+	 * clear ACTIVE since we are done with this sqe/cmd slot
+	 * We can only accept io cmd in case of being not active.
+	 */
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+	/* tell ublksrv one io request is coming */
+	io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
+}
+
+static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	blk_status_t res;
+
+	/* fill iod to slot in io cmd buffer */
+	res = ublk_setup_iod(ubq, rq);
+	if (unlikely(res != BLK_STS_OK))
+		return BLK_STS_IOERR;
+
+	blk_mq_start_request(bd->rq);
+
+	if (unlikely(ubq_daemon_is_dying(ubq))) {
+		mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
+		return BLK_STS_IOERR;
+	}
+
+	pdu->req = rq;
+	io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+
+	return BLK_STS_OK;
+}
+
+
+static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
+		unsigned int hctx_idx)
+{
+	struct ublk_device *ub = hctx->queue->queuedata;
+	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
+
+	hctx->driver_data = ubq;
+	return 0;
+}
+
+static const struct blk_mq_ops ublk_mq_ops = {
+	.queue_rq       = ublk_queue_rq,
+	.init_hctx	= ublk_init_hctx,
+};
+
+static int ublk_ch_open(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = container_of(inode->i_cdev,
+			struct ublk_device, cdev);
+
+	if (atomic_cmpxchg(&ub->ch_open_cnt, 0, 1) == 0) {
+		filp->private_data = ub;
+		return 0;
+	}
+	return -EBUSY;
+}
+
+static int ublk_ch_release(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = filp->private_data;
+
+	while (atomic_cmpxchg(&ub->ch_open_cnt, 1, 0) != 1)
+		cpu_relax();
+
+	filp->private_data = NULL;
+	return 0;
+}
+
+/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
+static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ublk_device *ub = filp->private_data;
+	size_t sz = vma->vm_end - vma->vm_start;
+	unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
+	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
+	int q_id, ret = 0;
+
+	mutex_lock(&ub->mutex);
+	if (!ub->mm)
+		ub->mm = current->mm;
+	if (current->mm != ub->mm)
+		ret = -EINVAL;
+	mutex_unlock(&ub->mutex);
+
+	if (ret)
+		return ret;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
+	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
+		return -EINVAL;
+
+	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
+	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
+			__func__, q_id, current->pid, vma->vm_start,
+			phys_off, (unsigned long)sz);
+
+	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+static void ublk_commit_completion(struct ublk_device *ub,
+		struct ublksrv_io_cmd *ub_cmd)
+{
+	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
+	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
+	struct ublk_io *io = &ubq->ios[tag];
+	struct request *req;
+
+	/* now this cmd slot is owned by nbd driver */
+	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+	io->res = ub_cmd->result;
+
+	/* find the io request and complete */
+	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+
+	if (req && likely(!blk_should_fake_timeout(req->q)))
+		ublk_complete_rq(req);
+}
+
+/*
+ * When ->ubq_daemon is exiting, either new request is ended immediately,
+ * or any queued io command is drained, so it is safe to abort queue
+ * lockless
+ */
+static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	int i;
+
+	if (!ublk_get_device(ub))
+		return;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
+			struct request *rq;
+
+			/*
+			 * Either we fail the request or ublk_rq_task_work_fn
+			 * will do it
+			 */
+			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
+			if (rq)
+				__ublk_fail_req(io, rq);
+		}
+	}
+	ublk_put_device(ub);
+}
+
+static void ublk_daemon_monitor_work(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, monitor_work.work);
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		if (ubq_daemon_is_dying(ubq)) {
+			schedule_work(&ub->stop_work);
+
+			/* abort queue is for making forward progress */
+			ublk_abort_queue(ub, ubq);
+		}
+	}
+
+	/*
+	 * We can't schedule monitor work after ublk_remove() is started.
+	 *
+	 * No need ub->mutex, monitor work are canceled after state is marked
+	 * as DEAD, so DEAD state is observed reliably.
+	 */
+	if (ub->dev_info.state != UBLK_S_DEV_DEAD)
+		schedule_delayed_work(&ub->monitor_work,
+				UBLK_DAEMON_MONITOR_PERIOD);
+}
+
+static void ublk_cancel_queue(struct ublk_queue *ubq)
+{
+	int i;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (io->flags & UBLK_IO_FLAG_ACTIVE)
+			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
+	}
+}
+
+/* Cancel all pending commands, must be called after del_gendisk() returns */
+static void ublk_cancel_dev(struct ublk_device *ub)
+{
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_cancel_queue(ublk_get_queue(ub, i));
+}
+
+static void ublk_stop_dev(struct ublk_device *ub)
+{
+	mutex_lock(&ub->mutex);
+	if (!disk_live(ub->ub_disk))
+		goto unlock;
+
+	del_gendisk(ub->ub_disk);
+	ub->dev_info.state = UBLK_S_DEV_DEAD;
+	ub->dev_info.ublksrv_pid = -1;
+	ublk_cancel_dev(ub);
+ unlock:
+	mutex_unlock(&ub->mutex);
+	cancel_delayed_work_sync(&ub->monitor_work);
+}
+
+static int ublk_ctrl_stop_dev(struct ublk_device *ub)
+{
+	ublk_stop_dev(ub);
+	cancel_work_sync(&ub->stop_work);
+	return 0;
+}
+
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+	return ubq->nr_io_ready == ubq->q_depth;
+}
+
+/* device can only be started after all IOs are ready */
+static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	mutex_lock(&ub->mutex);
+	ubq->nr_io_ready++;
+	if (ublk_queue_ready(ubq)) {
+		ubq->ubq_daemon = current;
+		get_task_struct(ubq->ubq_daemon);
+		ub->nr_queues_ready++;
+	}
+	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
+		complete_all(&ub->completion);
+	mutex_unlock(&ub->mutex);
+}
+
+static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
+	struct ublk_device *ub = cmd->file->private_data;
+	struct ublk_queue *ubq;
+	struct ublk_io *io;
+	u32 cmd_op = cmd->cmd_op;
+	unsigned tag = ub_cmd->tag;
+	int ret = -EINVAL;
+
+	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
+			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
+			ub_cmd->result);
+
+	if (!(issue_flags & IO_URING_F_SQE128))
+		goto out;
+
+	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+		goto out;
+
+	ubq = ublk_get_queue(ub, ub_cmd->q_id);
+	if (!ubq || ub_cmd->q_id != ubq->q_id)
+		goto out;
+
+	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+		goto out;
+
+	if (tag >= ubq->q_depth)
+		goto out;
+
+	io = &ubq->ios[tag];
+
+	/* there is pending io cmd, something must be wrong */
+	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	switch (cmd_op) {
+	case UBLK_IO_FETCH_REQ:
+		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
+		if (ublk_queue_ready(ubq)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		/*
+		 * The io is being handled by server, so COMMIT_RQ is expected
+		 * instead of FETCH_REQ
+		 */
+		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+			goto out;
+		/* FETCH_RQ has to provide IO buffer */
+		if (!ub_cmd->addr)
+			goto out;
+		io->cmd = cmd;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->addr = ub_cmd->addr;
+
+		ublk_mark_io_ready(ub, ubq);
+		break;
+	case UBLK_IO_COMMIT_AND_FETCH_REQ:
+		/* FETCH_RQ has to provide IO buffer */
+		if (!ub_cmd->addr)
+			goto out;
+		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+			goto out;
+		io->addr = ub_cmd->addr;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->cmd = cmd;
+		ublk_commit_completion(ub, ub_cmd);
+		break;
+	default:
+		goto out;
+	}
+	return -EIOCBQUEUED;
+
+ out:
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+	io_uring_cmd_done(cmd, ret, 0);
+	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
+			__func__, cmd_op, tag, ret, io->flags);
+	return -EIOCBQUEUED;
+}
+
+static const struct file_operations ublk_ch_fops = {
+	.owner = THIS_MODULE,
+	.open = ublk_ch_open,
+	.release = ublk_ch_release,
+	.llseek = no_llseek,
+	.uring_cmd = ublk_ch_uring_cmd,
+	.mmap = ublk_ch_mmap,
+};
+
+static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
+{
+	int size = ublk_queue_cmd_buf_size(ub, q_id);
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	if (ubq->ubq_daemon)
+		put_task_struct(ubq->ubq_daemon);
+	if (ubq->io_cmd_buf)
+		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+}
+
+static int ublk_init_queue(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
+	void *ptr;
+	int size;
+
+	ubq->q_id = q_id;
+	ubq->q_depth = ub->dev_info.queue_depth;
+	size = ublk_queue_cmd_buf_size(ub, q_id);
+
+	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
+	if (!ptr)
+		return -ENOMEM;
+
+	ubq->io_cmd_buf = ptr;
+	ubq->dev = ub;
+	return 0;
+}
+
+static void ublk_deinit_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int i;
+
+	if (!ub->__queues)
+		return;
+
+	for (i = 0; i < nr_queues; i++)
+		ublk_deinit_queue(ub, i);
+	kfree(ub->__queues);
+}
+
+static int ublk_init_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int depth = ub->dev_info.queue_depth;
+	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
+	int i, ret = -ENOMEM;
+
+	ub->queue_size = ubq_size;
+	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
+	if (!ub->__queues)
+		return ret;
+
+	for (i = 0; i < nr_queues; i++) {
+		if (ublk_init_queue(ub, i))
+			goto fail;
+	}
+
+	init_completion(&ub->completion);
+	return 0;
+
+ fail:
+	ublk_deinit_queues(ub);
+	return ret;
+}
+
+static int __ublk_alloc_dev_number(struct ublk_device *ub, int idx)
+{
+	int i = idx;
+	int err;
+
+	spin_lock(&ublk_idr_lock);
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (i >= 0) {
+		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
+	}
+	spin_unlock(&ublk_idr_lock);
+
+	if (err >= 0)
+		ub->ub_number = err;
+
+	return err;
+}
+
+static struct ublk_device *__ublk_create_dev(int idx)
+{
+	struct ublk_device *ub = NULL;
+	int ret;
+
+	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+	if (!ub)
+		return ERR_PTR(-ENOMEM);
+
+	ret = __ublk_alloc_dev_number(ub, idx);
+	if (ret < 0) {
+		kfree(ub);
+		return ERR_PTR(ret);
+	}
+	return ub;
+}
+
+static void __ublk_destroy_dev(struct ublk_device *ub)
+{
+	spin_lock(&ublk_idr_lock);
+	idr_remove(&ublk_index_idr, ub->ub_number);
+	wake_up_all(&ublk_idr_wq);
+	spin_unlock(&ublk_idr_lock);
+
+	mutex_destroy(&ub->mutex);
+
+	kfree(ub);
+}
+
+static void ublk_cdev_rel(struct device *dev)
+{
+	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
+
+	put_disk(ub->ub_disk);
+
+	blk_mq_free_tag_set(&ub->tag_set);
+
+	ublk_deinit_queues(ub);
+
+	__ublk_destroy_dev(ub);
+}
+
+static int ublk_add_chdev(struct ublk_device *ub)
+{
+	struct device *dev = &ub->cdev_dev;
+	int minor = ub->ub_number;
+	int ret;
+
+	dev->parent = ublk_misc.this_device;
+	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
+	dev->class = ublk_chr_class;
+	dev->release = ublk_cdev_rel;
+	device_initialize(dev);
+
+	ret = dev_set_name(dev, "ublkc%d", minor);
+	if (ret)
+		goto fail;
+
+	cdev_init(&ub->cdev, &ublk_ch_fops);
+	ret = cdev_device_add(&ub->cdev, dev);
+	if (ret)
+		goto fail;
+	return 0;
+ fail:
+	put_device(dev);
+	return ret;
+}
+
+static void ublk_stop_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, stop_work);
+
+	ublk_stop_dev(ub);
+}
+
+static void ublk_update_capacity(struct ublk_device *ub)
+{
+	unsigned int max_rq_bytes;
+
+	/* make max request buffer size aligned with PAGE_SIZE */
+	max_rq_bytes = round_down(ub->dev_info.rq_max_blocks <<
+			ub->bs_shift, PAGE_SIZE);
+	ub->dev_info.rq_max_blocks = max_rq_bytes >> ub->bs_shift;
+
+	set_capacity(ub->ub_disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
+}
+
+/* add disk & cdev, cleanup everything in case of failure */
+static int ublk_add_dev(struct ublk_device *ub)
+{
+	struct gendisk *disk;
+	int err = -ENOMEM;
+	int bsize;
+
+	/* We are not ready to support zero copy */
+	ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
+
+	bsize = ub->dev_info.block_size;
+	ub->bs_shift = ilog2(bsize);
+
+	ub->dev_info.nr_hw_queues = min_t(unsigned int,
+			ub->dev_info.nr_hw_queues, nr_cpu_ids);
+
+	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
+	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
+
+	if (ublk_init_queues(ub))
+		goto out_destroy_dev;
+
+	ub->tag_set.ops = &ublk_mq_ops;
+	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
+	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
+	ub->tag_set.numa_node = NUMA_NO_NODE;
+	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ub->tag_set.driver_data = ub;
+
+	err = blk_mq_alloc_tag_set(&ub->tag_set);
+	if (err)
+		goto out_deinit_queues;
+
+	disk = ub->ub_disk = blk_mq_alloc_disk(&ub->tag_set, ub);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_cleanup_tags;
+	}
+	ub->ub_queue = ub->ub_disk->queue;
+
+	ub->ub_queue->queuedata = ub;
+
+	blk_queue_logical_block_size(ub->ub_queue, bsize);
+	blk_queue_physical_block_size(ub->ub_queue, bsize);
+	blk_queue_io_min(ub->ub_queue, bsize);
+
+	blk_queue_max_hw_sectors(ub->ub_queue, ub->dev_info.rq_max_blocks <<
+			(ub->bs_shift - 9));
+
+	ub->ub_queue->limits.discard_granularity = PAGE_SIZE;
+
+	blk_queue_max_discard_sectors(ub->ub_queue, UINT_MAX >> 9);
+	blk_queue_max_write_zeroes_sectors(ub->ub_queue, UINT_MAX >> 9);
+
+	ublk_update_capacity(ub);
+
+	disk->fops		= &ub_fops;
+	disk->private_data	= ub;
+	disk->queue		= ub->ub_queue;
+	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
+
+	mutex_init(&ub->mutex);
+
+	/* add char dev so that ublksrv daemon can be setup */
+	err = ublk_add_chdev(ub);
+	if (err)
+		return err;
+
+	/* don't expose disk now until we got start command from cdev */
+
+	return 0;
+
+out_cleanup_tags:
+	blk_mq_free_tag_set(&ub->tag_set);
+out_deinit_queues:
+	ublk_deinit_queues(ub);
+out_destroy_dev:
+	__ublk_destroy_dev(ub);
+	return err;
+}
+
+static void ublk_remove(struct ublk_device *ub)
+{
+	ublk_ctrl_stop_dev(ub);
+
+	cdev_device_del(&ub->cdev, &ub->cdev_dev);
+	put_device(&ub->cdev_dev);
+}
+
+static struct ublk_device *ublk_get_device_from_id(int idx)
+{
+	struct ublk_device *ub = NULL;
+
+	if (idx < 0)
+		return NULL;
+
+	spin_lock(&ublk_idr_lock);
+	ub = idr_find(&ublk_index_idr, idx);
+	if (ub)
+		ub = ublk_get_device(ub);
+	spin_unlock(&ublk_idr_lock);
+
+	return ub;
+}
+
+static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ret = -EINVAL;
+	int ublksrv_pid = (int)header->data[0];
+	unsigned long dev_blocks = header->data[1];
+
+	if (ublksrv_pid <= 0)
+		return ret;
+
+	wait_for_completion_interruptible(&ub->completion);
+
+	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+
+	mutex_lock(&ub->mutex);
+	if (!disk_live(ub->ub_disk)) {
+		/* We may get disk size updated */
+		if (dev_blocks) {
+			ub->dev_info.dev_blocks = dev_blocks;
+			ublk_update_capacity(ub);
+		}
+		ub->dev_info.ublksrv_pid = ublksrv_pid;
+		ret = add_disk(ub->ub_disk);
+		if (!ret)
+			ub->dev_info.state = UBLK_S_DEV_LIVE;
+	} else {
+		ret = -EEXIST;
+	}
+	mutex_unlock(&ub->mutex);
+
+	return ret;
+}
+
+static struct blk_mq_hw_ctx *ublk_get_hw_queue(struct ublk_device *ub,
+		unsigned int index)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(ub->ub_queue, hctx, i)
+		if (hctx->queue_num == index)
+			return hctx;
+	return NULL;
+}
+
+static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct blk_mq_hw_ctx *hctx;
+	struct ublk_device *ub;
+	unsigned long queue;
+	unsigned int retlen;
+	int ret;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		goto out;
+
+	ret = -EINVAL;
+	queue = header->data[0];
+	if (queue >= ub->dev_info.nr_hw_queues)
+		goto out;
+	hctx = ublk_get_hw_queue(ub, queue);
+	if (!hctx)
+		goto out;
+
+	retlen = min_t(unsigned short, header->len, cpumask_size());
+	if (copy_to_user(argp, hctx->cpumask, retlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (retlen != header->len) {
+		if (clear_user(argp + retlen, header->len - retlen)) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+	ret = 0;
+ out:
+	if (ub)
+		ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_dev_info *info,
+		void __user *argp, int idx)
+{
+	struct ublk_device *ub;
+	int ret;
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ub = __ublk_create_dev(idx);
+	if (!IS_ERR_OR_NULL(ub)) {
+		memcpy(&ub->dev_info, info, sizeof(*info));
+
+		/* update device id */
+		ub->dev_info.dev_id = ub->ub_number;
+
+		ret = ublk_add_dev(ub);
+		if (!ret) {
+			if (copy_to_user(argp, &ub->dev_info, sizeof(*info))) {
+				ublk_remove(ub);
+				ret = -EFAULT;
+			}
+		}
+	} else {
+		if (IS_ERR(ub))
+			ret = PTR_ERR(ub);
+		else
+			ret = -ENOMEM;
+	}
+	mutex_unlock(&ublk_ctl_mutex);
+
+	return ret;
+}
+
+static inline bool ublk_idr_freed(int id)
+{
+	void *ptr;
+
+	spin_lock(&ublk_idr_lock);
+	ptr = idr_find(&ublk_index_idr, id);
+	spin_unlock(&ublk_idr_lock);
+
+	return ptr == NULL;
+}
+
+static int ublk_ctrl_del_dev(int idx)
+{
+	struct ublk_device *ub;
+	int ret;
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ub = ublk_get_device_from_id(idx);
+	if (ub) {
+		ublk_remove(ub);
+		ublk_put_device(ub);
+		ret = 0;
+	} else {
+		ret = -ENODEV;
+	}
+
+	/*
+	 * Wait until the idr is removed, then it can be reused after
+	 * DEL_DEV command is returned.
+	 */
+	if (!ret)
+		wait_event(ublk_idr_wq, ublk_idr_freed(idx));
+	mutex_unlock(&ublk_ctl_mutex);
+
+	return ret;
+}
+
+
+static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
+{
+	pr_devel("%s: dev id %d flags %llx\n", __func__,
+			info->dev_id, info->flags[0]);
+	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
+			info->nr_hw_queues, info->queue_depth,
+			info->block_size, info->dev_blocks);
+}
+
+static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+
+	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
+			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
+			header->data[0], header->addr, header->len);
+}
+
+static int ublk_ctrl_cmd_validate(struct io_uring_cmd *cmd,
+		struct ublksrv_ctrl_dev_info *info)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	u32 cmd_op = cmd->cmd_op;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd_op) {
+	case UBLK_CMD_GET_DEV_INFO:
+		if (header->len < sizeof(*info) || !header->addr)
+			return -EINVAL;
+		break;
+	case UBLK_CMD_ADD_DEV:
+		if (header->len < sizeof(*info) || !header->addr)
+			return -EINVAL;
+		if (copy_from_user(info, argp, sizeof(*info)) != 0)
+			return -EFAULT;
+		ublk_dump_dev_info(info);
+		if (header->dev_id != info->dev_id) {
+			printk(KERN_WARNING "%s: cmd %x, dev id not match %u %u\n",
+					__func__, cmd_op, header->dev_id,
+					info->dev_id);
+			return -EINVAL;
+		}
+		if (header->queue_id != (u16)-1) {
+			printk(KERN_WARNING "%s: cmd %x queue_id is wrong %x\n",
+					__func__, cmd_op, header->queue_id);
+			return -EINVAL;
+		}
+		break;
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+		if ((header->len * BITS_PER_BYTE) < nr_cpu_ids)
+			return -EINVAL;
+		if (header->len & (sizeof(unsigned long)-1))
+			return -EINVAL;
+		if (!header->addr)
+			return -EINVAL;
+	};
+
+	return 0;
+}
+
+static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublksrv_ctrl_dev_info info;
+	u32 cmd_op = cmd->cmd_op;
+	struct ublk_device *ub;
+	int ret = -EINVAL;
+
+	ublk_ctrl_cmd_dump(cmd);
+
+	if (!(issue_flags & IO_URING_F_SQE128))
+		goto out;
+
+	ret = ublk_ctrl_cmd_validate(cmd, &info);
+	if (ret)
+		goto out;
+
+	ret = -ENODEV;
+	switch (cmd_op) {
+	case UBLK_CMD_START_DEV:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			ret = ublk_ctrl_start_dev(ub, cmd);
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_STOP_DEV:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			ret = ublk_ctrl_stop_dev(ub);
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_GET_DEV_INFO:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
+				ret = -EFAULT;
+			else
+				ret = 0;
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_ADD_DEV:
+		ret = ublk_ctrl_add_dev(&info, argp, header->dev_id);
+		break;
+	case UBLK_CMD_DEL_DEV:
+		ret = ublk_ctrl_del_dev(header->dev_id);
+		break;
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+		ret = ublk_ctrl_get_queue_affinity(cmd);
+		break;
+	default:
+		break;
+	};
+ out:
+	io_uring_cmd_done(cmd, ret, 0);
+	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
+			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
+	return -EIOCBQUEUED;
+}
+
+static const struct file_operations ublk_ctl_fops = {
+	.open		= nonseekable_open,
+	.uring_cmd      = ublk_ctrl_uring_cmd,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice ublk_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ublk-control",
+	.fops		= &ublk_ctl_fops,
+};
+
+static int __init ublk_init(void)
+{
+	int ret;
+
+	init_waitqueue_head(&ublk_idr_wq);
+
+	ret = misc_register(&ublk_misc);
+	if (ret)
+		return ret;
+
+	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
+	if (ret)
+		goto unregister_mis;
+
+	ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
+	if (IS_ERR(ublk_chr_class)) {
+		ret = PTR_ERR(ublk_chr_class);
+		goto free_chrdev_region;
+	}
+	return 0;
+
+free_chrdev_region:
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+unregister_mis:
+	misc_deregister(&ublk_misc);
+	return ret;
+}
+
+static void __exit ublk_exit(void)
+{
+	struct ublk_device *ub;
+	int id;
+
+	class_destroy(ublk_chr_class);
+
+	misc_deregister(&ublk_misc);
+
+	idr_for_each_entry(&ublk_index_idr, ub, id)
+		ublk_remove(ub);
+
+	idr_destroy(&ublk_index_idr);
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+}
+
+module_init(ublk_init);
+module_exit(ublk_exit);
+
+MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
new file mode 100644
index 000000000000..4f0c16ec875e
--- /dev/null
+++ b/include/uapi/linux/ublk_cmd.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef USER_BLK_DRV_CMD_INC_H
+#define USER_BLK_DRV_CMD_INC_H
+
+#include <linux/types.h>
+
+/* ublk server command definition */
+
+/*
+ * Admin commands, issued by ublk server, and handled by ublk driver.
+ */
+#define	UBLK_CMD_GET_QUEUE_AFFINITY	0x01
+#define	UBLK_CMD_GET_DEV_INFO	0x02
+#define	UBLK_CMD_ADD_DEV		0x04
+#define	UBLK_CMD_DEL_DEV		0x05
+#define	UBLK_CMD_START_DEV	0x06
+#define	UBLK_CMD_STOP_DEV	0x07
+
+/*
+ * IO commands, issued by ublk server, and handled by ublk driver.
+ *
+ * FETCH_REQ: issued via sqe(URING_CMD) beforehand for fetching IO request
+ *      from ublk driver, should be issued only when starting device. After
+ *      the associated cqe is returned, request's tag can be retrieved via
+ *      cqe->userdata.
+ *
+ * COMMIT_AND_FETCH_REQ: issued via sqe(URING_CMD) after ublkserver handled
+ *      this IO request, request's handling result is committed to ublk
+ *      driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
+ *      handled before completing io request.
+ */
+#define	UBLK_IO_FETCH_REQ		0x20
+#define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
+
+/* only ABORT means that no re-fetch */
+#define UBLK_IO_RES_OK			0
+#define UBLK_IO_RES_ABORT		(-ENODEV)
+
+#define UBLKSRV_CMD_BUF_OFFSET	0
+#define UBLKSRV_IO_BUF_OFFSET	0x80000000
+
+/* tag bit is 12bit, so at most 4096 IOs for each queue */
+#define UBLK_MAX_QUEUE_DEPTH	4096
+
+/*
+ * zero copy requires 4k block size, and can remap ublk driver's io
+ * request into ublksrv's vm space
+ */
+#define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
+
+/* device state */
+#define UBLK_S_DEV_DEAD	0
+#define UBLK_S_DEV_LIVE	1
+
+/* shipped via sqe->cmd of io_uring command */
+struct ublksrv_ctrl_cmd {
+	/* sent to which device, must be valid */
+	__u32	dev_id;
+
+	/* sent to which queue, must be -1 if the cmd isn't for queue */
+	__u16	queue_id;
+	/*
+	 * cmd specific buffer, can be IN or OUT.
+	 */
+	__u16	len;
+	__u64	addr;
+
+	/* inline data */
+	__u64	data[2];
+};
+
+struct ublksrv_ctrl_dev_info {
+	__u16	nr_hw_queues;
+	__u16	queue_depth;
+	__u16	block_size;
+	__u16	state;
+
+	__u32	rq_max_blocks;
+	__u32	dev_id;
+
+	__u64   dev_blocks;
+
+	__s32	ublksrv_pid;
+	__s32	reserved0;
+	__u64	flags[2];
+
+	/* For ublksrv internal use, invisible to ublk driver */
+	__u64	ublksrv_flags;
+	__u64	reserved1[9];
+};
+
+#define		UBLK_IO_OP_READ		0
+#define		UBLK_IO_OP_WRITE		1
+#define		UBLK_IO_OP_FLUSH		2
+#define		UBLK_IO_OP_DISCARD	3
+#define		UBLK_IO_OP_WRITE_SAME	4
+#define		UBLK_IO_OP_WRITE_ZEROES	5
+
+#define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
+#define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
+#define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
+#define		UBLK_IO_F_META			(1U << 11)
+#define		UBLK_IO_F_INTEGRITY		(1U << 12)
+#define		UBLK_IO_F_FUA			(1U << 13)
+#define		UBLK_IO_F_PREFLUSH		(1U << 14)
+#define		UBLK_IO_F_NOUNMAP		(1U << 15)
+#define		UBLK_IO_F_SWAP			(1U << 16)
+
+/*
+ * io cmd is described by this structure, and stored in share memory, indexed
+ * by request tag.
+ *
+ * The data is stored by ublk driver, and read by ublksrv after one fetch command
+ * returns.
+ */
+struct ublksrv_io_desc {
+	/* op: bit 0-7, flags: bit 8-31 */
+	__u32		op_flags;
+
+	__u32		nr_sectors;
+
+	/* start sector for this io */
+	__u64		start_sector;
+
+	/* buffer address in ublksrv daemon vm space, from ublk driver */
+	__u64		addr;
+};
+
+static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod)
+{
+	return iod->op_flags & 0xff;
+}
+
+static inline __u32 ublksrv_get_flags(const struct ublksrv_io_desc *iod)
+{
+	return iod->op_flags >> 8;
+}
+
+/* issued to ublk driver via /dev/ublkcN */
+struct ublksrv_io_cmd {
+	__u16	q_id;
+
+	/* for fetch/commit which result */
+	__u16	tag;
+
+	/* io result, it is valid for COMMIT* command only */
+	__s32	result;
+
+	/*
+	 * userspace buffer address in ublksrv daemon process, valid for
+	 * FETCH* command only
+	 */
+	__u64	addr;
+};
+
+#endif
-- 
cgit 


From 0edb3696c1713c42f52acbd8355b545e58f782b1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 13 Jul 2022 22:07:11 +0800
Subject: ublk_drv: support to complete io command via task_work_add

Use task_work_add if it is available, since task_work_add can bring
up better performance, especially batching signaling ->ubq_daemon can
be done.

It is observed that task_work_add() can boost iops by +4% on random
4k io test. Also except for completing io command, all other code
paths are same with completing io command via
io_uring_cmd_complete_in_task.

Meantime add one flag of UBLK_F_URING_CMD_COMP_IN_TASK for comparing
the mode easily.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220713140711.97356-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 75 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/ublk_cmd.h |  6 ++++
 2 files changed, 73 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 922a84c86fc6..35fa06ee70ff 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -41,10 +41,15 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <asm/page.h>
+#include <linux/task_work.h>
 #include <uapi/linux/ublk_cmd.h>
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
+struct ublk_rq_data {
+	struct callback_head work;
+};
+
 struct ublk_uring_cmd_pdu {
 	struct request *req;
 };
@@ -91,6 +96,7 @@ struct ublk_queue {
 	int q_id;
 	int q_depth;
 
+	unsigned long flags;
 	struct task_struct	*ubq_daemon;
 	char *io_cmd_buf;
 
@@ -149,6 +155,14 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
+{
+	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
+			!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
+		return true;
+	return false;
+}
+
 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
 {
 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@@ -500,12 +514,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 
 #define UBLK_REQUEUE_DELAY_MS	3
 
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+static inline void __ublk_rq_task_work(struct request *req)
 {
-	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-	struct ublk_device *ub = cmd->file->private_data;
-	struct request *req = pdu->req;
 	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	struct ublk_device *ub = ubq->dev;
 	int tag = req->tag;
 	struct ublk_io *io = &ubq->ios[tag];
 	bool task_exiting = current != ubq->ubq_daemon ||
@@ -557,13 +569,27 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
 	io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
 }
 
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+	__ublk_rq_task_work(pdu->req);
+}
+
+static void ublk_rq_task_work_fn(struct callback_head *work)
+{
+	struct ublk_rq_data *data = container_of(work,
+			struct ublk_rq_data, work);
+	struct request *req = blk_mq_rq_from_pdu(data);
+
+	__ublk_rq_task_work(req);
+}
+
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
 	struct ublk_queue *ubq = hctx->driver_data;
 	struct request *rq = bd->rq;
-	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
-	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	blk_status_t res;
 
 	/* fill iod to slot in io cmd buffer */
@@ -574,16 +600,36 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(bd->rq);
 
 	if (unlikely(ubq_daemon_is_dying(ubq))) {
+ fail:
 		mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
 		return BLK_STS_IOERR;
 	}
 
-	pdu->req = rq;
-	io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	if (ublk_can_use_task_work(ubq)) {
+		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+		enum task_work_notify_mode notify_mode = bd->last ?
+			TWA_SIGNAL_NO_IPI : TWA_NONE;
+
+		if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
+			goto fail;
+	} else {
+		struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+		pdu->req = rq;
+		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	}
 
 	return BLK_STS_OK;
 }
 
+static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+
+	if (ublk_can_use_task_work(ubq))
+		__set_notify_signal(ubq->ubq_daemon);
+}
 
 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 		unsigned int hctx_idx)
@@ -595,9 +641,20 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 	return 0;
 }
 
+static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+	init_task_work(&data->work, ublk_rq_task_work_fn);
+	return 0;
+}
+
 static const struct blk_mq_ops ublk_mq_ops = {
 	.queue_rq       = ublk_queue_rq,
+	.commit_rqs     = ublk_commit_rqs,
 	.init_hctx	= ublk_init_hctx,
+	.init_request   = ublk_init_rq,
 };
 
 static int ublk_ch_open(struct inode *inode, struct file *filp)
@@ -912,6 +969,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	void *ptr;
 	int size;
 
+	ubq->flags = ub->dev_info.flags[0];
 	ubq->q_id = q_id;
 	ubq->q_depth = ub->dev_info.queue_depth;
 	size = ublk_queue_cmd_buf_size(ub, q_id);
@@ -1099,6 +1157,7 @@ static int ublk_add_dev(struct ublk_device *ub)
 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
 	ub->tag_set.numa_node = NUMA_NO_NODE;
+	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
 	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ub->tag_set.driver_data = ub;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4f0c16ec875e..a3f5e7c21807 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -48,6 +48,12 @@
  */
 #define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
 
+/*
+ * Force to complete io cmd via io_uring_cmd_complete_in_task so that
+ * performance comparison is done easily with using task_work_add
+ */
+#define UBLK_F_URING_CMD_COMP_IN_TASK	(1UL << 1)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit 


From ddaf8d96f93bccb3f2b1f4f156c098b272440004 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 11 Jul 2022 07:22:55 +0000
Subject: usb: typec: Add support for retimers

Introduce a retimer device class and associated functions that register
and use retimer "switch" devices. These operate in a manner similar to
the "mode-switch" and help configure retimers that exist between the
Type-C connector and host controller(s).

Type C ports can be linked to retimers using firmware node device
references (again, in a manner similar to "mode-switch").

There are no new sysfs files being created; there is the new retimer
class directory, but there are no class-specific files being created
there.

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20220711072333.2064341-2-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/Makefile        |   2 +-
 drivers/usb/typec/class.c         |   9 +-
 drivers/usb/typec/class.h         |   1 +
 drivers/usb/typec/retimer.c       | 168 ++++++++++++++++++++++++++++++++++++++
 drivers/usb/typec/retimer.h       |  15 ++++
 include/linux/usb/typec_retimer.h |  45 ++++++++++
 6 files changed, 238 insertions(+), 2 deletions(-)
 create mode 100644 drivers/usb/typec/retimer.c
 create mode 100644 drivers/usb/typec/retimer.h
 create mode 100644 include/linux/usb/typec_retimer.h

(limited to 'include')

diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile
index dac5c11a3234..4a83dad51a6c 100644
--- a/drivers/usb/typec/Makefile
+++ b/drivers/usb/typec/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_TYPEC)		+= typec.o
-typec-y				:= class.o mux.o bus.o pd.o
+typec-y				:= class.o mux.o bus.o pd.o retimer.o
 typec-$(CONFIG_ACPI)		+= port-mapper.o
 obj-$(CONFIG_TYPEC)		+= altmodes/
 obj-$(CONFIG_TYPEC_TCPM)	+= tcpm/
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index bbc46b14f99a..9062836bb638 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -2299,10 +2299,14 @@ static int __init typec_init(void)
 	if (ret)
 		goto err_unregister_bus;
 
-	ret = class_register(&typec_class);
+	ret = class_register(&retimer_class);
 	if (ret)
 		goto err_unregister_mux_class;
 
+	ret = class_register(&typec_class);
+	if (ret)
+		goto err_unregister_retimer_class;
+
 	ret = usb_power_delivery_init();
 	if (ret)
 		goto err_unregister_class;
@@ -2312,6 +2316,9 @@ static int __init typec_init(void)
 err_unregister_class:
 	class_unregister(&typec_class);
 
+err_unregister_retimer_class:
+	class_unregister(&retimer_class);
+
 err_unregister_mux_class:
 	class_unregister(&typec_mux_class);
 
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index b531f9853bc0..43fcf9e37a8c 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -76,6 +76,7 @@ extern const struct device_type typec_port_dev_type;
 #define is_typec_port(dev) ((dev)->type == &typec_port_dev_type)
 
 extern struct class typec_mux_class;
+extern struct class retimer_class;
 extern struct class typec_class;
 
 #if defined(CONFIG_ACPI)
diff --git a/drivers/usb/typec/retimer.c b/drivers/usb/typec/retimer.c
new file mode 100644
index 000000000000..051eaa7d2899
--- /dev/null
+++ b/drivers/usb/typec/retimer.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2022 Google LLC
+ *
+ * USB Type-C Retimer support.
+ * Author: Prashant Malani <pmalani@chromium.org>
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+
+#include "class.h"
+#include "retimer.h"
+
+static bool dev_name_ends_with(struct device *dev, const char *suffix)
+{
+	const char *name = dev_name(dev);
+	const int name_len = strlen(name);
+	const int suffix_len = strlen(suffix);
+
+	if (suffix_len > name_len)
+		return false;
+
+	return strcmp(name + (name_len - suffix_len), suffix) == 0;
+}
+
+static int retimer_fwnode_match(struct device *dev, const void *fwnode)
+{
+	return dev_fwnode(dev) == fwnode && dev_name_ends_with(dev, "-retimer");
+}
+
+static void *typec_retimer_match(struct fwnode_handle *fwnode, const char *id, void *data)
+{
+	struct device *dev  = class_find_device(&retimer_class, NULL, fwnode,
+						retimer_fwnode_match);
+
+	return dev ? to_typec_retimer(dev) : ERR_PTR(-EPROBE_DEFER);
+}
+
+/**
+ * fwnode_typec_retimer_get - Find USB Type-C retimer.
+ * @fwnode: The caller device node.
+ *
+ * Finds a retimer linked to the caller. This function is primarily meant for the
+ * Type-C drivers. Returns a reference to the retimer on success, NULL if no
+ * matching connection was found, or ERR_PTR(-EPROBE_DEFER) when a connection
+ * was found but the retimer has not been enumerated yet.
+ */
+struct typec_retimer *fwnode_typec_retimer_get(struct fwnode_handle *fwnode)
+{
+	struct typec_retimer *retimer;
+
+	retimer = fwnode_connection_find_match(fwnode, "retimer-switch", NULL, typec_retimer_match);
+	if (!IS_ERR_OR_NULL(retimer))
+		WARN_ON(!try_module_get(retimer->dev.parent->driver->owner));
+
+	return retimer;
+}
+EXPORT_SYMBOL_GPL(fwnode_typec_retimer_get);
+
+/**
+ * typec_retimer_put - Release handle to a retimer.
+ * @retimer: USB Type-C Connector Retimer.
+ *
+ * Decrements reference count for @retimer.
+ */
+void typec_retimer_put(struct typec_retimer *retimer)
+{
+	if (!IS_ERR_OR_NULL(retimer)) {
+		module_put(retimer->dev.parent->driver->owner);
+		put_device(&retimer->dev);
+	}
+}
+EXPORT_SYMBOL_GPL(typec_retimer_put);
+
+int typec_retimer_set(struct typec_retimer *retimer, struct typec_retimer_state *state)
+{
+	if (IS_ERR_OR_NULL(retimer))
+		return 0;
+
+	return retimer->set(retimer, state);
+}
+EXPORT_SYMBOL_GPL(typec_retimer_set);
+
+static void typec_retimer_release(struct device *dev)
+{
+	kfree(to_typec_retimer(dev));
+}
+
+static const struct device_type typec_retimer_dev_type = {
+	.name = "typec_retimer",
+	.release = typec_retimer_release,
+};
+
+/**
+ * typec_retimer_register - Register a retimer device.
+ * @parent: Parent device.
+ * @desc: Retimer description.
+ *
+ * Some USB Type-C connectors have their physical lines routed through retimers before they
+ * reach muxes or host controllers. In some cases (for example: using alternate modes)
+ * these retimers need to be reconfigured appropriately. This function registers retimer
+ * switches which route and potentially modify the signals on the Type C physical lines
+ * enroute to the host controllers.
+ */
+struct typec_retimer *
+typec_retimer_register(struct device *parent, const struct typec_retimer_desc *desc)
+{
+	struct typec_retimer *retimer;
+	int ret;
+
+	if (!desc || !desc->set)
+		return ERR_PTR(-EINVAL);
+
+	retimer = kzalloc(sizeof(*retimer), GFP_KERNEL);
+	if (!retimer)
+		return ERR_PTR(-ENOMEM);
+
+	retimer->set = desc->set;
+
+	device_initialize(&retimer->dev);
+	retimer->dev.parent = parent;
+	retimer->dev.fwnode = desc->fwnode;
+	retimer->dev.class = &retimer_class;
+	retimer->dev.type = &typec_retimer_dev_type;
+	retimer->dev.driver_data = desc->drvdata;
+	dev_set_name(&retimer->dev, "%s-retimer",
+		     desc->name ? desc->name : dev_name(parent));
+
+	ret = device_add(&retimer->dev);
+	if (ret) {
+		dev_err(parent, "failed to register retimer (%d)\n", ret);
+		put_device(&retimer->dev);
+		return ERR_PTR(ret);
+	}
+
+	return retimer;
+}
+EXPORT_SYMBOL_GPL(typec_retimer_register);
+
+/**
+ * typec_retimer_unregister - Unregister retimer device.
+ * @retimer: USB Type-C Connector retimer.
+ *
+ * Unregister retimer that was registered with typec_retimer_register().
+ */
+void typec_retimer_unregister(struct typec_retimer *retimer)
+{
+	if (!IS_ERR_OR_NULL(retimer))
+		device_unregister(&retimer->dev);
+}
+EXPORT_SYMBOL_GPL(typec_retimer_unregister);
+
+void *typec_retimer_get_drvdata(struct typec_retimer *retimer)
+{
+	return dev_get_drvdata(&retimer->dev);
+}
+EXPORT_SYMBOL_GPL(typec_retimer_get_drvdata);
+
+struct class retimer_class = {
+	.name = "retimer",
+	.owner = THIS_MODULE,
+};
diff --git a/drivers/usb/typec/retimer.h b/drivers/usb/typec/retimer.h
new file mode 100644
index 000000000000..fa15951d4846
--- /dev/null
+++ b/drivers/usb/typec/retimer.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __USB_TYPEC_RETIMER__
+#define __USB_TYPEC_RETIMER__
+
+#include <linux/usb/typec_retimer.h>
+
+struct typec_retimer {
+	struct device dev;
+	typec_retimer_set_fn_t set;
+};
+
+#define to_typec_retimer(_dev_) container_of(_dev_, struct typec_retimer, dev)
+
+#endif /* __USB_TYPEC_RETIMER__ */
diff --git a/include/linux/usb/typec_retimer.h b/include/linux/usb/typec_retimer.h
new file mode 100644
index 000000000000..5e036b3360e2
--- /dev/null
+++ b/include/linux/usb/typec_retimer.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __USB_TYPEC_RETIMER
+#define __USB_TYPEC_RETIMER
+
+#include <linux/property.h>
+#include <linux/usb/typec.h>
+
+struct device;
+struct typec_retimer;
+struct typec_altmode;
+struct fwnode_handle;
+
+struct typec_retimer_state {
+	struct typec_altmode *alt;
+	unsigned long mode;
+	void *data;
+};
+
+typedef int (*typec_retimer_set_fn_t)(struct typec_retimer *retimer,
+				      struct typec_retimer_state *state);
+
+struct typec_retimer_desc {
+	struct fwnode_handle *fwnode;
+	typec_retimer_set_fn_t set;
+	const char *name;
+	void *drvdata;
+};
+
+struct typec_retimer *fwnode_typec_retimer_get(struct fwnode_handle *fwnode);
+void typec_retimer_put(struct typec_retimer *retimer);
+int typec_retimer_set(struct typec_retimer *retimer, struct typec_retimer_state *state);
+
+static inline struct typec_retimer *typec_retimer_get(struct device *dev)
+{
+	return fwnode_typec_retimer_get(dev_fwnode(dev));
+}
+
+struct typec_retimer *
+typec_retimer_register(struct device *parent, const struct typec_retimer_desc *desc);
+void typec_retimer_unregister(struct typec_retimer *retimer);
+
+void *typec_retimer_get_drvdata(struct typec_retimer *retimer);
+
+#endif /* __USB_TYPEC_RETIMER */
-- 
cgit 


From e6281c26674e037798bf674f26a7593a324cdf39 Mon Sep 17 00:00:00 2001
From: Ang Tien Sung <tien.sung.ang@intel.com>
Date: Mon, 11 Jul 2022 17:31:35 -0500
Subject: firmware: stratix10-svc: Add support for FCS

Extend Intel service layer driver to support FPGA Crypto service(FCS)
features on Intel Soc platforms. Adding an additional channel and FCS
platform driver ("intel_fcs") as part of the probe method.
FCS driver uses the driver to send crypto operations' commands to
the secure device manager(SDM) on Intel Soc platforms Stratix10 and
Agilex.

Signed-off-by: Ang Tien Sung <tien.sung.ang@intel.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://lore.kernel.org/r/20220711223140.2307945-1-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/stratix10-svc.c                   | 30 ++++++++++++++++++----
 .../linux/firmware/intel/stratix10-svc-client.h    |  1 +
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 14663f671323..1fe748c047df 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -34,12 +34,13 @@
  * timeout is set to 30 seconds (30 * 1000) at Intel Stratix10 SoC.
  */
 #define SVC_NUM_DATA_IN_FIFO			32
-#define SVC_NUM_CHANNEL				2
+#define SVC_NUM_CHANNEL				3
 #define FPGA_CONFIG_DATA_CLAIM_TIMEOUT_MS	200
 #define FPGA_CONFIG_STATUS_TIMEOUT_SEC		30
 
 /* stratix10 service layer clients */
 #define STRATIX10_RSU				"stratix10-rsu"
+#define INTEL_FCS				"intel-fcs"
 
 typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long,
 			     unsigned long, unsigned long, unsigned long,
@@ -53,6 +54,7 @@ struct stratix10_svc_chan;
  */
 struct stratix10_svc {
 	struct platform_device *stratix10_svc_rsu;
+	struct platform_device *intel_svc_fcs;
 };
 
 /**
@@ -1036,6 +1038,11 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
 	chans[1].name = SVC_CLIENT_RSU;
 	spin_lock_init(&chans[1].lock);
 
+	chans[2].scl = NULL;
+	chans[2].ctrl = controller;
+	chans[2].name = SVC_CLIENT_FCS;
+	spin_lock_init(&chans[2].lock);
+
 	list_add_tail(&controller->node, &svc_ctrl);
 	platform_set_drvdata(pdev, controller);
 
@@ -1054,8 +1061,22 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
 	}
 
 	ret = platform_device_add(svc->stratix10_svc_rsu);
-	if (ret)
-		goto err_put_device;
+	if (ret) {
+		platform_device_put(svc->stratix10_svc_rsu);
+		return ret;
+	}
+
+	svc->intel_svc_fcs = platform_device_alloc(INTEL_FCS, 1);
+	if (!svc->intel_svc_fcs) {
+		dev_err(dev, "failed to allocate %s device\n", INTEL_FCS);
+		return -ENOMEM;
+	}
+
+	ret = platform_device_add(svc->intel_svc_fcs);
+	if (ret) {
+		platform_device_put(svc->intel_svc_fcs);
+		return ret;
+	}
 
 	dev_set_drvdata(dev, svc);
 
@@ -1063,8 +1084,6 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
 
 	return 0;
 
-err_put_device:
-	platform_device_put(svc->stratix10_svc_rsu);
 err_free_kfifo:
 	kfifo_free(&controller->svc_fifo);
 	return ret;
@@ -1075,6 +1094,7 @@ static int stratix10_svc_drv_remove(struct platform_device *pdev)
 	struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev);
 	struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev);
 
+	platform_device_unregister(svc->intel_svc_fcs);
 	platform_device_unregister(svc->stratix10_svc_rsu);
 
 	kfifo_free(&ctrl->svc_fifo);
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 18c1841fdb1f..24121f8e0d7c 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -14,6 +14,7 @@
  */
 #define SVC_CLIENT_FPGA			"fpga"
 #define SVC_CLIENT_RSU			"rsu"
+#define SVC_CLIENT_FCS			"fcs"
 
 /*
  * Status of the sent command, in bit number
-- 
cgit 


From 79b936254aa0eb4e3bc73fecaacf049145613c0a Mon Sep 17 00:00:00 2001
From: Ang Tien Sung <tien.sung.ang@intel.com>
Date: Mon, 11 Jul 2022 17:31:36 -0500
Subject: firmware: stratix10-svc: add FCS polling command

Introduce a new SMC command INTEL_SIP_SMC_FUNCID_SERVICE_COMPLETED
that polls if a previous asynchronous command was completed. This
SMC command is used by the new FPGA Crypto Service (FCS).
A basic example is that the FCS sends an AES data encryption
call to the secure device manager(SDM) and waits for the completion
of the operation by continuously polling the results with the new
command.

Signed-off-by: Ang Tien Sung <tien.sung.ang@intel.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://lore.kernel.org/r/20220711223140.2307945-2-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/stratix10-svc.c                   | 42 +++++++++++++++++++---
 include/linux/firmware/intel/stratix10-smc.h       | 25 +++++++++++++
 .../linux/firmware/intel/stratix10-svc-client.h    |  5 +++
 3 files changed, 67 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 1fe748c047df..439a4bdf8207 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -248,6 +248,7 @@ static void svc_thread_cmd_config_status(struct stratix10_svc_controller *ctrl,
 {
 	struct arm_smccc_res res;
 	int count_in_sec;
+	unsigned long a0, a1, a2;
 
 	cb_data->kaddr1 = NULL;
 	cb_data->kaddr2 = NULL;
@@ -256,24 +257,45 @@ static void svc_thread_cmd_config_status(struct stratix10_svc_controller *ctrl,
 
 	pr_debug("%s: polling config status\n", __func__);
 
+	a0 = INTEL_SIP_SMC_FPGA_CONFIG_ISDONE;
+	a1 = (unsigned long)p_data->paddr;
+	a2 = (unsigned long)p_data->size;
+
+	if (p_data->command == COMMAND_POLL_SERVICE_STATUS)
+		a0 = INTEL_SIP_SMC_SERVICE_COMPLETED;
+
 	count_in_sec = FPGA_CONFIG_STATUS_TIMEOUT_SEC;
 	while (count_in_sec) {
-		ctrl->invoke_fn(INTEL_SIP_SMC_FPGA_CONFIG_ISDONE,
-				0, 0, 0, 0, 0, 0, 0, &res);
+		ctrl->invoke_fn(a0, a1, a2, 0, 0, 0, 0, 0, &res);
 		if ((res.a0 == INTEL_SIP_SMC_STATUS_OK) ||
-		    (res.a0 == INTEL_SIP_SMC_STATUS_ERROR))
+		    (res.a0 == INTEL_SIP_SMC_STATUS_ERROR) ||
+		    (res.a0 == INTEL_SIP_SMC_STATUS_REJECTED))
 			break;
 
 		/*
-		 * configuration is still in progress, wait one second then
+		 * request is still in progress, wait one second then
 		 * poll again
 		 */
 		msleep(1000);
 		count_in_sec--;
 	}
 
-	if (res.a0 == INTEL_SIP_SMC_STATUS_OK && count_in_sec)
+	if (!count_in_sec) {
+		pr_err("%s: poll status timeout\n", __func__);
+		cb_data->status = BIT(SVC_STATUS_BUSY);
+	} else if (res.a0 == INTEL_SIP_SMC_STATUS_OK) {
 		cb_data->status = BIT(SVC_STATUS_COMPLETED);
+		cb_data->kaddr2 = (res.a2) ?
+				  svc_pa_to_va(res.a2) : NULL;
+		cb_data->kaddr3 = (res.a3) ? &res.a3 : NULL;
+	} else {
+		pr_err("%s: poll status error\n", __func__);
+		cb_data->kaddr1 = &res.a1;
+		cb_data->kaddr2 = (res.a2) ?
+				  svc_pa_to_va(res.a2) : NULL;
+		cb_data->kaddr3 = (res.a3) ? &res.a3 : NULL;
+		cb_data->status = BIT(SVC_STATUS_ERROR);
+	}
 
 	p_data->chan->scl->receive_cb(p_data->chan->scl, cb_data);
 }
@@ -298,6 +320,7 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 	case COMMAND_RECONFIG:
 	case COMMAND_RSU_UPDATE:
 	case COMMAND_RSU_NOTIFY:
+	case COMMAND_POLL_SERVICE_STATUS:
 		cb_data->status = BIT(SVC_STATUS_OK);
 		break;
 	case COMMAND_RECONFIG_DATA_SUBMIT:
@@ -430,6 +453,14 @@ static int svc_normal_to_secure_thread(void *data)
 			a1 = 0;
 			a2 = 0;
 			break;
+		/* for polling */
+		case COMMAND_POLL_SERVICE_STATUS:
+			a0 = INTEL_SIP_SMC_SERVICE_COMPLETED;
+			a1 = (unsigned long)pdata->paddr;
+			a2 = (unsigned long)pdata->size;
+
+			break;
+
 		default:
 			pr_warn("it shouldn't happen\n");
 			break;
@@ -470,6 +501,7 @@ static int svc_normal_to_secure_thread(void *data)
 							  pdata, cbdata);
 				break;
 			case COMMAND_RECONFIG_STATUS:
+			case COMMAND_POLL_SERVICE_STATUS:
 				svc_thread_cmd_config_status(ctrl,
 							     pdata, cbdata);
 				break;
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index aad497a9ad8b..0de104009566 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -403,6 +403,31 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_RSU_MAX_RETRY \
 	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_RSU_MAX_RETRY)
 
+/**
+ * Request INTEL_SIP_SMC_SERVICE_COMPLETED
+ * Sync call to check if the secure world have completed service request
+ * or not.
+ *
+ * Call register usage:
+ * a0: INTEL_SIP_SMC_SERVICE_COMPLETED
+ * a1: this register is optional. If used, it is the physical address for
+ *     secure firmware to put output data
+ * a2: this register is optional. If used, it is the size of output data
+ * a3-a7: not used
+ *
+ * Return status:
+ * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_STATUS_ERROR,
+ *     INTEL_SIP_SMC_REJECTED or INTEL_SIP_SMC_STATUS_BUSY
+ * a1: mailbox error if a0 is INTEL_SIP_SMC_STATUS_ERROR
+ * a2: physical address containing the process info
+ *     for FCS certificate -- the data contains the certificate status
+ *     for FCS cryption -- the data contains the actual data size FW processes
+ * a3: output data size
+ */
+#define INTEL_SIP_SMC_FUNCID_SERVICE_COMPLETED 30
+#define INTEL_SIP_SMC_SERVICE_COMPLETED \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_SERVICE_COMPLETED)
+
 /**
  * Request INTEL_SIP_SMC_FIRMWARE_VERSION
  *
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 24121f8e0d7c..5d0e814e0c41 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -106,6 +106,9 @@ struct stratix10_svc_chan;
  * @COMMAND_RSU_DCMF_VERSION: query firmware for the DCMF version, return status
  * is SVC_STATUS_OK or SVC_STATUS_ERROR
  *
+ * @COMMAND_POLL_SERVICE_STATUS: poll if the service request is complete,
+ * return statis is SVC_STATUS_OK, SVC_STATUS_ERROR or SVC_STATUS_BUSY
+ *
  * @COMMAND_FIRMWARE_VERSION: query running firmware version, return status
  * is SVC_STATUS_OK or SVC_STATUS_ERROR
  */
@@ -122,6 +125,8 @@ enum stratix10_svc_command_code {
 	COMMAND_RSU_MAX_RETRY,
 	COMMAND_RSU_DCMF_VERSION,
 	COMMAND_FIRMWARE_VERSION,
+	/* for general status poll */
+	COMMAND_POLL_SERVICE_STATUS = 40,
 };
 
 /**
-- 
cgit 


From 4a4709d470e624e65a879b5c430dee5e27e9ac83 Mon Sep 17 00:00:00 2001
From: Ang Tien Sung <tien.sung.ang@intel.com>
Date: Mon, 11 Jul 2022 17:31:37 -0500
Subject: firmware: stratix10-svc: add new FCS commands

Extending the fpga svc driver to support 6 new FPGA Crypto
Service(FCS) commands.
We are adding FCS SDOS data encryption and decryption,
random number generator, image validation request,
reading the data provision and certificate validation.

Signed-off-by: Ang Tien Sung <tien.sung.ang@intel.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://lore.kernel.org/r/20220711223140.2307945-3-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/stratix10-svc.c                   | 105 +++++++++++++++++--
 include/linux/firmware/intel/stratix10-smc.h       | 111 +++++++++++++++++++++
 .../linux/firmware/intel/stratix10-svc-client.h    |  41 +++++++-
 3 files changed, 245 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 439a4bdf8207..1cf9f4f47b94 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -99,8 +99,10 @@ struct stratix10_svc_data_mem {
 /**
  * struct stratix10_svc_data - service data structure
  * @chan: service channel
- * @paddr: playload physical address
- * @size: playload size
+ * @paddr: physical address of to be processed payload
+ * @size: to be processed playload size
+ * @paddr_output: physical address of processed payload
+ * @size_output: processed payload size
  * @command: service command requested by client
  * @flag: configuration type (full or partial)
  * @arg: args to be passed via registers and not physically mapped buffers
@@ -111,6 +113,8 @@ struct stratix10_svc_data {
 	struct stratix10_svc_chan *chan;
 	phys_addr_t paddr;
 	size_t size;
+	phys_addr_t paddr_output;
+	size_t size_output;
 	u32 command;
 	u32 flag;
 	u64 arg[3];
@@ -320,7 +324,10 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 	case COMMAND_RECONFIG:
 	case COMMAND_RSU_UPDATE:
 	case COMMAND_RSU_NOTIFY:
-	case COMMAND_POLL_SERVICE_STATUS:
+	case COMMAND_FCS_REQUEST_SERVICE:
+	case COMMAND_FCS_SEND_CERTIFICATE:
+	case COMMAND_FCS_DATA_ENCRYPTION:
+	case COMMAND_FCS_DATA_DECRYPTION:
 		cb_data->status = BIT(SVC_STATUS_OK);
 		break;
 	case COMMAND_RECONFIG_DATA_SUBMIT:
@@ -340,6 +347,14 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 		cb_data->kaddr1 = &res.a1;
 		cb_data->kaddr2 = &res.a2;
 		break;
+	case COMMAND_FCS_RANDOM_NUMBER_GEN:
+	case COMMAND_FCS_GET_PROVISION_DATA:
+	case COMMAND_POLL_SERVICE_STATUS:
+		cb_data->status = BIT(SVC_STATUS_OK);
+		cb_data->kaddr1 = &res.a1;
+		cb_data->kaddr2 = svc_pa_to_va(res.a2);
+		cb_data->kaddr3 = &res.a3;
+		break;
 	default:
 		pr_warn("it shouldn't happen\n");
 		break;
@@ -366,7 +381,7 @@ static int svc_normal_to_secure_thread(void *data)
 	struct stratix10_svc_data *pdata;
 	struct stratix10_svc_cb_data *cbdata;
 	struct arm_smccc_res res;
-	unsigned long a0, a1, a2;
+	unsigned long a0, a1, a2, a3, a4, a5, a6, a7;
 	int ret_fifo = 0;
 
 	pdata =  kmalloc(sizeof(*pdata), GFP_KERNEL);
@@ -383,6 +398,11 @@ static int svc_normal_to_secure_thread(void *data)
 	a0 = INTEL_SIP_SMC_FPGA_CONFIG_LOOPBACK;
 	a1 = 0;
 	a2 = 0;
+	a3 = 0;
+	a4 = 0;
+	a5 = 0;
+	a6 = 0;
+	a7 = 0;
 
 	pr_debug("smc_hvc_shm_thread is running\n");
 
@@ -453,12 +473,50 @@ static int svc_normal_to_secure_thread(void *data)
 			a1 = 0;
 			a2 = 0;
 			break;
+
+		/* for FCS */
+		case COMMAND_FCS_DATA_ENCRYPTION:
+			a0 = INTEL_SIP_SMC_FCS_CRYPTION;
+			a1 = 1;
+			a2 = (unsigned long)pdata->paddr;
+			a3 = (unsigned long)pdata->size;
+			a4 = (unsigned long)pdata->paddr_output;
+			a5 = (unsigned long)pdata->size_output;
+			break;
+		case COMMAND_FCS_DATA_DECRYPTION:
+			a0 = INTEL_SIP_SMC_FCS_CRYPTION;
+			a1 = 0;
+			a2 = (unsigned long)pdata->paddr;
+			a3 = (unsigned long)pdata->size;
+			a4 = (unsigned long)pdata->paddr_output;
+			a5 = (unsigned long)pdata->size_output;
+			break;
+		case COMMAND_FCS_RANDOM_NUMBER_GEN:
+			a0 = INTEL_SIP_SMC_FCS_RANDOM_NUMBER;
+			a1 = (unsigned long)pdata->paddr;
+			a2 = 0;
+			break;
+		case COMMAND_FCS_REQUEST_SERVICE:
+			a0 = INTEL_SIP_SMC_FCS_SERVICE_REQUEST;
+			a1 = (unsigned long)pdata->paddr;
+			a2 = (unsigned long)pdata->size;
+			break;
+		case COMMAND_FCS_SEND_CERTIFICATE:
+			a0 = INTEL_SIP_SMC_FCS_SEND_CERTIFICATE;
+			a1 = (unsigned long)pdata->paddr;
+			a2 = (unsigned long)pdata->size;
+			break;
+		case COMMAND_FCS_GET_PROVISION_DATA:
+			a0 = INTEL_SIP_SMC_FCS_GET_PROVISION_DATA;
+			a1 = (unsigned long)pdata->paddr;
+			a2 = 0;
+			break;
+
 		/* for polling */
 		case COMMAND_POLL_SERVICE_STATUS:
 			a0 = INTEL_SIP_SMC_SERVICE_COMPLETED;
 			a1 = (unsigned long)pdata->paddr;
 			a2 = (unsigned long)pdata->size;
-
 			break;
 
 		default:
@@ -466,10 +524,14 @@ static int svc_normal_to_secure_thread(void *data)
 			break;
 		}
 		pr_debug("%s: before SMC call -- a0=0x%016x a1=0x%016x",
-			 __func__, (unsigned int)a0, (unsigned int)a1);
+			 __func__,
+			 (unsigned int)a0,
+			 (unsigned int)a1);
 		pr_debug(" a2=0x%016x\n", (unsigned int)a2);
-
-		ctrl->invoke_fn(a0, a1, a2, 0, 0, 0, 0, 0, &res);
+		pr_debug(" a3=0x%016x\n", (unsigned int)a3);
+		pr_debug(" a4=0x%016x\n", (unsigned int)a4);
+		pr_debug(" a5=0x%016x\n", (unsigned int)a5);
+		ctrl->invoke_fn(a0, a1, a2, a3, a4, a5, a6, a7, &res);
 
 		pr_debug("%s: after SMC call -- res.a0=0x%016x",
 			 __func__, (unsigned int)res.a0);
@@ -512,6 +574,22 @@ static int svc_normal_to_secure_thread(void *data)
 			break;
 		case INTEL_SIP_SMC_STATUS_REJECTED:
 			pr_debug("%s: STATUS_REJECTED\n", __func__);
+			/* for FCS */
+			switch (pdata->command) {
+			case COMMAND_FCS_REQUEST_SERVICE:
+			case COMMAND_FCS_SEND_CERTIFICATE:
+			case COMMAND_FCS_GET_PROVISION_DATA:
+			case COMMAND_FCS_DATA_ENCRYPTION:
+			case COMMAND_FCS_DATA_DECRYPTION:
+			case COMMAND_FCS_RANDOM_NUMBER_GEN:
+				cbdata->status = BIT(SVC_STATUS_INVALID_PARAM);
+				cbdata->kaddr1 = NULL;
+				cbdata->kaddr2 = NULL;
+				cbdata->kaddr3 = NULL;
+				pdata->chan->scl->receive_cb(pdata->chan->scl,
+							     cbdata);
+				break;
+			}
 			break;
 		case INTEL_SIP_SMC_STATUS_ERROR:
 		case INTEL_SIP_SMC_RSU_ERROR:
@@ -886,8 +964,19 @@ int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg)
 		list_for_each_entry(p_mem, &svc_data_mem, node)
 			if (p_mem->vaddr == p_msg->payload) {
 				p_data->paddr = p_mem->paddr;
+				p_data->size = p_msg->payload_length;
 				break;
 			}
+		if (p_msg->payload_output) {
+			list_for_each_entry(p_mem, &svc_data_mem, node)
+				if (p_mem->vaddr == p_msg->payload_output) {
+					p_data->paddr_output =
+						p_mem->paddr;
+					p_data->size_output =
+						p_msg->payload_length_output;
+					break;
+				}
+		}
 	}
 
 	p_data->command = p_msg->command;
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index 0de104009566..8c198984a47c 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -445,4 +445,115 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_FIRMWARE_VERSION \
         INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FIRMWARE_VERSION)
 
+/**
+ * SMC call protocol for FPGA Crypto Service (FCS)
+ * FUNCID starts from 90
+ */
+
+/**
+ * Request INTEL_SIP_SMC_FCS_RANDOM_NUMBER
+ *
+ * Sync call used to query the random number generated by the firmware
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_FCS_RANDOM_NUMBER
+ * a1 the physical address for firmware to write generated random data
+ * a2-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FCS_ERROR or
+ *      INTEL_SIP_SMC_FCS_REJECTED
+ * a1 mailbox error
+ * a2 the physical address of generated random number
+ * a3 size
+ */
+#define INTEL_SIP_SMC_FUNCID_FCS_RANDOM_NUMBER 90
+#define INTEL_SIP_SMC_FCS_RANDOM_NUMBER \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_RANDOM_NUMBER)
+
+/**
+ * Request INTEL_SIP_SMC_FCS_CRYPTION
+ * Async call for data encryption and HMAC signature generation, or for
+ * data decryption and HMAC verification.
+ *
+ * Call INTEL_SIP_SMC_SERVICE_COMPLETED to get the output encrypted or
+ * decrypted data
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_FCS_CRYPTION
+ * a1 cryption mode (1 for encryption and 0 for decryption)
+ * a2 physical address which stores to be encrypted or decrypted data
+ * a3 input data size
+ * a4 physical address which will hold the encrypted or decrypted output data
+ * a5 output data size
+ * a6-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_STATUS_ERROR or
+ *      INTEL_SIP_SMC_STATUS_REJECTED
+ * a1-3 not used
+ */
+#define INTEL_SIP_SMC_FUNCID_FCS_CRYPTION 91
+#define INTEL_SIP_SMC_FCS_CRYPTION \
+	INTEL_SIP_SMC_STD_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_CRYPTION)
+
+/**
+ * Request INTEL_SIP_SMC_FCS_SERVICE_REQUEST
+ * Async call for authentication service of HPS software
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_FCS_SERVICE_REQUEST
+ * a1 the physical address of data block
+ * a2 size of data block
+ * a3-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_ERROR or
+ *      INTEL_SIP_SMC_REJECTED
+ * a1-a3 not used
+ */
+#define INTEL_SIP_SMC_FUNCID_FCS_SERVICE_REQUEST 92
+#define INTEL_SIP_SMC_FCS_SERVICE_REQUEST \
+	INTEL_SIP_SMC_STD_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_SERVICE_REQUEST)
+
+/**
+ * Request INTEL_SIP_SMC_FUNCID_FCS_SEND_CERTIFICATE
+ * Sync call to send a signed certificate
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_FCS_SEND_CERTIFICATE
+ * a1 the physical address of CERTIFICATE block
+ * a2 size of data block
+ * a3-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK or INTEL_SIP_SMC_FCS_REJECTED
+ * a1-a3 not used
+ */
+#define INTEL_SIP_SMC_FUNCID_FCS_SEND_CERTIFICATE 93
+#define INTEL_SIP_SMC_FCS_SEND_CERTIFICATE \
+	INTEL_SIP_SMC_STD_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_SEND_CERTIFICATE)
+
+/**
+ * Request INTEL_SIP_SMC_FCS_GET_PROVISION_DATA
+ * Sync call to dump all the fuses and key hashes
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_FCS_GET_PROVISION_DATA
+ * a1 the physical address for firmware to write structure of fuse and
+ *    key hashes
+ * a2-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FCS_ERROR or
+ *      INTEL_SIP_SMC_FCS_REJECTED
+ * a1 mailbox error
+ * a2 physical address for the structure of fuse and key hashes
+ * a3 the size of structure
+ *
+ */
+#define INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA 94
+#define INTEL_SIP_SMC_FCS_GET_PROVISION_DATA \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA)
+
 #endif
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 5d0e814e0c41..677720792259 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -50,8 +50,8 @@
 #define SVC_STATUS_BUSY			4
 #define SVC_STATUS_ERROR		5
 #define SVC_STATUS_NO_SUPPORT		6
-
-/*
+#define SVC_STATUS_INVALID_PARAM	7
+/**
  * Flag bit for COMMAND_RECONFIG
  *
  * COMMAND_RECONFIG_FLAG_PARTIAL:
@@ -67,6 +67,8 @@
 #define SVC_RECONFIG_REQUEST_TIMEOUT_MS         300
 #define SVC_RECONFIG_BUFFER_TIMEOUT_MS          720
 #define SVC_RSU_REQUEST_TIMEOUT_MS              300
+#define SVC_FCS_REQUEST_TIMEOUT_MS		2000
+#define SVC_COMPLETED_TIMEOUT_MS		30000
 
 struct stratix10_svc_chan;
 
@@ -111,20 +113,47 @@ struct stratix10_svc_chan;
  *
  * @COMMAND_FIRMWARE_VERSION: query running firmware version, return status
  * is SVC_STATUS_OK or SVC_STATUS_ERROR
+ *
+ * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware,
+ * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM
+ *
+ * @COMMAND_FCS_SEND_CERTIFICATE: send a certificate, return status is
+ * SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM, SVC_STATUS_ERROR
+ *
+ * @COMMAND_FCS_GET_PROVISION_DATA: read the provisioning data, return status is
+ * SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM, SVC_STATUS_ERROR
+ *
+ * @COMMAND_FCS_DATA_ENCRYPTION: encrypt the data, return status is
+ * SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM, SVC_STATUS_ERROR
+ *
+ * @COMMAND_FCS_DATA_DECRYPTION: decrypt the data, return status is
+ * SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM, SVC_STATUS_ERROR
+ *
+ * @COMMAND_FCS_RANDOM_NUMBER_GEN: generate a random number, return status
+ * is SVC_STATUS_OK, SVC_STATUS_ERROR
  */
 enum stratix10_svc_command_code {
+	/* for FPGA */
 	COMMAND_NOOP = 0,
 	COMMAND_RECONFIG,
 	COMMAND_RECONFIG_DATA_SUBMIT,
 	COMMAND_RECONFIG_DATA_CLAIM,
 	COMMAND_RECONFIG_STATUS,
-	COMMAND_RSU_STATUS,
+	/* for RSU */
+	COMMAND_RSU_STATUS = 10,
 	COMMAND_RSU_UPDATE,
 	COMMAND_RSU_NOTIFY,
 	COMMAND_RSU_RETRY,
 	COMMAND_RSU_MAX_RETRY,
 	COMMAND_RSU_DCMF_VERSION,
 	COMMAND_FIRMWARE_VERSION,
+	/* for FCS */
+	COMMAND_FCS_REQUEST_SERVICE = 20,
+	COMMAND_FCS_SEND_CERTIFICATE,
+	COMMAND_FCS_GET_PROVISION_DATA,
+	COMMAND_FCS_DATA_ENCRYPTION,
+	COMMAND_FCS_DATA_DECRYPTION,
+	COMMAND_FCS_RANDOM_NUMBER_GEN,
 	/* for general status poll */
 	COMMAND_POLL_SERVICE_STATUS = 40,
 };
@@ -132,13 +161,17 @@ enum stratix10_svc_command_code {
 /**
  * struct stratix10_svc_client_msg - message sent by client to service
  * @payload: starting address of data need be processed
- * @payload_length: data size in bytes
+ * @payload_length: to be processed data size in bytes
+ * @payload_output: starting address of processed data
+ * @payload_length_output: processed data size in bytes
  * @command: service command
  * @arg: args to be passed via registers and not physically mapped buffers
  */
 struct stratix10_svc_client_msg {
 	void *payload;
 	size_t payload_length;
+	void *payload_output;
+	size_t payload_length_output;
 	enum stratix10_svc_command_code command;
 	u64 arg[3];
 };
-- 
cgit 


From 1b4394c5d731593063f53df9d72467335c3b0367 Mon Sep 17 00:00:00 2001
From: Kah Jing Lee <kah.jing.lee@intel.com>
Date: Mon, 11 Jul 2022 17:31:39 -0500
Subject: firmware: stratix10-svc: extend svc to support RSU feature

Extend Intel Stratix10 service layer driver to support new RSU
DCMF status reporting.

The status of each DCMF is reported. The currently used DCMF is used as
reference, while the other three are compared against it to determine if
they are corrupted.

DCMF = Decision Configuration Management Firmware
RSU = Remote System Update

Signed-off-by: Radu Bacrau <radu.bacrau@intel.com>
Signed-off-by: Ang Tien Sung <tien.sung.ang@intel.com>
Signed-off-by: Kah Jing Lee <kah.jing.lee@intel.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://lore.kernel.org/r/20220711223140.2307945-5-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/stratix10-svc.c                    | 20 ++++++++++++--------
 include/linux/firmware/intel/stratix10-smc.h        | 21 +++++++++++++++++++++
 include/linux/firmware/intel/stratix10-svc-client.h |  4 ++++
 3 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 1cf9f4f47b94..7fea94fe2ca0 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -338,6 +338,7 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 		break;
 	case COMMAND_RSU_RETRY:
 	case COMMAND_RSU_MAX_RETRY:
+	case COMMAND_RSU_DCMF_STATUS:
 	case COMMAND_FIRMWARE_VERSION:
 		cb_data->status = BIT(SVC_STATUS_OK);
 		cb_data->kaddr1 = &res.a1;
@@ -518,7 +519,11 @@ static int svc_normal_to_secure_thread(void *data)
 			a1 = (unsigned long)pdata->paddr;
 			a2 = (unsigned long)pdata->size;
 			break;
-
+		case COMMAND_RSU_DCMF_STATUS:
+			a0 = INTEL_SIP_SMC_RSU_DCMF_STATUS;
+			a1 = 0;
+			a2 = 0;
+			break;
 		default:
 			pr_warn("it shouldn't happen\n");
 			break;
@@ -596,8 +601,9 @@ static int svc_normal_to_secure_thread(void *data)
 			pr_err("%s: STATUS_ERROR\n", __func__);
 			cbdata->status = BIT(SVC_STATUS_ERROR);
 			cbdata->kaddr1 = &res.a1;
-			cbdata->kaddr2 = NULL;
-			cbdata->kaddr3 = NULL;
+			cbdata->kaddr2 = (res.a2) ?
+				svc_pa_to_va(res.a2) : NULL;
+			cbdata->kaddr3 = (res.a3) ? &res.a3 : NULL;
 			pdata->chan->scl->receive_cb(pdata->chan->scl, cbdata);
 			break;
 		default:
@@ -605,12 +611,10 @@ static int svc_normal_to_secure_thread(void *data)
 
 			/*
 			 * be compatible with older version firmware which
-			 * doesn't support RSU notify or retry
+			 * doesn't support newer RSU commands
 			 */
-			if ((pdata->command == COMMAND_RSU_RETRY) ||
-			    (pdata->command == COMMAND_RSU_MAX_RETRY) ||
-			    (pdata->command == COMMAND_RSU_NOTIFY) ||
-			    (pdata->command == COMMAND_FIRMWARE_VERSION)) {
+			if ((pdata->command != COMMAND_RSU_UPDATE) &&
+				(pdata->command != COMMAND_RSU_STATUS)) {
 				cbdata->status =
 					BIT(SVC_STATUS_NO_SUPPORT);
 				cbdata->kaddr1 = NULL;
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index 8c198984a47c..8fc4aa450517 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -403,6 +403,27 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_RSU_MAX_RETRY \
 	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_RSU_MAX_RETRY)
 
+/**
+ * Request INTEL_SIP_SMC_RSU_DCMF_STATUS
+ *
+ * Sync call used by service driver at EL1 to query DCMF status from FW
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_RSU_DCMF_STATUS
+ * a1-7 not used
+ *
+ * Return status
+ * a0 INTEL_SIP_SMC_STATUS_OK
+ * a1 dcmf3 | dcmf2 | dcmf1 | dcmf0
+ *
+ * Or
+ *
+ * a0 INTEL_SIP_SMC_RSU_ERROR
+ */
+#define INTEL_SIP_SMC_FUNCID_RSU_DCMF_STATUS 20
+#define INTEL_SIP_SMC_RSU_DCMF_STATUS \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_RSU_DCMF_STATUS)
+
 /**
  * Request INTEL_SIP_SMC_SERVICE_COMPLETED
  * Sync call to check if the secure world have completed service request
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 677720792259..82a20e62f15f 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -114,6 +114,9 @@ struct stratix10_svc_chan;
  * @COMMAND_FIRMWARE_VERSION: query running firmware version, return status
  * is SVC_STATUS_OK or SVC_STATUS_ERROR
  *
+ * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status
+ * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
+ *
  * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware,
  * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM
  *
@@ -146,6 +149,7 @@ enum stratix10_svc_command_code {
 	COMMAND_RSU_RETRY,
 	COMMAND_RSU_MAX_RETRY,
 	COMMAND_RSU_DCMF_VERSION,
+	COMMAND_RSU_DCMF_STATUS,
 	COMMAND_FIRMWARE_VERSION,
 	/* for FCS */
 	COMMAND_FCS_REQUEST_SERVICE = 20,
-- 
cgit 


From 7935e899b35c93faa26e1d272a51b3d9cb39f23f Mon Sep 17 00:00:00 2001
From: Ang Tien Sung <tien.sung.ang@intel.com>
Date: Mon, 11 Jul 2022 17:31:40 -0500
Subject: firmware: stratix10-svc: To support a command ATF Get Version

We are to support a new SMC Command of hexadecimal 0x200 that returns
the ATF Firmware major and minor version.

Signed-off-by: Ang Tien Sung <tien.sung.ang@intel.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://lore.kernel.org/r/20220711223140.2307945-6-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/stratix10-svc.c                    | 10 ++++++++++
 include/linux/firmware/intel/stratix10-smc.h        | 18 ++++++++++++++++++
 include/linux/firmware/intel/stratix10-svc-client.h |  5 +++++
 3 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 7fea94fe2ca0..b4081f4d88a3 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -343,6 +343,11 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 		cb_data->status = BIT(SVC_STATUS_OK);
 		cb_data->kaddr1 = &res.a1;
 		break;
+	case COMMAND_SMC_SVC_VERSION:
+		cb_data->status = BIT(SVC_STATUS_OK);
+		cb_data->kaddr1 = &res.a1;
+		cb_data->kaddr2 = &res.a2;
+		break;
 	case COMMAND_RSU_DCMF_VERSION:
 		cb_data->status = BIT(SVC_STATUS_OK);
 		cb_data->kaddr1 = &res.a1;
@@ -524,6 +529,11 @@ static int svc_normal_to_secure_thread(void *data)
 			a1 = 0;
 			a2 = 0;
 			break;
+		case COMMAND_SMC_SVC_VERSION:
+			a0 = INTEL_SIP_SMC_SVC_VERSION;
+			a1 = 0;
+			a2 = 0;
+			break;
 		default:
 			pr_warn("it shouldn't happen\n");
 			break;
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index 8fc4aa450517..a718f853d457 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -466,6 +466,24 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_FIRMWARE_VERSION \
         INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FIRMWARE_VERSION)
 
+/**
+ * Request INTEL_SIP_SMC_SVC_VERSION
+ *
+ * Sync call used to query the SIP SMC API Version
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_SVC_VERSION
+ * a1-a7 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK
+ * a1 Major
+ * a2 Minor
+ */
+#define INTEL_SIP_SMC_SVC_FUNCID_VERSION 512
+#define INTEL_SIP_SMC_SVC_VERSION \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_SVC_FUNCID_VERSION)
+
 /**
  * SMC call protocol for FPGA Crypto Service (FCS)
  * FUNCID starts from 90
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 82a20e62f15f..a776a787ac75 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -114,6 +114,9 @@ struct stratix10_svc_chan;
  * @COMMAND_FIRMWARE_VERSION: query running firmware version, return status
  * is SVC_STATUS_OK or SVC_STATUS_ERROR
  *
+ * @COMMAND_SMC_SVC_VERSION: Non-mailbox SMC SVC API Version,
+ * return status is SVC_STATUS_OK
+ *
  * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status
  * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
  *
@@ -160,6 +163,8 @@ enum stratix10_svc_command_code {
 	COMMAND_FCS_RANDOM_NUMBER_GEN,
 	/* for general status poll */
 	COMMAND_POLL_SERVICE_STATUS = 40,
+	/* Non-mailbox SMC Call */
+	COMMAND_SMC_SVC_VERSION = 200,
 };
 
 /**
-- 
cgit 


From 900d156bac2bc474cf7c7bee4efbc6c83ec5ae58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Jul 2022 07:53:17 +0200
Subject: block: remove bdevname

Replace the remaining calls of bdevname with snprintf using the %pg
format specifier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220713055317.1888500-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           | 23 -----------------------
 drivers/md/md.c         |  2 +-
 drivers/md/raid1.c      |  2 +-
 drivers/md/raid10.c     |  2 +-
 fs/ext4/mmp.c           |  3 ++-
 fs/jbd2/journal.c       |  6 ++++--
 include/linux/blkdev.h  |  1 -
 kernel/trace/blktrace.c |  4 ++--
 8 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/block/genhd.c b/block/genhd.c
index 9d30f159c59a..44dfcf67ed96 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -101,29 +101,6 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
 }
 EXPORT_SYMBOL_GPL(set_capacity_and_notify);
 
-/*
- * Format the device name of the indicated block device into the supplied buffer
- * and return a pointer to that same buffer for convenience.
- *
- * Note: do not use this in new code, use the %pg specifier to sprintf and
- * printk insted.
- */
-const char *bdevname(struct block_device *bdev, char *buf)
-{
-	struct gendisk *hd = bdev->bd_disk;
-	int partno = bdev->bd_partno;
-
-	if (!partno)
-		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
-	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
-	else
-		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
-
-	return buf;
-}
-EXPORT_SYMBOL(bdevname);
-
 static void part_stat_read_all(struct block_device *part,
 		struct disk_stats *stat)
 {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 076255ec9ba1..4be9d8173071 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2438,7 +2438,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 			mdname(mddev), mddev->max_disks);
 		return -EBUSY;
 	}
-	bdevname(rdev->bdev,b);
+	snprintf(b, sizeof(b), "%pg", rdev->bdev);
 	strreplace(b, '/', '!');
 
 	rdev->mddev = mddev;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 258d4eb2d63c..65cd90f0b2a8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1240,7 +1240,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 		rcu_read_lock();
 		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
 		if (rdev)
-			bdevname(rdev->bdev, b);
+			snprintf(b, sizeof(b), "%pg", rdev->bdev);
 		else
 			strcpy(b, "???");
 		rcu_read_unlock();
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d589f823feb1..a7dcb1bf6b0a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1164,7 +1164,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		disk = r10_bio->devs[slot].devnum;
 		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (err_rdev)
-			bdevname(err_rdev->bdev, b);
+			snprintf(b, sizeof(b), "%pg", err_rdev->bdev);
 		else {
 			strcpy(b, "???");
 			/* This never gets dereferenced */
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index b7a850b0070b..b221f313ded6 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -371,7 +371,8 @@ skip:
 	EXT4_SB(sb)->s_mmp_bh = bh;
 
 	BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
-	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+	snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
+		 "%pg", bh->b_bdev);
 
 	/*
 	 * Start a kernel thread to update the MMP block periodically.
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0cbeeaec2d1..9015f5fa2862 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1465,7 +1465,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
 	if (!journal)
 		return NULL;
 
-	bdevname(journal->j_dev, journal->j_devname);
+	snprintf(journal->j_devname, sizeof(journal->j_devname),
+		 "%pg", journal->j_dev);
 	strreplace(journal->j_devname, '/', '!');
 	jbd2_stats_proc_init(journal);
 
@@ -1507,7 +1508,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
 		return NULL;
 
 	journal->j_inode = inode;
-	bdevname(journal->j_dev, journal->j_devname);
+	snprintf(journal->j_devname, sizeof(journal->j_devname),
+		 "%pg", journal->j_dev);
 	p = strreplace(journal->j_devname, '/', '!');
 	sprintf(p, "-%lu", journal->j_inode->i_ino);
 	jbd2_stats_proc_init(journal);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 22c477fadc0f..2775763c51b9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1457,7 +1457,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 int bdev_read_only(struct block_device *bdev);
 int set_blocksize(struct block_device *bdev, int size);
 
-const char *bdevname(struct block_device *bdev, char *buffer);
 int lookup_bdev(const char *pathname, dev_t *dev);
 
 void blkdev_show(struct seq_file *seqf, off_t offset);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c584effe5fe9..4752bda1b1a0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -736,12 +736,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 
 	switch (cmd) {
 	case BLKTRACESETUP:
-		bdevname(bdev, b);
+		snprintf(b, sizeof(b), "%pg", bdev);
 		ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
-		bdevname(bdev, b);
+		snprintf(b, sizeof(b), "%pg", bdev);
 		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #endif
-- 
cgit 


From 3d8c51b25a235e283e37750943bbf356ef187230 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Thu, 14 Jul 2022 10:07:54 +0300
Subject: net/tls: Check for errors in tls_device_init

Add missing error checks in tls_device_init.

Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220714070754.1428-1-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h    | 4 ++--
 net/tls/tls_device.c | 4 ++--
 net/tls/tls_main.c   | 7 ++++++-
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 8017f1703447..8bd938f98bdd 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -704,7 +704,7 @@ int tls_sw_fallback_init(struct sock *sk,
 			 struct tls_crypto_info *crypto_info);
 
 #ifdef CONFIG_TLS_DEVICE
-void tls_device_init(void);
+int tls_device_init(void);
 void tls_device_cleanup(void);
 void tls_device_sk_destruct(struct sock *sk);
 int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
@@ -724,7 +724,7 @@ static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk)
 	return tls_get_ctx(sk)->rx_conf == TLS_HW;
 }
 #else
-static inline void tls_device_init(void) {}
+static inline int tls_device_init(void) { return 0; }
 static inline void tls_device_cleanup(void) {}
 
 static inline int
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index ec6f4b699a2b..ce827e79c66a 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1419,9 +1419,9 @@ static struct notifier_block tls_dev_notifier = {
 	.notifier_call	= tls_dev_event,
 };
 
-void __init tls_device_init(void)
+int __init tls_device_init(void)
 {
-	register_netdevice_notifier(&tls_dev_notifier);
+	return register_netdevice_notifier(&tls_dev_notifier);
 }
 
 void __exit tls_device_cleanup(void)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 2ffede463e4a..d80ab3d1764e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -1048,7 +1048,12 @@ static int __init tls_register(void)
 	if (err)
 		return err;
 
-	tls_device_init();
+	err = tls_device_init();
+	if (err) {
+		unregister_pernet_subsys(&tls_proc_ops);
+		return err;
+	}
+
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
-- 
cgit 


From ff07a02e9e8e6489db841e0c48a5c78e7e78d572 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:27 -0700
Subject: treewide: Rename enum req_opf into enum req_op

The type name enum req_opf is misleading since it suggests that values of
this type include both an operation type and flags. Since values of this
type represent an operation only, change the type name into enum req_op.

Convert the enum req_op documentation into kernel-doc format. Move a few
definitions such that the enum req_op documentation occurs just above
the enum req_op definition.

The name "req_opf" was introduced by commit ef295ecf090d ("block: better op
and flags encoding").

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-2-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c                 |  7 +++----
 drivers/block/null_blk/main.c     |  9 ++++-----
 drivers/block/null_blk/null_blk.h | 12 +++++-------
 drivers/block/null_blk/trace.h    |  2 +-
 drivers/block/null_blk/zoned.c    |  4 ++--
 drivers/md/dm-integrity.c         |  2 +-
 drivers/nvme/target/zns.c         |  4 ++--
 drivers/scsi/sd_zbc.c             |  2 +-
 drivers/ufs/core/ufshpb.c         |  5 ++---
 fs/zonefs/super.c                 |  5 ++---
 fs/zonefs/trace.h                 |  2 +-
 include/linux/blk_types.h         | 16 ++++++++--------
 include/linux/blkdev.h            |  2 +-
 13 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 7c017458d5ce..a264621d4905 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -256,9 +256,8 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
  *    The operation to execute on each zone can be a zone reset, open, close
  *    or finish request.
  */
-int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
-		     sector_t sector, sector_t nr_sectors,
-		     gfp_t gfp_mask)
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+		     sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	sector_t zone_sectors = bdev_zone_sectors(bdev);
@@ -397,7 +396,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
 	void __user *argp = (void __user *)arg;
 	struct request_queue *q;
 	struct blk_zone_range zrange;
-	enum req_opf op;
+	enum req_op op;
 	int ret;
 
 	if (!argp)
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 4e03a020ee3c..8b224ede2e33 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1310,7 +1310,7 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
 }
 
 static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
-						     enum req_opf op,
+						     enum req_op op,
 						     sector_t sector,
 						     sector_t nr_sectors)
 {
@@ -1381,9 +1381,8 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
 	}
 }
 
-blk_status_t null_process_cmd(struct nullb_cmd *cmd,
-			      enum req_opf op, sector_t sector,
-			      unsigned int nr_sectors)
+blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
+			      sector_t sector, unsigned int nr_sectors)
 {
 	struct nullb_device *dev = cmd->nq->dev;
 	blk_status_t ret;
@@ -1401,7 +1400,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd,
 }
 
 static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
-				    sector_t nr_sectors, enum req_opf op)
+				    sector_t nr_sectors, enum req_op op)
 {
 	struct nullb_device *dev = cmd->nq->dev;
 	struct nullb *nullb = dev->nullb;
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 8359b43842f2..6fbf0a1b2622 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -136,9 +136,8 @@ struct nullb {
 
 blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector,
 				 sector_t nr_sectors);
-blk_status_t null_process_cmd(struct nullb_cmd *cmd,
-			      enum req_opf op, sector_t sector,
-			      unsigned int nr_sectors);
+blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
+			      sector_t sector, unsigned int nr_sectors);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q);
@@ -146,9 +145,8 @@ int null_register_zoned_dev(struct nullb *nullb);
 void null_free_zoned_dev(struct nullb_device *dev);
 int null_report_zones(struct gendisk *disk, sector_t sector,
 		      unsigned int nr_zones, report_zones_cb cb, void *data);
-blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
-				    enum req_opf op, sector_t sector,
-				    sector_t nr_sectors);
+blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
+				    sector_t sector, sector_t nr_sectors);
 size_t null_zone_valid_read_len(struct nullb *nullb,
 				sector_t sector, unsigned int len);
 #else
@@ -164,7 +162,7 @@ static inline int null_register_zoned_dev(struct nullb *nullb)
 }
 static inline void null_free_zoned_dev(struct nullb_device *dev) {}
 static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
-			enum req_opf op, sector_t sector, sector_t nr_sectors)
+			enum req_op op, sector_t sector, sector_t nr_sectors)
 {
 	return BLK_STS_NOTSUPP;
 }
diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h
index 86d6c12c603c..6b2b370e786f 100644
--- a/drivers/block/null_blk/trace.h
+++ b/drivers/block/null_blk/trace.h
@@ -36,7 +36,7 @@ TRACE_EVENT(nullb_zone_op,
 	    TP_ARGS(cmd, zone_no, zone_cond),
 	    TP_STRUCT__entry(
 		__array(char, disk, DISK_NAME_LEN)
-		__field(enum req_opf, op)
+		__field(enum req_op, op)
 		__field(unsigned int, zone_no)
 		__field(unsigned int, zone_cond)
 	    ),
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 64b06caab984..55a69e48ef8b 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -600,7 +600,7 @@ static blk_status_t null_reset_zone(struct nullb_device *dev,
 	return BLK_STS_OK;
 }
 
-static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
+static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
 				   sector_t sector)
 {
 	struct nullb_device *dev = cmd->nq->dev;
@@ -653,7 +653,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
 	return ret;
 }
 
-blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
+blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
 				    sector_t sector, sector_t nr_sectors)
 {
 	struct nullb_device *dev;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3d5a0ce123c9..148978ad03a8 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -298,7 +298,7 @@ struct dm_integrity_io {
 	struct work_struct work;
 
 	struct dm_integrity_c *ic;
-	enum req_opf op;
+	enum req_op op;
 	bool fua;
 
 	struct dm_integrity_range range;
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index c0ee21fcab81..b233c0943fec 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -308,7 +308,7 @@ void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
 	queue_work(zbd_wq, &req->z.zmgmt_work);
 }
 
-static inline enum req_opf zsa_req_op(u8 zsa)
+static inline enum req_op zsa_req_op(u8 zsa)
 {
 	switch (zsa) {
 	case NVME_ZONE_OPEN:
@@ -465,7 +465,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
 {
 	struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
 	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
-	enum req_opf op = zsa_req_op(req->cmd->zms.zsa);
+	enum req_op op = zsa_req_op(req->cmd->zms.zsa);
 	struct block_device *bdev = req->ns->bdev;
 	sector_t zone_sectors = bdev_zone_sectors(bdev);
 	u16 status = NVME_SC_SUCCESS;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index b8c97456506a..bd15624c6322 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -529,7 +529,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
 	struct request *rq = scsi_cmd_to_rq(cmd);
 	struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
 	unsigned int zno = blk_rq_zone_no(rq);
-	enum req_opf op = req_op(rq);
+	enum req_op op = req_op(rq);
 	unsigned long flags;
 
 	/*
diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c
index de2bb8401bc4..24f1ee82c215 100644
--- a/drivers/ufs/core/ufshpb.c
+++ b/drivers/ufs/core/ufshpb.c
@@ -433,9 +433,8 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 	return 0;
 }
 
-static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb,
-					 int rgn_idx, enum req_opf dir,
-					 bool atomic)
+static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb, int rgn_idx,
+					 enum req_op dir, bool atomic)
 {
 	struct ufshpb_req *rq;
 	struct request *req;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9c0eef1ff32a..a221ddb12da6 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -60,8 +60,7 @@ static void zonefs_account_active(struct inode *inode)
 	}
 }
 
-static inline int zonefs_zone_mgmt(struct inode *inode,
-				   enum req_opf op)
+static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
 {
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	int ret;
@@ -525,7 +524,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
 {
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	loff_t old_isize;
-	enum req_opf op;
+	enum req_op op;
 	int ret = 0;
 
 	/*
diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h
index f369d7d50303..21501da764bd 100644
--- a/fs/zonefs/trace.h
+++ b/fs/zonefs/trace.h
@@ -20,7 +20,7 @@
 #define show_dev(dev) MAJOR(dev), MINOR(dev)
 
 TRACE_EVENT(zonefs_zone_mgmt,
-	    TP_PROTO(struct inode *inode, enum req_opf op),
+	    TP_PROTO(struct inode *inode, enum req_op op),
 	    TP_ARGS(inode, op),
 	    TP_STRUCT__entry(
 			     __field(dev_t, dev)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a24d4078fb21..0e6a2af7ed3d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -337,8 +337,12 @@ enum {
 
 typedef __u32 __bitwise blk_mq_req_flags_t;
 
-/*
- * Operations and flags common to the bio and request structures.
+#define REQ_OP_BITS	8
+#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
+#define REQ_FLAG_BITS	24
+
+/**
+ * enum req_op - Operations common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.
  *
  * The least significant bit of the operation number indicates the data
@@ -350,11 +354,7 @@ typedef __u32 __bitwise blk_mq_req_flags_t;
  * If a operation does not transfer data the least significant bit has no
  * meaning.
  */
-#define REQ_OP_BITS	8
-#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
-#define REQ_FLAG_BITS	24
-
-enum req_opf {
+enum req_op {
 	/* read sectors from the device */
 	REQ_OP_READ		= 0,
 	/* write sectors to the device */
@@ -509,7 +509,7 @@ static inline bool op_is_discard(unsigned int op)
  * due to its different handling in the block layer and device response in
  * case of command failure.
  */
-static inline bool op_is_zone_mgmt(enum req_opf op)
+static inline bool op_is_zone_mgmt(enum req_op op)
 {
 	switch (op & REQ_OP_MASK) {
 	case REQ_OP_ZONE_RESET:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2775763c51b9..ec072a5129bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -322,7 +322,7 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data);
 unsigned int bdev_nr_zones(struct block_device *bdev);
-extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
+extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 			    sector_t sectors, sector_t nr_sectors,
 			    gfp_t gfp_mask);
 int blk_revalidate_disk_zones(struct gendisk *disk,
-- 
cgit 


From 77e7ffd7ad3952909be6a9c599b7d164c8866fec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:28 -0700
Subject: block: Use enum req_op where appropriate

Change the type of the arguments that are used to pass a REQ_OP_* value
from int or unsigned int into enum req_op to improve static type
checking.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-3-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 6 +++---
 block/blk-mq-debugfs.c    | 2 +-
 block/blk-throttle.c      | 7 ++++---
 block/blk-wbt.c           | 2 +-
 block/blk.h               | 2 +-
 include/linux/blk_types.h | 2 +-
 include/linux/blkdev.h    | 6 +++---
 7 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8365996a8ef8..67b8bcfa27f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static const char *const blk_op_name[] = {
  * string format. Useful in the debugging and tracing bio or request. For
  * invalid REQ_OP_XXX it returns string "UNKNOWN".
  */
-inline const char *blk_op_str(unsigned int op)
+inline const char *blk_op_str(enum req_op op)
 {
 	const char *op_str = "UNKNOWN";
 
@@ -953,7 +953,7 @@ again:
 }
 
 unsigned long bdev_start_io_acct(struct block_device *bdev,
-				 unsigned int sectors, unsigned int op,
+				 unsigned int sectors, enum req_op op,
 				 unsigned long start_time)
 {
 	const int sgrp = op_stat_group(op);
@@ -994,7 +994,7 @@ unsigned long bio_start_io_acct(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_start_io_acct);
 
-void bdev_end_io_acct(struct block_device *bdev, unsigned int op,
+void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 		      unsigned long start_time)
 {
 	const int sgrp = op_stat_group(op);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7ee1b13380d0..6cc2411e2d26 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -304,7 +304,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
-	const unsigned int op = req_op(rq);
+	const enum req_op op = req_op(rq);
 	const char *op_str = blk_op_str(op);
 
 	seq_printf(m, "%p {.op=", rq);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 139b2d7a99e2..9f5fe62afff9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2203,8 +2203,9 @@ out_unlock:
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_track_latency(struct throtl_data *td, sector_t size,
-	int op, unsigned long time)
+				 enum req_op op, unsigned long time)
 {
+	const bool rw = op_is_write(op);
 	struct latency_bucket *latency;
 	int index;
 
@@ -2215,10 +2216,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
 
 	index = request_bucket_index(size);
 
-	latency = get_cpu_ptr(td->latency_buckets[op]);
+	latency = get_cpu_ptr(td->latency_buckets[rw]);
 	latency[index].total_latency += time;
 	latency[index].samples++;
-	put_cpu_ptr(td->latency_buckets[op]);
+	put_cpu_ptr(td->latency_buckets[rw]);
 }
 
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0c119be0e813..7bf09ae06577 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -670,7 +670,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
 
 static int wbt_data_dir(const struct request *rq)
 {
-	const int op = req_op(rq);
+	const enum req_op op = req_op(rq);
 
 	if (op == REQ_OP_READ)
 		return READ;
diff --git a/block/blk.h b/block/blk.h
index b71e22c97d77..c4b084bfe87c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -160,7 +160,7 @@ static inline bool blk_discard_mergable(struct request *req)
 }
 
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
-						     int op)
+						     enum req_op op)
 {
 	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0e6a2af7ed3d..cce8768bc00b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -522,7 +522,7 @@ static inline bool op_is_zone_mgmt(enum req_op op)
 	}
 }
 
-static inline int op_stat_group(unsigned int op)
+static inline int op_stat_group(enum req_op op)
 {
 	if (op_is_discard(op))
 		return STAT_DISCARD;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec072a5129bf..2f13f0062192 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -872,7 +872,7 @@ extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
-extern const char *blk_op_str(unsigned int op);
+extern const char *blk_op_str(enum req_op op);
 
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
@@ -1434,9 +1434,9 @@ static inline void blk_wake_io_task(struct task_struct *waiter)
 }
 
 unsigned long bdev_start_io_acct(struct block_device *bdev,
-				 unsigned int sectors, unsigned int op,
+				 unsigned int sectors, enum req_op op,
 				 unsigned long start_time);
-void bdev_end_io_acct(struct block_device *bdev, unsigned int op,
+void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 		unsigned long start_time);
 
 void bio_start_io_acct_time(struct bio *bio, unsigned long start_time);
-- 
cgit 


From 86947df3a9236481276e8baadde50a403b02b4d4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:29 -0700
Subject: block: Change the type of the last .rw_page() argument

All .rw_page() callers pass an enum req_op value as last argument. Make
this explicit by changing the type of the last argument into enum req_op.
See also commit 3f289dcb4b26 ("block: make bdev_ops->rw_page() take a
REQ_OP instead of bool").

Cc: Tejun Heo <tj@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-4-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/brd.c           | 2 +-
 drivers/block/zram/zram_drv.c | 2 +-
 drivers/nvdimm/btt.c          | 2 +-
 drivers/nvdimm/pmem.c         | 2 +-
 include/linux/blkdev.h        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 9e26d5e769f3..7b82876af36e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -310,7 +310,7 @@ static void brd_submit_bio(struct bio *bio)
 }
 
 static int brd_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, unsigned int op)
+		       struct page *page, enum req_op op)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	int err;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e5233c911e43..a35b86c58aa2 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1631,7 +1631,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 }
 
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, unsigned int op)
+		       struct page *page, enum req_op op)
 {
 	int offset, ret;
 	u32 index;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 5e622c0d4b66..dfbf73145d16 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1483,7 +1483,7 @@ static void btt_submit_bio(struct bio *bio)
 }
 
 static int btt_rw_page(struct block_device *bdev, sector_t sector,
-		struct page *page, unsigned int op)
+		struct page *page, enum req_op op)
 {
 	struct btt *btt = bdev->bd_disk->private_data;
 	int rc;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index a72b81fa3242..f36efcc11f67 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -239,7 +239,7 @@ static void pmem_submit_bio(struct bio *bio)
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, unsigned int op)
+		       struct page *page, enum req_op op)
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	blk_status_t rc;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f13f0062192..ca2ff113ea00 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1381,7 +1381,7 @@ struct block_device_operations {
 			unsigned int flags);
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
-	int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
+	int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	unsigned int (*check_events) (struct gendisk *disk,
-- 
cgit 


From 2d9b02be73ba8efba406b399a722b4e33614dd0e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:30 -0700
Subject: block: Change the type of req_op() and bio_op() into enum req_op

Improve static type checking by changing the type of the value returned by
req_op() and bio_op() from unsigned int into enum req_op. Insert
'default: break;' in switch statements on the enum req_op type to prevent
that the compiler warns about these switch statements.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-5-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c         | 2 ++
 drivers/block/paride/pd.c | 2 ++
 drivers/md/dm.c           | 2 ++
 include/linux/blk-mq.h    | 6 ++++--
 include/linux/blk_types.h | 6 ++++--
 5 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5abf5aa5a5f0..de178a8b4c82 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -405,6 +405,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
 		return 1;
 	case REQ_OP_WRITE_ZEROES:
 		return 0;
+	default:
+		break;
 	}
 
 	rq_for_each_bvec(bv, rq, iter)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index c8c14c6f5c3a..f8a75bc90f70 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -501,6 +501,8 @@ static enum action do_pd_io_start(void)
 			return do_pd_read_start();
 		else
 			return do_pd_write_start();
+	default:
+		break;
 	}
 	return Fail;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 33d3799bb66e..6c21922b87d0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1542,6 +1542,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 	case REQ_OP_WRITE_ZEROES:
 		num_bios = ti->num_write_zeroes_bios;
 		break;
+	default:
+		break;
 	}
 
 	/*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d74f6a6b7e69..677195de0663 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -198,8 +198,10 @@ struct request {
 	void *end_io_data;
 };
 
-#define req_op(req) \
-	((req)->cmd_flags & REQ_OP_MASK)
+static inline enum req_op req_op(const struct request *req)
+{
+	return req->cmd_flags & REQ_OP_MASK;
+}
 
 static inline bool blk_rq_is_passthrough(struct request *rq)
 {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cce8768bc00b..e66cbe377ae8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -463,8 +463,10 @@ enum stat_group {
 	NR_STAT_GROUPS
 };
 
-#define bio_op(bio) \
-	((bio)->bi_opf & REQ_OP_MASK)
+static inline enum req_op bio_op(const struct bio *bio)
+{
+	return bio->bi_opf & REQ_OP_MASK;
+}
 
 /* obsolete, don't use in new code */
 static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
-- 
cgit 


From 342a72a334073f163da924b69c3d3fb4685eb33a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:31 -0700
Subject: block: Introduce the type blk_opf_t

Introduce the type blk_opf_t for the request operation and flags (REQ_OP_*
and REQ_*). This type will be used to improve documentation of the block
layer code and also to allow sparse to verify whether request flags are used
correctly.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-6-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 97 +++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e66cbe377ae8..1ef99790f6ed 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -240,6 +240,8 @@ static inline void bio_issue_init(struct bio_issue *issue,
 			((u64)size << BIO_ISSUE_SIZE_SHIFT));
 }
 
+typedef __u32 __bitwise blk_opf_t;
+
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE		-1U
 
@@ -250,7 +252,7 @@ typedef unsigned int blk_qc_t;
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
-	unsigned int		bi_opf;		/* bottom bits REQ_OP, top bits
+	blk_opf_t		bi_opf;		/* bottom bits REQ_OP, top bits
 						 * req_flags.
 						 */
 	unsigned short		bi_flags;	/* BIO_* below */
@@ -338,7 +340,7 @@ enum {
 typedef __u32 __bitwise blk_mq_req_flags_t;
 
 #define REQ_OP_BITS	8
-#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
+#define REQ_OP_MASK	(__force blk_opf_t)((1 << REQ_OP_BITS) - 1)
 #define REQ_FLAG_BITS	24
 
 /**
@@ -356,35 +358,35 @@ typedef __u32 __bitwise blk_mq_req_flags_t;
  */
 enum req_op {
 	/* read sectors from the device */
-	REQ_OP_READ		= 0,
+	REQ_OP_READ		= (__force blk_opf_t)0,
 	/* write sectors to the device */
-	REQ_OP_WRITE		= 1,
+	REQ_OP_WRITE		= (__force blk_opf_t)1,
 	/* flush the volatile write cache */
-	REQ_OP_FLUSH		= 2,
+	REQ_OP_FLUSH		= (__force blk_opf_t)2,
 	/* discard sectors */
-	REQ_OP_DISCARD		= 3,
+	REQ_OP_DISCARD		= (__force blk_opf_t)3,
 	/* securely erase sectors */
-	REQ_OP_SECURE_ERASE	= 5,
+	REQ_OP_SECURE_ERASE	= (__force blk_opf_t)5,
 	/* write the zero filled sector many times */
-	REQ_OP_WRITE_ZEROES	= 9,
+	REQ_OP_WRITE_ZEROES	= (__force blk_opf_t)9,
 	/* Open a zone */
-	REQ_OP_ZONE_OPEN	= 10,
+	REQ_OP_ZONE_OPEN	= (__force blk_opf_t)10,
 	/* Close a zone */
-	REQ_OP_ZONE_CLOSE	= 11,
+	REQ_OP_ZONE_CLOSE	= (__force blk_opf_t)11,
 	/* Transition a zone to full */
-	REQ_OP_ZONE_FINISH	= 12,
+	REQ_OP_ZONE_FINISH	= (__force blk_opf_t)12,
 	/* write data at the current zone write pointer */
-	REQ_OP_ZONE_APPEND	= 13,
+	REQ_OP_ZONE_APPEND	= (__force blk_opf_t)13,
 	/* reset a zone write pointer */
-	REQ_OP_ZONE_RESET	= 15,
+	REQ_OP_ZONE_RESET	= (__force blk_opf_t)15,
 	/* reset all the zone present on the device */
-	REQ_OP_ZONE_RESET_ALL	= 17,
+	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
 
 	/* Driver private requests */
-	REQ_OP_DRV_IN		= 34,
-	REQ_OP_DRV_OUT		= 35,
+	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
+	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
 
-	REQ_OP_LAST,
+	REQ_OP_LAST		= (__force blk_opf_t)36,
 };
 
 enum req_flag_bits {
@@ -425,28 +427,31 @@ enum req_flag_bits {
 	__REQ_NR_BITS,		/* stops here */
 };
 
-#define REQ_FAILFAST_DEV	(1ULL << __REQ_FAILFAST_DEV)
-#define REQ_FAILFAST_TRANSPORT	(1ULL << __REQ_FAILFAST_TRANSPORT)
-#define REQ_FAILFAST_DRIVER	(1ULL << __REQ_FAILFAST_DRIVER)
-#define REQ_SYNC		(1ULL << __REQ_SYNC)
-#define REQ_META		(1ULL << __REQ_META)
-#define REQ_PRIO		(1ULL << __REQ_PRIO)
-#define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
-#define REQ_IDLE		(1ULL << __REQ_IDLE)
-#define REQ_INTEGRITY		(1ULL << __REQ_INTEGRITY)
-#define REQ_FUA			(1ULL << __REQ_FUA)
-#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
-#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
-#define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
-#define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
-#define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
-
-#define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
-#define REQ_POLLED		(1ULL << __REQ_POLLED)
-#define REQ_ALLOC_CACHE		(1ULL << __REQ_ALLOC_CACHE)
-
-#define REQ_DRV			(1ULL << __REQ_DRV)
-#define REQ_SWAP		(1ULL << __REQ_SWAP)
+#define REQ_FAILFAST_DEV	\
+			(__force blk_opf_t)(1ULL << __REQ_FAILFAST_DEV)
+#define REQ_FAILFAST_TRANSPORT	\
+			(__force blk_opf_t)(1ULL << __REQ_FAILFAST_TRANSPORT)
+#define REQ_FAILFAST_DRIVER	\
+			(__force blk_opf_t)(1ULL << __REQ_FAILFAST_DRIVER)
+#define REQ_SYNC	(__force blk_opf_t)(1ULL << __REQ_SYNC)
+#define REQ_META	(__force blk_opf_t)(1ULL << __REQ_META)
+#define REQ_PRIO	(__force blk_opf_t)(1ULL << __REQ_PRIO)
+#define REQ_NOMERGE	(__force blk_opf_t)(1ULL << __REQ_NOMERGE)
+#define REQ_IDLE	(__force blk_opf_t)(1ULL << __REQ_IDLE)
+#define REQ_INTEGRITY	(__force blk_opf_t)(1ULL << __REQ_INTEGRITY)
+#define REQ_FUA		(__force blk_opf_t)(1ULL << __REQ_FUA)
+#define REQ_PREFLUSH	(__force blk_opf_t)(1ULL << __REQ_PREFLUSH)
+#define REQ_RAHEAD	(__force blk_opf_t)(1ULL << __REQ_RAHEAD)
+#define REQ_BACKGROUND	(__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
+#define REQ_NOWAIT	(__force blk_opf_t)(1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT	(__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
+
+#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
+#define REQ_POLLED	(__force blk_opf_t)(1ULL << __REQ_POLLED)
+#define REQ_ALLOC_CACHE	(__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
+
+#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
+#define REQ_SWAP	(__force blk_opf_t)(1ULL << __REQ_SWAP)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -469,22 +474,22 @@ static inline enum req_op bio_op(const struct bio *bio)
 }
 
 /* obsolete, don't use in new code */
-static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
-		unsigned op_flags)
+static inline void bio_set_op_attrs(struct bio *bio, enum req_op op,
+				    blk_opf_t op_flags)
 {
 	bio->bi_opf = op | op_flags;
 }
 
-static inline bool op_is_write(unsigned int op)
+static inline bool op_is_write(blk_opf_t op)
 {
-	return (op & 1);
+	return !!(op & (__force blk_opf_t)1);
 }
 
 /*
  * Check if the bio or request is one that needs special treatment in the
  * flush state machine.
  */
-static inline bool op_is_flush(unsigned int op)
+static inline bool op_is_flush(blk_opf_t op)
 {
 	return op & (REQ_FUA | REQ_PREFLUSH);
 }
@@ -494,13 +499,13 @@ static inline bool op_is_flush(unsigned int op)
  * PREFLUSH flag.  Other operations may be marked as synchronous using the
  * REQ_SYNC flag.
  */
-static inline bool op_is_sync(unsigned int op)
+static inline bool op_is_sync(blk_opf_t op)
 {
 	return (op & REQ_OP_MASK) == REQ_OP_READ ||
 		(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
 }
 
-static inline bool op_is_discard(unsigned int op)
+static inline bool op_is_discard(blk_opf_t op)
 {
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
-- 
cgit 


From 16458cf3bd15e5624205df6e8a76b9a5363555f3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:32 -0700
Subject: block: Use the new blk_opf_t type

Use the new blk_opf_t type for arguments and variables that represent
request flags or a bitwise combination of a request operation and
request flags. Rename the function arguments and also a structure member
that hold a request operation and flags from 'rw' into 'opf'.

This patch does not change any functionality.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-7-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 10 +++++-----
 block/blk-cgroup-rwstat.h |  8 ++++----
 block/blk-core.c          |  2 +-
 block/blk-flush.c         |  6 +++---
 block/blk-merge.c         |  6 +++---
 block/blk-mq-debugfs.c    |  4 ++--
 block/blk-mq.c            | 15 ++++++++-------
 block/blk-mq.h            |  6 +++---
 block/blk-wbt.c           | 16 ++++++++--------
 block/elevator.h          |  2 +-
 block/fops.c              | 12 ++++++------
 include/linux/bio.h       | 10 +++++-----
 include/linux/blk-mq.h    |  6 +++---
 include/linux/blkdev.h    |  2 +-
 14 files changed, 53 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 888ee81ea303..6f9f883f9a65 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -239,7 +239,7 @@ static void bio_free(struct bio *bio)
  * when IO has completed, or when the bio is released.
  */
 void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
-	      unsigned short max_vecs, unsigned int opf)
+	      unsigned short max_vecs, blk_opf_t opf)
 {
 	bio->bi_next = NULL;
 	bio->bi_bdev = bdev;
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(bio_init);
  *   preserved are the ones that are initialized by bio_alloc_bioset(). See
  *   comment in struct bio.
  */
-void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
+void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
 {
 	bio_uninit(bio);
 	memset(bio, 0, BIO_RESET_BYTES);
@@ -341,7 +341,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 EXPORT_SYMBOL(bio_chain);
 
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
-		unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
 {
 	struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
 
@@ -409,7 +409,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 }
 
 static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
-		unsigned short nr_vecs, unsigned int opf, gfp_t gfp,
+		unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
 		struct bio_set *bs)
 {
 	struct bio_alloc_cache *cache;
@@ -468,7 +468,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
  * Returns: Pointer to new bio on success, NULL on failure.
  */
 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
-			     unsigned int opf, gfp_t gfp_mask,
+			     blk_opf_t opf, gfp_t gfp_mask,
 			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 9f2723b34b75..022527b0b043 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
  * caller is responsible for synchronizing calls to this function.
  */
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   unsigned int op, uint64_t val)
+				   blk_opf_t opf, uint64_t val)
 {
 	struct percpu_counter *cnt;
 
-	if (op_is_discard(op))
+	if (op_is_discard(opf))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
-	else if (op_is_write(op))
+	else if (op_is_write(opf))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
 
 	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 
-	if (op_is_sync(op))
+	if (op_is_sync(opf))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
diff --git a/block/blk-core.c b/block/blk-core.c
index 67b8bcfa27f0..123468b9d2e4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,7 +1203,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule);
 
 int __init blk_dev_init(void)
 {
-	BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+	BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
 	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			sizeof_field(struct request, cmd_flags));
 	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c68968724870..d20a0c6b2c66 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -94,7 +94,7 @@ enum {
 };
 
 static void blk_kick_flush(struct request_queue *q,
-			   struct blk_flush_queue *fq, unsigned int flags);
+			   struct blk_flush_queue *fq, blk_opf_t flags);
 
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
@@ -173,7 +173,7 @@ static void blk_flush_complete_seq(struct request *rq,
 {
 	struct request_queue *q = rq->q;
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
-	unsigned int cmd_flags;
+	blk_opf_t cmd_flags;
 
 	BUG_ON(rq->flush.seq & seq);
 	rq->flush.seq |= seq;
@@ -290,7 +290,7 @@ bool is_flush_rq(struct request *rq)
  *
  */
 static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
-			   unsigned int flags)
+			   blk_opf_t flags)
 {
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	struct request *first_rq =
diff --git a/block/blk-merge.c b/block/blk-merge.c
index de178a8b4c82..3c3f785f558a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -712,7 +712,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
  */
 void blk_rq_set_mixed_merge(struct request *rq)
 {
-	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	struct bio *bio;
 
 	if (rq->rq_flags & RQF_MIXED_MERGE)
@@ -928,7 +928,7 @@ enum bio_merge_status {
 static enum bio_merge_status bio_attempt_back_merge(struct request *req,
 		struct bio *bio, unsigned int nr_segs)
 {
-	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+	const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
 	if (!ll_back_merge_fn(req, bio, nr_segs))
 		return BIO_MERGE_FAILED;
@@ -952,7 +952,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
 static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 		struct bio *bio, unsigned int nr_segs)
 {
-	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+	const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
 	if (!ll_front_merge_fn(req, bio, nr_segs))
 		return BIO_MERGE_FAILED;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6cc2411e2d26..8559cea7f300 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -313,8 +313,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	else
 		seq_printf(m, "%s", op_str);
 	seq_puts(m, ", .cmd_flags=");
-	blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
-		       ARRAY_SIZE(cmd_flag_name));
+	blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK),
+		       cmd_flag_name, ARRAY_SIZE(cmd_flag_name));
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f1b84e20b1a9..d716b7f3763f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -510,13 +510,13 @@ retry:
 					alloc_time_ns);
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 		blk_mq_req_flags_t flags)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
 		.flags		= flags,
-		.cmd_flags	= op,
+		.cmd_flags	= opf,
 		.nr_tags	= 1,
 	};
 	struct request *rq;
@@ -540,12 +540,12 @@ out_queue_exit:
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
+	blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
 		.flags		= flags,
-		.cmd_flags	= op,
+		.cmd_flags	= opf,
 		.nr_tags	= 1,
 	};
 	u64 alloc_time_ns = 0;
@@ -660,7 +660,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
 		rq->q->disk ? rq->q->disk->disk_name : "?",
-		(unsigned long long) rq->cmd_flags);
+		(__force unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
@@ -713,8 +713,9 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
 		"phys_seg %u prio class %u\n",
 		blk_status_to_str(status),
 		req->q->disk ? req->q->disk->disk_name : "?",
-		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
-		req->cmd_flags & ~REQ_OP_MASK,
+		blk_rq_pos(req), (__force u32)req_op(req),
+		blk_op_str(req_op(req)),
+		(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
 		req->nr_phys_segments,
 		IOPRIO_PRIO_CLASS(req->ioprio));
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e694ec67d646..8ca453ac243d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -86,7 +86,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 	return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
 }
 
-static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf)
+static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
@@ -107,7 +107,7 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf)
  * @ctx: software queue cpu ctx
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
-						     unsigned int opf,
+						     blk_opf_t opf,
 						     struct blk_mq_ctx *ctx)
 {
 	return ctx->hctxs[blk_mq_get_hctx_type(opf)];
@@ -152,7 +152,7 @@ struct blk_mq_alloc_data {
 	struct request_queue *q;
 	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
-	unsigned int cmd_flags;
+	blk_opf_t cmd_flags;
 	req_flags_t rq_flags;
 
 	/* allocate multiple requests/tags in one go */
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 7bf09ae06577..f2e4bf1dca47 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -451,7 +451,7 @@ static bool close_io(struct rq_wb *rwb)
 
 #define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
 
-static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
 {
 	unsigned int limit;
 
@@ -462,7 +462,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	if (!rwb_enabled(rwb))
 		return UINT_MAX;
 
-	if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+	if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
 		return rwb->wb_background;
 
 	/*
@@ -473,9 +473,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	 * the idle limit, or go to normal if we haven't had competing
 	 * IO for a bit.
 	 */
-	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+	if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
 		limit = rwb->rq_depth.max_depth;
-	else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+	else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
 		/*
 		 * If less than 100ms since we completed unrelated IO,
 		 * limit us to half the depth for background writeback.
@@ -490,13 +490,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 struct wbt_wait_data {
 	struct rq_wb *rwb;
 	enum wbt_flags wb_acct;
-	unsigned long rw;
+	blk_opf_t opf;
 };
 
 static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
 {
 	struct wbt_wait_data *data = private_data;
-	return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
+	return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
 }
 
 static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
@@ -510,13 +510,13 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
  * the timer to kick off queuing again.
  */
 static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
-		       unsigned long rw)
+		       blk_opf_t opf)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
 	struct wbt_wait_data data = {
 		.rwb = rwb,
 		.wb_acct = wb_acct,
-		.rw = rw,
+		.opf = opf,
 	};
 
 	rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
diff --git a/block/elevator.h b/block/elevator.h
index 16cd8bdedb7e..3f0593b3bf9d 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -34,7 +34,7 @@ struct elevator_mq_ops {
 	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
 	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
 	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
-	void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *);
+	void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *);
 	void (*prepare_request)(struct request *);
 	void (*finish_request)(struct request *);
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
diff --git a/block/fops.c b/block/fops.c
index 86d3cab9bf93..29066ac5a2fa 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -32,14 +32,14 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
+static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 {
-	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+	blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
 
 	/* avoid the need for a I/O completion work item */
 	if (iocb->ki_flags & IOCB_DSYNC)
-		op |= REQ_FUA;
-	return op;
+		opf |= REQ_FUA;
+	return opf;
 }
 
 static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
@@ -175,7 +175,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
-	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
@@ -297,7 +297,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
 	bool is_read = iov_iter_rw(iter) == READ;
-	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 992ee987f273..ca22b06700a9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -405,7 +405,7 @@ extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 
 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
-			     unsigned int opf, gfp_t gfp_mask,
+			     blk_opf_t opf, gfp_t gfp_mask,
 			     struct bio_set *bs);
 struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask);
 extern void bio_put(struct bio *);
@@ -418,7 +418,7 @@ int bio_init_clone(struct block_device *bdev, struct bio *bio,
 extern struct bio_set fs_bio_set;
 
 static inline struct bio *bio_alloc(struct block_device *bdev,
-		unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask)
+		unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask)
 {
 	return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
 }
@@ -456,9 +456,9 @@ struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
 void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
-	      unsigned short max_vecs, unsigned int opf);
+	      unsigned short max_vecs, blk_opf_t opf);
 extern void bio_uninit(struct bio *);
-void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf);
+void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
 void bio_chain(struct bio *, struct bio *);
 
 int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off);
@@ -789,6 +789,6 @@ static inline void bio_clear_polled(struct bio *bio)
 }
 
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
-		unsigned int nr_pages, unsigned int opf, gfp_t gfp);
+		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
 
 #endif /* __LINUX_BIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 677195de0663..effee1dc715a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -80,7 +80,7 @@ struct request {
 	struct blk_mq_ctx *mq_ctx;
 	struct blk_mq_hw_ctx *mq_hctx;
 
-	unsigned int cmd_flags;		/* op and common flags */
+	blk_opf_t cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 
 	int tag;
@@ -715,10 +715,10 @@ enum {
 	BLK_MQ_REQ_PM		= (__force blk_mq_req_flags_t)(1 << 2),
 };
 
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 		blk_mq_req_flags_t flags);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, blk_mq_req_flags_t flags,
+		blk_opf_t opf, blk_mq_req_flags_t flags,
 		unsigned int hctx_idx);
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca2ff113ea00..d04bdf549efa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -250,7 +250,7 @@ static inline int blk_validate_block_size(unsigned long bsize)
 	return 0;
 }
 
-static inline bool blk_op_is_passthrough(unsigned int op)
+static inline bool blk_op_is_passthrough(blk_opf_t op)
 {
 	op &= REQ_OP_MASK;
 	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
-- 
cgit 


From 919dbca8670d0f7828dfbb2f9b434ac22dca8d2e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:37 -0700
Subject: blktrace: Use the new blk_opf_t type

Improve static type checking by using the new blk_opf_t type for a function
argument that represents a combination of a request operation and request
flags. Rename that argument from 'op' into 'opf' to make its role more
clear.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-12-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h |  3 ++-
 kernel/trace/blktrace.c      | 51 ++++++++++++++++++++++----------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index f6f9b544365a..cfbda114348c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -7,6 +7,7 @@
 #include <linux/compat.h>
 #include <uapi/linux/blktrace_api.h>
 #include <linux/list.h>
+#include <linux/blk_types.h>
 
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 
@@ -105,7 +106,7 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-void blk_fill_rwbs(char *rwbs, unsigned int op);
+void blk_fill_rwbs(char *rwbs, blk_opf_t opf);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4327b51da403..150058f5daa9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 #define BLK_TC_PREFLUSH		BLK_TC_FLUSH
 
 /* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) <<	\
 	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 
 /*
@@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int op, int op_flags, u32 what, int error, int pdu_len,
-		     void *pdu_data, u64 cgid)
+			    const blk_opf_t opf, u32 what, int error,
+			    int pdu_len, void *pdu_data, u64 cgid)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	int cpu;
 	bool blk_tracer = blk_tracer_enabled;
 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+	const enum req_op op = opf & REQ_OP_MASK;
 
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
 	what |= ddir_act[op_is_write(op) ? WRITE : READ];
-	what |= MASK_TC_BIT(op_flags, SYNC);
-	what |= MASK_TC_BIT(op_flags, RAHEAD);
-	what |= MASK_TC_BIT(op_flags, META);
-	what |= MASK_TC_BIT(op_flags, PREFLUSH);
-	what |= MASK_TC_BIT(op_flags, FUA);
+	what |= MASK_TC_BIT(opf, SYNC);
+	what |= MASK_TC_BIT(opf, RAHEAD);
+	what |= MASK_TC_BIT(opf, META);
+	what |= MASK_TC_BIT(opf, PREFLUSH);
+	what |= MASK_TC_BIT(opf, FUA);
 	if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
@@ -842,9 +843,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error,
 	else
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
-	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, blk_status_to_errno(error), 0,
-			NULL, cgid);
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
+			what, blk_status_to_errno(error), 0, NULL, cgid);
 	rcu_read_unlock();
 }
 
@@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 	}
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+			bio->bi_opf, what, error, 0, NULL,
 			blk_trace_bio_get_cgid(q, bio));
 	rcu_read_unlock();
 }
@@ -949,7 +949,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	rcu_read_lock();
 	bt = rcu_dereference(q->blk_trace);
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
 	rcu_read_unlock();
 }
 
@@ -969,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
 	}
 	rcu_read_unlock();
 }
@@ -985,8 +985,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
 		__be64 rpdu = cpu_to_be64(pdu);
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
-				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
-				BLK_TA_SPLIT,
+				bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
 				blk_status_to_errno(bio->bi_status),
 				sizeof(rpdu), &rpdu,
 				blk_trace_bio_get_cgid(q, bio));
@@ -1022,7 +1021,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+			bio->bi_opf, BLK_TA_REMAP,
 			blk_status_to_errno(bio->bi_status),
 			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
 	rcu_read_unlock();
@@ -1058,7 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-			req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0,
+			rq->cmd_flags, BLK_TA_REMAP, 0,
 			sizeof(r), &r, blk_trace_request_get_cgid(rq));
 	rcu_read_unlock();
 }
@@ -1084,7 +1083,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len)
 		return;
 	}
 
-	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, 0, len, data,
 				blk_trace_request_get_cgid(rq));
 	rcu_read_unlock();
@@ -1881,14 +1880,14 @@ out:
  *     caller with resulting string.
  *
  **/
-void blk_fill_rwbs(char *rwbs, unsigned int op)
+void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
 {
 	int i = 0;
 
-	if (op & REQ_PREFLUSH)
+	if (opf & REQ_PREFLUSH)
 		rwbs[i++] = 'F';
 
-	switch (op & REQ_OP_MASK) {
+	switch (opf & REQ_OP_MASK) {
 	case REQ_OP_WRITE:
 		rwbs[i++] = 'W';
 		break;
@@ -1909,13 +1908,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
 		rwbs[i++] = 'N';
 	}
 
-	if (op & REQ_FUA)
+	if (opf & REQ_FUA)
 		rwbs[i++] = 'F';
-	if (op & REQ_RAHEAD)
+	if (opf & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (op & REQ_SYNC)
+	if (opf & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (op & REQ_META)
+	if (opf & REQ_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
-- 
cgit 


From 581075e4f6475bb97c73ecccf68636a9453a31fd Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:06:47 -0700
Subject: dm/core: Reduce the size of struct dm_io_request

Combine the bi_op and bi_op_flags into the bi_opf member. Use the new
blk_opf_t type to improve static type checking. This patch does not
change any functionality.

Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-22-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-bufio.c           |  9 +++------
 drivers/md/dm-integrity.c       | 15 +++++----------
 drivers/md/dm-io.c              | 10 ++++++----
 drivers/md/dm-kcopyd.c          |  3 +--
 drivers/md/dm-log.c             |  6 ++----
 drivers/md/dm-raid1.c           | 12 +++++-------
 drivers/md/dm-snap-persistent.c |  3 +--
 drivers/md/dm-writecache.c      | 12 ++++--------
 include/linux/dm-io.h           |  4 ++--
 9 files changed, 29 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5ffa1dcf84cf..1b7acda45c78 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -582,8 +582,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_op = rw,
-		.bi_op_flags = 0,
+		.bi_opf = rw,
 		.notify.fn = dmio_complete,
 		.notify.context = b,
 		.client = b->c->dm_io,
@@ -1341,8 +1340,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
 int dm_bufio_issue_flush(struct dm_bufio_client *c)
 {
 	struct dm_io_request io_req = {
-		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
+		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
@@ -1365,8 +1363,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
 {
 	struct dm_io_request io_req = {
-		.bi_op = REQ_OP_DISCARD,
-		.bi_op_flags = REQ_SYNC,
+		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 148978ad03a8..2ccc103dea1e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -557,8 +557,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
 	struct dm_io_region io_loc;
 	int r;
 
-	io_req.bi_op = op;
-	io_req.bi_op_flags = op_flags;
+	io_req.bi_opf = op | op_flags;
 	io_req.mem.type = DM_IO_KMEM;
 	io_req.mem.ptr.addr = ic->sb;
 	io_req.notify.fn = NULL;
@@ -1067,8 +1066,7 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
 	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
 	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
 
-	io_req.bi_op = op;
-	io_req.bi_op_flags = op_flags;
+	io_req.bi_opf = op | op_flags;
 	io_req.mem.type = DM_IO_PAGE_LIST;
 	if (ic->journal_io)
 		io_req.mem.ptr.pl = &ic->journal_io[pl_index];
@@ -1188,8 +1186,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig
 	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
 	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
 
-	io_req.bi_op = REQ_OP_WRITE;
-	io_req.bi_op_flags = 0;
+	io_req.bi_opf = REQ_OP_WRITE;
 	io_req.mem.type = DM_IO_PAGE_LIST;
 	io_req.mem.ptr.pl = &ic->journal[pl_index];
 	io_req.mem.offset = pl_offset;
@@ -1516,8 +1513,7 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat
 	if (!ic->meta_dev)
 		flush_data = false;
 	if (flush_data) {
-		fr.io_req.bi_op = REQ_OP_WRITE,
-		fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
+		fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
 		fr.io_req.mem.type = DM_IO_KMEM,
 		fr.io_req.mem.ptr.addr = NULL,
 		fr.io_req.notify.fn = flush_notify,
@@ -2706,8 +2702,7 @@ next_chunk:
 	if (unlikely(dm_integrity_failed(ic)))
 		goto err;
 
-	io_req.bi_op = REQ_OP_READ;
-	io_req.bi_op_flags = 0;
+	io_req.bi_opf = REQ_OP_READ;
 	io_req.mem.type = DM_IO_VMA;
 	io_req.mem.ptr.addr = ic->recalc_buffer;
 	io_req.notify.fn = NULL;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e4b95eaeec8c..0606e00d1817 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -489,7 +489,7 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 
 	case DM_IO_VMA:
 		flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
-		if (io_req->bi_op == REQ_OP_READ) {
+		if ((io_req->bi_opf & REQ_OP_MASK) == REQ_OP_READ) {
 			dp->vma_invalidate_address = io_req->mem.ptr.vma;
 			dp->vma_invalidate_size = size;
 		}
@@ -519,11 +519,13 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 
 	if (!io_req->notify.fn)
 		return sync_io(io_req->client, num_regions, where,
-			       io_req->bi_op, io_req->bi_op_flags, &dp,
+			       io_req->bi_opf & REQ_OP_MASK,
+			       io_req->bi_opf & ~REQ_OP_MASK, &dp,
 			       sync_error_bits);
 
-	return async_io(io_req->client, num_regions, where, io_req->bi_op,
-			io_req->bi_op_flags, &dp, io_req->notify.fn,
+	return async_io(io_req->client, num_regions, where,
+			io_req->bi_opf & REQ_OP_MASK,
+			io_req->bi_opf & ~REQ_OP_MASK, &dp, io_req->notify.fn,
 			io_req->notify.context);
 }
 EXPORT_SYMBOL(dm_io);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 37b03ab7e5c9..a99b994e2b62 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -549,8 +549,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_op = job->rw,
-		.bi_op_flags = 0,
+		.bi_opf = job->rw,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = 0,
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 0c6620e7b7bf..56ad13f9347b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -293,8 +293,7 @@ static void header_from_disk(struct log_header_core *core, struct log_header_dis
 
 static int rw_header(struct log_c *lc, int op)
 {
-	lc->io_req.bi_op = op;
-	lc->io_req.bi_op_flags = 0;
+	lc->io_req.bi_opf = op;
 
 	return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
 }
@@ -307,8 +306,7 @@ static int flush_header(struct log_c *lc)
 		.count = 0,
 	};
 
-	lc->io_req.bi_op = REQ_OP_WRITE;
-	lc->io_req.bi_op_flags = REQ_PREFLUSH;
+	lc->io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 8811d484fdd1..06a38dc32025 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -260,8 +260,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct dm_io_region io[MAX_NR_MIRRORS];
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
+		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
@@ -535,8 +534,7 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
 {
 	struct dm_io_region io;
 	struct dm_io_request io_req = {
-		.bi_op = REQ_OP_READ,
-		.bi_op_flags = 0,
+		.bi_opf = REQ_OP_READ,
 		.mem.type = DM_IO_BIO,
 		.mem.ptr.bio = bio,
 		.notify.fn = read_callback,
@@ -648,9 +646,9 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	unsigned int i;
 	struct dm_io_region io[MAX_NR_MIRRORS], *dest = io;
 	struct mirror *m;
+	blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH);
 	struct dm_io_request io_req = {
-		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH),
+		.bi_opf = REQ_OP_WRITE | op_flags,
 		.mem.type = DM_IO_BIO,
 		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
@@ -659,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	};
 
 	if (bio_op(bio) == REQ_OP_DISCARD) {
-		io_req.bi_op = REQ_OP_DISCARD;
+		io_req.bi_opf = REQ_OP_DISCARD | op_flags;
 		io_req.mem.type = DM_IO_KMEM;
 		io_req.mem.ptr.addr = NULL;
 	}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3bb5cff5d6fc..eaf969de3d3a 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -235,8 +235,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op,
 		.count = ps->store->chunk_size,
 	};
 	struct dm_io_request io_req = {
-		.bi_op = op,
-		.bi_op_flags = op_flags,
+		.bi_opf = op | op_flags,
 		.mem.type = DM_IO_VMA,
 		.mem.ptr.vma = area,
 		.client = ps->io_client,
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index d74c5a7a0ab4..2b994b3e22a7 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -523,8 +523,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 
 		region.sector += wc->start_sector;
 		atomic_inc(&endio.count);
-		req.bi_op = REQ_OP_WRITE;
-		req.bi_op_flags = REQ_SYNC;
+		req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
 		req.mem.type = DM_IO_VMA;
 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
 		req.client = wc->dm_io;
@@ -562,8 +561,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
 
 	region.sector += wc->start_sector;
 
-	req.bi_op = REQ_OP_WRITE;
-	req.bi_op_flags = REQ_SYNC | REQ_FUA;
+	req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
 	req.mem.type = DM_IO_VMA;
 	req.mem.ptr.vma = (char *)wc->memory_map;
 	req.client = wc->dm_io;
@@ -592,8 +590,7 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
 	region.bdev = dev->bdev;
 	region.sector = 0;
 	region.count = 0;
-	req.bi_op = REQ_OP_WRITE;
-	req.bi_op_flags = REQ_PREFLUSH;
+	req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	req.mem.type = DM_IO_KMEM;
 	req.mem.ptr.addr = NULL;
 	req.client = wc->dm_io;
@@ -981,8 +978,7 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors
 	region.bdev = wc->ssd_dev->bdev;
 	region.sector = wc->start_sector;
 	region.count = n_sectors;
-	req.bi_op = REQ_OP_READ;
-	req.bi_op_flags = REQ_SYNC;
+	req.bi_opf = REQ_OP_READ | REQ_SYNC;
 	req.mem.type = DM_IO_VMA;
 	req.mem.ptr.vma = (char *)wc->memory_map;
 	req.client = wc->dm_io;
diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h
index a52c6580cc9a..8e1c4ab5df04 100644
--- a/include/linux/dm-io.h
+++ b/include/linux/dm-io.h
@@ -13,6 +13,7 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/blk_types.h>
 
 struct dm_io_region {
 	struct block_device *bdev;
@@ -57,8 +58,7 @@ struct dm_io_notify {
  */
 struct dm_io_client;
 struct dm_io_request {
-	int bi_op;			/* REQ_OP */
-	int bi_op_flags;		/* req_flag_bits */
+	blk_opf_t	    bi_opf;	/* Request type and flags */
 	struct dm_io_memory mem;	/* Memory to use for io */
 	struct dm_io_notify notify;	/* Synchronous if notify.fn is NULL */
 	struct dm_io_client *client;	/* Client memory handler */
-- 
cgit 


From ea957547e819a21bd49895c6162f78d542867d39 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:05 -0700
Subject: scsi/core: Improve static type checking

Improve static type checking by using the new blk_opf_t type for the
combination of a request operation and its flags.

Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.garry@huawei.com>
Cc: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-40-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c  | 6 +++---
 include/scsi/scsi_cmnd.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1b3ca5c16c3d..06ec4705caf9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1125,12 +1125,12 @@ static void scsi_initialize_rq(struct request *rq)
 	cmd->retries = 0;
 }
 
-struct request *scsi_alloc_request(struct request_queue *q,
-		unsigned int op, blk_mq_req_flags_t flags)
+struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf,
+				   blk_mq_req_flags_t flags)
 {
 	struct request *rq;
 
-	rq = blk_mq_alloc_request(q, op, flags);
+	rq = blk_mq_alloc_request(q, opf, flags);
 	if (!IS_ERR(rq))
 		scsi_initialize_rq(rq);
 	return rq;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 1e80e70dfa92..bac55decf900 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -386,7 +386,7 @@ static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
 extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc,
 			     u8 key, u8 asc, u8 ascq);
 
-struct request *scsi_alloc_request(struct request_queue *q,
-		unsigned int op, blk_mq_req_flags_t flags);
+struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf,
+				   blk_mq_req_flags_t flags);
 
 #endif /* _SCSI_SCSI_CMND_H */
-- 
cgit 


From 2599cac57a9af4e7ce628e2ef41e92797cba4ae2 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:07 -0700
Subject: scsi/core: Use the new blk_opf_t type

Use the new blk_opf_t type for arguments and variables that represent
request flags. Use the !! operator in scsi_noretry_cmd() to convert the
blk_opf_t type into a boolean. This patch does not change any functionality.

Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.garry@huawei.com>
Cc: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-42-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c    | 6 +++---
 include/scsi/scsi_device.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 06ec4705caf9..17a617db9ae0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -209,8 +209,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
 int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 int data_direction, void *buffer, unsigned bufflen,
 		 unsigned char *sense, struct scsi_sense_hdr *sshdr,
-		 int timeout, int retries, u64 flags, req_flags_t rq_flags,
-		 int *resid)
+		 int timeout, int retries, blk_opf_t flags,
+		 req_flags_t rq_flags, int *resid)
 {
 	struct request *req;
 	struct scsi_cmnd *scmd;
@@ -633,7 +633,7 @@ static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result)
  */
 static unsigned int scsi_rq_err_bytes(const struct request *rq)
 {
-	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	unsigned int bytes = 0;
 	struct bio *bio;
 
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 7cf5f3b7589f..2493bd65351a 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -457,7 +457,7 @@ extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
 extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 			int data_direction, void *buffer, unsigned bufflen,
 			unsigned char *sense, struct scsi_sense_hdr *sshdr,
-			int timeout, int retries, u64 flags,
+			int timeout, int retries, blk_opf_t flags,
 			req_flags_t rq_flags, int *resid);
 /* Make sure any sense buffer is the correct size. */
 #define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense,	\
-- 
cgit 


From f8e6e4bd9fd8c452565f3eaeb358e3cc08d880f4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:11 -0700
Subject: mm: Use the new blk_opf_t type

Improve static type checking by using the new blk_opf_t type for block
layer request flags.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.cz>
Cc: Stefan Roesch <shr@fb.com>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-46-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index da21d63f70e2..e91bea371b18 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -101,9 +101,9 @@ struct writeback_control {
 #endif
 };
 
-static inline int wbc_to_write_flags(struct writeback_control *wbc)
+static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
 {
-	int flags = 0;
+	blk_opf_t flags = 0;
 
 	if (wbc->punt_to_cgroup)
 		flags = REQ_CGROUP_PUNT;
-- 
cgit 


From 3ae7286943ae6f6bfecfe0a3da9d1a4c64f5531f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:12 -0700
Subject: fs/buffer: Use the new blk_opf_t type

Improve static type checking by using the new blk_opf_t type for block layer
request flags. Change WRITE into REQ_OP_WRITE. This patch does not change
any functionality since REQ_OP_WRITE == WRITE == 1.

Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-47-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c                 | 21 +++++++++++----------
 include/linux/buffer_head.h |  9 +++++----
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 898c7f301b1b..4a00b61f35ec 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,8 +52,8 @@
 #include "internal.h"
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-			 struct writeback_control *wbc);
+static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
+			 struct buffer_head *bh, struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -1716,7 +1716,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = wbc_to_write_flags(wbc);
+	blk_opf_t write_flags = wbc_to_write_flags(wbc);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -2994,8 +2994,8 @@ static void end_bio_bh_io_sync(struct bio *bio)
 	bio_put(bio);
 }
 
-static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-			 struct writeback_control *wbc)
+static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
+			 struct buffer_head *bh, struct writeback_control *wbc)
 {
 	struct bio *bio;
 
@@ -3040,7 +3040,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	return 0;
 }
 
-int submit_bh(int op, int op_flags, struct buffer_head *bh)
+int submit_bh(enum req_op op, blk_opf_t op_flags, struct buffer_head *bh)
 {
 	return submit_bh_wbc(op, op_flags, bh, NULL);
 }
@@ -3072,7 +3072,8 @@ EXPORT_SYMBOL(submit_bh);
  * All of the buffers must be for the same device, and must also be a
  * multiple of the current approved size for the device.
  */
-void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
+void ll_rw_block(enum req_op op, blk_opf_t op_flags, int nr,
+		 struct buffer_head *bhs[])
 {
 	int i;
 
@@ -3081,7 +3082,7 @@ void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
 
 		if (!trylock_buffer(bh))
 			continue;
-		if (op == WRITE) {
+		if (op == REQ_OP_WRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
@@ -3101,7 +3102,7 @@ void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
 
-void write_dirty_buffer(struct buffer_head *bh, int op_flags)
+void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
 {
 	lock_buffer(bh);
 	if (!test_clear_buffer_dirty(bh)) {
@@ -3119,7 +3120,7 @@ EXPORT_SYMBOL(write_dirty_buffer);
  * and then start new I/O and then wait upon it.  The caller must have a ref on
  * the buffer_head.
  */
-int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
+int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
 {
 	int ret = 0;
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c9d1463bb20f..9795df9400bd 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -9,6 +9,7 @@
 #define _LINUX_BUFFER_HEAD_H
 
 #include <linux/types.h>
+#include <linux/blk_types.h>
 #include <linux/fs.h>
 #include <linux/linkage.h>
 #include <linux/pagemap.h>
@@ -201,11 +202,11 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(int, int, int, struct buffer_head * bh[]);
+void ll_rw_block(enum req_op, blk_opf_t, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
-int __sync_dirty_buffer(struct buffer_head *bh, int op_flags);
-void write_dirty_buffer(struct buffer_head *bh, int op_flags);
-int submit_bh(int, int, struct buffer_head *);
+int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
+void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
+int submit_bh(enum req_op, blk_opf_t, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
-- 
cgit 


From 1420c4a549bf28ffddbed827d61fb3d4d2132ddb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:13 -0700
Subject: fs/buffer: Combine two submit_bh() and ll_rw_block() arguments

Both submit_bh() and ll_rw_block() accept a request operation type and
request flags as their first two arguments. Micro-optimize these two
functions by combining these first two arguments into a single argument.
This patch does not change the behavior of any of the modified code.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Song Liu <song@kernel.org> (for the md changes)
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-48-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/md-bitmap.c      |  4 ++--
 fs/buffer.c                 | 53 +++++++++++++++++++++++----------------------
 fs/ext4/fast_commit.c       |  2 +-
 fs/ext4/mmp.c               |  2 +-
 fs/ext4/super.c             |  6 ++---
 fs/gfs2/bmap.c              |  5 ++---
 fs/gfs2/dir.c               |  5 ++---
 fs/gfs2/meta_io.c           |  9 ++++----
 fs/gfs2/quota.c             |  2 +-
 fs/isofs/compress.c         |  2 +-
 fs/jbd2/commit.c            |  8 +++----
 fs/jbd2/journal.c           |  4 ++--
 fs/jbd2/recovery.c          |  4 ++--
 fs/nilfs2/btnode.c          |  2 +-
 fs/nilfs2/gcinode.c         |  2 +-
 fs/nilfs2/mdt.c             |  2 +-
 fs/ntfs/aops.c              |  6 ++---
 fs/ntfs/compress.c          |  2 +-
 fs/ntfs/file.c              |  2 +-
 fs/ntfs/logfile.c           |  2 +-
 fs/ntfs/mft.c               |  4 ++--
 fs/ntfs3/file.c             |  2 +-
 fs/ntfs3/inode.c            |  2 +-
 fs/ocfs2/aops.c             |  2 +-
 fs/ocfs2/buffer_head_io.c   |  8 +++----
 fs/ocfs2/super.c            |  2 +-
 fs/reiserfs/inode.c         |  4 ++--
 fs/reiserfs/journal.c       | 12 +++++-----
 fs/reiserfs/stree.c         |  4 ++--
 fs/reiserfs/super.c         |  2 +-
 fs/udf/dir.c                |  2 +-
 fs/udf/directory.c          |  2 +-
 fs/udf/inode.c              |  2 +-
 fs/ufs/balloc.c             |  2 +-
 include/linux/buffer_head.h |  4 ++--
 35 files changed, 88 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 0a21b8317103..bf6dffadbe6f 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -302,7 +302,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 			atomic_inc(&bitmap->pending_writes);
 			set_buffer_locked(bh);
 			set_buffer_mapped(bh);
-			submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+			submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
 			bh = bh->b_this_page;
 		}
 
@@ -394,7 +394,7 @@ static int read_page(struct file *file, unsigned long index,
 			atomic_inc(&bitmap->pending_writes);
 			set_buffer_locked(bh);
 			set_buffer_mapped(bh);
-			submit_bh(REQ_OP_READ, 0, bh);
+			submit_bh(REQ_OP_READ, bh);
 		}
 		blk_cur++;
 		bh = bh->b_this_page;
diff --git a/fs/buffer.c b/fs/buffer.c
index 4a00b61f35ec..af53569930bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,8 +52,8 @@
 #include "internal.h"
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
-			 struct buffer_head *bh, struct writeback_control *wbc);
+static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev,
 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 	if (bh) {
 		if (buffer_dirty(bh))
-			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
+			ll_rw_block(REQ_OP_WRITE, 1, &bh);
 		put_bh(bh);
 	}
 }
@@ -1174,7 +1174,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
 	} else {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(REQ_OP_READ, 0, bh);
+		submit_bh(REQ_OP_READ, bh);
 		wait_on_buffer(bh);
 		if (buffer_uptodate(bh))
 			return bh;
@@ -1342,7 +1342,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);
 	if (likely(bh)) {
-		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+		ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
 		brelse(bh);
 	}
 }
@@ -1353,7 +1353,7 @@ void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
 {
 	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
 	if (likely(bh)) {
-		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+		ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
 		brelse(bh);
 	}
 }
@@ -1804,7 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1858,7 +1858,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -2033,7 +2033,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		    !buffer_unwritten(bh) &&
 		     (block_start < from || block_end > to)) {
-			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+			ll_rw_block(REQ_OP_READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 	}
@@ -2334,7 +2334,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 		if (buffer_uptodate(bh))
 			end_buffer_async_read(bh, 1);
 		else
-			submit_bh(REQ_OP_READ, 0, bh);
+			submit_bh(REQ_OP_READ, bh);
 	}
 	return 0;
 }
@@ -2665,7 +2665,7 @@ int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 		if (block_start < from || block_end > to) {
 			lock_buffer(bh);
 			bh->b_end_io = end_buffer_read_nobh;
-			submit_bh(REQ_OP_READ, 0, bh);
+			submit_bh(REQ_OP_READ, bh);
 			nr_reads++;
 		}
 	}
@@ -2915,7 +2915,7 @@ int block_truncate_page(struct address_space *mapping,
 
 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
 		err = -EIO;
-		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+		ll_rw_block(REQ_OP_READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
@@ -2994,9 +2994,10 @@ static void end_bio_bh_io_sync(struct bio *bio)
 	bio_put(bio);
 }
 
-static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
-			 struct buffer_head *bh, struct writeback_control *wbc)
+static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+			 struct writeback_control *wbc)
 {
+	const enum req_op op = opf & REQ_OP_MASK;
 	struct bio *bio;
 
 	BUG_ON(!buffer_locked(bh));
@@ -3012,11 +3013,11 @@ static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
 		clear_buffer_write_io_error(bh);
 
 	if (buffer_meta(bh))
-		op_flags |= REQ_META;
+		opf |= REQ_META;
 	if (buffer_prio(bh))
-		op_flags |= REQ_PRIO;
+		opf |= REQ_PRIO;
 
-	bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
+	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
 
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 
@@ -3040,9 +3041,9 @@ static int submit_bh_wbc(enum req_op op, blk_opf_t op_flags,
 	return 0;
 }
 
-int submit_bh(enum req_op op, blk_opf_t op_flags, struct buffer_head *bh)
+int submit_bh(blk_opf_t opf, struct buffer_head *bh)
 {
-	return submit_bh_wbc(op, op_flags, bh, NULL);
+	return submit_bh_wbc(opf, bh, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3072,9 +3073,9 @@ EXPORT_SYMBOL(submit_bh);
  * All of the buffers must be for the same device, and must also be a
  * multiple of the current approved size for the device.
  */
-void ll_rw_block(enum req_op op, blk_opf_t op_flags, int nr,
-		 struct buffer_head *bhs[])
+void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[])
 {
+	const enum req_op op = opf & REQ_OP_MASK;
 	int i;
 
 	for (i = 0; i < nr; i++) {
@@ -3086,14 +3087,14 @@ void ll_rw_block(enum req_op op, blk_opf_t op_flags, int nr,
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-				submit_bh(op, op_flags, bh);
+				submit_bh(opf, bh);
 				continue;
 			}
 		} else {
 			if (!buffer_uptodate(bh)) {
 				bh->b_end_io = end_buffer_read_sync;
 				get_bh(bh);
-				submit_bh(op, op_flags, bh);
+				submit_bh(opf, bh);
 				continue;
 			}
 		}
@@ -3111,7 +3112,7 @@ void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
 	}
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE, op_flags, bh);
+	submit_bh(REQ_OP_WRITE | op_flags, bh);
 }
 EXPORT_SYMBOL(write_dirty_buffer);
 
@@ -3138,7 +3139,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
 
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
+		ret = submit_bh(REQ_OP_WRITE | op_flags, bh);
 		wait_on_buffer(bh);
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
@@ -3366,7 +3367,7 @@ int bh_submit_read(struct buffer_head *bh)
 
 	get_bh(bh);
 	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, 0, bh);
+	submit_bh(REQ_OP_READ, bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return 0;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 795a60ad1897..0df5482c6c1c 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -668,7 +668,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 	set_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = ext4_end_buffer_io_sync;
-	submit_bh(REQ_OP_WRITE, write_flags, bh);
+	submit_bh(REQ_OP_WRITE | write_flags, bh);
 	EXT4_SB(sb)->s_fc_bh = NULL;
 }
 
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index b221f313ded6..9af68a7ecdcf 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
+	submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 845f2f8aee5f..24922184b622 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -171,7 +171,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
 
 	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_READ, op_flags, bh);
+	submit_bh(REQ_OP_READ | op_flags, bh);
 }
 
 void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
@@ -5939,8 +5939,8 @@ static int ext4_commit_super(struct super_block *sb)
 	/* Clear potential dirty bit if it was journalled update */
 	clear_buffer_dirty(sbh);
 	sbh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE,
-		  REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+	submit_bh(REQ_OP_WRITE | REQ_SYNC |
+		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
 	wait_on_buffer(sbh);
 	if (buffer_write_io_error(sbh)) {
 		ext4_msg(sb, KERN_ERR, "I/O error while writing "
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index b6697333bb2b..3bdb2c668a71 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -310,9 +310,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 		if (trylock_buffer(rabh)) {
 			if (!buffer_uptodate(rabh)) {
 				rabh->b_end_io = end_buffer_read_sync;
-				submit_bh(REQ_OP_READ,
-					  REQ_RAHEAD | REQ_META | REQ_PRIO,
-					  rabh);
+				submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+					  REQ_PRIO, rabh);
 				continue;
 			}
 			unlock_buffer(rabh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 42b7dfffb5e7..a0562dd1bada 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1508,9 +1508,8 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
 				continue;
 			}
 			bh->b_end_io = end_buffer_read_sync;
-			submit_bh(REQ_OP_READ,
-				  REQ_RAHEAD | REQ_META | REQ_PRIO,
-				  bh);
+			submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+				  REQ_PRIO, bh);
 			continue;
 		}
 		brelse(bh);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 868dcc71b581..3570739f005d 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -75,7 +75,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, write_flags, bh);
+			submit_bh(REQ_OP_WRITE | write_flags, bh);
 			nr_underway++;
 		}
 		bh = next;
@@ -527,7 +527,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
+		ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh);
 
 	dblock++;
 	extlen--;
@@ -536,9 +536,8 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 		bh = gfs2_getbuf(gl, dblock, CREATE);
 
 		if (!buffer_uptodate(bh) && !buffer_locked(bh))
-			ll_rw_block(REQ_OP_READ,
-				    REQ_RAHEAD | REQ_META | REQ_PRIO,
-				    1, &bh);
+			ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+				    REQ_PRIO, 1, &bh);
 		brelse(bh);
 		dblock++;
 		extlen--;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 59d727a4ae2c..c98a7faa67d3 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -746,7 +746,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
 		if (PageUptodate(page))
 			set_buffer_uptodate(bh);
 		if (!buffer_uptodate(bh)) {
-			ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
+			ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh);
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				goto unlock_out;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 95a19f25d61c..b466172eec25 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -82,7 +82,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
 		return 0;
 	}
 	haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
-	ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs);
+	ll_rw_block(REQ_OP_READ, haveblocks, bhs);
 
 	curbh = 0;
 	curpage = 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eb315e81f1a6..890b5543a1c5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,10 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
-		ret = submit_bh(REQ_OP_WRITE,
-			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
+		ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH |
+				REQ_FUA, bh);
 	else
-		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+		ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -763,7 +763,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+				submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9015f5fa2862..07e6aaf7e213 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1638,7 +1638,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 		sb->s_checksum = jbd2_superblock_csum(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
-	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
+	ret = submit_bh(REQ_OP_WRITE | write_flags, bh);
 	wait_on_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		clear_buffer_write_io_error(bh);
@@ -1900,7 +1900,7 @@ static int journal_get_superblock(journal_t *journal)
 
 	J_ASSERT(bh != NULL);
 	if (!buffer_uptodate(bh)) {
-		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+		ll_rw_block(REQ_OP_READ, 1, &bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			printk(KERN_ERR
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 8ca3527189f8..e699d6ab2c0e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
 			bufs[nbufs++] = bh;
 			if (nbufs == MAXBUF) {
-				ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
+				ll_rw_block(REQ_OP_READ, nbufs, bufs);
 				journal_brelse_array(bufs, nbufs);
 				nbufs = 0;
 			}
@@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 	}
 
 	if (nbufs)
-		ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
+		ll_rw_block(REQ_OP_READ, nbufs, bufs);
 	err = 0;
 
 failed:
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ca611ac09f7c..5c39efbf733f 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -122,7 +122,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 	bh->b_blocknr = pblocknr; /* set block address for read */
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(mode, mode_flags, bh);
+	submit_bh(mode | mode_flags, bh);
 	bh->b_blocknr = blocknr; /* set back to the given block address */
 	*submit_ptr = pblocknr;
 	err = 0;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 04fdd420eae7..847def8af315 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -92,7 +92,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 	bh->b_blocknr = pbn;
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_READ, 0, bh);
+	submit_bh(REQ_OP_READ, bh);
 	if (vbn)
 		bh->b_blocknr = vbn;
  out:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d29a0f2b9c16..66e8811c2528 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -148,7 +148,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(mode, mode_flags, bh);
+	submit_bh(mode | mode_flags, bh);
 	ret = 0;
 
 	trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 9e3964ea2ea0..b5765fdb3a47 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -342,7 +342,7 @@ handle_zblock:
 		for (i = 0; i < nr; i++) {
 			tbh = arr[i];
 			if (likely(!buffer_uptodate(tbh)))
-				submit_bh(REQ_OP_READ, 0, tbh);
+				submit_bh(REQ_OP_READ, tbh);
 			else
 				ntfs_end_buffer_async_read(tbh, 1);
 		}
@@ -859,7 +859,7 @@ lock_retry_remap:
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, 0, bh);
+			submit_bh(REQ_OP_WRITE, bh);
 			need_end_writeback = false;
 		}
 		bh = next;
@@ -1187,7 +1187,7 @@ lock_retry_remap:
 		BUG_ON(!buffer_mapped(tbh));
 		get_bh(tbh);
 		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, 0, tbh);
+		submit_bh(REQ_OP_WRITE, tbh);
 	}
 	/* Synchronize the mft mirror now if not @sync. */
 	if (is_mft && !sync)
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index a60f543e7557..587e9b187873 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -658,7 +658,7 @@ lock_retry_remap:
 		}
 		get_bh(tbh);
 		tbh->b_end_io = end_buffer_read_sync;
-		submit_bh(REQ_OP_READ, 0, tbh);
+		submit_bh(REQ_OP_READ, tbh);
 	}
 
 	/* Wait for io completion on all buffer heads. */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a8abe2296514..46ed69b86c33 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -537,7 +537,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
 	lock_buffer(bh);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_read_sync;
-	return submit_bh(REQ_OP_READ, 0, bh);
+	return submit_bh(REQ_OP_READ, bh);
 }
 
 /**
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index bc1bf217b38e..6ce60ffc6ac0 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -807,7 +807,7 @@ map_vcn:
 			 * completed ignore errors afterwards as we can assume
 			 * that if one buffer worked all of them will work.
 			 */
-			submit_bh(REQ_OP_WRITE, 0, bh);
+			submit_bh(REQ_OP_WRITE, bh);
 			if (should_wait) {
 				should_wait = false;
 				wait_on_buffer(bh);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0d62cd5bb7f8..f7bf5ce960cc 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -583,7 +583,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 			clear_buffer_dirty(tbh);
 			get_bh(tbh);
 			tbh->b_end_io = end_buffer_write_sync;
-			submit_bh(REQ_OP_WRITE, 0, tbh);
+			submit_bh(REQ_OP_WRITE, tbh);
 		}
 		/* Wait on i/o completion of buffers. */
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
@@ -780,7 +780,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 		clear_buffer_dirty(tbh);
 		get_bh(tbh);
 		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, 0, tbh);
+		submit_bh(REQ_OP_WRITE, tbh);
 	}
 	/* Synchronize the mft mirror now if not @sync. */
 	if (!sync && ni->mft_no < vol->mftmirr_size)
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 8e9d2b35175f..4a21745711fe 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -242,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 				lock_buffer(bh);
 				bh->b_end_io = end_buffer_read_sync;
 				get_bh(bh);
-				submit_bh(REQ_OP_READ, 0, bh);
+				submit_bh(REQ_OP_READ, bh);
 
 				wait_on_buffer(bh);
 				if (!buffer_uptodate(bh)) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index be4ebdd8048b..d100a063def2 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -629,7 +629,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
 			bh->b_size = block_size;
 			off = vbo & (PAGE_SIZE - 1);
 			set_bh_page(bh, page, off);
-			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+			ll_rw_block(REQ_OP_READ, 1, &bh);
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh)) {
 				err = -EIO;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 35d40a67204c..304ed2be1b83 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -638,7 +638,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			   !buffer_new(bh) &&
 			   ocfs2_should_read_blk(inode, page, block_start) &&
 			   (block_start < from || block_end > to)) {
-			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+			ll_rw_block(REQ_OP_READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index e7758778abef..196638a22b48 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -64,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 
 	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE, 0, bh);
+	submit_bh(REQ_OP_WRITE, bh);
 
 	wait_on_buffer(bh);
 
@@ -147,7 +147,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 
 		get_bh(bh); /* for end_buffer_read_sync() */
 		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(REQ_OP_READ, 0, bh);
+		submit_bh(REQ_OP_READ, bh);
 	}
 
 read_failure:
@@ -328,7 +328,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 			if (validate)
 				set_buffer_needs_validate(bh);
 			bh->b_end_io = end_buffer_read_sync;
-			submit_bh(REQ_OP_READ, 0, bh);
+			submit_bh(REQ_OP_READ, bh);
 			continue;
 		}
 	}
@@ -449,7 +449,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
 	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
-	submit_bh(REQ_OP_WRITE, 0, bh);
+	submit_bh(REQ_OP_WRITE, bh);
 
 	wait_on_buffer(bh);
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7298816d8d9..e68807196076 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1785,7 +1785,7 @@ static int ocfs2_get_sector(struct super_block *sb,
 	if (!buffer_dirty(*bh))
 		clear_buffer_uptodate(*bh);
 	unlock_buffer(*bh);
-	ll_rw_block(REQ_OP_READ, 0, 1, bh);
+	ll_rw_block(REQ_OP_READ, 1, bh);
 	wait_on_buffer(*bh);
 	if (!buffer_uptodate(*bh)) {
 		mlog_errno(-EIO);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0cffe054b78e..23f542d1748b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2664,7 +2664,7 @@ static int reiserfs_write_full_page(struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, 0, bh);
+			submit_bh(REQ_OP_WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
@@ -2724,7 +2724,7 @@ fail:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(REQ_OP_WRITE, 0, bh);
+			submit_bh(REQ_OP_WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d8cc9a366124..94addfcefede 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -650,7 +650,7 @@ static void submit_logged_buffer(struct buffer_head *bh)
 		BUG();
 	if (!buffer_uptodate(bh))
 		BUG();
-	submit_bh(REQ_OP_WRITE, 0, bh);
+	submit_bh(REQ_OP_WRITE, bh);
 }
 
 static void submit_ordered_buffer(struct buffer_head *bh)
@@ -660,7 +660,7 @@ static void submit_ordered_buffer(struct buffer_head *bh)
 	clear_buffer_dirty(bh);
 	if (!buffer_uptodate(bh))
 		BUG();
-	submit_bh(REQ_OP_WRITE, 0, bh);
+	submit_bh(REQ_OP_WRITE, bh);
 }
 
 #define CHUNK_SIZE 32
@@ -868,7 +868,7 @@ loop_next:
 		 */
 		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
 			spin_unlock(lock);
-			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
+			ll_rw_block(REQ_OP_WRITE, 1, &bh);
 			spin_lock(lock);
 		}
 		put_bh(bh);
@@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s,
 		if (tbh) {
 			if (buffer_dirty(tbh)) {
 		            depth = reiserfs_write_unlock_nested(s);
-			    ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh);
+			    ll_rw_block(REQ_OP_WRITE, 1, &tbh);
 			    reiserfs_write_lock_nested(s, depth);
 			}
 			put_bh(tbh) ;
@@ -2240,7 +2240,7 @@ abort_replay:
 		}
 	}
 	/* read in the log blocks, memcpy to the corresponding real block */
-	ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks);
+	ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 
 		wait_on_buffer(log_blocks[i]);
@@ -2342,7 +2342,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
 		} else
 			bhlist[j++] = bh;
 	}
-	ll_rw_block(REQ_OP_READ, 0, j, bhlist);
+	ll_rw_block(REQ_OP_READ, j, bhlist);
 	for (i = 1; i < j; i++)
 		brelse(bhlist[i]);
 	bh = bhlist[0];
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ef42729216d1..9a293609a022 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s,
 		if (!buffer_uptodate(bh[j])) {
 			if (depth == -1)
 				depth = reiserfs_write_unlock_nested(s);
-			ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j);
+			ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j);
 		}
 		brelse(bh[j]);
 	}
@@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
 			if (!buffer_uptodate(bh) && depth == -1)
 				depth = reiserfs_write_unlock_nested(sb);
 
-			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+			ll_rw_block(REQ_OP_READ, 1, &bh);
 			wait_on_buffer(bh);
 
 			if (depth != -1)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cfb7c44c7366..c88cd2ce0665 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1702,7 +1702,7 @@ static int read_super_block(struct super_block *s, int offset)
 /* after journal replay, reread all bitmap and super blocks */
 static int reread_meta_blocks(struct super_block *s)
 {
-	ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s));
+	ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s));
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
 		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 42e3e551fa4c..cad3772f9dbe 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
 					brelse(tmp);
 			}
 			if (num) {
-				ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
+				ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
 				for (i = 0; i < num; i++)
 					brelse(bha[i]);
 			}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 73720320f0ab..a2adf6293093 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					brelse(tmp);
 			}
 			if (num) {
-				ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
+				ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
 				for (i = 0; i < num; i++)
 					brelse(bha[i]);
 			}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index edc88716751a..8d06daed549f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1214,7 +1214,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
 	if (buffer_uptodate(bh))
 		return bh;
 
-	ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+	ll_rw_block(REQ_OP_READ, 1, &bh);
 
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 075d3d9114c8..bd810d8239f2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -296,7 +296,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 			if (!buffer_mapped(bh))
 					map_bh(bh, inode->i_sb, oldb + pos);
 			if (!buffer_uptodate(bh)) {
-				ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+				ll_rw_block(REQ_OP_READ, 1, &bh);
 				wait_on_buffer(bh);
 				if (!buffer_uptodate(bh)) {
 					ufs_error(inode->i_sb, __func__,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9795df9400bd..bb68eb6407da 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -202,11 +202,11 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(enum req_op, blk_opf_t, int, struct buffer_head * bh[]);
+void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
-int submit_bh(enum req_op, blk_opf_t, struct buffer_head *);
+int submit_bh(blk_opf_t, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
-- 
cgit 


From 7649c873c16a384d447f7dbf9b153e333159f914 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:18 -0700
Subject: fs/f2fs: Use the enum req_op and blk_opf_t types

Improve static type checking by using the enum req_op type for variables
that represent a request operation and the new blk_opf_t type for
variables that represent request flags.

Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-53-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/f2fs/data.c              | 11 ++++++-----
 fs/f2fs/f2fs.h              |  6 +++---
 fs/f2fs/node.c              |  2 +-
 fs/f2fs/segment.c           |  2 +-
 include/trace/events/f2fs.h | 22 +++++++++++-----------
 5 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7fcbcf979737..5c13ee321940 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -387,11 +387,11 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 	return 0;
 }
 
-static unsigned int f2fs_io_flags(struct f2fs_io_info *fio)
+static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
 {
 	unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
 	unsigned int fua_flag, meta_flag, io_flag;
-	unsigned int op_flags = 0;
+	blk_opf_t op_flags = 0;
 
 	if (fio->op != REQ_OP_WRITE)
 		return 0;
@@ -999,7 +999,7 @@ out:
 }
 
 static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
-				      unsigned nr_pages, unsigned op_flag,
+				      unsigned nr_pages, blk_opf_t op_flag,
 				      pgoff_t first_idx, bool for_write)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1047,7 +1047,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 
 /* This can handle encryption stuffs */
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
-				 block_t blkaddr, int op_flags, bool for_write)
+				 block_t blkaddr, blk_opf_t op_flags,
+				 bool for_write)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
@@ -1181,7 +1182,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 }
 
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-						int op_flags, bool for_write)
+				     blk_opf_t op_flags, bool for_write)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9bbecd008d2..868170b72de9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1183,8 +1183,8 @@ struct f2fs_io_info {
 	nid_t ino;		/* inode number */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	enum temp_type temp;	/* contains HOT/WARM/COLD */
-	int op;			/* contains REQ_OP_ */
-	int op_flags;		/* req_flag_bits */
+	enum req_op op;		/* contains REQ_OP_ */
+	blk_opf_t op_flags;	/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
@@ -3741,7 +3741,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-			int op_flags, bool for_write);
+			blk_opf_t op_flags, bool for_write);
 struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
 struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
 			bool for_write);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index cf6f7fc83c08..04a145f1dcfc 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1327,7 +1327,7 @@ fail:
  * 0: f2fs_put_page(page, 0)
  * LOCKED_PAGE or error: f2fs_put_page(page, 1)
  */
-static int read_node_page(struct page *page, int op_flags)
+static int read_node_page(struct page *page, blk_opf_t op_flags)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	struct node_info ni;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 874c1b9c41a2..c7afc588cf26 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1082,7 +1082,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
 					&(dcc->fstrim_list) : &(dcc->wait_list);
-	int flag = dpolicy->sync ? REQ_SYNC : 0;
+	blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0;
 	block_t lstart, start, len, total_len;
 	int err = 0;
 
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 513e889ef8aa..f1e922237736 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -66,7 +66,7 @@ TRACE_DEFINE_ENUM(CP_RESIZE);
 
 #define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO |	\
 			REQ_PREFLUSH | REQ_FUA)
-#define F2FS_BIO_FLAG_MASK(t)	(t & F2FS_OP_FLAGS)
+#define F2FS_BIO_FLAG_MASK(t) (__force u32)((t) & F2FS_OP_FLAGS)
 
 #define show_bio_type(op,op_flags)	show_bio_op(op),		\
 						show_bio_op_flags(op_flags)
@@ -75,12 +75,12 @@ TRACE_DEFINE_ENUM(CP_RESIZE);
 
 #define show_bio_op_flags(flags)					\
 	__print_flags(F2FS_BIO_FLAG_MASK(flags), "|",			\
-		{ REQ_RAHEAD,		"R" },				\
-		{ REQ_SYNC,		"S" },				\
-		{ REQ_META,		"M" },				\
-		{ REQ_PRIO,		"P" },				\
-		{ REQ_PREFLUSH,		"PF" },				\
-		{ REQ_FUA,		"FUA" })
+		{ (__force u32)REQ_RAHEAD,	"R" },			\
+		{ (__force u32)REQ_SYNC,	"S" },			\
+		{ (__force u32)REQ_META,	"M" },			\
+		{ (__force u32)REQ_PRIO,	"P" },			\
+		{ (__force u32)REQ_PREFLUSH,	"PF" },			\
+		{ (__force u32)REQ_FUA,		"FUA" })
 
 #define show_data_type(type)						\
 	__print_symbolic(type,						\
@@ -1036,8 +1036,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__field(pgoff_t, index)
 		__field(block_t, old_blkaddr)
 		__field(block_t, new_blkaddr)
-		__field(int, op)
-		__field(int, op_flags)
+		__field(enum req_op, op)
+		__field(blk_opf_t, op_flags)
 		__field(int, temp)
 		__field(int, type)
 	),
@@ -1092,8 +1092,8 @@ DECLARE_EVENT_CLASS(f2fs__bio,
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(dev_t,	target)
-		__field(int,	op)
-		__field(int,	op_flags)
+		__field(enum req_op,	op)
+		__field(blk_opf_t,	op_flags)
 		__field(int,	type)
 		__field(sector_t,	sector)
 		__field(unsigned int,	size)
-- 
cgit 


From 6669797b0dd41ced457760b6e1014fdda8ce19ce Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:22 -0700
Subject: fs/jbd2: Fix the documentation of the jbd2_write_superblock() callers

Commit 2a222ca992c3 ("fs: have submit_bh users pass in op and flags
separately") renamed the jbd2_write_superblock() 'write_op' argument into
'write_flags'. Propagate this change to the jbd2_write_superblock()
callers. Additionally, change the type of 'write_flags' into blk_opf_t.

Cc: Mike Christie <michael.christie@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-57-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/jbd2/journal.c           | 15 ++++++++-------
 include/linux/jbd2.h        |  2 +-
 include/trace/events/jbd2.h | 12 ++++++------
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 07e6aaf7e213..2a1b9da7c3e3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1604,7 +1604,7 @@ static int journal_reset(journal_t *journal)
  * This function expects that the caller will have locked the journal
  * buffer head, and will return with it unlocked
  */
-static int jbd2_write_superblock(journal_t *journal, int write_flags)
+static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
 	journal_superblock_t *sb = journal->j_superblock;
@@ -1661,13 +1661,14 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
  * @journal: The journal to update.
  * @tail_tid: TID of the new transaction at the tail of the log
  * @tail_block: The first block of the transaction at the tail of the log
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
  *
  * Update a journal's superblock information about log tail and write it to
  * disk, waiting for the IO to complete.
  */
 int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
-				     unsigned long tail_block, int write_op)
+				    unsigned long tail_block,
+				    blk_opf_t write_flags)
 {
 	journal_superblock_t *sb = journal->j_superblock;
 	int ret;
@@ -1687,7 +1688,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	sb->s_sequence = cpu_to_be32(tail_tid);
 	sb->s_start    = cpu_to_be32(tail_block);
 
-	ret = jbd2_write_superblock(journal, write_op);
+	ret = jbd2_write_superblock(journal, write_flags);
 	if (ret)
 		goto out;
 
@@ -1704,12 +1705,12 @@ out:
 /**
  * jbd2_mark_journal_empty() - Mark on disk journal as empty.
  * @journal: The journal to update.
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
  *
  * Update a journal's dynamic superblock fields to show that journal is empty.
  * Write updated superblock to disk waiting for IO to complete.
  */
-static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
+static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
 {
 	journal_superblock_t *sb = journal->j_superblock;
 	bool had_fast_commit = false;
@@ -1735,7 +1736,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 		had_fast_commit = true;
 	}
 
-	jbd2_write_superblock(journal, write_op);
+	jbd2_write_superblock(journal, write_flags);
 
 	if (had_fast_commit)
 		jbd2_set_feature_fast_commit(journal);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index e79d6e0b14e8..dc1724131300 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1557,7 +1557,7 @@ extern int	   jbd2_journal_wipe       (journal_t *, int);
 extern int	   jbd2_journal_skip_recovery	(journal_t *);
 extern void	   jbd2_journal_update_sb_errno(journal_t *);
 extern int	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
-				unsigned long, int);
+				unsigned long, blk_opf_t);
 extern void	   jbd2_journal_abort      (journal_t *, int);
 extern int	   jbd2_journal_errno      (journal_t *);
 extern void	   jbd2_journal_ack_err    (journal_t *);
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index a4dfe005983d..99f783c384bb 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -355,22 +355,22 @@ TRACE_EVENT(jbd2_update_log_tail,
 
 TRACE_EVENT(jbd2_write_superblock,
 
-	TP_PROTO(journal_t *journal, int write_op),
+	TP_PROTO(journal_t *journal, blk_opf_t write_flags),
 
-	TP_ARGS(journal, write_op),
+	TP_ARGS(journal, write_flags),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,  dev			)
-		__field(	  int,  write_op		)
+		__field(    blk_opf_t,  write_flags		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->write_op	= write_op;
+		__entry->write_flags	= write_flags;
 	),
 
-	TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->write_op)
+	TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), (__force u32)__entry->write_flags)
 );
 
 TRACE_EVENT(jbd2_lock_buffer_stall,
-- 
cgit 


From ed4512590bd5839f8ea9eef1626b0f4db626b1d1 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Jul 2022 11:07:24 -0700
Subject: fs/nilfs2: Use the enum req_op and blk_opf_t types

Improve static type checking by using the enum req_op type for variables
that represent a request operation and the new blk_opf_t type for
variables that represent request flags. Combine the 'mode' and
'mode_flags' arguments of nilfs_btnode_submit_block into a single
argument 'opf'.

Reviewed-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220714180729.1065367-59-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/nilfs2/btnode.c            |  8 ++++----
 fs/nilfs2/btnode.h            |  4 ++--
 fs/nilfs2/btree.c             |  6 +++---
 fs/nilfs2/gcinode.c           |  5 ++---
 fs/nilfs2/mdt.c               | 19 ++++++++++---------
 include/trace/events/nilfs2.h |  4 ++--
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5c39efbf733f..e74fda212620 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -70,7 +70,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 }
 
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-			      sector_t pblocknr, int mode, int mode_flags,
+			      sector_t pblocknr, blk_opf_t opf,
 			      struct buffer_head **pbh, sector_t *submit_ptr)
 {
 	struct buffer_head *bh;
@@ -103,13 +103,13 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 		}
 	}
 
-	if (mode_flags & REQ_RAHEAD) {
+	if (opf & REQ_RAHEAD) {
 		if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
 			err = -EBUSY; /* internal code */
 			brelse(bh);
 			goto out_locked;
 		}
-	} else { /* mode == READ */
+	} else { /* opf == REQ_OP_READ */
 		lock_buffer(bh);
 	}
 	if (buffer_uptodate(bh)) {
@@ -122,7 +122,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 	bh->b_blocknr = pblocknr; /* set block address for read */
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(mode | mode_flags, bh);
+	submit_bh(opf, bh);
 	bh->b_blocknr = blocknr; /* set back to the given block address */
 	*submit_ptr = pblocknr;
 	err = 0;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index bd5544e63a01..4bc5612dff94 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -34,8 +34,8 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
 					      __u64 blocknr);
-int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
-			      int, struct buffer_head **, sector_t *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+			      blk_opf_t, struct buffer_head **, sector_t *);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
 				    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f544c22fff78..9f4d9432d38a 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -477,7 +477,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
 	sector_t submit_ptr = 0;
 	int ret;
 
-	ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh,
+	ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh,
 					&submit_ptr);
 	if (ret) {
 		if (ret != -EEXIST)
@@ -495,8 +495,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
 			ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
 
 			ret = nilfs_btnode_submit_block(btnc, ptr2, 0,
-							REQ_OP_READ, REQ_RAHEAD,
-							&ra_bh, &submit_ptr);
+						REQ_OP_READ | REQ_RAHEAD,
+						&ra_bh, &submit_ptr);
 			if (likely(!ret || ret == -EEXIST))
 				brelse(ra_bh);
 			else if (ret != -EBUSY)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 847def8af315..b0d22ff24b67 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -129,9 +129,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
 	struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
 	int ret;
 
-	ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
-					vbn ? : pbn, pbn, REQ_OP_READ, 0,
-					out_bh, &pbn);
+	ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn,
+					REQ_OP_READ, out_bh, &pbn);
 	if (ret == -EEXIST) /* internal code (cache hit) */
 		ret = 0;
 	return ret;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 66e8811c2528..cbf4fa60eea2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -111,8 +111,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 }
 
 static int
-nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
-		       int mode, int mode_flags, struct buffer_head **out_bh)
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf,
+		       struct buffer_head **out_bh)
 {
 	struct buffer_head *bh;
 	__u64 blknum = 0;
@@ -126,12 +126,12 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 	if (buffer_uptodate(bh))
 		goto out;
 
-	if (mode_flags & REQ_RAHEAD) {
+	if (opf & REQ_RAHEAD) {
 		if (!trylock_buffer(bh)) {
 			ret = -EBUSY;
 			goto failed_bh;
 		}
-	} else /* mode == READ */
+	} else /* opf == REQ_OP_READ */
 		lock_buffer(bh);
 
 	if (buffer_uptodate(bh)) {
@@ -148,10 +148,11 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(mode | mode_flags, bh);
+	submit_bh(opf, bh);
 	ret = 0;
 
-	trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
+	trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff,
+				      opf & REQ_OP_MASK);
  out:
 	get_bh(bh);
 	*out_bh = bh;
@@ -172,7 +173,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 	int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
 	int err;
 
-	err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh);
+	err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, &first_bh);
 	if (err == -EEXIST) /* internal code */
 		goto out;
 
@@ -182,8 +183,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 	if (readahead) {
 		blkoff = block + 1;
 		for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
-			err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ,
-						     REQ_RAHEAD, &bh);
+			err = nilfs_mdt_submit_block(inode, blkoff,
+						REQ_OP_READ | REQ_RAHEAD, &bh);
 			if (likely(!err || err == -EEXIST))
 				brelse(bh);
 			else if (err != -EBUSY)
diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
index 84ee31fc04cc..8efc6236f57c 100644
--- a/include/trace/events/nilfs2.h
+++ b/include/trace/events/nilfs2.h
@@ -192,7 +192,7 @@ TRACE_EVENT(nilfs2_mdt_submit_block,
 	    TP_PROTO(struct inode *inode,
 		     unsigned long ino,
 		     unsigned long blkoff,
-		     int mode),
+		     enum req_op mode),
 
 	    TP_ARGS(inode, ino, blkoff, mode),
 
@@ -200,7 +200,7 @@ TRACE_EVENT(nilfs2_mdt_submit_block,
 		    __field(struct inode *, inode)
 		    __field(unsigned long, ino)
 		    __field(unsigned long, blkoff)
-		    __field(int, mode)
+		    __field(enum req_op, mode)
 	    ),
 
 	    TP_fast_assign(
-- 
cgit 


From 2a04b8d846dca196342862caecadd2a78460d8dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Jul 2022 18:58:20 -0400
Subject: tracing: devlink: Use static array for string in devlink_trap_report
 event

The trace event devlink_trap_report uses the __dynamic_array() macro to
determine the size of the input_dev_name field. This is because it needs
to test the dev field for NULL, and will use "NULL" if it is. But it also
has the size of the dynamic array as a fixed IFNAMSIZ bytes. This defeats
the purpose of the dynamic array, as this will reserve that amount of
bytes on the ring buffer, and to make matters worse, it will even save
that size in the event as the event expects it to be dynamic (for which it
is not).

Since IFNAMSIZ is just 16 bytes, just make it a static array and this will
remove the meta data from the event that records the size.

Link: https://lkml.kernel.org/r/20220712185820.002d9fb5@gandalf.local.home

Cc: Leon Romanovsky <leon@kernel.org>
Cc: Jiri Pirko <jiri@nvidia.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/devlink.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
index 2814f188d98c..24969184c534 100644
--- a/include/trace/events/devlink.h
+++ b/include/trace/events/devlink.h
@@ -186,7 +186,7 @@ TRACE_EVENT(devlink_trap_report,
 		__string(driver_name, devlink_to_dev(devlink)->driver->name)
 		__string(trap_name, metadata->trap_name)
 		__string(trap_group_name, metadata->trap_group_name)
-		__dynamic_array(char, input_dev_name, IFNAMSIZ)
+		__array(char, input_dev_name, IFNAMSIZ)
 	),
 
 	TP_fast_assign(
@@ -197,15 +197,14 @@ TRACE_EVENT(devlink_trap_report,
 		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
 		__assign_str(trap_name, metadata->trap_name);
 		__assign_str(trap_group_name, metadata->trap_group_name);
-		__assign_str(input_dev_name,
-			     (input_dev ? input_dev->name : "NULL"));
+		strscpy(__entry->input_dev_name, input_dev ? input_dev->name : "NULL", IFNAMSIZ);
 	),
 
 	TP_printk("bus_name=%s dev_name=%s driver_name=%s trap_name=%s "
 		  "trap_group_name=%s input_dev_name=%s", __get_str(bus_name),
 		  __get_str(dev_name), __get_str(driver_name),
 		  __get_str(trap_name), __get_str(trap_group_name),
-		  __get_str(input_dev_name))
+		  __entry->input_dev_name)
 );
 
 #endif /* _TRACE_DEVLINK_H */
-- 
cgit 


From b644c95598adfe2b968e97daee3a890dd2fda84d Mon Sep 17 00:00:00 2001
From: PaddyKP_Yao <PaddyKP_Yao@asus.com>
Date: Mon, 11 Jul 2022 19:51:25 +0800
Subject: platform/x86: asus-wmi: Add mic-mute LED classdev support

In some new ASUS devices, hotkey Fn+F13 is used for mic mute. If mic-mute
LED is present by checking WMI ASUS_WMI_DEVID_MICMUTE_LED, we will add a
mic-mute LED classdev, asus::micmute, in the asus-wmi driver to control
it. The binding of mic-mute LED controls will be swithched with LED
trigger.

Signed-off-by: PaddyKP_Yao <PaddyKP_Yao@asus.com>
Link: https://lore.kernel.org/r/20220711115125.2072508-1-PaddyKP_Yao@asus.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/Kconfig               |  2 ++
 drivers/platform/x86/asus-wmi.c            | 25 +++++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h |  1 +
 3 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 7fa88efeef4d..6a33c862452b 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -273,6 +273,8 @@ config ASUS_WMI
 	select INPUT_SPARSEKMAP
 	select LEDS_CLASS
 	select NEW_LEDS
+	select LEDS_TRIGGERS
+	select LEDS_TRIGGER_AUDIO
 	select ACPI_PLATFORM_PROFILE
 	help
 	  Say Y here if you have a WMI aware Asus laptop (like Eee PCs or new
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 62ce198a3463..89b604e04d7f 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -208,6 +208,7 @@ struct asus_wmi {
 	int kbd_led_wk;
 	struct led_classdev lightbar_led;
 	int lightbar_led_wk;
+	struct led_classdev micmute_led;
 	struct workqueue_struct *led_workqueue;
 	struct work_struct tpd_led_work;
 	struct work_struct wlan_led_work;
@@ -1028,12 +1029,23 @@ static enum led_brightness lightbar_led_get(struct led_classdev *led_cdev)
 	return result & ASUS_WMI_DSTS_LIGHTBAR_MASK;
 }
 
+static int micmute_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness brightness)
+{
+	int state = brightness != LED_OFF;
+	int err;
+
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_MICMUTE_LED, state, NULL);
+	return err < 0 ? err : 0;
+}
+
 static void asus_wmi_led_exit(struct asus_wmi *asus)
 {
 	led_classdev_unregister(&asus->kbd_led);
 	led_classdev_unregister(&asus->tpd_led);
 	led_classdev_unregister(&asus->wlan_led);
 	led_classdev_unregister(&asus->lightbar_led);
+	led_classdev_unregister(&asus->micmute_led);
 
 	if (asus->led_workqueue)
 		destroy_workqueue(asus->led_workqueue);
@@ -1105,6 +1117,19 @@ static int asus_wmi_led_init(struct asus_wmi *asus)
 					   &asus->lightbar_led);
 	}
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MICMUTE_LED)) {
+		asus->micmute_led.name = "asus::micmute";
+		asus->micmute_led.max_brightness = 1;
+		asus->micmute_led.brightness = ledtrig_audio_get(LED_AUDIO_MICMUTE);
+		asus->micmute_led.brightness_set_blocking = micmute_led_set;
+		asus->micmute_led.default_trigger = "audio-micmute";
+
+		rv = led_classdev_register(&asus->platform_device->dev,
+						&asus->micmute_led);
+		if (rv)
+			goto error;
+	}
+
 error:
 	if (rv)
 		asus_wmi_led_exit(asus);
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index a571b47ff362..98f2b2f20f3e 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -49,6 +49,7 @@
 #define ASUS_WMI_DEVID_LED4		0x00020014
 #define ASUS_WMI_DEVID_LED5		0x00020015
 #define ASUS_WMI_DEVID_LED6		0x00020016
+#define ASUS_WMI_DEVID_MICMUTE_LED		0x00040017
 
 /* Backlight and Brightness */
 #define ASUS_WMI_DEVID_ALS_ENABLE	0x00050001 /* Ambient Light Sensor */
-- 
cgit 


From cd89edda4002b7fb3c0a6765c3a60a60d5b1dc16 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Thu, 14 Jul 2022 20:42:12 +0800
Subject: PCI: loongson: Add ACPI init support

Loongson PCH (LS7A chipset) will be used by both MIPS-based and LoongArch-
based Loongson processors. MIPS-based Loongson uses FDT, while LoongArch-
based Loongson uses ACPI.

Add ACPI init support for the driver in pci-loongson.c because it is
currently FDT-only.

LoongArch is a new RISC ISA, mainline support will come soon, and
documentations are here (in translation):

  https://github.com/loongson/LoongArch-Documentation

Link: https://lore.kernel.org/r/20220714124216.1489304-4-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_mcfg.c               | 10 ++++
 drivers/pci/controller/Kconfig        |  2 +-
 drivers/pci/controller/pci-loongson.c | 94 ++++++++++++++++++++++++++++-------
 include/linux/pci-ecam.h              |  1 +
 4 files changed, 87 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
index 63b98eae5e75..860014b89b8e 100644
--- a/drivers/acpi/pci_mcfg.c
+++ b/drivers/acpi/pci_mcfg.c
@@ -172,6 +172,16 @@ static struct mcfg_fixup mcfg_quirks[] = {
 	ALTRA_ECAM_QUIRK(1, 14),
 	ALTRA_ECAM_QUIRK(1, 15),
 #endif /* ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+#define LOONGSON_ECAM_MCFG(table_id, seg) \
+	{ "LOONGS", table_id, 1, seg, MCFG_BUS_ANY, &loongson_pci_ecam_ops }
+
+	LOONGSON_ECAM_MCFG("\0", 0),
+	LOONGSON_ECAM_MCFG("LOONGSON", 0),
+	LOONGSON_ECAM_MCFG("\0", 1),
+	LOONGSON_ECAM_MCFG("LOONGSON", 1),
+#endif /* LOONGARCH */
 };
 
 static char mcfg_oem_id[ACPI_OEM_ID_SIZE];
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index b8d96d38064d..9dbd73898b47 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -293,7 +293,7 @@ config PCI_HYPERV_INTERFACE
 config PCI_LOONGSON
 	bool "LOONGSON PCI Controller"
 	depends on MACH_LOONGSON64 || COMPILE_TEST
-	depends on OF
+	depends on OF || ACPI
 	depends on PCI_QUIRKS
 	default MACH_LOONGSON64
 	help
diff --git a/drivers/pci/controller/pci-loongson.c b/drivers/pci/controller/pci-loongson.c
index 565453882ffe..cd29800974e7 100644
--- a/drivers/pci/controller/pci-loongson.c
+++ b/drivers/pci/controller/pci-loongson.c
@@ -9,6 +9,8 @@
 #include <linux/of_pci.h>
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
 
 #include "../pci.h"
 
@@ -97,39 +99,53 @@ static void loongson_mrrs_quirk(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_ENABLE(PCI_ANY_ID, PCI_ANY_ID, loongson_mrrs_quirk);
 
-static void __iomem *cfg1_map(struct loongson_pci *priv, int bus,
-				unsigned int devfn, int where)
+static struct loongson_pci *pci_bus_to_loongson_pci(struct pci_bus *bus)
 {
-	unsigned long addroff = 0x0;
+	struct pci_config_window *cfg;
 
-	if (bus != 0)
-		addroff |= BIT(28); /* Type 1 Access */
-	addroff |= (where & 0xff) | ((where & 0xf00) << 16);
-	addroff |= (bus << 16) | (devfn << 8);
-	return priv->cfg1_base + addroff;
+	if (acpi_disabled)
+		return (struct loongson_pci *)(bus->sysdata);
+
+	cfg = bus->sysdata;
+	return (struct loongson_pci *)(cfg->priv);
 }
 
-static void __iomem *cfg0_map(struct loongson_pci *priv, int bus,
-				unsigned int devfn, int where)
+static void __iomem *cfg0_map(struct loongson_pci *priv, struct pci_bus *bus,
+			      unsigned int devfn, int where)
 {
 	unsigned long addroff = 0x0;
+	unsigned char busnum = bus->number;
 
-	if (bus != 0)
+	if (!pci_is_root_bus(bus)) {
 		addroff |= BIT(24); /* Type 1 Access */
-	addroff |= (bus << 16) | (devfn << 8) | where;
+		addroff |= (busnum << 16);
+	}
+	addroff |= (devfn << 8) | where;
 	return priv->cfg0_base + addroff;
 }
 
-static void __iomem *pci_loongson_map_bus(struct pci_bus *bus, unsigned int devfn,
-			       int where)
+static void __iomem *cfg1_map(struct loongson_pci *priv, struct pci_bus *bus,
+			      unsigned int devfn, int where)
 {
+	unsigned long addroff = 0x0;
 	unsigned char busnum = bus->number;
-	struct pci_host_bridge *bridge = pci_find_host_bridge(bus);
-	struct loongson_pci *priv =  pci_host_bridge_priv(bridge);
+
+	if (!pci_is_root_bus(bus)) {
+		addroff |= BIT(28); /* Type 1 Access */
+		addroff |= (busnum << 16);
+	}
+	addroff |= (devfn << 8) | (where & 0xff) | ((where & 0xf00) << 16);
+	return priv->cfg1_base + addroff;
+}
+
+static void __iomem *pci_loongson_map_bus(struct pci_bus *bus,
+					  unsigned int devfn, int where)
+{
+	struct loongson_pci *priv = pci_bus_to_loongson_pci(bus);
 
 	/*
 	 * Do not read more than one device on the bus other than
-	 * the host bus. For our hardware the root bus is always bus 0.
+	 * the host bus.
 	 */
 	if (priv->data->flags & FLAG_DEV_FIX &&
 			!pci_is_root_bus(bus) && PCI_SLOT(devfn) > 0)
@@ -137,15 +153,17 @@ static void __iomem *pci_loongson_map_bus(struct pci_bus *bus, unsigned int devf
 
 	/* CFG0 can only access standard space */
 	if (where < PCI_CFG_SPACE_SIZE && priv->cfg0_base)
-		return cfg0_map(priv, busnum, devfn, where);
+		return cfg0_map(priv, bus, devfn, where);
 
 	/* CFG1 can access extended space */
 	if (where < PCI_CFG_SPACE_EXP_SIZE && priv->cfg1_base)
-		return cfg1_map(priv, busnum, devfn, where);
+		return cfg1_map(priv, bus, devfn, where);
 
 	return NULL;
 }
 
+#ifdef CONFIG_OF
+
 static int loongson_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
 	int irq;
@@ -259,3 +277,41 @@ static struct platform_driver loongson_pci_driver = {
 	.probe = loongson_pci_probe,
 };
 builtin_platform_driver(loongson_pci_driver);
+
+#endif
+
+#ifdef CONFIG_ACPI
+
+static int loongson_pci_ecam_init(struct pci_config_window *cfg)
+{
+	struct device *dev = cfg->parent;
+	struct loongson_pci *priv;
+	struct loongson_pci_data *data;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	cfg->priv = priv;
+	data->flags = FLAG_CFG1;
+	priv->data = data;
+	priv->cfg1_base = cfg->win - (cfg->busr.start << 16);
+
+	return 0;
+}
+
+const struct pci_ecam_ops loongson_pci_ecam_ops = {
+	.bus_shift = 16,
+	.init	   = loongson_pci_ecam_init,
+	.pci_ops   = {
+		.map_bus = pci_loongson_map_bus,
+		.read	 = pci_generic_config_read,
+		.write	 = pci_generic_config_write,
+	}
+};
+
+#endif
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index adea5a4771cf..6b1301e2498e 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -87,6 +87,7 @@ extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 *
 extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 extern const struct pci_ecam_ops al_pcie_ops;	/* Amazon Annapurna Labs PCIe */
 extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */
+extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */
 #endif
 
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
-- 
cgit 


From ca2e1a627035002cd33d9667431e80bad90c25fa Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Thu, 7 Jul 2022 15:08:42 +0200
Subject: xsk: Mark napi_id on sendmsg()

When application runs in busy poll mode and does not receive a single
packet but only sends them, it is currently impossible to get into
napi_busy_loop() as napi_id is only marked on Rx side in xsk_rcv_check().
In there, napi_id is being taken from xdp_rxq_info carried by xdp_buff.
From Tx perspective, we do not have access to it. What we have handy is
the xsk pool.

Xsk pool works on a pool of internal xdp_buff wrappers called xdp_buff_xsk.
AF_XDP ZC enabled drivers call xp_set_rxq_info() so each of xdp_buff_xsk
has a valid pointer to xdp_rxq_info of underlying queue. Therefore, on Tx
side, napi_id can be pulled from xs->pool->heads[0].xdp.rxq->napi_id. Hide
this pointer chase under helper function, xsk_pool_get_napi_id().

Do this only for sockets working in ZC mode as otherwise rxq pointers would
not be initialized.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220707130842.49408-1-maciej.fijalkowski@intel.com
---
 include/net/xdp_sock_drv.h | 14 ++++++++++++++
 net/xdp/xsk.c              |  5 ++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 4aa031849668..4277b0dcee05 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -44,6 +44,15 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 	xp_set_rxq_info(pool, rxq);
 }
 
+static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	return pool->heads[0].xdp.rxq->napi_id;
+#else
+	return 0;
+#endif
+}
+
 static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool,
 				      unsigned long attrs)
 {
@@ -198,6 +207,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 {
 }
 
+static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
+{
+	return 0;
+}
+
 static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool,
 				      unsigned long attrs)
 {
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 09002387987e..5b4ce6ba1bc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -639,8 +639,11 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len
 	if (unlikely(need_wait))
 		return -EOPNOTSUPP;
 
-	if (sk_can_busy_loop(sk))
+	if (sk_can_busy_loop(sk)) {
+		if (xs->zc)
+			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+	}
 
 	if (xs->zc && xsk_no_wakeup(sk))
 		return 0;
-- 
cgit 


From e2863a78593d638d3924a6f67900c4820034f349 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:24 -0700
Subject: lib/bitmap: change return types to bool where appropriate

Some bitmap functions return boolean results in int variables. Fix it
by changing return types to bool.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h       | 8 ++++----
 lib/bitmap.c                 | 4 ++--
 tools/include/linux/bitmap.h | 8 ++++----
 tools/lib/bitmap.c           | 2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index c91638e507f2..e1a438bdda52 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -148,13 +148,13 @@ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
 			 unsigned int shift, unsigned int nbits);
 void bitmap_cut(unsigned long *dst, const unsigned long *src,
 		unsigned int first, unsigned int cut, unsigned int nbits);
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 		  const unsigned long *bitmap2, unsigned int nbits);
-int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
 		    const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_replace(unsigned long *dst,
 		      const unsigned long *old, const unsigned long *new,
@@ -315,7 +315,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
 	bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
 #endif
 
-static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
@@ -341,7 +341,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 		__bitmap_xor(dst, src1, src2, nbits);
 }
 
-static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e903e13c62e1..9bc80f5bf149 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -237,7 +237,7 @@ void bitmap_cut(unsigned long *dst, const unsigned long *src,
 }
 EXPORT_SYMBOL(bitmap_cut);
 
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 				const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
@@ -275,7 +275,7 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_xor);
 
-int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
 				const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index afdf93bebaaf..2ae7ab8ed7d1 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -14,7 +14,7 @@
 int __bitmap_weight(const unsigned long *bitmap, int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int bits);
 bool __bitmap_equal(const unsigned long *bitmap1,
 		    const unsigned long *bitmap2, unsigned int bits);
@@ -45,7 +45,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 	dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
 }
 
-static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
+static inline bool bitmap_empty(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -53,7 +53,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
 	return find_first_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
+static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
@@ -146,7 +146,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
  * @src2: operand 2
  * @nbits: size of bitmap
  */
-static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
 			     const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 354f8cdc0880..2e351d63fdba 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -57,7 +57,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
 	return ret;
 }
 
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
-- 
cgit 


From 3a06ed80265fa62eecaf519d92f1633e4f9510c7 Mon Sep 17 00:00:00 2001
From: Michael Wu <michael@allwinnertech.com>
Date: Fri, 8 Jul 2022 17:57:14 +0800
Subject: extcon: Add EXTCON_DISP_CVBS and EXTCON_DISP_EDP

Add EXTCON_DISP_CVBS for Composite Video Broadcast Signal.
Add EXTCON_DISP_EDP for Embedded Display Port

[1] https://en.wikipedia.org/wiki/Composite_video
[2] https://en.wikipedia.org/wiki/DisplayPort#eDP

Signed-off-by: Michael Wu <michael@allwinnertech.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon.c | 10 ++++++++++
 include/linux/extcon.h  |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 0e40418ad287..e1c71359b605 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -167,6 +167,16 @@ static const struct __extcon_info {
 		.id = EXTCON_DISP_HMD,
 		.name = "HMD",
 	},
+	[EXTCON_DISP_CVBS] = {
+		.type = EXTCON_TYPE_DISP,
+		.id = EXTCON_DISP_CVBS,
+		.name = "CVBS",
+	},
+	[EXTCON_DISP_EDP] = {
+		.type = EXTCON_TYPE_DISP,
+		.id = EXTCON_DISP_EDP,
+		.name = "EDP",
+	},
 
 	/* Miscellaneous external connector */
 	[EXTCON_DOCK] = {
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 685401d94d39..3c45c3846fe9 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -76,6 +76,8 @@
 #define EXTCON_DISP_VGA		43	/* Video Graphics Array */
 #define EXTCON_DISP_DP		44	/* Display Port */
 #define EXTCON_DISP_HMD		45	/* Head-Mounted Display */
+#define EXTCON_DISP_CVBS	46	/* Composite Video Broadcast Signal */
+#define EXTCON_DISP_EDP		47	/* Embedded Display Port */
 
 /* Miscellaneous external connector */
 #define EXTCON_DOCK		60
-- 
cgit 


From 9a7923668bc779dd601d94704b4dd895a2ef6a77 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Jul 2022 16:18:51 +0200
Subject: net: devlink: make devlink_dpipe_headers_register() return void

The return value is not used, so change the return value type to void.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c | 6 ++----
 include/net/devlink.h                                | 2 +-
 net/core/devlink.c                                   | 5 ++---
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 5d494fabf93d..c2540292702d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -1266,10 +1266,8 @@ int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
 	int err;
 
-	err = devlink_dpipe_headers_register(devlink,
-					     &mlxsw_sp_dpipe_headers);
-	if (err)
-		return err;
+	devlink_dpipe_headers_register(devlink, &mlxsw_sp_dpipe_headers);
+
 	err = mlxsw_sp_dpipe_erif_table_init(mlxsw_sp);
 	if (err)
 		goto err_erif_table_init;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index b1b5c19a8316..88c701b375a2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1590,7 +1590,7 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 				 void *priv, bool counter_control_extern);
 void devlink_dpipe_table_unregister(struct devlink *devlink,
 				    const char *table_name);
-int devlink_dpipe_headers_register(struct devlink *devlink,
+void devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers);
 void devlink_dpipe_headers_unregister(struct devlink *devlink);
 bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b2e454ebd78..c261bba9ab76 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10425,13 +10425,12 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
  *
  *	Register the headers supported by hardware.
  */
-int devlink_dpipe_headers_register(struct devlink *devlink,
-				   struct devlink_dpipe_headers *dpipe_headers)
+void devlink_dpipe_headers_register(struct devlink *devlink,
+				    struct devlink_dpipe_headers *dpipe_headers)
 {
 	devl_lock(devlink);
 	devlink->dpipe_headers = dpipe_headers;
 	devl_unlock(devlink);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
 
-- 
cgit 


From 309c56e84602d894e7ca424b0b13837117568fca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:13 +0200
Subject: iommu: remove the unused dev_has_feat method

This method is never actually called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-2-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 -
 include/linux/iommu.h                       | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d9c1623ec1a9..1b6c17dd81ee 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-	.dev_has_feat		= arm_smmu_dev_has_feature,
 	.dev_feat_enabled	= arm_smmu_dev_feature_enabled,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e6abd998dbe7..a3acdb46b939 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -164,8 +164,7 @@ struct iommu_iort_rmr_data {
  *			 supported, this feature must be enabled before and
  *			 disabled after %IOMMU_DEV_FEAT_SVA.
  *
- * Device drivers query whether a feature is supported using
- * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
+ * Device drivers enable a feature using iommu_dev_enable_feature().
  */
 enum iommu_dev_features {
 	IOMMU_DEV_FEAT_SVA,
@@ -248,7 +247,6 @@ struct iommu_ops {
 	bool (*is_attach_deferred)(struct device *dev);
 
 	/* Per device IOMMU features */
-	bool (*dev_has_feat)(struct device *dev, enum iommu_dev_features f);
 	bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
-- 
cgit 


From a871765d5588531e1972902d1ae077b8be306c94 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:14 +0200
Subject: iommu: remove iommu_dev_feature_enabled

Remove the unused iommu_dev_feature_enabled function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-3-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  1 -
 drivers/iommu/iommu.c                       | 13 -------------
 include/linux/iommu.h                       |  9 ---------
 3 files changed, 23 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1b6c17dd81ee..4d30a8d2bc23 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-	.dev_feat_enabled	= arm_smmu_dev_feature_enabled,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
 	.sva_bind		= arm_smmu_sva_bind,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0aa141646bdf..1bb016a6a2aa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2760,19 +2760,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 }
 EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
-bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-	if (dev->iommu && dev->iommu->iommu_dev) {
-		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
-
-		if (ops->dev_feat_enabled)
-			return ops->dev_feat_enabled(dev, feat);
-	}
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled);
-
 /**
  * iommu_sva_bind_device() - Bind a process address space to a device
  * @dev: the device
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a3acdb46b939..0bc2eb14b026 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -215,7 +215,6 @@ struct iommu_iotlb_gather {
  *                      driver init to device driver init (default no)
  * @dev_has/enable/disable_feat: per device entries to check/enable/disable
  *                               iommu specific features.
- * @dev_feat_enabled: check enabled feature
  * @sva_bind: Bind process address space to device
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
@@ -247,7 +246,6 @@ struct iommu_ops {
 	bool (*is_attach_deferred)(struct device *dev);
 
 	/* Per device IOMMU features */
-	bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
 
@@ -670,7 +668,6 @@ void iommu_release_device(struct device *dev);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
-bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f);
 
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm,
@@ -997,12 +994,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static inline bool
-iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-	return false;
-}
-
 static inline int
 iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-- 
cgit 


From ae3ff39a51a0f5843960487962e110339f321b0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:15 +0200
Subject: iommu: remove the put_resv_regions method

All drivers that implement get_resv_regions just use
generic_put_resv_regions to implement the put side.  Remove the
indirections and document the allocations constraints.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-4-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c                   |  1 -
 drivers/iommu/apple-dart.c                  |  1 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  1 -
 drivers/iommu/arm/arm-smmu/arm-smmu.c       |  1 -
 drivers/iommu/intel/iommu.c                 |  1 -
 drivers/iommu/iommu.c                       | 21 ++++-----------------
 drivers/iommu/mtk_iommu.c                   |  1 -
 drivers/iommu/virtio-iommu.c                |  5 ++---
 include/linux/iommu.h                       |  4 ----
 9 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 840831d5d2ad..e66e071e8c3b 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2280,7 +2280,6 @@ const struct iommu_ops amd_iommu_ops = {
 	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
 	.get_resv_regions = amd_iommu_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 	.def_domain_type = amd_iommu_def_domain_type,
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index e87d3cf54ed6..1b1725759262 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -768,7 +768,6 @@ static const struct iommu_ops apple_dart_iommu_ops = {
 	.of_xlate = apple_dart_of_xlate,
 	.def_domain_type = apple_dart_def_domain_type,
 	.get_resv_regions = apple_dart_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.pgsize_bitmap = -1UL, /* Restricted during dart probe */
 	.owner = THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 4d30a8d2bc23..4a5e435567f1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2852,7 +2852,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
 	.sva_bind		= arm_smmu_sva_bind,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 588929bed1bc..2d4129a4ccfc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1584,7 +1584,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.def_domain_type	= arm_smmu_def_domain_type,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
 	.owner			= THIS_MODULE,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 44016594831d..49d616aa2148 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4911,7 +4911,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.probe_finalize		= intel_iommu_probe_finalize,
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.device_group		= intel_iommu_device_group,
 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1bb016a6a2aa..f53f8b2d27a5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2579,27 +2579,14 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 		ops->get_resv_regions(dev, list);
 }
 
-void iommu_put_resv_regions(struct device *dev, struct list_head *list)
-{
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
-
-	if (ops->put_resv_regions)
-		ops->put_resv_regions(dev, list);
-}
-
 /**
- * generic_iommu_put_resv_regions - Reserved region driver helper
+ * iommu_put_resv_regions - release resered regions
  * @dev: device for which to free reserved regions
  * @list: reserved region list for device
  *
- * IOMMU drivers can use this to implement their .put_resv_regions() callback
- * for simple reservations. If a per region callback is provided that will be
- * used to free all memory allocations associated with the reserved region or
- * else just free up the memory for the regions. If an IOMMU driver allocates
- * additional resources per region, it is going to have to implement a custom
- * callback.
+ * This releases a reserved region list acquired by iommu_get_resv_regions().
  */
-void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 {
 	struct iommu_resv_region *entry, *next;
 
@@ -2610,7 +2597,7 @@ void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
 			kfree(entry);
 	}
 }
-EXPORT_SYMBOL(generic_iommu_put_resv_regions);
+EXPORT_SYMBOL(iommu_put_resv_regions);
 
 struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 						  size_t length, int prot,
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 5c3d9366c25c..95fd21c7207a 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -928,7 +928,6 @@ static const struct iommu_ops mtk_iommu_ops = {
 	.device_group	= mtk_iommu_device_group,
 	.of_xlate	= mtk_iommu_of_xlate,
 	.get_resv_regions = mtk_iommu_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.pgsize_bitmap	= SZ_4K | SZ_64K | SZ_1M | SZ_16M,
 	.owner		= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 55337796a5f8..feeb5fde72a3 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -964,7 +964,7 @@ static struct iommu_device *viommu_probe_device(struct device *dev)
 	return &viommu->iommu;
 
 err_free_dev:
-	generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+	iommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 
 	return ERR_PTR(ret);
@@ -983,7 +983,7 @@ static void viommu_release_device(struct device *dev)
 {
 	struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
 
-	generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+	iommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 }
 
@@ -1007,7 +1007,6 @@ static struct iommu_ops viommu_ops = {
 	.release_device		= viommu_release_device,
 	.device_group		= viommu_device_group,
 	.get_resv_regions	= viommu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.of_xlate		= viommu_of_xlate,
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0bc2eb14b026..ea30f00dc145 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -209,7 +209,6 @@ struct iommu_iotlb_gather {
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
  * @get_resv_regions: Request list of reserved regions for a device
- * @put_resv_regions: Free list of reserved regions for a device
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -240,7 +239,6 @@ struct iommu_ops {
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
-	void (*put_resv_regions)(struct device *dev, struct list_head *list);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct device *dev);
@@ -454,8 +452,6 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
-extern void generic_iommu_put_resv_regions(struct device *dev,
-					   struct list_head *list);
 extern void iommu_set_default_passthrough(bool cmd_line);
 extern void iommu_set_default_translated(bool cmd_line);
 extern bool iommu_default_passthrough(void);
-- 
cgit 


From 933ab6d30153f937ffa9471ce64207e6d9acf56e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:44 +0800
Subject: iommu/vt-d: Move trace/events/intel_iommu.h under iommu

This header file is private to the Intel IOMMU driver. Move it to the
driver folder.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c         |  2 +-
 drivers/iommu/intel/svm.c          |  2 +-
 drivers/iommu/intel/trace.c        |  2 +-
 drivers/iommu/intel/trace.h        | 98 ++++++++++++++++++++++++++++++++++++++
 include/trace/events/intel_iommu.h | 94 ------------------------------------
 5 files changed, 101 insertions(+), 97 deletions(-)
 create mode 100644 drivers/iommu/intel/trace.h
 delete mode 100644 include/trace/events/intel_iommu.h

(limited to 'include')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 9699ca101c62..f91b45be1d92 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -30,10 +30,10 @@
 #include <linux/numa.h>
 #include <linux/limits.h>
 #include <asm/irq_remapping.h>
-#include <trace/events/intel_iommu.h>
 
 #include "../irq_remapping.h"
 #include "perf.h"
+#include "trace.h"
 
 typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
 struct dmar_res_callback {
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 7ee37d996e15..70b40d007a52 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -21,11 +21,11 @@
 #include <linux/ioasid.h>
 #include <asm/page.h>
 #include <asm/fpu/api.h>
-#include <trace/events/intel_iommu.h>
 
 #include "pasid.h"
 #include "perf.h"
 #include "../iommu-sva-lib.h"
+#include "trace.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
diff --git a/drivers/iommu/intel/trace.c b/drivers/iommu/intel/trace.c
index bfb6a6e37a88..117e626e3ea9 100644
--- a/drivers/iommu/intel/trace.c
+++ b/drivers/iommu/intel/trace.c
@@ -11,4 +11,4 @@
 #include <linux/types.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/intel_iommu.h>
+#include "trace.h"
diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h
new file mode 100644
index 000000000000..25cb7f88e1a2
--- /dev/null
+++ b/drivers/iommu/intel/trace.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel IOMMU trace support
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM intel_iommu
+
+#if !defined(_TRACE_INTEL_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_INTEL_IOMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/intel-iommu.h>
+
+#define MSG_MAX		256
+
+TRACE_EVENT(qi_submit,
+	TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
+
+	TP_ARGS(iommu, qw0, qw1, qw2, qw3),
+
+	TP_STRUCT__entry(
+		__field(u64, qw0)
+		__field(u64, qw1)
+		__field(u64, qw2)
+		__field(u64, qw3)
+		__string(iommu, iommu->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(iommu, iommu->name);
+		__entry->qw0 = qw0;
+		__entry->qw1 = qw1;
+		__entry->qw2 = qw2;
+		__entry->qw3 = qw3;
+	),
+
+	TP_printk("%s %s: 0x%llx 0x%llx 0x%llx 0x%llx",
+		  __print_symbolic(__entry->qw0 & 0xf,
+				   { QI_CC_TYPE,	"cc_inv" },
+				   { QI_IOTLB_TYPE,	"iotlb_inv" },
+				   { QI_DIOTLB_TYPE,	"dev_tlb_inv" },
+				   { QI_IEC_TYPE,	"iec_inv" },
+				   { QI_IWD_TYPE,	"inv_wait" },
+				   { QI_EIOTLB_TYPE,	"p_iotlb_inv" },
+				   { QI_PC_TYPE,	"pc_inv" },
+				   { QI_DEIOTLB_TYPE,	"p_dev_tlb_inv" },
+				   { QI_PGRP_RESP_TYPE,	"page_grp_resp" }),
+		__get_str(iommu),
+		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
+	)
+);
+
+TRACE_EVENT(prq_report,
+	TP_PROTO(struct intel_iommu *iommu, struct device *dev,
+		 u64 dw0, u64 dw1, u64 dw2, u64 dw3,
+		 unsigned long seq),
+
+	TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
+
+	TP_STRUCT__entry(
+		__field(u64, dw0)
+		__field(u64, dw1)
+		__field(u64, dw2)
+		__field(u64, dw3)
+		__field(unsigned long, seq)
+		__string(iommu, iommu->name)
+		__string(dev, dev_name(dev))
+		__dynamic_array(char, buff, MSG_MAX)
+	),
+
+	TP_fast_assign(
+		__entry->dw0 = dw0;
+		__entry->dw1 = dw1;
+		__entry->dw2 = dw2;
+		__entry->dw3 = dw3;
+		__entry->seq = seq;
+		__assign_str(iommu, iommu->name);
+		__assign_str(dev, dev_name(dev));
+	),
+
+	TP_printk("%s/%s seq# %ld: %s",
+		__get_str(iommu), __get_str(dev), __entry->seq,
+		decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
+				      __entry->dw1, __entry->dw2, __entry->dw3)
+	)
+);
+#endif /* _TRACE_INTEL_IOMMU_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH ../../drivers/iommu/intel/
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
deleted file mode 100644
index e5c1ca6d16ee..000000000000
--- a/include/trace/events/intel_iommu.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Intel IOMMU trace support
- *
- * Copyright (C) 2019 Intel Corporation
- *
- * Author: Lu Baolu <baolu.lu@linux.intel.com>
- */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM intel_iommu
-
-#if !defined(_TRACE_INTEL_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_INTEL_IOMMU_H
-
-#include <linux/tracepoint.h>
-#include <linux/intel-iommu.h>
-
-#define MSG_MAX		256
-
-TRACE_EVENT(qi_submit,
-	TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
-
-	TP_ARGS(iommu, qw0, qw1, qw2, qw3),
-
-	TP_STRUCT__entry(
-		__field(u64, qw0)
-		__field(u64, qw1)
-		__field(u64, qw2)
-		__field(u64, qw3)
-		__string(iommu, iommu->name)
-	),
-
-	TP_fast_assign(
-		__assign_str(iommu, iommu->name);
-		__entry->qw0 = qw0;
-		__entry->qw1 = qw1;
-		__entry->qw2 = qw2;
-		__entry->qw3 = qw3;
-	),
-
-	TP_printk("%s %s: 0x%llx 0x%llx 0x%llx 0x%llx",
-		  __print_symbolic(__entry->qw0 & 0xf,
-				   { QI_CC_TYPE,	"cc_inv" },
-				   { QI_IOTLB_TYPE,	"iotlb_inv" },
-				   { QI_DIOTLB_TYPE,	"dev_tlb_inv" },
-				   { QI_IEC_TYPE,	"iec_inv" },
-				   { QI_IWD_TYPE,	"inv_wait" },
-				   { QI_EIOTLB_TYPE,	"p_iotlb_inv" },
-				   { QI_PC_TYPE,	"pc_inv" },
-				   { QI_DEIOTLB_TYPE,	"p_dev_tlb_inv" },
-				   { QI_PGRP_RESP_TYPE,	"page_grp_resp" }),
-		__get_str(iommu),
-		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
-	)
-);
-
-TRACE_EVENT(prq_report,
-	TP_PROTO(struct intel_iommu *iommu, struct device *dev,
-		 u64 dw0, u64 dw1, u64 dw2, u64 dw3,
-		 unsigned long seq),
-
-	TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
-
-	TP_STRUCT__entry(
-		__field(u64, dw0)
-		__field(u64, dw1)
-		__field(u64, dw2)
-		__field(u64, dw3)
-		__field(unsigned long, seq)
-		__string(iommu, iommu->name)
-		__string(dev, dev_name(dev))
-		__dynamic_array(char, buff, MSG_MAX)
-	),
-
-	TP_fast_assign(
-		__entry->dw0 = dw0;
-		__entry->dw1 = dw1;
-		__entry->dw2 = dw2;
-		__entry->dw3 = dw3;
-		__entry->seq = seq;
-		__assign_str(iommu, iommu->name);
-		__assign_str(dev, dev_name(dev));
-	),
-
-	TP_printk("%s/%s seq# %ld: %s",
-		__get_str(iommu), __get_str(dev), __entry->seq,
-		decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
-				      __entry->dw1, __entry->dw2, __entry->dw3)
-	)
-);
-#endif /* _TRACE_INTEL_IOMMU_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
-- 
cgit 


From f9903555dd05a4096d8fef4eb823c0b94f710982 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:46 +0800
Subject: iommu/vt-d: Remove unnecessary exported symbol

The exported symbol intel_iommu_gfx_mapped is not used anywhere in the
tree. Remove it to avoid dead code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 6 ------
 include/linux/intel-iommu.h | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5c0dce78586a..6564af7ec0dd 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -314,9 +314,6 @@ static int iommu_skip_te_disable;
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
-int intel_iommu_gfx_mapped;
-EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
-
 DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
@@ -4093,9 +4090,6 @@ int __init intel_iommu_init(void)
 	if (list_empty(&dmar_satc_units))
 		pr_info("No SATC found\n");
 
-	if (dmar_map_gfx)
-		intel_iommu_gfx_mapped = 1;
-
 	init_no_remapping_devices();
 
 	ret = init_dmars();
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5fcf89faa31a..4ebf3c4da45e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -787,7 +787,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
-extern int intel_iommu_gfx_mapped;
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
-- 
cgit 


From 853788b9a66ff089053df3b4ed7d166e61def449 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:49 +0800
Subject: x86/boot/tboot: Move tboot_force_iommu() to Intel IOMMU

tboot_force_iommu() is only called by the Intel IOMMU driver. Move the
helper into that driver. No functional change intended.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/kernel/tboot.c     | 15 ---------------
 drivers/iommu/intel/iommu.c | 14 ++++++++++++++
 include/linux/tboot.h       |  2 --
 3 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 0c1154a1c403..3bacd935f840 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -6,7 +6,6 @@
  * Copyright (c) 2006-2009, Intel Corporation
  */
 
-#include <linux/intel-iommu.h>
 #include <linux/init_task.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
@@ -516,17 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb
 
 	return dmar_tbl;
 }
-
-int tboot_force_iommu(void)
-{
-	if (!tboot_enabled())
-		return 0;
-
-	if (no_iommu || dmar_disabled)
-		pr_warn("Forcing Intel-IOMMU to enabled\n");
-
-	dmar_disabled = 0;
-	no_iommu = 0;
-
-	return 1;
-}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6564af7ec0dd..c70948021e8e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4019,6 +4019,20 @@ static int __init probe_acpi_namespace_devices(void)
 	return 0;
 }
 
+static __init int tboot_force_iommu(void)
+{
+	if (!tboot_enabled())
+		return 0;
+
+	if (no_iommu || dmar_disabled)
+		pr_warn("Forcing Intel-IOMMU to enabled\n");
+
+	dmar_disabled = 0;
+	no_iommu = 0;
+
+	return 1;
+}
+
 int __init intel_iommu_init(void)
 {
 	int ret = -ENODEV;
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index 5146d2574e85..d2279160ef39 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -126,7 +126,6 @@ extern void tboot_probe(void);
 extern void tboot_shutdown(u32 shutdown_type);
 extern struct acpi_table_header *tboot_get_dmar_table(
 				      struct acpi_table_header *dmar_tbl);
-extern int tboot_force_iommu(void);
 
 #else
 
@@ -136,7 +135,6 @@ extern int tboot_force_iommu(void);
 #define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
 					do { } while (0)
 #define tboot_get_dmar_table(dmar_tbl)	(dmar_tbl)
-#define tboot_force_iommu()		0
 
 #endif /* !CONFIG_INTEL_TXT */
 
-- 
cgit 


From 2585a2790e7fdb3dadfe309c9220bbc03704f84f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:50 +0800
Subject: iommu/vt-d: Move include/linux/intel-iommu.h under iommu

This header file is private to the Intel IOMMU driver. Move it to the
driver folder.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 MAINTAINERS                         |   1 -
 drivers/iommu/intel/cap_audit.c     |   2 +-
 drivers/iommu/intel/debugfs.c       |   2 +-
 drivers/iommu/intel/dmar.c          |   2 +-
 drivers/iommu/intel/iommu.c         |   2 +-
 drivers/iommu/intel/iommu.h         | 831 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/irq_remapping.c |   2 +-
 drivers/iommu/intel/pasid.c         |   2 +-
 drivers/iommu/intel/perf.c          |   2 +-
 drivers/iommu/intel/svm.c           |   2 +-
 drivers/iommu/intel/trace.h         |   3 +-
 include/linux/intel-iommu.h         | 831 ------------------------------------
 12 files changed, 841 insertions(+), 841 deletions(-)
 create mode 100644 drivers/iommu/intel/iommu.h
 delete mode 100644 include/linux/intel-iommu.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index f679152bdbad..8f9ed151ad4c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10077,7 +10077,6 @@ L:	iommu@lists.linux.dev
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 F:	drivers/iommu/intel/
-F:	include/linux/intel-iommu.h
 F:	include/linux/intel-svm.h
 
 INTEL IOP-ADMA DMA DRIVER
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
index 71596fc62822..3ee68393122f 100644
--- a/drivers/iommu/intel/cap_audit.c
+++ b/drivers/iommu/intel/cap_audit.c
@@ -10,7 +10,7 @@
 
 #define pr_fmt(fmt)	"DMAR: " fmt
 
-#include <linux/intel-iommu.h>
+#include "iommu.h"
 #include "cap_audit.h"
 
 static u64 intel_iommu_cap_sanity;
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index ed796eea4581..d927ef10641b 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -10,11 +10,11 @@
 
 #include <linux/debugfs.h>
 #include <linux/dmar.h>
-#include <linux/intel-iommu.h>
 #include <linux/pci.h>
 
 #include <asm/irq_remapping.h>
 
+#include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
 
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index f91b45be1d92..2a5e0f91e647 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -19,7 +19,6 @@
 #include <linux/pci.h>
 #include <linux/dmar.h>
 #include <linux/iova.h>
-#include <linux/intel-iommu.h>
 #include <linux/timer.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -31,6 +30,7 @@
 #include <linux/limits.h>
 #include <asm/irq_remapping.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "perf.h"
 #include "trace.h"
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c70948021e8e..10bda4bec8fe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -17,7 +17,6 @@
 #include <linux/dma-direct.h>
 #include <linux/dma-iommu.h>
 #include <linux/dmi.h>
-#include <linux/intel-iommu.h>
 #include <linux/intel-svm.h>
 #include <linux/memory.h>
 #include <linux/pci.h>
@@ -26,6 +25,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/tboot.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "../iommu-sva-lib.h"
 #include "pasid.h"
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
new file mode 100644
index 000000000000..4ebf3c4da45e
--- /dev/null
+++ b/drivers/iommu/intel/iommu.h
@@ -0,0 +1,831 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2006-2015, Intel Corporation.
+ *
+ * Authors: Ashok Raj <ashok.raj@intel.com>
+ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *          David Woodhouse <David.Woodhouse@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <linux/types.h>
+#include <linux/iova.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/mmu_notifier.h>
+#include <linux/list.h>
+#include <linux/iommu.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmar.h>
+#include <linux/ioasid.h>
+#include <linux/bitfield.h>
+
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+
+/*
+ * VT-d hardware uses 4KiB page size regardless of host page size.
+ */
+#define VTD_PAGE_SHIFT		(12)
+#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
+#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
+#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
+
+#define VTD_STRIDE_SHIFT        (9)
+#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
+
+#define DMA_PTE_READ		BIT_ULL(0)
+#define DMA_PTE_WRITE		BIT_ULL(1)
+#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
+#define DMA_PTE_SNP		BIT_ULL(11)
+
+#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
+#define DMA_FL_PTE_US		BIT_ULL(2)
+#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
+#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
+#define DMA_FL_PTE_XD		BIT_ULL(63)
+
+#define ADDR_WIDTH_5LEVEL	(57)
+#define ADDR_WIDTH_4LEVEL	(48)
+
+#define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_DEV_IOTLB	1
+#define CONTEXT_TT_PASS_THROUGH 2
+#define CONTEXT_PASIDE		BIT_ULL(3)
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
+#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
+#define DMAR_PQH_REG	0xc0	/* Page request queue head register */
+#define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
+#define DMAR_PQA_REG	0xd0	/* Page request queue address register */
+#define DMAR_PRS_REG	0xdc	/* Page request status register */
+#define DMAR_PECTL_REG	0xe0	/* Page request event control register */
+#define	DMAR_PEDATA_REG	0xe4	/* Page request event interrupt data register */
+#define	DMAR_PEADDR_REG	0xe8	/* Page request event interrupt addr register */
+#define	DMAR_PEUADDR_REG 0xec	/* Page request event Upper address register */
+#define DMAR_MTRRCAP_REG 0x100	/* MTRR capability register */
+#define DMAR_MTRRDEF_REG 0x108	/* MTRR default type register */
+#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */
+#define DMAR_MTRR_FIX16K_80000_REG 0x128
+#define DMAR_MTRR_FIX16K_A0000_REG 0x130
+#define DMAR_MTRR_FIX4K_C0000_REG 0x138
+#define DMAR_MTRR_FIX4K_C8000_REG 0x140
+#define DMAR_MTRR_FIX4K_D0000_REG 0x148
+#define DMAR_MTRR_FIX4K_D8000_REG 0x150
+#define DMAR_MTRR_FIX4K_E0000_REG 0x158
+#define DMAR_MTRR_FIX4K_E8000_REG 0x160
+#define DMAR_MTRR_FIX4K_F0000_REG 0x168
+#define DMAR_MTRR_FIX4K_F8000_REG 0x170
+#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */
+#define DMAR_MTRR_PHYSMASK0_REG 0x188
+#define DMAR_MTRR_PHYSBASE1_REG 0x190
+#define DMAR_MTRR_PHYSMASK1_REG 0x198
+#define DMAR_MTRR_PHYSBASE2_REG 0x1a0
+#define DMAR_MTRR_PHYSMASK2_REG 0x1a8
+#define DMAR_MTRR_PHYSBASE3_REG 0x1b0
+#define DMAR_MTRR_PHYSMASK3_REG 0x1b8
+#define DMAR_MTRR_PHYSBASE4_REG 0x1c0
+#define DMAR_MTRR_PHYSMASK4_REG 0x1c8
+#define DMAR_MTRR_PHYSBASE5_REG 0x1d0
+#define DMAR_MTRR_PHYSMASK5_REG 0x1d8
+#define DMAR_MTRR_PHYSBASE6_REG 0x1e0
+#define DMAR_MTRR_PHYSMASK6_REG 0x1e8
+#define DMAR_MTRR_PHYSBASE7_REG 0x1f0
+#define DMAR_MTRR_PHYSMASK7_REG 0x1f8
+#define DMAR_MTRR_PHYSBASE8_REG 0x200
+#define DMAR_MTRR_PHYSMASK8_REG 0x208
+#define DMAR_MTRR_PHYSBASE9_REG 0x210
+#define DMAR_MTRR_PHYSMASK9_REG 0x218
+#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
+#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
+#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
+
+#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
+
+#define OFFSET_STRIDE		(9)
+
+#define dmar_readq(a) readq(a)
+#define dmar_writeq(a,v) writeq(v,a)
+#define dmar_readl(a) readl(a)
+#define dmar_writel(a, v) writel(v, a)
+
+#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+#define DMAR_VER_MINOR(v)		((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_5lp_support(c)	(((c) >> 60) & 1)
+#define cap_pi_support(c)	(((c) >> 59) & 1)
+#define cap_fl1gp_support(c)	(((c) >> 56) & 1)
+#define cap_read_drain(c)	(((c) >> 55) & 1)
+#define cap_write_drain(c)	(((c) >> 54) & 1)
+#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
+
+#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
+#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
+					* OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
+#define cap_max_fault_reg_offset(c) \
+	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
+
+#define cap_zlr(c)		(((c) >> 22) & 1)
+#define cap_isoch(c)		(((c) >> 23) & 1)
+#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)	(((c) >> 7) & 1)
+#define cap_phmr(c)		(((c) >> 6) & 1)
+#define cap_plmr(c)		(((c) >> 5) & 1)
+#define cap_rwbf(c)		(((c) >> 4) & 1)
+#define cap_afl(c)		(((c) >> 3) & 1)
+#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
+/*
+ * Extended Capability Register
+ */
+
+#define	ecap_rps(e)		(((e) >> 49) & 0x1)
+#define ecap_smpwc(e)		(((e) >> 48) & 0x1)
+#define ecap_flts(e)		(((e) >> 47) & 0x1)
+#define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_slads(e)		(((e) >> 45) & 0x1)
+#define ecap_vcs(e)		(((e) >> 44) & 0x1)
+#define ecap_smts(e)		(((e) >> 43) & 0x1)
+#define ecap_dit(e)		(((e) >> 41) & 0x1)
+#define ecap_pds(e)		(((e) >> 42) & 0x1)
+#define ecap_pasid(e)		(((e) >> 40) & 0x1)
+#define ecap_pss(e)		(((e) >> 35) & 0x1f)
+#define ecap_eafs(e)		(((e) >> 34) & 0x1)
+#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
+#define ecap_srs(e)		(((e) >> 31) & 0x1)
+#define ecap_ers(e)		(((e) >> 30) & 0x1)
+#define ecap_prs(e)		(((e) >> 29) & 0x1)
+#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
+#define ecap_dis(e)		(((e) >> 27) & 0x1)
+#define ecap_nest(e)		(((e) >> 26) & 0x1)
+#define ecap_mts(e)		(((e) >> 25) & 0x1)
+#define ecap_ecs(e)		(((e) >> 24) & 0x1)
+#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
+#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
+#define ecap_coherent(e)	((e) & 0x1)
+#define ecap_qis(e)		((e) & 0x2)
+#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
+#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
+#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
+#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
+#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
+
+/* Virtual command interface capability */
+#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
+
+/* IOTLB_REG */
+#define DMA_TLB_FLUSH_GRANU_OFFSET  60
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
+#define DMA_TLB_IVT (((u64)1) << 63)
+#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
+#define DMA_TLB_MAX_SIZE (0x3f)
+
+/* INVALID_DESC */
+#define DMA_CCMD_INVL_GRANU_OFFSET  61
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
+#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
+#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
+#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
+#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
+#define DMA_ID_TLB_ADDR(addr)	(addr)
+#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM (((u32)1)<<31)
+#define DMA_PMEN_PRS (((u32)1)<<0)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE (((u32)1) << 31)
+#define DMA_GCMD_SRTP (((u32)1) << 30)
+#define DMA_GCMD_SFL (((u32)1) << 29)
+#define DMA_GCMD_EAFL (((u32)1) << 28)
+#define DMA_GCMD_WBF (((u32)1) << 27)
+#define DMA_GCMD_QIE (((u32)1) << 26)
+#define DMA_GCMD_SIRTP (((u32)1) << 24)
+#define DMA_GCMD_IRE (((u32) 1) << 25)
+#define DMA_GCMD_CFI (((u32) 1) << 23)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES (((u32)1) << 31)
+#define DMA_GSTS_RTPS (((u32)1) << 30)
+#define DMA_GSTS_FLS (((u32)1) << 29)
+#define DMA_GSTS_AFLS (((u32)1) << 28)
+#define DMA_GSTS_WBFS (((u32)1) << 27)
+#define DMA_GSTS_QIES (((u32)1) << 26)
+#define DMA_GSTS_IRTPS (((u32)1) << 24)
+#define DMA_GSTS_IRES (((u32)1) << 25)
+#define DMA_GSTS_CFIS (((u32)1) << 23)
+
+/* DMA_RTADDR_REG */
+#define DMA_RTADDR_RTT (((u64)1) << 11)
+#define DMA_RTADDR_SMT (((u64)1) << 10)
+
+/* CCMD_REG */
+#define DMA_CCMD_ICC (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u32)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
+#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
+#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
+#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
+#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
+#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u32)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff)
+#define dma_frcd_pasid_present(c) (((c) >> 31) & 1)
+/* low 64 bit */
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
+
+/* PRS_REG */
+#define DMA_PRS_PPR	((u32)1)
+#define DMA_PRS_PRO	((u32)2)
+
+#define DMA_VCS_PAS	((u64)1)
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
+do {									\
+	cycles_t start_time = get_cycles();				\
+	while (1) {							\
+		sts = op(iommu->reg + offset);				\
+		if (cond)						\
+			break;						\
+		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+			panic("DMAR hardware is malfunctioning\n");	\
+		cpu_relax();						\
+	}								\
+} while (0)
+
+#define QI_LENGTH	256	/* queue length */
+
+enum {
+	QI_FREE,
+	QI_IN_USE,
+	QI_DONE,
+	QI_ABORT
+};
+
+#define QI_CC_TYPE		0x1
+#define QI_IOTLB_TYPE		0x2
+#define QI_DIOTLB_TYPE		0x3
+#define QI_IEC_TYPE		0x4
+#define QI_IWD_TYPE		0x5
+#define QI_EIOTLB_TYPE		0x6
+#define QI_PC_TYPE		0x7
+#define QI_DEIOTLB_TYPE		0x8
+#define QI_PGRP_RESP_TYPE	0x9
+#define QI_PSTRM_RESP_TYPE	0xa
+
+#define QI_IEC_SELECTIVE	(((u64)1) << 4)
+#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
+#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
+
+#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
+#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+#define QI_IWD_FENCE		(((u64)1) << 6)
+#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
+
+#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
+#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
+#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
+#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
+#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
+#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
+#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
+
+#define QI_CC_FM(fm)		(((u64)fm) << 48)
+#define QI_CC_SID(sid)		(((u64)sid) << 32)
+#define QI_CC_DID(did)		(((u64)did) << 16)
+#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
+
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
+				   ((u64)((pfsid >> 4) & 0xfff) << 52))
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
+#define QI_PC_PASID(pasid)	(((u64)pasid) << 32)
+#define QI_PC_DID(did)		(((u64)did) << 16)
+#define QI_PC_GRAN(gran)	(((u64)gran) << 4)
+
+/* PASID cache invalidation granu */
+#define QI_PC_ALL_PASIDS	0
+#define QI_PC_PASID_SEL		1
+#define QI_PC_GLOBAL		3
+
+#define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
+#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
+#define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
+#define QI_EIOTLB_DID(did)	(((u64)did) << 16)
+#define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
+
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL		1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
+
+#define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
+#define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
+#define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
+#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
+#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
+				    ((u64)((pfsid >> 4) & 0xfff) << 52))
+#define QI_DEV_EIOTLB_MAX_INVS	32
+
+/* Page group response descriptor QW0 */
+#define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
+#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
+#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
+#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
+#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
+
+/* Page group response descriptor QW1 */
+#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
+#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
+
+
+#define QI_RESP_SUCCESS		0x0
+#define QI_RESP_INVALID		0x1
+#define QI_RESP_FAILURE		0xf
+
+#define QI_GRAN_NONG_PASID		2
+#define QI_GRAN_PSI_PASID		3
+
+#define qi_shift(iommu)		(DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap))
+
+struct qi_desc {
+	u64 qw0;
+	u64 qw1;
+	u64 qw2;
+	u64 qw3;
+};
+
+struct q_inval {
+	raw_spinlock_t  q_lock;
+	void		*desc;          /* invalidation queue */
+	int             *desc_status;   /* desc status */
+	int             free_head;      /* first free entry */
+	int             free_tail;      /* last free entry */
+	int             free_cnt;
+};
+
+struct dmar_pci_notify_info;
+
+#ifdef CONFIG_IRQ_REMAP
+/* 1MB - maximum possible interrupt remapping table size */
+#define INTR_REMAP_PAGE_ORDER	8
+#define INTR_REMAP_TABLE_REG_SIZE	0xf
+#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
+
+#define INTR_REMAP_TABLE_ENTRIES	65536
+
+struct irq_domain;
+
+struct ir_table {
+	struct irte *base;
+	unsigned long *bitmap;
+};
+
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
+#else
+static inline void
+intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
+#endif
+
+struct iommu_flush {
+	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
+			      u8 fm, u64 type);
+	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+			    unsigned int size_order, u64 type);
+};
+
+enum {
+	SR_DMAR_FECTL_REG,
+	SR_DMAR_FEDATA_REG,
+	SR_DMAR_FEADDR_REG,
+	SR_DMAR_FEUADDR_REG,
+	MAX_SR_DMAR_REGS
+};
+
+#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
+#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
+#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
+
+extern int intel_iommu_sm;
+extern spinlock_t device_domain_lock;
+
+#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
+#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
+				 ecap_pasid((iommu)->ecap))
+
+struct pasid_entry;
+struct pasid_state_entry;
+struct page_req_dsc;
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64     lo;
+	u64     hi;
+};
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+
+/*
+ * When VT-d works in the scalable mode, it allows DMA translation to
+ * happen through either first level or second level page table. This
+ * bit marks that the DMA translation for the domain goes through the
+ * first level page table, otherwise, it goes through the second level.
+ */
+#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
+
+struct dmar_domain {
+	int	nid;			/* node id */
+
+	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
+					/* Refcount of devices per iommu */
+
+
+	u16		iommu_did[DMAR_UNITS_SUPPORTED];
+					/* Domain ids per IOMMU. Use u16 since
+					 * domain ids are 16 bit wide according
+					 * to VT-d spec, section 9.3 */
+
+	u8 has_iotlb_device: 1;
+	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
+	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
+	u8 set_pte_snp:1;
+
+	struct list_head devices;	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+	int		flags;		/* flags to find out type of domain */
+	int		iommu_superpage;/* Level of superpages supported:
+					   0 == 4KiB (no superpages), 1 == 2MiB,
+					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
+	u64		max_addr;	/* maximum mapped address */
+
+	struct iommu_domain domain;	/* generic domain data structure for
+					   iommu core */
+};
+
+struct intel_iommu {
+	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
+	u64 		reg_phys; /* physical address of hw register set */
+	u64		reg_size; /* size of hw register set */
+	u64		cap;
+	u64		ecap;
+	u64		vccap;
+	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+	raw_spinlock_t	register_lock; /* protect register handling */
+	int		seq_id;	/* sequence id of the iommu */
+	int		agaw; /* agaw of this iommu */
+	int		msagaw; /* max sagaw of this iommu */
+	unsigned int 	irq, pr_irq;
+	u16		segment;     /* PCI segment# */
+	unsigned char 	name[13];    /* Device Name */
+
+#ifdef CONFIG_INTEL_IOMMU
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	spinlock_t	lock; /* protect context, domain ids */
+	struct root_entry *root_entry; /* virtual address */
+
+	struct iommu_flush flush;
+#endif
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	struct page_req_dsc *prq;
+	unsigned char prq_name[16];    /* Name for PRQ interrupt */
+	struct completion prq_complete;
+	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
+#endif
+	struct iopf_queue *iopf_queue;
+	unsigned char iopfq_name[16];
+	struct q_inval  *qi;            /* Queued invalidation info */
+	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
+
+#ifdef CONFIG_IRQ_REMAP
+	struct ir_table *ir_table;	/* Interrupt remapping info */
+	struct irq_domain *ir_domain;
+	struct irq_domain *ir_msi_domain;
+#endif
+	struct iommu_device iommu;  /* IOMMU core code handle */
+	int		node;
+	u32		flags;      /* Software defined flags */
+
+	struct dmar_drhd_unit *drhd;
+	void *perf_statistic;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u32 segment;		/* PCI segment number */
+	u8 bus;			/* PCI bus number */
+	u8 devfn;		/* PCI devfn number */
+	u16 pfsid;		/* SRIOV physical function source ID */
+	u8 pasid_supported:3;
+	u8 pasid_enabled:1;
+	u8 pri_supported:1;
+	u8 pri_enabled:1;
+	u8 ats_supported:1;
+	u8 ats_enabled:1;
+	u8 ats_qdep;
+	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
+	struct dmar_domain *domain; /* pointer to domain */
+	struct pasid_table *pasid_table; /* pasid table */
+};
+
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
+}
+
+/* Convert generic struct iommu_domain to private struct dmar_domain */
+static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct dmar_domain, domain);
+}
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-10: available
+ * 11: snoop behavior
+ * 12-63: Host physical address
+ */
+struct dma_pte {
+	u64 val;
+};
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+	pte->val = 0;
+}
+
+static inline u64 dma_pte_addr(struct dma_pte *pte)
+{
+#ifdef CONFIG_64BIT
+	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+#else
+	/* Must have a full atomic 64-bit read */
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
+			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+#endif
+}
+
+static inline bool dma_pte_present(struct dma_pte *pte)
+{
+	return (pte->val & 3) != 0;
+}
+
+static inline bool dma_pte_superpage(struct dma_pte *pte)
+{
+	return (pte->val & DMA_PTE_LARGE_PAGE);
+}
+
+static inline bool first_pte_in_page(struct dma_pte *pte)
+{
+	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
+}
+
+static inline int nr_pte_to_next_page(struct dma_pte *pte)
+{
+	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
+		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
+}
+
+extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void dmar_disable_qi(struct intel_iommu *iommu);
+extern int dmar_reenable_qi(struct intel_iommu *iommu);
+extern void qi_global_iec(struct intel_iommu *iommu);
+
+extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			     u8 fm, u64 type);
+extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			u16 qdep, u64 addr, unsigned mask);
+
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih);
+
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid, u16 qdep, u64 addr,
+			      unsigned int size_order);
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
+			  u32 pasid);
+
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options);
+/*
+ * Options used in qi_submit_sync:
+ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
+ */
+#define QI_OPT_WAIT_DRAIN		BIT(0)
+
+extern int dmar_ir_support(void);
+
+void *alloc_pgtable_page(int node);
+void free_pgtable_page(void *vaddr);
+struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
+void iommu_flush_write_buffer(struct intel_iommu *iommu);
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+extern void intel_svm_check(struct intel_iommu *iommu);
+extern int intel_svm_enable_prq(struct intel_iommu *iommu);
+extern int intel_svm_finish_prq(struct intel_iommu *iommu);
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
+				 void *drvdata);
+void intel_svm_unbind(struct iommu_sva *handle);
+u32 intel_svm_get_pasid(struct iommu_sva *handle);
+int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg);
+
+struct intel_svm_dev {
+	struct list_head list;
+	struct rcu_head rcu;
+	struct device *dev;
+	struct intel_iommu *iommu;
+	struct iommu_sva sva;
+	unsigned long prq_seq_number;
+	u32 pasid;
+	int users;
+	u16 did;
+	u16 dev_iotlb:1;
+	u16 sid, qdep;
+};
+
+struct intel_svm {
+	struct mmu_notifier notifier;
+	struct mm_struct *mm;
+
+	unsigned int flags;
+	u32 pasid;
+	struct list_head devs;
+};
+#else
+static inline void intel_svm_check(struct intel_iommu *iommu) {}
+#endif
+
+#ifdef CONFIG_INTEL_IOMMU_DEBUGFS
+void intel_iommu_debugfs_init(void);
+#else
+static inline void intel_iommu_debugfs_init(void) {}
+#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
+
+extern const struct attribute_group *intel_iommu_groups[];
+bool context_present(struct context_entry *context);
+struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+					 u8 devfn, int alloc);
+
+extern const struct iommu_ops intel_iommu_ops;
+
+#ifdef CONFIG_INTEL_IOMMU
+extern int iommu_calculate_agaw(struct intel_iommu *iommu);
+extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
+extern int dmar_disabled;
+extern int intel_iommu_enabled;
+#else
+static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
+static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
+#define dmar_disabled	(1)
+#define intel_iommu_enabled (0)
+#endif
+
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	char *buf = str;
+	int bytes;
+
+	bytes = snprintf(buf, size,
+			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
+			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
+			 dw1 & BIT_ULL(0) ? 'r' : '-',
+			 dw1 & BIT_ULL(1) ? 'w' : '-',
+			 dw0 & BIT_ULL(52) ? 'x' : '-',
+			 dw0 & BIT_ULL(53) ? 'p' : '-',
+			 dw1 & BIT_ULL(2) ? 'l' : '-',
+			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
+			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+	/* Private Data */
+	if (dw0 & BIT_ULL(9)) {
+		size -= bytes;
+		buf += bytes;
+		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+	}
+
+	return str;
+}
+
+#endif
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index a67319597884..2e9683e970f8 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -10,7 +10,6 @@
 #include <linux/hpet.h>
 #include <linux/pci.h>
 #include <linux/irq.h>
-#include <linux/intel-iommu.h>
 #include <linux/acpi.h>
 #include <linux/irqdomain.h>
 #include <linux/crash_dump.h>
@@ -21,6 +20,7 @@
 #include <asm/irq_remapping.h>
 #include <asm/pci-direct.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "cap_audit.h"
 
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 17cad7c1f62d..43f090381ec7 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -12,13 +12,13 @@
 #include <linux/bitops.h>
 #include <linux/cpufeature.h>
 #include <linux/dmar.h>
-#include <linux/intel-iommu.h>
 #include <linux/iommu.h>
 #include <linux/memory.h>
 #include <linux/pci.h>
 #include <linux/pci-ats.h>
 #include <linux/spinlock.h>
 
+#include "iommu.h"
 #include "pasid.h"
 
 /*
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index 0e8e03252d92..94ee70ac38e3 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/spinlock.h>
-#include <linux/intel-iommu.h>
 
+#include "iommu.h"
 #include "perf.h"
 
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 70b40d007a52..580713aa9e07 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -5,7 +5,6 @@
  * Authors: David Woodhouse <dwmw2@infradead.org>
  */
 
-#include <linux/intel-iommu.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
@@ -22,6 +21,7 @@
 #include <asm/page.h>
 #include <asm/fpu/api.h>
 
+#include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
 #include "../iommu-sva-lib.h"
diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h
index 25cb7f88e1a2..93d96f93a89b 100644
--- a/drivers/iommu/intel/trace.h
+++ b/drivers/iommu/intel/trace.h
@@ -13,7 +13,8 @@
 #define _TRACE_INTEL_IOMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/intel-iommu.h>
+
+#include "iommu.h"
 
 #define MSG_MAX		256
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
deleted file mode 100644
index 4ebf3c4da45e..000000000000
--- a/include/linux/intel-iommu.h
+++ /dev/null
@@ -1,831 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright © 2006-2015, Intel Corporation.
- *
- * Authors: Ashok Raj <ashok.raj@intel.com>
- *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *          David Woodhouse <David.Woodhouse@intel.com>
- */
-
-#ifndef _INTEL_IOMMU_H_
-#define _INTEL_IOMMU_H_
-
-#include <linux/types.h>
-#include <linux/iova.h>
-#include <linux/io.h>
-#include <linux/idr.h>
-#include <linux/mmu_notifier.h>
-#include <linux/list.h>
-#include <linux/iommu.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
-#include <linux/dmar.h>
-#include <linux/ioasid.h>
-#include <linux/bitfield.h>
-
-#include <asm/cacheflush.h>
-#include <asm/iommu.h>
-
-/*
- * VT-d hardware uses 4KiB page size regardless of host page size.
- */
-#define VTD_PAGE_SHIFT		(12)
-#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
-#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
-#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
-
-#define VTD_STRIDE_SHIFT        (9)
-#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
-
-#define DMA_PTE_READ		BIT_ULL(0)
-#define DMA_PTE_WRITE		BIT_ULL(1)
-#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
-#define DMA_PTE_SNP		BIT_ULL(11)
-
-#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
-#define DMA_FL_PTE_US		BIT_ULL(2)
-#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
-#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
-#define DMA_FL_PTE_XD		BIT_ULL(63)
-
-#define ADDR_WIDTH_5LEVEL	(57)
-#define ADDR_WIDTH_4LEVEL	(48)
-
-#define CONTEXT_TT_MULTI_LEVEL	0
-#define CONTEXT_TT_DEV_IOTLB	1
-#define CONTEXT_TT_PASS_THROUGH 2
-#define CONTEXT_PASIDE		BIT_ULL(3)
-
-/*
- * Intel IOMMU register specification per version 1.0 public spec.
- */
-#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
-#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
-#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
-#define	DMAR_GCMD_REG	0x18	/* Global command register */
-#define	DMAR_GSTS_REG	0x1c	/* Global status register */
-#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
-#define	DMAR_CCMD_REG	0x28	/* Context command reg */
-#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
-#define	DMAR_FECTL_REG	0x38	/* Fault control register */
-#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
-#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
-#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
-#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
-#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
-#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
-#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
-#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
-#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
-#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
-#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
-#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
-#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
-#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
-#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
-#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
-#define DMAR_PQH_REG	0xc0	/* Page request queue head register */
-#define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
-#define DMAR_PQA_REG	0xd0	/* Page request queue address register */
-#define DMAR_PRS_REG	0xdc	/* Page request status register */
-#define DMAR_PECTL_REG	0xe0	/* Page request event control register */
-#define	DMAR_PEDATA_REG	0xe4	/* Page request event interrupt data register */
-#define	DMAR_PEADDR_REG	0xe8	/* Page request event interrupt addr register */
-#define	DMAR_PEUADDR_REG 0xec	/* Page request event Upper address register */
-#define DMAR_MTRRCAP_REG 0x100	/* MTRR capability register */
-#define DMAR_MTRRDEF_REG 0x108	/* MTRR default type register */
-#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */
-#define DMAR_MTRR_FIX16K_80000_REG 0x128
-#define DMAR_MTRR_FIX16K_A0000_REG 0x130
-#define DMAR_MTRR_FIX4K_C0000_REG 0x138
-#define DMAR_MTRR_FIX4K_C8000_REG 0x140
-#define DMAR_MTRR_FIX4K_D0000_REG 0x148
-#define DMAR_MTRR_FIX4K_D8000_REG 0x150
-#define DMAR_MTRR_FIX4K_E0000_REG 0x158
-#define DMAR_MTRR_FIX4K_E8000_REG 0x160
-#define DMAR_MTRR_FIX4K_F0000_REG 0x168
-#define DMAR_MTRR_FIX4K_F8000_REG 0x170
-#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */
-#define DMAR_MTRR_PHYSMASK0_REG 0x188
-#define DMAR_MTRR_PHYSBASE1_REG 0x190
-#define DMAR_MTRR_PHYSMASK1_REG 0x198
-#define DMAR_MTRR_PHYSBASE2_REG 0x1a0
-#define DMAR_MTRR_PHYSMASK2_REG 0x1a8
-#define DMAR_MTRR_PHYSBASE3_REG 0x1b0
-#define DMAR_MTRR_PHYSMASK3_REG 0x1b8
-#define DMAR_MTRR_PHYSBASE4_REG 0x1c0
-#define DMAR_MTRR_PHYSMASK4_REG 0x1c8
-#define DMAR_MTRR_PHYSBASE5_REG 0x1d0
-#define DMAR_MTRR_PHYSMASK5_REG 0x1d8
-#define DMAR_MTRR_PHYSBASE6_REG 0x1e0
-#define DMAR_MTRR_PHYSMASK6_REG 0x1e8
-#define DMAR_MTRR_PHYSBASE7_REG 0x1f0
-#define DMAR_MTRR_PHYSMASK7_REG 0x1f8
-#define DMAR_MTRR_PHYSBASE8_REG 0x200
-#define DMAR_MTRR_PHYSMASK8_REG 0x208
-#define DMAR_MTRR_PHYSBASE9_REG 0x210
-#define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
-#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
-#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
-
-#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
-#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
-#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
-
-#define OFFSET_STRIDE		(9)
-
-#define dmar_readq(a) readq(a)
-#define dmar_writeq(a,v) writeq(v,a)
-#define dmar_readl(a) readl(a)
-#define dmar_writel(a, v) writel(v, a)
-
-#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
-#define DMAR_VER_MINOR(v)		((v) & 0x0f)
-
-/*
- * Decoding Capability Register
- */
-#define cap_5lp_support(c)	(((c) >> 60) & 1)
-#define cap_pi_support(c)	(((c) >> 59) & 1)
-#define cap_fl1gp_support(c)	(((c) >> 56) & 1)
-#define cap_read_drain(c)	(((c) >> 55) & 1)
-#define cap_write_drain(c)	(((c) >> 54) & 1)
-#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
-#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
-#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
-
-#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
-#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
-					* OFFSET_STRIDE) + 21)
-
-#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
-#define cap_max_fault_reg_offset(c) \
-	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
-
-#define cap_zlr(c)		(((c) >> 22) & 1)
-#define cap_isoch(c)		(((c) >> 23) & 1)
-#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
-#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
-#define cap_caching_mode(c)	(((c) >> 7) & 1)
-#define cap_phmr(c)		(((c) >> 6) & 1)
-#define cap_plmr(c)		(((c) >> 5) & 1)
-#define cap_rwbf(c)		(((c) >> 4) & 1)
-#define cap_afl(c)		(((c) >> 3) & 1)
-#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
-/*
- * Extended Capability Register
- */
-
-#define	ecap_rps(e)		(((e) >> 49) & 0x1)
-#define ecap_smpwc(e)		(((e) >> 48) & 0x1)
-#define ecap_flts(e)		(((e) >> 47) & 0x1)
-#define ecap_slts(e)		(((e) >> 46) & 0x1)
-#define ecap_slads(e)		(((e) >> 45) & 0x1)
-#define ecap_vcs(e)		(((e) >> 44) & 0x1)
-#define ecap_smts(e)		(((e) >> 43) & 0x1)
-#define ecap_dit(e)		(((e) >> 41) & 0x1)
-#define ecap_pds(e)		(((e) >> 42) & 0x1)
-#define ecap_pasid(e)		(((e) >> 40) & 0x1)
-#define ecap_pss(e)		(((e) >> 35) & 0x1f)
-#define ecap_eafs(e)		(((e) >> 34) & 0x1)
-#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
-#define ecap_srs(e)		(((e) >> 31) & 0x1)
-#define ecap_ers(e)		(((e) >> 30) & 0x1)
-#define ecap_prs(e)		(((e) >> 29) & 0x1)
-#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
-#define ecap_dis(e)		(((e) >> 27) & 0x1)
-#define ecap_nest(e)		(((e) >> 26) & 0x1)
-#define ecap_mts(e)		(((e) >> 25) & 0x1)
-#define ecap_ecs(e)		(((e) >> 24) & 0x1)
-#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
-#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
-#define ecap_coherent(e)	((e) & 0x1)
-#define ecap_qis(e)		((e) & 0x2)
-#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
-#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
-#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
-#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
-#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
-#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
-
-/* Virtual command interface capability */
-#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
-
-/* IOTLB_REG */
-#define DMA_TLB_FLUSH_GRANU_OFFSET  60
-#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
-#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
-#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
-#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
-#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
-#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
-#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
-#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
-#define DMA_TLB_IVT (((u64)1) << 63)
-#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
-#define DMA_TLB_MAX_SIZE (0x3f)
-
-/* INVALID_DESC */
-#define DMA_CCMD_INVL_GRANU_OFFSET  61
-#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
-#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
-#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
-#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
-#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
-#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
-#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
-#define DMA_ID_TLB_ADDR(addr)	(addr)
-#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
-
-/* PMEN_REG */
-#define DMA_PMEN_EPM (((u32)1)<<31)
-#define DMA_PMEN_PRS (((u32)1)<<0)
-
-/* GCMD_REG */
-#define DMA_GCMD_TE (((u32)1) << 31)
-#define DMA_GCMD_SRTP (((u32)1) << 30)
-#define DMA_GCMD_SFL (((u32)1) << 29)
-#define DMA_GCMD_EAFL (((u32)1) << 28)
-#define DMA_GCMD_WBF (((u32)1) << 27)
-#define DMA_GCMD_QIE (((u32)1) << 26)
-#define DMA_GCMD_SIRTP (((u32)1) << 24)
-#define DMA_GCMD_IRE (((u32) 1) << 25)
-#define DMA_GCMD_CFI (((u32) 1) << 23)
-
-/* GSTS_REG */
-#define DMA_GSTS_TES (((u32)1) << 31)
-#define DMA_GSTS_RTPS (((u32)1) << 30)
-#define DMA_GSTS_FLS (((u32)1) << 29)
-#define DMA_GSTS_AFLS (((u32)1) << 28)
-#define DMA_GSTS_WBFS (((u32)1) << 27)
-#define DMA_GSTS_QIES (((u32)1) << 26)
-#define DMA_GSTS_IRTPS (((u32)1) << 24)
-#define DMA_GSTS_IRES (((u32)1) << 25)
-#define DMA_GSTS_CFIS (((u32)1) << 23)
-
-/* DMA_RTADDR_REG */
-#define DMA_RTADDR_RTT (((u64)1) << 11)
-#define DMA_RTADDR_SMT (((u64)1) << 10)
-
-/* CCMD_REG */
-#define DMA_CCMD_ICC (((u64)1) << 63)
-#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
-#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
-#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
-#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
-#define DMA_CCMD_MASK_NOBIT 0
-#define DMA_CCMD_MASK_1BIT 1
-#define DMA_CCMD_MASK_2BIT 2
-#define DMA_CCMD_MASK_3BIT 3
-#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
-#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
-
-/* FECTL_REG */
-#define DMA_FECTL_IM (((u32)1) << 31)
-
-/* FSTS_REG */
-#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
-#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
-#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
-#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
-#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
-#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
-#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
-
-/* FRCD_REG, 32 bits access */
-#define DMA_FRCD_F (((u32)1) << 31)
-#define dma_frcd_type(d) ((d >> 30) & 1)
-#define dma_frcd_fault_reason(c) (c & 0xff)
-#define dma_frcd_source_id(c) (c & 0xffff)
-#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff)
-#define dma_frcd_pasid_present(c) (((c) >> 31) & 1)
-/* low 64 bit */
-#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
-
-/* PRS_REG */
-#define DMA_PRS_PPR	((u32)1)
-#define DMA_PRS_PRO	((u32)2)
-
-#define DMA_VCS_PAS	((u64)1)
-
-#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
-do {									\
-	cycles_t start_time = get_cycles();				\
-	while (1) {							\
-		sts = op(iommu->reg + offset);				\
-		if (cond)						\
-			break;						\
-		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
-			panic("DMAR hardware is malfunctioning\n");	\
-		cpu_relax();						\
-	}								\
-} while (0)
-
-#define QI_LENGTH	256	/* queue length */
-
-enum {
-	QI_FREE,
-	QI_IN_USE,
-	QI_DONE,
-	QI_ABORT
-};
-
-#define QI_CC_TYPE		0x1
-#define QI_IOTLB_TYPE		0x2
-#define QI_DIOTLB_TYPE		0x3
-#define QI_IEC_TYPE		0x4
-#define QI_IWD_TYPE		0x5
-#define QI_EIOTLB_TYPE		0x6
-#define QI_PC_TYPE		0x7
-#define QI_DEIOTLB_TYPE		0x8
-#define QI_PGRP_RESP_TYPE	0x9
-#define QI_PSTRM_RESP_TYPE	0xa
-
-#define QI_IEC_SELECTIVE	(((u64)1) << 4)
-#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
-#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
-
-#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
-#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
-#define QI_IWD_FENCE		(((u64)1) << 6)
-#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
-
-#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
-#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
-#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
-#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
-#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
-#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
-#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
-
-#define QI_CC_FM(fm)		(((u64)fm) << 48)
-#define QI_CC_SID(sid)		(((u64)sid) << 32)
-#define QI_CC_DID(did)		(((u64)did) << 16)
-#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
-
-#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
-#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
-#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
-#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
-				   ((u64)((pfsid >> 4) & 0xfff) << 52))
-#define QI_DEV_IOTLB_SIZE	1
-#define QI_DEV_IOTLB_MAX_INVS	32
-
-#define QI_PC_PASID(pasid)	(((u64)pasid) << 32)
-#define QI_PC_DID(did)		(((u64)did) << 16)
-#define QI_PC_GRAN(gran)	(((u64)gran) << 4)
-
-/* PASID cache invalidation granu */
-#define QI_PC_ALL_PASIDS	0
-#define QI_PC_PASID_SEL		1
-#define QI_PC_GLOBAL		3
-
-#define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
-#define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
-#define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
-#define QI_EIOTLB_DID(did)	(((u64)did) << 16)
-#define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
-
-/* QI Dev-IOTLB inv granu */
-#define QI_DEV_IOTLB_GRAN_ALL		1
-#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
-
-#define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
-#define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
-#define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
-#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
-#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
-#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
-				    ((u64)((pfsid >> 4) & 0xfff) << 52))
-#define QI_DEV_EIOTLB_MAX_INVS	32
-
-/* Page group response descriptor QW0 */
-#define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
-#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
-#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
-#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
-#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
-
-/* Page group response descriptor QW1 */
-#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
-#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
-
-
-#define QI_RESP_SUCCESS		0x0
-#define QI_RESP_INVALID		0x1
-#define QI_RESP_FAILURE		0xf
-
-#define QI_GRAN_NONG_PASID		2
-#define QI_GRAN_PSI_PASID		3
-
-#define qi_shift(iommu)		(DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap))
-
-struct qi_desc {
-	u64 qw0;
-	u64 qw1;
-	u64 qw2;
-	u64 qw3;
-};
-
-struct q_inval {
-	raw_spinlock_t  q_lock;
-	void		*desc;          /* invalidation queue */
-	int             *desc_status;   /* desc status */
-	int             free_head;      /* first free entry */
-	int             free_tail;      /* last free entry */
-	int             free_cnt;
-};
-
-struct dmar_pci_notify_info;
-
-#ifdef CONFIG_IRQ_REMAP
-/* 1MB - maximum possible interrupt remapping table size */
-#define INTR_REMAP_PAGE_ORDER	8
-#define INTR_REMAP_TABLE_REG_SIZE	0xf
-#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
-
-#define INTR_REMAP_TABLE_ENTRIES	65536
-
-struct irq_domain;
-
-struct ir_table {
-	struct irte *base;
-	unsigned long *bitmap;
-};
-
-void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
-#else
-static inline void
-intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
-#endif
-
-struct iommu_flush {
-	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
-			      u8 fm, u64 type);
-	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
-			    unsigned int size_order, u64 type);
-};
-
-enum {
-	SR_DMAR_FECTL_REG,
-	SR_DMAR_FEDATA_REG,
-	SR_DMAR_FEADDR_REG,
-	SR_DMAR_FEUADDR_REG,
-	MAX_SR_DMAR_REGS
-};
-
-#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
-#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
-#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
-
-extern int intel_iommu_sm;
-extern spinlock_t device_domain_lock;
-
-#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
-#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
-				 ecap_pasid((iommu)->ecap))
-
-struct pasid_entry;
-struct pasid_state_entry;
-struct page_req_dsc;
-
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64     lo;
-	u64     hi;
-};
-
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-
-/*
- * When VT-d works in the scalable mode, it allows DMA translation to
- * happen through either first level or second level page table. This
- * bit marks that the DMA translation for the domain goes through the
- * first level page table, otherwise, it goes through the second level.
- */
-#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
-
-struct dmar_domain {
-	int	nid;			/* node id */
-
-	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
-					/* Refcount of devices per iommu */
-
-
-	u16		iommu_did[DMAR_UNITS_SUPPORTED];
-					/* Domain ids per IOMMU. Use u16 since
-					 * domain ids are 16 bit wide according
-					 * to VT-d spec, section 9.3 */
-
-	u8 has_iotlb_device: 1;
-	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
-	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
-	u8 set_pte_snp:1;
-
-	struct list_head devices;	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-	int		flags;		/* flags to find out type of domain */
-	int		iommu_superpage;/* Level of superpages supported:
-					   0 == 4KiB (no superpages), 1 == 2MiB,
-					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-	u64		max_addr;	/* maximum mapped address */
-
-	struct iommu_domain domain;	/* generic domain data structure for
-					   iommu core */
-};
-
-struct intel_iommu {
-	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
-	u64 		reg_phys; /* physical address of hw register set */
-	u64		reg_size; /* size of hw register set */
-	u64		cap;
-	u64		ecap;
-	u64		vccap;
-	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
-	raw_spinlock_t	register_lock; /* protect register handling */
-	int		seq_id;	/* sequence id of the iommu */
-	int		agaw; /* agaw of this iommu */
-	int		msagaw; /* max sagaw of this iommu */
-	unsigned int 	irq, pr_irq;
-	u16		segment;     /* PCI segment# */
-	unsigned char 	name[13];    /* Device Name */
-
-#ifdef CONFIG_INTEL_IOMMU
-	unsigned long 	*domain_ids; /* bitmap of domains */
-	spinlock_t	lock; /* protect context, domain ids */
-	struct root_entry *root_entry; /* virtual address */
-
-	struct iommu_flush flush;
-#endif
-#ifdef CONFIG_INTEL_IOMMU_SVM
-	struct page_req_dsc *prq;
-	unsigned char prq_name[16];    /* Name for PRQ interrupt */
-	struct completion prq_complete;
-	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
-#endif
-	struct iopf_queue *iopf_queue;
-	unsigned char iopfq_name[16];
-	struct q_inval  *qi;            /* Queued invalidation info */
-	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
-
-#ifdef CONFIG_IRQ_REMAP
-	struct ir_table *ir_table;	/* Interrupt remapping info */
-	struct irq_domain *ir_domain;
-	struct irq_domain *ir_msi_domain;
-#endif
-	struct iommu_device iommu;  /* IOMMU core code handle */
-	int		node;
-	u32		flags;      /* Software defined flags */
-
-	struct dmar_drhd_unit *drhd;
-	void *perf_statistic;
-};
-
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	u32 segment;		/* PCI segment number */
-	u8 bus;			/* PCI bus number */
-	u8 devfn;		/* PCI devfn number */
-	u16 pfsid;		/* SRIOV physical function source ID */
-	u8 pasid_supported:3;
-	u8 pasid_enabled:1;
-	u8 pri_supported:1;
-	u8 pri_enabled:1;
-	u8 ats_supported:1;
-	u8 ats_enabled:1;
-	u8 ats_qdep;
-	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
-	struct intel_iommu *iommu; /* IOMMU used by this device */
-	struct dmar_domain *domain; /* pointer to domain */
-	struct pasid_table *pasid_table; /* pasid table */
-};
-
-static inline void __iommu_flush_cache(
-	struct intel_iommu *iommu, void *addr, int size)
-{
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(addr, size);
-}
-
-/* Convert generic struct iommu_domain to private struct dmar_domain */
-static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
-{
-	return container_of(dom, struct dmar_domain, domain);
-}
-
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-10: available
- * 11: snoop behavior
- * 12-63: Host physical address
- */
-struct dma_pte {
-	u64 val;
-};
-
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
-	pte->val = 0;
-}
-
-static inline u64 dma_pte_addr(struct dma_pte *pte)
-{
-#ifdef CONFIG_64BIT
-	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
-#else
-	/* Must have a full atomic 64-bit read */
-	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
-			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
-#endif
-}
-
-static inline bool dma_pte_present(struct dma_pte *pte)
-{
-	return (pte->val & 3) != 0;
-}
-
-static inline bool dma_pte_superpage(struct dma_pte *pte)
-{
-	return (pte->val & DMA_PTE_LARGE_PAGE);
-}
-
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
-	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
-	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
-		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
-extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-
-extern int dmar_enable_qi(struct intel_iommu *iommu);
-extern void dmar_disable_qi(struct intel_iommu *iommu);
-extern int dmar_reenable_qi(struct intel_iommu *iommu);
-extern void qi_global_iec(struct intel_iommu *iommu);
-
-extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-			     u8 fm, u64 type);
-extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-			  unsigned int size_order, u64 type);
-extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-			u16 qdep, u64 addr, unsigned mask);
-
-void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
-		     unsigned long npages, bool ih);
-
-void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-			      u32 pasid, u16 qdep, u64 addr,
-			      unsigned int size_order);
-void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
-			  u32 pasid);
-
-int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
-		   unsigned int count, unsigned long options);
-/*
- * Options used in qi_submit_sync:
- * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
- */
-#define QI_OPT_WAIT_DRAIN		BIT(0)
-
-extern int dmar_ir_support(void);
-
-void *alloc_pgtable_page(int node);
-void free_pgtable_page(void *vaddr);
-struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
-void iommu_flush_write_buffer(struct intel_iommu *iommu);
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
-struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-extern void intel_svm_check(struct intel_iommu *iommu);
-extern int intel_svm_enable_prq(struct intel_iommu *iommu);
-extern int intel_svm_finish_prq(struct intel_iommu *iommu);
-struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
-				 void *drvdata);
-void intel_svm_unbind(struct iommu_sva *handle);
-u32 intel_svm_get_pasid(struct iommu_sva *handle);
-int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
-			    struct iommu_page_response *msg);
-
-struct intel_svm_dev {
-	struct list_head list;
-	struct rcu_head rcu;
-	struct device *dev;
-	struct intel_iommu *iommu;
-	struct iommu_sva sva;
-	unsigned long prq_seq_number;
-	u32 pasid;
-	int users;
-	u16 did;
-	u16 dev_iotlb:1;
-	u16 sid, qdep;
-};
-
-struct intel_svm {
-	struct mmu_notifier notifier;
-	struct mm_struct *mm;
-
-	unsigned int flags;
-	u32 pasid;
-	struct list_head devs;
-};
-#else
-static inline void intel_svm_check(struct intel_iommu *iommu) {}
-#endif
-
-#ifdef CONFIG_INTEL_IOMMU_DEBUGFS
-void intel_iommu_debugfs_init(void);
-#else
-static inline void intel_iommu_debugfs_init(void) {}
-#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
-
-extern const struct attribute_group *intel_iommu_groups[];
-bool context_present(struct context_entry *context);
-struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
-					 u8 devfn, int alloc);
-
-extern const struct iommu_ops intel_iommu_ops;
-
-#ifdef CONFIG_INTEL_IOMMU
-extern int iommu_calculate_agaw(struct intel_iommu *iommu);
-extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
-extern int dmar_disabled;
-extern int intel_iommu_enabled;
-#else
-static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
-{
-	return 0;
-}
-static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
-{
-	return 0;
-}
-#define dmar_disabled	(1)
-#define intel_iommu_enabled (0)
-#endif
-
-static inline const char *decode_prq_descriptor(char *str, size_t size,
-		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
-{
-	char *buf = str;
-	int bytes;
-
-	bytes = snprintf(buf, size,
-			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
-			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
-			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
-			 dw1 & BIT_ULL(0) ? 'r' : '-',
-			 dw1 & BIT_ULL(1) ? 'w' : '-',
-			 dw0 & BIT_ULL(52) ? 'x' : '-',
-			 dw0 & BIT_ULL(53) ? 'p' : '-',
-			 dw1 & BIT_ULL(2) ? 'l' : '-',
-			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
-			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
-
-	/* Private Data */
-	if (dw0 & BIT_ULL(9)) {
-		size -= bytes;
-		buf += bytes;
-		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
-	}
-
-	return str;
-}
-
-#endif
-- 
cgit 


From 25357900f4e67094dd09b7dcb8a5f47c3f5a159d Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:09:08 +0800
Subject: iommu/vt-d: Make DMAR_UNITS_SUPPORTED default 1024

If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set
to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to
allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ
remapping doesn't support X2APIC mode x2apic disabled"; and the system
fails to boot properly.

To support up to 64 sockets with 10 DMAR units each (640), make the
value of DMAR_UNITS_SUPPORTED default 1024.

Signed-off-by: Steve Wahl<steve.wahl@hpe.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/linux-iommu/20220615183650.32075-1-steve.wahl@hpe.com/
Link: https://lore.kernel.org/r/20220702015610.2849494-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dmar.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index cbd714a198a0..d81a51978d01 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -18,11 +18,7 @@
 
 struct acpi_dmar_header;
 
-#ifdef	CONFIG_X86
-# define	DMAR_UNITS_SUPPORTED	MAX_IO_APICS
-#else
-# define	DMAR_UNITS_SUPPORTED	64
-#endif
+#define DMAR_UNITS_SUPPORTED	1024
 
 /* DMAR Flags */
 #define DMAR_INTR_REMAP		0x1
-- 
cgit 


From e4e712bbbd6d73263d964d6cb390b373738b62ab Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 4 Jul 2022 09:42:48 +0000
Subject: crypto: aria - Implement ARIA symmetric cipher algorithm

ARIA(RFC 5794) is a symmetric block cipher algorithm.
This algorithm is being used widely in South Korea as a standard cipher
algorithm.
This code is written based on the ARIA implementation of OpenSSL.
The OpenSSL code is based on the distributed source code[1] by KISA.

ARIA has three key sizes and corresponding rounds.
ARIA128: 12 rounds.
ARIA192: 14 rounds.
ARIA245: 16 rounds.

[1] https://seed.kisa.or.kr/kisa/Board/19/detailView.do (Korean)

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig        |  15 ++
 crypto/Makefile       |   1 +
 crypto/aria.c         | 288 +++++++++++++++++++++++++++++++
 include/crypto/aria.h | 461 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 765 insertions(+)
 create mode 100644 crypto/aria.c
 create mode 100644 include/crypto/aria.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 59489a300cd1..7d98a2b4ac9c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1508,6 +1508,21 @@ config CRYPTO_SEED
 	  See also:
 	  <http://www.kisa.or.kr/kisa/seed/jsp/seed_eng.jsp>
 
+config CRYPTO_ARIA
+	tristate "ARIA cipher algorithm"
+	select CRYPTO_ALGAPI
+	help
+	  ARIA cipher algorithm (RFC5794).
+
+	  ARIA is a standard encryption algorithm of the Republic of Korea.
+	  The ARIA specifies three key sizes and rounds.
+	  128-bit: 12 rounds.
+	  192-bit: 14 rounds.
+	  256-bit: 16 rounds.
+
+	  See also:
+	  <https://seed.kisa.or.kr/kisa/algorithm/EgovAriaInfo.do>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index a4a84860fe43..167c004dbf4f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -148,6 +148,7 @@ obj-$(CONFIG_CRYPTO_TEA) += tea.o
 obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
+obj-$(CONFIG_CRYPTO_ARIA) += aria.o
 obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o
 obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
diff --git a/crypto/aria.c b/crypto/aria.c
new file mode 100644
index 000000000000..ac3dffac34bb
--- /dev/null
+++ b/crypto/aria.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Cryptographic API.
+ *
+ * ARIA Cipher Algorithm.
+ *
+ * Documentation of ARIA can be found in RFC 5794.
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ * Information for ARIA
+ *     http://210.104.33.10/ARIA/index-e.html (English)
+ *     http://seed.kisa.or.kr/ (Korean)
+ *
+ * Public domain version is distributed above.
+ */
+
+#include <crypto/aria.h>
+
+static void aria_set_encrypt_key(struct aria_ctx *ctx, const u8 *in_key,
+				 unsigned int key_len)
+{
+	const __be32 *key = (const __be32 *)in_key;
+	u32 w0[4], w1[4], w2[4], w3[4];
+	u32 reg0, reg1, reg2, reg3;
+	const u32 *ck;
+	int rkidx = 0;
+
+	ck = &key_rc[(key_len - 16) / 8][0];
+
+	w0[0] = be32_to_cpu(key[0]);
+	w0[1] = be32_to_cpu(key[1]);
+	w0[2] = be32_to_cpu(key[2]);
+	w0[3] = be32_to_cpu(key[3]);
+
+	reg0 = w0[0] ^ ck[0];
+	reg1 = w0[1] ^ ck[1];
+	reg2 = w0[2] ^ ck[2];
+	reg3 = w0[3] ^ ck[3];
+
+	aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+	if (key_len > 16) {
+		w1[0] = be32_to_cpu(key[4]);
+		w1[1] = be32_to_cpu(key[5]);
+		if (key_len > 24) {
+			w1[2] = be32_to_cpu(key[6]);
+			w1[3] = be32_to_cpu(key[7]);
+		} else {
+			w1[2] = 0;
+			w1[3] = 0;
+		}
+	} else {
+		w1[0] = 0;
+		w1[1] = 0;
+		w1[2] = 0;
+		w1[3] = 0;
+	}
+
+	w1[0] ^= reg0;
+	w1[1] ^= reg1;
+	w1[2] ^= reg2;
+	w1[3] ^= reg3;
+
+	reg0 = w1[0];
+	reg1 = w1[1];
+	reg2 = w1[2];
+	reg3 = w1[3];
+
+	reg0 ^= ck[4];
+	reg1 ^= ck[5];
+	reg2 ^= ck[6];
+	reg3 ^= ck[7];
+
+	aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+
+	reg0 ^= w0[0];
+	reg1 ^= w0[1];
+	reg2 ^= w0[2];
+	reg3 ^= w0[3];
+
+	w2[0] = reg0;
+	w2[1] = reg1;
+	w2[2] = reg2;
+	w2[3] = reg3;
+
+	reg0 ^= ck[8];
+	reg1 ^= ck[9];
+	reg2 ^= ck[10];
+	reg3 ^= ck[11];
+
+	aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+	w3[0] = reg0 ^ w1[0];
+	w3[1] = reg1 ^ w1[1];
+	w3[2] = reg2 ^ w1[2];
+	w3[3] = reg3 ^ w1[3];
+
+	aria_gsrk(ctx->enc_key[rkidx], w0, w1, 19);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w1, w2, 19);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w2, w3, 19);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w3, w0, 19);
+
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w0, w1, 31);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w1, w2, 31);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w2, w3, 31);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w3, w0, 31);
+
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w0, w1, 67);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w1, w2, 67);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w2, w3, 67);
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w3, w0, 67);
+
+	rkidx++;
+	aria_gsrk(ctx->enc_key[rkidx], w0, w1, 97);
+	if (key_len > 16) {
+		rkidx++;
+		aria_gsrk(ctx->enc_key[rkidx], w1, w2, 97);
+		rkidx++;
+		aria_gsrk(ctx->enc_key[rkidx], w2, w3, 97);
+
+		if (key_len > 24) {
+			rkidx++;
+			aria_gsrk(ctx->enc_key[rkidx], w3, w0, 97);
+
+			rkidx++;
+			aria_gsrk(ctx->enc_key[rkidx], w0, w1, 109);
+		}
+	}
+}
+
+static void aria_set_decrypt_key(struct aria_ctx *ctx)
+{
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		ctx->dec_key[0][i] = ctx->enc_key[ctx->rounds][i];
+		ctx->dec_key[ctx->rounds][i] = ctx->enc_key[0][i];
+	}
+
+	for (i = 1; i < ctx->rounds; i++) {
+		ctx->dec_key[i][0] = aria_m(ctx->enc_key[ctx->rounds - i][0]);
+		ctx->dec_key[i][1] = aria_m(ctx->enc_key[ctx->rounds - i][1]);
+		ctx->dec_key[i][2] = aria_m(ctx->enc_key[ctx->rounds - i][2]);
+		ctx->dec_key[i][3] = aria_m(ctx->enc_key[ctx->rounds - i][3]);
+
+		aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+			       &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+		aria_diff_byte(&ctx->dec_key[i][1],
+			       &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+		aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+			       &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+	}
+}
+
+static int aria_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aria_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (key_len != 16 && key_len != 24 && key_len != 32)
+		return -EINVAL;
+
+	ctx->key_length = key_len;
+	ctx->rounds = (key_len + 32) / 4;
+
+	aria_set_encrypt_key(ctx, in_key, key_len);
+	aria_set_decrypt_key(ctx);
+
+	return 0;
+}
+
+static void __aria_crypt(struct aria_ctx *ctx, u8 *out, const u8 *in,
+			 u32 key[][ARIA_RD_KEY_WORDS])
+{
+	const __be32 *src = (const __be32 *)in;
+	__be32 *dst = (__be32 *)out;
+	u32 reg0, reg1, reg2, reg3;
+	int rounds, rkidx = 0;
+
+	rounds = ctx->rounds;
+
+	reg0 = be32_to_cpu(src[0]);
+	reg1 = be32_to_cpu(src[1]);
+	reg2 = be32_to_cpu(src[2]);
+	reg3 = be32_to_cpu(src[3]);
+
+	aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+	rkidx++;
+
+	aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+	aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+	rkidx++;
+
+	while ((rounds -= 2) > 0) {
+		aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+		aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+		rkidx++;
+
+		aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+		aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+		rkidx++;
+	}
+
+	reg0 = key[rkidx][0] ^ make_u32((u8)(x1[get_u8(reg0, 0)]),
+					(u8)(x2[get_u8(reg0, 1)] >> 8),
+					(u8)(s1[get_u8(reg0, 2)]),
+					(u8)(s2[get_u8(reg0, 3)]));
+	reg1 = key[rkidx][1] ^ make_u32((u8)(x1[get_u8(reg1, 0)]),
+					(u8)(x2[get_u8(reg1, 1)] >> 8),
+					(u8)(s1[get_u8(reg1, 2)]),
+					(u8)(s2[get_u8(reg1, 3)]));
+	reg2 = key[rkidx][2] ^ make_u32((u8)(x1[get_u8(reg2, 0)]),
+					(u8)(x2[get_u8(reg2, 1)] >> 8),
+					(u8)(s1[get_u8(reg2, 2)]),
+					(u8)(s2[get_u8(reg2, 3)]));
+	reg3 = key[rkidx][3] ^ make_u32((u8)(x1[get_u8(reg3, 0)]),
+					(u8)(x2[get_u8(reg3, 1)] >> 8),
+					(u8)(s1[get_u8(reg3, 2)]),
+					(u8)(s2[get_u8(reg3, 3)]));
+
+	dst[0] = cpu_to_be32(reg0);
+	dst[1] = cpu_to_be32(reg1);
+	dst[2] = cpu_to_be32(reg2);
+	dst[3] = cpu_to_be32(reg3);
+}
+
+static void aria_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct aria_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	__aria_crypt(ctx, out, in, ctx->enc_key);
+}
+
+static void aria_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct aria_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	__aria_crypt(ctx, out, in, ctx->dec_key);
+}
+
+static struct crypto_alg aria_alg = {
+	.cra_name		=	"aria",
+	.cra_driver_name	=	"aria-generic",
+	.cra_priority		=	100,
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	ARIA_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct aria_ctx),
+	.cra_alignmask		=	3,
+	.cra_module		=	THIS_MODULE,
+	.cra_u			=	{
+		.cipher = {
+			.cia_min_keysize	=	ARIA_MIN_KEY_SIZE,
+			.cia_max_keysize	=	ARIA_MAX_KEY_SIZE,
+			.cia_setkey		=	aria_set_key,
+			.cia_encrypt		=	aria_encrypt,
+			.cia_decrypt		=	aria_decrypt
+		}
+	}
+};
+
+static int __init aria_init(void)
+{
+	return crypto_register_alg(&aria_alg);
+}
+
+static void __exit aria_fini(void)
+{
+	crypto_unregister_alg(&aria_alg);
+}
+
+subsys_initcall(aria_init);
+module_exit(aria_fini);
+
+MODULE_DESCRIPTION("ARIA Cipher Algorithm");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_ALIAS_CRYPTO("aria");
diff --git a/include/crypto/aria.h b/include/crypto/aria.h
new file mode 100644
index 000000000000..4a86661788e8
--- /dev/null
+++ b/include/crypto/aria.h
@@ -0,0 +1,461 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * ARIA Cipher Algorithm.
+ *
+ * Documentation of ARIA can be found in RFC 5794.
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ * Information for ARIA
+ *     http://210.104.33.10/ARIA/index-e.html (English)
+ *     http://seed.kisa.or.kr/ (Korean)
+ *
+ * Public domain version is distributed above.
+ */
+
+#ifndef _CRYPTO_ARIA_H
+#define _CRYPTO_ARIA_H
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <asm/byteorder.h>
+
+#define ARIA_MIN_KEY_SIZE	16
+#define ARIA_MAX_KEY_SIZE	32
+#define ARIA_BLOCK_SIZE		16
+#define ARIA_MAX_RD_KEYS	17
+#define ARIA_RD_KEY_WORDS	(ARIA_BLOCK_SIZE / sizeof(u32))
+
+struct aria_ctx {
+	int key_length;
+	int rounds;
+	u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+	u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+};
+
+static const u32 key_rc[5][4] = {
+	{ 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0 },
+	{ 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0 },
+	{ 0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e },
+	{ 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0 },
+	{ 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0 }
+};
+
+static const u32 s1[256] = {
+	0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b,
+	0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5,
+	0x00303030, 0x00010101, 0x00676767, 0x002b2b2b,
+	0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676,
+	0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d,
+	0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0,
+	0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf,
+	0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0,
+	0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626,
+	0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc,
+	0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1,
+	0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515,
+	0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3,
+	0x00181818, 0x00969696, 0x00050505, 0x009a9a9a,
+	0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2,
+	0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575,
+	0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a,
+	0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0,
+	0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3,
+	0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484,
+	0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed,
+	0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b,
+	0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939,
+	0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf,
+	0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb,
+	0x00434343, 0x004d4d4d, 0x00333333, 0x00858585,
+	0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f,
+	0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8,
+	0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f,
+	0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5,
+	0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121,
+	0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2,
+	0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec,
+	0x005f5f5f, 0x00979797, 0x00444444, 0x00171717,
+	0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d,
+	0x00646464, 0x005d5d5d, 0x00191919, 0x00737373,
+	0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc,
+	0x00222222, 0x002a2a2a, 0x00909090, 0x00888888,
+	0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414,
+	0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb,
+	0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a,
+	0x00494949, 0x00060606, 0x00242424, 0x005c5c5c,
+	0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262,
+	0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979,
+	0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d,
+	0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9,
+	0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea,
+	0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808,
+	0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e,
+	0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6,
+	0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f,
+	0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a,
+	0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666,
+	0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e,
+	0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9,
+	0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e,
+	0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111,
+	0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494,
+	0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9,
+	0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf,
+	0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d,
+	0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868,
+	0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f,
+	0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616
+};
+
+static const u32 s2[256] = {
+	0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc,
+	0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc,
+	0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646,
+	0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1,
+	0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb,
+	0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b,
+	0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303,
+	0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1,
+	0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b,
+	0x55005555, 0xf000f0f0, 0x99009999, 0x69006969,
+	0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae,
+	0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb,
+	0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb,
+	0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4,
+	0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa,
+	0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb,
+	0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929,
+	0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191,
+	0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595,
+	0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd,
+	0x08000808, 0x7a007a7a, 0x88008888, 0x38003838,
+	0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828,
+	0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7,
+	0x93009393, 0xa400a4a4, 0x12001212, 0x53005353,
+	0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131,
+	0x36003636, 0x21002121, 0x58005858, 0x48004848,
+	0x01000101, 0x8e008e8e, 0x37003737, 0x74007474,
+	0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1,
+	0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7,
+	0xc400c4c4, 0x56005656, 0x42004242, 0x26002626,
+	0x07000707, 0x98009898, 0x60006060, 0xd900d9d9,
+	0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040,
+	0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd,
+	0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404,
+	0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f,
+	0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc,
+	0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757,
+	0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565,
+	0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e,
+	0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5,
+	0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717,
+	0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a,
+	0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767,
+	0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343,
+	0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5,
+	0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434,
+	0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a,
+	0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8,
+	0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b,
+	0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6,
+	0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424,
+	0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada,
+	0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef,
+	0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f,
+	0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a,
+	0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c,
+	0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333,
+	0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3,
+	0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0,
+	0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d,
+	0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5,
+	0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8,
+	0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a,
+	0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181
+};
+
+static const u32 x1[256] = {
+	0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5,
+	0x30300030, 0x36360036, 0xa5a500a5, 0x38380038,
+	0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e,
+	0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb,
+	0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082,
+	0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087,
+	0x34340034, 0x8e8e008e, 0x43430043, 0x44440044,
+	0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb,
+	0x54540054, 0x7b7b007b, 0x94940094, 0x32320032,
+	0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d,
+	0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b,
+	0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e,
+	0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066,
+	0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2,
+	0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049,
+	0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025,
+	0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064,
+	0x86860086, 0x68680068, 0x98980098, 0x16160016,
+	0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc,
+	0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092,
+	0x6c6c006c, 0x70700070, 0x48480048, 0x50500050,
+	0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da,
+	0x5e5e005e, 0x15150015, 0x46460046, 0x57570057,
+	0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084,
+	0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000,
+	0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a,
+	0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005,
+	0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006,
+	0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f,
+	0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002,
+	0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003,
+	0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b,
+	0x3a3a003a, 0x91910091, 0x11110011, 0x41410041,
+	0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea,
+	0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce,
+	0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073,
+	0x96960096, 0xacac00ac, 0x74740074, 0x22220022,
+	0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085,
+	0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8,
+	0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e,
+	0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071,
+	0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089,
+	0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e,
+	0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b,
+	0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b,
+	0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020,
+	0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe,
+	0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4,
+	0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033,
+	0x88880088, 0x07070007, 0xc7c700c7, 0x31310031,
+	0xb1b100b1, 0x12120012, 0x10100010, 0x59590059,
+	0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f,
+	0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9,
+	0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d,
+	0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f,
+	0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef,
+	0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d,
+	0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0,
+	0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c,
+	0x83830083, 0x53530053, 0x99990099, 0x61610061,
+	0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e,
+	0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026,
+	0xe1e100e1, 0x69690069, 0x14140014, 0x63630063,
+	0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d
+};
+
+static const u32 x2[256] = {
+	0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00,
+	0x87878700, 0xb9b9b900, 0x21212100, 0x78787800,
+	0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100,
+	0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00,
+	0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00,
+	0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300,
+	0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600,
+	0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00,
+	0x81818100, 0x65656500, 0xf5f5f500, 0x89898900,
+	0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600,
+	0x57575700, 0x43434300, 0x56565600, 0x17171700,
+	0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00,
+	0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300,
+	0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00,
+	0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800,
+	0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00,
+	0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00,
+	0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800,
+	0x67676700, 0x88888800, 0x06060600, 0xc3c3c300,
+	0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00,
+	0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00,
+	0x02020200, 0x24242400, 0x75757500, 0x93939300,
+	0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200,
+	0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00,
+	0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00,
+	0x12121200, 0x97979700, 0x32323200, 0xababab00,
+	0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300,
+	0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900,
+	0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100,
+	0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900,
+	0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600,
+	0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100,
+	0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500,
+	0x86868600, 0x36363600, 0xbebebe00, 0x61616100,
+	0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00,
+	0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00,
+	0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00,
+	0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500,
+	0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00,
+	0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700,
+	0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00,
+	0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000,
+	0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100,
+	0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00,
+	0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600,
+	0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000,
+	0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00,
+	0x18181800, 0x83838300, 0x16161600, 0xa5a5a500,
+	0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500,
+	0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00,
+	0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300,
+	0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500,
+	0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00,
+	0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300,
+	0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900,
+	0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00,
+	0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400,
+	0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00,
+	0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00,
+	0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300,
+	0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700,
+	0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00,
+	0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300,
+	0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000
+};
+
+static inline u32 rotl32(u32 v, u32 r)
+{
+	return ((v << r) | (v >> (32 - r)));
+}
+
+static inline u32 rotr32(u32 v, u32 r)
+{
+	return ((v >> r) | (v << (32 - r)));
+}
+
+static inline u32 bswap32(u32 v)
+{
+	return ((v << 24) ^
+		(v >> 24) ^
+		((v & 0x0000ff00) << 8) ^
+		((v & 0x00ff0000) >> 8));
+}
+
+static inline u8 get_u8(u32 x, u32 y)
+{
+	return (x >> ((3 - y) * 8));
+}
+
+static inline u32 make_u32(u8 v0, u8 v1, u8 v2, u8 v3)
+{
+	return ((u32)v0 << 24) | ((u32)v1 << 16) | ((u32)v2 <<  8) | ((u32)v3);
+}
+
+static inline u32 aria_m(u32 t0)
+{
+	return rotr32(t0, 8) ^ rotr32(t0 ^ rotr32(t0, 8), 16);
+}
+
+/* S-Box Layer 1 + M */
+static inline void aria_sbox_layer1_with_pre_diff(u32 *t0, u32 *t1, u32 *t2,
+						  u32 *t3)
+{
+	*t0 = s1[get_u8(*t0, 0)] ^
+	      s2[get_u8(*t0, 1)] ^
+	      x1[get_u8(*t0, 2)] ^
+	      x2[get_u8(*t0, 3)];
+	*t1 = s1[get_u8(*t1, 0)] ^
+	      s2[get_u8(*t1, 1)] ^
+	      x1[get_u8(*t1, 2)] ^
+	      x2[get_u8(*t1, 3)];
+	*t2 = s1[get_u8(*t2, 0)] ^
+	      s2[get_u8(*t2, 1)] ^
+	      x1[get_u8(*t2, 2)] ^
+	      x2[get_u8(*t2, 3)];
+	*t3 = s1[get_u8(*t3, 0)] ^
+	      s2[get_u8(*t3, 1)] ^
+	      x1[get_u8(*t3, 2)] ^
+	      x2[get_u8(*t3, 3)];
+}
+
+/* S-Box Layer 2 + M */
+static inline void aria_sbox_layer2_with_pre_diff(u32 *t0, u32 *t1, u32 *t2,
+						  u32 *t3)
+{
+	*t0 = x1[get_u8(*t0, 0)] ^
+	      x2[get_u8(*t0, 1)] ^
+	      s1[get_u8(*t0, 2)] ^
+	      s2[get_u8(*t0, 3)];
+	*t1 = x1[get_u8(*t1, 0)] ^
+	      x2[get_u8(*t1, 1)] ^
+	      s1[get_u8(*t1, 2)] ^
+	      s2[get_u8(*t1, 3)];
+	*t2 = x1[get_u8(*t2, 0)] ^
+	      x2[get_u8(*t2, 1)] ^
+	      s1[get_u8(*t2, 2)] ^
+	      s2[get_u8(*t2, 3)];
+	*t3 = x1[get_u8(*t3, 0)] ^
+	      x2[get_u8(*t3, 1)] ^
+	      s1[get_u8(*t3, 2)] ^
+	      s2[get_u8(*t3, 3)];
+}
+
+/* Word-level diffusion */
+static inline void aria_diff_word(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+	*t1 ^= *t2;
+	*t2 ^= *t3;
+	*t0 ^= *t1;
+
+	*t3 ^= *t1;
+	*t2 ^= *t0;
+	*t1 ^= *t2;
+}
+
+/* Byte-level diffusion */
+static inline void aria_diff_byte(u32 *t1, u32 *t2, u32 *t3)
+{
+	*t1 = ((*t1 << 8) & 0xff00ff00) ^ ((*t1 >> 8) & 0x00ff00ff);
+	*t2 = rotr32(*t2, 16);
+	*t3 = bswap32(*t3);
+}
+
+/* Key XOR Layer */
+static inline void aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2,
+				      u32 *t3)
+{
+	*t0 ^= rk[0];
+	*t1 ^= rk[1];
+	*t2 ^= rk[2];
+	*t3 ^= rk[3];
+}
+/* Odd round Substitution & Diffusion */
+static inline void aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+	aria_sbox_layer1_with_pre_diff(t0, t1, t2, t3);
+	aria_diff_word(t0, t1, t2, t3);
+	aria_diff_byte(t1, t2, t3);
+	aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Even round Substitution & Diffusion */
+static inline void aria_subst_diff_even(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+	aria_sbox_layer2_with_pre_diff(t0, t1, t2, t3);
+	aria_diff_word(t0, t1, t2, t3);
+	aria_diff_byte(t3, t0, t1);
+	aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Q, R Macro expanded ARIA GSRK */
+static inline void aria_gsrk(u32 *rk, u32 *x, u32 *y, u32 n)
+{
+	int q = 4 - (n / 32);
+	int r = n % 32;
+
+	rk[0] = (x[0]) ^
+		((y[q % 4]) >> r) ^
+		((y[(q + 3) % 4]) << (32 - r));
+	rk[1] = (x[1]) ^
+		((y[(q + 1) % 4]) >> r) ^
+		((y[q % 4]) << (32 - r));
+	rk[2] = (x[2]) ^
+		((y[(q + 2) % 4]) >> r) ^
+		((y[(q + 1) % 4]) << (32 - r));
+	rk[3] =	(x[3]) ^
+		((y[(q + 3) % 4]) >> r) ^
+		((y[(q + 2) % 4]) << (32 - r));
+}
+
+#endif
-- 
cgit 


From fb2accadaa9427a374fa9f482ea24ca731f675ba Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Wed, 13 Jul 2022 17:56:48 -0500
Subject: iommu/amd: Introduce function to check and enable SNP

To support SNP, IOMMU needs to be enabled, and prohibits IOMMU
configurations where DTE[Mode]=0, which means it cannot be supported with
IOMMU passthrough domain (a.k.a IOMMU_DOMAIN_IDENTITY),
and when AMD IOMMU driver is configured to not use the IOMMU host (v1) page
table. Otherwise, RMP table initialization could cause the system to crash.

The request to enable SNP support in IOMMU must be done before PCI
initialization state of the IOMMU driver because enabling SNP affects
how IOMMU driver sets up IOMMU data structures (i.e. DTE).

Unlike other IOMMU features, SNP feature does not have an enable bit in
the IOMMU control register. Instead, the IOMMU driver introduces
an amd_iommu_snp_en variable to track enabling state of SNP.

Introduce amd_iommu_snp_enable() for other drivers to request enabling
the SNP support in IOMMU, which checks all prerequisites and determines
if the feature can be safely enabled.

Please see the IOMMU spec section 2.12 for further details.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Co-developed-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Link: https://lore.kernel.org/r/20220713225651.20758-7-suravee.suthikulpanit@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu.h |  2 ++
 drivers/iommu/amd/init.c      | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/amd-iommu.h     |  4 ++++
 3 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 1b945d47741c..84e5bb1bf01b 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -140,4 +140,6 @@ extern struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
 
 extern u64 amd_iommu_efr;
 extern u64 amd_iommu_efr2;
+
+extern bool amd_iommu_snp_en;
 #endif
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 7c6e61451ff7..cddcb6f7e3c9 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -168,6 +168,10 @@ static int amd_iommu_target_ivhd_type;
 u64 amd_iommu_efr;
 u64 amd_iommu_efr2;
 
+/* SNP is enabled on the system? */
+bool amd_iommu_snp_en;
+EXPORT_SYMBOL(amd_iommu_snp_en);
+
 LIST_HEAD(amd_iommu_pci_seg_list);	/* list of all PCI segments */
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
 					   system */
@@ -3556,3 +3560,41 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64
 
 	return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
 }
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+int amd_iommu_snp_enable(void)
+{
+	/*
+	 * The SNP support requires that IOMMU must be enabled, and is
+	 * not configured in the passthrough mode.
+	 */
+	if (no_iommu || iommu_default_passthrough()) {
+		pr_err("SNP: IOMMU is disabled or configured in passthrough mode, SNP cannot be supported");
+		return -EINVAL;
+	}
+
+	/*
+	 * Prevent enabling SNP after IOMMU_ENABLED state because this process
+	 * affect how IOMMU driver sets up data structures and configures
+	 * IOMMU hardware.
+	 */
+	if (init_state > IOMMU_ENABLED) {
+		pr_err("SNP: Too late to enable SNP for IOMMU.\n");
+		return -EINVAL;
+	}
+
+	amd_iommu_snp_en = check_feature_on_all_iommus(FEATURE_SNP);
+	if (!amd_iommu_snp_en)
+		return -EINVAL;
+
+	pr_info("SNP enabled\n");
+
+	/* Enforce IOMMU v1 pagetable when SNP is enabled. */
+	if (amd_iommu_pgtable != AMD_IOMMU_V1) {
+		pr_warn("Force to using AMD IOMMU v1 page table due to SNP\n");
+		amd_iommu_pgtable = AMD_IOMMU_V1;
+	}
+
+	return 0;
+}
+#endif
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 58e6c3806c09..953e6f12fa1c 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -206,4 +206,8 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
 		u64 *value);
 struct amd_iommu *get_amd_iommu(unsigned int idx);
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+int amd_iommu_snp_enable(void);
+#endif
+
 #endif /* _ASM_X86_AMD_IOMMU_H */
-- 
cgit 


From 577e5b8c3924539c7a09e3e00477534f39e61829 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 8 Jun 2022 12:01:12 +0300
Subject: wifi: cfg80211: add API to add/modify/remove a link station

Add an API for adding/modifying/removing a link of a station.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  64 +++++++++++++++++
 include/uapi/linux/nl80211.h |   8 +++
 net/wireless/nl80211.c       | 168 ++++++++++++++++++++++++++++++++++++++++---
 net/wireless/rdev-ops.h      |  48 +++++++++++++
 net/wireless/trace.h         |  97 +++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3bcdd20ace66..422764881269 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1456,6 +1456,61 @@ struct sta_txpwr {
 	enum nl80211_tx_power_setting type;
 };
 
+/**
+ * struct link_station_parameters - link station parameters
+ *
+ * Used to change and create a new link station.
+ *
+ * @mld_mac: MAC address of the station
+ * @link_id: the link id (-1 for non-MLD station)
+ * @link_mac: MAC address of the link
+ * @supported_rates: supported rates in IEEE 802.11 format
+ *	(or NULL for no change)
+ * @supported_rates_len: number of supported rates
+ * @ht_capa: HT capabilities of station
+ * @vht_capa: VHT capabilities of station
+ * @opmode_notif: operating mode field from Operating Mode Notification
+ * @opmode_notif_used: information if operating mode field is used
+ * @he_capa: HE capabilities of station
+ * @he_capa_len: the length of the HE capabilities
+ * @txpwr: transmit power for an associated station
+ * @txpwr_set: txpwr field is set
+ * @he_6ghz_capa: HE 6 GHz Band capabilities of station
+ * @eht_capa: EHT capabilities of station
+ * @eht_capa_len: the length of the EHT capabilities
+ */
+struct link_station_parameters {
+	const u8 *mld_mac;
+	int link_id;
+	const u8 *link_mac;
+	const u8 *supported_rates;
+	u8 supported_rates_len;
+	const struct ieee80211_ht_cap *ht_capa;
+	const struct ieee80211_vht_cap *vht_capa;
+	u8 opmode_notif;
+	bool opmode_notif_used;
+	const struct ieee80211_he_cap_elem *he_capa;
+	u8 he_capa_len;
+	struct sta_txpwr txpwr;
+	bool txpwr_set;
+	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
+	const struct ieee80211_eht_cap_elem *eht_capa;
+	u8 eht_capa_len;
+};
+
+/**
+ * struct link_station_del_parameters - link station deletion parameters
+ *
+ * Used to delete a link station entry (or all stations).
+ *
+ * @mld_mac: MAC address of the station
+ * @link_id: the link id
+ */
+struct link_station_del_parameters {
+	const u8 *mld_mac;
+	u32 link_id;
+};
+
 /**
  * struct station_parameters - station parameters
  *
@@ -4215,6 +4270,9 @@ struct mgmt_frame_regs {
  *	radar channel.
  *	The caller is expected to set chandef pointer to NULL in order to
  *	disable background CAC/radar detection.
+ * @add_link_station: Add a link to a station.
+ * @mod_link_station: Modify a link of a station.
+ * @del_link_station: Remove a link of a station.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4560,6 +4618,12 @@ struct cfg80211_ops {
 				struct cfg80211_fils_aad *fils_aad);
 	int	(*set_radar_background)(struct wiphy *wiphy,
 					struct cfg80211_chan_def *chandef);
+	int	(*add_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_parameters *params);
+	int	(*mod_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_parameters *params);
+	int	(*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_del_parameters *params);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7bb1ae59f3a5..37bfc934325a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1254,6 +1254,10 @@
  *	without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links
  *	in preparation for e.g. roaming to a regular (non-MLO) AP.
  *
+ * @NL80211_CMD_ADD_LINK_STA: Add a link to an MLD station
+ * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station
+ * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1501,6 +1505,10 @@ enum nl80211_commands {
 	NL80211_CMD_ADD_LINK,
 	NL80211_CMD_REMOVE_LINK,
 
+	NL80211_CMD_ADD_LINK_STA,
+	NL80211_CMD_MODIFY_LINK_STA,
+	NL80211_CMD_REMOVE_LINK_STA,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 22c4cf6fbb57..3cf8e01e3f7e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6801,7 +6801,8 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 }
 
 static int nl80211_parse_sta_txpower_setting(struct genl_info *info,
-					     struct station_parameters *params)
+					     struct sta_txpwr *txpwr,
+					     bool *txpwr_set)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int idx;
@@ -6813,18 +6814,20 @@ static int nl80211_parse_sta_txpower_setting(struct genl_info *info,
 			return -EOPNOTSUPP;
 
 		idx = NL80211_ATTR_STA_TX_POWER_SETTING;
-		params->txpwr.type = nla_get_u8(info->attrs[idx]);
+		txpwr->type = nla_get_u8(info->attrs[idx]);
 
-		if (params->txpwr.type == NL80211_TX_POWER_LIMITED) {
+		if (txpwr->type == NL80211_TX_POWER_LIMITED) {
 			idx = NL80211_ATTR_STA_TX_POWER;
 
 			if (info->attrs[idx])
-				params->txpwr.power =
-					nla_get_s16(info->attrs[idx]);
+				txpwr->power = nla_get_s16(info->attrs[idx]);
 			else
 				return -EINVAL;
 		}
-		params->sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
+
+		*txpwr_set = true;
+	} else {
+		*txpwr_set = false;
 	}
 
 	return 0;
@@ -6837,6 +6840,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	struct station_parameters params;
 	u8 *mac_addr;
 	int err;
+	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -6930,9 +6934,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params);
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
 	if (err)
 		return err;
+	if (txpwr_set)
+		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	/* Include parameters for TDLS peer (will check later) */
 	err = nl80211_set_station_tdls(info, &params);
@@ -6975,6 +6981,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	u8 *mac_addr = NULL;
 	u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
 			 BIT(NL80211_STA_FLAG_ASSOCIATED);
+	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -7085,9 +7092,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params);
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
 	if (err)
 		return err;
+	if (txpwr_set)
+		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	err = nl80211_parse_sta_channel_info(info, &params);
 	if (err)
@@ -15682,6 +15691,128 @@ static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static int
+nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
+			     bool add)
+{
+	struct link_station_parameters params = {};
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	int err;
+
+	if ((add && !rdev->ops->add_link_station) ||
+	    (!add && !rdev->ops->mod_link_station))
+		return -EOPNOTSUPP;
+
+	if (add && !info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (add && !info->attrs[NL80211_ATTR_MLD_ADDR])
+		return -EINVAL;
+
+	if (add && !info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_MLD_ADDR])
+		params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+
+	if (info->attrs[NL80211_ATTR_MAC]) {
+		params.link_mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
+		if (!is_valid_ether_addr(params.link_mac))
+			return -EINVAL;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);
+
+	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
+		params.supported_rates =
+			nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+		params.supported_rates_len =
+			nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+	}
+
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
+		params.vht_capa =
+			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params.he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params.he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
+			params.eht_capa =
+				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+			params.eht_capa_len =
+				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa,
+							(const u8 *)params.eht_capa,
+							params.eht_capa_len))
+				return -EINVAL;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
+		params.he_6ghz_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
+		params.opmode_notif_used = true;
+		params.opmode_notif =
+			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
+	}
+
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr,
+						&params.txpwr_set);
+	if (err)
+		return err;
+
+	if (add)
+		return rdev_add_link_station(rdev, dev, &params);
+	return rdev_mod_link_station(rdev, dev, &params);
+}
+
+static int
+nl80211_add_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	return nl80211_add_mod_link_station(skb, info, true);
+}
+
+static int
+nl80211_modify_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	return nl80211_add_mod_link_station(skb, info, false);
+}
+
+static int
+nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct link_station_del_parameters params = {};
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	if (!rdev->ops->del_link_station)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MLD_ADDR] ||
+	    !info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+	params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);
+
+	return rdev_del_link_station(rdev, dev, &params);
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -16832,6 +16963,27 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
+	{
+		.cmd = NL80211_CMD_ADD_LINK_STA,
+		.doit = nl80211_add_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
+	{
+		.cmd = NL80211_CMD_MODIFY_LINK_STA,
+		.doit = nl80211_modify_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
+	{
+		.cmd = NL80211_CMD_REMOVE_LINK_STA,
+		.doit = nl80211_remove_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index a329ba036989..6221a996c19f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1448,4 +1448,52 @@ rdev_del_intf_link(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int
+rdev_add_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->add_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_add_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline int
+rdev_mod_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->mod_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_mod_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline int
+rdev_del_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_del_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->del_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_del_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 65f8b814ecd0..16d0fe53fcf2 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3775,6 +3775,103 @@ TRACE_EVENT(cfg80211_assoc_comeback,
 		  WDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->timeout)
 );
 
+DECLARE_EVENT_CLASS(link_station_add_mod,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__array(u8, mld_mac, 6)
+		__array(u8, link_mac, 6)
+		__field(u32, link_id)
+		__dynamic_array(u8, supported_rates,
+				params->supported_rates_len)
+		__array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
+		__array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
+		__field(u8, opmode_notif)
+		__field(bool, opmode_notif_used)
+		__dynamic_array(u8, he_capa, params->he_capa_len)
+		__array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa))
+		__dynamic_array(u8, eht_capa, params->eht_capa_len)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		memset(__entry->mld_mac, 0, 6);
+		memset(__entry->link_mac, 0, 6);
+		if (params->mld_mac)
+			memcpy(__entry->mld_mac, params->mld_mac, 6);
+		if (params->link_mac)
+			memcpy(__entry->link_mac, params->link_mac, 6);
+		__entry->link_id = params->link_id;
+		if (params->supported_rates && params->supported_rates_len)
+			memcpy(__get_dynamic_array(supported_rates),
+			       params->supported_rates,
+			       params->supported_rates_len);
+		memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
+		if (params->ht_capa)
+			memcpy(__entry->ht_capa, params->ht_capa,
+			       sizeof(struct ieee80211_ht_cap));
+		memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
+		if (params->vht_capa)
+			memcpy(__entry->vht_capa, params->vht_capa,
+			       sizeof(struct ieee80211_vht_cap));
+		__entry->opmode_notif = params->opmode_notif;
+		__entry->opmode_notif_used = params->opmode_notif_used;
+		if (params->he_capa && params->he_capa_len)
+			memcpy(__get_dynamic_array(he_capa), params->he_capa,
+			       params->he_capa_len);
+		memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa));
+		if (params->he_6ghz_capa)
+			memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa,
+			       sizeof(struct ieee80211_he_6ghz_capa));
+		if (params->eht_capa && params->eht_capa_len)
+			memcpy(__get_dynamic_array(eht_capa), params->eht_capa,
+			       params->eht_capa_len);
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+		  ", link mac: " MAC_PR_FMT ", link id: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac),
+		  MAC_PR_ARG(link_mac), __entry->link_id)
+);
+
+DEFINE_EVENT(link_station_add_mod, rdev_add_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params)
+);
+
+DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params)
+);
+
+TRACE_EVENT(rdev_del_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_del_parameters *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__array(u8, mld_mac, 6)
+		__field(u32, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		memset(__entry->mld_mac, 0, 6);
+		if (params->mld_mac)
+			memcpy(__entry->mld_mac, params->mld_mac, 6);
+		__entry->link_id = params->link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+		  ", link id: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac),
+		  __entry->link_id)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit 


From b95eb7f0eee479478eb1a7c0a42a80167708c1df Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Tue, 14 Jun 2022 13:49:16 +0300
Subject: wifi: cfg80211/mac80211: separate link params from station params

Put the link_station_parameters structure in the station_parameters
structure (and remove the station_parameters fields already existing
in link_station_parameters).
Now, for an MLD station, the default link is added together with
the station.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/marvell/mwifiex/sta_cmd.c |  20 ++--
 drivers/net/wireless/microchip/wilc1000/hif.c  |  20 ++--
 include/net/cfg80211.h                         |  28 +----
 net/mac80211/cfg.c                             | 143 ++++++++++++++++---------
 net/wireless/nl80211.c                         |  97 +++++++++--------
 net/wireless/trace.h                           |  24 +++--
 6 files changed, 180 insertions(+), 152 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/marvell/mwifiex/sta_cmd.c b/drivers/net/wireless/marvell/mwifiex/sta_cmd.c
index 1e2798dce18f..851ea58fb38e 100644
--- a/drivers/net/wireless/marvell/mwifiex/sta_cmd.c
+++ b/drivers/net/wireless/marvell/mwifiex/sta_cmd.c
@@ -1790,29 +1790,31 @@ mwifiex_cmd_tdls_oper(struct mwifiex_private *priv,
 		wmm_qos_info->qos_info = 0;
 		config_len += sizeof(struct mwifiex_ie_types_qos_info);
 
-		if (params->ht_capa) {
+		if (params->link_sta_params.ht_capa) {
 			ht_capab = (struct mwifiex_ie_types_htcap *)(pos +
 								    config_len);
 			ht_capab->header.type =
 					    cpu_to_le16(WLAN_EID_HT_CAPABILITY);
 			ht_capab->header.len =
 				   cpu_to_le16(sizeof(struct ieee80211_ht_cap));
-			memcpy(&ht_capab->ht_cap, params->ht_capa,
+			memcpy(&ht_capab->ht_cap, params->link_sta_params.ht_capa,
 			       sizeof(struct ieee80211_ht_cap));
 			config_len += sizeof(struct mwifiex_ie_types_htcap);
 		}
 
-		if (params->supported_rates && params->supported_rates_len) {
+		if (params->link_sta_params.supported_rates &&
+		    params->link_sta_params.supported_rates_len) {
 			tlv_rates = (struct host_cmd_tlv_rates *)(pos +
 								  config_len);
 			tlv_rates->header.type =
 					       cpu_to_le16(WLAN_EID_SUPP_RATES);
 			tlv_rates->header.len =
-				       cpu_to_le16(params->supported_rates_len);
-			memcpy(tlv_rates->rates, params->supported_rates,
-			       params->supported_rates_len);
+				       cpu_to_le16(params->link_sta_params.supported_rates_len);
+			memcpy(tlv_rates->rates,
+			       params->link_sta_params.supported_rates,
+			       params->link_sta_params.supported_rates_len);
 			config_len += sizeof(struct host_cmd_tlv_rates) +
-				      params->supported_rates_len;
+				      params->link_sta_params.supported_rates_len;
 		}
 
 		if (params->ext_capab && params->ext_capab_len) {
@@ -1826,14 +1828,14 @@ mwifiex_cmd_tdls_oper(struct mwifiex_private *priv,
 			config_len += sizeof(struct mwifiex_ie_types_extcap) +
 				      params->ext_capab_len;
 		}
-		if (params->vht_capa) {
+		if (params->link_sta_params.vht_capa) {
 			vht_capab = (struct mwifiex_ie_types_vhtcap *)(pos +
 								    config_len);
 			vht_capab->header.type =
 					   cpu_to_le16(WLAN_EID_VHT_CAPABILITY);
 			vht_capab->header.len =
 				  cpu_to_le16(sizeof(struct ieee80211_vht_cap));
-			memcpy(&vht_capab->vht_cap, params->vht_capa,
+			memcpy(&vht_capab->vht_cap, params->link_sta_params.vht_capa,
 			       sizeof(struct ieee80211_vht_cap));
 			config_len += sizeof(struct mwifiex_ie_types_vhtcap);
 		}
diff --git a/drivers/net/wireless/microchip/wilc1000/hif.c b/drivers/net/wireless/microchip/wilc1000/hif.c
index 4038a254465f..021e0db80bd2 100644
--- a/drivers/net/wireless/microchip/wilc1000/hif.c
+++ b/drivers/net/wireless/microchip/wilc1000/hif.c
@@ -816,15 +816,15 @@ static void wilc_hif_pack_sta_param(u8 *cur_byte, const u8 *mac,
 	put_unaligned_le16(params->aid, cur_byte);
 	cur_byte += 2;
 
-	*cur_byte++ = params->supported_rates_len;
-	if (params->supported_rates_len > 0)
-		memcpy(cur_byte, params->supported_rates,
-		       params->supported_rates_len);
-	cur_byte += params->supported_rates_len;
+	*cur_byte++ = params->link_sta_params.supported_rates_len;
+	if (params->link_sta_params.supported_rates_len > 0)
+		memcpy(cur_byte, params->link_sta_params.supported_rates,
+		       params->link_sta_params.supported_rates_len);
+	cur_byte += params->link_sta_params.supported_rates_len;
 
-	if (params->ht_capa) {
+	if (params->link_sta_params.ht_capa) {
 		*cur_byte++ = true;
-		memcpy(cur_byte, params->ht_capa,
+		memcpy(cur_byte, params->link_sta_params.ht_capa,
 		       sizeof(struct ieee80211_ht_cap));
 	} else {
 		*cur_byte++ = false;
@@ -1799,7 +1799,8 @@ int wilc_add_station(struct wilc_vif *vif, const u8 *mac,
 
 	wid.id = WID_ADD_STA;
 	wid.type = WID_BIN;
-	wid.size = WILC_ADD_STA_LENGTH + params->supported_rates_len;
+	wid.size = WILC_ADD_STA_LENGTH +
+		   params->link_sta_params.supported_rates_len;
 	wid.val = kmalloc(wid.size, GFP_KERNEL);
 	if (!wid.val)
 		return -ENOMEM;
@@ -1884,7 +1885,8 @@ int wilc_edit_station(struct wilc_vif *vif, const u8 *mac,
 
 	wid.id = WID_EDIT_STA;
 	wid.type = WID_BIN;
-	wid.size = WILC_ADD_STA_LENGTH + params->supported_rates_len;
+	wid.size = WILC_ADD_STA_LENGTH +
+		   params->link_sta_params.supported_rates_len;
 	wid.val = kmalloc(wid.size, GFP_KERNEL);
 	if (!wid.val)
 		return -ENOMEM;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 422764881269..c904cbd1c4d6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1433,7 +1433,6 @@ enum station_parameters_apply_mask {
 	STATION_PARAM_APPLY_UAPSD = BIT(0),
 	STATION_PARAM_APPLY_CAPABILITY = BIT(1),
 	STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
-	STATION_PARAM_APPLY_STA_TXPOWER = BIT(3),
 };
 
 /**
@@ -1517,9 +1516,6 @@ struct link_station_del_parameters {
  * Used to change and create a new station.
  *
  * @vlan: vlan interface station should belong to
- * @supported_rates: supported rates in IEEE 802.11 format
- *	(or NULL for no change)
- * @supported_rates_len: number of supported rates
  * @sta_flags_mask: station flags that changed
  *	(bitmask of BIT(%NL80211_STA_FLAG_...))
  * @sta_flags_set: station flags values
@@ -1530,8 +1526,6 @@ struct link_station_del_parameters {
  * @peer_aid: mesh peer AID or zero for no change
  * @plink_action: plink action to take
  * @plink_state: set the peer link state for a station
- * @ht_capa: HT capabilities of station
- * @vht_capa: VHT capabilities of station
  * @uapsd_queues: bitmap of queues configured for uapsd. same format
  *	as the AC bitmap in the QoS info field
  * @max_sp: max Service Period. same format as the MAX_SP in the
@@ -1548,19 +1542,11 @@ struct link_station_del_parameters {
  * @supported_channels_len: number of supported channels
  * @supported_oper_classes: supported oper classes in IEEE 802.11 format
  * @supported_oper_classes_len: number of supported operating classes
- * @opmode_notif: operating mode field from Operating Mode Notification
- * @opmode_notif_used: information if operating mode field is used
  * @support_p2p_ps: information if station supports P2P PS mechanism
- * @he_capa: HE capabilities of station
- * @he_capa_len: the length of the HE capabilities
  * @airtime_weight: airtime scheduler weight for this station
- * @txpwr: transmit power for an associated station
- * @he_6ghz_capa: HE 6 GHz Band capabilities of station
- * @eht_capa: EHT capabilities of station
- * @eht_capa_len: the length of the EHT capabilities
+ * @link_sta_params: link related params.
  */
 struct station_parameters {
-	const u8 *supported_rates;
 	struct net_device *vlan;
 	u32 sta_flags_mask, sta_flags_set;
 	u32 sta_modify_mask;
@@ -1568,11 +1554,8 @@ struct station_parameters {
 	u16 aid;
 	u16 vlan_id;
 	u16 peer_aid;
-	u8 supported_rates_len;
 	u8 plink_action;
 	u8 plink_state;
-	const struct ieee80211_ht_cap *ht_capa;
-	const struct ieee80211_vht_cap *vht_capa;
 	u8 uapsd_queues;
 	u8 max_sp;
 	enum nl80211_mesh_power_mode local_pm;
@@ -1583,16 +1566,9 @@ struct station_parameters {
 	u8 supported_channels_len;
 	const u8 *supported_oper_classes;
 	u8 supported_oper_classes_len;
-	u8 opmode_notif;
-	bool opmode_notif_used;
 	int support_p2p_ps;
-	const struct ieee80211_he_cap_elem *he_capa;
-	u8 he_capa_len;
 	u16 airtime_weight;
-	struct sta_txpwr txpwr;
-	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
-	const struct ieee80211_eht_cap_elem *eht_capa;
-	u8 eht_capa_len;
+	struct link_station_parameters link_sta_params;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 64801ab545c1..3eacf72279c2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1581,6 +1581,83 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
 #endif
 }
 
+static int sta_link_apply_parameters(struct ieee80211_local *local,
+				     struct sta_info *sta,
+				     struct link_station_parameters *params)
+{
+	int ret = 0;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u32 link_id = params->link_id < 0 ? 0 : params->link_id;
+	struct link_sta_info *link_sta =
+		rcu_dereference_protected(sta->link[link_id],
+					  lockdep_is_held(&local->sta_mtx));
+
+	if (!link_sta)
+		return -EINVAL;
+
+	sband = ieee80211_get_link_sband(sdata, link_id);
+	if (!sband)
+		return -EINVAL;
+
+	if (params->link_mac) {
+		memcpy(link_sta->addr, params->link_mac, ETH_ALEN);
+		memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN);
+	}
+
+	if (params->txpwr_set) {
+		link_sta->pub->txpwr.type = params->txpwr.type;
+		if (params->txpwr.type == NL80211_TX_POWER_LIMITED)
+			link_sta->pub->txpwr.power = params->txpwr.power;
+		ret = drv_sta_set_txpwr(local, sdata, sta);
+		if (ret)
+			return ret;
+	}
+
+	if (params->supported_rates &&
+	    params->supported_rates_len) {
+		ieee80211_parse_bitrates(&sdata->vif.link_conf[link_id]->chandef,
+					 sband, params->supported_rates,
+					 params->supported_rates_len,
+					 &link_sta->pub->supp_rates[sband->band]);
+	}
+
+	if (params->ht_capa)
+		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+						  params->ht_capa, link_sta);
+
+	/* VHT can override some HT caps such as the A-MSDU max length */
+	if (params->vht_capa)
+		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+						    params->vht_capa, link_sta);
+
+	if (params->he_capa)
+		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+						  (void *)params->he_capa,
+						  params->he_capa_len,
+						  (void *)params->he_6ghz_capa,
+						  link_sta);
+
+	if (params->eht_capa)
+		ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband,
+						    (u8 *)params->he_capa,
+						    params->he_capa_len,
+						    params->eht_capa,
+						    params->eht_capa_len,
+						    link_sta);
+
+	if (params->opmode_notif_used) {
+		/* returned value is only needed for rc update, but the
+		 * rc isn't initialized here yet, so ignore it
+		 */
+		__ieee80211_vht_handle_opmode(sdata, link_sta,
+					      params->opmode_notif,
+					      sband->band);
+	}
+
+	return ret;
+}
+
 static int sta_apply_parameters(struct ieee80211_local *local,
 				struct sta_info *sta,
 				struct station_parameters *params)
@@ -1588,9 +1665,11 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	int ret = 0;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u32 link_id = params->link_sta_params.link_id < 0 ?
+		      0 : params->link_sta_params.link_id;
 	u32 mask, set;
 
-	sband = ieee80211_get_sband(sdata);
+	sband = ieee80211_get_link_sband(sdata, link_id);
 	if (!sband)
 		return -EINVAL;
 
@@ -1721,56 +1800,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	if (params->listen_interval >= 0)
 		sta->listen_interval = params->listen_interval;
 
-	if (params->sta_modify_mask & STATION_PARAM_APPLY_STA_TXPOWER) {
-		sta->sta.deflink.txpwr.type = params->txpwr.type;
-		if (params->txpwr.type == NL80211_TX_POWER_LIMITED)
-			sta->sta.deflink.txpwr.power = params->txpwr.power;
-		ret = drv_sta_set_txpwr(local, sdata, sta);
-		if (ret)
-			return ret;
-	}
-
-	if (params->supported_rates && params->supported_rates_len) {
-		ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
-					 sband, params->supported_rates,
-					 params->supported_rates_len,
-					 &sta->sta.deflink.supp_rates[sband->band]);
-	}
-
-	if (params->ht_capa)
-		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
-						  params->ht_capa,
-						  &sta->deflink);
-
-	/* VHT can override some HT caps such as the A-MSDU max length */
-	if (params->vht_capa)
-		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
-						    params->vht_capa,
-						    &sta->deflink);
-
-	if (params->he_capa)
-		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
-						  (void *)params->he_capa,
-						  params->he_capa_len,
-						  (void *)params->he_6ghz_capa,
-						  &sta->deflink);
-
-	if (params->eht_capa)
-		ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband,
-						    (u8 *)params->he_capa,
-						    params->he_capa_len,
-						    params->eht_capa,
-						    params->eht_capa_len,
-						    &sta->deflink);
-
-	if (params->opmode_notif_used) {
-		/* returned value is only needed for rc update, but the
-		 * rc isn't initialized here yet, so ignore it
-		 */
-		__ieee80211_vht_handle_opmode(sdata, &sta->deflink,
-					      params->opmode_notif,
-					      sband->band);
-	}
+	ret = sta_link_apply_parameters(local, sta, &params->link_sta_params);
+	if (ret)
+		return ret;
 
 	if (params->support_p2p_ps >= 0)
 		sta->sta.support_p2p_ps = params->support_p2p_ps;
@@ -1821,14 +1853,21 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	    !sdata->u.mgd.associated)
 		return -EINVAL;
 
-	sta = sta_info_alloc(sdata, mac, -1, GFP_KERNEL);
+	sta = sta_info_alloc(sdata, mac, params->link_sta_params.link_id,
+			     GFP_KERNEL);
 	if (!sta)
 		return -ENOMEM;
 
 	if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
 		sta->sta.tdls = true;
 
+	/* Though the mutex is not needed here (since the station is not
+	 * visible yet), sta_apply_parameters (and inner functions) require
+	 * the mutex due to other paths.
+	 */
+	mutex_lock(&local->sta_mtx);
 	err = sta_apply_parameters(local, sta, params);
+	mutex_unlock(&local->sta_mtx);
 	if (err) {
 		sta_info_free(local, sta);
 		return err;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3cf8e01e3f7e..5f6cbc2d73b4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6573,10 +6573,12 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 			return -EINVAL;
 		if (params->sta_modify_mask & STATION_PARAM_APPLY_CAPABILITY)
 			return -EINVAL;
-		if (params->supported_rates)
+		if (params->link_sta_params.supported_rates)
 			return -EINVAL;
-		if (params->ext_capab || params->ht_capa || params->vht_capa ||
-		    params->he_capa || params->eht_capa)
+		if (params->ext_capab || params->link_sta_params.ht_capa ||
+		    params->link_sta_params.vht_capa ||
+		    params->link_sta_params.he_capa ||
+		    params->link_sta_params.eht_capa)
 			return -EINVAL;
 	}
 
@@ -6624,7 +6626,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 			return -EINVAL;
 		/* force (at least) rates when authorizing */
 		if (params->sta_flags_set & BIT(NL80211_STA_FLAG_AUTHORIZED) &&
-		    !params->supported_rates)
+		    !params->link_sta_params.supported_rates)
 			return -EINVAL;
 		break;
 	case CFG80211_STA_TDLS_PEER_ACTIVE:
@@ -6648,7 +6650,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 	 */
 	if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
 	    statype != CFG80211_STA_TDLS_PEER_SETUP)
-		params->opmode_notif_used = false;
+		params->link_sta_params.opmode_notif_used = false;
 
 	return 0;
 }
@@ -6769,26 +6771,26 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 	if (info->attrs[NL80211_ATTR_PEER_AID])
 		params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
 	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
-		params->ht_capa =
+		params->link_sta_params.ht_capa =
 			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
-		params->vht_capa =
+		params->link_sta_params.vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
 	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
-		params->he_capa =
+		params->link_sta_params.he_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-		params->he_capa_len =
+		params->link_sta_params.he_capa_len =
 			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
 
 		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
-			params->eht_capa =
+			params->link_sta_params.eht_capa =
 				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
-			params->eht_capa_len =
+			params->link_sta_params.eht_capa_len =
 				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
 
-			if (!ieee80211_eht_capa_size_ok((const u8 *)params->he_capa,
-							(const u8 *)params->eht_capa,
-							params->eht_capa_len))
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params->link_sta_params.he_capa,
+							(const u8 *)params->link_sta_params.eht_capa,
+							params->link_sta_params.eht_capa_len))
 				return -EINVAL;
 		}
 	}
@@ -6840,7 +6842,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	struct station_parameters params;
 	u8 *mac_addr;
 	int err;
-	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -6876,9 +6877,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
 	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
-		params.supported_rates =
+		params.link_sta_params.supported_rates =
 			nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
-		params.supported_rates_len =
+		params.link_sta_params.supported_rates_len =
 			nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
 	}
 
@@ -6916,13 +6917,13 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 			info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]);
 
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
-		params.opmode_notif_used = true;
-		params.opmode_notif =
+		params.link_sta_params.opmode_notif_used = true;
+		params.link_sta_params.opmode_notif =
 			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
 	}
 
 	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
-		params.he_6ghz_capa =
+		params.link_sta_params.he_6ghz_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
 
 	if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
@@ -6934,11 +6935,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
+	err = nl80211_parse_sta_txpower_setting(info,
+						&params.link_sta_params.txpwr,
+						&params.link_sta_params.txpwr_set);
 	if (err)
 		return err;
-	if (txpwr_set)
-		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	/* Include parameters for TDLS peer (will check later) */
 	err = nl80211_set_station_tdls(info, &params);
@@ -6981,7 +6982,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	u8 *mac_addr = NULL;
 	u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
 			 BIT(NL80211_STA_FLAG_ASSOCIATED);
-	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -7001,10 +7001,13 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	    !info->attrs[NL80211_ATTR_PEER_AID])
 		return -EINVAL;
 
+	params.link_sta_params.link_id =
+		nl80211_link_id_or_invalid(info->attrs);
+
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	params.supported_rates =
+	params.link_sta_params.supported_rates =
 		nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
-	params.supported_rates_len =
+	params.link_sta_params.supported_rates_len =
 		nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
 	params.listen_interval =
 		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
@@ -7043,39 +7046,39 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
-		params.ht_capa =
+		params.link_sta_params.ht_capa =
 			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
 
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
-		params.vht_capa =
+		params.link_sta_params.vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
 
 	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
-		params.he_capa =
+		params.link_sta_params.he_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-		params.he_capa_len =
+		params.link_sta_params.he_capa_len =
 			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
 
 		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
-			params.eht_capa =
+			params.link_sta_params.eht_capa =
 				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
-			params.eht_capa_len =
+			params.link_sta_params.eht_capa_len =
 				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
 
-			if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa,
-							(const u8 *)params.eht_capa,
-							params.eht_capa_len))
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params.link_sta_params.he_capa,
+							(const u8 *)params.link_sta_params.eht_capa,
+							params.link_sta_params.eht_capa_len))
 				return -EINVAL;
 		}
 	}
 
 	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
-		params.he_6ghz_capa =
+		params.link_sta_params.he_6ghz_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
 
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
-		params.opmode_notif_used = true;
-		params.opmode_notif =
+		params.link_sta_params.opmode_notif_used = true;
+		params.link_sta_params.opmode_notif =
 			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
 	}
 
@@ -7092,11 +7095,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
+	err = nl80211_parse_sta_txpower_setting(info,
+						&params.link_sta_params.txpwr,
+						&params.link_sta_params.txpwr_set);
 	if (err)
 		return err;
-	if (txpwr_set)
-		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	err = nl80211_parse_sta_channel_info(info, &params);
 	if (err)
@@ -7115,17 +7118,19 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	 * error in this case.
 	 */
 	if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
-		params.ht_capa = NULL;
-		params.vht_capa = NULL;
+		params.link_sta_params.ht_capa = NULL;
+		params.link_sta_params.vht_capa = NULL;
 
 		/* HE and EHT require WME */
-		if (params.he_capa_len || params.he_6ghz_capa ||
-		    params.eht_capa_len)
+		if (params.link_sta_params.he_capa_len ||
+		    params.link_sta_params.he_6ghz_capa ||
+		    params.link_sta_params.eht_capa_len)
 			return -EINVAL;
 	}
 
 	/* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */
-	if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa))
+	if (params.link_sta_params.he_6ghz_capa &&
+	    (params.link_sta_params.ht_capa || params.link_sta_params.vht_capa))
 		return -EINVAL;
 
 	/* When you run into this, adjust the code below for the new flag */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 16d0fe53fcf2..e78bffbc6f95 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -738,7 +738,7 @@ DECLARE_EVENT_CLASS(station_add_change,
 		__array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
 		__array(char, vlan, IFNAMSIZ)
 		__dynamic_array(u8, supported_rates,
-				params->supported_rates_len)
+				params->link_sta_params.supported_rates_len)
 		__dynamic_array(u8, ext_capab, params->ext_capab_len)
 		__dynamic_array(u8, supported_channels,
 				params->supported_channels_len)
@@ -758,20 +758,23 @@ DECLARE_EVENT_CLASS(station_add_change,
 		__entry->plink_state = params->plink_state;
 		__entry->uapsd_queues = params->uapsd_queues;
 		memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
-		if (params->ht_capa)
-			memcpy(__entry->ht_capa, params->ht_capa,
+		if (params->link_sta_params.ht_capa)
+			memcpy(__entry->ht_capa,
+			       params->link_sta_params.ht_capa,
 			       sizeof(struct ieee80211_ht_cap));
 		memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
-		if (params->vht_capa)
-			memcpy(__entry->vht_capa, params->vht_capa,
+		if (params->link_sta_params.vht_capa)
+			memcpy(__entry->vht_capa,
+			       params->link_sta_params.vht_capa,
 			       sizeof(struct ieee80211_vht_cap));
 		memset(__entry->vlan, 0, sizeof(__entry->vlan));
 		if (params->vlan)
 			memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
-		if (params->supported_rates && params->supported_rates_len)
+		if (params->link_sta_params.supported_rates &&
+		    params->link_sta_params.supported_rates_len)
 			memcpy(__get_dynamic_array(supported_rates),
-			       params->supported_rates,
-			       params->supported_rates_len);
+			       params->link_sta_params.supported_rates,
+			       params->link_sta_params.supported_rates_len);
 		if (params->ext_capab && params->ext_capab_len)
 			memcpy(__get_dynamic_array(ext_capab),
 			       params->ext_capab,
@@ -788,8 +791,9 @@ DECLARE_EVENT_CLASS(station_add_change,
 			       params->supported_oper_classes_len);
 		__entry->max_sp = params->max_sp;
 		__entry->capability = params->capability;
-		__entry->opmode_notif = params->opmode_notif;
-		__entry->opmode_notif_used = params->opmode_notif_used;
+		__entry->opmode_notif = params->link_sta_params.opmode_notif;
+		__entry->opmode_notif_used =
+			params->link_sta_params.opmode_notif_used;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
 		  ", station flags mask: %u, station flags set: %u, "
-- 
cgit 


From d8675a63518c6148827838058feb7f18403faed1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 17 Jun 2022 22:36:37 +0200
Subject: wifi: mac80211: RCU-ify link/link_conf pointers

Since links can be added and removed dynamically, we need to
somehow protect the sdata->link[] and vif->link_conf[] array
pointers from disappearing when accessing them without locks.
RCU-ify the pointers to achieve this, which requires quite a
bit of rework.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c |  14 ++-
 include/net/mac80211.h                |   6 +-
 net/mac80211/cfg.c                    | 169 ++++++++++++++++++++++------------
 net/mac80211/chan.c                   | 123 +++++++++++++------------
 net/mac80211/debugfs_netdev.c         |   2 +-
 net/mac80211/driver-ops.h             |  10 +-
 net/mac80211/ht.c                     |  24 ++++-
 net/mac80211/ibss.c                   |   8 +-
 net/mac80211/ieee80211_i.h            |  21 +++--
 net/mac80211/iface.c                  |  36 ++++----
 net/mac80211/main.c                   |   9 +-
 net/mac80211/mesh.c                   |   7 +-
 net/mac80211/mlme.c                   |  30 +++---
 net/mac80211/ocb.c                    |   4 +-
 net/mac80211/offchannel.c             |  15 ++-
 net/mac80211/rx.c                     |   8 +-
 net/mac80211/sta_info.c               |   3 +-
 net/mac80211/tdls.c                   |   3 +-
 net/mac80211/trace.h                  |  19 ++--
 net/mac80211/tx.c                     |  88 +++++++++---------
 net/mac80211/util.c                   |  34 ++++---
 net/mac80211/vht.c                    |  62 +++++++++----
 22 files changed, 420 insertions(+), 275 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 4f22f3df161c..6bad95eeb709 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1477,9 +1477,10 @@ static void mac80211_hwsim_tx_iter(void *_data, u8 *addr,
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) {
-		struct ieee80211_bss_conf *conf = vif->link_conf[i];
+		struct ieee80211_bss_conf *conf;
 		struct ieee80211_chanctx_conf *chanctx;
 
+		conf = rcu_dereference(vif->link_conf[i]);
 		if (!conf)
 			continue;
 
@@ -1917,7 +1918,7 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 {
 	struct mac80211_hwsim_link_data *link_data = arg;
 	u32 link_id = link_data->link_id;
-	struct ieee80211_bss_conf *link_conf = vif->link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf;
 	struct mac80211_hwsim_data *data =
 		container_of(link_data, struct mac80211_hwsim_data,
 			     link_data[link_id]);
@@ -1931,6 +1932,10 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 
 	hwsim_check_magic(vif);
 
+	link_conf = rcu_dereference(vif->link_conf[link_id]);
+	if (!link_conf)
+		return;
+
 	if (vif->type != NL80211_IFTYPE_AP &&
 	    vif->type != NL80211_IFTYPE_MESH_POINT &&
 	    vif->type != NL80211_IFTYPE_ADHOC &&
@@ -2155,12 +2160,11 @@ static void mac80211_hwsim_vif_info_changed(struct ieee80211_hw *hw,
 
 static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif,
-					     u32 link_id,
-					     u64 changed)
+					     struct ieee80211_bss_conf *info,
+					     u32 link_id, u64 changed)
 {
 	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
 	struct mac80211_hwsim_data *data = hw->priv;
-	struct ieee80211_bss_conf *info = vif->link_conf[link_id];
 	struct mac80211_hwsim_link_data *link_data = &data->link_data[link_id];
 
 	hwsim_check_magic(vif);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1c005a30313f..044ed417b06f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1789,7 +1789,7 @@ struct ieee80211_vif {
 	enum nl80211_iftype type;
 	struct ieee80211_vif_cfg cfg;
 	struct ieee80211_bss_conf bss_conf;
-	struct ieee80211_bss_conf *link_conf[IEEE80211_MLD_MAX_NUM_LINKS];
+	struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS];
 	u16 valid_links;
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
@@ -4082,7 +4082,9 @@ struct ieee80211_ops {
 				u64 changed);
 	void (*link_info_changed)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  unsigned int link_id, u64 changed);
+				  struct ieee80211_bss_conf *info,
+				  unsigned int link_id,
+				  u64 changed);
 
 	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			unsigned int link_id);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 85032dcaa595..3d66e40af7a9 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -39,7 +39,7 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
 		memcpy(sdata->vif.bss_conf.mu_group.position,
 		       params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
 		       WLAN_USER_POSITION_LEN);
-		ieee80211_link_info_change_notify(sdata, 0,
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
 						  BSS_CHANGED_MU_GROUPS);
 		/* don't care about endianness - just check for 0 */
 		memcpy(&membership, params->vht_mumimo_groups,
@@ -841,8 +841,8 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 		sdata = wiphy_dereference(local->hw.wiphy,
 					  local->monitor_sdata);
 		if (sdata) {
-			ieee80211_link_release_channel(sdata->link[0]);
-			ret = ieee80211_link_use_channel(sdata->link[0],
+			ieee80211_link_release_channel(&sdata->deflink);
+			ret = ieee80211_link_use_channel(&sdata->deflink,
 							 chandef,
 							 IEEE80211_CHANCTX_EXCLUSIVE);
 		}
@@ -1008,6 +1008,7 @@ ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst,
 }
 
 static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_link_data *link,
 				   struct cfg80211_beacon_data *params,
 				   const struct ieee80211_csa_settings *csa,
 				   const struct ieee80211_color_change_settings *cca)
@@ -1017,9 +1018,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 	int new_head_len, new_tail_len;
 	int size, err;
 	u32 changed = BSS_CHANGED_BEACON;
-	struct ieee80211_link_data *link = sdata->link[params->link_id];
-	struct ieee80211_bss_conf *link_conf =
-		sdata->vif.link_conf[params->link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 
 	old = sdata_dereference(link->u.ap.beacon, sdata);
 
@@ -1153,8 +1152,14 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	int i, err;
 	int prev_beacon_int;
 	unsigned int link_id = params->beacon.link_id;
-	struct ieee80211_link_data *link = sdata->link[link_id];
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_link_data *link;
+	struct ieee80211_bss_conf *link_conf;
+
+	link = sdata_dereference(sdata->link[link_id], sdata);
+	if (!link)
+		return -ENOLINK;
+
+	link_conf = link->conf;
 
 	old = sdata_dereference(link->u.ap.beacon, sdata);
 	if (old)
@@ -1264,7 +1269,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
 		link_conf->beacon_tx_rate = params->beacon_rate;
 
-	err = ieee80211_assign_beacon(sdata, &params->beacon, NULL, NULL);
+	err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL);
 	if (err < 0)
 		goto error;
 	changed |= err;
@@ -1300,7 +1305,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 
 	ieee80211_recalc_dtim(local, sdata);
 	ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID);
-	ieee80211_link_info_change_notify(sdata, link_id, changed);
+	ieee80211_link_info_change_notify(sdata, link, changed);
 
 	netif_carrier_on(dev);
 	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
@@ -1320,25 +1325,30 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 				   struct cfg80211_beacon_data *params)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_link_data *link;
 	struct beacon_data *old;
 	int err;
-	struct ieee80211_bss_conf *link_conf =
-		sdata->vif.link_conf[params->link_id];
+	struct ieee80211_bss_conf *link_conf;
 
 	sdata_assert_lock(sdata);
 
+	link = sdata_dereference(sdata->link[params->link_id], sdata);
+	if (!link)
+		return -ENOLINK;
+
+	link_conf = link->conf;
+
 	/* don't allow changing the beacon while a countdown is in place - offset
 	 * of channel switch counter may change
 	 */
 	if (link_conf->csa_active || link_conf->color_change_active)
 		return -EBUSY;
 
-	old = sdata_dereference(sdata->link[params->link_id]->u.ap.beacon,
-				sdata);
+	old = sdata_dereference(link->u.ap.beacon, sdata);
 	if (!old)
 		return -ENOENT;
 
-	err = ieee80211_assign_beacon(sdata, params, NULL, NULL);
+	err = ieee80211_assign_beacon(sdata, link, params, NULL, NULL);
 	if (err < 0)
 		return err;
 
@@ -1348,7 +1358,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 		err |= BSS_CHANGED_HE_BSS_COLOR;
 	}
 
-	ieee80211_link_info_change_notify(sdata, params->link_id, err);
+	ieee80211_link_info_change_notify(sdata, link, err);
 	return 0;
 }
 
@@ -1373,8 +1383,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	struct fils_discovery_data *old_fils_discovery;
 	struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp;
 	struct cfg80211_chan_def chandef;
-	struct ieee80211_link_data *link = sdata->link[link_id];
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_link_data *link =
+		sdata_dereference(sdata->link[link_id], sdata);
+	struct ieee80211_bss_conf *link_conf = link->conf;
 
 	sdata_assert_lock(sdata);
 
@@ -1431,7 +1442,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	sdata->beacon_rate_set = false;
 	sdata->vif.cfg.ssid_len = 0;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
-	ieee80211_link_info_change_notify(sdata, link_id,
+	ieee80211_link_info_change_notify(sdata, link,
 					  BSS_CHANGED_BEACON_ENABLED);
 
 	if (sdata->wdev.cac_started) {
@@ -1589,14 +1600,16 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	u32 link_id = params->link_id < 0 ? 0 : params->link_id;
+	struct ieee80211_link_data *link =
+		sdata_dereference(sdata->link[link_id], sdata);
 	struct link_sta_info *link_sta =
 		rcu_dereference_protected(sta->link[link_id],
 					  lockdep_is_held(&local->sta_mtx));
 
-	if (!link_sta)
+	if (!link || !link_sta)
 		return -EINVAL;
 
-	sband = ieee80211_get_link_sband(sdata, link_id);
+	sband = ieee80211_get_link_sband(link);
 	if (!sband)
 		return -EINVAL;
 
@@ -1616,7 +1629,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
 
 	if (params->supported_rates &&
 	    params->supported_rates_len) {
-		ieee80211_parse_bitrates(&sdata->vif.link_conf[link_id]->chandef,
+		ieee80211_parse_bitrates(&link->conf->chandef,
 					 sband, params->supported_rates,
 					 params->supported_rates_len,
 					 &link_sta->pub->supp_rates[sband->band]);
@@ -1667,9 +1680,14 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	u32 link_id = params->link_sta_params.link_id < 0 ?
 		      0 : params->link_sta_params.link_id;
+	struct ieee80211_link_data *link;
 	u32 mask, set;
 
-	sband = ieee80211_get_link_sband(sdata, link_id);
+	link = sdata_dereference(sdata->link[link_id], sdata);
+	if (!link)
+		return -ENOLINK;
+
+	sband = ieee80211_get_link_sband(link);
 	if (!sband)
 		return -EINVAL;
 
@@ -1987,7 +2005,14 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 		}
 	}
 
-	err = sta_apply_parameters(local, sta, params);
+	/* we use sta_info_get_bss() so this might be different */
+	if (sdata != sta->sdata) {
+		mutex_lock_nested(&sta->sdata->wdev.mtx, 1);
+		err = sta_apply_parameters(local, sta, params);
+		mutex_unlock(&sta->sdata->wdev.mtx);
+	} else {
+		err = sta_apply_parameters(local, sta, params);
+	}
 	if (err)
 		goto out_err;
 
@@ -2374,7 +2399,8 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
 	if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) {
 		conf->ht_opmode = nconf->ht_opmode;
 		sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode;
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_HT);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_HT);
 	}
 	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask))
 		conf->dot11MeshHWMPactivePathToRootTimeout =
@@ -2426,7 +2452,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
 	mutex_lock(&sdata->local->mtx);
-	err = ieee80211_link_use_channel(sdata->link[0], &setup->chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef,
 					 IEEE80211_CHANCTX_SHARED);
 	mutex_unlock(&sdata->local->mtx);
 	if (err)
@@ -2441,7 +2467,7 @@ static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev)
 
 	ieee80211_stop_mesh(sdata);
 	mutex_lock(&sdata->local->mtx);
-	ieee80211_link_release_channel(sdata->link[0]);
+	ieee80211_link_release_channel(&sdata->deflink);
 	kfree(sdata->u.mesh.ie);
 	mutex_unlock(&sdata->local->mtx);
 
@@ -2529,7 +2555,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 		changed |= BSS_CHANGED_P2P_PS;
 	}
 
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
 	return 0;
 }
@@ -2570,7 +2596,8 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_QOS);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_QOS);
 
 	return 0;
 }
@@ -2719,7 +2746,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
 	memcpy(sdata->vif.bss_conf.mcast_rate, rate,
 	       sizeof(int) * NUM_NL80211_BANDS);
 
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_MCAST_RATE);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_MCAST_RATE);
 
 	return 0;
 }
@@ -2934,7 +2962,7 @@ static int ieee80211_testmode_dump(struct wiphy *wiphy,
 #endif
 
 int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
-				 unsigned int link_id,
+				 struct ieee80211_link_data *link,
 				 enum ieee80211_smps_mode smps_mode)
 {
 	const u8 *ap;
@@ -2948,8 +2976,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
-	old_req = sdata->link[link_id]->u.mgd.req_smps;
-	sdata->link[link_id]->u.mgd.req_smps = smps_mode;
+	old_req = link->u.mgd.req_smps;
+	link->u.mgd.req_smps = smps_mode;
 
 	if (old_req == smps_mode &&
 	    smps_mode != IEEE80211_SMPS_AUTOMATIC)
@@ -2961,10 +2989,10 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	 * the new value until we associate.
 	 */
 	if (!sdata->u.mgd.associated ||
-	    sdata->vif.link_conf[link_id]->chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+	    link->conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
 		return 0;
 
-	ap = sdata->link[link_id]->u.mgd.bssid;
+	ap = link->u.mgd.bssid;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
@@ -2988,7 +3016,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	err = ieee80211_send_smps_action(sdata, smps_mode,
 					 ap, ap);
 	if (err)
-		sdata->link[link_id]->u.mgd.req_smps = old_req;
+		link->u.mgd.req_smps = old_req;
 	else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found)
 		ieee80211_teardown_tdls_peers(sdata);
 
@@ -3018,10 +3046,14 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 	/* no change, but if automatic follow powersave */
 	sdata_lock(sdata);
 	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
-		if (!sdata->link[link_id])
+		struct ieee80211_link_data *link;
+
+		link = sdata_dereference(sdata->link[link_id], sdata);
+
+		if (!link)
 			continue;
-		__ieee80211_request_smps_mgd(sdata, link_id,
-					     sdata->link[link_id]->u.mgd.req_smps);
+		__ieee80211_request_smps_mgd(sdata, link,
+					     link->u.mgd.req_smps);
 	}
 	sdata_unlock(sdata);
 
@@ -3060,7 +3092,8 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
 	/* tell the driver upon association, unless already associated */
 	if (sdata->u.mgd.associated &&
 	    sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_CQM);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_CQM);
 
 	return 0;
 }
@@ -3085,7 +3118,8 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy,
 	/* tell the driver upon association, unless already associated */
 	if (sdata->u.mgd.associated &&
 	    sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_CQM);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_CQM);
 
 	return 0;
 }
@@ -3177,7 +3211,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = local->rx_chains;
 
-	err = ieee80211_link_use_channel(sdata->link[0], chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, chandef,
 					 IEEE80211_CHANCTX_SHARED);
 	if (err)
 		goto out_unlock;
@@ -3206,7 +3240,7 @@ static void ieee80211_end_cac(struct wiphy *wiphy,
 		cancel_delayed_work(&sdata->deflink.dfs_cac_timer_work);
 
 		if (sdata->wdev.cac_started) {
-			ieee80211_link_release_channel(sdata->link[0]);
+			ieee80211_link_release_channel(&sdata->deflink);
 			sdata->wdev.cac_started = false;
 		}
 	}
@@ -3353,7 +3387,7 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
 		if (!sdata->deflink.u.ap.next_beacon)
 			return -EINVAL;
 
-		err = ieee80211_assign_beacon(sdata,
+		err = ieee80211_assign_beacon(sdata, &sdata->deflink,
 					      sdata->deflink.u.ap.next_beacon,
 					      NULL, NULL);
 		ieee80211_free_next_beacon(&sdata->deflink);
@@ -3410,7 +3444,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 		if (sdata->deflink.reserved_ready)
 			return 0;
 
-		return ieee80211_link_use_reserved_context(sdata->link[0]);
+		return ieee80211_link_use_reserved_context(&sdata->deflink);
 	}
 
 	if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
@@ -3423,7 +3457,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	if (err)
 		return err;
 
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
 	if (sdata->deflink.csa_block_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
@@ -3522,7 +3556,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 		csa.n_counter_offsets_presp = params->n_counter_offsets_presp;
 		csa.count = params->count;
 
-		err = ieee80211_assign_beacon(sdata, &params->beacon_csa, &csa, NULL);
+		err = ieee80211_assign_beacon(sdata, &sdata->deflink,
+					      &params->beacon_csa, &csa,
+					      NULL);
 		if (err < 0) {
 			ieee80211_free_next_beacon(&sdata->deflink);
 			return err;
@@ -3675,7 +3711,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	if (err)
 		goto out;
 
-	err = ieee80211_link_reserve_chanctx(sdata->link[0], &params->chandef,
+	err = ieee80211_link_reserve_chanctx(&sdata->deflink, &params->chandef,
 					     chanctx->mode,
 					     params->radar_required);
 	if (err)
@@ -3684,7 +3720,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	/* if reservation is invalid then this will fail */
 	err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0);
 	if (err) {
-		ieee80211_link_unreserve_chanctx(sdata->link[0]);
+		ieee80211_link_unreserve_chanctx(&sdata->deflink);
 		goto out;
 	}
 
@@ -3694,7 +3730,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 	err = ieee80211_set_csa_beacon(sdata, params, &changed);
 	if (err) {
-		ieee80211_link_unreserve_chanctx(sdata->link[0]);
+		ieee80211_link_unreserve_chanctx(&sdata->deflink);
 		goto out;
 	}
 
@@ -3711,7 +3747,8 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 					  params->count, params->block_tx);
 
 	if (changed) {
-		ieee80211_link_info_change_notify(sdata, 0, changed);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  changed);
 		drv_channel_switch_beacon(sdata, &params->chandef);
 	} else {
 		/* if the beacon didn't change, we can finalize immediately */
@@ -3950,12 +3987,19 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_chanctx_conf *chanctx_conf;
+	struct ieee80211_link_data *link;
 	int ret = -ENODATA;
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.link_conf[link_id]->chanctx_conf);
+	link = rcu_dereference(sdata->link[link_id]);
+	if (!link) {
+		ret = -ENOLINK;
+		goto out;
+	}
+
+	chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
 	if (chanctx_conf) {
-		*chandef = sdata->vif.link_conf[link_id]->chandef;
+		*chandef = link->conf->chandef;
 		ret = 0;
 	} else if (local->open_count > 0 &&
 		   local->open_count == local->monitors &&
@@ -3966,6 +4010,7 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 			*chandef = local->_oper_chandef;
 		ret = 0;
 	}
+out:
 	rcu_read_unlock();
 
 	return ret;
@@ -4009,13 +4054,15 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 				      struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_link_data *link;
 	int ret;
 	u32 changed = 0;
 
-	ret = ieee80211_link_change_bandwidth(sdata->link[link_id], chandef,
-					      &changed);
+	link = sdata_dereference(sdata->link[link_id], sdata);
+
+	ret = ieee80211_link_change_bandwidth(link, chandef, &changed);
 	if (ret == 0)
-		ieee80211_link_info_change_notify(sdata, link_id, changed);
+		ieee80211_link_info_change_notify(sdata, link, changed);
 
 	return ret;
 }
@@ -4358,7 +4405,7 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata,
 		if (!sdata->deflink.u.ap.next_beacon)
 			return -EINVAL;
 
-		ret = ieee80211_assign_beacon(sdata,
+		ret = ieee80211_assign_beacon(sdata, &sdata->deflink,
 					      sdata->deflink.u.ap.next_beacon,
 					      NULL, NULL);
 		ieee80211_free_next_beacon(&sdata->deflink);
@@ -4401,7 +4448,8 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata,
 			params->counter_offset_presp;
 		color_change.count = params->count;
 
-		err = ieee80211_assign_beacon(sdata, &params->beacon_color_change,
+		err = ieee80211_assign_beacon(sdata, &sdata->deflink,
+					      &params->beacon_color_change,
 					      NULL, &color_change);
 		if (err < 0) {
 			ieee80211_free_next_beacon(&sdata->deflink);
@@ -4424,7 +4472,7 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.he_bss_color.enabled = enable;
 	changed |= BSS_CHANGED_HE_BSS_COLOR;
 
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
 	if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) {
 		struct ieee80211_sub_if_data *child;
@@ -4434,7 +4482,8 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
 			if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) {
 				child->vif.bss_conf.he_bss_color.color = color;
 				child->vif.bss_conf.he_bss_color.enabled = enable;
-				ieee80211_link_info_change_notify(child, 0,
+				ieee80211_link_info_change_notify(child,
+								  &child->deflink,
 								  BSS_CHANGED_HE_BSS_COLOR);
 			}
 		}
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 6853b563fb6c..8d384956fde5 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -67,14 +67,12 @@ static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local)
 }
 
 static struct ieee80211_chanctx *
-ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata,
-			  unsigned int link_id)
+ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
 {
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
-	struct ieee80211_local *local __maybe_unused = sdata->local;
+	struct ieee80211_local *local __maybe_unused = link->sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 
-	conf = rcu_dereference_protected(link_conf->chanctx_conf,
+	conf = rcu_dereference_protected(link->conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (!conf)
 		return NULL;
@@ -82,12 +80,6 @@ ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata,
 	return container_of(conf, struct ieee80211_chanctx, conf);
 }
 
-static struct ieee80211_chanctx *
-ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
-{
-	return ieee80211_vif_get_chanctx(link->sdata, link->link_id);
-}
-
 static const struct cfg80211_chan_def *
 ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local,
 				   struct ieee80211_chanctx *ctx,
@@ -122,8 +114,7 @@ ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
 
 	list_for_each_entry(link, &ctx->assigned_links,
 			    assigned_chanctx_list) {
-		struct ieee80211_bss_conf *link_conf =
-			link->sdata->vif.link_conf[link->link_id];
+		struct ieee80211_bss_conf *link_conf = link->conf;
 
 		if (link->reserved_chanctx)
 			continue;
@@ -254,7 +245,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata,
 	enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
 	struct sta_info *sta;
 
-	rcu_read_lock();
 	list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
 		if (sdata != sta->sdata &&
 		    !(sta->sdata->bss && sta->sdata->bss == sdata->bss))
@@ -262,7 +252,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata,
 
 		max_bw = max(max_bw, ieee80211_get_sta_bw(sta, link_id));
 	}
-	rcu_read_unlock();
 
 	return max_bw;
 }
@@ -275,10 +264,11 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_vif *vif = &sdata->vif;
 	int link_id;
 
+	rcu_read_lock();
 	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
 		enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT;
 		struct ieee80211_bss_conf *link_conf =
-			sdata->vif.link_conf[link_id];
+			rcu_dereference(sdata->vif.link_conf[link_id]);
 
 		if (!link_conf)
 			continue;
@@ -319,6 +309,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata,
 
 		max_bw = max(max_bw, width);
 	}
+	rcu_read_unlock();
 
 	return max_bw;
 }
@@ -345,7 +336,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 	/* use the configured bandwidth in case of monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
 	if (sdata &&
-	    rcu_access_pointer(sdata->vif.link_conf[0]->chanctx_conf) == conf)
+	    rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == conf)
 		max_bw = max(max_bw, conf->def.width);
 
 	rcu_read_unlock();
@@ -419,7 +410,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 
 		for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) {
 			struct ieee80211_bss_conf *link_conf =
-				sdata->vif.link_conf[link_id];
+				rcu_dereference(sdata->vif.link_conf[link_id]);
 			struct link_sta_info *link_sta;
 
 			if (!link_conf)
@@ -572,8 +563,11 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local)
 		unsigned int link_id;
 
 		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
-			if (sdata->link[link_id] &&
-			    sdata->link[link_id]->radar_required) {
+			struct ieee80211_link_data *link;
+
+			link = rcu_dereference(sdata->link[link_id]);
+
+			if (link && link->radar_required) {
 				rcu_read_unlock();
 				return true;
 			}
@@ -602,15 +596,15 @@ ieee80211_chanctx_radar_required(struct ieee80211_local *local,
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
-			struct ieee80211_bss_conf *link_conf =
-				sdata->vif.link_conf[link_id];
+			struct ieee80211_link_data *link;
 
-			if (!link_conf)
+			link = rcu_dereference(sdata->link[link_id]);
+			if (!link)
 				continue;
 
-			if (rcu_access_pointer(link_conf->chanctx_conf) != conf)
+			if (rcu_access_pointer(link->conf->chanctx_conf) != conf)
 				continue;
-			if (!sdata->link[link_id]->radar_required)
+			if (!link->radar_required)
 				continue;
 			required = true;
 			break;
@@ -774,7 +768,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 
 		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
 			struct ieee80211_bss_conf *link_conf =
-				sdata->vif.link_conf[link_id];
+				rcu_dereference(sdata->vif.link_conf[link_id]);
 
 			if (!link_conf)
 				continue;
@@ -850,7 +844,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
 	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
 		return -ENOTSUPP;
 
-	conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
+	conf = rcu_dereference_protected(link->conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 
 	if (conf) {
@@ -872,7 +866,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
 	}
 
 out:
-	rcu_assign_pointer(sdata->vif.link_conf[link_id]->chanctx_conf, conf);
+	rcu_assign_pointer(link->conf->chanctx_conf, conf);
 
 	sdata->vif.cfg.idle = !conf;
 
@@ -931,14 +925,14 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 		}
 
 		for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
-			struct ieee80211_link_data *link = sdata->link[link_id];
-			struct ieee80211_bss_conf *link_conf =
-				sdata->vif.link_conf[link_id];
+			struct ieee80211_link_data *link;
 
-			if (!link_conf)
+			link = rcu_dereference(sdata->link[link_id]);
+
+			if (!link)
 				continue;
 
-			if (rcu_access_pointer(link_conf->chanctx_conf) != &chanctx->conf)
+			if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf)
 				continue;
 
 			switch (link->smps_mode) {
@@ -968,7 +962,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 	/* Disable SMPS for the monitor interface */
 	sdata = rcu_dereference(local->monitor_sdata);
 	if (sdata &&
-	    rcu_access_pointer(sdata->vif.link_conf[0]->chanctx_conf) == &chanctx->conf)
+	    rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf)
 		rx_chains_dynamic = rx_chains_static = local->rx_chains;
 
 	rcu_read_unlock();
@@ -998,7 +992,7 @@ __ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	unsigned int link_id = link->link_id;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local __maybe_unused = sdata->local;
 	struct ieee80211_sub_if_data *vlan;
 	struct ieee80211_chanctx_conf *conf;
@@ -1021,9 +1015,17 @@ __ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
 	if (clear)
 		conf = NULL;
 
-	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
-		rcu_assign_pointer(vlan->vif.link_conf[link_id]->chanctx_conf,
-				   conf);
+	rcu_read_lock();
+	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+		struct ieee80211_bss_conf *vlan_conf;
+
+		vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]);
+		if (WARN_ON(!vlan_conf))
+			continue;
+
+		rcu_assign_pointer(vlan_conf->chanctx_conf, conf);
+	}
+	rcu_read_unlock();
 }
 
 void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
@@ -1210,21 +1212,29 @@ ieee80211_link_update_chandef(struct ieee80211_link_data *link,
 	unsigned int link_id = link->link_id;
 	struct ieee80211_sub_if_data *vlan;
 
-	sdata->vif.link_conf[link_id]->chandef = *chandef;
+	link->conf->chandef = *chandef;
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return;
 
-	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
-		vlan->vif.link_conf[link_id]->chandef = *chandef;
+	rcu_read_lock();
+	list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+		struct ieee80211_bss_conf *vlan_conf;
+
+		vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]);
+		if (WARN_ON(!vlan_conf))
+			continue;
+
+		vlan_conf->chandef = *chandef;
+	}
+	rcu_read_unlock();
 }
 
 static int
 ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
@@ -1296,7 +1306,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 	ieee80211_recalc_radar_chanctx(local, new_ctx);
 
 	if (changed)
-		ieee80211_link_info_change_notify(sdata, link_id, changed);
+		ieee80211_link_info_change_notify(sdata, link, changed);
 
 out:
 	ieee80211_link_chanctx_reservation_complete(link);
@@ -1307,13 +1317,12 @@ static int
 ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
 	const struct cfg80211_chan_def *chandef;
 	int err;
 
-	old_ctx = ieee80211_vif_get_chanctx(sdata, link_id);
+	old_ctx = ieee80211_link_get_chanctx(link);
 	new_ctx = link->reserved_chanctx;
 
 	if (WARN_ON(!link->reserved_ready))
@@ -1625,8 +1634,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 		list_for_each_entry(link, &ctx->reserved_links,
 				    reserved_chanctx_list) {
 			struct ieee80211_sub_if_data *sdata = link->sdata;
-			struct ieee80211_bss_conf *link_conf =
-				sdata->vif.link_conf[link->link_id];
+			struct ieee80211_bss_conf *link_conf = link->conf;
 			u32 changed = 0;
 
 			if (!ieee80211_link_has_in_place_reservation(link))
@@ -1649,7 +1657,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 			ieee80211_link_update_chandef(link, &link->reserved_chandef);
 			if (changed)
 				ieee80211_link_info_change_notify(sdata,
-								  link->link_id,
+								  link,
 								  changed);
 
 			ieee80211_recalc_txpower(sdata, false);
@@ -1746,8 +1754,7 @@ err:
 static void __ieee80211_link_release_channel(struct ieee80211_link_data *link)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -1786,7 +1793,6 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link,
 			       enum ieee80211_chanctx_mode mode)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *ctx;
 	u8 radar_detect_width = 0;
@@ -1806,7 +1812,7 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link,
 	if (ret > 0)
 		radar_detect_width = BIT(chandef->width);
 
-	sdata->link[link_id]->radar_required = ret;
+	link->radar_required = ret;
 
 	ret = ieee80211_check_combinations(sdata, chandef, mode,
 					   radar_detect_width);
@@ -1910,8 +1916,7 @@ int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
 				    u32 *changed)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -1997,7 +2002,8 @@ void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	unsigned int link_id = link->link_id;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
+	struct ieee80211_bss_conf *ap_conf;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_sub_if_data *ap;
 	struct ieee80211_chanctx_conf *conf;
@@ -2009,9 +2015,12 @@ void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link)
 
 	mutex_lock(&local->chanctx_mtx);
 
-	conf = rcu_dereference_protected(ap->vif.link_conf[link_id]->chanctx_conf,
+	rcu_read_lock();
+	ap_conf = rcu_dereference(ap->vif.link_conf[link_id]);
+	conf = rcu_dereference_protected(ap_conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	rcu_assign_pointer(link_conf->chanctx_conf, conf);
+	rcu_read_unlock();
 	mutex_unlock(&local->chanctx_mtx);
 }
 
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index ead917501d6c..1e5b041a5cea 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -256,7 +256,7 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
 		return -EOPNOTSUPP;
 
 	sdata_lock(sdata);
-	err = __ieee80211_request_smps_mgd(sdata, 0, smps_mode);
+	err = __ieee80211_request_smps_mgd(sdata, &sdata->deflink, smps_mode);
 	sdata_unlock(sdata);
 
 	return err;
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index db38c8cc9d8f..ee3ac1a01bb0 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -167,6 +167,7 @@ static inline void drv_vif_cfg_changed(struct ieee80211_local *local,
 
 static inline void drv_link_info_changed(struct ieee80211_local *local,
 					 struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_bss_conf *info,
 					 int link_id, u64 changed)
 {
 	might_sleep();
@@ -189,13 +190,13 @@ static inline void drv_link_info_changed(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_link_info_changed(local, sdata, link_id, changed);
+	trace_drv_link_info_changed(local, sdata, info, link_id, changed);
 	if (local->ops->link_info_changed)
 		local->ops->link_info_changed(&local->hw, &sdata->vif,
-					      link_id, changed);
+					      info, link_id, changed);
 	else if (local->ops->bss_info_changed)
 		local->ops->bss_info_changed(&local->hw, &sdata->vif,
-					     &sdata->vif.bss_conf, changed);
+					     info, changed);
 	trace_drv_return_void(local);
 }
 
@@ -995,8 +996,7 @@ static inline int drv_start_ap(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_start_ap(local, sdata, sdata->vif.link_conf[link_id],
-			   link_id);
+	trace_drv_start_ap(local, sdata, link_id);
 	if (local->ops->start_ap)
 		ret = local->ops->start_ap(&local->hw, &sdata->vif, link_id);
 	trace_drv_return_int(local, ret);
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 2eb3a409b70f..ea7ce87b7ec4 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -140,12 +140,14 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 				       const struct ieee80211_ht_cap *ht_cap_ie,
 				       struct link_sta_info *link_sta)
 {
+	struct ieee80211_bss_conf *link_conf;
 	struct sta_info *sta = link_sta->sta;
 	struct ieee80211_sta_ht_cap ht_cap, own_cap;
 	u8 ampdu_info, tx_mcs_set_cap;
 	int i, max_tx_streams;
 	bool changed;
 	enum ieee80211_sta_rx_bandwidth bw;
+	enum nl80211_chan_width width;
 
 	memset(&ht_cap, 0, sizeof(ht_cap));
 
@@ -248,7 +250,14 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 
 	memcpy(&link_sta->pub->ht_cap, &ht_cap, sizeof(ht_cap));
 
-	switch (sdata->vif.link_conf[link_sta->link_id]->chandef.width) {
+	rcu_read_lock();
+	link_conf = rcu_dereference(sdata->vif.link_conf[link_sta->link_id]);
+	if (WARN_ON(!link_conf))
+		width = NL80211_CHAN_WIDTH_20_NOHT;
+	else
+		width = link_conf->chandef.width;
+
+	switch (width) {
 	default:
 		WARN_ON_ONCE(1);
 		fallthrough;
@@ -264,6 +273,7 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 				IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20;
 		break;
 	}
+	rcu_read_unlock();
 
 	link_sta->pub->bandwidth = bw;
 
@@ -547,7 +557,7 @@ void ieee80211_request_smps_mgd_work(struct work_struct *work)
 			     u.mgd.request_smps_work);
 
 	sdata_lock(link->sdata);
-	__ieee80211_request_smps_mgd(link->sdata, link->link_id,
+	__ieee80211_request_smps_mgd(link->sdata, link,
 				     link->u.mgd.driver_smps_mode);
 	sdata_unlock(link->sdata);
 }
@@ -556,19 +566,23 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id,
 			    enum ieee80211_smps_mode smps_mode)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	struct ieee80211_link_data *link = sdata->link[link_id];
+	struct ieee80211_link_data *link;
 
 	if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION))
 		return;
 
+	rcu_read_lock();
+	link = rcu_dereference(sdata->link[link_id]);
 	if (WARN_ON(!link))
-		return;
+		goto out;
 
 	if (link->u.mgd.driver_smps_mode == smps_mode)
-		return;
+		goto out;
 
 	link->u.mgd.driver_smps_mode = smps_mode;
 	ieee80211_queue_work(&sdata->local->hw, &link->u.mgd.request_smps_work);
+out:
+	rcu_read_unlock();
 }
 /* this might change ... don't want non-open drivers using it */
 EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index d30a82f1620b..393c7595bfa4 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -300,7 +300,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	radar_required = err;
 
 	mutex_lock(&local->mtx);
-	if (ieee80211_link_use_channel(sdata->link[0], &chandef,
+	if (ieee80211_link_use_channel(&sdata->deflink, &chandef,
 				       ifibss->fixed_channel ?
 					IEEE80211_CHANCTX_SHARED :
 					IEEE80211_CHANCTX_EXCLUSIVE)) {
@@ -370,7 +370,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		RCU_INIT_POINTER(ifibss->presp, NULL);
 		kfree_rcu(presp, rcu_head);
 		mutex_lock(&local->mtx);
-		ieee80211_link_release_channel(sdata->link[0]);
+		ieee80211_link_release_channel(&sdata->deflink);
 		mutex_unlock(&local->mtx);
 		sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n",
 			   err);
@@ -722,7 +722,7 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata)
 						BSS_CHANGED_IBSS);
 	drv_leave_ibss(local, sdata);
 	mutex_lock(&local->mtx);
-	ieee80211_link_release_channel(sdata->link[0]);
+	ieee80211_link_release_channel(&sdata->deflink);
 	mutex_unlock(&local->mtx);
 }
 
@@ -1848,7 +1848,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 		| IEEE80211_HT_PARAM_RIFS_MODE;
 
 	changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE;
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = local->rx_chains;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 604825dde4fd..f6791b47f78f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -961,6 +961,8 @@ struct ieee80211_link_data {
 		struct ieee80211_link_data_managed mgd;
 		struct ieee80211_link_data_ap ap;
 	} u;
+
+	struct ieee80211_bss_conf *conf;
 };
 
 struct ieee80211_sub_if_data {
@@ -1045,7 +1047,7 @@ struct ieee80211_sub_if_data {
 	} u;
 
 	struct ieee80211_link_data deflink;
-	struct ieee80211_link_data *link[IEEE80211_MLD_MAX_NUM_LINKS];
+	struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	struct {
@@ -1544,16 +1546,14 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata)
 }
 
 static inline struct ieee80211_supported_band *
-ieee80211_get_link_sband(struct ieee80211_sub_if_data *sdata, u32 link_id)
+ieee80211_get_link_sband(struct ieee80211_link_data *link)
 {
-	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_local *local = link->sdata->local;
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	enum nl80211_band band;
 
 	rcu_read_lock();
-	chanctx_conf =
-		rcu_dereference(sdata->vif.link_conf[link_id]->chanctx_conf);
-
+	chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
 	if (!chanctx_conf) {
 		rcu_read_unlock();
 		return NULL;
@@ -1721,7 +1721,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata,
 				     u64 changed);
 void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
-				       int link_id, u64 changed);
+				       struct ieee80211_link_data *link,
+				       u64 changed);
 void ieee80211_configure_filter(struct ieee80211_local *local);
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 
@@ -2002,7 +2003,7 @@ ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
 enum nl80211_chan_width
 ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta);
 void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
-				 unsigned int link_id,
+				 struct ieee80211_link_data *link,
 				 struct ieee80211_mgmt *mgmt);
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				  struct link_sta_info *sta,
@@ -2277,10 +2278,10 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 			    struct ieee802_11_elems *elems,
 			    enum nl80211_band band, u32 *basic_rates);
 int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
-				 unsigned int link_id,
+				 struct ieee80211_link_data *link,
 				 enum ieee80211_smps_mode smps_mode);
 void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
-			   unsigned int link_id);
+			   struct ieee80211_link_data *link);
 void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata);
 
 size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d5e904bff624..55b0a1fa92ab 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -80,7 +80,7 @@ void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
 {
 	if (__ieee80211_recalc_txpower(sdata) ||
 	    (update_bss && ieee80211_sdata_running(sdata)))
-		ieee80211_link_info_change_notify(sdata, 0,
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
 						  BSS_CHANGED_TXPOWER);
 }
 
@@ -480,7 +480,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 		chandef = sdata->vif.bss_conf.chandef;
 		WARN_ON(local->suspended);
 		mutex_lock(&local->mtx);
-		ieee80211_link_release_channel(sdata->link[0]);
+		ieee80211_link_release_channel(&sdata->deflink);
 		mutex_unlock(&local->mtx);
 		cfg80211_cac_event(sdata->dev, &chandef,
 				   NL80211_RADAR_CAC_ABORTED,
@@ -1031,11 +1031,12 @@ static void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 	if (link_id < 0)
 		link_id = 0;
 
-	sdata->vif.link_conf[link_id] = link_conf;
-	sdata->link[link_id] = link;
+	rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf);
+	rcu_assign_pointer(sdata->link[link_id], link);
 
 	link->sdata = sdata;
 	link->link_id = link_id;
+	link->conf = link_conf;
 
 	INIT_WORK(&link->csa_finalize_work,
 		  ieee80211_csa_finalize_work);
@@ -1128,7 +1129,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	mutex_unlock(&local->iflist_mtx);
 
 	mutex_lock(&local->mtx);
-	ret = ieee80211_link_use_channel(sdata->link[0], &local->monitor_chandef,
+	ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chandef,
 					 IEEE80211_CHANCTX_EXCLUSIVE);
 	mutex_unlock(&local->mtx);
 	if (ret) {
@@ -1173,7 +1174,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
 	synchronize_net();
 
 	mutex_lock(&local->mtx);
-	ieee80211_link_release_channel(sdata->link[0]);
+	ieee80211_link_release_channel(&sdata->deflink);
 	mutex_unlock(&local->mtx);
 
 	drv_remove_interface(local, sdata);
@@ -1279,7 +1280,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	case NL80211_IFTYPE_AP_VLAN:
 		/* no need to tell driver, but set carrier and chanctx */
 		if (sdata->bss->active) {
-			ieee80211_link_vlan_copy_chanctx(sdata->link[0]);
+			ieee80211_link_vlan_copy_chanctx(&sdata->deflink);
 			netif_carrier_on(dev);
 			ieee80211_set_vif_encap_ops(sdata);
 		} else {
@@ -1351,7 +1352,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
 		    sdata->vif.type != NL80211_IFTYPE_NAN)
 			changed |= ieee80211_reset_erp_info(sdata);
-		ieee80211_link_info_change_notify(sdata, 0, changed);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  changed);
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
@@ -1535,7 +1537,8 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 			break;
 		}
 		case WLAN_VHT_ACTION_GROUPID_MGMT:
-			ieee80211_process_mu_groups(sdata, 0, mgmt);
+			ieee80211_process_mu_groups(sdata, &sdata->deflink,
+						    mgmt);
 			break;
 		default:
 			WARN_ON(1);
@@ -1689,7 +1692,7 @@ static void ieee80211_recalc_smps_work(struct work_struct *work)
 	struct ieee80211_sub_if_data *sdata =
 		container_of(work, struct ieee80211_sub_if_data, recalc_smps);
 
-	ieee80211_recalc_smps(sdata, 0);
+	ieee80211_recalc_smps(sdata, &sdata->deflink);
 }
 
 /*
@@ -2364,6 +2367,7 @@ int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
 	struct {
 		struct ieee80211_link_data data;
 		struct ieee80211_bss_conf conf;
+		struct rcu_head rcu_head;
 	} *links[IEEE80211_MLD_MAX_NUM_LINKS] = {}, *link;
 	struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS];
 	struct ieee80211_link_data *old_data[IEEE80211_MLD_MAX_NUM_LINKS];
@@ -2394,15 +2398,15 @@ int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
 	/* link them into data structures */
 	for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
 		WARN_ON(!use_deflink &&
-			sdata->link[link_id] == &sdata->deflink);
+			rcu_access_pointer(sdata->link[link_id]) == &sdata->deflink);
 
 		link = links[link_id];
 		ieee80211_link_init(sdata, link_id, &link->data, &link->conf);
 	}
 
 	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
-		sdata->link[link_id] = NULL;
-		sdata->vif.link_conf[link_id] = NULL;
+		RCU_INIT_POINTER(sdata->link[link_id], NULL);
+		RCU_INIT_POINTER(sdata->vif.link_conf[link_id], NULL);
 	}
 
 	sdata->vif.valid_links = new_links;
@@ -2426,20 +2430,20 @@ int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
 	/* now use this to free the old links */
 	memset(links, 0, sizeof(links));
 	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
-		if (sdata->link[link_id] == &sdata->deflink)
+		if (rcu_access_pointer(sdata->link[link_id]) == &sdata->deflink)
 			continue;
 		/*
 		 * we must have allocated the data through this path so
 		 * we know we can free both at the same time
 		 */
-		links[link_id] = container_of(sdata->link[link_id],
+		links[link_id] = container_of(rcu_access_pointer(sdata->link[link_id]),
 					      typeof(*links[link_id]),
 					      data);
 	}
 
 free:
 	for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++)
-		kfree(links[link_id]);
+		kfree_rcu(links[link_id], rcu_head);
 	if (use_deflink)
 		ieee80211_link_init(sdata, -1, &sdata->deflink,
 				    &sdata->vif.bss_conf);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index c34f06039dda..191f4d35ef60 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -246,9 +246,11 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 		u64 ch = changed & ~BSS_CHANGED_VIF_CFG_FLAGS;
 
 		/* FIXME: should be for each link */
-		trace_drv_link_info_changed(local, sdata, 0, changed);
+		trace_drv_link_info_changed(local, sdata, &sdata->vif.bss_conf,
+					    0, changed);
 		if (local->ops->link_info_changed)
 			local->ops->link_info_changed(&local->hw, &sdata->vif,
+						      &sdata->vif.bss_conf,
 						      0, ch);
 	}
 
@@ -272,7 +274,8 @@ void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata,
 }
 
 void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
-				       int link_id, u64 changed)
+				       struct ieee80211_link_data *link,
+				       u64 changed)
 {
 	struct ieee80211_local *local = sdata->local;
 
@@ -284,7 +287,7 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	drv_link_info_changed(local, sdata, link_id, changed);
+	drv_link_info_changed(local, sdata, link->conf, link->link_id, changed);
 }
 
 u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 6160211a7eee..ba4e0921fa5d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1056,7 +1056,7 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 	}
 
 	ieee80211_recalc_dtim(local, sdata);
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 
 	netif_carrier_on(sdata->dev);
 	return 0;
@@ -1080,7 +1080,8 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
 	sdata->vif.bss_conf.enable_beacon = false;
 	sdata->beacon_rate_set = false;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BEACON_ENABLED);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_BEACON_ENABLED);
 
 	/* remove beacon */
 	bcn = sdata_dereference(ifmsh->beacon, sdata);
@@ -1578,7 +1579,7 @@ static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata)
 		if (ieee80211_mesh_rebuild_beacon(sdata))
 			return;
 
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
 }
 
 void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4175247c325e..52a41416b8bb 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1836,7 +1836,8 @@ void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata)
 
 	if (sdata->vif.bss_conf.ps != ps_allowed) {
 		sdata->vif.bss_conf.ps = ps_allowed;
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_PS);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_PS);
 	}
 }
 
@@ -2032,7 +2033,8 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 {
 	if (__ieee80211_sta_handle_tspec_ac_params(sdata))
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_QOS);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_QOS);
 }
 
 static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
@@ -2338,7 +2340,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	ieee80211_recalc_ps(local);
 	mutex_unlock(&local->iflist_mtx);
 
-	ieee80211_recalc_smps(sdata, 0);
+	ieee80211_recalc_smps(sdata, &sdata->deflink);
 	ieee80211_recalc_ps_vif(sdata);
 
 	netif_carrier_on(sdata->dev);
@@ -2921,7 +2923,8 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
 		sta_info_destroy_addr(sdata, auth_data->bss->bssid);
 
 		eth_zero_addr(sdata->deflink.u.mgd.bssid);
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
 		mutex_lock(&sdata->local->mtx);
 		ieee80211_link_release_channel(&sdata->deflink);
@@ -2950,7 +2953,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		sta_info_destroy_addr(sdata, assoc_data->bss->bssid);
 
 		eth_zero_addr(sdata->deflink.u.mgd.bssid);
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
 		sdata->vif.bss_conf.mu_mimo_owner = false;
 
@@ -4392,7 +4396,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					       elems->pwr_constr_elem,
 					       elems->cisco_dtpc_elem);
 
-	ieee80211_link_info_change_notify(sdata, 0, changed);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  changed);
 free:
 	kfree(elems);
 }
@@ -5702,9 +5707,10 @@ skip_rates:
 		 * tell driver about BSSID, basic rates and timing
 		 * this was set up above, before setting the channel
 		 */
-		ieee80211_link_info_change_notify(sdata, 0,
-			BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES |
-			BSS_CHANGED_BEACON_INT);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_BSSID |
+						  BSS_CHANGED_BASIC_RATES |
+						  BSS_CHANGED_BEACON_INT);
 
 		if (assoc)
 			sta_info_pre_move_state(new_sta, IEEE80211_STA_AUTH);
@@ -5870,7 +5876,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 
  err_clear:
 	eth_zero_addr(sdata->deflink.u.mgd.bssid);
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_BSSID);
 	ifmgd->auth_data = NULL;
 	mutex_lock(&sdata->local->mtx);
 	ieee80211_link_release_channel(&sdata->deflink);
@@ -6217,7 +6224,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	return 0;
  err_clear:
 	eth_zero_addr(sdata->deflink.u.mgd.bssid);
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_BSSID);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_BSSID);
 	ifmgd->assoc_data = NULL;
  err_free:
 	kfree(assoc_data);
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 0fd29d9c496c..2ca2164a3098 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -186,7 +186,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
 	mutex_lock(&sdata->local->mtx);
-	err = ieee80211_link_use_channel(sdata->link[0], &setup->chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef,
 					 IEEE80211_CHANCTX_SHARED);
 	mutex_unlock(&sdata->local->mtx);
 	if (err)
@@ -229,7 +229,7 @@ int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata)
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_OCB);
 
 	mutex_lock(&sdata->local->mtx);
-	ieee80211_link_release_channel(sdata->link[0]);
+	ieee80211_link_release_channel(&sdata->deflink);
 	mutex_unlock(&sdata->local->mtx);
 
 	skb_queue_purge(&sdata->skb_queue);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index aff5d3c39902..be79ae68754e 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -119,7 +119,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
 				&sdata->state);
 			sdata->vif.bss_conf.enable_beacon = false;
 			ieee80211_link_info_change_notify(
-				sdata, 0, BSS_CHANGED_BEACON_ENABLED);
+				sdata, &sdata->deflink,
+				BSS_CHANGED_BEACON_ENABLED);
 		}
 
 		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
@@ -156,7 +157,8 @@ void ieee80211_offchannel_return(struct ieee80211_local *local)
 				       &sdata->state)) {
 			sdata->vif.bss_conf.enable_beacon = true;
 			ieee80211_link_info_change_notify(
-				sdata, 0, BSS_CHANGED_BEACON_ENABLED);
+				sdata, &sdata->deflink,
+				BSS_CHANGED_BEACON_ENABLED);
 		}
 	}
 	mutex_unlock(&local->iflist_mtx);
@@ -848,14 +850,17 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		rcu_read_lock();
 		/* Check all the links first */
 		for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) {
-			if (!sdata->vif.link_conf[i])
+			struct ieee80211_bss_conf *conf;
+
+			conf = rcu_dereference(sdata->vif.link_conf[i]);
+			if (!conf)
 				continue;
 
-			chanctx_conf = rcu_dereference(sdata->vif.link_conf[i]->chanctx_conf);
+			chanctx_conf = rcu_dereference(conf->chanctx_conf);
 			if (!chanctx_conf)
 				continue;
 
-			if (ether_addr_equal(sdata->vif.link_conf[i]->addr, mgmt->sa))
+			if (ether_addr_equal(conf->addr, mgmt->sa))
 				break;
 
 			chanctx_conf = NULL;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b7dff3ef9748..c70156e49d0d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4156,9 +4156,13 @@ static bool ieee80211_is_our_addr(struct ieee80211_sub_if_data *sdata,
 		return false;
 
 	for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) {
-		if (!sdata->vif.link_conf[link_id])
+		struct ieee80211_bss_conf *conf;
+
+		conf = rcu_dereference(sdata->vif.link_conf[link_id]);
+
+		if (!conf)
 			continue;
-		if (ether_addr_equal(sdata->vif.link_conf[link_id]->addr, addr)) {
+		if (ether_addr_equal(conf->addr, addr)) {
 			if (out_link_id)
 				*out_link_id = link_id;
 			return true;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index a4fa0ce7bd92..20aad688c9c9 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -731,7 +731,8 @@ ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata)
 
 	if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) {
 		sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps;
-		ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_P2P_PS);
+		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+						  BSS_CHANGED_P2P_PS);
 	}
 }
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index c531fa17f426..71883ffd7061 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1336,7 +1336,8 @@ iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	sdata->vif.bss_conf.ht_operation_mode = opmode;
-	ieee80211_link_info_change_notify(sdata, 0, BSS_CHANGED_HT);
+	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+					  BSS_CHANGED_HT);
 }
 
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index f96e7cdca4c2..6aa06fba5b50 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -446,9 +446,10 @@ TRACE_EVENT(drv_vif_cfg_changed,
 TRACE_EVENT(drv_link_info_changed,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_bss_conf *link_conf,
 		 int link_id, u64 changed),
 
-	TP_ARGS(local, sdata, link_id, changed),
+	TP_ARGS(local, sdata, link_conf, link_id, changed),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -481,8 +482,6 @@ TRACE_EVENT(drv_link_info_changed,
 	),
 
 	TP_fast_assign(
-		struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
-
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->changed = changed;
@@ -1752,10 +1751,9 @@ DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
 TRACE_EVENT(drv_start_ap,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 struct ieee80211_bss_conf *info,
 		 unsigned int link_id),
 
-	TP_ARGS(local, sdata, info, link_id),
+	TP_ARGS(local, sdata, link_id),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -1768,15 +1766,20 @@ TRACE_EVENT(drv_start_ap,
 	),
 
 	TP_fast_assign(
+		struct ieee80211_bss_conf *info =
+			sdata_dereference(sdata->vif.link_conf[link_id], sdata);
+
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->link_id = link_id;
-		__entry->dtimper = info->dtim_period;
-		__entry->bcnint = info->beacon_int;
+		if (info) {
+			__entry->dtimper = info->dtim_period;
+			__entry->bcnint = info->beacon_int;
+			__entry->hidden_ssid = info->hidden_ssid;
+		}
 		memcpy(__get_dynamic_array(ssid),
 		       sdata->vif.cfg.ssid,
 		       sdata->vif.cfg.ssid_len);
-		__entry->hidden_ssid = info->hidden_ssid;
 	),
 
 	TP_printk(
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 6cd3aeba870f..4a7a714de79e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4598,13 +4598,14 @@ void ieee80211_tx_pending(struct tasklet_struct *t)
 /* functions for drivers to get certain frames */
 
 static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_link_data *link,
 				       struct ps_data *ps, struct sk_buff *skb,
-				       bool is_template, unsigned int link_id)
+				       bool is_template)
 {
 	u8 *pos, *tim;
 	int aid0 = 0;
 	int i, have_bits = 0, n1, n2;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 
 	/* Generate bitmap for TIM only if there are any STAs in power save
 	 * mode. */
@@ -4664,8 +4665,9 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 }
 
 static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_link_data *link,
 				    struct ps_data *ps, struct sk_buff *skb,
-				    bool is_template, unsigned int link_id)
+				    bool is_template)
 {
 	struct ieee80211_local *local = sdata->local;
 
@@ -4677,12 +4679,10 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 	 * of the tim bitmap in mac80211 and the driver.
 	 */
 	if (local->tim_in_locked_section) {
-		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template,
-					   link_id);
+		__ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template);
 	} else {
 		spin_lock_bh(&local->tim_lock);
-		__ieee80211_beacon_add_tim(sdata, ps, skb, is_template,
-					   link_id);
+		__ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template);
 		spin_unlock_bh(&local->tim_lock);
 	}
 
@@ -4691,7 +4691,7 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
 
 static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 					struct beacon_data *beacon,
-					unsigned int link_id)
+					struct ieee80211_link_data *link)
 {
 	u8 *beacon_data, count, max_count = 1;
 	struct probe_resp *resp;
@@ -4716,20 +4716,17 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	rcu_read_lock();
-	resp = rcu_dereference(sdata->link[link_id]->u.ap.probe_resp);
+	resp = rcu_dereference(link->u.ap.probe_resp);
 
 	bcn_offsets = beacon->cntdwn_counter_offsets;
 	count = beacon->cntdwn_current_counter;
-	if (sdata->vif.link_conf[link_id]->csa_active)
+	if (link->conf->csa_active)
 		max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
 
 	for (i = 0; i < max_count; ++i) {
 		if (bcn_offsets[i]) {
-			if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) {
-				rcu_read_unlock();
+			if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len))
 				return;
-			}
 			beacon_data[bcn_offsets[i]] = count;
 		}
 
@@ -4739,7 +4736,6 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
 			resp->data[resp_offsets[i]] = count;
 		}
 	}
-	rcu_read_unlock();
 }
 
 static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon)
@@ -4863,14 +4859,14 @@ EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete);
 static int ieee80211_beacon_protect(struct sk_buff *skb,
 				    struct ieee80211_local *local,
 				    struct ieee80211_sub_if_data *sdata,
-				    unsigned int link_id)
+				    struct ieee80211_link_data *link)
 {
 	ieee80211_tx_result res;
 	struct ieee80211_tx_data tx;
 	struct sk_buff *check_skb;
 
 	memset(&tx, 0, sizeof(tx));
-	tx.key = rcu_dereference(sdata->link[link_id]->default_beacon_key);
+	tx.key = rcu_dereference(link->default_beacon_key);
 	if (!tx.key)
 		return 0;
 	tx.local = local;
@@ -4890,12 +4886,12 @@ static int ieee80211_beacon_protect(struct sk_buff *skb,
 static void
 ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
+			    struct ieee80211_link_data *link,
 			    struct ieee80211_mutable_offsets *offs,
 			    struct beacon_data *beacon,
 			    struct sk_buff *skb,
 			    struct ieee80211_chanctx_conf *chanctx_conf,
-			    u16 csa_off_base,
-			    unsigned int link_id)
+			    u16 csa_off_base)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -4926,7 +4922,7 @@ ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 	memset(&txrc, 0, sizeof(txrc));
 	txrc.hw = hw;
 	txrc.sband = local->hw.wiphy->bands[band];
-	txrc.bss_conf = sdata->vif.link_conf[link_id];
+	txrc.bss_conf = link->conf;
 	txrc.skb = skb;
 	txrc.reported_rate.idx = -1;
 	if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band])
@@ -4958,11 +4954,11 @@ ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon)
 static struct sk_buff *
 ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
+			struct ieee80211_link_data *link,
 			struct ieee80211_mutable_offsets *offs,
 			bool is_template,
 			struct beacon_data *beacon,
-			struct ieee80211_chanctx_conf *chanctx_conf,
-			unsigned int link_id)
+			struct ieee80211_chanctx_conf *chanctx_conf)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -4975,7 +4971,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 		if (!is_template)
 			ieee80211_beacon_update_cntdwn(vif);
 
-		ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
+		ieee80211_set_beacon_cntdwn(sdata, beacon, link);
 	}
 
 	/* headroom, head length,
@@ -4991,7 +4987,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	skb_reserve(skb, local->tx_headroom);
 	skb_put_data(skb, beacon->head, beacon->head_len);
 
-	ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template, link_id);
+	ieee80211_beacon_add_tim(sdata, link, &ap->ps, skb, is_template);
 
 	if (offs) {
 		offs->tim_offset = beacon->head_len;
@@ -5010,11 +5006,11 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	if (beacon->tail)
 		skb_put_data(skb, beacon->tail, beacon->tail_len);
 
-	if (ieee80211_beacon_protect(skb, local, sdata, link_id) < 0)
+	if (ieee80211_beacon_protect(skb, local, sdata, link) < 0)
 		return NULL;
 
-	ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, chanctx_conf,
-				    csa_off_base, link_id);
+	ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb,
+				    chanctx_conf, csa_off_base);
 	return skb;
 }
 
@@ -5030,12 +5026,16 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct ieee80211_sub_if_data *sdata = NULL;
 	struct ieee80211_chanctx_conf *chanctx_conf;
+	struct ieee80211_link_data *link;
 
 	rcu_read_lock();
 
 	sdata = vif_to_sdata(vif);
+	link = rcu_dereference(sdata->link[link_id]);
+	if (!link)
+		goto out;
 	chanctx_conf =
-		rcu_dereference(sdata->vif.link_conf[link_id]->chanctx_conf);
+		rcu_dereference(link->conf->chanctx_conf);
 
 	if (!ieee80211_sdata_running(sdata) || !chanctx_conf)
 		goto out;
@@ -5044,12 +5044,12 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		memset(offs, 0, sizeof(*offs));
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP) {
-		beacon = rcu_dereference(sdata->link[link_id]->u.ap.beacon);
+		beacon = rcu_dereference(link->u.ap.beacon);
 		if (!beacon)
 			goto out;
 
-		skb = ieee80211_beacon_get_ap(hw, vif, offs, is_template,
-					      beacon, chanctx_conf, link_id);
+		skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template,
+					      beacon, chanctx_conf);
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 		struct ieee80211_hdr *hdr;
@@ -5062,7 +5062,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 			if (!is_template)
 				__ieee80211_beacon_update_cntdwn(beacon);
 
-			ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
+			ieee80211_set_beacon_cntdwn(sdata, beacon, link);
 		}
 
 		skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
@@ -5076,8 +5076,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						 IEEE80211_STYPE_BEACON);
 
-		ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb,
-					    chanctx_conf, 0, link_id);
+		ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb,
+					    chanctx_conf, 0);
 	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
 		struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
@@ -5094,7 +5094,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 				 */
 				__ieee80211_beacon_update_cntdwn(beacon);
 
-			ieee80211_set_beacon_cntdwn(sdata, beacon, link_id);
+			ieee80211_set_beacon_cntdwn(sdata, beacon, link);
 		}
 
 		if (ifmsh->sync_ops)
@@ -5109,8 +5109,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 			goto out;
 		skb_reserve(skb, local->tx_headroom);
 		skb_put_data(skb, beacon->head, beacon->head_len);
-		ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template,
-					 link_id);
+		ieee80211_beacon_add_tim(sdata, link, &ifmsh->ps, skb,
+					 is_template);
 
 		if (offs) {
 			offs->tim_offset = beacon->head_len;
@@ -5118,8 +5118,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		}
 
 		skb_put_data(skb, beacon->tail, beacon->tail_len);
-		ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb,
-					    chanctx_conf, 0, link_id);
+		ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb,
+					    chanctx_conf, 0);
 	} else {
 		WARN_ON(1);
 		goto out;
@@ -5616,11 +5616,17 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 		link = IEEE80211_LINK_UNSPECIFIED;
 	} else {
 		/* otherwise must be addressed from a link */
+		rcu_read_lock();
 		for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) {
-			if (memcmp(sdata->vif.link_conf[link]->addr,
-				   hdr->addr2, ETH_ALEN) == 0)
+			struct ieee80211_bss_conf *link_conf;
+
+			link_conf = rcu_dereference(sdata->vif.link_conf[link]);
+			if (!link_conf)
+				continue;
+			if (memcmp(link_conf->addr, hdr->addr2, ETH_ALEN) == 0)
 				break;
 		}
+		rcu_read_unlock();
 
 		if (WARN_ON_ONCE(link == ARRAY_SIZE(sdata->vif.link_conf)))
 			link = ffs(sdata->vif.valid_links) - 1;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 645f75b0f89f..3e29ef1f81ad 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1702,7 +1702,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 	    sdata->vif.type != NL80211_IFTYPE_NAN) {
 		sdata->vif.bss_conf.qos = enable_qos;
 		if (bss_notify)
-			ieee80211_link_info_change_notify(sdata, 0,
+			ieee80211_link_info_change_notify(sdata,
+							  &sdata->deflink,
 							  BSS_CHANGED_QOS);
 	}
 }
@@ -2259,7 +2260,7 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
 
 static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 				     struct ieee80211_sub_if_data *sdata,
-				     unsigned int link_id)
+				     struct ieee80211_link_data *link)
 {
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
@@ -2268,11 +2269,11 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 		return;
 
 	mutex_lock(&local->chanctx_mtx);
-	conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
+	conf = rcu_dereference_protected(link->conf->chanctx_conf,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (conf) {
 		ctx = container_of(conf, struct ieee80211_chanctx, conf);
-		drv_assign_vif_chanctx(local, sdata, link_id, ctx);
+		drv_assign_vif_chanctx(local, sdata, link->link_id, ctx);
 	}
 	mutex_unlock(&local->chanctx_mtx);
 }
@@ -2478,7 +2479,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		sdata = wiphy_dereference(local->hw.wiphy,
 					  local->monitor_sdata);
 		if (sdata && ieee80211_sdata_running(sdata))
-			ieee80211_assign_chanctx(local, sdata, 0);
+			ieee80211_assign_chanctx(local, sdata, &sdata->deflink);
 	}
 
 	/* reconfigure hardware */
@@ -2488,16 +2489,23 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 
 	/* Finally also reconfigure all the BSS information */
 	list_for_each_entry(sdata, &local->interfaces, list) {
-		unsigned int link;
+		unsigned int link_id;
 		u32 changed;
 
 		if (!ieee80211_sdata_running(sdata))
 			continue;
 
-		for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) {
-			if (sdata->vif.link_conf[link])
+		sdata_lock(sdata);
+		for (link_id = 0;
+		     link_id < ARRAY_SIZE(sdata->vif.link_conf);
+		     link_id++) {
+			struct ieee80211_link_data *link;
+
+			link = sdata_dereference(sdata->link[link_id], sdata);
+			if (link)
 				ieee80211_assign_chanctx(local, sdata, link);
 		}
+		sdata_unlock(sdata);
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_AP_VLAN:
@@ -2807,7 +2815,7 @@ void ieee80211_resume_disconnect(struct ieee80211_vif *vif)
 EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect);
 
 void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
-			   unsigned int link_id)
+			   struct ieee80211_link_data *link)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *chanctx_conf;
@@ -2815,7 +2823,7 @@ void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
 
 	mutex_lock(&local->chanctx_mtx);
 
-	chanctx_conf = rcu_dereference_protected(sdata->vif.link_conf[link_id]->chanctx_conf,
+	chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf,
 						 lockdep_is_held(&local->chanctx_mtx));
 
 	/*
@@ -3984,7 +3992,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local)
 
 		if (sdata->wdev.cac_started) {
 			chandef = sdata->vif.bss_conf.chandef;
-			ieee80211_link_release_channel(sdata->link[0]);
+			ieee80211_link_release_channel(&sdata->deflink);
 			cfg80211_cac_event(sdata->dev,
 					   &chandef,
 					   NL80211_RADAR_CAC_ABORTED,
@@ -4430,13 +4438,11 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
 		!list_empty(&ctx->assigned_links));
 
 	list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) {
-		struct ieee80211_sub_if_data *sdata = link->sdata;
-
 		if (!link->radar_required)
 			continue;
 
 		radar_detect |=
-			BIT(sdata->vif.link_conf[link->link_id]->chandef.width);
+			BIT(link->conf->chandef.width);
 	}
 
 	return radar_detect;
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index fa14627b499a..c804890dc623 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -341,39 +341,50 @@ ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta)
 {
 	unsigned int link_id = link_sta->link_id;
 	struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata;
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
 	struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap;
 	struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap;
 	struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap;
 	u32 cap_width;
 
 	if (he_cap->has_he) {
+		struct ieee80211_bss_conf *link_conf;
+		enum ieee80211_sta_rx_bandwidth ret;
 		u8 info;
 
+		rcu_read_lock();
+		link_conf = rcu_dereference(sdata->vif.link_conf[link_id]);
+
 		if (eht_cap->has_eht &&
 		    link_conf->chandef.chan->band == NL80211_BAND_6GHZ) {
 			info = eht_cap->eht_cap_elem.phy_cap_info[0];
 
-			if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
-				return IEEE80211_STA_RX_BW_320;
+			if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) {
+				ret = IEEE80211_STA_RX_BW_320;
+				goto out;
+			}
 		}
 
 		info = he_cap->he_cap_elem.phy_cap_info[0];
 
 		if (link_conf->chandef.chan->band == NL80211_BAND_2GHZ) {
 			if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
-				return IEEE80211_STA_RX_BW_40;
+				ret = IEEE80211_STA_RX_BW_40;
 			else
-				return IEEE80211_STA_RX_BW_20;
+				ret = IEEE80211_STA_RX_BW_20;
+			goto out;
 		}
 
 		if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G ||
 		    info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
-			return IEEE80211_STA_RX_BW_160;
+			ret = IEEE80211_STA_RX_BW_160;
 		else if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)
-			return IEEE80211_STA_RX_BW_80;
+			ret = IEEE80211_STA_RX_BW_80;
+		else
+			ret = IEEE80211_STA_RX_BW_20;
+out:
+		rcu_read_unlock();
 
-		return IEEE80211_STA_RX_BW_20;
+		return ret;
 	}
 
 	if (!vht_cap->vht_supported)
@@ -481,11 +492,18 @@ enum ieee80211_sta_rx_bandwidth
 ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta)
 {
 	struct sta_info *sta = link_sta->sta;
-	struct ieee80211_bss_conf *link_conf =
-		sta->sdata->vif.link_conf[link_sta->link_id];
-	enum nl80211_chan_width bss_width = link_conf->chandef.width;
+	struct ieee80211_bss_conf *link_conf;
+	enum nl80211_chan_width bss_width;
 	enum ieee80211_sta_rx_bandwidth bw;
 
+	rcu_read_lock();
+	link_conf = rcu_dereference(sta->sdata->vif.link_conf[link_sta->link_id]);
+	if (WARN_ON(!link_conf))
+		bss_width = NL80211_CHAN_WIDTH_20_NOHT;
+	else
+		bss_width = link_conf->chandef.width;
+	rcu_read_unlock();
+
 	bw = ieee80211_sta_cap_rx_bw(link_sta);
 	bw = min(bw, link_sta->cur_max_bandwidth);
 
@@ -659,10 +677,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 }
 
 void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
-				 unsigned int link_id,
+				 struct ieee80211_link_data *link,
 				 struct ieee80211_mgmt *mgmt)
 {
-	struct ieee80211_bss_conf *link_conf = sdata->vif.link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf = link->conf;
 
 	if (!link_conf->mu_mimo_owner)
 		return;
@@ -680,19 +698,25 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 	       mgmt->u.action.u.vht_group_notif.position,
 	       WLAN_USER_POSITION_LEN);
 
-	ieee80211_link_info_change_notify(sdata, link_id, BSS_CHANGED_MU_GROUPS);
+	ieee80211_link_info_change_notify(sdata, link,
+					  BSS_CHANGED_MU_GROUPS);
 }
 
 void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id,
 				const u8 *membership, const u8 *position)
 {
-	struct ieee80211_bss_conf *link_conf = vif->link_conf[link_id];
+	struct ieee80211_bss_conf *link_conf;
 
-	if (WARN_ON_ONCE(!link_conf->mu_mimo_owner))
-		return;
+	rcu_read_lock();
+	link_conf = rcu_dereference(vif->link_conf[link_id]);
 
-	memcpy(link_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
-	memcpy(link_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
+	if (!WARN_ON_ONCE(!link_conf || !link_conf->mu_mimo_owner)) {
+		memcpy(link_conf->mu_group.membership, membership,
+		       WLAN_MEMBERSHIP_LEN);
+		memcpy(link_conf->mu_group.position, position,
+		       WLAN_USER_POSITION_LEN);
+	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
 
-- 
cgit 


From 23cc6d8c37cdbe3b2b74a922f3311ecb2a5aea6c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 22 Jun 2022 10:17:23 +0200
Subject: wifi: cfg80211: make cfg80211_auth_request::key_idx signed

We might assign -1 to it in some cases when key is NULL,
which means the key_idx isn't used but can lead to a
warning from static checkers such as smatch.

Make the struct member signed simply to avoid that, we
only need a range of -1..3 anyway.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c904cbd1c4d6..a4e2cb2378b8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2773,7 +2773,8 @@ struct cfg80211_auth_request {
 	size_t ie_len;
 	enum nl80211_auth_type auth_type;
 	const u8 *key;
-	u8 key_len, key_idx;
+	u8 key_len;
+	s8 key_idx;
 	const u8 *auth_data;
 	size_t auth_data_len;
 	s8 link_id;
-- 
cgit 


From 1e0b3b0b6cb5e04bcb25fbe4164180d64e3ace07 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 27 Apr 2022 18:02:10 +0300
Subject: wifi: mac80211: Align with Draft P802.11be_D1.5

Align the mac80211 implementation with P802.11be_D1.5.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 52 ++++++++++++++++++---------
 net/mac80211/ieee80211_i.h |  4 +++
 net/mac80211/mlme.c        | 32 +++++++++++++++++
 net/mac80211/util.c        | 87 +++++++++++++++++++++++++++++-----------------
 4 files changed, 128 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f386f9ed41f3..e75c73ca11ec 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2046,25 +2046,38 @@ struct ieee80211_eht_cap_elem {
 	u8 optional[];
 } __packed;
 
+#define IEEE80211_EHT_OPER_INFO_PRESENT	                        0x1
+#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x2
+
 /**
  * struct ieee80211_eht_operation - eht operation element
  *
  * This structure is the "EHT Operation Element" fields as
- * described in P802.11be_D1.4 section 9.4.2.311
+ * described in P802.11be_D1.5 section 9.4.2.311
  *
- * FIXME: The spec is unclear how big the fields are, and doesn't
- *	  indicate the "Disabled Subchannel Bitmap Present" in the
- *	  structure (Figure 9-1002a) at all ...
+ * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_*
+ * @optional: optional parts
  */
 struct ieee80211_eht_operation {
-	u8 chan_width;
-	u8 ccfs;
-	u8 present_bm;
-
-	u8 disable_subchannel_bitmap[];
+	u8 params;
+	u8 optional[];
 } __packed;
 
-#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x1
+/**
+ * struct ieee80211_eht_operation_info - eht operation information
+ *
+ * @control: EHT operation information control.
+ * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz
+ *     EHT BSS.
+ * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS.
+ * @optional: optional parts
+ */
+struct ieee80211_eht_operation_info {
+	u8 control;
+	u8 ccfs0;
+	u8 ccfs1;
+	u8 optional[];
+} __packed;
 
 /* 802.11ac VHT Capabilities */
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
@@ -2773,10 +2786,12 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 #define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2		0x08
 #define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT			0x10
 #define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC			0x20
-#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_MASK		0xc0
-#define		IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_3895	0
-#define		IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_7991	1
-#define		IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_11454	2
+#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK		0xc0
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895	        0
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991	        1
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454	        2
+
+#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK		0x01
 
 /* EHT PHY capabilities as defined in P802.11be_D1.4 section 9.4.2.313.3 */
 #define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ			0x02
@@ -2949,8 +2964,13 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
 	if (len < needed)
 		return false;
 
-	if (elem->present_bm & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)
-		needed += 2;
+	if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) {
+		needed += 3;
+
+		if (elem->params &
+		    IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)
+			needed += 2;
+	}
 
 	return len >= needed;
 }
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f6791b47f78f..aa1e438ee61e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2326,6 +2326,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef);
+void ieee80211_chandef_eht_oper(struct ieee80211_sub_if_data *sdata,
+				const struct ieee80211_eht_operation *eht_oper,
+				bool support_160, bool support_320,
+				struct cfg80211_chan_def *chandef);
 bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 				    const struct ieee80211_he_operation *he_oper,
 				    const struct ieee80211_eht_operation *eht_oper,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e5c058db451d..39f13f3c29bf 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -303,6 +303,38 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 
 	*chandef = vht_chandef;
 
+	/*
+	 * handle the case that the EHT operation indicates that it holds EHT
+	 * operation information (in case that the channel width differs from
+	 * the channel width reported in HT/VHT/HE).
+	 */
+	if (eht_oper && (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) {
+		struct cfg80211_chan_def eht_chandef = *chandef;
+
+		ieee80211_chandef_eht_oper(sdata, eht_oper,
+					   eht_chandef.width ==
+					   NL80211_CHAN_WIDTH_160,
+					   false, &eht_chandef);
+
+		if (!cfg80211_chandef_valid(&eht_chandef)) {
+			if (!(ifmgd->flags & IEEE80211_STA_DISABLE_EHT))
+				sdata_info(sdata,
+					   "AP EHT information is invalid, disabling EHT\n");
+			ret = IEEE80211_STA_DISABLE_EHT;
+			goto out;
+		}
+
+		if (!cfg80211_chandef_compatible(chandef, &eht_chandef)) {
+			if (!(ifmgd->flags & IEEE80211_STA_DISABLE_EHT))
+				sdata_info(sdata,
+					   "AP EHT information is incompatible, disabling EHT\n");
+			ret = IEEE80211_STA_DISABLE_EHT;
+			goto out;
+		}
+
+		*chandef = eht_chandef;
+	}
+
 	ret = 0;
 
 out:
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3e29ef1f81ad..924192238042 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3459,6 +3459,58 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 	return true;
 }
 
+void ieee80211_chandef_eht_oper(struct ieee80211_sub_if_data *sdata,
+				const struct ieee80211_eht_operation *eht_oper,
+				bool support_160, bool support_320,
+				struct cfg80211_chan_def *chandef)
+{
+	struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional;
+
+	chandef->center_freq1 =
+		ieee80211_channel_to_frequency(info->ccfs0,
+					       chandef->chan->band);
+
+	switch (u8_get_bits(info->control,
+			    IEEE80211_EHT_OPER_CHAN_WIDTH)) {
+	case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ:
+		chandef->width = NL80211_CHAN_WIDTH_20;
+		break;
+	case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ:
+		chandef->width = NL80211_CHAN_WIDTH_40;
+		break;
+	case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ:
+		chandef->width = NL80211_CHAN_WIDTH_80;
+		break;
+	case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ:
+		if (support_160) {
+			chandef->width = NL80211_CHAN_WIDTH_160;
+			chandef->center_freq1 =
+				ieee80211_channel_to_frequency(info->ccfs1,
+							       chandef->chan->band);
+		} else {
+			chandef->width = NL80211_CHAN_WIDTH_80;
+		}
+		break;
+	case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ:
+		if (support_320) {
+			chandef->width = NL80211_CHAN_WIDTH_320;
+			chandef->center_freq1 =
+				ieee80211_channel_to_frequency(info->ccfs1,
+							       chandef->chan->band);
+		} else if (support_160) {
+			chandef->width = NL80211_CHAN_WIDTH_160;
+		} else {
+			chandef->width = NL80211_CHAN_WIDTH_80;
+
+			if (chandef->center_freq1 > chandef->chan->center_freq)
+				chandef->center_freq1 -= 40;
+			else
+				chandef->center_freq1 += 40;
+		}
+		break;
+	}
+}
+
 bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 				    const struct ieee80211_he_operation *he_oper,
 				    const struct ieee80211_eht_operation *eht_oper,
@@ -3539,7 +3591,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	if (!eht_oper) {
+	if (!eht_oper ||
+	    !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) {
 		switch (u8_get_bits(he_6ghz_oper->control,
 				    IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) {
 		case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ:
@@ -3583,36 +3636,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 		support_320 =
 			eht_phy_cap & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ;
 
-		switch (u8_get_bits(eht_oper->chan_width,
-				    IEEE80211_EHT_OPER_CHAN_WIDTH)) {
-		case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ:
-			he_chandef.width = NL80211_CHAN_WIDTH_20;
-			break;
-		case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ:
-			he_chandef.width = NL80211_CHAN_WIDTH_40;
-			break;
-		case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ:
-			he_chandef.width = NL80211_CHAN_WIDTH_80;
-			break;
-		case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ:
-			if (support_160)
-				he_chandef.width = NL80211_CHAN_WIDTH_160;
-			else
-				he_chandef.width = NL80211_CHAN_WIDTH_80;
-			break;
-		case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ:
-			if (support_320)
-				he_chandef.width = NL80211_CHAN_WIDTH_320;
-			else if (support_160)
-				he_chandef.width = NL80211_CHAN_WIDTH_160;
-			else
-				he_chandef.width = NL80211_CHAN_WIDTH_80;
-			break;
-		}
-
-		he_chandef.center_freq1 =
-			ieee80211_channel_to_frequency(eht_oper->ccfs,
-						       NL80211_BAND_6GHZ);
+		ieee80211_chandef_eht_oper(sdata, eht_oper, support_160,
+					   support_320, &he_chandef);
 	}
 
 	if (!cfg80211_chandef_valid(&he_chandef)) {
-- 
cgit 


From 062e8e02dfd43c94b0d601c071e8a5c50bed830e Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 1 Jun 2022 17:43:34 +0300
Subject: wifi: mac80211: Align with Draft P802.11be_D2.0

Align the mac80211 implementation with P802.11be_D2.0.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c |  6 +++---
 include/linux/ieee80211.h             | 23 +++++++++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 737f8ed56d94..f437d8a65268 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3118,7 +3118,7 @@ static const struct ieee80211_sband_iftype_data sband_capa_2ghz[] = {
 			.has_eht = true,
 			.eht_cap_elem = {
 				.mac_cap_info[0] =
-					IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS |
+					IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
 					IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
 					IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
 				.phy_cap_info[0] =
@@ -3271,7 +3271,7 @@ static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = {
 			.has_eht = true,
 			.eht_cap_elem = {
 				.mac_cap_info[0] =
-					IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS |
+					IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
 					IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
 					IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
 				.phy_cap_info[0] =
@@ -3453,7 +3453,7 @@ static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = {
 			.has_eht = true,
 			.eht_cap_elem = {
 				.mac_cap_info[0] =
-					IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS |
+					IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
 					IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
 					IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
 				.phy_cap_info[0] =
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e75c73ca11ec..cca564372d16 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2020,7 +2020,7 @@ struct ieee80211_eht_mcs_nss_supp_bw {
  * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data
  *
  * This structure is the "EHT Capabilities element" fixed fields as
- * described in P802.11be_D1.4 section 9.4.2.313.
+ * described in P802.11be_D2.0 section 9.4.2.313.
  *
  * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP*
  * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP*
@@ -2046,20 +2046,27 @@ struct ieee80211_eht_cap_elem {
 	u8 optional[];
 } __packed;
 
-#define IEEE80211_EHT_OPER_INFO_PRESENT	                        0x1
-#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x2
+#define IEEE80211_EHT_OPER_INFO_PRESENT	                        0x01
+#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x02
+#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION	                0x04
+#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT         0x08
+#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK      0x30
 
 /**
  * struct ieee80211_eht_operation - eht operation element
  *
  * This structure is the "EHT Operation Element" fields as
- * described in P802.11be_D1.5 section 9.4.2.311
+ * described in P802.11be_D2.0 section 9.4.2.311
  *
  * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_*
+ * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in
+ *     EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and
+ *     receive.
  * @optional: optional parts
  */
 struct ieee80211_eht_operation {
 	u8 params;
+	__le32 basic_mcs_nss;
 	u8 optional[];
 } __packed;
 
@@ -2779,8 +2786,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ	BIT(0)
 #define S1G_OPER_CH_WIDTH_OPER		GENMASK(4, 1)
 
-/* EHT MAC capabilities as defined in P802.11be_D1.4 section 9.4.2.313.2 */
-#define IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS			0x01
+/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */
+#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS			0x01
 #define IEEE80211_EHT_MAC_CAP0_OM_CONTROL			0x02
 #define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1		0x04
 #define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2		0x08
@@ -2793,7 +2800,7 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 
 #define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK		0x01
 
-/* EHT PHY capabilities as defined in P802.11be_D1.4 section 9.4.2.313.3 */
+/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */
 #define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ			0x02
 #define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ		0x04
 #define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI		0x08
@@ -2858,7 +2865,7 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 #define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA	0x02
 
 /*
- * EHT operation channel width as defined in P802.11be_D1.4 section 9.4.2.311
+ * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311
  */
 #define IEEE80211_EHT_OPER_CHAN_WIDTH		0x7
 #define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ	0
-- 
cgit 


From 3fbddae46e5fc3f1630c7cf561d88e4ad21c7105 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Jun 2022 14:42:12 +0200
Subject: wifi: mac80211: provide link ID in link_conf

It might be useful to drivers to be able to pass only the
link_conf pointer, rather than both the pointer and the
link_id; add the link_id to the link_conf to facility that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/iface.c   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 044ed417b06f..877b3eca6db4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -516,6 +516,7 @@ struct ieee80211_fils_discovery {
  * to that BSS) that can change during the lifetime of the BSS.
  *
  * @addr: (link) address used locally
+ * @link_id: link ID, or 0 for non-MLO
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
  * @uora_exists: is the UORA element advertised by AP
  * @ack_enabled: indicates support to receive a multi-TID that solicits either
@@ -639,6 +640,7 @@ struct ieee80211_fils_discovery {
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
+	unsigned int link_id;
 	u8 addr[ETH_ALEN] __aligned(2);
 	u8 htc_trig_based_pkt_ext;
 	bool uora_exists;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index fbb9c9c291b9..312a812bec84 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -390,6 +390,7 @@ static void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 	link->sdata = sdata;
 	link->link_id = link_id;
 	link->conf = link_conf;
+	link_conf->link_id = link_id;
 
 	INIT_WORK(&link->csa_finalize_work,
 		  ieee80211_csa_finalize_work);
-- 
cgit 


From a3b8008dc1421a6f1d0d92cfc1352d729f6756c1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Jun 2022 15:02:16 +0200
Subject: wifi: mac80211: move ps setting to vif config

This really shouldn't be in a per-link config, we don't want
to let anyone control it that way (if anything, link powersave
could be forced through APIs to activate/deactivate a link),
and we don't support powersave in software with devices that
can do MLO.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c                | 2 +-
 drivers/net/wireless/ath/ath11k/mac.c                | 2 +-
 drivers/net/wireless/ath/wcn36xx/main.c              | 2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/power.c       | 4 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c          | 2 +-
 drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c | 2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/mcu.c      | 2 +-
 drivers/net/wireless/silabs/wfx/sta.c                | 6 +++---
 drivers/net/wireless/ti/wlcore/main.c                | 4 ++--
 include/net/mac80211.h                               | 6 +++---
 net/mac80211/main.c                                  | 1 +
 net/mac80211/mlme.c                                  | 7 +++----
 net/mac80211/trace.h                                 | 4 ++--
 13 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 3d111d6447f0..b18f32261d15 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -6252,7 +6252,7 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_PS) {
-		arvif->ps = vif->bss_conf.ps;
+		arvif->ps = vif->cfg.ps;
 
 		ret = ath10k_config_ps(ar);
 		if (ret)
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 17dbc7d9cf29..1cf1e1f18dba 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -3292,7 +3292,7 @@ static void ath11k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_PS &&
 	    ar->ab->hw_params.supports_sta_ps) {
-		arvif->ps = vif->bss_conf.ps;
+		arvif->ps = vif->cfg.ps;
 
 		ret = ath11k_mac_config_ps(ar);
 		if (ret)
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index ace8641909bd..dc59cafd29e3 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -385,7 +385,7 @@ static void wcn36xx_change_ps(struct wcn36xx *wcn, bool enable)
 	list_for_each_entry(tmp, &wcn->vif_list, list) {
 		vif = wcn36xx_priv_to_vif(tmp);
 		if (enable && !wcn->sw_scan) {
-			if (vif->bss_conf.ps) /* ps allowed ? */
+			if (vif->cfg.ps) /* ps allowed ? */
 				wcn36xx_pmc_enter_bmps_state(wcn, vif);
 		} else {
 			wcn36xx_pmc_exit_bmps_state(wcn, vif);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/power.c b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
index b49f265a421f..f5744162d0d8 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/power.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/power.c
@@ -359,7 +359,7 @@ static void iwl_mvm_power_build_cmd(struct iwl_mvm *mvm,
 
 	cmd->flags |= cpu_to_le16(POWER_FLAGS_POWER_SAVE_ENA_MSK);
 
-	if (!vif->bss_conf.ps || !mvmvif->pm_enabled)
+	if (!vif->cfg.ps || !mvmvif->pm_enabled)
 		return;
 
 	if (iwl_mvm_vif_low_latency(mvmvif) && vif->p2p &&
@@ -890,7 +890,7 @@ static int iwl_mvm_power_set_ba(struct iwl_mvm *mvm,
 
 	mvmvif->bf_data.ba_enabled = !(!mvmvif->pm_enabled ||
 				       mvm->ps_disabled ||
-				       !vif->bss_conf.ps ||
+				       !vif->cfg.ps ||
 				       iwl_mvm_vif_low_latency(mvmvif));
 
 	return _iwl_mvm_enable_beacon_filter(mvm, vif, &cmd, 0);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 303975f9e2b5..a79043f30775 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -1861,7 +1861,7 @@ static bool rs_tpc_allowed(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	int index = rate->index;
 	bool cam = (iwlmvm_mod_params.power_scheme == IWL_POWER_SCHEME_CAM);
 	bool sta_ps_disabled = (vif->type == NL80211_IFTYPE_STATION &&
-				!vif->bss_conf.ps);
+				!vif->cfg.ps);
 
 	IWL_DEBUG_RATE(mvm, "cam: %d sta_ps_disabled %d\n",
 		       cam, sta_ps_disabled);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
index d3da54e6670b..389f8c61939d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
@@ -193,7 +193,7 @@ int mt76_connac_mcu_set_vif_ps(struct mt76_dev *dev, struct ieee80211_vif *vif)
 			      */
 	} req = {
 		.bss_idx = mvif->idx,
-		.ps_state = vif->bss_conf.ps ? 2 : 0,
+		.ps_state = vif->cfg.ps ? 2 : 0,
 	};
 
 	if (vif->type != NL80211_IFTYPE_STATION)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
index 3b5b475b0875..976686075dd7 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
@@ -972,7 +972,7 @@ int mt7921_mcu_uni_bss_ps(struct mt7921_dev *dev, struct ieee80211_vif *vif)
 		.ps = {
 			.tag = cpu_to_le16(UNI_BSS_INFO_PS),
 			.len = cpu_to_le16(sizeof(struct ps_tlv)),
-			.ps_state = vif->bss_conf.ps ? 2 : 0,
+			.ps_state = vif->cfg.ps ? 2 : 0,
 		},
 	};
 
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 0378144795ca..47fd887ce6fe 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -175,7 +175,7 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 			/* It is useless to enable PS if channels are the same. */
 			if (enable_ps)
 				*enable_ps = false;
-			if (vif->cfg.assoc && vif->bss_conf.ps)
+			if (vif->cfg.assoc && vif->cfg.ps)
 				dev_info(wvif->wdev->dev, "ignoring requested PS mode");
 			return -1;
 		}
@@ -188,8 +188,8 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 			return 30;
 	}
 	if (enable_ps)
-		*enable_ps = vif->bss_conf.ps;
-	if (vif->cfg.assoc && vif->bss_conf.ps)
+		*enable_ps = vif->cfg.ps;
+	if (vif->cfg.assoc && vif->cfg.ps)
 		return conf->dynamic_ps_timeout;
 	else
 		return -1;
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index a1923ef52d55..d1200080f165 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4481,7 +4481,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	}
 
 	if (changed & BSS_CHANGED_PS) {
-		if ((bss_conf->ps) &&
+		if (vif->cfg.ps &&
 		    test_bit(WLVIF_FLAG_STA_ASSOCIATED, &wlvif->flags) &&
 		    !test_bit(WLVIF_FLAG_IN_PS, &wlvif->flags)) {
 			int ps_mode;
@@ -4501,7 +4501,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 			if (ret < 0)
 				wl1271_warning("enter %s ps failed %d",
 					       ps_mode_str, ret);
-		} else if (!bss_conf->ps &&
+		} else if (!vif->cfg.ps &&
 			   test_bit(WLVIF_FLAG_IN_PS, &wlvif->flags)) {
 			wl1271_debug(DEBUG_PSM, "auto ps disabled");
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 877b3eca6db4..fba06d7bc7eb 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -578,8 +578,6 @@ struct ieee80211_fils_discovery {
  * @cqm_rssi_high: Connection quality monitor RSSI upper threshold.
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
  * @qos: This is a QoS-enabled BSS.
- * @ps: power-save mode (STA only). This flag is NOT affected by
- *	offchannel/dynamic_ps operations.
  * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode.
  * @txpower: TX power in dBm.  INT_MIN means not configured.
  * @txpower_type: TX power adjustment used to control per packet Transmit
@@ -673,7 +671,6 @@ struct ieee80211_bss_conf {
 	struct cfg80211_chan_def chandef;
 	struct ieee80211_mu_group_data mu_group;
 	bool qos;
-	bool ps;
 	bool hidden_ssid;
 	int txpower;
 	enum nl80211_tx_power_setting txpower_type;
@@ -1715,6 +1712,8 @@ enum ieee80211_offload_flags {
  * @assoc: association status
  * @ibss_joined: indicates whether this station is part of an IBSS or not
  * @ibss_creator: indicates if a new IBSS network is being created
+ * @ps: power-save mode (STA only). This flag is NOT affected by
+ *	offchannel/dynamic_ps operations.
  * @aid: association ID number, valid only when @assoc is true
  * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The
  *	may filter ARP queries targeted for other addresses than listed here.
@@ -1734,6 +1733,7 @@ struct ieee80211_vif_cfg {
 	/* association related data */
 	bool assoc, ibss_joined;
 	bool ibss_creator;
+	bool ps;
 	u16 aid;
 
 	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 191f4d35ef60..1258e506377e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -202,6 +202,7 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 
 #define BSS_CHANGED_VIF_CFG_FLAGS (BSS_CHANGED_ASSOC |\
 				   BSS_CHANGED_IDLE |\
+				   BSS_CHANGED_PS |\
 				   BSS_CHANGED_IBSS |\
 				   BSS_CHANGED_ARP_FILTER |\
 				   BSS_CHANGED_SSID)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 299381d19760..f8be05804e59 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1869,10 +1869,9 @@ void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata)
 {
 	bool ps_allowed = ieee80211_powersave_allowed(sdata);
 
-	if (sdata->vif.bss_conf.ps != ps_allowed) {
-		sdata->vif.bss_conf.ps = ps_allowed;
-		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
-						  BSS_CHANGED_PS);
+	if (sdata->vif.cfg.ps != ps_allowed) {
+		sdata->vif.cfg.ps = ps_allowed;
+		ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_PS);
 	}
 }
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 6aa06fba5b50..751f08b47d0c 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -413,6 +413,7 @@ TRACE_EVENT(drv_vif_cfg_changed,
 		__dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
 		__field(int, s1g)
 		__field(bool, idle)
+		__field(bool, ps)
 	),
 
 	TP_fast_assign(
@@ -423,6 +424,7 @@ TRACE_EVENT(drv_vif_cfg_changed,
 		__entry->assoc = sdata->vif.cfg.assoc;
 		__entry->ibss_joined = sdata->vif.cfg.ibss_joined;
 		__entry->ibss_creator = sdata->vif.cfg.ibss_creator;
+		__entry->ps = sdata->vif.cfg.ps;
 
 		__entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt;
 		memcpy(__get_dynamic_array(arp_addr_list),
@@ -475,7 +477,6 @@ TRACE_EVENT(drv_link_info_changed,
 		__field(u32, channel_cfreq1)
 		__field(u32, channel_cfreq1_offset)
 		__field(bool, qos)
-		__field(bool, ps)
 		__field(bool, hidden_ssid)
 		__field(int, txpower)
 		__field(u8, p2p_oppps_ctwindow)
@@ -506,7 +507,6 @@ TRACE_EVENT(drv_link_info_changed,
 		__entry->channel_cfreq1 = link_conf->chandef.center_freq1;
 		__entry->channel_cfreq1_offset = link_conf->chandef.freq1_offset;
 		__entry->qos = link_conf->qos;
-		__entry->ps = link_conf->ps;
 		__entry->hidden_ssid = link_conf->hidden_ssid;
 		__entry->txpower = link_conf->txpower;
 		__entry->p2p_oppps_ctwindow = link_conf->p2p_noa_attr.oppps_ctwindow;
-- 
cgit 


From b3e2130bf5f6c66831707cd51a8b13007137ac37 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Jun 2022 15:40:11 +0200
Subject: wifi: mac80211: change QoS settings API to take link into account

Take the link into account in the QoS settings (EDCA parameters)
APIs.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c              |  3 +-
 drivers/net/wireless/ath/ath11k/mac.c              |  3 +-
 drivers/net/wireless/ath/ath5k/mac80211-ops.c      |  3 +-
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |  3 +-
 drivers/net/wireless/ath/ath9k/main.c              |  3 +-
 drivers/net/wireless/ath/carl9170/main.c           |  3 +-
 drivers/net/wireless/broadcom/b43/main.c           |  3 +-
 drivers/net/wireless/broadcom/b43legacy/main.c     |  3 +-
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  3 +-
 drivers/net/wireless/intel/iwlegacy/common.c       |  3 +-
 drivers/net/wireless/intel/iwlegacy/common.h       |  3 +-
 drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |  5 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  3 +-
 drivers/net/wireless/intersil/p54/main.c           |  3 +-
 drivers/net/wireless/mac80211_hwsim.c              |  8 ++--
 drivers/net/wireless/marvell/mwl8k.c               |  5 +-
 drivers/net/wireless/mediatek/mt76/mt7603/main.c   |  3 +-
 drivers/net/wireless/mediatek/mt76/mt7615/main.c   |  3 +-
 drivers/net/wireless/mediatek/mt76/mt76x02.h       |  3 +-
 drivers/net/wireless/mediatek/mt76/mt76x02_util.c  |  3 +-
 drivers/net/wireless/mediatek/mt76/mt7915/main.c   |  3 +-
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  3 +-
 drivers/net/wireless/mediatek/mt7601u/mt7601u.h    |  3 +-
 drivers/net/wireless/mediatek/mt7601u/tx.c         |  3 +-
 drivers/net/wireless/ralink/rt2x00/rt2400pci.c     |  5 +-
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c     |  5 +-
 drivers/net/wireless/ralink/rt2x00/rt2800lib.h     |  3 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00.h        |  3 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00mac.c     |  3 +-
 drivers/net/wireless/ralink/rt2x00/rt61pci.c       |  5 +-
 drivers/net/wireless/ralink/rt2x00/rt73usb.c       |  5 +-
 drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |  3 +-
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |  3 +-
 .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |  3 +-
 drivers/net/wireless/realtek/rtlwifi/core.c        |  3 +-
 drivers/net/wireless/realtek/rtw88/mac80211.c      |  3 +-
 drivers/net/wireless/realtek/rtw89/mac80211.c      |  3 +-
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        |  3 +-
 drivers/net/wireless/silabs/wfx/sta.c              |  3 +-
 drivers/net/wireless/silabs/wfx/sta.h              |  3 +-
 drivers/net/wireless/st/cw1200/sta.c               |  3 +-
 drivers/net/wireless/st/cw1200/sta.h               |  3 +-
 drivers/net/wireless/ti/wl1251/main.c              |  3 +-
 drivers/net/wireless/ti/wlcore/main.c              |  3 +-
 include/net/mac80211.h                             |  3 +-
 net/mac80211/cfg.c                                 |  7 +--
 net/mac80211/driver-ops.c                          |  8 ++--
 net/mac80211/driver-ops.h                          |  2 +-
 net/mac80211/ibss.c                                |  2 +-
 net/mac80211/ieee80211_i.h                         |  5 +-
 net/mac80211/iface.c                               |  2 +-
 net/mac80211/mlme.c                                | 55 ++++++++++++----------
 net/mac80211/tdls.c                                |  2 +-
 net/mac80211/trace.h                               |  9 ++--
 net/mac80211/util.c                                | 18 +++----
 55 files changed, 158 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index b18f32261d15..ac54396418c9 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -7814,7 +7814,8 @@ exit:
 }
 
 static int ath10k_conf_tx(struct ieee80211_hw *hw,
-			  struct ieee80211_vif *vif, u16 ac,
+			  struct ieee80211_vif *vif,
+			  unsigned int link_id, u16 ac,
 			  const struct ieee80211_tx_queue_params *params)
 {
 	struct ath10k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 1cf1e1f18dba..ec7b3a3629f3 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -4822,7 +4822,8 @@ exit:
 }
 
 static int ath11k_mac_op_conf_tx(struct ieee80211_hw *hw,
-				 struct ieee80211_vif *vif, u16 ac,
+				 struct ieee80211_vif *vif,
+				 unsigned int link_id, u16 ac,
 				 const struct ieee80211_tx_queue_params *params)
 {
 	struct ath11k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
index 8da232e81518..acd0e1dafb7f 100644
--- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c
+++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
@@ -572,7 +572,8 @@ ath5k_get_stats(struct ieee80211_hw *hw,
 
 
 static int
-ath5k_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+ath5k_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	      unsigned int link_id, u16 queue,
 	      const struct ieee80211_tx_queue_params *params)
 {
 	struct ath5k_hw *ah = hw->priv;
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 14d713e03872..61875c45366b 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1369,7 +1369,8 @@ static void ath9k_htc_sta_rc_update(struct ieee80211_hw *hw,
 }
 
 static int ath9k_htc_conf_tx(struct ieee80211_hw *hw,
-			     struct ieee80211_vif *vif, u16 queue,
+			     struct ieee80211_vif *vif,
+			     unsigned int link_id, u16 queue,
 			     const struct ieee80211_tx_queue_params *params)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index c3d5d9795424..d2c20c332c7a 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1712,7 +1712,8 @@ static void ath9k_sta_notify(struct ieee80211_hw *hw,
 }
 
 static int ath9k_conf_tx(struct ieee80211_hw *hw,
-			 struct ieee80211_vif *vif, u16 queue,
+			 struct ieee80211_vif *vif,
+			 unsigned int link_id, u16 queue,
 			 const struct ieee80211_tx_queue_params *params)
 {
 	struct ath_softc *sc = hw->priv;
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 3d881028bd00..1540e9827f48 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1365,7 +1365,8 @@ static int carl9170_op_sta_remove(struct ieee80211_hw *hw,
 }
 
 static int carl9170_op_conf_tx(struct ieee80211_hw *hw,
-			       struct ieee80211_vif *vif, u16 queue,
+			       struct ieee80211_vif *vif,
+			       unsigned int link_id, u16 queue,
 			       const struct ieee80211_tx_queue_params *param)
 {
 	struct ar9170 *ar = hw->priv;
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index 6b4188066f0b..008ee1f98af4 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -3783,7 +3783,8 @@ static void b43_qos_init(struct b43_wldev *dev)
 }
 
 static int b43_op_conf_tx(struct ieee80211_hw *hw,
-			  struct ieee80211_vif *vif, u16 _queue,
+			  struct ieee80211_vif *vif,
+			  unsigned int link_id, u16 _queue,
 			  const struct ieee80211_tx_queue_params *params)
 {
 	struct b43_wl *wl = hw_to_b43_wl(hw);
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 532013184389..cbc21c7c3138 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -2505,7 +2505,8 @@ static void b43legacy_op_tx(struct ieee80211_hw *hw,
 }
 
 static int b43legacy_op_conf_tx(struct ieee80211_hw *hw,
-				struct ieee80211_vif *vif, u16 queue,
+				struct ieee80211_vif *vif,
+				unsigned int link_id, u16 queue,
 				const struct ieee80211_tx_queue_params *params)
 {
 	return 0;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 61d7404aded4..a4034d44609b 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -787,7 +787,8 @@ static void brcms_ops_sw_scan_complete(struct ieee80211_hw *hw,
 }
 
 static int
-brcms_ops_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+brcms_ops_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		  unsigned int link_id, u16 queue,
 		  const struct ieee80211_tx_queue_params *params)
 {
 	struct brcms_info *wl = hw->priv;
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index 6f007e63f0c2..9aa3359a9adc 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -4480,7 +4480,8 @@ il_clear_isr_stats(struct il_priv *il)
 }
 
 int
-il_mac_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+il_mac_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	       unsigned int link_id, u16 queue,
 	       const struct ieee80211_tx_queue_params *params)
 {
 	struct il_priv *il = hw->priv;
diff --git a/drivers/net/wireless/intel/iwlegacy/common.h b/drivers/net/wireless/intel/iwlegacy/common.h
index d1383b4f0f05..69687fcf963f 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.h
+++ b/drivers/net/wireless/intel/iwlegacy/common.h
@@ -1683,7 +1683,8 @@ struct il_cfg {
  ***************************/
 
 int il_mac_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		   u16 queue, const struct ieee80211_tx_queue_params *params);
+		   unsigned int link_id, u16 queue,
+		   const struct ieee80211_tx_queue_params *params);
 int il_mac_tx_last_beacon(struct ieee80211_hw *hw);
 
 void il_set_rxon_hwcrypto(struct il_priv *il, int hw_decrypt);
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
index e8bd4f0e3d2d..f4070fddc8c7 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
@@ -2,7 +2,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2003 - 2014 Intel Corporation. All rights reserved.
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2019, 2022 Intel Corporation
  *
  * Portions of this file are derived from the ipw3945 project, as well
  * as portions of the ieee80211 subsystem header files.
@@ -1153,7 +1153,8 @@ static int iwlagn_mac_set_tim(struct ieee80211_hw *hw,
 }
 
 static int iwlagn_mac_conf_tx(struct ieee80211_hw *hw,
-			      struct ieee80211_vif *vif, u16 queue,
+			      struct ieee80211_vif *vif,
+			      unsigned int link_id, u16 queue,
 			      const struct ieee80211_tx_queue_params *params)
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index eaffc3163bd1..3de558f286a1 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -3349,7 +3349,8 @@ static void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw,
 }
 
 static int iwl_mvm_mac_conf_tx(struct ieee80211_hw *hw,
-			       struct ieee80211_vif *vif, u16 ac,
+			       struct ieee80211_vif *vif,
+			       unsigned int link_id, u16 ac,
 			       const struct ieee80211_tx_queue_params *params)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index 115be1f3f33d..0f76d43fda33 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -404,7 +404,8 @@ static void p54_configure_filter(struct ieee80211_hw *dev,
 }
 
 static int p54_conf_tx(struct ieee80211_hw *dev,
-		       struct ieee80211_vif *vif, u16 queue,
+		       struct ieee80211_vif *vif,
+		       unsigned int link_id, u16 queue,
 		       const struct ieee80211_tx_queue_params *params)
 {
 	struct p54_common *priv = dev->priv;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index f437d8a65268..5da7a097d518 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2340,10 +2340,10 @@ static int mac80211_hwsim_set_tim(struct ieee80211_hw *hw,
 	return 0;
 }
 
-static int mac80211_hwsim_conf_tx(
-	struct ieee80211_hw *hw,
-	struct ieee80211_vif *vif, u16 queue,
-	const struct ieee80211_tx_queue_params *params)
+static int mac80211_hwsim_conf_tx(struct ieee80211_hw *hw,
+				  struct ieee80211_vif *vif,
+				  unsigned int link_id, u16 queue,
+				  const struct ieee80211_tx_queue_params *params)
 {
 	wiphy_dbg(hw->wiphy,
 		  "%s (queue=%d txop=%d cw_min=%d cw_max=%d aifs=%d)\n",
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 293bec9bb8dd..c1bf8998109c 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5365,7 +5365,8 @@ static int mwl8k_sta_add(struct ieee80211_hw *hw,
 }
 
 static int mwl8k_conf_tx(struct ieee80211_hw *hw,
-			 struct ieee80211_vif *vif, u16 queue,
+			 struct ieee80211_vif *vif,
+			 unsigned int link_id, u16 queue,
 			 const struct ieee80211_tx_queue_params *params)
 {
 	struct mwl8k_priv *priv = hw->priv;
@@ -6050,7 +6051,7 @@ static int mwl8k_reload_firmware(struct ieee80211_hw *hw, char *fw_image)
 		goto fail;
 
 	for (i = 0; i < MWL8K_TX_WMM_QUEUES; i++) {
-		rc = mwl8k_conf_tx(hw, NULL, i, &priv->wmm_params[i]);
+		rc = mwl8k_conf_tx(hw, NULL, 0, i, &priv->wmm_params[i]);
 		if (rc)
 			goto fail;
 	}
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index 088c0a4cf774..051715ed90dd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -527,7 +527,8 @@ mt7603_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 }
 
 static int
-mt7603_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+mt7603_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	       unsigned int link_id, u16 queue,
 	       const struct ieee80211_tx_queue_params *params)
 {
 	struct mt7603_dev *dev = hw->priv;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
index 277c22a4d049..bf1696eb568a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
@@ -494,7 +494,8 @@ static int mt7615_config(struct ieee80211_hw *hw, u32 changed)
 }
 
 static int
-mt7615_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+mt7615_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	       unsigned int link_id, u16 queue,
 	       const struct ieee80211_tx_queue_params *params)
 {
 	struct mt76_vif *mvif = (struct mt76_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h
index 74ad418f5a70..50eaeff11af3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h
@@ -156,7 +156,8 @@ int mt76x02_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 		    struct ieee80211_vif *vif, struct ieee80211_sta *sta,
 		    struct ieee80211_key_conf *key);
 int mt76x02_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		    u16 queue, const struct ieee80211_tx_queue_params *params);
+		    unsigned int link_id, u16 queue,
+		    const struct ieee80211_tx_queue_params *params);
 void mt76x02_sta_rate_tbl_update(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_sta *sta);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
index a0e2d042751b..604ddcc21123 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
@@ -487,7 +487,8 @@ int mt76x02_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 EXPORT_SYMBOL_GPL(mt76x02_set_key);
 
 int mt76x02_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		    u16 queue, const struct ieee80211_tx_queue_params *params)
+		    unsigned int link_id, u16 queue,
+		    const struct ieee80211_tx_queue_params *params)
 {
 	struct mt76x02_dev *dev = hw->priv;
 	u8 cw_min = 5, cw_max = 10, qid;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
index fbeac9aa2af6..25323a155a88 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
@@ -478,7 +478,8 @@ static int mt7915_config(struct ieee80211_hw *hw, u32 changed)
 }
 
 static int
-mt7915_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+mt7915_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	       unsigned int link_id, u16 queue,
 	       const struct ieee80211_tx_queue_params *params)
 {
 	struct mt7915_vif *mvif = (struct mt7915_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 63583605d1cd..1d0daa0c90be 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -567,7 +567,8 @@ out:
 }
 
 static int
-mt7921_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 queue,
+mt7921_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+	       unsigned int link_id, u16 queue,
 	       const struct ieee80211_tx_queue_params *params)
 {
 	struct mt7921_vif *mvif = (struct mt7921_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
index a122f1dd38f6..118d43707853 100644
--- a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
+++ b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
@@ -368,7 +368,8 @@ void mt7601u_mac_set_ampdu_factor(struct mt7601u_dev *dev);
 void mt7601u_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control,
 		struct sk_buff *skb);
 int mt7601u_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		    u16 queue, const struct ieee80211_tx_queue_params *params);
+		    unsigned int link_id, u16 queue,
+		    const struct ieee80211_tx_queue_params *params);
 void mt7601u_tx_status(struct mt7601u_dev *dev, struct sk_buff *skb);
 void mt7601u_tx_stat(struct work_struct *work);
 
diff --git a/drivers/net/wireless/mediatek/mt7601u/tx.c b/drivers/net/wireless/mediatek/mt7601u/tx.c
index f1fa0442a57f..51d977ffc52f 100644
--- a/drivers/net/wireless/mediatek/mt7601u/tx.c
+++ b/drivers/net/wireless/mediatek/mt7601u/tx.c
@@ -258,7 +258,8 @@ void mt7601u_tx_stat(struct work_struct *work)
 }
 
 int mt7601u_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		    u16 queue, const struct ieee80211_tx_queue_params *params)
+		    unsigned int link_id, u16 queue,
+		    const struct ieee80211_tx_queue_params *params)
 {
 	struct mt7601u_dev *dev = hw->priv;
 	u8 cw_min = 5, cw_max = 10, hw_q = q2hwq(queue);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
index dec6ffdf07c4..273c5eac3362 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
@@ -1654,7 +1654,8 @@ static int rt2400pci_probe_hw(struct rt2x00_dev *rt2x00dev)
  * IEEE80211 stack callback functions.
  */
 static int rt2400pci_conf_tx(struct ieee80211_hw *hw,
-			     struct ieee80211_vif *vif, u16 queue,
+			     struct ieee80211_vif *vif,
+			     unsigned int link_id, u16 queue,
 			     const struct ieee80211_tx_queue_params *params)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
@@ -1667,7 +1668,7 @@ static int rt2400pci_conf_tx(struct ieee80211_hw *hw,
 	if (queue != 0)
 		return -EINVAL;
 
-	if (rt2x00mac_conf_tx(hw, vif, queue, params))
+	if (rt2x00mac_conf_tx(hw, vif, link_id, queue, params))
 		return -EINVAL;
 
 	/*
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index cbdaf7992f98..18102fbe36d6 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -10395,7 +10395,8 @@ int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 EXPORT_SYMBOL_GPL(rt2800_set_rts_threshold);
 
 int rt2800_conf_tx(struct ieee80211_hw *hw,
-		   struct ieee80211_vif *vif, u16 queue_idx,
+		   struct ieee80211_vif *vif,
+		   unsigned int link_id, u16 queue_idx,
 		   const struct ieee80211_tx_queue_params *params)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
@@ -10411,7 +10412,7 @@ int rt2800_conf_tx(struct ieee80211_hw *hw,
 	 * we are free to update the registers based on the value
 	 * in the queue parameter.
 	 */
-	retval = rt2x00mac_conf_tx(hw, vif, queue_idx, params);
+	retval = rt2x00mac_conf_tx(hw, vif, link_id, queue_idx, params);
 	if (retval)
 		return retval;
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
index 1139405c0ebb..e1761f467b94 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
@@ -245,7 +245,8 @@ void rt2800_get_key_seq(struct ieee80211_hw *hw,
 			struct ieee80211_key_seq *seq);
 int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value);
 int rt2800_conf_tx(struct ieee80211_hw *hw,
-		   struct ieee80211_vif *vif, u16 queue_idx,
+		   struct ieee80211_vif *vif,
+		   unsigned int link_id, u16 queue_idx,
 		   const struct ieee80211_tx_queue_params *params);
 u64 rt2800_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
index 918e0477bb7d..f7ef2ca38794 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
@@ -1481,7 +1481,8 @@ void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw,
 				struct ieee80211_bss_conf *bss_conf,
 				u64 changes);
 int rt2x00mac_conf_tx(struct ieee80211_hw *hw,
-		      struct ieee80211_vif *vif, u16 queue,
+		      struct ieee80211_vif *vif,
+		      unsigned int link_id, u16 queue,
 		      const struct ieee80211_tx_queue_params *params);
 void rt2x00mac_rfkill_poll(struct ieee80211_hw *hw);
 void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
index 6205d22765c7..c0ab2e6a29ad 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
@@ -665,7 +665,8 @@ void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw,
 EXPORT_SYMBOL_GPL(rt2x00mac_bss_info_changed);
 
 int rt2x00mac_conf_tx(struct ieee80211_hw *hw,
-		      struct ieee80211_vif *vif, u16 queue_idx,
+		      struct ieee80211_vif *vif,
+		      unsigned int link_id, u16 queue_idx,
 		      const struct ieee80211_tx_queue_params *params)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
diff --git a/drivers/net/wireless/ralink/rt2x00/rt61pci.c b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
index 82cfc2aadc2b..d92f9eb07dc9 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt61pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
@@ -2799,7 +2799,8 @@ static int rt61pci_probe_hw(struct rt2x00_dev *rt2x00dev)
  * IEEE80211 stack callback functions.
  */
 static int rt61pci_conf_tx(struct ieee80211_hw *hw,
-			   struct ieee80211_vif *vif, u16 queue_idx,
+			   struct ieee80211_vif *vif,
+			   unsigned int link_id, u16 queue_idx,
 			   const struct ieee80211_tx_queue_params *params)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
@@ -2815,7 +2816,7 @@ static int rt61pci_conf_tx(struct ieee80211_hw *hw,
 	 * we are free to update the registers based on the value
 	 * in the queue parameter.
 	 */
-	retval = rt2x00mac_conf_tx(hw, vif, queue_idx, params);
+	retval = rt2x00mac_conf_tx(hw, vif, link_id, queue_idx, params);
 	if (retval)
 		return retval;
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt73usb.c b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
index 5ff2c740c3ea..e3269fd7c59e 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt73usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
@@ -2218,7 +2218,8 @@ static int rt73usb_probe_hw(struct rt2x00_dev *rt2x00dev)
  * IEEE80211 stack callback functions.
  */
 static int rt73usb_conf_tx(struct ieee80211_hw *hw,
-			   struct ieee80211_vif *vif, u16 queue_idx,
+			   struct ieee80211_vif *vif,
+			   unsigned int link_id, u16 queue_idx,
 			   const struct ieee80211_tx_queue_params *params)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
@@ -2234,7 +2235,7 @@ static int rt73usb_conf_tx(struct ieee80211_hw *hw,
 	 * we are free to update the registers based on the value
 	 * in the queue parameter.
 	 */
-	retval = rt2x00mac_conf_tx(hw, vif, queue_idx, params);
+	retval = rt2x00mac_conf_tx(hw, vif, link_id, queue_idx, params);
 	if (retval)
 		return retval;
 
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
index f66cc9083972..cdfe08078c57 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
@@ -1424,7 +1424,8 @@ static void rtl8187se_conf_ac_parm(struct ieee80211_hw *dev, u8 queue)
 }
 
 static int rtl8180_conf_tx(struct ieee80211_hw *dev,
-			    struct ieee80211_vif *vif, u16 queue,
+			    struct ieee80211_vif *vif,
+			    unsigned int link_id, u16 queue,
 			    const struct ieee80211_tx_queue_params *params)
 {
 	struct rtl8180_priv *priv = dev->priv;
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index edc84f9dc984..c0f6e9c6d03e 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -1338,7 +1338,8 @@ static void rtl8187_configure_filter(struct ieee80211_hw *dev,
 }
 
 static int rtl8187_conf_tx(struct ieee80211_hw *dev,
-			   struct ieee80211_vif *vif, u16 queue,
+			   struct ieee80211_vif *vif,
+			   unsigned int link_id, u16 queue,
 			   const struct ieee80211_tx_queue_params *params)
 {
 	struct rtl8187_priv *priv = dev->priv;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 65c4cb1e030c..33a1d9143038 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -5957,7 +5957,8 @@ exit:
 }
 
 static int rtl8xxxu_conf_tx(struct ieee80211_hw *hw,
-			    struct ieee80211_vif *vif, u16 queue,
+			    struct ieee80211_vif *vif,
+			    unsigned int link_id, u16 queue,
 			    const struct ieee80211_tx_queue_params *param)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 8537cc6d7a89..519b2643f9f4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -982,7 +982,8 @@ static int _rtl_get_hal_qnum(u16 queue)
  *for rtl819x  BE = 0, BK = 1, VI = 2, VO = 3
  */
 static int rtl_op_conf_tx(struct ieee80211_hw *hw,
-			  struct ieee80211_vif *vif, u16 queue,
+			  struct ieee80211_vif *vif,
+			  unsigned int link_id, u16 queue,
 			  const struct ieee80211_tx_queue_params *param)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index ba60ca7ecdbf..bb7291665d37 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -443,7 +443,8 @@ static int rtw_ops_start_ap(struct ieee80211_hw *hw,
 }
 
 static int rtw_ops_conf_tx(struct ieee80211_hw *hw,
-			   struct ieee80211_vif *vif, u16 ac,
+			   struct ieee80211_vif *vif,
+			   unsigned int link_id, u16 ac,
 			   const struct ieee80211_tx_queue_params *params)
 {
 	struct rtw_dev *rtwdev = hw->priv;
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index 20fb4c550010..f40569c8575c 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -427,7 +427,8 @@ static int rtw89_ops_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
 }
 
 static int rtw89_ops_conf_tx(struct ieee80211_hw *hw,
-			     struct ieee80211_vif *vif, u16 ac,
+			     struct ieee80211_vif *vif,
+			     unsigned int link_id, u16 ac,
 			     const struct ieee80211_tx_queue_params *params)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 1dff3d263382..bf39c4bda26f 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -895,7 +895,8 @@ static void rsi_mac80211_conf_filter(struct ieee80211_hw *hw,
  * Return: 0 on success, negative error code on failure.
  */
 static int rsi_mac80211_conf_tx(struct ieee80211_hw *hw,
-				struct ieee80211_vif *vif, u16 queue,
+				struct ieee80211_vif *vif,
+				unsigned int link_id, u16 queue,
 				const struct ieee80211_tx_queue_params *params)
 {
 	struct rsi_hw *adapter = hw->priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 47fd887ce6fe..89402afe4fe6 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -216,7 +216,8 @@ int wfx_update_pm(struct wfx_vif *wvif)
 }
 
 int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		u16 queue, const struct ieee80211_tx_queue_params *params)
+		unsigned int link_id, u16 queue,
+		const struct ieee80211_tx_queue_params *params)
 {
 	struct wfx_dev *wdev = hw->priv;
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index 93ea992de1d7..6558c5698cc8 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -36,7 +36,8 @@ void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 int wfx_join_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void wfx_leave_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		u16 queue, const struct ieee80211_tx_queue_params *params);
+		unsigned int link_id, u16 queue,
+		const struct ieee80211_tx_queue_params *params);
 void wfx_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			  struct ieee80211_bss_conf *info, u64 changed);
 int wfx_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta);
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index a3f6942df0c1..26d3614519b1 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -606,7 +606,8 @@ void cw1200_configure_filter(struct ieee80211_hw *dev,
 }
 
 int cw1200_conf_tx(struct ieee80211_hw *dev, struct ieee80211_vif *vif,
-		   u16 queue, const struct ieee80211_tx_queue_params *params)
+		   unsigned int link_id, u16 queue,
+		   const struct ieee80211_tx_queue_params *params)
 {
 	struct cw1200_common *priv = dev->priv;
 	int ret = 0;
diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h
index 05e3ab7ccef1..a49f187c7049 100644
--- a/drivers/net/wireless/st/cw1200/sta.h
+++ b/drivers/net/wireless/st/cw1200/sta.h
@@ -28,7 +28,8 @@ void cw1200_configure_filter(struct ieee80211_hw *dev,
 			     unsigned int *total_flags,
 			     u64 multicast);
 int cw1200_conf_tx(struct ieee80211_hw *dev, struct ieee80211_vif *vif,
-		   u16 queue, const struct ieee80211_tx_queue_params *params);
+		   unsigned int link_id, u16 queue,
+		   const struct ieee80211_tx_queue_params *params);
 int cw1200_get_stats(struct ieee80211_hw *dev,
 		     struct ieee80211_low_level_stats *stats);
 int cw1200_set_key(struct ieee80211_hw *dev, enum set_key_cmd cmd,
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index d76558964420..9144ef5538a8 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1282,7 +1282,8 @@ static struct ieee80211_channel wl1251_channels[] = {
 };
 
 static int wl1251_op_conf_tx(struct ieee80211_hw *hw,
-			     struct ieee80211_vif *vif, u16 queue,
+			     struct ieee80211_vif *vif,
+			     unsigned int link_id, u16 queue,
 			     const struct ieee80211_tx_queue_params *params)
 {
 	enum wl1251_acx_ps_scheme ps_scheme;
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index d1200080f165..1edec9f3c0d8 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4864,7 +4864,8 @@ out:
 }
 
 static int wl1271_op_conf_tx(struct ieee80211_hw *hw,
-			     struct ieee80211_vif *vif, u16 queue,
+			     struct ieee80211_vif *vif,
+			     unsigned int link_id, u16 queue,
 			     const struct ieee80211_tx_queue_params *params)
 {
 	struct wl1271 *wl = hw->priv;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fba06d7bc7eb..7cf8d58eb5f0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4174,7 +4174,8 @@ struct ieee80211_ops {
 			       struct ieee80211_sta *sta,
 			       struct station_info *sinfo);
 	int (*conf_tx)(struct ieee80211_hw *hw,
-		       struct ieee80211_vif *vif, u16 ac,
+		       struct ieee80211_vif *vif,
+		       unsigned int link_id, u16 ac,
 		       const struct ieee80211_tx_queue_params *params);
 	u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 	void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index aa3a1568c7fc..fa3b6cc2cafa 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2570,6 +2570,7 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_link_data *link = &sdata->deflink;
 	struct ieee80211_tx_queue_params p;
 
 	if (!local->ops->conf_tx)
@@ -2592,15 +2593,15 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 
 	ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac);
 
-	sdata->tx_conf[params->ac] = p;
-	if (drv_conf_tx(local, sdata, params->ac, &p)) {
+	link->tx_conf[params->ac] = p;
+	if (drv_conf_tx(local, link, params->ac, &p)) {
 		wiphy_debug(local->hw.wiphy,
 			    "failed to set TX queue parameters for AC %d\n",
 			    params->ac);
 		return -EINVAL;
 	}
 
-	ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+	ieee80211_link_info_change_notify(sdata, link,
 					  BSS_CHANGED_QOS);
 
 	return 0;
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 48322e45e7dd..9b61dc7889c2 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright 2015 Intel Deutschland GmbH
+ * Copyright (C) 2022 Intel Corporation
  */
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
@@ -180,9 +181,10 @@ void drv_sta_rc_update(struct ieee80211_local *local,
 }
 
 int drv_conf_tx(struct ieee80211_local *local,
-		struct ieee80211_sub_if_data *sdata, u16 ac,
+		struct ieee80211_link_data *link, u16 ac,
 		const struct ieee80211_tx_queue_params *params)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	int ret = -EOPNOTSUPP;
 
 	might_sleep();
@@ -201,10 +203,10 @@ int drv_conf_tx(struct ieee80211_local *local,
 		return -EINVAL;
 	}
 
-	trace_drv_conf_tx(local, sdata, ac, params);
+	trace_drv_conf_tx(local, sdata, link->link_id, ac, params);
 	if (local->ops->conf_tx)
 		ret = local->ops->conf_tx(&local->hw, &sdata->vif,
-					  ac, params);
+					  link->link_id, ac, params);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index ee3ac1a01bb0..eb16a354338c 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -590,7 +590,7 @@ static inline void drv_sta_statistics(struct ieee80211_local *local,
 }
 
 int drv_conf_tx(struct ieee80211_local *local,
-		struct ieee80211_sub_if_data *sdata, u16 ac,
+		struct ieee80211_link_data *link, u16 ac,
 		const struct ieee80211_tx_queue_params *params);
 
 u64 drv_get_tsf(struct ieee80211_local *local,
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 561abcf4def9..0a1d51c60530 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -356,7 +356,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	else
 		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
 
-	ieee80211_set_wmm_default(sdata, true, false);
+	ieee80211_set_wmm_default(&sdata->deflink, true, false);
 
 	sdata->vif.cfg.ibss_joined = true;
 	sdata->vif.cfg.ibss_creator = creator;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 05996df627ea..699dabaa9f0d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -969,6 +969,8 @@ struct ieee80211_link_data {
 		struct ieee80211_link_data_ap ap;
 	} u;
 
+	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
+
 	struct ieee80211_bss_conf *conf;
 };
 
@@ -1012,7 +1014,6 @@ struct ieee80211_sub_if_data {
 	bool control_port_over_nl80211;
 
 	atomic_t num_tx_queued;
-	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 	struct mac80211_qos_map __rcu *qos_map;
 
 	/* used to reconfigure hardware SM PS */
@@ -2101,7 +2102,7 @@ int ieee80211_frame_duration(enum nl80211_band band, size_t len,
 void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
 					   struct ieee80211_tx_queue_params *qparam,
 					   int ac);
-void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
+void ieee80211_set_wmm_default(struct ieee80211_link_data *link,
 			       bool bss_notify, bool enable_qos);
 void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 		    struct sta_info *sta, struct sk_buff *skb);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 312a812bec84..f29764936c99 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1539,7 +1539,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		 * doesn't start up with sane defaults.
 		 * Enable QoS for anything but station interfaces.
 		 */
-		ieee80211_set_wmm_default(sdata, true,
+		ieee80211_set_wmm_default(&sdata->deflink, true,
 			sdata->vif.type != NL80211_IFTYPE_STATION);
 	}
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 5400b3e567fa..44f76a8fa80b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2018,10 +2018,11 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 		switch (tx_tspec->action) {
 		case TX_TSPEC_ACTION_STOP_DOWNGRADE:
 			/* take the original parameters */
-			if (drv_conf_tx(local, sdata, ac, &sdata->tx_conf[ac]))
-				sdata_err(sdata,
-					  "failed to set TX queue parameters for queue %d\n",
-					  ac);
+			if (drv_conf_tx(local, &sdata->deflink, ac,
+					&sdata->deflink.tx_conf[ac]))
+				link_err(&sdata->deflink,
+					 "failed to set TX queue parameters for queue %d\n",
+					 ac);
 			tx_tspec->action = TX_TSPEC_ACTION_NONE;
 			tx_tspec->downgraded = false;
 			ret = true;
@@ -2047,11 +2048,11 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 			 */
 			if (non_acm_ac >= IEEE80211_NUM_ACS)
 				non_acm_ac = IEEE80211_AC_BK;
-			if (drv_conf_tx(local, sdata, ac,
-					&sdata->tx_conf[non_acm_ac]))
-				sdata_err(sdata,
-					  "failed to set TX queue parameters for queue %d\n",
-					  ac);
+			if (drv_conf_tx(local, &sdata->deflink, ac,
+					&sdata->deflink.tx_conf[non_acm_ac]))
+				link_err(&sdata->deflink,
+					 "failed to set TX queue parameters for queue %d\n",
+					 ac);
 			tx_tspec->action = TX_TSPEC_ACTION_NONE;
 			ret = true;
 			schedule_delayed_work(&ifmgd->tx_tspec_wk,
@@ -2085,10 +2086,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
 /* MLME */
 static bool
 ieee80211_sta_wmm_params(struct ieee80211_local *local,
-			 struct ieee80211_sub_if_data *sdata,
+			 struct ieee80211_link_data *link,
 			 const u8 *wmm_param, size_t wmm_param_len,
 			 const struct ieee80211_mu_edca_param_set *mu_edca)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	size_t left;
@@ -2117,11 +2119,11 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local,
 	 * the driver about it.
 	 */
 	mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1;
-	if (count == sdata->deflink.u.mgd.wmm_last_param_set &&
-	    mu_edca_count == sdata->deflink.u.mgd.mu_edca_last_param_set)
+	if (count == link->u.mgd.wmm_last_param_set &&
+	    mu_edca_count == link->u.mgd.mu_edca_last_param_set)
 		return false;
-	sdata->deflink.u.mgd.wmm_last_param_set = count;
-	sdata->deflink.u.mgd.mu_edca_last_param_set = mu_edca_count;
+	link->u.mgd.wmm_last_param_set = count;
+	link->u.mgd.mu_edca_last_param_set = mu_edca_count;
 
 	pos = wmm_param + 8;
 	left = wmm_param_len - 8;
@@ -2219,16 +2221,16 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local,
 			 params[ac].aifs, params[ac].cw_min, params[ac].cw_max,
 			 params[ac].txop, params[ac].uapsd,
 			 ifmgd->tx_tspec[ac].downgraded);
-		sdata->tx_conf[ac] = params[ac];
+		link->tx_conf[ac] = params[ac];
 		if (!ifmgd->tx_tspec[ac].downgraded &&
-		    drv_conf_tx(local, sdata, ac, &params[ac]))
-			sdata_err(sdata,
-				  "failed to set TX queue parameters for AC %d\n",
-				  ac);
+		    drv_conf_tx(local, link, ac, &params[ac]))
+			link_err(link,
+				 "failed to set TX queue parameters for AC %d\n",
+				 ac);
 	}
 
 	/* enable WMM or activate new settings */
-	sdata->vif.bss_conf.qos = true;
+	link->conf->qos = true;
 	return true;
 }
 
@@ -2508,7 +2510,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	/* disassociated - set to defaults now */
-	ieee80211_set_wmm_default(sdata, false, false);
+	ieee80211_set_wmm_default(&sdata->deflink, false, false);
 
 	del_timer_sync(&sdata->u.mgd.conn_mon_timer);
 	del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
@@ -3766,12 +3768,13 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	sdata->deflink.u.mgd.mu_edca_last_param_set = -1;
 
 	if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
-		ieee80211_set_wmm_default(sdata, false, false);
-	} else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param,
+		ieee80211_set_wmm_default(&sdata->deflink, false, false);
+	} else if (!ieee80211_sta_wmm_params(local, &sdata->deflink,
+					     elems->wmm_param,
 					     elems->wmm_param_len,
 					     elems->mu_edca_param_set)) {
 		/* still enable QoS since we might have HT/VHT */
-		ieee80211_set_wmm_default(sdata, false, true);
+		ieee80211_set_wmm_default(&sdata->deflink, false, true);
 		/* set the disable-WMM flag in this case to disable
 		 * tracking WMM parameter changes in the beacon if
 		 * the parameters weren't actually valid. Doing so
@@ -3938,7 +3941,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		/* get uapsd queues configuration */
 		uapsd_queues = 0;
 		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
-			if (sdata->tx_conf[ac].uapsd)
+			if (sdata->deflink.tx_conf[ac].uapsd)
 				uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
 
 		info.success = 1;
@@ -4363,7 +4366,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					 elems, true);
 
 	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) &&
-	    ieee80211_sta_wmm_params(local, sdata, elems->wmm_param,
+	    ieee80211_sta_wmm_params(local, &sdata->deflink, elems->wmm_param,
 				     elems->wmm_param_len,
 				     elems->mu_edca_param_set))
 		changed |= BSS_CHANGED_QOS;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 30f2b340017c..e7bdcdb488e2 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -293,7 +293,7 @@ static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata,
 	 * doesn't support it, as mandated by 802.11-2012 section 10.22.4
 	 */
 	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
-		txq = &sdata->tx_conf[ieee80211_ac_from_wmm(i)];
+		txq = &sdata->deflink.tx_conf[ieee80211_ac_from_wmm(i)];
 		wmm->ac[i].aci_aifsn = ieee80211_wmm_aci_aifsn(txq->aifs,
 							       txq->acm, i);
 		wmm->ac[i].cw = ieee80211_wmm_ecw(txq->cw_min, txq->cw_max);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 751f08b47d0c..b6f12ac91849 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1003,13 +1003,15 @@ DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update,
 TRACE_EVENT(drv_conf_tx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
+		 unsigned int link_id,
 		 u16 ac, const struct ieee80211_tx_queue_params *params),
 
-	TP_ARGS(local, sdata, ac, params),
+	TP_ARGS(local, sdata, link_id, ac, params),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
+		__field(unsigned int, link_id)
 		__field(u16, ac)
 		__field(u16, txop)
 		__field(u16, cw_min)
@@ -1021,6 +1023,7 @@ TRACE_EVENT(drv_conf_tx,
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
+		__entry->link_id = link_id;
 		__entry->ac = ac;
 		__entry->txop = params->txop;
 		__entry->cw_max = params->cw_max;
@@ -1030,8 +1033,8 @@ TRACE_EVENT(drv_conf_tx,
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT  " AC:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, __entry->ac
+		LOCAL_PR_FMT  VIF_PR_FMT  " link_id: %d, AC:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->ac
 	)
 );
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f06514087511..2b2c8e1472f3 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1596,9 +1596,10 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 }
 
-void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
+void ieee80211_set_wmm_default(struct ieee80211_link_data *link,
 			       bool bss_notify, bool enable_qos)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_tx_queue_params qparam;
 	struct ieee80211_chanctx_conf *chanctx_conf;
@@ -1616,7 +1617,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 	memset(&qparam, 0, sizeof(qparam));
 
 	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
+	chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
 	use_11b = (chanctx_conf &&
 		   chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) &&
 		 !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
@@ -1693,17 +1694,16 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 
 		qparam.uapsd = false;
 
-		sdata->tx_conf[ac] = qparam;
-		drv_conf_tx(local, sdata, ac, &qparam);
+		link->tx_conf[ac] = qparam;
+		drv_conf_tx(local, link, ac, &qparam);
 	}
 
 	if (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
 	    sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
 	    sdata->vif.type != NL80211_IFTYPE_NAN) {
-		sdata->vif.bss_conf.qos = enable_qos;
+		link->conf->qos = enable_qos;
 		if (bss_notify)
-			ieee80211_link_info_change_notify(sdata,
-							  &sdata->deflink,
+			ieee80211_link_info_change_notify(sdata, link,
 							  BSS_CHANGED_QOS);
 	}
 }
@@ -2520,8 +2520,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			fallthrough;
 		case NL80211_IFTYPE_AP: /* AP stations are handled later */
 			for (i = 0; i < IEEE80211_NUM_ACS; i++)
-				drv_conf_tx(local, sdata, i,
-					    &sdata->tx_conf[i]);
+				drv_conf_tx(local, &sdata->deflink, i,
+					    &sdata->deflink.tx_conf[i]);
 			break;
 		}
 
-- 
cgit 


From b65567b03c9502e67bed6707cb53a4c730c2bee2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Jun 2022 16:08:39 +0200
Subject: wifi: mac80211: mlme: track AP (MLD) address separately

To prepare a bit more for MLO in the client code,
track the AP's address (for now only the BSSID, but
will track the AP MLD's address later) separately
from the per-link BSSID.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  3 +++
 net/mac80211/mlme.c    | 30 ++++++++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7cf8d58eb5f0..36eba96a1012 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1728,6 +1728,8 @@ enum ieee80211_offload_flags {
  * @idle: This interface is idle. There's also a global idle flag in the
  *	hardware config which may be more appropriate depending on what
  *	your driver/device needs to do.
+ * @ap_addr: AP MLD address, or BSSID for non-MLO connections
+ *	(station mode only)
  */
 struct ieee80211_vif_cfg {
 	/* association related data */
@@ -1742,6 +1744,7 @@ struct ieee80211_vif_cfg {
 	size_t ssid_len;
 	bool s1g;
 	bool idle;
+	u8 ap_addr[ETH_ALEN] __aligned(2);
 };
 
 /**
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7eab15920a70..a2e8fe9b43ab 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1808,7 +1808,7 @@ static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata)
 		return false;
 
 	rcu_read_lock();
-	sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid);
+	sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
 	if (sta)
 		authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
 	rcu_read_unlock();
@@ -2313,6 +2313,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	sdata->u.mgd.associated = true;
 	sdata->deflink.u.mgd.bss = cbss;
 	memcpy(sdata->deflink.u.mgd.bssid, cbss->bssid, ETH_ALEN);
+	memcpy(sdata->vif.cfg.ap_addr, cbss->bssid, ETH_ALEN);
 
 	ieee80211_check_rate_mask(sdata);
 
@@ -2463,6 +2464,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* clear bssid only after building the needed mgmt frames */
 	eth_zero_addr(sdata->deflink.u.mgd.bssid);
 
+	eth_zero_addr(sdata->vif.cfg.ap_addr);
 	sdata->vif.cfg.ssid_len = 0;
 
 	/* remove AP and TDLS peers */
@@ -2653,7 +2655,7 @@ static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
 static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	u8 *dst = sdata->deflink.u.mgd.bssid;
+	u8 *dst = sdata->vif.cfg.ap_addr;
 	u8 unicast_limit = max(1, max_probe_tries - 3);
 	struct sta_info *sta;
 
@@ -2882,13 +2884,13 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work)
 
 	if (ifmgd->connection_loss) {
 		sdata_info(sdata, "Connection to AP %pM lost\n",
-			   sdata->deflink.u.mgd.bssid);
+			   sdata->vif.cfg.ap_addr);
 		__ieee80211_disconnect(sdata);
 		ifmgd->connection_loss = false;
 	} else if (ifmgd->driver_disconnect) {
 		sdata_info(sdata,
 			   "Driver requested disconnection from AP %pM\n",
-			   sdata->deflink.u.mgd.bssid);
+			   sdata->vif.cfg.ap_addr);
 		__ieee80211_disconnect(sdata);
 		ifmgd->driver_disconnect = false;
 	} else {
@@ -3042,7 +3044,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 }
 
 static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
-				    const u8 *bssid)
+				    const u8 *ap_addr)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct sta_info *sta;
@@ -3056,14 +3058,14 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
 
 	/* move station state to auth */
 	mutex_lock(&sdata->local->sta_mtx);
-	sta = sta_info_get(sdata, bssid);
+	sta = sta_info_get(sdata, ap_addr);
 	if (!sta) {
-		WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
+		WARN_ONCE(1, "%s: STA %pM not found", sdata->name, ap_addr);
 		result = false;
 		goto out;
 	}
 	if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
-		sdata_info(sdata, "failed moving %pM to auth\n", bssid);
+		sdata_info(sdata, "failed moving %pM to auth\n", ap_addr);
 		result = false;
 		goto out;
 	}
@@ -4402,7 +4404,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 				erp_valid, erp_value);
 
 	mutex_lock(&local->sta_mtx);
-	sta = sta_info_get(sdata, bssid);
+	sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
 
 	changed |= ieee80211_recalc_twt_req(sdata, sta, elems);
 
@@ -4885,7 +4887,7 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t)
 	    !sdata->deflink.u.mgd.csa_waiting_bcn)
 		return;
 
-	sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid);
+	sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
 	if (!sta)
 		return;
 
@@ -4981,7 +4983,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
 			.bssid = bssid,
 		};
 
-		memcpy(bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN);
+		memcpy(bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
 		ieee80211_mgd_deauth(sdata, &req);
 	}
 
@@ -5909,7 +5911,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 
 		sdata_info(sdata,
 			   "disconnect from AP %pM for new auth to %pM\n",
-			   sdata->deflink.u.mgd.bssid, req->bss->bssid);
+			   sdata->vif.cfg.ap_addr, req->bss->bssid);
 		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
 				       WLAN_REASON_UNSPECIFIED,
 				       false, frame_buf);
@@ -5986,7 +5988,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 
 		sdata_info(sdata,
 			   "disconnect from AP %pM for new assoc to %pM\n",
-			   sdata->deflink.u.mgd.bssid, req->bss->bssid);
+			   sdata->vif.cfg.ap_addr, req->bss->bssid);
 		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
 				       WLAN_REASON_UNSPECIFIED,
 				       false, frame_buf);
@@ -6344,7 +6346,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (ifmgd->associated &&
-	    ether_addr_equal(sdata->deflink.u.mgd.bssid, req->bssid)) {
+	    ether_addr_equal(sdata->vif.cfg.ap_addr, req->bssid)) {
 		sdata_info(sdata,
 			   "deauthenticating from %pM by local choice (Reason: %u=%s)\n",
 			   req->bssid, req->reason_code,
-- 
cgit 


From 8f6e0dfc2245d8ca1a3335a06a1219c56df04bb8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 27 Jun 2022 16:19:18 +0200
Subject: wifi: cfg80211: remove BSS pointer from cfg80211_disassoc_request

The race described by the comment in mac80211 hasn't existed
since the locking rework to use the same lock and for MLO we
need to pass the AP MLD address, so just pass the BSSID or
AP MLD address instead of the BSS struct pointer, and adjust
all the code accordingly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  6 +++---
 net/mac80211/mlme.c    | 14 +++++---------
 net/wireless/core.h    |  2 +-
 net/wireless/mlme.c    |  8 +++-----
 net/wireless/trace.h   |  5 +----
 5 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a4e2cb2378b8..d0be08483fa6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2886,7 +2886,7 @@ struct cfg80211_assoc_request {
  * This structure provides information needed to complete IEEE 802.11
  * deauthentication.
  *
- * @bssid: the BSSID of the BSS to deauthenticate from
+ * @bssid: the BSSID or AP MLD address to deauthenticate from
  * @ie: Extra IEs to add to Deauthentication frame or %NULL
  * @ie_len: Length of ie buffer in octets
  * @reason_code: The reason code for the deauthentication
@@ -2907,7 +2907,7 @@ struct cfg80211_deauth_request {
  * This structure provides information needed to complete IEEE 802.11
  * disassociation.
  *
- * @bss: the BSS to disassociate from
+ * @ap_addr: the BSSID or AP MLD address to disassociate from
  * @ie: Extra IEs to add to Disassociation frame or %NULL
  * @ie_len: Length of ie buffer in octets
  * @reason_code: The reason code for the disassociation
@@ -2915,7 +2915,7 @@ struct cfg80211_deauth_request {
  *	Disassociation frame is to be transmitted.
  */
 struct cfg80211_disassoc_request {
-	struct cfg80211_bss *bss;
+	const u8 *ap_addr;
 	const u8 *ie;
 	size_t ie_len;
 	u16 reason_code;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b71de89d9734..a2b4536c3a24 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6426,18 +6426,14 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
 {
 	u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
 
-	/*
-	 * cfg80211 should catch this ... but it's racy since
-	 * we can receive a disassoc frame, process it, hand it
-	 * to cfg80211 while that's in a locked section already
-	 * trying to tell us that the user wants to disconnect.
-	 */
-	if (sdata->deflink.u.mgd.bss != req->bss)
-		return -ENOLINK;
+	if (!sdata->u.mgd.associated ||
+	    memcmp(sdata->vif.cfg.ap_addr, req->ap_addr, ETH_ALEN))
+		return -ENOTCONN;
 
 	sdata_info(sdata,
 		   "disassociating from %pM by local choice (Reason: %u=%s)\n",
-		   req->bss->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code));
+		   req->ap_addr, req->reason_code,
+		   ieee80211_get_reason_code_string(req->reason_code));
 
 	ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC,
 			       req->reason_code, !req->local_state_change,
diff --git a/net/wireless/core.h b/net/wireless/core.h
index fd723fa5e2d7..e72ca6eefafb 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -372,7 +372,7 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 			 const u8 *ie, int ie_len, u16 reason,
 			 bool local_state_change);
 int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
-			   struct net_device *dev, const u8 *bssid,
+			   struct net_device *dev, const u8 *ap_addr,
 			   const u8 *ie, int ie_len, u16 reason,
 			   bool local_state_change);
 void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 935537c64ed8..4a35b3559daa 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -370,7 +370,7 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 }
 
 int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
-			   struct net_device *dev, const u8 *bssid,
+			   struct net_device *dev, const u8 *ap_addr,
 			   const u8 *ie, int ie_len, u16 reason,
 			   bool local_state_change)
 {
@@ -380,6 +380,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 		.local_state_change = local_state_change,
 		.ie = ie,
 		.ie_len = ie_len,
+		.ap_addr = ap_addr,
 	};
 	int err;
 
@@ -388,10 +389,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 	if (!wdev->connected)
 		return -ENOTCONN;
 
-	if (ether_addr_equal(wdev->links[0].client.current_bss->pub.bssid,
-			     bssid))
-		req.bss = &wdev->links[0].client.current_bss->pub;
-	else
+	if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN))
 		return -ENOTCONN;
 
 	err = rdev_disassoc(rdev, dev, &req);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index c50e8a04199e..4316d3dc31ea 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1318,10 +1318,7 @@ TRACE_EVENT(rdev_disassoc,
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
-		if (req->bss)
-			MAC_ASSIGN(bssid, req->bss->bssid);
-		else
-			eth_zero_addr(__entry->bssid);
+		MAC_ASSIGN(bssid, req->ap_addr);
 		__entry->reason_code = req->reason_code;
 		__entry->local_state_change = req->local_state_change;
 	),
-- 
cgit 


From f662d2f4e22e5d5a9215e9c881875a4769494ef6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 27 Jun 2022 22:09:50 +0200
Subject: wifi: cfg80211: prepare association failure APIs for MLO

For MLO, we need the ability to report back multiple BSS
structures to release, as well as the AP MLD address (if
attempting to make an MLO connection).

Unify cfg80211_assoc_timeout() and cfg80211_abandon_assoc()
into a new cfg80211_assoc_failure() that gets a structure
parameter with the necessary data.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 29 +++++++++++++++++------------
 net/mac80211/mlme.c    | 28 +++++++++++++++++++++++-----
 net/wireless/mlme.c    | 36 +++++++++++++++++++-----------------
 net/wireless/trace.h   | 19 ++++++++++++++++---
 4 files changed, 75 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d0be08483fa6..fbbe1796b85f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6900,24 +6900,29 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
 			    const u8 *req_ies, size_t req_ies_len);
 
 /**
- * cfg80211_assoc_timeout - notification of timed out association
- * @dev: network device
- * @bss: The BSS entry with which association timed out.
- *
- * This function may sleep. The caller must hold the corresponding wdev's mutex.
- */
-void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss);
+ * struct cfg80211_assoc_failure - association failure data
+ * @ap_mld_addr: AP MLD address, or %NULL
+ * @bss: list of BSSes, must use entry 0 for non-MLO connections
+ *	(@ap_mld_addr is %NULL)
+ * @timeout: indicates the association failed due to timeout, otherwise
+ *	the association was abandoned for a reason reported through some
+ *	other API (e.g. deauth RX)
+ */
+struct cfg80211_assoc_failure {
+	const u8 *ap_mld_addr;
+	struct cfg80211_bss *bss[IEEE80211_MLD_MAX_NUM_LINKS];
+	bool timeout;
+};
 
 /**
- * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt
+ * cfg80211_assoc_failure - notification of association failure
  * @dev: network device
- * @bss: The BSS entry with which association was abandoned.
+ * @data: data describing the association failure
  *
- * Call this whenever - for reasons reported through other API, like deauth RX,
- * an association attempt was abandoned.
  * This function may sleep. The caller must hold the corresponding wdev's mutex.
  */
-void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss);
+void cfg80211_assoc_failure(struct net_device *dev,
+			    struct cfg80211_assoc_failure *data);
 
 /**
  * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a2b4536c3a24..e1c4a4dcfc70 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3027,8 +3027,13 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		ieee80211_link_release_channel(&sdata->deflink);
 		mutex_unlock(&sdata->local->mtx);
 
-		if (abandon)
-			cfg80211_abandon_assoc(sdata->dev, assoc_data->bss);
+		if (abandon) {
+			struct cfg80211_assoc_failure data = {
+				.bss[0] = assoc_data->bss,
+			};
+
+			cfg80211_assoc_failure(sdata->dev, &data);
+		}
 	}
 
 	kfree(assoc_data);
@@ -3956,8 +3961,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	} else {
 		if (!ieee80211_assoc_success(sdata, cbss, mgmt, len, elems)) {
 			/* oops -- internal error -- send timeout for now */
+			struct cfg80211_assoc_failure data = {
+				.timeout = true,
+				.bss[0] = cbss,
+			};
 			ieee80211_destroy_assoc_data(sdata, false, false);
-			cfg80211_assoc_timeout(sdata->dev, cbss);
+			cfg80211_assoc_failure(sdata->dev, &data);
 			goto notify_driver;
 		}
 		event.u.mlme.status = MLME_SUCCESS;
@@ -4833,9 +4842,13 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
 				.u.mlme.data = ASSOC_EVENT,
 				.u.mlme.status = MLME_TIMEOUT,
 			};
+			struct cfg80211_assoc_failure data = {
+				.bss[0] = bss,
+				.timeout = true,
+			};
 
 			ieee80211_destroy_assoc_data(sdata, false, false);
-			cfg80211_assoc_timeout(sdata->dev, bss);
+			cfg80211_assoc_failure(sdata->dev, &data);
 			drv_event_callback(sdata->local, sdata, &event);
 		}
 	} else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started)
@@ -6468,8 +6481,13 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 	sdata_lock(sdata);
 	if (ifmgd->assoc_data) {
 		struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
+		struct cfg80211_assoc_failure data = {
+			.bss[0] = bss,
+			.timeout = true,
+		};
+
 		ieee80211_destroy_assoc_data(sdata, false, false);
-		cfg80211_assoc_timeout(sdata->dev, bss);
+		cfg80211_assoc_failure(sdata->dev, &data);
 	}
 	if (ifmgd->auth_data)
 		ieee80211_destroy_auth_data(sdata, false);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 4a35b3559daa..aefe8b26f0d7 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -154,33 +154,35 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr)
 }
 EXPORT_SYMBOL(cfg80211_auth_timeout);
 
-void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss)
+void cfg80211_assoc_failure(struct net_device *dev,
+			    struct cfg80211_assoc_failure *data)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid;
+	int i;
 
-	trace_cfg80211_send_assoc_timeout(dev, bss->bssid);
-
-	nl80211_send_assoc_timeout(rdev, dev, bss->bssid, GFP_KERNEL);
-	cfg80211_sme_assoc_timeout(wdev);
+	trace_cfg80211_send_assoc_failure(dev, data);
 
-	cfg80211_unhold_bss(bss_from_pub(bss));
-	cfg80211_put_bss(wiphy, bss);
-}
-EXPORT_SYMBOL(cfg80211_assoc_timeout);
+	if (data->timeout) {
+		nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL);
+		cfg80211_sme_assoc_timeout(wdev);
+	} else {
+		cfg80211_sme_abandon_assoc(wdev);
+	}
 
-void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	struct wiphy *wiphy = wdev->wiphy;
+	for (i = 0; i < ARRAY_SIZE(data->bss); i++) {
+		struct cfg80211_bss *bss = data->bss[i];
 
-	cfg80211_sme_abandon_assoc(wdev);
+		if (!bss)
+			continue;
 
-	cfg80211_unhold_bss(bss_from_pub(bss));
-	cfg80211_put_bss(wiphy, bss);
+		cfg80211_unhold_bss(bss_from_pub(bss));
+		cfg80211_put_bss(wiphy, bss);
+	}
 }
-EXPORT_SYMBOL(cfg80211_abandon_assoc);
+EXPORT_SYMBOL(cfg80211_assoc_failure);
 
 void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
 			   bool reconnect)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 4316d3dc31ea..19efb9539533 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2969,9 +2969,22 @@ DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout,
 	TP_ARGS(netdev, mac)
 );
 
-DEFINE_EVENT(netdev_mac_evt, cfg80211_send_assoc_timeout,
-	TP_PROTO(struct net_device *netdev, const u8 *mac),
-	TP_ARGS(netdev, mac)
+TRACE_EVENT(cfg80211_send_assoc_failure,
+	TP_PROTO(struct net_device *netdev,
+		 struct cfg80211_assoc_failure *data),
+	TP_ARGS(netdev, data),
+	TP_STRUCT__entry(
+		NETDEV_ENTRY
+		MAC_ENTRY(ap_addr)
+		__field(bool, timeout)
+	),
+	TP_fast_assign(
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid);
+		__entry->timeout = data->timeout;
+	),
+	TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT ", timeout: %d",
+		  NETDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout)
 );
 
 TRACE_EVENT(cfg80211_michael_mic_failure,
-- 
cgit 


From e69dac88a155bd626170bb0bb43f83fb564392f7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 28 Jun 2022 11:25:38 +0200
Subject: wifi: cfg80211: adjust assoc comeback for MLO

We only report the BSSID to userspace, so change the
argument from BSS struct pointer to AP address, which
we'll use to carry either the BSSID or AP MLD address.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++--
 net/mac80211/mlme.c    |  2 +-
 net/wireless/nl80211.c |  6 +++---
 net/wireless/trace.h   | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fbbe1796b85f..1d8cecf035d1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8592,13 +8592,13 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
  * cfg80211_assoc_comeback - notification of association that was
  * temporarly rejected with a comeback
  * @netdev: network device
- * @bss: the bss entry with which association is in progress.
+ * @ap_addr: AP (MLD) address that rejected the assocation
  * @timeout: timeout interval value TUs.
  *
  * this function may sleep. the caller must hold the corresponding wdev's mutex.
  */
 void cfg80211_assoc_comeback(struct net_device *netdev,
-			     struct cfg80211_bss *bss, u32 timeout);
+			     const u8 *ap_addr, u32 timeout);
 
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ac37cf74ab74..91f07b67f2f0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3944,7 +3944,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	    elems->timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) {
 		u32 tu, ms;
 
-		cfg80211_assoc_comeback(sdata->dev, assoc_data->bss,
+		cfg80211_assoc_comeback(sdata->dev, assoc_data->bss->bssid,
 					le32_to_cpu(elems->timeout_int->value));
 
 		tu = le32_to_cpu(elems->timeout_int->value);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 886d964242ae..6c3b47a7960f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18036,7 +18036,7 @@ static void nl80211_send_remain_on_chan_event(
 }
 
 void cfg80211_assoc_comeback(struct net_device *netdev,
-			     struct cfg80211_bss *bss, u32 timeout)
+			     const u8 *ap_addr, u32 timeout)
 {
 	struct wireless_dev *wdev = netdev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
@@ -18044,7 +18044,7 @@ void cfg80211_assoc_comeback(struct net_device *netdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	trace_cfg80211_assoc_comeback(wdev, bss->bssid, timeout);
+	trace_cfg80211_assoc_comeback(wdev, ap_addr, timeout);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
@@ -18058,7 +18058,7 @@ void cfg80211_assoc_comeback(struct net_device *netdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bss->bssid) ||
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ap_addr) ||
 	    nla_put_u32(msg, NL80211_ATTR_TIMEOUT, timeout))
 		goto nla_put_failure;
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 19efb9539533..94d107cab72c 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3764,20 +3764,20 @@ TRACE_EVENT(cfg80211_bss_color_notify,
 );
 
 TRACE_EVENT(cfg80211_assoc_comeback,
-	TP_PROTO(struct wireless_dev *wdev, const u8 *bssid, u32 timeout),
-	TP_ARGS(wdev, bssid, timeout),
+	TP_PROTO(struct wireless_dev *wdev, const u8 *ap_addr, u32 timeout),
+	TP_ARGS(wdev, ap_addr, timeout),
 	TP_STRUCT__entry(
 		WDEV_ENTRY
-		MAC_ENTRY(bssid)
+		MAC_ENTRY(ap_addr)
 		__field(u32, timeout)
 	),
 	TP_fast_assign(
 		WDEV_ASSIGN;
-		MAC_ASSIGN(bssid, bssid);
+		MAC_ASSIGN(ap_addr, ap_addr);
 		__entry->timeout = timeout;
 	),
 	TP_printk(WDEV_PR_FMT ", " MAC_PR_FMT ", timeout: %u TUs",
-		  WDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->timeout)
+		  WDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout)
 );
 
 DECLARE_EVENT_CLASS(link_station_add_mod,
-- 
cgit 


From cd47c0f57ae6607cfa3d161e341cbdd283bc444f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 28 Jun 2022 16:25:37 +0200
Subject: wifi: cfg80211: put cfg80211_rx_assoc_resp() arguments into a struct

For MLO we'll need a lot more arguments, including all the
BSS pointers and link addresses, so move the data to a struct
to be able to extend it more easily later.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 24 +++++++++++++++++-------
 net/mac80211/mlme.c    | 17 ++++++++++++-----
 net/wireless/mlme.c    | 32 +++++++++++++++-----------------
 net/wireless/nl80211.c | 12 ++++++------
 net/wireless/nl80211.h |  4 +---
 5 files changed, 51 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1d8cecf035d1..2628e8f98a13 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6877,16 +6877,29 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);
 void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
 
 /**
- * cfg80211_rx_assoc_resp - notification of processed association response
- * @dev: network device
+ * struct cfg80211_rx_assoc_resp - association response data
  * @bss: the BSS that association was requested with, ownership of the pointer
- *	moves to cfg80211 in this call
+ *	moves to cfg80211 in the call to cfg80211_rx_assoc_resp()
  * @buf: (Re)Association Response frame (header + body)
  * @len: length of the frame data
  * @uapsd_queues: bitmap of queues configured for uapsd. Same format
  *	as the AC bitmap in the QoS info field
  * @req_ies: information elements from the (Re)Association Request frame
  * @req_ies_len: length of req_ies data
+ */
+struct cfg80211_rx_assoc_resp {
+	struct cfg80211_bss *bss;
+	const u8 *buf;
+	size_t len;
+	const u8 *req_ies;
+	size_t req_ies_len;
+	int uapsd_queues;
+};
+
+/**
+ * cfg80211_rx_assoc_resp - notification of processed association response
+ * @dev: network device
+ * @data: association response data, &struct cfg80211_rx_assoc_resp
  *
  * After being asked to associate via cfg80211_ops::assoc() the driver must
  * call either this function or cfg80211_auth_timeout().
@@ -6894,10 +6907,7 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  * This function may sleep. The caller must hold the corresponding wdev's mutex.
  */
 void cfg80211_rx_assoc_resp(struct net_device *dev,
-			    struct cfg80211_bss *bss,
-			    const u8 *buf, size_t len,
-			    int uapsd_queues,
-			    const u8 *req_ies, size_t req_ies_len);
+			    struct cfg80211_rx_assoc_resp *data);
 
 /**
  * struct cfg80211_assoc_failure - association failure data
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 91f07b67f2f0..ecbd70a1fbb7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3878,7 +3878,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
 	u16 capab_info, status_code, aid;
 	struct ieee802_11_elems *elems;
-	int ac, uapsd_queues = -1;
+	int ac;
 	u8 *pos;
 	bool reassoc;
 	struct cfg80211_bss *cbss;
@@ -3887,6 +3887,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		.u.mlme.data = ASSOC_EVENT,
 	};
 	struct ieee80211_prep_tx_info info = {};
+	struct cfg80211_rx_assoc_resp resp = {
+		.uapsd_queues = -1,
+	};
 
 	sdata_assert_lock(sdata);
 
@@ -3984,16 +3987,20 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		ieee80211_destroy_assoc_data(sdata, ASSOC_SUCCESS);
 
 		/* get uapsd queues configuration */
-		uapsd_queues = 0;
+		resp.uapsd_queues = 0;
 		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
 			if (sdata->deflink.tx_conf[ac].uapsd)
-				uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
+				resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
 
 		info.success = 1;
 	}
 
-	cfg80211_rx_assoc_resp(sdata->dev, cbss, (u8 *)mgmt, len, uapsd_queues,
-			       ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len);
+	resp.bss = cbss;
+	resp.buf = (u8 *)mgmt;
+	resp.len = len;
+	resp.req_ies = ifmgd->assoc_req_ies;
+	resp.req_ies_len = ifmgd->assoc_req_ies_len;
+	cfg80211_rx_assoc_resp(sdata->dev, &resp);
 notify_driver:
 	drv_mgd_complete_tx(sdata->local, sdata, &info);
 	kfree(elems);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index aefe8b26f0d7..a6ad696f131b 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -21,36 +21,35 @@
 #include "rdev-ops.h"
 
 
-void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
-			    const u8 *buf, size_t len, int uapsd_queues,
-			    const u8 *req_ies, size_t req_ies_len)
+void cfg80211_rx_assoc_resp(struct net_device *dev,
+			    struct cfg80211_rx_assoc_resp *data)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
-	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf;
 	struct cfg80211_connect_resp_params cr;
 	const u8 *resp_ie = mgmt->u.assoc_resp.variable;
-	size_t resp_ie_len = len - offsetof(struct ieee80211_mgmt,
-					    u.assoc_resp.variable);
+	size_t resp_ie_len = data->len - offsetof(struct ieee80211_mgmt,
+						  u.assoc_resp.variable);
 
-	if (bss->channel->band == NL80211_BAND_S1GHZ) {
+	if (data->bss->channel->band == NL80211_BAND_S1GHZ) {
 		resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable;
-		resp_ie_len = len - offsetof(struct ieee80211_mgmt,
-					     u.s1g_assoc_resp.variable);
+		resp_ie_len = data->len - offsetof(struct ieee80211_mgmt,
+						   u.s1g_assoc_resp.variable);
 	}
 
 	memset(&cr, 0, sizeof(cr));
 	cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code);
 	cr.links[0].bssid = mgmt->bssid;
-	cr.links[0].bss = bss;
-	cr.req_ie = req_ies;
-	cr.req_ie_len = req_ies_len;
+	cr.links[0].bss = data->bss;
+	cr.req_ie = data->req_ies;
+	cr.req_ie_len = data->req_ies_len;
 	cr.resp_ie = resp_ie;
 	cr.resp_ie_len = resp_ie_len;
 	cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED;
 
-	trace_cfg80211_send_rx_assoc(dev, bss);
+	trace_cfg80211_send_rx_assoc(dev, data->bss);
 
 	/*
 	 * This is a bit of a hack, we don't notify userspace of
@@ -59,13 +58,12 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 	 * frame instead of reassoc.
 	 */
 	if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) {
-		cfg80211_unhold_bss(bss_from_pub(bss));
-		cfg80211_put_bss(wiphy, bss);
+		cfg80211_unhold_bss(bss_from_pub(data->bss));
+		cfg80211_put_bss(wiphy, data->bss);
 		return;
 	}
 
-	nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues,
-			      req_ies, req_ies_len);
+	nl80211_send_rx_assoc(rdev, dev, data);
 	/* update current_bss etc., consumes the bss reference */
 	__cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS);
 }
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6c3b47a7960f..11cad2d46d0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -17432,13 +17432,13 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
 }
 
 void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
-			   struct net_device *netdev, const u8 *buf,
-			   size_t len, gfp_t gfp, int uapsd_queues,
-			   const u8 *req_ies, size_t req_ies_len)
+			   struct net_device *netdev,
+			   struct cfg80211_rx_assoc_resp *data)
 {
-	nl80211_send_mlme_event(rdev, netdev, buf, len,
-				NL80211_CMD_ASSOCIATE, gfp, uapsd_queues,
-				req_ies, req_ies_len, false);
+	nl80211_send_mlme_event(rdev, netdev, data->buf, data->len,
+				NL80211_CMD_ASSOCIATE, GFP_KERNEL,
+				data->uapsd_queues,
+				data->req_ies, data->req_ies_len, false);
 }
 
 void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index d642e3be4ee7..a7e8e0917c1c 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -60,9 +60,7 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
 			  const u8 *buf, size_t len, gfp_t gfp);
 void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
 			   struct net_device *netdev,
-			   const u8 *buf, size_t len, gfp_t gfp,
-			   int uapsd_queues,
-			   const u8 *req_ies, size_t req_ies_len);
+			   struct cfg80211_rx_assoc_resp *data);
 void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 const u8 *buf, size_t len,
-- 
cgit 


From 5cd212cb6415aa604ada17d5150847fd65d27337 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 28 Jun 2022 17:17:05 +0200
Subject: wifi: cfg80211: extend cfg80211_rx_assoc_resp() for MLO

Extend the cfg80211_rx_assoc_resp() to cover multiple
BSSes, the AP MLD address and local link addresses
for MLO.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  9 ++++++-
 net/mac80211/mlme.c    |  2 +-
 net/wireless/mlme.c    | 64 +++++++++++++++++++++++++++++++++-----------------
 net/wireless/trace.h   | 16 ++++++-------
 4 files changed, 59 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2628e8f98a13..2fe3eff11a8b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6886,14 +6886,21 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  *	as the AC bitmap in the QoS info field
  * @req_ies: information elements from the (Re)Association Request frame
  * @req_ies_len: length of req_ies data
+ * @ap_mld_addr: AP MLD address (in case of MLO)
+ * @links: per-link information indexed by link ID, use links[0] for
+ *	non-MLO connections
  */
 struct cfg80211_rx_assoc_resp {
-	struct cfg80211_bss *bss;
 	const u8 *buf;
 	size_t len;
 	const u8 *req_ies;
 	size_t req_ies_len;
 	int uapsd_queues;
+	const u8 *ap_mld_addr;
+	struct {
+		const u8 *addr;
+		struct cfg80211_bss *bss;
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ecbd70a1fbb7..29c1fe8b9f4a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3995,7 +3995,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		info.success = 1;
 	}
 
-	resp.bss = cbss;
+	resp.links[0].bss = cbss;
 	resp.buf = (u8 *)mgmt;
 	resp.len = len;
 	resp.req_ies = ifmgd->assoc_req_ies;
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index a6ad696f131b..80f11a29cb86 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -28,28 +28,41 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf;
-	struct cfg80211_connect_resp_params cr;
-	const u8 *resp_ie = mgmt->u.assoc_resp.variable;
-	size_t resp_ie_len = data->len - offsetof(struct ieee80211_mgmt,
-						  u.assoc_resp.variable);
-
-	if (data->bss->channel->band == NL80211_BAND_S1GHZ) {
-		resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable;
-		resp_ie_len = data->len - offsetof(struct ieee80211_mgmt,
-						   u.s1g_assoc_resp.variable);
-	}
+	struct cfg80211_connect_resp_params cr = {
+		.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED,
+		.req_ie = data->req_ies,
+		.req_ie_len = data->req_ies_len,
+		.resp_ie = mgmt->u.assoc_resp.variable,
+		.resp_ie_len = data->len -
+			       offsetof(struct ieee80211_mgmt,
+					u.assoc_resp.variable),
+		.status = le16_to_cpu(mgmt->u.assoc_resp.status_code),
+		.ap_mld_addr = data->ap_mld_addr,
+	};
+	unsigned int link_id;
+
+	for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+		cr.links[link_id].bss = data->links[link_id].bss;
+		if (!cr.links[link_id].bss)
+			continue;
+		cr.links[link_id].bssid = data->links[link_id].bss->bssid;
+		cr.links[link_id].addr = data->links[link_id].addr;
+		/* need to have local link addresses for MLO connections */
+		WARN_ON(cr.ap_mld_addr && !cr.links[link_id].addr);
+
+		if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) {
+			WARN_ON(link_id);
+			cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable;
+			cr.resp_ie_len = data->len -
+					 offsetof(struct ieee80211_mgmt,
+						  u.s1g_assoc_resp.variable);
+		}
 
-	memset(&cr, 0, sizeof(cr));
-	cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code);
-	cr.links[0].bssid = mgmt->bssid;
-	cr.links[0].bss = data->bss;
-	cr.req_ie = data->req_ies;
-	cr.req_ie_len = data->req_ies_len;
-	cr.resp_ie = resp_ie;
-	cr.resp_ie_len = resp_ie_len;
-	cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED;
+		if (cr.ap_mld_addr)
+			cr.valid_links |= BIT(link_id);
+	}
 
-	trace_cfg80211_send_rx_assoc(dev, data->bss);
+	trace_cfg80211_send_rx_assoc(dev, data);
 
 	/*
 	 * This is a bit of a hack, we don't notify userspace of
@@ -58,8 +71,15 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
 	 * frame instead of reassoc.
 	 */
 	if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) {
-		cfg80211_unhold_bss(bss_from_pub(data->bss));
-		cfg80211_put_bss(wiphy, data->bss);
+		for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+			struct cfg80211_bss *bss = data->links[link_id].bss;
+
+			if (!bss)
+				continue;
+
+			cfg80211_unhold_bss(bss_from_pub(bss));
+			cfg80211_put_bss(wiphy, bss);
+		}
 		return;
 	}
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 94d107cab72c..dac66ad5937d 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2887,20 +2887,20 @@ DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
 );
 
 TRACE_EVENT(cfg80211_send_rx_assoc,
-	TP_PROTO(struct net_device *netdev, struct cfg80211_bss *bss),
-	TP_ARGS(netdev, bss),
+	TP_PROTO(struct net_device *netdev,
+		 struct cfg80211_rx_assoc_resp *data),
+	TP_ARGS(netdev, data),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
-		MAC_ENTRY(bssid)
-		CHAN_ENTRY
+		MAC_ENTRY(ap_addr)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
-		MAC_ASSIGN(bssid, bss->bssid);
-		CHAN_ASSIGN(bss->channel);
+		MAC_ASSIGN(ap_addr,
+			   data->ap_mld_addr ?: data->links[0].bss->bssid);
 	),
-	TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", " CHAN_PR_FMT,
-		  NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG)
+	TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT,
+		  NETDEV_PR_ARG, MAC_PR_ARG(ap_addr))
 );
 
 DECLARE_EVENT_CLASS(netdev_frame_event,
-- 
cgit 


From b327c84c328ed2be4dbad4f5ed7c17476fe1b3bf Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Wed, 29 Jun 2022 12:22:24 +0300
Subject: wifi: mac80211: replace link_id with link_conf in start/stop_ap()

When calling start/stop_ap(), mac80211 already has a protected
link_conf pointer. Pass it to the driver, so it shouldn't
handle RCU protection.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 16 ++++++++--------
 drivers/net/wireless/realtek/rtw88/mac80211.c     |  3 ++-
 drivers/net/wireless/realtek/rtw89/mac80211.c     |  5 +++--
 drivers/net/wireless/silabs/wfx/sta.c             |  4 ++--
 drivers/net/wireless/silabs/wfx/sta.h             |  4 ++--
 include/net/mac80211.h                            |  4 ++--
 net/mac80211/cfg.c                                |  4 ++--
 net/mac80211/driver-ops.h                         | 18 ++++++++++++------
 net/mac80211/trace.h                              | 23 +++++++++--------------
 net/mac80211/util.c                               |  3 ++-
 10 files changed, 44 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 3de558f286a1..126106ea62d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2397,7 +2397,7 @@ static void iwl_mvm_bss_info_changed_station(struct iwl_mvm *mvm,
 
 static int iwl_mvm_start_ap_ibss(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 unsigned int link_id)
+				 struct ieee80211_bss_conf *link_conf)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -2525,20 +2525,20 @@ out_unlock:
 
 static int iwl_mvm_start_ap(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
-			    unsigned int link_id)
+			    struct ieee80211_bss_conf *link_conf)
 {
-	return iwl_mvm_start_ap_ibss(hw, vif, link_id);
+	return iwl_mvm_start_ap_ibss(hw, vif, link_conf);
 }
 
 static int iwl_mvm_start_ibss(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif)
 {
-	return iwl_mvm_start_ap_ibss(hw, vif, 0);
+	return iwl_mvm_start_ap_ibss(hw, vif, &vif->bss_conf);
 }
 
 static void iwl_mvm_stop_ap_ibss(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 unsigned int link_id)
+				 struct ieee80211_bss_conf *link_conf)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -2603,15 +2603,15 @@ static void iwl_mvm_stop_ap_ibss(struct ieee80211_hw *hw,
 
 static void iwl_mvm_stop_ap(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
-			    unsigned int link_id)
+			    struct ieee80211_bss_conf *link_conf)
 {
-	iwl_mvm_stop_ap_ibss(hw, vif, link_id);
+	iwl_mvm_stop_ap_ibss(hw, vif, link_conf);
 }
 
 static void iwl_mvm_stop_ibss(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif)
 {
-	iwl_mvm_stop_ap_ibss(hw, vif, 0);
+	iwl_mvm_stop_ap_ibss(hw, vif, &vif->bss_conf);
 }
 
 static void
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index bb7291665d37..c7b98a0599d5 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -430,7 +430,8 @@ static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw,
 }
 
 static int rtw_ops_start_ap(struct ieee80211_hw *hw,
-			    struct ieee80211_vif *vif, unsigned int link_id)
+			    struct ieee80211_vif *vif,
+			    struct ieee80211_bss_conf *link_conf)
 {
 	struct rtw_dev *rtwdev = hw->priv;
 	struct rtw_chip_info *chip = rtwdev->chip;
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index f40569c8575c..cef27e781ae2 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -382,7 +382,8 @@ static void rtw89_ops_bss_info_changed(struct ieee80211_hw *hw,
 }
 
 static int rtw89_ops_start_ap(struct ieee80211_hw *hw,
-			      struct ieee80211_vif *vif, unsigned int link_id)
+			      struct ieee80211_vif *vif,
+			      struct ieee80211_bss_conf *link_conf)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
@@ -403,7 +404,7 @@ static int rtw89_ops_start_ap(struct ieee80211_hw *hw,
 
 static
 void rtw89_ops_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		       unsigned int link_id)
+		       struct ieee80211_bss_conf *link_conf)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_vif *rtwvif = (struct rtw89_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 89402afe4fe6..920bd1a4a1b1 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -380,7 +380,7 @@ static void wfx_set_mfp_ap(struct wfx_vif *wvif)
 }
 
 int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		 unsigned int link_id)
+		 struct ieee80211_bss_conf *link_conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
 	struct wfx_dev *wdev = wvif->wdev;
@@ -399,7 +399,7 @@ int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		 unsigned int link_id)
+		 struct ieee80211_bss_conf *link_conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
 
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index 6558c5698cc8..bf2e76167a6f 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -30,9 +30,9 @@ void wfx_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags,
 int wfx_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void wfx_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int wfx_start_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		 unsigned int link_id);
+		 struct ieee80211_bss_conf *link_conf);
 void wfx_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		 unsigned int link_id);
+		 struct ieee80211_bss_conf *link_conf);
 int wfx_join_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void wfx_leave_ibss(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 36eba96a1012..dcb8b6dac2e1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4092,9 +4092,9 @@ struct ieee80211_ops {
 				  u64 changed);
 
 	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			unsigned int link_id);
+			struct ieee80211_bss_conf *link_conf);
 	void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			unsigned int link_id);
+			struct ieee80211_bss_conf *link_conf);
 
 	u64 (*prepare_multicast)(struct ieee80211_hw *hw,
 				 struct netdev_hw_addr_list *mc_list);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fa3b6cc2cafa..9214883eb0a8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1296,7 +1296,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 		changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP;
 	}
 
-	err = drv_start_ap(sdata->local, sdata, link_id);
+	err = drv_start_ap(sdata->local, sdata, link_conf);
 	if (err) {
 		old = sdata_dereference(link->u.ap.beacon, sdata);
 
@@ -1457,7 +1457,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 				   GFP_KERNEL);
 	}
 
-	drv_stop_ap(sdata->local, sdata, link_id);
+	drv_stop_ap(sdata->local, sdata, link_conf);
 
 	/* free all potentially still buffered bcast frames */
 	local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index eb16a354338c..a04a88d122b7 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -987,32 +987,38 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local,
 
 static inline int drv_start_ap(struct ieee80211_local *local,
 			       struct ieee80211_sub_if_data *sdata,
-			       unsigned int link_id)
+			       struct ieee80211_bss_conf *link_conf)
 {
 	int ret = 0;
 
+	/* make sure link_conf is protected */
+	sdata_assert_lock(sdata);
+
 	might_sleep();
 
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_start_ap(local, sdata, link_id);
+	trace_drv_start_ap(local, sdata, link_conf);
 	if (local->ops->start_ap)
-		ret = local->ops->start_ap(&local->hw, &sdata->vif, link_id);
+		ret = local->ops->start_ap(&local->hw, &sdata->vif, link_conf);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
 
 static inline void drv_stop_ap(struct ieee80211_local *local,
 			       struct ieee80211_sub_if_data *sdata,
-			       unsigned int link_id)
+			       struct ieee80211_bss_conf *link_conf)
 {
+	/* make sure link_conf is protected */
+	sdata_assert_lock(sdata);
+
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_stop_ap(local, sdata, link_id);
+	trace_drv_stop_ap(local, sdata, link_conf);
 	if (local->ops->stop_ap)
-		local->ops->stop_ap(&local->hw, &sdata->vif, link_id);
+		local->ops->stop_ap(&local->hw, &sdata->vif, link_conf);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index b6f12ac91849..75e5c1376351 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1754,9 +1754,9 @@ DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
 TRACE_EVENT(drv_start_ap,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 unsigned int link_id),
+		 struct ieee80211_bss_conf *link_conf),
 
-	TP_ARGS(local, sdata, link_id),
+	TP_ARGS(local, sdata, link_conf),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -1769,17 +1769,12 @@ TRACE_EVENT(drv_start_ap,
 	),
 
 	TP_fast_assign(
-		struct ieee80211_bss_conf *info =
-			sdata_dereference(sdata->vif.link_conf[link_id], sdata);
-
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
-		__entry->link_id = link_id;
-		if (info) {
-			__entry->dtimper = info->dtim_period;
-			__entry->bcnint = info->beacon_int;
-			__entry->hidden_ssid = info->hidden_ssid;
-		}
+		__entry->link_id = link_conf->link_id;
+		__entry->dtimper = link_conf->dtim_period;
+		__entry->bcnint = link_conf->beacon_int;
+		__entry->hidden_ssid = link_conf->hidden_ssid;
 		memcpy(__get_dynamic_array(ssid),
 		       sdata->vif.cfg.ssid,
 		       sdata->vif.cfg.ssid_len);
@@ -1794,9 +1789,9 @@ TRACE_EVENT(drv_start_ap,
 TRACE_EVENT(drv_stop_ap,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 unsigned int link_id),
+		 struct ieee80211_bss_conf *link_conf),
 
-	TP_ARGS(local, sdata, link_id),
+	TP_ARGS(local, sdata, link_conf),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -1807,7 +1802,7 @@ TRACE_EVENT(drv_stop_ap,
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
-		__entry->link_id = link_id;
+		__entry->link_id = link_conf->link_id;
 	),
 
 	TP_printk(
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 0ff09c639c50..cb0dd874c5df 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2579,7 +2579,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 				changed |= BSS_CHANGED_AP_PROBE_RESP;
 
 				if (rcu_access_pointer(sdata->deflink.u.ap.beacon))
-					drv_start_ap(local, sdata, 0);
+					drv_start_ap(local, sdata,
+						     sdata->deflink.conf);
 			}
 			fallthrough;
 		case NL80211_IFTYPE_MESH_POINT:
-- 
cgit 


From 19654a61bfd66539087119fbceed46e48f084d89 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Jul 2022 14:01:29 +0200
Subject: wifi: cfg80211: add ieee80211_chanwidth_rate_flags()

To simplify things when we don't have a full chandef,
add ieee80211_chanwidth_rate_flags().

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2fe3eff11a8b..f9ea49e67164 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -939,19 +939,18 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 				  enum nl80211_iftype iftype);
 
 /**
- * ieee80211_chandef_rate_flags - returns rate flags for a channel
+ * ieee80211_chanwidth_rate_flags - return rate flags for channel width
+ * @width: the channel width of the channel
  *
  * In some channel types, not all rates may be used - for example CCK
  * rates may not be used in 5/10 MHz channels.
  *
- * @chandef: channel definition for the channel
- *
- * Returns: rate flags which apply for this channel
+ * Returns: rate flags which apply for this channel width
  */
 static inline enum ieee80211_rate_flags
-ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
+ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width)
 {
-	switch (chandef->width) {
+	switch (width) {
 	case NL80211_CHAN_WIDTH_5:
 		return IEEE80211_RATE_SUPPORTS_5MHZ;
 	case NL80211_CHAN_WIDTH_10:
@@ -962,6 +961,20 @@ ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
 	return 0;
 }
 
+/**
+ * ieee80211_chandef_rate_flags - returns rate flags for a channel
+ * @chandef: channel definition for the channel
+ *
+ * See ieee80211_chanwidth_rate_flags().
+ *
+ * Returns: rate flags which apply for this channel
+ */
+static inline enum ieee80211_rate_flags
+ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
+{
+	return ieee80211_chanwidth_rate_flags(chandef->width);
+}
+
 /**
  * ieee80211_chandef_max_power - maximum transmission power for the chandef
  *
-- 
cgit 


From 4e9c3af398207d95957ae6c25290891574f2d7e8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 4 Jul 2022 15:02:33 +0200
Subject: wifi: nl80211: add EML/MLD capabilities to per-iftype capabilities

We have the per-interface type capabilities, currently for
extended capabilities, add the EML/MLD capabilities there
to have this advertised by the driver.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 12 ++++++++++--
 net/wireless/nl80211.c       |  9 +++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f9ea49e67164..bc960646973b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4993,12 +4993,16 @@ struct wiphy_vendor_command {
  *	802.11-2012 8.4.2.29 for the defined fields.
  * @extended_capabilities_mask: mask of the valid values
  * @extended_capabilities_len: length of the extended capabilities
+ * @eml_capabilities: EML capabilities (for MLO)
+ * @mld_capa_and_ops: MLD capabilities and operations (for MLO)
  */
 struct wiphy_iftype_ext_capab {
 	enum nl80211_iftype iftype;
 	const u8 *extended_capabilities;
 	const u8 *extended_capabilities_mask;
 	u8 extended_capabilities_len;
+	u16 eml_capabilities;
+	u16 mld_capa_and_ops;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 37bfc934325a..3fa586e38f88 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2368,8 +2368,10 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes:
  *	%NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA,
- *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per
- *	interface type.
+ *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities and
+ *	other interface-type specific capabilities per interface type. For MLO,
+ *	%NL80211_ATTR_EML_CAPABILITY and %NL80211_ATTR_MLD_CAPA_AND_OPS are
+ *	present.
  *
  * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO
  *	groupID for monitor mode.
@@ -2709,6 +2711,9 @@ enum nl80211_commands {
  *	suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum
  *	number prior to the introduction of this attribute.
  *
+ * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16)
+ * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16)
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3231,6 +3236,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MAX_NUM_AKM_SUITES,
 
+	NL80211_ATTR_EML_CAPABILITY,
+	NL80211_ATTR_MLD_CAPA_AND_OPS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 37ec8b3897b4..35fb2b0517d9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2867,6 +2867,15 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 					    capab->extended_capabilities_mask))
 					goto nla_put_failure;
 
+				if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO &&
+				    (nla_put_u16(msg,
+						 NL80211_ATTR_EML_CAPABILITY,
+						 capab->eml_capabilities) ||
+				     nla_put_u16(msg,
+						 NL80211_ATTR_MLD_CAPA_AND_OPS,
+						 capab->mld_capa_and_ops)))
+					goto nla_put_failure;
+
 				nla_nest_end(msg, nested_ext_capab);
 				if (state->split)
 					break;
-- 
cgit 


From 67207bab9341418698ae1029b28a44b58c39257e Mon Sep 17 00:00:00 2001
From: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Date: Thu, 30 Jun 2022 15:27:59 +0300
Subject: wifi: cfg80211/mac80211: Support control port TX from specific link

In case of authentication with a legacy station, link addressed EAPOL
frames should be sent. Support it.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h     |  2 +-
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/tx.c          | 20 ++++++++++++++++++--
 net/wireless/nl80211.c     |  5 ++++-
 net/wireless/rdev-ops.h    |  7 ++++---
 net/wireless/trace.h       | 11 +++++++----
 6 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index bc960646973b..8dbc64286d91 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4580,7 +4580,7 @@ struct cfg80211_ops {
 				   struct net_device *dev,
 				   const u8 *buf, size_t len,
 				   const u8 *dest, const __be16 proto,
-				   const bool noencrypt,
+				   const bool noencrypt, int link_id,
 				   u64 *cookie);
 
 	int	(*get_ftm_responder_stats)(struct wiphy *wiphy,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d17d73e8d19f..58b08315fa26 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1946,7 +1946,7 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta);
 int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len,
 			      const u8 *dest, __be16 proto, bool unencrypted,
-			      u64 *cookie);
+			      int link_id, u64 *cookie);
 int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 4a7a714de79e..8f25cfe0d543 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5674,7 +5674,7 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
 int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len,
 			      const u8 *dest, __be16 proto, bool unencrypted,
-			      u64 *cookie)
+			      int link_id, u64 *cookie)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
@@ -5714,7 +5714,23 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 
 	ehdr = skb_push(skb, sizeof(struct ethhdr));
 	memcpy(ehdr->h_dest, dest, ETH_ALEN);
-	memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
+
+	if (link_id < 0) {
+		memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
+	} else {
+		struct ieee80211_bss_conf *link_conf;
+
+		rcu_read_lock();
+		link_conf = rcu_dereference(sdata->vif.link_conf[link_id]);
+		if (!link_conf) {
+			dev_kfree_skb(skb);
+			rcu_read_unlock();
+			return -ENOLINK;
+		}
+		memcpy(ehdr->h_source, link_conf->addr, ETH_ALEN);
+		rcu_read_unlock();
+	}
+
 	ehdr->h_proto = proto;
 
 	skb->dev = dev;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 35fcf36bbaad..53d63effbca9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -15223,6 +15223,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	u16 proto;
 	bool noencrypt;
 	u64 cookie = 0;
+	int link_id;
 	int err;
 
 	if (!wiphy_ext_feature_isset(&rdev->wiphy,
@@ -15271,8 +15272,10 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	noencrypt =
 		nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]);
 
+	link_id = nl80211_link_id_or_invalid(info->attrs);
+
 	err = rdev_tx_control_port(rdev, dev, buf, len,
-				   dest, cpu_to_be16(proto), noencrypt,
+				   dest, cpu_to_be16(proto), noencrypt, link_id,
 				   dont_wait_for_ack ? NULL : &cookie);
 	if (!err && !dont_wait_for_ack)
 		nl_set_extack_cookie_u64(info->extack, cookie);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 53f5a0126dfd..40915a82da73 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -746,13 +746,14 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
 				       struct net_device *dev,
 				       const void *buf, size_t len,
 				       const u8 *dest, __be16 proto,
-				       const bool noencrypt, u64 *cookie)
+				       const bool noencrypt, int link,
+				       u64 *cookie)
 {
 	int ret;
 	trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
-				   dest, proto, noencrypt);
+				   dest, proto, noencrypt, link);
 	ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
-					 dest, proto, noencrypt, cookie);
+					 dest, proto, noencrypt, link, cookie);
 	if (cookie)
 		trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
 	else
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index dac66ad5937d..592b9e9e821a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2015,14 +2015,15 @@ TRACE_EVENT(rdev_mgmt_tx,
 TRACE_EVENT(rdev_tx_control_port,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
 		 const u8 *buf, size_t len, const u8 *dest, __be16 proto,
-		 bool unencrypted),
-	TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted),
+		 bool unencrypted, int link_id),
+	TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted, link_id),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		MAC_ENTRY(dest)
 		__field(__be16, proto)
 		__field(bool, unencrypted)
+		__field(int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -2030,12 +2031,14 @@ TRACE_EVENT(rdev_tx_control_port,
 		MAC_ASSIGN(dest, dest);
 		__entry->proto = proto;
 		__entry->unencrypted = unencrypted;
+		__entry->link_id = link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ","
-		  " proto: 0x%x, unencrypted: %s",
+		  " proto: 0x%x, unencrypted: %s, link: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest),
 		  be16_to_cpu(__entry->proto),
-		  BOOL_TO_STR(__entry->unencrypted))
+		  BOOL_TO_STR(__entry->unencrypted),
+		  __entry->link_id)
 );
 
 TRACE_EVENT(rdev_set_noack_map,
-- 
cgit 


From 727eff4dd198d79f9e81d3aafbab741a8374b5d0 Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Sun, 3 Jul 2022 18:04:15 +0300
Subject: wifi: mac80211: replace link_id with link_conf in
 switch/(un)assign_vif_chanctx()

Since mac80211 already has a protected pointer to link_conf,
pass it to the driver to avoid additional RCU locking.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c             |  4 ++--
 drivers/net/wireless/ath/ath11k/mac.c             |  4 ++--
 drivers/net/wireless/ath/ath9k/main.c             |  4 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |  4 ++--
 drivers/net/wireless/mac80211_hwsim.c             |  4 ++--
 drivers/net/wireless/silabs/wfx/sta.c             |  4 ++--
 drivers/net/wireless/silabs/wfx/sta.h             |  4 ++--
 drivers/net/wireless/ti/wlcore/main.c             |  4 ++--
 include/net/mac80211.h                            |  8 +++----
 net/mac80211/chan.c                               |  9 ++++----
 net/mac80211/driver-ops.h                         | 26 ++++++++++++++++-------
 net/mac80211/trace.h                              | 16 +++++++-------
 net/mac80211/util.c                               |  2 +-
 13 files changed, 51 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index ac54396418c9..9dd3b8fba4b0 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -8920,7 +8920,7 @@ unlock:
 static int
 ath10k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 unsigned int link_id,
+				 struct ieee80211_bss_conf *link_conf,
 				 struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath10k *ar = hw->priv;
@@ -9000,7 +9000,7 @@ err:
 static void
 ath10k_mac_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
-				   unsigned int link_id,
+				   struct ieee80211_bss_conf *link_conf,
 				   struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath10k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index ec7b3a3629f3..92e17d3c634e 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -7073,7 +7073,7 @@ static int ath11k_start_vdev_delay(struct ieee80211_hw *hw,
 static int
 ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
-				 unsigned int link_id,
+				 struct ieee80211_bss_conf *link_conf,
 				 struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath11k *ar = hw->priv;
@@ -7163,7 +7163,7 @@ out:
 static void
 ath11k_mac_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
-				   unsigned int link_id,
+				   struct ieee80211_bss_conf *link_conf,
 				   struct ieee80211_chanctx_conf *ctx)
 {
 	struct ath11k *ar = hw->priv;
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index d2c20c332c7a..a4197c14f0a9 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -2597,7 +2597,7 @@ static void ath9k_change_chanctx(struct ieee80211_hw *hw,
 
 static int ath9k_assign_vif_chanctx(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
-				    unsigned int link_id,
+				    struct ieee80211_bss_conf *link_conf,
 				    struct ieee80211_chanctx_conf *conf)
 {
 	struct ath_softc *sc = hw->priv;
@@ -2629,7 +2629,7 @@ static int ath9k_assign_vif_chanctx(struct ieee80211_hw *hw,
 
 static void ath9k_unassign_vif_chanctx(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
-				       unsigned int link_id,
+				       struct ieee80211_bss_conf *link_conf,
 				       struct ieee80211_chanctx_conf *conf)
 {
 	struct ath_softc *sc = hw->priv;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 126106ea62d3..5eb28f8ee87e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -4264,7 +4264,7 @@ out:
 }
 static int iwl_mvm_assign_vif_chanctx(struct ieee80211_hw *hw,
 				      struct ieee80211_vif *vif,
-				      unsigned int link_id,
+				      struct ieee80211_bss_conf *link_conf,
 				      struct ieee80211_chanctx_conf *ctx)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
@@ -4338,7 +4338,7 @@ out:
 
 static void iwl_mvm_unassign_vif_chanctx(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
-					 unsigned int link_id,
+					 struct ieee80211_bss_conf *link_conf,
 					 struct ieee80211_chanctx_conf *ctx)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 5da7a097d518..7d573e90ad81 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2794,7 +2794,7 @@ static void mac80211_hwsim_change_chanctx(struct ieee80211_hw *hw,
 
 static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif,
-					     unsigned int link_id,
+					     struct ieee80211_bss_conf *link_conf,
 					     struct ieee80211_chanctx_conf *ctx)
 {
 	hwsim_check_magic(vif);
@@ -2805,7 +2805,7 @@ static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw,
 
 static void mac80211_hwsim_unassign_vif_chanctx(struct ieee80211_hw *hw,
 						struct ieee80211_vif *vif,
-						unsigned int link_id,
+						struct ieee80211_bss_conf *link_conf,
 						struct ieee80211_chanctx_conf *ctx)
 {
 	hwsim_check_magic(vif);
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index 920bd1a4a1b1..626dfb4b7a55 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -683,7 +683,7 @@ void wfx_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *
 }
 
 int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			   unsigned int link_id,
+			   struct ieee80211_bss_conf *link_conf,
 			   struct ieee80211_chanctx_conf *conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
@@ -696,7 +696,7 @@ int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			      unsigned int link_id,
+			      struct ieee80211_bss_conf *link_conf,
 			      struct ieee80211_chanctx_conf *conf)
 {
 	struct wfx_vif *wvif = (struct wfx_vif *)vif->drv_priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index bf2e76167a6f..888db5cd3206 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -51,10 +51,10 @@ int wfx_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf
 void wfx_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf);
 void wfx_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *conf, u32 changed);
 int wfx_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			   unsigned int link_id,
+			   struct ieee80211_bss_conf *link_conf,
 			   struct ieee80211_chanctx_conf *conf);
 void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			      unsigned int link_id,
+			      struct ieee80211_bss_conf *link_conf,
 			      struct ieee80211_chanctx_conf *conf);
 
 /* Hardware API Callbacks */
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 1edec9f3c0d8..3e3922d4c788 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -4701,7 +4701,7 @@ out:
 
 static int wlcore_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 					struct ieee80211_vif *vif,
-					unsigned int link_id,
+					struct ieee80211_bss_conf *link_conf,
 					struct ieee80211_chanctx_conf *ctx)
 {
 	struct wl1271 *wl = hw->priv;
@@ -4752,7 +4752,7 @@ out:
 
 static void wlcore_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
 					   struct ieee80211_vif *vif,
-					   unsigned int link_id,
+					   struct ieee80211_bss_conf *link_conf,
 					   struct ieee80211_chanctx_conf *ctx)
 {
 	struct wl1271 *wl = hw->priv;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index dcb8b6dac2e1..e4ead73c5c43 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -261,13 +261,13 @@ enum ieee80211_chanctx_switch_mode {
  * done.
  *
  * @vif: the vif that should be switched from old_ctx to new_ctx
- * @link_id: the link ID that's switching
+ * @link_conf: the link conf that's switching
  * @old_ctx: the old context to which the vif was assigned
  * @new_ctx: the new context to which the vif must be assigned
  */
 struct ieee80211_vif_chanctx_switch {
 	struct ieee80211_vif *vif;
-	unsigned int link_id;
+	struct ieee80211_bss_conf *link_conf;
 	struct ieee80211_chanctx_conf *old_ctx;
 	struct ieee80211_chanctx_conf *new_ctx;
 };
@@ -4297,11 +4297,11 @@ struct ieee80211_ops {
 			       u32 changed);
 	int (*assign_vif_chanctx)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  unsigned int link_id,
+				  struct ieee80211_bss_conf *link_conf,
 				  struct ieee80211_chanctx_conf *ctx);
 	void (*unassign_vif_chanctx)(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
-				     unsigned int link_id,
+				     struct ieee80211_bss_conf *link_conf,
 				     struct ieee80211_chanctx_conf *ctx);
 	int (*switch_vif_chanctx)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif_chanctx_switch *vifs,
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 2e9bc285f0a5..f247daa41563 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -835,7 +835,6 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
 					 struct ieee80211_chanctx *new_ctx)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
-	unsigned int link_id = link->link_id;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *curr_ctx = NULL;
@@ -850,13 +849,13 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
 	if (conf) {
 		curr_ctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-		drv_unassign_vif_chanctx(local, sdata, link_id, curr_ctx);
+		drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx);
 		conf = NULL;
 		list_del(&link->assigned_chanctx_list);
 	}
 
 	if (new_ctx) {
-		ret = drv_assign_vif_chanctx(local, sdata, link_id, new_ctx);
+		ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx);
 		if (ret)
 			goto out;
 
@@ -1276,7 +1275,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 	vif_chsw[0].vif = &sdata->vif;
 	vif_chsw[0].old_ctx = &old_ctx->conf;
 	vif_chsw[0].new_ctx = &new_ctx->conf;
-	vif_chsw[0].link_id = link->link_id;
+	vif_chsw[0].link_conf = link->conf;
 
 	list_del(&link->reserved_chanctx_list);
 	link->reserved_chanctx = NULL;
@@ -1440,7 +1439,7 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
 			vif_chsw[i].vif = &link->sdata->vif;
 			vif_chsw[i].old_ctx = &old_ctx->conf;
 			vif_chsw[i].new_ctx = &ctx->conf;
-			vif_chsw[i].link_id = link->link_id;
+			vif_chsw[i].link_conf = link->conf;
 
 			i++;
 		}
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index a04a88d122b7..0f06081c68ca 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -937,22 +937,31 @@ static inline void drv_change_chanctx(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
+static inline void drv_verify_link_exists(struct ieee80211_sub_if_data *sdata,
+					  struct ieee80211_bss_conf *link_conf)
+{
+	/* deflink always exists, so need to check only for other links */
+	if (sdata->deflink.conf != link_conf)
+		sdata_assert_lock(sdata);
+}
+
 static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
 					 struct ieee80211_sub_if_data *sdata,
-					 unsigned int link_id,
+					 struct ieee80211_bss_conf *link_conf,
 					 struct ieee80211_chanctx *ctx)
 {
 	int ret = 0;
 
+	drv_verify_link_exists(sdata, link_conf);
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_assign_vif_chanctx(local, sdata, link_id, ctx);
+	trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx);
 	if (local->ops->assign_vif_chanctx) {
 		WARN_ON_ONCE(!ctx->driver_present);
 		ret = local->ops->assign_vif_chanctx(&local->hw,
 						     &sdata->vif,
-						     link_id,
+						     link_conf,
 						     &ctx->conf);
 	}
 	trace_drv_return_int(local, ret);
@@ -962,20 +971,21 @@ static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
 
 static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
 					    struct ieee80211_sub_if_data *sdata,
-					    unsigned int link_id,
+					    struct ieee80211_bss_conf *link_conf,
 					    struct ieee80211_chanctx *ctx)
 {
 	might_sleep();
 
+	drv_verify_link_exists(sdata, link_conf);
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_unassign_vif_chanctx(local, sdata, link_id, ctx);
+	trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx);
 	if (local->ops->unassign_vif_chanctx) {
 		WARN_ON_ONCE(!ctx->driver_present);
 		local->ops->unassign_vif_chanctx(&local->hw,
 						 &sdata->vif,
-						 link_id,
+						 link_conf,
 						 &ctx->conf);
 	}
 	trace_drv_return_void(local);
@@ -992,7 +1002,7 @@ static inline int drv_start_ap(struct ieee80211_local *local,
 	int ret = 0;
 
 	/* make sure link_conf is protected */
-	sdata_assert_lock(sdata);
+	drv_verify_link_exists(sdata, link_conf);
 
 	might_sleep();
 
@@ -1011,7 +1021,7 @@ static inline void drv_stop_ap(struct ieee80211_local *local,
 			       struct ieee80211_bss_conf *link_conf)
 {
 	/* make sure link_conf is protected */
-	sdata_assert_lock(sdata);
+	drv_verify_link_exists(sdata, link_conf);
 
 	if (!check_sdata_in_driver(sdata))
 		return;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 75e5c1376351..402110f439f8 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1669,7 +1669,7 @@ TRACE_EVENT(drv_switch_vif_chanctx,
 
 				SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type);
 				SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p);
-				SWITCH_ENTRY_ASSIGN(link_id, link_id);
+				SWITCH_ENTRY_ASSIGN(link_id, link_conf->link_id);
 				strncpy(local_vifs[i].vif.vif_name,
 					sdata->name,
 					sizeof(local_vifs[i].vif.vif_name));
@@ -1710,10 +1710,10 @@ TRACE_EVENT(drv_switch_vif_chanctx,
 DECLARE_EVENT_CLASS(local_sdata_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 unsigned int link_id,
+		 struct ieee80211_bss_conf *link_conf,
 		 struct ieee80211_chanctx *ctx),
 
-	TP_ARGS(local, sdata, link_id, ctx),
+	TP_ARGS(local, sdata, link_conf, ctx),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -1726,7 +1726,7 @@ DECLARE_EVENT_CLASS(local_sdata_chanctx,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		CHANCTX_ASSIGN;
-		__entry->link_id = link_id;
+		__entry->link_id = link_conf->link_id;
 	),
 
 	TP_printk(
@@ -1738,17 +1738,17 @@ DECLARE_EVENT_CLASS(local_sdata_chanctx,
 DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 unsigned int link_id,
+		 struct ieee80211_bss_conf *link_conf,
 		 struct ieee80211_chanctx *ctx),
-	TP_ARGS(local, sdata, link_id, ctx)
+	TP_ARGS(local, sdata, link_conf, ctx)
 );
 
 DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 unsigned int link_id,
+		 struct ieee80211_bss_conf *link_conf,
 		 struct ieee80211_chanctx *ctx),
-	TP_ARGS(local, sdata, link_id, ctx)
+	TP_ARGS(local, sdata, link_conf, ctx)
 );
 
 TRACE_EVENT(drv_start_ap,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 739c05fdb9bb..8cb93d65b80e 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2270,7 +2270,7 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 					 lockdep_is_held(&local->chanctx_mtx));
 	if (conf) {
 		ctx = container_of(conf, struct ieee80211_chanctx, conf);
-		drv_assign_vif_chanctx(local, sdata, link->link_id, ctx);
+		drv_assign_vif_chanctx(local, sdata, link->conf, ctx);
 	}
 	mutex_unlock(&local->chanctx_mtx);
 }
-- 
cgit 


From 7840bd468a99edac26492afa828b8fcbbbb2384e Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Mon, 4 Jul 2022 00:38:22 +0300
Subject: wifi: mac80211: remove link_id parameter from link_info_changed()

Since struct ieee80211_bss_conf already contains link_id,
passing link_id is not necessary.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 3 ++-
 include/net/mac80211.h                | 1 -
 net/mac80211/driver-ops.h             | 4 ++--
 net/mac80211/main.c                   | 5 ++---
 net/mac80211/trace.h                  | 6 +++---
 5 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 7d573e90ad81..5418c4973501 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2178,10 +2178,11 @@ static void mac80211_hwsim_vif_info_changed(struct ieee80211_hw *hw,
 static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif,
 					     struct ieee80211_bss_conf *info,
-					     u32 link_id, u64 changed)
+					     u64 changed)
 {
 	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
 	struct mac80211_hwsim_data *data = hw->priv;
+	unsigned int link_id = info->link_id;
 	struct mac80211_hwsim_link_data *link_data = &data->link_data[link_id];
 
 	hwsim_check_magic(vif);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e4ead73c5c43..6fc4253b5644 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4088,7 +4088,6 @@ struct ieee80211_ops {
 	void (*link_info_changed)(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
 				  struct ieee80211_bss_conf *info,
-				  unsigned int link_id,
 				  u64 changed);
 
 	int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 0f06081c68ca..482f5c97a72b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -190,10 +190,10 @@ static inline void drv_link_info_changed(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	trace_drv_link_info_changed(local, sdata, info, link_id, changed);
+	trace_drv_link_info_changed(local, sdata, info, changed);
 	if (local->ops->link_info_changed)
 		local->ops->link_info_changed(&local->hw, &sdata->vif,
-					      info, link_id, changed);
+					      info, changed);
 	else if (local->ops->bss_info_changed)
 		local->ops->bss_info_changed(&local->hw, &sdata->vif,
 					     info, changed);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 8d5b18318b20..26bb30606282 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -248,11 +248,10 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 
 		/* FIXME: should be for each link */
 		trace_drv_link_info_changed(local, sdata, &sdata->vif.bss_conf,
-					    0, changed);
+					    changed);
 		if (local->ops->link_info_changed)
 			local->ops->link_info_changed(&local->hw, &sdata->vif,
-						      &sdata->vif.bss_conf,
-						      0, ch);
+						      &sdata->vif.bss_conf, ch);
 	}
 
 	if (local->ops->bss_info_changed)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 402110f439f8..9f4377566c42 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -449,9 +449,9 @@ TRACE_EVENT(drv_link_info_changed,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
 		 struct ieee80211_bss_conf *link_conf,
-		 int link_id, u64 changed),
+		 u64 changed),
 
-	TP_ARGS(local, sdata, link_conf, link_id, changed),
+	TP_ARGS(local, sdata, link_conf, changed),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -486,7 +486,7 @@ TRACE_EVENT(drv_link_info_changed,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->changed = changed;
-		__entry->link_id = link_id;
+		__entry->link_id = link_conf->link_id;
 		__entry->shortpre = link_conf->use_short_preamble;
 		__entry->cts = link_conf->use_cts_prot;
 		__entry->shortslot = link_conf->use_short_slot;
-- 
cgit 


From 7464f665158e09f3f29116d8d0676824c1f1eeda Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 12 Jul 2022 18:32:49 +0200
Subject: wifi: cfg80211: add cfg80211_get_iftype_ext_capa()

Add a helper function cfg80211_get_iftype_ext_capa() to
look up interface type-specific (extended) capabilities.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  8 ++++++++
 net/wireless/util.c    | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8dbc64286d91..d5af3a7fc2b4 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5005,6 +5005,14 @@ struct wiphy_iftype_ext_capab {
 	u16 mld_capa_and_ops;
 };
 
+/**
+ * cfg80211_get_iftype_ext_capa - lookup interface type extended capability
+ * @wiphy: the wiphy to look up from
+ * @type: the interface type to look up
+ */
+const struct wiphy_iftype_ext_capab *
+cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);
+
 /**
  * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities
  * @max_peers: maximum number of peers in a single measurement
diff --git a/net/wireless/util.c b/net/wireless/util.c
index fe7956c8c6da..2c127951764a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2490,3 +2490,17 @@ int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
 
 	return rdev_del_virtual_intf(rdev, wdev);
 }
+
+const struct wiphy_iftype_ext_capab *
+cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type)
+{
+	int i;
+
+	for (i = 0; i < wiphy->num_iftype_ext_capab; i++) {
+		if (wiphy->iftype_ext_capab[i].iftype == type)
+			return &wiphy->iftype_ext_capab[i];
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa);
-- 
cgit 


From 3e0278b717b077f9ccf0280580ce6c5eb9c4dbac Mon Sep 17 00:00:00 2001
From: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Date: Wed, 13 Jul 2022 12:05:27 +0300
Subject: wifi: mac80211: select link when transmitting to non-MLO stations

When an MLO AP is transmitting to a non-MLO station, addr2 should be set
to a link address. This should be done before the frame is encrypted as
otherwise aad verification would fail. In case of software encryption
this can't be left for the device to handle, and should be done by
mac80211 when building the frame hdr.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  2 ++
 net/mac80211/cfg.c     |  4 ++++
 net/mac80211/tx.c      | 19 +++++++++++++++++--
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6fc4253b5644..6f856da28d71 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2147,6 +2147,7 @@ struct ieee80211_link_sta {
  * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only
  *	valid if the STA is a TDLS peer in the first place.
  * @mfp: indicates whether the STA uses management frame protection or not.
+ * @mlo: indicates whether the STA is MLO station.
  * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
  *	A-MSDU. Taken from the Extended Capabilities element. 0 means
  *	unlimited.
@@ -2180,6 +2181,7 @@ struct ieee80211_sta {
 	bool tdls;
 	bool tdls_initiator;
 	bool mfp;
+	bool mlo;
 	u8 max_amsdu_subframes;
 
 	/**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 1cacd1e0fc85..fe6500b36953 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1818,6 +1818,10 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 			return ret;
 	}
 
+	/* Mark the STA as MLO if MLD MAC address is available */
+	if (params->link_sta_params.mld_mac)
+		sta->sta.mlo = true;
+
 	return 0;
 }
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 5dd29288c009..1d60eab8308a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2570,6 +2570,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_chanctx_conf *chanctx_conf = NULL;
 	enum nl80211_band band;
 	int ret;
+	u8 link_id = IEEE80211_LINK_UNSPECIFIED;
 
 	if (IS_ERR(sta))
 		sta = NULL;
@@ -2618,7 +2619,21 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
 		/* DA BSSID SA */
 		memcpy(hdr.addr1, skb->data, ETH_ALEN);
-		memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+
+		if (sdata->vif.valid_links && sta && !sta->sta.mlo) {
+			struct ieee80211_link_data *link;
+
+			link_id = sta->deflink.link_id;
+			link = rcu_dereference(sdata->link[link_id]);
+			if (WARN_ON(!link)) {
+				ret = -ENOLINK;
+				goto free;
+			}
+			memcpy(hdr.addr2, link->conf->addr, ETH_ALEN);
+		} else {
+			memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+		}
+
 		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
 		hdrlen = 24;
 		break;
@@ -2882,7 +2897,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	info->ack_frame_id = info_id;
 	info->band = band;
 	info->control.flags = ctrl_flags |
-			      u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
+			      u32_encode_bits(link_id,
 					      IEEE80211_TX_CTRL_MLO_LINK);
 
 	return skb;
-- 
cgit 


From 8281b7ec5c56b71cb2cc5a1728b41607be66959c Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:51 -0700
Subject: ip: Fix data-races around sysctl_ip_default_ttl.

While reading sysctl_ip_default_ttl, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its readers.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 2 +-
 include/net/route.h                                | 2 +-
 net/ipv4/ip_sockglue.c                             | 2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c                | 4 ++--
 net/ipv4/proc.c                                    | 2 +-
 net/netfilter/nf_synproxy_core.c                   | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 0147de405365..ffb6f6d05a07 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -474,7 +474,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 			set_tun->ttl = ip4_dst_hoplimit(&rt->dst);
 			ip_rt_put(rt);
 		} else {
-			set_tun->ttl = net->ipv4.sysctl_ip_default_ttl;
+			set_tun->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
 		}
 	}
 
diff --git a/include/net/route.h b/include/net/route.h
index 991a3985712d..bbcf2aba149f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -373,7 +373,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	struct net *net = dev_net(dst->dev);
 
 	if (hoplimit == 0)
-		hoplimit = net->ipv4.sysctl_ip_default_ttl;
+		hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
 	return hoplimit;
 }
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 445a9ecaefa1..d497d525dea3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1606,7 +1606,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	{
 		struct net *net = sock_net(sk);
 		val = (inet->uc_ttl == -1 ?
-		       net->ipv4.sysctl_ip_default_ttl :
+		       READ_ONCE(net->ipv4.sysctl_ip_default_ttl) :
 		       inet->uc_ttl);
 		break;
 	}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 918c61fda0f3..d640adcaf1b1 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -62,7 +62,7 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
-				   net->ipv4.sysctl_ip_default_ttl);
+				   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
 	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
 	niph->tot_len = htons(nskb->len);
 	ip_send_check(niph);
@@ -117,7 +117,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
-				   net->ipv4.sysctl_ip_default_ttl);
+				   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
 
 	skb_reset_transport_header(nskb);
 	icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 28836071f0a6..0088a4c64d77 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -387,7 +387,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "\nIp: %d %d",
 		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
-		   net->ipv4.sysctl_ip_default_ttl);
+		   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
 
 	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
 	snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index e479dd0561c5..16915f8eef2b 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -405,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
 	iph->tos	= 0;
 	iph->id		= 0;
 	iph->frag_off	= htons(IP_DF);
-	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;
+	iph->ttl	= READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
 	iph->protocol	= IPPROTO_TCP;
 	iph->check	= 0;
 	iph->saddr	= saddr;
-- 
cgit 


From 60c158dc7b1f0558f6cadd5b50d0386da0000d50 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:53 -0700
Subject: ip: Fix data-races around sysctl_ip_fwd_use_pmtu.

While reading sysctl_ip_fwd_use_pmtu, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its readers.

Fixes: f87c10a8aa1e ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 2 +-
 net/ipv4/route.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 26fffda78cca..05fe313f72fa 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -446,7 +446,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	struct net *net = dev_net(dst->dev);
 	unsigned int mtu;
 
-	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
+	if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
 	    ip_mtu_locked(dst) ||
 	    !forwarding) {
 		mtu = rt->rt_pmtu;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 356f535f3443..91c4f60de75a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1398,7 +1398,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 	struct fib_info *fi = res->fi;
 	u32 mtu = 0;
 
-	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
 		mtu = fi->fib_mtu;
 
-- 
cgit 


From 289d3b21fb0bfc94c4e98f10635bba1824e5f83c Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:55 -0700
Subject: ip: Fix data-races around sysctl_ip_nonlocal_bind.

While reading sysctl_ip_nonlocal_bind, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its readers.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 2 +-
 net/sctp/protocol.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index daead5fb389a..68d337775564 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -374,7 +374,7 @@ static inline bool inet_get_convert_csum(struct sock *sk)
 static inline bool inet_can_nonlocal_bind(struct net *net,
 					  struct inet_sock *inet)
 {
-	return net->ipv4.sysctl_ip_nonlocal_bind ||
+	return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
 		inet->freebind || inet->transparent;
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 35928fefae33..1a094b087d88 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -358,7 +358,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
 	if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
 	   ret != RTN_LOCAL &&
 	   !sp->inet.freebind &&
-	   !net->ipv4.sysctl_ip_nonlocal_bind)
+	    !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind))
 		return 0;
 
 	if (ipv6_only_sock(sctp_opt2sk(sp)))
-- 
cgit 


From 85d0b4dbd74b95cc492b1f4e34497d3f894f5d9a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:57 -0700
Subject: ip: Fix a data-race around sysctl_fwmark_reflect.

While reading sysctl_fwmark_reflect, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 05fe313f72fa..4a15b6bcb4b8 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -384,7 +384,7 @@ void ipfrag_init(void);
 void ip_static_sysctl_init(void);
 
 #define IP4_REPLY_MARK(net, mark) \
-	((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0)
+	(READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0)
 
 static inline bool ip_is_fragment(const struct iphdr *iph)
 {
-- 
cgit 


From 1a0008f9df59451d0a17806c1ee1a19857032fa8 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:58 -0700
Subject: tcp/dccp: Fix a data-race around sysctl_tcp_fwmark_accept.

While reading sysctl_tcp_fwmark_accept, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: 84f39b08d786 ("net: support marking accepting TCP sockets")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 68d337775564..b29108f0973a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -107,7 +107,8 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
 
 static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
 {
-	if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
+	if (!sk->sk_mark &&
+	    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
 		return skb->mark;
 
 	return sk->sk_mark;
-- 
cgit 


From 08a75f10679470552a3a443f9aefd1399604d31d Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 13:51:59 -0700
Subject: tcp: Fix data-races around sysctl_tcp_l3mdev_accept.

While reading sysctl_tcp_l3mdev_accept, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its readers.

Fixes: 6dd9a14e92e5 ("net: Allow accepted sockets to be bound to l3mdev domain")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 2 +-
 include/net/inet_sock.h       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ebfa3df6f8dc..fd6b510d114b 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -179,7 +179,7 @@ static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
 					int dif, int sdif)
 {
 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-	return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept,
+	return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
 				 bound_dev_if, dif, sdif);
 #else
 	return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b29108f0973a..6395f6b9a5d2 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -121,7 +121,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk,
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	struct net *net = sock_net(sk);
 
-	if (!bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
+	if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
 		return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
 #endif
 
@@ -133,7 +133,7 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk)
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	struct net *net = sock_net(sk);
 
-	if (!net->ipv4.sysctl_tcp_l3mdev_accept)
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
 		return l3mdev_master_ifindex_by_index(net,
 						      sk->sk_bound_dev_if);
 #endif
-- 
cgit 


From 93064e15c8a3a8394319a11b8037666e4b7d653d Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 7 Jul 2022 16:10:36 +0100
Subject: ACPI: utils: Add api to read _SUB from ACPI

Add a wrapper function to read the _SUB string from ACPI.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220707151037.3901050-2-sbinding@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/acpi/utils.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |  6 ++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 3a9773a09e19..5a7b8065e77f 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -291,6 +291,44 @@ int acpi_get_local_address(acpi_handle handle, u32 *addr)
 }
 EXPORT_SYMBOL(acpi_get_local_address);
 
+#define ACPI_MAX_SUB_BUF_SIZE	9
+
+const char *acpi_get_subsystem_id(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	acpi_status status;
+	const char *sub;
+	size_t len;
+
+	status = acpi_evaluate_object(handle, METHOD_NAME__SUB, NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		acpi_handle_debug(handle, "Reading ACPI _SUB failed: %#x\n", status);
+		return ERR_PTR(-ENODATA);
+	}
+
+	obj = buffer.pointer;
+	if (obj->type == ACPI_TYPE_STRING) {
+		len = strlen(obj->string.pointer);
+		if (len < ACPI_MAX_SUB_BUF_SIZE && len > 0) {
+			sub = kstrdup(obj->string.pointer, GFP_KERNEL);
+			if (!sub)
+				sub = ERR_PTR(-ENOMEM);
+		} else {
+			acpi_handle_err(handle, "ACPI _SUB Length %zu is Invalid\n", len);
+			sub = ERR_PTR(-ENODATA);
+		}
+	} else {
+		acpi_handle_warn(handle, "Warning ACPI _SUB did not return a string\n");
+		sub = ERR_PTR(-ENODATA);
+	}
+
+	acpi_os_free(buffer.pointer);
+
+	return sub;
+}
+EXPORT_SYMBOL_GPL(acpi_get_subsystem_id);
+
 acpi_status
 acpi_evaluate_reference(acpi_handle handle,
 			acpi_string pathname,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4f82a5bc6d98..968a187f14f0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -762,6 +762,7 @@ static inline u64 acpi_arch_get_root_pointer(void)
 #endif
 
 int acpi_get_local_address(acpi_handle handle, u32 *addr);
+const char *acpi_get_subsystem_id(acpi_handle handle);
 
 #else	/* !CONFIG_ACPI */
 
@@ -1023,6 +1024,11 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
 	return -ENODEV;
 }
 
+static inline const char *acpi_get_subsystem_id(acpi_handle handle)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline int acpi_register_wakeup_handler(int wake_irq,
 	bool (*wakeup)(void *context), void *context)
 {
-- 
cgit 


From 28a6ed8e39f77f6ac613ec9b7461aa75e85fa79a Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 11 Jul 2022 07:22:57 +0000
Subject: platform/chrome: Add Type-C mux set command definitions

Copy EC header definitions for the USB Type-C Mux control command from
the EC code base. Also pull in "TBT_UFP_REPLY" definitions, since that
is the prior entry in the enum.

These headers are already present in the EC code base. [1]

[1] https://chromium.googlesource.com/chromiumos/platform/ec/+/b80f85a94a423273c1638ef7b662c56931a138dd/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20220711072333.2064341-4-pmalani@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/cros_ec_commands.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8cfa8cfca77e..a3945c5e7f50 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5722,8 +5722,21 @@ enum typec_control_command {
 	TYPEC_CONTROL_COMMAND_EXIT_MODES,
 	TYPEC_CONTROL_COMMAND_CLEAR_EVENTS,
 	TYPEC_CONTROL_COMMAND_ENTER_MODE,
+	TYPEC_CONTROL_COMMAND_TBT_UFP_REPLY,
+	TYPEC_CONTROL_COMMAND_USB_MUX_SET,
 };
 
+/* Replies the AP may specify to the TBT EnterMode command as a UFP */
+enum typec_tbt_ufp_reply {
+	TYPEC_TBT_UFP_REPLY_NAK,
+	TYPEC_TBT_UFP_REPLY_ACK,
+};
+
+struct typec_usb_mux_set {
+	uint8_t mux_index;	/* Index of the mux to set in the chain */
+	uint8_t mux_flags;	/* USB_PD_MUX_*-encoded USB mux state to set */
+} __ec_align1;
+
 struct ec_params_typec_control {
 	uint8_t port;
 	uint8_t command;	/* enum typec_control_command */
@@ -5737,6 +5750,8 @@ struct ec_params_typec_control {
 	union {
 		uint32_t clear_events_mask;
 		uint8_t mode_to_enter;      /* enum typec_mode */
+		uint8_t tbt_ufp_reply;      /* enum typec_tbt_ufp_reply */
+		struct typec_usb_mux_set mux_params;
 		uint8_t placeholder[128];
 	};
 } __ec_align1;
@@ -5815,6 +5830,9 @@ enum tcpc_cc_polarity {
 #define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
 #define PD_STATUS_EVENT_HARD_RESET		BIT(2)
+#define PD_STATUS_EVENT_DISCONNECTED		BIT(3)
+#define PD_STATUS_EVENT_MUX_0_SET_DONE		BIT(4)
+#define PD_STATUS_EVENT_MUX_1_SET_DONE		BIT(5)
 
 struct ec_params_typec_status {
 	uint8_t port;
-- 
cgit 


From 4dea97f8636d0514befc9fc5cf342b351b7d0e20 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:25 -0700
Subject: lib/bitmap: change type of bitmap_weight to unsigned long

bitmap_weight() doesn't return negative values, so change it's type
to unsigned long. It may help compiler to generate better code and
catch bugs.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/dma/ti/k3-udma.c                           | 6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c            | 2 +-
 drivers/gpu/drm/i915/display/intel_display_power.c | 2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c           | 2 +-
 drivers/iommu/intel/iommu.c                        | 2 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_debug.c     | 2 +-
 drivers/net/wireless/ath/carl9170/debug.c          | 2 +-
 include/linux/bitmap.h                             | 5 +++--
 lib/bitmap.c                                       | 5 ++---
 tools/include/linux/bitmap.h                       | 4 ++--
 tools/lib/bitmap.c                                 | 4 ++--
 12 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 2f0d2c68c93c..07cb48db76ba 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -4997,7 +4997,7 @@ static int setup_resources(struct udma_dev *ud)
 	switch (ud->match_data->type) {
 	case DMA_TYPE_UDMA:
 		dev_info(dev,
-			 "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n",
+			 "Channels: %d (tchan: %lu, rchan: %lu, gp-rflow: %lu)\n",
 			 ch_count,
 			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
 						       ud->tchan_cnt),
@@ -5008,7 +5008,7 @@ static int setup_resources(struct udma_dev *ud)
 		break;
 	case DMA_TYPE_BCDMA:
 		dev_info(dev,
-			 "Channels: %d (bchan: %u, tchan: %u, rchan: %u)\n",
+			 "Channels: %d (bchan: %lu, tchan: %lu, rchan: %lu)\n",
 			 ch_count,
 			 ud->bchan_cnt - bitmap_weight(ud->bchan_map,
 						       ud->bchan_cnt),
@@ -5019,7 +5019,7 @@ static int setup_resources(struct udma_dev *ud)
 		break;
 	case DMA_TYPE_PKTDMA:
 		dev_info(dev,
-			 "Channels: %d (tchan: %u, rchan: %u)\n",
+			 "Channels: %d (tchan: %lu, rchan: %lu)\n",
 			 ch_count,
 			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
 						       ud->tchan_cnt),
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 16699158e00d..1098506ba1aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -195,7 +195,7 @@ void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)
 			set_bit(i, adev->gfx.mec.queue_bitmap);
 	}
 
-	dev_dbg(adev->dev, "mec queue bitmap weight=%d\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES));
+	dev_dbg(adev->dev, "mec queue bitmap weight=%ld\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES));
 }
 
 void amdgpu_gfx_graphics_queue_acquire(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index 949edc983a16..4204e8a0e1a6 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -378,7 +378,7 @@ static void print_power_domains(struct i915_power_domains *power_domains,
 						     power_domains);
 	enum intel_display_power_domain domain;
 
-	drm_dbg(&i915->drm, "%s (%d):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM));
+	drm_dbg(&i915->drm, "%s (%ld):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM));
 	for_each_power_domain(domain, mask)
 		drm_dbg(&i915->drm, "%s use_count %d\n",
 			intel_display_power_domain_str(domain),
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index 56a3063545ec..3d35b3982a60 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -363,7 +363,7 @@ void mdp5_smp_dump(struct mdp5_smp *smp, struct drm_printer *p)
 	}
 
 	drm_printf(p, "TOTAL:\t%d\t(of %d)\n", total, smp->blk_cnt);
-	drm_printf(p, "AVAIL:\t%d\n", smp->blk_cnt -
+	drm_printf(p, "AVAIL:\t%ld\n", smp->blk_cnt -
 			bitmap_weight(state->state, smp->blk_cnt));
 
 	if (drm_can_sleep())
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 44016594831d..2f1b09f34706 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3946,7 +3946,7 @@ static ssize_t domains_used_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+	return sprintf(buf, "%ld\n", bitmap_weight(iommu->domain_ids,
 						  cap_ndoms(iommu->cap)));
 }
 static DEVICE_ATTR_RO(domains_used);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 42c96c9d7fb1..af054a3808ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -463,7 +463,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 		field = min(
 			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
-			dev->caps.num_ports);
+			(unsigned long)dev->caps.num_ports);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
 
 		size = dev->caps.function_caps; /* set PF behaviours */
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
index b3ed65e5c4da..d32056c4ee1d 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
@@ -296,7 +296,7 @@ static ssize_t read_file_slot(struct file *file, char __user *user_buf,
 	spin_lock_bh(&priv->tx.tx_lock);
 	len = scnprintf(buf, sizeof(buf),
 			"TX slot bitmap : %*pb\n"
-			"Used slots     : %d\n",
+			"Used slots     : %ld\n",
 			MAX_TX_BUF_NUM, priv->tx.tx_slot,
 			bitmap_weight(priv->tx.tx_slot, MAX_TX_BUF_NUM));
 	spin_unlock_bh(&priv->tx.tx_lock);
diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c
index bb40889d7c72..7e88882b87d9 100644
--- a/drivers/net/wireless/ath/carl9170/debug.c
+++ b/drivers/net/wireless/ath/carl9170/debug.c
@@ -221,7 +221,7 @@ static char *carl9170_debugfs_mem_usage_read(struct ar9170 *ar, char *buf,
 	ADD(buf, *len, bufsize, "jar: [%*pb]\n",
 	    ar->fw.mem_blocks, ar->mem_bitmap);
 
-	ADD(buf, *len, bufsize, "cookies: used:%3d / total:%3d, allocs:%d\n",
+	ADD(buf, *len, bufsize, "cookies: used:%3ld / total:%3d, allocs:%d\n",
 	    bitmap_weight(ar->mem_bitmap, ar->fw.mem_blocks),
 	    ar->fw.mem_blocks, atomic_read(&ar->mem_allocs));
 
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index e1a438bdda52..035d4ac66641 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -163,7 +163,7 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 			 const unsigned long *bitmap2, unsigned int nbits);
 bool __bitmap_subset(const unsigned long *bitmap1,
 		     const unsigned long *bitmap2, unsigned int nbits);
-int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -431,7 +431,8 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 	return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline
+unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 9bc80f5bf149..2b67cd657692 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -333,10 +333,9 @@ bool __bitmap_subset(const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_subset);
 
-int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
-	unsigned int k, lim = bits/BITS_PER_LONG;
-	int w = 0;
+	unsigned long k, w = 0, lim = bits/BITS_PER_LONG;
 
 	for (k = 0; k < lim; k++)
 		w += hweight_long(bitmap[k]);
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 2ae7ab8ed7d1..ae1852e39142 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -11,7 +11,7 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
-int __bitmap_weight(const unsigned long *bitmap, int bits);
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
 bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -61,7 +61,7 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 	return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static inline unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 2e351d63fdba..e1fafc131a49 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -5,9 +5,9 @@
  */
 #include <linux/bitmap.h>
 
-int __bitmap_weight(const unsigned long *bitmap, int bits)
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
-	int k, w = 0, lim = bits/BITS_PER_LONG;
+	unsigned long k, w = 0, lim = bits/BITS_PER_LONG;
 
 	for (k = 0; k < lim; k++)
 		w += hweight_long(bitmap[k]);
-- 
cgit 


From cb32c285cc10e428589194e30233d673e7c23c72 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:26 -0700
Subject: cpumask: change return types to bool where appropriate

Some cpumask functions have integer return types where return values
are naturally booleans.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..b54e27d9da6b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -372,9 +372,9 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in @cpumask, else returns 0
+ * Returns true if @cpu is set in @cpumask, else returns false
  */
-static __always_inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
+static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
 {
 	return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
 }
@@ -384,11 +384,11 @@ static __always_inline int cpumask_test_cpu(int cpu, const struct cpumask *cpuma
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
+ * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
  *
  * test_and_set_bit wrapper for cpumasks.
  */
-static __always_inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -398,11 +398,11 @@ static __always_inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpu
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
+ * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
  *
  * test_and_clear_bit wrapper for cpumasks.
  */
-static __always_inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -431,9 +431,9 @@ static inline void cpumask_clear(struct cpumask *dstp)
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns 0, else returns 1
+ * If *@dstp is empty, returns false, else returns true
  */
-static inline int cpumask_and(struct cpumask *dstp,
+static inline bool cpumask_and(struct cpumask *dstp,
 			       const struct cpumask *src1p,
 			       const struct cpumask *src2p)
 {
@@ -474,9 +474,9 @@ static inline void cpumask_xor(struct cpumask *dstp,
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns 0, else returns 1
+ * If *@dstp is empty, returns false, else returns true
  */
-static inline int cpumask_andnot(struct cpumask *dstp,
+static inline bool cpumask_andnot(struct cpumask *dstp,
 				  const struct cpumask *src1p,
 				  const struct cpumask *src2p)
 {
@@ -539,9 +539,9 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  *
- * Returns 1 if *@src1p is a subset of *@src2p, else returns 0
+ * Returns true if *@src1p is a subset of *@src2p, else returns false
  */
-static inline int cpumask_subset(const struct cpumask *src1p,
+static inline bool cpumask_subset(const struct cpumask *src1p,
 				 const struct cpumask *src2p)
 {
 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
-- 
cgit 


From 8b6b795d9bfc031a8953c40fac8d3cf67e1a4d3d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:27 -0700
Subject: lib/cpumask: change return types to unsigned where appropriate

Switch return types to unsigned int where return values cannot be negative.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 14 +++++++-------
 lib/cpumask.c           | 18 +++++++++---------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b54e27d9da6b..760022bcb925 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -176,12 +176,12 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 					     const struct cpumask *src2p) {
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
+static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
@@ -258,12 +258,12 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+unsigned int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+unsigned int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
-int cpumask_any_and_distribute(const struct cpumask *src1p,
+unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
-int cpumask_any_distribute(const struct cpumask *srcp);
+unsigned int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
@@ -289,7 +289,7 @@ int cpumask_any_distribute(const struct cpumask *srcp);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a971a82d2f43..da68f6bbde44 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL(cpumask_next);
  *
  * Returns >= nr_cpu_ids if no further cpus set in both.
  */
-int cpumask_next_and(int n, const struct cpumask *src1p,
+unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
 	/* -1 is a legal arg here. */
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(cpumask_next_and);
  * Often used to find any cpu but smp_processor_id() in a mask.
  * Returns >= nr_cpu_ids if no cpus set.
  */
-int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 {
 	unsigned int i;
 
@@ -74,9 +74,9 @@ EXPORT_SYMBOL(cpumask_any_but);
  * Note: the @wrap argument is required for the start condition when
  * we cannot assume @start is set in @mask.
  */
-int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
 {
-	int next;
+	unsigned int next;
 
 again:
 	next = cpumask_next(n, mask);
@@ -205,7 +205,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  */
 unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-	int cpu;
+	unsigned int cpu;
 
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
@@ -243,10 +243,10 @@ static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
  *
  * Returns >= nr_cpu_ids if the intersection is empty.
  */
-int cpumask_any_and_distribute(const struct cpumask *src1p,
+unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p)
 {
-	int next, prev;
+	unsigned int next, prev;
 
 	/* NOTE: our first selection will skip 0. */
 	prev = __this_cpu_read(distribute_cpu_mask_prev);
@@ -262,9 +262,9 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
 
-int cpumask_any_distribute(const struct cpumask *srcp)
+unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
-	int next, prev;
+	unsigned int next, prev;
 
 	/* NOTE: our first selection will skip 0. */
 	prev = __this_cpu_read(distribute_cpu_mask_prev);
-- 
cgit 


From 9b2e70860ef2f0d74b6d9e57929d57b14481b9c9 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:28 -0700
Subject: lib/cpumask: move trivial wrappers around find_bit to the header

To avoid circular dependencies, cpumask keeps simple (almost) one-line
wrappers around find_bit() in a c-file.

Commit 47d8c15615c0a2 ("include: move find.h from asm_generic to linux")
moved find.h header out of asm_generic include path, and it helped to fix
many circular dependencies, including some in cpumask.h.

This patch moves those one-liners to header files.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/cpumask.c           | 55 -----------------------------------------------
 2 files changed, 54 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 760022bcb925..ea3de2c2c180 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -241,7 +241,21 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
-unsigned int __pure cpumask_next(int n, const struct cpumask *srcp);
+/**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+static inline
+unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+}
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
@@ -258,8 +272,25 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-unsigned int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-unsigned int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+/**
+ * cpumask_next_and - get the next cpu in *src1p & *src2p
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first cpumask pointer
+ * @src2p: the second cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set in both.
+ */
+static inline
+unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
+		     const struct cpumask *src2p)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
+		nr_cpumask_bits, n + 1);
+}
+
 unsigned int cpumask_local_spread(unsigned int i, int node);
 unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
@@ -324,6 +355,26 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, boo
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
+
+/**
+ * cpumask_any_but - return a "random" in a cpumask, but not this one.
+ * @mask: the cpumask to search
+ * @cpu: the cpu to ignore.
+ *
+ * Often used to find any cpu but smp_processor_id() in a mask.
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+static inline
+unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+{
+	unsigned int i;
+
+	cpumask_check(cpu);
+	for_each_cpu(i, mask)
+		if (i != cpu)
+			break;
+	return i;
+}
 #endif /* SMP */
 
 #define CPU_BITS_NONE						\
diff --git a/lib/cpumask.c b/lib/cpumask.c
index da68f6bbde44..cb7262ff8633 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -7,61 +7,6 @@
 #include <linux/memblock.h>
 #include <linux/numa.h>
 
-/**
- * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @srcp: the cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set.
- */
-unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
-	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
-}
-EXPORT_SYMBOL(cpumask_next);
-
-/**
- * cpumask_next_and - get the next cpu in *src1p & *src2p
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @src1p: the first cpumask pointer
- * @src2p: the second cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set in both.
- */
-unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
-		     const struct cpumask *src2p)
-{
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
-	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
-		nr_cpumask_bits, n + 1);
-}
-EXPORT_SYMBOL(cpumask_next_and);
-
-/**
- * cpumask_any_but - return a "random" in a cpumask, but not this one.
- * @mask: the cpumask to search
- * @cpu: the cpu to ignore.
- *
- * Often used to find any cpu but smp_processor_id() in a mask.
- * Returns >= nr_cpu_ids if no cpus set.
- */
-unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
-{
-	unsigned int i;
-
-	cpumask_check(cpu);
-	for_each_cpu(i, mask)
-		if (i != cpu)
-			break;
-	return i;
-}
-EXPORT_SYMBOL(cpumask_any_but);
-
 /**
  * cpumask_next_wrap - helper to implement for_each_cpu_wrap
  * @n: the cpu prior to the place to search
-- 
cgit 


From db96b0c5f9db22d908ab5f7cd75904adba4b28ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 10 Jun 2021 10:56:49 +0200
Subject: headers/deps: mm: Optimize <linux/gfp.h> header dependencies

There's a couple of superfluous inclusions here - remove them before
doing bigger changes.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/gfp.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae933c2..52f2c873a7d4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -2,10 +2,7 @@
 #ifndef __LINUX_GFP_H
 #define __LINUX_GFP_H
 
-#include <linux/mmdebug.h>
 #include <linux/mmzone.h>
-#include <linux/stddef.h>
-#include <linux/linkage.h>
 #include <linux/topology.h>
 
 /* The typedef is in types.h but we want the documentation here */
-- 
cgit 


From cb5a065b4ea9c062a18143c8a14e831179687f54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 14 Apr 2022 18:42:28 +0200
Subject: headers/deps: mm: Split <linux/gfp_types.h> out of <linux/gfp.h>

This is a much smaller header.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/gfp.h       | 345 +--------------------------------------------
 include/linux/gfp_types.h | 348 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 350 insertions(+), 343 deletions(-)
 create mode 100644 include/linux/gfp_types.h

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 52f2c873a7d4..f314be58fa77 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -2,354 +2,13 @@
 #ifndef __LINUX_GFP_H
 #define __LINUX_GFP_H
 
+#include <linux/gfp_types.h>
+
 #include <linux/mmzone.h>
 #include <linux/topology.h>
 
-/* The typedef is in types.h but we want the documentation here */
-#if 0
-/**
- * typedef gfp_t - Memory allocation flags.
- *
- * GFP flags are commonly used throughout Linux to indicate how memory
- * should be allocated.  The GFP acronym stands for get_free_pages(),
- * the underlying memory allocation function.  Not every GFP flag is
- * supported by every function which may allocate memory.  Most users
- * will want to use a plain ``GFP_KERNEL``.
- */
-typedef unsigned int __bitwise gfp_t;
-#endif
-
 struct vm_area_struct;
 
-/*
- * In case of changes, please don't forget to update
- * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
- */
-
-/* Plain integer GFP bitmasks. Do not use this directly. */
-#define ___GFP_DMA		0x01u
-#define ___GFP_HIGHMEM		0x02u
-#define ___GFP_DMA32		0x04u
-#define ___GFP_MOVABLE		0x08u
-#define ___GFP_RECLAIMABLE	0x10u
-#define ___GFP_HIGH		0x20u
-#define ___GFP_IO		0x40u
-#define ___GFP_FS		0x80u
-#define ___GFP_ZERO		0x100u
-#define ___GFP_ATOMIC		0x200u
-#define ___GFP_DIRECT_RECLAIM	0x400u
-#define ___GFP_KSWAPD_RECLAIM	0x800u
-#define ___GFP_WRITE		0x1000u
-#define ___GFP_NOWARN		0x2000u
-#define ___GFP_RETRY_MAYFAIL	0x4000u
-#define ___GFP_NOFAIL		0x8000u
-#define ___GFP_NORETRY		0x10000u
-#define ___GFP_MEMALLOC		0x20000u
-#define ___GFP_COMP		0x40000u
-#define ___GFP_NOMEMALLOC	0x80000u
-#define ___GFP_HARDWALL		0x100000u
-#define ___GFP_THISNODE		0x200000u
-#define ___GFP_ACCOUNT		0x400000u
-#define ___GFP_ZEROTAGS		0x800000u
-#ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO		0x1000000u
-#define ___GFP_SKIP_KASAN_UNPOISON	0x2000000u
-#define ___GFP_SKIP_KASAN_POISON	0x4000000u
-#else
-#define ___GFP_SKIP_ZERO		0
-#define ___GFP_SKIP_KASAN_UNPOISON	0
-#define ___GFP_SKIP_KASAN_POISON	0
-#endif
-#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x8000000u
-#else
-#define ___GFP_NOLOCKDEP	0
-#endif
-/* If the above are modified, __GFP_BITS_SHIFT may need updating */
-
-/*
- * Physical address zone modifiers (see linux/mmzone.h - low four bits)
- *
- * Do not put any conditional on these. If necessary modify the definitions
- * without the underscores and use them consistently. The definitions here may
- * be used in bit comparisons.
- */
-#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
-#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
-#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
-#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
-#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
-
-/**
- * DOC: Page mobility and placement hints
- *
- * Page mobility and placement hints
- * ---------------------------------
- *
- * These flags provide hints about how mobile the page is. Pages with similar
- * mobility are placed within the same pageblocks to minimise problems due
- * to external fragmentation.
- *
- * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
- * moved by page migration during memory compaction or can be reclaimed.
- *
- * %__GFP_RECLAIMABLE is used for slab allocations that specify
- * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
- *
- * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
- * these pages will be spread between local zones to avoid all the dirty
- * pages being in one zone (fair zone allocation policy).
- *
- * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
- *
- * %__GFP_THISNODE forces the allocation to be satisfied from the requested
- * node with no fallbacks or placement policy enforcements.
- *
- * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
- */
-#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
-#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
-#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
-#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
-#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
-
-/**
- * DOC: Watermark modifiers
- *
- * Watermark modifiers -- controls access to emergency reserves
- * ------------------------------------------------------------
- *
- * %__GFP_HIGH indicates that the caller is high-priority and that granting
- * the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
- *
- * %__GFP_MEMALLOC allows access to all memory. This should only be used when
- * the caller guarantees the allocation will allow more memory to be freed
- * very shortly e.g. process exiting or swapping. Users either should
- * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
- * Users of this flag have to be extremely careful to not deplete the reserve
- * completely and implement a throttling mechanism which controls the
- * consumption of the reserve based on the amount of freed memory.
- * Usage of a pre-allocated pool (e.g. mempool) should be always considered
- * before using this flag.
- *
- * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
- * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
- */
-#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
-#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
-#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
-#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
-
-/**
- * DOC: Reclaim modifiers
- *
- * Reclaim modifiers
- * -----------------
- * Please note that all the following flags are only applicable to sleepable
- * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
- *
- * %__GFP_IO can start physical IO.
- *
- * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
- * allocator recursing into the filesystem which might already be holding
- * locks.
- *
- * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
- * This flag can be cleared to avoid unnecessary delays when a fallback
- * option is available.
- *
- * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
- * the low watermark is reached and have it reclaim pages until the high
- * watermark is reached. A caller may wish to clear this flag when fallback
- * options are available and the reclaim is likely to disrupt the system. The
- * canonical example is THP allocation where a fallback is cheap but
- * reclaim/compaction may cause indirect stalls.
- *
- * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
- *
- * The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
- * !costly allocations are too essential to fail so they are implicitly
- * non-failing by default (with some exceptions like OOM victims might fail so
- * the caller still has to check for failures) while costly requests try to be
- * not disruptive and back off even without invoking the OOM killer.
- * The following three modifiers might be used to override some of these
- * implicit rules
- *
- * %__GFP_NORETRY: The VM implementation will try only very lightweight
- * memory direct reclaim to get some memory under memory pressure (thus
- * it can sleep). It will avoid disruptive actions like OOM killer. The
- * caller must handle the failure which is quite likely to happen under
- * heavy memory pressure. The flag is suitable when failure can easily be
- * handled at small cost, such as reduced throughput
- *
- * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
- * procedures that have previously failed if there is some indication
- * that progress has been made else where.  It can wait for other
- * tasks to attempt high level approaches to freeing memory such as
- * compaction (which removes fragmentation) and page-out.
- * There is still a definite limit to the number of retries, but it is
- * a larger limit than with %__GFP_NORETRY.
- * Allocations with this flag may fail, but only when there is
- * genuinely little unused memory. While these allocations do not
- * directly trigger the OOM killer, their failure indicates that
- * the system is likely to need to use the OOM killer soon.  The
- * caller must handle failure, but can reasonably do so by failing
- * a higher-level request, or completing it only in a much less
- * efficient manner.
- * If the allocation does fail, and the caller is in a position to
- * free some non-essential memory, doing so could benefit the system
- * as a whole.
- *
- * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures. The allocation could block
- * indefinitely but will never return with failure. Testing for
- * failure is pointless.
- * New users should be evaluated carefully (and the flag should be
- * used only when there is no reasonable failure policy) but it is
- * definitely preferable to use the flag rather than opencode endless
- * loop around allocator.
- * Using this flag for costly allocations is _highly_ discouraged.
- */
-#define __GFP_IO	((__force gfp_t)___GFP_IO)
-#define __GFP_FS	((__force gfp_t)___GFP_FS)
-#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
-#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
-#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
-#define __GFP_RETRY_MAYFAIL	((__force gfp_t)___GFP_RETRY_MAYFAIL)
-#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
-#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
-
-/**
- * DOC: Action modifiers
- *
- * Action modifiers
- * ----------------
- *
- * %__GFP_NOWARN suppresses allocation failure reports.
- *
- * %__GFP_COMP address compound page metadata.
- *
- * %__GFP_ZERO returns a zeroed page on success.
- *
- * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
- * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
- * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
- * memory tags at the same time as zeroing memory has minimal additional
- * performace impact.
- *
- * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
- * Only effective in HW_TAGS mode.
- *
- * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
- * Typically, used for userspace pages. Only effective in HW_TAGS mode.
- */
-#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
-#define __GFP_COMP	((__force gfp_t)___GFP_COMP)
-#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
-#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
-#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
-#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
-#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
-
-/* Disable lockdep for GFP context tracking */
-#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
-
-/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
-#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-
-/**
- * DOC: Useful GFP flag combinations
- *
- * Useful GFP flag combinations
- * ----------------------------
- *
- * Useful GFP flag combinations that are commonly used. It is recommended
- * that subsystems start with one of these combinations and then set/clear
- * %__GFP_FOO flags as necessary.
- *
- * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
- * watermark is applied to allow access to "atomic reserves".
- * The current implementation doesn't support NMI and few other strict
- * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
- *
- * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
- * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
- *
- * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
- * accounted to kmemcg.
- *
- * %GFP_NOWAIT is for kernel allocations that should not stall for direct
- * reclaim, start physical IO or use any filesystem callback.
- *
- * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
- * that do not require the starting of any physical IO.
- * Please try to avoid using this flag directly and instead use
- * memalloc_noio_{save,restore} to mark the whole scope which cannot
- * perform any IO with a short explanation why. All allocation requests
- * will inherit GFP_NOIO implicitly.
- *
- * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
- * Please try to avoid using this flag directly and instead use
- * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
- * recurse into the FS layer with a short explanation why. All allocation
- * requests will inherit GFP_NOFS implicitly.
- *
- * %GFP_USER is for userspace allocations that also need to be directly
- * accessibly by the kernel or hardware. It is typically used by hardware
- * for buffers that are mapped to userspace (e.g. graphics) that hardware
- * still must DMA to. cpuset limits are enforced for these allocations.
- *
- * %GFP_DMA exists for historical reasons and should be avoided where possible.
- * The flags indicates that the caller requires that the lowest zone be
- * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
- * it would require careful auditing as some users really require it and
- * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
- * lowest zone as a type of emergency reserve.
- *
- * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
- * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
- * because the DMA32 kmalloc cache array is not implemented.
- * (Reason: there is no such user in kernel).
- *
- * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
- * do not need to be directly accessible by the kernel but that cannot
- * move once in use. An example may be a hardware allocation that maps
- * data directly into userspace but has no addressing limitations.
- *
- * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
- * need direct access to but can use kmap() when access is required. They
- * are expected to be movable via page reclaim or page migration. Typically,
- * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
- *
- * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
- * are compound allocations that will generally fail quickly if memory is not
- * available and will not wake kswapd/kcompactd on failure. The _LIGHT
- * version does not attempt reclaim/compaction at all and is by default used
- * in page fault path, while the non-light is used by khugepaged.
- */
-#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
-#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
-#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
-#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
-#define GFP_NOIO	(__GFP_RECLAIM)
-#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
-#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
-#define GFP_DMA		__GFP_DMA
-#define GFP_DMA32	__GFP_DMA32
-#define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
-			 __GFP_SKIP_KASAN_POISON)
-#define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
-#define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
-
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
 #define GFP_MOVABLE_SHIFT 3
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
new file mode 100644
index 000000000000..06fc85cee23f
--- /dev/null
+++ b/include/linux/gfp_types.h
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_GFP_TYPES_H
+#define __LINUX_GFP_TYPES_H
+
+/* The typedef is in types.h but we want the documentation here */
+#if 0
+/**
+ * typedef gfp_t - Memory allocation flags.
+ *
+ * GFP flags are commonly used throughout Linux to indicate how memory
+ * should be allocated.  The GFP acronym stands for get_free_pages(),
+ * the underlying memory allocation function.  Not every GFP flag is
+ * supported by every function which may allocate memory.  Most users
+ * will want to use a plain ``GFP_KERNEL``.
+ */
+typedef unsigned int __bitwise gfp_t;
+#endif
+
+/*
+ * In case of changes, please don't forget to update
+ * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
+ */
+
+/* Plain integer GFP bitmasks. Do not use this directly. */
+#define ___GFP_DMA		0x01u
+#define ___GFP_HIGHMEM		0x02u
+#define ___GFP_DMA32		0x04u
+#define ___GFP_MOVABLE		0x08u
+#define ___GFP_RECLAIMABLE	0x10u
+#define ___GFP_HIGH		0x20u
+#define ___GFP_IO		0x40u
+#define ___GFP_FS		0x80u
+#define ___GFP_ZERO		0x100u
+#define ___GFP_ATOMIC		0x200u
+#define ___GFP_DIRECT_RECLAIM	0x400u
+#define ___GFP_KSWAPD_RECLAIM	0x800u
+#define ___GFP_WRITE		0x1000u
+#define ___GFP_NOWARN		0x2000u
+#define ___GFP_RETRY_MAYFAIL	0x4000u
+#define ___GFP_NOFAIL		0x8000u
+#define ___GFP_NORETRY		0x10000u
+#define ___GFP_MEMALLOC		0x20000u
+#define ___GFP_COMP		0x40000u
+#define ___GFP_NOMEMALLOC	0x80000u
+#define ___GFP_HARDWALL		0x100000u
+#define ___GFP_THISNODE		0x200000u
+#define ___GFP_ACCOUNT		0x400000u
+#define ___GFP_ZEROTAGS		0x800000u
+#ifdef CONFIG_KASAN_HW_TAGS
+#define ___GFP_SKIP_ZERO		0x1000000u
+#define ___GFP_SKIP_KASAN_UNPOISON	0x2000000u
+#define ___GFP_SKIP_KASAN_POISON	0x4000000u
+#else
+#define ___GFP_SKIP_ZERO		0
+#define ___GFP_SKIP_KASAN_UNPOISON	0
+#define ___GFP_SKIP_KASAN_POISON	0
+#endif
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP	0x8000000u
+#else
+#define ___GFP_NOLOCKDEP	0
+#endif
+/* If the above are modified, __GFP_BITS_SHIFT may need updating */
+
+/*
+ * Physical address zone modifiers (see linux/mmzone.h - low four bits)
+ *
+ * Do not put any conditional on these. If necessary modify the definitions
+ * without the underscores and use them consistently. The definitions here may
+ * be used in bit comparisons.
+ */
+#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
+#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
+#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
+#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
+#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
+
+/**
+ * DOC: Page mobility and placement hints
+ *
+ * Page mobility and placement hints
+ * ---------------------------------
+ *
+ * These flags provide hints about how mobile the page is. Pages with similar
+ * mobility are placed within the same pageblocks to minimise problems due
+ * to external fragmentation.
+ *
+ * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
+ * moved by page migration during memory compaction or can be reclaimed.
+ *
+ * %__GFP_RECLAIMABLE is used for slab allocations that specify
+ * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
+ *
+ * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
+ * these pages will be spread between local zones to avoid all the dirty
+ * pages being in one zone (fair zone allocation policy).
+ *
+ * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
+ *
+ * %__GFP_THISNODE forces the allocation to be satisfied from the requested
+ * node with no fallbacks or placement policy enforcements.
+ *
+ * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
+#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
+#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
+#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
+#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
+
+/**
+ * DOC: Watermark modifiers
+ *
+ * Watermark modifiers -- controls access to emergency reserves
+ * ------------------------------------------------------------
+ *
+ * %__GFP_HIGH indicates that the caller is high-priority and that granting
+ * the request is necessary before the system can make forward progress.
+ * For example, creating an IO context to clean pages.
+ *
+ * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
+ * high priority. Users are typically interrupt handlers. This may be
+ * used in conjunction with %__GFP_HIGH
+ *
+ * %__GFP_MEMALLOC allows access to all memory. This should only be used when
+ * the caller guarantees the allocation will allow more memory to be freed
+ * very shortly e.g. process exiting or swapping. Users either should
+ * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ * Users of this flag have to be extremely careful to not deplete the reserve
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory.
+ * Usage of a pre-allocated pool (e.g. mempool) should be always considered
+ * before using this flag.
+ *
+ * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
+ * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
+ */
+#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
+#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
+#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
+
+/**
+ * DOC: Reclaim modifiers
+ *
+ * Reclaim modifiers
+ * -----------------
+ * Please note that all the following flags are only applicable to sleepable
+ * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
+ *
+ * %__GFP_IO can start physical IO.
+ *
+ * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
+ * allocator recursing into the filesystem which might already be holding
+ * locks.
+ *
+ * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
+ * This flag can be cleared to avoid unnecessary delays when a fallback
+ * option is available.
+ *
+ * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
+ * the low watermark is reached and have it reclaim pages until the high
+ * watermark is reached. A caller may wish to clear this flag when fallback
+ * options are available and the reclaim is likely to disrupt the system. The
+ * canonical example is THP allocation where a fallback is cheap but
+ * reclaim/compaction may cause indirect stalls.
+ *
+ * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
+ *
+ * The default allocator behavior depends on the request size. We have a concept
+ * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
+ * !costly allocations are too essential to fail so they are implicitly
+ * non-failing by default (with some exceptions like OOM victims might fail so
+ * the caller still has to check for failures) while costly requests try to be
+ * not disruptive and back off even without invoking the OOM killer.
+ * The following three modifiers might be used to override some of these
+ * implicit rules
+ *
+ * %__GFP_NORETRY: The VM implementation will try only very lightweight
+ * memory direct reclaim to get some memory under memory pressure (thus
+ * it can sleep). It will avoid disruptive actions like OOM killer. The
+ * caller must handle the failure which is quite likely to happen under
+ * heavy memory pressure. The flag is suitable when failure can easily be
+ * handled at small cost, such as reduced throughput
+ *
+ * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
+ * procedures that have previously failed if there is some indication
+ * that progress has been made else where.  It can wait for other
+ * tasks to attempt high level approaches to freeing memory such as
+ * compaction (which removes fragmentation) and page-out.
+ * There is still a definite limit to the number of retries, but it is
+ * a larger limit than with %__GFP_NORETRY.
+ * Allocations with this flag may fail, but only when there is
+ * genuinely little unused memory. While these allocations do not
+ * directly trigger the OOM killer, their failure indicates that
+ * the system is likely to need to use the OOM killer soon.  The
+ * caller must handle failure, but can reasonably do so by failing
+ * a higher-level request, or completing it only in a much less
+ * efficient manner.
+ * If the allocation does fail, and the caller is in a position to
+ * free some non-essential memory, doing so could benefit the system
+ * as a whole.
+ *
+ * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
+ * cannot handle allocation failures. The allocation could block
+ * indefinitely but will never return with failure. Testing for
+ * failure is pointless.
+ * New users should be evaluated carefully (and the flag should be
+ * used only when there is no reasonable failure policy) but it is
+ * definitely preferable to use the flag rather than opencode endless
+ * loop around allocator.
+ * Using this flag for costly allocations is _highly_ discouraged.
+ */
+#define __GFP_IO	((__force gfp_t)___GFP_IO)
+#define __GFP_FS	((__force gfp_t)___GFP_FS)
+#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
+#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
+#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
+#define __GFP_RETRY_MAYFAIL	((__force gfp_t)___GFP_RETRY_MAYFAIL)
+#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
+#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
+
+/**
+ * DOC: Action modifiers
+ *
+ * Action modifiers
+ * ----------------
+ *
+ * %__GFP_NOWARN suppresses allocation failure reports.
+ *
+ * %__GFP_COMP address compound page metadata.
+ *
+ * %__GFP_ZERO returns a zeroed page on success.
+ *
+ * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
+ * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
+ * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
+ * memory tags at the same time as zeroing memory has minimal additional
+ * performace impact.
+ *
+ * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
+ * Only effective in HW_TAGS mode.
+ *
+ * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
+ * Typically, used for userspace pages. Only effective in HW_TAGS mode.
+ */
+#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
+#define __GFP_COMP	((__force gfp_t)___GFP_COMP)
+#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
+#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
+#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
+#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
+/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+
+/**
+ * DOC: Useful GFP flag combinations
+ *
+ * Useful GFP flag combinations
+ * ----------------------------
+ *
+ * Useful GFP flag combinations that are commonly used. It is recommended
+ * that subsystems start with one of these combinations and then set/clear
+ * %__GFP_FOO flags as necessary.
+ *
+ * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
+ * watermark is applied to allow access to "atomic reserves".
+ * The current implementation doesn't support NMI and few other strict
+ * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
+ *
+ * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
+ * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
+ *
+ * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
+ * accounted to kmemcg.
+ *
+ * %GFP_NOWAIT is for kernel allocations that should not stall for direct
+ * reclaim, start physical IO or use any filesystem callback.
+ *
+ * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
+ * that do not require the starting of any physical IO.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_noio_{save,restore} to mark the whole scope which cannot
+ * perform any IO with a short explanation why. All allocation requests
+ * will inherit GFP_NOIO implicitly.
+ *
+ * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ * recurse into the FS layer with a short explanation why. All allocation
+ * requests will inherit GFP_NOFS implicitly.
+ *
+ * %GFP_USER is for userspace allocations that also need to be directly
+ * accessibly by the kernel or hardware. It is typically used by hardware
+ * for buffers that are mapped to userspace (e.g. graphics) that hardware
+ * still must DMA to. cpuset limits are enforced for these allocations.
+ *
+ * %GFP_DMA exists for historical reasons and should be avoided where possible.
+ * The flags indicates that the caller requires that the lowest zone be
+ * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
+ * it would require careful auditing as some users really require it and
+ * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
+ * lowest zone as a type of emergency reserve.
+ *
+ * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
+ * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
+ * because the DMA32 kmalloc cache array is not implemented.
+ * (Reason: there is no such user in kernel).
+ *
+ * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
+ * do not need to be directly accessible by the kernel but that cannot
+ * move once in use. An example may be a hardware allocation that maps
+ * data directly into userspace but has no addressing limitations.
+ *
+ * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
+ * need direct access to but can use kmap() when access is required. They
+ * are expected to be movable via page reclaim or page migration. Typically,
+ * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
+ *
+ * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
+ * are compound allocations that will generally fail quickly if memory is not
+ * available and will not wake kswapd/kcompactd on failure. The _LIGHT
+ * version does not attempt reclaim/compaction at all and is by default used
+ * in page fault path, while the non-light is used by khugepaged.
+ */
+#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
+#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
+#define GFP_NOIO	(__GFP_RECLAIM)
+#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
+#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_DMA		__GFP_DMA
+#define GFP_DMA32	__GFP_DMA32
+#define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
+#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
+			 __GFP_SKIP_KASAN_POISON)
+#define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
+
+#endif /* __LINUX_GFP_TYPES_H */
-- 
cgit 


From f0dd891dd5a1d6dc6c9d486333aac4f433f17d17 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:30 -0700
Subject: lib/cpumask: move some one-line wrappers to header file

After moving gfp flags to a separate header, it's possible to move some
cpumask allocators into headers, and avoid creating real functions.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 34 +++++++++++++++++++++++++++++++---
 lib/cpumask.c           | 28 ----------------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ea3de2c2c180..80627362c774 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -12,6 +12,8 @@
 #include <linux/bitmap.h>
 #include <linux/atomic.h>
 #include <linux/bug.h>
+#include <linux/gfp_types.h>
+#include <linux/numa.h>
 
 /* Don't assign or return these: may not be this big! */
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
@@ -794,9 +796,35 @@ typedef struct cpumask *cpumask_var_t;
 #define __cpumask_var_read_mostly	__read_mostly
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
-bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
-bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
+
+static inline
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+{
+	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
+}
+
+/**
+ * alloc_cpumask_var - allocate a struct cpumask
+ * @mask: pointer to cpumask_var_t where the cpumask is returned
+ * @flags: GFP_ flags
+ *
+ * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
+ * a nop returning a constant 1 (in <linux/cpumask.h>).
+ *
+ * See alloc_cpumask_var_node.
+ */
+static inline
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
+}
+
+static inline
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+}
+
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index cb7262ff8633..f0ae119be8c4 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -70,34 +70,6 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
 
-bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
-{
-	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
-}
-EXPORT_SYMBOL(zalloc_cpumask_var_node);
-
-/**
- * alloc_cpumask_var - allocate a struct cpumask
- * @mask: pointer to cpumask_var_t where the cpumask is returned
- * @flags: GFP_ flags
- *
- * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
- * a nop returning a constant 1 (in <linux/cpumask.h>).
- *
- * See alloc_cpumask_var_node.
- */
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-{
-	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(alloc_cpumask_var);
-
-bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-{
-	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(zalloc_cpumask_var);
-
 /**
  * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
  * @mask: pointer to cpumask_var_t where the cpumask is returned
-- 
cgit 


From 00f6842ef41d90cc335ae4dbb00d71f4f642c712 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Mon, 7 Mar 2022 16:32:00 +0000
Subject: media: v4l: Add packed YUV 4:4:4 YUVA and YUVX pixel formats

The new YUVA and YUVX are permutations of the existing AYUV and XYUV
formats. They are use by the NXP i.MX8 ISI hardware.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Jacopo Mondi <jacopo@jmondi.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-packed-yuv.rst    | 20 ++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                 |  2 ++
 include/uapi/linux/videodev2.h                       |  2 ++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
index 65520c3af7cf..bf283a1b5581 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
@@ -220,6 +220,26 @@ the second byte and Y'\ :sub:`7-0` in the third byte.
       - Y'\ :sub:`7-0`
       - X\ :sub:`7-0`
 
+    * .. _V4L2-PIX-FMT-YUVA32:
+
+      - ``V4L2_PIX_FMT_YUVA32``
+      - 'YUVA'
+
+      - Y'\ :sub:`7-0`
+      - Cb\ :sub:`7-0`
+      - Cr\ :sub:`7-0`
+      - A\ :sub:`7-0`
+
+    * .. _V4L2-PIX-FMT-YUVX32:
+
+      - ``V4L2_PIX_FMT_YUVX32``
+      - 'YUVX'
+
+      - Y'\ :sub:`7-0`
+      - Cb\ :sub:`7-0`
+      - Cr\ :sub:`7-0`
+      - X\ :sub:`7-0`
+
     * .. _V4L2-PIX-FMT-YUV24:
 
       - ``V4L2_PIX_FMT_YUV24``
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index e03362c8aef9..11f19099f33b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1296,6 +1296,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_XYUV32:	descr = "32-bit XYUV 8-8-8-8"; break;
 	case V4L2_PIX_FMT_VUYA32:	descr = "32-bit VUYA 8-8-8-8"; break;
 	case V4L2_PIX_FMT_VUYX32:	descr = "32-bit VUYX 8-8-8-8"; break;
+	case V4L2_PIX_FMT_YUVA32:	descr = "32-bit YUVA 8-8-8-8"; break;
+	case V4L2_PIX_FMT_YUVX32:	descr = "32-bit YUVX 8-8-8-8"; break;
 	case V4L2_PIX_FMT_YUV410:	descr = "Planar YUV 4:1:0"; break;
 	case V4L2_PIX_FMT_YUV420:	descr = "Planar YUV 4:2:0"; break;
 	case V4L2_PIX_FMT_HI240:	descr = "8-bit Dithered RGB (BTTV)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 5311ac4fde35..0028ab74ca7c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -593,6 +593,8 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_XYUV32  v4l2_fourcc('X', 'Y', 'U', 'V') /* 32  XYUV-8-8-8-8  */
 #define V4L2_PIX_FMT_VUYA32  v4l2_fourcc('V', 'U', 'Y', 'A') /* 32  VUYA-8-8-8-8  */
 #define V4L2_PIX_FMT_VUYX32  v4l2_fourcc('V', 'U', 'Y', 'X') /* 32  VUYX-8-8-8-8  */
+#define V4L2_PIX_FMT_YUVA32  v4l2_fourcc('Y', 'U', 'V', 'A') /* 32  YUVA-8-8-8-8  */
+#define V4L2_PIX_FMT_YUVX32  v4l2_fourcc('Y', 'U', 'V', 'X') /* 32  YUVX-8-8-8-8  */
 #define V4L2_PIX_FMT_M420    v4l2_fourcc('M', '4', '2', '0') /* 12  YUV 4:2:0 2 lines y, 1 line uv interleaved */
 
 /* two planes -- one Y, one Cr + Cb interleaved  */
-- 
cgit 


From 718d2153ad0de0c7c0b6891eaa7f9918d68b6d5e Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Thu, 17 Mar 2022 12:37:12 +0000
Subject: media: v4l2: Make colorspace validity checks more future-proof

The helper functions that test validity of colorspace-related fields
use the last value of the corresponding enums. This isn't very
future-proof, as there's a high chance someone adding a new value may
forget to update the helpers. Add new "LAST" entries to the enumerations
to improve this, and keep them private to the kernel.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/media/v4l2-common.h    |  6 +++---
 include/uapi/linux/videodev2.h | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 3eb202259e8c..b708d63995f4 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -563,19 +563,19 @@ static inline void v4l2_buffer_set_timestamp(struct v4l2_buffer *buf,
 static inline bool v4l2_is_colorspace_valid(__u32 colorspace)
 {
 	return colorspace > V4L2_COLORSPACE_DEFAULT &&
-	       colorspace <= V4L2_COLORSPACE_DCI_P3;
+	       colorspace < V4L2_COLORSPACE_LAST;
 }
 
 static inline bool v4l2_is_xfer_func_valid(__u32 xfer_func)
 {
 	return xfer_func > V4L2_XFER_FUNC_DEFAULT &&
-	       xfer_func <= V4L2_XFER_FUNC_SMPTE2084;
+	       xfer_func < V4L2_XFER_FUNC_LAST;
 }
 
 static inline bool v4l2_is_ycbcr_enc_valid(__u8 ycbcr_enc)
 {
 	return ycbcr_enc > V4L2_YCBCR_ENC_DEFAULT &&
-	       ycbcr_enc <= V4L2_YCBCR_ENC_SMPTE240M;
+	       ycbcr_enc < V4L2_YCBCR_ENC_LAST;
 }
 
 static inline bool v4l2_is_hsv_enc_valid(__u8 hsv_enc)
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 0028ab74ca7c..e32b9e25258d 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -245,6 +245,14 @@ enum v4l2_colorspace {
 
 	/* DCI-P3 colorspace, used by cinema projectors */
 	V4L2_COLORSPACE_DCI_P3        = 12,
+
+#ifdef __KERNEL__
+	/*
+	 * Largest supported colorspace value, assigned by the compiler, used
+	 * by the framework to check for invalid values.
+	 */
+	V4L2_COLORSPACE_LAST,
+#endif
 };
 
 /*
@@ -283,6 +291,13 @@ enum v4l2_xfer_func {
 	V4L2_XFER_FUNC_NONE        = 5,
 	V4L2_XFER_FUNC_DCI_P3      = 6,
 	V4L2_XFER_FUNC_SMPTE2084   = 7,
+#ifdef __KERNEL__
+	/*
+	 * Largest supported transfer function value, assigned by the compiler,
+	 * used by the framework to check for invalid values.
+	 */
+	V4L2_XFER_FUNC_LAST,
+#endif
 };
 
 /*
@@ -343,6 +358,13 @@ enum v4l2_ycbcr_encoding {
 
 	/* SMPTE 240M -- Obsolete HDTV */
 	V4L2_YCBCR_ENC_SMPTE240M      = 8,
+#ifdef __KERNEL__
+	/*
+	 * Largest supported encoding value, assigned by the compiler, used by
+	 * the framework to check for invalid values.
+	 */
+	V4L2_YCBCR_ENC_LAST,
+#endif
 };
 
 /*
-- 
cgit 


From 9b04369b060fd4885f728b7a4ab4851ffb1abb64 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 12 Apr 2022 01:15:36 +0300
Subject: drm/scheduler: Don't kill jobs in interrupt context

Interrupt context can't sleep. Drivers like Panfrost and MSM are taking
mutex when job is released, and thus, that code can sleep. This results
into "BUG: scheduling while atomic" if locks are contented while job is
freed. There is no good reason for releasing scheduler's jobs in IRQ
context, hence use normal context to fix the trouble.

Cc: stable@vger.kernel.org
Fixes: 542cff7893a3 ("drm/sched: Avoid lockdep spalt on killing a processes")
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220411221536.283312-1-dmitry.osipenko@collabora.com
---
 drivers/gpu/drm/scheduler/sched_entity.c | 6 +++---
 include/drm/gpu_scheduler.h              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 191c56064f19..6b25b2f4f5a3 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -190,7 +190,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);
 
-static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
+static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
 {
 	struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
 
@@ -207,8 +207,8 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
 	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
 						 finish_cb);
 
-	init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
-	irq_work_queue(&job->work);
+	INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
+	schedule_work(&job->work);
 }
 
 static struct dma_fence *
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0fca8f38bee4..addb135eeea6 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -28,7 +28,7 @@
 #include <linux/dma-fence.h>
 #include <linux/completion.h>
 #include <linux/xarray.h>
-#include <linux/irq_work.h>
+#include <linux/workqueue.h>
 
 #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
 
@@ -295,7 +295,7 @@ struct drm_sched_job {
 	 */
 	union {
 		struct dma_fence_cb		finish_cb;
-		struct irq_work 		work;
+		struct work_struct 		work;
 	};
 
 	uint64_t			id;
-- 
cgit 


From 982c0487185bd466059ff618f398a8d074ddb654 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Fri, 1 Jul 2022 14:15:58 +0100
Subject: media: subdev: Add v4l2_subdev_call_state_try() macro

Add a helper macro for the situations where a non-MC driver needs to
call a state-operation (operation which takes a subdev state as a
parameter) in try-context in another subdev.

The macro allocates a new subdev state for the called subdev and frees
the state afterwards.

An example use case is a media platform driver testing if a
v4l2_subdev_format would be accepted by a source subdev.

This should not be used in MC drivers.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Tested-by: Marek Vasut <marex@denx.de>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/media/v4l2-subdev.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index b661e1817470..9689f38a0af1 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1433,6 +1433,40 @@ extern const struct v4l2_subdev_ops v4l2_subdev_call_wrappers;
 		__result;						\
 	})
 
+/**
+ * v4l2_subdev_call_state_try - call an operation of a v4l2_subdev which
+ *				takes state as a parameter, passing the
+ *				subdev a newly allocated try state.
+ *
+ * @sd: pointer to the &struct v4l2_subdev
+ * @o: name of the element at &struct v4l2_subdev_ops that contains @f.
+ *     Each element there groups a set of callbacks functions.
+ * @f: callback function to be called.
+ *     The callback functions are defined in groups, according to
+ *     each element at &struct v4l2_subdev_ops.
+ * @args: arguments for @f.
+ *
+ * This is similar to v4l2_subdev_call_state_active(), except that as this
+ * version allocates a new state, this is only usable for
+ * V4L2_SUBDEV_FORMAT_TRY use cases.
+ *
+ * Note: only legacy non-MC drivers may need this macro.
+ */
+#define v4l2_subdev_call_state_try(sd, o, f, args...)                 \
+	({                                                            \
+		int __result;                                         \
+		static struct lock_class_key __key;                   \
+		const char *name = KBUILD_BASENAME                    \
+			":" __stringify(__LINE__) ":state->lock";     \
+		struct v4l2_subdev_state *state =                     \
+			__v4l2_subdev_state_alloc(sd, name, &__key);  \
+		v4l2_subdev_lock_state(state);                        \
+		__result = v4l2_subdev_call(sd, o, f, state, ##args); \
+		v4l2_subdev_unlock_state(state);                      \
+		__v4l2_subdev_state_free(state);                      \
+		__result;                                             \
+	})
+
 /**
  * v4l2_subdev_has_op - Checks if a subdev defines a certain operation.
  *
-- 
cgit 


From 1ed3d6446b966143e67a106132eea169b3fc5e1c Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 7 Jul 2022 23:47:32 +0100
Subject: media: entity: Add iterator for entity data links

Iterating over the links for an entity is a somewhat common need
through the media subsystem, but generally the assumption is that
they will all be data links. To meet that assumption add a new macro
that iterates through an entity's links and skips non-data links.

Signed-off-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/mc/mc-entity.c | 16 ++++++++++++++++
 include/media/media-entity.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c
index 11f5207f73aa..a247d5679930 100644
--- a/drivers/media/mc/mc-entity.c
+++ b/drivers/media/mc/mc-entity.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/bitmap.h>
+#include <linux/list.h>
 #include <linux/property.h>
 #include <linux/slab.h>
 #include <media/media-entity.h>
@@ -1051,3 +1052,18 @@ struct media_link *media_create_ancillary_link(struct media_entity *primary,
 	return link;
 }
 EXPORT_SYMBOL_GPL(media_create_ancillary_link);
+
+struct media_link *__media_entity_next_link(struct media_entity *entity,
+					    struct media_link *link,
+					    unsigned long link_type)
+{
+	link = link ? list_next_entry(link, list)
+		    : list_first_entry(&entity->links, typeof(*link), list);
+
+	list_for_each_entry_from(link, &entity->links, list)
+		if ((link->flags & MEDIA_LNK_FL_LINK_TYPE) == link_type)
+			return link;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(__media_entity_next_link);
diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index a9a1c0ec5d1c..4efe2f88e6dc 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -1140,4 +1140,34 @@ struct media_link *
 media_create_ancillary_link(struct media_entity *primary,
 			    struct media_entity *ancillary);
 
+/**
+ * __media_entity_next_link() - Iterate through a &media_entity's links
+ *
+ * @entity:	pointer to the &media_entity
+ * @link:	pointer to a &media_link to hold the iterated values
+ * @link_type:	one of the MEDIA_LNK_FL_LINK_TYPE flags
+ *
+ * Return the next link against an entity matching a specific link type. This
+ * allows iteration through an entity's links whilst guaranteeing all of the
+ * returned links are of the given type.
+ */
+struct media_link *__media_entity_next_link(struct media_entity *entity,
+					    struct media_link *link,
+					    unsigned long link_type);
+
+/**
+ * for_each_media_entity_data_link() - Iterate through an entity's data links
+ *
+ * @entity:	pointer to the &media_entity
+ * @link:	pointer to a &media_link to hold the iterated values
+ *
+ * Iterate over a &media_entity's data links
+ */
+#define for_each_media_entity_data_link(entity, link)			\
+	for (link = __media_entity_next_link(entity, NULL,		\
+					     MEDIA_LNK_FL_DATA_LINK);	\
+	     link;							\
+	     link = __media_entity_next_link(entity, link,		\
+					     MEDIA_LNK_FL_DATA_LINK))
+
 #endif
-- 
cgit 


From e7255c00b10e5e570dd8eb24f59e964eeec38d3b Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Wed, 6 Jul 2022 14:02:26 +0200
Subject: ALSA: hda: Skip event processing for unregistered codecs

When codec is unbound but not yet removed, in the eyes of
snd_hdac_bus_process_unsol_events() it is still a valid target to
delegate work to. Such behaviour may lead to use-after-free errors.
Address by verifying if codec is actually registered.

Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20220706120230.427296-6-cezary.rojewski@intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hda_codec.h |  1 -
 include/sound/hdaudio.h   |  1 +
 sound/hda/hdac_bus.c      |  2 +-
 sound/pci/hda/hda_codec.c | 10 +++++-----
 sound/soc/codecs/hda.c    |  4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h
index b7be300b6b18..6d3c82c4b6ac 100644
--- a/include/sound/hda_codec.h
+++ b/include/sound/hda_codec.h
@@ -231,7 +231,6 @@ struct hda_codec {
 	/* misc flags */
 	unsigned int configured:1; /* codec was configured */
 	unsigned int in_freeing:1; /* being released */
-	unsigned int registered:1; /* codec was registered */
 	unsigned int display_power_control:1; /* needs display power */
 	unsigned int spdif_status_reset :1; /* needs to toggle SPDIF for each
 					     * status change
diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index 15f15075238d..797bf67a164d 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -93,6 +93,7 @@ struct hdac_device {
 	bool lazy_cache:1;	/* don't wake up for writes */
 	bool caps_overwriting:1; /* caps overwrite being in process */
 	bool cache_coef:1;	/* cache COEF read/write too */
+	unsigned int registered:1; /* codec was registered */
 };
 
 /* device/driver type used for matching */
diff --git a/sound/hda/hdac_bus.c b/sound/hda/hdac_bus.c
index 71db8592b33d..d497414a5538 100644
--- a/sound/hda/hdac_bus.c
+++ b/sound/hda/hdac_bus.c
@@ -183,7 +183,7 @@ static void snd_hdac_bus_process_unsol_events(struct work_struct *work)
 		if (!(caddr & (1 << 4))) /* no unsolicited event? */
 			continue;
 		codec = bus->caddr_tbl[caddr & 0x0f];
-		if (!codec || !codec->dev.driver)
+		if (!codec || !codec->registered)
 			continue;
 		spin_unlock_irq(&bus->reg_lock);
 		drv = drv_to_hdac_driver(codec->dev.driver);
diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index b1921f920513..7be74227bf19 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -772,11 +772,11 @@ static void codec_release_pcms(struct hda_codec *codec)
  */
 void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec)
 {
-	if (codec->registered) {
+	if (codec->core.registered) {
 		/* pm_runtime_put() is called in snd_hdac_device_exit() */
 		pm_runtime_get_noresume(hda_codec_dev(codec));
 		pm_runtime_disable(hda_codec_dev(codec));
-		codec->registered = 0;
+		codec->core.registered = 0;
 	}
 
 	snd_hda_codec_disconnect_pcms(codec);
@@ -825,14 +825,14 @@ void snd_hda_codec_display_power(struct hda_codec *codec, bool enable)
  */
 void snd_hda_codec_register(struct hda_codec *codec)
 {
-	if (codec->registered)
+	if (codec->core.registered)
 		return;
 	if (device_is_registered(hda_codec_dev(codec))) {
 		snd_hda_codec_display_power(codec, true);
 		pm_runtime_enable(hda_codec_dev(codec));
 		/* it was powered up in snd_hda_codec_new(), now all done */
 		snd_hda_power_down(codec);
-		codec->registered = 1;
+		codec->core.registered = 1;
 	}
 }
 EXPORT_SYMBOL_GPL(snd_hda_codec_register);
@@ -3047,7 +3047,7 @@ void snd_hda_codec_shutdown(struct hda_codec *codec)
 	struct hda_pcm *cpcm;
 
 	/* Skip the shutdown if codec is not registered */
-	if (!codec->registered)
+	if (!codec->core.registered)
 		return;
 
 	cancel_delayed_work_sync(&codec->jackpoll_work);
diff --git a/sound/soc/codecs/hda.c b/sound/soc/codecs/hda.c
index edcb8bc6806b..ad20a3dff9b7 100644
--- a/sound/soc/codecs/hda.c
+++ b/sound/soc/codecs/hda.c
@@ -274,7 +274,7 @@ static void hda_codec_remove(struct snd_soc_component *component)
 	struct hdac_device *hdev = &codec->core;
 	struct hdac_bus *bus = hdev->bus;
 	struct hdac_ext_link *hlink;
-	bool was_registered = codec->registered;
+	bool was_registered = codec->core.registered;
 
 	/* Don't allow any more runtime suspends */
 	pm_runtime_forbid(&hdev->dev);
@@ -376,7 +376,7 @@ static int hda_hdev_detach(struct hdac_device *hdev)
 {
 	struct hda_codec *codec = dev_to_hda_codec(&hdev->dev);
 
-	if (codec->registered)
+	if (codec->core.registered)
 		cancel_delayed_work_sync(&codec->jackpoll_work);
 
 	snd_soc_unregister_component(&hdev->dev);
-- 
cgit 


From 64fe675e999c2c7d753ecaaa1349693c59ce6c11 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 8 Jul 2022 17:21:40 +0100
Subject: media: videodev2.h: add V4L2_CTRL_FLAG_DYNAMIC_ARRAY

Add a new flag that indicates that this control is a dynamically sized
array. Also document this flag.

Currently dynamically sized arrays are limited to one dimensional arrays,
but that might change in the future if there is a need for it.

The initial use-case of dynamic arrays are stateless codecs. A frame
can be divided in many slices, so you want to provide an array containing
slice information for each slice. Typically the number of slices is small,
but the standard allow for hundreds or thousands of slices. Dynamic arrays
are a good solution since sizing the array for the worst case would waste
substantial amounts of memory.

Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst   | 8 ++++++++
 Documentation/userspace-api/media/videodev2.h.rst.exceptions | 1 +
 include/uapi/linux/videodev2.h                               | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
index 88f630252d98..a20dfa2a933b 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
@@ -625,6 +625,14 @@ See also the examples in :ref:`control`.
 	``V4L2_CTRL_FLAG_GRABBED`` flag when buffers are allocated or
 	streaming is in progress since most drivers do not support changing
 	the format in that case.
+    * - ``V4L2_CTRL_FLAG_DYNAMIC_ARRAY``
+      - 0x0800
+      - This control is a dynamically sized 1-dimensional array. It
+        behaves the same as a regular array, except that the number
+	of elements as reported by the ``elems`` field is between 1 and
+	``dims[0]``. So setting the control with a differently sized
+	array will change the ``elems`` field when the control is
+	queried afterwards.
 
 Return Value
 ============
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 9cbb7a0c354a..0b91200776f8 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -379,6 +379,7 @@ replace define V4L2_CTRL_FLAG_VOLATILE control-flags
 replace define V4L2_CTRL_FLAG_HAS_PAYLOAD control-flags
 replace define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE control-flags
 replace define V4L2_CTRL_FLAG_MODIFY_LAYOUT control-flags
+replace define V4L2_CTRL_FLAG_DYNAMIC_ARRAY control-flags
 
 replace define V4L2_CTRL_FLAG_NEXT_CTRL control
 replace define V4L2_CTRL_FLAG_NEXT_COMPOUND control
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index e32b9e25258d..87ebc6baafb6 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1914,6 +1914,7 @@ struct v4l2_querymenu {
 #define V4L2_CTRL_FLAG_HAS_PAYLOAD	0x0100
 #define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE	0x0200
 #define V4L2_CTRL_FLAG_MODIFY_LAYOUT	0x0400
+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 
 /*  Query flags, to be ORed with the control ID */
 #define V4L2_CTRL_FLAG_NEXT_CTRL	0x80000000
-- 
cgit 


From fb582cba44928f78e56047d11a08dad570e97ae8 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 8 Jul 2022 17:21:41 +0100
Subject: media: v4l2-ctrls: add support for dynamically allocated arrays.

Implement support for dynamically allocated arrays.

Most of the changes concern keeping track of the number of elements
of the array and the number of elements allocated for the array and
reallocating memory if needed.

Acked-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls-api.c     | 103 +++++++++++----
 drivers/media/v4l2-core/v4l2-ctrls-core.c    | 182 ++++++++++++++++++++++-----
 drivers/media/v4l2-core/v4l2-ctrls-priv.h    |   3 +-
 drivers/media/v4l2-core/v4l2-ctrls-request.c |  13 +-
 include/media/v4l2-ctrls.h                   |  42 +++++--
 5 files changed, 272 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c
index db9baa0bd05f..50d012ba3c02 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-api.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c
@@ -97,29 +97,47 @@ static int def_to_user(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl)
 	return ptr_to_user(c, ctrl, ctrl->p_new);
 }
 
-/* Helper function: copy the caller-provider value to the given control value */
-static int user_to_ptr(struct v4l2_ext_control *c,
-		       struct v4l2_ctrl *ctrl,
-		       union v4l2_ctrl_ptr ptr)
+/* Helper function: copy the caller-provider value as the new control value */
+static int user_to_new(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl)
 {
 	int ret;
 	u32 size;
 
-	ctrl->is_new = 1;
+	ctrl->is_new = 0;
+	if (ctrl->is_dyn_array &&
+	    c->size > ctrl->p_dyn_alloc_elems * ctrl->elem_size) {
+		void *old = ctrl->p_dyn;
+		void *tmp = kvzalloc(2 * c->size, GFP_KERNEL);
+
+		if (!tmp)
+			return -ENOMEM;
+		memcpy(tmp, ctrl->p_new.p, ctrl->elems * ctrl->elem_size);
+		memcpy(tmp + c->size, ctrl->p_cur.p, ctrl->elems * ctrl->elem_size);
+		ctrl->p_new.p = tmp;
+		ctrl->p_cur.p = tmp + c->size;
+		ctrl->p_dyn = tmp;
+		ctrl->p_dyn_alloc_elems = c->size / ctrl->elem_size;
+		kvfree(old);
+	}
+
 	if (ctrl->is_ptr && !ctrl->is_string) {
+		unsigned int elems = c->size / ctrl->elem_size;
 		unsigned int idx;
 
-		ret = copy_from_user(ptr.p, c->ptr, c->size) ? -EFAULT : 0;
-		if (ret || !ctrl->is_array)
-			return ret;
-		for (idx = c->size / ctrl->elem_size; idx < ctrl->elems; idx++)
-			ctrl->type_ops->init(ctrl, idx, ptr);
+		if (copy_from_user(ctrl->p_new.p, c->ptr, c->size))
+			return -EFAULT;
+		ctrl->is_new = 1;
+		if (ctrl->is_dyn_array)
+			ctrl->new_elems = elems;
+		else if (ctrl->is_array)
+			for (idx = elems; idx < ctrl->elems; idx++)
+				ctrl->type_ops->init(ctrl, idx, ctrl->p_new);
 		return 0;
 	}
 
 	switch (ctrl->type) {
 	case V4L2_CTRL_TYPE_INTEGER64:
-		*ptr.p_s64 = c->value64;
+		*ctrl->p_new.p_s64 = c->value64;
 		break;
 	case V4L2_CTRL_TYPE_STRING:
 		size = c->size;
@@ -127,32 +145,27 @@ static int user_to_ptr(struct v4l2_ext_control *c,
 			return -ERANGE;
 		if (size > ctrl->maximum + 1)
 			size = ctrl->maximum + 1;
-		ret = copy_from_user(ptr.p_char, c->string, size) ? -EFAULT : 0;
+		ret = copy_from_user(ctrl->p_new.p_char, c->string, size) ? -EFAULT : 0;
 		if (!ret) {
-			char last = ptr.p_char[size - 1];
+			char last = ctrl->p_new.p_char[size - 1];
 
-			ptr.p_char[size - 1] = 0;
+			ctrl->p_new.p_char[size - 1] = 0;
 			/*
 			 * If the string was longer than ctrl->maximum,
 			 * then return an error.
 			 */
-			if (strlen(ptr.p_char) == ctrl->maximum && last)
+			if (strlen(ctrl->p_new.p_char) == ctrl->maximum && last)
 				return -ERANGE;
 		}
 		return ret;
 	default:
-		*ptr.p_s32 = c->value;
+		*ctrl->p_new.p_s32 = c->value;
 		break;
 	}
+	ctrl->is_new = 1;
 	return 0;
 }
 
-/* Helper function: copy the caller-provider value as the new control value */
-static int user_to_new(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl)
-{
-	return user_to_ptr(c, ctrl, ctrl->p_new);
-}
-
 /*
  * VIDIOC_G/TRY/S_EXT_CTRLS implementation
  */
@@ -254,7 +267,31 @@ static int prepare_ext_ctrls(struct v4l2_ctrl_handler *hdl,
 			have_clusters = true;
 		if (ctrl->cluster[0] != ctrl)
 			ref = find_ref_lock(hdl, ctrl->cluster[0]->id);
-		if (ctrl->is_ptr && !ctrl->is_string) {
+		if (ctrl->is_dyn_array) {
+			unsigned int max_size = ctrl->dims[0] * ctrl->elem_size;
+			unsigned int tot_size = ctrl->elem_size;
+
+			if (cs->which == V4L2_CTRL_WHICH_REQUEST_VAL)
+				tot_size *= ref->p_req_elems;
+			else
+				tot_size *= ctrl->elems;
+
+			c->size = ctrl->elem_size * (c->size / ctrl->elem_size);
+			if (get) {
+				if (c->size < tot_size) {
+					c->size = tot_size;
+					return -ENOSPC;
+				}
+				c->size = tot_size;
+			} else {
+				if (c->size > max_size) {
+					c->size = max_size;
+					return -ENOSPC;
+				}
+				if (!c->size)
+					return -EFAULT;
+			}
+		} else if (ctrl->is_ptr && !ctrl->is_string) {
 			unsigned int tot_size = ctrl->elems * ctrl->elem_size;
 
 			if (c->size < tot_size) {
@@ -346,7 +383,7 @@ static int class_check(struct v4l2_ctrl_handler *hdl, u32 which)
  *
  * Note that v4l2_g_ext_ctrls_common() with 'which' set to
  * V4L2_CTRL_WHICH_REQUEST_VAL is only called if the request was
- * completed, and in that case valid_p_req is true for all controls.
+ * completed, and in that case p_req_valid is true for all controls.
  */
 int v4l2_g_ext_ctrls_common(struct v4l2_ctrl_handler *hdl,
 			    struct v4l2_ext_controls *cs,
@@ -430,7 +467,9 @@ int v4l2_g_ext_ctrls_common(struct v4l2_ctrl_handler *hdl,
 
 			if (is_default)
 				ret = def_to_user(cs->controls + idx, ref->ctrl);
-			else if (is_request && ref->valid_p_req)
+			else if (is_request && ref->p_req_dyn_enomem)
+				ret = -ENOMEM;
+			else if (is_request && ref->p_req_valid)
 				ret = req_to_user(cs->controls + idx, ref);
 			else if (is_volatile)
 				ret = new_to_user(cs->controls + idx, ref->ctrl);
@@ -457,6 +496,17 @@ int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct video_device *vdev,
 }
 EXPORT_SYMBOL(v4l2_g_ext_ctrls);
 
+/* Validate a new control */
+static int validate_new(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr p_new)
+{
+	unsigned int idx;
+	int err = 0;
+
+	for (idx = 0; !err && idx < ctrl->new_elems; idx++)
+		err = ctrl->type_ops->validate(ctrl, idx, p_new);
+	return err;
+}
+
 /* Validate controls. */
 static int validate_ctrls(struct v4l2_ext_controls *cs,
 			  struct v4l2_ctrl_helper *helpers,
@@ -872,6 +922,9 @@ int __v4l2_ctrl_s_ctrl_compound(struct v4l2_ctrl *ctrl,
 	/* It's a driver bug if this happens. */
 	if (WARN_ON(ctrl->type != type))
 		return -EINVAL;
+	/* Setting dynamic arrays is not (yet?) supported. */
+	if (WARN_ON(ctrl->is_dyn_array))
+		return -EINVAL;
 	memcpy(ctrl->p_new.p, p, ctrl->elems * ctrl->elem_size);
 	return set_ctrl(NULL, ctrl, 0);
 }
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c
index 949c1884d9c1..ff8a61f24d0a 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-core.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c
@@ -991,11 +991,12 @@ EXPORT_SYMBOL(v4l2_ctrl_notify);
 
 /* Copy the one value to another. */
 static void ptr_to_ptr(struct v4l2_ctrl *ctrl,
-		       union v4l2_ctrl_ptr from, union v4l2_ctrl_ptr to)
+		       union v4l2_ctrl_ptr from, union v4l2_ctrl_ptr to,
+		       unsigned int elems)
 {
 	if (ctrl == NULL)
 		return;
-	memcpy(to.p, from.p_const, ctrl->elems * ctrl->elem_size);
+	memcpy(to.p, from.p_const, elems * ctrl->elem_size);
 }
 
 /* Copy the new value to the current value. */
@@ -1008,8 +1009,11 @@ void new_to_cur(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 ch_flags)
 
 	/* has_changed is set by cluster_changed */
 	changed = ctrl->has_changed;
-	if (changed)
-		ptr_to_ptr(ctrl, ctrl->p_new, ctrl->p_cur);
+	if (changed) {
+		if (ctrl->is_dyn_array)
+			ctrl->elems = ctrl->new_elems;
+		ptr_to_ptr(ctrl, ctrl->p_new, ctrl->p_cur, ctrl->elems);
+	}
 
 	if (ch_flags & V4L2_EVENT_CTRL_CH_FLAGS) {
 		/* Note: CH_FLAGS is only set for auto clusters. */
@@ -1039,36 +1043,122 @@ void cur_to_new(struct v4l2_ctrl *ctrl)
 {
 	if (ctrl == NULL)
 		return;
-	ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new);
+	if (ctrl->is_dyn_array)
+		ctrl->new_elems = ctrl->elems;
+	ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems);
+}
+
+static bool req_alloc_dyn_array(struct v4l2_ctrl_ref *ref, u32 elems)
+{
+	void *tmp;
+
+	if (elems < ref->p_req_dyn_alloc_elems)
+		return true;
+
+	tmp = kvmalloc(elems * ref->ctrl->elem_size, GFP_KERNEL);
+
+	if (!tmp) {
+		ref->p_req_dyn_enomem = true;
+		return false;
+	}
+	ref->p_req_dyn_enomem = false;
+	kvfree(ref->p_req.p);
+	ref->p_req.p = tmp;
+	ref->p_req_dyn_alloc_elems = elems;
+	return true;
 }
 
 /* Copy the new value to the request value */
 void new_to_req(struct v4l2_ctrl_ref *ref)
 {
+	struct v4l2_ctrl *ctrl;
+
 	if (!ref)
 		return;
-	ptr_to_ptr(ref->ctrl, ref->ctrl->p_new, ref->p_req);
-	ref->valid_p_req = true;
+
+	ctrl = ref->ctrl;
+	if (ctrl->is_dyn_array && !req_alloc_dyn_array(ref, ctrl->new_elems))
+		return;
+
+	ref->p_req_elems = ctrl->new_elems;
+	ptr_to_ptr(ctrl, ctrl->p_new, ref->p_req, ref->p_req_elems);
+	ref->p_req_valid = true;
 }
 
 /* Copy the current value to the request value */
 void cur_to_req(struct v4l2_ctrl_ref *ref)
 {
+	struct v4l2_ctrl *ctrl;
+
 	if (!ref)
 		return;
-	ptr_to_ptr(ref->ctrl, ref->ctrl->p_cur, ref->p_req);
-	ref->valid_p_req = true;
+
+	ctrl = ref->ctrl;
+	if (ctrl->is_dyn_array && !req_alloc_dyn_array(ref, ctrl->elems))
+		return;
+
+	ref->p_req_elems = ctrl->elems;
+	ptr_to_ptr(ctrl, ctrl->p_cur, ref->p_req, ctrl->elems);
+	ref->p_req_valid = true;
 }
 
 /* Copy the request value to the new value */
-void req_to_new(struct v4l2_ctrl_ref *ref)
+int req_to_new(struct v4l2_ctrl_ref *ref)
 {
+	struct v4l2_ctrl *ctrl;
+
 	if (!ref)
-		return;
-	if (ref->valid_p_req)
-		ptr_to_ptr(ref->ctrl, ref->p_req, ref->ctrl->p_new);
-	else
-		ptr_to_ptr(ref->ctrl, ref->ctrl->p_cur, ref->ctrl->p_new);
+		return 0;
+
+	ctrl = ref->ctrl;
+
+	/*
+	 * This control was never set in the request, so just use the current
+	 * value.
+	 */
+	if (!ref->p_req_valid) {
+		if (ctrl->is_dyn_array)
+			ctrl->new_elems = ctrl->elems;
+		ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems);
+		return 0;
+	}
+
+	/* Not a dynamic array, so just copy the request value */
+	if (!ctrl->is_dyn_array) {
+		ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems);
+		return 0;
+	}
+
+	/* Sanity check, should never happen */
+	if (WARN_ON(!ref->p_req_dyn_alloc_elems))
+		return -ENOMEM;
+
+	/*
+	 * Check if the number of elements in the request is more than the
+	 * elements in ctrl->p_dyn. If so, attempt to realloc ctrl->p_dyn.
+	 * Note that p_dyn is allocated with twice the number of elements
+	 * in the dynamic array since it has to store both the current and
+	 * new value of such a control.
+	 */
+	if (ref->p_req_elems > ctrl->p_dyn_alloc_elems) {
+		unsigned int sz = ref->p_req_elems * ctrl->elem_size;
+		void *old = ctrl->p_dyn;
+		void *tmp = kvzalloc(2 * sz, GFP_KERNEL);
+
+		if (!tmp)
+			return -ENOMEM;
+		memcpy(tmp, ctrl->p_new.p, ctrl->elems * ctrl->elem_size);
+		memcpy(tmp + sz, ctrl->p_cur.p, ctrl->elems * ctrl->elem_size);
+		ctrl->p_new.p = tmp;
+		ctrl->p_cur.p = tmp + sz;
+		ctrl->p_dyn = tmp;
+		ctrl->p_dyn_alloc_elems = ref->p_req_elems;
+		kvfree(old);
+	}
+
+	ctrl->new_elems = ref->p_req_elems;
+	ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems);
+	return 0;
 }
 
 /* Control range checking */
@@ -1110,17 +1200,6 @@ int check_range(enum v4l2_ctrl_type type,
 	}
 }
 
-/* Validate a new control */
-int validate_new(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr p_new)
-{
-	unsigned idx;
-	int err = 0;
-
-	for (idx = 0; !err && idx < ctrl->elems; idx++)
-		err = ctrl->type_ops->validate(ctrl, idx, p_new);
-	return err;
-}
-
 /* Set the handler's error code if it wasn't set earlier already */
 static inline int handler_set_err(struct v4l2_ctrl_handler *hdl, int err)
 {
@@ -1164,6 +1243,8 @@ void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl)
 	/* Free all nodes */
 	list_for_each_entry_safe(ref, next_ref, &hdl->ctrl_refs, node) {
 		list_del(&ref->node);
+		if (ref->p_req_dyn_alloc_elems)
+			kvfree(ref->p_req.p);
 		kfree(ref);
 	}
 	/* Free all controls owned by the handler */
@@ -1171,6 +1252,7 @@ void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl)
 		list_del(&ctrl->node);
 		list_for_each_entry_safe(sev, next_sev, &ctrl->ev_subs, node)
 			list_del(&sev->node);
+		kvfree(ctrl->p_dyn);
 		kvfree(ctrl);
 	}
 	kvfree(hdl->buckets);
@@ -1286,7 +1368,7 @@ int handler_new_ref(struct v4l2_ctrl_handler *hdl,
 	if (hdl->error)
 		return hdl->error;
 
-	if (allocate_req)
+	if (allocate_req && !ctrl->is_dyn_array)
 		size_extra_req = ctrl->elems * ctrl->elem_size;
 	new_ref = kzalloc(sizeof(*new_ref) + size_extra_req, GFP_KERNEL);
 	if (!new_ref)
@@ -1460,7 +1542,6 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 			elem_size = sizeof(s32);
 		break;
 	}
-	tot_ctrl_size = elem_size * elems;
 
 	/* Sanity checks */
 	if (id == 0 || name == NULL || !elem_size ||
@@ -1481,17 +1562,33 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 		handler_set_err(hdl, -EINVAL);
 		return NULL;
 	}
+	if (flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) {
+		/*
+		 * For now only support this for one-dimensional arrays only.
+		 *
+		 * This can be relaxed in the future, but this will
+		 * require more effort.
+		 */
+		if (nr_of_dims != 1) {
+			handler_set_err(hdl, -EINVAL);
+			return NULL;
+		}
+		/* Start with just 1 element */
+		elems = 1;
+	}
 
+	tot_ctrl_size = elem_size * elems;
 	sz_extra = 0;
 	if (type == V4L2_CTRL_TYPE_BUTTON)
 		flags |= V4L2_CTRL_FLAG_WRITE_ONLY |
 			V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
 	else if (type == V4L2_CTRL_TYPE_CTRL_CLASS)
 		flags |= V4L2_CTRL_FLAG_READ_ONLY;
-	else if (type == V4L2_CTRL_TYPE_INTEGER64 ||
-		 type == V4L2_CTRL_TYPE_STRING ||
-		 type >= V4L2_CTRL_COMPOUND_TYPES ||
-		 is_array)
+	else if (!(flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) &&
+		 (type == V4L2_CTRL_TYPE_INTEGER64 ||
+		  type == V4L2_CTRL_TYPE_STRING ||
+		  type >= V4L2_CTRL_COMPOUND_TYPES ||
+		  is_array))
 		sz_extra += 2 * tot_ctrl_size;
 
 	if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const)
@@ -1520,7 +1617,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	ctrl->is_ptr = is_array || type >= V4L2_CTRL_COMPOUND_TYPES || ctrl->is_string;
 	ctrl->is_int = !ctrl->is_ptr && type != V4L2_CTRL_TYPE_INTEGER64;
 	ctrl->is_array = is_array;
+	ctrl->is_dyn_array = !!(flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY);
 	ctrl->elems = elems;
+	ctrl->new_elems = elems;
 	ctrl->nr_of_dims = nr_of_dims;
 	if (nr_of_dims)
 		memcpy(ctrl->dims, dims, nr_of_dims * sizeof(dims[0]));
@@ -1533,6 +1632,16 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	ctrl->cur.val = ctrl->val = def;
 	data = &ctrl[1];
 
+	if (ctrl->is_dyn_array) {
+		ctrl->p_dyn_alloc_elems = elems;
+		ctrl->p_dyn = kvzalloc(2 * elems * elem_size, GFP_KERNEL);
+		if (!ctrl->p_dyn) {
+			kvfree(ctrl);
+			return NULL;
+		}
+		data = ctrl->p_dyn;
+	}
+
 	if (!ctrl->is_int) {
 		ctrl->p_new.p = data;
 		ctrl->p_cur.p = data + tot_ctrl_size;
@@ -1542,7 +1651,10 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	}
 
 	if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const) {
-		ctrl->p_def.p = ctrl->p_cur.p + tot_ctrl_size;
+		if (ctrl->is_dyn_array)
+			ctrl->p_def.p = &ctrl[1];
+		else
+			ctrl->p_def.p = ctrl->p_cur.p + tot_ctrl_size;
 		memcpy(ctrl->p_def.p, p_def.p_const, elem_size);
 	}
 
@@ -1552,6 +1664,7 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	}
 
 	if (handler_new_ref(hdl, ctrl, NULL, false, false)) {
+		kvfree(ctrl->p_dyn);
 		kvfree(ctrl);
 		return NULL;
 	}
@@ -1889,6 +2002,9 @@ static int cluster_changed(struct v4l2_ctrl *master)
 			continue;
 		}
 
+		if (ctrl->elems != ctrl->new_elems)
+			ctrl_changed = true;
+
 		for (idx = 0; !ctrl_changed && idx < ctrl->elems; idx++)
 			ctrl_changed = !ctrl->type_ops->equal(ctrl, idx,
 				ctrl->p_cur, ctrl->p_new);
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-priv.h b/drivers/media/v4l2-core/v4l2-ctrls-priv.h
index d4bf2c716f97..aba6176fab6c 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-priv.h
+++ b/drivers/media/v4l2-core/v4l2-ctrls-priv.h
@@ -57,10 +57,9 @@ void cur_to_new(struct v4l2_ctrl *ctrl);
 void cur_to_req(struct v4l2_ctrl_ref *ref);
 void new_to_cur(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 ch_flags);
 void new_to_req(struct v4l2_ctrl_ref *ref);
-void req_to_new(struct v4l2_ctrl_ref *ref);
+int req_to_new(struct v4l2_ctrl_ref *ref);
 void send_initial_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl);
 void send_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 changes);
-int validate_new(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr p_new);
 int handler_new_ref(struct v4l2_ctrl_handler *hdl,
 		    struct v4l2_ctrl *ctrl,
 		    struct v4l2_ctrl_ref **ctrl_ref,
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-request.c b/drivers/media/v4l2-core/v4l2-ctrls-request.c
index 7d098f287fd9..c637049d7a2b 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-request.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-request.c
@@ -143,7 +143,7 @@ v4l2_ctrl_request_hdl_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id)
 {
 	struct v4l2_ctrl_ref *ref = find_ref_lock(hdl, id);
 
-	return (ref && ref->valid_p_req) ? ref->ctrl : NULL;
+	return (ref && ref->p_req_valid) ? ref->ctrl : NULL;
 }
 EXPORT_SYMBOL_GPL(v4l2_ctrl_request_hdl_ctrl_find);
 
@@ -373,7 +373,7 @@ void v4l2_ctrl_request_complete(struct media_request *req,
 			v4l2_ctrl_unlock(master);
 			continue;
 		}
-		if (ref->valid_p_req)
+		if (ref->p_req_valid)
 			continue;
 
 		/* Copy the current control value into the request */
@@ -442,7 +442,7 @@ int v4l2_ctrl_request_setup(struct media_request *req,
 				struct v4l2_ctrl_ref *r =
 					find_ref(hdl, master->cluster[i]->id);
 
-				if (r->valid_p_req) {
+				if (r->p_req_valid) {
 					have_new_data = true;
 					break;
 				}
@@ -458,7 +458,11 @@ int v4l2_ctrl_request_setup(struct media_request *req,
 				struct v4l2_ctrl_ref *r =
 					find_ref(hdl, master->cluster[i]->id);
 
-				req_to_new(r);
+				ret = req_to_new(r);
+				if (ret) {
+					v4l2_ctrl_unlock(master);
+					goto error;
+				}
 				master->cluster[i]->is_new = 1;
 				r->req_done = true;
 			}
@@ -490,6 +494,7 @@ int v4l2_ctrl_request_setup(struct media_request *req,
 			break;
 	}
 
+error:
 	media_request_object_put(obj);
 	return ret;
 }
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index b3ce438f1329..f4105de8a8d2 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -185,6 +185,8 @@ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv);
  *		and/or has type %V4L2_CTRL_TYPE_STRING. In other words, &struct
  *		v4l2_ext_control uses field p to point to the data.
  * @is_array: If set, then this control contains an N-dimensional array.
+ * @is_dyn_array: If set, then this control contains a dynamically sized 1-dimensional array.
+ *		If this is set, then @is_array is also set.
  * @has_volatiles: If set, then one or more members of the cluster are volatile.
  *		Drivers should never touch this flag.
  * @call_notify: If set, then call the handler's notify function whenever the
@@ -205,6 +207,9 @@ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv);
  * @step:	The control's step value for non-menu controls.
  * @elems:	The number of elements in the N-dimensional array.
  * @elem_size:	The size in bytes of the control.
+ * @new_elems:	The number of elements in p_new. This is the same as @elems,
+ *		except for dynamic arrays. In that case it is in the range of
+ *		1 to @p_dyn_alloc_elems.
  * @dims:	The size of each dimension.
  * @nr_of_dims:The number of dimensions in @dims.
  * @menu_skip_mask: The control's skip mask for menu controls. This makes it
@@ -223,15 +228,21 @@ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv);
  *		:math:`ceil(\frac{maximum - minimum}{step}) + 1`.
  *		Used only if the @type is %V4L2_CTRL_TYPE_INTEGER_MENU.
  * @flags:	The control's flags.
- * @cur:	Structure to store the current value.
- * @cur.val:	The control's current value, if the @type is represented via
- *		a u32 integer (see &enum v4l2_ctrl_type).
- * @val:	The control's new s32 value.
  * @priv:	The control's private pointer. For use by the driver. It is
  *		untouched by the control framework. Note that this pointer is
  *		not freed when the control is deleted. Should this be needed
  *		then a new internal bitfield can be added to tell the framework
  *		to free this pointer.
+ * @p_dyn:	Pointer to the dynamically allocated array. Only valid if
+ *		@is_dyn_array is true.
+ * @p_dyn_alloc_elems: The number of elements in the dynamically allocated
+ *		array for both the cur and new values. So @p_dyn is actually
+ *		sized for 2 * @p_dyn_alloc_elems * @elem_size. Only valid if
+ *		@is_dyn_array is true.
+ * @cur:	Structure to store the current value.
+ * @cur.val:	The control's current value, if the @type is represented via
+ *		a u32 integer (see &enum v4l2_ctrl_type).
+ * @val:	The control's new s32 value.
  * @p_def:	The control's default value represented via a union which
  *		provides a standard way of accessing control types
  *		through a pointer (for compound controls only).
@@ -260,6 +271,7 @@ struct v4l2_ctrl {
 	unsigned int is_string:1;
 	unsigned int is_ptr:1;
 	unsigned int is_array:1;
+	unsigned int is_dyn_array:1;
 	unsigned int has_volatiles:1;
 	unsigned int call_notify:1;
 	unsigned int manual_mode_value:8;
@@ -272,6 +284,7 @@ struct v4l2_ctrl {
 	s64 minimum, maximum, default_value;
 	u32 elems;
 	u32 elem_size;
+	u32 new_elems;
 	u32 dims[V4L2_CTRL_MAX_DIMS];
 	u32 nr_of_dims;
 	union {
@@ -284,6 +297,8 @@ struct v4l2_ctrl {
 	};
 	unsigned long flags;
 	void *priv;
+	void *p_dyn;
+	u32 p_dyn_alloc_elems;
 	s32 val;
 	struct {
 		s32 val;
@@ -309,12 +324,22 @@ struct v4l2_ctrl {
  *		the control has been applied. This prevents applying controls
  *		from a cluster with multiple controls twice (when the first
  *		control of a cluster is applied, they all are).
- * @valid_p_req: If set, then p_req contains the control value for the request.
+ * @p_req_valid: If set, then p_req contains the control value for the request.
+ * @p_req_dyn_enomem: If set, then p_req is invalid since allocating space for
+ *		a dynamic array failed. Attempting to read this value shall
+ *		result in ENOMEM. Only valid if ctrl->is_dyn_array is true.
+ * @p_req_dyn_alloc_elems: The number of elements allocated for the dynamic
+ *		array. Only valid if @p_req_valid and ctrl->is_dyn_array are
+ *		true.
+ * @p_req_elems: The number of elements in @p_req. This is the same as
+ *		ctrl->elems, except for dynamic arrays. In that case it is in
+ *		the range of 1 to @p_req_dyn_alloc_elems. Only valid if
+ *		@p_req_valid is true.
  * @p_req:	If the control handler containing this control reference
  *		is bound to a media request, then this points to the
  *		value of the control that must be applied when the request
  *		is executed, or to the value of the control at the time
- *		that the request was completed. If @valid_p_req is false,
+ *		that the request was completed. If @p_req_valid is false,
  *		then this control was never set for this request and the
  *		control will not be updated when this request is applied.
  *
@@ -329,7 +354,10 @@ struct v4l2_ctrl_ref {
 	struct v4l2_ctrl_helper *helper;
 	bool from_other_dev;
 	bool req_done;
-	bool valid_p_req;
+	bool p_req_valid;
+	bool p_req_dyn_enomem;
+	u32 p_req_dyn_alloc_elems;
+	u32 p_req_elems;
 	union v4l2_ctrl_ptr p_req;
 };
 
-- 
cgit 


From 9763fe6c5229d80d22e21603aa36bc39269d0edd Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:43 +0100
Subject: media: uapi: HEVC: Add missing fields in HEVC controls

Complete the HEVC controls with missing fields from H.265 specifications.
Even if these fields aren't used by the current mainlined drivers
they will be required for (at least) the rkvdec driver.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 32 ++++++++++++++++++++++
 include/media/hevc-ctrls.h                         |  8 +++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 6183f43f4d73..cff742142a55 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2683,6 +2683,16 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     :stub-columns: 0
     :widths:       1 1 2
 
+    * - __u8
+      - ``video_parameter_set_id``
+      - Specifies the value of the vps_video_parameter_set_id of the active VPS
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
+    * - __u8
+      - ``seq_parameter_set_id``
+      - Provides an identifier for the SPS for reference by other syntax elements
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
     * - __u16
       - ``pic_width_in_luma_samples``
       -
@@ -2822,6 +2832,9 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     :stub-columns: 0
     :widths:       1 1 2
 
+    * - __u8
+      - ``pic_parameter_set_id``
+      - Identifies the PPS for reference by other syntax elements.
     * - __u8
       - ``num_extra_slice_header_bits``
       -
@@ -3048,6 +3061,15 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     * - __u8
       - ``ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
       - The list of L1 reference elements as indices in the DPB.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, described as st_ref_pic_set()
+        in the specification, included in the slice header or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set include in the slice header
+        or SPS. It is the number of bits in the conditional block if(long_term_ref_pics_present_flag)
+        in section 7.3.6.1 of the specification.
     * - __u8
       - ``padding``
       - Applications and drivers must set this to zero.
@@ -3385,6 +3407,16 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - ``pic_order_cnt_val``
       - PicOrderCntVal as described in section 8.3.1 "Decoding process
         for picture order count" of the specification.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, of the first slice
+        described as st_ref_pic_set() in the specification, included in the slice header
+        or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set, of the first slice
+        included in the slice header or SPS. It is the number of bits in the conditional block
+        if(long_term_ref_pics_present_flag) in section 7.3.6.1 of the specification.
     * - __u8
       - ``num_active_dpb_entries``
       - The number of entries in ``dpb``.
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 01ccda48d8c5..752a8d10782c 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -58,6 +58,8 @@ enum v4l2_mpeg_video_hevc_start_code {
 /* The controls are not stable at the moment and will likely be reworked. */
 struct v4l2_ctrl_hevc_sps {
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
 	__u16	pic_width_in_luma_samples;
 	__u16	pic_height_in_luma_samples;
 	__u8	bit_depth_luma_minus8;
@@ -108,6 +110,7 @@ struct v4l2_ctrl_hevc_sps {
 
 struct v4l2_ctrl_hevc_pps {
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	pic_parameter_set_id;
 	__u8	num_extra_slice_header_bits;
 	__u8	num_ref_idx_l0_default_active_minus1;
 	__u8	num_ref_idx_l1_default_active_minus1;
@@ -199,7 +202,8 @@ struct v4l2_ctrl_hevc_slice_params {
 	__u32	slice_segment_addr;
 	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
 	__u8	padding;
 
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
@@ -214,6 +218,8 @@ struct v4l2_ctrl_hevc_slice_params {
 
 struct v4l2_ctrl_hevc_decode_params {
 	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
 	__u8	num_active_dpb_entries;
 	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	num_poc_st_curr_before;
-- 
cgit 


From b92de2f91821ce3874d67751cc062dda007c0366 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:44 +0100
Subject: media: uapi: HEVC: Rename HEVC stateless controls with STATELESS
 prefix

Change HEVC  stateless controls names to V4L2_CID_STATELESS_HEVC instead
of V4L2_CID_MPEG_VIDEO_HEVC be coherent with v4l2 naming convention.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 26 +++++++++---------
 drivers/media/v4l2-core/v4l2-ctrls-defs.c          | 32 +++++++++++-----------
 drivers/staging/media/hantro/hantro_drv.c          | 26 +++++++++---------
 drivers/staging/media/hantro/hantro_hevc.c         |  8 +++---
 drivers/staging/media/sunxi/cedrus/cedrus.c        | 24 ++++++++--------
 drivers/staging/media/sunxi/cedrus/cedrus_dec.c    | 10 +++----
 include/media/hevc-ctrls.h                         | 26 +++++++++---------
 7 files changed, 76 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index cff742142a55..868669ae6831 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2661,7 +2661,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
 .. _v4l2-mpeg-hevc:
 
-``V4L2_CID_MPEG_VIDEO_HEVC_SPS (struct)``
+``V4L2_CID_STATELESS_HEVC_SPS (struct)``
     Specifies the Sequence Parameter Set fields (as extracted from the
     bitstream) for the associated HEVC slice data.
     These bitstream parameters are defined according to :ref:`hevc`.
@@ -2814,7 +2814,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
-``V4L2_CID_MPEG_VIDEO_HEVC_PPS (struct)``
+``V4L2_CID_STATELESS_HEVC_PPS (struct)``
     Specifies the Picture Parameter Set fields (as extracted from the
     bitstream) for the associated HEVC slice data.
     These bitstream parameters are defined according to :ref:`hevc`.
@@ -2967,7 +2967,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
-``V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (struct)``
+``V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (struct)``
     Specifies various slice-specific parameters, especially from the NAL unit
     header, general slice segment header and weighted prediction parameter
     parts of the bitstream.
@@ -3132,7 +3132,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
-``V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (struct)``
+``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
     Specifies the HEVC scaling matrix parameters used for the scaling process
     for transform coefficients.
     These matrix and parameters are defined according to :ref:`hevc`.
@@ -3282,7 +3282,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
-``V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (enum)``
+``V4L2_CID_STATELESS_HEVC_DECODE_MODE (enum)``
     Specifies the decoding mode to use. Currently exposes slice-based and
     frame-based decoding but new modes might be added later on.
     This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
@@ -3297,7 +3297,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
        This menu control is not yet part of the public kernel API and
        it is expected to change.
 
-.. c:type:: v4l2_mpeg_video_hevc_decode_mode
+.. c:type:: v4l2_stateless_hevc_decode_mode
 
 .. raw:: latex
 
@@ -3310,11 +3310,11 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     :stub-columns: 0
     :widths:       1 1 2
 
-    * - ``V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED``
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED``
       - 0
       - Decoding is done at the slice granularity.
         The OUTPUT buffer must contain a single slice.
-    * - ``V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED``
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED``
       - 1
       - Decoding is done at the frame granularity.
         The OUTPUT buffer must contain all slices needed to decode the
@@ -3324,7 +3324,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
-``V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (enum)``
+``V4L2_CID_STATELESS_HEVC_START_CODE (enum)``
     Specifies the HEVC slice start code expected for each slice.
     This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
     pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
@@ -3338,7 +3338,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
        This menu control is not yet part of the public kernel API and
        it is expected to change.
 
-.. c:type:: v4l2_mpeg_video_hevc_start_code
+.. c:type:: v4l2_stateless_hevc_start_code
 
 .. tabularcolumns:: |p{9.2cm}|p{0.6cm}|p{7.5cm}|
 
@@ -3347,13 +3347,13 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     :stub-columns: 0
     :widths:       1 1 2
 
-    * - ``V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE``
+    * - ``V4L2_STATELESS_HEVC_START_CODE_NONE``
       - 0
       - Selecting this value specifies that HEVC slices are passed
         to the driver without any start code. The bitstream data should be
         according to :ref:`hevc` 7.3.1.1 General NAL unit syntax, hence
         contains emulation prevention bytes when required.
-    * - ``V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B``
+    * - ``V4L2_STATELESS_HEVC_START_CODE_ANNEX_B``
       - 1
       - Selecting this value specifies that HEVC slices are expected
         to be prefixed by Annex B start codes. According to :ref:`hevc`
@@ -3386,7 +3386,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     This provides a bitmask which consists of bits [0, LTR_COUNT-1].
     This is applicable to the H264 and HEVC encoders.
 
-``V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (struct)``
+``V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (struct)``
     Specifies various decode parameters, especially the references picture order
     count (POC) for all the lists (short, long, before, current, after) and the
     number of entries for each of them.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index 16f42d2fd359..9f55503cd3d6 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -704,9 +704,9 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		return hevc_tier;
 	case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE:
 		return hevc_loop_filter_mode;
-	case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE:
+	case V4L2_CID_STATELESS_HEVC_DECODE_MODE:
 		return hevc_decode_mode;
-	case V4L2_CID_MPEG_VIDEO_HEVC_START_CODE:
+	case V4L2_CID_STATELESS_HEVC_START_CODE:
 		return hevc_start_code;
 	case V4L2_CID_CAMERA_ORIENTATION:
 		return camera_orientation;
@@ -1003,13 +1003,6 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD:	return "HEVC Size of Length Field";
 	case V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES:	return "Reference Frames for a P-Frame";
 	case V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR:		return "Prepend SPS and PPS to IDR";
-	case V4L2_CID_MPEG_VIDEO_HEVC_SPS:			return "HEVC Sequence Parameter Set";
-	case V4L2_CID_MPEG_VIDEO_HEVC_PPS:			return "HEVC Picture Parameter Set";
-	case V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS:		return "HEVC Slice Parameters";
-	case V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX:		return "HEVC Scaling Matrix";
-	case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS:		return "HEVC Decode Parameters";
-	case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE:		return "HEVC Decode Mode";
-	case V4L2_CID_MPEG_VIDEO_HEVC_START_CODE:		return "HEVC Start Code";
 
 	/* CAMERA controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
@@ -1188,6 +1181,13 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_STATELESS_MPEG2_QUANTISATION:		return "MPEG-2 Quantisation Matrices";
 	case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR:	return "VP9 Probabilities Updates";
 	case V4L2_CID_STATELESS_VP9_FRAME:			return "VP9 Frame Decode Parameters";
+	case V4L2_CID_STATELESS_HEVC_SPS:			return "HEVC Sequence Parameter Set";
+	case V4L2_CID_STATELESS_HEVC_PPS:			return "HEVC Picture Parameter Set";
+	case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS:		return "HEVC Slice Parameters";
+	case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX:		return "HEVC Scaling Matrix";
+	case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS:		return "HEVC Decode Parameters";
+	case V4L2_CID_STATELESS_HEVC_DECODE_MODE:		return "HEVC Decode Mode";
+	case V4L2_CID_STATELESS_HEVC_START_CODE:		return "HEVC Start Code";
 
 	/* Colorimetry controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
@@ -1363,8 +1363,8 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD:
 	case V4L2_CID_MPEG_VIDEO_HEVC_TIER:
 	case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE:
-	case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE:
-	case V4L2_CID_MPEG_VIDEO_HEVC_START_CODE:
+	case V4L2_CID_STATELESS_HEVC_DECODE_MODE:
+	case V4L2_CID_STATELESS_HEVC_START_CODE:
 	case V4L2_CID_STATELESS_H264_DECODE_MODE:
 	case V4L2_CID_STATELESS_H264_START_CODE:
 	case V4L2_CID_CAMERA_ORIENTATION:
@@ -1502,19 +1502,19 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_STATELESS_VP8_FRAME:
 		*type = V4L2_CTRL_TYPE_VP8_FRAME;
 		break;
-	case V4L2_CID_MPEG_VIDEO_HEVC_SPS:
+	case V4L2_CID_STATELESS_HEVC_SPS:
 		*type = V4L2_CTRL_TYPE_HEVC_SPS;
 		break;
-	case V4L2_CID_MPEG_VIDEO_HEVC_PPS:
+	case V4L2_CID_STATELESS_HEVC_PPS:
 		*type = V4L2_CTRL_TYPE_HEVC_PPS;
 		break;
-	case V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS:
+	case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS:
 		*type = V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS;
 		break;
-	case V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX:
+	case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX:
 		*type = V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX;
 		break;
-	case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS:
+	case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS:
 		*type = V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS;
 		break;
 	case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR:
diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c
index 8bbc2f2b5746..1a62c5610730 100644
--- a/drivers/staging/media/hantro/hantro_drv.c
+++ b/drivers/staging/media/hantro/hantro_drv.c
@@ -270,7 +270,7 @@ static int hantro_try_ctrl(struct v4l2_ctrl *ctrl)
 		if (sps->bit_depth_luma_minus8 != 0)
 			/* Only 8-bit is supported */
 			return -EINVAL;
-	} else if (ctrl->id == V4L2_CID_MPEG_VIDEO_HEVC_SPS) {
+	} else if (ctrl->id == V4L2_CID_STATELESS_HEVC_SPS) {
 		const struct v4l2_ctrl_hevc_sps *sps = ctrl->p_new.p_hevc_sps;
 
 		return hantro_hevc_validate_sps(ctx, sps);
@@ -438,18 +438,18 @@ static const struct hantro_ctrl controls[] = {
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE,
-			.min = V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-			.max = V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-			.def = V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+			.id = V4L2_CID_STATELESS_HEVC_DECODE_MODE,
+			.min = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+			.max = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+			.def = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE,
-			.min = V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-			.max = V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-			.def = V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+			.id = V4L2_CID_STATELESS_HEVC_START_CODE,
+			.min = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+			.max = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+			.def = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
@@ -469,23 +469,23 @@ static const struct hantro_ctrl controls[] = {
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+			.id = V4L2_CID_STATELESS_HEVC_SPS,
 			.ops = &hantro_ctrl_ops,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+			.id = V4L2_CID_STATELESS_HEVC_PPS,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+			.id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+			.id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
 		},
 	}, {
 		.codec = HANTRO_HEVC_DECODER,
diff --git a/drivers/staging/media/hantro/hantro_hevc.c b/drivers/staging/media/hantro/hantro_hevc.c
index bd924896e409..1df87ca88ebf 100644
--- a/drivers/staging/media/hantro/hantro_hevc.c
+++ b/drivers/staging/media/hantro/hantro_hevc.c
@@ -189,17 +189,17 @@ int hantro_hevc_dec_prepare_run(struct hantro_ctx *ctx)
 	hantro_start_prepare_run(ctx);
 
 	ctrls->decode_params =
-		hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS);
+		hantro_get_ctrl(ctx, V4L2_CID_STATELESS_HEVC_DECODE_PARAMS);
 	if (WARN_ON(!ctrls->decode_params))
 		return -EINVAL;
 
 	ctrls->scaling =
-		hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX);
+		hantro_get_ctrl(ctx, V4L2_CID_STATELESS_HEVC_SCALING_MATRIX);
 	if (WARN_ON(!ctrls->scaling))
 		return -EINVAL;
 
 	ctrls->sps =
-		hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_HEVC_SPS);
+		hantro_get_ctrl(ctx, V4L2_CID_STATELESS_HEVC_SPS);
 	if (WARN_ON(!ctrls->sps))
 		return -EINVAL;
 
@@ -208,7 +208,7 @@ int hantro_hevc_dec_prepare_run(struct hantro_ctx *ctx)
 		return ret;
 
 	ctrls->pps =
-		hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_HEVC_PPS);
+		hantro_get_ctrl(ctx, V4L2_CID_STATELESS_HEVC_PPS);
 	if (WARN_ON(!ctrls->pps))
 		return -EINVAL;
 
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c
index 68b3dcdb5df3..87be975a72b6 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus.c
@@ -42,7 +42,7 @@ static int cedrus_try_ctrl(struct v4l2_ctrl *ctrl)
 		if (sps->bit_depth_luma_minus8 != 0)
 			/* Only 8-bit is supported */
 			return -EINVAL;
-	} else if (ctrl->id == V4L2_CID_MPEG_VIDEO_HEVC_SPS) {
+	} else if (ctrl->id == V4L2_CID_STATELESS_HEVC_SPS) {
 		const struct v4l2_ctrl_hevc_sps *sps = ctrl->p_new.p_hevc_sps;
 		struct cedrus_ctx *ctx = container_of(ctrl->handler, struct cedrus_ctx, hdl);
 
@@ -164,42 +164,42 @@ static const struct cedrus_control cedrus_controls[] = {
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+			.id	= V4L2_CID_STATELESS_HEVC_SPS,
 			.ops	= &cedrus_ctrl_ops,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+			.id	= V4L2_CID_STATELESS_HEVC_PPS,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+			.id	= V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+			.id	= V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE,
-			.max	= V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-			.def	= V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+			.id	= V4L2_CID_STATELESS_HEVC_DECODE_MODE,
+			.max	= V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+			.def	= V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_HEVC_START_CODE,
-			.max	= V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-			.def	= V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+			.id	= V4L2_CID_STATELESS_HEVC_START_CODE,
+			.max	= V4L2_STATELESS_HEVC_START_CODE_NONE,
+			.def	= V4L2_STATELESS_HEVC_START_CODE_NONE,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
@@ -211,7 +211,7 @@ static const struct cedrus_control cedrus_controls[] = {
 	},
 	{
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+			.id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index 9c7200299465..aabe6253078e 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -65,15 +65,15 @@ void cedrus_device_run(void *priv)
 
 	case V4L2_PIX_FMT_HEVC_SLICE:
 		run.h265.sps = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_HEVC_SPS);
+			V4L2_CID_STATELESS_HEVC_SPS);
 		run.h265.pps = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_HEVC_PPS);
+			V4L2_CID_STATELESS_HEVC_PPS);
 		run.h265.slice_params = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS);
+			V4L2_CID_STATELESS_HEVC_SLICE_PARAMS);
 		run.h265.decode_params = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS);
+			V4L2_CID_STATELESS_HEVC_DECODE_PARAMS);
 		run.h265.scaling_matrix = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX);
+			V4L2_CID_STATELESS_HEVC_SCALING_MATRIX);
 		break;
 
 	case V4L2_PIX_FMT_VP8_FRAME:
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 752a8d10782c..45734bd8fdfc 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -16,13 +16,13 @@
 /* The pixel format isn't stable at the moment and will likely be renamed. */
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 
-#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
 
 /* enum v4l2_ctrl_type type values */
 #define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
@@ -31,14 +31,14 @@
 #define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
 #define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
 
-enum v4l2_mpeg_video_hevc_decode_mode {
-	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
 };
 
-enum v4l2_mpeg_video_hevc_start_code {
-	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
 };
 
 #define V4L2_HEVC_SLICE_TYPE_B	0
-- 
cgit 


From c4a179c7167ee16aad1267f9c99bc1ecff475585 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:45 +0100
Subject: media: uapi: HEVC: Change pic_order_cnt definition in
 v4l2_hevc_dpb_entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HEVC specification describes the following:
"PicOrderCntVal is derived as follows:
PicOrderCntVal = PicOrderCntMsb + slice_pic_order_cnt_lsb
The value of PicOrderCntVal shall be in the range of
−2^31 to 2^31 − 1, inclusive."

To match with these definitions change __u16 pic_order_cnt[2]
into __s32 pic_order_cnt_val.
Change v4l2_ctrl_hevc_slice_params->slice_pic_order_cnt to __s32 too.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst | 2 +-
 drivers/staging/media/hantro/hantro_g2_hevc_dec.c         | 7 +++----
 drivers/staging/media/hantro/hantro_hevc.c                | 2 +-
 drivers/staging/media/hantro/hantro_hw.h                  | 4 ++--
 drivers/staging/media/sunxi/cedrus/cedrus_h265.c          | 4 ++--
 include/media/hevc-ctrls.h                                | 4 ++--
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 868669ae6831..3dfb81a93935 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3010,7 +3010,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     * - __u8
       - ``colour_plane_id``
       -
-    * - __u16
+    * - __s32
       - ``slice_pic_order_cnt``
       -
     * - __u8
diff --git a/drivers/staging/media/hantro/hantro_g2_hevc_dec.c b/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
index 5df6f08e26f5..d28653d04d20 100644
--- a/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
+++ b/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
@@ -390,11 +390,10 @@ static int set_ref(struct hantro_ctx *ctx)
 			 !!(pps->flags & V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED));
 
 	/*
-	 * Write POC count diff from current pic. For frame decoding only compute
-	 * pic_order_cnt[0] and ignore pic_order_cnt[1] used in field-coding.
+	 * Write POC count diff from current pic.
 	 */
 	for (i = 0; i < decode_params->num_active_dpb_entries && i < ARRAY_SIZE(cur_poc); i++) {
-		char poc_diff = decode_params->pic_order_cnt_val - dpb[i].pic_order_cnt[0];
+		char poc_diff = decode_params->pic_order_cnt_val - dpb[i].pic_order_cnt_val;
 
 		hantro_reg_write(vpu, &cur_poc[i], poc_diff);
 	}
@@ -421,7 +420,7 @@ static int set_ref(struct hantro_ctx *ctx)
 	dpb_longterm_e = 0;
 	for (i = 0; i < decode_params->num_active_dpb_entries &&
 	     i < (V4L2_HEVC_DPB_ENTRIES_NUM_MAX - 1); i++) {
-		luma_addr = hantro_hevc_get_ref_buf(ctx, dpb[i].pic_order_cnt[0]);
+		luma_addr = hantro_hevc_get_ref_buf(ctx, dpb[i].pic_order_cnt_val);
 		if (!luma_addr)
 			return -ENOMEM;
 
diff --git a/drivers/staging/media/hantro/hantro_hevc.c b/drivers/staging/media/hantro/hantro_hevc.c
index 1df87ca88ebf..5984c5fa6f83 100644
--- a/drivers/staging/media/hantro/hantro_hevc.c
+++ b/drivers/staging/media/hantro/hantro_hevc.c
@@ -33,7 +33,7 @@ void hantro_hevc_ref_init(struct hantro_ctx *ctx)
 }
 
 dma_addr_t hantro_hevc_get_ref_buf(struct hantro_ctx *ctx,
-				   int poc)
+				   s32 poc)
 {
 	struct hantro_hevc_dec_hw_ctx *hevc_dec = &ctx->hevc_dec;
 	int i;
diff --git a/drivers/staging/media/hantro/hantro_hw.h b/drivers/staging/media/hantro/hantro_hw.h
index 24c1f18b8ba8..762631d15fdc 100644
--- a/drivers/staging/media/hantro/hantro_hw.h
+++ b/drivers/staging/media/hantro/hantro_hw.h
@@ -145,7 +145,7 @@ struct hantro_hevc_dec_hw_ctx {
 	struct hantro_aux_buf tile_bsd;
 	struct hantro_aux_buf ref_bufs[NUM_REF_PICTURES];
 	struct hantro_aux_buf scaling_lists;
-	int ref_bufs_poc[NUM_REF_PICTURES];
+	s32 ref_bufs_poc[NUM_REF_PICTURES];
 	u32 ref_bufs_used;
 	struct hantro_hevc_dec_ctrls ctrls;
 	unsigned int num_tile_cols_allocated;
@@ -358,7 +358,7 @@ void hantro_hevc_dec_exit(struct hantro_ctx *ctx);
 int hantro_g2_hevc_dec_run(struct hantro_ctx *ctx);
 int hantro_hevc_dec_prepare_run(struct hantro_ctx *ctx);
 void hantro_hevc_ref_init(struct hantro_ctx *ctx);
-dma_addr_t hantro_hevc_get_ref_buf(struct hantro_ctx *ctx, int poc);
+dma_addr_t hantro_hevc_get_ref_buf(struct hantro_ctx *ctx, s32 poc);
 int hantro_hevc_add_ref_buf(struct hantro_ctx *ctx, int poc, dma_addr_t addr);
 int hantro_hevc_validate_sps(struct hantro_ctx *ctx, const struct v4l2_ctrl_hevc_sps *sps);
 
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
index 44f385be9f6c..411601975124 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
@@ -143,8 +143,8 @@ static void cedrus_h265_frame_info_write_dpb(struct cedrus_ctx *ctx,
 	for (i = 0; i < num_active_dpb_entries; i++) {
 		int buffer_index = vb2_find_timestamp(vq, dpb[i].timestamp, 0);
 		u32 pic_order_cnt[2] = {
-			dpb[i].pic_order_cnt[0],
-			dpb[i].pic_order_cnt[1]
+			dpb[i].pic_order_cnt_val,
+			dpb[i].pic_order_cnt_val
 		};
 
 		cedrus_h265_frame_info_write_single(ctx, i, dpb[i].field_pic,
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 45734bd8fdfc..01c1795c57a9 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -138,7 +138,7 @@ struct v4l2_hevc_dpb_entry {
 	__u64	timestamp;
 	__u8	flags;
 	__u8	field_pic;
-	__u16	pic_order_cnt[2];
+	__s32	pic_order_cnt_val;
 	__u8	padding[2];
 };
 
@@ -181,7 +181,7 @@ struct v4l2_ctrl_hevc_slice_params {
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
 	__u8	slice_type;
 	__u8	colour_plane_id;
-	__u16	slice_pic_order_cnt;
+	__s32	slice_pic_order_cnt;
 	__u8	num_ref_idx_l0_active_minus1;
 	__u8	num_ref_idx_l1_active_minus1;
 	__u8	collocated_ref_idx;
-- 
cgit 


From 24aab5985fdec281c013368077323b5f4e5de2c3 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:46 +0100
Subject: media: uapi: HEVC: Add SEI pic struct flags

The possible values for the field_pic field in the v4l2_hevc_dpb_entry
structure are defined in the table D.2 in HEVC specification section D.3.3.
Add flags and documentation for each of them.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 54 ++++++++++++++++++++++
 include/media/hevc-ctrls.h                         | 14 ++++++
 2 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 3dfb81a93935..8ba16e8742f3 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3218,6 +3218,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     * - __u8
       - ``field_pic``
       - Whether the reference is a field picture or a frame.
+        See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
     * - __u16
       - ``pic_order_cnt[2]``
       - The picture order count of the reference. Only the first element of the
@@ -3231,6 +3232,59 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
+.. _hevc_dpb_field_pic_flags:
+
+``HEVC dpb field pic Flags``
+
+.. raw:: latex
+
+    \scriptsize
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME``
+      - 0
+      - (progressive) Frame
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD``
+      - 1
+      - Top field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD``
+      - 2
+      - Bottom field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM``
+      - 3
+      - Top field, bottom field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP``
+      - 4
+      - Bottom field, top field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP``
+      - 5
+      - Top field, bottom field, top field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM``
+      - 6
+      - Bottom field, top field, bottom field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING``
+      - 7
+      - Frame doubling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING``
+      - 8
+      - Frame tripling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM``
+      - 9
+      - Top field paired with previous bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP``
+      - 10
+      - Bottom field paired with previous top field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM``
+      - 11
+      - Top field paired with next bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP``
+      - 12
+      - Bottom field paired with next top field in output order
+
 .. c:type:: v4l2_hevc_pred_weight_table
 
 .. raw:: latex
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 01c1795c57a9..f3695ab44389 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -132,6 +132,20 @@ struct v4l2_ctrl_hevc_pps {
 
 #define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
 
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
 #define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
 
 struct v4l2_hevc_dpb_entry {
-- 
cgit 


From 625e9ab479a79366fb770f5762f5ded1a1351891 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:47 +0100
Subject: media: uapi: HEVC: Add documentation to uAPI structure

Add kernel-doc documentation for all the HEVC structures.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 168 +++++++++-------
 include/media/hevc-ctrls.h                         | 221 ++++++++++++++++++++-
 2 files changed, 313 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 8ba16e8742f3..e88b5e6944a6 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2695,70 +2695,76 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
         of H.265 specifications.
     * - __u16
       - ``pic_width_in_luma_samples``
-      -
+      - Specifies the width of each decoded picture in units of luma samples.
     * - __u16
       - ``pic_height_in_luma_samples``
-      -
+      - Specifies the height of each decoded picture in units of luma samples.
     * - __u8
       - ``bit_depth_luma_minus8``
-      -
+      - This value plus 8 specifies the bit depth of the samples of the luma array.
     * - __u8
       - ``bit_depth_chroma_minus8``
-      -
+      - This value plus 8 specifies the bit depth of the samples of the chroma arrays.
     * - __u8
       - ``log2_max_pic_order_cnt_lsb_minus4``
-      -
+      - This value plus 4 specifies the value of the variable MaxPicOrderCntLsb.
     * - __u8
       - ``sps_max_dec_pic_buffering_minus1``
-      -
+      - This value plus 1 specifies the maximum required size of the decoded picture buffer for
+        the codec video sequence.
     * - __u8
       - ``sps_max_num_reorder_pics``
-      -
+      - Indicates the maximum allowed number of pictures.
     * - __u8
       - ``sps_max_latency_increase_plus1``
-      -
+      - Not equal to 0 is used to compute the value of SpsMaxLatencyPictures array.
     * - __u8
       - ``log2_min_luma_coding_block_size_minus3``
-      -
+      - This value plus 3 specifies the minimum luma coding block size.
     * - __u8
       - ``log2_diff_max_min_luma_coding_block_size``
-      -
+      - Specifies the difference between the maximum and minimum luma coding block size.
     * - __u8
       - ``log2_min_luma_transform_block_size_minus2``
-      -
+      - This value plus 2 specifies the minimum luma transform block size.
     * - __u8
       - ``log2_diff_max_min_luma_transform_block_size``
-      -
+      - Specifies the difference between the maximum and minimum luma transform block size.
     * - __u8
       - ``max_transform_hierarchy_depth_inter``
-      -
+      - Specifies the maximum hierarchy depth for transform units of coding units coded
+        in inter prediction mode.
     * - __u8
       - ``max_transform_hierarchy_depth_intra``
-      -
+      - Specifies the maximum hierarchy depth for transform units of coding units coded in
+        intra prediction mode.
     * - __u8
       - ``pcm_sample_bit_depth_luma_minus1``
-      -
+      - This value plus 1 specifies the number of bits used to represent each of PCM sample
+        values of the luma component.
     * - __u8
       - ``pcm_sample_bit_depth_chroma_minus1``
-      -
+      - This value plus 1 specifies the number of bits used to represent each of PCM sample
+        values of the chroma components.
     * - __u8
       - ``log2_min_pcm_luma_coding_block_size_minus3``
-      -
+      - This value plus 3 specifies the minimum size of coding blocks.
     * - __u8
       - ``log2_diff_max_min_pcm_luma_coding_block_size``
-      -
+      - Specifies the difference between the maximum and minimum size of coding blocks.
     * - __u8
       - ``num_short_term_ref_pic_sets``
-      -
+      - Specifies the number of st_ref_pic_set() syntax structures included in the SPS.
     * - __u8
       - ``num_long_term_ref_pics_sps``
-      -
+      - Specifies the number of candidate long-term reference pictures that are
+        specified in the SPS.
     * - __u8
       - ``chroma_format_idc``
-      -
+      - Specifies the chroma sampling.
     * - __u8
       - ``sps_max_sub_layers_minus1``
-      -
+      - This value plus 1 specifies the maximum number of temporal sub-layers.
     * - __u64
       - ``flags``
       - See :ref:`Sequence Parameter Set Flags <hevc_sps_flags>`
@@ -2837,46 +2843,52 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - Identifies the PPS for reference by other syntax elements.
     * - __u8
       - ``num_extra_slice_header_bits``
-      -
+      - Specifies the number of extra slice header bits that are present
+        in the slice header RBSP for coded pictures referring to the PPS.
     * - __u8
       - ``num_ref_idx_l0_default_active_minus1``
-      - Specifies the inferred value of num_ref_idx_l0_active_minus1
+      - This value plus 1 specifies the inferred value of num_ref_idx_l0_active_minus1.
     * - __u8
       - ``num_ref_idx_l1_default_active_minus1``
-      - Specifies the inferred value of num_ref_idx_l1_active_minus1
+      - This value plus 1 specifies the inferred value of num_ref_idx_l1_active_minus1.
     * - __s8
       - ``init_qp_minus26``
-      -
+      - This value plus 26 specifies the initial value of SliceQp Y for each slice
+        referring to the PPS.
     * - __u8
       - ``diff_cu_qp_delta_depth``
-      -
+      - Specifies the difference between the luma coding tree block size
+        and the minimum luma coding block size of coding units that
+        convey cu_qp_delta_abs and cu_qp_delta_sign_flag.
     * - __s8
       - ``pps_cb_qp_offset``
-      -
+      - Specifies the offsets to the luma quantization parameter Cb.
     * - __s8
       - ``pps_cr_qp_offset``
-      -
+      - Specifies the offsets to the luma quantization parameter Cr.
     * - __u8
       - ``num_tile_columns_minus1``
-      -
+      - This value plus 1 specifies the number of tile columns partitioning the picture.
     * - __u8
       - ``num_tile_rows_minus1``
-      -
+      - This value plus 1 specifies the number of tile rows partitioning the picture.
     * - __u8
       - ``column_width_minus1[20]``
-      -
+      - Plus 1 specifies the width of each tile column in units of
+        coding tree blocks.
     * - __u8
       - ``row_height_minus1[22]``
-      -
+      - This value plus 1 specifies the height of each tile row in units of coding
+        tree blocks.
     * - __s8
       - ``pps_beta_offset_div2``
-      -
+      - Specifies the default deblocking parameter offsets for beta divided by 2.
     * - __s8
       - ``pps_tc_offset_div2``
-      -
+      - Specifies the default deblocking parameter offsets for tC divided by 2.
     * - __u8
       - ``log2_parallel_merge_level_minus2``
-      -
+      - Plus 2 specifies the value of the variable Log2ParMrgLevel.
     * - __u8
       - ``padding[4]``
       - Applications and drivers must set this to zero.
@@ -2998,10 +3010,10 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - Offset (in bits) to the video data in the current slice data.
     * - __u8
       - ``nal_unit_type``
-      -
+      - Specifies the coding type of the slice (B, P or I).
     * - __u8
       - ``nuh_temporal_id_plus1``
-      -
+      - This value minus 1 specifies a temporal identifier for the NAL unit.
     * - __u8
       - ``slice_type``
       -
@@ -3009,52 +3021,56 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 	V4L2_HEVC_SLICE_TYPE_B).
     * - __u8
       - ``colour_plane_id``
-      -
+      - Specifies the colour plane associated with the current slice.
     * - __s32
       - ``slice_pic_order_cnt``
-      -
+      - Specifies the picture order count.
     * - __u8
       - ``num_ref_idx_l0_active_minus1``
-      -
+      - This value plus 1 specifies the maximum reference index for
+        reference picture list 0 that may be used to decode the slice.
     * - __u8
       - ``num_ref_idx_l1_active_minus1``
-      -
+      - This value plus 1 specifies the maximum reference index for
+        reference picture list 1 that may be used to decode the slice.
     * - __u8
       - ``collocated_ref_idx``
-      -
+      - Specifies the reference index of the collocated picture used for
+        temporal motion vector prediction.
     * - __u8
       - ``five_minus_max_num_merge_cand``
-      -
+      - Specifies the maximum number of merging motion vector prediction
+        candidates supported in the slice subtracted from 5.
     * - __s8
       - ``slice_qp_delta``
-      -
+      - Specifies the initial value of QpY to be used for the coding blocks in the slice.
     * - __s8
       - ``slice_cb_qp_offset``
-      -
+      - Specifies a difference to be added to the value of pps_cb_qp_offset.
     * - __s8
       - ``slice_cr_qp_offset``
-      -
+      - Specifies a difference to be added to the value of pps_cr_qp_offset.
     * - __s8
       - ``slice_act_y_qp_offset``
-      -
+      - Screen content extension parameters.
     * - __s8
       - ``slice_act_cb_qp_offset``
-      -
+      - Screen content extension parameters.
     * - __s8
       - ``slice_act_cr_qp_offset``
-      -
+      - Screen content extension parameters.
     * - __s8
       - ``slice_beta_offset_div2``
-      -
+      - Specifies the deblocking parameter offsets for beta divided by 2.
     * - __s8
       - ``slice_tc_offset_div2``
-      -
+      - Specifies the deblocking parameter offsets for tC divided by 2.
     * - __u8
       - ``pic_struct``
-      -
+      - Indicates whether a picture should be displayed as a frame or as one or more fields.
     * - __u32
       - ``slice_segment_addr``
-      -
+      - Specifies the address of the first coding tree block in the slice segment.
     * - __u8
       - ``ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
       - The list of L0 reference elements as indices in the DPB.
@@ -3219,11 +3235,9 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - ``field_pic``
       - Whether the reference is a field picture or a frame.
         See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
-    * - __u16
-      - ``pic_order_cnt[2]``
-      - The picture order count of the reference. Only the first element of the
-        array is used for frame pictures, while the first element identifies the
-        top field and the second the bottom field in field-coded pictures.
+    * - __s32
+      - ``pic_order_cnt_val``
+      - The picture order count of the current picture.
     * - __u8
       - ``padding[2]``
       - Applications and drivers must set this to zero.
@@ -3298,36 +3312,44 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     :stub-columns: 0
     :widths:       1 1 2
 
-    * - __u8
-      - ``luma_log2_weight_denom``
-      -
-    * - __s8
-      - ``delta_chroma_log2_weight_denom``
-      -
     * - __s8
       - ``delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      -
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 0.
     * - __s8
       - ``luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      -
+      - The additive offset applied to the luma prediction value for list 0.
     * - __s8
       - ``delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      -
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 0.
     * - __s8
       - ``chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      -
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 0.
     * - __s8
       - ``delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      -
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 1.
     * - __s8
       - ``luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      -
+      - The additive offset applied to the luma prediction value for list 1.
     * - __s8
       - ``delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      -
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 1.
     * - __s8
       - ``chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      -
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 1.
+    * - __u8
+      - ``luma_log2_weight_denom``
+      - The base 2 logarithm of the denominator for all luma weighting
+        factors.
+    * - __s8
+      - ``delta_chroma_log2_weight_denom``
+      - The difference of the base 2 logarithm of the denominator for
+        all chroma weighting factors.
     * - __u8
       - ``padding[6]``
       - Applications and drivers must set this to zero.
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index f3695ab44389..57053cfa099b 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -55,9 +55,68 @@ enum v4l2_stateless_hevc_start_code {
 #define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
 #define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
 
-/* The controls are not stable at the moment and will likely be reworked. */
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			    vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples: specifies the width of each decoded picture
+ *			       in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8 specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value
+ *                                     of the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                          minimum luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: this value plus 2 specifies the
+ *                                             minimum luma transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps:	specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
 struct v4l2_ctrl_hevc_sps {
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
 	__u8	video_parameter_set_id;
 	__u8	seq_parameter_set_id;
 	__u16	pic_width_in_luma_samples;
@@ -108,8 +167,43 @@ struct v4l2_ctrl_hevc_sps {
 #define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
 #define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
 
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the inferred
+ *                                        value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the inferred
+ *                                        value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y
+ *                   for each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows
+ *                        partitioning the picture
+ * @column_width_minus1: this value plus 1 specifies the width of each tile column
+ *                       in units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
 struct v4l2_ctrl_hevc_pps {
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
 	__u8	pic_parameter_set_id;
 	__u8	num_extra_slice_header_bits;
 	__u8	num_ref_idx_l0_default_active_minus1;
@@ -148,6 +242,14 @@ struct v4l2_ctrl_hevc_pps {
 
 #define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
 
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @pic_order_cnt_val: the picture order count of the reference.
+ */
 struct v4l2_hevc_dpb_entry {
 	__u64	timestamp;
 	__u8	flags;
@@ -156,6 +258,31 @@ struct v4l2_hevc_dpb_entry {
 	__u8	padding[2];
 };
 
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
 struct v4l2_hevc_pred_weight_table {
 	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
@@ -184,6 +311,50 @@ struct v4l2_hevc_pred_weight_table {
 #define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
 #define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
 
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_bit_offset: offset (in bits) to the video data in the current slice data
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum reference
+ *                                index for reference picture list 0 that may be
+ *                                used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum reference
+ *                                index for reference picture list 1 that may be
+ *                                used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				picture include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
 struct v4l2_ctrl_hevc_slice_params {
 	__u32	bit_size;
 	__u32	data_bit_offset;
@@ -230,6 +401,28 @@ struct v4l2_ctrl_hevc_slice_params {
 #define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
 #define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
 
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
 struct v4l2_ctrl_hevc_decode_params {
 	__s32	pic_order_cnt_val;
 	__u16	short_term_ref_pic_set_size;
@@ -245,6 +438,28 @@ struct v4l2_ctrl_hevc_decode_params {
 	__u64	flags;
 };
 
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16: scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
 struct v4l2_ctrl_hevc_scaling_matrix {
 	__u8	scaling_list_4x4[6][16];
 	__u8	scaling_list_8x8[6][64];
-- 
cgit 


From 869ba3c8bba7aa1f0d33bf1979c8b81c2bfe2c93 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:48 +0100
Subject: media: uapi: HEVC: Define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS as a
 dynamic array

Make explicit that V4L2_CID_STATELESS_HEVC_SLICE_PARAMS control is
a dynamic array control type.
Some drivers may be able to receive multiple slices in one control
to improve decoding performance.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst | 2 ++
 drivers/media/v4l2-core/v4l2-ctrls-defs.c                 | 1 +
 drivers/staging/media/sunxi/cedrus/cedrus.c               | 2 ++
 include/media/hevc-ctrls.h                                | 3 +++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index e88b5e6944a6..c2e0adece613 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2986,6 +2986,8 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     These bitstream parameters are defined according to :ref:`hevc`.
     They are described in section 7.4.7 "General slice segment header
     semantics" of the specification.
+    This control is a dynamically sized 1-dimensional array,
+    V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
 
 .. c:type:: v4l2_ctrl_hevc_slice_params
 
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index 9f55503cd3d6..d594efbcbb93 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -1510,6 +1510,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 		break;
 	case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS:
 		*type = V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS;
+		*flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY;
 		break;
 	case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX:
 		*type = V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX;
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c
index 87be975a72b6..b12219123a6b 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus.c
@@ -178,6 +178,8 @@ static const struct cedrus_control cedrus_controls[] = {
 	{
 		.cfg = {
 			.id	= V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
+			/* The driver can only handle 1 entry per slice for now */
+			.dims   = { 1 },
 		},
 		.codec		= CEDRUS_CODEC_H265,
 	},
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 57053cfa099b..341fc795d550 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -314,6 +314,9 @@ struct v4l2_hevc_pred_weight_table {
 /**
  * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
  *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
  * @bit_size: size (in bits) of the current slice data
  * @data_bit_offset: offset (in bits) to the video data in the current slice data
  * @nal_unit_type: specifies the coding type of the slice (B, P or I)
-- 
cgit 


From a8755e9bdd6a416331e286cc25cddbd93b2c064a Mon Sep 17 00:00:00 2001
From: Dinh Nguyen <dinguyen@kernel.org>
Date: Fri, 15 Jul 2022 10:03:49 -0500
Subject: firmware: stratix10-svc: fix kernel-doc warning

include/linux/firmware/intel/stratix10-svc-client.h:55: warning: This comment
starts with '/**', but isn't a kernel-doc comment. Refer
Documentation/doc-guide/kernel-doc.rst
 * Flag bit for COMMAND_RECONFIG

Fixes: 4a4709d470e6 ("firmware: stratix10-svc: add new FCS commands")
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20220715150349.2413994-1-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/intel/stratix10-svc-client.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index a776a787ac75..0c16037fd08d 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -51,7 +51,8 @@
 #define SVC_STATUS_ERROR		5
 #define SVC_STATUS_NO_SUPPORT		6
 #define SVC_STATUS_INVALID_PARAM	7
-/**
+
+/*
  * Flag bit for COMMAND_RECONFIG
  *
  * COMMAND_RECONFIG_FLAG_PARTIAL:
-- 
cgit 


From 7ee951acd31a88f941fd6535fbdee3a1567f1d63 Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Fri, 15 Jul 2022 09:49:24 -0400
Subject: drivers/base: fix userspace break from using bin_attributes for
 cpumap and cpulist

Using bin_attributes with a 0 size causes fstat and friends to return that
0 size. This breaks userspace code that retrieves the size before reading
the file. Rather than reverting 75bd50fa841 ("drivers/base/node.c: use
bin_attribute to break the size limitation of cpumap ABI") let's put in a
size value at compile time.

For cpulist the maximum size is on the order of
	NR_CPUS * (ceil(log10(NR_CPUS)) + 1)/2

which for 8192 is 20480 (8192 * 5)/2. In order to get near that you'd need
a system with every other CPU on one node. For example: (0,2,4,8, ... ).
To simplify the math and support larger NR_CPUS in the future we are using
(NR_CPUS * 7)/2. We also set it to a min of PAGE_SIZE to retain the older
behavior for smaller NR_CPUS.

The cpumap file the size works out to be NR_CPUS/4 + NR_CPUS/32 - 1
(or NR_CPUS * 9/32 - 1) including the ","s.

Add a set of macros for these values to cpumask.h so they can be used in
multiple places. Apply these to the handful of such files in
drivers/base/topology.c as well as node.c.

As an example, on an 80 cpu 4-node system (NR_CPUS == 8192):

before:

-r--r--r--. 1 root root 0 Jul 12 14:08 system/node/node0/cpulist
-r--r--r--. 1 root root 0 Jul 11 17:25 system/node/node0/cpumap

after:

-r--r--r--. 1 root root 28672 Jul 13 11:32 system/node/node0/cpulist
-r--r--r--. 1 root root  4096 Jul 13 11:31 system/node/node0/cpumap

CONFIG_NR_CPUS = 16384
-r--r--r--. 1 root root 57344 Jul 13 14:03 system/node/node0/cpulist
-r--r--r--. 1 root root  4607 Jul 13 14:02 system/node/node0/cpumap

The actual number of cpus doesn't matter for the reported size since they
are based on NR_CPUS.

Fixes: 75bd50fa841d ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
Fixes: bb9ec13d156e ("topology: use bin_attribute to break the size limitation of cpumap ABI")
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: stable@vger.kernel.org
Acked-by: Yury Norov <yury.norov@gmail.com> (for include/linux/cpumask.h)
Signed-off-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220715134924.3466194-1-pauld@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/node.c     |  4 ++--
 drivers/base/topology.c | 32 ++++++++++++++++----------------
 include/linux/cpumask.h | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0ac6376ef7a1..eb0f43784c2b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
 	return n;
 }
 
-static BIN_ATTR_RO(cpumap, 0);
+static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
 
 static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
 				   struct bin_attribute *attr, char *buf,
@@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
 	return n;
 }
 
-static BIN_ATTR_RO(cpulist, 0);
+static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
 
 /**
  * struct node_access_nodes - Access class device to hold user visible
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index ac6ad9ab67f9..89f98be5c5b9 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
 static DEVICE_ATTR_ADMIN_RO(ppin);
 
 define_siblings_read_func(thread_siblings, sibling_cpumask);
-static BIN_ATTR_RO(thread_siblings, 0);
-static BIN_ATTR_RO(thread_siblings_list, 0);
+static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
 
 define_siblings_read_func(core_cpus, sibling_cpumask);
-static BIN_ATTR_RO(core_cpus, 0);
-static BIN_ATTR_RO(core_cpus_list, 0);
+static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
 
 define_siblings_read_func(core_siblings, core_cpumask);
-static BIN_ATTR_RO(core_siblings, 0);
-static BIN_ATTR_RO(core_siblings_list, 0);
+static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
 
 #ifdef TOPOLOGY_CLUSTER_SYSFS
 define_siblings_read_func(cluster_cpus, cluster_cpumask);
-static BIN_ATTR_RO(cluster_cpus, 0);
-static BIN_ATTR_RO(cluster_cpus_list, 0);
+static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
 #endif
 
 #ifdef TOPOLOGY_DIE_SYSFS
 define_siblings_read_func(die_cpus, die_cpumask);
-static BIN_ATTR_RO(die_cpus, 0);
-static BIN_ATTR_RO(die_cpus_list, 0);
+static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
 #endif
 
 define_siblings_read_func(package_cpus, core_cpumask);
-static BIN_ATTR_RO(package_cpus, 0);
-static BIN_ATTR_RO(package_cpus_list, 0);
+static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
 
 #ifdef TOPOLOGY_BOOK_SYSFS
 define_id_show_func(book_id, "%d");
 static DEVICE_ATTR_RO(book_id);
 define_siblings_read_func(book_siblings, book_cpumask);
-static BIN_ATTR_RO(book_siblings, 0);
-static BIN_ATTR_RO(book_siblings_list, 0);
+static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
 #endif
 
 #ifdef TOPOLOGY_DRAWER_SYSFS
 define_id_show_func(drawer_id, "%d");
 static DEVICE_ATTR_RO(drawer_id);
 define_siblings_read_func(drawer_siblings, drawer_cpumask);
-static BIN_ATTR_RO(drawer_siblings, 0);
-static BIN_ATTR_RO(drawer_siblings_list, 0);
+static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
 #endif
 
 static struct bin_attribute *bin_attrs[] = {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..4592d0845941 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1071,4 +1071,22 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
 	[0] =  1UL							\
 } }
 
+/*
+ * Provide a valid theoretical max size for cpumap and cpulist sysfs files
+ * to avoid breaking userspace which may allocate a buffer based on the size
+ * reported by e.g. fstat.
+ *
+ * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
+ *
+ * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
+ * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
+ * cover a worst-case of every other cpu being on one of two nodes for a
+ * very large NR_CPUS.
+ *
+ *  Use PAGE_SIZE as a minimum for smaller configurations.
+ */
+#define CPUMAP_FILE_MAX_BYTES  ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \
+					? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
+#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)
+
 #endif /* __LINUX_CPUMASK_H */
-- 
cgit 


From 65d9a9a60fd71be964effb2e94747a6acb6e7015 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 1 Jul 2022 13:04:04 +0530
Subject: kexec_file: drop weak attribute from functions

As requested
(http://lkml.kernel.org/r/87ee0q7b92.fsf@email.froward.int.ebiederm.org),
this series converts weak functions in kexec to use the #ifdef approach.

Quoting the 3e35142ef99fe ("kexec_file: drop weak attribute from
arch_kexec_apply_relocations[_add]") changelog:

: Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols")
: [1], binutils (v2.36+) started dropping section symbols that it thought
: were unused.  This isn't an issue in general, but with kexec_file.c, gcc
: is placing kexec_arch_apply_relocations[_add] into a separate
: .text.unlikely section and the section symbol ".text.unlikely" is being
: dropped.  Due to this, recordmcount is unable to find a non-weak symbol in
: .text.unlikely to generate a relocation record against.

This patch (of 2);

Drop __weak attribute from functions in kexec_file.c:
- arch_kexec_kernel_image_probe()
- arch_kimage_file_post_load_cleanup()
- arch_kexec_kernel_image_load()
- arch_kexec_locate_mem_hole()
- arch_kexec_kernel_verify_sig()

arch_kexec_kernel_image_load() calls into kexec_image_load_default(), so
drop the static attribute for the latter.

arch_kexec_kernel_verify_sig() is not overridden by any architecture, so
drop the __weak attribute.

Link: https://lkml.kernel.org/r/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com
Link: https://lkml.kernel.org/r/2cd7ca1fe4d6bb6ca38e3283c717878388ed6788.1656659357.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Suggested-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/arm64/include/asm/kexec.h   |  4 +++-
 arch/powerpc/include/asm/kexec.h |  9 ++++++++
 arch/s390/include/asm/kexec.h    |  3 +++
 arch/x86/include/asm/kexec.h     |  6 ++++++
 include/linux/kexec.h            | 44 ++++++++++++++++++++++++++++++++++------
 kernel/kexec_file.c              | 35 ++------------------------------
 6 files changed, 61 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 9839bfc163d7..78d272b26ebd 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -115,7 +115,9 @@ extern const struct kexec_file_ops kexec_image_ops;
 
 struct kimage;
 
-extern int arch_kimage_file_post_load_cleanup(struct kimage *image);
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+
 extern int load_other_segments(struct kimage *image,
 		unsigned long kernel_load_addr, unsigned long kernel_size,
 		char *initrd, unsigned long initrd_len,
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 2aefe14e1442..1e5e9b6ec78d 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -120,6 +120,15 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
 #ifdef CONFIG_PPC64
 struct kexec_buf;
 
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len);
+#define arch_kexec_kernel_image_probe arch_kexec_kernel_image_probe
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+
+int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
+#define arch_kexec_locate_mem_hole arch_kexec_locate_mem_hole
+
 int load_crashdump_segments_ppc64(struct kimage *image,
 				  struct kexec_buf *kbuf);
 int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 649ecdcc8734..8886aadc11a3 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -92,5 +92,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 				     const Elf_Shdr *relsec,
 				     const Elf_Shdr *symtab);
 #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
 #endif
 #endif /*_S390_KEXEC_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6ad8d946cd3e..5ec359c1b50c 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -193,6 +193,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 				     const Elf_Shdr *relsec,
 				     const Elf_Shdr *symtab);
 #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+void *arch_kexec_kernel_image_load(struct kimage *image);
+#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
 #endif
 #endif
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 475683cd67f1..6958c6b471f4 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -188,21 +188,53 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 				   void *buf, unsigned int size,
 				   bool get_value);
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
+void *kexec_image_load_default(struct kimage *image);
+
+#ifndef arch_kexec_kernel_image_probe
+static inline int
+arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len)
+{
+	return kexec_image_probe_default(image, buf, buf_len);
+}
+#endif
+
+#ifndef arch_kimage_file_post_load_cleanup
+static inline int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+	return kexec_image_post_load_cleanup_default(image);
+}
+#endif
+
+#ifndef arch_kexec_kernel_image_load
+static inline void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+	return kexec_image_load_default(image);
+}
+#endif
 
-/* Architectures may override the below functions */
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-				  unsigned long buf_len);
-void *arch_kexec_kernel_image_load(struct kimage *image);
-int arch_kimage_file_post_load_cleanup(struct kimage *image);
 #ifdef CONFIG_KEXEC_SIG
 int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 				 unsigned long buf_len);
 #endif
-int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
+#ifndef arch_kexec_locate_mem_hole
+/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf:                       Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+	return kexec_locate_mem_hole(kbuf);
+}
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f9261c07b048..0c27c81351ee 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -62,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
 	return ret;
 }
 
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-					 unsigned long buf_len)
-{
-	return kexec_image_probe_default(image, buf, buf_len);
-}
-
-static void *kexec_image_load_default(struct kimage *image)
+void *kexec_image_load_default(struct kimage *image)
 {
 	if (!image->fops || !image->fops->load)
 		return ERR_PTR(-ENOEXEC);
@@ -80,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image)
 				 image->cmdline_buf_len);
 }
 
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-	return kexec_image_load_default(image);
-}
-
 int kexec_image_post_load_cleanup_default(struct kimage *image)
 {
 	if (!image->fops || !image->fops->cleanup)
@@ -93,11 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
 	return image->fops->cleanup(image->image_loader_data);
 }
 
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-	return kexec_image_post_load_cleanup_default(image);
-}
-
 #ifdef CONFIG_KEXEC_SIG
 static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
 					  unsigned long buf_len)
@@ -110,8 +93,7 @@ static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
 	return image->fops->verify_sig(buf, buf_len);
 }
 
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-					unsigned long buf_len)
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len)
 {
 	return kexec_image_verify_sig_default(image, buf, buf_len);
 }
@@ -621,19 +603,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
 }
 
-/**
- * arch_kexec_locate_mem_hole - Find free memory to place the segments.
- * @kbuf:                       Parameters for the memory search.
- *
- * On success, kbuf->mem will have the start address of the memory region found.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
-{
-	return kexec_locate_mem_hole(kbuf);
-}
-
 /**
  * kexec_add_buffer - place a buffer in a kexec segment
  * @kbuf:	Buffer contents and memory parameters.
-- 
cgit 


From 0738eceb6201691534df07e0928d0a6168a35787 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 1 Jul 2022 13:04:05 +0530
Subject: kexec: drop weak attribute from functions

Drop __weak attribute from functions in kexec_core.c:
- machine_kexec_post_load()
- arch_kexec_protect_crashkres()
- arch_kexec_unprotect_crashkres()
- crash_free_reserved_phys_range()

Link: https://lkml.kernel.org/r/c0f6219e03cb399d166d518ab505095218a902dd.1656659357.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Suggested-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/arm64/include/asm/kexec.h   | 16 ++++++++++++++--
 arch/powerpc/include/asm/kexec.h |  5 +++++
 arch/s390/include/asm/kexec.h    | 11 +++++++++++
 arch/x86/include/asm/kexec.h     |  6 ++++++
 include/linux/kexec.h            | 32 ++++++++++++++++++++++++++++----
 kernel/kexec_core.c              | 27 ---------------------------
 6 files changed, 64 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 78d272b26ebd..559bfae26715 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -84,16 +84,30 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 extern bool crash_is_nosave(unsigned long pfn);
 extern void crash_prepare_suspend(void);
 extern void crash_post_resume(void);
+
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
 #else
 static inline bool crash_is_nosave(unsigned long pfn) {return false; }
 static inline void crash_prepare_suspend(void) {}
 static inline void crash_post_resume(void) {}
 #endif
 
+struct kimage;
+
 #if defined(CONFIG_KEXEC_CORE)
 void cpu_soft_restart(unsigned long el2_switch, unsigned long entry,
 		      unsigned long arg0, unsigned long arg1,
 		      unsigned long arg2);
+
+int machine_kexec_post_load(struct kimage *image);
+#define machine_kexec_post_load machine_kexec_post_load
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
@@ -113,8 +127,6 @@ struct kimage_arch {
 #ifdef CONFIG_KEXEC_FILE
 extern const struct kexec_file_ops kexec_image_ops;
 
-struct kimage;
-
 int arch_kimage_file_post_load_cleanup(struct kimage *image);
 #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
 
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 1e5e9b6ec78d..d6f4edfe4737 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -98,6 +98,11 @@ void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_co
 
 void kexec_copy_flush(struct kimage *image);
 
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PPC_RTAS)
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
+#endif
+
 #ifdef CONFIG_KEXEC_FILE
 extern const struct kexec_file_ops kexec_elf64_ops;
 
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 8886aadc11a3..1bd08eb56d5f 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -85,6 +85,17 @@ struct kimage_arch {
 extern const struct kexec_file_ops s390_kexec_image_ops;
 extern const struct kexec_file_ops s390_kexec_elf_ops;
 
+#ifdef CONFIG_CRASH_DUMP
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+#endif
+
 #ifdef CONFIG_KEXEC_FILE
 struct purgatory_info;
 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 5ec359c1b50c..a3760ca796aa 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -186,6 +186,12 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
 extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
 #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
 
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+
 #ifdef CONFIG_KEXEC_FILE
 struct purgatory_info;
 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6958c6b471f4..8107606ad1e8 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -390,7 +390,10 @@ extern void machine_kexec_cleanup(struct kimage *image);
 extern int kernel_kexec(void);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
-int machine_kexec_post_load(struct kimage *image);
+
+#ifndef machine_kexec_post_load
+static inline int machine_kexec_post_load(struct kimage *image) { return 0; }
+#endif
 
 extern void __crash_kexec(struct pt_regs *);
 extern void crash_kexec(struct pt_regs *);
@@ -423,10 +426,21 @@ extern bool kexec_in_progress;
 
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
-void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
 
-void arch_kexec_protect_crashkres(void);
-void arch_kexec_unprotect_crashkres(void);
+#ifndef arch_kexec_protect_crashkres
+/*
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+static inline void arch_kexec_protect_crashkres(void) { }
+#endif
+
+#ifndef arch_kexec_unprotect_crashkres
+static inline void arch_kexec_unprotect_crashkres(void) { }
+#endif
 
 #ifndef page_to_boot_pfn
 static inline unsigned long page_to_boot_pfn(struct page *page)
@@ -456,6 +470,16 @@ static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys)
 }
 #endif
 
+#ifndef crash_free_reserved_phys_range
+static inline void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE)
+		free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
+}
+#endif
+
 static inline unsigned long virt_to_boot_phys(void *addr)
 {
 	return phys_to_boot_phys(__pa((unsigned long)addr));
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 4d34c78334ce..acd029b307e4 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -591,11 +591,6 @@ static void kimage_free_extra_pages(struct kimage *image)
 
 }
 
-int __weak machine_kexec_post_load(struct kimage *image)
-{
-	return 0;
-}
-
 void kimage_terminate(struct kimage *image)
 {
 	if (*image->entry != 0)
@@ -1020,15 +1015,6 @@ size_t crash_get_memory_size(void)
 	return size;
 }
 
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-					   unsigned long end)
-{
-	unsigned long addr;
-
-	for (addr = begin; addr < end; addr += PAGE_SIZE)
-		free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
-}
-
 int crash_shrink_memory(unsigned long new_size)
 {
 	int ret = 0;
@@ -1225,16 +1211,3 @@ int kernel_kexec(void)
 	mutex_unlock(&kexec_mutex);
 	return error;
 }
-
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_kexec_protect_crashkres(void)
-{}
-
-void __weak arch_kexec_unprotect_crashkres(void)
-{}
-- 
cgit 


From 689a71493bd2f31c024f8c0395f85a1fd4b2138e Mon Sep 17 00:00:00 2001
From: Coiby Xu <coxu@redhat.com>
Date: Thu, 14 Jul 2022 21:40:24 +0800
Subject: kexec: clean up arch_kexec_kernel_verify_sig

Before commit 105e10e2cf1c ("kexec_file: drop weak attribute from
functions"), there was already no arch-specific implementation
of arch_kexec_kernel_verify_sig. With weak attribute dropped by that
commit, arch_kexec_kernel_verify_sig is completely useless. So clean it
up.

Note later patches are dependent on this patch so it should be backported
to the stable tree as well.

Cc: stable@vger.kernel.org
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Reviewed-by: Michal Suchanek <msuchanek@suse.de>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Coiby Xu <coxu@redhat.com>
[zohar@linux.ibm.com: reworded patch description "Note"]
Link: https://lore.kernel.org/linux-integrity/20220714134027.394370-1-coxu@redhat.com/
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/kexec.h |  5 -----
 kernel/kexec_file.c   | 33 +++++++++++++--------------------
 2 files changed, 13 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8107606ad1e8..7f710fb3712b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -212,11 +212,6 @@ static inline void *arch_kexec_kernel_image_load(struct kimage *image)
 }
 #endif
 
-#ifdef CONFIG_KEXEC_SIG
-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-				 unsigned long buf_len);
-#endif
-
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 0c27c81351ee..6dc1294c90fc 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -81,24 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
 	return image->fops->cleanup(image->image_loader_data);
 }
 
-#ifdef CONFIG_KEXEC_SIG
-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
-					  unsigned long buf_len)
-{
-	if (!image->fops || !image->fops->verify_sig) {
-		pr_debug("kernel loader does not support signature verification.\n");
-		return -EKEYREJECTED;
-	}
-
-	return image->fops->verify_sig(buf, buf_len);
-}
-
-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len)
-{
-	return kexec_image_verify_sig_default(image, buf, buf_len);
-}
-#endif
-
 /*
  * Free up memory used by kernel, initrd, and command line. This is temporary
  * memory allocation which is not needed any more after these buffers have
@@ -141,13 +123,24 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 }
 
 #ifdef CONFIG_KEXEC_SIG
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+				  unsigned long buf_len)
+{
+	if (!image->fops || !image->fops->verify_sig) {
+		pr_debug("kernel loader does not support signature verification.\n");
+		return -EKEYREJECTED;
+	}
+
+	return image->fops->verify_sig(buf, buf_len);
+}
+
 static int
 kimage_validate_signature(struct kimage *image)
 {
 	int ret;
 
-	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-					   image->kernel_buf_len);
+	ret = kexec_image_verify_sig(image, image->kernel_buf,
+				     image->kernel_buf_len);
 	if (ret) {
 
 		if (sig_enforce) {
-- 
cgit 


From c903dae8941deb55043ee46ded29e84e97cd84bb Mon Sep 17 00:00:00 2001
From: Coiby Xu <coxu@redhat.com>
Date: Thu, 14 Jul 2022 21:40:25 +0800
Subject: kexec, KEYS: make the code in bzImage64_verify_sig generic

commit 278311e417be ("kexec, KEYS: Make use of platform keyring for
signature verify") adds platform keyring support on x86 kexec but not
arm64.

The code in bzImage64_verify_sig uses the keys on the
.builtin_trusted_keys, .machine, if configured and enabled,
.secondary_trusted_keys, also if configured, and .platform keyrings
to verify the signed kernel image as PE file.

Cc: kexec@lists.infradead.org
Cc: keyrings@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Reviewed-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Coiby Xu <coxu@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/x86/kernel/kexec-bzimage64.c | 20 +-------------------
 include/linux/kexec.h             |  7 +++++++
 kernel/kexec_file.c               | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 170d0fd68b1f..f299b48f9c9f 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -17,7 +17,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/efi.h>
-#include <linux/verification.h>
 
 #include <asm/bootparam.h>
 #include <asm/setup.h>
@@ -528,28 +527,11 @@ static int bzImage64_cleanup(void *loader_data)
 	return 0;
 }
 
-#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
-static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
-{
-	int ret;
-
-	ret = verify_pefile_signature(kernel, kernel_len,
-				      VERIFY_USE_SECONDARY_KEYRING,
-				      VERIFYING_KEXEC_PE_SIGNATURE);
-	if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
-		ret = verify_pefile_signature(kernel, kernel_len,
-					      VERIFY_USE_PLATFORM_KEYRING,
-					      VERIFYING_KEXEC_PE_SIGNATURE);
-	}
-	return ret;
-}
-#endif
-
 const struct kexec_file_ops kexec_bzImage64_ops = {
 	.probe = bzImage64_probe,
 	.load = bzImage64_load,
 	.cleanup = bzImage64_cleanup,
 #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
-	.verify_sig = bzImage64_verify_sig,
+	.verify_sig = kexec_kernel_verify_pe_sig,
 #endif
 };
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 7f710fb3712b..13e6c4b58f07 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -19,6 +19,7 @@
 #include <asm/io.h>
 
 #include <uapi/linux/kexec.h>
+#include <linux/verification.h>
 
 /* Location of a reserved region to hold the crash kernel.
  */
@@ -212,6 +213,12 @@ static inline void *arch_kexec_kernel_image_load(struct kimage *image)
 }
 #endif
 
+#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len);
+#endif
+#endif
+
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6dc1294c90fc..a7b411c22f19 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,23 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 }
 
 #ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+	int ret;
+
+	ret = verify_pefile_signature(kernel, kernel_len,
+				      VERIFY_USE_SECONDARY_KEYRING,
+				      VERIFYING_KEXEC_PE_SIGNATURE);
+	if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+		ret = verify_pefile_signature(kernel, kernel_len,
+					      VERIFY_USE_PLATFORM_KEYRING,
+					      VERIFYING_KEXEC_PE_SIGNATURE);
+	}
+	return ret;
+}
+#endif
+
 static int kexec_image_verify_sig(struct kimage *image, void *buf,
 				  unsigned long buf_len)
 {
-- 
cgit 


From 311027bd13eb676d6025eb2414ff867dd09500e9 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 18:21:50 +0200
Subject: media: uapi: Add V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS control

The number of 'entry point offset' can be very variable.
Instead of using a large static array define a v4l2 dynamic array
of U32 (V4L2_CTRL_TYPE_U32).
The number of entry point offsets is reported by the elems field
and in struct v4l2_ctrl_hevc_slice_params.num_entry_point_offsets
field.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst          | 16 ++++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls-defs.c                |  5 +++++
 include/media/hevc-ctrls.h                               |  5 ++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index c2e0adece613..c05c5553f0e3 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3010,6 +3010,11 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     * - __u32
       - ``data_bit_offset``
       - Offset (in bits) to the video data in the current slice data.
+    * - __u32
+      - ``num_entry_point_offsets``
+      - Specifies the number of entry point offset syntax elements in the slice header.
+        When the driver supports it, the ``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS``
+        must be set.
     * - __u8
       - ``nal_unit_type``
       - Specifies the coding type of the slice (B, P or I).
@@ -3150,6 +3155,17 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
 
     \normalsize
 
+``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (integer)``
+    Specifies entry point offsets in bytes.
+    This control is a dynamically sized array. The number of entry point
+    offsets is reported by the ``elems`` field.
+    This bitstream parameter is defined according to :ref:`hevc`.
+    They are described in section 7.4.7.1 "General slice segment header
+    semantics" of the specification.
+    When multiple slices are submitted in a request, the length of
+    this array must be the sum of num_entry_point_offsets of all the
+    slices in the request.
+
 ``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
     Specifies the HEVC scaling matrix parameters used for the scaling process
     for transform coefficients.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index d594efbcbb93..e22921e7ea61 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -1188,6 +1188,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS:		return "HEVC Decode Parameters";
 	case V4L2_CID_STATELESS_HEVC_DECODE_MODE:		return "HEVC Decode Mode";
 	case V4L2_CID_STATELESS_HEVC_START_CODE:		return "HEVC Start Code";
+	case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS:	return "HEVC Entry Point Offsets";
 
 	/* Colorimetry controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
@@ -1518,6 +1519,10 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS:
 		*type = V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS;
 		break;
+	case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS:
+		*type = V4L2_CTRL_TYPE_U32;
+		*flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY;
+		break;
 	case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR:
 		*type = V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR;
 		break;
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 341fc795d550..e614cf20dfd3 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -23,6 +23,7 @@
 #define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
 #define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
 #define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_BASE + 1017)
 
 /* enum v4l2_ctrl_type type values */
 #define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
@@ -319,6 +320,8 @@ struct v4l2_hevc_pred_weight_table {
  *
  * @bit_size: size (in bits) of the current slice data
  * @data_bit_offset: offset (in bits) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
  * @nal_unit_type: specifies the coding type of the slice (B, P or I)
  * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
  * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
@@ -361,7 +364,7 @@ struct v4l2_hevc_pred_weight_table {
 struct v4l2_ctrl_hevc_slice_params {
 	__u32	bit_size;
 	__u32	data_bit_offset;
-
+	__u32	num_entry_point_offsets;
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
 	__u8	nal_unit_type;
 	__u8	nuh_temporal_id_plus1;
-- 
cgit 


From 01dcfd53335cfb5c66a6c023ac0f789a5b87ace5 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:49 +0100
Subject: media: uapi: Move parsed HEVC pixel format out of staging

Move HEVC pixel format since we are ready to stabilize the uAPI

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 5 -----
 include/media/hevc-ctrls.h                                  | 3 ---
 include/uapi/linux/videodev2.h                              | 1 +
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index 967fc803ef94..c352d91a73d8 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -215,11 +215,6 @@ Compressed Formats
 	See the :ref:`associated Codec Control IDs <v4l2-mpeg-hevc>`.
 	Buffers associated with this pixel format must contain the appropriate
 	number of macroblocks to decode a full corresponding frame.
-
-	.. note::
-
-	   This format is not yet part of the public kernel API and it
-	   is expected to change.
     * .. _V4L2-PIX-FMT-FWHT:
 
       - ``V4L2_PIX_FMT_FWHT``
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index e614cf20dfd3..3a6601a46ced 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -13,9 +13,6 @@
 
 #include <linux/videodev2.h>
 
-/* The pixel format isn't stable at the moment and will likely be renamed. */
-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-
 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
 #define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
 #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 87ebc6baafb6..06e3f81ad855 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -736,6 +736,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_FWHT     v4l2_fourcc('F', 'W', 'H', 'T') /* Fast Walsh Hadamard Transform (vicodec) */
 #define V4L2_PIX_FMT_FWHT_STATELESS     v4l2_fourcc('S', 'F', 'W', 'H') /* Stateless FWHT (vicodec) */
 #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_CPIA1    v4l2_fourcc('C', 'P', 'I', 'A') /* cpia1 YUV */
-- 
cgit 


From 16e2d220cdc64a1518878172dc7b50d4f60e5aac Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:51 +0100
Subject: media: uapi: Move the HEVC stateless control type out of staging

Move the HEVC stateless controls types out of staging,
and re-number them.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/videodev2.h.rst.exceptions | 5 +++++
 include/media/hevc-ctrls.h                                   | 7 -------
 include/uapi/linux/videodev2.h                               | 6 ++++++
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 0b91200776f8..2feea4a5a008 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -153,6 +153,11 @@ replace symbol V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_VP9_FRAME :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_HDR10_CLL_INFO :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_PPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS :c:type:`v4l2_ctrl_type`
 
 # V4L2 capability defines
 replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 3a6601a46ced..42d16e8a1050 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -22,13 +22,6 @@
 #define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
 #define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_BASE + 1017)
 
-/* enum v4l2_ctrl_type type values */
-#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-
 enum v4l2_stateless_hevc_decode_mode {
 	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
 	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 06e3f81ad855..cff2bb78b2cc 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1860,6 +1860,12 @@ enum v4l2_ctrl_type {
 
 	V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR	= 0x0260,
 	V4L2_CTRL_TYPE_VP9_FRAME		= 0x0261,
+
+	V4L2_CTRL_TYPE_HEVC_SPS			= 0x0270,
+	V4L2_CTRL_TYPE_HEVC_PPS			= 0x0271,
+	V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS	= 0x0272,
+	V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX	= 0x0273,
+	V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS	= 0x0274,
 };
 
 /*  Used in the VIDIOC_QUERYCTRL ioctl for querying controls */
-- 
cgit 


From 3360755ef89ab2a46c58b07c38c6b97bcdc31d85 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:53 +0100
Subject: media: hantro: Stop using Hantro dedicated control

The number of bits to skip in the slice header can be computed
in the driver by using sps, pps and decode_params information.
This makes it possible to remove Hantro dedicated control.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/staging/media/hantro/hantro_drv.c         | 36 ----------------------
 drivers/staging/media/hantro/hantro_g2_hevc_dec.c | 37 ++++++++++++++++++++++-
 include/media/hevc-ctrls.h                        | 13 --------
 3 files changed, 36 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c
index 1a62c5610730..e35549663cdc 100644
--- a/drivers/staging/media/hantro/hantro_drv.c
+++ b/drivers/staging/media/hantro/hantro_drv.c
@@ -304,26 +304,6 @@ static int hantro_jpeg_s_ctrl(struct v4l2_ctrl *ctrl)
 	return 0;
 }
 
-static int hantro_hevc_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct hantro_ctx *ctx;
-
-	ctx = container_of(ctrl->handler,
-			   struct hantro_ctx, ctrl_handler);
-
-	vpu_debug(1, "s_ctrl: id = %d, val = %d\n", ctrl->id, ctrl->val);
-
-	switch (ctrl->id) {
-	case V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP:
-		ctx->hevc_dec.ctrls.hevc_hdr_skip_length = ctrl->val;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static const struct v4l2_ctrl_ops hantro_ctrl_ops = {
 	.try_ctrl = hantro_try_ctrl,
 };
@@ -332,10 +312,6 @@ static const struct v4l2_ctrl_ops hantro_jpeg_ctrl_ops = {
 	.s_ctrl = hantro_jpeg_s_ctrl,
 };
 
-static const struct v4l2_ctrl_ops hantro_hevc_ctrl_ops = {
-	.s_ctrl = hantro_hevc_s_ctrl,
-};
-
 #define HANTRO_JPEG_ACTIVE_MARKERS	(V4L2_JPEG_ACTIVE_MARKER_APP0 | \
 					 V4L2_JPEG_ACTIVE_MARKER_COM | \
 					 V4L2_JPEG_ACTIVE_MARKER_DQT | \
@@ -487,18 +463,6 @@ static const struct hantro_ctrl controls[] = {
 		.cfg = {
 			.id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
 		},
-	}, {
-		.codec = HANTRO_HEVC_DECODER,
-		.cfg = {
-			.id = V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP,
-			.name = "Hantro HEVC slice header skip bytes",
-			.type = V4L2_CTRL_TYPE_INTEGER,
-			.min = 0,
-			.def = 0,
-			.max = 0x100,
-			.step = 1,
-			.ops = &hantro_hevc_ctrl_ops,
-		},
 	}, {
 		.codec = HANTRO_VP9_DECODER,
 		.cfg = {
diff --git a/drivers/staging/media/hantro/hantro_g2_hevc_dec.c b/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
index d28653d04d20..233ecd863d5f 100644
--- a/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
+++ b/drivers/staging/media/hantro/hantro_g2_hevc_dec.c
@@ -117,6 +117,41 @@ static void prepare_tile_info_buffer(struct hantro_ctx *ctx)
 		vpu_debug(1, "%s: no chroma!\n", __func__);
 }
 
+static int compute_header_skip_length(struct hantro_ctx *ctx)
+{
+	const struct hantro_hevc_dec_ctrls *ctrls = &ctx->hevc_dec.ctrls;
+	const struct v4l2_ctrl_hevc_decode_params *decode_params = ctrls->decode_params;
+	const struct v4l2_ctrl_hevc_sps *sps = ctrls->sps;
+	const struct v4l2_ctrl_hevc_pps *pps = ctrls->pps;
+	int skip = 0;
+
+	if (pps->flags & V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT)
+		/* size of pic_output_flag */
+		skip++;
+
+	if (sps->flags & V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE)
+		/* size of pic_order_cnt_lsb */
+		skip += 2;
+
+	if (!(decode_params->flags & V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC)) {
+		/* size of pic_order_cnt_lsb */
+		skip += sps->log2_max_pic_order_cnt_lsb_minus4 + 4;
+
+		/* size of short_term_ref_pic_set_sps_flag */
+		skip++;
+
+		if (decode_params->short_term_ref_pic_set_size)
+			/* size of st_ref_pic_set( num_short_term_ref_pic_sets ) */
+			skip += decode_params->short_term_ref_pic_set_size;
+		else if (sps->num_short_term_ref_pic_sets > 1)
+			skip += fls(sps->num_short_term_ref_pic_sets - 1);
+
+		skip += decode_params->long_term_ref_pic_set_size;
+	}
+
+	return skip;
+}
+
 static void set_params(struct hantro_ctx *ctx)
 {
 	const struct hantro_hevc_dec_ctrls *ctrls = &ctx->hevc_dec.ctrls;
@@ -134,7 +169,7 @@ static void set_params(struct hantro_ctx *ctx)
 
 	hantro_reg_write(vpu, &g2_output_8_bits, 0);
 
-	hantro_reg_write(vpu, &g2_hdr_skip_length, ctrls->hevc_hdr_skip_length);
+	hantro_reg_write(vpu, &g2_hdr_skip_length, compute_header_skip_length(ctx));
 
 	min_log2_cb_size = sps->log2_min_luma_coding_block_size_minus3 + 3;
 	max_log2_ctb_size = min_log2_cb_size + sps->log2_diff_max_min_luma_coding_block_size;
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 42d16e8a1050..9239e8b649e0 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -465,17 +465,4 @@ struct v4l2_ctrl_hevc_scaling_matrix {
 	__u8	scaling_list_dc_coef_32x32[2];
 };
 
-/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
-#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
-/*
- * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
- * the number of data (in bits) to skip in the
- * slice segment header.
- * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
- * to before syntax element "slice_temporal_mvp_enabled_flag".
- * If IDR, the skipped bits are just "pic_output_flag"
- * (separate_colour_plane_flag is not supported).
- */
-#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
-
 #endif
-- 
cgit 


From 13789e30704908a6bf743d338f663b0468e204c7 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:54 +0100
Subject: media: uapi: HEVC: fix padding in v4l2 control structures

Fix padding where needed to remove holes

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    |  6 +++---
 drivers/media/v4l2-core/v4l2-ctrls-core.c          | 15 ---------------
 include/media/hevc-ctrls.h                         | 22 ++++++++++++++--------
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index c05c5553f0e3..73b347acb704 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3514,9 +3514,6 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     * - __u8
       - ``num_active_dpb_entries``
       - The number of entries in ``dpb``.
-    * - struct :c:type:`v4l2_hevc_dpb_entry`
-      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The decoded picture buffer, for meta-data about reference frames.
     * - __u8
       - ``num_poc_st_curr_before``
       - The number of reference pictures in the short-term set that come before
@@ -3540,6 +3537,9 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
       - PocLtCurr as described in section 8.3.2 "Decoding process for reference
         picture set": provides the index of the long term references in DPB array.
+    * - struct :c:type:`v4l2_hevc_dpb_entry`
+      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The decoded picture buffer, for meta-data about reference frames.
     * - __u64
       - ``flags``
       - See :ref:`Decode Parameters Flags <hevc_decode_params_flags>`
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c
index c5c5407584ff..1f85828d6694 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-core.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c
@@ -536,7 +536,6 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 	struct v4l2_ctrl_h264_decode_params *p_h264_dec_params;
 	struct v4l2_ctrl_hevc_sps *p_hevc_sps;
 	struct v4l2_ctrl_hevc_pps *p_hevc_pps;
-	struct v4l2_ctrl_hevc_slice_params *p_hevc_slice_params;
 	struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering;
 	struct v4l2_ctrl_hevc_decode_params *p_hevc_decode_params;
 	struct v4l2_area *area;
@@ -814,8 +813,6 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 			p_hevc_pps->pps_beta_offset_div2 = 0;
 			p_hevc_pps->pps_tc_offset_div2 = 0;
 		}
-
-		zero_padding(*p_hevc_pps);
 		break;
 
 	case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS:
@@ -824,21 +821,9 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 		if (p_hevc_decode_params->num_active_dpb_entries >
 		    V4L2_HEVC_DPB_ENTRIES_NUM_MAX)
 			return -EINVAL;
-
-		for (i = 0; i < p_hevc_decode_params->num_active_dpb_entries;
-		     i++) {
-			struct v4l2_hevc_dpb_entry *dpb_entry =
-				&p_hevc_decode_params->dpb[i];
-
-			zero_padding(*dpb_entry);
-		}
 		break;
 
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
-		p_hevc_slice_params = p;
-
-		zero_padding(p_hevc_slice_params->pred_weight_table);
-		zero_padding(*p_hevc_slice_params);
 		break;
 
 	case V4L2_CTRL_TYPE_HDR10_CLL_INFO:
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 9239e8b649e0..7358cbfc3e4d 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -105,6 +105,7 @@ enum v4l2_stateless_hevc_start_code {
  * @chroma_format_idc: specifies the chroma sampling
  * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
  *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
  * @flags: see V4L2_HEVC_SPS_FLAG_{}
  */
 struct v4l2_ctrl_hevc_sps {
@@ -133,6 +134,7 @@ struct v4l2_ctrl_hevc_sps {
 	__u8	chroma_format_idc;
 	__u8	sps_max_sub_layers_minus1;
 
+	__u8	reserved[6];
 	__u64	flags;
 };
 
@@ -192,6 +194,7 @@ struct v4l2_ctrl_hevc_sps {
  *			divided by 2
  * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
  *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
  * @flags: see V4L2_HEVC_PPS_FLAG_{}
  */
 struct v4l2_ctrl_hevc_pps {
@@ -210,8 +213,7 @@ struct v4l2_ctrl_hevc_pps {
 	__s8	pps_beta_offset_div2;
 	__s8	pps_tc_offset_div2;
 	__u8	log2_parallel_merge_level_minus2;
-
-	__u8	padding[4];
+	__u8	reserved;
 	__u64	flags;
 };
 
@@ -239,14 +241,15 @@ struct v4l2_ctrl_hevc_pps {
  * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
  * @flags: long term flag for the reference frame
  * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
  * @pic_order_cnt_val: the picture order count of the reference.
  */
 struct v4l2_hevc_dpb_entry {
 	__u64	timestamp;
 	__u8	flags;
 	__u8	field_pic;
+	__u16	reserved;
 	__s32	pic_order_cnt_val;
-	__u8	padding[2];
 };
 
 /**
@@ -285,8 +288,6 @@ struct v4l2_hevc_pred_weight_table {
 	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
 	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
 
-	__u8	padding[6];
-
 	__u8	luma_log2_weight_denom;
 	__s8	delta_chroma_log2_weight_denom;
 };
@@ -339,6 +340,7 @@ struct v4l2_hevc_pred_weight_table {
  * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
  * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
  *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
  * @slice_segment_addr: specifies the address of the first coding tree block in
  *			the slice segment
  * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
@@ -349,6 +351,7 @@ struct v4l2_hevc_pred_weight_table {
  *				picture include in the SPS
  * @pred_weight_table: the prediction weight coefficients for inter-picture
  *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
  * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
  */
 struct v4l2_ctrl_hevc_slice_params {
@@ -379,17 +382,18 @@ struct v4l2_ctrl_hevc_slice_params {
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
 	__u8	pic_struct;
 
+	__u8	reserved0[3];
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
 	__u32	slice_segment_addr;
 	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u16	short_term_ref_pic_set_size;
 	__u16	long_term_ref_pic_set_size;
-	__u8	padding;
 
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
 	struct v4l2_hevc_pred_weight_table pred_weight_table;
 
+	__u8	reserved1[2];
 	__u64	flags;
 };
 
@@ -406,7 +410,6 @@ struct v4l2_ctrl_hevc_slice_params {
  * @long_term_ref_pic_set_size: specifies the size of long-term reference
  *				pictures set include in the SPS of the first slice
  * @num_active_dpb_entries: the number of entries in dpb
- * @dpb: the decoded picture buffer, for meta-data about reference frames
  * @num_poc_st_curr_before: the number of reference pictures in the short-term
  *			    set that come before the current frame
  * @num_poc_st_curr_after: the number of reference pictures in the short-term
@@ -417,6 +420,8 @@ struct v4l2_ctrl_hevc_slice_params {
  * @poc_st_curr_after: provides the index of the short term after references
  *		       in DPB array
  * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
  * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
  */
 struct v4l2_ctrl_hevc_decode_params {
@@ -424,13 +429,14 @@ struct v4l2_ctrl_hevc_decode_params {
 	__u16	short_term_ref_pic_set_size;
 	__u16	long_term_ref_pic_set_size;
 	__u8	num_active_dpb_entries;
-	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	num_poc_st_curr_before;
 	__u8	num_poc_st_curr_after;
 	__u8	num_poc_lt_curr;
 	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
 	__u64	flags;
 };
 
-- 
cgit 


From e7060d9a78c278f41d196d06df8879230038e508 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:55 +0100
Subject: media: uapi: Change data_bit_offset definition

'F.7.3.6.1 General slice segment header syntax' section of HEVC
specification describes that a slice header always end aligned on
byte boundary, therefore we only need to provide the data offset in bytes.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst       |  4 ++--
 drivers/staging/media/sunxi/cedrus/cedrus_h265.c      | 19 ++++++++++++++++++-
 drivers/staging/media/sunxi/cedrus/cedrus_video.c     |  1 -
 include/media/hevc-ctrls.h                            |  4 ++--
 4 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 73b347acb704..6cf68588af1b 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3008,8 +3008,8 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
       - ``bit_size``
       - Size (in bits) of the current slice data.
     * - __u32
-      - ``data_bit_offset``
-      - Offset (in bits) to the video data in the current slice data.
+      - ``data_byte_offset``
+      - Offset (in bytes) to the video data in the current slice data.
     * - __u32
       - ``num_entry_point_offsets``
       - Specifies the number of entry point offset syntax elements in the slice header.
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
index 411601975124..7b67cb4621cf 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
@@ -317,6 +317,8 @@ static void cedrus_h265_setup(struct cedrus_ctx *ctx,
 	u32 chroma_log2_weight_denom;
 	u32 output_pic_list_index;
 	u32 pic_order_cnt[2];
+	u8 *padding;
+	int count;
 	u32 reg;
 
 	sps = run->h265.sps;
@@ -405,7 +407,22 @@ static void cedrus_h265_setup(struct cedrus_ctx *ctx,
 	/* Initialize bitstream access. */
 	cedrus_write(dev, VE_DEC_H265_TRIGGER, VE_DEC_H265_TRIGGER_INIT_SWDEC);
 
-	cedrus_h265_skip_bits(dev, slice_params->data_bit_offset);
+	/*
+	 * Cedrus expects that bitstream pointer is actually at the end of the slice header
+	 * instead of start of slice data. Padding is 8 bits at most (one bit set to 1 and
+	 * at most seven bits set to 0), so we have to inspect only one byte before slice data.
+	 */
+	padding = (u8 *)vb2_plane_vaddr(&run->src->vb2_buf, 0) +
+		slice_params->data_byte_offset - 1;
+
+	for (count = 0; count < 8; count++)
+		if (*padding & (1 << count))
+			break;
+
+	/* Include the one bit. */
+	count++;
+
+	cedrus_h265_skip_bits(dev, slice_params->data_byte_offset * 8 - count);
 
 	/* Bitstream parameters. */
 
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_video.c b/drivers/staging/media/sunxi/cedrus/cedrus_video.c
index 33726175d980..66714609b577 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_video.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_video.c
@@ -568,7 +568,6 @@ int cedrus_queue_init(void *priv, struct vb2_queue *src_vq,
 
 	src_vq->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
 	src_vq->io_modes = VB2_MMAP | VB2_DMABUF;
-	src_vq->dma_attrs = DMA_ATTR_NO_KERNEL_MAPPING;
 	src_vq->drv_priv = ctx;
 	src_vq->buf_struct_size = sizeof(struct cedrus_buffer);
 	src_vq->ops = &cedrus_qops;
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 7358cbfc3e4d..c89029b3c5da 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -310,7 +310,7 @@ struct v4l2_hevc_pred_weight_table {
  * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
  *
  * @bit_size: size (in bits) of the current slice data
- * @data_bit_offset: offset (in bits) to the video data in the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
  * @num_entry_point_offsets: specifies the number of entry point offset syntax
  *			     elements in the slice header.
  * @nal_unit_type: specifies the coding type of the slice (B, P or I)
@@ -356,7 +356,7 @@ struct v4l2_hevc_pred_weight_table {
  */
 struct v4l2_ctrl_hevc_slice_params {
 	__u32	bit_size;
-	__u32	data_bit_offset;
+	__u32	data_byte_offset;
 	__u32	num_entry_point_offsets;
 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
 	__u8	nal_unit_type;
-- 
cgit 


From ca24fef0f2c857b0533f21f9a8a756f9e73d60fb Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:56 +0100
Subject: media: uapi: move HEVC stateless controls out of staging

HEVC uAPI is used by 2 mainline drivers (Hantro, Cedrus)
and at least 2 out-of-tree drivers (rkvdec, RPi).
The uAPI has been reviewed so it is time to make it 'public' by
un-staging it.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../media/v4l/ext-ctrls-codec-stateless.rst        | 902 ++++++++++++++++++++
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 906 ---------------------
 .../userspace-api/media/v4l/pixfmt-compressed.rst  |   2 +-
 .../userspace-api/media/v4l/vidioc-g-ext-ctrls.rst |  20 +
 include/media/hevc-ctrls.h                         | 474 -----------
 include/media/v4l2-ctrls.h                         |   6 -
 include/uapi/linux/v4l2-controls.h                 | 459 +++++++++++
 include/uapi/linux/videodev2.h                     |   5 +
 8 files changed, 1387 insertions(+), 1387 deletions(-)
 delete mode 100644 include/media/hevc-ctrls.h

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
index bee73065e993..cd33857d947d 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
@@ -2048,3 +2048,905 @@ This structure contains all loop filter related parameters. See sections
       - 0x2
       - When set, the bitstream contains additional syntax elements that
         specify which mode and reference frame deltas are to be updated.
+
+.. _v4l2-codec-stateless-hevc:
+
+``V4L2_CID_STATELESS_HEVC_SPS (struct)``
+    Specifies the Sequence Parameter Set fields (as extracted from the
+    bitstream) for the associated HEVC slice data.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.3.2 "Sequence parameter set RBSP
+    semantics" of the specification.
+
+.. c:type:: v4l2_ctrl_hevc_sps
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{1.2cm}|p{9.2cm}|p{6.9cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_sps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``video_parameter_set_id``
+      - Specifies the value of the vps_video_parameter_set_id of the active VPS
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
+    * - __u8
+      - ``seq_parameter_set_id``
+      - Provides an identifier for the SPS for reference by other syntax elements
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
+    * - __u16
+      - ``pic_width_in_luma_samples``
+      - Specifies the width of each decoded picture in units of luma samples.
+    * - __u16
+      - ``pic_height_in_luma_samples``
+      - Specifies the height of each decoded picture in units of luma samples.
+    * - __u8
+      - ``bit_depth_luma_minus8``
+      - This value plus 8 specifies the bit depth of the samples of the luma array.
+    * - __u8
+      - ``bit_depth_chroma_minus8``
+      - This value plus 8 specifies the bit depth of the samples of the chroma arrays.
+    * - __u8
+      - ``log2_max_pic_order_cnt_lsb_minus4``
+      - Specifies the value of the variable MaxPicOrderCntLsb.
+    * - __u8
+      - ``sps_max_dec_pic_buffering_minus1``
+      - This value plus 1 specifies the maximum required size of the decoded picture buffer for
+        the coded video sequence (CVS).
+    * - __u8
+      - ``sps_max_num_reorder_pics``
+      - Indicates the maximum allowed number of pictures.
+    * - __u8
+      - ``sps_max_latency_increase_plus1``
+      - Used to signal MaxLatencyPictures, which indicates the maximum number of
+        pictures that can precede any picture in output order and follow that
+        picture in decoding order.
+    * - __u8
+      - ``log2_min_luma_coding_block_size_minus3``
+      - This value plus 3 specifies the minimum luma coding block size.
+    * - __u8
+      - ``log2_diff_max_min_luma_coding_block_size``
+      - Specifies the difference between the maximum and minimum luma coding block size.
+    * - __u8
+      - ``log2_min_luma_transform_block_size_minus2``
+      - This value plus 2 specifies the minimum luma transform block size.
+    * - __u8
+      - ``log2_diff_max_min_luma_transform_block_size``
+      - Specifies the difference between the maximum and minimum luma transform block size.
+    * - __u8
+      - ``max_transform_hierarchy_depth_inter``
+      - Specifies the maximum hierarchy depth for transform units of coding units coded
+        in inter prediction mode.
+    * - __u8
+      - ``max_transform_hierarchy_depth_intra``
+      - Specifies the maximum hierarchy depth for transform units of coding units coded in
+        intra prediction mode.
+    * - __u8
+      - ``pcm_sample_bit_depth_luma_minus1``
+      - This value plus 1 specifies the number of bits used to represent each of PCM sample values of the
+        luma component.
+    * - __u8
+      - ``pcm_sample_bit_depth_chroma_minus1``
+      - Specifies the number of bits used to represent each of PCM sample values of
+        the chroma components.
+    * - __u8
+      - ``log2_min_pcm_luma_coding_block_size_minus3``
+      - Plus 3 specifies the minimum size of coding blocks.
+    * - __u8
+      - ``log2_diff_max_min_pcm_luma_coding_block_size``
+      - Specifies the difference between the maximum and minimum size of coding blocks.
+    * - __u8
+      - ``num_short_term_ref_pic_sets``
+      - Specifies the number of st_ref_pic_set() syntax structures included in the SPS.
+    * - __u8
+      - ``num_long_term_ref_pics_sps``
+      - Specifies the number of candidate long-term reference pictures that are
+        specified in the SPS.
+    * - __u8
+      - ``chroma_format_idc``
+      - Specifies the chroma sampling.
+    * - __u8
+      - ``sps_max_sub_layers_minus1``
+      - This value plus 1 specifies the maximum number of temporal sub-layers.
+    * - __u64
+      - ``flags``
+      - See :ref:`Sequence Parameter Set Flags <hevc_sps_flags>`
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_sps_flags:
+
+``Sequence Parameter Set Flags``
+
+.. raw:: latex
+
+    \small
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_AMP_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_PCM_ENABLED``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED``
+      - 0x00000100
+      -
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_PPS (struct)``
+    Specifies the Picture Parameter Set fields (as extracted from the
+    bitstream) for the associated HEVC slice data.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.3.3 "Picture parameter set RBSP
+    semantics" of the specification.
+
+.. c:type:: v4l2_ctrl_hevc_pps
+
+.. tabularcolumns:: |p{1.2cm}|p{8.6cm}|p{7.5cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_pps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``pic_parameter_set_id``
+      - Identifies the PPS for reference by other syntax elements.
+    * - __u8
+      - ``num_extra_slice_header_bits``
+      - Specifies the number of extra slice header bits that are present
+        in the slice header RBSP for coded pictures referring to the PPS.
+    * - __u8
+      - ``num_ref_idx_l0_default_active_minus1``
+      - This value plus 1 specifies the inferred value of num_ref_idx_l0_active_minus1.
+    * - __u8
+      - ``num_ref_idx_l1_default_active_minus1``
+      - This value plus 1 specifies the inferred value of num_ref_idx_l1_active_minus1.
+    * - __s8
+      - ``init_qp_minus26``
+      - This value plus 26 specifies the initial value of SliceQp Y for each slice
+        referring to the PPS.
+    * - __u8
+      - ``diff_cu_qp_delta_depth``
+      - Specifies the difference between the luma coding tree block size
+        and the minimum luma coding block size of coding units that
+        convey cu_qp_delta_abs and cu_qp_delta_sign_flag.
+    * - __s8
+      - ``pps_cb_qp_offset``
+      - Specifies the offsets to the luma quantization parameter Cb.
+    * - __s8
+      - ``pps_cr_qp_offset``
+      - Specifies the offsets to the luma quantization parameter Cr.
+    * - __u8
+      - ``num_tile_columns_minus1``
+      - This value plus 1 specifies the number of tile columns partitioning the picture.
+    * - __u8
+      - ``num_tile_rows_minus1``
+      - This value plus 1 specifies the number of tile rows partitioning the picture.
+    * - __u8
+      - ``column_width_minus1[20]``
+      - This value plus 1 specifies the width of the i-th tile column in units of
+        coding tree blocks.
+    * - __u8
+      - ``row_height_minus1[22]``
+      - This value plus 1 specifies the height of the i-th tile row in units of coding
+        tree blocks.
+    * - __s8
+      - ``pps_beta_offset_div2``
+      - Specifies the default deblocking parameter offsets for beta divided by 2.
+    * - __s8
+      - ``pps_tc_offset_div2``
+      - Specifies the default deblocking parameter offsets for tC divided by 2.
+    * - __u8
+      - ``log2_parallel_merge_level_minus2``
+      - This value plus 2 specifies the value of the variable Log2ParMrgLevel.
+    * - __u8
+      - ``padding[4]``
+      - Applications and drivers must set this to zero.
+    * - __u64
+      - ``flags``
+      - See :ref:`Picture Parameter Set Flags <hevc_pps_flags>`
+
+.. _hevc_pps_flags:
+
+``Picture Parameter Set Flags``
+
+.. raw:: latex
+
+    \small
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED``
+      - 0x00000100
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED``
+      - 0x00000200
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED``
+      - 0x00000400
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TILES_ENABLED``
+      - 0x00000800
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED``
+      - 0x00001000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED``
+      - 0x00002000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED``
+      - 0x00004000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED``
+      - 0x00008000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER``
+      - 0x00010000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT``
+      - 0x00020000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT``
+      - 0x00040000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
+      - 0x00080000
+      - Specifies the presence of deblocking filter control syntax elements in
+        the PPS
+    * - ``V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING``
+      - 0x00100000
+      - Specifies that tile column boundaries and likewise tile row boundaries
+        are distributed uniformly across the picture
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (struct)``
+    Specifies various slice-specific parameters, especially from the NAL unit
+    header, general slice segment header and weighted prediction parameter
+    parts of the bitstream.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.7 "General slice segment header
+    semantics" of the specification.
+    This control is a dynamically sized 1-dimensional array,
+    V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+
+.. c:type:: v4l2_ctrl_hevc_slice_params
+
+.. raw:: latex
+
+    \scriptsize
+
+.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_slice_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u32
+      - ``bit_size``
+      - Size (in bits) of the current slice data.
+    * - __u32
+      - ``data_byte_offset``
+      - Offset (in byte) to the video data in the current slice data.
+    * - __u32
+      - ``num_entry_point_offsets``
+      - Specifies the number of entry point offset syntax elements in the slice header.
+        When the driver supports it, the ``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS``
+        must be set.
+    * - __u8
+      - ``nal_unit_type``
+      - Specifies the coding type of the slice (B, P or I).
+    * - __u8
+      - ``nuh_temporal_id_plus1``
+      - Minus 1 specifies a temporal identifier for the NAL unit.
+    * - __u8
+      - ``slice_type``
+      -
+	(V4L2_HEVC_SLICE_TYPE_I, V4L2_HEVC_SLICE_TYPE_P or
+	V4L2_HEVC_SLICE_TYPE_B).
+    * - __u8
+      - ``colour_plane_id``
+      - Specifies the colour plane associated with the current slice.
+    * - __s32
+      - ``slice_pic_order_cnt``
+      - Specifies the picture order count.
+    * - __u8
+      - ``num_ref_idx_l0_active_minus1``
+      - This value plus 1 specifies the maximum reference index for reference picture list 0
+        that may be used to decode the slice.
+    * - __u8
+      - ``num_ref_idx_l1_active_minus1``
+      - This value plus 1 specifies the maximum reference index for reference picture list 1
+        that may be used to decode the slice.
+    * - __u8
+      - ``collocated_ref_idx``
+      - Specifies the reference index of the collocated picture used for
+        temporal motion vector prediction.
+    * - __u8
+      - ``five_minus_max_num_merge_cand``
+      - Specifies the maximum number of merging motion vector prediction
+        candidates supported in the slice subtracted from 5.
+    * - __s8
+      - ``slice_qp_delta``
+      - Specifies the initial value of QpY to be used for the coding blocks in the slice.
+    * - __s8
+      - ``slice_cb_qp_offset``
+      - Specifies a difference to be added to the value of pps_cb_qp_offset.
+    * - __s8
+      - ``slice_cr_qp_offset``
+      - Specifies a difference to be added to the value of pps_cr_qp_offset.
+    * - __s8
+      - ``slice_act_y_qp_offset``
+      - Specifies the offset to the luma of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_act_cb_qp_offset``
+      - Specifies the offset to the cb of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_act_cr_qp_offset``
+      - Specifies the offset to the cr of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_beta_offset_div2``
+      - Specifies the deblocking parameter offsets for beta divided by 2.
+    * - __s8
+      - ``slice_tc_offset_div2``
+      - Specifies the deblocking parameter offsets for tC divided by 2.
+    * - __u8
+      - ``pic_struct``
+      - Indicates whether a picture should be displayed as a frame or as one or more fields.
+    * - __u32
+      - ``slice_segment_addr``
+      - Specifies the address of the first coding tree block in the slice segment.
+    * - __u8
+      - ``ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The list of L0 reference elements as indices in the DPB.
+    * - __u8
+      - ``ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The list of L1 reference elements as indices in the DPB.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, described as st_ref_pic_set()
+        in the specification, included in the slice header or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set include in the slice header
+        or SPS. It is the number of bits in the conditional block if(long_term_ref_pics_present_flag)
+        in section 7.3.6.1 of the specification.
+    * - __u8
+      - ``padding``
+      - Applications and drivers must set this to zero.
+    * - struct :c:type:`v4l2_hevc_pred_weight_table`
+      - ``pred_weight_table``
+      - The prediction weight coefficients for inter-picture prediction.
+    * - __u64
+      - ``flags``
+      - See :ref:`Slice Parameters Flags <hevc_slice_params_flags>`
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_slice_params_flags:
+
+``Slice Parameters Flags``
+
+.. raw:: latex
+
+    \scriptsize
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED``
+      - 0x00000100
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT``
+      - 0x00000200
+      -
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (integer)``
+    Specifies entry point offsets in bytes.
+    This control is a dynamically sized array. The number of entry point
+    offsets is reported by the ``elems`` field.
+    This bitstream parameter is defined according to :ref:`hevc`.
+    They are described in section 7.4.7.1 "General slice segment header
+    semantics" of the specification.
+    When multiple slices are submitted in a request, the length of
+    this array must be the sum of num_entry_point_offsets of all the
+    slices in the request.
+
+``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
+    Specifies the HEVC scaling matrix parameters used for the scaling process
+    for transform coefficients.
+    These matrix and parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.5 "Scaling list data semantics" of
+    the specification.
+
+.. c:type:: v4l2_ctrl_hevc_scaling_matrix
+
+.. raw:: latex
+
+    \scriptsize
+
+.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_scaling_matrix
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``scaling_list_4x4[6][16]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_8x8[6][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_16x16[6][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_32x32[2][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_dc_coef_16x16[6]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_dc_coef_32x32[2]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+
+.. raw:: latex
+
+    \normalsize
+
+.. c:type:: v4l2_hevc_dpb_entry
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{1.0cm}|p{4.2cm}|p{12.1cm}|
+
+.. flat-table:: struct v4l2_hevc_dpb_entry
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u64
+      - ``timestamp``
+      - Timestamp of the V4L2 capture buffer to use as reference, used
+        with B-coded and P-coded frames. The timestamp refers to the
+	``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
+	:c:func:`v4l2_timeval_to_ns()` function to convert the struct
+	:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
+    * - __u8
+      - ``flags``
+      - Long term flag for the reference frame
+        (V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE). The flag is set as
+        described in the ITU HEVC specification chapter "8.3.2 Decoding
+        process for reference picture set".
+    * - __u8
+      - ``field_pic``
+      - Whether the reference is a field picture or a frame.
+        See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
+    * - __s32
+      - ``pic_order_cnt_val``
+      - The picture order count of the current picture.
+    * - __u8
+      - ``padding[2]``
+      - Applications and drivers must set this to zero.
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_dpb_field_pic_flags:
+
+``HEVC dpb field pic Flags``
+
+.. raw:: latex
+
+    \scriptsize
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME``
+      - 0
+      - (progressive) Frame
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD``
+      - 1
+      - Top field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD``
+      - 2
+      - Bottom field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM``
+      - 3
+      - Top field, bottom field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP``
+      - 4
+      - Bottom field, top field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP``
+      - 5
+      - Top field, bottom field, top field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM``
+      - 6
+      - Bottom field, top field, bottom field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING``
+      - 7
+      - Frame doubling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING``
+      - 8
+      - Frame tripling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM``
+      - 9
+      - Top field paired with previous bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP``
+      - 10
+      - Bottom field paired with previous top field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM``
+      - 11
+      - Top field paired with next bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP``
+      - 12
+      - Bottom field paired with next top field in output order
+
+.. c:type:: v4l2_hevc_pred_weight_table
+
+.. raw:: latex
+
+    \footnotesize
+
+.. tabularcolumns:: |p{0.8cm}|p{10.6cm}|p{5.9cm}|
+
+.. flat-table:: struct v4l2_hevc_pred_weight_table
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s8
+      - ``delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 0.
+    * - __s8
+      - ``luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The additive offset applied to the luma prediction value for list 0.
+    * - __s8
+      - ``delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 0.
+    * - __s8
+      - ``chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 0.
+    * - __s8
+      - ``delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 1.
+    * - __s8
+      - ``luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The additive offset applied to the luma prediction value for list 1.
+    * - __s8
+      - ``delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 1.
+    * - __s8
+      - ``chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 1.
+    * - __u8
+      - ``luma_log2_weight_denom``
+      - The base 2 logarithm of the denominator for all luma weighting
+        factors.
+    * - __s8
+      - ``delta_chroma_log2_weight_denom``
+      - The difference of the base 2 logarithm of the denominator for
+        all chroma weighting factors.
+    * - __u8
+      - ``padding[6]``
+      - Applications and drivers must set this to zero.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_DECODE_MODE (enum)``
+    Specifies the decoding mode to use. Currently exposes slice-based and
+    frame-based decoding but new modes might be added later on.
+    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
+    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
+    are required to set this control in order to specify the decoding mode
+    that is expected for the buffer.
+    Drivers may expose a single or multiple decoding modes, depending
+    on what they can support.
+
+.. c:type:: v4l2_stateless_hevc_decode_mode
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{9.4cm}|p{0.6cm}|p{7.3cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED``
+      - 0
+      - Decoding is done at the slice granularity.
+        The OUTPUT buffer must contain a single slice.
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED``
+      - 1
+      - Decoding is done at the frame granularity.
+        The OUTPUT buffer must contain all slices needed to decode the
+        frame.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_START_CODE (enum)``
+    Specifies the HEVC slice start code expected for each slice.
+    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
+    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
+    are required to set this control in order to specify the start code
+    that is expected for the buffer.
+    Drivers may expose a single or multiple start codes, depending
+    on what they can support.
+
+.. c:type:: v4l2_stateless_hevc_start_code
+
+.. tabularcolumns:: |p{9.2cm}|p{0.6cm}|p{7.5cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_STATELESS_HEVC_START_CODE_NONE``
+      - 0
+      - Selecting this value specifies that HEVC slices are passed
+        to the driver without any start code. The bitstream data should be
+        according to :ref:`hevc` 7.3.1.1 General NAL unit syntax, hence
+        contains emulation prevention bytes when required.
+    * - ``V4L2_STATELESS_HEVC_START_CODE_ANNEX_B``
+      - 1
+      - Selecting this value specifies that HEVC slices are expected
+        to be prefixed by Annex B start codes. According to :ref:`hevc`
+        valid start codes can be 3-bytes 0x000001 or 4-bytes 0x00000001.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID (integer)``
+    Specifies a priority identifier for the NAL unit, which will be applied to
+    the base layer. By default this value is set to 0 for the base layer,
+    and the next layer will have the priority ID assigned as 1, 2, 3 and so on.
+    The video encoder can't decide the priority id to be applied to a layer,
+    so this has to come from client.
+    This is applicable to H264 and valid Range is from 0 to 63.
+    Source Rec. ITU-T H.264 (06/2019); G.7.4.1.1, G.8.8.1.
+
+``V4L2_CID_MPEG_VIDEO_LTR_COUNT (integer)``
+    Specifies the maximum number of Long Term Reference (LTR) frames at any
+    given time that the encoder can keep.
+    This is applicable to the H264 and HEVC encoders.
+
+``V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX (integer)``
+    After setting this control the frame that will be queued next
+    will be marked as a Long Term Reference (LTR) frame
+    and given this LTR index which ranges from 0 to LTR_COUNT-1.
+    This is applicable to the H264 and HEVC encoders.
+    Source Rec. ITU-T H.264 (06/2019); Table 7.9
+
+``V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES (bitmask)``
+    Specifies the Long Term Reference (LTR) frame(s) to be used for
+    encoding the next frame queued after setting this control.
+    This provides a bitmask which consists of bits [0, LTR_COUNT-1].
+    This is applicable to the H264 and HEVC encoders.
+
+``V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (struct)``
+    Specifies various decode parameters, especially the references picture order
+    count (POC) for all the lists (short, long, before, current, after) and the
+    number of entries for each of them.
+    These parameters are defined according to :ref:`hevc`.
+    They are described in section 8.3 "Slice decoding process" of the
+    specification.
+
+.. c:type:: v4l2_ctrl_hevc_decode_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_decode_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s32
+      - ``pic_order_cnt_val``
+      - PicOrderCntVal as described in section 8.3.1 "Decoding process
+        for picture order count" of the specification.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, of the first slice
+        described as st_ref_pic_set() in the specification, included in the slice header
+        or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set, of the first slice
+        included in the slice header or SPS. It is the number of bits in the conditional block
+        if(long_term_ref_pics_present_flag) in section 7.3.6.1 of the specification.
+    * - __u8
+      - ``num_active_dpb_entries``
+      - The number of entries in ``dpb``.
+    * - __u8
+      - ``num_poc_st_curr_before``
+      - The number of reference pictures in the short-term set that come before
+        the current frame.
+    * - __u8
+      - ``num_poc_st_curr_after``
+      - The number of reference pictures in the short-term set that come after
+        the current frame.
+    * - __u8
+      - ``num_poc_lt_curr``
+      - The number of reference pictures in the long-term set.
+    * - __u8
+      - ``poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocStCurrBefore as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the short term before references in DPB array.
+    * - __u8
+      - ``poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocStCurrAfter as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the short term after references in DPB array.
+    * - __u8
+      - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocLtCurr as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the long term references in DPB array.
+    * - struct :c:type:`v4l2_hevc_dpb_entry`
+      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The decoded picture buffer, for meta-data about reference frames.
+    * - __u64
+      - ``flags``
+      - See :ref:`Decode Parameters Flags <hevc_decode_params_flags>`
+
+.. _hevc_decode_params_flags:
+
+``Decode Parameters Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR``
+      - 0x00000004
+      -
diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 6cf68588af1b..2a165ae063fb 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2658,909 +2658,3 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     Indicates whether to generate SPS and PPS at every IDR. Setting it to 0
     disables generating SPS and PPS at every IDR. Setting it to one enables
     generating SPS and PPS at every IDR.
-
-.. _v4l2-mpeg-hevc:
-
-``V4L2_CID_STATELESS_HEVC_SPS (struct)``
-    Specifies the Sequence Parameter Set fields (as extracted from the
-    bitstream) for the associated HEVC slice data.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.3.2 "Sequence parameter set RBSP
-    semantics" of the specification.
-
-.. c:type:: v4l2_ctrl_hevc_sps
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{1.2cm}|p{9.2cm}|p{6.9cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_sps
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``video_parameter_set_id``
-      - Specifies the value of the vps_video_parameter_set_id of the active VPS
-        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
-        of H.265 specifications.
-    * - __u8
-      - ``seq_parameter_set_id``
-      - Provides an identifier for the SPS for reference by other syntax elements
-        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
-        of H.265 specifications.
-    * - __u16
-      - ``pic_width_in_luma_samples``
-      - Specifies the width of each decoded picture in units of luma samples.
-    * - __u16
-      - ``pic_height_in_luma_samples``
-      - Specifies the height of each decoded picture in units of luma samples.
-    * - __u8
-      - ``bit_depth_luma_minus8``
-      - This value plus 8 specifies the bit depth of the samples of the luma array.
-    * - __u8
-      - ``bit_depth_chroma_minus8``
-      - This value plus 8 specifies the bit depth of the samples of the chroma arrays.
-    * - __u8
-      - ``log2_max_pic_order_cnt_lsb_minus4``
-      - This value plus 4 specifies the value of the variable MaxPicOrderCntLsb.
-    * - __u8
-      - ``sps_max_dec_pic_buffering_minus1``
-      - This value plus 1 specifies the maximum required size of the decoded picture buffer for
-        the codec video sequence.
-    * - __u8
-      - ``sps_max_num_reorder_pics``
-      - Indicates the maximum allowed number of pictures.
-    * - __u8
-      - ``sps_max_latency_increase_plus1``
-      - Not equal to 0 is used to compute the value of SpsMaxLatencyPictures array.
-    * - __u8
-      - ``log2_min_luma_coding_block_size_minus3``
-      - This value plus 3 specifies the minimum luma coding block size.
-    * - __u8
-      - ``log2_diff_max_min_luma_coding_block_size``
-      - Specifies the difference between the maximum and minimum luma coding block size.
-    * - __u8
-      - ``log2_min_luma_transform_block_size_minus2``
-      - This value plus 2 specifies the minimum luma transform block size.
-    * - __u8
-      - ``log2_diff_max_min_luma_transform_block_size``
-      - Specifies the difference between the maximum and minimum luma transform block size.
-    * - __u8
-      - ``max_transform_hierarchy_depth_inter``
-      - Specifies the maximum hierarchy depth for transform units of coding units coded
-        in inter prediction mode.
-    * - __u8
-      - ``max_transform_hierarchy_depth_intra``
-      - Specifies the maximum hierarchy depth for transform units of coding units coded in
-        intra prediction mode.
-    * - __u8
-      - ``pcm_sample_bit_depth_luma_minus1``
-      - This value plus 1 specifies the number of bits used to represent each of PCM sample
-        values of the luma component.
-    * - __u8
-      - ``pcm_sample_bit_depth_chroma_minus1``
-      - This value plus 1 specifies the number of bits used to represent each of PCM sample
-        values of the chroma components.
-    * - __u8
-      - ``log2_min_pcm_luma_coding_block_size_minus3``
-      - This value plus 3 specifies the minimum size of coding blocks.
-    * - __u8
-      - ``log2_diff_max_min_pcm_luma_coding_block_size``
-      - Specifies the difference between the maximum and minimum size of coding blocks.
-    * - __u8
-      - ``num_short_term_ref_pic_sets``
-      - Specifies the number of st_ref_pic_set() syntax structures included in the SPS.
-    * - __u8
-      - ``num_long_term_ref_pics_sps``
-      - Specifies the number of candidate long-term reference pictures that are
-        specified in the SPS.
-    * - __u8
-      - ``chroma_format_idc``
-      - Specifies the chroma sampling.
-    * - __u8
-      - ``sps_max_sub_layers_minus1``
-      - This value plus 1 specifies the maximum number of temporal sub-layers.
-    * - __u64
-      - ``flags``
-      - See :ref:`Sequence Parameter Set Flags <hevc_sps_flags>`
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_sps_flags:
-
-``Sequence Parameter Set Flags``
-
-.. raw:: latex
-
-    \small
-
-.. cssclass:: longtable
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_AMP_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_PCM_ENABLED``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED``
-      - 0x00000100
-      -
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_PPS (struct)``
-    Specifies the Picture Parameter Set fields (as extracted from the
-    bitstream) for the associated HEVC slice data.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.3.3 "Picture parameter set RBSP
-    semantics" of the specification.
-
-.. c:type:: v4l2_ctrl_hevc_pps
-
-.. tabularcolumns:: |p{1.2cm}|p{8.6cm}|p{7.5cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_pps
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``pic_parameter_set_id``
-      - Identifies the PPS for reference by other syntax elements.
-    * - __u8
-      - ``num_extra_slice_header_bits``
-      - Specifies the number of extra slice header bits that are present
-        in the slice header RBSP for coded pictures referring to the PPS.
-    * - __u8
-      - ``num_ref_idx_l0_default_active_minus1``
-      - This value plus 1 specifies the inferred value of num_ref_idx_l0_active_minus1.
-    * - __u8
-      - ``num_ref_idx_l1_default_active_minus1``
-      - This value plus 1 specifies the inferred value of num_ref_idx_l1_active_minus1.
-    * - __s8
-      - ``init_qp_minus26``
-      - This value plus 26 specifies the initial value of SliceQp Y for each slice
-        referring to the PPS.
-    * - __u8
-      - ``diff_cu_qp_delta_depth``
-      - Specifies the difference between the luma coding tree block size
-        and the minimum luma coding block size of coding units that
-        convey cu_qp_delta_abs and cu_qp_delta_sign_flag.
-    * - __s8
-      - ``pps_cb_qp_offset``
-      - Specifies the offsets to the luma quantization parameter Cb.
-    * - __s8
-      - ``pps_cr_qp_offset``
-      - Specifies the offsets to the luma quantization parameter Cr.
-    * - __u8
-      - ``num_tile_columns_minus1``
-      - This value plus 1 specifies the number of tile columns partitioning the picture.
-    * - __u8
-      - ``num_tile_rows_minus1``
-      - This value plus 1 specifies the number of tile rows partitioning the picture.
-    * - __u8
-      - ``column_width_minus1[20]``
-      - Plus 1 specifies the width of each tile column in units of
-        coding tree blocks.
-    * - __u8
-      - ``row_height_minus1[22]``
-      - This value plus 1 specifies the height of each tile row in units of coding
-        tree blocks.
-    * - __s8
-      - ``pps_beta_offset_div2``
-      - Specifies the default deblocking parameter offsets for beta divided by 2.
-    * - __s8
-      - ``pps_tc_offset_div2``
-      - Specifies the default deblocking parameter offsets for tC divided by 2.
-    * - __u8
-      - ``log2_parallel_merge_level_minus2``
-      - Plus 2 specifies the value of the variable Log2ParMrgLevel.
-    * - __u8
-      - ``padding[4]``
-      - Applications and drivers must set this to zero.
-    * - __u64
-      - ``flags``
-      - See :ref:`Picture Parameter Set Flags <hevc_pps_flags>`
-
-.. _hevc_pps_flags:
-
-``Picture Parameter Set Flags``
-
-.. raw:: latex
-
-    \small
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED``
-      - 0x00000100
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED``
-      - 0x00000200
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED``
-      - 0x00000400
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TILES_ENABLED``
-      - 0x00000800
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED``
-      - 0x00001000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED``
-      - 0x00002000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED``
-      - 0x00004000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED``
-      - 0x00008000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER``
-      - 0x00010000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT``
-      - 0x00020000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT``
-      - 0x00040000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
-      - 0x00080000
-      - Specifies the presence of deblocking filter control syntax elements in
-        the PPS
-    * - ``V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING``
-      - 0x00100000
-      - Specifies that tile column boundaries and likewise tile row boundaries
-        are distributed uniformly across the picture
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (struct)``
-    Specifies various slice-specific parameters, especially from the NAL unit
-    header, general slice segment header and weighted prediction parameter
-    parts of the bitstream.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.7 "General slice segment header
-    semantics" of the specification.
-    This control is a dynamically sized 1-dimensional array,
-    V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
-
-.. c:type:: v4l2_ctrl_hevc_slice_params
-
-.. raw:: latex
-
-    \scriptsize
-
-.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_slice_params
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u32
-      - ``bit_size``
-      - Size (in bits) of the current slice data.
-    * - __u32
-      - ``data_byte_offset``
-      - Offset (in bytes) to the video data in the current slice data.
-    * - __u32
-      - ``num_entry_point_offsets``
-      - Specifies the number of entry point offset syntax elements in the slice header.
-        When the driver supports it, the ``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS``
-        must be set.
-    * - __u8
-      - ``nal_unit_type``
-      - Specifies the coding type of the slice (B, P or I).
-    * - __u8
-      - ``nuh_temporal_id_plus1``
-      - This value minus 1 specifies a temporal identifier for the NAL unit.
-    * - __u8
-      - ``slice_type``
-      -
-	(V4L2_HEVC_SLICE_TYPE_I, V4L2_HEVC_SLICE_TYPE_P or
-	V4L2_HEVC_SLICE_TYPE_B).
-    * - __u8
-      - ``colour_plane_id``
-      - Specifies the colour plane associated with the current slice.
-    * - __s32
-      - ``slice_pic_order_cnt``
-      - Specifies the picture order count.
-    * - __u8
-      - ``num_ref_idx_l0_active_minus1``
-      - This value plus 1 specifies the maximum reference index for
-        reference picture list 0 that may be used to decode the slice.
-    * - __u8
-      - ``num_ref_idx_l1_active_minus1``
-      - This value plus 1 specifies the maximum reference index for
-        reference picture list 1 that may be used to decode the slice.
-    * - __u8
-      - ``collocated_ref_idx``
-      - Specifies the reference index of the collocated picture used for
-        temporal motion vector prediction.
-    * - __u8
-      - ``five_minus_max_num_merge_cand``
-      - Specifies the maximum number of merging motion vector prediction
-        candidates supported in the slice subtracted from 5.
-    * - __s8
-      - ``slice_qp_delta``
-      - Specifies the initial value of QpY to be used for the coding blocks in the slice.
-    * - __s8
-      - ``slice_cb_qp_offset``
-      - Specifies a difference to be added to the value of pps_cb_qp_offset.
-    * - __s8
-      - ``slice_cr_qp_offset``
-      - Specifies a difference to be added to the value of pps_cr_qp_offset.
-    * - __s8
-      - ``slice_act_y_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_act_cb_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_act_cr_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_beta_offset_div2``
-      - Specifies the deblocking parameter offsets for beta divided by 2.
-    * - __s8
-      - ``slice_tc_offset_div2``
-      - Specifies the deblocking parameter offsets for tC divided by 2.
-    * - __u8
-      - ``pic_struct``
-      - Indicates whether a picture should be displayed as a frame or as one or more fields.
-    * - __u32
-      - ``slice_segment_addr``
-      - Specifies the address of the first coding tree block in the slice segment.
-    * - __u8
-      - ``ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The list of L0 reference elements as indices in the DPB.
-    * - __u8
-      - ``ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The list of L1 reference elements as indices in the DPB.
-    * - __u16
-      - ``short_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the short-term reference picture set, described as st_ref_pic_set()
-        in the specification, included in the slice header or SPS (section 7.3.6.1).
-    * - __u16
-      - ``long_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the long-term reference picture set include in the slice header
-        or SPS. It is the number of bits in the conditional block if(long_term_ref_pics_present_flag)
-        in section 7.3.6.1 of the specification.
-    * - __u8
-      - ``padding``
-      - Applications and drivers must set this to zero.
-    * - struct :c:type:`v4l2_hevc_pred_weight_table`
-      - ``pred_weight_table``
-      - The prediction weight coefficients for inter-picture prediction.
-    * - __u64
-      - ``flags``
-      - See :ref:`Slice Parameters Flags <hevc_slice_params_flags>`
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_slice_params_flags:
-
-``Slice Parameters Flags``
-
-.. raw:: latex
-
-    \scriptsize
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED``
-      - 0x00000100
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT``
-      - 0x00000200
-      -
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (integer)``
-    Specifies entry point offsets in bytes.
-    This control is a dynamically sized array. The number of entry point
-    offsets is reported by the ``elems`` field.
-    This bitstream parameter is defined according to :ref:`hevc`.
-    They are described in section 7.4.7.1 "General slice segment header
-    semantics" of the specification.
-    When multiple slices are submitted in a request, the length of
-    this array must be the sum of num_entry_point_offsets of all the
-    slices in the request.
-
-``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
-    Specifies the HEVC scaling matrix parameters used for the scaling process
-    for transform coefficients.
-    These matrix and parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.5 "Scaling list data semantics" of
-    the specification.
-
-.. c:type:: v4l2_ctrl_hevc_scaling_matrix
-
-.. raw:: latex
-
-    \scriptsize
-
-.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_scaling_matrix
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``scaling_list_4x4[6][16]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_8x8[6][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_16x16[6][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_32x32[2][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_dc_coef_16x16[6]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_dc_coef_32x32[2]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-
-.. raw:: latex
-
-    \normalsize
-
-.. c:type:: v4l2_hevc_dpb_entry
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{1.0cm}|p{4.2cm}|p{12.1cm}|
-
-.. flat-table:: struct v4l2_hevc_dpb_entry
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u64
-      - ``timestamp``
-      - Timestamp of the V4L2 capture buffer to use as reference, used
-        with B-coded and P-coded frames. The timestamp refers to the
-	``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
-	:c:func:`v4l2_timeval_to_ns()` function to convert the struct
-	:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
-    * - __u8
-      - ``flags``
-      - Long term flag for the reference frame
-        (V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE). The flag is set as
-        described in the ITU HEVC specification chapter "8.3.2 Decoding
-        process for reference picture set".
-    * - __u8
-      - ``field_pic``
-      - Whether the reference is a field picture or a frame.
-        See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
-    * - __s32
-      - ``pic_order_cnt_val``
-      - The picture order count of the current picture.
-    * - __u8
-      - ``padding[2]``
-      - Applications and drivers must set this to zero.
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_dpb_field_pic_flags:
-
-``HEVC dpb field pic Flags``
-
-.. raw:: latex
-
-    \scriptsize
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME``
-      - 0
-      - (progressive) Frame
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD``
-      - 1
-      - Top field
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD``
-      - 2
-      - Bottom field
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM``
-      - 3
-      - Top field, bottom field, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP``
-      - 4
-      - Bottom field, top field, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP``
-      - 5
-      - Top field, bottom field, top field repeated, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM``
-      - 6
-      - Bottom field, top field, bottom field repeated, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING``
-      - 7
-      - Frame doubling
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING``
-      - 8
-      - Frame tripling
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM``
-      - 9
-      - Top field paired with previous bottom field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP``
-      - 10
-      - Bottom field paired with previous top field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM``
-      - 11
-      - Top field paired with next bottom field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP``
-      - 12
-      - Bottom field paired with next top field in output order
-
-.. c:type:: v4l2_hevc_pred_weight_table
-
-.. raw:: latex
-
-    \footnotesize
-
-.. tabularcolumns:: |p{0.8cm}|p{10.6cm}|p{5.9cm}|
-
-.. flat-table:: struct v4l2_hevc_pred_weight_table
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s8
-      - ``delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The difference of the weighting factor applied to the luma
-        prediction value for list 0.
-    * - __s8
-      - ``luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The additive offset applied to the luma prediction value for list 0.
-    * - __s8
-      - ``delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the weighting factor applied to the chroma
-        prediction value for list 0.
-    * - __s8
-      - ``chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the additive offset applied to the chroma
-        prediction values for list 0.
-    * - __s8
-      - ``delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The difference of the weighting factor applied to the luma
-        prediction value for list 1.
-    * - __s8
-      - ``luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The additive offset applied to the luma prediction value for list 1.
-    * - __s8
-      - ``delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the weighting factor applied to the chroma
-        prediction value for list 1.
-    * - __s8
-      - ``chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the additive offset applied to the chroma
-        prediction values for list 1.
-    * - __u8
-      - ``luma_log2_weight_denom``
-      - The base 2 logarithm of the denominator for all luma weighting
-        factors.
-    * - __s8
-      - ``delta_chroma_log2_weight_denom``
-      - The difference of the base 2 logarithm of the denominator for
-        all chroma weighting factors.
-    * - __u8
-      - ``padding[6]``
-      - Applications and drivers must set this to zero.
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_DECODE_MODE (enum)``
-    Specifies the decoding mode to use. Currently exposes slice-based and
-    frame-based decoding but new modes might be added later on.
-    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
-    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
-    are required to set this control in order to specify the decoding mode
-    that is expected for the buffer.
-    Drivers may expose a single or multiple decoding modes, depending
-    on what they can support.
-
-    .. note::
-
-       This menu control is not yet part of the public kernel API and
-       it is expected to change.
-
-.. c:type:: v4l2_stateless_hevc_decode_mode
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{9.4cm}|p{0.6cm}|p{7.3cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED``
-      - 0
-      - Decoding is done at the slice granularity.
-        The OUTPUT buffer must contain a single slice.
-    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED``
-      - 1
-      - Decoding is done at the frame granularity.
-        The OUTPUT buffer must contain all slices needed to decode the
-        frame. The OUTPUT buffer must also contain both fields.
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_START_CODE (enum)``
-    Specifies the HEVC slice start code expected for each slice.
-    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
-    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
-    are required to set this control in order to specify the start code
-    that is expected for the buffer.
-    Drivers may expose a single or multiple start codes, depending
-    on what they can support.
-
-    .. note::
-
-       This menu control is not yet part of the public kernel API and
-       it is expected to change.
-
-.. c:type:: v4l2_stateless_hevc_start_code
-
-.. tabularcolumns:: |p{9.2cm}|p{0.6cm}|p{7.5cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_STATELESS_HEVC_START_CODE_NONE``
-      - 0
-      - Selecting this value specifies that HEVC slices are passed
-        to the driver without any start code. The bitstream data should be
-        according to :ref:`hevc` 7.3.1.1 General NAL unit syntax, hence
-        contains emulation prevention bytes when required.
-    * - ``V4L2_STATELESS_HEVC_START_CODE_ANNEX_B``
-      - 1
-      - Selecting this value specifies that HEVC slices are expected
-        to be prefixed by Annex B start codes. According to :ref:`hevc`
-        valid start codes can be 3-bytes 0x000001 or 4-bytes 0x00000001.
-
-``V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID (integer)``
-    Specifies a priority identifier for the NAL unit, which will be applied to
-    the base layer. By default this value is set to 0 for the base layer,
-    and the next layer will have the priority ID assigned as 1, 2, 3 and so on.
-    The video encoder can't decide the priority id to be applied to a layer,
-    so this has to come from client.
-    This is applicable to H264 and valid Range is from 0 to 63.
-    Source Rec. ITU-T H.264 (06/2019); G.7.4.1.1, G.8.8.1.
-
-``V4L2_CID_MPEG_VIDEO_LTR_COUNT (integer)``
-    Specifies the maximum number of Long Term Reference (LTR) frames at any
-    given time that the encoder can keep.
-    This is applicable to the H264 and HEVC encoders.
-
-``V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX (integer)``
-    After setting this control the frame that will be queued next
-    will be marked as a Long Term Reference (LTR) frame
-    and given this LTR index which ranges from 0 to LTR_COUNT-1.
-    This is applicable to the H264 and HEVC encoders.
-    Source Rec. ITU-T H.264 (06/2019); Table 7.9
-
-``V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES (bitmask)``
-    Specifies the Long Term Reference (LTR) frame(s) to be used for
-    encoding the next frame queued after setting this control.
-    This provides a bitmask which consists of bits [0, LTR_COUNT-1].
-    This is applicable to the H264 and HEVC encoders.
-
-``V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (struct)``
-    Specifies various decode parameters, especially the references picture order
-    count (POC) for all the lists (short, long, before, current, after) and the
-    number of entries for each of them.
-    These parameters are defined according to :ref:`hevc`.
-    They are described in section 8.3 "Slice decoding process" of the
-    specification.
-
-.. c:type:: v4l2_ctrl_hevc_decode_params
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_decode_params
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s32
-      - ``pic_order_cnt_val``
-      - PicOrderCntVal as described in section 8.3.1 "Decoding process
-        for picture order count" of the specification.
-    * - __u16
-      - ``short_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the short-term reference picture set, of the first slice
-        described as st_ref_pic_set() in the specification, included in the slice header
-        or SPS (section 7.3.6.1).
-    * - __u16
-      - ``long_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the long-term reference picture set, of the first slice
-        included in the slice header or SPS. It is the number of bits in the conditional block
-        if(long_term_ref_pics_present_flag) in section 7.3.6.1 of the specification.
-    * - __u8
-      - ``num_active_dpb_entries``
-      - The number of entries in ``dpb``.
-    * - __u8
-      - ``num_poc_st_curr_before``
-      - The number of reference pictures in the short-term set that come before
-        the current frame.
-    * - __u8
-      - ``num_poc_st_curr_after``
-      - The number of reference pictures in the short-term set that come after
-        the current frame.
-    * - __u8
-      - ``num_poc_lt_curr``
-      - The number of reference pictures in the long-term set.
-    * - __u8
-      - ``poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocStCurrBefore as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the short term before references in DPB array.
-    * - __u8
-      - ``poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocStCurrAfter as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the short term after references in DPB array.
-    * - __u8
-      - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocLtCurr as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the long term references in DPB array.
-    * - struct :c:type:`v4l2_hevc_dpb_entry`
-      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The decoded picture buffer, for meta-data about reference frames.
-    * - __u64
-      - ``flags``
-      - See :ref:`Decode Parameters Flags <hevc_decode_params_flags>`
-
-.. _hevc_decode_params_flags:
-
-``Decode Parameters Flags``
-
-.. cssclass:: longtable
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR``
-      - 0x00000004
-      -
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index c352d91a73d8..506dd3c98884 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -212,7 +212,7 @@ Compressed Formats
         ``V4L2_CID_MPEG_VIDEO_HEVC_SPS``,
         ``V4L2_CID_MPEG_VIDEO_HEVC_PPS``, and
         ``V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS``.
-	See the :ref:`associated Codec Control IDs <v4l2-mpeg-hevc>`.
+	See the :ref:`associated Codec Control IDs <v4l2-codec-stateless-hevc>`.
 	Buffers associated with this pixel format must contain the appropriate
 	number of macroblocks to decode a full corresponding frame.
     * .. _V4L2-PIX-FMT-FWHT:
diff --git a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
index 29971a45a2d4..892cfeb8b988 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
@@ -249,6 +249,26 @@ still cause this situation.
       - ``p_hdr10_mastering``
       - A pointer to a struct :c:type:`v4l2_ctrl_hdr10_mastering_display`. Valid if this control is
         of type ``V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY``.
+    * - struct :c:type:`v4l2_ctrl_hevc_sps` *
+      - ``p_hevc_sps``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_sps`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SPS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_pps` *
+      - ``p_hevc_pps``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_pps`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_PPS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_slice_params` *
+      - ``p_hevc_slice_params``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_slice_params`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_scaling_matrix` *
+      - ``p_hevc_scaling_matrix``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_scaling_matrix`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX``.
+    * - struct :c:type:`v4l2_ctrl_hevc_decode_params` *
+      - ``p_hevc_decode_params``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_decode_params`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS``.
     * - void *
       - ``ptr``
       - A pointer to a compound type which can be an N-dimensional array
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
deleted file mode 100644
index c89029b3c5da..000000000000
--- a/include/media/hevc-ctrls.h
+++ /dev/null
@@ -1,474 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the HEVC state controls for use with stateless HEVC
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _HEVC_CTRLS_H_
-#define _HEVC_CTRLS_H_
-
-#include <linux/videodev2.h>
-
-#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_BASE + 1017)
-
-enum v4l2_stateless_hevc_decode_mode {
-	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
-	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
-};
-
-enum v4l2_stateless_hevc_start_code {
-	V4L2_STATELESS_HEVC_START_CODE_NONE,
-	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
-};
-
-#define V4L2_HEVC_SLICE_TYPE_B	0
-#define V4L2_HEVC_SLICE_TYPE_P	1
-#define V4L2_HEVC_SLICE_TYPE_I	2
-
-#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-
-/**
- * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
- *
- * @video_parameter_set_id: specifies the value of the
- *			    vps_video_parameter_set_id of the active VPS
- * @seq_parameter_set_id: provides an identifier for the SPS for
- *			  reference by other syntax elements
- * @pic_width_in_luma_samples: specifies the width of each decoded picture
- *			       in units of luma samples
- * @pic_height_in_luma_samples: specifies the height of each decoded picture
- *				in units of luma samples
- * @bit_depth_luma_minus8: this value plus 8 specifies the bit depth of the
- *                         samples of the luma array
- * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
- *                           samples of the chroma arrays
- * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value
- *                                     of the variable MaxPicOrderCntLsb
- * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
- *                                    required size of the decoded picture
- *                                    buffer for the codec video sequence
- * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
- * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
- *				    value of SpsMaxLatencyPictures array
- * @log2_min_luma_coding_block_size_minus3: this value plus 3 specifies the
- *                                          minimum luma coding block size
- * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
- *					      the maximum and minimum luma
- *					      coding block size
- * @log2_min_luma_transform_block_size_minus2: this value plus 2 specifies the
- *                                             minimum luma transform block size
- * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
- *						 the maximum and minimum luma
- *						 transform block size
- * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
- *					 depth for transform units of
- *					 coding units coded in inter
- *					 prediction mode
- * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
- *					 depth for transform units of
- *					 coding units coded in intra
- *					 prediction mode
- * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
- *                                    bits used to represent each of PCM sample
- *                                    values of the luma component
- * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
- *                                      of bits used to represent each of PCM
- *                                      sample values of the chroma components
- * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
- *                                              minimum size of coding blocks
- * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
- *						  the maximum and minimum size of
- *						  coding blocks
- * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
- *				 syntax structures included in the SPS
- * @num_long_term_ref_pics_sps:	specifies the number of candidate long-term
- *				reference pictures that are specified in the SPS
- * @chroma_format_idc: specifies the chroma sampling
- * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
- *                             of temporal sub-layers
- * @reserved: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_SPS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_sps {
-	__u8	video_parameter_set_id;
-	__u8	seq_parameter_set_id;
-	__u16	pic_width_in_luma_samples;
-	__u16	pic_height_in_luma_samples;
-	__u8	bit_depth_luma_minus8;
-	__u8	bit_depth_chroma_minus8;
-	__u8	log2_max_pic_order_cnt_lsb_minus4;
-	__u8	sps_max_dec_pic_buffering_minus1;
-	__u8	sps_max_num_reorder_pics;
-	__u8	sps_max_latency_increase_plus1;
-	__u8	log2_min_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_luma_coding_block_size;
-	__u8	log2_min_luma_transform_block_size_minus2;
-	__u8	log2_diff_max_min_luma_transform_block_size;
-	__u8	max_transform_hierarchy_depth_inter;
-	__u8	max_transform_hierarchy_depth_intra;
-	__u8	pcm_sample_bit_depth_luma_minus1;
-	__u8	pcm_sample_bit_depth_chroma_minus1;
-	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-	__u8	num_short_term_ref_pic_sets;
-	__u8	num_long_term_ref_pics_sps;
-	__u8	chroma_format_idc;
-	__u8	sps_max_sub_layers_minus1;
-
-	__u8	reserved[6];
-	__u64	flags;
-};
-
-#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-
-/**
- * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
- *
- * @pic_parameter_set_id: identifies the PPS for reference by other
- *			  syntax elements
- * @num_extra_slice_header_bits: specifies the number of extra slice header
- *				 bits that are present in the slice header RBSP
- *				 for coded pictures referring to the PPS.
- * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the inferred
- *                                        value of num_ref_idx_l0_active_minus1
- * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the inferred
- *                                        value of num_ref_idx_l1_active_minus1
- * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y
- *                   for each slice referring to the PPS
- * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
- *			    tree block size and the minimum luma coding block
- *			    size of coding units that convey cu_qp_delta_abs
- *			    and cu_qp_delta_sign_flag
- * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
- * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
- * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
- *			     partitioning the picture
- * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows
- *                        partitioning the picture
- * @column_width_minus1: this value plus 1 specifies the width of each tile column
- *                       in units of coding tree blocks
- * @row_height_minus1: this value plus 1 specifies the height of each tile row in
- *		       units of coding tree blocks
- * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
- *			  beta divided by 2
- * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
- *			divided by 2
- * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
- *                                    the variable Log2ParMrgLevel
- * @reserved: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_PPS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_pps {
-	__u8	pic_parameter_set_id;
-	__u8	num_extra_slice_header_bits;
-	__u8	num_ref_idx_l0_default_active_minus1;
-	__u8	num_ref_idx_l1_default_active_minus1;
-	__s8	init_qp_minus26;
-	__u8	diff_cu_qp_delta_depth;
-	__s8	pps_cb_qp_offset;
-	__s8	pps_cr_qp_offset;
-	__u8	num_tile_columns_minus1;
-	__u8	num_tile_rows_minus1;
-	__u8	column_width_minus1[20];
-	__u8	row_height_minus1[22];
-	__s8	pps_beta_offset_div2;
-	__s8	pps_tc_offset_div2;
-	__u8	log2_parallel_merge_level_minus2;
-	__u8	reserved;
-	__u64	flags;
-};
-
-#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
-
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
-
-#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-
-/**
- * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
- *
- * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
- * @flags: long term flag for the reference frame
- * @field_pic: whether the reference is a field picture or a frame.
- * @reserved: padding field. Should be zeroed by applications.
- * @pic_order_cnt_val: the picture order count of the reference.
- */
-struct v4l2_hevc_dpb_entry {
-	__u64	timestamp;
-	__u8	flags;
-	__u8	field_pic;
-	__u16	reserved;
-	__s32	pic_order_cnt_val;
-};
-
-/**
- * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
- *
- * @delta_luma_weight_l0: the difference of the weighting factor applied
- *			  to the luma prediction value for list 0
- * @luma_offset_l0: the additive offset applied to the luma prediction value
- *		    for list 0
- * @delta_chroma_weight_l0: the difference of the weighting factor applied
- *			    to the chroma prediction values for list 0
- * @chroma_offset_l0: the difference of the additive offset applied to
- *		      the chroma prediction values for list 0
- * @delta_luma_weight_l1: the difference of the weighting factor applied
- *			  to the luma prediction value for list 1
- * @luma_offset_l1: the additive offset applied to the luma prediction value
- *		    for list 1
- * @delta_chroma_weight_l1: the difference of the weighting factor applied
- *			    to the chroma prediction values for list 1
- * @chroma_offset_l1: the difference of the additive offset applied to
- *		      the chroma prediction values for list 1
- * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
- *			    all luma weighting factors
- * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
- *				    of the denominator for all chroma
- *				    weighting factors
- */
-struct v4l2_hevc_pred_weight_table {
-	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__u8	luma_log2_weight_denom;
-	__s8	delta_chroma_log2_weight_denom;
-};
-
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-
-/**
- * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
- *
- * This control is a dynamically sized 1-dimensional array,
- * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
- *
- * @bit_size: size (in bits) of the current slice data
- * @data_byte_offset: offset (in bytes) to the video data in the current slice data
- * @num_entry_point_offsets: specifies the number of entry point offset syntax
- *			     elements in the slice header.
- * @nal_unit_type: specifies the coding type of the slice (B, P or I)
- * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
- * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
- * @colour_plane_id: specifies the colour plane associated with the current slice
- * @slice_pic_order_cnt: specifies the picture order count
- * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum reference
- *                                index for reference picture list 0 that may be
- *                                used to decode the slice
- * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum reference
- *                                index for reference picture list 1 that may be
- *                                used to decode the slice
- * @collocated_ref_idx: specifies the reference index of the collocated picture used
- *			for temporal motion vector prediction
- * @five_minus_max_num_merge_cand: specifies the maximum number of merging
- *				   motion vector prediction candidates supported in
- *				   the slice subtracted from 5
- * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
- *		    blocks in the slice
- * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
- * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
- * @slice_act_y_qp_offset: screen content extension parameters
- * @slice_act_cb_qp_offset: screen content extension parameters
- * @slice_act_cr_qp_offset: screen content extension parameters
- * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
- * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
- * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
- *		more fields
- * @reserved0: padding field. Should be zeroed by applications.
- * @slice_segment_addr: specifies the address of the first coding tree block in
- *			the slice segment
- * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
- * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
- * @short_term_ref_pic_set_size: specifies the size of short-term reference
- *				 pictures included in the SPS
- * @long_term_ref_pic_set_size: specifies the size of long-term reference
- *				picture include in the SPS
- * @pred_weight_table: the prediction weight coefficients for inter-picture
- *		       prediction
- * @reserved1: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_slice_params {
-	__u32	bit_size;
-	__u32	data_byte_offset;
-	__u32	num_entry_point_offsets;
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-	__u8	nal_unit_type;
-	__u8	nuh_temporal_id_plus1;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	__u8	slice_type;
-	__u8	colour_plane_id;
-	__s32	slice_pic_order_cnt;
-	__u8	num_ref_idx_l0_active_minus1;
-	__u8	num_ref_idx_l1_active_minus1;
-	__u8	collocated_ref_idx;
-	__u8	five_minus_max_num_merge_cand;
-	__s8	slice_qp_delta;
-	__s8	slice_cb_qp_offset;
-	__s8	slice_cr_qp_offset;
-	__s8	slice_act_y_qp_offset;
-	__s8	slice_act_cb_qp_offset;
-	__s8	slice_act_cr_qp_offset;
-	__s8	slice_beta_offset_div2;
-	__s8	slice_tc_offset_div2;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-	__u8	pic_struct;
-
-	__u8	reserved0[3];
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	__u32	slice_segment_addr;
-	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u16	short_term_ref_pic_set_size;
-	__u16	long_term_ref_pic_set_size;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-	struct v4l2_hevc_pred_weight_table pred_weight_table;
-
-	__u8	reserved1[2];
-	__u64	flags;
-};
-
-#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-
-/**
- * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
- *
- * @pic_order_cnt_val: picture order count
- * @short_term_ref_pic_set_size: specifies the size of short-term reference
- *				 pictures set included in the SPS of the first slice
- * @long_term_ref_pic_set_size: specifies the size of long-term reference
- *				pictures set include in the SPS of the first slice
- * @num_active_dpb_entries: the number of entries in dpb
- * @num_poc_st_curr_before: the number of reference pictures in the short-term
- *			    set that come before the current frame
- * @num_poc_st_curr_after: the number of reference pictures in the short-term
- *			   set that come after the current frame
- * @num_poc_lt_curr: the number of reference pictures in the long-term set
- * @poc_st_curr_before: provides the index of the short term before references
- *			in DPB array
- * @poc_st_curr_after: provides the index of the short term after references
- *		       in DPB array
- * @poc_lt_curr: provides the index of the long term references in DPB array
- * @reserved: padding field. Should be zeroed by applications.
- * @dpb: the decoded picture buffer, for meta-data about reference frames
- * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
- */
-struct v4l2_ctrl_hevc_decode_params {
-	__s32	pic_order_cnt_val;
-	__u16	short_term_ref_pic_set_size;
-	__u16	long_term_ref_pic_set_size;
-	__u8	num_active_dpb_entries;
-	__u8	num_poc_st_curr_before;
-	__u8	num_poc_st_curr_after;
-	__u8	num_poc_lt_curr;
-	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	reserved[4];
-	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u64	flags;
-};
-
-/**
- * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
- *
- * @scaling_list_4x4: scaling list is used for the scaling process for
- *		      transform coefficients. The values on each scaling
- *		      list are expected in raster scan order
- * @scaling_list_8x8: scaling list is used for the scaling process for
- *		      transform coefficients. The values on each scaling
- *		      list are expected in raster scan order
- * @scaling_list_16x16: scaling list is used for the scaling process for
- *			transform coefficients. The values on each scaling
- *			list are expected in raster scan order
- * @scaling_list_32x32:	scaling list is used for the scaling process for
- *			transform coefficients. The values on each scaling
- *			list are expected in raster scan order
- * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process
- *				for transform coefficients. The values on each
- *				scaling list are expected in raster scan order.
- * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
- *				for transform coefficients. The values on each
- *				scaling list are expected in raster scan order.
- */
-struct v4l2_ctrl_hevc_scaling_matrix {
-	__u8	scaling_list_4x4[6][16];
-	__u8	scaling_list_8x8[6][64];
-	__u8	scaling_list_16x16[6][64];
-	__u8	scaling_list_32x32[2][64];
-	__u8	scaling_list_dc_coef_16x16[6];
-	__u8	scaling_list_dc_coef_32x32[2];
-};
-
-#endif
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index f4105de8a8d2..00828a4f9404 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -13,12 +13,6 @@
 #include <linux/videodev2.h>
 #include <media/media-request.h>
 
-/*
- * Include the stateless codec compound control definitions.
- * This will move to the public headers once this API is fully stable.
- */
-#include <media/hevc-ctrls.h>
-
 /* forward references */
 struct file;
 struct poll_table_struct;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index dfff69ed88f7..5f46bf4a570c 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1997,6 +1997,465 @@ struct v4l2_ctrl_mpeg2_quantisation {
 	__u8	chroma_non_intra_quantiser_matrix[64];
 };
 
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
+ *				in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
+ *                                     the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
+ *					    luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
+ *					       transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_sps {
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u8	reserved[6];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
+ *		     each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
+ *			  the picture
+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
+ *			 units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_pps {
+	__u8	pic_parameter_set_id;
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+	__u8	reserved;
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
+ * @pic_order_cnt_val: the picture order count of the current picture.
+ */
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	reserved;
+	__s32	pic_order_cnt_val;
+};
+
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 0
+ *                                that may be used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 1
+ *                                that may be used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_byte_offset;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__s32	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	__u8	reserved0[3];
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u8	reserved1[2];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+	__u8	num_active_dpb_entries;
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
 #define V4L2_CID_COLORIMETRY_CLASS_BASE	(V4L2_CTRL_CLASS_COLORIMETRY | 0x900)
 #define V4L2_CID_COLORIMETRY_CLASS	(V4L2_CTRL_CLASS_COLORIMETRY | 1)
 
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index cff2bb78b2cc..d6fac2344033 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1793,6 +1793,11 @@ struct v4l2_ext_control {
 		struct v4l2_ctrl_mpeg2_quantisation __user *p_mpeg2_quantisation;
 		struct v4l2_ctrl_vp9_compressed_hdr __user *p_vp9_compressed_hdr_probs;
 		struct v4l2_ctrl_vp9_frame __user *p_vp9_frame;
+		struct v4l2_ctrl_hevc_sps __user *p_hevc_sps;
+		struct v4l2_ctrl_hevc_pps __user *p_hevc_pps;
+		struct v4l2_ctrl_hevc_slice_params __user *p_hevc_slice_params;
+		struct v4l2_ctrl_hevc_scaling_matrix __user *p_hevc_scaling_matrix;
+		struct v4l2_ctrl_hevc_decode_params __user *p_hevc_decode_params;
 		void __user *ptr;
 	};
 } __attribute__ ((packed));
-- 
cgit 


From ae6ccaa650380d243cf43d31c864c5ced2fd4612 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 7 Jul 2022 08:15:52 +0100
Subject: PM: EM: convert power field to micro-Watts precision and align
 drivers

The milli-Watts precision causes rounding errors while calculating
efficiency cost for each OPP. This is especially visible in the 'simple'
Energy Model (EM), where the power for each OPP is provided from OPP
framework. This can cause some OPPs to be marked inefficient, while
using micro-Watts precision that might not happen.

Update all EM users which access 'power' field and assume the value is
in milli-Watts.

Solve also an issue with potential overflow in calculation of energy
estimation on 32bit machine. It's needed now since the power value
(thus the 'cost' as well) are higher.

Example calculation which shows the rounding error and impact:

power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz

power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000
power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18

power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961
power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21

max_freq = 2000MHz

cost_a_mW = 18 * 2000MHz/500MHz = 72
cost_a_uW = 18000 * 2000MHz/500MHz = 72000

cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better
cost_b_uW = 21961 * 2000MHz/600MHz = 73203

The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly
better that the 'cost_b_uW' (this patch uses micro-Watts) and such
would have impact on the 'inefficient OPPs' information in the Cpufreq
framework. This patch set removes the rounding issue.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mediatek-cpufreq-hw.c |  7 +++--
 drivers/cpufreq/scmi-cpufreq.c        |  6 ++++
 drivers/opp/of.c                      | 15 +++++-----
 drivers/powercap/dtpm_cpu.c           |  5 ++--
 drivers/thermal/cpufreq_cooling.c     | 13 +++++++--
 drivers/thermal/devfreq_cooling.c     | 19 +++++++++---
 include/linux/energy_model.h          | 54 ++++++++++++++++++++++++-----------
 kernel/power/energy_model.c           | 24 ++++++++++------
 8 files changed, 100 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
index 813cccbfe934..f0e0a35c7f21 100644
--- a/drivers/cpufreq/mediatek-cpufreq-hw.c
+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
@@ -51,7 +51,7 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
 };
 
 static int __maybe_unused
-mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
+mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *uW,
 			  unsigned long *KHz)
 {
 	struct mtk_cpufreq_data *data;
@@ -71,8 +71,9 @@ mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
 	i--;
 
 	*KHz = data->table[i].frequency;
-	*mW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
-			    i * LUT_ROW_SIZE) / 1000;
+	/* Provide micro-Watts value to the Energy Model */
+	*uW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
+			    i * LUT_ROW_SIZE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 6d2a4cf46db7..bfd35583d653 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/scmi_protocol.h>
 #include <linux/types.h>
+#include <linux/units.h>
 
 struct scmi_data {
 	int domain_id;
@@ -99,6 +100,7 @@ static int __maybe_unused
 scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 		   unsigned long *KHz)
 {
+	bool power_scale_mw = perf_ops->power_scale_mw_get(ph);
 	unsigned long Hz;
 	int ret, domain;
 
@@ -112,6 +114,10 @@ scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 	if (ret)
 		return ret;
 
+	/* Provide bigger resolution power to the Energy Model */
+	if (power_scale_mw)
+		*power *= MICROWATT_PER_MILLIWATT;
+
 	/* The EM framework specifies the frequency in KHz. */
 	*KHz = Hz / 1000;
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 30394929d700..eb89c9a75985 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1443,12 +1443,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
  * It provides the power used by @dev at @kHz if it is the frequency of an
  * existing OPP, or at the frequency of the first OPP above @kHz otherwise
  * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power.
+ * frequency and @uW to the associated power.
  *
  * Returns 0 on success or a proper -EINVAL value in case of error.
  */
 static int __maybe_unused
-_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
+_get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
 {
 	struct dev_pm_opp *opp;
 	unsigned long opp_freq, opp_power;
@@ -1465,7 +1465,7 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
 		return -EINVAL;
 
 	*kHz = opp_freq / 1000;
-	*mW = opp_power / 1000;
+	*uW = opp_power;
 
 	return 0;
 }
@@ -1475,14 +1475,14 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
  * This computes the power estimated by @dev at @kHz if it is the frequency
  * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise
  * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power. The power is estimated as
+ * frequency and @uW to the associated power. The power is estimated as
  * P = C * V^2 * f with C being the device's capacitance and V and f
  * respectively the voltage and frequency of the OPP.
  *
  * Returns -EINVAL if the power calculation failed because of missing
  * parameters, 0 otherwise.
  */
-static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
+static int __maybe_unused _get_power(struct device *dev, unsigned long *uW,
 				     unsigned long *kHz)
 {
 	struct dev_pm_opp *opp;
@@ -1512,9 +1512,10 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
 		return -EINVAL;
 
 	tmp = (u64)cap * mV * mV * (Hz / 1000000);
-	do_div(tmp, 1000000000);
+	/* Provide power in micro-Watts */
+	do_div(tmp, 1000000);
 
-	*mW = (unsigned long)tmp;
+	*uW = (unsigned long)tmp;
 	*kHz = Hz / 1000;
 
 	return 0;
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index f5eced0842b3..61c5ff80bd30 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -53,7 +53,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
 
 	for (i = 0; i < pd->nr_perf_states; i++) {
 
-		power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus;
+		power = pd->table[i].power * nr_cpus;
 
 		if (power > power_limit)
 			break;
@@ -63,8 +63,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
 
 	freq_qos_update_request(&dtpm_cpu->qos_req, freq);
 
-	power_limit = pd->table[i - 1].power *
-		MICROWATT_PER_MILLIWATT * nr_cpus;
+	power_limit = pd->table[i - 1].power * nr_cpus;
 
 	return power_limit;
 }
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b8151d95a806..dc19e7c80751 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -21,6 +21,7 @@
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/thermal.h>
+#include <linux/units.h>
 
 #include <trace/events/thermal.h>
 
@@ -101,6 +102,7 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 freq)
 {
+	unsigned long power_mw;
 	int i;
 
 	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
@@ -108,16 +110,23 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			break;
 	}
 
-	return cpufreq_cdev->em->table[i + 1].power;
+	power_mw = cpufreq_cdev->em->table[i + 1].power;
+	power_mw /= MICROWATT_PER_MILLIWATT;
+
+	return power_mw;
 }
 
 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 power)
 {
+	unsigned long em_power_mw;
 	int i;
 
 	for (i = cpufreq_cdev->max_level; i > 0; i--) {
-		if (power >= cpufreq_cdev->em->table[i].power)
+		/* Convert EM power to milli-Watts to make safe comparison */
+		em_power_mw = cpufreq_cdev->em->table[i].power;
+		em_power_mw /= MICROWATT_PER_MILLIWATT;
+		if (power >= em_power_mw)
 			break;
 	}
 
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 8c76f9655e57..8d1260f65061 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -200,7 +200,11 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
 		if (!res) {
 			state = dfc->capped_state;
+
+			/* Convert EM power into milli-Watts first */
 			dfc->res_util = dfc->em_pd->table[state].power;
+			dfc->res_util /= MICROWATT_PER_MILLIWATT;
+
 			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
 			if (*power > 1)
@@ -218,8 +222,10 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 
 		_normalize_load(&status);
 
-		/* Scale power for utilization */
+		/* Convert EM power into milli-Watts first */
 		*power = dfc->em_pd->table[perf_idx].power;
+		*power /= MICROWATT_PER_MILLIWATT;
+		/* Scale power for utilization */
 		*power *= status.busy_time;
 		*power >>= 10;
 	}
@@ -244,6 +250,7 @@ static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
 
 	perf_idx = dfc->max_state - state;
 	*power = dfc->em_pd->table[perf_idx].power;
+	*power /= MICROWATT_PER_MILLIWATT;
 
 	return 0;
 }
@@ -254,7 +261,7 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
 	struct devfreq_dev_status status;
-	unsigned long freq;
+	unsigned long freq, em_power_mw;
 	s32 est_power;
 	int i;
 
@@ -279,9 +286,13 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	 * Find the first cooling state that is within the power
 	 * budget. The EM power table is sorted ascending.
 	 */
-	for (i = dfc->max_state; i > 0; i--)
-		if (est_power >= dfc->em_pd->table[i].power)
+	for (i = dfc->max_state; i > 0; i--) {
+		/* Convert EM power to milli-Watts to make safe comparison */
+		em_power_mw = dfc->em_pd->table[i].power;
+		em_power_mw /= MICROWATT_PER_MILLIWATT;
+		if (est_power >= em_power_mw)
 			break;
+	}
 
 	*state = dfc->max_state - i;
 	dfc->capped_state = *state;
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 8419bffb4398..b9caa01dfac4 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -62,7 +62,7 @@ struct em_perf_domain {
 /*
  *  em_perf_domain flags:
  *
- *  EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some
+ *  EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some
  *  other scale.
  *
  *  EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
@@ -71,7 +71,7 @@ struct em_perf_domain {
  *  EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
  *  created by platform missing real power information
  */
-#define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
+#define EM_PERF_DOMAIN_MICROWATTS BIT(0)
 #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
 #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
 
@@ -79,22 +79,44 @@ struct em_perf_domain {
 #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
 
 #ifdef CONFIG_ENERGY_MODEL
-#define EM_MAX_POWER 0xFFFF
+/*
+ * The max power value in micro-Watts. The limit of 64 Watts is set as
+ * a safety net to not overflow multiplications on 32bit platforms. The
+ * 32bit value limit for total Perf Domain power implies a limit of
+ * maximum CPUs in such domain to 64.
+ */
+#define EM_MAX_POWER (64000000) /* 64 Watts */
+
+/*
+ * To avoid possible energy estimation overflow on 32bit machines add
+ * limits to number of CPUs in the Perf. Domain.
+ * We are safe on 64bit machine, thus some big number.
+ */
+#ifdef CONFIG_64BIT
+#define EM_MAX_NUM_CPUS 4096
+#else
+#define EM_MAX_NUM_CPUS 16
+#endif
 
 /*
- * Increase resolution of energy estimation calculations for 64-bit
- * architectures. The extra resolution improves decision made by EAS for the
- * task placement when two Performance Domains might provide similar energy
- * estimation values (w/o better resolution the values could be equal).
+ * To avoid an overflow on 32bit machines while calculating the energy
+ * use a different order in the operation. First divide by the 'cpu_scale'
+ * which would reduce big value stored in the 'cost' field, then multiply by
+ * the 'sum_util'. This would allow to handle existing platforms, which have
+ * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts.
+ * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util'
+ * could be 4096, then multiplication: 'cost' * 'sum_util'  would overflow.
+ * This reordering of operations has some limitations, we lose small
+ * precision in the estimation (comparing to 64bit platform w/o reordering).
  *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
- * are pretty high and the returns do not justify the increased costs.
+ * We are safe on 64bit machine.
  */
 #ifdef CONFIG_64BIT
-#define em_scale_power(p) ((p) * 1000)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+	(((cost) * (sum_util)) / (scale_cpu))
 #else
-#define em_scale_power(p) (p)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+	(((cost) / (scale_cpu)) * (sum_util))
 #endif
 
 struct em_data_callback {
@@ -112,7 +134,7 @@ struct em_data_callback {
 	 * and frequency.
 	 *
 	 * In case of CPUs, the power is the one of a single CPU in the domain,
-	 * expressed in milli-Watts or an abstract scale. It is expected to
+	 * expressed in micro-Watts or an abstract scale. It is expected to
 	 * fit in the [0, EM_MAX_POWER] range.
 	 *
 	 * Return 0 on success.
@@ -148,7 +170,7 @@ struct em_perf_domain *em_cpu_get(int cpu);
 struct em_perf_domain *em_pd_get(struct device *dev);
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span,
-				bool milliwatts);
+				bool microwatts);
 void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
@@ -273,7 +295,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 *   pd_nrg = ------------------------                       (4)
 	 *                  scale_cpu
 	 */
-	return ps->cost * sum_util / scale_cpu;
+	return em_estimate_energy(ps->cost, sum_util, scale_cpu);
 }
 
 /**
@@ -297,7 +319,7 @@ struct em_data_callback {};
 static inline
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span,
-				bool milliwatts)
+				bool microwatts)
 {
 	return -EINVAL;
 }
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 6c373f2960e7..f82111837b8d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -145,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 
 		/*
 		 * The power returned by active_state() is expected to be
-		 * positive and to fit into 16 bits.
+		 * positive and be in range.
 		 */
 		if (!power || power > EM_MAX_POWER) {
 			dev_err(dev, "EM: invalid power: %lu\n",
@@ -170,7 +170,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 				goto free_ps_table;
 			}
 		} else {
-			power_res = em_scale_power(table[i].power);
+			power_res = table[i].power;
 			cost = div64_u64(fmax * power_res, table[i].frequency);
 		}
 
@@ -201,9 +201,17 @@ static int em_create_pd(struct device *dev, int nr_states,
 {
 	struct em_perf_domain *pd;
 	struct device *cpu_dev;
-	int cpu, ret;
+	int cpu, ret, num_cpus;
 
 	if (_is_cpu_device(dev)) {
+		num_cpus = cpumask_weight(cpus);
+
+		/* Prevent max possible energy calculation to not overflow */
+		if (num_cpus > EM_MAX_NUM_CPUS) {
+			dev_err(dev, "EM: too many CPUs, overflow possible\n");
+			return -EINVAL;
+		}
+
 		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
 		if (!pd)
 			return -ENOMEM;
@@ -314,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
  *		type of devices this should be set to NULL.
- * @milliwatts	: Flag indicating that the power values are in milliWatts or
+ * @microwatts	: Flag indicating that the power values are in micro-Watts or
  *		in some other scale. It must be set properly.
  *
  * Create Energy Model tables for a performance domain using the callbacks
  * defined in cb.
  *
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
  * sub-systems might rely on this flag and check if all devices in the EM are
  * using the same scale.
  *
@@ -331,7 +339,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  */
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *cpus,
-				bool milliwatts)
+				bool microwatts)
 {
 	unsigned long cap, prev_cap = 0;
 	unsigned long flags = 0;
@@ -381,8 +389,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 		}
 	}
 
-	if (milliwatts)
-		flags |= EM_PERF_DOMAIN_MILLIWATTS;
+	if (microwatts)
+		flags |= EM_PERF_DOMAIN_MICROWATTS;
 	else if (cb->get_cost)
 		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
 
-- 
cgit 


From 5e0fd2026cdd474a85b3135c312912321e60f47a Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 7 Jul 2022 08:15:54 +0100
Subject: firmware: arm_scmi: Get detailed power scale from perf

In SCMI v3.1 the power scale can be in micro-Watts. The upper layers, e.g.
cpufreq and EM should handle received power values properly (upscale when
needed). Thus, provide an interface which allows to check what is the
scale for power values. The old interface allowed to distinguish between
bogo-Watts and milli-Watts only (which was good for older SCMI spec).

Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/firmware/arm_scmi/perf.c | 18 +++++++++++-------
 include/linux/scmi_protocol.h    |  8 +++++++-
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index bbb0331801ff..92414e53f908 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -170,8 +170,7 @@ struct perf_dom_info {
 struct scmi_perf_info {
 	u32 version;
 	int num_domains;
-	bool power_scale_mw;
-	bool power_scale_uw;
+	enum scmi_power_scale power_scale;
 	u64 stats_addr;
 	u32 stats_size;
 	struct perf_dom_info *dom_info;
@@ -201,9 +200,13 @@ static int scmi_perf_attributes_get(const struct scmi_protocol_handle *ph,
 		u16 flags = le16_to_cpu(attr->flags);
 
 		pi->num_domains = le16_to_cpu(attr->num_domains);
-		pi->power_scale_mw = POWER_SCALE_IN_MILLIWATT(flags);
+
+		if (POWER_SCALE_IN_MILLIWATT(flags))
+			pi->power_scale = SCMI_POWER_MILLIWATTS;
 		if (PROTOCOL_REV_MAJOR(pi->version) >= 0x3)
-			pi->power_scale_uw = POWER_SCALE_IN_MICROWATT(flags);
+			if (POWER_SCALE_IN_MICROWATT(flags))
+				pi->power_scale = SCMI_POWER_MICROWATTS;
+
 		pi->stats_addr = le32_to_cpu(attr->stats_addr_low) |
 				(u64)le32_to_cpu(attr->stats_addr_high) << 32;
 		pi->stats_size = le32_to_cpu(attr->stats_size);
@@ -792,11 +795,12 @@ static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
 	return dom->fc_info && dom->fc_info->level_set_addr;
 }
 
-static bool scmi_power_scale_mw_get(const struct scmi_protocol_handle *ph)
+static enum scmi_power_scale
+scmi_power_scale_get(const struct scmi_protocol_handle *ph)
 {
 	struct scmi_perf_info *pi = ph->get_priv(ph);
 
-	return pi->power_scale_mw;
+	return pi->power_scale;
 }
 
 static const struct scmi_perf_proto_ops perf_proto_ops = {
@@ -811,7 +815,7 @@ static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.freq_get = scmi_dvfs_freq_get,
 	.est_power_get = scmi_dvfs_est_power_get,
 	.fast_switch_possible = scmi_fast_switch_possible,
-	.power_scale_mw_get = scmi_power_scale_mw_get,
+	.power_scale_get = scmi_power_scale_get,
 };
 
 static int scmi_perf_set_notify_enabled(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 704111f63993..a0a246310ba1 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -60,6 +60,12 @@ struct scmi_clock_info {
 	};
 };
 
+enum scmi_power_scale {
+	SCMI_POWER_BOGOWATTS,
+	SCMI_POWER_MILLIWATTS,
+	SCMI_POWER_MICROWATTS
+};
+
 struct scmi_handle;
 struct scmi_device;
 struct scmi_protocol_handle;
@@ -135,7 +141,7 @@ struct scmi_perf_proto_ops {
 			     unsigned long *rate, unsigned long *power);
 	bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph,
 				     struct device *dev);
-	bool (*power_scale_mw_get)(const struct scmi_protocol_handle *ph);
+	enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph);
 };
 
 /**
-- 
cgit 


From fca8300f68fe3fbb2fcda862f0f114011715b3bc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 4 Jul 2022 09:14:36 -0400
Subject: tracing/ipv4/ipv6: Use static array for name field in
 fib*_lookup_table event

The fib_lookup_table and fib6_lookup_table events declare name as a
dynamic_array, but also give it a fixed size, which defeats the purpose of
the dynamic array, especially since the dynamic array also includes meta
data in the event to specify its size.

Since the size of the name is at most 16 bytes (defined by IFNAMSIZ),
it is not worth spending the effort to determine the size of the string.

Just use a fixed size array and copy into it. This will save 4 bytes that
are used for the meta data that saves the size and position of a dynamic
array, and even slightly speed up the event processing.

Link: https://lkml.kernel.org/r/20220704091436.3705edbf@rorschach.local.home

Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/fib.h  | 6 +++---
 include/trace/events/fib6.h | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 6f2a4dc35e37..c2300c407f58 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -32,7 +32,7 @@ TRACE_EVENT(fib_table_lookup,
 		__array(	__u8,	gw6,	16	)
 		__field(	u16,	sport		)
 		__field(	u16,	dport		)
-		__dynamic_array(char,  name,   IFNAMSIZ )
+		__array(char,  name,   IFNAMSIZ )
 	),
 
 	TP_fast_assign(
@@ -66,7 +66,7 @@ TRACE_EVENT(fib_table_lookup,
 		}
 
 		dev = nhc ? nhc->nhc_dev : NULL;
-		__assign_str(name, dev ? dev->name : "-");
+		strlcpy(__entry->name, dev ? dev->name : "-", IFNAMSIZ);
 
 		if (nhc) {
 			if (nhc->nhc_gw_family == AF_INET) {
@@ -95,7 +95,7 @@ TRACE_EVENT(fib_table_lookup,
 		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
 		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
 		  __entry->tos, __entry->scope, __entry->flags,
-		  __get_str(name), __entry->gw4, __entry->gw6, __entry->err)
+		  __entry->name, __entry->gw4, __entry->gw6, __entry->err)
 );
 #endif /* _TRACE_FIB_H */
 
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index c6abdcc77c12..6e821eb79450 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -31,7 +31,7 @@ TRACE_EVENT(fib6_table_lookup,
 		__field(        u16,	dport		)
 		__field(        u8,	proto		)
 		__field(        u8,	rt_type		)
-		__dynamic_array(	char,	name,	IFNAMSIZ )
+		__array(		char,	name,	IFNAMSIZ )
 		__array(		__u8,	gw,	16	 )
 	),
 
@@ -63,9 +63,9 @@ TRACE_EVENT(fib6_table_lookup,
 		}
 
 		if (res->nh && res->nh->fib_nh_dev) {
-			__assign_str(name, res->nh->fib_nh_dev);
+			strlcpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ);
 		} else {
-			__assign_str(name, "-");
+			strcpy(__entry->name, "-");
 		}
 		if (res->f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
@@ -83,7 +83,7 @@ TRACE_EVENT(fib6_table_lookup,
 		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
 		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
 		  __entry->tos, __entry->scope, __entry->flags,
-		  __get_str(name), __entry->gw, __entry->err)
+		  __entry->name, __entry->gw, __entry->err)
 );
 
 #endif /* _TRACE_FIB6_H */
-- 
cgit 


From 43b2aef3735e0ddb4fc4e6401cd5dc1bf205dead Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2022 18:37:41 -0400
Subject: neighbor: tracing: Have neigh_create event use __string()

The dev field of the neigh_create event uses __dynamic_array() with a
fixed size, which defeats the purpose of __dynamic_array(). Looking at the
logic, as it already uses __assign_str(), just use the same logic in
__string to create the size needed. It appears that because "dev" can be
NULL, it needs the check. But __string() can have the same checks as
__assign_str() so use them there too.

Link: https://lkml.kernel.org/r/20220705183741.35387e3f@rorschach.local.home

Cc: David Ahern <dsahern@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/neigh.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/neigh.h b/include/trace/events/neigh.h
index 62bb17516713..5eaa1fa99171 100644
--- a/include/trace/events/neigh.h
+++ b/include/trace/events/neigh.h
@@ -30,7 +30,7 @@ TRACE_EVENT(neigh_create,
 
 	TP_STRUCT__entry(
 		__field(u32, family)
-		__dynamic_array(char,  dev,   IFNAMSIZ )
+		__string(dev, dev ? dev->name : "NULL")
 		__field(int, entries)
 		__field(u8, created)
 		__field(u8, gc_exempt)
-- 
cgit 


From fcfe0ac2fcfae7d5fcad3d0375cb8ff38caf8aba Mon Sep 17 00:00:00 2001
From: Micah Morton <mortonm@chromium.org>
Date: Wed, 8 Jun 2022 20:57:11 +0000
Subject: security: Add LSM hook to setgroups() syscall

Give the LSM framework the ability to filter setgroups() syscalls. There
are already analagous hooks for the set*uid() and set*gid() syscalls.
The SafeSetID LSM will use this new hook to ensure setgroups() calls are
allowed by the installed security policy. Tested by putting print
statement in security_task_fix_setgroups() hook and confirming that it
gets hit when userspace does a setgroups() syscall.

Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Micah Morton <mortonm@chromium.org>
---
 include/linux/lsm_hook_defs.h |  1 +
 include/linux/lsm_hooks.h     |  7 +++++++
 include/linux/security.h      |  7 +++++++
 kernel/groups.c               | 13 +++++++++++++
 security/security.c           |  5 +++++
 5 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index eafa1d2489fd..806448173033 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -201,6 +201,7 @@ LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old,
 	 int flags)
 LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old,
 	 int flags)
+LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old)
 LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
 LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
 LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 91c8146649f5..84a0d7e02176 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -702,6 +702,13 @@
  *	@old is the set of credentials that are being replaced.
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 on success.
+ * @task_fix_setgroups:
+ *	Update the module's state after setting the supplementary group
+ *	identity attributes of the current process.
+ *	@new is the set of credentials that will be installed.  Modifications
+ *	should be made to this rather than to @current->cred.
+ *	@old is the set of credentials that are being replaced.
+ *	Return 0 on success.
  * @task_setpgid:
  *	Check permission before setting the process group identifier of the
  *	process @p to @pgid.
diff --git a/include/linux/security.h b/include/linux/security.h
index 7fc4e9f49f54..1dfd32c49fa3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -415,6 +415,7 @@ int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
 int security_task_fix_setgid(struct cred *new, const struct cred *old,
 			     int flags);
+int security_task_fix_setgroups(struct cred *new, const struct cred *old);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
@@ -1098,6 +1099,12 @@ static inline int security_task_fix_setgid(struct cred *new,
 	return 0;
 }
 
+static inline int security_task_fix_setgroups(struct cred *new,
+					   const struct cred *old)
+{
+	return 0;
+}
+
 static inline int security_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return 0;
diff --git a/kernel/groups.c b/kernel/groups.c
index 787b381c7c00..9aaed2a31073 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
 int set_current_groups(struct group_info *group_info)
 {
 	struct cred *new;
+	const struct cred *old;
+	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
 
+	old = current_cred();
+
 	set_groups(new, group_info);
+
+	retval = security_task_fix_setgroups(new, old);
+	if (retval < 0)
+		goto error;
+
 	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 EXPORT_SYMBOL(set_current_groups);
diff --git a/security/security.c b/security/security.c
index 188b8f782220..15c686145ad6 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1803,6 +1803,11 @@ int security_task_fix_setgid(struct cred *new, const struct cred *old,
 	return call_int_hook(task_fix_setgid, 0, new, old, flags);
 }
 
+int security_task_fix_setgroups(struct cred *new, const struct cred *old)
+{
+	return call_int_hook(task_fix_setgroups, 0, new, old);
+}
+
 int security_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return call_int_hook(task_setpgid, 0, p, pgid);
-- 
cgit 


From c9fa2b07fa99e648b5042a32dbfa39ba68a190db Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 26 Jun 2022 20:45:07 +0200
Subject: mnt_idmapping: add vfs[g,u]id_into_k[g,u]id()

Add two tiny helpers to conver a vfsuid into a kuid.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 41dc80f8b67c..f6e5369d2928 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -333,6 +333,19 @@ static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns,
 	return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid));
 }
 
+/**
+ * vfsuid_into_kuid - convert vfsuid into kuid
+ * @vfsuid: the vfsuid to convert
+ *
+ * This can be used when a vfsuid is committed as a kuid.
+ *
+ * Return: a kuid with the value of @vfsuid
+ */
+static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid)
+{
+	return AS_KUIDT(vfsuid);
+}
+
 /**
  * from_vfsgid - map a vfsgid into the filesystem idmapping
  * @mnt_userns: the mount's idmapping
@@ -406,6 +419,19 @@ static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns,
 	return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid));
 }
 
+/**
+ * vfsgid_into_kgid - convert vfsgid into kgid
+ * @vfsgid: the vfsgid to convert
+ *
+ * This can be used when a vfsgid is committed as a kgid.
+ *
+ * Return: a kgid with the value of @vfsgid
+ */
+static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid)
+{
+	return AS_KGIDT(vfsgid);
+}
+
 /**
  * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
  * @mnt_userns: the mount's idmapping
-- 
cgit 


From 0c5fd887d2bb47aa37aa9fb1eb1d1d2abac62972 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 6 Jul 2022 18:30:59 +0200
Subject: acl: move idmapped mount fixup into vfs_{g,s}etxattr()

This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.

I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.

Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.

The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:

        setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc

Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:

        mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
        mkdir -pv /vol/contpool/ctrover/{over,work}
        chown 10000000:10000000 /vol/contpool/ctrover/{over,work}

The user now creates an idmapped mount for the rootfs:

        mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
                                      /var/lib/lxc/c2/rootfs \
                                      /vol/contpool/lowermap

This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.

Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.

       mount -t overlay overlay                      \
             -o lowerdir=/vol/contpool/lowermap,     \
                upperdir=/vol/contpool/overmap/over, \
                workdir=/vol/contpool/overmap/work   \
             /vol/contpool/merge

The user can do this in two ways:

(1) Mount overlayfs in the initial user namespace and expose it to the
    container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
    user namespace.

Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.

Now the user tries to retrieve the POSIX ACLs using the getfacl command

        getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc

and to their surprise they see:

        # file: vol/contpool/merge/home/ubuntu/.bashrc
        # owner: 1000
        # group: 1000
        user::rw-
        user:4294967295:rwx
        group::r--
        mask::rwx
        other::r--

indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                  |> vfs_getxattr()
                  |  -> __vfs_getxattr()
                  |     -> handler->get == ovl_posix_acl_xattr_get()
                  |        -> ovl_xattr_get()
                  |           -> vfs_getxattr()
                  |              -> __vfs_getxattr()
                  |                 -> handler->get() /* lower filesystem callback */
                  |> posix_acl_fix_xattr_to_user()
                     {
                              4 = make_kuid(&init_user_ns, 4);
                              4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
                              /* FAILURE */
                             -1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
                     }

If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:10000000:65536

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                  |> vfs_getxattr()
                  |  -> __vfs_getxattr()
                  |     -> handler->get == ovl_posix_acl_xattr_get()
                  |        -> ovl_xattr_get()
                  |           -> vfs_getxattr()
                  |              -> __vfs_getxattr()
                  |                 -> handler->get() /* lower filesystem callback */
                  |> posix_acl_fix_xattr_to_user()
                     {
                              4 = make_kuid(&init_user_ns, 4);
                              4 = mapped_kuid_fs(&init_user_ns, 4);
                              /* FAILURE */
                             -1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
                     }

As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.

This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:

        setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc

The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.

If the user chooses option (1) we get the following callchain for fscaps:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                   -> vfs_getxattr()
                      -> xattr_getsecurity()
                         -> security_inode_getsecurity()                                       ________________________________
                            -> cap_inode_getsecurity()                                         |                              |
                               {                                                               V                              |
                                        10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000);                     |
                                        10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000);                  |
                                               /* Expected result is 0 and thus that we own the fscap. */                     |
                                               0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000);            |
                               }                                                                                              |
                               -> vfs_getxattr_alloc()                                                                        |
                                  -> handler->get == ovl_other_xattr_get()                                                    |
                                     -> vfs_getxattr()                                                                        |
                                        -> xattr_getsecurity()                                                                |
                                           -> security_inode_getsecurity()                                                    |
                                              -> cap_inode_getsecurity()                                                      |
                                                 {                                                                            |
                                                                0 = make_kuid(0:0:4k /* lower s_user_ns */, 0);               |
                                                         10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
                                                         10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000);    |
                                                         |____________________________________________________________________|
                                                 }
                                                 -> vfs_getxattr_alloc()
                                                    -> handler->get == /* lower filesystem callback */

And if the user chooses option (2) we get:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:10000000:65536

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                   -> vfs_getxattr()
                      -> xattr_getsecurity()
                         -> security_inode_getsecurity()                                                _______________________________
                            -> cap_inode_getsecurity()                                                  |                             |
                               {                                                                        V                             |
                                       10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0);                           |
                                       10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000);                           |
                                               /* Expected result is 0 and thus that we own the fscap. */                             |
                                              0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000);                     |
                               }                                                                                                      |
                               -> vfs_getxattr_alloc()                                                                                |
                                  -> handler->get == ovl_other_xattr_get()                                                            |
                                    |-> vfs_getxattr()                                                                                |
                                        -> xattr_getsecurity()                                                                        |
                                           -> security_inode_getsecurity()                                                            |
                                              -> cap_inode_getsecurity()                                                              |
                                                 {                                                                                    |
                                                                 0 = make_kuid(0:0:4k /* lower s_user_ns */, 0);                      |
                                                          10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0);        |
                                                                 0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
                                                                 |____________________________________________________________________|
                                                 }
                                                 -> vfs_getxattr_alloc()
                                                    -> handler->get == /* lower filesystem callback */

We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.

For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.

The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().

To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().

Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                  |> vfs_getxattr()
                  |  |> __vfs_getxattr()
                  |  |  -> handler->get == ovl_posix_acl_xattr_get()
                  |  |     -> ovl_xattr_get()
                  |  |        -> vfs_getxattr()
                  |  |           |> __vfs_getxattr()
                  |  |           |  -> handler->get() /* lower filesystem callback */
                  |  |           |> posix_acl_getxattr_idmapped_mnt()
                  |  |              {
                  |  |                              4 = make_kuid(&init_user_ns, 4);
                  |  |                       10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
                  |  |                       10000004 = from_kuid(&init_user_ns, 10000004);
                  |  |                       |_______________________
                  |  |              }                               |
                  |  |                                              |
                  |  |> posix_acl_getxattr_idmapped_mnt()           |
                  |     {                                           |
                  |                                                 V
                  |             10000004 = make_kuid(&init_user_ns, 10000004);
                  |             10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
                  |             10000004 = from_kuid(&init_user_ns, 10000004);
                  |     }       |_________________________________________________
                  |                                                              |
                  |                                                              |
                  |> posix_acl_fix_xattr_to_user()                               |
                     {                                                           V
                                 10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
                                        /* SUCCESS */
                                        4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
                     }

And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:

        idmapped mount /vol/contpool/merge:      0:10000000:65536
        caller's idmapping:                      0:10000000:65536
        overlayfs idmapping (ofs->creator_cred): 0:10000000:65536

        sys_getxattr()
        -> path_getxattr()
           -> getxattr()
              -> do_getxattr()
                  |> vfs_getxattr()
                  |  |> __vfs_getxattr()
                  |  |  -> handler->get == ovl_posix_acl_xattr_get()
                  |  |     -> ovl_xattr_get()
                  |  |        -> vfs_getxattr()
                  |  |           |> __vfs_getxattr()
                  |  |           |  -> handler->get() /* lower filesystem callback */
                  |  |           |> posix_acl_getxattr_idmapped_mnt()
                  |  |              {
                  |  |                              4 = make_kuid(&init_user_ns, 4);
                  |  |                       10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
                  |  |                       10000004 = from_kuid(&init_user_ns, 10000004);
                  |  |                       |_______________________
                  |  |              }                               |
                  |  |                                              |
                  |  |> posix_acl_getxattr_idmapped_mnt()           |
                  |     {                                           V
                  |             10000004 = make_kuid(&init_user_ns, 10000004);
                  |             10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
                  |             10000004 = from_kuid(0(&init_user_ns, 10000004);
                  |             |_________________________________________________
                  |     }                                                        |
                  |                                                              |
                  |> posix_acl_fix_xattr_to_user()                               |
                     {                                                           V
                                 10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
                                        /* SUCCESS */
                                        4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
                     }

The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:

        ovl_permission()
        -> generic_permission()
           -> acl_permission_check()
              -> check_acl()
                 -> get_acl()
                    -> inode->i_op->get_acl() == ovl_get_acl()
                        > get_acl() /* on the underlying filesystem)
                          ->inode->i_op->get_acl() == /*lower filesystem callback */
                 -> posix_acl_permission()

passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.

The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.

Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ksmbd/vfs.c                  |   2 +-
 fs/ksmbd/vfs.h                  |   2 +-
 fs/overlayfs/overlayfs.h        |   3 +-
 fs/posix_acl.c                  | 147 +++++++++++++++++++++++++++++-----------
 fs/xattr.c                      |  25 +++++--
 include/linux/posix_acl_xattr.h |  34 ++++++----
 include/linux/xattr.h           |   2 +-
 7 files changed, 151 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 05efcdf7a4a7..7c849024999f 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns,
  */
 int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
 		       struct dentry *dentry, const char *attr_name,
-		       const void *attr_value, size_t attr_size, int flags)
+		       void *attr_value, size_t attr_size, int flags)
 {
 	int err;
 
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 8c37aaf936ab..70da4c0ba7ad 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns,
 				int attr_name_len);
 int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
 		       struct dentry *dentry, const char *attr_name,
-		       const void *attr_value, size_t attr_size, int flags);
+		       void *attr_value, size_t attr_size, int flags);
 int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
 				size_t *xattr_stream_name_size, int s_type);
 int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e22e20f4811a..6ec815b84d48 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -249,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				  const char *name, const void *value,
 				  size_t size, int flags)
 {
-	int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags);
+	int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
+			       (void *)value, size, flags);
 
 	pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
 		 dentry, name, min((int)size, 48), value, size, flags, err);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 962d32468eb4..d954852a0158 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -375,8 +375,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
                                         goto check_perm;
                                 break;
                         case ACL_USER:
-				uid = mapped_kuid_fs(mnt_userns,
-						     i_user_ns(inode),
+				uid = mapped_kuid_fs(mnt_userns, &init_user_ns,
 						     pa->e_uid);
 				if (uid_eq(uid, current_fsuid()))
                                         goto mask;
@@ -390,8 +389,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
                                 }
 				break;
                         case ACL_GROUP:
-				gid = mapped_kgid_fs(mnt_userns,
-						     i_user_ns(inode),
+				gid = mapped_kgid_fs(mnt_userns, &init_user_ns,
 						     pa->e_gid);
 				if (in_group_p(gid)) {
 					found = 1;
@@ -710,46 +708,127 @@ EXPORT_SYMBOL(posix_acl_update_mode);
 /*
  * Fix up the uids and gids in posix acl extended attributes in place.
  */
-static void posix_acl_fix_xattr_userns(
-	struct user_namespace *to, struct user_namespace *from,
-	struct user_namespace *mnt_userns,
-	void *value, size_t size, bool from_user)
+static int posix_acl_fix_xattr_common(void *value, size_t size)
+{
+	struct posix_acl_xattr_header *header = value;
+	int count;
+
+	if (!header)
+		return -EINVAL;
+	if (size < sizeof(struct posix_acl_xattr_header))
+		return -EINVAL;
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return -EINVAL;
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return -EINVAL;
+	if (count == 0)
+		return -EINVAL;
+
+	return count;
+}
+
+void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				     const struct inode *inode,
+				     void *value, size_t size)
 {
 	struct posix_acl_xattr_header *header = value;
 	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
 	int count;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
 	kuid_t uid;
 	kgid_t gid;
 
-	if (!value)
+	if (no_idmapping(mnt_userns, i_user_ns(inode)))
 		return;
-	if (size < sizeof(struct posix_acl_xattr_header))
+
+	count = posix_acl_fix_xattr_common(value, size);
+	if (count < 0)
 		return;
-	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+
+	for (end = entry + count; entry != end; entry++) {
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER:
+			uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+			vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid);
+			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
+						vfsuid_into_kuid(vfsuid)));
+			break;
+		case ACL_GROUP:
+			gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+			vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid);
+			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
+						vfsgid_into_kgid(vfsgid)));
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				     const struct inode *inode,
+				     void *value, size_t size)
+{
+	struct posix_acl_xattr_header *header = value;
+	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+	int count;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (no_idmapping(mnt_userns, i_user_ns(inode)))
 		return;
 
-	count = posix_acl_xattr_count(size);
+	count = posix_acl_fix_xattr_common(value, size);
 	if (count < 0)
 		return;
-	if (count == 0)
+
+	for (end = entry + count; entry != end; entry++) {
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER:
+			uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+			vfsuid = VFSUIDT_INIT(uid);
+			uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid);
+			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid));
+			break;
+		case ACL_GROUP:
+			gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+			vfsgid = VFSGIDT_INIT(gid);
+			gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid);
+			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid));
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+static void posix_acl_fix_xattr_userns(
+	struct user_namespace *to, struct user_namespace *from,
+	void *value, size_t size)
+{
+	struct posix_acl_xattr_header *header = value;
+	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+	int count;
+	kuid_t uid;
+	kgid_t gid;
+
+	count = posix_acl_fix_xattr_common(value, size);
+	if (count < 0)
 		return;
 
 	for (end = entry + count; entry != end; entry++) {
 		switch(le16_to_cpu(entry->e_tag)) {
 		case ACL_USER:
 			uid = make_kuid(from, le32_to_cpu(entry->e_id));
-			if (from_user)
-				uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid);
-			else
-				uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid);
 			entry->e_id = cpu_to_le32(from_kuid(to, uid));
 			break;
 		case ACL_GROUP:
 			gid = make_kgid(from, le32_to_cpu(entry->e_id));
-			if (from_user)
-				gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid);
-			else
-				gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid);
 			entry->e_id = cpu_to_le32(from_kgid(to, gid));
 			break;
 		default:
@@ -758,34 +837,20 @@ static void posix_acl_fix_xattr_userns(
 	}
 }
 
-void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
-				   struct inode *inode,
-				   void *value, size_t size)
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-
-	/* Leave ids untouched on non-idmapped mounts. */
-	if (no_idmapping(mnt_userns, i_user_ns(inode)))
-		mnt_userns = &init_user_ns;
-	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+	if (user_ns == &init_user_ns)
 		return;
-	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
-				   size, true);
+	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
 }
 
-void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
-				 struct inode *inode,
-				 void *value, size_t size)
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-
-	/* Leave ids untouched on non-idmapped mounts. */
-	if (no_idmapping(mnt_userns, i_user_ns(inode)))
-		mnt_userns = &init_user_ns;
-	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+	if (user_ns == &init_user_ns)
 		return;
-	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
-				   size, false);
+	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
 }
 
 /*
diff --git a/fs/xattr.c b/fs/xattr.c
index e8dd03e4561e..a1f4998bc6be 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -282,9 +282,15 @@ out:
 }
 EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
 
+static inline bool is_posix_acl_xattr(const char *name)
+{
+	return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+	       (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
+}
+
 int
 vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
-	     const char *name, const void *value, size_t size, int flags)
+	     const char *name, void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(mnt_userns, dentry, &value, size);
+		error = cap_convert_nscap(mnt_userns, dentry,
+					  (const void **)&value, size);
 		if (error < 0)
 			return error;
 		size = error;
 	}
 
+	if (size && is_posix_acl_xattr(name))
+		posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size);
+
 retry_deleg:
 	inode_lock(inode);
 	error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
@@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return ret;
 	}
 nolsm:
-	return __vfs_getxattr(dentry, inode, name, value, size);
+	error = __vfs_getxattr(dentry, inode, name, value, size);
+	if (error > 0 && is_posix_acl_xattr(name))
+		posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
+	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_getxattr);
 
@@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns,
 	if (ctx->size &&
 		((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		(strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
-		posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
-						ctx->kvalue, ctx->size);
+		posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
 }
 
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
@@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
-							ctx->kvalue, error);
+			posix_acl_fix_xattr_to_user(ctx->kvalue, error);
 		if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 1766e1de6956..b6bd3eac2bcc 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -33,21 +33,31 @@ posix_acl_xattr_count(size_t size)
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
-void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
-				   struct inode *inode,
-				   void *value, size_t size);
-void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
-				   struct inode *inode,
-				 void *value, size_t size);
+void posix_acl_fix_xattr_from_user(void *value, size_t size);
+void posix_acl_fix_xattr_to_user(void *value, size_t size);
+void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				     const struct inode *inode,
+				     void *value, size_t size);
+void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				     const struct inode *inode,
+				     void *value, size_t size);
 #else
-static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
-						 struct inode *inode,
-						 void *value, size_t size)
+static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
 {
 }
-static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
-					       struct inode *inode,
-					       void *value, size_t size)
+static inline void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+}
+static inline void
+posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				const struct inode *inode, void *value,
+				size_t size)
+{
+}
+static inline void
+posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+				const struct inode *inode, void *value,
+				size_t size)
 {
 }
 #endif
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 4c379d23ec6e..979a9d3e5bfb 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
 			  const char *, const void *, size_t, int,
 			  struct inode **);
 int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
-		 const void *, size_t, int);
+		 void *, size_t, int);
 int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
 int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
 			     const char *, struct inode **);
-- 
cgit 


From 8043bffd01833a8544f2466fb3804310d6e73d09 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 6 Jul 2022 17:13:23 +0200
Subject: acl: make posix_acl_clone() available to overlayfs

The ovl_get_acl() function needs to alter the POSIX ACLs retrieved from the
lower filesystem. Instead of hand-rolling a overlayfs specific
posix_acl_clone() variant allow export it. It's not special and it's not deeply
internal anyway.

Link: https://lore.kernel.org/r/20220708090134.385160-3-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c            | 3 ++-
 include/linux/posix_acl.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index d4b60c18fda7..1d17d7b13dcd 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc);
 /*
  * Clone an ACL.
  */
-static struct posix_acl *
+struct posix_acl *
 posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 {
 	struct posix_acl *clone = NULL;
@@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 	}
 	return clone;
 }
+EXPORT_SYMBOL_GPL(posix_acl_clone);
 
 /*
  * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index b65c877d92b8..7d1e604c1325 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -73,6 +73,7 @@ extern int set_posix_acl(struct user_namespace *, struct inode *, int,
 			 struct posix_acl *);
 
 struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);
 
 #ifdef CONFIG_FS_POSIX_ACL
 int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t);
-- 
cgit 


From 0563231f93c6d1f582b168a47753b345c1e20d81 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2022 18:44:54 -0400
Subject: tracing/events: Add __vstring() and __assign_vstr() helper macros

There's several places that open code the following logic:

  TP_STRUCT__entry(__dynamic_array(char, msg, MSG_MAX)),
  TP_fast_assign(vsnprintf(__get_str(msg), MSG_MAX, vaf->fmt, *vaf->va);)

To load a string created by variable array va_list.

The main issue with this approach is that "MSG_MAX" usage in the
__dynamic_array() portion. That actually just reserves the MSG_MAX in the
event, and even wastes space because there's dynamic meta data also saved
in the event to denote the offset and size of the dynamic array. It would
have been better to just use a static __array() field.

Instead, create __vstring() and __assign_vstr() that work like __string
and __assign_str() but instead of taking a destination string to copy,
take a format string and a va_list pointer and fill in the values.

It uses the helper:

 #define __trace_event_vstr_len(fmt, va)		\
 ({							\
	va_list __ap;					\
	int __ret;					\
							\
	va_copy(__ap, *(va));				\
	__ret = vsnprintf(NULL, 0, fmt, __ap) + 1;	\
	va_end(__ap);					\
							\
	min(__ret, TRACE_EVENT_STR_MAX);		\
 })

To figure out the length to store the string. It may be slightly slower as
it needs to run the vsnprintf() twice, but it now saves space on the ring
buffer.

Link: https://lkml.kernel.org/r/20220705224749.053570613@goodmis.org

Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Franky Lin <franky.lin@broadcom.com>
Cc: Hante Meuleman <hante.meuleman@broadcom.com>
Cc: Gregory Greenman <gregory.greenman@intel.com>
Cc: Peter Chen <peter.chen@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Chunfeng Yun <chunfeng.yun@mediatek.com>
Cc: Bin Liu <b-liu@ti.com>
Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <a@unstable.cc>
Cc: Sven Eckelmann <sven@narfation.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h                 | 18 ++++++++++++++++++
 include/trace/stages/stage1_struct_define.h  |  3 +++
 include/trace/stages/stage2_data_offsets.h   |  3 +++
 include/trace/stages/stage4_event_fields.h   |  3 +++
 include/trace/stages/stage5_get_offsets.h    |  4 ++++
 include/trace/stages/stage6_event_callback.h |  7 +++++++
 6 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index e6e95a9f07a5..b18759a673c6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -916,6 +916,24 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 
 #endif
 
+#define TRACE_EVENT_STR_MAX	512
+
+/*
+ * gcc warns that you can not use a va_list in an inlined
+ * function. But lets me make it into a macro :-/
+ */
+#define __trace_event_vstr_len(fmt, va)			\
+({							\
+	va_list __ap;					\
+	int __ret;					\
+							\
+	va_copy(__ap, *(va));				\
+	__ret = vsnprintf(NULL, 0, fmt, __ap) + 1;	\
+	va_end(__ap);					\
+							\
+	min(__ret, TRACE_EVENT_STR_MAX);		\
+})
+
 #endif /* _LINUX_TRACE_EVENT_H */
 
 /*
diff --git a/include/trace/stages/stage1_struct_define.h b/include/trace/stages/stage1_struct_define.h
index a16783419687..1b7bab60434c 100644
--- a/include/trace/stages/stage1_struct_define.h
+++ b/include/trace/stages/stage1_struct_define.h
@@ -26,6 +26,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1)
 
diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h
index 42fd1e8813ec..1b7a8f764fdd 100644
--- a/include/trace/stages/stage2_data_offsets.h
+++ b/include/trace/stages/stage2_data_offsets.h
@@ -32,6 +32,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
index e80cdc397a43..c3790ec7a453 100644
--- a/include/trace/stages/stage4_event_fields.h
+++ b/include/trace/stages/stage4_event_fields.h
@@ -38,6 +38,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h
index 7ee5931300e6..fba4c24ed9e6 100644
--- a/include/trace/stages/stage5_get_offsets.h
+++ b/include/trace/stages/stage5_get_offsets.h
@@ -39,6 +39,10 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item,		\
+		      __trace_event_vstr_len(fmt, ap))
+
 #undef __rel_dynamic_array
 #define __rel_dynamic_array(type, item, len)				\
 	__item_length = (len) * sizeof(type);				\
diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
index e1724f73594b..0f51f6b3ab70 100644
--- a/include/trace/stages/stage6_event_callback.h
+++ b/include/trace/stages/stage6_event_callback.h
@@ -24,6 +24,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __assign_str
 #define __assign_str(dst, src)						\
 	strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
@@ -35,6 +38,10 @@
 		__get_str(dst)[len] = '\0';				\
 	} while(0)
 
+#undef __assign_vstr
+#define __assign_vstr(dst, fmt, va)					\
+	vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, *(va))
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
-- 
cgit 


From 11052589cf5c0bab3b4884d423d5f60c38fcf25d Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 13 Jul 2022 10:52:07 -0700
Subject: tcp/udp: Make early_demux back namespacified.

Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis.  Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp").  However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.

We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL.  When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable.  Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic.  Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.

This patch namespacifies (tcp|udp)_early_demux again.  For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline.  So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.

Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/protocol.h     |  4 ----
 include/net/tcp.h          |  2 +-
 include/net/udp.h          |  2 +-
 net/ipv4/af_inet.c         | 14 ++---------
 net/ipv4/ip_input.c        | 37 +++++++++++++++++------------
 net/ipv4/sysctl_net_ipv4.c | 59 ++--------------------------------------------
 net/ipv6/ip6_input.c       | 23 ++++++++++--------
 net/ipv6/tcp_ipv6.c        |  9 ++-----
 net/ipv6/udp.c             |  9 ++-----
 9 files changed, 45 insertions(+), 114 deletions(-)

(limited to 'include')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index f51c06ae365f..6aef8cb11cc8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -35,8 +35,6 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
-	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 
 	/* This returns an error if we weren't able to handle the error. */
@@ -52,8 +50,6 @@ struct net_protocol {
 
 #if IS_ENABLED(CONFIG_IPV6)
 struct inet6_protocol {
-	void	(*early_demux)(struct sk_buff *skb);
-	void    (*early_demux_handler)(struct sk_buff *skb);
 	int	(*handler)(struct sk_buff *skb);
 
 	/* This returns an error if we weren't able to handle the error. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e99f5c61f84..1636c55e798b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -932,7 +932,7 @@ extern const struct inet_connection_sock_af_ops ipv6_specific;
 
 INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
 INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
-INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb));
+void tcp_v6_early_demux(struct sk_buff *skb);
 
 #endif
 
diff --git a/include/net/udp.h b/include/net/udp.h
index b83a00330566..bb4c227299cc 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -167,7 +167,7 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
 typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport,
 				     __be16 dport);
 
-INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *));
+void udp_v6_early_demux(struct sk_buff *skb);
 INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4bc24f9e38b3..8baef2f3fc4b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1710,24 +1710,14 @@ static const struct net_protocol igmp_protocol = {
 };
 #endif
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
-	.early_demux	=	tcp_v4_early_demux,
-	.early_demux_handler =  tcp_v4_early_demux,
+static const struct net_protocol tcp_protocol = {
 	.handler	=	tcp_v4_rcv,
 	.err_handler	=	tcp_v4_err,
 	.no_policy	=	1,
 	.icmp_strict_tag_validation = 1,
 };
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
-	.early_demux =	udp_v4_early_demux,
-	.early_demux_handler =	udp_v4_early_demux,
+static const struct net_protocol udp_protocol = {
 	.handler =	udp_rcv,
 	.err_handler =	udp_err,
 	.no_policy =	1,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b1165f717cd1..1b512390b3cf 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,14 +312,13 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
 	       ip_hdr(hint)->tos == iph->tos;
 }
 
-INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
+int tcp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
 static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 			      struct sk_buff *skb, struct net_device *dev,
 			      const struct sk_buff *hint)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	int (*edemux)(struct sk_buff *skb);
 	int err, drop_reason;
 	struct rtable *rt;
 
@@ -332,21 +331,29 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 			goto drop_error;
 	}
 
-	if (net->ipv4.sysctl_ip_early_demux &&
+	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
 	    !ip_is_fragment(iph)) {
-		const struct net_protocol *ipprot;
-		int protocol = iph->protocol;
-
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
-					      udp_v4_early_demux, skb);
-			if (unlikely(err))
-				goto drop_error;
-			/* must reload iph, skb->head might have changed */
-			iph = ip_hdr(skb);
+		switch (iph->protocol) {
+		case IPPROTO_TCP:
+			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+				tcp_v4_early_demux(skb);
+
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
+		case IPPROTO_UDP:
+			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+				err = udp_v4_early_demux(skb);
+				if (unlikely(err))
+					goto drop_error;
+
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
 		}
 	}
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 108fd86f2718..130e9c130311 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -350,61 +350,6 @@ bad_key:
 	return ret;
 }
 
-static void proc_configure_early_demux(int enabled, int protocol)
-{
-	struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_protocol *ip6prot;
-#endif
-
-	rcu_read_lock();
-
-	ipprot = rcu_dereference(inet_protos[protocol]);
-	if (ipprot)
-		ipprot->early_demux = enabled ? ipprot->early_demux_handler :
-						NULL;
-
-#if IS_ENABLED(CONFIG_IPV6)
-	ip6prot = rcu_dereference(inet6_protos[protocol]);
-	if (ip6prot)
-		ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
-						 NULL;
-#endif
-	rcu_read_unlock();
-}
-
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
-				void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-
-	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
-
-	if (write && !ret) {
-		int enabled = init_net.ipv4.sysctl_tcp_early_demux;
-
-		proc_configure_early_demux(enabled, IPPROTO_TCP);
-	}
-
-	return ret;
-}
-
-static int proc_udp_early_demux(struct ctl_table *table, int write,
-				void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-
-	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
-
-	if (write && !ret) {
-		int enabled = init_net.ipv4.sysctl_udp_early_demux;
-
-		proc_configure_early_demux(enabled, IPPROTO_UDP);
-	}
-
-	return ret;
-}
-
 static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 					     int write, void *buffer,
 					     size_t *lenp, loff_t *ppos)
@@ -707,14 +652,14 @@ static struct ctl_table ipv4_net_table[] = {
 		.data           = &init_net.ipv4.sysctl_udp_early_demux,
 		.maxlen         = sizeof(u8),
 		.mode           = 0644,
-		.proc_handler   = proc_udp_early_demux
+		.proc_handler   = proc_dou8vec_minmax,
 	},
 	{
 		.procname       = "tcp_early_demux",
 		.data           = &init_net.ipv4.sysctl_tcp_early_demux,
 		.maxlen         = sizeof(u8),
 		.mode           = 0644,
-		.proc_handler   = proc_tcp_early_demux
+		.proc_handler   = proc_dou8vec_minmax,
 	},
 	{
 		.procname       = "nexthop_compat_mode",
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 0322cc86b84e..e1ebf5e42ebe 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -45,20 +45,23 @@
 #include <net/inet_ecn.h>
 #include <net/dst_metadata.h>
 
-INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *));
 static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
 				struct sk_buff *skb)
 {
-	void (*edemux)(struct sk_buff *skb);
-
-	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
-		const struct inet6_protocol *ipprot;
-
-		ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
-			INDIRECT_CALL_2(edemux, tcp_v6_early_demux,
-					udp_v6_early_demux, skb);
+	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+	    !skb_dst(skb) && !skb->sk) {
+		switch (ipv6_hdr(skb)->nexthdr) {
+		case IPPROTO_TCP:
+			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+				tcp_v6_early_demux(skb);
+			break;
+		case IPPROTO_UDP:
+			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+				udp_v6_early_demux(skb);
+			break;
+		}
 	}
+
 	if (!skb_valid_dst(skb))
 		ip6_route_input(skb);
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f37dd4aa91c6..9d3ede293258 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1822,7 +1822,7 @@ do_time_wait:
 	goto discard_it;
 }
 
-INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
 {
 	const struct ipv6hdr *hdr;
 	const struct tcphdr *th;
@@ -2176,12 +2176,7 @@ struct proto tcpv6_prot = {
 };
 EXPORT_SYMBOL_GPL(tcpv6_prot);
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
-	.early_demux	=	tcp_v6_early_demux,
-	.early_demux_handler =  tcp_v6_early_demux,
+static const struct inet6_protocol tcpv6_protocol = {
 	.handler	=	tcp_v6_rcv,
 	.err_handler	=	tcp_v6_err,
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 55afd7f39c04..e2f2e087a753 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1052,7 +1052,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 	return NULL;
 }
 
-INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct udphdr *uh;
@@ -1660,12 +1660,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname,
 	return ipv6_getsockopt(sk, level, optname, optval, optlen);
 }
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
-	.early_demux	=	udp_v6_early_demux,
-	.early_demux_handler =  udp_v6_early_demux,
+static const struct inet6_protocol udpv6_protocol = {
 	.handler	=	udpv6_rcv,
 	.err_handler	=	udpv6_err,
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-- 
cgit 


From e68c5dcf0aacc48a23cedcb3ce81b8c60837f48c Mon Sep 17 00:00:00 2001
From: Jaehee Park <jhpark1013@gmail.com>
Date: Wed, 13 Jul 2022 16:40:47 -0700
Subject: net: ipv4: new arp_accept option to accept garp only if in-network

In many deployments, we want the option to not learn a neighbor from
garp if the src ip is not in the same subnet as an address configured
on the interface that received the garp message. net.ipv4.arp_accept
sysctl is currently used to control creation of a neigh from a
received garp packet. This patch adds a new option '2' to
net.ipv4.arp_accept which extends option '1' by including the subnet
check.

Signed-off-by: Jaehee Park <jhpark1013@gmail.com>
Suggested-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  9 ++++++---
 include/linux/inetdevice.h             |  2 +-
 net/ipv4/arp.c                         | 24 ++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 2b329042b38c..b31601405c54 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1633,12 +1633,15 @@ arp_notify - BOOLEAN
 	     or hardware address changes.
 	 ==  ==========================================================
 
-arp_accept - BOOLEAN
-	Define behavior for gratuitous ARP frames who's IP is not
-	already present in the ARP table:
+arp_accept - INTEGER
+	Define behavior for accepting gratuitous ARP (garp) frames from devices
+	that are not already present in the ARP table:
 
 	- 0 - don't create new entries in the ARP table
 	- 1 - create new entries in the ARP table
+	- 2 - create new entries only if the source IP address is in the same
+	  subnet as an address configured on the interface that received the
+	  garp message.
 
 	Both replies and requests type gratuitous arp will trigger the
 	ARP table to be updated, if this setting is on.
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ead323243e7b..ddb27fc0ee8c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -131,7 +131,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 	IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)
 
 #define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
-#define IN_DEV_ARP_ACCEPT(in_dev)	IN_DEV_ORCONF((in_dev), ARP_ACCEPT)
+#define IN_DEV_ARP_ACCEPT(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ACCEPT)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
 #define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
 #define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index af2f12ffc9ca..87c7e3fc5197 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -429,6 +429,26 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
 	return !inet_confirm_addr(net, in_dev, sip, tip, scope);
 }
 
+static int arp_accept(struct in_device *in_dev, __be32 sip)
+{
+	struct net *net = dev_net(in_dev->dev);
+	int scope = RT_SCOPE_LINK;
+
+	switch (IN_DEV_ARP_ACCEPT(in_dev)) {
+	case 0: /* Don't create new entries from garp */
+		return 0;
+	case 1: /* Create new entries from garp */
+		return 1;
+	case 2: /* Create a neighbor in the arp table only if sip
+		 * is in the same subnet as an address configured
+		 * on the interface that received the garp message
+		 */
+		return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
+	default:
+		return 0;
+	}
+}
+
 static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 {
 	struct rtable *rt;
@@ -868,12 +888,12 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
 
 	addr_type = -1;
-	if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
+	if (n || arp_accept(in_dev, sip)) {
 		is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
 				      sip, tip, sha, tha);
 	}
 
-	if (IN_DEV_ARP_ACCEPT(in_dev)) {
+	if (arp_accept(in_dev, sip)) {
 		/* Unsolicited ARP is not accepted by default.
 		   It is possible, that this option should be enabled for some
 		   devices (strip is candidate)
-- 
cgit 


From ceefa81e6e69b020997205e5c30a42d43aa5ae63 Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Fri, 15 Jul 2022 16:03:22 +0200
Subject: serial: remove VR41XX serial driver

Commit d3164e2f3b0a ("MIPS: Remove VR41xx support") removed support
for MIPS VR41xx platform, so remove exclusive drivers for this
platform, too.

Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Link: https://lore.kernel.org/r/20220715140322.135825-1-tsbogend@alpha.franken.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |  17 -
 drivers/tty/serial/Makefile      |   1 -
 drivers/tty/serial/vr41xx_siu.c  | 932 ---------------------------------------
 include/uapi/linux/serial_core.h |   4 -
 4 files changed, 954 deletions(-)
 delete mode 100644 drivers/tty/serial/vr41xx_siu.c

(limited to 'include')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 8a3ee1525d80..f92963a2226b 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -890,23 +890,6 @@ config SERIAL_TXX9_STDSERIAL
 	bool "TX39XX/49XX SIO act as standard serial"
 	depends on !SERIAL_8250 && SERIAL_TXX9
 
-config SERIAL_VR41XX
-	tristate "NEC VR4100 series Serial Interface Unit support"
-	depends on CPU_VR41XX
-	select SERIAL_CORE
-	help
-	  If you have a NEC VR4100 series processor and you want to use
-	  Serial Interface Unit(SIU) or Debug Serial Interface Unit(DSIU)
-	  (not include VR4111/VR4121 DSIU), say Y.  Otherwise, say N.
-
-config SERIAL_VR41XX_CONSOLE
-	bool "Enable NEC VR4100 series Serial Interface Unit console"
-	depends on SERIAL_VR41XX=y
-	select SERIAL_CORE_CONSOLE
-	help
-	  If you have a NEC VR4100 series processor and you want to use
-	  a console on a serial port, say Y.  Otherwise, say N.
-
 config SERIAL_JSM
 	tristate "Digi International NEO and Classic PCI Support"
 	depends on PCI
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 61cc8de95571..238a9557b487 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -51,7 +51,6 @@ obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o
 obj-$(CONFIG_SERIAL_SC16IS7XX_CORE) += sc16is7xx.o
 obj-$(CONFIG_SERIAL_JSM) += jsm/
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
-obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
 obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
 obj-$(CONFIG_SERIAL_MSM) += msm_serial.o
diff --git a/drivers/tty/serial/vr41xx_siu.c b/drivers/tty/serial/vr41xx_siu.c
deleted file mode 100644
index 1ba689a81abd..000000000000
--- a/drivers/tty/serial/vr41xx_siu.c
+++ /dev/null
@@ -1,932 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- *  Driver for NEC VR4100 series Serial Interface Unit.
- *
- *  Copyright (C) 2004-2008  Yoichi Yuasa <yuasa@linux-mips.org>
- *
- *  Based on drivers/serial/8250.c, by Russell King.
- */
-
-#include <linux/console.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <linux/serial_reg.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-
-#include <linux/io.h>
-#include <asm/vr41xx/siu.h>
-#include <asm/vr41xx/vr41xx.h>
-
-#define SIU_BAUD_BASE	1152000
-#define SIU_MAJOR	204
-#define SIU_MINOR_BASE	82
-
-#define RX_MAX_COUNT	256
-#define TX_MAX_COUNT	15
-
-#define SIUIRSEL	0x08
- #define TMICMODE	0x20
- #define TMICTX		0x10
- #define IRMSEL		0x0c
- #define IRMSEL_HP	0x08
- #define IRMSEL_TEMIC	0x04
- #define IRMSEL_SHARP	0x00
- #define IRUSESEL	0x02
- #define SIRSEL		0x01
-
-static struct uart_port siu_uart_ports[SIU_PORTS_MAX] = {
-	[0 ... SIU_PORTS_MAX-1] = {
-		.lock	= __SPIN_LOCK_UNLOCKED(siu_uart_ports->lock),
-		.irq	= 0,
-	},
-};
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-static uint8_t lsr_break_flag[SIU_PORTS_MAX];
-#endif
-
-#define siu_read(port, offset)		readb((port)->membase + (offset))
-#define siu_write(port, offset, value)	writeb((value), (port)->membase + (offset))
-
-void vr41xx_select_siu_interface(siu_interface_t interface)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	if (interface == SIU_INTERFACE_IRDA)
-		irsel |= SIRSEL;
-	else
-		irsel &= ~SIRSEL;
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_select_siu_interface);
-
-void vr41xx_use_irda(irda_use_t use)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	if (use == FIR_USE_IRDA)
-		irsel |= IRUSESEL;
-	else
-		irsel &= ~IRUSESEL;
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_use_irda);
-
-void vr41xx_select_irda_module(irda_module_t module, irda_speed_t speed)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	irsel &= ~(IRMSEL | TMICTX | TMICMODE);
-	switch (module) {
-	case SHARP_IRDA:
-		irsel |= IRMSEL_SHARP;
-		break;
-	case TEMIC_IRDA:
-		irsel |= IRMSEL_TEMIC | TMICMODE;
-		if (speed == IRDA_TX_4MBPS)
-			irsel |= TMICTX;
-		break;
-	case HP_IRDA:
-		irsel |= IRMSEL_HP;
-		break;
-	default:
-		break;
-	}
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_select_irda_module);
-
-static inline void siu_clear_fifo(struct uart_port *port)
-{
-	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO);
-	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR |
-	                          UART_FCR_CLEAR_XMIT);
-	siu_write(port, UART_FCR, 0);
-}
-
-static inline unsigned long siu_port_size(struct uart_port *port)
-{
-	switch (port->type) {
-	case PORT_VR41XX_SIU:
-		return 11UL;
-	case PORT_VR41XX_DSIU:
-		return 8UL;
-	}
-
-	return 0;
-}
-
-static inline unsigned int siu_check_type(struct uart_port *port)
-{
-	if (port->line == 0)
-		return PORT_VR41XX_SIU;
-	if (port->line == 1 && port->irq)
-		return PORT_VR41XX_DSIU;
-
-	return PORT_UNKNOWN;
-}
-
-static inline const char *siu_type_name(struct uart_port *port)
-{
-	switch (port->type) {
-	case PORT_VR41XX_SIU:
-		return "SIU";
-	case PORT_VR41XX_DSIU:
-		return "DSIU";
-	}
-
-	return NULL;
-}
-
-static unsigned int siu_tx_empty(struct uart_port *port)
-{
-	uint8_t lsr;
-
-	lsr = siu_read(port, UART_LSR);
-	if (lsr & UART_LSR_TEMT)
-		return TIOCSER_TEMT;
-
-	return 0;
-}
-
-static void siu_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-	uint8_t mcr = 0;
-
-	if (mctrl & TIOCM_DTR)
-		mcr |= UART_MCR_DTR;
-	if (mctrl & TIOCM_RTS)
-		mcr |= UART_MCR_RTS;
-	if (mctrl & TIOCM_OUT1)
-		mcr |= UART_MCR_OUT1;
-	if (mctrl & TIOCM_OUT2)
-		mcr |= UART_MCR_OUT2;
-	if (mctrl & TIOCM_LOOP)
-		mcr |= UART_MCR_LOOP;
-
-	siu_write(port, UART_MCR, mcr);
-}
-
-static unsigned int siu_get_mctrl(struct uart_port *port)
-{
-	uint8_t msr;
-	unsigned int mctrl = 0;
-
-	msr = siu_read(port, UART_MSR);
-	if (msr & UART_MSR_DCD)
-		mctrl |= TIOCM_CAR;
-	if (msr & UART_MSR_RI)
-		mctrl |= TIOCM_RNG;
-	if (msr & UART_MSR_DSR)
-		mctrl |= TIOCM_DSR;
-	if (msr & UART_MSR_CTS)
-		mctrl |= TIOCM_CTS;
-
-	return mctrl;
-}
-
-static void siu_stop_tx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_THRI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_start_tx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier |= UART_IER_THRI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_stop_rx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_RLSI;
-	siu_write(port, UART_IER, ier);
-
-	port->read_status_mask &= ~UART_LSR_DR;
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_enable_ms(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier |= UART_IER_MSI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_break_ctl(struct uart_port *port, int ctl)
-{
-	unsigned long flags;
-	uint8_t lcr;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	lcr = siu_read(port, UART_LCR);
-	if (ctl == -1)
-		lcr |= UART_LCR_SBC;
-	else
-		lcr &= ~UART_LCR_SBC;
-	siu_write(port, UART_LCR, lcr);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static inline void receive_chars(struct uart_port *port, uint8_t *status)
-{
-	uint8_t lsr, ch;
-	char flag;
-	int max_count = RX_MAX_COUNT;
-
-	lsr = *status;
-
-	do {
-		ch = siu_read(port, UART_RX);
-		port->icount.rx++;
-		flag = TTY_NORMAL;
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-		lsr |= lsr_break_flag[port->line];
-		lsr_break_flag[port->line] = 0;
-#endif
-		if (unlikely(lsr & (UART_LSR_BI | UART_LSR_FE |
-		                    UART_LSR_PE | UART_LSR_OE))) {
-			if (lsr & UART_LSR_BI) {
-				lsr &= ~(UART_LSR_FE | UART_LSR_PE);
-				port->icount.brk++;
-
-				if (uart_handle_break(port))
-					goto ignore_char;
-			}
-
-			if (lsr & UART_LSR_FE)
-				port->icount.frame++;
-			if (lsr & UART_LSR_PE)
-				port->icount.parity++;
-			if (lsr & UART_LSR_OE)
-				port->icount.overrun++;
-
-			lsr &= port->read_status_mask;
-			if (lsr & UART_LSR_BI)
-				flag = TTY_BREAK;
-			if (lsr & UART_LSR_FE)
-				flag = TTY_FRAME;
-			if (lsr & UART_LSR_PE)
-				flag = TTY_PARITY;
-		}
-
-		if (uart_handle_sysrq_char(port, ch))
-			goto ignore_char;
-
-		uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
-
-	ignore_char:
-		lsr = siu_read(port, UART_LSR);
-	} while ((lsr & UART_LSR_DR) && (max_count-- > 0));
-
-	tty_flip_buffer_push(&port->state->port);
-
-	*status = lsr;
-}
-
-static inline void check_modem_status(struct uart_port *port)
-{
-	uint8_t msr;
-
-	msr = siu_read(port, UART_MSR);
-	if ((msr & UART_MSR_ANY_DELTA) == 0)
-		return;
-	if (msr & UART_MSR_DDCD)
-		uart_handle_dcd_change(port, msr & UART_MSR_DCD);
-	if (msr & UART_MSR_TERI)
-		port->icount.rng++;
-	if (msr & UART_MSR_DDSR)
-		port->icount.dsr++;
-	if (msr & UART_MSR_DCTS)
-		uart_handle_cts_change(port, msr & UART_MSR_CTS);
-
-	wake_up_interruptible(&port->state->port.delta_msr_wait);
-}
-
-static inline void transmit_chars(struct uart_port *port)
-{
-	struct circ_buf *xmit;
-	int max_count = TX_MAX_COUNT;
-
-	xmit = &port->state->xmit;
-
-	if (port->x_char) {
-		siu_write(port, UART_TX, port->x_char);
-		port->icount.tx++;
-		port->x_char = 0;
-		return;
-	}
-
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
-		siu_stop_tx(port);
-		return;
-	}
-
-	do {
-		siu_write(port, UART_TX, xmit->buf[xmit->tail]);
-		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		port->icount.tx++;
-		if (uart_circ_empty(xmit))
-			break;
-	} while (max_count-- > 0);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(port);
-
-	if (uart_circ_empty(xmit))
-		siu_stop_tx(port);
-}
-
-static irqreturn_t siu_interrupt(int irq, void *dev_id)
-{
-	struct uart_port *port;
-	uint8_t iir, lsr;
-
-	port = (struct uart_port *)dev_id;
-
-	iir = siu_read(port, UART_IIR);
-	if (iir & UART_IIR_NO_INT)
-		return IRQ_NONE;
-
-	lsr = siu_read(port, UART_LSR);
-	if (lsr & UART_LSR_DR)
-		receive_chars(port, &lsr);
-
-	check_modem_status(port);
-
-	if (lsr & UART_LSR_THRE)
-		transmit_chars(port);
-
-	return IRQ_HANDLED;
-}
-
-static int siu_startup(struct uart_port *port)
-{
-	int retval;
-
-	if (port->membase == NULL)
-		return -ENODEV;
-
-	siu_clear_fifo(port);
-
-	(void)siu_read(port, UART_LSR);
-	(void)siu_read(port, UART_RX);
-	(void)siu_read(port, UART_IIR);
-	(void)siu_read(port, UART_MSR);
-
-	if (siu_read(port, UART_LSR) == 0xff)
-		return -ENODEV;
-
-	retval = request_irq(port->irq, siu_interrupt, 0, siu_type_name(port), port);
-	if (retval)
-		return retval;
-
-	if (port->type == PORT_VR41XX_DSIU)
-		vr41xx_enable_dsiuint(DSIUINT_ALL);
-
-	siu_write(port, UART_LCR, UART_LCR_WLEN8);
-
-	spin_lock_irq(&port->lock);
-	siu_set_mctrl(port, port->mctrl);
-	spin_unlock_irq(&port->lock);
-
-	siu_write(port, UART_IER, UART_IER_RLSI | UART_IER_RDI);
-
-	(void)siu_read(port, UART_LSR);
-	(void)siu_read(port, UART_RX);
-	(void)siu_read(port, UART_IIR);
-	(void)siu_read(port, UART_MSR);
-
-	return 0;
-}
-
-static void siu_shutdown(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t lcr;
-
-	siu_write(port, UART_IER, 0);
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	port->mctrl &= ~TIOCM_OUT2;
-	siu_set_mctrl(port, port->mctrl);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	lcr = siu_read(port, UART_LCR);
-	lcr &= ~UART_LCR_SBC;
-	siu_write(port, UART_LCR, lcr);
-
-	siu_clear_fifo(port);
-
-	(void)siu_read(port, UART_RX);
-
-	if (port->type == PORT_VR41XX_DSIU)
-		vr41xx_disable_dsiuint(DSIUINT_ALL);
-
-	free_irq(port->irq, port);
-}
-
-static void siu_set_termios(struct uart_port *port, struct ktermios *new,
-                            struct ktermios *old)
-{
-	tcflag_t c_cflag, c_iflag;
-	uint8_t lcr, fcr, ier;
-	unsigned int baud, quot;
-	unsigned long flags;
-
-	c_cflag = new->c_cflag;
-	lcr = UART_LCR_WLEN(tty_get_char_size(c_cflag));
-
-	if (c_cflag & CSTOPB)
-		lcr |= UART_LCR_STOP;
-	if (c_cflag & PARENB)
-		lcr |= UART_LCR_PARITY;
-	if ((c_cflag & PARODD) != PARODD)
-		lcr |= UART_LCR_EPAR;
-	if (c_cflag & CMSPAR)
-		lcr |= UART_LCR_SPAR;
-
-	baud = uart_get_baud_rate(port, new, old, 0, port->uartclk/16);
-	quot = uart_get_divisor(port, baud);
-
-	fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	uart_update_timeout(port, c_cflag, baud);
-
-	c_iflag = new->c_iflag;
-
-	port->read_status_mask = UART_LSR_THRE | UART_LSR_OE | UART_LSR_DR;
-	if (c_iflag & INPCK)
-		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
-	if (c_iflag & (IGNBRK | BRKINT | PARMRK))
-		port->read_status_mask |= UART_LSR_BI;
-
-	port->ignore_status_mask = 0;
-	if (c_iflag & IGNPAR)
-		port->ignore_status_mask |= UART_LSR_FE | UART_LSR_PE;
-	if (c_iflag & IGNBRK) {
-		port->ignore_status_mask |= UART_LSR_BI;
-		if (c_iflag & IGNPAR)
-			port->ignore_status_mask |= UART_LSR_OE;
-	}
-
-	if ((c_cflag & CREAD) == 0)
-		port->ignore_status_mask |= UART_LSR_DR;
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_MSI;
-	if (UART_ENABLE_MS(port, c_cflag))
-		ier |= UART_IER_MSI;
-	siu_write(port, UART_IER, ier);
-
-	siu_write(port, UART_LCR, lcr | UART_LCR_DLAB);
-
-	siu_write(port, UART_DLL, (uint8_t)quot);
-	siu_write(port, UART_DLM, (uint8_t)(quot >> 8));
-
-	siu_write(port, UART_LCR, lcr);
-
-	siu_write(port, UART_FCR, fcr);
-
-	siu_set_mctrl(port, port->mctrl);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_pm(struct uart_port *port, unsigned int state, unsigned int oldstate)
-{
-	switch (state) {
-	case 0:
-		switch (port->type) {
-		case PORT_VR41XX_SIU:
-			vr41xx_supply_clock(SIU_CLOCK);
-			break;
-		case PORT_VR41XX_DSIU:
-			vr41xx_supply_clock(DSIU_CLOCK);
-			break;
-		}
-		break;
-	case 3:
-		switch (port->type) {
-		case PORT_VR41XX_SIU:
-			vr41xx_mask_clock(SIU_CLOCK);
-			break;
-		case PORT_VR41XX_DSIU:
-			vr41xx_mask_clock(DSIU_CLOCK);
-			break;
-		}
-		break;
-	}
-}
-
-static const char *siu_type(struct uart_port *port)
-{
-	return siu_type_name(port);
-}
-
-static void siu_release_port(struct uart_port *port)
-{
-	unsigned long size;
-
-	if (port->flags	& UPF_IOREMAP) {
-		iounmap(port->membase);
-		port->membase = NULL;
-	}
-
-	size = siu_port_size(port);
-	release_mem_region(port->mapbase, size);
-}
-
-static int siu_request_port(struct uart_port *port)
-{
-	unsigned long size;
-	struct resource *res;
-
-	size = siu_port_size(port);
-	res = request_mem_region(port->mapbase, size, siu_type_name(port));
-	if (res == NULL)
-		return -EBUSY;
-
-	if (port->flags & UPF_IOREMAP) {
-		port->membase = ioremap(port->mapbase, size);
-		if (port->membase == NULL) {
-			release_resource(res);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static void siu_config_port(struct uart_port *port, int flags)
-{
-	if (flags & UART_CONFIG_TYPE) {
-		port->type = siu_check_type(port);
-		(void)siu_request_port(port);
-	}
-}
-
-static int siu_verify_port(struct uart_port *port, struct serial_struct *serial)
-{
-	if (port->type != PORT_VR41XX_SIU && port->type != PORT_VR41XX_DSIU)
-		return -EINVAL;
-	if (port->irq != serial->irq)
-		return -EINVAL;
-	if (port->iotype != serial->io_type)
-		return -EINVAL;
-	if (port->mapbase != (unsigned long)serial->iomem_base)
-		return -EINVAL;
-
-	return 0;
-}
-
-static const struct uart_ops siu_uart_ops = {
-	.tx_empty	= siu_tx_empty,
-	.set_mctrl	= siu_set_mctrl,
-	.get_mctrl	= siu_get_mctrl,
-	.stop_tx	= siu_stop_tx,
-	.start_tx	= siu_start_tx,
-	.stop_rx	= siu_stop_rx,
-	.enable_ms	= siu_enable_ms,
-	.break_ctl	= siu_break_ctl,
-	.startup	= siu_startup,
-	.shutdown	= siu_shutdown,
-	.set_termios	= siu_set_termios,
-	.pm		= siu_pm,
-	.type		= siu_type,
-	.release_port	= siu_release_port,
-	.request_port	= siu_request_port,
-	.config_port	= siu_config_port,
-	.verify_port	= siu_verify_port,
-};
-
-static int siu_init_ports(struct platform_device *pdev)
-{
-	struct uart_port *port;
-	struct resource *res;
-	int *type = dev_get_platdata(&pdev->dev);
-	int i;
-
-	if (!type)
-		return 0;
-
-	port = siu_uart_ports;
-	for (i = 0; i < SIU_PORTS_MAX; i++) {
-		port->type = type[i];
-		if (port->type == PORT_UNKNOWN)
-			continue;
-		port->irq = platform_get_irq(pdev, i);
-		port->uartclk = SIU_BAUD_BASE * 16;
-		port->fifosize = 16;
-		port->regshift = 0;
-		port->iotype = UPIO_MEM;
-		port->flags = UPF_IOREMAP | UPF_BOOT_AUTOCONF;
-		port->line = i;
-		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		port->mapbase = res->start;
-		port++;
-	}
-
-	return i;
-}
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-
-static void wait_for_xmitr(struct uart_port *port)
-{
-	int timeout = 10000;
-	uint8_t lsr, msr;
-
-	do {
-		lsr = siu_read(port, UART_LSR);
-		if (lsr & UART_LSR_BI)
-			lsr_break_flag[port->line] = UART_LSR_BI;
-
-		if (uart_lsr_tx_empty(lsr))
-			break;
-	} while (timeout-- > 0);
-
-	if (port->flags & UPF_CONS_FLOW) {
-		timeout = 1000000;
-
-		do {
-			msr = siu_read(port, UART_MSR);
-			if ((msr & UART_MSR_CTS) != 0)
-				break;
-		} while (timeout-- > 0);
-	}
-}
-
-static void siu_console_putchar(struct uart_port *port, unsigned char ch)
-{
-	wait_for_xmitr(port);
-	siu_write(port, UART_TX, ch);
-}
-
-static void siu_console_write(struct console *con, const char *s, unsigned count)
-{
-	struct uart_port *port;
-	uint8_t ier;
-
-	port = &siu_uart_ports[con->index];
-
-	ier = siu_read(port, UART_IER);
-	siu_write(port, UART_IER, 0);
-
-	uart_console_write(port, s, count, siu_console_putchar);
-
-	wait_for_xmitr(port);
-	siu_write(port, UART_IER, ier);
-}
-
-static int __init siu_console_setup(struct console *con, char *options)
-{
-	struct uart_port *port;
-	int baud = 9600;
-	int parity = 'n';
-	int bits = 8;
-	int flow = 'n';
-
-	if (con->index >= SIU_PORTS_MAX)
-		con->index = 0;
-
-	port = &siu_uart_ports[con->index];
-	if (port->membase == NULL) {
-		if (port->mapbase == 0)
-			return -ENODEV;
-		port->membase = ioremap(port->mapbase, siu_port_size(port));
-	}
-
-	if (port->type == PORT_VR41XX_SIU)
-		vr41xx_select_siu_interface(SIU_INTERFACE_RS232C);
-
-	if (options != NULL)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-
-	return uart_set_options(port, con, baud, parity, bits, flow);
-}
-
-static struct uart_driver siu_uart_driver;
-
-static struct console siu_console = {
-	.name	= "ttyVR",
-	.write	= siu_console_write,
-	.device	= uart_console_device,
-	.setup	= siu_console_setup,
-	.flags	= CON_PRINTBUFFER,
-	.index	= -1,
-	.data	= &siu_uart_driver,
-};
-
-static int siu_console_init(void)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < SIU_PORTS_MAX; i++) {
-		port = &siu_uart_ports[i];
-		port->ops = &siu_uart_ops;
-	}
-
-	register_console(&siu_console);
-
-	return 0;
-}
-
-console_initcall(siu_console_init);
-
-void __init vr41xx_siu_early_setup(struct uart_port *port)
-{
-	if (port->type == PORT_UNKNOWN)
-		return;
-
-	siu_uart_ports[port->line].line = port->line;
-	siu_uart_ports[port->line].type = port->type;
-	siu_uart_ports[port->line].uartclk = SIU_BAUD_BASE * 16;
-	siu_uart_ports[port->line].mapbase = port->mapbase;
-	siu_uart_ports[port->line].ops = &siu_uart_ops;
-}
-
-#define SERIAL_VR41XX_CONSOLE	&siu_console
-#else
-#define SERIAL_VR41XX_CONSOLE	NULL
-#endif
-
-static struct uart_driver siu_uart_driver = {
-	.owner		= THIS_MODULE,
-	.driver_name	= "SIU",
-	.dev_name	= "ttyVR",
-	.major		= SIU_MAJOR,
-	.minor		= SIU_MINOR_BASE,
-	.cons		= SERIAL_VR41XX_CONSOLE,
-};
-
-static int siu_probe(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int num, i, retval;
-
-	num = siu_init_ports(dev);
-	if (num <= 0)
-		return -ENODEV;
-
-	siu_uart_driver.nr = num;
-	retval = uart_register_driver(&siu_uart_driver);
-	if (retval)
-		return retval;
-
-	for (i = 0; i < num; i++) {
-		port = &siu_uart_ports[i];
-		port->ops = &siu_uart_ops;
-		port->dev = &dev->dev;
-		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_VR41XX_CONSOLE);
-
-		retval = uart_add_one_port(&siu_uart_driver, port);
-		if (retval < 0) {
-			port->dev = NULL;
-			break;
-		}
-	}
-
-	if (i == 0 && retval < 0) {
-		uart_unregister_driver(&siu_uart_driver);
-		return retval;
-	}
-
-	return 0;
-}
-
-static int siu_remove(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if (port->dev == &dev->dev) {
-			uart_remove_one_port(&siu_uart_driver, port);
-			port->dev = NULL;
-		}
-	}
-
-	uart_unregister_driver(&siu_uart_driver);
-
-	return 0;
-}
-
-static int siu_suspend(struct platform_device *dev, pm_message_t state)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if ((port->type == PORT_VR41XX_SIU ||
-		     port->type == PORT_VR41XX_DSIU) && port->dev == &dev->dev)
-			uart_suspend_port(&siu_uart_driver, port);
-
-	}
-
-	return 0;
-}
-
-static int siu_resume(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if ((port->type == PORT_VR41XX_SIU ||
-		     port->type == PORT_VR41XX_DSIU) && port->dev == &dev->dev)
-			uart_resume_port(&siu_uart_driver, port);
-	}
-
-	return 0;
-}
-
-static struct platform_driver siu_device_driver = {
-	.probe		= siu_probe,
-	.remove		= siu_remove,
-	.suspend	= siu_suspend,
-	.resume		= siu_resume,
-	.driver		= {
-		.name	= "SIU",
-	},
-};
-
-module_platform_driver(siu_device_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:SIU");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 6faf502b7860..3ba34d8378bd 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -124,10 +124,6 @@
 /* TXX9 type number */
 #define PORT_TXX9	64
 
-/* NEC VR4100 series SIU/DSIU */
-#define PORT_VR41XX_SIU		65
-#define PORT_VR41XX_DSIU	66
-
 /* Samsung S3C2400 SoC */
 #define PORT_S3C2400	67
 
-- 
cgit 


From 868941b14441282ba08761b770fc6cad69d5bdb7 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 29 Jun 2022 15:07:00 +0200
Subject: fs: remove no_llseek

Now that all callers of ->llseek are going through vfs_llseek(), we
don't gain anything by keeping no_llseek around. Nothing actually calls
it and setting ->llseek to no_lseek is completely equivalent to
leaving it NULL.

Longer term (== by the end of merge window) we want to remove all such
intializations.  To simplify the merge window this commit does *not*
touch initializers - it only defines no_llseek as NULL (and simplifies
the tests on file opening).

At -rc1 we'll need do a mechanical removal of no_llseek -

git grep -l -w no_llseek | grep -v porting.rst | while read i; do
	sed -i '/\<no_llseek\>/d' $i
done
would do it.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst | 8 ++++++++
 drivers/gpu/drm/drm_file.c            | 3 +--
 fs/file_table.c                       | 2 +-
 fs/open.c                             | 2 --
 fs/read_write.c                       | 6 ------
 include/linux/fs.h                    | 2 +-
 kernel/bpf/bpf_iter.c                 | 3 +--
 7 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 2e0e4f0e0c6f..aee9aaf9f3df 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -914,3 +914,11 @@ Calling conventions for file_open_root() changed; now it takes struct path *
 instead of passing mount and dentry separately.  For callers that used to
 pass <mnt, mnt->mnt_root> pair (i.e. the root of given mount), a new helper
 is provided - file_open_root_mnt().  In-tree users adjusted.
+
+---
+
+**mandatory**
+
+no_llseek is gone; don't set .llseek to that - just leave it NULL instead.
+Checks for "does that file have llseek(2), or should it fail with ESPIPE"
+should be done by looking at FMODE_LSEEK in file->f_mode.
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index ed25168619fc..dc7d2e5b16c8 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -552,8 +552,7 @@ EXPORT_SYMBOL(drm_release_noglobal);
  * Since events are used by the KMS API for vblank and page flip completion this
  * means all modern display drivers must use it.
  *
- * @offset is ignored, DRM events are read like a pipe. Therefore drivers also
- * must set the &file_operation.llseek to no_llseek(). Polling support is
+ * @offset is ignored, DRM events are read like a pipe. Polling support is
  * provided by drm_poll().
  *
  * This function will only ever read a full event. Therefore userspace must
diff --git a/fs/file_table.c b/fs/file_table.c
index 0658b822beeb..5727a63a7b67 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -235,7 +235,7 @@ static struct file *alloc_file(const struct path *path, int flags,
 	file->f_mapping = path->dentry->d_inode->i_mapping;
 	file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
 	file->f_sb_err = file_sample_sb_err(file);
-	if (fop->llseek && fop->llseek != no_llseek)
+	if (fop->llseek)
 		file->f_mode |= FMODE_LSEEK;
 	if ((file->f_mode & FMODE_READ) &&
 	     likely(fop->read || fop->read_iter))
diff --git a/fs/open.c b/fs/open.c
index 4488bd77c390..07c332753a36 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,8 +860,6 @@ static int do_dentry_open(struct file *f,
 		f->f_mode |= FMODE_CAN_WRITE;
 	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
 		f->f_mode &= ~FMODE_LSEEK;
-	if ((f->f_mode & FMODE_LSEEK) && f->f_op->llseek == no_llseek)
-		f->f_mode &= ~FMODE_LSEEK;
 	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
 		f->f_mode |= FMODE_CAN_ODIRECT;
 
diff --git a/fs/read_write.c b/fs/read_write.c
index d94b6dbba6f9..6b2849b34781 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -227,12 +227,6 @@ loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 }
 EXPORT_SYMBOL(noop_llseek);
 
-loff_t no_llseek(struct file *file, loff_t offset, int whence)
-{
-	return -ESPIPE;
-}
-EXPORT_SYMBOL(no_llseek);
-
 loff_t default_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file_inode(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..294932167335 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3022,7 +3022,7 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
-extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
+#define no_llseek NULL
 extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..8af0cbf9c0cd 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -81,10 +81,9 @@ static bool bpf_iter_support_resched(struct seq_file *seq)
 #define MAX_ITER_OBJECTS	1000000
 
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
- * no_llseek is assumed for this file.
  * The following are differences from seq_read():
  *  . fixed buffer size (PAGE_SIZE)
- *  . assuming no_llseek
+ *  . assuming NULL ->llseek()
  *  . stop() may call bpf program, handling potential overflow there
  */
 static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
-- 
cgit 


From bc72d938c149197688ae3b3ecaa25d4aee8653cb Mon Sep 17 00:00:00 2001
From: Dmitry Rokosov <DDRokosov@sberdevices.ru>
Date: Wed, 1 Jun 2022 17:48:32 +0000
Subject: iio: trigger: move trig->owner init to trigger allocate() stage

To provide a new IIO trigger to the IIO core, usually driver executes the
following pipeline: allocate()/register()/get(). Before, IIO core assigned
trig->owner as a pointer to the module which registered this trigger at
the register() stage. But actually the trigger object is owned by the
module earlier, on the allocate() stage, when trigger object is
successfully allocated for the driver.

This patch moves trig->owner initialization from register()
stage of trigger initialization pipeline to allocate() stage to
eliminate all misunderstandings and time gaps between trigger object
creation and owner acquiring.

Signed-off-by: Dmitry Rokosov <ddrokosov@sberdevices.ru>
Link: https://lore.kernel.org/r/20220601174837.20292-1-ddrokosov@sberdevices.ru
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-trigger.c | 46 +++++++++++++++++++++-----------------
 include/linux/iio/iio.h            |  9 +++++---
 include/linux/iio/trigger.h        | 21 +++++++++--------
 3 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 26d610d4cbb8..efc01584b3f9 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -63,13 +63,10 @@ ATTRIBUTE_GROUPS(iio_trig_dev);
 
 static struct iio_trigger *__iio_trigger_find_by_name(const char *name);
 
-int __iio_trigger_register(struct iio_trigger *trig_info,
-			   struct module *this_mod)
+int iio_trigger_register(struct iio_trigger *trig_info)
 {
 	int ret;
 
-	trig_info->owner = this_mod;
-
 	trig_info->id = ida_alloc(&iio_trigger_ida, GFP_KERNEL);
 	if (trig_info->id < 0)
 		return trig_info->id;
@@ -100,7 +97,7 @@ error_unregister_id:
 	ida_free(&iio_trigger_ida, trig_info->id);
 	return ret;
 }
-EXPORT_SYMBOL(__iio_trigger_register);
+EXPORT_SYMBOL(iio_trigger_register);
 
 void iio_trigger_unregister(struct iio_trigger *trig_info)
 {
@@ -547,8 +544,9 @@ static void iio_trig_subirqunmask(struct irq_data *d)
 	trig->subirqs[d->irq - trig->subirq_base].enabled = true;
 }
 
-static __printf(2, 0)
+static __printf(3, 0)
 struct iio_trigger *viio_trigger_alloc(struct device *parent,
+				       struct module *this_mod,
 				       const char *fmt,
 				       va_list vargs)
 {
@@ -578,6 +576,8 @@ struct iio_trigger *viio_trigger_alloc(struct device *parent,
 
 	INIT_LIST_HEAD(&trig->list);
 
+	trig->owner = this_mod;
+
 	trig->subirq_chip.name = trig->name;
 	trig->subirq_chip.irq_mask = &iio_trig_subirqmask;
 	trig->subirq_chip.irq_unmask = &iio_trig_subirqunmask;
@@ -598,8 +598,9 @@ free_trig:
 }
 
 /**
- * iio_trigger_alloc - Allocate a trigger
+ * __iio_trigger_alloc - Allocate a trigger
  * @parent:		Device to allocate iio_trigger for
+ * @this_mod:		module allocating the trigger
  * @fmt:		trigger name format. If it includes format
  *			specifiers, the additional arguments following
  *			format are formatted and inserted in the resulting
@@ -607,18 +608,20 @@ free_trig:
  * RETURNS:
  * Pointer to allocated iio_trigger on success, NULL on failure.
  */
-struct iio_trigger *iio_trigger_alloc(struct device *parent, const char *fmt, ...)
+struct iio_trigger *__iio_trigger_alloc(struct device *parent,
+					struct module *this_mod,
+					const char *fmt, ...)
 {
 	struct iio_trigger *trig;
 	va_list vargs;
 
 	va_start(vargs, fmt);
-	trig = viio_trigger_alloc(parent, fmt, vargs);
+	trig = viio_trigger_alloc(parent, this_mod, fmt, vargs);
 	va_end(vargs);
 
 	return trig;
 }
-EXPORT_SYMBOL(iio_trigger_alloc);
+EXPORT_SYMBOL(__iio_trigger_alloc);
 
 void iio_trigger_free(struct iio_trigger *trig)
 {
@@ -633,10 +636,11 @@ static void devm_iio_trigger_release(struct device *dev, void *res)
 }
 
 /**
- * devm_iio_trigger_alloc - Resource-managed iio_trigger_alloc()
+ * __devm_iio_trigger_alloc - Resource-managed iio_trigger_alloc()
  * Managed iio_trigger_alloc.  iio_trigger allocated with this function is
  * automatically freed on driver detach.
  * @parent:		Device to allocate iio_trigger for
+ * @this_mod:		module allocating the trigger
  * @fmt:		trigger name format. If it includes format
  *			specifiers, the additional arguments following
  *			format are formatted and inserted in the resulting
@@ -646,7 +650,9 @@ static void devm_iio_trigger_release(struct device *dev, void *res)
  * RETURNS:
  * Pointer to allocated iio_trigger on success, NULL on failure.
  */
-struct iio_trigger *devm_iio_trigger_alloc(struct device *parent, const char *fmt, ...)
+struct iio_trigger *__devm_iio_trigger_alloc(struct device *parent,
+					     struct module *this_mod,
+					     const char *fmt, ...)
 {
 	struct iio_trigger **ptr, *trig;
 	va_list vargs;
@@ -658,7 +664,7 @@ struct iio_trigger *devm_iio_trigger_alloc(struct device *parent, const char *fm
 
 	/* use raw alloc_dr for kmalloc caller tracing */
 	va_start(vargs, fmt);
-	trig = viio_trigger_alloc(parent, fmt, vargs);
+	trig = viio_trigger_alloc(parent, this_mod, fmt, vargs);
 	va_end(vargs);
 	if (trig) {
 		*ptr = trig;
@@ -669,7 +675,7 @@ struct iio_trigger *devm_iio_trigger_alloc(struct device *parent, const char *fm
 
 	return trig;
 }
-EXPORT_SYMBOL_GPL(devm_iio_trigger_alloc);
+EXPORT_SYMBOL_GPL(__devm_iio_trigger_alloc);
 
 static void devm_iio_trigger_unreg(void *trigger_info)
 {
@@ -677,10 +683,9 @@ static void devm_iio_trigger_unreg(void *trigger_info)
 }
 
 /**
- * __devm_iio_trigger_register - Resource-managed iio_trigger_register()
+ * devm_iio_trigger_register - Resource-managed iio_trigger_register()
  * @dev:	device this trigger was allocated for
  * @trig_info:	trigger to register
- * @this_mod:   module registering the trigger
  *
  * Managed iio_trigger_register().  The IIO trigger registered with this
  * function is automatically unregistered on driver detach. This function
@@ -690,19 +695,18 @@ static void devm_iio_trigger_unreg(void *trigger_info)
  * RETURNS:
  * 0 on success, negative error number on failure.
  */
-int __devm_iio_trigger_register(struct device *dev,
-				struct iio_trigger *trig_info,
-				struct module *this_mod)
+int devm_iio_trigger_register(struct device *dev,
+			      struct iio_trigger *trig_info)
 {
 	int ret;
 
-	ret = __iio_trigger_register(trig_info, this_mod);
+	ret = iio_trigger_register(trig_info);
 	if (ret)
 		return ret;
 
 	return devm_add_action_or_reset(dev, devm_iio_trigger_unreg, trig_info);
 }
-EXPORT_SYMBOL_GPL(__devm_iio_trigger_register);
+EXPORT_SYMBOL_GPL(devm_iio_trigger_register);
 
 bool iio_trigger_using_own(struct iio_dev *indio_dev)
 {
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d9b4a9ca9a0f..5dfbfc991c69 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -727,10 +727,13 @@ static inline void *iio_priv(const struct iio_dev *indio_dev)
 
 void iio_device_free(struct iio_dev *indio_dev);
 struct iio_dev *devm_iio_device_alloc(struct device *parent, int sizeof_priv);
-__printf(2, 3)
-struct iio_trigger *devm_iio_trigger_alloc(struct device *parent,
-					   const char *fmt, ...);
 
+#define devm_iio_trigger_alloc(parent, fmt, ...) \
+	__devm_iio_trigger_alloc((parent), THIS_MODULE, (fmt), ##__VA_ARGS__)
+__printf(3, 4)
+struct iio_trigger *__devm_iio_trigger_alloc(struct device *parent,
+					     struct module *this_mod,
+					     const char *fmt, ...);
 /**
  * iio_get_debugfs_dentry() - helper function to get the debugfs_dentry
  * @indio_dev:		IIO device structure for device
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 03b1d6863436..f6360d9a492d 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -131,16 +131,10 @@ static inline void *iio_trigger_get_drvdata(struct iio_trigger *trig)
  * iio_trigger_register() - register a trigger with the IIO core
  * @trig_info:	trigger to be registered
  **/
-#define iio_trigger_register(trig_info) \
-	__iio_trigger_register((trig_info), THIS_MODULE)
-int __iio_trigger_register(struct iio_trigger *trig_info,
-			   struct module *this_mod);
+int iio_trigger_register(struct iio_trigger *trig_info);
 
-#define devm_iio_trigger_register(dev, trig_info) \
-	__devm_iio_trigger_register((dev), (trig_info), THIS_MODULE)
-int __devm_iio_trigger_register(struct device *dev,
-				struct iio_trigger *trig_info,
-				struct module *this_mod);
+int devm_iio_trigger_register(struct device *dev,
+			      struct iio_trigger *trig_info);
 
 /**
  * iio_trigger_unregister() - unregister a trigger from the core
@@ -168,8 +162,13 @@ void iio_trigger_poll_chained(struct iio_trigger *trig);
 
 irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private);
 
-__printf(2, 3)
-struct iio_trigger *iio_trigger_alloc(struct device *parent, const char *fmt, ...);
+#define iio_trigger_alloc(parent, fmt, ...) \
+	__iio_trigger_alloc((parent), THIS_MODULE, (fmt), ##__VA_ARGS__)
+
+__printf(3, 4)
+struct iio_trigger *__iio_trigger_alloc(struct device *parent,
+					struct module *this_mod,
+					const char *fmt, ...);
 void iio_trigger_free(struct iio_trigger *trig);
 
 /**
-- 
cgit 


From 3c8e19d3d3f9a20cde987fa73fd83b13dcc8604f Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Wed, 6 Jul 2022 19:28:55 +0100
Subject: media: Add P010 tiled format

Add P010 tiled format

[rebased, updated pixel format name and added description]

Tested-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst      | 18 +++++++++++++++---
 drivers/media/v4l2-core/v4l2-common.c                  |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c                   |  1 +
 include/uapi/linux/videodev2.h                         |  1 +
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index 997ce2d094fc..10b1feeb0b57 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -116,6 +116,13 @@ All components are stored with the same number of bits per component.
       - Cb, Cr
       - Yes
       - Linear
+    * - V4L2_PIX_FMT_P010_4L4
+      - 'T010'
+      - 10
+      - 4:2:0
+      - Cb, Cr
+      - Yes
+      - 4x4 tiles
     * - V4L2_PIX_FMT_NV16
       - 'NV16'
       - 8
@@ -528,11 +535,12 @@ number of lines as the luma plane.
       - Cr\ :sub:`33`
 
 .. _V4L2_PIX_FMT_P010:
+.. _V4L2-PIX-FMT-P010-4L4:
 
-P010
-----
+P010 and tiled P010
+-------------------
 
-Like NV12 with 10 bits per component, expanded to 16 bits.
+P010 is like NV12 with 10 bits per component, expanded to 16 bits.
 Data in the 10 high bits, zeros in the 6 low bits, arranged in little endian order.
 
 .. flat-table:: Sample 4x4 P010 Image
@@ -589,6 +597,10 @@ relationship between the luma and chroma line padding and stride.
 
 All components are stored with the same number of bits per component.
 
+``V4L2_PIX_FMT_P010_4L4`` stores pixels in 4x4 tiles, and stores tiles linearly
+in memory. The line stride must be aligned to multiple of 8 and image height to
+a multiple of 4. The layouts of the luma and chroma planes are identical.
+
 .. raw:: latex
 
     \small
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 1e38ad8906a2..e0fbe6ba4b6c 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -278,6 +278,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 
 		/* Tiled YUV formats */
 		{ .format = V4L2_PIX_FMT_NV12_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 },
+		{ .format = V4L2_PIX_FMT_P010_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 4, 0, 0 }, .hdiv = 2, .vdiv = 2 },
 
 		/* YUV planar formats, non contiguous variant */
 		{ .format = V4L2_PIX_FMT_YUV420M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index d9b6b8a678bc..c314025d977e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1357,6 +1357,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV12_4L4:	descr = "Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_16L16:	descr = "Y/CbCr 4:2:0 (16x16 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_32L32:   descr = "Y/CbCr 4:2:0 (32x32 Linear)"; break;
+	case V4L2_PIX_FMT_P010_4L4:	descr = "10-bit Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12M:	descr = "Y/CbCr 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_NV21M:	descr = "Y/CrCb 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_NV16M:	descr = "Y/CbCr 4:2:2 (N-C)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d6fac2344033..01e630f2ec78 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -654,6 +654,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV12_4L4 v4l2_fourcc('V', 'T', '1', '2')   /* 12  Y/CbCr 4:2:0  4x4 tiles */
 #define V4L2_PIX_FMT_NV12_16L16 v4l2_fourcc('H', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 16x16 tiles */
 #define V4L2_PIX_FMT_NV12_32L32 v4l2_fourcc('S', 'T', '1', '2') /* 12  Y/CbCr 4:2:0 32x32 tiles */
+#define V4L2_PIX_FMT_P010_4L4 v4l2_fourcc('T', '0', '1', '0') /* 12  Y/CbCr 4:2:0 10-bit 4x4 macroblocks */
 
 /* Tiled YUV formats, non contiguous planes */
 #define V4L2_PIX_FMT_NV12MT  v4l2_fourcc('T', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 64x32 tiles */
-- 
cgit 


From 9d042e457ef8e3f883f0cba8a1eed633906deb78 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Mon, 11 Jul 2022 22:11:34 +0100
Subject: media: videobuf2: Introduce vb2_find_buffer()

All users of vb2_find_timestamp() combine it with vb2_get_buffer()
to retrieve a videobuf2 buffer, given a u64 timestamp.

Introduce an API for this use-case. Users will be converted to the new
API as follow-up commits.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Acked-by: Tomasz Figa <tfiga@chromium.org>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-v4l2.c | 12 ++++++++++++
 include/media/videobuf2-v4l2.h                  | 10 ++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 075d24ebf44c..f26cb8586bd4 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -638,6 +638,18 @@ int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
 }
 EXPORT_SYMBOL_GPL(vb2_find_timestamp);
 
+struct vb2_buffer *vb2_find_buffer(struct vb2_queue *q, u64 timestamp)
+{
+	unsigned int i;
+
+	for (i = 0; i < q->num_buffers; i++)
+		if (q->bufs[i]->copied_timestamp &&
+		    q->bufs[i]->timestamp == timestamp)
+			return vb2_get_buffer(q, i);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(vb2_find_buffer);
+
 /*
  * vb2_querybuf() - query video buffer information
  * @q:		videobuf queue
diff --git a/include/media/videobuf2-v4l2.h b/include/media/videobuf2-v4l2.h
index d818d9707695..76e405c0b003 100644
--- a/include/media/videobuf2-v4l2.h
+++ b/include/media/videobuf2-v4l2.h
@@ -78,6 +78,16 @@ struct vb2_v4l2_buffer {
 int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
 		       unsigned int start_idx);
 
+/**
+ * vb2_find_buffer() - Find a buffer with given timestamp
+ *
+ * @q:		pointer to &struct vb2_queue with videobuf2 queue.
+ * @timestamp:	the timestamp to find.
+ *
+ * Returns the buffer with the given @timestamp, or NULL if not found.
+ */
+struct vb2_buffer *vb2_find_buffer(struct vb2_queue *q, u64 timestamp);
+
 int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b);
 
 /**
-- 
cgit 


From 57cb848f004820a88279f91ca108af33554cd38e Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Tue, 14 Jun 2022 20:10:48 +0100
Subject: media: v4l2-async: Add notifier operation to destroy asd instances

Drivers typically extend the v4l2_async_subdev structure by embedding it
in a driver-specific structure, to store per-subdev custom data. The
v4l2_async_subdev instances are freed by the v4l2-async framework, which
makes this mechanism cumbersome to use safely when custom data needs
special treatment to be destroyed (such as freeing additional memory, or
releasing references to kernel objects).

To ease this, add a .destroy() operation to the
v4l2_async_notifier_operations structure. The operation is called right
before the v4l2_async_subdev is freed, giving drivers a chance to
destroy data if needed.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/driver-api/media/v4l2-subdev.rst |  6 ++++++
 drivers/media/v4l2-core/v4l2-async.c           | 10 ++++++++++
 include/media/v4l2-async.h                     |  2 ++
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-api/media/v4l2-subdev.rst b/Documentation/driver-api/media/v4l2-subdev.rst
index cf3b52bdbfb9..6f8d79926aa5 100644
--- a/Documentation/driver-api/media/v4l2-subdev.rst
+++ b/Documentation/driver-api/media/v4l2-subdev.rst
@@ -243,6 +243,12 @@ notifier callback is called. After all subdevices have been located the
 .complete() callback is called. When a subdevice is removed from the
 system the .unbind() method is called. All three callbacks are optional.
 
+Drivers can store any type of custom data in their driver-specific
+:c:type:`v4l2_async_subdev` wrapper. If any of that data requires special
+handling when the structure is freed, drivers must implement the ``.destroy()``
+notifier callback. The framework will call it right before freeing the
+:c:type:`v4l2_async_subdev`.
+
 Calling subdev operations
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c
index b16f3ce8e5ef..2f1b718a9189 100644
--- a/drivers/media/v4l2-core/v4l2-async.c
+++ b/drivers/media/v4l2-core/v4l2-async.c
@@ -52,6 +52,15 @@ static int v4l2_async_nf_call_complete(struct v4l2_async_notifier *n)
 	return n->ops->complete(n);
 }
 
+static void v4l2_async_nf_call_destroy(struct v4l2_async_notifier *n,
+				       struct v4l2_async_subdev *asd)
+{
+	if (!n->ops || !n->ops->destroy)
+		return;
+
+	n->ops->destroy(asd);
+}
+
 static bool match_i2c(struct v4l2_async_notifier *notifier,
 		      struct v4l2_subdev *sd, struct v4l2_async_subdev *asd)
 {
@@ -633,6 +642,7 @@ static void __v4l2_async_nf_cleanup(struct v4l2_async_notifier *notifier)
 		}
 
 		list_del(&asd->asd_list);
+		v4l2_async_nf_call_destroy(notifier, asd);
 		kfree(asd);
 	}
 }
diff --git a/include/media/v4l2-async.h b/include/media/v4l2-async.h
index 13ff3ad948f4..25eb1d138c06 100644
--- a/include/media/v4l2-async.h
+++ b/include/media/v4l2-async.h
@@ -81,6 +81,7 @@ struct v4l2_async_subdev {
  * @complete:	All subdevices have been probed successfully. The complete
  *		callback is only executed for the root notifier.
  * @unbind:	a subdevice is leaving
+ * @destroy:	the asd is about to be freed
  */
 struct v4l2_async_notifier_operations {
 	int (*bound)(struct v4l2_async_notifier *notifier,
@@ -90,6 +91,7 @@ struct v4l2_async_notifier_operations {
 	void (*unbind)(struct v4l2_async_notifier *notifier,
 		       struct v4l2_subdev *subdev,
 		       struct v4l2_async_subdev *asd);
+	void (*destroy)(struct v4l2_async_subdev *asd);
 };
 
 /**
-- 
cgit 


From b2e44430b6348f68f56e78e932e6312f12128778 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Sat, 25 Jun 2022 18:02:24 +0100
Subject: media: mc-entity: Rename media_entity_remote_pad() to
 media_pad_remote_pad_first()

The media_entity_remote_pad() is misnamed, as it operates on a pad and
not an entity. Rename it to media_pad_remote_pad_first() to clarify its
behaviour.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/driver-api/media/mc-core.rst                 | 3 +--
 drivers/media/i2c/adv748x/adv748x.h                        | 2 +-
 drivers/media/i2c/tvp5150.c                                | 2 +-
 drivers/media/mc/mc-entity.c                               | 4 ++--
 drivers/media/pci/intel/ipu3/ipu3-cio2-main.c              | 2 +-
 drivers/media/platform/qcom/camss/camss-csid.c             | 6 +++---
 drivers/media/platform/qcom/camss/camss-csiphy.c           | 2 +-
 drivers/media/platform/qcom/camss/camss-ispif.c            | 4 ++--
 drivers/media/platform/qcom/camss/camss-vfe.c              | 2 +-
 drivers/media/platform/qcom/camss/camss-video.c            | 6 +++---
 drivers/media/platform/qcom/camss/camss.c                  | 2 +-
 drivers/media/platform/renesas/rcar-vin/rcar-core.c        | 2 +-
 drivers/media/platform/renesas/rcar-vin/rcar-csi2.c        | 2 +-
 drivers/media/platform/renesas/rcar-vin/rcar-dma.c         | 2 +-
 drivers/media/platform/renesas/rcar-vin/rcar-v4l2.c        | 2 +-
 drivers/media/platform/renesas/vsp1/vsp1_entity.c          | 4 ++--
 drivers/media/platform/renesas/vsp1/vsp1_video.c           | 2 +-
 drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c        | 2 +-
 drivers/media/platform/samsung/exynos4-is/common.c         | 2 +-
 drivers/media/platform/samsung/exynos4-is/fimc-capture.c   | 6 +++---
 drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c | 2 +-
 drivers/media/platform/samsung/exynos4-is/fimc-lite.c      | 2 +-
 drivers/media/platform/samsung/exynos4-is/media-dev.c      | 2 +-
 drivers/media/platform/samsung/s3c-camif/camif-capture.c   | 2 +-
 drivers/media/platform/st/stm32/stm32-dcmi.c               | 6 +++---
 drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c       | 4 ++--
 drivers/media/platform/ti/cal/cal-camerarx.c               | 2 +-
 drivers/media/platform/ti/cal/cal-video.c                  | 2 +-
 drivers/media/platform/ti/omap3isp/isp.c                   | 6 +++---
 drivers/media/platform/ti/omap3isp/ispccdc.c               | 2 +-
 drivers/media/platform/ti/omap3isp/ispccp2.c               | 2 +-
 drivers/media/platform/ti/omap3isp/ispcsi2.c               | 2 +-
 drivers/media/platform/ti/omap3isp/ispvideo.c              | 4 ++--
 drivers/media/platform/video-mux.c                         | 2 +-
 drivers/media/platform/xilinx/xilinx-csi2rxss.c            | 2 +-
 drivers/media/platform/xilinx/xilinx-dma.c                 | 4 ++--
 drivers/media/test-drivers/vimc/vimc-streamer.c            | 2 +-
 drivers/staging/media/imx/imx-media-dev-common.c           | 2 +-
 drivers/staging/media/imx/imx-media-utils.c                | 2 +-
 drivers/staging/media/imx/imx7-media-csi.c                 | 2 +-
 drivers/staging/media/omap4iss/iss.c                       | 6 +++---
 drivers/staging/media/omap4iss/iss_csi2.c                  | 2 +-
 drivers/staging/media/omap4iss/iss_video.c                 | 2 +-
 drivers/staging/media/tegra-video/vi.c                     | 4 ++--
 include/media/media-entity.h                               | 4 ++--
 45 files changed, 65 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst
index 02481a2513b9..6eea6a3b6441 100644
--- a/Documentation/driver-api/media/mc-core.rst
+++ b/Documentation/driver-api/media/mc-core.rst
@@ -186,8 +186,7 @@ is required and the graph structure can be freed normally.
 
 Helper functions can be used to find a link between two given pads, or a pad
 connected to another pad through an enabled link
-:c:func:`media_entity_find_link()` and
-:c:func:`media_entity_remote_pad()`.
+:c:func:`media_entity_find_link()` and :c:func:`media_pad_remote_pad_first()`.
 
 Use count and power handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/drivers/media/i2c/adv748x/adv748x.h b/drivers/media/i2c/adv748x/adv748x.h
index 31bac06d46b5..d75eb3d8be5a 100644
--- a/drivers/media/i2c/adv748x/adv748x.h
+++ b/drivers/media/i2c/adv748x/adv748x.h
@@ -417,7 +417,7 @@ int adv748x_write_block(struct adv748x_state *state, int client_page,
 
 static inline struct v4l2_subdev *adv748x_get_remote_sd(struct media_pad *pad)
 {
-	pad = media_entity_remote_pad(pad);
+	pad = media_pad_remote_pad_first(pad);
 	if (!pad)
 		return NULL;
 
diff --git a/drivers/media/i2c/tvp5150.c b/drivers/media/i2c/tvp5150.c
index 65472438444b..93a980c4e899 100644
--- a/drivers/media/i2c/tvp5150.c
+++ b/drivers/media/i2c/tvp5150.c
@@ -1285,7 +1285,7 @@ static int tvp5150_disable_all_input_links(struct tvp5150 *decoder)
 	int err;
 
 	for (i = 0; i < TVP5150_NUM_PADS - 1; i++) {
-		connector_pad = media_entity_remote_pad(&decoder->pads[i]);
+		connector_pad = media_pad_remote_pad_first(&decoder->pads[i]);
 		if (!connector_pad)
 			continue;
 
diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c
index cebb905260d3..66ba0629c4ce 100644
--- a/drivers/media/mc/mc-entity.c
+++ b/drivers/media/mc/mc-entity.c
@@ -901,7 +901,7 @@ media_entity_find_link(struct media_pad *source, struct media_pad *sink)
 }
 EXPORT_SYMBOL_GPL(media_entity_find_link);
 
-struct media_pad *media_entity_remote_pad(const struct media_pad *pad)
+struct media_pad *media_pad_remote_pad_first(const struct media_pad *pad)
 {
 	struct media_link *link;
 
@@ -919,7 +919,7 @@ struct media_pad *media_entity_remote_pad(const struct media_pad *pad)
 	return NULL;
 
 }
-EXPORT_SYMBOL_GPL(media_entity_remote_pad);
+EXPORT_SYMBOL_GPL(media_pad_remote_pad_first);
 
 static void media_interface_init(struct media_device *mdev,
 				 struct media_interface *intf,
diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
index dbdbdb648a0d..a3fe547b7fce 100644
--- a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
+++ b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
@@ -1323,7 +1323,7 @@ static int cio2_video_link_validate(struct media_link *link)
 	struct v4l2_subdev_format source_fmt;
 	int ret;
 
-	if (!media_entity_remote_pad(entity->pads)) {
+	if (!media_pad_remote_pad_first(entity->pads)) {
 		dev_info(dev, "video node %s pad not connected\n", vd->name);
 		return -ENOTCONN;
 	}
diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c
index 80628801cf09..88f188e0f750 100644
--- a/drivers/media/platform/qcom/camss/camss-csid.c
+++ b/drivers/media/platform/qcom/camss/camss-csid.c
@@ -245,7 +245,7 @@ static int csid_set_stream(struct v4l2_subdev *sd, int enable)
 		}
 
 		if (!csid->testgen.enabled &&
-		    !media_entity_remote_pad(&csid->pads[MSM_CSID_PAD_SINK]))
+		    !media_pad_remote_pad_first(&csid->pads[MSM_CSID_PAD_SINK]))
 			return -ENOLINK;
 	}
 
@@ -518,7 +518,7 @@ static int csid_set_test_pattern(struct csid_device *csid, s32 value)
 	struct csid_testgen_config *tg = &csid->testgen;
 
 	/* If CSID is linked to CSIPHY, do not allow to enable test generator */
-	if (value && media_entity_remote_pad(&csid->pads[MSM_CSID_PAD_SINK]))
+	if (value && media_pad_remote_pad_first(&csid->pads[MSM_CSID_PAD_SINK]))
 		return -EBUSY;
 
 	tg->enabled = !!value;
@@ -729,7 +729,7 @@ static int csid_link_setup(struct media_entity *entity,
 			   const struct media_pad *remote, u32 flags)
 {
 	if (flags & MEDIA_LNK_FL_ENABLED)
-		if (media_entity_remote_pad(local))
+		if (media_pad_remote_pad_first(local))
 			return -EBUSY;
 
 	if ((local->flags & MEDIA_PAD_FL_SINK) &&
diff --git a/drivers/media/platform/qcom/camss/camss-csiphy.c b/drivers/media/platform/qcom/camss/camss-csiphy.c
index 75fcfc627400..3f726a7237f5 100644
--- a/drivers/media/platform/qcom/camss/camss-csiphy.c
+++ b/drivers/media/platform/qcom/camss/camss-csiphy.c
@@ -693,7 +693,7 @@ static int csiphy_link_setup(struct media_entity *entity,
 		struct csiphy_device *csiphy;
 		struct csid_device *csid;
 
-		if (media_entity_remote_pad(local))
+		if (media_pad_remote_pad_first(local))
 			return -EBUSY;
 
 		sd = media_entity_to_v4l2_subdev(entity);
diff --git a/drivers/media/platform/qcom/camss/camss-ispif.c b/drivers/media/platform/qcom/camss/camss-ispif.c
index 91e6a2b9ac50..b713f5b86aba 100644
--- a/drivers/media/platform/qcom/camss/camss-ispif.c
+++ b/drivers/media/platform/qcom/camss/camss-ispif.c
@@ -812,7 +812,7 @@ static int ispif_set_stream(struct v4l2_subdev *sd, int enable)
 	int ret;
 
 	if (enable) {
-		if (!media_entity_remote_pad(&line->pads[MSM_ISPIF_PAD_SINK]))
+		if (!media_pad_remote_pad_first(&line->pads[MSM_ISPIF_PAD_SINK]))
 			return -ENOLINK;
 
 		/* Config */
@@ -1301,7 +1301,7 @@ static int ispif_link_setup(struct media_entity *entity,
 			    const struct media_pad *remote, u32 flags)
 {
 	if (flags & MEDIA_LNK_FL_ENABLED) {
-		if (media_entity_remote_pad(local))
+		if (media_pad_remote_pad_first(local))
 			return -EBUSY;
 
 		if (local->flags & MEDIA_PAD_FL_SINK) {
diff --git a/drivers/media/platform/qcom/camss/camss-vfe.c b/drivers/media/platform/qcom/camss/camss-vfe.c
index 76e28b832568..a26e4a5d87b6 100644
--- a/drivers/media/platform/qcom/camss/camss-vfe.c
+++ b/drivers/media/platform/qcom/camss/camss-vfe.c
@@ -1436,7 +1436,7 @@ static int vfe_link_setup(struct media_entity *entity,
 			  const struct media_pad *remote, u32 flags)
 {
 	if (flags & MEDIA_LNK_FL_ENABLED)
-		if (media_entity_remote_pad(local))
+		if (media_pad_remote_pad_first(local))
 			return -EBUSY;
 
 	return 0;
diff --git a/drivers/media/platform/qcom/camss/camss-video.c b/drivers/media/platform/qcom/camss/camss-video.c
index 307bb1dc4589..290df04c4d02 100644
--- a/drivers/media/platform/qcom/camss/camss-video.c
+++ b/drivers/media/platform/qcom/camss/camss-video.c
@@ -328,7 +328,7 @@ static struct v4l2_subdev *video_remote_subdev(struct camss_video *video,
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(&video->pad);
+	remote = media_pad_remote_pad_first(&video->pad);
 
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
@@ -507,7 +507,7 @@ static int video_start_streaming(struct vb2_queue *q, unsigned int count)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -543,7 +543,7 @@ static void video_stop_streaming(struct vb2_queue *q)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c
index 932968e5f1e5..1118c40886d5 100644
--- a/drivers/media/platform/qcom/camss/camss.c
+++ b/drivers/media/platform/qcom/camss/camss.c
@@ -937,7 +937,7 @@ struct media_entity *camss_find_sensor(struct media_entity *entity)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			return NULL;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			return NULL;
 
diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-core.c b/drivers/media/platform/renesas/rcar-vin/rcar-core.c
index 4b7a9743554a..968a74234e92 100644
--- a/drivers/media/platform/renesas/rcar-vin/rcar-core.c
+++ b/drivers/media/platform/renesas/rcar-vin/rcar-core.c
@@ -845,7 +845,7 @@ static int rvin_csi2_link_notify(struct media_link *link, u32 flags,
 				continue;
 
 			/* Get remote CSI-2, if any. */
-			csi_pad = media_entity_remote_pad(
+			csi_pad = media_pad_remote_pad_first(
 					&group->vin[i]->vdev.entity.pads[0]);
 			if (!csi_pad)
 				continue;
diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-csi2.c b/drivers/media/platform/renesas/rcar-vin/rcar-csi2.c
index fea8f00a9152..174aa6176f54 100644
--- a/drivers/media/platform/renesas/rcar-vin/rcar-csi2.c
+++ b/drivers/media/platform/renesas/rcar-vin/rcar-csi2.c
@@ -1313,7 +1313,7 @@ static int rcsi2_link_setup(struct media_entity *entity,
 	channel = id % 4;
 
 	if (flags & MEDIA_LNK_FL_ENABLED) {
-		if (media_entity_remote_pad(local)) {
+		if (media_pad_remote_pad_first(local)) {
 			dev_dbg(priv->dev,
 				"Each VC can only be routed to one output channel\n");
 			return -EINVAL;
diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c
index 6644b498929d..8d37fbdc266a 100644
--- a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c
+++ b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c
@@ -1258,7 +1258,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on)
 		return ret == -ENOIOCTLCMD ? 0 : ret;
 	}
 
-	pad = media_entity_remote_pad(&vin->pad);
+	pad = media_pad_remote_pad_first(&vin->pad);
 	if (!pad)
 		return -EPIPE;
 
diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-v4l2.c b/drivers/media/platform/renesas/rcar-vin/rcar-v4l2.c
index 2e2aa9d746ee..576059f9bbe3 100644
--- a/drivers/media/platform/renesas/rcar-vin/rcar-v4l2.c
+++ b/drivers/media/platform/renesas/rcar-vin/rcar-v4l2.c
@@ -1032,7 +1032,7 @@ static void rvin_notify(struct v4l2_subdev *sd,
 		if (!vin)
 			continue;
 
-		pad = media_entity_remote_pad(&vin->pad);
+		pad = media_pad_remote_pad_first(&vin->pad);
 		if (!pad)
 			continue;
 
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_entity.c b/drivers/media/platform/renesas/vsp1/vsp1_entity.c
index a116a3362f9e..4c3bd2b1ca28 100644
--- a/drivers/media/platform/renesas/vsp1/vsp1_entity.c
+++ b/drivers/media/platform/renesas/vsp1/vsp1_entity.c
@@ -516,8 +516,8 @@ int vsp1_entity_link_setup(struct media_entity *entity,
  * higher than one for the data pipelines, except for the links to the HGO and
  * HGT that can be enabled in addition to a regular data link. When traversing
  * outgoing links this function ignores HGO and HGT entities and should thus be
- * used in place of the generic media_entity_remote_pad() function to traverse
- * data pipelines.
+ * used in place of the generic media_pad_remote_pad_first() function to
+ * traverse data pipelines.
  *
  * Return a pointer to the pad at the remote end of the first found enabled
  * link, or NULL if no enabled link has been found.
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_video.c b/drivers/media/platform/renesas/vsp1/vsp1_video.c
index 51219b1b6ea9..e8e0ee5f2277 100644
--- a/drivers/media/platform/renesas/vsp1/vsp1_video.c
+++ b/drivers/media/platform/renesas/vsp1/vsp1_video.c
@@ -50,7 +50,7 @@ vsp1_video_remote_subdev(struct media_pad *local, u32 *pad)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(local);
+	remote = media_pad_remote_pad_first(local);
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
 
diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c
index 187d78075acb..a97c145bad98 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c
@@ -200,7 +200,7 @@ static struct v4l2_subdev *rkisp1_get_remote_sensor(struct v4l2_subdev *sd)
 	struct media_entity *sensor_me;
 
 	local = &sd->entity.pads[RKISP1_ISP_PAD_SINK_VIDEO];
-	remote = media_entity_remote_pad(local);
+	remote = media_pad_remote_pad_first(local);
 	if (!remote)
 		return NULL;
 
diff --git a/drivers/media/platform/samsung/exynos4-is/common.c b/drivers/media/platform/samsung/exynos4-is/common.c
index 26ee2388edfd..e41333535eac 100644
--- a/drivers/media/platform/samsung/exynos4-is/common.c
+++ b/drivers/media/platform/samsung/exynos4-is/common.c
@@ -21,7 +21,7 @@ struct v4l2_subdev *fimc_find_remote_sensor(struct media_entity *entity)
 
 	while (pad->flags & MEDIA_PAD_FL_SINK) {
 		/* source pad */
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-capture.c b/drivers/media/platform/samsung/exynos4-is/fimc-capture.c
index 7ff4024003f4..03638c8f772d 100644
--- a/drivers/media/platform/samsung/exynos4-is/fimc-capture.c
+++ b/drivers/media/platform/samsung/exynos4-is/fimc-capture.c
@@ -737,7 +737,7 @@ static struct media_entity *fimc_pipeline_get_head(struct media_entity *me)
 	struct media_pad *pad = &me->pads[0];
 
 	while (!(pad->flags & MEDIA_PAD_FL_SOURCE)) {
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad)
 			break;
 		me = pad->entity;
@@ -810,7 +810,7 @@ static int fimc_pipeline_try_format(struct fimc_ctx *ctx,
 					return ret;
 			}
 
-			pad = media_entity_remote_pad(&me->pads[sfmt.pad]);
+			pad = media_pad_remote_pad_first(&me->pads[sfmt.pad]);
 			if (!pad)
 				return -EINVAL;
 			me = pad->entity;
@@ -1115,7 +1115,7 @@ static int fimc_pipeline_validate(struct fimc_dev *fimc)
 
 			if (p->flags & MEDIA_PAD_FL_SINK) {
 				sink_pad = p;
-				src_pad = media_entity_remote_pad(sink_pad);
+				src_pad = media_pad_remote_pad_first(sink_pad);
 				if (src_pad)
 					break;
 			}
diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c b/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c
index 83688a7982f7..8f12240b0eb7 100644
--- a/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c
+++ b/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c
@@ -465,7 +465,7 @@ static int isp_video_pipeline_validate(struct fimc_isp *isp)
 			return -EPIPE;
 
 		/* Retrieve format at the source pad */
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-lite.c b/drivers/media/platform/samsung/exynos4-is/fimc-lite.c
index 1a396b7cd9a9..41b0a4a5929a 100644
--- a/drivers/media/platform/samsung/exynos4-is/fimc-lite.c
+++ b/drivers/media/platform/samsung/exynos4-is/fimc-lite.c
@@ -789,7 +789,7 @@ static int fimc_pipeline_validate(struct fimc_lite *fimc)
 				return -EPIPE;
 		}
 		/* Retrieve format at the source pad */
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/platform/samsung/exynos4-is/media-dev.c b/drivers/media/platform/samsung/exynos4-is/media-dev.c
index 544b54e428c9..52b43ea04030 100644
--- a/drivers/media/platform/samsung/exynos4-is/media-dev.c
+++ b/drivers/media/platform/samsung/exynos4-is/media-dev.c
@@ -81,7 +81,7 @@ static void fimc_pipeline_prepare(struct fimc_pipeline *p,
 			struct media_pad *spad = &me->pads[i];
 			if (!(spad->flags & MEDIA_PAD_FL_SINK))
 				continue;
-			pad = media_entity_remote_pad(spad);
+			pad = media_pad_remote_pad_first(spad);
 			if (pad)
 				break;
 		}
diff --git a/drivers/media/platform/samsung/s3c-camif/camif-capture.c b/drivers/media/platform/samsung/s3c-camif/camif-capture.c
index 140854ab4dd8..c2d8f1e425d8 100644
--- a/drivers/media/platform/samsung/s3c-camif/camif-capture.c
+++ b/drivers/media/platform/samsung/s3c-camif/camif-capture.c
@@ -811,7 +811,7 @@ static int camif_pipeline_validate(struct camif_dev *camif)
 	int ret;
 
 	/* Retrieve format at the sensor subdev source pad */
-	pad = media_entity_remote_pad(&camif->pads[0]);
+	pad = media_pad_remote_pad_first(&camif->pads[0]);
 	if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 		return -EPIPE;
 
diff --git a/drivers/media/platform/st/stm32/stm32-dcmi.c b/drivers/media/platform/st/stm32/stm32-dcmi.c
index 9b14a113083f..2ca95ab2b0fe 100644
--- a/drivers/media/platform/st/stm32/stm32-dcmi.c
+++ b/drivers/media/platform/st/stm32/stm32-dcmi.c
@@ -611,7 +611,7 @@ static struct media_entity *dcmi_find_source(struct stm32_dcmi *dcmi)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -681,7 +681,7 @@ static int dcmi_pipeline_s_fmt(struct stm32_dcmi *dcmi,
 		}
 
 		/* Walk to next entity */
-		sink_pad = media_entity_remote_pad(src_pad);
+		sink_pad = media_pad_remote_pad_first(src_pad);
 		if (!sink_pad || !is_media_entity_v4l2_subdev(sink_pad->entity))
 			break;
 
@@ -705,7 +705,7 @@ static int dcmi_pipeline_s_stream(struct stm32_dcmi *dcmi, int state)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c
index 682c26536034..1d46e113d01d 100644
--- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c
+++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c
@@ -77,7 +77,7 @@ sun6i_video_remote_subdev(struct sun6i_video *video, u32 *pad)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(&video->pad);
+	remote = media_pad_remote_pad_first(&video->pad);
 
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
@@ -560,7 +560,7 @@ static int sun6i_video_link_validate(struct media_link *link)
 
 	video->mbus_code = 0;
 
-	if (!media_entity_remote_pad(link->sink->entity->pads)) {
+	if (!media_pad_remote_pad_first(link->sink->entity->pads)) {
 		dev_info(video->csi->dev,
 			 "video node %s pad not connected\n", vdev->name);
 		return -ENOLINK;
diff --git a/drivers/media/platform/ti/cal/cal-camerarx.c b/drivers/media/platform/ti/cal/cal-camerarx.c
index a0880f0091f7..e136d70b4048 100644
--- a/drivers/media/platform/ti/cal/cal-camerarx.c
+++ b/drivers/media/platform/ti/cal/cal-camerarx.c
@@ -592,7 +592,7 @@ int cal_camerarx_get_remote_frame_desc(struct cal_camerarx *phy,
 	if (!phy->source)
 		return -EPIPE;
 
-	pad = media_entity_remote_pad(&phy->pads[CAL_CAMERARX_PAD_SINK]);
+	pad = media_pad_remote_pad_first(&phy->pads[CAL_CAMERARX_PAD_SINK]);
 	if (!pad)
 		return -EPIPE;
 
diff --git a/drivers/media/platform/ti/cal/cal-video.c b/drivers/media/platform/ti/cal/cal-video.c
index 07ae1a34e6b0..776da0cfcdbe 100644
--- a/drivers/media/platform/ti/cal/cal-video.c
+++ b/drivers/media/platform/ti/cal/cal-video.c
@@ -685,7 +685,7 @@ static int cal_video_check_format(struct cal_ctx *ctx)
 	const struct v4l2_mbus_framefmt *format;
 	struct media_pad *remote_pad;
 
-	remote_pad = media_entity_remote_pad(&ctx->pad);
+	remote_pad = media_pad_remote_pad_first(&ctx->pad);
 	if (!remote_pad)
 		return -ENODEV;
 
diff --git a/drivers/media/platform/ti/omap3isp/isp.c b/drivers/media/platform/ti/omap3isp/isp.c
index 4c937f3f323e..d251736eb420 100644
--- a/drivers/media/platform/ti/omap3isp/isp.c
+++ b/drivers/media/platform/ti/omap3isp/isp.c
@@ -700,7 +700,7 @@ static int isp_pipeline_enable(struct isp_pipeline *pipe,
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -797,7 +797,7 @@ static int isp_pipeline_disable(struct isp_pipeline *pipe)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -942,7 +942,7 @@ static int isp_pipeline_is_last(struct media_entity *me)
 	pipe = to_isp_pipeline(me);
 	if (pipe->stream_state == ISP_PIPELINE_STREAM_STOPPED)
 		return 0;
-	pad = media_entity_remote_pad(&pipe->output->pad);
+	pad = media_pad_remote_pad_first(&pipe->output->pad);
 	return pad->entity == me;
 }
 
diff --git a/drivers/media/platform/ti/omap3isp/ispccdc.c b/drivers/media/platform/ti/omap3isp/ispccdc.c
index 108b5e9f82cb..11afb8aec292 100644
--- a/drivers/media/platform/ti/omap3isp/ispccdc.c
+++ b/drivers/media/platform/ti/omap3isp/ispccdc.c
@@ -1133,7 +1133,7 @@ static void ccdc_configure(struct isp_ccdc_device *ccdc)
 	ccdc->bt656 = false;
 	ccdc->fields = 0;
 
-	pad = media_entity_remote_pad(&ccdc->pads[CCDC_PAD_SINK]);
+	pad = media_pad_remote_pad_first(&ccdc->pads[CCDC_PAD_SINK]);
 	sensor = media_entity_to_v4l2_subdev(pad->entity);
 	if (ccdc->input == CCDC_INPUT_PARALLEL) {
 		struct v4l2_subdev *sd =
diff --git a/drivers/media/platform/ti/omap3isp/ispccp2.c b/drivers/media/platform/ti/omap3isp/ispccp2.c
index acb58b6ddba1..fc90ff88464f 100644
--- a/drivers/media/platform/ti/omap3isp/ispccp2.c
+++ b/drivers/media/platform/ti/omap3isp/ispccp2.c
@@ -357,7 +357,7 @@ static int ccp2_if_configure(struct isp_ccp2_device *ccp2)
 
 	ccp2_pwr_cfg(ccp2);
 
-	pad = media_entity_remote_pad(&ccp2->pads[CCP2_PAD_SINK]);
+	pad = media_pad_remote_pad_first(&ccp2->pads[CCP2_PAD_SINK]);
 	sensor = media_entity_to_v4l2_subdev(pad->entity);
 	buscfg = v4l2_subdev_to_bus_cfg(pipe->external);
 
diff --git a/drivers/media/platform/ti/omap3isp/ispcsi2.c b/drivers/media/platform/ti/omap3isp/ispcsi2.c
index 6302e0c94034..6870980a2fa9 100644
--- a/drivers/media/platform/ti/omap3isp/ispcsi2.c
+++ b/drivers/media/platform/ti/omap3isp/ispcsi2.c
@@ -561,7 +561,7 @@ static int csi2_configure(struct isp_csi2_device *csi2)
 	if (csi2->contexts[0].enabled || csi2->ctrl.if_enable)
 		return -EBUSY;
 
-	pad = media_entity_remote_pad(&csi2->pads[CSI2_PAD_SINK]);
+	pad = media_pad_remote_pad_first(&csi2->pads[CSI2_PAD_SINK]);
 	sensor = media_entity_to_v4l2_subdev(pad->entity);
 	buscfg = v4l2_subdev_to_bus_cfg(pipe->external);
 
diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c
index 8811d6dd4ee7..d7059180e80e 100644
--- a/drivers/media/platform/ti/omap3isp/ispvideo.c
+++ b/drivers/media/platform/ti/omap3isp/ispvideo.c
@@ -206,7 +206,7 @@ isp_video_remote_subdev(struct isp_video *video, u32 *pad)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(&video->pad);
+	remote = media_pad_remote_pad_first(&video->pad);
 
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
@@ -981,7 +981,7 @@ static int isp_video_check_external_subdevs(struct isp_video *video,
 			continue;
 
 		/* ISP entities have always sink pad == 0. Find source. */
-		source_pad = media_entity_remote_pad(&ents[i]->pads[0]);
+		source_pad = media_pad_remote_pad_first(&ents[i]->pads[0]);
 		if (source_pad == NULL)
 			continue;
 
diff --git a/drivers/media/platform/video-mux.c b/drivers/media/platform/video-mux.c
index b31e5913a4cd..71d97042a470 100644
--- a/drivers/media/platform/video-mux.c
+++ b/drivers/media/platform/video-mux.c
@@ -118,7 +118,7 @@ static int video_mux_s_stream(struct v4l2_subdev *sd, int enable)
 		return -EINVAL;
 	}
 
-	pad = media_entity_remote_pad(&sd->entity.pads[vmux->active]);
+	pad = media_pad_remote_pad_first(&sd->entity.pads[vmux->active]);
 	if (!pad) {
 		dev_err(sd->dev, "Failed to find remote source pad\n");
 		return -ENOLINK;
diff --git a/drivers/media/platform/xilinx/xilinx-csi2rxss.c b/drivers/media/platform/xilinx/xilinx-csi2rxss.c
index 051c60cba1e0..cf8e892c47f0 100644
--- a/drivers/media/platform/xilinx/xilinx-csi2rxss.c
+++ b/drivers/media/platform/xilinx/xilinx-csi2rxss.c
@@ -474,7 +474,7 @@ static struct v4l2_subdev *xcsi2rxss_get_remote_subdev(struct media_pad *local)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(local);
+	remote = media_pad_remote_pad_first(local);
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
 
diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c
index 338c3661d809..2d1ef7a25c33 100644
--- a/drivers/media/platform/xilinx/xilinx-dma.c
+++ b/drivers/media/platform/xilinx/xilinx-dma.c
@@ -44,7 +44,7 @@ xvip_dma_remote_subdev(struct media_pad *local, u32 *pad)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(local);
+	remote = media_pad_remote_pad_first(local);
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
 
@@ -107,7 +107,7 @@ static int xvip_pipeline_start_stop(struct xvip_pipeline *pipe, bool start)
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
diff --git a/drivers/media/test-drivers/vimc/vimc-streamer.c b/drivers/media/test-drivers/vimc/vimc-streamer.c
index 65feb3c596db..807551a5143b 100644
--- a/drivers/media/test-drivers/vimc/vimc-streamer.c
+++ b/drivers/media/test-drivers/vimc/vimc-streamer.c
@@ -30,7 +30,7 @@ static struct media_entity *vimc_get_source_entity(struct media_entity *ent)
 	for (i = 0; i < ent->num_pads; i++) {
 		if (ent->pads[i].flags & MEDIA_PAD_FL_SOURCE)
 			continue;
-		pad = media_entity_remote_pad(&ent->pads[i]);
+		pad = media_pad_remote_pad_first(&ent->pads[i]);
 		return pad ? pad->entity : NULL;
 	}
 	return NULL;
diff --git a/drivers/staging/media/imx/imx-media-dev-common.c b/drivers/staging/media/imx/imx-media-dev-common.c
index 80b69a9a752c..e6d6ed3b1161 100644
--- a/drivers/staging/media/imx/imx-media-dev-common.c
+++ b/drivers/staging/media/imx/imx-media-dev-common.c
@@ -235,7 +235,7 @@ static int imx_media_inherit_controls(struct imx_media_dev *imxmd,
 		if (!(spad->flags & MEDIA_PAD_FL_SINK))
 			continue;
 
-		pad = media_entity_remote_pad(spad);
+		pad = media_pad_remote_pad_first(spad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			continue;
 
diff --git a/drivers/staging/media/imx/imx-media-utils.c b/drivers/staging/media/imx/imx-media-utils.c
index 94bc866ca28c..294c808b2ebe 100644
--- a/drivers/staging/media/imx/imx-media-utils.c
+++ b/drivers/staging/media/imx/imx-media-utils.c
@@ -698,7 +698,7 @@ imx_media_pipeline_pad(struct media_entity *start_entity, u32 grp_id,
 		    (!upstream && !(spad->flags & MEDIA_PAD_FL_SOURCE)))
 			continue;
 
-		pad = media_entity_remote_pad(spad);
+		pad = media_pad_remote_pad_first(spad);
 		if (!pad)
 			continue;
 
diff --git a/drivers/staging/media/imx/imx7-media-csi.c b/drivers/staging/media/imx/imx7-media-csi.c
index 0066af8d111f..a0553c24cce4 100644
--- a/drivers/staging/media/imx/imx7-media-csi.c
+++ b/drivers/staging/media/imx/imx7-media-csi.c
@@ -1963,7 +1963,7 @@ static int imx7_csi_pad_link_validate(struct v4l2_subdev *sd,
 			if (!(spad->flags & MEDIA_PAD_FL_SINK))
 				continue;
 
-			pad = media_entity_remote_pad(spad);
+			pad = media_pad_remote_pad_first(spad);
 			if (pad)
 				break;
 		}
diff --git a/drivers/staging/media/omap4iss/iss.c b/drivers/staging/media/omap4iss/iss.c
index 68588e9dab0b..28aacda0f5a7 100644
--- a/drivers/staging/media/omap4iss/iss.c
+++ b/drivers/staging/media/omap4iss/iss.c
@@ -395,7 +395,7 @@ static int iss_pipeline_disable(struct iss_pipeline *pipe,
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -464,7 +464,7 @@ static int iss_pipeline_enable(struct iss_pipeline *pipe,
 		if (!(pad->flags & MEDIA_PAD_FL_SINK))
 			break;
 
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 
@@ -553,7 +553,7 @@ static int iss_pipeline_is_last(struct media_entity *me)
 	pipe = to_iss_pipeline(me);
 	if (pipe->stream_state == ISS_PIPELINE_STREAM_STOPPED)
 		return 0;
-	pad = media_entity_remote_pad(&pipe->output->pad);
+	pad = media_pad_remote_pad_first(&pipe->output->pad);
 	return pad->entity == me;
 }
 
diff --git a/drivers/staging/media/omap4iss/iss_csi2.c b/drivers/staging/media/omap4iss/iss_csi2.c
index 124ab2f44fbf..04ce0e7eb557 100644
--- a/drivers/staging/media/omap4iss/iss_csi2.c
+++ b/drivers/staging/media/omap4iss/iss_csi2.c
@@ -538,7 +538,7 @@ static int csi2_configure(struct iss_csi2_device *csi2)
 	if (csi2->contexts[0].enabled || csi2->ctrl.if_enable)
 		return -EBUSY;
 
-	pad = media_entity_remote_pad(&csi2->pads[CSI2_PAD_SINK]);
+	pad = media_pad_remote_pad_first(&csi2->pads[CSI2_PAD_SINK]);
 	sensor = media_entity_to_v4l2_subdev(pad->entity);
 	pdata = sensor->host_priv;
 
diff --git a/drivers/staging/media/omap4iss/iss_video.c b/drivers/staging/media/omap4iss/iss_video.c
index d0da083deed5..9512cd3314f2 100644
--- a/drivers/staging/media/omap4iss/iss_video.c
+++ b/drivers/staging/media/omap4iss/iss_video.c
@@ -190,7 +190,7 @@ iss_video_remote_subdev(struct iss_video *video, u32 *pad)
 {
 	struct media_pad *remote;
 
-	remote = media_entity_remote_pad(&video->pad);
+	remote = media_pad_remote_pad_first(&video->pad);
 
 	if (!remote || !is_media_entity_v4l2_subdev(remote->entity))
 		return NULL;
diff --git a/drivers/staging/media/tegra-video/vi.c b/drivers/staging/media/tegra-video/vi.c
index 8e184aa4c252..9d46a36cc014 100644
--- a/drivers/staging/media/tegra-video/vi.c
+++ b/drivers/staging/media/tegra-video/vi.c
@@ -157,7 +157,7 @@ tegra_channel_get_remote_csi_subdev(struct tegra_vi_channel *chan)
 {
 	struct media_pad *pad;
 
-	pad = media_entity_remote_pad(&chan->pad);
+	pad = media_pad_remote_pad_first(&chan->pad);
 	if (!pad)
 		return NULL;
 
@@ -177,7 +177,7 @@ tegra_channel_get_remote_source_subdev(struct tegra_vi_channel *chan)
 
 	pad = &subdev->entity.pads[0];
 	while (!(pad->flags & MEDIA_PAD_FL_SOURCE)) {
-		pad = media_entity_remote_pad(pad);
+		pad = media_pad_remote_pad_first(pad);
 		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
 			break;
 		entity = pad->entity;
diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index 4efe2f88e6dc..5e272a0a9895 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -848,7 +848,7 @@ struct media_link *media_entity_find_link(struct media_pad *source,
 		struct media_pad *sink);
 
 /**
- * media_entity_remote_pad - Find the pad at the remote end of a link
+ * media_pad_remote_pad_first - Find the first pad at the remote end of a link
  * @pad: Pad at the local end of the link
  *
  * Search for a remote pad connected to the given pad by iterating over all
@@ -857,7 +857,7 @@ struct media_link *media_entity_find_link(struct media_pad *source,
  * Return: returns a pointer to the pad at the remote end of the first found
  * enabled link, or %NULL if no enabled link has been found.
  */
-struct media_pad *media_entity_remote_pad(const struct media_pad *pad);
+struct media_pad *media_pad_remote_pad_first(const struct media_pad *pad);
 
 /**
  * media_entity_is_streaming - Test if an entity is part of a streaming pipeline
-- 
cgit 


From 5680fe45d66bbef32a902c04889a523fa4fc33ba Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Tue, 14 Jun 2022 20:11:09 +0100
Subject: media: mc-entity: Add a new helper function to get a remote pad

The media_entity_remote_pad_first() helper function returns the first
remote pad it finds connected to a given pad. Beside being possibly
non-deterministic (as it stops at the first enabled link), the fact that
it returns the first match makes it unsuitable for drivers that need to
guarantee that a single link is enabled, for instance when an entity can
process data from one of multiple sources at a time.

For those use cases, add a new helper function,
media_entity_remote_pad_unique(), that operates on an entity and returns
a remote pad, with a guarantee that only one link is enabled. To ease
its use in drivers, also add an inline wrapper that locates source pads
specifically. A wrapper that locates sink pads can easily be added when
needed.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/driver-api/media/mc-core.rst |  3 +-
 drivers/media/mc/mc-entity.c               | 39 +++++++++++++++++++++++++
 include/media/media-entity.h               | 46 ++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst
index 6eea6a3b6441..66801506b2dd 100644
--- a/Documentation/driver-api/media/mc-core.rst
+++ b/Documentation/driver-api/media/mc-core.rst
@@ -186,7 +186,8 @@ is required and the graph structure can be freed normally.
 
 Helper functions can be used to find a link between two given pads, or a pad
 connected to another pad through an enabled link
-:c:func:`media_entity_find_link()` and :c:func:`media_pad_remote_pad_first()`.
+(:c:func:`media_entity_find_link()`, :c:func:`media_pad_remote_pad_first()` and
+:c:func:`media_entity_remote_source_pad_unique()`).
 
 Use count and power handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c
index 66ba0629c4ce..767846b216c5 100644
--- a/drivers/media/mc/mc-entity.c
+++ b/drivers/media/mc/mc-entity.c
@@ -921,6 +921,45 @@ struct media_pad *media_pad_remote_pad_first(const struct media_pad *pad)
 }
 EXPORT_SYMBOL_GPL(media_pad_remote_pad_first);
 
+struct media_pad *
+media_entity_remote_pad_unique(const struct media_entity *entity,
+			       unsigned int type)
+{
+	struct media_pad *pad = NULL;
+	struct media_link *link;
+
+	list_for_each_entry(link, &entity->links, list) {
+		struct media_pad *local_pad;
+		struct media_pad *remote_pad;
+
+		if (((link->flags & MEDIA_LNK_FL_LINK_TYPE) !=
+		     MEDIA_LNK_FL_DATA_LINK) ||
+		    !(link->flags & MEDIA_LNK_FL_ENABLED))
+			continue;
+
+		if (type == MEDIA_PAD_FL_SOURCE) {
+			local_pad = link->sink;
+			remote_pad = link->source;
+		} else {
+			local_pad = link->source;
+			remote_pad = link->sink;
+		}
+
+		if (local_pad->entity == entity) {
+			if (pad)
+				return ERR_PTR(-ENOTUNIQ);
+
+			pad = remote_pad;
+		}
+	}
+
+	if (!pad)
+		return ERR_PTR(-ENOLINK);
+
+	return pad;
+}
+EXPORT_SYMBOL_GPL(media_entity_remote_pad_unique);
+
 static void media_interface_init(struct media_device *mdev,
 				 struct media_interface *intf,
 				 u32 gobj_type,
diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index 5e272a0a9895..c5618f7c1408 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -859,6 +859,52 @@ struct media_link *media_entity_find_link(struct media_pad *source,
  */
 struct media_pad *media_pad_remote_pad_first(const struct media_pad *pad);
 
+/**
+ * media_entity_remote_pad_unique - Find a remote pad connected to an entity
+ * @entity: The entity
+ * @type: The type of pad to find (MEDIA_PAD_FL_SINK or MEDIA_PAD_FL_SOURCE)
+ *
+ * Search for and return a remote pad of @type connected to @entity through an
+ * enabled link. If multiple (or no) remote pads match these criteria, an error
+ * is returned.
+ *
+ * The uniqueness constraint makes this helper function suitable for entities
+ * that support a single active source or sink at a time.
+ *
+ * Return: A pointer to the remote pad, or one of the following error pointers
+ * if an error occurs:
+ *
+ * * -ENOTUNIQ - Multiple links are enabled
+ * * -ENOLINK - No connected pad found
+ */
+struct media_pad *
+media_entity_remote_pad_unique(const struct media_entity *entity,
+			       unsigned int type);
+
+/**
+ * media_entity_remote_source_pad_unique - Find a remote source pad connected to
+ *	an entity
+ * @entity: The entity
+ *
+ * Search for and return a remote source pad connected to @entity through an
+ * enabled link. If multiple (or no) remote pads match these criteria, an error
+ * is returned.
+ *
+ * The uniqueness constraint makes this helper function suitable for entities
+ * that support a single active source at a time.
+ *
+ * Return: A pointer to the remote pad, or one of the following error pointers
+ * if an error occurs:
+ *
+ * * -ENOTUNIQ - Multiple links are enabled
+ * * -ENOLINK - No connected pad found
+ */
+static inline struct media_pad *
+media_entity_remote_source_pad_unique(const struct media_entity *entity)
+{
+	return media_entity_remote_pad_unique(entity, MEDIA_PAD_FL_SOURCE);
+}
+
 /**
  * media_entity_is_streaming - Test if an entity is part of a streaming pipeline
  * @entity: The entity
-- 
cgit 


From 03b282861ca737b7e2dfb9a1e4a2a4a7e3594688 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Tue, 14 Jun 2022 20:11:10 +0100
Subject: media: mc-entity: Add a new helper function to get a remote pad for a
 pad

The newly added media_entity_remote_source_pad_unique() helper function
handles use cases where the entity has a link enabled uniqueness
constraint covering all pads. There are use cases where the constraint
covers a specific pad only. Add a new media_pad_remote_pad_unique()
function to handle this. It operates as
media_entity_remote_source_pad_unique(), but on a given pad instead of
on the entity.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/driver-api/media/mc-core.rst |  5 +++--
 drivers/media/mc/mc-entity.c               | 31 ++++++++++++++++++++++++++++++
 include/media/media-entity.h               | 18 +++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst
index 66801506b2dd..644911936ad9 100644
--- a/Documentation/driver-api/media/mc-core.rst
+++ b/Documentation/driver-api/media/mc-core.rst
@@ -186,8 +186,9 @@ is required and the graph structure can be freed normally.
 
 Helper functions can be used to find a link between two given pads, or a pad
 connected to another pad through an enabled link
-(:c:func:`media_entity_find_link()`, :c:func:`media_pad_remote_pad_first()` and
-:c:func:`media_entity_remote_source_pad_unique()`).
+(:c:func:`media_entity_find_link()`, :c:func:`media_pad_remote_pad_first()`n
+:c:func:`media_entity_remote_source_pad_unique()` and
+:c:func:`media_pad_remote_pad_unique()`).
 
 Use count and power handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c
index 767846b216c5..afd1bd7ff7b6 100644
--- a/drivers/media/mc/mc-entity.c
+++ b/drivers/media/mc/mc-entity.c
@@ -960,6 +960,37 @@ media_entity_remote_pad_unique(const struct media_entity *entity,
 }
 EXPORT_SYMBOL_GPL(media_entity_remote_pad_unique);
 
+struct media_pad *media_pad_remote_pad_unique(const struct media_pad *pad)
+{
+	struct media_pad *found_pad = NULL;
+	struct media_link *link;
+
+	list_for_each_entry(link, &pad->entity->links, list) {
+		struct media_pad *remote_pad;
+
+		if (!(link->flags & MEDIA_LNK_FL_ENABLED))
+			continue;
+
+		if (link->sink == pad)
+			remote_pad = link->source;
+		else if (link->source == pad)
+			remote_pad = link->sink;
+		else
+			continue;
+
+		if (found_pad)
+			return ERR_PTR(-ENOTUNIQ);
+
+		found_pad = remote_pad;
+	}
+
+	if (!found_pad)
+		return ERR_PTR(-ENOLINK);
+
+	return found_pad;
+}
+EXPORT_SYMBOL_GPL(media_pad_remote_pad_unique);
+
 static void media_interface_init(struct media_device *mdev,
 				 struct media_interface *intf,
 				 u32 gobj_type,
diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index c5618f7c1408..f16ffe70f7a6 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -859,6 +859,24 @@ struct media_link *media_entity_find_link(struct media_pad *source,
  */
 struct media_pad *media_pad_remote_pad_first(const struct media_pad *pad);
 
+/**
+ * media_pad_remote_pad_unique - Find a remote pad connected to a pad
+ * @pad: The pad
+ *
+ * Search for and return a remote pad connected to @pad through an enabled
+ * link. If multiple (or no) remote pads are found, an error is returned.
+ *
+ * The uniqueness constraint makes this helper function suitable for entities
+ * that support a single active source at a time on a given pad.
+ *
+ * Return: A pointer to the remote pad, or one of the following error pointers
+ * if an error occurs:
+ *
+ * * -ENOTUNIQ - Multiple links are enabled
+ * * -ENOLINK - No connected pad found
+ */
+struct media_pad *media_pad_remote_pad_unique(const struct media_pad *pad);
+
 /**
  * media_entity_remote_pad_unique - Find a remote pad connected to an entity
  * @entity: The entity
-- 
cgit 


From 9f968c9266aa30b0e81be0c6a560e45b93bed3dc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 5 Jul 2022 14:34:33 +0100
Subject: KVM: arm64: vgic-v2: Add helper for legacy dist/cpuif base address
 setting

We carry a legacy interface to set the base addresses for GICv2.
As this is currently plumbed into the same handling code as
the modern interface, it limits the evolution we can make there.

Add a helper dedicated to this handling, with a view of maybe
removing this in the future.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/arm.c                  | 11 ++---------
 arch/arm64/kvm/vgic/vgic-kvm-device.c | 32 ++++++++++++++++++++++++++++++++
 include/kvm/arm_vgic.h                |  1 +
 3 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a0188144a122..fd26beacbbe5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1414,18 +1414,11 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 					struct kvm_arm_device_addr *dev_addr)
 {
-	unsigned long dev_id, type;
-
-	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
-		KVM_ARM_DEVICE_ID_SHIFT;
-	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
-		KVM_ARM_DEVICE_TYPE_SHIFT;
-
-	switch (dev_id) {
+	switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
 	case KVM_ARM_DEVICE_VGIC_V2:
 		if (!vgic_present)
 			return -ENXIO;
-		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
+		return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
 	default:
 		return -ENODEV;
 	}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index c17e5502c0b3..04175fd55da6 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -41,6 +41,38 @@ static int vgic_check_type(struct kvm *kvm, int type_needed)
 		return 0;
 }
 
+int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr)
+{
+	struct vgic_dist *vgic = &kvm->arch.vgic;
+	int r;
+
+	mutex_lock(&kvm->lock);
+	switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) {
+	case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		if (!r)
+			r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr,
+					       SZ_4K, KVM_VGIC_V2_DIST_SIZE);
+		if (!r)
+			vgic->vgic_dist_base = dev_addr->addr;
+		break;
+	case KVM_VGIC_V2_ADDR_TYPE_CPU:
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		if (!r)
+			r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr,
+					       SZ_4K, KVM_VGIC_V2_CPU_SIZE);
+		if (!r)
+			vgic->vgic_cpu_base = dev_addr->addr;
+		break;
+	default:
+		r = -ENODEV;
+	}
+
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+
 /**
  * kvm_vgic_addr - set or get vgic VM base addresses
  * @kvm:   pointer to the vm struct
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 2d8f2e90edc2..f79cce67563e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -365,6 +365,7 @@ extern struct static_key_false vgic_v2_cpuif_trap;
 extern struct static_key_false vgic_v3_cpuif_trap;
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
+int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
-- 
cgit 


From 4b85080f4e378f617f88964dec94fd282bcf2af4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 5 Jul 2022 14:39:24 +0100
Subject: KVM: arm64: vgic: Consolidate userspace access for base address
 setting

Align kvm_vgic_addr() with the rest of the code by moving the
userspace accesses into it. kvm_vgic_addr() is also made static.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-kvm-device.c | 75 +++++++++++++++--------------------
 include/kvm/arm_vgic.h                |  1 -
 2 files changed, 31 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 04175fd55da6..011171dc41c5 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -76,8 +76,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
 /**
  * kvm_vgic_addr - set or get vgic VM base addresses
  * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
+ * @attr:  pointer to the attribute being retrieved/updated
  * @write: if true set the address in the VM address space, if false read the
  *          address
  *
@@ -89,15 +88,22 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
  * overlapping regions in case of a virtual GICv3 here, since we don't know
  * the number of VCPUs yet, so we defer this check to map_resources().
  */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write)
 {
-	int r = 0;
+	u64 __user *uaddr = (u64 __user *)attr->addr;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
 	phys_addr_t *addr_ptr, alignment, size;
 	u64 undef_value = VGIC_ADDR_UNDEF;
+	u64 addr;
+	int r;
+
+	/* Reading a redistributor region addr implies getting the index */
+	if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION)
+		if (get_user(addr, uaddr))
+			return -EFAULT;
 
 	mutex_lock(&kvm->lock);
-	switch (type) {
+	switch (attr->attr) {
 	case KVM_VGIC_V2_ADDR_TYPE_DIST:
 		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
 		addr_ptr = &vgic->vgic_dist_base;
@@ -123,7 +129,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 		if (r)
 			break;
 		if (write) {
-			r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
+			r = vgic_v3_set_redist_base(kvm, 0, addr, 0);
 			goto out;
 		}
 		rdreg = list_first_entry_or_null(&vgic->rd_regions,
@@ -143,14 +149,12 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 		if (r)
 			break;
 
-		index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
+		index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
 
 		if (write) {
-			gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
-			u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
-					>> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-			u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
-					>> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
+			gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK;
+			u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr);
+			u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr);
 
 			if (!count || flags)
 				r = -EINVAL;
@@ -166,9 +170,9 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 			goto out;
 		}
 
-		*addr = index;
-		*addr |= rdreg->base;
-		*addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+		addr = index;
+		addr |= rdreg->base;
+		addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
 		goto out;
 	}
 	default:
@@ -179,15 +183,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
 		goto out;
 
 	if (write) {
-		r = vgic_check_iorange(kvm, *addr_ptr, *addr, alignment, size);
+		r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size);
 		if (!r)
-			*addr_ptr = *addr;
+			*addr_ptr = addr;
 	} else {
-		*addr = *addr_ptr;
+		addr = *addr_ptr;
 	}
 
 out:
 	mutex_unlock(&kvm->lock);
+
+	if (!r && !write)
+		r =  put_user(addr, uaddr);
+
 	return r;
 }
 
@@ -197,17 +205,9 @@ static int vgic_set_common_attr(struct kvm_device *dev,
 	int r;
 
 	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (get_user(addr, uaddr))
-			return -EFAULT;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		r = kvm_vgic_addr(dev->kvm, attr, true);
 		return (r == -ENODEV) ? -ENXIO : r;
-	}
 	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
 		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
 		u32 val;
@@ -260,22 +260,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
 	int r = -ENXIO;
 
 	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (get_user(addr, uaddr))
-			return -EFAULT;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-		if (r)
-			return (r == -ENODEV) ? -ENXIO : r;
-
-		if (put_user(addr, uaddr))
-			return -EFAULT;
-		break;
-	}
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		r = kvm_vgic_addr(dev->kvm, attr, false);
+		return (r == -ENODEV) ? -ENXIO : r;
 	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
 		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f79cce67563e..4df9e73a8bb5 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -364,7 +364,6 @@ struct vgic_cpu {
 extern struct static_key_false vgic_v2_cpuif_trap;
 extern struct static_key_false vgic_v3_cpuif_trap;
 
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
-- 
cgit 


From f484da847a01936e1d087a80cc96e8254492527c Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:03 -0700
Subject: net/mlx5: Expose the ability to point to any UID from shared UID

Expose shared_object_to_user_object_allowed, this capability
means an object created with shared UID can point to any UID.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8e87eb47f9dc..9321d774e2d8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1371,7 +1371,9 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x1f];
+	u8         reserved_at_0[0x10];
+	u8         shared_object_to_user_object_allowed[0x1];
+	u8         reserved_at_13[0xe];
 	u8         vhca_resource_manager[0x1];
 
 	u8         hca_cap_2[0x1];
@@ -8507,7 +8509,7 @@ struct mlx5_ifc_create_flow_table_out_bits {
 
 struct mlx5_ifc_create_flow_table_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit 


From 6c27c56cdc69608211617eea217e1a6cecccc675 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:04 -0700
Subject: net/mlx5: fs, expose flow table ID to users

Expose the flow table ID to users. This will be used by downstream
patches to allow creating steering rules that point to a flow table ID.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 6 ++++++
 include/linux/mlx5/fs.h                           | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 14187e50e2f9..1da3dc7c95fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1195,6 +1195,12 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 }
 EXPORT_SYMBOL(mlx5_create_flow_table);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft)
+{
+	return ft->id;
+}
+EXPORT_SYMBOL(mlx5_flow_table_id);
+
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     struct mlx5_flow_table_attr *ft_attr, u16 vport)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ece3e35622d7..eee07d416b56 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -315,4 +315,5 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  struct mlx5_pkt_reformat *reformat);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft);
 #endif
-- 
cgit 


From b0bb369ee451323968b31392a86398f15a2ba183 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:05 -0700
Subject: net/mlx5: fs, allow flow table creation with a UID

Add UID field to flow table attributes to allow creating flow tables
with a non default (zero) uid.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c         | 16 ++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h         |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c        |  2 +-
 .../net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c    |  1 +
 .../net/ethernet/mellanox/mlx5/core/steering/dr_table.c  |  8 +++++---
 .../net/ethernet/mellanox/mlx5/core/steering/dr_types.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c |  7 ++++---
 .../net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h    |  3 ++-
 include/linux/mlx5/fs.h                                  |  1 +
 9 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 735dc805dad7..e735e19461ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -50,10 +50,12 @@ static int mlx5_cmd_stub_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					   struct mlx5_flow_table *ft,
-					   unsigned int size,
+					   struct mlx5_flow_table_attr *ft_attr,
 					   struct mlx5_flow_table *next_ft)
 {
-	ft->max_fte = size ? roundup_pow_of_two(size) : 1;
+	int max_fte = ft_attr->max_fte;
+
+	ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1;
 
 	return 0;
 }
@@ -258,7 +260,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 				      struct mlx5_flow_table *ft,
-				      unsigned int size,
+				      struct mlx5_flow_table_attr *ft_attr,
 				      struct mlx5_flow_table *next_ft)
 {
 	int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
@@ -267,17 +269,19 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {};
 	struct mlx5_core_dev *dev = ns->dev;
+	unsigned int size;
 	int err;
 
-	if (size != POOL_NEXT_SIZE)
-		size = roundup_pow_of_two(size);
-	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size);
+	if (ft_attr->max_fte != POOL_NEXT_SIZE)
+		size = roundup_pow_of_two(ft_attr->max_fte);
+	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte);
 	if (!size)
 		return -ENOSPC;
 
 	MLX5_SET(create_flow_table_in, in, opcode,
 		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
 
+	MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid);
 	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 274004e80f03..8ef4254b9ea1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -38,7 +38,7 @@
 struct mlx5_flow_cmds {
 	int (*create_flow_table)(struct mlx5_flow_root_namespace *ns,
 				 struct mlx5_flow_table *ft,
-				 unsigned int size,
+				 struct mlx5_flow_table_attr *ft_attr,
 				 struct mlx5_flow_table *next_ft);
 	int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns,
 				  struct mlx5_flow_table *ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 1da3dc7c95fa..35d89edb1bcd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1155,7 +1155,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 			      find_next_chained_ft(fs_prio);
 	ft->def_miss_action = ns->def_miss_action;
 	ft->ns = ns;
-	err = root->cmds->create_flow_table(root, ft, ft_attr->max_fte, next_ft);
+	err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft);
 	if (err)
 		goto free_ft;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 223c8741b7ae..16d65fe4f654 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -439,6 +439,7 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev,
 
 	MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE);
 	MLX5_SET(create_flow_table_in, in, table_type, attr->table_type);
+	MLX5_SET(create_flow_table_in, in, uid, attr->uid);
 
 	ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context);
 	MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
index e5f6412baea9..31d443dd8386 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
@@ -214,7 +214,7 @@ static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl)
 					     tbl->table_type);
 }
 
-static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
+static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid)
 {
 	bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
@@ -236,6 +236,7 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	ft_attr.sw_owner = true;
 	ft_attr.decap_en = en_decap;
 	ft_attr.reformat_en = en_encap;
+	ft_attr.uid = uid;
 
 	ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr,
 					   NULL, &tbl->table_id);
@@ -243,7 +244,8 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	return ret;
 }
 
-struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags)
+struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level,
+					 u32 flags, u16 uid)
 {
 	struct mlx5dr_table *tbl;
 	int ret;
@@ -263,7 +265,7 @@ struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u
 	if (ret)
 		goto free_tbl;
 
-	ret = dr_table_create_sw_owned_tbl(tbl);
+	ret = dr_table_create_sw_owned_tbl(tbl, uid);
 	if (ret)
 		goto uninit_tbl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 98320e3945ad..50b0dd4fb4a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -1200,6 +1200,7 @@ struct mlx5dr_cmd_query_flow_table_details {
 
 struct mlx5dr_cmd_create_flow_table_attr {
 	u32 table_type;
+	u16 uid;
 	u64 icm_addr_rx;
 	u64 icm_addr_tx;
 	u8 level;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index 6a9abba92df6..c30ed8e18458 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -62,7 +62,7 @@ static int set_miss_action(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					 struct mlx5_flow_table *ft,
-					 unsigned int size,
+					 struct mlx5_flow_table_attr *ft_attr,
 					 struct mlx5_flow_table *next_ft)
 {
 	struct mlx5dr_table *tbl;
@@ -71,7 +71,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 
 	if (mlx5_dr_is_fw_table(ft->flags))
 		return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft,
-								    size,
+								    ft_attr,
 								    next_ft);
 	flags = ft->flags;
 	/* turn off encap/decap if not supported for sw-str by fw */
@@ -79,7 +79,8 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 		flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 				      MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags);
+	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags,
+				  ft_attr->uid);
 	if (!tbl) {
 		mlx5_core_err(ns->dev, "Failed creating dr flow_table\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index 7626c85643b1..3bb14860b36d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -51,7 +51,8 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn,
 			    struct mlx5dr_domain *peer_dmn);
 
 struct mlx5dr_table *
-mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags);
+mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags,
+		    u16 uid);
 
 struct mlx5dr_table *
 mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index eee07d416b56..8e73c377da2c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -178,6 +178,7 @@ struct mlx5_flow_table_attr {
 	int max_fte;
 	u32 level;
 	u32 flags;
+	u16 uid;
 	struct mlx5_flow_table *next_ft;
 
 	struct {
-- 
cgit 


From 335e52c28cf9954d65b819cb68912fd32de3c844 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 1 Jul 2022 17:16:19 +0800
Subject: mm: Add PAGE_ALIGN_DOWN macro

This is just the same as PAGE_ALIGN(), but rounds the address down, not
up.

Suggested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..bd58ee018555 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -221,6 +221,9 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
 
+/* to align the pointer to the (prev) page boundary */
+#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
+
 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
 #define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
 
-- 
cgit 


From 6077c943beee407168f72ece745b0aeaef6b896f Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:08 -0500
Subject: mm: rename is_pinnable_page() to is_longterm_pinnable_page()

Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory
mapping", v9.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our ongoing
testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory (aka VRAM)
as SPM (special purpose memory) in the UEFI system address map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages.  The initial user for
this hardware page migration capability is the Frontier supercomputer
project.  This functionality is not AMD-specific.  We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration, with
.5 TB of system memory plus 256 GB of device memory split across 4 GPUs,
all in a single coherent address space.  Page migration is expected to
improve application efficiency significantly.  We will report empirical
results as they become available.

Coherent device type pages at gup are now migrated back to system memory
if they are being pinned long-term (FOLL_LONGTERM).  The reason is, that
long-term pinning would interfere with the device memory manager owning
the device-coherent pages (e.g.  evictions in TTM).  These series
incorporate Alistair Popple patches to do this migration from
pin_user_pages() calls.  hmm_gup_test has been added to hmm-test to test
different get user pages calls.

This series includes handling of device-managed anonymous pages returned
by vm_normal_pages.  Although they behave like normal pages for purposes
of mapping in CPU page tables and for COW, they do not support LRU lists,
NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they expect
to put pages on an LRU list.


This patch (of 14):

is_pinnable_page() and folio_is_pinnable() are renamed to
is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively.
These functions are used in the FOLL_LONGTERM flag context.

Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com
Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 mm/gup.c           | 4 ++--
 mm/gup_test.c      | 2 +-
 mm/hugetlb.c       | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9cc02a7e503b..3c044e38958c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1607,7 +1607,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
 
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 #ifdef CONFIG_CMA
 	int mt = get_pageblock_migratetype(page);
@@ -1618,15 +1618,15 @@ static inline bool is_pinnable_page(struct page *page)
 	return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
 }
 #else
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 	return true;
 }
 #endif
 
-static inline bool folio_is_pinnable(struct folio *folio)
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
 {
-	return is_pinnable_page(&folio->page);
+	return is_longterm_pinnable_page(&folio->page);
 }
 
 static inline void set_page_zone(struct page *page, enum zone_type zone)
diff --git a/mm/gup.c b/mm/gup.c
index 3129b754ade3..a9940e3b3181 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
 		 * path.
 		 */
 		if (unlikely((flags & FOLL_LONGTERM) &&
-			     !is_pinnable_page(page)))
+			     !is_longterm_pinnable_page(page)))
 			return NULL;
 
 		/*
@@ -1923,7 +1923,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 			continue;
 		prev_folio = folio;
 
-		if (folio_is_pinnable(folio))
+		if (folio_is_longterm_pinnable(folio))
 			continue;
 
 		/*
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..12b0a91767d3 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
 				dump_page(page, "gup_test failure");
 				break;
 			} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-				WARN(!is_pinnable_page(page),
+				WARN(!is_longterm_pinnable_page(page),
 				     "pages[%lu] is NOT pinnable but pinned\n",
 				     i)) {
 				dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 064da8ffbac6..ffdf3fc4a83f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1129,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 	lockdep_assert_held(&hugetlb_lock);
 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-		if (pin && !is_pinnable_page(page))
+		if (pin && !is_longterm_pinnable_page(page))
 			continue;
 
 		if (PageHWPoison(page))
-- 
cgit 


From 5bb88dc571b1cbf0284100a317fb21ab7d03e40c Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:09 -0500
Subject: mm: move page zone helpers from mm.h to mmzone.h

It makes more sense to have these helpers in zone specific header
file, rather than the generic mm.h

Link: https://lkml.kernel.org/r/20220715150521.18165-3-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h |  2 +-
 include/linux/mm.h       | 78 ----------------------------------------------
 include/linux/mmzone.h   | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9f5ee49482de..732dde5988fb 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
 
-#include <linux/mm.h>
+#include <linux/mmzone.h>
 #include <linux/range.h>
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c044e38958c..a2d01e49253b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1045,84 +1045,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-/*
- * The zone field is never updated after free_area_init_core()
- * sets it, so none of the operations on it need to be atomic.
- */
-
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
-#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
-#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
-#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
-#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
-#define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
-
-/*
- * Define the bit shifts to access each section.  For non-existent
- * sections we define the shift as 0; that plus a 0 mask ensures
- * the compiler will optimise away reference to them.
- */
-#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
-#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
-#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
-#define KASAN_TAG_PGSHIFT	(KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
-
-/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF)? \
-						SECTIONS_PGOFF : ZONES_PGOFF)
-#else
-#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF		((NODES_PGOFF < ZONES_PGOFF)? \
-						NODES_PGOFF : ZONES_PGOFF)
-#endif
-
-#define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
-
-#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
-#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
-#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_SHIFT) - 1)
-#define KASAN_TAG_MASK		((1UL << KASAN_TAG_WIDTH) - 1)
-#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
-
-static inline enum zone_type page_zonenum(const struct page *page)
-{
-	ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
-	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
-}
-
-static inline enum zone_type folio_zonenum(const struct folio *folio)
-{
-	return page_zonenum(&folio->page);
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
-{
-	return page_zonenum(page) == ZONE_DEVICE;
-}
-extern void memmap_init_zone_device(struct zone *, unsigned long,
-				    unsigned long, struct dev_pagemap *);
-#else
-static inline bool is_zone_device_page(const struct page *page)
-{
-	return false;
-}
-#endif
-
-static inline bool folio_is_zone_device(const struct folio *folio)
-{
-	return is_zone_device_page(&folio->page);
-}
-
-static inline bool is_zone_movable_page(const struct page *page)
-{
-	return page_zonenum(page) == ZONE_MOVABLE;
-}
-
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 735bf5b37949..5da1135e6755 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone)
 	return zone->spanned_pages == 0;
 }
 
+#ifndef BUILD_VDSO32_64
+/*
+ * The zone field is never updated after free_area_init_core()
+ * sets it, so none of the operations on it need to be atomic.
+ */
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
+#define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+
+/*
+ * Define the bit shifts to access each section.  For non-existent
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
+#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
+#define KASAN_TAG_PGSHIFT	(KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
+
+/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF) ? \
+						SECTIONS_PGOFF : ZONES_PGOFF)
+#else
+#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#define ZONEID_PGOFF		((NODES_PGOFF < ZONES_PGOFF) ? \
+						NODES_PGOFF : ZONES_PGOFF)
+#endif
+
+#define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
+
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
+#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_SHIFT) - 1)
+#define KASAN_TAG_MASK		((1UL << KASAN_TAG_WIDTH) - 1)
+#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
+
+static inline enum zone_type page_zonenum(const struct page *page)
+{
+	ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
+static inline enum zone_type folio_zonenum(const struct folio *folio)
+{
+	return page_zonenum(&folio->page);
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_zone_device_page(const struct page *page)
+{
+	return page_zonenum(page) == ZONE_DEVICE;
+}
+extern void memmap_init_zone_device(struct zone *, unsigned long,
+				    unsigned long, struct dev_pagemap *);
+#else
+static inline bool is_zone_device_page(const struct page *page)
+{
+	return false;
+}
+#endif
+
+static inline bool folio_is_zone_device(const struct folio *folio)
+{
+	return is_zone_device_page(&folio->page);
+}
+
+static inline bool is_zone_movable_page(const struct page *page)
+{
+	return page_zonenum(page) == ZONE_MOVABLE;
+}
+#endif
+
 /*
  * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
  * intersection with the given zone
-- 
cgit 


From f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:10 -0500
Subject: mm: add zone device coherent type memory support

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI or
CXL).  Any page of a process can be migrated to such memory.  However, no
one should be allowed to pin such memory so that it can always be evicted.

[hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page]
Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 19 +++++++++++++++++++
 include/linux/mm.h       |  5 ++++-
 mm/memcontrol.c          |  7 ++++---
 mm/memory-failure.c      |  8 ++++++--
 mm/memremap.c            | 10 ++++++++++
 mm/migrate_device.c      | 16 +++++++---------
 mm/rmap.c                |  5 +++--
 7 files changed, 53 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 732dde5988fb..09320b7f706c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/mm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
 enum memory_type {
 	/* 0 is reserved to catch uninitialized type fields */
 	MEMORY_DEVICE_PRIVATE = 1,
+	MEMORY_DEVICE_COHERENT,
 	MEMORY_DEVICE_FS_DAX,
 	MEMORY_DEVICE_GENERIC,
 	MEMORY_DEVICE_PCI_P2PDMA,
@@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+	return is_device_coherent_page(&folio->page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2d01e49253b..64393ed3330a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -28,6 +28,7 @@
 #include <linux/sched.h>
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
+#include <linux/memremap.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page)
 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
 		return false;
 #endif
-	return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
+	return !(is_device_coherent_page(page) ||
+		 is_zone_movable_page(page) ||
+		 is_zero_pfn(page_to_pfn(page)));
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1497affe08c4..b1868784f895 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5716,8 +5716,8 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
- *     (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  *     For now we such page is charge like a regular page would be as for all
  *     intent and purposes it is just special memory taking the place of a
  *     regular page.
@@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page_memcg(page) == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page))
+			if (is_device_private_page(page) ||
+			    is_device_coherent_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f7612ccdb299..b7ca5db7e60e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		goto unlock;
 	}
 
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_COHERENT:
 		/*
-		 * TODO: Handle HMM pages which may need coordination
+		 * TODO: Handle device pages which may need coordination
 		 * with device-side memory.
 		 */
 		goto unlock;
+	default:
+		break;
 	}
 
 	/*
diff --git a/mm/memremap.c b/mm/memremap.c
index 8b5c8fd4ea8e..f0955785150f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			return ERR_PTR(-EINVAL);
 		}
 		break;
+	case MEMORY_DEVICE_COHERENT:
+		if (!pgmap->ops->page_free) {
+			WARN(1, "Missing page_free method\n");
+			return ERR_PTR(-EINVAL);
+		}
+		if (!pgmap->owner) {
+			WARN(1, "Missing owner\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
 			WARN(1, "File system DAX not supported\n");
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5052093d0262..a4847ad65da3 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
  *     handle_pte_fault()
  *       do_anonymous_page()
  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
  */
 static void migrate_vma_insert_page(struct migrate_vma *migrate,
 				    unsigned long addr,
@@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 						page_to_pfn(page));
 		entry = swp_entry_to_pte(swp_entry);
 	} else {
-		/*
-		 * For now we only support migrating to un-addressable device
-		 * memory.
-		 */
-		if (is_zone_device_page(page)) {
+		if (is_zone_device_page(page) &&
+		    !is_device_coherent_page(page)) {
 			pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
 			goto abort;
 		}
@@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate)
 
 		mapping = page_mapping(page);
 
-		if (is_device_private_page(newpage)) {
+		if (is_device_private_page(newpage) ||
+		    is_device_coherent_page(newpage)) {
 			/*
-			 * For now only support private anonymous when migrating
-			 * to un-addressable device memory.
+			 * For now only support anonymous memory migrating to
+			 * device private or coherent memory.
 			 */
 			if (mapping) {
 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
diff --git a/mm/rmap.c b/mm/rmap.c
index 83172ee0ea35..0532fd92ecb3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		/* Update high watermark before we lower rss */
 		update_hiwater_rss(mm);
 
-		if (folio_is_zone_device(folio)) {
+		if (folio_is_device_private(folio)) {
 			unsigned long pfn = folio_pfn(folio);
 			swp_entry_t entry;
 			pte_t swp_pte;
@@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
 					TTU_SYNC)))
 		return;
 
-	if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
+	if (folio_is_zone_device(folio) &&
+	    (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
 		return;
 
 	/*
-- 
cgit 


From dd19e6d8ffaa1289d75d7833de97faf1b6b2c8e4 Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:12 -0500
Subject: mm: add device coherent vma selection for memory migration

This case is used to migrate pages from device memory, back to system
memory.  Device coherent type memory is cache coherent from device and CPU
point of view.

Link: https://lkml.kernel.org/r/20220715150521.18165-6-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Poppple <apopple@nvidia.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c     | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
 	MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
 	MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+	MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ again:
 			if (is_writable_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
-			if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-				goto next;
 			pfn = pte_pfn(pte);
-			if (is_zero_pfn(pfn)) {
+			if (is_zero_pfn(pfn) &&
+			    (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
 				mpfn = MIGRATE_PFN_MIGRATE;
 				migrate->cpages++;
 				goto next;
 			}
 			page = vm_normal_page(migrate->vma, addr, pte);
+			if (page && !is_zone_device_page(page) &&
+			    !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+				goto next;
+			else if (page && is_device_coherent_page(page) &&
+			    (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+			     page->pgmap->owner != migrate->pgmap_owner))
+				goto next;
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
-- 
cgit 


From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:25 +0800
Subject: dax: introduce holder for dax_device

Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2.

The patchset fsdax-rmap is aimed to support shared pages tracking for
fsdax.

It moves owner tracking from dax_assocaite_entry() to pmem device driver,
by introducing an interface ->memory_failure() for struct pagemap.  This
interface is called by memory_failure() in mm, and implemented by pmem
device.

Then call holder operations to find the filesystem which the corrupted
data located in, and call filesystem handler to track files or metadata
associated with this page.

Finally we are able to try to fix the corrupted data in filesystem and do
other necessary processing, such as killing processes who are using the
files affected.

The call trace is like this:
memory_failure()
|* fsdax case
|------------
|pgmap->ops->memory_failure()      => pmem_pgmap_memory_failure()
| dax_holder_notify_failure()      =>
|  dax_device->holder_ops->notify_failure() =>
|                                     - xfs_dax_notify_failure()
|  |* xfs_dax_notify_failure()
|  |--------------------------
|  |   xfs_rmap_query_range()
|  |    xfs_dax_failure_fn()
|  |    * corrupted on metadata
|  |       try to recover data, call xfs_force_shutdown()
|  |    * corrupted on file data
|  |       try to recover data, call mf_dax_kill_procs()
|* normal case
|-------------
|mf_generic_kill_procs()


The patchset fsdax-reflink attempts to add CoW support for fsdax, and
takes XFS, which has both reflink and fsdax features, as an example.

One of the key mechanisms needed to be implemented in fsdax is CoW.  Copy
the data from srcmap before we actually write data to the destination
iomap.  And we just copy range in which data won't be changed.

Another mechanism is range comparison.  In page cache case, readpage() is
used to load data on disk to page cache in order to be able to compare
data.  In fsdax case, readpage() does not work.  So, we need another
compare data with direct access support.

With the two mechanisms implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.


This patch (of 14):

To easily track filesystem from a pmem device, we introduce a holder for
dax_device structure, and also its operation.  This holder is used to
remember who is using this dax_device:

 - When it is the backend of a filesystem, the holder will be the
   instance of this filesystem.
 - When this pmem device is one of the targets in a mapped device, the
   holder will be this mapped device.  In this case, the mapped device
   has its own dax_device and it will follow the first rule.  So that we
   can finally track to the filesystem we needed.

The holder and holder_ops will be set when filesystem is being mounted,
or an target device is being activated.

Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com
Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.wiliams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c     |  2 +-
 fs/erofs/super.c    | 10 ++++----
 fs/ext2/super.c     |  7 +++---
 fs/ext4/super.c     |  9 +++----
 fs/xfs/xfs_buf.c    |  5 ++--
 include/linux/dax.h | 33 +++++++++++++++++++-------
 7 files changed, 110 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 50a08b2ec247..9b5e2a5eb0ae 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -22,6 +22,8 @@
  * @private: dax driver private data
  * @flags: state and boolean properties
  * @ops: operations for this device
+ * @holder_data: holder of a dax_device: could be filesystem or mapped device
+ * @holder_ops: operations for the inner holder
  */
 struct dax_device {
 	struct inode inode;
@@ -29,6 +31,8 @@ struct dax_device {
 	void *private;
 	unsigned long flags;
 	const struct dax_operations *ops;
+	void *holder_data;
+	const struct dax_holder_operations *holder_ops;
 };
 
 static dev_t dax_devt;
@@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
  * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
  * @bdev: block device to find a dax_device for
  * @start_off: returns the byte offset into the dax_device that @bdev starts
+ * @holder: filesystem or mapped device inside the dax_device
+ * @ops: operations for the inner holder
  */
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+		void *holder, const struct dax_holder_operations *ops)
 {
 	struct dax_device *dax_dev;
 	u64 part_size;
@@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
 	dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
 	if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
 		dax_dev = NULL;
+	else if (holder) {
+		if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
+			dax_dev->holder_ops = ops;
+		else
+			dax_dev = NULL;
+	}
 	dax_read_unlock(id);
 
 	return dax_dev;
 }
 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
+void fs_put_dax(struct dax_device *dax_dev, void *holder)
+{
+	if (dax_dev && holder &&
+	    cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
+		dax_dev->holder_ops = NULL;
+	put_dax(dax_dev);
+}
+EXPORT_SYMBOL_GPL(fs_put_dax);
 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
 
 enum dax_device_flags {
@@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 }
 EXPORT_SYMBOL_GPL(dax_recovery_write);
 
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
+			      u64 len, int mf_flags)
+{
+	int rc, id;
+
+	id = dax_read_lock();
+	if (!dax_alive(dax_dev)) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (!dax_dev->holder_ops) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
+out:
+	dax_read_unlock(id);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev)
 	if (!dax_dev)
 		return;
 
+	if (dax_dev->holder_data != NULL)
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
+
+	/* clear holder data */
+	dax_dev->holder_ops = NULL;
+	dax_dev->holder_data = NULL;
 }
 EXPORT_SYMBOL_GPL(kill_dax);
 
@@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+/**
+ * dax_holder() - obtain the holder of a dax device
+ * @dax_dev: a dax_device instance
+
+ * Return: the holder's data which represents the holder if registered,
+ * otherwize NULL.
+ */
+void *dax_holder(struct dax_device *dax_dev)
+{
+	return dax_dev->holder_data;
+}
+EXPORT_SYMBOL_GPL(dax_holder);
+
 /**
  * inode_dax: convert a public inode into its dax_dev
  * @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2b75f1ef7386..0177a4ce9a18 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
 	}
 
 	td->dm_dev.bdev = bdev;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
 	return 0;
 }
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 95addc5c9d34..3173debeaa5a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 		if (IS_ERR(bdev))
 			return PTR_ERR(bdev);
 		dif->bdev = bdev;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+						  NULL, NULL);
 	}
 
 	dif->blocks = le32_to_cpu(dis->blocks);
@@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		}
 
 		sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
-						  &sbi->dax_part_off);
+						  &sbi->dax_part_off,
+						  NULL, NULL);
 	}
 
 	err = erofs_read_superblock(sb);
@@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 {
 	struct erofs_device_info *dif = ptr;
 
-	fs_put_dax(dif->dax_dev);
+	fs_put_dax(dif->dax_dev, NULL);
 	if (dif->bdev)
 		blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
 	erofs_fscache_unregister_cookie(&dif->fscache);
@@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
 		return;
 
 	erofs_free_dev_context(sbi->devs);
-	fs_put_dax(sbi->dax_dev);
+	fs_put_dax(sbi->dax_dev, NULL);
 	erofs_fscache_unregister_cookie(&sbi->s_fscache);
 	erofs_fscache_unregister_fs(sb);
 	kfree(sbi->opt.fsid);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..4638946251b9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 }
 
@@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
-	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+					   NULL, NULL);
 
 	spin_lock_init(&sbi->s_lock);
 	ret = -EINVAL;
@@ -1204,7 +1205,7 @@ failed_mount_group_desc:
 failed_mount:
 	brelse(bh);
 failed_sbi:
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 845f2f8aee5f..1f8bf507ba5a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 #if IS_ENABLED(CONFIG_UNICODE)
 	utf8_unload(sb->s_encoding);
@@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
 		return;
 
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 }
 
@@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
 	if (!sbi)
 		return NULL;
 
-	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+					   NULL, NULL);
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
 	sbi->s_sb = sb;
 	return sbi;
 err_out:
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 	return NULL;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4aa9c9cf5b6e..1ec2a7b6d44e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1911,7 +1911,7 @@ xfs_free_buftarg(
 	list_lru_destroy(&btp->bt_lru);
 
 	blkdev_issue_flush(btp->bt_bdev);
-	fs_put_dax(btp->bt_daxdev);
+	fs_put_dax(btp->bt_daxdev, NULL);
 
 	kmem_free(btp);
 }
@@ -1964,7 +1964,8 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
-	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL,
+					    NULL);
 
 	/*
 	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7b81634c52a..cf85fc36da5f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -43,8 +43,21 @@ struct dax_operations {
 			void *addr, size_t bytes, struct iov_iter *iter);
 };
 
+struct dax_holder_operations {
+	/*
+	 * notify_failure - notify memory failure into inner holder device
+	 * @dax_dev: the dax device which contains the holder
+	 * @offset: offset on this dax device where memory failure occurs
+	 * @len: length of this memory failure event
+	 * @flags: action flags for memory failure handler
+	 */
+	int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
+			u64 len, int mf_flags);
+};
+
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+void *dax_holder(struct dax_device *dax_dev);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 	return dax_synchronous(dax_dev);
 }
 #else
+static inline void *dax_holder(struct dax_device *dax_dev)
+{
+	return NULL;
+}
 static inline struct dax_device *alloc_dax(void *private,
 		const struct dax_operations *ops)
 {
@@ -114,12 +131,9 @@ struct writeback_control;
 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
-		u64 *start_off);
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
-	put_dax(dax_dev);
-}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+		void *holder, const struct dax_holder_operations *ops);
+void fs_put_dax(struct dax_device *dax_dev, void *holder);
 #else
 static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
 {
@@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk)
 {
 }
 static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
-		u64 *start_off)
+		u64 *start_off, void *holder,
+		const struct dax_holder_operations *ops)
 {
 	return NULL;
 }
-static inline void fs_put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
 {
 }
 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
@@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
 			size_t nr_pages);
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
+		int mf_flags);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-- 
cgit 


From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:27 +0800
Subject: pagemap,pmem: introduce ->memory_failure()

When memory-failure occurs, we call this function which is implemented by
each kind of devices.  For the fsdax case, pmem device driver implements
it.  Pmem device driver will find out the filesystem in which the
corrupted page located in.

With dax_holder notify support, we are able to notify the memory failure
from pmem driver to upper layers.  If there is something not support in
the notify routine, memory_failure will fall back to the generic hanlder.

Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/nvdimm/pmem.c    | 17 +++++++++++++++++
 include/linux/memremap.h | 12 ++++++++++++
 mm/memory-failure.c      | 14 ++++++++++++++
 3 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 629d10fcf53b..107c9cb3d57d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem)
 	blk_cleanup_disk(pmem->disk);
 }
 
+static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
+		unsigned long pfn, unsigned long nr_pages, int mf_flags)
+{
+	struct pmem_device *pmem =
+			container_of(pgmap, struct pmem_device, pgmap);
+	u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
+	u64 len = nr_pages << PAGE_SHIFT;
+
+	return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
+}
+
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.memory_failure		= pmem_pagemap_memory_failure,
+};
+
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.range.end = res->end;
 		pmem->pgmap.nr_range = 1;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		bb_range = pmem->pgmap.range;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 09320b7f706c..19010491a603 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -87,6 +87,18 @@ struct dev_pagemap_ops {
 	 * the page back to a CPU accessible page.
 	 */
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+
+	/*
+	 * Handle the memory failure happens on a range of pfns.  Notify the
+	 * processes who are using these pfns, and try to recover the data on
+	 * them if necessary.  The mf_flags is finally passed to the recover
+	 * function through the whole notify routine.
+	 *
+	 * When this is not implemented, or it returns -EOPNOTSUPP, the caller
+	 * will fall back to a common handler called mf_generic_kill_procs().
+	 */
+	int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
+			      unsigned long nr_pages, int mf_flags);
 };
 
 #define PGMAP_ALTMAP_VALID	(1 << 0)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f8a8a5d45eba..46c77151f726 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1748,6 +1748,20 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	if (!pgmap_pfn_valid(pgmap, pfn))
 		goto out;
 
+	/*
+	 * Call driver's implementation to handle the memory failure, otherwise
+	 * fall back to generic handler.
+	 */
+	if (pgmap->ops->memory_failure) {
+		rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
+		/*
+		 * Fall back to generic handler too if operation is not
+		 * supported inside the driver/device/filesystem.
+		 */
+		if (rc != -EOPNOTSUPP)
+			goto out;
+	}
+
 	rc = mf_generic_kill_procs(pfn, flags, pgmap);
 out:
 	/* drop pgmap ref acquired in caller */
-- 
cgit 


From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:28 +0800
Subject: fsdax: introduce dax_lock_mapping_entry()

The current dax_lock_page() locks dax entry by obtaining mapping and index
in page.  To support 1-to-N RMAP in NVDIMM, we need a new function to lock
a specific dax entry corresponding to this file's mapping,index.  And
output the page corresponding to the specific dax entry for caller use.

Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c            | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h | 15 +++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 4155a6107fa1..65e44d78b3bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -455,6 +455,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
 	dax_unlock_entry(&xas, (void *)cookie);
 }
 
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+		struct page **page)
+{
+	XA_STATE(xas, NULL, 0);
+	void *entry;
+
+	rcu_read_lock();
+	for (;;) {
+		entry = NULL;
+		if (!dax_mapping(mapping))
+			break;
+
+		xas.xa = &mapping->i_pages;
+		xas_lock_irq(&xas);
+		xas_set(&xas, index);
+		entry = xas_load(&xas);
+		if (dax_is_locked(entry)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(&xas, entry);
+			rcu_read_lock();
+			continue;
+		}
+		if (!entry ||
+		    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+			/*
+			 * Because we are looking for entry from file's mapping
+			 * and index, so the entry may not be inserted for now,
+			 * or even a zero/empty entry.  We don't think this is
+			 * an error case.  So, return a special value and do
+			 * not output @page.
+			 */
+			entry = (void *)~0UL;
+		} else {
+			*page = pfn_to_page(dax_to_pfn(entry));
+			dax_lock_entry(&xas, entry);
+		}
+		xas_unlock_irq(&xas);
+		break;
+	}
+	rcu_read_unlock();
+	return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie)
+{
+	XA_STATE(xas, &mapping->i_pages, index);
+
+	if (cookie == ~0UL)
+		return;
+
+	dax_unlock_entry(&xas, (void *)cookie);
+}
+
 /*
  * Find page cache entry at given index. If it is a DAX entry, return it
  * with the entry locked. If the page cache doesn't contain an entry at
diff --git a/include/linux/dax.h b/include/linux/dax.h
index cf85fc36da5f..7116681b48c0 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+		unsigned long index, struct page **page);
+void dax_unlock_mapping_entry(struct address_space *mapping,
+		unsigned long index, dax_entry_t cookie);
 #else
 static inline struct page *dax_layout_busy_page(struct address_space *mapping)
 {
@@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page)
 static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
 {
 }
+
+static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+		unsigned long index, struct page **page)
+{
+	return 0;
+}
+
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+		unsigned long index, dax_entry_t cookie)
+{
+}
 #endif
 
 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-- 
cgit 


From c36e2024957120566efd99395b5c8cc95b5175c1 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:29 +0800
Subject: mm: introduce mf_dax_kill_procs() for fsdax case

This new function is a variant of mf_generic_kill_procs that accepts a
file, offset pair instead of a struct to support multiple files sharing a
DAX mapping.  It is intended to be called by the file systems as part of
the memory_failure handler after the file system performed a reverse
mapping from the storage address to the file and file offset.

Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  |  2 ++
 mm/memory-failure.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64393ed3330a..d4ebfc206e2b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3178,6 +3178,8 @@ enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 };
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+		      unsigned long count, int mf_flags);
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46c77151f726..c9931c676335 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -297,10 +297,9 @@ void shake_page(struct page *p)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+		unsigned long address)
 {
-	unsigned long address = vma_address(page, vma);
 	unsigned long ret = 0;
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
 /*
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ *   In other cases, such as anonymous and file-backend page, the address to be
+ *   killed can be caculated by @p itself.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+			pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+			struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 	}
 
 	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
+	if (is_zone_device_page(p)) {
+		/*
+		 * Since page->mapping is not used for fsdax, we need
+		 * calculate the address based on the vma.
+		 */
+		if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+	} else
 		tk->size_shift = page_shift(compound_head(p));
 
 	/*
@@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -541,13 +550,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * to be informed of all such data corruptions.
 			 */
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
 	i_mmap_unlock_read(mapping);
 }
 
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+		struct address_space *mapping, pgoff_t pgoff,
+		struct list_head *to_kill)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+
+	i_mmap_lock_read(mapping);
+	read_lock(&tasklist_lock);
+	for_each_process(tsk) {
+		struct task_struct *t = task_early_kill(tsk, true);
+
+		if (!t)
+			continue;
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+			if (vma->vm_mm == t->mm)
+				add_to_kill(t, page, pgoff, vma, to_kill);
+		}
+	}
+	read_unlock(&tasklist_lock);
+	i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
@@ -1588,6 +1625,45 @@ unlock:
 	return rc;
 }
 
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping:	address_space of the file in use
+ * @index:	start pgoff of the range within the file
+ * @count:	length of the range, in unit of PAGE_SIZE
+ * @mf_flags:	memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+		unsigned long count, int mf_flags)
+{
+	LIST_HEAD(to_kill);
+	dax_entry_t cookie;
+	struct page *page;
+	size_t end = index + count;
+
+	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+	for (; index < end; index++) {
+		page = NULL;
+		cookie = dax_lock_mapping_entry(mapping, index, &page);
+		if (!cookie)
+			return -EBUSY;
+		if (!page)
+			goto unlock;
+
+		SetPageHWPoison(page);
+
+		collect_procs_fsdax(page, mapping, index, &to_kill);
+		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+				index, mf_flags);
+unlock:
+		dax_unlock_mapping_entry(mapping, index, cookie);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
 /*
  * Called from hugetlb code with hugetlb_lock held.
  *
-- 
cgit 


From 6061b69b9a550a2ab84e805d0d2315ba6215f112 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:31 +0800
Subject: fsdax: set a CoW flag when associate reflink mappings

Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file
mappings.  In this case, since the dax-rmap has already took the
responsibility to look up for shared files by given dax page, the
page->mapping is no longer to used for rmap but for marking that this dax
page is shared.  And to make sure disassociation works fine, we use
page->index as refcount, and clear page->mapping to the initial state when
page->index is decreased to 0.

With the help of this new flag, it is able to distinguish normal case and
CoW case, and keep the warning in normal case.

Link: https://lkml.kernel.org/r/20220603053738.1218681-8-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                   | 50 +++++++++++++++++++++++++++++++++++++---------
 include/linux/page-flags.h |  6 ++++++
 2 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 65e44d78b3bb..b59b864017ad 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
 	for (pfn = dax_to_pfn(entry); \
 			pfn < dax_end_pfn(entry); pfn++)
 
+static inline bool dax_mapping_is_cow(struct address_space *mapping)
+{
+	return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
 /*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static inline void dax_mapping_set_cow(struct page *page)
+{
+	if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+		/*
+		 * Reset the index if the page was already mapped
+		 * regularly before.
+		 */
+		if (page->mapping)
+			page->index = 1;
+		page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+	}
+	page->index++;
+}
+
+/*
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files.  If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
  */
 static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address)
+		struct vm_area_struct *vma, unsigned long address, bool cow)
 {
 	unsigned long size = dax_entry_size(entry), pfn, index;
 	int i = 0;
@@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		WARN_ON_ONCE(page->mapping);
-		page->mapping = mapping;
-		page->index = index + i++;
+		if (cow) {
+			dax_mapping_set_cow(page);
+		} else {
+			WARN_ON_ONCE(page->mapping);
+			page->mapping = mapping;
+			page->index = index + i++;
+		}
 	}
 }
 
@@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		struct page *page = pfn_to_page(pfn);
 
 		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+		if (dax_mapping_is_cow(page->mapping)) {
+			/* keep the CoW flag if this page is still shared */
+			if (page->index-- > 0)
+				continue;
+		} else
+			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
 		page->mapping = NULL;
 		page->index = 0;
 	}
@@ -830,7 +861,8 @@ static void *dax_insert_entry(struct xa_state *xas,
 		void *old;
 
 		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
+		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+				false);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 82719d33c0f1..f2ff65f1bf83 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -661,6 +661,12 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
+/*
+ * Different with flags above, this flag is used only for fsdax mode.  It
+ * indicates that this page->mapping is now under reflink case.
+ */
+#define PAGE_MAPPING_DAX_COW	0x1
+
 static __always_inline bool folio_mapping_flags(struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
-- 
cgit 


From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:36 +0800
Subject: fsdax: dedup file range to use a compare function

With dax we cannot deal with readpage() etc.  So, we create a dax
comparison function which is similar with vfs_dedupe_file_range_compare().
And introduce dax_remap_file_range_prep() for filesystem use.

Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c             | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/remap_range.c     | 31 ++++++++++++++++----
 fs/xfs/xfs_reflink.c |  8 +++--
 include/linux/dax.h  |  8 +++++
 include/linux/fs.h   | 12 +++++---
 5 files changed, 130 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 0aab32300531..e0f9c4a0a0c1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 	return dax_insert_pfn_mkwrite(vmf, pfn, order);
 }
 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+		struct iomap_iter *it_dest, u64 len, bool *same)
+{
+	const struct iomap *smap = &it_src->iomap;
+	const struct iomap *dmap = &it_dest->iomap;
+	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+	void *saddr, *daddr;
+	int id, ret;
+
+	len = min(len, min(smap->length, dmap->length));
+
+	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+		*same = true;
+		return len;
+	}
+
+	if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+		*same = false;
+		return 0;
+	}
+
+	id = dax_read_lock();
+	ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+				      &saddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+				      &daddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	*same = !memcmp(saddr, daddr, len);
+	if (!*same)
+		len = 0;
+	dax_read_unlock(id);
+	return len;
+
+out_unlock:
+	dax_read_unlock(id);
+	return -EIO;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+		struct inode *dst, loff_t dstoff, loff_t len, bool *same,
+		const struct iomap_ops *ops)
+{
+	struct iomap_iter src_iter = {
+		.inode		= src,
+		.pos		= srcoff,
+		.len		= len,
+		.flags		= IOMAP_DAX,
+	};
+	struct iomap_iter dst_iter = {
+		.inode		= dst,
+		.pos		= dstoff,
+		.len		= len,
+		.flags		= IOMAP_DAX,
+	};
+	int ret;
+
+	while ((ret = iomap_iter(&src_iter, ops)) > 0) {
+		while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
+			dst_iter.processed = dax_range_compare_iter(&src_iter,
+						&dst_iter, len, same);
+		}
+		if (ret <= 0)
+			src_iter.processed = ret;
+	}
+	return ret;
+}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      loff_t *len, unsigned int remap_flags,
+			      const struct iomap_ops *ops)
+{
+	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+					       pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..231de627c1b9 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/dax.h>
 #include "internal.h"
 
 #include <linux/uaccess.h>
@@ -271,9 +272,11 @@ out_error:
  * If there's an error, then the usual negative error code is returned.
  * Otherwise returns 0 with *len set to the request length.
  */
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  loff_t *len, unsigned int remap_flags)
+int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out,
+				loff_t *len, unsigned int remap_flags,
+				const struct iomap_ops *dax_read_ops)
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
@@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	if (remap_flags & REMAP_FILE_DEDUP) {
 		bool		is_same = false;
 
-		ret = vfs_dedupe_file_range_compare(file_in, pos_in,
-				file_out, pos_out, *len, &is_same);
+		if (*len == 0)
+			return 0;
+
+		if (!IS_DAX(inode_in))
+			ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+					file_out, pos_out, *len, &is_same);
+		else if (dax_read_ops)
+			ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+					inode_out, pos_out, *len, &is_same,
+					dax_read_ops);
+		else
+			return -EINVAL;
 		if (ret)
 			return ret;
 		if (!is_same)
@@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	return ret;
 }
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  loff_t *len, unsigned int remap_flags)
+{
+	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+					       pos_out, len, remap_flags, NULL);
+}
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e7a7c00d93be..cbaf36d21020 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
 		goto out_unlock;
 
-	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
-			len, remap_flags);
+	if (!IS_DAX(inode_in))
+		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags);
+	else
+		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags, &xfs_read_iomap_ops);
 	if (ret || *len == 0)
 		goto out_unlock;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7116681b48c0..ba985333e26b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+				  struct inode *dest, loff_t destoff,
+				  loff_t len, bool *is_same,
+				  const struct iomap_ops *ops);
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      loff_t *len, unsigned int remap_flags,
+			      const struct iomap_ops *ops);
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..134e9d7ad5d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,6 +74,7 @@ struct fsverity_operations;
 struct fs_context;
 struct fs_parameter_spec;
 struct fileattr;
+struct iomap_ops;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				       struct file *file_out, loff_t pos_out,
 				       size_t len, unsigned int flags);
-extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-					 struct file *file_out, loff_t pos_out,
-					 loff_t *count,
-					 unsigned int remap_flags);
+int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    loff_t *len, unsigned int remap_flags,
+				    const struct iomap_ops *dax_read_ops);
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  loff_t *count, unsigned int remap_flags);
 extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t len, unsigned int remap_flags);
-- 
cgit 


From 4fa6893faeaaea4fe4440512d2a708527ef47051 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:35 -0700
Subject: mm: thp: consolidate vma size check to transhuge_vma_suitable

There are couple of places that check whether the vma size is ok for THP
or whether address fits, they are open coded and duplicate, use
transhuge_vma_suitable() to do the job by passing in (vma->end -
HPAGE_PMD_SIZE).

Move vma size check into hugepage_vma_check().  This will make
khugepaged_enter() is as same as khugepaged_enter_vma().  There is just
one caller for khugepaged_enter(), replace it to khugepaged_enter_vma()
and remove khugepaged_enter().

Link: https://lkml.kernel.org/r/20220616174840.1202070-3-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    | 11 +++++++++++
 include/linux/khugepaged.h | 14 --------------
 mm/huge_memory.c           |  2 +-
 mm/khugepaged.c            | 19 ++++++-------------
 4 files changed, 18 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 648cb3ce7099..8a5a8bfce0f5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -116,6 +116,17 @@ extern struct kobj_attribute shmem_enabled_attr;
 
 extern unsigned long transparent_hugepage_flags;
 
+/*
+ * Do the below checks:
+ *   - For file vma, check if the linear page offset of vma is
+ *     HPAGE_PMD_NR aligned within the file.  The hugepage is
+ *     guaranteed to be hugepage-aligned within the file, but we must
+ *     check that the PMD-aligned addresses in the VMA map to
+ *     PMD-aligned offsets within the file, else the hugepage will
+ *     not be PMD-mappable.
+ *   - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ *     area.
+ */
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 392d34c3c59a..31ca8a7f78f4 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -51,16 +51,6 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 	if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
 		__khugepaged_exit(mm);
 }
-
-static inline void khugepaged_enter(struct vm_area_struct *vma,
-				   unsigned long vm_flags)
-{
-	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags))
-			__khugepaged_enter(vma->vm_mm);
-	}
-}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
@@ -68,10 +58,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
 static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
-static inline void khugepaged_enter(struct vm_area_struct *vma,
-				    unsigned long vm_flags)
-{
-}
 static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
 					unsigned long vm_flags)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a563de8234c1..2751649aaf33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -726,7 +726,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
-	khugepaged_enter(vma, vma->vm_flags);
+	khugepaged_enter_vma(vma, vma->vm_flags);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm) &&
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3eec970a884d..c7e22135f1b5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -443,8 +443,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	if (vma_is_dax(vma))
 		return false;
 
-	if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
-				vma->vm_pgoff, HPAGE_PMD_NR))
+	/* Check alignment for file vma and size for both file and anon vma */
+	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
 		return false;
 
 	/* Enabled via shmem mount options or sysfs settings. */
@@ -505,9 +505,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled() &&
-	    (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
-	     (vma->vm_end & HPAGE_PMD_MASK))) {
+	    khugepaged_enabled()) {
 		if (hugepage_vma_check(vma, vm_flags))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -948,7 +946,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 		struct vm_area_struct **vmap)
 {
 	struct vm_area_struct *vma;
-	unsigned long hstart, hend;
 
 	if (unlikely(khugepaged_test_exit(mm)))
 		return SCAN_ANY_PROCESS;
@@ -957,9 +954,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 	if (!vma)
 		return SCAN_VMA_NULL;
 
-	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
-	hend = vma->vm_end & HPAGE_PMD_MASK;
-	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
 	if (!hugepage_vma_check(vma, vma->vm_flags))
 		return SCAN_VMA_CHECK;
@@ -2135,10 +2130,8 @@ skip:
 			progress++;
 			continue;
 		}
-		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
-		hend = vma->vm_end & HPAGE_PMD_MASK;
-		if (hstart >= hend)
-			goto skip;
+		hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
+		hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
 		if (khugepaged_scan.address > hend)
 			goto skip;
 		if (khugepaged_scan.address < hstart)
-- 
cgit 


From 9fec51689ff60d9766b38051a0b1692f93d95364 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:37 -0700
Subject: mm: thp: kill transparent_hugepage_active()

The transparent_hugepage_active() was introduced to show THP eligibility
bit in smaps in proc, smaps is the only user.  But it actually does the
similar check as hugepage_vma_check() which is used by khugepaged.  We
definitely don't have to maintain two similar checks, so kill
transparent_hugepage_active().

This patch also fixed the wrong behavior for VM_NO_KHUGEPAGED vmas.

Also move hugepage_vma_check() to huge_memory.c and huge_mm.h since it
is not only for khugepaged anymore.

[akpm@linux-foundation.org: check vma->vm_mm, per Zach]
[akpm@linux-foundation.org: add comment to vdso check]
Link: https://lkml.kernel.org/r/20220616174840.1202070-5-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c         |  2 +-
 include/linux/huge_mm.h    | 16 ++++++++------
 include/linux/khugepaged.h |  2 --
 mm/huge_memory.c           | 53 ++++++++++++++++++++++++++++++++++++++--------
 mm/khugepaged.c            | 48 ++++-------------------------------------
 5 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1d7fd832123b..072cf770b5d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n",
-		   transparent_hugepage_active(vma));
+		   hugepage_vma_check(vma, vma->vm_flags, true));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8a5a8bfce0f5..64487bcd0c7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -202,7 +202,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-bool transparent_hugepage_active(struct vm_area_struct *vma);
+bool hugepage_vma_check(struct vm_area_struct *vma,
+			unsigned long vm_flags,
+			bool smaps);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -351,11 +353,6 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
-static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
-{
-	return false;
-}
-
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
@@ -368,6 +365,13 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
 	return false;
 }
 
+static inline bool hugepage_vma_check(struct vm_area_struct *vma,
+				       unsigned long vm_flags,
+				       bool smaps)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 31ca8a7f78f4..ea5fd4c398f7 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group;
 extern int khugepaged_init(void);
 extern void khugepaged_destroy(void);
 extern int start_stop_khugepaged(void);
-extern bool hugepage_vma_check(struct vm_area_struct *vma,
-			       unsigned long vm_flags);
 extern void __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern void khugepaged_enter_vma(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2751649aaf33..8cbd21aaf03e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -69,21 +69,56 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
-bool transparent_hugepage_active(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+			unsigned long vm_flags,
+			bool smaps)
 {
-	/* The addr is used to check if the vma size fits */
-	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+	if (!vma->vm_mm)		/* vdso */
+		return false;
+
+	if (!transhuge_vma_enabled(vma, vm_flags))
+		return false;
 
-	if (!transhuge_vma_suitable(vma, addr))
+	if (vm_flags & VM_NO_KHUGEPAGED)
 		return false;
-	if (vma_is_anonymous(vma))
-		return __transparent_hugepage_enabled(vma);
-	if (vma_is_shmem(vma))
+
+	/* Don't run khugepaged against DAX vma */
+	if (vma_is_dax(vma))
+		return false;
+
+	/* Check alignment for file vma and size for both file and anon vma */
+	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+		return false;
+
+	/* Enabled via shmem mount options or sysfs settings. */
+	if (shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
-	if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+
+	if (!khugepaged_enabled())
+		return false;
+
+	/* THP settings require madvise. */
+	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		return false;
+
+	/* Only regular file is valid */
+	if (file_thp_enabled(vma))
 		return true;
 
-	return false;
+	if (!vma_is_anonymous(vma))
+		return false;
+
+	if (vma_is_temporary_stack(vma))
+		return false;
+
+	/*
+	 * THPeligible bit of smaps should show 1 for proper VMAs even
+	 * though anon_vma is not initialized yet.
+	 */
+	if (!vma->anon_vma)
+		return smaps;
+
+	return true;
 }
 
 static bool get_huge_zero_page(void)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 67e144e64b7f..6bbf3adac534 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -430,46 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
-bool hugepage_vma_check(struct vm_area_struct *vma,
-			unsigned long vm_flags)
-{
-	if (!transhuge_vma_enabled(vma, vm_flags))
-		return false;
-
-	if (vm_flags & VM_NO_KHUGEPAGED)
-		return false;
-
-	/* Don't run khugepaged against DAX vma */
-	if (vma_is_dax(vma))
-		return false;
-
-	/* Check alignment for file vma and size for both file and anon vma */
-	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
-		return false;
-
-	/* Enabled via shmem mount options or sysfs settings. */
-	if (shmem_file(vma->vm_file))
-		return shmem_huge_enabled(vma);
-
-	if (!khugepaged_enabled())
-		return false;
-
-	/* THP settings require madvise. */
-	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
-		return false;
-
-	/* Only regular file is valid */
-	if (file_thp_enabled(vma))
-		return true;
-
-	if (!vma->anon_vma || !vma_is_anonymous(vma))
-		return false;
-	if (vma_is_temporary_stack(vma))
-		return false;
-
-	return true;
-}
-
 void __khugepaged_enter(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
@@ -506,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags))
+		if (hugepage_vma_check(vma, vm_flags, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -956,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
 	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags))
+	if (!hugepage_vma_check(vma, vma->vm_flags, false))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
@@ -1441,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
 	 * will not fail the vma for missing VM_HUGEPAGE
 	 */
-	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false))
 		return;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2131,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			progress++;
 			break;
 		}
-		if (!hugepage_vma_check(vma, vma->vm_flags)) {
+		if (!hugepage_vma_check(vma, vma->vm_flags, false)) {
 skip:
 			progress++;
 			continue;
-- 
cgit 


From 7da4e2cb8b1ff8221759bfc7512d651ee69516dc Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:38 -0700
Subject: mm: thp: kill __transhuge_page_enabled()

The page fault path checks THP eligibility with __transhuge_page_enabled()
which does the similar thing as hugepage_vma_check(), so use
hugepage_vma_check() instead.

However page fault allows DAX and !anon_vma cases, so added a new flag,
in_pf, to hugepage_vma_check() to make page fault work correctly.

The in_pf flag is also used to skip shmem and file THP for page fault
since shmem handles THP in its own shmem_fault() and file THP allocation
on fault is not supported yet.

Also remove hugepage_vma_enabled() since hugepage_vma_check() is the only
caller now, it is not necessary to have a helper function.

Link: https://lkml.kernel.org/r/20220616174840.1202070-6-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  2 +-
 include/linux/huge_mm.h | 57 ++-----------------------------------------------
 mm/huge_memory.c        | 51 +++++++++++++++++++++++++++++++++----------
 mm/khugepaged.c         |  8 +++----
 mm/memory.c             |  7 ++++--
 5 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 072cf770b5d0..a3398d0f1927 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n",
-		   hugepage_vma_check(vma, vma->vm_flags, true));
+		   hugepage_vma_check(vma, vma->vm_flags, true, false));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 64487bcd0c7b..cd8a6c5d9fe5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -146,48 +146,6 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 	return true;
 }
 
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
-					  unsigned long vm_flags)
-{
-	/* Explicitly disabled through madvise. */
-	if ((vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-		return false;
-	return true;
-}
-
-/*
- * to be used on vmas which are known to support THP.
- * Use transparent_hugepage_active otherwise
- */
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-
-	/*
-	 * If the hardware/firmware marked hugepage support disabled.
-	 */
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
-		return false;
-
-	if (!transhuge_vma_enabled(vma, vma->vm_flags))
-		return false;
-
-	if (vma_is_temporary_stack(vma))
-		return false;
-
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
-		return true;
-
-	if (vma_is_dax(vma))
-		return true;
-
-	if (transparent_hugepage_flags &
-				(1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
-		return !!(vma->vm_flags & VM_HUGEPAGE);
-
-	return false;
-}
-
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
 {
 	struct inode *inode;
@@ -204,7 +162,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
-			bool smaps);
+			bool smaps, bool in_pf);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -348,26 +306,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return false;
 }
 
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-	return false;
-}
-
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
 	return false;
 }
 
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
-					  unsigned long vm_flags)
-{
-	return false;
-}
-
 static inline bool hugepage_vma_check(struct vm_area_struct *vma,
 				       unsigned long vm_flags,
-				       bool smaps)
+				       bool smaps, bool in_pf)
 {
 	return false;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cbd21aaf03e..4b90c7021e52 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -71,27 +71,53 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
-			bool smaps)
+			bool smaps, bool in_pf)
 {
 	if (!vma->vm_mm)		/* vdso */
 		return false;
 
-	if (!transhuge_vma_enabled(vma, vm_flags))
+	/*
+	 * Explicitly disabled through madvise or prctl, or some
+	 * architectures may disable THP for some mappings, for
+	 * example, s390 kvm.
+	 * */
+	if ((vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
-
-	if (vm_flags & VM_NO_KHUGEPAGED)
+	/*
+	 * If the hardware/firmware marked hugepage support disabled.
+	 */
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
 		return false;
 
-	/* Don't run khugepaged against DAX vma */
+	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
 	if (vma_is_dax(vma))
+		return in_pf;
+
+	/*
+	 * Special VMA and hugetlb VMA.
+	 * Must be checked after dax since some dax mappings may have
+	 * VM_MIXEDMAP set.
+	 */
+	if (vm_flags & VM_NO_KHUGEPAGED)
 		return false;
 
-	/* Check alignment for file vma and size for both file and anon vma */
-	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+	/*
+	 * Check alignment for file vma and size for both file and anon vma.
+	 *
+	 * Skip the check for page fault. Huge fault does the check in fault
+	 * handlers. And this check is not suitable for huge PUD fault.
+	 */
+	if (!in_pf &&
+	    !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
 		return false;
 
-	/* Enabled via shmem mount options or sysfs settings. */
-	if (shmem_file(vma->vm_file))
+	/*
+	 * Enabled via shmem mount options or sysfs settings.
+	 * Must be done before hugepage flags check since shmem has its
+	 * own flags.
+	 */
+	if (!in_pf && shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
 
 	if (!khugepaged_enabled())
@@ -102,7 +128,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 		return false;
 
 	/* Only regular file is valid */
-	if (file_thp_enabled(vma))
+	if (!in_pf && file_thp_enabled(vma))
 		return true;
 
 	if (!vma_is_anonymous(vma))
@@ -114,9 +140,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	/*
 	 * THPeligible bit of smaps should show 1 for proper VMAs even
 	 * though anon_vma is not initialized yet.
+	 *
+	 * Allow page fault since anon_vma may be not initialized until
+	 * the first page fault.
 	 */
 	if (!vma->anon_vma)
-		return smaps;
+		return (smaps || in_pf);
 
 	return true;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6bbf3adac534..d683ef1edeb5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -466,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags, false))
+		if (hugepage_vma_check(vma, vm_flags, false, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -916,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
 	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags, false))
+	if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
@@ -1401,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
 	 * will not fail the vma for missing VM_HUGEPAGE
 	 */
-	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false))
+	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
 		return;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2091,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			progress++;
 			break;
 		}
-		if (!hugepage_vma_check(vma, vma->vm_flags, false)) {
+		if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
 skip:
 			progress++;
 			continue;
diff --git a/mm/memory.c b/mm/memory.c
index dce0b2e686eb..2392d5db473a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4970,6 +4970,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		.gfp_mask = __get_fault_gfp_mask(vma),
 	};
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long vm_flags = vma->vm_flags;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	vm_fault_t ret;
@@ -4983,7 +4984,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 retry_pud:
-	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+	if (pud_none(*vmf.pud) &&
+	    hugepage_vma_check(vma, vm_flags, false, true)) {
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -5016,7 +5018,8 @@ retry_pud:
 	if (pud_trans_unstable(vmf.pud))
 		goto retry_pud;
 
-	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+	if (pmd_none(*vmf.pmd) &&
+	    hugepage_vma_check(vma, vm_flags, false, true)) {
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-- 
cgit 


From 1064026bab9f011bdea1251d44d66bbbcee04f6e Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:39 -0700
Subject: mm: khugepaged: reorg some khugepaged helpers

The khugepaged_{enabled|always|req_madv} are not khugepaged only anymore,
move them to huge_mm.h and rename to hugepage_flags_xxx, and remove
khugepaged_req_madv due to no users.

Also move khugepaged_defrag to khugepaged.c since its only caller is in
that file, it doesn't have to be in a header file.

Link: https://lkml.kernel.org/r/20220616174840.1202070-7-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    |  8 ++++++++
 include/linux/khugepaged.h | 14 --------------
 mm/huge_memory.c           |  4 ++--
 mm/khugepaged.c            | 18 +++++++++++-------
 4 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cd8a6c5d9fe5..ae3d8e2fd9e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -116,6 +116,14 @@ extern struct kobj_attribute shmem_enabled_attr;
 
 extern unsigned long transparent_hugepage_flags;
 
+#define hugepage_flags_enabled()					       \
+	(transparent_hugepage_flags &				       \
+	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
+	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define hugepage_flags_always()				\
+	(transparent_hugepage_flags &			\
+	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+
 /*
  * Do the below checks:
  *   - For file vma, check if the linear page offset of vma is
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index ea5fd4c398f7..384f034ae947 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -24,20 +24,6 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
 }
 #endif
 
-#define khugepaged_enabled()					       \
-	(transparent_hugepage_flags &				       \
-	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
-	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define khugepaged_always()				\
-	(transparent_hugepage_flags &			\
-	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
-#define khugepaged_req_madv()					\
-	(transparent_hugepage_flags &				\
-	 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
-#define khugepaged_defrag()					\
-	(transparent_hugepage_flags &				\
-	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
-
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4b90c7021e52..8e1b3d9f7ebf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -120,11 +120,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	if (!in_pf && shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
 
-	if (!khugepaged_enabled())
+	if (!hugepage_flags_enabled())
 		return false;
 
 	/* THP settings require madvise. */
-	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+	if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
 		return false;
 
 	/* Only regular file is valid */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d683ef1edeb5..01f71786d530 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -465,7 +465,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled()) {
+	    hugepage_flags_enabled()) {
 		if (hugepage_vma_check(vma, vm_flags, false, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -761,6 +761,10 @@ static bool khugepaged_scan_abort(int nid)
 	return false;
 }
 
+#define khugepaged_defrag()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 {
@@ -858,7 +862,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
 			khugepaged_alloc_sleep();
 		} else
 			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+	} while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
 
 	return hpage;
 }
@@ -2172,7 +2176,7 @@ breakouterloop_mmap_lock:
 static int khugepaged_has_work(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) &&
-		khugepaged_enabled();
+		hugepage_flags_enabled();
 }
 
 static int khugepaged_wait_event(void)
@@ -2237,7 +2241,7 @@ static void khugepaged_wait_work(void)
 		return;
 	}
 
-	if (khugepaged_enabled())
+	if (hugepage_flags_enabled())
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
@@ -2268,7 +2272,7 @@ static void set_recommended_min_free_kbytes(void)
 	int nr_zones = 0;
 	unsigned long recommended_min;
 
-	if (!khugepaged_enabled()) {
+	if (!hugepage_flags_enabled()) {
 		calculate_min_free_kbytes();
 		goto update_wmarks;
 	}
@@ -2318,7 +2322,7 @@ int start_stop_khugepaged(void)
 	int err = 0;
 
 	mutex_lock(&khugepaged_mutex);
-	if (khugepaged_enabled()) {
+	if (hugepage_flags_enabled()) {
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
@@ -2344,7 +2348,7 @@ fail:
 void khugepaged_min_free_kbytes_update(void)
 {
 	mutex_lock(&khugepaged_mutex);
-	if (khugepaged_enabled() && khugepaged_thread)
+	if (hugepage_flags_enabled() && khugepaged_thread)
 		set_recommended_min_free_kbytes();
 	mutex_unlock(&khugepaged_mutex);
 }
-- 
cgit 


From e95a9851787bbb3cd4deb40fe8bab03f731852d1 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 21 Jun 2022 16:56:17 -0700
Subject: hugetlb: skip to end of PT page mapping when pte not present

Patch series "hugetlb: speed up linear address scanning", v2.

At unmap, fork and remap time hugetlb address ranges are linearly scanned.
We can optimize these scans if the ranges are sparsely populated.

Also, enable page table "Lazy copy" for hugetlb at fork.

NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to
add an arch specific version hugetlb_mask_last_page() to take advantage of
sparse address scanning improvements.  Baolin Wang added the routine for
arm64.  Other architectures which could be optimized are: ia64, mips,
parisc, powerpc, s390, sh and sparc.


This patch (of 4):

HugeTLB address ranges are linearly scanned during fork, unmap and remap
operations.  If a non-present entry is encountered, the code currently
continues to the next huge page aligned address.  However, a non-present
entry implies that the page table page for that entry is not present.
Therefore, the linear scan can skip to the end of range mapped by the page
table page.  This can speed operations on large sparsely populated hugetlb
mappings.

Create a new routine hugetlb_mask_last_page() that will return an address
mask.  When the mask is ORed with an address, the result will be the
address of the last huge page mapped by the associated page table page.
Use this mask to update addresses in routines which linearly scan hugetlb
address ranges when a non-present pte is encountered.

hugetlb_mask_last_page is related to the implementation of huge_pte_offset
as hugetlb_mask_last_page is called when huge_pte_offset returns NULL.
This patch only provides a complete hugetlb_mask_last_page implementation
when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined.  Architectures which
provide their own versions of huge_pte_offset can also provide their own
version of hugetlb_mask_last_page.

Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: James Houghton <jthoughton@google.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  1 +
 mm/hugetlb.c            | 56 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c6cccfaf8708..ce30fad5fd13 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
+unsigned long hugetlb_mask_last_page(struct hstate *h);
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long *addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ffdf3fc4a83f..95fd1c36c17f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4727,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long npages = pages_per_huge_page(h);
 	struct address_space *mapping = src_vma->vm_file->f_mapping;
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	int ret = 0;
 
 	if (cow) {
@@ -4746,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		i_mmap_lock_read(mapping);
 	}
 
+	last_addr_mask = hugetlb_mask_last_page(h);
 	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
 		src_pte = huge_pte_offset(src, addr, sz);
-		if (!src_pte)
+		if (!src_pte) {
+			addr |= last_addr_mask;
 			continue;
+		}
 		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
 		if (!dst_pte) {
 			ret = -ENOMEM;
@@ -4767,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		 * after taking the lock below.
 		 */
 		dst_entry = huge_ptep_get(dst_pte);
-		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+			addr |= last_addr_mask;
 			continue;
+		}
 
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
 		src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4928,6 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	unsigned long sz = huge_page_size(h);
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long old_end = old_addr + len;
+	unsigned long last_addr_mask;
 	unsigned long old_addr_copy;
 	pte_t *src_pte, *dst_pte;
 	struct mmu_notifier_range range;
@@ -4943,12 +4950,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	/* Prevent race with file truncation */
 	i_mmap_lock_write(mapping);
 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
 		src_pte = huge_pte_offset(mm, old_addr, sz);
-		if (!src_pte)
+		if (!src_pte) {
+			old_addr |= last_addr_mask;
+			new_addr |= last_addr_mask;
 			continue;
+		}
 		if (huge_pte_none(huge_ptep_get(src_pte)))
 			continue;
 
@@ -4993,6 +5004,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	bool force_flush = false;
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5013,11 +5025,14 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 				end);
 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	address = start;
 	for (; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address, sz);
-		if (!ptep)
+		if (!ptep) {
+			address |= last_addr_mask;
 			continue;
+		}
 
 		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
@@ -6285,6 +6300,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	unsigned long pages = 0, psize = huge_page_size(h);
 	bool shared_pmd = false;
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
@@ -6301,12 +6317,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += psize) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address, psize);
-		if (!ptep)
+		if (!ptep) {
+			address |= last_addr_mask;
 			continue;
+		}
 		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			/*
@@ -6856,6 +6875,33 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	return (pte_t *)pmd;
 }
 
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size.  Used to skip non-present
+ * page table entries when linearly scanning address ranges.  Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+	unsigned long hp_size = huge_page_size(h);
+
+	if (hp_size == PUD_SIZE)
+		return P4D_SIZE - PUD_SIZE;
+	else if (hp_size == PMD_SIZE)
+		return PUD_SIZE - PMD_SIZE;
+	else
+		return 0UL;
+}
+
+#else
+
+/* See description above.  Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+	return 0UL;
+}
+
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
 /*
-- 
cgit 


From 4ddb4d91b82f4b64458fe35bc8e395c7c082ea2b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 21 Jun 2022 16:56:19 -0700
Subject: hugetlb: do not update address in huge_pmd_unshare

As an optimization for loops sequentially processing hugetlb address
ranges, huge_pmd_unshare would update a passed address if it unshared a
pmd.  Updating a loop control variable outside the loop like this is
generally a bad idea.  These loops are now using hugetlb_mask_last_page to
optimize scanning when non-present ptes are discovered.  The same can be
done when huge_pmd_unshare returns 1 indicating a pmd was unshared.

Remove address update from huge_pmd_unshare.  Change the passed argument
type and update all callers.  In loops sequentially processing addresses
use hugetlb_mask_last_page to update address if pmd is unshared.

[sfr@canb.auug.org.au: fix an unused variable warning/error]
  Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au
Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: Will Deacon <will@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 44 +++++++++++++++++---------------------------
 mm/rmap.c               |  4 ++--
 3 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ce30fad5fd13..75ee739d815b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -196,7 +196,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
 unsigned long hugetlb_mask_last_page(struct hstate *h);
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long *addr, pte_t *ptep);
+				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -243,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write(
 
 static inline int huge_pmd_unshare(struct mm_struct *mm,
 					struct vm_area_struct *vma,
-					unsigned long *addr, pte_t *ptep)
+					unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 95fd1c36c17f..96635a2874e3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4935,7 +4935,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long old_end = old_addr + len;
 	unsigned long last_addr_mask;
-	unsigned long old_addr_copy;
 	pte_t *src_pte, *dst_pte;
 	struct mmu_notifier_range range;
 	bool shared_pmd = false;
@@ -4963,14 +4962,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 		if (huge_pte_none(huge_ptep_get(src_pte)))
 			continue;
 
-		/* old_addr arg to huge_pmd_unshare() is a pointer and so the
-		 * arg may be modified. Pass a copy instead to preserve the
-		 * value in old_addr.
-		 */
-		old_addr_copy = old_addr;
-
-		if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+		if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
 			shared_pmd = true;
+			old_addr |= last_addr_mask;
+			new_addr |= last_addr_mask;
 			continue;
 		}
 
@@ -5035,10 +5030,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		}
 
 		ptl = huge_pte_lock(h, mm, ptep);
-		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+		if (huge_pmd_unshare(mm, vma, address, ptep)) {
 			spin_unlock(ptl);
 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
 			force_flush = true;
+			address |= last_addr_mask;
 			continue;
 		}
 
@@ -6327,7 +6323,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			continue;
 		}
 		ptl = huge_pte_lock(h, mm, ptep);
-		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+		if (huge_pmd_unshare(mm, vma, address, ptep)) {
 			/*
 			 * When uffd-wp is enabled on the vma, unshare
 			 * shouldn't happen at all.  Warn about it if it
@@ -6337,6 +6333,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			pages++;
 			spin_unlock(ptl);
 			shared_pmd = true;
+			address |= last_addr_mask;
 			continue;
 		}
 		pte = huge_ptep_get(ptep);
@@ -6759,11 +6756,11 @@ out:
  *	    0 the underlying pte page is not shared, or it is the last user
  */
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-					unsigned long *addr, pte_t *ptep)
+					unsigned long addr, pte_t *ptep)
 {
-	pgd_t *pgd = pgd_offset(mm, *addr);
-	p4d_t *p4d = p4d_offset(pgd, *addr);
-	pud_t *pud = pud_offset(p4d, *addr);
+	pgd_t *pgd = pgd_offset(mm, addr);
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	pud_t *pud = pud_offset(p4d, addr);
 
 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6773,14 +6770,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_clear(pud);
 	put_page(virt_to_page(ptep));
 	mm_dec_nr_pmds(mm);
-	/*
-	 * This update of passed address optimizes loops sequentially
-	 * processing addresses in increments of huge page size (PMD_SIZE
-	 * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
-	 * Update address to the 'last page' in the cleared area so that
-	 * calling loop can move to first page past this area.
-	 */
-	*addr |= PUD_SIZE - PMD_SIZE;
 	return 1;
 }
 
@@ -6792,7 +6781,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long *addr, pte_t *ptep)
+				unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
@@ -6899,6 +6888,10 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
 /* See description above.  Architectures can provide their own version. */
 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
 {
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+	if (huge_page_size(h) == PMD_SIZE)
+		return PUD_SIZE - PMD_SIZE;
+#endif
 	return 0UL;
 }
 
@@ -7125,14 +7118,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	mmu_notifier_invalidate_range_start(&range);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (address = start; address < end; address += PUD_SIZE) {
-		unsigned long tmp = address;
-
 		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
 		ptl = huge_pte_lock(h, mm, ptep);
-		/* We don't want 'address' to be changed */
-		huge_pmd_unshare(mm, vma, &tmp, ptep);
+		huge_pmd_unshare(mm, vma, address, ptep);
 		spin_unlock(ptl);
 	}
 	flush_hugetlb_tlb_range(vma, start, end);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0532fd92ecb3..fb6b3b47f3e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1559,7 +1559,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * do this outside rmap routines.
 			 */
 			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
 				flush_tlb_range(vma, range.start, range.end);
 				mmu_notifier_invalidate_range(mm, range.start,
 							      range.end);
@@ -1920,7 +1920,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * do this outside rmap routines.
 			 */
 			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
 				flush_tlb_range(vma, range.start, range.end);
 				mmu_notifier_invalidate_range(mm, range.start,
 							      range.end);
-- 
cgit 


From bf75f200569dd05ac2112797f44548beb6b4be26 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:17 +0100
Subject: mm/page_alloc: add page->buddy_list and page->pcp_list

Patch series "Drain remote per-cpu directly", v5.

Some setups, notably NOHZ_FULL CPUs, may be running realtime or
latency-sensitive applications that cannot tolerate interference due to
per-cpu drain work queued by __drain_all_pages().  Introduce a new
mechanism to remotely drain the per-cpu lists.  It is made possible by
remotely locking 'struct per_cpu_pages' new per-cpu spinlocks.  This has
two advantages, the time to drain is more predictable and other unrelated
tasks are not interrupted.

This series has the same intent as Nicolas' series "mm/page_alloc: Remote
per-cpu lists drain support" -- avoid interference of a high priority task
due to a workqueue item draining per-cpu page lists.  While many workloads
can tolerate a brief interruption, it may cause a real-time task running
on a NOHZ_FULL CPU to miss a deadline and at minimum, the draining is
non-deterministic.

Currently an IRQ-safe local_lock protects the page allocator per-cpu
lists.  The local_lock on its own prevents migration and the IRQ disabling
protects from corruption due to an interrupt arriving while a page
allocation is in progress.

This series adjusts the locking.  A spinlock is added to struct
per_cpu_pages to protect the list contents while local_lock_irq is
ultimately replaced by just the spinlock in the final patch.  This allows
a remote CPU to safely.  Follow-on work should allow the spin_lock_irqsave
to be converted to spin_lock to avoid IRQs being disabled/enabled in most
cases.  The follow-on patch will be one kernel release later as it is
relatively high risk and it'll make bisections more clear if there are any
problems.

Patch 1 is a cosmetic patch to clarify when page->lru is storing buddy pages
	and when it is storing per-cpu pages.

Patch 2 shrinks per_cpu_pages to make room for a spin lock. Strictly speaking
	this is not necessary but it avoids per_cpu_pages consuming another
	cache line.

Patch 3 is a preparation patch to avoid code duplication.

Patch 4 is a minor correction.

Patch 5 uses a spin_lock to protect the per_cpu_pages contents while still
	relying on local_lock to prevent migration, stabilise the pcp
	lookup and prevent IRQ reentrancy.

Patch 6 remote drains per-cpu pages directly instead of using a workqueue.

Patch 7 uses a normal spinlock instead of local_lock for remote draining


This patch (of 7):

The page allocator uses page->lru for storing pages on either buddy or PCP
lists.  Create page->buddy_list and page->pcp_list as a union with
page->lru.  This is simply to clarify what type of list a page is on in
the page allocator.

No functional change intended.

[minchan@kernel.org: fix page lru fields in macros]
Link: https://lkml.kernel.org/r/20220624125423.6126-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h |  5 +++++
 mm/page_alloc.c          | 24 ++++++++++++------------
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b961a29bf26..cf97f3884fda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -87,6 +87,7 @@ struct page {
 			 */
 			union {
 				struct list_head lru;
+
 				/* Or, for the Unevictable "LRU list" slot */
 				struct {
 					/* Always even, to negate PageTail */
@@ -94,6 +95,10 @@ struct page {
 					/* Count page's or folio's mlocks */
 					unsigned int mlock_count;
 				};
+
+				/* Or, free page */
+				struct list_head buddy_list;
+				struct list_head pcp_list;
 			};
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c9c02b23f02f..78ba5ba66586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -793,7 +793,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
 		return false;
 
 	__SetPageGuard(page);
-	INIT_LIST_HEAD(&page->lru);
+	INIT_LIST_HEAD(&page->buddy_list);
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -936,7 +936,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add(&page->lru, &area->free_list[migratetype]);
+	list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
@@ -946,7 +946,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add_tail(&page->lru, &area->free_list[migratetype]);
+	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
@@ -960,7 +960,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move_tail(&page->lru, &area->free_list[migratetype]);
+	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -970,7 +970,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 	if (page_reported(page))
 		__ClearPageReported(page);
 
-	list_del(&page->lru);
+	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 	zone->free_area[order].nr_free--;
@@ -1508,11 +1508,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		do {
 			int mt;
 
-			page = list_last_entry(list, struct page, lru);
+			page = list_last_entry(list, struct page, pcp_list);
 			mt = get_pcppage_migratetype(page);
 
 			/* must delete to avoid corrupting pcp list */
-			list_del(&page->lru);
+			list_del(&page->pcp_list);
 			count -= nr_pages;
 			pcp->count -= nr_pages;
 
@@ -3072,7 +3072,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
 		 */
-		list_add_tail(&page->lru, list);
+		list_add_tail(&page->pcp_list, list);
 		allocated++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3322,7 +3322,7 @@ void mark_free_pages(struct zone *zone)
 
 	for_each_migratetype_order(order, t) {
 		list_for_each_entry(page,
-				&zone->free_area[order].free_list[t], lru) {
+				&zone->free_area[order].free_list[t], buddy_list) {
 			unsigned long i;
 
 			pfn = page_to_pfn(page);
@@ -3411,7 +3411,7 @@ static void free_unref_page_commit(struct page *page, int migratetype,
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
-	list_add(&page->lru, &pcp->lists[pindex]);
+	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
 
 	/*
@@ -3674,8 +3674,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 				return NULL;
 		}
 
-		page = list_first_entry(list, struct page, lru);
-		list_del(&page->lru);
+		page = list_first_entry(list, struct page, pcp_list);
+		list_del(&page->pcp_list);
 		pcp->count -= 1 << order;
 	} while (check_new_pcp(page, order));
 
-- 
cgit 


From 5d0a661d808fc8ddc26940b1a12b82ae356f3ae2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:18 +0100
Subject: mm/page_alloc: use only one PCP list for THP-sized allocations

The per_cpu_pages is cache-aligned on a standard x86-64 distribution
configuration but a later patch will add a new field which would push the
structure into the next cache line.  Use only one list to store THP-sized
pages on the per-cpu list.  This assumes that the vast majority of
THP-sized allocations are GFP_MOVABLE but even if it was another type, it
would not contribute to serious fragmentation that potentially causes a
later THP allocation failure.  Align per_cpu_pages on the cacheline
boundary to ensure there is no false cache sharing.

After this patch, the structure sizing is;

struct per_cpu_pages {
        int                        count;                /*     0     4 */
        int                        high;                 /*     4     4 */
        int                        batch;                /*     8     4 */
        short int                  free_factor;          /*    12     2 */
        short int                  expire;               /*    14     2 */
        struct list_head           lists[13];            /*    16   208 */

        /* size: 256, cachelines: 4, members: 6 */
        /* padding: 32 */
} __attribute__((__aligned__(64)));

Link: https://lkml.kernel.org/r/20220624125423.6126-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 11 +++++++----
 mm/page_alloc.c        |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5da1135e6755..041136b5628a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -355,15 +355,18 @@ enum zone_watermarks {
 };
 
 /*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
- * for pageblock size for THP if configured.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
+ * for THP which will usually be GFP_MOVABLE. Even if it is another type,
+ * it should not contribute to serious fragmentation causing THP allocation
+ * failures.
  */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define NR_PCP_THP 1
 #else
 #define NR_PCP_THP 0
 #endif
-#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
+#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
 
 /*
  * Shift to encode migratetype and order in the same integer, with order
@@ -389,7 +392,7 @@ struct per_cpu_pages {
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
-};
+} ____cacheline_aligned_in_smp;
 
 struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 78ba5ba66586..b5c340d2cb43 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -653,7 +653,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order > PAGE_ALLOC_COSTLY_ORDER) {
 		VM_BUG_ON(order != pageblock_order);
-		base = PAGE_ALLOC_COSTLY_ORDER + 1;
+		return NR_LOWORDER_PCP_LISTS;
 	}
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -667,7 +667,7 @@ static inline int pindex_to_order(unsigned int pindex)
 	int order = pindex / MIGRATE_PCPTYPES;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (order > PAGE_ALLOC_COSTLY_ORDER)
+	if (pindex == NR_LOWORDER_PCP_LISTS)
 		order = pageblock_order;
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-- 
cgit 


From 4b23a68f953628eb4e4b7fe1294ebf93d4b8ceee Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:21 +0100
Subject: mm/page_alloc: protect PCP lists with a spinlock

Currently the PCP lists are protected by using local_lock_irqsave to
prevent migration and IRQ reentrancy but this is inconvenient.  Remote
draining of the lists is impossible and a workqueue is required and every
task allocation/free must disable then enable interrupts which is
expensive.

As preparation for dealing with both of those problems, protect the
lists with a spinlock.  The IRQ-unsafe version of the lock is used
because IRQs are already disabled by local_lock_irqsave.  spin_trylock
is used in combination with local_lock_irqsave() but later will be
replaced with a spin_trylock_irqsave when the local_lock is removed.

The per_cpu_pages still fits within the same number of cache lines after
this patch relative to before the series.

struct per_cpu_pages {
        spinlock_t                 lock;                 /*     0     4 */
        int                        count;                /*     4     4 */
        int                        high;                 /*     8     4 */
        int                        batch;                /*    12     4 */
        short int                  free_factor;          /*    16     2 */
        short int                  expire;               /*    18     2 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           lists[13];            /*    24   208 */

        /* size: 256, cachelines: 4, members: 7 */
        /* sum members: 228, holes: 1, sum holes: 4 */
        /* padding: 24 */
} __attribute__((__aligned__(64)));

There is overhead in the fast path due to acquiring the spinlock even
though the spinlock is per-cpu and uncontended in the common case.  Page
Fault Test (PFT) running on a 1-socket reported the following results on a
1 socket machine.

                                     5.19.0-rc3               5.19.0-rc3
                                        vanilla      mm-pcpspinirq-v5r16
Hmean     faults/sec-1   869275.7381 (   0.00%)   874597.5167 *   0.61%*
Hmean     faults/sec-3  2370266.6681 (   0.00%)  2379802.0362 *   0.40%*
Hmean     faults/sec-5  2701099.7019 (   0.00%)  2664889.7003 *  -1.34%*
Hmean     faults/sec-7  3517170.9157 (   0.00%)  3491122.8242 *  -0.74%*
Hmean     faults/sec-8  3965729.6187 (   0.00%)  3939727.0243 *  -0.66%*

There is a small hit in the number of faults per second but given that the
results are more stable, it's borderline noise.

[akpm@linux-foundation.org: add missing local_unlock_irqrestore() on contention path]
Link: https://lkml.kernel.org/r/20220624125423.6126-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |   1 +
 mm/page_alloc.c        | 119 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 99 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 041136b5628a..578247a341b2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -382,6 +382,7 @@ enum zone_watermarks {
 
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
+	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 026c9437456c..a08ec4ac7ef2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -133,6 +133,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -3101,15 +3115,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long flags;
 	int to_drain, batch;
 
-	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
-	if (to_drain > 0)
+	if (to_drain > 0) {
+		unsigned long flags;
+
+		/*
+		 * free_pcppages_bulk expects IRQs disabled for zone->lock
+		 * so even though pcp->lock is not intended to be IRQ-safe,
+		 * it's needed in this context.
+		 */
+		spin_lock_irqsave(&pcp->lock, flags);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 #endif
 
@@ -3122,16 +3143,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_lock_irqsave(&pagesets.lock, flags);
-
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
-		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+	if (pcp->count) {
+		unsigned long flags;
 
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		/* See drain_zone_pages on why this is disabling IRQs */
+		spin_lock_irqsave(&pcp->lock, flags);
+		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 
 /*
@@ -3399,17 +3421,15 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	return min(READ_ONCE(pcp->batch) << 2, high);
 }
 
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+				   struct page *page, int migratetype,
 				   unsigned int order)
 {
-	struct zone *zone = page_zone(page);
-	struct per_cpu_pages *pcp;
 	int high;
 	int pindex;
 	bool free_high;
 
 	__count_vm_event(PGFREE);
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
@@ -3436,6 +3456,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
 void free_unref_page(struct page *page, unsigned int order)
 {
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
+	struct per_cpu_pages *pcp;
+	struct zone *zone;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
@@ -3459,7 +3482,16 @@ void free_unref_page(struct page *page, unsigned int order)
 	}
 
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, migratetype, order);
+	zone = page_zone(page);
+	pcp_trylock_prepare(UP_flags);
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	if (spin_trylock(&pcp->lock)) {
+		free_unref_page_commit(zone, pcp, page, migratetype, order);
+		spin_unlock(&pcp->lock);
+	} else {
+		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+	}
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3469,6 +3501,8 @@ void free_unref_page(struct page *page, unsigned int order)
 void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
+	struct per_cpu_pages *pcp = NULL;
+	struct zone *locked_zone = NULL;
 	unsigned long flags;
 	int batch_count = 0;
 	int migratetype;
@@ -3495,6 +3529,17 @@ void free_unref_page_list(struct list_head *list)
 
 	local_lock_irqsave(&pagesets.lock, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
+		struct zone *zone = page_zone(page);
+
+		/* Different zone, different pcp lock. */
+		if (zone != locked_zone) {
+			if (pcp)
+				spin_unlock(&pcp->lock);
+			locked_zone = zone;
+			pcp = this_cpu_ptr(zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
+		}
+
 		/*
 		 * Non-isolated types over MIGRATE_PCPTYPES get added
 		 * to the MIGRATE_MOVABLE pcp list.
@@ -3504,18 +3549,24 @@ void free_unref_page_list(struct list_head *list)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, migratetype, 0);
+		free_unref_page_commit(zone, pcp, page, migratetype, 0);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
+			spin_unlock(&pcp->lock);
 			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
 			local_lock_irqsave(&pagesets.lock, flags);
+			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
 		}
 	}
+
+	if (pcp)
+		spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3729,18 +3780,32 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct list_head *list;
 	struct page *page;
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
 
 	local_lock_irqsave(&pagesets.lock, flags);
 
+	/*
+	 * spin_trylock may fail due to a parallel drain. In the future, the
+	 * trylock will also protect against IRQ reentrancy.
+	 */
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	pcp_trylock_prepare(UP_flags);
+	if (!spin_trylock(&pcp->lock)) {
+		pcp_trylock_finish(UP_flags);
+		local_unlock_irqrestore(&pagesets.lock, flags);
+		return NULL;
+	}
+
 	/*
 	 * On allocation, reduce the number of pages that are batch freed.
 	 * See nr_pcp_free() where free_factor is increased for subsequent
 	 * frees.
 	 */
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+	spin_unlock(&pcp->lock);
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3775,7 +3840,8 @@ struct page *rmqueue(struct zone *preferred_zone,
 				migratetype != MIGRATE_MOVABLE) {
 			page = rmqueue_pcplist(preferred_zone, zone, order,
 					gfp_flags, migratetype, alloc_flags);
-			goto out;
+			if (likely(page))
+				goto out;
 		}
 	}
 
@@ -5260,6 +5326,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 {
 	struct page *page;
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
 	struct zone *zone;
 	struct zoneref *z;
 	struct per_cpu_pages *pcp;
@@ -5340,11 +5407,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	if (unlikely(!zone))
 		goto failed;
 
-	/* Attempt the batch allocation */
+	/* Is a parallel drain in progress? */
 	local_lock_irqsave(&pagesets.lock, flags);
+	pcp_trylock_prepare(UP_flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
-	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+	if (!spin_trylock(&pcp->lock))
+		goto failed_irq;
 
+	/* Attempt the batch allocation */
+	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
 	while (nr_populated < nr_pages) {
 
 		/* Skip existing pages */
@@ -5357,8 +5428,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 								pcp, pcp_list);
 		if (unlikely(!page)) {
 			/* Try and allocate at least one page */
-			if (!nr_account)
+			if (!nr_account) {
+				spin_unlock(&pcp->lock);
 				goto failed_irq;
+			}
 			break;
 		}
 		nr_account++;
@@ -5371,6 +5444,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
+	spin_unlock(&pcp->lock);
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -5380,6 +5455,7 @@ out:
 	return nr_populated;
 
 failed_irq:
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 failed:
@@ -7020,6 +7096,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
+	spin_lock_init(&pcp->lock);
 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);
 
-- 
cgit 


From 840532711d7299d7e937952482ec899d4622c452 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:35 +0530
Subject: mm/mmap: build protect protection_map[] with __P000

Patch series "mm/mmap: Drop __SXXX/__PXXX macros from across platforms",
v7.

__SXXX/__PXXX macros are unnecessary abstraction layer in creating the
generic protection_map[] array which is used for vm_get_page_prot().  This
abstraction layer can be avoided, if the platforms just define the array
protection_map[] for all possible vm_flags access permission combinations
and also export vm_get_page_prot() implementation.

This series drops __SXXX/__PXXX macros from across platforms in the tree.
First it build protects generic protection_map[] array with '#ifdef
__P000' and moves it inside platforms which enable
ARCH_HAS_VM_GET_PAGE_PROT.  Later this build protects same array with
'#ifdef ARCH_HAS_VM_GET_PAGE_PROT' and moves inside remaining platforms
while enabling ARCH_HAS_VM_GET_PAGE_PROT.  This adds a new macro
DECLARE_VM_GET_PAGE_PROT defining the current generic vm_get_page_prot(),
in order for it to be reused on platforms that do not require custom
implementation.  Finally, ARCH_HAS_VM_GET_PAGE_PROT can just be dropped,
as all platforms now define and export vm_get_page_prot(), via looking up
a private and static protection_map[] array.  protection_map[] data type
has been changed as 'static const' on all platforms that do not change it
during boot.


This patch (of 26):

Build protect generic protection_map[] array with __P000, so that it can
be moved inside all the platforms one after the other.  Otherwise there
will be build failures during this process.
CONFIG_ARCH_HAS_VM_GET_PAGE_PROT cannot be used for this purpose as only
certain platforms enable this config now.

Link: https://lkml.kernel.org/r/20220711070600.2378316-1-anshuman.khandual@arm.com
Link: https://lkml.kernel.org/r/20220711070600.2378316-2-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 mm/mmap.c          | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4ebfc206e2b..1a435ce146a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,7 +425,9 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
+#ifdef __P000
 extern pgprot_t protection_map[16];
+#endif
 
 /*
  * The default fault flags that should be used by most of the
diff --git a/mm/mmap.c b/mm/mmap.c
index c14d7286a379..def0e03cf25c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -101,6 +101,7 @@ static void unmap_region(struct mm_struct *mm,
  *								w: (no) no
  *								x: (yes) yes
  */
+#ifdef __P000
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
 	[VM_READ]					= __P001,
@@ -119,6 +120,7 @@ pgprot_t protection_map[16] __ro_after_init = {
 	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
 };
+#endif
 
 #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 pgprot_t vm_get_page_prot(unsigned long vm_flags)
-- 
cgit 


From 43957b5d11037a651d162f65c682ec3c76777fc8 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:36 +0530
Subject: mm/mmap: define DECLARE_VM_GET_PAGE_PROT

This just converts the generic vm_get_page_prot() implementation into a
new macro i.e DECLARE_VM_GET_PAGE_PROT which later can be used across
platforms when enabling them with ARCH_HAS_VM_GET_PAGE_PROT.  This does
not create any functional change.

Link: https://lkml.kernel.org/r/20220711070600.2378316-3-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++
 mm/mmap.c               | 26 +-------------------------
 2 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 3cdc16cfd867..014ee8f0fbaa 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1689,4 +1689,32 @@ typedef unsigned int pgtbl_mod_mask;
 #define MAX_PTRS_PER_P4D PTRS_PER_P4D
 #endif
 
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware.  The expected
+ * behavior is in parens:
+ *
+ * map_type	prot
+ *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
+ * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
+ * MAP_PRIVATE (with Enhanced PAN supported):
+ *								r: (no) no
+ *								w: (no) no
+ *								x: (yes) yes
+ */
+#define DECLARE_VM_GET_PAGE_PROT					\
+pgprot_t vm_get_page_prot(unsigned long vm_flags)			\
+{									\
+		return protection_map[vm_flags &			\
+			(VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];	\
+}									\
+EXPORT_SYMBOL(vm_get_page_prot);
+
 #endif /* _LINUX_PGTABLE_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index def0e03cf25c..3c0d65743bc4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,26 +81,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-/* description of effects of mapping type and prot in current implementation.
- * this is due to the limited x86 page protection hardware.  The expected
- * behavior is in parens:
- *
- * map_type	prot
- *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
- * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
- *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
- *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
- *
- * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
- *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
- *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE (with Enhanced PAN supported):
- *								r: (no) no
- *								w: (no) no
- *								x: (yes) yes
- */
 #ifdef __P000
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
@@ -123,11 +103,7 @@ pgprot_t protection_map[16] __ro_after_init = {
 #endif
 
 #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
-{
-	return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
-}
-EXPORT_SYMBOL(vm_get_page_prot);
+DECLARE_VM_GET_PAGE_PROT
 #endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
 
 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
-- 
cgit 


From 09095f74130dfb2110ef2bcdd9ad0d42addaa1d5 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:41 +0530
Subject: mm/mmap: build protect protection_map[] with
 ARCH_HAS_VM_GET_PAGE_PROT

Now that protection_map[] has been moved inside those platforms that
enable ARCH_HAS_VM_GET_PAGE_PROT.  Hence generic protection_map[] array
now can be protected with CONFIG_ARCH_HAS_VM_GET_PAGE_PROT intead of
__P000.

Link: https://lkml.kernel.org/r/20220711070600.2378316-8-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/mmap.c          | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a435ce146a2..4b4dc93f9bc3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,7 +425,7 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
-#ifdef __P000
+#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 extern pgprot_t protection_map[16];
 #endif
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 3c0d65743bc4..2a58a9cd0752 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,7 +81,7 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-#ifdef __P000
+#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
 	[VM_READ]					= __P001,
@@ -100,9 +100,6 @@ pgprot_t protection_map[16] __ro_after_init = {
 	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
 };
-#endif
-
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 DECLARE_VM_GET_PAGE_PROT
 #endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
 
-- 
cgit 


From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:36:00 +0530
Subject: mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT

Now all the platforms enable ARCH_HAS_GET_PAGE_PROT.  They define and
export own vm_get_page_prot() whether custom or standard
DECLARE_VM_GET_PAGE_PROT.  Hence there is no need for default generic
fallback for vm_get_page_prot().  Just drop this fallback and also
ARCH_HAS_GET_PAGE_PROT mechanism.

Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/Kconfig      |  1 -
 arch/arc/Kconfig        |  1 -
 arch/arm/Kconfig        |  1 -
 arch/arm64/Kconfig      |  1 -
 arch/csky/Kconfig       |  1 -
 arch/hexagon/Kconfig    |  1 -
 arch/ia64/Kconfig       |  1 -
 arch/loongarch/Kconfig  |  1 -
 arch/m68k/Kconfig       |  1 -
 arch/microblaze/Kconfig |  1 -
 arch/mips/Kconfig       |  1 -
 arch/nios2/Kconfig      |  1 -
 arch/openrisc/Kconfig   |  1 -
 arch/parisc/Kconfig     |  1 -
 arch/powerpc/Kconfig    |  1 -
 arch/riscv/Kconfig      |  1 -
 arch/s390/Kconfig       |  1 -
 arch/sh/Kconfig         |  1 -
 arch/sparc/Kconfig      |  1 -
 arch/um/Kconfig         |  1 -
 arch/x86/Kconfig        |  1 -
 arch/xtensa/Kconfig     |  1 -
 include/linux/mm.h      |  3 ---
 mm/Kconfig              |  3 ---
 mm/mmap.c               | 22 ----------------------
 25 files changed, 50 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index db1c8b329461..7d0d26b5b3f5 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -2,7 +2,6 @@
 config ALPHA
 	bool
 	default y
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8be56a5d8a9b..9e3653253ef2 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -13,7 +13,6 @@ config ARC
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select ARCH_32BIT_OFF_T
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e153b6d4fc5b..7630ba9cb6cc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -24,7 +24,6 @@ config ARM
 	select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU
 	select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..7030bf3f8d6f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -45,7 +45,6 @@ config ARM64
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
 	select ARCH_HAVE_ELF_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 588b8a9c68ed..21d72b078eef 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -6,7 +6,6 @@ config CSKY
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace)
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index bc4ceecd0588..54eadf265178 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -6,7 +6,6 @@ config HEXAGON
 	def_bool y
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_PREEMPT
 	select DMA_GLOBAL_POOL
 	# Other pending projects/to-do items.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 0510a5737711..cb93769a9f2a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -12,7 +12,6 @@ config IA64
 	select ARCH_HAS_DMA_MARK_CLEAN
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ACPI
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index adf8cf6ec5d5..db2838cf8c02 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -9,7 +9,6 @@ config LOONGARCH
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PTE_SPECIAL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_INLINE_READ_LOCK if !PREEMPTION
 	select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 49aa0cf13e96..936cce42ae9a 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -7,7 +7,6 @@ config M68K
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 	select ARCH_MIGHT_HAVE_PC_PARPORT if ISA
 	select ARCH_NO_PREEMPT if !COLDFIRE
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 15f91ba8a0c4..8cf429ad1c84 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -7,7 +7,6 @@ config MICROBLAZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d0b7eb11ec81..db09d45d59ec 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -14,7 +14,6 @@ config MIPS
 	select ARCH_HAS_STRNLEN_USER
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_SUPPORTS_UPROBES
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index e0459dffd218..4167f1eb4cd8 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -6,7 +6,6 @@ config NIOS2
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_DMA_SET_UNCACHED
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_SWAP
 	select COMMON_CLK
 	select TIMER_OF
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index fe0dfb50eb86..e814df4c483c 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -10,7 +10,6 @@ config OPENRISC
 	select ARCH_HAS_DMA_SET_UNCACHED
 	select ARCH_HAS_DMA_CLEAR_UNCACHED
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select COMMON_CLK
 	select OF
 	select OF_EARLY_FLATTREE
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 891d82393957..fa400055b2d5 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -12,7 +12,6 @@ config PARISC
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_NO_SG_CHAIN
 	select ARCH_SUPPORTS_HUGETLBFS if PA20
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1035d172c7dd..250b8658b2d4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -140,7 +140,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 583389d4e43a..32ffef9f6e5b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -32,7 +32,6 @@ config RISCV
 	select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
 	select ARCH_STACKWALK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c4481377ca83..91c0b80a8bf0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -81,7 +81,6 @@ config S390
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_VDSO_DATA
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_INLINE_READ_LOCK
 	select ARCH_INLINE_READ_LOCK_BH
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 91f3ea325388..5f220e903e5a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -12,7 +12,6 @@ config SUPERH
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 09f868613a4d..9c1cce74953a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -13,7 +13,6 @@ config 64BIT
 config SPARC
 	bool
 	default y
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select DMA_OPS
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 7fb43654e5b5..4ec22e156a2e 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -10,7 +10,6 @@ config UML
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_PREEMPT
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index be0b95e51df6..841e4843d0c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,7 +94,6 @@ config X86
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 4c0d83520ff1..0b0f0172cced 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -11,7 +11,6 @@ config XTENSA
 	select ARCH_HAS_DMA_SET_UNCACHED if MMU
 	select ARCH_HAS_STRNCPY_FROM_USER if !KASAN
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4b4dc93f9bc3..61e3101c44ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,9 +425,6 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-extern pgprot_t protection_map[16];
-#endif
 
 /*
  * The default fault flags that should be used by most of the
diff --git a/mm/Kconfig b/mm/Kconfig
index c1fa4993a56f..56ca0e7c6f9a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -951,9 +951,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
 	  register alias named "current_stack_pointer", this config can be
 	  selected.
 
-config ARCH_HAS_VM_GET_PAGE_PROT
-	bool
-
 config ARCH_HAS_PTE_DEVMAP
 	bool
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 2a58a9cd0752..edf27a2789a2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,28 +81,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-pgprot_t protection_map[16] __ro_after_init = {
-	[VM_NONE]					= __P000,
-	[VM_READ]					= __P001,
-	[VM_WRITE]					= __P010,
-	[VM_WRITE | VM_READ]				= __P011,
-	[VM_EXEC]					= __P100,
-	[VM_EXEC | VM_READ]				= __P101,
-	[VM_EXEC | VM_WRITE]				= __P110,
-	[VM_EXEC | VM_WRITE | VM_READ]			= __P111,
-	[VM_SHARED]					= __S000,
-	[VM_SHARED | VM_READ]				= __S001,
-	[VM_SHARED | VM_WRITE]				= __S010,
-	[VM_SHARED | VM_WRITE | VM_READ]		= __S011,
-	[VM_SHARED | VM_EXEC]				= __S100,
-	[VM_SHARED | VM_EXEC | VM_READ]			= __S101,
-	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
-	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
-};
-DECLARE_VM_GET_PAGE_PROT
-#endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
-
 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
 {
 	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
-- 
cgit 


From 3ce4fee4401206cf5a2c476ec0ee6c90191dfade Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 4 Jul 2022 21:21:55 +0800
Subject: mm/huge_memory: check pmd_present first in is_huge_zero_pmd

When pmd is non-present, pmd_pfn returns an insane value. So we should
check pmd_present first to avoid acquiring such insane value and also
avoid touching possible cold huge_zero_pfn cache line when pmd isn't
present.

Link: https://lkml.kernel.org/r/20220704132201.14611-11-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ae3d8e2fd9e2..12b297f9951d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -273,7 +273,7 @@ static inline bool is_huge_zero_page(struct page *page)
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
-	return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
+	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
 }
 
 static inline bool is_huge_zero_pud(pud_t pud)
-- 
cgit 


From 121c1781aeb00475d163246d9ae7d8746e377040 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 4 Jul 2022 21:21:58 +0800
Subject: mm/huge_memory: fix comment of page_deferred_list

The current comment is confusing because if global or memcg deferred list
in the second tail page is occupied by compound_head, why we still use
page[2].deferred_list here? I think it wants to say that Global or memcg
deferred list in the first tail page is occupied by compound_mapcount and
compound_pincount so we use the second tail page's deferred_list instead.

Link: https://lkml.kernel.org/r/20220704132201.14611-14-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 12b297f9951d..37f2f11a6d7e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -294,8 +294,8 @@ static inline bool thp_migration_supported(void)
 static inline struct list_head *page_deferred_list(struct page *page)
 {
 	/*
-	 * Global or memcg deferred list in the second tail pages is
-	 * occupied by compound_head.
+	 * See organization of tail pages of compound page in
+	 * "struct page" definition.
 	 */
 	return &page[2].deferred_list;
 }
-- 
cgit 


From dcadcf1c30619ead2f3280bfb7f74de8304be2bb Mon Sep 17 00:00:00 2001
From: Gang Li <ligang.bdlg@bytedance.com>
Date: Wed, 6 Jul 2022 11:46:54 +0800
Subject: mm, hugetlb: skip irrelevant nodes in show_free_areas()

show_free_areas() allows to filter out node specific data which is
irrelevant to the allocation request.  But hugetlb_show_meminfo() still
shows hugetlb on all nodes, which is redundant and unnecessary.

Use show_mem_node_skip() to skip irrelevant nodes.  And replace
hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid).

before-and-after sample output of OOM:

before:
```
[  214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB
[  214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
[  214.388334] lowmem_reserve[]: 0 0 0 0 0
[  214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20
[  214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
```

after:
```
[  145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u
[  145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
[  145.152315] lowmem_reserve[]: 0 0 0 0 0
[  145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME)
[  145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
```

Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.com
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 18 ++++++++----------
 mm/page_alloc.c         |  8 ++++++--
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 75ee739d815b..4cdfce976644 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -152,7 +152,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct page *ref_page, zap_flags_t zap_flags);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
-void hugetlb_show_meminfo(void);
+void hugetlb_show_meminfo_node(int nid);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
@@ -298,7 +298,7 @@ static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
 	return 0;
 }
 
-static inline void hugetlb_show_meminfo(void)
+static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 96635a2874e3..bb763f5d30b9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4477,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
 			     nid, h->surplus_huge_pages_node[nid]);
 }
 
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
 {
 	struct hstate *h;
-	int nid;
 
 	if (!hugepages_supported())
 		return;
 
-	for_each_node_state(nid, N_MEMORY)
-		for_each_hstate(h)
-			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
-				nid,
-				h->nr_huge_pages_node[nid],
-				h->free_huge_pages_node[nid],
-				h->surplus_huge_pages_node[nid],
-				huge_page_size(h) / SZ_1K);
+	for_each_hstate(h)
+		printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+			nid,
+			h->nr_huge_pages_node[nid],
+			h->free_huge_pages_node[nid],
+			h->surplus_huge_pages_node[nid],
+			huge_page_size(h) / SZ_1K);
 }
 
 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 215b26664ad7..4fa96d3510fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6022,7 +6022,7 @@ static void show_migration_types(unsigned char type)
 void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
-	int cpu;
+	int cpu, nid;
 	struct zone *zone;
 	pg_data_t *pgdat;
 
@@ -6210,7 +6210,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		printk(KERN_CONT "= %lukB\n", K(total));
 	}
 
-	hugetlb_show_meminfo();
+	for_each_online_node(nid) {
+		if (show_mem_node_skip(filter, nid, nodemask))
+			continue;
+		hugetlb_show_meminfo_node(nid);
+	}
 
 	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
 
-- 
cgit 


From 0d8bc0b10aeab543bdccb86180f58db1f79f7cee Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Wed, 13 Jul 2022 20:53:14 +0800
Subject: writeback: cleanup bdi_sched_wait()

bdi_sched_wait() is no longer used since commit 839a8e8660b6 ("writeback:
replace custom worker pool implementation with unbound workqueue"), so
remove it.

Link: https://lkml.kernel.org/r/20220713125314.171345-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d452071db572..e84b745a6811 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -140,12 +140,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping)
 	return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
 }
 
-static inline int bdi_sched_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
-- 
cgit 


From 62df90b53e6f332bb69b73621998826c49a17323 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sun, 19 Jun 2022 15:46:41 +0800
Subject: net, lib/once: remove {net_}get_random_once_wait macro

DO_ONCE(func, ...) will call func with spinlock which acquired by
spin_lock_irqsave in __do_once_start.  But the get_random_once_wait will
sleep in get_random_bytes_wait -> wait_for_random_bytes.

Fortunately, there is no place to use {net_}get_random_once_wait, so we
could remove them simply.

Link: https://lkml.kernel.org/r/20220619074641.40916-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/net.h  | 2 --
 include/linux/once.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 12093f4db50c..8613772a1f58 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ do {									\
 
 #define net_get_random_once(buf, nbytes)			\
 	get_random_once((buf), (nbytes))
-#define net_get_random_once_wait(buf, nbytes)			\
-	get_random_once_wait((buf), (nbytes))
 
 /*
  * E.g. XFS meta- & log-data is in slab pages, or bcache meta
diff --git a/include/linux/once.h b/include/linux/once.h
index f54523052bbc..b14d8b309d52 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
-#define get_random_once_wait(buf, nbytes)                                    \
-	DO_ONCE(get_random_bytes_wait, (buf), (nbytes))                      \
 
 #endif /* _LINUX_ONCE_H */
-- 
cgit 


From 43c249ea0b1e10baac4a1264a25d69723ce5d2c2 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 24 Jun 2022 16:14:12 +0200
Subject: compiler-gcc.h: remove ancient workaround for gcc PR 58670

The workaround for 'asm goto' miscompilation introduces a compiler barrier
quirk that inhibits many useful compiler optimizations.  For example,
__try_cmpxchg_user compiles to:

   11375:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11379:	41 8b 02             	mov    (%r10),%eax
   1137c:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   11380:	0f 94 c2             	sete   %dl
   11383:	84 d2                	test   %dl,%dl
   11385:	75 c4                	jne    1134b <...>
   11387:	41 89 02             	mov    %eax,(%r10)

where the barrier inhibits flags propagation from asm when compiled with
gcc-12.

When the mentioned quirk is removed, the following code is generated:

   11553:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11557:	41 8b 02             	mov    (%r10),%eax
   1155a:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   1155e:	74 c9                	je     11529 <...>
   11560:	41 89 02             	mov    %eax,(%r10)

The refered compiler bug:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670

was fixed for gcc-4.8.2.

Current minimum required version of GCC is version 5.1 which has the above
'asm goto' miscompilation fixed, so remove the workaround.

Link: https://lkml.kernel.org/r/20220624141412.72274-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a0c55eeaeaf1..9b157b71036f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,17 +66,6 @@
 		__builtin_unreachable();	\
 	} while (0)
 
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
 #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__
-- 
cgit 


From 045ed31e23aea840648c290dbde04797064960db Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 24 Jun 2022 08:30:04 +0300
Subject: kfifo: fix kfifo_to_user() return type

The kfifo_to_user() macro is supposed to return zero for success or
negative error codes.  Unfortunately, there is a signedness bug so it
returns unsigned int.  This only affects callers which try to save the
result in ssize_t and as far as I can see the only place which does that
is line6_hwdep_read().

TL;DR: s/_uint/_int/.

Link: https://lkml.kernel.org/r/YrVL3OJVLlNhIMFs@kili
Fixes: 144ecf310eb5 ("kfifo: fix kfifo_alloc() to return a signed int value")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kfifo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 86249476b57f..0b35a41440ff 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -688,7 +688,7 @@ __kfifo_uint_must_check_helper( \
  * writer, you don't need extra locking to use these macro.
  */
 #define	kfifo_to_user(fifo, to, len, copied) \
-__kfifo_uint_must_check_helper( \
+__kfifo_int_must_check_helper( \
 ({ \
 	typeof((fifo) + 1) __tmp = (fifo); \
 	void __user *__to = (to); \
-- 
cgit 


From 4f09903078eeb9138cddce8db06100b82f8620e8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:27 +0200
Subject: cpumask: add UP optimised for_each_*_cpu versions

On uniprocessor builds, the following loops will always run over a mask
that contains one enabled CPU (cpu0):

    - for_each_possible_cpu
    - for_each_online_cpu
    - for_each_present_cpu

Provide uniprocessor-specific macros for these loops, that always run
exactly once.

Link: https://lkml.kernel.org/r/3a92869b902a075b97be5d1452c9c6badbbff0df.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..533612770bc0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -811,9 +811,16 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 /* First bits of cpu_bit_bitmap are in fact unset. */
 #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
 
+#if NR_CPUS == 1
+/* Uniprocessor: the possible/online/present masks are always "1" */
+#define for_each_possible_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_online_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_present_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#else
 #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
 #define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
+#endif
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 void init_cpu_present(const struct cpumask *src);
-- 
cgit 


From b81dce77cedcea6f00292f02d4b1ebbfc2c5988d Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:25 +0200
Subject: cpumask: Fix invalid uniprocessor mask assumption

On uniprocessor builds, any CPU mask is assumed to contain exactly one CPU
(cpu0).  This assumption ignores the existence of empty masks, resulting
in incorrect behaviour.

cpumask_first_zero(), cpumask_next_zero(), and for_each_cpu_not() don't
provide behaviour matching the assumption that a UP mask is always "1",
and instead provide behaviour matching the empty mask.

Drop the incorrectly optimised code and use the generic implementations in
all cases.

Link: https://lkml.kernel.org/r/86bf3f005abba2d92120ddd0809235cab4f759a6.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Suggested-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 99 ++++++++++---------------------------------------
 lib/Makefile            |  3 +-
 lib/cpumask.c           |  2 +
 3 files changed, 22 insertions(+), 82 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 533612770bc0..6c5b4ee000f2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -116,85 +116,6 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
 	return cpu;
 }
 
-#if NR_CPUS == 1
-/* Uniprocessor.  Assume all masks are "1". */
-static inline unsigned int cpumask_first(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
-					     const struct cpumask *srcp2)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_last(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-/* Valid inputs for n are -1 and 0. */
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_and(int n,
-					    const struct cpumask *srcp,
-					    const struct cpumask *andp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
-					     int start, bool wrap)
-{
-	/* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
-	return (wrap && n == 0);
-}
-
-/* cpu must be a valid cpu, ie 0, so there's no other choice. */
-static inline unsigned int cpumask_any_but(const struct cpumask *mask,
-					   unsigned int cpu)
-{
-	return 1;
-}
-
-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
-{
-	return 0;
-}
-
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
-	return cpumask_first_and(src1p, src2p);
-}
-
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
-{
-	return cpumask_first(srcp);
-}
-
-#define for_each_cpu(cpu, mask)			\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_not(cpu, mask)		\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_wrap(cpu, mask, start)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask1, mask2)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
-#else
 /**
  * cpumask_first - get the first cpu in a cpumask
  * @srcp: the cpumask pointer
@@ -260,10 +181,29 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 
 int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+
+#if NR_CPUS == 1
+/* Uniprocessor: there is only one valid CPU */
+static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+	return 0;
+}
+
+static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+					     const struct cpumask *src2p) {
+	return cpumask_first_and(src1p, src2p);
+}
+
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+#else
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
 int cpumask_any_distribute(const struct cpumask *srcp);
+#endif /* NR_CPUS */
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
@@ -324,7 +264,6 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
-#endif /* SMP */
 
 #define CPU_BITS_NONE						\
 {								\
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..bcc7e8ea0cde 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,10 +34,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
-	 buildid.o
+	 buildid.o cpumask.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
-lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a971a82d2f43..b9728513a4d4 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+#if NR_CPUS > 1
 /**
  * cpumask_local_spread - select the i'th cpu with local numa cpu's first
  * @i: index number
@@ -279,3 +280,4 @@ int cpumask_any_distribute(const struct cpumask *srcp)
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_distribute);
+#endif /* NR_CPUS */
-- 
cgit 


From 953257a9252a9b3c58ca68fc5bf26fc65e5b1cb8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:28 +0200
Subject: cpumask: update cpumask_next_wrap() signature

The extern specifier is not needed for this declaration, so drop it.  The
function also depends only on the input parameters, and has no side
effects, so it can be marked __pure like other functions in cpumask.h.

Link: https://lkml.kernel.org/r/72ab755695b74bb5fbaa756ae4c0edd708d172f1.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 6c5b4ee000f2..523857884ae4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -229,7 +229,7 @@ int cpumask_any_distribute(const struct cpumask *srcp);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
-- 
cgit 


From 91561d4ecb755f056f8ff04f9dcaec210140e55c Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Fri, 15 Jul 2022 18:45:33 +0800
Subject: swiotlb: remove unused fields in io_tlb_mem

Commit 20347fca71a3 ("swiotlb: split up the global swiotlb lock") splits
io_tlb_mem into multiple areas. Each area has its own lock and index. The
global ones are not used so remove them.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 5 -----
 kernel/dma/swiotlb.c    | 2 --
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index f65ff1930120..d3ae03edbbd2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -79,11 +79,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  * @used:	The number of used IO TLB block.
  * @list:	The free list describing the number of free entries available
  *		from each index.
- * @index:	The index to start searching in the next round.
  * @orig_addr:	The original address corresponding to a mapped entry.
  * @alloc_size:	Size of the allocated buffer.
- * @lock:	The lock to protect the above data structures in the map and
- *		unmap calls.
  * @debugfs:	The dentry to debugfs.
  * @late_alloc:	%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
@@ -97,8 +94,6 @@ struct io_tlb_mem {
 	void *vaddr;
 	unsigned long nslabs;
 	unsigned long used;
-	unsigned int index;
-	spinlock_t lock;
 	struct dentry *debugfs;
 	bool late_alloc;
 	bool force_bounce;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c50e6fe20f37..cbffa0b1ace5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -253,14 +253,12 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->nslabs = nslabs;
 	mem->start = start;
 	mem->end = mem->start + bytes;
-	mem->index = 0;
 	mem->late_alloc = late_alloc;
 	mem->nareas = nareas;
 	mem->area_nslabs = nslabs / mem->nareas;
 
 	mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
 
-	spin_lock_init(&mem->lock);
 	for (i = 0; i < mem->nareas; i++) {
 		spin_lock_init(&mem->areas[i].lock);
 		mem->areas[i].index = 0;
-- 
cgit 


From 942a8186eb4451700dadd1d60a2e43ce67605991 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Jul 2022 08:43:07 +0200
Subject: swiotlb: move struct io_tlb_slot to swiotlb.c

No need to expose this structure definition in the header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 6 +-----
 kernel/dma/swiotlb.c    | 6 ++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d3ae03edbbd2..35bc4e281c21 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -101,11 +101,7 @@ struct io_tlb_mem {
 	unsigned int nareas;
 	unsigned int area_nslabs;
 	struct io_tlb_area *areas;
-	struct io_tlb_slot {
-		phys_addr_t orig_addr;
-		size_t alloc_size;
-		unsigned int list;
-	} *slots;
+	struct io_tlb_slot *slots;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 608923e8dab1..39dee4004439 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,12 @@
 
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
+struct io_tlb_slot {
+	phys_addr_t orig_addr;
+	size_t alloc_size;
+	unsigned int list;
+};
+
 static bool swiotlb_force_bounce;
 static bool swiotlb_force_disable;
 
-- 
cgit 


From 76c16d3e19446deea98b7883f261758b96b8781a Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Date: Thu, 14 Jul 2022 15:54:27 +0800
Subject: net: stmmac: switch to use interrupt for hw crosstimestamping

Using current implementation of polling mode, there is high chances we
will hit into timeout error when running phc2sys. Hence, update the
implementation of hardware crosstimestamping to use the MAC interrupt
service routine instead of polling for TSIS bit in the MAC Timestamp
Interrupt Status register to be set.

Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  | 25 ++++++++++++++--------
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  |  4 ++++
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  1 +
 .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c  |  5 +++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   | 12 +----------
 include/linux/stmmac.h                             |  1 +
 7 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 38fe77d1035e..3fe720c5dc9f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -298,6 +298,11 @@ static void get_arttime(struct mii_bus *mii, int intel_adhoc_addr,
 	*art_time = ns;
 }
 
+static int stmmac_cross_ts_isr(struct stmmac_priv *priv)
+{
+	return (readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE);
+}
+
 static int intel_crosststamp(ktime_t *device,
 			     struct system_counterval_t *system,
 			     void *ctx)
@@ -313,8 +318,6 @@ static int intel_crosststamp(ktime_t *device,
 	u32 num_snapshot;
 	u32 gpio_value;
 	u32 acr_value;
-	int ret;
-	u32 v;
 	int i;
 
 	if (!boot_cpu_has(X86_FEATURE_ART))
@@ -328,6 +331,8 @@ static int intel_crosststamp(ktime_t *device,
 	if (priv->plat->ext_snapshot_en)
 		return -EBUSY;
 
+	priv->plat->int_snapshot_en = 1;
+
 	mutex_lock(&priv->aux_ts_lock);
 	/* Enable Internal snapshot trigger */
 	acr_value = readl(ptpaddr + PTP_ACR);
@@ -347,6 +352,7 @@ static int intel_crosststamp(ktime_t *device,
 		break;
 	default:
 		mutex_unlock(&priv->aux_ts_lock);
+		priv->plat->int_snapshot_en = 0;
 		return -EINVAL;
 	}
 	writel(acr_value, ptpaddr + PTP_ACR);
@@ -368,13 +374,12 @@ static int intel_crosststamp(ktime_t *device,
 	gpio_value |= GMAC_GPO1;
 	writel(gpio_value, ioaddr + GMAC_GPIO_STATUS);
 
-	/* Poll for time sync operation done */
-	ret = readl_poll_timeout(priv->ioaddr + GMAC_INT_STATUS, v,
-				 (v & GMAC_INT_TSIE), 100, 10000);
-
-	if (ret == -ETIMEDOUT) {
-		pr_err("%s: Wait for time sync operation timeout\n", __func__);
-		return ret;
+	/* Time sync done Indication - Interrupt method */
+	if (!wait_event_interruptible_timeout(priv->tstamp_busy_wait,
+					      stmmac_cross_ts_isr(priv),
+					      HZ / 100)) {
+		priv->plat->int_snapshot_en = 0;
+		return -ETIMEDOUT;
 	}
 
 	num_snapshot = (readl(ioaddr + GMAC_TIMESTAMP_STATUS) &
@@ -392,6 +397,7 @@ static int intel_crosststamp(ktime_t *device,
 	}
 
 	system->cycles *= intel_priv->crossts_adj;
+	priv->plat->int_snapshot_en = 0;
 
 	return 0;
 }
@@ -576,6 +582,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 
 	plat->has_crossts = true;
 	plat->crosststamp = intel_crosststamp;
+	plat->int_snapshot_en = 0;
 
 	/* Setup MSI vector offset specific to Intel mGbE controller */
 	plat->msi_mac_vec = 29;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 462ca7ed095a..71dad409f78b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -150,7 +150,8 @@
 #define	GMAC_PCS_IRQ_DEFAULT	(GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK |	\
 				 GMAC_INT_PCS_ANE)
 
-#define	GMAC_INT_DEFAULT_ENABLE	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN)
+#define	GMAC_INT_DEFAULT_ENABLE	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN | \
+				 GMAC_INT_TSIE)
 
 enum dwmac4_irq_status {
 	time_stamp_irq = 0x00001000,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index fd41db65fe1d..d5299dd13e85 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -23,6 +23,7 @@
 static void dwmac4_core_init(struct mac_device_info *hw,
 			     struct net_device *dev)
 {
+	struct stmmac_priv *priv = netdev_priv(dev);
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value = readl(ioaddr + GMAC_CONFIG);
 
@@ -58,6 +59,9 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 		value |= GMAC_INT_FPE_EN;
 
 	writel(value, ioaddr + GMAC_INT_EN);
+
+	if (GMAC_INT_DEFAULT_ENABLE & GMAC_INT_TSIE)
+		init_waitqueue_head(&priv->tstamp_busy_wait);
 }
 
 static void dwmac4_rx_queue_enable(struct mac_device_info *hw,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 57970ae2178d..f9e83964aa7e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -266,6 +266,7 @@ struct stmmac_priv {
 	rwlock_t ptp_lock;
 	/* Protects auxiliary snapshot registers from concurrent access. */
 	struct mutex aux_ts_lock;
+	wait_queue_head_t tstamp_busy_wait;
 
 	void __iomem *mmcaddr;
 	void __iomem *ptpaddr;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 92d32940aff0..764832f4dae1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -179,6 +179,11 @@ static void timestamp_interrupt(struct stmmac_priv *priv)
 	u64 ptp_time;
 	int i;
 
+	if (priv->plat->int_snapshot_en) {
+		wake_up(&priv->tstamp_busy_wait);
+		return;
+	}
+
 	tsync_int = readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE;
 
 	if (!tsync_int)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index e45fb191d8e6..4d11980dcd64 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -175,11 +175,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
 	struct stmmac_priv *priv =
 	    container_of(ptp, struct stmmac_priv, ptp_clock_ops);
 	void __iomem *ptpaddr = priv->ptpaddr;
-	void __iomem *ioaddr = priv->hw->pcsr;
 	struct stmmac_pps_cfg *cfg;
-	u32 intr_value, acr_value;
 	int ret = -EOPNOTSUPP;
 	unsigned long flags;
+	u32 acr_value;
 
 	switch (rq->type) {
 	case PTP_CLK_REQ_PEROUT:
@@ -213,19 +212,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
 			netdev_dbg(priv->dev, "Auxiliary Snapshot %d enabled.\n",
 				   priv->plat->ext_snapshot_num >>
 				   PTP_ACR_ATSEN_SHIFT);
-			/* Enable Timestamp Interrupt */
-			intr_value = readl(ioaddr + GMAC_INT_EN);
-			intr_value |= GMAC_INT_TSIE;
-			writel(intr_value, ioaddr + GMAC_INT_EN);
-
 		} else {
 			netdev_dbg(priv->dev, "Auxiliary Snapshot %d disabled.\n",
 				   priv->plat->ext_snapshot_num >>
 				   PTP_ACR_ATSEN_SHIFT);
-			/* Disable Timestamp Interrupt */
-			intr_value = readl(ioaddr + GMAC_INT_EN);
-			intr_value &= ~GMAC_INT_TSIE;
-			writel(intr_value, ioaddr + GMAC_INT_EN);
 		}
 		writel(acr_value, ptpaddr + PTP_ACR);
 		mutex_unlock(&priv->aux_ts_lock);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 29917850f079..8df475db88c0 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -260,6 +260,7 @@ struct plat_stmmacenet_data {
 	bool has_crossts;
 	int int_snapshot_num;
 	int ext_snapshot_num;
+	bool int_snapshot_en;
 	bool ext_snapshot_en;
 	bool multi_msi_en;
 	int msi_mac_vec;
-- 
cgit 


From 4bc5008e4387106215b50ae1a4ac2467455725ca Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Thu, 14 Jul 2022 17:44:02 +0800
Subject: net/smc: Introduce a sysctl for setting SMC-R buffer type

This patch introduces the sysctl smcr_buf_type for setting
the type of SMC-R sndbufs and RMBs.

Valid values includes:

- SMCR_PHYS_CONT_BUFS, which means use physically contiguous
  buffers for better performance and is the default value.

- SMCR_VIRT_CONT_BUFS, which means use virtually contiguous
  buffers in case of physically contiguous memory is scarce.

- SMCR_MIXED_BUFS, which means first try to use physically
  contiguous buffers. If not available, then use virtually
  contiguous buffers.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/smc-sysctl.rst | 13 +++++++++++++
 include/net/netns/smc.h                 |  1 +
 net/smc/smc_core.h                      |  6 ++++++
 net/smc/smc_sysctl.c                    | 11 +++++++++++
 4 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst
index 0987fd1bc220..742e90e6d822 100644
--- a/Documentation/networking/smc-sysctl.rst
+++ b/Documentation/networking/smc-sysctl.rst
@@ -21,3 +21,16 @@ autocorking_size - INTEGER
 	know how/when to uncork their sockets.
 
 	Default: 64K
+
+smcr_buf_type - INTEGER
+        Controls which type of sndbufs and RMBs to use in later newly created
+        SMC-R link group. Only for SMC-R.
+
+        Default: 0 (physically contiguous sndbufs and RMBs)
+
+        Possible values:
+
+        - 0 - Use physically contiguous buffers
+        - 1 - Use virtually contiguous buffers
+        - 2 - Mixed use of the two types. Try physically contiguous buffers first.
+          If not available, use virtually contiguous buffers then.
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index e5389eeaf8bd..2adbe2b245df 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -18,5 +18,6 @@ struct netns_smc {
 	struct ctl_table_header		*smc_hdr;
 #endif
 	unsigned int			sysctl_autocorking_size;
+	unsigned int			sysctl_smcr_buf_type;
 };
 #endif
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 46ddec5f1edc..7652dfa783ff 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -217,6 +217,12 @@ enum smc_lgr_type {				/* redundancy state of lgr */
 	SMC_LGR_ASYMMETRIC_LOCAL,	/* local has 1, peer 2 active RNICs */
 };
 
+enum smcr_buf_type {		/* types of SMC-R sndbufs and RMBs */
+	SMCR_PHYS_CONT_BUFS	= 0,
+	SMCR_VIRT_CONT_BUFS	= 1,
+	SMCR_MIXED_BUFS		= 2,
+};
+
 enum smc_llc_flowtype {
 	SMC_LLC_FLOW_NONE	= 0,
 	SMC_LLC_FLOW_ADD_LINK	= 2,
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index cf3ab1334c00..0613868fdb97 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -15,6 +15,7 @@
 #include <net/net_namespace.h>
 
 #include "smc.h"
+#include "smc_core.h"
 #include "smc_sysctl.h"
 
 static struct ctl_table smc_table[] = {
@@ -25,6 +26,15 @@ static struct ctl_table smc_table[] = {
 		.mode           = 0644,
 		.proc_handler	= proc_douintvec,
 	},
+	{
+		.procname	= "smcr_buf_type",
+		.data		= &init_net.smc.sysctl_smcr_buf_type,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
 	{  }
 };
 
@@ -49,6 +59,7 @@ int __net_init smc_sysctl_net_init(struct net *net)
 		goto err_reg;
 
 	net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+	net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
 
 	return 0;
 
-- 
cgit 


From ddefb2d205539418f3c3851a3e06fac9624f257d Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Thu, 14 Jul 2022 17:44:05 +0800
Subject: net/smc: Extend SMC-R link group netlink attribute

Extend SMC-R link group netlink attribute SMC_GEN_LGR_SMCR.
Introduce SMC_NLA_LGR_R_BUF_TYPE to show the buffer type of
SMC-R link group.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc.h | 1 +
 net/smc/smc_core.c       | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 693f549f6966..bb4dacca31e7 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -124,6 +124,7 @@ enum {
 	SMC_NLA_LGR_R_V2,		/* nest */
 	SMC_NLA_LGR_R_NET_COOKIE,	/* u64 */
 	SMC_NLA_LGR_R_PAD,		/* flag */
+	SMC_NLA_LGR_R_BUF_TYPE,		/* u8 */
 	__SMC_NLA_LGR_R_MAX,
 	SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1
 };
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f26770c29d78..ff49a11f57b8 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -347,6 +347,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr,
 		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type))
 		goto errattr;
+	if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type))
+		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
 		goto errattr;
 	if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE,
-- 
cgit 


From 4cbc325ed6b4dce4910be06d9d6940a8b919c59b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Jul 2022 22:22:25 -0700
Subject: tls: rx: allow only one reader at a time

recvmsg() in TLS gets data from the skb list (rx_list) or fresh
skbs we read from TCP via strparser. The former holds skbs which were
already decrypted for peek or decrypted and partially consumed.

tls_wait_data() only notices appearance of fresh skbs coming out
of TCP (or psock). It is possible, if there is a concurrent call
to peek() and recv() that the peek() will move the data from input
to rx_list without recv() noticing. recv() will then read data out
of order or never wake up.

This is not a practical use case/concern, but it makes the self
tests less reliable. This patch solves the problem by allowing
only one reader in.

Because having multiple processes calling read()/peek() is not
normal avoid adding a lock and try to fast-path the single reader
case.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h |  3 +++
 net/tls/tls_sw.c  | 61 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 8742e13bc362..e8935cfe0cd6 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -116,11 +116,14 @@ struct tls_sw_context_rx {
 	void (*saved_data_ready)(struct sock *sk);
 
 	struct sk_buff *recv_pkt;
+	u8 reader_present;
 	u8 async_capable:1;
 	u8 zc_capable:1;
+	u8 reader_contended:1;
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
+	struct wait_queue_head wq;
 };
 
 struct tls_record_info {
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 68d79ee48a56..761a63751616 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1753,6 +1753,51 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
 	sk_flush_backlog(sk);
 }
 
+static long tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx,
+			       bool nonblock)
+{
+	long timeo;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	while (unlikely(ctx->reader_present)) {
+		DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+		ctx->reader_contended = 1;
+
+		add_wait_queue(&ctx->wq, &wait);
+		sk_wait_event(sk, &timeo,
+			      !READ_ONCE(ctx->reader_present), &wait);
+		remove_wait_queue(&ctx->wq, &wait);
+
+		if (!timeo)
+			return -EAGAIN;
+		if (signal_pending(current))
+			return sock_intr_errno(timeo);
+	}
+
+	WRITE_ONCE(ctx->reader_present, 1);
+
+	return timeo;
+}
+
+static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx)
+{
+	if (unlikely(ctx->reader_contended)) {
+		if (wq_has_sleeper(&ctx->wq))
+			wake_up(&ctx->wq);
+		else
+			ctx->reader_contended = 0;
+
+		WARN_ON_ONCE(!ctx->reader_present);
+	}
+
+	WRITE_ONCE(ctx->reader_present, 0);
+	release_sock(sk);
+}
+
 int tls_sw_recvmsg(struct sock *sk,
 		   struct msghdr *msg,
 		   size_t len,
@@ -1782,7 +1827,9 @@ int tls_sw_recvmsg(struct sock *sk,
 		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
 
 	psock = sk_psock_get(sk);
-	lock_sock(sk);
+	timeo = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT);
+	if (timeo < 0)
+		return timeo;
 	bpf_strp_enabled = sk_psock_strp_enabled(psock);
 
 	/* If crypto failed the connection is broken */
@@ -1801,7 +1848,6 @@ int tls_sw_recvmsg(struct sock *sk,
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 	len = len - copied;
-	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek &&
 		ctx->zc_capable;
@@ -1956,7 +2002,7 @@ recv_end:
 	copied += decrypted;
 
 end:
-	release_sock(sk);
+	tls_rx_reader_unlock(sk, ctx);
 	if (psock)
 		sk_psock_put(sk, psock);
 	return copied ? : err;
@@ -1978,9 +2024,9 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 	long timeo;
 	int chunk;
 
-	lock_sock(sk);
-
-	timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
+	timeo = tls_rx_reader_lock(sk, ctx, flags & SPLICE_F_NONBLOCK);
+	if (timeo < 0)
+		return timeo;
 
 	from_queue = !skb_queue_empty(&ctx->rx_list);
 	if (from_queue) {
@@ -2029,7 +2075,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 	}
 
 splice_read_end:
-	release_sock(sk);
+	tls_rx_reader_unlock(sk, ctx);
 	return copied ? : err;
 }
 
@@ -2371,6 +2417,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	} else {
 		crypto_init_wait(&sw_ctx_rx->async_wait);
 		spin_lock_init(&sw_ctx_rx->decrypt_compl_lock);
+		init_waitqueue_head(&sw_ctx_rx->wq);
 		crypto_info = &ctx->crypto_recv.info;
 		cctx = &ctx->rx;
 		skb_queue_head_init(&sw_ctx_rx->rx_list);
-- 
cgit 


From 53d57999fe02785040bc53e2f12efc881f13ae17 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Jul 2022 22:22:28 -0700
Subject: tls: rx: remove the message decrypted tracking

We no longer allow a decrypted skb to remain linked to ctx->recv_pkt.
Anything on the list is decrypted, anything on ctx->recv_pkt needs
to be decrypted.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h |  1 -
 net/tls/tls_sw.c        | 10 ----------
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 88900b05443e..41e2ce9e9e10 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -72,7 +72,6 @@ struct sk_skb_cb {
 	/* strp users' data follows */
 	struct tls_msg {
 		u8 control;
-		u8 decrypted;
 	} tls;
 	/* temp_reg is a temporary register used for bpf_convert_data_end_access
 	 * when dst_reg == src_reg.
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f5f06d1ba024..49cfaa8119c6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1563,21 +1563,13 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
 	struct strp_msg *rxm = strp_msg(skb);
-	struct tls_msg *tlm = tls_msg(skb);
 	int pad, err;
 
-	if (tlm->decrypted) {
-		darg->zc = false;
-		darg->async = false;
-		return 0;
-	}
-
 	if (tls_ctx->rx_conf == TLS_HW) {
 		err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
 		if (err < 0)
 			return err;
 		if (err > 0) {
-			tlm->decrypted = 1;
 			darg->zc = false;
 			darg->async = false;
 			goto decrypt_done;
@@ -1610,7 +1602,6 @@ decrypt_done:
 	rxm->full_len -= pad;
 	rxm->offset += prot->prepend_size;
 	rxm->full_len -= prot->overhead_size;
-	tlm->decrypted = 1;
 decrypt_next:
 	tls_advance_record_sn(sk, prot, &tls_ctx->rx);
 
@@ -2130,7 +2121,6 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 	if (ret < 0)
 		goto read_failure;
 
-	tlm->decrypted = 0;
 	tlm->control = header[0];
 
 	data_len = ((header[4] & 0xFF) | (header[3] << 8));
-- 
cgit 


From c618db2afe7c31d13ca8cf05b60f17165fbdc282 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 14 Jul 2022 22:22:33 -0700
Subject: tls: rx: async: hold onto the input skb

Async crypto currently benefits from the fact that we decrypt
in place. When we allow input and output to be different skbs
we will have to hang onto the input while we move to the next
record. Clone the inputs and keep them on a list.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  1 +
 net/tls/Makefile   |  2 +-
 net/tls/tls.h      |  3 +++
 net/tls/tls_strp.c | 17 +++++++++++++++++
 net/tls/tls_sw.c   | 26 +++++++++++++++++---------
 5 files changed, 39 insertions(+), 10 deletions(-)
 create mode 100644 net/tls/tls_strp.c

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index e8935cfe0cd6..181c496b01b8 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -123,6 +123,7 @@ struct tls_sw_context_rx {
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
+	struct sk_buff_head async_hold;
 	struct wait_queue_head wq;
 };
 
diff --git a/net/tls/Makefile b/net/tls/Makefile
index f1ffbfe8968d..e41c800489ac 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -7,7 +7,7 @@ CFLAGS_trace.o := -I$(src)
 
 obj-$(CONFIG_TLS) += tls.o
 
-tls-y := tls_main.o tls_sw.o tls_proc.o trace.o
+tls-y := tls_main.o tls_sw.o tls_proc.o trace.o tls_strp.o
 
 tls-$(CONFIG_TLS_TOE) += tls_toe.o
 tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls.h b/net/tls/tls.h
index 44522b221717..c818dc68955d 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -124,6 +124,9 @@ int tls_sw_fallback_init(struct sock *sk,
 			 struct tls_offload_context_tx *offload_ctx,
 			 struct tls_crypto_info *crypto_info);
 
+int tls_strp_msg_hold(struct sock *sk, struct sk_buff *skb,
+		      struct sk_buff_head *dst);
+
 static inline struct tls_msg *tls_msg(struct sk_buff *skb)
 {
 	struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
new file mode 100644
index 000000000000..9ccab79a6e1e
--- /dev/null
+++ b/net/tls/tls_strp.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/skbuff.h>
+
+#include "tls.h"
+
+int tls_strp_msg_hold(struct sock *sk, struct sk_buff *skb,
+		      struct sk_buff_head *dst)
+{
+	struct sk_buff *clone;
+
+	clone = skb_clone(skb, sk->sk_allocation);
+	if (!clone)
+		return -ENOMEM;
+	__skb_queue_tail(dst, clone);
+	return 0;
+}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 09fe2cfff51a..f767501e178d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1535,8 +1535,13 @@ fallback_to_reg_recv:
 		goto exit_free_pages;
 
 	darg->skb = tls_strp_msg(ctx);
-	if (darg->async)
-		return 0;
+
+	if (unlikely(darg->async)) {
+		err = tls_strp_msg_hold(sk, skb, &ctx->async_hold);
+		if (err)
+			__skb_queue_tail(&ctx->async_hold, darg->skb);
+		return err;
+	}
 
 	if (prot->tail_size)
 		darg->tail = dctx->tail;
@@ -1998,14 +2003,16 @@ recv_end:
 		reinit_completion(&ctx->async_wait.completion);
 		pending = atomic_read(&ctx->decrypt_pending);
 		spin_unlock_bh(&ctx->decrypt_compl_lock);
-		if (pending) {
+		ret = 0;
+		if (pending)
 			ret = crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
-			if (ret) {
-				if (err >= 0 || err == -EINPROGRESS)
-					err = ret;
-				decrypted = 0;
-				goto end;
-			}
+		__skb_queue_purge(&ctx->async_hold);
+
+		if (ret) {
+			if (err >= 0 || err == -EINPROGRESS)
+				err = ret;
+			decrypted = 0;
+			goto end;
 		}
 
 		/* Drain records from the rx_list & copy if required */
@@ -2440,6 +2447,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		crypto_info = &ctx->crypto_recv.info;
 		cctx = &ctx->rx;
 		skb_queue_head_init(&sw_ctx_rx->rx_list);
+		skb_queue_head_init(&sw_ctx_rx->async_hold);
 		aead = &sw_ctx_rx->aead_recv;
 	}
 
-- 
cgit 


From f2f316e287e6c2e3a1c5bab8d9b77ee03daa0463 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 15 Jul 2022 10:17:45 -0700
Subject: tcp: Fix data-races around keepalive sysctl knobs.

While reading sysctl_tcp_keepalive_(time|probes|intvl), they can be changed
concurrently.  Thus, we need to add READ_ONCE() to their readers.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 9 ++++++---
 net/smc/smc_llc.c | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1636c55e798b..204478d5d388 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1493,21 +1493,24 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp)
 {
 	struct net *net = sock_net((struct sock *)tp);
 
-	return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
+	return tp->keepalive_intvl ? :
+		READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
 }
 
 static inline int keepalive_time_when(const struct tcp_sock *tp)
 {
 	struct net *net = sock_net((struct sock *)tp);
 
-	return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time;
+	return tp->keepalive_time ? :
+		READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
 }
 
 static inline int keepalive_probes(const struct tcp_sock *tp)
 {
 	struct net *net = sock_net((struct sock *)tp);
 
-	return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
+	return tp->keepalive_probes ? :
+		READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
 }
 
 static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index c4d057b2941d..0bde36b56472 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -2122,7 +2122,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 	init_waitqueue_head(&lgr->llc_flow_waiter);
 	init_waitqueue_head(&lgr->llc_msg_waiter);
 	mutex_init(&lgr->llc_conf_mutex);
-	lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time;
+	lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
 }
 
 /* called after lgr was removed from lgr_list */
-- 
cgit 


From 39e24435a776e9de5c6dd188836cf2523547804b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 15 Jul 2022 10:17:50 -0700
Subject: tcp: Fix data-races around some timeout sysctl knobs.

While reading these sysctl knobs, they can be changed concurrently.
Thus, we need to add READ_ONCE() to their readers.

  - tcp_retries1
  - tcp_retries2
  - tcp_orphan_retries
  - tcp_fin_timeout

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  3 ++-
 net/ipv4/tcp.c        |  2 +-
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/tcp_timer.c  | 10 +++++-----
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 204478d5d388..23ccaa386746 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1523,7 +1523,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
 
 static inline int tcp_fin_time(const struct sock *sk)
 {
-	int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
+	int fin_timeout = tcp_sk(sk)->linger2 ? :
+		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
 	const int rto = inet_csk(sk)->icsk_rto;
 
 	if (fin_timeout < (rto << 2) - (rto >> 1))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b3632fa5df07..b1b1bcbc4f60 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3973,7 +3973,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		val = tp->linger2;
 		if (val >= 0)
-			val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
+			val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
 		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 02ab3a9c6657..3b3552d292a5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4090,7 +4090,7 @@ void tcp_send_probe0(struct sock *sk)
 
 	icsk->icsk_probes_out++;
 	if (err <= 0) {
-		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
+		if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
 			icsk->icsk_backoff++;
 		timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
 	} else {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a234704e8163..ec5277becc6a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -143,7 +143,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
  */
 static int tcp_orphan_retries(struct sock *sk, bool alive)
 {
-	int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
+	int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */
 
 	/* We know from an ICMP that something is wrong. */
 	if (sk->sk_err_soft && !alive)
@@ -243,14 +243,14 @@ static int tcp_write_timeout(struct sock *sk)
 			READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
 		expired = icsk->icsk_retransmits >= retry_until;
 	} else {
-		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
+		if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
 			__dst_negative_advice(sk);
 		}
 
-		retry_until = net->ipv4.sysctl_tcp_retries2;
+		retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
 		if (sock_flag(sk, SOCK_DEAD)) {
 			const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
 
@@ -381,7 +381,7 @@ static void tcp_probe_timer(struct sock *sk)
 		 msecs_to_jiffies(icsk->icsk_user_timeout))
 		goto abort;
 
-	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
+	max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
 	if (sock_flag(sk, SOCK_DEAD)) {
 		const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
 
@@ -589,7 +589,7 @@ out_reset_timer:
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
+	if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
 		__sk_dst_reset(sk);
 
 out:;
-- 
cgit 


From 55be873695ed8912eb77ff46d1d1cadf028bd0f3 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 15 Jul 2022 10:17:51 -0700
Subject: tcp: Fix a data-race around sysctl_tcp_notsent_lowat.

While reading sysctl_tcp_notsent_lowat, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 23ccaa386746..6ee1fb4fb292 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2027,7 +2027,7 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
 {
 	struct net *net = sock_net((struct sock *)tp);
-	return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
+	return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
 }
 
 bool tcp_stream_memory_free(const struct sock *sk, int wake);
-- 
cgit 


From 9592eef7c16ec5fb9f36c4d9abe8eeffc2e1d2f3 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 5 Jul 2022 20:48:41 +0200
Subject: random: remove CONFIG_ARCH_RANDOM

When RDRAND was introduced, there was much discussion on whether it
should be trusted and how the kernel should handle that. Initially, two
mechanisms cropped up, CONFIG_ARCH_RANDOM, a compile time switch, and
"nordrand", a boot-time switch.

Later the thinking evolved. With a properly designed RNG, using RDRAND
values alone won't harm anything, even if the outputs are malicious.
Rather, the issue is whether those values are being *trusted* to be good
or not. And so a new set of options were introduced as the real
ones that people use -- CONFIG_RANDOM_TRUST_CPU and "random.trust_cpu".
With these options, RDRAND is used, but it's not always credited. So in
the worst case, it does nothing, and in the best case, maybe it helps.

Along the way, CONFIG_ARCH_RANDOM's meaning got sort of pulled into the
center and became something certain platforms force-select.

The old options don't really help with much, and it's a bit odd to have
special handling for these instructions when the kernel can deal fine
with the existence or untrusted existence or broken existence or
non-existence of that CPU capability.

Simplify the situation by removing CONFIG_ARCH_RANDOM and using the
ordinary asm-generic fallback pattern instead, keeping the two options
that are actually used. For now it leaves "nordrand" for now, as the
removal of that will take a different route.

Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/arm/include/asm/archrandom.h                  |  2 ++
 arch/arm64/Kconfig                                 |  8 -------
 arch/arm64/include/asm/archrandom.h                | 10 ---------
 arch/arm64/kernel/cpufeature.c                     |  2 --
 arch/powerpc/Kconfig                               |  3 ---
 arch/powerpc/include/asm/archrandom.h              |  3 ---
 arch/powerpc/include/asm/machdep.h                 |  2 --
 arch/powerpc/platforms/microwatt/Kconfig           |  1 -
 arch/powerpc/platforms/powernv/Kconfig             |  1 -
 arch/powerpc/platforms/pseries/Kconfig             |  1 -
 arch/s390/Kconfig                                  | 15 -------------
 arch/s390/configs/zfcpdump_defconfig               |  1 -
 arch/s390/crypto/Makefile                          |  2 +-
 arch/s390/include/asm/archrandom.h                 |  3 ---
 arch/s390/kernel/setup.c                           |  2 --
 arch/x86/Kconfig                                   |  9 --------
 arch/x86/include/asm/archrandom.h                  | 14 ++++--------
 arch/x86/kernel/cpu/rdrand.c                       |  2 --
 drivers/char/Kconfig                               |  1 -
 drivers/char/hw_random/s390-trng.c                 |  9 --------
 include/asm-generic/Kbuild                         |  1 +
 include/asm-generic/archrandom.h                   | 25 ++++++++++++++++++++++
 include/linux/random.h                             |  9 +-------
 .../testing/selftests/wireguard/qemu/kernel.config |  1 -
 24 files changed, 34 insertions(+), 93 deletions(-)
 create mode 100644 include/asm-generic/archrandom.h

(limited to 'include')

diff --git a/arch/arm/include/asm/archrandom.h b/arch/arm/include/asm/archrandom.h
index a8e84ca5c2ee..cc4714eb1a75 100644
--- a/arch/arm/include/asm/archrandom.h
+++ b/arch/arm/include/asm/archrandom.h
@@ -7,4 +7,6 @@ static inline bool __init smccc_probe_trng(void)
 	return false;
 }
 
+#include <asm-generic/archrandom.h>
+
 #endif /* _ASM_ARCHRANDOM_H */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..1880f71c2547 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1858,14 +1858,6 @@ config ARM64_E0PD
 
 	  This option enables E0PD for TTBR1 where available.
 
-config ARCH_RANDOM
-	bool "Enable support for random number generation"
-	default y
-	help
-	  Random number generation (part of the ARMv8.5 Extensions)
-	  provides a high bandwidth, cryptographically secure
-	  hardware random number generator.
-
 config ARM64_AS_HAS_MTE
 	# Initial support for MTE went in binutils 2.32.0, checked with
 	# ".arch armv8.5-a+memtag" below. However, this was incomplete
diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
index 3a6b6d38c5b8..c3b9fa56af67 100644
--- a/arch/arm64/include/asm/archrandom.h
+++ b/arch/arm64/include/asm/archrandom.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_ARCHRANDOM_H
 #define _ASM_ARCHRANDOM_H
 
-#ifdef CONFIG_ARCH_RANDOM
-
 #include <linux/arm-smccc.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
@@ -167,12 +165,4 @@ arch_get_random_seed_long_early(unsigned long *v)
 }
 #define arch_get_random_seed_long_early arch_get_random_seed_long_early
 
-#else /* !CONFIG_ARCH_RANDOM */
-
-static inline bool __init smccc_probe_trng(void)
-{
-	return false;
-}
-
-#endif /* CONFIG_ARCH_RANDOM */
 #endif /* _ASM_ARCHRANDOM_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8d88433de81d..0e9462abeb77 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2416,7 +2416,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_e0pd,
 	},
 #endif
-#ifdef CONFIG_ARCH_RANDOM
 	{
 		.desc = "Random Number Generator",
 		.capability = ARM64_HAS_RNG,
@@ -2428,7 +2427,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.sign = FTR_UNSIGNED,
 		.min_field_value = 1,
 	},
-#endif
 #ifdef CONFIG_ARM64_BTI
 	{
 		.desc = "Branch Target Identification",
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7aa12e88c580..623deb5bedf6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1252,9 +1252,6 @@ config PHYSICAL_START
 	default "0x00000000"
 endif
 
-config	ARCH_RANDOM
-	def_bool n
-
 config PPC_LIB_RHEAP
 	bool
 
diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
index 9a53e29680f4..25ba65df6b1a 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_POWERPC_ARCHRANDOM_H
 #define _ASM_POWERPC_ARCHRANDOM_H
 
-#ifdef CONFIG_ARCH_RANDOM
-
 #include <asm/machdep.h>
 
 static inline bool __must_check arch_get_random_long(unsigned long *v)
@@ -35,7 +33,6 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
 
 	return rc;
 }
-#endif /* CONFIG_ARCH_RANDOM */
 
 #ifdef CONFIG_PPC_POWERNV
 int powernv_hwrng_present(void);
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 358d171ae8e0..6c1002043367 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -200,9 +200,7 @@ struct machdep_calls {
 	ssize_t (*cpu_release)(const char *, size_t);
 #endif
 
-#ifdef CONFIG_ARCH_RANDOM
 	int (*get_random_seed)(unsigned long *v);
-#endif
 };
 
 extern void e500_idle(void);
diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig
index 5e320f49583a..6af443a1db99 100644
--- a/arch/powerpc/platforms/microwatt/Kconfig
+++ b/arch/powerpc/platforms/microwatt/Kconfig
@@ -6,7 +6,6 @@ config PPC_MICROWATT
 	select PPC_ICS_NATIVE
 	select PPC_ICP_NATIVE
 	select PPC_UDBG_16550
-	select ARCH_RANDOM
 	help
           This option enables support for FPGA-based Microwatt implementations.
 
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 161dfe024085..e1a05c5a9004 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -12,7 +12,6 @@ config PPC_POWERNV
 	select EPAPR_BOOT
 	select PPC_INDIRECT_PIO
 	select PPC_UDBG_16550
-	select ARCH_RANDOM
 	select CPU_FREQ
 	select PPC_DOORBELL
 	select MMU_NOTIFIER
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f7fd91d153a4..f4a647c1f0b2 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -19,7 +19,6 @@ config PPC_PSERIES
 	select PPC_UDBG_16550
 	select PPC_DOORBELL
 	select HOTPLUG_CPU
-	select ARCH_RANDOM
 	select FORCE_SMP
 	select SWIOTLB
 	default y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8cd9e56c629b..9b6e4e7cb17b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -507,21 +507,6 @@ config KEXEC_SIG
 	  verification for the corresponding kernel image type being
 	  loaded in order for this to work.
 
-config ARCH_RANDOM
-	def_bool y
-	prompt "s390 architectural random number generation API"
-	help
-	  Enable the s390 architectural random number generation API
-	  to provide random data for all consumers within the Linux
-	  kernel.
-
-	  When enabled the arch_random_* functions declared in linux/random.h
-	  are implemented. The implementation is based on the s390 CPACF
-	  instruction subfunction TRNG which provides a real true random
-	  number generator.
-
-	  If unsure, say Y.
-
 config KERNEL_NOBP
 	def_bool n
 	prompt "Enable modified branch prediction for the kernel by default"
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index a87fcc45e307..f4976f611b94 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -15,7 +15,6 @@ CONFIG_TUNE_ZEC12=y
 # CONFIG_COMPAT is not set
 CONFIG_NR_CPUS=2
 CONFIG_HZ_100=y
-# CONFIG_ARCH_RANDOM is not set
 # CONFIG_RELOCATABLE is not set
 # CONFIG_CHSC_SCH is not set
 # CONFIG_SCM_BUS is not set
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index c63abfeb6d17..1b1cc478fa94 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
-obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
+obj-y += arch_random.o
 
 crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
 chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index 2c6e1c6ecbe7..0a1c2e66c709 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -11,8 +11,6 @@
 #ifndef _ASM_S390_ARCHRANDOM_H
 #define _ASM_S390_ARCHRANDOM_H
 
-#ifdef CONFIG_ARCH_RANDOM
-
 #include <linux/static_key.h>
 #include <linux/atomic.h>
 #include <asm/cpacf.h>
@@ -50,5 +48,4 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
 	return false;
 }
 
-#endif /* CONFIG_ARCH_RANDOM */
 #endif /* _ASM_S390_ARCHRANDOM_H */
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 0a37f5de2863..ebad41afe355 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -876,10 +876,8 @@ static void __init setup_randomness(void)
 		add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
 	memblock_free(vmms, PAGE_SIZE);
 
-#ifdef CONFIG_ARCH_RANDOM
 	if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
 		static_branch_enable(&s390_arch_random_available);
-#endif
 }
 
 /*
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e58798f636d4..ba13749c09c8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1810,15 +1810,6 @@ config ARCH_USES_PG_UNCACHED
 	def_bool y
 	depends on X86_PAT
 
-config ARCH_RANDOM
-	def_bool y
-	prompt "x86 architectural random number generator" if EXPERT
-	help
-	  Enable the x86 architectural RDRAND instruction
-	  (Intel Bull Mountain technology) to generate random numbers.
-	  If supported, this is a high bandwidth, cryptographically
-	  secure hardware random number generator.
-
 config X86_UMIP
 	def_bool y
 	prompt "User Mode Instruction Prevention" if EXPERT
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index ebc248e49549..fb235b696175 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -65,10 +65,8 @@ static inline bool __must_check rdseed_int(unsigned int *v)
 
 /*
  * These are the generic interfaces; they must not be declared if the
- * stubs in <linux/random.h> are to be invoked,
- * i.e. CONFIG_ARCH_RANDOM is not defined.
+ * stubs in <linux/random.h> are to be invoked.
  */
-#ifdef CONFIG_ARCH_RANDOM
 
 static inline bool __must_check arch_get_random_long(unsigned long *v)
 {
@@ -90,12 +88,8 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
 	return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false;
 }
 
-extern void x86_init_rdrand(struct cpuinfo_x86 *c);
-
-#else  /* !CONFIG_ARCH_RANDOM */
-
-static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { }
-
-#endif  /* !CONFIG_ARCH_RANDOM */
+#ifndef CONFIG_UML
+void x86_init_rdrand(struct cpuinfo_x86 *c);
+#endif
 
 #endif /* ASM_X86_ARCHRANDOM_H */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index c4be62058dd9..8f216669ecb8 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -26,7 +26,6 @@ __setup("nordrand", x86_rdrand_setup);
  */
 #define SANITY_CHECK_LOOPS 8
 
-#ifdef CONFIG_ARCH_RANDOM
 void x86_init_rdrand(struct cpuinfo_x86 *c)
 {
 	unsigned int changed = 0;
@@ -63,4 +62,3 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
 "RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
 
 }
-#endif
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 0b6c03643ddc..30192e123e5f 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -431,7 +431,6 @@ config ADI
 config RANDOM_TRUST_CPU
 	bool "Initialize RNG using CPU RNG instructions"
 	default y
-	depends on ARCH_RANDOM
 	help
 	  Initialize the RNG using random numbers supplied by the CPU's
 	  RNG instructions (e.g. RDRAND), if supported and available. These
diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c
index 2beaa35c0d74..488808dc17a2 100644
--- a/drivers/char/hw_random/s390-trng.c
+++ b/drivers/char/hw_random/s390-trng.c
@@ -108,7 +108,6 @@ static ssize_t trng_counter_show(struct device *dev,
 {
 	u64 dev_counter = atomic64_read(&trng_dev_counter);
 	u64 hwrng_counter = atomic64_read(&trng_hwrng_counter);
-#if IS_ENABLED(CONFIG_ARCH_RANDOM)
 	u64 arch_counter = atomic64_read(&s390_arch_random_counter);
 
 	return sysfs_emit(buf,
@@ -118,14 +117,6 @@ static ssize_t trng_counter_show(struct device *dev,
 			"total: %llu\n",
 			dev_counter, hwrng_counter, arch_counter,
 			dev_counter + hwrng_counter + arch_counter);
-#else
-	return sysfs_emit(buf,
-			"trng:  %llu\n"
-			"hwrng: %llu\n"
-			"total: %llu\n",
-			dev_counter, hwrng_counter,
-			dev_counter + hwrng_counter);
-#endif
 }
 static DEVICE_ATTR(byte_counter, 0444, trng_counter_show, NULL);
 
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 8e47d483b524..36db8b9eb68a 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -5,6 +5,7 @@
 # asm headers from the host architecutre.)
 
 mandatory-y += atomic.h
+mandatory-y += archrandom.h
 mandatory-y += barrier.h
 mandatory-y += bitops.h
 mandatory-y += bug.h
diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h
new file mode 100644
index 000000000000..3a5ee202dd86
--- /dev/null
+++ b/include/asm-generic/archrandom.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_GENERIC_ARCHRANDOM_H__
+#define __ASM_GENERIC_ARCHRANDOM_H__
+
+static inline bool __must_check arch_get_random_long(unsigned long *v)
+{
+	return false;
+}
+
+static inline bool __must_check arch_get_random_int(unsigned int *v)
+{
+	return false;
+}
+
+static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
+{
+	return false;
+}
+
+static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+{
+	return false;
+}
+
+#endif
diff --git a/include/linux/random.h b/include/linux/random.h
index 20e389a14e5c..865770e29f3e 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -106,14 +106,7 @@ declare_get_random_var_wait(long, unsigned long)
  */
 #include <linux/prandom.h>
 
-#ifdef CONFIG_ARCH_RANDOM
-# include <asm/archrandom.h>
-#else
-static inline bool __must_check arch_get_random_long(unsigned long *v) { return false; }
-static inline bool __must_check arch_get_random_int(unsigned int *v) { return false; }
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { return false; }
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { return false; }
-#endif
+#include <asm/archrandom.h>
 
 /*
  * Called from the boot CPU during startup; not valid to call once
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index bad88f4b0a03..e1858ce7003f 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -58,7 +58,6 @@ CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_HIGH_RES_TIMERS=y
-CONFIG_ARCH_RANDOM=y
 CONFIG_FILE_LOCKING=y
 CONFIG_POSIX_TIMERS=y
 CONFIG_DEVTMPFS=y
-- 
cgit 


From 0b4ae3f6d1210c11f9baf159009c7227eacf90f2 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 11 Jul 2022 07:47:16 -0700
Subject: iio: cros: Register FIFO callback after sensor is registered

Instead of registering callback to process sensor events right at
initialization time, wait for the sensor to be register in the iio
subsystem.

Events can come at probe time (in case the kernel rebooted abruptly
without switching the sensor off for  instance), and be sent to IIO core
before the sensor is fully registered.

Fixes: aa984f1ba4a4 ("iio: cros_ec: Register to cros_ec_sensorhub when EC supports FIFO")
Reported-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20220711144716.642617-1-gwendal@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/cros_ec_accel_legacy.c           |  4 +-
 .../iio/common/cros_ec_sensors/cros_ec_lid_angle.c |  4 +-
 .../iio/common/cros_ec_sensors/cros_ec_sensors.c   |  6 +--
 .../common/cros_ec_sensors/cros_ec_sensors_core.c  | 58 ++++++++++++++++------
 drivers/iio/light/cros_ec_light_prox.c             |  6 +--
 drivers/iio/pressure/cros_ec_baro.c                |  6 +--
 include/linux/iio/common/cros_ec_sensors_core.h    |  7 ++-
 7 files changed, 60 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/accel/cros_ec_accel_legacy.c b/drivers/iio/accel/cros_ec_accel_legacy.c
index 1c0171f26e99..0f403342b1fc 100644
--- a/drivers/iio/accel/cros_ec_accel_legacy.c
+++ b/drivers/iio/accel/cros_ec_accel_legacy.c
@@ -215,7 +215,7 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
-					cros_ec_sensors_capture, NULL);
+					cros_ec_sensors_capture);
 	if (ret)
 		return ret;
 
@@ -235,7 +235,7 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
 		state->sign[CROS_EC_SENSOR_Z] = -1;
 	}
 
-	return devm_iio_device_register(dev, indio_dev);
+	return cros_ec_sensors_core_register(dev, indio_dev, NULL);
 }
 
 static struct platform_driver cros_ec_accel_platform_driver = {
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
index 9f780fafaed9..119acb078af3 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
@@ -98,7 +98,7 @@ static int cros_ec_lid_angle_probe(struct platform_device *pdev)
 	if (!indio_dev)
 		return -ENOMEM;
 
-	ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL, NULL);
+	ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL);
 	if (ret)
 		return ret;
 
@@ -114,7 +114,7 @@ static int cros_ec_lid_angle_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return devm_iio_device_register(dev, indio_dev);
+	return cros_ec_sensors_core_register(dev, indio_dev, NULL);
 }
 
 static const struct platform_device_id cros_ec_lid_angle_ids[] = {
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
index 61e07a7bb199..66153b1850f1 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
@@ -236,8 +236,7 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
-					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data);
+					cros_ec_sensors_capture);
 	if (ret)
 		return ret;
 
@@ -298,7 +297,8 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
 	else
 		state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-	return devm_iio_device_register(dev, indio_dev);
+	return cros_ec_sensors_core_register(dev, indio_dev,
+			cros_ec_sensors_push_data);
 }
 
 static const struct platform_device_id cros_ec_sensors_ids[] = {
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
index e5ccedef13a8..05a28d353e34 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
@@ -228,21 +228,18 @@ static void cros_ec_sensors_core_clean(void *arg)
 
 /**
  * cros_ec_sensors_core_init() - basic initialization of the core structure
- * @pdev:		platform device created for the sensors
+ * @pdev:		platform device created for the sensor
  * @indio_dev:		iio device structure of the device
  * @physical_device:	true if the device refers to a physical device
  * @trigger_capture:    function pointer to call buffer is triggered,
  *    for backward compatibility.
- * @push_data:          function to call when cros_ec_sensorhub receives
- *    a sample for that sensor.
  *
  * Return: 0 on success, -errno on failure.
  */
 int cros_ec_sensors_core_init(struct platform_device *pdev,
 			      struct iio_dev *indio_dev,
 			      bool physical_device,
-			      cros_ec_sensors_capture_t trigger_capture,
-			      cros_ec_sensorhub_push_data_cb_t push_data)
+			      cros_ec_sensors_capture_t trigger_capture)
 {
 	struct device *dev = &pdev->dev;
 	struct cros_ec_sensors_core_state *state = iio_priv(indio_dev);
@@ -340,17 +337,6 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 			if (ret)
 				return ret;
 
-			ret = cros_ec_sensorhub_register_push_data(
-					sensor_hub, sensor_platform->sensor_num,
-					indio_dev, push_data);
-			if (ret)
-				return ret;
-
-			ret = devm_add_action_or_reset(
-					dev, cros_ec_sensors_core_clean, pdev);
-			if (ret)
-				return ret;
-
 			/* Timestamp coming from FIFO are in ns since boot. */
 			ret = iio_device_set_clock(indio_dev, CLOCK_BOOTTIME);
 			if (ret)
@@ -372,6 +358,46 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_core_init);
 
+/**
+ * cros_ec_sensors_core_register() - Register callback to FIFO and IIO when
+ * sensor is ready.
+ * It must be called at the end of the sensor probe routine.
+ * @dev:		device created for the sensor
+ * @indio_dev:		iio device structure of the device
+ * @push_data:          function to call when cros_ec_sensorhub receives
+ *    a sample for that sensor.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int cros_ec_sensors_core_register(struct device *dev,
+				  struct iio_dev *indio_dev,
+				  cros_ec_sensorhub_push_data_cb_t push_data)
+{
+	struct cros_ec_sensor_platform *sensor_platform = dev_get_platdata(dev);
+	struct cros_ec_sensorhub *sensor_hub = dev_get_drvdata(dev->parent);
+	struct platform_device *pdev = to_platform_device(dev);
+	struct cros_ec_dev *ec = sensor_hub->ec;
+	int ret;
+
+	ret = devm_iio_device_register(dev, indio_dev);
+	if (ret)
+		return ret;
+
+	if (!push_data ||
+	    !cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO))
+		return 0;
+
+	ret = cros_ec_sensorhub_register_push_data(
+			sensor_hub, sensor_platform->sensor_num,
+			indio_dev, push_data);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(
+			dev, cros_ec_sensors_core_clean, pdev);
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensors_core_register);
+
 /**
  * cros_ec_motion_send_host_cmd() - send motion sense host command
  * @state:		pointer to state information for device
diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c
index e345e0f71b74..19e529c84e95 100644
--- a/drivers/iio/light/cros_ec_light_prox.c
+++ b/drivers/iio/light/cros_ec_light_prox.c
@@ -182,8 +182,7 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
-					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data);
+					cros_ec_sensors_capture);
 	if (ret)
 		return ret;
 
@@ -239,7 +238,8 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 
 	state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-	return devm_iio_device_register(dev, indio_dev);
+	return cros_ec_sensors_core_register(dev, indio_dev,
+					     cros_ec_sensors_push_data);
 }
 
 static const struct platform_device_id cros_ec_light_prox_ids[] = {
diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c
index 25217279f350..2649c2f89e89 100644
--- a/drivers/iio/pressure/cros_ec_baro.c
+++ b/drivers/iio/pressure/cros_ec_baro.c
@@ -139,8 +139,7 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
-					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data);
+					cros_ec_sensors_capture);
 	if (ret)
 		return ret;
 
@@ -185,7 +184,8 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
 
 	state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-	return devm_iio_device_register(dev, indio_dev);
+	return cros_ec_sensors_core_register(dev, indio_dev,
+					     cros_ec_sensors_push_data);
 }
 
 static const struct platform_device_id cros_ec_baro_ids[] = {
diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index a8259c8822f5..e72167b96d27 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -93,8 +93,11 @@ int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev, unsigned long scan_mask,
 struct platform_device;
 int cros_ec_sensors_core_init(struct platform_device *pdev,
 			      struct iio_dev *indio_dev, bool physical_device,
-			      cros_ec_sensors_capture_t trigger_capture,
-			      cros_ec_sensorhub_push_data_cb_t push_data);
+			      cros_ec_sensors_capture_t trigger_capture);
+
+int cros_ec_sensors_core_register(struct device *dev,
+				  struct iio_dev *indio_dev,
+				  cros_ec_sensorhub_push_data_cb_t push_data);
 
 irqreturn_t cros_ec_sensors_capture(int irq, void *p);
 int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
-- 
cgit 


From d276a22314c2bad9136c5e0b09eb3c8a560e1161 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Jul 2022 08:30:13 +0200
Subject: ublk: remove UBLK_IO_F_INTEGRITY

The ublk protocol has no mechanism to actually transfer the integrity
metadata, so don't define this flag, which requires that an integrity
payload is attached to a bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220718063013.335531-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 3 ---
 include/uapi/linux/ublk_cmd.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 42afab25864f..796d8230fb60 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -389,9 +389,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 	if (req->cmd_flags & REQ_META)
 		flags |= UBLK_IO_F_META;
 
-	if (req->cmd_flags & REQ_INTEGRITY)
-		flags |= UBLK_IO_F_INTEGRITY;
-
 	if (req->cmd_flags & REQ_FUA)
 		flags |= UBLK_IO_F_FUA;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a3f5e7c21807..d6879eea2fde 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -106,7 +106,6 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
 #define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
 #define		UBLK_IO_F_META			(1U << 11)
-#define		UBLK_IO_F_INTEGRITY		(1U << 12)
 #define		UBLK_IO_F_FUA			(1U << 13)
 #define		UBLK_IO_F_PREFLUSH		(1U << 14)
 #define		UBLK_IO_F_NOUNMAP		(1U << 15)
-- 
cgit 


From f4f451a16dd1f478fdb966bcbb612c1e4ce6b962 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 5 Jul 2022 20:35:32 +0800
Subject: mm: fix missing wake-up event for FSDAX pages

FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
1, then the page is freed.  The FSDAX pages can be pinned through GUP,
then they will be unpinned via unpin_user_page() using a folio variant
to put the page, however, folio variants did not consider this special
case, the result will be to miss a wakeup event (like the user of
__fuse_dax_break_layouts()).  This results in a task being permanently
stuck in TASK_INTERRUPTIBLE state.

Since FSDAX pages are only possibly obtained by GUP users, so fix GUP
instead of folio_put() to lower overhead.

Link: https://lkml.kernel.org/r/20220705123532.283-1-songmuchun@bytedance.com
Fixes: d8ddc099c6b3 ("mm/gup: Add gup_put_folio()")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 14 +++++++++-----
 mm/gup.c           |  6 ++++--
 mm/memremap.c      |  6 +++---
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..7898e29bcfb5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1130,23 +1130,27 @@ static inline bool is_zone_movable_page(const struct page *page)
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
-bool __put_devmap_managed_page(struct page *page);
-static inline bool put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs);
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
 {
 	if (!static_branch_unlikely(&devmap_managed_key))
 		return false;
 	if (!is_zone_device_page(page))
 		return false;
-	return __put_devmap_managed_page(page);
+	return __put_devmap_managed_page_refs(page, refs);
 }
-
 #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page(struct page *page)
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
 {
 	return false;
 }
 #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
 
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	return put_devmap_managed_page_refs(page, 1);
+}
+
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..e2a39e30756d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,7 +87,8 @@ retry:
 	 * belongs to this folio.
 	 */
 	if (unlikely(page_folio(page) != folio)) {
-		folio_put_refs(folio, refs);
+		if (!put_devmap_managed_page_refs(&folio->page, refs))
+			folio_put_refs(folio, refs);
 		goto retry;
 	}
 
@@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
 			refs *= GUP_PIN_COUNTING_BIAS;
 	}
 
-	folio_put_refs(folio, refs);
+	if (!put_devmap_managed_page_refs(&folio->page, refs))
+		folio_put_refs(folio, refs);
 }
 
 /**
diff --git a/mm/memremap.c b/mm/memremap.c
index b870a659eee6..745eea0f99c3 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -499,7 +499,7 @@ void free_zone_device_page(struct page *page)
 }
 
 #ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs)
 {
 	if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
 		return false;
@@ -509,9 +509,9 @@ bool __put_devmap_managed_page(struct page *page)
 	 * refcount is 1, then the page is free and the refcount is
 	 * stable because nobody holds a reference on the page.
 	 */
-	if (page_ref_dec_return(page) == 1)
+	if (page_ref_sub_return(page, refs) == 1)
 		wake_up_var(&page->_refcount);
 	return true;
 }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page_refs);
 #endif /* CONFIG_FS_DAX */
-- 
cgit 


From c61c6c6507920598bd09db0dd4859a649e3f5ab0 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Date: Tue, 5 Jul 2022 16:35:18 +0200
Subject: dt-bindings: power: qcom-rpmpd: Add MSM8909 power domains

MSM8909 has the same power domains as MSM8916 so just define them
as aliases for the existing definitions.

Signed-off-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220705143523.3390944-4-stephan.gerhold@kernkonzept.com
---
 Documentation/devicetree/bindings/power/qcom,rpmpd.yaml | 1 +
 include/dt-bindings/power/qcom-rpmpd.h                  | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.yaml b/Documentation/devicetree/bindings/power/qcom,rpmpd.yaml
index ad77a6380f38..0ccca493251a 100644
--- a/Documentation/devicetree/bindings/power/qcom,rpmpd.yaml
+++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.yaml
@@ -18,6 +18,7 @@ properties:
     enum:
       - qcom,mdm9607-rpmpd
       - qcom,msm8226-rpmpd
+      - qcom,msm8909-rpmpd
       - qcom,msm8916-rpmpd
       - qcom,msm8939-rpmpd
       - qcom,msm8953-rpmpd
diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h
index 6cce5b7aa940..d81de63ae31c 100644
--- a/include/dt-bindings/power/qcom-rpmpd.h
+++ b/include/dt-bindings/power/qcom-rpmpd.h
@@ -187,6 +187,13 @@
 #define MSM8916_VDDMX		3
 #define MSM8916_VDDMX_AO	4
 
+/* MSM8909 Power Domain Indexes */
+#define MSM8909_VDDCX		MSM8916_VDDCX
+#define MSM8909_VDDCX_AO	MSM8916_VDDCX_AO
+#define MSM8909_VDDCX_VFC	MSM8916_VDDCX_VFC
+#define MSM8909_VDDMX		MSM8916_VDDMX
+#define MSM8909_VDDMX_AO	MSM8916_VDDMX_AO
+
 /* MSM8953 Power Domain Indexes */
 #define MSM8953_VDDMD		0
 #define MSM8953_VDDMD_AO	1
-- 
cgit 


From 2e07a521e1e424787af3bfc59615de4220856c35 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:28 +0100
Subject: skbuff: add SKBFL_DONT_ORPHAN flag

We don't want to list every single ubuf_info callback in
skb_orphan_frags(), add a flag controlling the behaviour.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 8 +++++---
 net/core/skbuff.c      | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d3d10556f0fa..8e12b3b9ad6c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -686,10 +686,13 @@ enum {
 	 * charged to the kernel memory.
 	 */
 	SKBFL_PURE_ZEROCOPY = BIT(2),
+
+	SKBFL_DONT_ORPHAN = BIT(3),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
-#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
+				 SKBFL_DONT_ORPHAN)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -3182,8 +3185,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	if (likely(!skb_zcopy(skb)))
 		return 0;
-	if (!skb_zcopy_is_nouarg(skb) &&
-	    skb_uarg(skb)->callback == msg_zerocopy_callback)
+	if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
 		return 0;
 	return skb_copy_ubufs(skb, gfp_mask);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09f56bfa2771..fc22b3d32052 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1193,7 +1193,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	uarg->flags = SKBFL_ZEROCOPY_FRAG;
+	uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
 	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
-- 
cgit 


From 852e85a704c2e11c050bdea286bc438aba4f4a22 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:34 +0200
Subject: net: devlink: add unlocked variants of devling_trap*() functions

Add unlocked variants of devl_trap*() functions to be used in drivers
called-in with devlink->lock held.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  20 ++++++
 net/core/devlink.c    | 180 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 168 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 88c701b375a2..fb1e17d998b6 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1745,9 +1745,15 @@ void devlink_flash_update_timeout_notify(struct devlink *devlink,
 					 const char *component,
 					 unsigned long timeout);
 
+int devl_traps_register(struct devlink *devlink,
+			const struct devlink_trap *traps,
+			size_t traps_count, void *priv);
 int devlink_traps_register(struct devlink *devlink,
 			   const struct devlink_trap *traps,
 			   size_t traps_count, void *priv);
+void devl_traps_unregister(struct devlink *devlink,
+			   const struct devlink_trap *traps,
+			   size_t traps_count);
 void devlink_traps_unregister(struct devlink *devlink,
 			      const struct devlink_trap *traps,
 			      size_t traps_count);
@@ -1755,17 +1761,31 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
 			 void *trap_ctx, struct devlink_port *in_devlink_port,
 			 const struct flow_action_cookie *fa_cookie);
 void *devlink_trap_ctx_priv(void *trap_ctx);
+int devl_trap_groups_register(struct devlink *devlink,
+			      const struct devlink_trap_group *groups,
+			      size_t groups_count);
 int devlink_trap_groups_register(struct devlink *devlink,
 				 const struct devlink_trap_group *groups,
 				 size_t groups_count);
+void devl_trap_groups_unregister(struct devlink *devlink,
+				 const struct devlink_trap_group *groups,
+				 size_t groups_count);
 void devlink_trap_groups_unregister(struct devlink *devlink,
 				    const struct devlink_trap_group *groups,
 				    size_t groups_count);
 int
+devl_trap_policers_register(struct devlink *devlink,
+			    const struct devlink_trap_policer *policers,
+			    size_t policers_count);
+int
 devlink_trap_policers_register(struct devlink *devlink,
 			       const struct devlink_trap_policer *policers,
 			       size_t policers_count);
 void
+devl_trap_policers_unregister(struct devlink *devlink,
+			      const struct devlink_trap_policer *policers,
+			      size_t policers_count);
+void
 devlink_trap_policers_unregister(struct devlink *devlink,
 				 const struct devlink_trap_policer *policers,
 				 size_t policers_count);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d2a4e6ee1be6..b0f6e8388880 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -11544,7 +11544,7 @@ static void devlink_trap_disable(struct devlink *devlink,
 }
 
 /**
- * devlink_traps_register - Register packet traps with devlink.
+ * devl_traps_register - Register packet traps with devlink.
  * @devlink: devlink.
  * @traps: Packet traps.
  * @traps_count: Count of provided packet traps.
@@ -11552,16 +11552,16 @@ static void devlink_trap_disable(struct devlink *devlink,
  *
  * Return: Non-zero value on failure.
  */
-int devlink_traps_register(struct devlink *devlink,
-			   const struct devlink_trap *traps,
-			   size_t traps_count, void *priv)
+int devl_traps_register(struct devlink *devlink,
+			const struct devlink_trap *traps,
+			size_t traps_count, void *priv)
 {
 	int i, err;
 
 	if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
 		return -EINVAL;
 
-	devl_lock(devlink);
+	devl_assert_locked(devlink);
 	for (i = 0; i < traps_count; i++) {
 		const struct devlink_trap *trap = &traps[i];
 
@@ -11573,7 +11573,6 @@ int devlink_traps_register(struct devlink *devlink,
 		if (err)
 			goto err_trap_register;
 	}
-	devl_unlock(devlink);
 
 	return 0;
 
@@ -11581,24 +11580,47 @@ err_trap_register:
 err_trap_verify:
 	for (i--; i >= 0; i--)
 		devlink_trap_unregister(devlink, &traps[i]);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devl_traps_register);
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+			   const struct devlink_trap *traps,
+			   size_t traps_count, void *priv)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_traps_register(devlink, traps, traps_count, priv);
 	devl_unlock(devlink);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_traps_register);
 
 /**
- * devlink_traps_unregister - Unregister packet traps from devlink.
+ * devl_traps_unregister - Unregister packet traps from devlink.
  * @devlink: devlink.
  * @traps: Packet traps.
  * @traps_count: Count of provided packet traps.
  */
-void devlink_traps_unregister(struct devlink *devlink,
-			      const struct devlink_trap *traps,
-			      size_t traps_count)
+void devl_traps_unregister(struct devlink *devlink,
+			   const struct devlink_trap *traps,
+			   size_t traps_count)
 {
 	int i;
 
-	devl_lock(devlink);
+	devl_assert_locked(devlink);
 	/* Make sure we do not have any packets in-flight while unregistering
 	 * traps by disabling all of them and waiting for a grace period.
 	 */
@@ -11607,6 +11629,23 @@ void devlink_traps_unregister(struct devlink *devlink,
 	synchronize_rcu();
 	for (i = traps_count - 1; i >= 0; i--)
 		devlink_trap_unregister(devlink, &traps[i]);
+}
+EXPORT_SYMBOL_GPL(devl_traps_unregister);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+			      const struct devlink_trap *traps,
+			      size_t traps_count)
+{
+	devl_lock(devlink);
+	devl_traps_unregister(devlink, traps, traps_count);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_traps_unregister);
@@ -11766,20 +11805,20 @@ devlink_trap_group_unregister(struct devlink *devlink,
 }
 
 /**
- * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * devl_trap_groups_register - Register packet trap groups with devlink.
  * @devlink: devlink.
  * @groups: Packet trap groups.
  * @groups_count: Count of provided packet trap groups.
  *
  * Return: Non-zero value on failure.
  */
-int devlink_trap_groups_register(struct devlink *devlink,
-				 const struct devlink_trap_group *groups,
-				 size_t groups_count)
+int devl_trap_groups_register(struct devlink *devlink,
+			      const struct devlink_trap_group *groups,
+			      size_t groups_count)
 {
 	int i, err;
 
-	devl_lock(devlink);
+	devl_assert_locked(devlink);
 	for (i = 0; i < groups_count; i++) {
 		const struct devlink_trap_group *group = &groups[i];
 
@@ -11791,7 +11830,6 @@ int devlink_trap_groups_register(struct devlink *devlink,
 		if (err)
 			goto err_trap_group_register;
 	}
-	devl_unlock(devlink);
 
 	return 0;
 
@@ -11799,26 +11837,65 @@ err_trap_group_register:
 err_trap_group_verify:
 	for (i--; i >= 0; i--)
 		devlink_trap_group_unregister(devlink, &groups[i]);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_register);
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+				 const struct devlink_trap_group *groups,
+				 size_t groups_count)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_trap_groups_register(devlink, groups, groups_count);
 	devl_unlock(devlink);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
 
+/**
+ * devl_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devl_trap_groups_unregister(struct devlink *devlink,
+				 const struct devlink_trap_group *groups,
+				 size_t groups_count)
+{
+	int i;
+
+	devl_assert_locked(devlink);
+	for (i = groups_count - 1; i >= 0; i--)
+		devlink_trap_group_unregister(devlink, &groups[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_unregister);
+
 /**
  * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
  * @devlink: devlink.
  * @groups: Packet trap groups.
  * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
  */
 void devlink_trap_groups_unregister(struct devlink *devlink,
 				    const struct devlink_trap_group *groups,
 				    size_t groups_count)
 {
-	int i;
-
 	devl_lock(devlink);
-	for (i = groups_count - 1; i >= 0; i--)
-		devlink_trap_group_unregister(devlink, &groups[i]);
+	devl_trap_groups_unregister(devlink, groups, groups_count);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
@@ -11905,7 +11982,7 @@ devlink_trap_policer_unregister(struct devlink *devlink,
 }
 
 /**
- * devlink_trap_policers_register - Register packet trap policers with devlink.
+ * devl_trap_policers_register - Register packet trap policers with devlink.
  * @devlink: devlink.
  * @policers: Packet trap policers.
  * @policers_count: Count of provided packet trap policers.
@@ -11913,13 +11990,13 @@ devlink_trap_policer_unregister(struct devlink *devlink,
  * Return: Non-zero value on failure.
  */
 int
-devlink_trap_policers_register(struct devlink *devlink,
-			       const struct devlink_trap_policer *policers,
-			       size_t policers_count)
+devl_trap_policers_register(struct devlink *devlink,
+			    const struct devlink_trap_policer *policers,
+			    size_t policers_count)
 {
 	int i, err;
 
-	devl_lock(devlink);
+	devl_assert_locked(devlink);
 	for (i = 0; i < policers_count; i++) {
 		const struct devlink_trap_policer *policer = &policers[i];
 
@@ -11934,35 +12011,74 @@ devlink_trap_policers_register(struct devlink *devlink,
 		if (err)
 			goto err_trap_policer_register;
 	}
-	devl_unlock(devlink);
-
 	return 0;
 
 err_trap_policer_register:
 err_trap_policer_verify:
 	for (i--; i >= 0; i--)
 		devlink_trap_policer_unregister(devlink, &policers[i]);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_register);
+
+/**
+ * devlink_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+int
+devlink_trap_policers_register(struct devlink *devlink,
+			       const struct devlink_trap_policer *policers,
+			       size_t policers_count)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_trap_policers_register(devlink, policers, policers_count);
 	devl_unlock(devlink);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_trap_policers_register);
 
+/**
+ * devl_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devl_trap_policers_unregister(struct devlink *devlink,
+			      const struct devlink_trap_policer *policers,
+			      size_t policers_count)
+{
+	int i;
+
+	devl_assert_locked(devlink);
+	for (i = policers_count - 1; i >= 0; i--)
+		devlink_trap_policer_unregister(devlink, &policers[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
+
 /**
  * devlink_trap_policers_unregister - Unregister packet trap policers from devlink.
  * @devlink: devlink.
  * @policers: Packet trap policers.
  * @policers_count: Count of provided packet trap policers.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
  */
 void
 devlink_trap_policers_unregister(struct devlink *devlink,
 				 const struct devlink_trap_policer *policers,
 				 size_t policers_count)
 {
-	int i;
-
 	devl_lock(devlink);
-	for (i = policers_count - 1; i >= 0; i--)
-		devlink_trap_policer_unregister(devlink, &policers[i]);
+	devl_trap_policers_unregister(devlink, policers, policers_count);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister);
-- 
cgit 


From c223d6a4bf6d2a88f2418316393f9170f81c54f0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:35 +0200
Subject: net: devlink: add unlocked variants of devlink_resource*() functions

Add unlocked variants of devlink_resource*() functions to be used
in drivers called-in with devlink->lock held.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  17 ++++
 net/core/devlink.c    | 217 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 173 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index fb1e17d998b6..d341753753ce 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1608,23 +1608,40 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
+int devl_resource_register(struct devlink *devlink,
+			   const char *resource_name,
+			   u64 resource_size,
+			   u64 resource_id,
+			   u64 parent_resource_id,
+			   const struct devlink_resource_size_params *size_params);
 int devlink_resource_register(struct devlink *devlink,
 			      const char *resource_name,
 			      u64 resource_size,
 			      u64 resource_id,
 			      u64 parent_resource_id,
 			      const struct devlink_resource_size_params *size_params);
+void devl_resources_unregister(struct devlink *devlink);
 void devlink_resources_unregister(struct devlink *devlink);
+int devl_resource_size_get(struct devlink *devlink,
+			   u64 resource_id,
+			   u64 *p_resource_size);
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
 int devlink_dpipe_table_resource_set(struct devlink *devlink,
 				     const char *table_name, u64 resource_id,
 				     u64 resource_units);
+void devl_resource_occ_get_register(struct devlink *devlink,
+				    u64 resource_id,
+				    devlink_resource_occ_get_t *occ_get,
+				    void *occ_get_priv);
 void devlink_resource_occ_get_register(struct devlink *devlink,
 				       u64 resource_id,
 				       devlink_resource_occ_get_t *occ_get,
 				       void *occ_get_priv);
+void devl_resource_occ_get_unregister(struct devlink *devlink,
+				      u64 resource_id);
+
 void devlink_resource_occ_get_unregister(struct devlink *devlink,
 					 u64 resource_id);
 int devlink_params_register(struct devlink *devlink,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b0f6e8388880..1688271ef7b2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10555,45 +10555,41 @@ unlock:
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
 /**
- *	devlink_resource_register - devlink resource register
+ * devl_resource_register - devlink resource register
  *
- *	@devlink: devlink
- *	@resource_name: resource's name
- *	@resource_size: resource's size
- *	@resource_id: resource's id
- *	@parent_resource_id: resource's parent id
- *	@size_params: size parameters
+ * @devlink: devlink
+ * @resource_name: resource's name
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_resource_id: resource's parent id
+ * @size_params: size parameters
  *
- *	Generic resources should reuse the same names across drivers.
- *	Please see the generic resources list at:
- *	Documentation/networking/devlink/devlink-resource.rst
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
  */
-int devlink_resource_register(struct devlink *devlink,
-			      const char *resource_name,
-			      u64 resource_size,
-			      u64 resource_id,
-			      u64 parent_resource_id,
-			      const struct devlink_resource_size_params *size_params)
+int devl_resource_register(struct devlink *devlink,
+			   const char *resource_name,
+			   u64 resource_size,
+			   u64 resource_id,
+			   u64 parent_resource_id,
+			   const struct devlink_resource_size_params *size_params)
 {
 	struct devlink_resource *resource;
 	struct list_head *resource_list;
 	bool top_hierarchy;
-	int err = 0;
+
+	lockdep_assert_held(&devlink->lock);
 
 	top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
 
-	devl_lock(devlink);
 	resource = devlink_resource_find(devlink, NULL, resource_id);
-	if (resource) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (resource)
+		return -EINVAL;
 
 	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
-	if (!resource) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!resource)
+		return -ENOMEM;
 
 	if (top_hierarchy) {
 		resource_list = &devlink->resource_list;
@@ -10607,8 +10603,7 @@ int devlink_resource_register(struct devlink *devlink,
 			resource->parent = parent_resource;
 		} else {
 			kfree(resource);
-			err = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 	}
 
@@ -10621,7 +10616,39 @@ int devlink_resource_register(struct devlink *devlink,
 	       sizeof(resource->size_params));
 	INIT_LIST_HEAD(&resource->resource_list);
 	list_add_tail(&resource->list, resource_list);
-out:
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_register);
+
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_resource_id: resource's parent id
+ *	@size_params: size parameters
+ *
+ *	Generic resources should reuse the same names across drivers.
+ *	Please see the generic resources list at:
+ *	Documentation/networking/devlink/devlink-resource.rst
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      const struct devlink_resource_size_params *size_params)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_resource_register(devlink, resource_name, resource_size,
+				     resource_id, parent_resource_id, size_params);
 	devl_unlock(devlink);
 	return err;
 }
@@ -10641,15 +10668,15 @@ static void devlink_resource_unregister(struct devlink *devlink,
 }
 
 /**
- *	devlink_resources_unregister - free all resources
+ * devl_resources_unregister - free all resources
  *
- *	@devlink: devlink
+ * @devlink: devlink
  */
-void devlink_resources_unregister(struct devlink *devlink)
+void devl_resources_unregister(struct devlink *devlink)
 {
 	struct devlink_resource *tmp, *child_resource;
 
-	devl_lock(devlink);
+	lockdep_assert_held(&devlink->lock);
 
 	list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
 				 list) {
@@ -10657,34 +10684,65 @@ void devlink_resources_unregister(struct devlink *devlink)
 		list_del(&child_resource->list);
 		kfree(child_resource);
 	}
+}
+EXPORT_SYMBOL_GPL(devl_resources_unregister);
 
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_resources_unregister(struct devlink *devlink)
+{
+	devl_lock(devlink);
+	devl_resources_unregister(devlink);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_resources_unregister);
 
+/**
+ * devl_resource_size_get - get and update size
+ *
+ * @devlink: devlink
+ * @resource_id: the requested resource id
+ * @p_resource_size: ptr to update
+ */
+int devl_resource_size_get(struct devlink *devlink,
+			   u64 resource_id,
+			   u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+
+	lockdep_assert_held(&devlink->lock);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_size_get);
+
 /**
  *	devlink_resource_size_get - get and update size
  *
  *	@devlink: devlink
  *	@resource_id: the requested resource id
  *	@p_resource_size: ptr to update
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size)
 {
-	struct devlink_resource *resource;
-	int err = 0;
+	int err;
 
 	devl_lock(devlink);
-	resource = devlink_resource_find(devlink, NULL, resource_id);
-	if (!resource) {
-		err = -EINVAL;
-		goto out;
-	}
-	*p_resource_size = resource->size_new;
-	resource->size = resource->size_new;
-out:
+	err = devl_resource_size_get(devlink, resource_id, p_resource_size);
 	devl_unlock(devlink);
 	return err;
 }
@@ -10721,6 +10779,33 @@ out:
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
 
+/**
+ * devl_resource_occ_get_register - register occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+void devl_resource_occ_get_register(struct devlink *devlink,
+				    u64 resource_id,
+				    devlink_resource_occ_get_t *occ_get,
+				    void *occ_get_priv)
+{
+	struct devlink_resource *resource;
+
+	lockdep_assert_held(&devlink->lock);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (WARN_ON(!resource))
+		return;
+	WARN_ON(resource->occ_get);
+
+	resource->occ_get = occ_get;
+	resource->occ_get_priv = occ_get_priv;
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_register);
+
 /**
  *	devlink_resource_occ_get_register - register occupancy getter
  *
@@ -10728,47 +10813,57 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
  *	@resource_id: resource id
  *	@occ_get: occupancy getter callback
  *	@occ_get_priv: occupancy getter callback priv
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 void devlink_resource_occ_get_register(struct devlink *devlink,
 				       u64 resource_id,
 				       devlink_resource_occ_get_t *occ_get,
 				       void *occ_get_priv)
 {
-	struct devlink_resource *resource;
-
 	devl_lock(devlink);
-	resource = devlink_resource_find(devlink, NULL, resource_id);
-	if (WARN_ON(!resource))
-		goto out;
-	WARN_ON(resource->occ_get);
-
-	resource->occ_get = occ_get;
-	resource->occ_get_priv = occ_get_priv;
-out:
+	devl_resource_occ_get_register(devlink, resource_id,
+				       occ_get, occ_get_priv);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
 
 /**
- *	devlink_resource_occ_get_unregister - unregister occupancy getter
+ * devl_resource_occ_get_unregister - unregister occupancy getter
  *
- *	@devlink: devlink
- *	@resource_id: resource id
+ * @devlink: devlink
+ * @resource_id: resource id
  */
-void devlink_resource_occ_get_unregister(struct devlink *devlink,
-					 u64 resource_id)
+void devl_resource_occ_get_unregister(struct devlink *devlink,
+				      u64 resource_id)
 {
 	struct devlink_resource *resource;
 
-	devl_lock(devlink);
+	lockdep_assert_held(&devlink->lock);
+
 	resource = devlink_resource_find(devlink, NULL, resource_id);
 	if (WARN_ON(!resource))
-		goto out;
+		return;
 	WARN_ON(!resource->occ_get);
 
 	resource->occ_get = NULL;
 	resource->occ_get_priv = NULL;
-out:
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
+
+/**
+ *	devlink_resource_occ_get_unregister - unregister occupancy getter
+ *
+ *	@devlink: devlink
+ *	@resource_id: resource id
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_resource_occ_get_unregister(struct devlink *devlink,
+					 u64 resource_id)
+{
+	devl_lock(devlink);
+	devl_resource_occ_get_unregister(devlink, resource_id);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
-- 
cgit 


From 755cfa69c4ece770c5a15dd51a9da2a7aafafa7c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:36 +0200
Subject: net: devlink: add unlocked variants of devlink_sb*() functions

Add unlocked variants of devlink_sb*() functions to be used
in drivers called-in with devlink->lock held.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  5 +++++
 net/core/devlink.c    | 54 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index d341753753ce..0057809a13b0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1579,10 +1579,15 @@ void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 void devlink_linecard_activate(struct devlink_linecard *linecard);
 void devlink_linecard_deactivate(struct devlink_linecard *linecard);
+int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
+		     u32 size, u16 ingress_pools_count,
+		     u16 egress_pools_count, u16 ingress_tc_count,
+		     u16 egress_tc_count);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
 			u16 egress_tc_count);
+void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index);
 void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index);
 int devlink_dpipe_table_register(struct devlink *devlink,
 				 const char *table_name,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1688271ef7b2..64dab4024d11 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10375,25 +10375,21 @@ void devlink_linecard_deactivate(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
 
-int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
-			u32 size, u16 ingress_pools_count,
-			u16 egress_pools_count, u16 ingress_tc_count,
-			u16 egress_tc_count)
+int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
+		     u32 size, u16 ingress_pools_count,
+		     u16 egress_pools_count, u16 ingress_tc_count,
+		     u16 egress_tc_count)
 {
 	struct devlink_sb *devlink_sb;
-	int err = 0;
 
-	devl_lock(devlink);
-	if (devlink_sb_index_exists(devlink, sb_index)) {
-		err = -EEXIST;
-		goto unlock;
-	}
+	lockdep_assert_held(&devlink->lock);
+
+	if (devlink_sb_index_exists(devlink, sb_index))
+		return -EEXIST;
 
 	devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
-	if (!devlink_sb) {
-		err = -ENOMEM;
-		goto unlock;
-	}
+	if (!devlink_sb)
+		return -ENOMEM;
 	devlink_sb->index = sb_index;
 	devlink_sb->size = size;
 	devlink_sb->ingress_pools_count = ingress_pools_count;
@@ -10401,23 +10397,45 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	devlink_sb->ingress_tc_count = ingress_tc_count;
 	devlink_sb->egress_tc_count = egress_tc_count;
 	list_add_tail(&devlink_sb->list, &devlink->sb_list);
-unlock:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_sb_register);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+			u32 size, u16 ingress_pools_count,
+			u16 egress_pools_count, u16 ingress_tc_count,
+			u16 egress_tc_count)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_sb_register(devlink, sb_index, size, ingress_pools_count,
+			       egress_pools_count, ingress_tc_count,
+			       egress_tc_count);
 	devl_unlock(devlink);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_sb_register);
 
-void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 {
 	struct devlink_sb *devlink_sb;
 
-	devl_lock(devlink);
+	lockdep_assert_held(&devlink->lock);
+
 	devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
 	WARN_ON(!devlink_sb);
 	list_del(&devlink_sb->list);
-	devl_unlock(devlink);
 	kfree(devlink_sb);
 }
+EXPORT_SYMBOL_GPL(devl_sb_unregister);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+	devl_lock(devlink);
+	devl_sb_unregister(devlink, sb_index);
+	devl_unlock(devlink);
+}
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 
 /**
-- 
cgit 


From 70a2ff89369d17c55efacb789171c3bd05c21817 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:37 +0200
Subject: net: devlink: add unlocked variants of devlink_dpipe*() functions

Add unlocked variants of devlink_dpipe*() functions to be used
in drivers called-in with devlink->lock held.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  12 ++++
 net/core/devlink.c    | 181 +++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 147 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0057809a13b0..18ad88527847 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1589,14 +1589,23 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u16 egress_tc_count);
 void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index);
 void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index);
+int devl_dpipe_table_register(struct devlink *devlink,
+			      const char *table_name,
+			      struct devlink_dpipe_table_ops *table_ops,
+			      void *priv, bool counter_control_extern);
 int devlink_dpipe_table_register(struct devlink *devlink,
 				 const char *table_name,
 				 struct devlink_dpipe_table_ops *table_ops,
 				 void *priv, bool counter_control_extern);
+void devl_dpipe_table_unregister(struct devlink *devlink,
+				 const char *table_name);
 void devlink_dpipe_table_unregister(struct devlink *devlink,
 				    const char *table_name);
+void devl_dpipe_headers_register(struct devlink *devlink,
+				 struct devlink_dpipe_headers *dpipe_headers);
 void devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers);
+void devl_dpipe_headers_unregister(struct devlink *devlink);
 void devlink_dpipe_headers_unregister(struct devlink *devlink);
 bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
 					 const char *table_name);
@@ -1633,6 +1642,9 @@ int devl_resource_size_get(struct devlink *devlink,
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
+int devl_dpipe_table_resource_set(struct devlink *devlink,
+				  const char *table_name, u64 resource_id,
+				  u64 resource_units);
 int devlink_dpipe_table_resource_set(struct devlink *devlink,
 				     const char *table_name, u64 resource_id,
 				     u64 resource_units);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 64dab4024d11..b249c18a8bbc 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10438,6 +10438,23 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 }
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 
+/**
+ * devl_dpipe_headers_register - register dpipe headers
+ *
+ * @devlink: devlink
+ * @dpipe_headers: dpipe header array
+ *
+ * Register the headers supported by hardware.
+ */
+void devl_dpipe_headers_register(struct devlink *devlink,
+				 struct devlink_dpipe_headers *dpipe_headers)
+{
+	lockdep_assert_held(&devlink->lock);
+
+	devlink->dpipe_headers = dpipe_headers;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
+
 /**
  *	devlink_dpipe_headers_register - register dpipe headers
  *
@@ -10445,27 +10462,46 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
  *	@dpipe_headers: dpipe header array
  *
  *	Register the headers supported by hardware.
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 void devlink_dpipe_headers_register(struct devlink *devlink,
 				    struct devlink_dpipe_headers *dpipe_headers)
 {
 	devl_lock(devlink);
-	devlink->dpipe_headers = dpipe_headers;
+	devl_dpipe_headers_register(devlink, dpipe_headers);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
 
+/**
+ * devl_dpipe_headers_unregister - unregister dpipe headers
+ *
+ * @devlink: devlink
+ *
+ * Unregister the headers supported by hardware.
+ */
+void devl_dpipe_headers_unregister(struct devlink *devlink)
+{
+	lockdep_assert_held(&devlink->lock);
+
+	devlink->dpipe_headers = NULL;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
+
 /**
  *	devlink_dpipe_headers_unregister - unregister dpipe headers
  *
  *	@devlink: devlink
  *
  *	Unregister the headers supported by hardware.
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 void devlink_dpipe_headers_unregister(struct devlink *devlink)
 {
 	devl_lock(devlink);
-	devlink->dpipe_headers = NULL;
+	devl_dpipe_headers_unregister(devlink);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
@@ -10502,38 +10538,33 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
 
 /**
- *	devlink_dpipe_table_register - register dpipe table
+ * devl_dpipe_table_register - register dpipe table
  *
- *	@devlink: devlink
- *	@table_name: table name
- *	@table_ops: table ops
- *	@priv: priv
- *	@counter_control_extern: external control for counters
+ * @devlink: devlink
+ * @table_name: table name
+ * @table_ops: table ops
+ * @priv: priv
+ * @counter_control_extern: external control for counters
  */
-int devlink_dpipe_table_register(struct devlink *devlink,
-				 const char *table_name,
-				 struct devlink_dpipe_table_ops *table_ops,
-				 void *priv, bool counter_control_extern)
+int devl_dpipe_table_register(struct devlink *devlink,
+			      const char *table_name,
+			      struct devlink_dpipe_table_ops *table_ops,
+			      void *priv, bool counter_control_extern)
 {
 	struct devlink_dpipe_table *table;
-	int err = 0;
+
+	lockdep_assert_held(&devlink->lock);
 
 	if (WARN_ON(!table_ops->size_get))
 		return -EINVAL;
 
-	devl_lock(devlink);
-
 	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
-				     devlink)) {
-		err = -EEXIST;
-		goto unlock;
-	}
+				     devlink))
+		return -EEXIST;
 
 	table = kzalloc(sizeof(*table), GFP_KERNEL);
-	if (!table) {
-		err = -ENOMEM;
-		goto unlock;
-	}
+	if (!table)
+		return -ENOMEM;
 
 	table->name = table_name;
 	table->table_ops = table_ops;
@@ -10541,33 +10572,72 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->counter_control_extern = counter_control_extern;
 
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-unlock:
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
+
+/**
+ *	devlink_dpipe_table_register - register dpipe table
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@table_ops: table ops
+ *	@priv: priv
+ *	@counter_control_extern: external control for counters
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+int devlink_dpipe_table_register(struct devlink *devlink,
+				 const char *table_name,
+				 struct devlink_dpipe_table_ops *table_ops,
+				 void *priv, bool counter_control_extern)
+{
+	int err;
+
+	devl_lock(devlink);
+	err = devl_dpipe_table_register(devlink, table_name, table_ops, priv,
+					counter_control_extern);
 	devl_unlock(devlink);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
 
 /**
- *	devlink_dpipe_table_unregister - unregister dpipe table
+ * devl_dpipe_table_unregister - unregister dpipe table
  *
- *	@devlink: devlink
- *	@table_name: table name
+ * @devlink: devlink
+ * @table_name: table name
  */
-void devlink_dpipe_table_unregister(struct devlink *devlink,
-				    const char *table_name)
+void devl_dpipe_table_unregister(struct devlink *devlink,
+				 const char *table_name)
 {
 	struct devlink_dpipe_table *table;
 
-	devl_lock(devlink);
+	lockdep_assert_held(&devlink->lock);
+
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
 					 table_name, devlink);
 	if (!table)
-		goto unlock;
+		return;
 	list_del_rcu(&table->list);
-	devl_unlock(devlink);
 	kfree_rcu(table, rcu);
-	return;
-unlock:
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
+
+/**
+ *	devlink_dpipe_table_unregister - unregister dpipe table
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_dpipe_table_unregister(struct devlink *devlink,
+				    const char *table_name)
+{
+	devl_lock(devlink);
+	devl_dpipe_table_unregister(devlink, table_name);
 	devl_unlock(devlink);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
@@ -10766,6 +10836,32 @@ int devlink_resource_size_get(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ * devl_dpipe_table_resource_set - set the resource id
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @resource_id: resource id
+ * @resource_units: number of resource's units consumed per table's entry
+ */
+int devl_dpipe_table_resource_set(struct devlink *devlink,
+				  const char *table_name, u64 resource_id,
+				  u64 resource_units)
+{
+	struct devlink_dpipe_table *table;
+
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name, devlink);
+	if (!table)
+		return -EINVAL;
+
+	table->resource_id = resource_id;
+	table->resource_units = resource_units;
+	table->resource_valid = true;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
+
 /**
  *	devlink_dpipe_table_resource_set - set the resource id
  *
@@ -10773,25 +10869,18 @@ EXPORT_SYMBOL_GPL(devlink_resource_size_get);
  *	@table_name: table name
  *	@resource_id: resource id
  *	@resource_units: number of resource's units consumed per table's entry
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 int devlink_dpipe_table_resource_set(struct devlink *devlink,
 				     const char *table_name, u64 resource_id,
 				     u64 resource_units)
 {
-	struct devlink_dpipe_table *table;
-	int err = 0;
+	int err;
 
 	devl_lock(devlink);
-	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name, devlink);
-	if (!table) {
-		err = -EINVAL;
-		goto out;
-	}
-	table->resource_id = resource_id;
-	table->resource_units = resource_units;
-	table->resource_valid = true;
-out:
+	err = devl_dpipe_table_resource_set(devlink, table_name,
+					    resource_id, resource_units);
 	devl_unlock(devlink);
 	return err;
 }
-- 
cgit 


From eb0e9fa2c6355e744d3ea7d07d34d89a4735320e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:39 +0200
Subject: net: devlink: add unlocked variants of
 devlink_region_create/destroy() functions

Add unlocked variants of devlink_region_create/destroy() functions
to be used in drivers called-in with devlink->lock held.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  5 +++
 net/core/devlink.c    | 89 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 18ad88527847..391d401ddb55 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1676,6 +1676,10 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
 int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value init_val);
 void devlink_param_value_changed(struct devlink *devlink, u32 param_id);
+struct devlink_region *devl_region_create(struct devlink *devlink,
+					  const struct devlink_region_ops *ops,
+					  u32 region_max_snapshots,
+					  u64 region_size);
 struct devlink_region *
 devlink_region_create(struct devlink *devlink,
 		      const struct devlink_region_ops *ops,
@@ -1684,6 +1688,7 @@ struct devlink_region *
 devlink_port_region_create(struct devlink_port *port,
 			   const struct devlink_port_region_ops *ops,
 			   u32 region_max_snapshots, u64 region_size);
+void devl_region_destroy(struct devlink_region *region);
 void devlink_region_destroy(struct devlink_region *region);
 void devlink_port_region_destroy(struct devlink_region *region);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b249c18a8bbc..fe9657d6162f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -11192,36 +11192,31 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 EXPORT_SYMBOL_GPL(devlink_param_value_changed);
 
 /**
- *	devlink_region_create - create a new address region
+ * devl_region_create - create a new address region
  *
- *	@devlink: devlink
- *	@ops: region operations and name
- *	@region_max_snapshots: Maximum supported number of snapshots for region
- *	@region_size: size of region
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
  */
-struct devlink_region *
-devlink_region_create(struct devlink *devlink,
-		      const struct devlink_region_ops *ops,
-		      u32 region_max_snapshots, u64 region_size)
+struct devlink_region *devl_region_create(struct devlink *devlink,
+					  const struct devlink_region_ops *ops,
+					  u32 region_max_snapshots,
+					  u64 region_size)
 {
 	struct devlink_region *region;
-	int err = 0;
+
+	devl_assert_locked(devlink);
 
 	if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
 		return ERR_PTR(-EINVAL);
 
-	devl_lock(devlink);
-
-	if (devlink_region_get_by_name(devlink, ops->name)) {
-		err = -EEXIST;
-		goto unlock;
-	}
+	if (devlink_region_get_by_name(devlink, ops->name))
+		return ERR_PTR(-EEXIST);
 
 	region = kzalloc(sizeof(*region), GFP_KERNEL);
-	if (!region) {
-		err = -ENOMEM;
-		goto unlock;
-	}
+	if (!region)
+		return ERR_PTR(-ENOMEM);
 
 	region->devlink = devlink;
 	region->max_snapshots = region_max_snapshots;
@@ -11231,12 +11226,32 @@ devlink_region_create(struct devlink *devlink,
 	list_add_tail(&region->list, &devlink->region_list);
 	devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
 
-	devl_unlock(devlink);
 	return region;
+}
+EXPORT_SYMBOL_GPL(devl_region_create);
 
-unlock:
+/**
+ *	devlink_region_create - create a new address region
+ *
+ *	@devlink: devlink
+ *	@ops: region operations and name
+ *	@region_max_snapshots: Maximum supported number of snapshots for region
+ *	@region_size: size of region
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+		      const struct devlink_region_ops *ops,
+		      u32 region_max_snapshots, u64 region_size)
+{
+	struct devlink_region *region;
+
+	devl_lock(devlink);
+	region = devl_region_create(devlink, ops, region_max_snapshots,
+				    region_size);
 	devl_unlock(devlink);
-	return ERR_PTR(err);
+	return region;
 }
 EXPORT_SYMBOL_GPL(devlink_region_create);
 
@@ -11247,6 +11262,8 @@ EXPORT_SYMBOL_GPL(devlink_region_create);
  *	@ops: region operations and name
  *	@region_max_snapshots: Maximum supported number of snapshots for region
  *	@region_size: size of region
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
  */
 struct devlink_region *
 devlink_port_region_create(struct devlink_port *port,
@@ -11292,16 +11309,16 @@ unlock:
 EXPORT_SYMBOL_GPL(devlink_port_region_create);
 
 /**
- *	devlink_region_destroy - destroy address region
+ * devl_region_destroy - destroy address region
  *
- *	@region: devlink region to destroy
+ * @region: devlink region to destroy
  */
-void devlink_region_destroy(struct devlink_region *region)
+void devl_region_destroy(struct devlink_region *region)
 {
 	struct devlink *devlink = region->devlink;
 	struct devlink_snapshot *snapshot, *ts;
 
-	devl_lock(devlink);
+	devl_assert_locked(devlink);
 
 	/* Free all snapshots of region */
 	list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
@@ -11310,9 +11327,25 @@ void devlink_region_destroy(struct devlink_region *region)
 	list_del(&region->list);
 
 	devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
-	devl_unlock(devlink);
 	kfree(region);
 }
+EXPORT_SYMBOL_GPL(devl_region_destroy);
+
+/**
+ *	devlink_region_destroy - destroy address region
+ *
+ *	@region: devlink region to destroy
+ *
+ *	Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+	struct devlink *devlink = region->devlink;
+
+	devl_lock(devlink);
+	devl_region_destroy(region);
+	devl_unlock(devlink);
+}
 EXPORT_SYMBOL_GPL(devlink_region_destroy);
 
 /**
-- 
cgit 


From 012ec02ae4410207f796a9b280a60b80b6cc790a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:40 +0200
Subject: netdevsim: convert driver to use unlocked devlink API during
 init/fini

Prepare for devlink reload being called with devlink->lock held and
convert the netdevsim driver to use unlocked devlink API during init and
fini flows. Take devl_lock() in reload_down() and reload_up() ops in the
meantime before reload cmd is converted to take the lock itself.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netdevsim/bus.c       |  19 ------
 drivers/net/netdevsim/dev.c       | 134 ++++++++++++++++++--------------------
 drivers/net/netdevsim/fib.c       |  62 +++++++++---------
 drivers/net/netdevsim/netdevsim.h |   3 -
 include/net/devlink.h             |   1 +
 net/core/devlink.c                |   6 ++
 6 files changed, 102 insertions(+), 123 deletions(-)

(limited to 'include')

diff --git a/drivers/net/netdevsim/bus.c b/drivers/net/netdevsim/bus.c
index 25cb2e600d53..b5f4df1a07a3 100644
--- a/drivers/net/netdevsim/bus.c
+++ b/drivers/net/netdevsim/bus.c
@@ -72,16 +72,7 @@ new_port_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
-	if (!mutex_trylock(&nsim_bus_dev->nsim_bus_reload_lock))
-		return -EBUSY;
-
-	if (nsim_bus_dev->in_reload) {
-		mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
-		return -EBUSY;
-	}
-
 	ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index);
-	mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
 	return ret ? ret : count;
 }
 
@@ -102,16 +93,7 @@ del_port_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
-	if (!mutex_trylock(&nsim_bus_dev->nsim_bus_reload_lock))
-		return -EBUSY;
-
-	if (nsim_bus_dev->in_reload) {
-		mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
-		return -EBUSY;
-	}
-
 	ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index);
-	mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
 	return ret ? ret : count;
 }
 
@@ -298,7 +280,6 @@ nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queu
 	nsim_bus_dev->num_queues = num_queues;
 	nsim_bus_dev->initial_net = current->nsproxy->net_ns;
 	nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS;
-	mutex_init(&nsim_bus_dev->nsim_bus_reload_lock);
 	/* Disallow using nsim_bus_dev */
 	smp_store_release(&nsim_bus_dev->init, false);
 
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 57a3ac893792..5802e80e8fe1 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -436,62 +436,62 @@ static int nsim_dev_resources_register(struct devlink *devlink)
 	int err;
 
 	/* Resources for IPv4 */
-	err = devlink_resource_register(devlink, "IPv4", (u64)-1,
-					NSIM_RESOURCE_IPV4,
-					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&params);
+	err = devl_resource_register(devlink, "IPv4", (u64)-1,
+				     NSIM_RESOURCE_IPV4,
+				     DEVLINK_RESOURCE_ID_PARENT_TOP,
+				     &params);
 	if (err) {
 		pr_err("Failed to register IPv4 top resource\n");
 		goto out;
 	}
 
-	err = devlink_resource_register(devlink, "fib", (u64)-1,
-					NSIM_RESOURCE_IPV4_FIB,
-					NSIM_RESOURCE_IPV4, &params);
+	err = devl_resource_register(devlink, "fib", (u64)-1,
+				     NSIM_RESOURCE_IPV4_FIB,
+				     NSIM_RESOURCE_IPV4, &params);
 	if (err) {
 		pr_err("Failed to register IPv4 FIB resource\n");
 		return err;
 	}
 
-	err = devlink_resource_register(devlink, "fib-rules", (u64)-1,
-					NSIM_RESOURCE_IPV4_FIB_RULES,
-					NSIM_RESOURCE_IPV4, &params);
+	err = devl_resource_register(devlink, "fib-rules", (u64)-1,
+				     NSIM_RESOURCE_IPV4_FIB_RULES,
+				     NSIM_RESOURCE_IPV4, &params);
 	if (err) {
 		pr_err("Failed to register IPv4 FIB rules resource\n");
 		return err;
 	}
 
 	/* Resources for IPv6 */
-	err = devlink_resource_register(devlink, "IPv6", (u64)-1,
-					NSIM_RESOURCE_IPV6,
-					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&params);
+	err = devl_resource_register(devlink, "IPv6", (u64)-1,
+				     NSIM_RESOURCE_IPV6,
+				     DEVLINK_RESOURCE_ID_PARENT_TOP,
+				     &params);
 	if (err) {
 		pr_err("Failed to register IPv6 top resource\n");
 		goto out;
 	}
 
-	err = devlink_resource_register(devlink, "fib", (u64)-1,
-					NSIM_RESOURCE_IPV6_FIB,
-					NSIM_RESOURCE_IPV6, &params);
+	err = devl_resource_register(devlink, "fib", (u64)-1,
+				     NSIM_RESOURCE_IPV6_FIB,
+				     NSIM_RESOURCE_IPV6, &params);
 	if (err) {
 		pr_err("Failed to register IPv6 FIB resource\n");
 		return err;
 	}
 
-	err = devlink_resource_register(devlink, "fib-rules", (u64)-1,
-					NSIM_RESOURCE_IPV6_FIB_RULES,
-					NSIM_RESOURCE_IPV6, &params);
+	err = devl_resource_register(devlink, "fib-rules", (u64)-1,
+				     NSIM_RESOURCE_IPV6_FIB_RULES,
+				     NSIM_RESOURCE_IPV6, &params);
 	if (err) {
 		pr_err("Failed to register IPv6 FIB rules resource\n");
 		return err;
 	}
 
 	/* Resources for nexthops */
-	err = devlink_resource_register(devlink, "nexthops", (u64)-1,
-					NSIM_RESOURCE_NEXTHOPS,
-					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&params);
+	err = devl_resource_register(devlink, "nexthops", (u64)-1,
+				     NSIM_RESOURCE_NEXTHOPS,
+				     DEVLINK_RESOURCE_ID_PARENT_TOP,
+				     &params);
 
 out:
 	return err;
@@ -557,15 +557,15 @@ static int nsim_dev_dummy_region_init(struct nsim_dev *nsim_dev,
 				      struct devlink *devlink)
 {
 	nsim_dev->dummy_region =
-		devlink_region_create(devlink, &dummy_region_ops,
-				      NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX,
-				      NSIM_DEV_DUMMY_REGION_SIZE);
+		devl_region_create(devlink, &dummy_region_ops,
+				   NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX,
+				   NSIM_DEV_DUMMY_REGION_SIZE);
 	return PTR_ERR_OR_ZERO(nsim_dev->dummy_region);
 }
 
 static void nsim_dev_dummy_region_exit(struct nsim_dev *nsim_dev)
 {
-	devlink_region_destroy(nsim_dev->dummy_region);
+	devl_region_destroy(nsim_dev->dummy_region);
 }
 
 static int
@@ -832,7 +832,11 @@ static void nsim_dev_trap_report_work(struct work_struct *work)
 	/* For each running port and enabled packet trap, generate a UDP
 	 * packet with a random 5-tuple and report it.
 	 */
-	devl_lock(priv_to_devlink(nsim_dev));
+	if (!devl_trylock(priv_to_devlink(nsim_dev))) {
+		schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, 0);
+		return;
+	}
+
 	list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) {
 		if (!netif_running(nsim_dev_port->ns->netdev))
 			continue;
@@ -880,18 +884,18 @@ static int nsim_dev_traps_init(struct devlink *devlink)
 	nsim_trap_data->nsim_dev = nsim_dev;
 	nsim_dev->trap_data = nsim_trap_data;
 
-	err = devlink_trap_policers_register(devlink, nsim_trap_policers_arr,
-					     policers_count);
+	err = devl_trap_policers_register(devlink, nsim_trap_policers_arr,
+					  policers_count);
 	if (err)
 		goto err_trap_policers_cnt_free;
 
-	err = devlink_trap_groups_register(devlink, nsim_trap_groups_arr,
-					   ARRAY_SIZE(nsim_trap_groups_arr));
+	err = devl_trap_groups_register(devlink, nsim_trap_groups_arr,
+					ARRAY_SIZE(nsim_trap_groups_arr));
 	if (err)
 		goto err_trap_policers_unregister;
 
-	err = devlink_traps_register(devlink, nsim_traps_arr,
-				     ARRAY_SIZE(nsim_traps_arr), NULL);
+	err = devl_traps_register(devlink, nsim_traps_arr,
+				  ARRAY_SIZE(nsim_traps_arr), NULL);
 	if (err)
 		goto err_trap_groups_unregister;
 
@@ -903,11 +907,11 @@ static int nsim_dev_traps_init(struct devlink *devlink)
 	return 0;
 
 err_trap_groups_unregister:
-	devlink_trap_groups_unregister(devlink, nsim_trap_groups_arr,
-				       ARRAY_SIZE(nsim_trap_groups_arr));
+	devl_trap_groups_unregister(devlink, nsim_trap_groups_arr,
+				    ARRAY_SIZE(nsim_trap_groups_arr));
 err_trap_policers_unregister:
-	devlink_trap_policers_unregister(devlink, nsim_trap_policers_arr,
-					 ARRAY_SIZE(nsim_trap_policers_arr));
+	devl_trap_policers_unregister(devlink, nsim_trap_policers_arr,
+				      ARRAY_SIZE(nsim_trap_policers_arr));
 err_trap_policers_cnt_free:
 	kfree(nsim_trap_data->trap_policers_cnt_arr);
 err_trap_items_free:
@@ -923,12 +927,12 @@ static void nsim_dev_traps_exit(struct devlink *devlink)
 
 	/* caution, trap work takes devlink lock */
 	cancel_delayed_work_sync(&nsim_dev->trap_data->trap_report_dw);
-	devlink_traps_unregister(devlink, nsim_traps_arr,
-				 ARRAY_SIZE(nsim_traps_arr));
-	devlink_trap_groups_unregister(devlink, nsim_trap_groups_arr,
-				       ARRAY_SIZE(nsim_trap_groups_arr));
-	devlink_trap_policers_unregister(devlink, nsim_trap_policers_arr,
-					 ARRAY_SIZE(nsim_trap_policers_arr));
+	devl_traps_unregister(devlink, nsim_traps_arr,
+			      ARRAY_SIZE(nsim_traps_arr));
+	devl_trap_groups_unregister(devlink, nsim_trap_groups_arr,
+				    ARRAY_SIZE(nsim_trap_groups_arr));
+	devl_trap_policers_unregister(devlink, nsim_trap_policers_arr,
+				      ARRAY_SIZE(nsim_trap_policers_arr));
 	kfree(nsim_dev->trap_data->trap_policers_cnt_arr);
 	kfree(nsim_dev->trap_data->trap_items_arr);
 	kfree(nsim_dev->trap_data);
@@ -943,24 +947,19 @@ static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change,
 				struct netlink_ext_ack *extack)
 {
 	struct nsim_dev *nsim_dev = devlink_priv(devlink);
-	struct nsim_bus_dev *nsim_bus_dev;
-
-	nsim_bus_dev = nsim_dev->nsim_bus_dev;
-	if (!mutex_trylock(&nsim_bus_dev->nsim_bus_reload_lock))
-		return -EOPNOTSUPP;
 
+	devl_lock(devlink);
 	if (nsim_dev->dont_allow_reload) {
 		/* For testing purposes, user set debugfs dont_allow_reload
 		 * value to true. So forbid it.
 		 */
 		NL_SET_ERR_MSG_MOD(extack, "User forbid the reload for testing purposes");
-		mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
+		devl_unlock(devlink);
 		return -EOPNOTSUPP;
 	}
-	nsim_bus_dev->in_reload = true;
 
 	nsim_dev_reload_destroy(nsim_dev);
-	mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
+	devl_unlock(devlink);
 	return 0;
 }
 
@@ -969,25 +968,21 @@ static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_actio
 			      struct netlink_ext_ack *extack)
 {
 	struct nsim_dev *nsim_dev = devlink_priv(devlink);
-	struct nsim_bus_dev *nsim_bus_dev;
 	int ret;
 
-	nsim_bus_dev = nsim_dev->nsim_bus_dev;
-	mutex_lock(&nsim_bus_dev->nsim_bus_reload_lock);
-	nsim_bus_dev->in_reload = false;
-
+	devl_lock(devlink);
 	if (nsim_dev->fail_reload) {
 		/* For testing purposes, user set debugfs fail_reload
 		 * value to true. Fail right away.
 		 */
 		NL_SET_ERR_MSG_MOD(extack, "User setup the reload to fail for testing purposes");
-		mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
+		devl_unlock(devlink);
 		return -EINVAL;
 	}
 
 	*actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
 	ret = nsim_dev_reload_create(nsim_dev, extack);
-	mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock);
+	devl_unlock(devlink);
 	return ret;
 }
 
@@ -1434,11 +1429,9 @@ static void nsim_dev_port_del_all(struct nsim_dev *nsim_dev)
 {
 	struct nsim_dev_port *nsim_dev_port, *tmp;
 
-	devl_lock(priv_to_devlink(nsim_dev));
 	list_for_each_entry_safe(nsim_dev_port, tmp,
 				 &nsim_dev->port_list, list)
 		__nsim_dev_port_del(nsim_dev_port);
-	devl_unlock(priv_to_devlink(nsim_dev));
 }
 
 static int nsim_dev_port_add_all(struct nsim_dev *nsim_dev,
@@ -1447,9 +1440,7 @@ static int nsim_dev_port_add_all(struct nsim_dev *nsim_dev,
 	int i, err;
 
 	for (i = 0; i < port_count; i++) {
-		devl_lock(priv_to_devlink(nsim_dev));
 		err = __nsim_dev_port_add(nsim_dev, NSIM_DEV_PORT_TYPE_PF, i);
-		devl_unlock(priv_to_devlink(nsim_dev));
 		if (err)
 			goto err_port_del_all;
 	}
@@ -1537,6 +1528,7 @@ int nsim_drv_probe(struct nsim_bus_dev *nsim_bus_dev)
 				 nsim_bus_dev->initial_net, &nsim_bus_dev->dev);
 	if (!devlink)
 		return -ENOMEM;
+	devl_lock(devlink);
 	nsim_dev = devlink_priv(devlink);
 	nsim_dev->nsim_bus_dev = nsim_bus_dev;
 	nsim_dev->switch_id.id_len = sizeof(nsim_dev->switch_id.id);
@@ -1555,7 +1547,7 @@ int nsim_drv_probe(struct nsim_bus_dev *nsim_bus_dev)
 				      GFP_KERNEL | __GFP_NOWARN);
 	if (!nsim_dev->vfconfigs) {
 		err = -ENOMEM;
-		goto err_devlink_free;
+		goto err_devlink_unlock;
 	}
 
 	err = nsim_dev_resources_register(devlink);
@@ -1608,6 +1600,7 @@ int nsim_drv_probe(struct nsim_bus_dev *nsim_bus_dev)
 
 	nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_LEGACY;
 	devlink_set_features(devlink, DEVLINK_F_RELOAD);
+	devl_unlock(devlink);
 	devlink_register(devlink);
 	return 0;
 
@@ -1631,10 +1624,11 @@ err_params_unregister:
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
 err_dl_unregister:
-	devlink_resources_unregister(devlink);
+	devl_resources_unregister(devlink);
 err_vfc_free:
 	kfree(nsim_dev->vfconfigs);
-err_devlink_free:
+err_devlink_unlock:
+	devl_unlock(devlink);
 	devlink_free(devlink);
 	dev_set_drvdata(&nsim_bus_dev->dev, NULL);
 	return err;
@@ -1648,13 +1642,11 @@ static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev)
 		return;
 	debugfs_remove(nsim_dev->take_snapshot);
 
-	devl_lock(devlink);
 	if (nsim_dev_get_vfs(nsim_dev)) {
 		nsim_bus_dev_set_vfs(nsim_dev->nsim_bus_dev, 0);
 		if (nsim_esw_mode_is_switchdev(nsim_dev))
 			nsim_esw_legacy_enable(nsim_dev, NULL);
 	}
-	devl_unlock(devlink);
 
 	nsim_dev_port_del_all(nsim_dev);
 	nsim_dev_hwstats_exit(nsim_dev);
@@ -1671,14 +1663,16 @@ void nsim_drv_remove(struct nsim_bus_dev *nsim_bus_dev)
 	struct devlink *devlink = priv_to_devlink(nsim_dev);
 
 	devlink_unregister(devlink);
+	devl_lock(devlink);
 	nsim_dev_reload_destroy(nsim_dev);
 
 	nsim_bpf_dev_exit(nsim_dev);
 	nsim_dev_debugfs_exit(nsim_dev);
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
-	devlink_resources_unregister(devlink);
+	devl_resources_unregister(devlink);
 	kfree(nsim_dev->vfconfigs);
+	devl_unlock(devlink);
 	devlink_free(devlink);
 	dev_set_drvdata(&nsim_bus_dev->dev, NULL);
 }
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index c8f398f5bc5b..94e7512bef94 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -1453,7 +1453,7 @@ static void nsim_fib_set_max_all(struct nsim_fib_data *data,
 		int err;
 		u64 val;
 
-		err = devlink_resource_size_get(devlink, res_ids[i], &val);
+		err = devl_resource_size_get(devlink, res_ids[i], &val);
 		if (err)
 			val = (u64) -1;
 		nsim_fib_set_max(data, res_ids[i], val);
@@ -1562,26 +1562,26 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink,
 		goto err_nexthop_nb_unregister;
 	}
 
-	devlink_resource_occ_get_register(devlink,
-					  NSIM_RESOURCE_IPV4_FIB,
-					  nsim_fib_ipv4_resource_occ_get,
-					  data);
-	devlink_resource_occ_get_register(devlink,
-					  NSIM_RESOURCE_IPV4_FIB_RULES,
-					  nsim_fib_ipv4_rules_res_occ_get,
-					  data);
-	devlink_resource_occ_get_register(devlink,
-					  NSIM_RESOURCE_IPV6_FIB,
-					  nsim_fib_ipv6_resource_occ_get,
-					  data);
-	devlink_resource_occ_get_register(devlink,
-					  NSIM_RESOURCE_IPV6_FIB_RULES,
-					  nsim_fib_ipv6_rules_res_occ_get,
-					  data);
-	devlink_resource_occ_get_register(devlink,
-					  NSIM_RESOURCE_NEXTHOPS,
-					  nsim_fib_nexthops_res_occ_get,
-					  data);
+	devl_resource_occ_get_register(devlink,
+				       NSIM_RESOURCE_IPV4_FIB,
+				       nsim_fib_ipv4_resource_occ_get,
+				       data);
+	devl_resource_occ_get_register(devlink,
+				       NSIM_RESOURCE_IPV4_FIB_RULES,
+				       nsim_fib_ipv4_rules_res_occ_get,
+				       data);
+	devl_resource_occ_get_register(devlink,
+				       NSIM_RESOURCE_IPV6_FIB,
+				       nsim_fib_ipv6_resource_occ_get,
+				       data);
+	devl_resource_occ_get_register(devlink,
+				       NSIM_RESOURCE_IPV6_FIB_RULES,
+				       nsim_fib_ipv6_rules_res_occ_get,
+				       data);
+	devl_resource_occ_get_register(devlink,
+				       NSIM_RESOURCE_NEXTHOPS,
+				       nsim_fib_nexthops_res_occ_get,
+				       data);
 	return data;
 
 err_nexthop_nb_unregister:
@@ -1604,16 +1604,16 @@ err_data_free:
 
 void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data)
 {
-	devlink_resource_occ_get_unregister(devlink,
-					    NSIM_RESOURCE_NEXTHOPS);
-	devlink_resource_occ_get_unregister(devlink,
-					    NSIM_RESOURCE_IPV6_FIB_RULES);
-	devlink_resource_occ_get_unregister(devlink,
-					    NSIM_RESOURCE_IPV6_FIB);
-	devlink_resource_occ_get_unregister(devlink,
-					    NSIM_RESOURCE_IPV4_FIB_RULES);
-	devlink_resource_occ_get_unregister(devlink,
-					    NSIM_RESOURCE_IPV4_FIB);
+	devl_resource_occ_get_unregister(devlink,
+					 NSIM_RESOURCE_NEXTHOPS);
+	devl_resource_occ_get_unregister(devlink,
+					 NSIM_RESOURCE_IPV6_FIB_RULES);
+	devl_resource_occ_get_unregister(devlink,
+					 NSIM_RESOURCE_IPV6_FIB);
+	devl_resource_occ_get_unregister(devlink,
+					 NSIM_RESOURCE_IPV4_FIB_RULES);
+	devl_resource_occ_get_unregister(devlink,
+					 NSIM_RESOURCE_IPV4_FIB);
 	unregister_fib_notifier(devlink_net(devlink), &data->fib_nb);
 	unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb);
 	flush_work(&data->fib_event_work);
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 0b122872b2c9..7d8ed8d8df5c 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -376,9 +376,6 @@ struct nsim_bus_dev {
 				  */
 	unsigned int max_vfs;
 	unsigned int num_vfs;
-	/* Lock for devlink->reload_enabled in netdevsim module */
-	struct mutex nsim_bus_reload_lock;
-	bool in_reload;
 	bool init;
 };
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 391d401ddb55..242798967a44 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1517,6 +1517,7 @@ struct device *devlink_to_dev(const struct devlink *devlink);
 
 /* Devlink instance explicit locking */
 void devl_lock(struct devlink *devlink);
+int devl_trylock(struct devlink *devlink);
 void devl_unlock(struct devlink *devlink);
 void devl_assert_locked(struct devlink *devlink);
 bool devl_lock_is_held(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index fe9657d6162f..1c75de6f6388 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -266,6 +266,12 @@ void devl_lock(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devl_lock);
 
+int devl_trylock(struct devlink *devlink)
+{
+	return mutex_trylock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_trylock);
+
 void devl_unlock(struct devlink *devlink)
 {
 	mutex_unlock(&devlink->lock);
-- 
cgit 


From f655dacb59ac987a56b82d2e73d85de411eb02aa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Jul 2022 13:02:41 +0200
Subject: net: devlink: remove unused locked functions

Remove locked versions of functions that are no longer used by anyone.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  20 ------
 net/core/devlink.c    | 168 --------------------------------------------------
 2 files changed, 188 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 242798967a44..780744b550b8 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1594,20 +1594,11 @@ int devl_dpipe_table_register(struct devlink *devlink,
 			      const char *table_name,
 			      struct devlink_dpipe_table_ops *table_ops,
 			      void *priv, bool counter_control_extern);
-int devlink_dpipe_table_register(struct devlink *devlink,
-				 const char *table_name,
-				 struct devlink_dpipe_table_ops *table_ops,
-				 void *priv, bool counter_control_extern);
 void devl_dpipe_table_unregister(struct devlink *devlink,
 				 const char *table_name);
-void devlink_dpipe_table_unregister(struct devlink *devlink,
-				    const char *table_name);
 void devl_dpipe_headers_register(struct devlink *devlink,
 				 struct devlink_dpipe_headers *dpipe_headers);
-void devlink_dpipe_headers_register(struct devlink *devlink,
-				   struct devlink_dpipe_headers *dpipe_headers);
 void devl_dpipe_headers_unregister(struct devlink *devlink);
-void devlink_dpipe_headers_unregister(struct devlink *devlink);
 bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
 					 const char *table_name);
 int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx);
@@ -1640,9 +1631,6 @@ void devlink_resources_unregister(struct devlink *devlink);
 int devl_resource_size_get(struct devlink *devlink,
 			   u64 resource_id,
 			   u64 *p_resource_size);
-int devlink_resource_size_get(struct devlink *devlink,
-			      u64 resource_id,
-			      u64 *p_resource_size);
 int devl_dpipe_table_resource_set(struct devlink *devlink,
 				  const char *table_name, u64 resource_id,
 				  u64 resource_units);
@@ -1817,18 +1805,10 @@ int
 devl_trap_policers_register(struct devlink *devlink,
 			    const struct devlink_trap_policer *policers,
 			    size_t policers_count);
-int
-devlink_trap_policers_register(struct devlink *devlink,
-			       const struct devlink_trap_policer *policers,
-			       size_t policers_count);
 void
 devl_trap_policers_unregister(struct devlink *devlink,
 			      const struct devlink_trap_policer *policers,
 			      size_t policers_count);
-void
-devlink_trap_policers_unregister(struct devlink *devlink,
-				 const struct devlink_trap_policer *policers,
-				 size_t policers_count);
 
 #if IS_ENABLED(CONFIG_NET_DEVLINK)
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1c75de6f6388..98d79feeb3dc 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10461,25 +10461,6 @@ void devl_dpipe_headers_register(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
 
-/**
- *	devlink_dpipe_headers_register - register dpipe headers
- *
- *	@devlink: devlink
- *	@dpipe_headers: dpipe header array
- *
- *	Register the headers supported by hardware.
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-void devlink_dpipe_headers_register(struct devlink *devlink,
-				    struct devlink_dpipe_headers *dpipe_headers)
-{
-	devl_lock(devlink);
-	devl_dpipe_headers_register(devlink, dpipe_headers);
-	devl_unlock(devlink);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
-
 /**
  * devl_dpipe_headers_unregister - unregister dpipe headers
  *
@@ -10495,23 +10476,6 @@ void devl_dpipe_headers_unregister(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
 
-/**
- *	devlink_dpipe_headers_unregister - unregister dpipe headers
- *
- *	@devlink: devlink
- *
- *	Unregister the headers supported by hardware.
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-void devlink_dpipe_headers_unregister(struct devlink *devlink)
-{
-	devl_lock(devlink);
-	devl_dpipe_headers_unregister(devlink);
-	devl_unlock(devlink);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
-
 /**
  *	devlink_dpipe_table_counter_enabled - check if counter allocation
  *					      required
@@ -10583,32 +10547,6 @@ int devl_dpipe_table_register(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
 
-/**
- *	devlink_dpipe_table_register - register dpipe table
- *
- *	@devlink: devlink
- *	@table_name: table name
- *	@table_ops: table ops
- *	@priv: priv
- *	@counter_control_extern: external control for counters
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-int devlink_dpipe_table_register(struct devlink *devlink,
-				 const char *table_name,
-				 struct devlink_dpipe_table_ops *table_ops,
-				 void *priv, bool counter_control_extern)
-{
-	int err;
-
-	devl_lock(devlink);
-	err = devl_dpipe_table_register(devlink, table_name, table_ops, priv,
-					counter_control_extern);
-	devl_unlock(devlink);
-	return err;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
-
 /**
  * devl_dpipe_table_unregister - unregister dpipe table
  *
@@ -10631,23 +10569,6 @@ void devl_dpipe_table_unregister(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
 
-/**
- *	devlink_dpipe_table_unregister - unregister dpipe table
- *
- *	@devlink: devlink
- *	@table_name: table name
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-void devlink_dpipe_table_unregister(struct devlink *devlink,
-				    const char *table_name)
-{
-	devl_lock(devlink);
-	devl_dpipe_table_unregister(devlink, table_name);
-	devl_unlock(devlink);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
-
 /**
  * devl_resource_register - devlink resource register
  *
@@ -10820,28 +10741,6 @@ int devl_resource_size_get(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_resource_size_get);
 
-/**
- *	devlink_resource_size_get - get and update size
- *
- *	@devlink: devlink
- *	@resource_id: the requested resource id
- *	@p_resource_size: ptr to update
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-int devlink_resource_size_get(struct devlink *devlink,
-			      u64 resource_id,
-			      u64 *p_resource_size)
-{
-	int err;
-
-	devl_lock(devlink);
-	err = devl_resource_size_get(devlink, resource_id, p_resource_size);
-	devl_unlock(devlink);
-	return err;
-}
-EXPORT_SYMBOL_GPL(devlink_resource_size_get);
-
 /**
  * devl_dpipe_table_resource_set - set the resource id
  *
@@ -10868,30 +10767,6 @@ int devl_dpipe_table_resource_set(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
 
-/**
- *	devlink_dpipe_table_resource_set - set the resource id
- *
- *	@devlink: devlink
- *	@table_name: table name
- *	@resource_id: resource id
- *	@resource_units: number of resource's units consumed per table's entry
- *
- *	Context: Takes and release devlink->lock <mutex>.
- */
-int devlink_dpipe_table_resource_set(struct devlink *devlink,
-				     const char *table_name, u64 resource_id,
-				     u64 resource_units)
-{
-	int err;
-
-	devl_lock(devlink);
-	err = devl_dpipe_table_resource_set(devlink, table_name,
-					    resource_id, resource_units);
-	devl_unlock(devlink);
-	return err;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
-
 /**
  * devl_resource_occ_get_register - register occupancy getter
  *
@@ -12262,30 +12137,6 @@ err_trap_policer_verify:
 }
 EXPORT_SYMBOL_GPL(devl_trap_policers_register);
 
-/**
- * devlink_trap_policers_register - Register packet trap policers with devlink.
- * @devlink: devlink.
- * @policers: Packet trap policers.
- * @policers_count: Count of provided packet trap policers.
- *
- * Return: Non-zero value on failure.
- *
- * Context: Takes and release devlink->lock <mutex>.
- */
-int
-devlink_trap_policers_register(struct devlink *devlink,
-			       const struct devlink_trap_policer *policers,
-			       size_t policers_count)
-{
-	int err;
-
-	devl_lock(devlink);
-	err = devl_trap_policers_register(devlink, policers, policers_count);
-	devl_unlock(devlink);
-	return err;
-}
-EXPORT_SYMBOL_GPL(devlink_trap_policers_register);
-
 /**
  * devl_trap_policers_unregister - Unregister packet trap policers from devlink.
  * @devlink: devlink.
@@ -12305,25 +12156,6 @@ devl_trap_policers_unregister(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
 
-/**
- * devlink_trap_policers_unregister - Unregister packet trap policers from devlink.
- * @devlink: devlink.
- * @policers: Packet trap policers.
- * @policers_count: Count of provided packet trap policers.
- *
- * Context: Takes and release devlink->lock <mutex>.
- */
-void
-devlink_trap_policers_unregister(struct devlink *devlink,
-				 const struct devlink_trap_policer *policers,
-				 size_t policers_count)
-{
-	devl_lock(devlink);
-	devl_trap_policers_unregister(devlink, policers, policers_count);
-	devl_unlock(devlink);
-}
-EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister);
-
 static void __devlink_compat_running_version(struct devlink *devlink,
 					     char *buf, size_t len)
 {
-- 
cgit 


From a229cc14f3395311b899e5e582b71efa8dd01df0 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 14 Jul 2022 19:15:24 +0800
Subject: dma-mapping: add dma_opt_mapping_size()

Streaming DMA mapping involving an IOMMU may be much slower for larger
total mapping size. This is because every IOMMU DMA mapping requires an
IOVA to be allocated and freed. IOVA sizes above a certain limit are not
cached, which can have a big impact on DMA mapping performance.

Provide an API for device drivers to know this "optimal" limit, such that
they may try to produce mapping which don't exceed it.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/core-api/dma-api.rst | 14 ++++++++++++++
 include/linux/dma-map-ops.h        |  1 +
 include/linux/dma-mapping.h        |  5 +++++
 kernel/dma/mapping.c               | 12 ++++++++++++
 4 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 6d6d0edd2d27..829f20a193ca 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -204,6 +204,20 @@ Returns the maximum size of a mapping for the device. The size parameter
 of the mapping functions like dma_map_single(), dma_map_page() and
 others should not be larger than the returned value.
 
+::
+
+	size_t
+	dma_opt_mapping_size(struct device *dev);
+
+Returns the maximum optimal size of a mapping for the device.
+
+Mapping larger buffers may take much longer in certain scenarios. In
+addition, for high-rate short-lived streaming mappings, the upfront time
+spent on the mapping may account for an appreciable part of the total
+request lifetime. As such, if splitting larger requests incurs no
+significant performance penalty, then device drivers are advised to
+limit total DMA streaming mappings length to the returned value.
+
 ::
 
 	bool
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d5b06b3a4a6..98ceba6fa848 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -69,6 +69,7 @@ struct dma_map_ops {
 	int (*dma_supported)(struct device *dev, u64 mask);
 	u64 (*get_required_mask)(struct device *dev);
 	size_t (*max_mapping_size)(struct device *dev);
+	size_t (*opt_mapping_size)(void);
 	unsigned long (*get_merge_boundary)(struct device *dev);
 };
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index dca2b1355bb1..fe3849434b2a 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,7 @@ int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
+size_t dma_opt_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
@@ -266,6 +267,10 @@ static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
 }
+static inline size_t dma_opt_mapping_size(struct device *dev)
+{
+	return 0;
+}
 static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	return false;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index db7244291b74..1bfe11b1edb6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -773,6 +773,18 @@ size_t dma_max_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_max_mapping_size);
 
+size_t dma_opt_mapping_size(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	size_t size = SIZE_MAX;
+
+	if (ops && ops->opt_mapping_size)
+		size = ops->opt_mapping_size();
+
+	return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit 


From 6d9870b7e5def2450e21316515b9efc0529204dd Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 14 Jul 2022 19:15:25 +0800
Subject: dma-iommu: add iommu_dma_opt_mapping_size()

Add the IOMMU callback for DMA mapping API dma_opt_mapping_size(), which
allows the drivers to know the optimal mapping limit and thus limit the
requested IOVA lengths.

This value is based on the IOVA rcache range limit, as IOVAs allocated
above this limit must always be newly allocated, which may be quite slow.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 6 ++++++
 drivers/iommu/iova.c      | 5 +++++
 include/linux/iova.h      | 2 ++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f90251572a5d..9e1586447ee8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1459,6 +1459,11 @@ static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
 	return (1UL << __ffs(domain->pgsize_bitmap)) - 1;
 }
 
+static size_t iommu_dma_opt_mapping_size(void)
+{
+	return iova_rcache_range();
+}
+
 static const struct dma_map_ops iommu_dma_ops = {
 	.alloc			= iommu_dma_alloc,
 	.free			= iommu_dma_free,
@@ -1479,6 +1484,7 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.map_resource		= iommu_dma_map_resource,
 	.unmap_resource		= iommu_dma_unmap_resource,
 	.get_merge_boundary	= iommu_dma_get_merge_boundary,
+	.opt_mapping_size	= iommu_dma_opt_mapping_size,
 };
 
 /*
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index db77aa675145..9f00b58d546e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -26,6 +26,11 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 
+unsigned long iova_rcache_range(void)
+{
+	return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1);
+}
+
 static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct iova_domain *iovad;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 320a70e40233..c6ba6d95d79c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -79,6 +79,8 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
+unsigned long iova_rcache_range(void);
+
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
-- 
cgit 


From 2b038e786f8338a3bc22d791000753e0ec113e00 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Jun 2022 20:28:42 +0300
Subject: gpiolib: devres: Get rid of unused devm_gpio_free()

The last user, which in fact was a dead code, has gone a year ago,
previous one 3 years ago. On top of that we want to drop away the
legacy GPIO APIs in the kernel, so take a chance to get rid of
unused devm_gpio_free() and accompanying stuff.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/gpio/gpiolib-devres.c                    | 32 ------------------------
 include/linux/gpio.h                             |  6 -----
 3 files changed, 39 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 2d39967bafcc..55272942e721 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -277,7 +277,6 @@ GPIO
   devm_gpiochip_add_data()
   devm_gpio_request()
   devm_gpio_request_one()
-  devm_gpio_free()
 
 I2C
   devm_i2c_new_dummy_device()
diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 79da85d17b71..16a696249229 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -375,9 +375,6 @@ void devm_gpiod_put_array(struct device *dev, struct gpio_descs *descs)
 }
 EXPORT_SYMBOL_GPL(devm_gpiod_put_array);
 
-
-
-
 static void devm_gpio_release(struct device *dev, void *res)
 {
 	unsigned *gpio = res;
@@ -385,13 +382,6 @@ static void devm_gpio_release(struct device *dev, void *res)
 	gpio_free(*gpio);
 }
 
-static int devm_gpio_match(struct device *dev, void *res, void *data)
-{
-	unsigned *this = res, *gpio = data;
-
-	return *this == *gpio;
-}
-
 /**
  *      devm_gpio_request - request a GPIO for a managed device
  *      @dev: device to request the GPIO for
@@ -402,11 +392,7 @@ static int devm_gpio_match(struct device *dev, void *res, void *data)
  *      same arguments and performs the same function as
  *      gpio_request().  GPIOs requested with this function will be
  *      automatically freed on driver detach.
- *
- *      If an GPIO allocated with this function needs to be freed
- *      separately, devm_gpio_free() must be used.
  */
-
 int devm_gpio_request(struct device *dev, unsigned gpio, const char *label)
 {
 	unsigned *dr;
@@ -459,24 +445,6 @@ int devm_gpio_request_one(struct device *dev, unsigned gpio,
 }
 EXPORT_SYMBOL_GPL(devm_gpio_request_one);
 
-/**
- *      devm_gpio_free - free a GPIO
- *      @dev: device to free GPIO for
- *      @gpio: GPIO to free
- *
- *      Except for the extra @dev argument, this function takes the
- *      same arguments and performs the same function as gpio_free().
- *      This function instead of gpio_free() should be used to manually
- *      free GPIOs allocated with devm_gpio_request().
- */
-void devm_gpio_free(struct device *dev, unsigned int gpio)
-{
-
-	WARN_ON(devres_release(dev, devm_gpio_release, devm_gpio_match,
-		&gpio));
-}
-EXPORT_SYMBOL_GPL(devm_gpio_free);
-
 static void devm_gpio_chip_release(void *data)
 {
 	struct gpio_chip *gc = data;
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 008ad3ee56b7..a370387fa406 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -95,7 +95,6 @@ struct device;
 int devm_gpio_request(struct device *dev, unsigned gpio, const char *label);
 int devm_gpio_request_one(struct device *dev, unsigned gpio,
 			  unsigned long flags, const char *label);
-void devm_gpio_free(struct device *dev, unsigned int gpio);
 
 #else /* ! CONFIG_GPIOLIB */
 
@@ -240,11 +239,6 @@ static inline int devm_gpio_request_one(struct device *dev, unsigned gpio,
 	return -EINVAL;
 }
 
-static inline void devm_gpio_free(struct device *dev, unsigned int gpio)
-{
-	WARN_ON(1);
-}
-
 #endif /* ! CONFIG_GPIOLIB */
 
 #endif /* __LINUX_GPIO_H */
-- 
cgit 


From 2a1192ff0835cb4b0ccd6f6e85c93aa0dc6f66b7 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 17:23:38 +0200
Subject: gpio: twl4030: Drop platform teardown callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no machine providing a teardown callback, so drop the unused
code.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpio-twl4030.c | 11 -----------
 include/linux/mfd/twl.h     |  2 --
 2 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c
index de249726230e..e2cb7cb90c8c 100644
--- a/drivers/gpio/gpio-twl4030.c
+++ b/drivers/gpio/gpio-twl4030.c
@@ -593,18 +593,7 @@ out:
 /* Cannot use as gpio_twl4030_probe() calls us */
 static int gpio_twl4030_remove(struct platform_device *pdev)
 {
-	struct twl4030_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct gpio_twl4030_priv *priv = platform_get_drvdata(pdev);
-	int status;
-
-	if (pdata && pdata->teardown) {
-		status = pdata->teardown(&pdev->dev, priv->gpio_chip.base,
-					 TWL4030_GPIO_MAX);
-		if (status) {
-			dev_dbg(&pdev->dev, "teardown --> %d\n", status);
-			return status;
-		}
-	}
 
 	gpiochip_remove(&priv->gpio_chip);
 
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 8871cc5188a0..c8cd31756037 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -594,8 +594,6 @@ struct twl4030_gpio_platform_data {
 
 	int		(*setup)(struct device *dev,
 				unsigned gpio, unsigned ngpio);
-	int		(*teardown)(struct device *dev,
-				unsigned gpio, unsigned ngpio);
 };
 
 struct twl4030_madc_platform_data {
-- 
cgit 


From 7e55b33d3f18fde5c7a57b6c52d80499485c737f Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 21:48:02 +0200
Subject: gpio: ucb1400: Remove platform setup and teardown support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no user of these callbacks. The motivation for this change is
to stop returning an error code from the remove callback.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpio-ucb1400.c | 20 --------------------
 drivers/mfd/ucb1400_core.c  |  6 ++----
 include/linux/ucb1400.h     |  2 --
 3 files changed, 2 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-ucb1400.c b/drivers/gpio/gpio-ucb1400.c
index d2a8644864c3..386e69300332 100644
--- a/drivers/gpio/gpio-ucb1400.c
+++ b/drivers/gpio/gpio-ucb1400.c
@@ -64,34 +64,14 @@ static int ucb1400_gpio_probe(struct platform_device *dev)
 	ucb->gc.can_sleep = true;
 
 	err = devm_gpiochip_add_data(&dev->dev, &ucb->gc, ucb);
-	if (err)
-		goto err;
-
-	if (ucb->gpio_setup)
-		err = ucb->gpio_setup(&dev->dev, ucb->gc.ngpio);
 
 err:
 	return err;
 
 }
 
-static int ucb1400_gpio_remove(struct platform_device *dev)
-{
-	int err = 0;
-	struct ucb1400_gpio *ucb = platform_get_drvdata(dev);
-
-	if (ucb && ucb->gpio_teardown) {
-		err = ucb->gpio_teardown(&dev->dev, ucb->gc.ngpio);
-		if (err)
-			return err;
-	}
-
-	return err;
-}
-
 static struct platform_driver ucb1400_gpio_driver = {
 	.probe	= ucb1400_gpio_probe,
-	.remove	= ucb1400_gpio_remove,
 	.driver	= {
 		.name	= "ucb1400_gpio"
 	},
diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c
index 8c3832a58ef6..ac1d18039568 100644
--- a/drivers/mfd/ucb1400_core.c
+++ b/drivers/mfd/ucb1400_core.c
@@ -72,11 +72,9 @@ static int ucb1400_core_probe(struct device *dev)
 
 	/* GPIO */
 	ucb_gpio.ac97 = ac97;
-	if (pdata) {
-		ucb_gpio.gpio_setup = pdata->gpio_setup;
-		ucb_gpio.gpio_teardown = pdata->gpio_teardown;
+	if (pdata)
 		ucb_gpio.gpio_offset = pdata->gpio_offset;
-	}
+
 	ucb->ucb1400_gpio = platform_device_alloc("ucb1400_gpio", -1);
 	if (!ucb->ucb1400_gpio) {
 		err = -ENOMEM;
diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index 0968ef458447..22345391350b 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -84,8 +84,6 @@ struct ucb1400_gpio {
 	struct gpio_chip	gc;
 	struct snd_ac97		*ac97;
 	int			gpio_offset;
-	int			(*gpio_setup)(struct device *dev, int ngpio);
-	int			(*gpio_teardown)(struct device *dev, int ngpio);
 };
 
 struct ucb1400_ts {
-- 
cgit 


From c269df8c5ad316eac47a9078e7a2339e1956637a Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Wed, 13 Jul 2022 15:14:18 +0200
Subject: gpiolib: add support for bias pull disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change prepares the gpio core to look at firmware flags and set
'FLAG_BIAS_DISABLE' if necessary. It works in similar way to
'GPIO_PULL_DOWN' and 'GPIO_PULL_UP'.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpiolib.c       | 8 ++++++--
 include/linux/gpio/machine.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9535f48e18d1..0692ec84d3b0 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3945,9 +3945,11 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 	if (lflags & GPIO_OPEN_SOURCE)
 		set_bit(FLAG_OPEN_SOURCE, &desc->flags);
 
-	if ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) {
+	if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) ||
+	    ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) ||
+	    ((lflags & GPIO_PULL_DOWN) && (lflags & GPIO_PULL_DISABLE))) {
 		gpiod_err(desc,
-			  "both pull-up and pull-down enabled, invalid configuration\n");
+			  "multiple pull-up, pull-down or pull-disable enabled, invalid configuration\n");
 		return -EINVAL;
 	}
 
@@ -3955,6 +3957,8 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 		set_bit(FLAG_PULL_UP, &desc->flags);
 	else if (lflags & GPIO_PULL_DOWN)
 		set_bit(FLAG_PULL_DOWN, &desc->flags);
+	else if (lflags & GPIO_PULL_DISABLE)
+		set_bit(FLAG_BIAS_DISABLE, &desc->flags);
 
 	ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY));
 	if (ret < 0)
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index 4d55da28e664..0b619eb7ae83 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -14,6 +14,7 @@ enum gpio_lookup_flags {
 	GPIO_TRANSITORY			= (1 << 3),
 	GPIO_PULL_UP			= (1 << 4),
 	GPIO_PULL_DOWN			= (1 << 5),
+	GPIO_PULL_DISABLE		= (1 << 6),
 
 	GPIO_LOOKUP_FLAGS_DEFAULT	= GPIO_ACTIVE_HIGH | GPIO_PERSISTENT,
 };
-- 
cgit 


From 31bea23119cda87088c6bd4085a1e442c6c5974c Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Wed, 13 Jul 2022 15:14:19 +0200
Subject: gpiolib: of: support bias pull disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On top of looking at PULL_UP and PULL_DOWN flags, also look at
PULL_DISABLE and set the appropriate GPIO flag. The GPIO core will then
pass down this to controllers that support it.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpiolib-of.c | 7 +++++++
 include/linux/of_gpio.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index f80307be37d5..a037b50bef33 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -354,6 +354,9 @@ struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node,
 	if (flags & OF_GPIO_PULL_DOWN)
 		lflags |= GPIO_PULL_DOWN;
 
+	if (flags & OF_GPIO_PULL_DISABLE)
+		lflags |= GPIO_PULL_DISABLE;
+
 	ret = gpiod_configure_flags(desc, propname, lflags, dflags);
 	if (ret < 0) {
 		gpiod_put(desc);
@@ -556,6 +559,8 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 		*flags |= GPIO_PULL_UP;
 	if (of_flags & OF_GPIO_PULL_DOWN)
 		*flags |= GPIO_PULL_DOWN;
+	if (of_flags & OF_GPIO_PULL_DISABLE)
+		*flags |= GPIO_PULL_DISABLE;
 
 	return desc;
 }
@@ -621,6 +626,8 @@ static struct gpio_desc *of_parse_own_gpio(struct device_node *np,
 		*lflags |= GPIO_PULL_UP;
 	if (xlate_flags & OF_GPIO_PULL_DOWN)
 		*lflags |= GPIO_PULL_DOWN;
+	if (xlate_flags & OF_GPIO_PULL_DISABLE)
+		*lflags |= GPIO_PULL_DISABLE;
 
 	if (of_property_read_bool(np, "input"))
 		*dflags |= GPIOD_IN;
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 8bf2ea859653..a5166eb93437 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -29,6 +29,7 @@ enum of_gpio_flags {
 	OF_GPIO_TRANSITORY = 0x8,
 	OF_GPIO_PULL_UP = 0x10,
 	OF_GPIO_PULL_DOWN = 0x20,
+	OF_GPIO_PULL_DISABLE = 0x40,
 };
 
 #ifdef CONFIG_OF_GPIO
-- 
cgit 


From d042656a2170ce1b0d1a72d81a99bc93f0170358 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Wed, 13 Jul 2022 15:14:21 +0200
Subject: dt-bindings: gpio: add pull-disable flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This extends the flags that can be used in GPIO specifiers to indicate
that no bias is intended in the pin.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/dt-bindings/gpio/gpio.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/gpio.h b/include/dt-bindings/gpio/gpio.h
index c029467e828b..5566e58196a2 100644
--- a/include/dt-bindings/gpio/gpio.h
+++ b/include/dt-bindings/gpio/gpio.h
@@ -39,4 +39,7 @@
 /* Bit 5 express pull down */
 #define GPIO_PULL_DOWN 32
 
+/* Bit 6 express pull disable */
+#define GPIO_PULL_DISABLE 64
+
 #endif
-- 
cgit 


From 608128d391fa5c95d515d2c198f40239016e1fcd Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 14 Jul 2022 19:15:27 +0800
Subject: scsi: sd: allow max_sectors be capped at DMA optimal size limit

Streaming DMA mappings may be considerably slower when mappings go through
an IOMMU and the total mapping length is somewhat long. This is because the
IOMMU IOVA code allocates and free an IOVA for each mapping, which may
affect performance.

New member Scsi_Host.opt_sectors is added, which is the optimal host
max_sectors, and use this value to cap the request queue max_sectors when
set.

It could be considered to have request queues io_opt value initially
set at Scsi_Host.opt_sectors in __scsi_init_queue(), but that is not
really the purpose of io_opt.

Finally, even though Scsi_Host.opt_sectors value should never be greater
than the request queue max_hw_sectors value, continue to limit to this
value for safety.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c        | 2 ++
 include/scsi/scsi_host.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a1a2ac09066f..3eaee1f7aaca 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3296,6 +3296,8 @@ static int sd_revalidate_disk(struct gendisk *disk)
 				      (sector_t)BLK_DEF_MAX_SECTORS);
 	}
 
+	rw_max = min_not_zero(rw_max, sdp->host->opt_sectors);
+
 	/* Do not exceed controller limit */
 	rw_max = min(rw_max, queue_max_hw_sectors(q));
 
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 667d889b92b5..d32a84b2bb40 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -607,6 +607,7 @@ struct Scsi_Host {
 	short unsigned int sg_tablesize;
 	short unsigned int sg_prot_tablesize;
 	unsigned int max_sectors;
+	unsigned int opt_sectors;
 	unsigned int max_segment_size;
 	unsigned long dma_boundary;
 	unsigned long virt_boundary_mask;
-- 
cgit 


From 62fa5c9800a0b9aecf9ce0743f12a16057c9400d Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 3 Jun 2022 17:57:25 +0200
Subject: mfd: max77714: Update Luca Ceresoli's e-mail address

My Bootlin address is preferred from now on.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220603155727.1232061-4-luca@lucaceresoli.net
---
 drivers/mfd/max77714.c       | 4 ++--
 include/linux/mfd/max77714.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/max77714.c b/drivers/mfd/max77714.c
index d1e4247800d2..143a432ea343 100644
--- a/drivers/mfd/max77714.c
+++ b/drivers/mfd/max77714.c
@@ -3,7 +3,7 @@
  * Maxim MAX77714 Core Driver
  *
  * Copyright (C) 2022 Luca Ceresoli
- * Author: Luca Ceresoli <luca@lucaceresoli.net>
+ * Author: Luca Ceresoli <luca.ceresoli@bootlin.com>
  */
 
 #include <linux/i2c.h>
@@ -148,5 +148,5 @@ static struct i2c_driver max77714_driver = {
 module_i2c_driver(max77714_driver);
 
 MODULE_DESCRIPTION("Maxim MAX77714 MFD core driver");
-MODULE_AUTHOR("Luca Ceresoli <luca@lucaceresoli.net>");
+MODULE_AUTHOR("Luca Ceresoli <luca.ceresoli@bootlin.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/max77714.h b/include/linux/mfd/max77714.h
index a970dc455426..7947e0d697a5 100644
--- a/include/linux/mfd/max77714.h
+++ b/include/linux/mfd/max77714.h
@@ -3,7 +3,7 @@
  * Maxim MAX77714 Register and data structures definition.
  *
  * Copyright (C) 2022 Luca Ceresoli
- * Author: Luca Ceresoli <luca@lucaceresoli.net>
+ * Author: Luca Ceresoli <luca.ceresoli@bootlin.com>
  */
 
 #ifndef __LINUX_MFD_MAX77714_H_
-- 
cgit 


From 128ac294e1b437cb8a7f2ff8ede1cde9082bddbe Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 30 May 2022 21:24:28 +0200
Subject: mfd: t7l66xb: Drop platform disable callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

None of the in-tree instantiations of struct t7l66xb_platform_data
provides a disable callback. So better don't dereference this function
pointer unconditionally. As there is no user, drop it completely instead
of calling it conditional.

This is a preparation for making platform remove callbacks return void.

Fixes: 1f192015ca5b ("mfd: driver for the T7L66XB TMIO SoC")
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220530192430.2108217-3-u.kleine-koenig@pengutronix.de
---
 drivers/mfd/t7l66xb.c       | 6 +-----
 include/linux/mfd/t7l66xb.h | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index 5369c67e3280..663ffd4b8570 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -397,11 +397,8 @@ err_noirq:
 
 static int t7l66xb_remove(struct platform_device *dev)
 {
-	struct t7l66xb_platform_data *pdata = dev_get_platdata(&dev->dev);
 	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
-	int ret;
 
-	ret = pdata->disable(dev);
 	clk_disable_unprepare(t7l66xb->clk48m);
 	clk_put(t7l66xb->clk48m);
 	clk_disable_unprepare(t7l66xb->clk32k);
@@ -412,8 +409,7 @@ static int t7l66xb_remove(struct platform_device *dev)
 	mfd_remove_devices(&dev->dev);
 	kfree(t7l66xb);
 
-	return ret;
-
+	return 0;
 }
 
 static struct platform_driver t7l66xb_platform_driver = {
diff --git a/include/linux/mfd/t7l66xb.h b/include/linux/mfd/t7l66xb.h
index 69632c1b07bd..ae3e7a5c5219 100644
--- a/include/linux/mfd/t7l66xb.h
+++ b/include/linux/mfd/t7l66xb.h
@@ -12,7 +12,6 @@
 
 struct t7l66xb_platform_data {
 	int (*enable)(struct platform_device *dev);
-	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
 	int (*resume)(struct platform_device *dev);
 
-- 
cgit 


From 6e1f1b1c93ceec1d9bfa9213775abc19161de5f1 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 30 May 2022 21:24:29 +0200
Subject: mfd: tc6387xb: Drop disable callback that is never called
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver never calls the disable callback, so drop the member from
the platform struct and all callbacks from the actual platform datas.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220530192430.2108217-4-u.kleine-koenig@pengutronix.de
---
 arch/arm/mach-pxa/eseries.c  | 1 -
 include/linux/mfd/tc6387xb.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index 08f8737aa8fd..99781eec065e 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -139,7 +139,6 @@ static void __init __maybe_unused eseries_register_clks(void)
 
 static struct tc6387xb_platform_data e330_tc6387xb_info = {
 	.enable   = &eseries_tmio_enable,
-	.disable  = &eseries_tmio_disable,
 	.suspend  = &eseries_tmio_suspend,
 	.resume   = &eseries_tmio_resume,
 };
diff --git a/include/linux/mfd/tc6387xb.h b/include/linux/mfd/tc6387xb.h
index b4888209494a..aacf1dcc86b9 100644
--- a/include/linux/mfd/tc6387xb.h
+++ b/include/linux/mfd/tc6387xb.h
@@ -12,7 +12,6 @@
 
 struct tc6387xb_platform_data {
 	int (*enable)(struct platform_device *dev);
-	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
 	int (*resume)(struct platform_device *dev);
 };
-- 
cgit 


From de58cee8c6b803dda3304eace346919fe880a40a Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Tue, 31 May 2022 14:49:56 +0200
Subject: mfd: mt6397-core: Add MT6357 PMIC support

Adds support for PMIC keys, Regulator, and RTC for the MT6357 PMIC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220531124959.202787-5-fparent@baylibre.com
---
 drivers/mfd/mt6397-core.c            |   44 +
 include/linux/mfd/mt6357/core.h      |  119 +++
 include/linux/mfd/mt6357/registers.h | 1574 ++++++++++++++++++++++++++++++++++
 3 files changed, 1737 insertions(+)
 create mode 100644 include/linux/mfd/mt6357/core.h
 create mode 100644 include/linux/mfd/mt6357/registers.h

(limited to 'include')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 1a368ad08f58..3cb8836bd08d 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -12,10 +12,12 @@
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6357/core.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/registers.h>
 #include <linux/mfd/mt6397/registers.h>
@@ -23,6 +25,9 @@
 #define MT6323_RTC_BASE		0x8000
 #define MT6323_RTC_SIZE		0x40
 
+#define MT6357_RTC_BASE		0x0588
+#define MT6357_RTC_SIZE		0x3c
+
 #define MT6358_RTC_BASE		0x0588
 #define MT6358_RTC_SIZE		0x3c
 
@@ -37,6 +42,11 @@ static const struct resource mt6323_rtc_resources[] = {
 	DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC),
 };
 
+static const struct resource mt6357_rtc_resources[] = {
+	DEFINE_RES_MEM(MT6357_RTC_BASE, MT6357_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6357_IRQ_RTC),
+};
+
 static const struct resource mt6358_rtc_resources[] = {
 	DEFINE_RES_MEM(MT6358_RTC_BASE, MT6358_RTC_SIZE),
 	DEFINE_RES_IRQ(MT6358_IRQ_RTC),
@@ -66,6 +76,13 @@ static const struct resource mt6323_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6323_IRQ_STATUS_FCHRKEY, "homekey"),
 };
 
+static const struct resource mt6357_keys_resources[] = {
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_PWRKEY, "powerkey"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY, "homekey"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_PWRKEY_R, "powerkey_r"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY_R, "homekey_r"),
+};
+
 static const struct resource mt6397_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_PWRKEY, "powerkey"),
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_HOMEKEY, "homekey"),
@@ -100,6 +117,22 @@ static const struct mfd_cell mt6323_devs[] = {
 	},
 };
 
+static const struct mfd_cell mt6357_devs[] = {
+	{
+		.name = "mt6357-regulator",
+	}, {
+		.name = "mt6357-rtc",
+		.num_resources = ARRAY_SIZE(mt6357_rtc_resources),
+		.resources = mt6357_rtc_resources,
+		.of_compatible = "mediatek,mt6357-rtc",
+	}, {
+		.name = "mtk-pmic-keys",
+		.num_resources = ARRAY_SIZE(mt6357_keys_resources),
+		.resources = mt6357_keys_resources,
+		.of_compatible = "mediatek,mt6357-keys"
+	},
+};
+
 static const struct mfd_cell mt6358_devs[] = {
 	{
 		.name = "mt6358-regulator",
@@ -179,6 +212,14 @@ static const struct chip_data mt6323_core = {
 	.irq_init = mt6397_irq_init,
 };
 
+static const struct chip_data mt6357_core = {
+	.cid_addr = MT6357_SWCID,
+	.cid_shift = 8,
+	.cells = mt6357_devs,
+	.cell_size = ARRAY_SIZE(mt6357_devs),
+	.irq_init = mt6358_irq_init,
+};
+
 static const struct chip_data mt6358_core = {
 	.cid_addr = MT6358_SWCID,
 	.cid_shift = 8,
@@ -261,6 +302,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	{
 		.compatible = "mediatek,mt6323",
 		.data = &mt6323_core,
+	}, {
+		.compatible = "mediatek,mt6357",
+		.data = &mt6357_core,
 	}, {
 		.compatible = "mediatek,mt6358",
 		.data = &mt6358_core,
diff --git a/include/linux/mfd/mt6357/core.h b/include/linux/mfd/mt6357/core.h
new file mode 100644
index 000000000000..2441611264fd
--- /dev/null
+++ b/include/linux/mfd/mt6357/core.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 BayLibre, SAS
+ * Author: Fabien Parent <fparent@baylibre.com>
+ */
+
+#ifndef __MFD_MT6357_CORE_H__
+#define __MFD_MT6357_CORE_H__
+
+enum mt6357_irq_top_status_shift {
+	MT6357_BUCK_TOP = 0,
+	MT6357_LDO_TOP,
+	MT6357_PSC_TOP,
+	MT6357_SCK_TOP,
+	MT6357_BM_TOP,
+	MT6357_HK_TOP,
+	MT6357_XPP_TOP,
+	MT6357_AUD_TOP,
+	MT6357_MISC_TOP,
+};
+
+enum mt6357_irq_numbers {
+	MT6357_IRQ_VPROC_OC = 0,
+	MT6357_IRQ_VCORE_OC,
+	MT6357_IRQ_VMODEM_OC,
+	MT6357_IRQ_VS1_OC,
+	MT6357_IRQ_VPA_OC,
+	MT6357_IRQ_VCORE_PREOC,
+	MT6357_IRQ_VFE28_OC = 16,
+	MT6357_IRQ_VXO22_OC,
+	MT6357_IRQ_VRF18_OC,
+	MT6357_IRQ_VRF12_OC,
+	MT6357_IRQ_VEFUSE_OC,
+	MT6357_IRQ_VCN33_OC,
+	MT6357_IRQ_VCN28_OC,
+	MT6357_IRQ_VCN18_OC,
+	MT6357_IRQ_VCAMA_OC,
+	MT6357_IRQ_VCAMD_OC,
+	MT6357_IRQ_VCAMIO_OC,
+	MT6357_IRQ_VLDO28_OC,
+	MT6357_IRQ_VUSB33_OC,
+	MT6357_IRQ_VAUX18_OC,
+	MT6357_IRQ_VAUD28_OC,
+	MT6357_IRQ_VIO28_OC,
+	MT6357_IRQ_VIO18_OC,
+	MT6357_IRQ_VSRAM_PROC_OC,
+	MT6357_IRQ_VSRAM_OTHERS_OC,
+	MT6357_IRQ_VIBR_OC,
+	MT6357_IRQ_VDRAM_OC,
+	MT6357_IRQ_VMC_OC,
+	MT6357_IRQ_VMCH_OC,
+	MT6357_IRQ_VEMC_OC,
+	MT6357_IRQ_VSIM1_OC,
+	MT6357_IRQ_VSIM2_OC,
+	MT6357_IRQ_PWRKEY = 48,
+	MT6357_IRQ_HOMEKEY,
+	MT6357_IRQ_PWRKEY_R,
+	MT6357_IRQ_HOMEKEY_R,
+	MT6357_IRQ_NI_LBAT_INT,
+	MT6357_IRQ_CHRDET,
+	MT6357_IRQ_CHRDET_EDGE,
+	MT6357_IRQ_VCDT_HV_DET,
+	MT6357_IRQ_WATCHDOG,
+	MT6357_IRQ_VBATON_UNDET,
+	MT6357_IRQ_BVALID_DET,
+	MT6357_IRQ_OV,
+	MT6357_IRQ_RTC = 64,
+	MT6357_IRQ_FG_BAT0_H = 80,
+	MT6357_IRQ_FG_BAT0_L,
+	MT6357_IRQ_FG_CUR_H,
+	MT6357_IRQ_FG_CUR_L,
+	MT6357_IRQ_FG_ZCV,
+	MT6357_IRQ_BATON_LV = 96,
+	MT6357_IRQ_BATON_HT,
+	MT6357_IRQ_BAT_H = 112,
+	MT6357_IRQ_BAT_L,
+	MT6357_IRQ_AUXADC_IMP,
+	MT6357_IRQ_NAG_C_DLTV,
+	MT6357_IRQ_AUDIO = 128,
+	MT6357_IRQ_ACCDET = 133,
+	MT6357_IRQ_ACCDET_EINT0,
+	MT6357_IRQ_ACCDET_EINT1,
+	MT6357_IRQ_SPI_CMD_ALERT = 144,
+	MT6357_IRQ_NR,
+};
+
+#define MT6357_IRQ_BUCK_BASE	MT6357_IRQ_VPROC_OC
+#define MT6357_IRQ_LDO_BASE	MT6357_IRQ_VFE28_OC
+#define MT6357_IRQ_PSC_BASE	MT6357_IRQ_PWRKEY
+#define MT6357_IRQ_SCK_BASE	MT6357_IRQ_RTC
+#define MT6357_IRQ_BM_BASE	MT6357_IRQ_FG_BAT0_H
+#define MT6357_IRQ_HK_BASE	MT6357_IRQ_BAT_H
+#define MT6357_IRQ_AUD_BASE	MT6357_IRQ_AUDIO
+#define MT6357_IRQ_MISC_BASE	MT6357_IRQ_SPI_CMD_ALERT
+
+#define MT6357_IRQ_BUCK_BITS (MT6357_IRQ_VCORE_PREOC - MT6357_IRQ_BUCK_BASE + 1)
+#define MT6357_IRQ_LDO_BITS (MT6357_IRQ_VSIM2_OC - MT6357_IRQ_LDO_BASE + 1)
+#define MT6357_IRQ_PSC_BITS (MT6357_IRQ_VCDT_HV_DET - MT6357_IRQ_PSC_BASE + 1)
+#define MT6357_IRQ_SCK_BITS (MT6357_IRQ_RTC - MT6357_IRQ_SCK_BASE + 1)
+#define MT6357_IRQ_BM_BITS (MT6357_IRQ_BATON_HT - MT6357_IRQ_BM_BASE + 1)
+#define MT6357_IRQ_HK_BITS (MT6357_IRQ_NAG_C_DLTV - MT6357_IRQ_HK_BASE + 1)
+#define MT6357_IRQ_AUD_BITS (MT6357_IRQ_ACCDET_EINT1 - MT6357_IRQ_AUD_BASE + 1)
+#define MT6357_IRQ_MISC_BITS	\
+	(MT6357_IRQ_SPI_CMD_ALERT - MT6357_IRQ_MISC_BASE + 1)
+
+#define MT6357_TOP_GEN(sp)	\
+{	\
+	.hwirq_base = MT6357_IRQ_##sp##_BASE,	\
+	.num_int_regs =	\
+		((MT6357_IRQ_##sp##_BITS - 1) /	\
+		MTK_PMIC_REG_WIDTH) + 1,	\
+	.en_reg = MT6357_##sp##_TOP_INT_CON0,	\
+	.en_reg_shift = 0x6,	\
+	.sta_reg = MT6357_##sp##_TOP_INT_STATUS0,	\
+	.sta_reg_shift = 0x2,	\
+	.top_offset = MT6357_##sp##_TOP,	\
+}
+
+#endif /* __MFD_MT6357_CORE_H__ */
diff --git a/include/linux/mfd/mt6357/registers.h b/include/linux/mfd/mt6357/registers.h
new file mode 100644
index 000000000000..e24af83b618d
--- /dev/null
+++ b/include/linux/mfd/mt6357/registers.h
@@ -0,0 +1,1574 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 MediaTek Inc.
+ */
+
+#ifndef __MFD_MT6357_REGISTERS_H__
+#define __MFD_MT6357_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6357_TOP0_ID                       0x0
+#define MT6357_TOP0_REV0                     0x2
+#define MT6357_TOP0_DSN_DBI                  0x4
+#define MT6357_TOP0_DSN_DXI                  0x6
+#define MT6357_HWCID                         0x8
+#define MT6357_SWCID                         0xa
+#define MT6357_PONSTS                        0xc
+#define MT6357_POFFSTS                       0xe
+#define MT6357_PSTSCTL                       0x10
+#define MT6357_PG_DEB_STS0                   0x12
+#define MT6357_PG_SDN_STS0                   0x14
+#define MT6357_OC_SDN_STS0                   0x16
+#define MT6357_THERMALSTATUS                 0x18
+#define MT6357_TOP_CON                       0x1a
+#define MT6357_TEST_OUT                      0x1c
+#define MT6357_TEST_CON0                     0x1e
+#define MT6357_TEST_CON1                     0x20
+#define MT6357_TESTMODE_SW                   0x22
+#define MT6357_TOPSTATUS                     0x24
+#define MT6357_TDSEL_CON                     0x26
+#define MT6357_RDSEL_CON                     0x28
+#define MT6357_SMT_CON0                      0x2a
+#define MT6357_SMT_CON1                      0x2c
+#define MT6357_TOP_RSV0                      0x2e
+#define MT6357_TOP_RSV1                      0x30
+#define MT6357_DRV_CON0                      0x32
+#define MT6357_DRV_CON1                      0x34
+#define MT6357_DRV_CON2                      0x36
+#define MT6357_DRV_CON3                      0x38
+#define MT6357_FILTER_CON0                   0x3a
+#define MT6357_FILTER_CON1                   0x3c
+#define MT6357_FILTER_CON2                   0x3e
+#define MT6357_FILTER_CON3                   0x40
+#define MT6357_TOP_STATUS                    0x42
+#define MT6357_TOP_STATUS_SET                0x44
+#define MT6357_TOP_STATUS_CLR                0x46
+#define MT6357_TOP_TRAP                      0x48
+#define MT6357_TOP1_ID                       0x80
+#define MT6357_TOP1_REV0                     0x82
+#define MT6357_TOP1_DSN_DBI                  0x84
+#define MT6357_TOP1_DSN_DXI                  0x86
+#define MT6357_GPIO_DIR0                     0x88
+#define MT6357_GPIO_DIR0_SET                 0x8a
+#define MT6357_GPIO_DIR0_CLR                 0x8c
+#define MT6357_GPIO_PULLEN0                  0x8e
+#define MT6357_GPIO_PULLEN0_SET              0x90
+#define MT6357_GPIO_PULLEN0_CLR              0x92
+#define MT6357_GPIO_PULLSEL0                 0x94
+#define MT6357_GPIO_PULLSEL0_SET             0x96
+#define MT6357_GPIO_PULLSEL0_CLR             0x98
+#define MT6357_GPIO_DINV0                    0x9a
+#define MT6357_GPIO_DINV0_SET                0x9c
+#define MT6357_GPIO_DINV0_CLR                0x9e
+#define MT6357_GPIO_DOUT0                    0xa0
+#define MT6357_GPIO_DOUT0_SET                0xa2
+#define MT6357_GPIO_DOUT0_CLR                0xa4
+#define MT6357_GPIO_PI0                      0xa6
+#define MT6357_GPIO_POE0                     0xa8
+#define MT6357_GPIO_MODE0                    0xaa
+#define MT6357_GPIO_MODE0_SET                0xac
+#define MT6357_GPIO_MODE0_CLR                0xae
+#define MT6357_GPIO_MODE1                    0xb0
+#define MT6357_GPIO_MODE1_SET                0xb2
+#define MT6357_GPIO_MODE1_CLR                0xb4
+#define MT6357_GPIO_MODE2                    0xb6
+#define MT6357_GPIO_MODE2_SET                0xb8
+#define MT6357_GPIO_MODE2_CLR                0xba
+#define MT6357_GPIO_MODE3                    0xbc
+#define MT6357_GPIO_MODE3_SET                0xbe
+#define MT6357_GPIO_MODE3_CLR                0xc0
+#define MT6357_GPIO_RSV                      0xc2
+#define MT6357_TOP2_ID                       0x100
+#define MT6357_TOP2_REV0                     0x102
+#define MT6357_TOP2_DSN_DBI                  0x104
+#define MT6357_TOP2_DSN_DXI                  0x106
+#define MT6357_TOP_PAM0                      0x108
+#define MT6357_TOP_PAM1                      0x10a
+#define MT6357_TOP_CKPDN_CON0                0x10c
+#define MT6357_TOP_CKPDN_CON0_SET            0x10e
+#define MT6357_TOP_CKPDN_CON0_CLR            0x110
+#define MT6357_TOP_CKPDN_CON1                0x112
+#define MT6357_TOP_CKPDN_CON1_SET            0x114
+#define MT6357_TOP_CKPDN_CON1_CLR            0x116
+#define MT6357_TOP_CKSEL_CON0                0x118
+#define MT6357_TOP_CKSEL_CON0_SET            0x11a
+#define MT6357_TOP_CKSEL_CON0_CLR            0x11c
+#define MT6357_TOP_CKSEL_CON1                0x11e
+#define MT6357_TOP_CKSEL_CON1_SET            0x120
+#define MT6357_TOP_CKSEL_CON1_CLR            0x122
+#define MT6357_TOP_CKDIVSEL_CON0             0x124
+#define MT6357_TOP_CKDIVSEL_CON0_SET         0x126
+#define MT6357_TOP_CKDIVSEL_CON0_CLR         0x128
+#define MT6357_TOP_CKHWEN_CON0               0x12a
+#define MT6357_TOP_CKHWEN_CON0_SET           0x12c
+#define MT6357_TOP_CKHWEN_CON0_CLR           0x12e
+#define MT6357_TOP_CKTST_CON0                0x130
+#define MT6357_TOP_CKTST_CON1                0x132
+#define MT6357_TOP_CLK_CON0                  0x134
+#define MT6357_TOP_CLK_CON0_SET              0x136
+#define MT6357_TOP_CLK_CON0_CLR              0x138
+#define MT6357_TOP_DCM_CON0                  0x13a
+#define MT6357_TOP_HANDOVER_DEBUG0           0x13c
+#define MT6357_TOP_RST_CON0                  0x13e
+#define MT6357_TOP_RST_CON0_SET              0x140
+#define MT6357_TOP_RST_CON0_CLR              0x142
+#define MT6357_TOP_RST_CON1                  0x144
+#define MT6357_TOP_RST_CON1_SET              0x146
+#define MT6357_TOP_RST_CON1_CLR              0x148
+#define MT6357_TOP_RST_CON2                  0x14a
+#define MT6357_TOP_RST_MISC                  0x14c
+#define MT6357_TOP_RST_MISC_SET              0x14e
+#define MT6357_TOP_RST_MISC_CLR              0x150
+#define MT6357_TOP_RST_STATUS                0x152
+#define MT6357_TOP_RST_STATUS_SET            0x154
+#define MT6357_TOP_RST_STATUS_CLR            0x156
+#define MT6357_TOP2_ELR_NUM                  0x158
+#define MT6357_TOP2_ELR0                     0x15a
+#define MT6357_TOP2_ELR1                     0x15c
+#define MT6357_TOP3_ID                       0x180
+#define MT6357_TOP3_REV0                     0x182
+#define MT6357_TOP3_DSN_DBI                  0x184
+#define MT6357_TOP3_DSN_DXI                  0x186
+#define MT6357_MISC_TOP_INT_CON0             0x188
+#define MT6357_MISC_TOP_INT_CON0_SET         0x18a
+#define MT6357_MISC_TOP_INT_CON0_CLR         0x18c
+#define MT6357_MISC_TOP_INT_MASK_CON0        0x18e
+#define MT6357_MISC_TOP_INT_MASK_CON0_SET    0x190
+#define MT6357_MISC_TOP_INT_MASK_CON0_CLR    0x192
+#define MT6357_MISC_TOP_INT_STATUS0          0x194
+#define MT6357_MISC_TOP_INT_RAW_STATUS0      0x196
+#define MT6357_TOP_INT_MASK_CON0             0x198
+#define MT6357_TOP_INT_MASK_CON0_SET         0x19a
+#define MT6357_TOP_INT_MASK_CON0_CLR         0x19c
+#define MT6357_TOP_INT_STATUS0               0x19e
+#define MT6357_TOP_INT_RAW_STATUS0           0x1a0
+#define MT6357_TOP_INT_CON0                  0x1a2
+#define MT6357_PLT0_ID                       0x380
+#define MT6357_PLT0_REV0                     0x382
+#define MT6357_PLT0_REV1                     0x384
+#define MT6357_PLT0_DSN_DXI                  0x386
+#define MT6357_FQMTR_CON0                    0x388
+#define MT6357_FQMTR_CON1                    0x38a
+#define MT6357_FQMTR_CON2                    0x38c
+#define MT6357_TOP_CLK_TRIM                  0x38e
+#define MT6357_OTP_CON0                      0x390
+#define MT6357_OTP_CON1                      0x392
+#define MT6357_OTP_CON2                      0x394
+#define MT6357_OTP_CON3                      0x396
+#define MT6357_OTP_CON4                      0x398
+#define MT6357_OTP_CON5                      0x39a
+#define MT6357_OTP_CON6                      0x39c
+#define MT6357_OTP_CON7                      0x39e
+#define MT6357_OTP_CON8                      0x3a0
+#define MT6357_OTP_CON9                      0x3a2
+#define MT6357_OTP_CON10                     0x3a4
+#define MT6357_OTP_CON11                     0x3a6
+#define MT6357_OTP_CON12                     0x3a8
+#define MT6357_OTP_CON13                     0x3aa
+#define MT6357_OTP_CON14                     0x3ac
+#define MT6357_TOP_TMA_KEY                   0x3ae
+#define MT6357_TOP_MDB_CONF0                 0x3b0
+#define MT6357_TOP_MDB_CONF1                 0x3b2
+#define MT6357_TOP_MDB_CONF2                 0x3b4
+#define MT6357_PLT0_ELR_NUM                  0x3b6
+#define MT6357_PLT0_ELR0                     0x3b8
+#define MT6357_PLT0_ELR1                     0x3ba
+#define MT6357_SPISLV_ID                     0x400
+#define MT6357_SPISLV_REV0                   0x402
+#define MT6357_SPISLV_REV1                   0x404
+#define MT6357_SPISLV_DSN_DXI                0x406
+#define MT6357_RG_SPI_CON0                   0x408
+#define MT6357_DEW_DIO_EN                    0x40a
+#define MT6357_DEW_READ_TEST                 0x40c
+#define MT6357_DEW_WRITE_TEST                0x40e
+#define MT6357_DEW_CRC_SWRST                 0x410
+#define MT6357_DEW_CRC_EN                    0x412
+#define MT6357_DEW_CRC_VAL                   0x414
+#define MT6357_DEW_DBG_MON_SEL               0x416
+#define MT6357_DEW_CIPHER_KEY_SEL            0x418
+#define MT6357_DEW_CIPHER_IV_SEL             0x41a
+#define MT6357_DEW_CIPHER_EN                 0x41c
+#define MT6357_DEW_CIPHER_RDY                0x41e
+#define MT6357_DEW_CIPHER_MODE               0x420
+#define MT6357_DEW_CIPHER_SWRST              0x422
+#define MT6357_DEW_RDDMY_NO                  0x424
+#define MT6357_INT_TYPE_CON0                 0x426
+#define MT6357_INT_TYPE_CON0_SET             0x428
+#define MT6357_INT_TYPE_CON0_CLR             0x42a
+#define MT6357_INT_STA                       0x42c
+#define MT6357_RG_SPI_CON1                   0x42e
+#define MT6357_RG_SPI_CON2                   0x430
+#define MT6357_RG_SPI_CON3                   0x432
+#define MT6357_RG_SPI_CON4                   0x434
+#define MT6357_RG_SPI_CON5                   0x436
+#define MT6357_RG_SPI_CON6                   0x438
+#define MT6357_RG_SPI_CON7                   0x43a
+#define MT6357_RG_SPI_CON8                   0x43c
+#define MT6357_RG_SPI_CON9                   0x43e
+#define MT6357_RG_SPI_CON10                  0x440
+#define MT6357_RG_SPI_CON11                  0x442
+#define MT6357_RG_SPI_CON12                  0x444
+#define MT6357_RG_SPI_CON13                  0x446
+#define MT6357_TOP_SPI_CON0                  0x448
+#define MT6357_TOP_SPI_CON1                  0x44a
+#define MT6357_SCK_TOP_DSN_ID                0x500
+#define MT6357_SCK_TOP_DSN_REV0              0x502
+#define MT6357_SCK_TOP_DBI                   0x504
+#define MT6357_SCK_TOP_DXI                   0x506
+#define MT6357_SCK_TOP_TPM0                  0x508
+#define MT6357_SCK_TOP_TPM1                  0x50a
+#define MT6357_SCK_TOP_CON0                  0x50c
+#define MT6357_SCK_TOP_CON1                  0x50e
+#define MT6357_SCK_TOP_TEST_OUT              0x510
+#define MT6357_SCK_TOP_TEST_CON0             0x512
+#define MT6357_SCK_TOP_CKPDN_CON0            0x514
+#define MT6357_SCK_TOP_CKPDN_CON0_SET        0x516
+#define MT6357_SCK_TOP_CKPDN_CON0_CLR        0x518
+#define MT6357_SCK_TOP_CKHWEN_CON0           0x51a
+#define MT6357_SCK_TOP_CKHWEN_CON0_SET       0x51c
+#define MT6357_SCK_TOP_CKHWEN_CON0_CLR       0x51e
+#define MT6357_SCK_TOP_CKTST_CON             0x520
+#define MT6357_SCK_TOP_RST_CON0              0x522
+#define MT6357_SCK_TOP_RST_CON0_SET          0x524
+#define MT6357_SCK_TOP_RST_CON0_CLR          0x526
+#define MT6357_SCK_TOP_INT_CON0              0x528
+#define MT6357_SCK_TOP_INT_CON0_SET          0x52a
+#define MT6357_SCK_TOP_INT_CON0_CLR          0x52c
+#define MT6357_SCK_TOP_INT_MASK_CON0         0x52e
+#define MT6357_SCK_TOP_INT_MASK_CON0_SET     0x530
+#define MT6357_SCK_TOP_INT_MASK_CON0_CLR     0x532
+#define MT6357_SCK_TOP_INT_STATUS0           0x534
+#define MT6357_SCK_TOP_INT_RAW_STATUS0       0x536
+#define MT6357_SCK_TOP_INT_MISC_CON          0x538
+#define MT6357_EOSC_CALI_CON0                0x53a
+#define MT6357_EOSC_CALI_CON1                0x53c
+#define MT6357_RTC_MIX_CON0                  0x53e
+#define MT6357_RTC_MIX_CON1                  0x540
+#define MT6357_RTC_MIX_CON2                  0x542
+#define MT6357_RTC_DSN_ID                    0x580
+#define MT6357_RTC_DSN_REV0                  0x582
+#define MT6357_RTC_DBI                       0x584
+#define MT6357_RTC_DXI                       0x586
+#define MT6357_RTC_BBPU                      0x588
+#define MT6357_RTC_IRQ_STA                   0x58a
+#define MT6357_RTC_IRQ_EN                    0x58c
+#define MT6357_RTC_CII_EN                    0x58e
+#define MT6357_RTC_AL_MASK                   0x590
+#define MT6357_RTC_TC_SEC                    0x592
+#define MT6357_RTC_TC_MIN                    0x594
+#define MT6357_RTC_TC_HOU                    0x596
+#define MT6357_RTC_TC_DOM                    0x598
+#define MT6357_RTC_TC_DOW                    0x59a
+#define MT6357_RTC_TC_MTH                    0x59c
+#define MT6357_RTC_TC_YEA                    0x59e
+#define MT6357_RTC_AL_SEC                    0x5a0
+#define MT6357_RTC_AL_MIN                    0x5a2
+#define MT6357_RTC_AL_HOU                    0x5a4
+#define MT6357_RTC_AL_DOM                    0x5a6
+#define MT6357_RTC_AL_DOW                    0x5a8
+#define MT6357_RTC_AL_MTH                    0x5aa
+#define MT6357_RTC_AL_YEA                    0x5ac
+#define MT6357_RTC_OSC32CON                  0x5ae
+#define MT6357_RTC_POWERKEY1                 0x5b0
+#define MT6357_RTC_POWERKEY2                 0x5b2
+#define MT6357_RTC_PDN1                      0x5b4
+#define MT6357_RTC_PDN2                      0x5b6
+#define MT6357_RTC_SPAR0                     0x5b8
+#define MT6357_RTC_SPAR1                     0x5ba
+#define MT6357_RTC_PROT                      0x5bc
+#define MT6357_RTC_DIFF                      0x5be
+#define MT6357_RTC_CALI                      0x5c0
+#define MT6357_RTC_WRTGR                     0x5c2
+#define MT6357_RTC_CON                       0x5c4
+#define MT6357_RTC_SEC_CTRL                  0x5c6
+#define MT6357_RTC_INT_CNT                   0x5c8
+#define MT6357_RTC_SEC_DAT0                  0x5ca
+#define MT6357_RTC_SEC_DAT1                  0x5cc
+#define MT6357_RTC_SEC_DAT2                  0x5ce
+#define MT6357_RTC_SEC_DSN_ID                0x600
+#define MT6357_RTC_SEC_DSN_REV0              0x602
+#define MT6357_RTC_SEC_DBI                   0x604
+#define MT6357_RTC_SEC_DXI                   0x606
+#define MT6357_RTC_TC_SEC_SEC                0x608
+#define MT6357_RTC_TC_MIN_SEC                0x60a
+#define MT6357_RTC_TC_HOU_SEC                0x60c
+#define MT6357_RTC_TC_DOM_SEC                0x60e
+#define MT6357_RTC_TC_DOW_SEC                0x610
+#define MT6357_RTC_TC_MTH_SEC                0x612
+#define MT6357_RTC_TC_YEA_SEC                0x614
+#define MT6357_RTC_SEC_CK_PDN                0x616
+#define MT6357_RTC_SEC_WRTGR                 0x618
+#define MT6357_DCXO_DSN_ID                   0x780
+#define MT6357_DCXO_DSN_REV0                 0x782
+#define MT6357_DCXO_DSN_DBI                  0x784
+#define MT6357_DCXO_DSN_DXI                  0x786
+#define MT6357_DCXO_CW00                     0x788
+#define MT6357_DCXO_CW00_SET                 0x78a
+#define MT6357_DCXO_CW00_CLR                 0x78c
+#define MT6357_DCXO_CW01                     0x78e
+#define MT6357_DCXO_CW02                     0x790
+#define MT6357_DCXO_CW03                     0x792
+#define MT6357_DCXO_CW04                     0x794
+#define MT6357_DCXO_CW05                     0x796
+#define MT6357_DCXO_CW06                     0x798
+#define MT6357_DCXO_CW07                     0x79a
+#define MT6357_DCXO_CW08                     0x79c
+#define MT6357_DCXO_CW09                     0x79e
+#define MT6357_DCXO_CW10                     0x7a0
+#define MT6357_DCXO_CW11                     0x7a2
+#define MT6357_DCXO_CW11_SET                 0x7a4
+#define MT6357_DCXO_CW11_CLR                 0x7a6
+#define MT6357_DCXO_CW12                     0x7a8
+#define MT6357_DCXO_CW13                     0x7aa
+#define MT6357_DCXO_CW14                     0x7ac
+#define MT6357_DCXO_CW15                     0x7ae
+#define MT6357_DCXO_CW16                     0x7b0
+#define MT6357_DCXO_CW17                     0x7b2
+#define MT6357_DCXO_CW18                     0x7b4
+#define MT6357_DCXO_CW19                     0x7b6
+#define MT6357_DCXO_CW20                     0x7b8
+#define MT6357_DCXO_CW21                     0x7ba
+#define MT6357_DCXO_CW22                     0x7bc
+#define MT6357_DCXO_ELR_NUM                  0x7be
+#define MT6357_DCXO_ELR0                     0x7c0
+#define MT6357_PSC_TOP_ID                    0x900
+#define MT6357_PSC_TOP_REV0                  0x902
+#define MT6357_PSC_TOP_DBI                   0x904
+#define MT6357_PSC_TOP_DXI                   0x906
+#define MT6357_PSC_TPM0                      0x908
+#define MT6357_PSC_TPM1                      0x90a
+#define MT6357_PSC_TOP_RSTCTL_0              0x90c
+#define MT6357_PSC_TOP_INT_CON0              0x90e
+#define MT6357_PSC_TOP_INT_CON0_SET          0x910
+#define MT6357_PSC_TOP_INT_CON0_CLR          0x912
+#define MT6357_PSC_TOP_INT_MASK_CON0         0x914
+#define MT6357_PSC_TOP_INT_MASK_CON0_SET     0x916
+#define MT6357_PSC_TOP_INT_MASK_CON0_CLR     0x918
+#define MT6357_PSC_TOP_INT_STATUS0           0x91a
+#define MT6357_PSC_TOP_INT_RAW_STATUS0       0x91c
+#define MT6357_PSC_TOP_INT_MISC_CON          0x91e
+#define MT6357_PSC_TOP_INT_MISC_CON_SET      0x920
+#define MT6357_PSC_TOP_INT_MISC_CON_CLR      0x922
+#define MT6357_PSC_TOP_MON_CTL               0x924
+#define MT6357_STRUP_ID                      0x980
+#define MT6357_STRUP_REV0                    0x982
+#define MT6357_STRUP_DBI                     0x984
+#define MT6357_STRUP_DXI                     0x986
+#define MT6357_STRUP_ANA_CON0                0x988
+#define MT6357_STRUP_ANA_CON1                0x98a
+#define MT6357_STRUP_ANA_CON2                0x98c
+#define MT6357_STRUP_ELR_NUM                 0x98e
+#define MT6357_STRUP_ELR_0                   0x990
+#define MT6357_PSEQ_ID                       0xa00
+#define MT6357_PSEQ_REV0                     0xa02
+#define MT6357_PSEQ_DBI                      0xa04
+#define MT6357_PSEQ_DXI                      0xa06
+#define MT6357_PPCCTL0                       0xa08
+#define MT6357_PPCCTL1                       0xa0a
+#define MT6357_PPCCTL2                       0xa0c
+#define MT6357_PPCCFG0                       0xa0e
+#define MT6357_PPCTST0                       0xa10
+#define MT6357_PORFLAG                       0xa12
+#define MT6357_STRUP_CON0                    0xa14
+#define MT6357_STRUP_CON1                    0xa16
+#define MT6357_STRUP_CON2                    0xa18
+#define MT6357_STRUP_CON3                    0xa1a
+#define MT6357_STRUP_CON4                    0xa1c
+#define MT6357_STRUP_CON5                    0xa1e
+#define MT6357_STRUP_CON6                    0xa20
+#define MT6357_STRUP_CON7                    0xa22
+#define MT6357_CPSCFG0                       0xa24
+#define MT6357_STRUP_CON9                    0xa26
+#define MT6357_STRUP_CON10                   0xa28
+#define MT6357_STRUP_CON11                   0xa2a
+#define MT6357_STRUP_CON12                   0xa2c
+#define MT6357_STRUP_CON13                   0xa2e
+#define MT6357_STRUP_CON14                   0xa30
+#define MT6357_STRUP_CON15                   0xa32
+#define MT6357_STRUP_CON16                   0xa34
+#define MT6357_STRUP_CON19                   0xa36
+#define MT6357_PSEQ_ELR_NUM                  0xa38
+#define MT6357_PSEQ_ELR7                     0xa3a
+#define MT6357_PSEQ_ELR8                     0xa3c
+#define MT6357_PCHR_DIG_DSN_ID               0xa80
+#define MT6357_PCHR_DIG_DSN_REV0             0xa82
+#define MT6357_PCHR_DIG_DSN_DBI              0xa84
+#define MT6357_PCHR_DIG_DSN_DXI              0xa86
+#define MT6357_CHR_TOP_CON0                  0xa88
+#define MT6357_CHR_TOP_CON1                  0xa8a
+#define MT6357_CHR_TOP_CON2                  0xa8c
+#define MT6357_CHR_TOP_CON3                  0xa8e
+#define MT6357_CHR_TOP_CON4                  0xa90
+#define MT6357_CHR_TOP_CON5                  0xa92
+#define MT6357_CHR_TOP_CON6                  0xa94
+#define MT6357_PCHR_DIG_ELR_NUM              0xa96
+#define MT6357_PCHR_ELR0                     0xa98
+#define MT6357_PCHR_ELR1                     0xa9a
+#define MT6357_PCHR_MACRO_DSN_ID             0xb80
+#define MT6357_PCHR_MACRO_DSN_REV0           0xb82
+#define MT6357_PCHR_MACRO_DSN_DBI            0xb84
+#define MT6357_PCHR_MACRO_DSN_DXI            0xb86
+#define MT6357_CHR_CON0                      0xb88
+#define MT6357_CHR_CON1                      0xb8a
+#define MT6357_CHR_CON2                      0xb8c
+#define MT6357_CHR_CON3                      0xb8e
+#define MT6357_CHR_CON4                      0xb90
+#define MT6357_CHR_CON5                      0xb92
+#define MT6357_CHR_CON6                      0xb94
+#define MT6357_CHR_CON7                      0xb96
+#define MT6357_CHR_CON8                      0xb98
+#define MT6357_CHR_CON9                      0xb9a
+#define MT6357_BM_TOP_DSN_ID                 0xc00
+#define MT6357_BM_TOP_DSN_REV0               0xc02
+#define MT6357_BM_TOP_DBI                    0xc04
+#define MT6357_BM_TOP_DXI                    0xc06
+#define MT6357_BM_TPM0                       0xc08
+#define MT6357_BM_TPM1                       0xc0a
+#define MT6357_BM_TOP_CKPDN_CON0             0xc0c
+#define MT6357_BM_TOP_CKPDN_CON0_SET         0xc0e
+#define MT6357_BM_TOP_CKPDN_CON0_CLR         0xc10
+#define MT6357_BM_TOP_CKSEL_CON0             0xc12
+#define MT6357_BM_TOP_CKSEL_CON0_SET         0xc14
+#define MT6357_BM_TOP_CKSEL_CON0_CLR         0xc16
+#define MT6357_BM_TOP_CKTST_CON0             0xc18
+#define MT6357_BM_TOP_RST_CON0               0xc1a
+#define MT6357_BM_TOP_RST_CON0_SET           0xc1c
+#define MT6357_BM_TOP_RST_CON0_CLR           0xc1e
+#define MT6357_BM_TOP_INT_CON0               0xc20
+#define MT6357_BM_TOP_INT_CON0_SET           0xc22
+#define MT6357_BM_TOP_INT_CON0_CLR           0xc24
+#define MT6357_BM_TOP_INT_CON1               0xc26
+#define MT6357_BM_TOP_INT_CON1_SET           0xc28
+#define MT6357_BM_TOP_INT_CON1_CLR           0xc2a
+#define MT6357_BM_TOP_INT_MASK_CON0          0xc2c
+#define MT6357_BM_TOP_INT_MASK_CON0_SET      0xc2e
+#define MT6357_BM_TOP_INT_MASK_CON0_CLR      0xc30
+#define MT6357_BM_TOP_INT_MASK_CON1          0xc32
+#define MT6357_BM_TOP_INT_MASK_CON1_SET      0xc34
+#define MT6357_BM_TOP_INT_MASK_CON1_CLR      0xc36
+#define MT6357_BM_TOP_INT_STATUS0            0xc38
+#define MT6357_BM_TOP_INT_STATUS1            0xc3a
+#define MT6357_BM_TOP_INT_RAW_STATUS0        0xc3c
+#define MT6357_BM_TOP_INT_RAW_STATUS1        0xc3e
+#define MT6357_BM_TOP_INT_MISC_CON           0xc40
+#define MT6357_BM_TOP_DBG_CON                0xc42
+#define MT6357_BM_TOP_RSV0                   0xc44
+#define MT6357_FGADC_ANA_DSN_ID              0xc80
+#define MT6357_FGADC_ANA_DSN_REV0            0xc82
+#define MT6357_FGADC_ANA_DSN_DBI             0xc84
+#define MT6357_FGADC_ANA_DSN_DXI             0xc86
+#define MT6357_FGADC_ANA_CON0                0xc88
+#define MT6357_FGADC_ANA_TEST_CON0           0xc8a
+#define MT6357_FGADC_ANA_ELR_NUM             0xc8c
+#define MT6357_FGADC_ANA_ELR0                0xc8e
+#define MT6357_FGADC_ANA_ELR1                0xc90
+#define MT6357_FGADC0_DSN_ID                 0xd00
+#define MT6357_FGADC0_DSN_REV0               0xd02
+#define MT6357_FGADC0_DSN_DBI                0xd04
+#define MT6357_FGADC0_DSN_DXI                0xd06
+#define MT6357_FGADC_CON0                    0xd08
+#define MT6357_FGADC_CON1                    0xd0a
+#define MT6357_FGADC_CON2                    0xd0c
+#define MT6357_FGADC_CON3                    0xd0e
+#define MT6357_FGADC_CON4                    0xd10
+#define MT6357_FGADC_CAR_CON0                0xd12
+#define MT6357_FGADC_CAR_CON1                0xd14
+#define MT6357_FGADC_CAR_CON2                0xd16
+#define MT6357_FGADC_CARTH_CON0              0xd18
+#define MT6357_FGADC_CARTH_CON1              0xd1a
+#define MT6357_FGADC_CARTH_CON2              0xd1c
+#define MT6357_FGADC_CARTH_CON3              0xd1e
+#define MT6357_FGADC_NTER_CON0               0xd20
+#define MT6357_FGADC_NTER_CON1               0xd22
+#define MT6357_FGADC_NTER_CON2               0xd24
+#define MT6357_FGADC_SON_CON0                0xd26
+#define MT6357_FGADC_SON_CON1                0xd28
+#define MT6357_FGADC_SON_CON2                0xd2a
+#define MT6357_FGADC_SON_CON3                0xd2c
+#define MT6357_FGADC_ZCV_CON0                0xd2e
+#define MT6357_FGADC_ZCV_CON1                0xd30
+#define MT6357_FGADC_ZCV_CON2                0xd32
+#define MT6357_FGADC_ZCV_CON3                0xd34
+#define MT6357_FGADC_ZCV_CON4                0xd36
+#define MT6357_FGADC_ZCVTH_CON0              0xd38
+#define MT6357_FGADC_ZCVTH_CON1              0xd3a
+#define MT6357_FGADC_ZCVTH_CON2              0xd3c
+#define MT6357_FGADC1_DSN_ID                 0xd80
+#define MT6357_FGADC1_DSN_REV0               0xd82
+#define MT6357_FGADC1_DSN_DBI                0xd84
+#define MT6357_FGADC1_DSN_DXI                0xd86
+#define MT6357_FGADC_R_CON0                  0xd88
+#define MT6357_FGADC_CUR_CON0                0xd8a
+#define MT6357_FGADC_CUR_CON1                0xd8c
+#define MT6357_FGADC_CUR_CON2                0xd8e
+#define MT6357_FGADC_CUR_CON3                0xd90
+#define MT6357_FGADC_OFFSET_CON0             0xd92
+#define MT6357_FGADC_OFFSET_CON1             0xd94
+#define MT6357_FGADC_GAIN_CON0               0xd96
+#define MT6357_FGADC_TEST_CON0               0xd98
+#define MT6357_SYSTEM_INFO_CON0              0xd9a
+#define MT6357_SYSTEM_INFO_CON1              0xd9c
+#define MT6357_SYSTEM_INFO_CON2              0xd9e
+#define MT6357_SYSTEM_INFO_CON3              0xda0
+#define MT6357_SYSTEM_INFO_CON4              0xda2
+#define MT6357_BATON_ANA_DSN_ID              0xe00
+#define MT6357_BATON_ANA_DSN_REV0            0xe02
+#define MT6357_BATON_ANA_DSN_DBI             0xe04
+#define MT6357_BATON_ANA_DSN_DXI             0xe06
+#define MT6357_BATON_ANA_CON0                0xe08
+#define MT6357_BATON_ANA_ELR_NUM             0xe0a
+#define MT6357_BATON_ANA_ELR0                0xe0c
+#define MT6357_HK_TOP_ID                     0xf80
+#define MT6357_HK_TOP_REV0                   0xf82
+#define MT6357_HK_TOP_DBI                    0xf84
+#define MT6357_HK_TOP_DXI                    0xf86
+#define MT6357_HK_TPM0                       0xf88
+#define MT6357_HK_TPM1                       0xf8a
+#define MT6357_HK_TOP_CLK_CON0               0xf8c
+#define MT6357_HK_TOP_CLK_CON1               0xf8e
+#define MT6357_HK_TOP_RST_CON0               0xf90
+#define MT6357_HK_TOP_INT_CON0               0xf92
+#define MT6357_HK_TOP_INT_CON0_SET           0xf94
+#define MT6357_HK_TOP_INT_CON0_CLR           0xf96
+#define MT6357_HK_TOP_INT_MASK_CON0          0xf98
+#define MT6357_HK_TOP_INT_MASK_CON0_SET      0xf9a
+#define MT6357_HK_TOP_INT_MASK_CON0_CLR      0xf9c
+#define MT6357_HK_TOP_INT_STATUS0            0xf9e
+#define MT6357_HK_TOP_INT_RAW_STATUS0        0xfa0
+#define MT6357_HK_TOP_MON_CON0               0xfa2
+#define MT6357_HK_TOP_MON_CON1               0xfa4
+#define MT6357_HK_TOP_MON_CON2               0xfa6
+#define MT6357_AUXADC_DSN_ID                 0x1000
+#define MT6357_AUXADC_DSN_REV0               0x1002
+#define MT6357_AUXADC_DSN_DBI                0x1004
+#define MT6357_AUXADC_DSN_DXI                0x1006
+#define MT6357_AUXADC_ANA_CON0               0x1008
+#define MT6357_AUXADC_DIG_1_DSN_ID           0x1080
+#define MT6357_AUXADC_DIG_1_DSN_REV0         0x1082
+#define MT6357_AUXADC_DIG_1_DSN_DBI          0x1084
+#define MT6357_AUXADC_DIG_1_DSN_DXI          0x1086
+#define MT6357_AUXADC_ADC0                   0x1088
+#define MT6357_AUXADC_ADC1                   0x108a
+#define MT6357_AUXADC_ADC2                   0x108c
+#define MT6357_AUXADC_ADC3                   0x108e
+#define MT6357_AUXADC_ADC4                   0x1090
+#define MT6357_AUXADC_ADC5                   0x1092
+#define MT6357_AUXADC_ADC6                   0x1094
+#define MT6357_AUXADC_ADC7                   0x1096
+#define MT6357_AUXADC_ADC8                   0x1098
+#define MT6357_AUXADC_ADC9                   0x109a
+#define MT6357_AUXADC_ADC10                  0x109c
+#define MT6357_AUXADC_ADC11                  0x109e
+#define MT6357_AUXADC_ADC12                  0x10a0
+#define MT6357_AUXADC_ADC14                  0x10a2
+#define MT6357_AUXADC_ADC16                  0x10a4
+#define MT6357_AUXADC_ADC17                  0x10a6
+#define MT6357_AUXADC_ADC18                  0x10a8
+#define MT6357_AUXADC_ADC19                  0x10aa
+#define MT6357_AUXADC_ADC20                  0x10ac
+#define MT6357_AUXADC_ADC21                  0x10ae
+#define MT6357_AUXADC_ADC22                  0x10b0
+#define MT6357_AUXADC_ADC23                  0x10b2
+#define MT6357_AUXADC_ADC24                  0x10b4
+#define MT6357_AUXADC_ADC25                  0x10b6
+#define MT6357_AUXADC_ADC26                  0x10b8
+#define MT6357_AUXADC_ADC27                  0x10ba
+#define MT6357_AUXADC_ADC29                  0x10bc
+#define MT6357_AUXADC_ADC30                  0x10be
+#define MT6357_AUXADC_ADC31                  0x10c0
+#define MT6357_AUXADC_ADC32                  0x10c2
+#define MT6357_AUXADC_ADC33                  0x10c4
+#define MT6357_AUXADC_ADC34                  0x10c6
+#define MT6357_AUXADC_ADC35                  0x10c8
+#define MT6357_AUXADC_ADC36                  0x10ca
+#define MT6357_AUXADC_ADC38                  0x10cc
+#define MT6357_AUXADC_ADC39                  0x10ce
+#define MT6357_AUXADC_ADC40                  0x10d0
+#define MT6357_AUXADC_ADC41                  0x10d2
+#define MT6357_AUXADC_ADC42                  0x10d4
+#define MT6357_AUXADC_ADC43                  0x10d6
+#define MT6357_AUXADC_ADC46                  0x10d8
+#define MT6357_AUXADC_ADC47                  0x10da
+#define MT6357_AUXADC_DIG_1_ELR_NUM          0x10dc
+#define MT6357_AUXADC_DIG_1_ELR0             0x10de
+#define MT6357_AUXADC_DIG_1_ELR1             0x10e0
+#define MT6357_AUXADC_DIG_2_DSN_ID           0x1100
+#define MT6357_AUXADC_DIG_2_DSN_REV0         0x1102
+#define MT6357_AUXADC_DIG_2_DSN_DBI          0x1104
+#define MT6357_AUXADC_DIG_2_DSN_DXI          0x1106
+#define MT6357_AUXADC_STA0                   0x1108
+#define MT6357_AUXADC_STA1                   0x110a
+#define MT6357_AUXADC_STA2                   0x110c
+#define MT6357_AUXADC_RQST0                  0x110e
+#define MT6357_AUXADC_RQST0_SET              0x1110
+#define MT6357_AUXADC_RQST0_CLR              0x1112
+#define MT6357_AUXADC_RQST2                  0x1114
+#define MT6357_AUXADC_RQST2_SET              0x1116
+#define MT6357_AUXADC_RQST2_CLR              0x1118
+#define MT6357_AUXADC_RQST1                  0x111a
+#define MT6357_AUXADC_RQST1_SET              0x111c
+#define MT6357_AUXADC_RQST1_CLR              0x111e
+#define MT6357_AUXADC_CON0                   0x1120
+#define MT6357_AUXADC_CON0_SET               0x1122
+#define MT6357_AUXADC_CON0_CLR               0x1124
+#define MT6357_AUXADC_CON1                   0x1126
+#define MT6357_AUXADC_CON2                   0x1128
+#define MT6357_AUXADC_CON3                   0x112a
+#define MT6357_AUXADC_CON4                   0x112c
+#define MT6357_AUXADC_CON5                   0x112e
+#define MT6357_AUXADC_CON6                   0x1130
+#define MT6357_AUXADC_CON7                   0x1132
+#define MT6357_AUXADC_CON8                   0x1134
+#define MT6357_AUXADC_CON9                   0x1136
+#define MT6357_AUXADC_CON10                  0x1138
+#define MT6357_AUXADC_CON11                  0x113a
+#define MT6357_AUXADC_CON12                  0x113c
+#define MT6357_AUXADC_CON13                  0x113e
+#define MT6357_AUXADC_CON14                  0x1140
+#define MT6357_AUXADC_CON15                  0x1142
+#define MT6357_AUXADC_CON16                  0x1144
+#define MT6357_AUXADC_CON17                  0x1146
+#define MT6357_AUXADC_CON18                  0x1148
+#define MT6357_AUXADC_CON19                  0x114a
+#define MT6357_AUXADC_CON20                  0x114c
+#define MT6357_AUXADC_DIG_3_DSN_ID           0x1180
+#define MT6357_AUXADC_DIG_3_DSN_REV0         0x1182
+#define MT6357_AUXADC_DIG_3_DSN_DBI          0x1184
+#define MT6357_AUXADC_DIG_3_DSN_DXI          0x1186
+#define MT6357_AUXADC_AUTORPT0               0x1188
+#define MT6357_AUXADC_LBAT0                  0x118a
+#define MT6357_AUXADC_LBAT1                  0x118c
+#define MT6357_AUXADC_LBAT2                  0x118e
+#define MT6357_AUXADC_LBAT3                  0x1190
+#define MT6357_AUXADC_LBAT4                  0x1192
+#define MT6357_AUXADC_LBAT5                  0x1194
+#define MT6357_AUXADC_LBAT6                  0x1196
+#define MT6357_AUXADC_ACCDET                 0x1198
+#define MT6357_AUXADC_DBG0                   0x119a
+#define MT6357_AUXADC_IMP0                   0x119c
+#define MT6357_AUXADC_IMP1                   0x119e
+#define MT6357_AUXADC_DIG_3_ELR_NUM          0x11a0
+#define MT6357_AUXADC_DIG_3_ELR0             0x11a2
+#define MT6357_AUXADC_DIG_3_ELR1             0x11a4
+#define MT6357_AUXADC_DIG_3_ELR2             0x11a6
+#define MT6357_AUXADC_DIG_3_ELR3             0x11a8
+#define MT6357_AUXADC_DIG_3_ELR4             0x11aa
+#define MT6357_AUXADC_DIG_3_ELR5             0x11ac
+#define MT6357_AUXADC_DIG_3_ELR6             0x11ae
+#define MT6357_AUXADC_DIG_3_ELR7             0x11b0
+#define MT6357_AUXADC_DIG_3_ELR8             0x11b2
+#define MT6357_AUXADC_DIG_3_ELR9             0x11b4
+#define MT6357_AUXADC_DIG_3_ELR10            0x11b6
+#define MT6357_AUXADC_DIG_3_ELR11            0x11b8
+#define MT6357_AUXADC_DIG_4_DSN_ID           0x1200
+#define MT6357_AUXADC_DIG_4_DSN_REV0         0x1202
+#define MT6357_AUXADC_DIG_4_DSN_DBI          0x1204
+#define MT6357_AUXADC_DIG_4_DSN_DXI          0x1206
+#define MT6357_AUXADC_MDRT_0                 0x1208
+#define MT6357_AUXADC_MDRT_1                 0x120a
+#define MT6357_AUXADC_MDRT_2                 0x120c
+#define MT6357_AUXADC_MDRT_3                 0x120e
+#define MT6357_AUXADC_MDRT_4                 0x1210
+#define MT6357_AUXADC_DCXO_MDRT_0            0x1212
+#define MT6357_AUXADC_DCXO_MDRT_1            0x1214
+#define MT6357_AUXADC_DCXO_MDRT_2            0x1216
+#define MT6357_AUXADC_NAG_0                  0x1218
+#define MT6357_AUXADC_NAG_1                  0x121a
+#define MT6357_AUXADC_NAG_2                  0x121c
+#define MT6357_AUXADC_NAG_3                  0x121e
+#define MT6357_AUXADC_NAG_4                  0x1220
+#define MT6357_AUXADC_NAG_5                  0x1222
+#define MT6357_AUXADC_NAG_6                  0x1224
+#define MT6357_AUXADC_NAG_7                  0x1226
+#define MT6357_AUXADC_NAG_8                  0x1228
+#define MT6357_AUXADC_RSV_1                  0x122a
+#define MT6357_AUXADC_ANA_0                  0x122c
+#define MT6357_AUXADC_IMP_CG0                0x122e
+#define MT6357_AUXADC_LBAT_CG0               0x1230
+#define MT6357_AUXADC_NAG_CG0                0x1232
+#define MT6357_AUXADC_PRI_NEW                0x1234
+#define MT6357_AUXADC_CHR_TOP_CON2           0x1236
+#define MT6357_BUCK_TOP_DSN_ID               0x1400
+#define MT6357_BUCK_TOP_DSN_REV0             0x1402
+#define MT6357_BUCK_TOP_DBI                  0x1404
+#define MT6357_BUCK_TOP_DXI                  0x1406
+#define MT6357_BUCK_TOP_PAM0                 0x1408
+#define MT6357_BUCK_TOP_PAM1                 0x140a
+#define MT6357_BUCK_TOP_CLK_CON0             0x140c
+#define MT6357_BUCK_TOP_CLK_CON0_SET         0x140e
+#define MT6357_BUCK_TOP_CLK_CON0_CLR         0x1410
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0        0x1412
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0_SET    0x1414
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0_CLR    0x1416
+#define MT6357_BUCK_TOP_CLK_MISC_CON0        0x1418
+#define MT6357_BUCK_TOP_INT_CON0             0x141a
+#define MT6357_BUCK_TOP_INT_CON0_SET         0x141c
+#define MT6357_BUCK_TOP_INT_CON0_CLR         0x141e
+#define MT6357_BUCK_TOP_INT_MASK_CON0        0x1420
+#define MT6357_BUCK_TOP_INT_MASK_CON0_SET    0x1422
+#define MT6357_BUCK_TOP_INT_MASK_CON0_CLR    0x1424
+#define MT6357_BUCK_TOP_INT_STATUS0          0x1426
+#define MT6357_BUCK_TOP_INT_RAW_STATUS0      0x1428
+#define MT6357_BUCK_TOP_STB_CON              0x142a
+#define MT6357_BUCK_TOP_SLP_CON0             0x142c
+#define MT6357_BUCK_TOP_SLP_CON1             0x142e
+#define MT6357_BUCK_TOP_SLP_CON2             0x1430
+#define MT6357_BUCK_TOP_MINFREQ_CON          0x1432
+#define MT6357_BUCK_TOP_OC_CON0              0x1434
+#define MT6357_BUCK_TOP_K_CON0               0x1436
+#define MT6357_BUCK_TOP_K_CON1               0x1438
+#define MT6357_BUCK_TOP_K_CON2               0x143a
+#define MT6357_BUCK_TOP_WDTDBG0              0x143c
+#define MT6357_BUCK_TOP_WDTDBG1              0x143e
+#define MT6357_BUCK_TOP_WDTDBG2              0x1440
+#define MT6357_BUCK_TOP_ELR_NUM              0x1442
+#define MT6357_BUCK_TOP_ELR0                 0x1444
+#define MT6357_BUCK_TOP_ELR1                 0x1446
+#define MT6357_BUCK_VPROC_DSN_ID             0x1480
+#define MT6357_BUCK_VPROC_DSN_REV0           0x1482
+#define MT6357_BUCK_VPROC_DSN_DBI            0x1484
+#define MT6357_BUCK_VPROC_DSN_DXI            0x1486
+#define MT6357_BUCK_VPROC_CON0               0x1488
+#define MT6357_BUCK_VPROC_CON1               0x148a
+#define MT6357_BUCK_VPROC_CFG0               0x148c
+#define MT6357_BUCK_VPROC_CFG1               0x148e
+#define MT6357_BUCK_VPROC_OP_EN              0x1490
+#define MT6357_BUCK_VPROC_OP_EN_SET          0x1492
+#define MT6357_BUCK_VPROC_OP_EN_CLR          0x1494
+#define MT6357_BUCK_VPROC_OP_CFG             0x1496
+#define MT6357_BUCK_VPROC_OP_CFG_SET         0x1498
+#define MT6357_BUCK_VPROC_OP_CFG_CLR         0x149a
+#define MT6357_BUCK_VPROC_SP_CON             0x149c
+#define MT6357_BUCK_VPROC_SP_CFG             0x149e
+#define MT6357_BUCK_VPROC_OC_CFG             0x14a0
+#define MT6357_BUCK_VPROC_DBG0               0x14a2
+#define MT6357_BUCK_VPROC_DBG1               0x14a4
+#define MT6357_BUCK_VPROC_DBG2               0x14a6
+#define MT6357_BUCK_VPROC_ELR_NUM            0x14a8
+#define MT6357_BUCK_VPROC_ELR0               0x14aa
+#define MT6357_BUCK_VCORE_DSN_ID             0x1500
+#define MT6357_BUCK_VCORE_DSN_REV0           0x1502
+#define MT6357_BUCK_VCORE_DSN_DBI            0x1504
+#define MT6357_BUCK_VCORE_DSN_DXI            0x1506
+#define MT6357_BUCK_VCORE_CON0               0x1508
+#define MT6357_BUCK_VCORE_CON1               0x150a
+#define MT6357_BUCK_VCORE_CFG0               0x150c
+#define MT6357_BUCK_VCORE_CFG1               0x150e
+#define MT6357_BUCK_VCORE_OP_EN              0x1510
+#define MT6357_BUCK_VCORE_OP_EN_SET          0x1512
+#define MT6357_BUCK_VCORE_OP_EN_CLR          0x1514
+#define MT6357_BUCK_VCORE_OP_CFG             0x1516
+#define MT6357_BUCK_VCORE_OP_CFG_SET         0x1518
+#define MT6357_BUCK_VCORE_OP_CFG_CLR         0x151a
+#define MT6357_BUCK_VCORE_SP_CON             0x151c
+#define MT6357_BUCK_VCORE_SP_CFG             0x151e
+#define MT6357_BUCK_VCORE_OC_CFG             0x1520
+#define MT6357_BUCK_VCORE_DBG0               0x1522
+#define MT6357_BUCK_VCORE_DBG1               0x1524
+#define MT6357_BUCK_VCORE_DBG2               0x1526
+#define MT6357_BUCK_VCORE_ELR_NUM            0x1528
+#define MT6357_BUCK_VCORE_ELR0               0x152a
+#define MT6357_BUCK_VMODEM_DSN_ID            0x1580
+#define MT6357_BUCK_VMODEM_DSN_REV0          0x1582
+#define MT6357_BUCK_VMODEM_DSN_DBI           0x1584
+#define MT6357_BUCK_VMODEM_DSN_DXI           0x1586
+#define MT6357_BUCK_VMODEM_CON0              0x1588
+#define MT6357_BUCK_VMODEM_CON1              0x158a
+#define MT6357_BUCK_VMODEM_CFG0              0x158c
+#define MT6357_BUCK_VMODEM_CFG1              0x158e
+#define MT6357_BUCK_VMODEM_OP_EN             0x1590
+#define MT6357_BUCK_VMODEM_OP_EN_SET         0x1592
+#define MT6357_BUCK_VMODEM_OP_EN_CLR         0x1594
+#define MT6357_BUCK_VMODEM_OP_CFG            0x1596
+#define MT6357_BUCK_VMODEM_OP_CFG_SET        0x1598
+#define MT6357_BUCK_VMODEM_OP_CFG_CLR        0x159a
+#define MT6357_BUCK_VMODEM_SP_CON            0x159c
+#define MT6357_BUCK_VMODEM_SP_CFG            0x159e
+#define MT6357_BUCK_VMODEM_OC_CFG            0x15a0
+#define MT6357_BUCK_VMODEM_DBG0              0x15a2
+#define MT6357_BUCK_VMODEM_DBG1              0x15a4
+#define MT6357_BUCK_VMODEM_DBG2              0x15a6
+#define MT6357_BUCK_VMODEM_ELR_NUM           0x15a8
+#define MT6357_BUCK_VMODEM_ELR0              0x15aa
+#define MT6357_BUCK_VS1_DSN_ID               0x1600
+#define MT6357_BUCK_VS1_DSN_REV0             0x1602
+#define MT6357_BUCK_VS1_DSN_DBI              0x1604
+#define MT6357_BUCK_VS1_DSN_DXI              0x1606
+#define MT6357_BUCK_VS1_CON0                 0x1608
+#define MT6357_BUCK_VS1_CON1                 0x160a
+#define MT6357_BUCK_VS1_CFG0                 0x160c
+#define MT6357_BUCK_VS1_CFG1                 0x160e
+#define MT6357_BUCK_VS1_OP_EN                0x1610
+#define MT6357_BUCK_VS1_OP_EN_SET            0x1612
+#define MT6357_BUCK_VS1_OP_EN_CLR            0x1614
+#define MT6357_BUCK_VS1_OP_CFG               0x1616
+#define MT6357_BUCK_VS1_OP_CFG_SET           0x1618
+#define MT6357_BUCK_VS1_OP_CFG_CLR           0x161a
+#define MT6357_BUCK_VS1_SP_CON               0x161c
+#define MT6357_BUCK_VS1_SP_CFG               0x161e
+#define MT6357_BUCK_VS1_OC_CFG               0x1620
+#define MT6357_BUCK_VS1_DBG0                 0x1622
+#define MT6357_BUCK_VS1_DBG1                 0x1624
+#define MT6357_BUCK_VS1_DBG2                 0x1626
+#define MT6357_BUCK_VS1_VOTER                0x1628
+#define MT6357_BUCK_VS1_VOTER_SET            0x162a
+#define MT6357_BUCK_VS1_VOTER_CLR            0x162c
+#define MT6357_BUCK_VS1_VOTER_CFG            0x162e
+#define MT6357_BUCK_VS1_ELR_NUM              0x1630
+#define MT6357_BUCK_VS1_ELR0                 0x1632
+#define MT6357_BUCK_VPA_DSN_ID               0x1680
+#define MT6357_BUCK_VPA_DSN_REV0             0x1682
+#define MT6357_BUCK_VPA_DSN_DBI              0x1684
+#define MT6357_BUCK_VPA_DSN_DXI              0x1686
+#define MT6357_BUCK_VPA_CON0                 0x1688
+#define MT6357_BUCK_VPA_CON1                 0x168a
+#define MT6357_BUCK_VPA_CFG0                 0x168c
+#define MT6357_BUCK_VPA_CFG1                 0x168e
+#define MT6357_BUCK_VPA_OC_CFG               0x1690
+#define MT6357_BUCK_VPA_DBG0                 0x1692
+#define MT6357_BUCK_VPA_DBG1                 0x1694
+#define MT6357_BUCK_VPA_DBG2                 0x1696
+#define MT6357_BUCK_VPA_DLC_CON0             0x1698
+#define MT6357_BUCK_VPA_DLC_CON1             0x169a
+#define MT6357_BUCK_VPA_DLC_CON2             0x169c
+#define MT6357_BUCK_VPA_MSFG_CON0            0x169e
+#define MT6357_BUCK_VPA_MSFG_CON1            0x16a0
+#define MT6357_BUCK_VPA_MSFG_RRATE0          0x16a2
+#define MT6357_BUCK_VPA_MSFG_RRATE1          0x16a4
+#define MT6357_BUCK_VPA_MSFG_RRATE2          0x16a6
+#define MT6357_BUCK_VPA_MSFG_RTHD0           0x16a8
+#define MT6357_BUCK_VPA_MSFG_RTHD1           0x16aa
+#define MT6357_BUCK_VPA_MSFG_RTHD2           0x16ac
+#define MT6357_BUCK_VPA_MSFG_FRATE0          0x16ae
+#define MT6357_BUCK_VPA_MSFG_FRATE1          0x16b0
+#define MT6357_BUCK_VPA_MSFG_FRATE2          0x16b2
+#define MT6357_BUCK_VPA_MSFG_FTHD0           0x16b4
+#define MT6357_BUCK_VPA_MSFG_FTHD1           0x16b6
+#define MT6357_BUCK_VPA_MSFG_FTHD2           0x16b8
+#define MT6357_BUCK_ANA_DSN_ID               0x1700
+#define MT6357_BUCK_ANA_DSN_REV0             0x1702
+#define MT6357_BUCK_ANA_DSN_DBI              0x1704
+#define MT6357_BUCK_ANA_DSN_FPI              0x1706
+#define MT6357_SMPS_ANA_CON0                 0x1708
+#define MT6357_SMPS_ANA_CON1                 0x170a
+#define MT6357_SMPS_ANA_CON2                 0x170c
+#define MT6357_VCORE_VPROC_ANA_CON0          0x170e
+#define MT6357_VCORE_VPROC_ANA_CON1          0x1710
+#define MT6357_VCORE_VPROC_ANA_CON2          0x1712
+#define MT6357_VCORE_VPROC_ANA_CON3          0x1714
+#define MT6357_VCORE_VPROC_ANA_CON4          0x1716
+#define MT6357_VCORE_VPROC_ANA_CON5          0x1718
+#define MT6357_VCORE_VPROC_ANA_CON6          0x171a
+#define MT6357_VCORE_VPROC_ANA_CON7          0x171c
+#define MT6357_VCORE_VPROC_ANA_CON8          0x171e
+#define MT6357_VCORE_VPROC_ANA_CON9          0x1720
+#define MT6357_VCORE_VPROC_ANA_CON10         0x1722
+#define MT6357_VCORE_VPROC_ANA_CON11         0x1724
+#define MT6357_VMODEM_ANA_CON0               0x1726
+#define MT6357_VMODEM_ANA_CON1               0x1728
+#define MT6357_VMODEM_ANA_CON2               0x172a
+#define MT6357_VMODEM_ANA_CON3               0x172c
+#define MT6357_VMODEM_ANA_CON4               0x172e
+#define MT6357_VMODEM_ANA_CON5               0x1730
+#define MT6357_VS1_ANA_CON0                  0x1732
+#define MT6357_VS1_ANA_CON1                  0x1734
+#define MT6357_VS1_ANA_CON2                  0x1736
+#define MT6357_VS1_ANA_CON3                  0x1738
+#define MT6357_VS1_ANA_CON4                  0x173a
+#define MT6357_VS1_ANA_CON5                  0x173c
+#define MT6357_VPA_ANA_CON0                  0x173e
+#define MT6357_VPA_ANA_CON1                  0x1740
+#define MT6357_VPA_ANA_CON2                  0x1742
+#define MT6357_VPA_ANA_CON3                  0x1744
+#define MT6357_VPA_ANA_CON4                  0x1746
+#define MT6357_VPA_ANA_CON5                  0x1748
+#define MT6357_BUCK_ANA_ELR_NUM              0x174a
+#define MT6357_SMPS_ELR_0                    0x174c
+#define MT6357_SMPS_ELR_1                    0x174e
+#define MT6357_SMPS_ELR_2                    0x1750
+#define MT6357_SMPS_ELR_3                    0x1752
+#define MT6357_SMPS_ELR_4                    0x1754
+#define MT6357_SMPS_ELR_5                    0x1756
+#define MT6357_VCORE_VPROC_ELR_0             0x1758
+#define MT6357_VCORE_VPROC_ELR_1             0x175a
+#define MT6357_VCORE_VPROC_ELR_2             0x175c
+#define MT6357_VCORE_VPROC_ELR_3             0x175e
+#define MT6357_VCORE_VPROC_ELR_4             0x1760
+#define MT6357_VMODEM_ELR_0                  0x1762
+#define MT6357_VMODEM_ELR_1                  0x1764
+#define MT6357_VMODEM_ELR_2                  0x1766
+#define MT6357_VS1_ELR_0                     0x1768
+#define MT6357_VS1_ELR_1                     0x176a
+#define MT6357_VPA_ELR_0                     0x176c
+#define MT6357_LDO_TOP_ID                    0x1880
+#define MT6357_LDO_TOP_REV0                  0x1882
+#define MT6357_LDO_TOP_DBI                   0x1884
+#define MT6357_LDO_TOP_DXI                   0x1886
+#define MT6357_LDO_TPM0                      0x1888
+#define MT6357_LDO_TPM1                      0x188a
+#define MT6357_LDO_TOP_CLK_DCM_CON0          0x188c
+#define MT6357_LDO_TOP_CLK_VIO28_CON0        0x188e
+#define MT6357_LDO_TOP_CLK_VIO18_CON0        0x1890
+#define MT6357_LDO_TOP_CLK_VAUD28_CON0       0x1892
+#define MT6357_LDO_TOP_CLK_VDRAM_CON0        0x1894
+#define MT6357_LDO_TOP_CLK_VSRAM_PROC_CON0   0x1896
+#define MT6357_LDO_TOP_CLK_VSRAM_OTHERS_CON0 0x1898
+#define MT6357_LDO_TOP_CLK_VAUX18_CON0       0x189a
+#define MT6357_LDO_TOP_CLK_VUSB33_CON0       0x189c
+#define MT6357_LDO_TOP_CLK_VEMC_CON0         0x189e
+#define MT6357_LDO_TOP_CLK_VXO22_CON0        0x18a0
+#define MT6357_LDO_TOP_CLK_VSIM1_CON0        0x18a2
+#define MT6357_LDO_TOP_CLK_VSIM2_CON0        0x18a4
+#define MT6357_LDO_TOP_CLK_VCAMD_CON0        0x18a6
+#define MT6357_LDO_TOP_CLK_VCAMIO_CON0       0x18a8
+#define MT6357_LDO_TOP_CLK_VEFUSE_CON0       0x18aa
+#define MT6357_LDO_TOP_CLK_VCN33_CON0        0x18ac
+#define MT6357_LDO_TOP_CLK_VCN18_CON0        0x18ae
+#define MT6357_LDO_TOP_CLK_VCN28_CON0        0x18b0
+#define MT6357_LDO_TOP_CLK_VIBR_CON0         0x18b2
+#define MT6357_LDO_TOP_CLK_VFE28_CON0        0x18b4
+#define MT6357_LDO_TOP_CLK_VMCH_CON0         0x18b6
+#define MT6357_LDO_TOP_CLK_VMC_CON0          0x18b8
+#define MT6357_LDO_TOP_CLK_VRF18_CON0        0x18ba
+#define MT6357_LDO_TOP_CLK_VLDO28_CON0       0x18bc
+#define MT6357_LDO_TOP_CLK_VRF12_CON0        0x18be
+#define MT6357_LDO_TOP_CLK_VCAMA_CON0        0x18c0
+#define MT6357_LDO_TOP_CLK_TREF_CON0         0x18c2
+#define MT6357_LDO_TOP_INT_CON0              0x18c4
+#define MT6357_LDO_TOP_INT_CON0_SET          0x18c6
+#define MT6357_LDO_TOP_INT_CON0_CLR          0x18c8
+#define MT6357_LDO_TOP_INT_CON1              0x18ca
+#define MT6357_LDO_TOP_INT_CON1_SET          0x18cc
+#define MT6357_LDO_TOP_INT_CON1_CLR          0x18ce
+#define MT6357_LDO_TOP_INT_MASK_CON0         0x18d0
+#define MT6357_LDO_TOP_INT_MASK_CON0_SET     0x18d2
+#define MT6357_LDO_TOP_INT_MASK_CON0_CLR     0x18d4
+#define MT6357_LDO_TOP_INT_MASK_CON1         0x18d6
+#define MT6357_LDO_TOP_INT_MASK_CON1_SET     0x18d8
+#define MT6357_LDO_TOP_INT_MASK_CON1_CLR     0x18da
+#define MT6357_LDO_TOP_INT_STATUS0           0x18dc
+#define MT6357_LDO_TOP_INT_STATUS1           0x18de
+#define MT6357_LDO_TOP_INT_RAW_STATUS0       0x18e0
+#define MT6357_LDO_TOP_INT_RAW_STATUS1       0x18e2
+#define MT6357_LDO_TEST_CON0                 0x18e4
+#define MT6357_LDO_TOP_WDT_CON0              0x18e6
+#define MT6357_LDO_TOP_RSV_CON0              0x18e8
+#define MT6357_LDO_TOP_RSV_CON1              0x18ea
+#define MT6357_LDO_OCFB0                     0x18ec
+#define MT6357_LDO_LP_PROTECTION             0x18ee
+#define MT6357_LDO_DUMMY_LOAD_GATED          0x18f0
+#define MT6357_LDO_GON0_DSN_ID               0x1900
+#define MT6357_LDO_GON0_DSN_REV0             0x1902
+#define MT6357_LDO_GON0_DSN_DBI              0x1904
+#define MT6357_LDO_GON0_DSN_DXI              0x1906
+#define MT6357_LDO_VXO22_CON0                0x1908
+#define MT6357_LDO_VXO22_OP_EN               0x190a
+#define MT6357_LDO_VXO22_OP_EN_SET           0x190c
+#define MT6357_LDO_VXO22_OP_EN_CLR           0x190e
+#define MT6357_LDO_VXO22_OP_CFG              0x1910
+#define MT6357_LDO_VXO22_OP_CFG_SET          0x1912
+#define MT6357_LDO_VXO22_OP_CFG_CLR          0x1914
+#define MT6357_LDO_VXO22_CON1                0x1916
+#define MT6357_LDO_VXO22_CON2                0x1918
+#define MT6357_LDO_VXO22_CON3                0x191a
+#define MT6357_LDO_VAUX18_CON0               0x191c
+#define MT6357_LDO_VAUX18_OP_EN              0x191e
+#define MT6357_LDO_VAUX18_OP_EN_SET          0x1920
+#define MT6357_LDO_VAUX18_OP_EN_CLR          0x1922
+#define MT6357_LDO_VAUX18_OP_CFG             0x1924
+#define MT6357_LDO_VAUX18_OP_CFG_SET         0x1926
+#define MT6357_LDO_VAUX18_OP_CFG_CLR         0x1928
+#define MT6357_LDO_VAUX18_CON1               0x192a
+#define MT6357_LDO_VAUX18_CON2               0x192c
+#define MT6357_LDO_VAUX18_CON3               0x192e
+#define MT6357_LDO_VAUD28_CON0               0x1930
+#define MT6357_LDO_VAUD28_OP_EN              0x1932
+#define MT6357_LDO_VAUD28_OP_EN_SET          0x1934
+#define MT6357_LDO_VAUD28_OP_EN_CLR          0x1936
+#define MT6357_LDO_VAUD28_OP_CFG             0x1938
+#define MT6357_LDO_VAUD28_OP_CFG_SET         0x193a
+#define MT6357_LDO_VAUD28_OP_CFG_CLR         0x193c
+#define MT6357_LDO_VAUD28_CON1               0x193e
+#define MT6357_LDO_VAUD28_CON2               0x1940
+#define MT6357_LDO_VAUD28_CON3               0x1942
+#define MT6357_LDO_VIO28_CON0                0x1944
+#define MT6357_LDO_VIO28_OP_EN               0x1946
+#define MT6357_LDO_VIO28_OP_EN_SET           0x1948
+#define MT6357_LDO_VIO28_OP_EN_CLR           0x194a
+#define MT6357_LDO_VIO28_OP_CFG              0x194c
+#define MT6357_LDO_VIO28_OP_CFG_SET          0x194e
+#define MT6357_LDO_VIO28_OP_CFG_CLR          0x1950
+#define MT6357_LDO_VIO28_CON1                0x1952
+#define MT6357_LDO_VIO28_CON2                0x1954
+#define MT6357_LDO_VIO28_CON3                0x1956
+#define MT6357_LDO_VIO18_CON0                0x1958
+#define MT6357_LDO_VIO18_OP_EN               0x195a
+#define MT6357_LDO_VIO18_OP_EN_SET           0x195c
+#define MT6357_LDO_VIO18_OP_EN_CLR           0x195e
+#define MT6357_LDO_VIO18_OP_CFG              0x1960
+#define MT6357_LDO_VIO18_OP_CFG_SET          0x1962
+#define MT6357_LDO_VIO18_OP_CFG_CLR          0x1964
+#define MT6357_LDO_VIO18_CON1                0x1966
+#define MT6357_LDO_VIO18_CON2                0x1968
+#define MT6357_LDO_VIO18_CON3                0x196a
+#define MT6357_LDO_VDRAM_CON0                0x196c
+#define MT6357_LDO_VDRAM_OP_EN               0x196e
+#define MT6357_LDO_VDRAM_OP_EN_SET           0x1970
+#define MT6357_LDO_VDRAM_OP_EN_CLR           0x1972
+#define MT6357_LDO_VDRAM_OP_CFG              0x1974
+#define MT6357_LDO_VDRAM_OP_CFG_SET          0x1976
+#define MT6357_LDO_VDRAM_OP_CFG_CLR          0x1978
+#define MT6357_LDO_VDRAM_CON1                0x197a
+#define MT6357_LDO_VDRAM_CON2                0x197c
+#define MT6357_LDO_VDRAM_CON3                0x197e
+#define MT6357_LDO_GON1_DSN_ID               0x1980
+#define MT6357_LDO_GON1_DSN_REV0             0x1982
+#define MT6357_LDO_GON1_DSN_DBI              0x1984
+#define MT6357_LDO_GON1_DSN_DXI              0x1986
+#define MT6357_LDO_VEMC_CON0                 0x1988
+#define MT6357_LDO_VEMC_OP_EN                0x198a
+#define MT6357_LDO_VEMC_OP_EN_SET            0x198c
+#define MT6357_LDO_VEMC_OP_EN_CLR            0x198e
+#define MT6357_LDO_VEMC_OP_CFG               0x1990
+#define MT6357_LDO_VEMC_OP_CFG_SET           0x1992
+#define MT6357_LDO_VEMC_OP_CFG_CLR           0x1994
+#define MT6357_LDO_VEMC_CON1                 0x1996
+#define MT6357_LDO_VEMC_CON2                 0x1998
+#define MT6357_LDO_VEMC_CON3                 0x199a
+#define MT6357_LDO_VUSB33_CON0_0             0x199c
+#define MT6357_LDO_VUSB33_OP_EN              0x199e
+#define MT6357_LDO_VUSB33_OP_EN_SET          0x19a0
+#define MT6357_LDO_VUSB33_OP_EN_CLR          0x19a2
+#define MT6357_LDO_VUSB33_OP_CFG             0x19a4
+#define MT6357_LDO_VUSB33_OP_CFG_SET         0x19a6
+#define MT6357_LDO_VUSB33_OP_CFG_CLR         0x19a8
+#define MT6357_LDO_VUSB33_CON0_1             0x19aa
+#define MT6357_LDO_VUSB33_CON1               0x19ac
+#define MT6357_LDO_VUSB33_CON2               0x19ae
+#define MT6357_LDO_VUSB33_CON3               0x19b0
+#define MT6357_LDO_VSRAM_PROC_CON0           0x19b2
+#define MT6357_LDO_VSRAM_PROC_CON2           0x19b4
+#define MT6357_LDO_VSRAM_PROC_CFG0           0x19b6
+#define MT6357_LDO_VSRAM_PROC_CFG1           0x19b8
+#define MT6357_LDO_VSRAM_PROC_OP_EN          0x19ba
+#define MT6357_LDO_VSRAM_PROC_OP_EN_SET      0x19bc
+#define MT6357_LDO_VSRAM_PROC_OP_EN_CLR      0x19be
+#define MT6357_LDO_VSRAM_PROC_OP_CFG         0x19c0
+#define MT6357_LDO_VSRAM_PROC_OP_CFG_SET     0x19c2
+#define MT6357_LDO_VSRAM_PROC_OP_CFG_CLR     0x19c4
+#define MT6357_LDO_VSRAM_PROC_CON3           0x19c6
+#define MT6357_LDO_VSRAM_PROC_CON4           0x19c8
+#define MT6357_LDO_VSRAM_PROC_CON5           0x19ca
+#define MT6357_LDO_VSRAM_PROC_DBG0           0x19cc
+#define MT6357_LDO_VSRAM_PROC_DBG1           0x19ce
+#define MT6357_LDO_VSRAM_OTHERS_CON0         0x19d0
+#define MT6357_LDO_VSRAM_OTHERS_CON2         0x19d2
+#define MT6357_LDO_VSRAM_OTHERS_CFG0         0x19d4
+#define MT6357_LDO_VSRAM_OTHERS_CFG1         0x19d6
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN        0x19d8
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN_SET    0x19da
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN_CLR    0x19dc
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG       0x19de
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG_SET   0x19e0
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG_CLR   0x19e2
+#define MT6357_LDO_VSRAM_OTHERS_CON3         0x19e4
+#define MT6357_LDO_VSRAM_OTHERS_CON4         0x19e6
+#define MT6357_LDO_VSRAM_OTHERS_CON5         0x19e8
+#define MT6357_LDO_VSRAM_OTHERS_DBG0         0x19ea
+#define MT6357_LDO_VSRAM_OTHERS_DBG1         0x19ec
+#define MT6357_LDO_VSRAM_PROC_SP             0x19ee
+#define MT6357_LDO_VSRAM_OTHERS_SP           0x19f0
+#define MT6357_LDO_VSRAM_PROC_R2R_PDN_DIS    0x19f2
+#define MT6357_LDO_VSRAM_OTHERS_R2R_PDN_DIS  0x19f4
+#define MT6357_LDO_VSRAM_WDT_DBG0            0x19f6
+#define MT6357_LDO_GON1_ELR_NUM              0x19f8
+#define MT6357_LDO_VSRAM_CON0                0x19fa
+#define MT6357_LDO_VSRAM_CON1                0x19fc
+#define MT6357_LDO_VSRAM_CON2                0x19fe
+#define MT6357_LDO_GOFF0_DSN_ID              0x1a00
+#define MT6357_LDO_GOFF0_DSN_REV0            0x1a02
+#define MT6357_LDO_GOFF0_DSN_DBI             0x1a04
+#define MT6357_LDO_GOFF0_DSN_DXI             0x1a06
+#define MT6357_LDO_VFE28_CON0                0x1a08
+#define MT6357_LDO_VFE28_OP_EN               0x1a0a
+#define MT6357_LDO_VFE28_OP_EN_SET           0x1a0c
+#define MT6357_LDO_VFE28_OP_EN_CLR           0x1a0e
+#define MT6357_LDO_VFE28_OP_CFG              0x1a10
+#define MT6357_LDO_VFE28_OP_CFG_SET          0x1a12
+#define MT6357_LDO_VFE28_OP_CFG_CLR          0x1a14
+#define MT6357_LDO_VFE28_CON1                0x1a16
+#define MT6357_LDO_VFE28_CON2                0x1a18
+#define MT6357_LDO_VFE28_CON3                0x1a1a
+#define MT6357_LDO_VRF18_CON0                0x1a1c
+#define MT6357_LDO_VRF18_OP_EN               0x1a1e
+#define MT6357_LDO_VRF18_OP_EN_SET           0x1a20
+#define MT6357_LDO_VRF18_OP_EN_CLR           0x1a22
+#define MT6357_LDO_VRF18_OP_CFG              0x1a24
+#define MT6357_LDO_VRF18_OP_CFG_SET          0x1a26
+#define MT6357_LDO_VRF18_OP_CFG_CLR          0x1a28
+#define MT6357_LDO_VRF18_CON1                0x1a2a
+#define MT6357_LDO_VRF18_CON2                0x1a2c
+#define MT6357_LDO_VRF18_CON3                0x1a2e
+#define MT6357_LDO_VRF12_CON0                0x1a30
+#define MT6357_LDO_VRF12_OP_EN               0x1a32
+#define MT6357_LDO_VRF12_OP_EN_SET           0x1a34
+#define MT6357_LDO_VRF12_OP_EN_CLR           0x1a36
+#define MT6357_LDO_VRF12_OP_CFG              0x1a38
+#define MT6357_LDO_VRF12_OP_CFG_SET          0x1a3a
+#define MT6357_LDO_VRF12_OP_CFG_CLR          0x1a3c
+#define MT6357_LDO_VRF12_CON1                0x1a3e
+#define MT6357_LDO_VRF12_CON2                0x1a40
+#define MT6357_LDO_VRF12_CON3                0x1a42
+#define MT6357_LDO_VEFUSE_CON0               0x1a44
+#define MT6357_LDO_VEFUSE_OP_EN              0x1a46
+#define MT6357_LDO_VEFUSE_OP_EN_SET          0x1a48
+#define MT6357_LDO_VEFUSE_OP_EN_CLR          0x1a4a
+#define MT6357_LDO_VEFUSE_OP_CFG             0x1a4c
+#define MT6357_LDO_VEFUSE_OP_CFG_SET         0x1a4e
+#define MT6357_LDO_VEFUSE_OP_CFG_CLR         0x1a50
+#define MT6357_LDO_VEFUSE_CON1               0x1a52
+#define MT6357_LDO_VEFUSE_CON2               0x1a54
+#define MT6357_LDO_VEFUSE_CON3               0x1a56
+#define MT6357_LDO_VCN18_CON0                0x1a58
+#define MT6357_LDO_VCN18_OP_EN               0x1a5a
+#define MT6357_LDO_VCN18_OP_EN_SET           0x1a5c
+#define MT6357_LDO_VCN18_OP_EN_CLR           0x1a5e
+#define MT6357_LDO_VCN18_OP_CFG              0x1a60
+#define MT6357_LDO_VCN18_OP_CFG_SET          0x1a62
+#define MT6357_LDO_VCN18_OP_CFG_CLR          0x1a64
+#define MT6357_LDO_VCN18_CON1                0x1a66
+#define MT6357_LDO_VCN18_CON2                0x1a68
+#define MT6357_LDO_VCN18_CON3                0x1a6a
+#define MT6357_LDO_VCAMA_CON0                0x1a6c
+#define MT6357_LDO_VCAMA_OP_EN               0x1a6e
+#define MT6357_LDO_VCAMA_OP_EN_SET           0x1a70
+#define MT6357_LDO_VCAMA_OP_EN_CLR           0x1a72
+#define MT6357_LDO_VCAMA_OP_CFG              0x1a74
+#define MT6357_LDO_VCAMA_OP_CFG_SET          0x1a76
+#define MT6357_LDO_VCAMA_OP_CFG_CLR          0x1a78
+#define MT6357_LDO_VCAMA_CON1                0x1a7a
+#define MT6357_LDO_VCAMA_CON2                0x1a7c
+#define MT6357_LDO_VCAMA_CON3                0x1a7e
+#define MT6357_LDO_GOFF1_DSN_ID              0x1a80
+#define MT6357_LDO_GOFF1_DSN_REV0            0x1a82
+#define MT6357_LDO_GOFF1_DSN_DBI             0x1a84
+#define MT6357_LDO_GOFF1_DSN_DXI             0x1a86
+#define MT6357_LDO_VCAMD_CON0                0x1a88
+#define MT6357_LDO_VCAMD_OP_EN               0x1a8a
+#define MT6357_LDO_VCAMD_OP_EN_SET           0x1a8c
+#define MT6357_LDO_VCAMD_OP_EN_CLR           0x1a8e
+#define MT6357_LDO_VCAMD_OP_CFG              0x1a90
+#define MT6357_LDO_VCAMD_OP_CFG_SET          0x1a92
+#define MT6357_LDO_VCAMD_OP_CFG_CLR          0x1a94
+#define MT6357_LDO_VCAMD_CON1                0x1a96
+#define MT6357_LDO_VCAMD_CON2                0x1a98
+#define MT6357_LDO_VCAMD_CON3                0x1a9a
+#define MT6357_LDO_VCAMIO_CON0               0x1a9c
+#define MT6357_LDO_VCAMIO_OP_EN              0x1a9e
+#define MT6357_LDO_VCAMIO_OP_EN_SET          0x1aa0
+#define MT6357_LDO_VCAMIO_OP_EN_CLR          0x1aa2
+#define MT6357_LDO_VCAMIO_OP_CFG             0x1aa4
+#define MT6357_LDO_VCAMIO_OP_CFG_SET         0x1aa6
+#define MT6357_LDO_VCAMIO_OP_CFG_CLR         0x1aa8
+#define MT6357_LDO_VCAMIO_CON1               0x1aaa
+#define MT6357_LDO_VCAMIO_CON2               0x1aac
+#define MT6357_LDO_VCAMIO_CON3               0x1aae
+#define MT6357_LDO_VMC_CON0                  0x1ab0
+#define MT6357_LDO_VMC_OP_EN                 0x1ab2
+#define MT6357_LDO_VMC_OP_EN_SET             0x1ab4
+#define MT6357_LDO_VMC_OP_EN_CLR             0x1ab6
+#define MT6357_LDO_VMC_OP_CFG                0x1ab8
+#define MT6357_LDO_VMC_OP_CFG_SET            0x1aba
+#define MT6357_LDO_VMC_OP_CFG_CLR            0x1abc
+#define MT6357_LDO_VMC_CON1                  0x1abe
+#define MT6357_LDO_VMC_CON2                  0x1ac0
+#define MT6357_LDO_VMC_CON3                  0x1ac2
+#define MT6357_LDO_VMCH_CON0                 0x1ac4
+#define MT6357_LDO_VMCH_OP_EN                0x1ac6
+#define MT6357_LDO_VMCH_OP_EN_SET            0x1ac8
+#define MT6357_LDO_VMCH_OP_EN_CLR            0x1aca
+#define MT6357_LDO_VMCH_OP_CFG               0x1acc
+#define MT6357_LDO_VMCH_OP_CFG_SET           0x1ace
+#define MT6357_LDO_VMCH_OP_CFG_CLR           0x1ad0
+#define MT6357_LDO_VMCH_CON1                 0x1ad2
+#define MT6357_LDO_VMCH_CON2                 0x1ad4
+#define MT6357_LDO_VMCH_CON3                 0x1ad6
+#define MT6357_LDO_VSIM1_CON0                0x1ad8
+#define MT6357_LDO_VSIM1_OP_EN               0x1ada
+#define MT6357_LDO_VSIM1_OP_EN_SET           0x1adc
+#define MT6357_LDO_VSIM1_OP_EN_CLR           0x1ade
+#define MT6357_LDO_VSIM1_OP_CFG              0x1ae0
+#define MT6357_LDO_VSIM1_OP_CFG_SET          0x1ae2
+#define MT6357_LDO_VSIM1_OP_CFG_CLR          0x1ae4
+#define MT6357_LDO_VSIM1_CON1                0x1ae6
+#define MT6357_LDO_VSIM1_CON2                0x1ae8
+#define MT6357_LDO_VSIM1_CON3                0x1aea
+#define MT6357_LDO_VSIM2_CON0                0x1aec
+#define MT6357_LDO_VSIM2_OP_EN               0x1aee
+#define MT6357_LDO_VSIM2_OP_EN_SET           0x1af0
+#define MT6357_LDO_VSIM2_OP_EN_CLR           0x1af2
+#define MT6357_LDO_VSIM2_OP_CFG              0x1af4
+#define MT6357_LDO_VSIM2_OP_CFG_SET          0x1af6
+#define MT6357_LDO_VSIM2_OP_CFG_CLR          0x1af8
+#define MT6357_LDO_VSIM2_CON1                0x1afa
+#define MT6357_LDO_VSIM2_CON2                0x1afc
+#define MT6357_LDO_VSIM2_CON3                0x1afe
+#define MT6357_LDO_GOFF2_DSN_ID              0x1b00
+#define MT6357_LDO_GOFF2_DSN_REV0            0x1b02
+#define MT6357_LDO_GOFF2_DSN_DBI             0x1b04
+#define MT6357_LDO_GOFF2_DSN_DXI             0x1b06
+#define MT6357_LDO_VIBR_CON0                 0x1b08
+#define MT6357_LDO_VIBR_OP_EN                0x1b0a
+#define MT6357_LDO_VIBR_OP_EN_SET            0x1b0c
+#define MT6357_LDO_VIBR_OP_EN_CLR            0x1b0e
+#define MT6357_LDO_VIBR_OP_CFG               0x1b10
+#define MT6357_LDO_VIBR_OP_CFG_SET           0x1b12
+#define MT6357_LDO_VIBR_OP_CFG_CLR           0x1b14
+#define MT6357_LDO_VIBR_CON1                 0x1b16
+#define MT6357_LDO_VIBR_CON2                 0x1b18
+#define MT6357_LDO_VIBR_CON3                 0x1b1a
+#define MT6357_LDO_VCN33_CON0_0              0x1b1c
+#define MT6357_LDO_VCN33_OP_EN               0x1b1e
+#define MT6357_LDO_VCN33_OP_EN_SET           0x1b20
+#define MT6357_LDO_VCN33_OP_EN_CLR           0x1b22
+#define MT6357_LDO_VCN33_OP_CFG              0x1b24
+#define MT6357_LDO_VCN33_OP_CFG_SET          0x1b26
+#define MT6357_LDO_VCN33_OP_CFG_CLR          0x1b28
+#define MT6357_LDO_VCN33_CON0_1              0x1b2a
+#define MT6357_LDO_VCN33_CON1                0x1b2c
+#define MT6357_LDO_VCN33_CON2                0x1b2e
+#define MT6357_LDO_VCN33_CON3                0x1b30
+#define MT6357_LDO_VLDO28_CON0_0             0x1b32
+#define MT6357_LDO_VLDO28_OP_EN              0x1b34
+#define MT6357_LDO_VLDO28_OP_EN_SET          0x1b36
+#define MT6357_LDO_VLDO28_OP_EN_CLR          0x1b38
+#define MT6357_LDO_VLDO28_OP_CFG             0x1b3a
+#define MT6357_LDO_VLDO28_OP_CFG_SET         0x1b3c
+#define MT6357_LDO_VLDO28_OP_CFG_CLR         0x1b3e
+#define MT6357_LDO_VLDO28_CON0_1             0x1b40
+#define MT6357_LDO_VLDO28_CON1               0x1b42
+#define MT6357_LDO_VLDO28_CON2               0x1b44
+#define MT6357_LDO_VLDO28_CON3               0x1b46
+#define MT6357_LDO_GOFF2_RSV_CON0            0x1b48
+#define MT6357_LDO_GOFF2_RSV_CON1            0x1b4a
+#define MT6357_LDO_GOFF3_DSN_ID              0x1b80
+#define MT6357_LDO_GOFF3_DSN_REV0            0x1b82
+#define MT6357_LDO_GOFF3_DSN_DBI             0x1b84
+#define MT6357_LDO_GOFF3_DSN_DXI             0x1b86
+#define MT6357_LDO_VCN28_CON0                0x1b88
+#define MT6357_LDO_VCN28_OP_EN               0x1b8a
+#define MT6357_LDO_VCN28_OP_EN_SET           0x1b8c
+#define MT6357_LDO_VCN28_OP_EN_CLR           0x1b8e
+#define MT6357_LDO_VCN28_OP_CFG              0x1b90
+#define MT6357_LDO_VCN28_OP_CFG_SET          0x1b92
+#define MT6357_LDO_VCN28_OP_CFG_CLR          0x1b94
+#define MT6357_LDO_VCN28_CON1                0x1b96
+#define MT6357_LDO_VCN28_CON2                0x1b98
+#define MT6357_LDO_VCN28_CON3                0x1b9a
+#define MT6357_VRTC_CON0                     0x1b9c
+#define MT6357_LDO_TREF_CON0                 0x1b9e
+#define MT6357_LDO_TREF_OP_EN                0x1ba0
+#define MT6357_LDO_TREF_OP_EN_SET            0x1ba2
+#define MT6357_LDO_TREF_OP_EN_CLR            0x1ba4
+#define MT6357_LDO_TREF_OP_CFG               0x1ba6
+#define MT6357_LDO_TREF_OP_CFG_SET           0x1ba8
+#define MT6357_LDO_TREF_OP_CFG_CLR           0x1baa
+#define MT6357_LDO_TREF_CON1                 0x1bac
+#define MT6357_LDO_GOFF3_RSV_CON0            0x1bae
+#define MT6357_LDO_GOFF3_RSV_CON1            0x1bb0
+#define MT6357_LDO_ANA0_DSN_ID               0x1c00
+#define MT6357_LDO_ANA0_DSN_REV0             0x1c02
+#define MT6357_LDO_ANA0_DSN_DBI              0x1c04
+#define MT6357_LDO_ANA0_DSN_DXI              0x1c06
+#define MT6357_VFE28_ANA_CON0                0x1c08
+#define MT6357_VFE28_ANA_CON1                0x1c0a
+#define MT6357_VCN28_ANA_CON0                0x1c0c
+#define MT6357_VCN28_ANA_CON1                0x1c0e
+#define MT6357_VAUD28_ANA_CON0               0x1c10
+#define MT6357_VAUD28_ANA_CON1               0x1c12
+#define MT6357_VAUX18_ANA_CON0               0x1c14
+#define MT6357_VAUX18_ANA_CON1               0x1c16
+#define MT6357_VXO22_ANA_CON0                0x1c18
+#define MT6357_VXO22_ANA_CON1                0x1c1a
+#define MT6357_VCN33_ANA_CON0                0x1c1c
+#define MT6357_VCN33_ANA_CON1                0x1c1e
+#define MT6357_VEMC_ANA_CON0                 0x1c20
+#define MT6357_VEMC_ANA_CON1                 0x1c22
+#define MT6357_VLDO28_ANA_CON0               0x1c24
+#define MT6357_VLDO28_ANA_CON1               0x1c26
+#define MT6357_VIO28_ANA_CON0                0x1c28
+#define MT6357_VIO28_ANA_CON1                0x1c2a
+#define MT6357_VIBR_ANA_CON0                 0x1c2c
+#define MT6357_VIBR_ANA_CON1                 0x1c2e
+#define MT6357_VSIM1_ANA_CON0                0x1c30
+#define MT6357_VSIM1_ANA_CON1                0x1c32
+#define MT6357_VSIM2_ANA_CON0                0x1c34
+#define MT6357_VSIM2_ANA_CON1                0x1c36
+#define MT6357_VMCH_ANA_CON0                 0x1c38
+#define MT6357_VMCH_ANA_CON1                 0x1c3a
+#define MT6357_VMC_ANA_CON0                  0x1c3c
+#define MT6357_VMC_ANA_CON1                  0x1c3e
+#define MT6357_VCAMIO_ANA_CON0               0x1c40
+#define MT6357_VCAMIO_ANA_CON1               0x1c42
+#define MT6357_VCN18_ANA_CON0                0x1c44
+#define MT6357_VCN18_ANA_CON1                0x1c46
+#define MT6357_VRF18_ANA_CON0                0x1c48
+#define MT6357_VRF18_ANA_CON1                0x1c4a
+#define MT6357_VIO18_ANA_CON0                0x1c4c
+#define MT6357_VIO18_ANA_CON1                0x1c4e
+#define MT6357_VDRAM_ANA_CON1                0x1c50
+#define MT6357_VRF12_ANA_CON0                0x1c52
+#define MT6357_VRF12_ANA_CON1                0x1c54
+#define MT6357_VSRAM_PROC_ANA_CON0           0x1c56
+#define MT6357_VSRAM_OTHERS_ANA_CON0         0x1c58
+#define MT6357_LDO_ANA0_ELR_NUM              0x1c5a
+#define MT6357_VFE28_ELR_0                   0x1c5c
+#define MT6357_VCN28_ELR_0                   0x1c5e
+#define MT6357_VAUD28_ELR_0                  0x1c60
+#define MT6357_VAUX18_ELR_0                  0x1c62
+#define MT6357_VXO22_ELR_0                   0x1c64
+#define MT6357_VCN33_ELR_0                   0x1c66
+#define MT6357_VEMC_ELR_0                    0x1c68
+#define MT6357_VLDO28_ELR_0                  0x1c6a
+#define MT6357_VIO28_ELR_0                   0x1c6c
+#define MT6357_VIBR_ELR_0                    0x1c6e
+#define MT6357_VSIM1_ELR_0                   0x1c70
+#define MT6357_VSIM2_ELR_0                   0x1c72
+#define MT6357_VMCH_ELR_0                    0x1c74
+#define MT6357_VMC_ELR_0                     0x1c76
+#define MT6357_VCAMIO_ELR_0                  0x1c78
+#define MT6357_VCN18_ELR_0                   0x1c7a
+#define MT6357_VRF18_ELR_0                   0x1c7c
+#define MT6357_LDO_ANA1_DSN_ID               0x1c80
+#define MT6357_LDO_ANA1_DSN_REV0             0x1c82
+#define MT6357_LDO_ANA1_DSN_DBI              0x1c84
+#define MT6357_LDO_ANA1_DSN_DXI              0x1c86
+#define MT6357_VUSB33_ANA_CON0               0x1c88
+#define MT6357_VUSB33_ANA_CON1               0x1c8a
+#define MT6357_VCAMA_ANA_CON0                0x1c8c
+#define MT6357_VCAMA_ANA_CON1                0x1c8e
+#define MT6357_VEFUSE_ANA_CON0               0x1c90
+#define MT6357_VEFUSE_ANA_CON1               0x1c92
+#define MT6357_VCAMD_ANA_CON0                0x1c94
+#define MT6357_VCAMD_ANA_CON1                0x1c96
+#define MT6357_LDO_ANA1_ELR_NUM              0x1c98
+#define MT6357_VUSB33_ELR_0                  0x1c9a
+#define MT6357_VCAMA_ELR_0                   0x1c9c
+#define MT6357_VEFUSE_ELR_0                  0x1c9e
+#define MT6357_VCAMD_ELR_0                   0x1ca0
+#define MT6357_VIO18_ELR_0                   0x1ca2
+#define MT6357_VDRAM_ELR_0                   0x1ca4
+#define MT6357_VRF12_ELR_0                   0x1ca6
+#define MT6357_VRTC_ELR_0                    0x1ca8
+#define MT6357_VDRAM_ELR_1                   0x1caa
+#define MT6357_VDRAM_ELR_2                   0x1cac
+#define MT6357_XPP_TOP_ID                    0x1e00
+#define MT6357_XPP_TOP_REV0                  0x1e02
+#define MT6357_XPP_TOP_DBI                   0x1e04
+#define MT6357_XPP_TOP_DXI                   0x1e06
+#define MT6357_XPP_TPM0                      0x1e08
+#define MT6357_XPP_TPM1                      0x1e0a
+#define MT6357_XPP_TOP_TEST_OUT              0x1e0c
+#define MT6357_XPP_TOP_TEST_CON0             0x1e0e
+#define MT6357_XPP_TOP_CKPDN_CON0            0x1e10
+#define MT6357_XPP_TOP_CKPDN_CON0_SET        0x1e12
+#define MT6357_XPP_TOP_CKPDN_CON0_CLR        0x1e14
+#define MT6357_XPP_TOP_CKSEL_CON0            0x1e16
+#define MT6357_XPP_TOP_CKSEL_CON0_SET        0x1e18
+#define MT6357_XPP_TOP_CKSEL_CON0_CLR        0x1e1a
+#define MT6357_XPP_TOP_RST_CON0              0x1e1c
+#define MT6357_XPP_TOP_RST_CON0_SET          0x1e1e
+#define MT6357_XPP_TOP_RST_CON0_CLR          0x1e20
+#define MT6357_XPP_TOP_RST_BANK_CON0         0x1e22
+#define MT6357_XPP_TOP_RST_BANK_CON0_SET     0x1e24
+#define MT6357_XPP_TOP_RST_BANK_CON0_CLR     0x1e26
+#define MT6357_DRIVER_BL_DSN_ID              0x1e80
+#define MT6357_DRIVER_BL_DSN_REV0            0x1e82
+#define MT6357_DRIVER_BL_DSN_DBI             0x1e84
+#define MT6357_DRIVER_BL_DSN_DXI             0x1e86
+#define MT6357_ISINK1_CON0                   0x1e88
+#define MT6357_ISINK1_CON1                   0x1e8a
+#define MT6357_ISINK1_CON2                   0x1e8c
+#define MT6357_ISINK1_CON3                   0x1e8e
+#define MT6357_ISINK_ANA1                    0x1e90
+#define MT6357_ISINK_PHASE_DLY               0x1e92
+#define MT6357_ISINK_SFSTR                   0x1e94
+#define MT6357_ISINK_EN_CTRL                 0x1e96
+#define MT6357_ISINK_MODE_CTRL               0x1e98
+#define MT6357_DRIVER_ANA_CON0               0x1e9a
+#define MT6357_ISINK_ANA_CON0                0x1e9c
+#define MT6357_ISINK_ANA_CON1                0x1e9e
+#define MT6357_DRIVER_BL_ELR_NUM             0x1ea0
+#define MT6357_DRIVER_BL_ELR_0               0x1ea2
+#define MT6357_DRIVER_CI_DSN_ID              0x1f00
+#define MT6357_DRIVER_CI_DSN_REV0            0x1f02
+#define MT6357_DRIVER_CI_DSN_DBI             0x1f04
+#define MT6357_DRIVER_CI_DSN_DXI             0x1f06
+#define MT6357_CHRIND_CON0                   0x1f08
+#define MT6357_CHRIND_CON1                   0x1f0a
+#define MT6357_CHRIND_CON2                   0x1f0c
+#define MT6357_CHRIND_CON3                   0x1f0e
+#define MT6357_CHRIND_CON4                   0x1f10
+#define MT6357_CHRIND_EN_CTRL                0x1f12
+#define MT6357_CHRIND_ANA_CON0               0x1f14
+#define MT6357_DRIVER_DL_DSN_ID              0x1f80
+#define MT6357_DRIVER_DL_DSN_REV0            0x1f82
+#define MT6357_DRIVER_DL_DSN_DBI             0x1f84
+#define MT6357_DRIVER_DL_DSN_DXI             0x1f86
+#define MT6357_ISINK2_CON0                   0x1f88
+#define MT6357_ISINK3_CON0                   0x1f8a
+#define MT6357_ISINK_EN_CTRL_SMPL            0x1f8c
+#define MT6357_AUD_TOP_ID                    0x2080
+#define MT6357_AUD_TOP_REV0                  0x2082
+#define MT6357_AUD_TOP_DBI                   0x2084
+#define MT6357_AUD_TOP_DXI                   0x2086
+#define MT6357_AUD_TOP_CKPDN_TPM0            0x2088
+#define MT6357_AUD_TOP_CKPDN_TPM1            0x208a
+#define MT6357_AUD_TOP_CKPDN_CON0            0x208c
+#define MT6357_AUD_TOP_CKPDN_CON0_SET        0x208e
+#define MT6357_AUD_TOP_CKPDN_CON0_CLR        0x2090
+#define MT6357_AUD_TOP_CKSEL_CON0            0x2092
+#define MT6357_AUD_TOP_CKSEL_CON0_SET        0x2094
+#define MT6357_AUD_TOP_CKSEL_CON0_CLR        0x2096
+#define MT6357_AUD_TOP_CKTST_CON0            0x2098
+#define MT6357_AUD_TOP_RST_CON0              0x209a
+#define MT6357_AUD_TOP_RST_CON0_SET          0x209c
+#define MT6357_AUD_TOP_RST_CON0_CLR          0x209e
+#define MT6357_AUD_TOP_RST_BANK_CON0         0x20a0
+#define MT6357_AUD_TOP_INT_CON0              0x20a2
+#define MT6357_AUD_TOP_INT_CON0_SET          0x20a4
+#define MT6357_AUD_TOP_INT_CON0_CLR          0x20a6
+#define MT6357_AUD_TOP_INT_MASK_CON0         0x20a8
+#define MT6357_AUD_TOP_INT_MASK_CON0_SET     0x20aa
+#define MT6357_AUD_TOP_INT_MASK_CON0_CLR     0x20ac
+#define MT6357_AUD_TOP_INT_STATUS0           0x20ae
+#define MT6357_AUD_TOP_INT_RAW_STATUS0       0x20b0
+#define MT6357_AUD_TOP_INT_MISC_CON0         0x20b2
+#define MT6357_AUDNCP_CLKDIV_CON0            0x20b4
+#define MT6357_AUDNCP_CLKDIV_CON1            0x20b6
+#define MT6357_AUDNCP_CLKDIV_CON2            0x20b8
+#define MT6357_AUDNCP_CLKDIV_CON3            0x20ba
+#define MT6357_AUDNCP_CLKDIV_CON4            0x20bc
+#define MT6357_AUD_TOP_MON_CON0              0x20be
+#define MT6357_AUDIO_DIG_DSN_ID              0x2100
+#define MT6357_AUDIO_DIG_DSN_REV0            0x2102
+#define MT6357_AUDIO_DIG_DSN_DBI             0x2104
+#define MT6357_AUDIO_DIG_DSN_DXI             0x2106
+#define MT6357_AFE_UL_DL_CON0                0x2108
+#define MT6357_AFE_DL_SRC2_CON0_L            0x210a
+#define MT6357_AFE_UL_SRC_CON0_H             0x210c
+#define MT6357_AFE_UL_SRC_CON0_L             0x210e
+#define MT6357_AFE_TOP_CON0                  0x2110
+#define MT6357_AUDIO_TOP_CON0                0x2112
+#define MT6357_AFE_MON_DEBUG0                0x2114
+#define MT6357_AFUNC_AUD_CON0                0x2116
+#define MT6357_AFUNC_AUD_CON1                0x2118
+#define MT6357_AFUNC_AUD_CON2                0x211a
+#define MT6357_AFUNC_AUD_CON3                0x211c
+#define MT6357_AFUNC_AUD_CON4                0x211e
+#define MT6357_AFUNC_AUD_CON5                0x2120
+#define MT6357_AFUNC_AUD_CON6                0x2122
+#define MT6357_AFUNC_AUD_MON0                0x2124
+#define MT6357_AUDRC_TUNE_MON0               0x2126
+#define MT6357_AFE_ADDA_MTKAIF_FIFO_CFG0     0x2128
+#define MT6357_AFE_ADDA_MTKAIF_FIFO_LOG_MON1 0x212a
+#define MT6357_AFE_ADDA_MTKAIF_MON0          0x212c
+#define MT6357_AFE_ADDA_MTKAIF_MON1          0x212e
+#define MT6357_AFE_ADDA_MTKAIF_MON2          0x2130
+#define MT6357_AFE_ADDA_MTKAIF_MON3          0x2132
+#define MT6357_AFE_ADDA_MTKAIF_CFG0          0x2134
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG0       0x2136
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG1       0x2138
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG2       0x213a
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG3       0x213c
+#define MT6357_AFE_ADDA_MTKAIF_TX_CFG1       0x213e
+#define MT6357_AFE_SGEN_CFG0                 0x2140
+#define MT6357_AFE_SGEN_CFG1                 0x2142
+#define MT6357_AFE_ADC_ASYNC_FIFO_CFG        0x2144
+#define MT6357_AFE_DCCLK_CFG0                0x2146
+#define MT6357_AFE_DCCLK_CFG1                0x2148
+#define MT6357_AUDIO_DIG_CFG                 0x214a
+#define MT6357_AFE_AUD_PAD_TOP               0x214c
+#define MT6357_AFE_AUD_PAD_TOP_MON           0x214e
+#define MT6357_AFE_AUD_PAD_TOP_MON1          0x2150
+#define MT6357_AUDENC_DSN_ID                 0x2180
+#define MT6357_AUDENC_DSN_REV0               0x2182
+#define MT6357_AUDENC_DSN_DBI                0x2184
+#define MT6357_AUDENC_DSN_FPI                0x2186
+#define MT6357_AUDENC_ANA_CON0               0x2188
+#define MT6357_AUDENC_ANA_CON1               0x218a
+#define MT6357_AUDENC_ANA_CON2               0x218c
+#define MT6357_AUDENC_ANA_CON3               0x218e
+#define MT6357_AUDENC_ANA_CON4               0x2190
+#define MT6357_AUDENC_ANA_CON5               0x2192
+#define MT6357_AUDENC_ANA_CON6               0x2194
+#define MT6357_AUDENC_ANA_CON7               0x2196
+#define MT6357_AUDENC_ANA_CON8               0x2198
+#define MT6357_AUDENC_ANA_CON9               0x219a
+#define MT6357_AUDENC_ANA_CON10              0x219c
+#define MT6357_AUDENC_ANA_CON11              0x219e
+#define MT6357_AUDDEC_DSN_ID                 0x2200
+#define MT6357_AUDDEC_DSN_REV0               0x2202
+#define MT6357_AUDDEC_DSN_DBI                0x2204
+#define MT6357_AUDDEC_DSN_FPI                0x2206
+#define MT6357_AUDDEC_ANA_CON0               0x2208
+#define MT6357_AUDDEC_ANA_CON1               0x220a
+#define MT6357_AUDDEC_ANA_CON2               0x220c
+#define MT6357_AUDDEC_ANA_CON3               0x220e
+#define MT6357_AUDDEC_ANA_CON4               0x2210
+#define MT6357_AUDDEC_ANA_CON5               0x2212
+#define MT6357_AUDDEC_ANA_CON6               0x2214
+#define MT6357_AUDDEC_ANA_CON7               0x2216
+#define MT6357_AUDDEC_ANA_CON8               0x2218
+#define MT6357_AUDDEC_ANA_CON9               0x221a
+#define MT6357_AUDDEC_ANA_CON10              0x221c
+#define MT6357_AUDDEC_ANA_CON11              0x221e
+#define MT6357_AUDDEC_ANA_CON12              0x2220
+#define MT6357_AUDDEC_ANA_CON13              0x2222
+#define MT6357_AUDDEC_ELR_NUM                0x2224
+#define MT6357_AUDDEC_ELR_0                  0x2226
+#define MT6357_AUDZCD_DSN_ID                 0x2280
+#define MT6357_AUDZCD_DSN_REV0               0x2282
+#define MT6357_AUDZCD_DSN_DBI                0x2284
+#define MT6357_AUDZCD_DSN_FPI                0x2286
+#define MT6357_ZCD_CON0                      0x2288
+#define MT6357_ZCD_CON1                      0x228a
+#define MT6357_ZCD_CON2                      0x228c
+#define MT6357_ZCD_CON3                      0x228e
+#define MT6357_ZCD_CON4                      0x2290
+#define MT6357_ZCD_CON5                      0x2292
+#define MT6357_ACCDET_DSN_DIG_ID             0x2300
+#define MT6357_ACCDET_DSN_DIG_REV0           0x2302
+#define MT6357_ACCDET_DSN_DBI                0x2304
+#define MT6357_ACCDET_DSN_FPI                0x2306
+#define MT6357_ACCDET_CON0                   0x2308
+#define MT6357_ACCDET_CON1                   0x230a
+#define MT6357_ACCDET_CON2                   0x230c
+#define MT6357_ACCDET_CON3                   0x230e
+#define MT6357_ACCDET_CON4                   0x2310
+#define MT6357_ACCDET_CON5                   0x2312
+#define MT6357_ACCDET_CON6                   0x2314
+#define MT6357_ACCDET_CON7                   0x2316
+#define MT6357_ACCDET_CON8                   0x2318
+#define MT6357_ACCDET_CON9                   0x231a
+#define MT6357_ACCDET_CON10                  0x231c
+#define MT6357_ACCDET_CON11                  0x231e
+#define MT6357_ACCDET_CON12                  0x2320
+#define MT6357_ACCDET_CON13                  0x2322
+#define MT6357_ACCDET_CON14                  0x2324
+#define MT6357_ACCDET_CON15                  0x2326
+#define MT6357_ACCDET_CON16                  0x2328
+#define MT6357_ACCDET_CON17                  0x232a
+#define MT6357_ACCDET_CON18                  0x232c
+#define MT6357_ACCDET_CON19                  0x232e
+#define MT6357_ACCDET_CON20                  0x2330
+#define MT6357_ACCDET_CON21                  0x2332
+#define MT6357_ACCDET_CON22                  0x2334
+#define MT6357_ACCDET_CON23                  0x2336
+#define MT6357_ACCDET_CON24                  0x2338
+#define MT6357_ACCDET_CON25                  0x233a
+#define MT6357_ACCDET_CON26                  0x233c
+#define MT6357_ACCDET_CON27                  0x233e
+#define MT6357_ACCDET_CON28                  0x2340
+
+#endif /* __MFD_MT6357_REGISTERS_H__ */
-- 
cgit 


From 738654be3cf7af4a0a131bd92142db045f0660dc Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Tue, 31 May 2022 14:49:57 +0200
Subject: mfd: mt6358-irq: Add MT6357 PMIC support

Add MT6357 PMIC IRQ support.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220531124959.202787-6-fparent@baylibre.com
---
 drivers/mfd/mt6358-irq.c        | 24 ++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/mt6358-irq.c b/drivers/mfd/mt6358-irq.c
index ea5e452510eb..389756436af6 100644
--- a/drivers/mfd/mt6358-irq.c
+++ b/drivers/mfd/mt6358-irq.c
@@ -3,6 +3,8 @@
 // Copyright (c) 2020 MediaTek Inc.
 
 #include <linux/interrupt.h>
+#include <linux/mfd/mt6357/core.h>
+#include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/core.h>
@@ -17,6 +19,17 @@
 
 #define MTK_PMIC_REG_WIDTH 16
 
+static const struct irq_top_t mt6357_ints[] = {
+	MT6357_TOP_GEN(BUCK),
+	MT6357_TOP_GEN(LDO),
+	MT6357_TOP_GEN(PSC),
+	MT6357_TOP_GEN(SCK),
+	MT6357_TOP_GEN(BM),
+	MT6357_TOP_GEN(HK),
+	MT6357_TOP_GEN(AUD),
+	MT6357_TOP_GEN(MISC),
+};
+
 static const struct irq_top_t mt6358_ints[] = {
 	MT6358_TOP_GEN(BUCK),
 	MT6358_TOP_GEN(LDO),
@@ -39,6 +52,13 @@ static const struct irq_top_t mt6359_ints[] = {
 	MT6359_TOP_GEN(MISC),
 };
 
+static struct pmic_irq_data mt6357_irqd = {
+	.num_top = ARRAY_SIZE(mt6357_ints),
+	.num_pmic_irqs = MT6357_IRQ_NR,
+	.top_int_status_reg = MT6357_TOP_INT_STATUS0,
+	.pmic_ints = mt6357_ints,
+};
+
 static struct pmic_irq_data mt6358_irqd = {
 	.num_top = ARRAY_SIZE(mt6358_ints),
 	.num_pmic_irqs = MT6358_IRQ_NR,
@@ -211,6 +231,10 @@ int mt6358_irq_init(struct mt6397_chip *chip)
 	struct pmic_irq_data *irqd;
 
 	switch (chip->chip_id) {
+	case MT6357_CHIP_ID:
+		chip->irq_data = &mt6357_irqd;
+		break;
+
 	case MT6358_CHIP_ID:
 	case MT6366_CHIP_ID:
 		chip->irq_data = &mt6358_irqd;
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 1cf78726503b..3fecaffe5019 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -12,6 +12,7 @@
 
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
+	MT6357_CHIP_ID = 0x57,
 	MT6358_CHIP_ID = 0x58,
 	MT6359_CHIP_ID = 0x59,
 	MT6366_CHIP_ID = 0x66,
-- 
cgit 


From 66ee379d743c69c726b61d078119a34d5be96a35 Mon Sep 17 00:00:00 2001
From: Tinghan Shen <tinghan.shen@mediatek.com>
Date: Wed, 1 Jun 2022 19:22:01 +0800
Subject: mfd: cros_ec: Add SCP Core-1 as a new CrOS EC MCU

MT8195 System Companion Processors(SCP) is a dual-core RISC-V MCU.
Add a new CrOS feature ID to represent the SCP's 2nd core.

The 1st core is referred to as 'core 0', and the 2nd core is referred
to as 'core 1'.

Signed-off-by: Tinghan Shen <tinghan.shen@mediatek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220601112201.15510-16-tinghan.shen@mediatek.com
---
 drivers/mfd/cros_ec_dev.c                      | 5 +++++
 include/linux/platform_data/cros_ec_commands.h | 2 ++
 include/linux/platform_data/cros_ec_proto.h    | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 596731caf407..07cc31d92edc 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -64,6 +64,11 @@ static const struct cros_feature_to_name cros_mcu_devices[] = {
 		.name	= CROS_EC_DEV_SCP_NAME,
 		.desc	= "System Control Processor",
 	},
+	{
+		.id	= EC_FEATURE_SCP_C1,
+		.name	= CROS_EC_DEV_SCP_C1_NAME,
+		.desc	= "System Control Processor 2nd Core",
+	},
 	{
 		.id	= EC_FEATURE_TOUCHPAD,
 		.name	= CROS_EC_DEV_TP_NAME,
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8cfa8cfca77e..9fbf1c5eb8d3 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1300,6 +1300,8 @@ enum ec_feature_code {
 	 * mux.
 	 */
 	EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK = 43,
+	/* The MCU is a System Companion Processor (SCP) 2nd Core. */
+	EC_FEATURE_SCP_C1 = 45,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 138fd912c808..da06dc7cf1cb 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -19,6 +19,7 @@
 #define CROS_EC_DEV_ISH_NAME	"cros_ish"
 #define CROS_EC_DEV_PD_NAME	"cros_pd"
 #define CROS_EC_DEV_SCP_NAME	"cros_scp"
+#define CROS_EC_DEV_SCP_C1_NAME	"cros_scp_c1"
 #define CROS_EC_DEV_TP_NAME	"cros_tp"
 
 /*
-- 
cgit 


From 4a346a03a63cb45f7766d9d6559cf3fee783e926 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 17:21:48 +0200
Subject: mfd: twl: Remove platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no in-tree machine that provides a struct twl4030_platform_data
since commit e92fc4f04a34 ("ARM: OMAP2+: Drop legacy board file for
LDP"). So assume dev_get_platdata() returns NULL in twl_probe() and
simplify accordingly.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220614152148.252820-1-u.kleine-koenig@pengutronix.de
---
 drivers/mfd/twl-core.c  | 323 +-----------------------------------------------
 include/linux/mfd/twl.h |  55 ---------
 2 files changed, 5 insertions(+), 373 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index bd6659cf3bc0..2cb9326f3e61 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -656,309 +656,6 @@ static inline struct device *add_child(unsigned mod_no, const char *name,
 		can_wakeup, irq0, irq1);
 }
 
-static struct device *
-add_regulator_linked(int num, struct regulator_init_data *pdata,
-		struct regulator_consumer_supply *consumers,
-		unsigned num_consumers, unsigned long features)
-{
-	struct twl_regulator_driver_data drv_data;
-
-	/* regulator framework demands init_data ... */
-	if (!pdata)
-		return NULL;
-
-	if (consumers) {
-		pdata->consumer_supplies = consumers;
-		pdata->num_consumer_supplies = num_consumers;
-	}
-
-	if (pdata->driver_data) {
-		/* If we have existing drv_data, just add the flags */
-		struct twl_regulator_driver_data *tmp;
-		tmp = pdata->driver_data;
-		tmp->features |= features;
-	} else {
-		/* add new driver data struct, used only during init */
-		drv_data.features = features;
-		drv_data.set_voltage = NULL;
-		drv_data.get_voltage = NULL;
-		drv_data.data = NULL;
-		pdata->driver_data = &drv_data;
-	}
-
-	/* NOTE:  we currently ignore regulator IRQs, e.g. for short circuits */
-	return add_numbered_child(TWL_MODULE_PM_MASTER, "twl_reg", num,
-		pdata, sizeof(*pdata), false, 0, 0);
-}
-
-static struct device *
-add_regulator(int num, struct regulator_init_data *pdata,
-		unsigned long features)
-{
-	return add_regulator_linked(num, pdata, NULL, 0, features);
-}
-
-/*
- * NOTE:  We know the first 8 IRQs after pdata->base_irq are
- * for the PIH, and the next are for the PWR_INT SIH, since
- * that's how twl_init_irq() sets things up.
- */
-
-static int
-add_children(struct twl4030_platform_data *pdata, unsigned irq_base,
-		unsigned long features)
-{
-	struct device	*child;
-
-	if (IS_ENABLED(CONFIG_GPIO_TWL4030) && pdata->gpio) {
-		child = add_child(TWL4030_MODULE_GPIO, "twl4030_gpio",
-				pdata->gpio, sizeof(*pdata->gpio),
-				false, irq_base + GPIO_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_KEYBOARD_TWL4030) && pdata->keypad) {
-		child = add_child(TWL4030_MODULE_KEYPAD, "twl4030_keypad",
-				pdata->keypad, sizeof(*pdata->keypad),
-				true, irq_base + KEYPAD_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_MADC) && pdata->madc &&
-	    twl_class_is_4030()) {
-		child = add_child(TWL4030_MODULE_MADC, "twl4030_madc",
-				pdata->madc, sizeof(*pdata->madc),
-				true, irq_base + MADC_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_RTC_DRV_TWL4030)) {
-		/*
-		 * REVISIT platform_data here currently might expose the
-		 * "msecure" line ... but for now we just expect board
-		 * setup to tell the chip "it's always ok to SET_TIME".
-		 * Eventually, Linux might become more aware of such
-		 * HW security concerns, and "least privilege".
-		 */
-		child = add_child(TWL_MODULE_RTC, "twl_rtc", NULL, 0,
-				true, irq_base + RTC_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_PWM_TWL)) {
-		child = add_child(TWL_MODULE_PWM, "twl-pwm", NULL, 0,
-				  false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_PWM_TWL_LED)) {
-		child = add_child(TWL_MODULE_LED, "twl-pwmled", NULL, 0,
-				  false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_USB) && pdata->usb &&
-	    twl_class_is_4030()) {
-
-		static struct regulator_consumer_supply usb1v5 = {
-			.supply =	"usb1v5",
-		};
-		static struct regulator_consumer_supply usb1v8 = {
-			.supply =	"usb1v8",
-		};
-		static struct regulator_consumer_supply usb3v1 = {
-			.supply =	"usb3v1",
-		};
-
-	/* First add the regulators so that they can be used by transceiver */
-		if (IS_ENABLED(CONFIG_REGULATOR_TWL4030)) {
-			/* this is a template that gets copied */
-			struct regulator_init_data usb_fixed = {
-				.constraints.valid_modes_mask =
-					REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-				.constraints.valid_ops_mask =
-					REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-			};
-
-			child = add_regulator_linked(TWL4030_REG_VUSB1V5,
-						      &usb_fixed, &usb1v5, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-			child = add_regulator_linked(TWL4030_REG_VUSB1V8,
-						      &usb_fixed, &usb1v8, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-			child = add_regulator_linked(TWL4030_REG_VUSB3V1,
-						      &usb_fixed, &usb3v1, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-		}
-
-		child = add_child(TWL_MODULE_USB, "twl4030_usb",
-				pdata->usb, sizeof(*pdata->usb), true,
-				/* irq0 = USB_PRES, irq1 = USB */
-				irq_base + USB_PRES_INTR_OFFSET,
-				irq_base + USB_INTR_OFFSET);
-
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		/* we need to connect regulators to this transceiver */
-		if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && child) {
-			usb1v5.dev_name = dev_name(child);
-			usb1v8.dev_name = dev_name(child);
-			usb3v1.dev_name = dev_name(child);
-		}
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_WATCHDOG) && twl_class_is_4030()) {
-		child = add_child(TWL_MODULE_PM_RECEIVER, "twl4030_wdt", NULL,
-				  0, false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_INPUT_TWL4030_PWRBUTTON) && twl_class_is_4030()) {
-		child = add_child(TWL_MODULE_PM_MASTER, "twl4030_pwrbutton",
-				  NULL, 0, true, irq_base + 8 + 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_MFD_TWL4030_AUDIO) && pdata->audio &&
-	    twl_class_is_4030()) {
-		child = add_child(TWL4030_MODULE_AUDIO_VOICE, "twl4030-audio",
-				pdata->audio, sizeof(*pdata->audio),
-				false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	/* twl4030 regulators */
-	if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && twl_class_is_4030()) {
-		child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VIO, pdata->vio,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDD1, pdata->vdd1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDD2, pdata->vdd2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VMMC1, pdata->vmmc1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDAC, pdata->vdac,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator((features & TWL4030_VAUX2)
-					? TWL4030_REG_VAUX2_4030
-					: TWL4030_REG_VAUX2,
-				pdata->vaux2, features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTANA1, pdata->vintana1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTANA2, pdata->vintana2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTDIG, pdata->vintdig,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	/* maybe add LDOs that are omitted on cost-reduced parts */
-	if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && !(features & TPS_SUBSET)
-	  && twl_class_is_4030()) {
-		child = add_regulator(TWL4030_REG_VPLL2, pdata->vpll2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VMMC2, pdata->vmmc2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VSIM, pdata->vsim,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX1, pdata->vaux1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX3, pdata->vaux3,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX4, pdata->vaux4,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_CHARGER_TWL4030) && pdata->bci &&
-			!(features & (TPS_SUBSET | TWL5031))) {
-		child = add_child(TWL_MODULE_MAIN_CHARGE, "twl4030_bci",
-				pdata->bci, sizeof(*pdata->bci), false,
-				/* irq0 = CHG_PRES, irq1 = BCI */
-				irq_base + BCI_PRES_INTR_OFFSET,
-				irq_base + BCI_INTR_OFFSET);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_POWER) && pdata->power) {
-		child = add_child(TWL_MODULE_PM_MASTER, "twl4030_power",
-				  pdata->power, sizeof(*pdata->power), false,
-				  0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	return 0;
-}
-
 /*----------------------------------------------------------------------*/
 
 /*
@@ -987,8 +684,7 @@ static inline int unprotect_pm_master(void)
 	return e;
 }
 
-static void clocks_init(struct device *dev,
-			struct twl4030_clock_init_data *clock)
+static void clocks_init(struct device *dev)
 {
 	int e = 0;
 	struct clk *osc;
@@ -1018,8 +714,6 @@ static void clocks_init(struct device *dev,
 	}
 
 	ctrl |= HIGH_PERF_SQ;
-	if (clock && clock->ck32k_lowpwr_enable)
-		ctrl |= CK32K_LOWPWR_EN;
 
 	e |= unprotect_pm_master();
 	/* effect->MADC+USB ck en */
@@ -1063,7 +757,6 @@ static struct of_dev_auxdata twl_auxdata_lookup[] = {
 static int
 twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 {
-	struct twl4030_platform_data	*pdata = dev_get_platdata(&client->dev);
 	struct device_node		*node = client->dev.of_node;
 	struct platform_device		*pdev;
 	const struct regmap_config	*twl_regmap_config;
@@ -1071,7 +764,7 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	int				status;
 	unsigned			i, num_slaves;
 
-	if (!node && !pdata) {
+	if (!node) {
 		dev_err(&client->dev, "no platform data\n");
 		return -EINVAL;
 	}
@@ -1161,7 +854,7 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	twl_priv->ready = true;
 
 	/* setup clock framework */
-	clocks_init(&client->dev, pdata ? pdata->clock : NULL);
+	clocks_init(&client->dev);
 
 	/* read TWL IDCODE Register */
 	if (twl_class_is_4030()) {
@@ -1209,14 +902,8 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 				 TWL4030_DCDC_GLOBAL_CFG);
 	}
 
-	if (node) {
-		if (pdata)
-			twl_auxdata_lookup[0].platform_data = pdata->gpio;
-		status = of_platform_populate(node, NULL, twl_auxdata_lookup,
-					      &client->dev);
-	} else {
-		status = add_children(pdata, irq_base, id->driver_data);
-	}
+	status = of_platform_populate(node, NULL, twl_auxdata_lookup,
+				      &client->dev);
 
 fail:
 	if (status < 0)
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 8871cc5188a0..e426938eafd5 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -694,61 +694,6 @@ struct twl4030_audio_data {
 	unsigned int irq_base;
 };
 
-struct twl4030_platform_data {
-	struct twl4030_clock_init_data		*clock;
-	struct twl4030_bci_platform_data	*bci;
-	struct twl4030_gpio_platform_data	*gpio;
-	struct twl4030_madc_platform_data	*madc;
-	struct twl4030_keypad_data		*keypad;
-	struct twl4030_usb_data			*usb;
-	struct twl4030_power_data		*power;
-	struct twl4030_audio_data		*audio;
-
-	/* Common LDO regulators for TWL4030/TWL6030 */
-	struct regulator_init_data		*vdac;
-	struct regulator_init_data		*vaux1;
-	struct regulator_init_data		*vaux2;
-	struct regulator_init_data		*vaux3;
-	struct regulator_init_data		*vdd1;
-	struct regulator_init_data		*vdd2;
-	struct regulator_init_data		*vdd3;
-	/* TWL4030 LDO regulators */
-	struct regulator_init_data		*vpll1;
-	struct regulator_init_data		*vpll2;
-	struct regulator_init_data		*vmmc1;
-	struct regulator_init_data		*vmmc2;
-	struct regulator_init_data		*vsim;
-	struct regulator_init_data		*vaux4;
-	struct regulator_init_data		*vio;
-	struct regulator_init_data		*vintana1;
-	struct regulator_init_data		*vintana2;
-	struct regulator_init_data		*vintdig;
-	/* TWL6030 LDO regulators */
-	struct regulator_init_data              *vmmc;
-	struct regulator_init_data              *vpp;
-	struct regulator_init_data              *vusim;
-	struct regulator_init_data              *vana;
-	struct regulator_init_data              *vcxio;
-	struct regulator_init_data              *vusb;
-	struct regulator_init_data		*clk32kg;
-	struct regulator_init_data              *v1v8;
-	struct regulator_init_data              *v2v1;
-	/* TWL6032 LDO regulators */
-	struct regulator_init_data		*ldo1;
-	struct regulator_init_data		*ldo2;
-	struct regulator_init_data		*ldo3;
-	struct regulator_init_data		*ldo4;
-	struct regulator_init_data		*ldo5;
-	struct regulator_init_data		*ldo6;
-	struct regulator_init_data		*ldo7;
-	struct regulator_init_data		*ldoln;
-	struct regulator_init_data		*ldousb;
-	/* TWL6032 DCDC regulators */
-	struct regulator_init_data		*smps3;
-	struct regulator_init_data		*smps4;
-	struct regulator_init_data		*vio6025;
-};
-
 struct twl_regulator_driver_data {
 	int		(*set_voltage)(void *data, int target_uV);
 	int		(*get_voltage)(void *data);
-- 
cgit 


From c55333064d6ea9a26c7e8cfbe2ba1fa77c3a8e48 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 19 Jun 2022 10:26:55 +0200
Subject: mfd: tc6393xb: Make disable callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All implementations return 0, so simplify accordingly.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220619082655.53728-1-u.kleine-koenig@pengutronix.de
---
 arch/arm/mach-pxa/eseries.c  | 3 +--
 arch/arm/mach-pxa/tosa.c     | 4 +---
 drivers/mfd/tc6393xb.c       | 5 ++---
 include/linux/mfd/tc6393xb.h | 2 +-
 4 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index 99781eec065e..6226aee152d3 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -86,11 +86,10 @@ int eseries_tmio_enable(struct platform_device *dev)
 	return 0;
 }
 
-int eseries_tmio_disable(struct platform_device *dev)
+void eseries_tmio_disable(struct platform_device *dev)
 {
 	gpio_set_value(GPIO_ESERIES_TMIO_SUSPEND, 0);
 	gpio_set_value(GPIO_ESERIES_TMIO_PCLR, 0);
-	return 0;
 }
 
 int eseries_tmio_suspend(struct platform_device *dev)
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 6af8bc404825..d41641d6cfcd 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -678,13 +678,11 @@ err_req_pclr:
 	return rc;
 }
 
-static int tosa_tc6393xb_disable(struct platform_device *dev)
+static void tosa_tc6393xb_disable(struct platform_device *dev)
 {
 	gpio_free(TOSA_GPIO_TC6393XB_L3V_ON);
 	gpio_free(TOSA_GPIO_TC6393XB_SUSPEND);
 	gpio_free(TOSA_GPIO_TC6393XB_REST_IN);
-
-	return 0;
 }
 
 static int tosa_tc6393xb_resume(struct platform_device *dev)
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 0be5731685b4..aa903a31dd43 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -798,20 +798,19 @@ static int tc6393xb_remove(struct platform_device *dev)
 {
 	struct tc6393xb_platform_data *tcpd = dev_get_platdata(&dev->dev);
 	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
-	int ret;
 
 	mfd_remove_devices(&dev->dev);
 
 	tc6393xb_detach_irq(dev);
 
-	ret = tcpd->disable(dev);
+	tcpd->disable(dev);
 	clk_disable_unprepare(tc6393xb->clk);
 	iounmap(tc6393xb->scr);
 	release_resource(&tc6393xb->rscr);
 	clk_put(tc6393xb->clk);
 	kfree(tc6393xb);
 
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index d336c541b7df..d17807f2d0c9 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -22,7 +22,7 @@ struct tc6393xb_platform_data {
 	u16	scr_gper;	/* GP Enable */
 
 	int	(*enable)(struct platform_device *dev);
-	int	(*disable)(struct platform_device *dev);
+	void	(*disable)(struct platform_device *dev);
 	int	(*suspend)(struct platform_device *dev);
 	int	(*resume)(struct platform_device *dev);
 
-- 
cgit 


From 79f821b5a3bf46d2d5ee2dc0e2b2428b1062a040 Mon Sep 17 00:00:00 2001
From: Zhang Jiaming <jiaming@nfschina.com>
Date: Thu, 23 Jun 2022 15:33:33 +0800
Subject: mfd: ipaq-micro: Fix spelling mistake of "receive{d}"

Change 'receieved' to 'received' and 'recieve' to 'receive'.

Signed-off-by: Zhang Jiaming <jiaming@nfschina.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220623073333.5675-1-jiaming@nfschina.com
---
 include/linux/mfd/ipaq-micro.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/ipaq-micro.h b/include/linux/mfd/ipaq-micro.h
index ee48a4321c57..d5caa4c86ecc 100644
--- a/include/linux/mfd/ipaq-micro.h
+++ b/include/linux/mfd/ipaq-micro.h
@@ -75,8 +75,8 @@ struct ipaq_micro_rxdev {
  * @id: 4-bit ID of the message
  * @tx_len: length of TX data
  * @tx_data: TX data to send
- * @rx_len: length of receieved RX data
- * @rx_data: RX data to recieve
+ * @rx_len: length of received RX data
+ * @rx_data: RX data to receive
  * @ack: a completion that will be completed when RX is complete
  * @node: list node if message gets queued
  */
-- 
cgit 


From d9cd0bc604705eb01497c3e483f8820a9677c682 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Mon, 27 Jun 2022 14:39:54 +0200
Subject: mfd: mt6397: Add basic support for MT6331+MT6332 PMIC

Add support for the MT6331 PMIC with MT6332 Companion PMIC, found
in MT6795 Helio X10 smartphone platforms.

This combo has support for multiple devices but, for a start,
only the following have been implemented:
- Regulators (two instances, one in MT6331, one in MT6332)
- RTC (MT6331)
- Keys (MT6331)
- Interrupts (MT6331 also dispatches MT6332's interrupts)

There's more to be implemented, especially for MT6332, which
will come at a later stage.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220627123954.64299-1-angelogioacchino.delregno@collabora.com
---
 drivers/mfd/mt6397-core.c            |  47 +++
 drivers/mfd/mt6397-irq.c             |   9 +-
 include/linux/mfd/mt6331/core.h      |  40 +++
 include/linux/mfd/mt6331/registers.h | 584 +++++++++++++++++++++++++++++++
 include/linux/mfd/mt6332/core.h      |  65 ++++
 include/linux/mfd/mt6332/registers.h | 642 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h      |   2 +
 7 files changed, 1388 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mfd/mt6331/core.h
 create mode 100644 include/linux/mfd/mt6331/registers.h
 create mode 100644 include/linux/mfd/mt6332/core.h
 create mode 100644 include/linux/mfd/mt6332/registers.h

(limited to 'include')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 3cb8836bd08d..f6c1f80f94a4 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -12,11 +12,13 @@
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6331/core.h>
 #include <linux/mfd/mt6357/core.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/registers.h>
@@ -28,6 +30,9 @@
 #define MT6357_RTC_BASE		0x0588
 #define MT6357_RTC_SIZE		0x3c
 
+#define MT6331_RTC_BASE		0x4000
+#define MT6331_RTC_SIZE		0x40
+
 #define MT6358_RTC_BASE		0x0588
 #define MT6358_RTC_SIZE		0x3c
 
@@ -47,6 +52,11 @@ static const struct resource mt6357_rtc_resources[] = {
 	DEFINE_RES_IRQ(MT6357_IRQ_RTC),
 };
 
+static const struct resource mt6331_rtc_resources[] = {
+	DEFINE_RES_MEM(MT6331_RTC_BASE, MT6331_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6331_IRQ_STATUS_RTC),
+};
+
 static const struct resource mt6358_rtc_resources[] = {
 	DEFINE_RES_MEM(MT6358_RTC_BASE, MT6358_RTC_SIZE),
 	DEFINE_RES_IRQ(MT6358_IRQ_RTC),
@@ -83,6 +93,11 @@ static const struct resource mt6357_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY_R, "homekey_r"),
 };
 
+static const struct resource mt6331_keys_resources[] = {
+	DEFINE_RES_IRQ_NAMED(MT6331_IRQ_STATUS_PWRKEY, "powerkey"),
+	DEFINE_RES_IRQ_NAMED(MT6331_IRQ_STATUS_HOMEKEY, "homekey"),
+};
+
 static const struct resource mt6397_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_PWRKEY, "powerkey"),
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_HOMEKEY, "homekey"),
@@ -133,6 +148,27 @@ static const struct mfd_cell mt6357_devs[] = {
 	},
 };
 
+/* MT6331 is always used in combination with MT6332 */
+static const struct mfd_cell mt6331_mt6332_devs[] = {
+	{
+		.name = "mt6331-rtc",
+		.num_resources = ARRAY_SIZE(mt6331_rtc_resources),
+		.resources = mt6331_rtc_resources,
+		.of_compatible = "mediatek,mt6331-rtc",
+	}, {
+		.name = "mt6331-regulator",
+		.of_compatible = "mediatek,mt6331-regulator"
+	}, {
+		.name = "mt6332-regulator",
+		.of_compatible = "mediatek,mt6332-regulator"
+	}, {
+		.name = "mtk-pmic-keys",
+		.num_resources = ARRAY_SIZE(mt6331_keys_resources),
+		.resources = mt6331_keys_resources,
+		.of_compatible = "mediatek,mt6331-keys"
+	},
+};
+
 static const struct mfd_cell mt6358_devs[] = {
 	{
 		.name = "mt6358-regulator",
@@ -220,6 +256,14 @@ static const struct chip_data mt6357_core = {
 	.irq_init = mt6358_irq_init,
 };
 
+static const struct chip_data mt6331_mt6332_core = {
+	.cid_addr = MT6331_HWCID,
+	.cid_shift = 0,
+	.cells = mt6331_mt6332_devs,
+	.cell_size = ARRAY_SIZE(mt6331_mt6332_devs),
+	.irq_init = mt6397_irq_init,
+};
+
 static const struct chip_data mt6358_core = {
 	.cid_addr = MT6358_SWCID,
 	.cid_shift = 8,
@@ -302,6 +346,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	{
 		.compatible = "mediatek,mt6323",
 		.data = &mt6323_core,
+	}, {
+		.compatible = "mediatek,mt6331",
+		.data = &mt6331_mt6332_core,
 	}, {
 		.compatible = "mediatek,mt6357",
 		.data = &mt6357_core,
diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c
index 2924919da991..eff53fed8fe7 100644
--- a/drivers/mfd/mt6397-irq.c
+++ b/drivers/mfd/mt6397-irq.c
@@ -12,6 +12,8 @@
 #include <linux/suspend.h>
 #include <linux/mfd/mt6323/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6331/core.h>
+#include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6397/registers.h>
 
@@ -172,7 +174,12 @@ int mt6397_irq_init(struct mt6397_chip *chip)
 		chip->int_status[0] = MT6323_INT_STATUS0;
 		chip->int_status[1] = MT6323_INT_STATUS1;
 		break;
-
+	case MT6331_CHIP_ID:
+		chip->int_con[0] = MT6331_INT_CON0;
+		chip->int_con[1] = MT6331_INT_CON1;
+		chip->int_status[0] = MT6331_INT_STATUS_CON0;
+		chip->int_status[1] = MT6331_INT_STATUS_CON1;
+		break;
 	case MT6391_CHIP_ID:
 	case MT6397_CHIP_ID:
 		chip->int_con[0] = MT6397_INT_CON0;
diff --git a/include/linux/mfd/mt6331/core.h b/include/linux/mfd/mt6331/core.h
new file mode 100644
index 000000000000..df8e6b1e4bc1
--- /dev/null
+++ b/include/linux/mfd/mt6331/core.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6331_CORE_H__
+#define __MFD_MT6331_CORE_H__
+
+enum mt6331_irq_status_numbers {
+	MT6331_IRQ_STATUS_PWRKEY = 0,
+	MT6331_IRQ_STATUS_HOMEKEY,
+	MT6331_IRQ_STATUS_CHRDET,
+	MT6331_IRQ_STATUS_THR_H,
+	MT6331_IRQ_STATUS_THR_L,
+	MT6331_IRQ_STATUS_BAT_H,
+	MT6331_IRQ_STATUS_BAT_L,
+	MT6331_IRQ_STATUS_RTC,
+	MT6331_IRQ_STATUS_AUDIO,
+	MT6331_IRQ_STATUS_MAD,
+	MT6331_IRQ_STATUS_ACCDET,
+	MT6331_IRQ_STATUS_ACCDET_EINT,
+	MT6331_IRQ_STATUS_ACCDET_NEGV = 12,
+	MT6331_IRQ_STATUS_VDVFS11_OC = 16,
+	MT6331_IRQ_STATUS_VDVFS12_OC,
+	MT6331_IRQ_STATUS_VDVFS13_OC,
+	MT6331_IRQ_STATUS_VDVFS14_OC,
+	MT6331_IRQ_STATUS_GPU_OC,
+	MT6331_IRQ_STATUS_VCORE1_OC,
+	MT6331_IRQ_STATUS_VCORE2_OC,
+	MT6331_IRQ_STATUS_VIO18_OC,
+	MT6331_IRQ_STATUS_LDO_OC,
+	MT6331_IRQ_STATUS_NR,
+};
+
+#define MT6331_IRQ_CON0_BASE	MT6331_IRQ_STATUS_PWRKEY
+#define MT6331_IRQ_CON0_BITS	(MT6331_IRQ_STATUS_ACCDET_NEGV + 1)
+#define MT6331_IRQ_CON1_BASE	MT6331_IRQ_STATUS_VDVFS11_OC
+#define MT6331_IRQ_CON1_BITS	(MT6331_IRQ_STATUS_LDO_OC - MT6331_IRQ_STATUS_VDFS11_OC + 1)
+
+#endif /* __MFD_MT6331_CORE_H__ */
diff --git a/include/linux/mfd/mt6331/registers.h b/include/linux/mfd/mt6331/registers.h
new file mode 100644
index 000000000000..e2be6bccd1a7
--- /dev/null
+++ b/include/linux/mfd/mt6331/registers.h
@@ -0,0 +1,584 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6331_REGISTERS_H__
+#define __MFD_MT6331_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6331_STRUP_CON0		0x0
+#define MT6331_STRUP_CON2		0x2
+#define MT6331_STRUP_CON3		0x4
+#define MT6331_STRUP_CON4		0x6
+#define MT6331_STRUP_CON5		0x8
+#define MT6331_STRUP_CON6		0xA
+#define MT6331_STRUP_CON7		0xC
+#define MT6331_STRUP_CON8		0xE
+#define MT6331_STRUP_CON9		0x10
+#define MT6331_STRUP_CON10		0x12
+#define MT6331_STRUP_CON11		0x14
+#define MT6331_STRUP_CON12		0x16
+#define MT6331_STRUP_CON13		0x18
+#define MT6331_STRUP_CON14		0x1A
+#define MT6331_STRUP_CON15		0x1C
+#define MT6331_STRUP_CON16		0x1E
+#define MT6331_STRUP_CON17		0x20
+#define MT6331_STRUP_CON18		0x22
+#define MT6331_HWCID			0x100
+#define MT6331_SWCID			0x102
+#define MT6331_EXT_PMIC_STATUS		0x104
+#define MT6331_TOP_CON			0x106
+#define MT6331_TEST_OUT			0x108
+#define MT6331_TEST_CON0		0x10A
+#define MT6331_TEST_CON1		0x10C
+#define MT6331_TESTMODE_SW		0x10E
+#define MT6331_EN_STATUS0		0x110
+#define MT6331_EN_STATUS1		0x112
+#define MT6331_EN_STATUS2		0x114
+#define MT6331_OCSTATUS0		0x116
+#define MT6331_OCSTATUS1		0x118
+#define MT6331_OCSTATUS2		0x11A
+#define MT6331_PGSTATUS			0x11C
+#define MT6331_TOPSTATUS		0x11E
+#define MT6331_TDSEL_CON		0x120
+#define MT6331_RDSEL_CON		0x122
+#define MT6331_SMT_CON0			0x124
+#define MT6331_SMT_CON1			0x126
+#define MT6331_SMT_CON2			0x128
+#define MT6331_DRV_CON0			0x12A
+#define MT6331_DRV_CON1			0x12C
+#define MT6331_DRV_CON2			0x12E
+#define MT6331_DRV_CON3			0x130
+#define MT6331_TOP_STATUS		0x132
+#define MT6331_TOP_STATUS_SET		0x134
+#define MT6331_TOP_STATUS_CLR		0x136
+#define MT6331_TOP_CKPDN_CON0		0x138
+#define MT6331_TOP_CKPDN_CON0_SET	0x13A
+#define MT6331_TOP_CKPDN_CON0_CLR	0x13C
+#define MT6331_TOP_CKPDN_CON1		0x13E
+#define MT6331_TOP_CKPDN_CON1_SET	0x140
+#define MT6331_TOP_CKPDN_CON1_CLR	0x142
+#define MT6331_TOP_CKPDN_CON2		0x144
+#define MT6331_TOP_CKPDN_CON2_SET	0x146
+#define MT6331_TOP_CKPDN_CON2_CLR	0x148
+#define MT6331_TOP_CKSEL_CON		0x14A
+#define MT6331_TOP_CKSEL_CON_SET	0x14C
+#define MT6331_TOP_CKSEL_CON_CLR	0x14E
+#define MT6331_TOP_CKHWEN_CON		0x150
+#define MT6331_TOP_CKHWEN_CON_SET	0x152
+#define MT6331_TOP_CKHWEN_CON_CLR	0x154
+#define MT6331_TOP_CKTST_CON0		0x156
+#define MT6331_TOP_CKTST_CON1		0x158
+#define MT6331_TOP_CLKSQ		0x15A
+#define MT6331_TOP_CLKSQ_SET		0x15C
+#define MT6331_TOP_CLKSQ_CLR		0x15E
+#define MT6331_TOP_RST_CON		0x160
+#define MT6331_TOP_RST_CON_SET		0x162
+#define MT6331_TOP_RST_CON_CLR		0x164
+#define MT6331_TOP_RST_MISC		0x166
+#define MT6331_TOP_RST_MISC_SET		0x168
+#define MT6331_TOP_RST_MISC_CLR		0x16A
+#define MT6331_INT_CON0			0x16C
+#define MT6331_INT_CON0_SET		0x16E
+#define MT6331_INT_CON0_CLR		0x170
+#define MT6331_INT_CON1			0x172
+#define MT6331_INT_CON1_SET		0x174
+#define MT6331_INT_CON1_CLR		0x176
+#define MT6331_INT_MISC_CON		0x178
+#define MT6331_INT_MISC_CON_SET		0x17A
+#define MT6331_INT_MISC_CON_CLR		0x17C
+#define MT6331_INT_STATUS_CON0		0x17E
+#define MT6331_INT_STATUS_CON1		0x180
+#define MT6331_OC_GEAR_0		0x182
+#define MT6331_FQMTR_CON0		0x184
+#define MT6331_FQMTR_CON1		0x186
+#define MT6331_FQMTR_CON2		0x188
+#define MT6331_RG_SPI_CON		0x18A
+#define MT6331_DEW_DIO_EN		0x18C
+#define MT6331_DEW_READ_TEST		0x18E
+#define MT6331_DEW_WRITE_TEST		0x190
+#define MT6331_DEW_CRC_SWRST		0x192
+#define MT6331_DEW_CRC_EN		0x194
+#define MT6331_DEW_CRC_VAL		0x196
+#define MT6331_DEW_DBG_MON_SEL		0x198
+#define MT6331_DEW_CIPHER_KEY_SEL	0x19A
+#define MT6331_DEW_CIPHER_IV_SEL	0x19C
+#define MT6331_DEW_CIPHER_EN		0x19E
+#define MT6331_DEW_CIPHER_RDY		0x1A0
+#define MT6331_DEW_CIPHER_MODE		0x1A2
+#define MT6331_DEW_CIPHER_SWRST		0x1A4
+#define MT6331_DEW_RDDMY_NO		0x1A6
+#define MT6331_INT_TYPE_CON0		0x1A8
+#define MT6331_INT_TYPE_CON0_SET	0x1AA
+#define MT6331_INT_TYPE_CON0_CLR	0x1AC
+#define MT6331_INT_TYPE_CON1		0x1AE
+#define MT6331_INT_TYPE_CON1_SET	0x1B0
+#define MT6331_INT_TYPE_CON1_CLR	0x1B2
+#define MT6331_INT_STA			0x1B4
+#define MT6331_BUCK_ALL_CON0		0x200
+#define MT6331_BUCK_ALL_CON1		0x202
+#define MT6331_BUCK_ALL_CON2		0x204
+#define MT6331_BUCK_ALL_CON3		0x206
+#define MT6331_BUCK_ALL_CON4		0x208
+#define MT6331_BUCK_ALL_CON5		0x20A
+#define MT6331_BUCK_ALL_CON6		0x20C
+#define MT6331_BUCK_ALL_CON7		0x20E
+#define MT6331_BUCK_ALL_CON8		0x210
+#define MT6331_BUCK_ALL_CON9		0x212
+#define MT6331_BUCK_ALL_CON10		0x214
+#define MT6331_BUCK_ALL_CON11		0x216
+#define MT6331_BUCK_ALL_CON12		0x218
+#define MT6331_BUCK_ALL_CON13		0x21A
+#define MT6331_BUCK_ALL_CON14		0x21C
+#define MT6331_BUCK_ALL_CON15		0x21E
+#define MT6331_BUCK_ALL_CON16		0x220
+#define MT6331_BUCK_ALL_CON17		0x222
+#define MT6331_BUCK_ALL_CON18		0x224
+#define MT6331_BUCK_ALL_CON19		0x226
+#define MT6331_BUCK_ALL_CON20		0x228
+#define MT6331_BUCK_ALL_CON21		0x22A
+#define MT6331_BUCK_ALL_CON22		0x22C
+#define MT6331_BUCK_ALL_CON23		0x22E
+#define MT6331_BUCK_ALL_CON24		0x230
+#define MT6331_BUCK_ALL_CON25		0x232
+#define MT6331_BUCK_ALL_CON26		0x234
+#define MT6331_VDVFS11_CON0		0x236
+#define MT6331_VDVFS11_CON1		0x238
+#define MT6331_VDVFS11_CON2		0x23A
+#define MT6331_VDVFS11_CON3		0x23C
+#define MT6331_VDVFS11_CON4		0x23E
+#define MT6331_VDVFS11_CON5		0x240
+#define MT6331_VDVFS11_CON6		0x242
+#define MT6331_VDVFS11_CON7		0x244
+#define MT6331_VDVFS11_CON8		0x246
+#define MT6331_VDVFS11_CON9		0x248
+#define MT6331_VDVFS11_CON10		0x24A
+#define MT6331_VDVFS11_CON11		0x24C
+#define MT6331_VDVFS11_CON12		0x24E
+#define MT6331_VDVFS11_CON13		0x250
+#define MT6331_VDVFS11_CON14		0x252
+#define MT6331_VDVFS11_CON18		0x25A
+#define MT6331_VDVFS11_CON19		0x25C
+#define MT6331_VDVFS11_CON20		0x25E
+#define MT6331_VDVFS11_CON21		0x260
+#define MT6331_VDVFS11_CON22		0x262
+#define MT6331_VDVFS11_CON23		0x264
+#define MT6331_VDVFS11_CON24		0x266
+#define MT6331_VDVFS11_CON25		0x268
+#define MT6331_VDVFS11_CON26		0x26A
+#define MT6331_VDVFS11_CON27		0x26C
+#define MT6331_VDVFS12_CON0		0x26E
+#define MT6331_VDVFS12_CON1		0x270
+#define MT6331_VDVFS12_CON2		0x272
+#define MT6331_VDVFS12_CON3		0x274
+#define MT6331_VDVFS12_CON4		0x276
+#define MT6331_VDVFS12_CON5		0x278
+#define MT6331_VDVFS12_CON6		0x27A
+#define MT6331_VDVFS12_CON7		0x27C
+#define MT6331_VDVFS12_CON8		0x27E
+#define MT6331_VDVFS12_CON9		0x280
+#define MT6331_VDVFS12_CON10		0x282
+#define MT6331_VDVFS12_CON11		0x284
+#define MT6331_VDVFS12_CON12		0x286
+#define MT6331_VDVFS12_CON13		0x288
+#define MT6331_VDVFS12_CON14		0x28A
+#define MT6331_VDVFS12_CON18		0x292
+#define MT6331_VDVFS12_CON19		0x294
+#define MT6331_VDVFS12_CON20		0x296
+#define MT6331_VDVFS13_CON0		0x298
+#define MT6331_VDVFS13_CON1		0x29A
+#define MT6331_VDVFS13_CON2		0x29C
+#define MT6331_VDVFS13_CON3		0x29E
+#define MT6331_VDVFS13_CON4		0x2A0
+#define MT6331_VDVFS13_CON5		0x2A2
+#define MT6331_VDVFS13_CON6		0x2A4
+#define MT6331_VDVFS13_CON7		0x2A6
+#define MT6331_VDVFS13_CON8		0x2A8
+#define MT6331_VDVFS13_CON9		0x2AA
+#define MT6331_VDVFS13_CON10		0x2AC
+#define MT6331_VDVFS13_CON11		0x2AE
+#define MT6331_VDVFS13_CON12		0x2B0
+#define MT6331_VDVFS13_CON13		0x2B2
+#define MT6331_VDVFS13_CON14		0x2B4
+#define MT6331_VDVFS13_CON18		0x2BC
+#define MT6331_VDVFS13_CON19		0x2BE
+#define MT6331_VDVFS13_CON20		0x2C0
+#define MT6331_VDVFS14_CON0		0x2C2
+#define MT6331_VDVFS14_CON1		0x2C4
+#define MT6331_VDVFS14_CON2		0x2C6
+#define MT6331_VDVFS14_CON3		0x2C8
+#define MT6331_VDVFS14_CON4		0x2CA
+#define MT6331_VDVFS14_CON5		0x2CC
+#define MT6331_VDVFS14_CON6		0x2CE
+#define MT6331_VDVFS14_CON7		0x2D0
+#define MT6331_VDVFS14_CON8		0x2D2
+#define MT6331_VDVFS14_CON9		0x2D4
+#define MT6331_VDVFS14_CON10		0x2D6
+#define MT6331_VDVFS14_CON11		0x2D8
+#define MT6331_VDVFS14_CON12		0x2DA
+#define MT6331_VDVFS14_CON13		0x2DC
+#define MT6331_VDVFS14_CON14		0x2DE
+#define MT6331_VDVFS14_CON18		0x2E6
+#define MT6331_VDVFS14_CON19		0x2E8
+#define MT6331_VDVFS14_CON20		0x2EA
+#define MT6331_VGPU_CON0		0x300
+#define MT6331_VGPU_CON1		0x302
+#define MT6331_VGPU_CON2		0x304
+#define MT6331_VGPU_CON3		0x306
+#define MT6331_VGPU_CON4		0x308
+#define MT6331_VGPU_CON5		0x30A
+#define MT6331_VGPU_CON6		0x30C
+#define MT6331_VGPU_CON7		0x30E
+#define MT6331_VGPU_CON8		0x310
+#define MT6331_VGPU_CON9		0x312
+#define MT6331_VGPU_CON10		0x314
+#define MT6331_VGPU_CON11		0x316
+#define MT6331_VGPU_CON12		0x318
+#define MT6331_VGPU_CON13		0x31A
+#define MT6331_VGPU_CON14		0x31C
+#define MT6331_VGPU_CON15		0x31E
+#define MT6331_VGPU_CON16		0x320
+#define MT6331_VGPU_CON17		0x322
+#define MT6331_VGPU_CON18		0x324
+#define MT6331_VGPU_CON19		0x326
+#define MT6331_VGPU_CON20		0x328
+#define MT6331_VCORE1_CON0		0x32A
+#define MT6331_VCORE1_CON1		0x32C
+#define MT6331_VCORE1_CON2		0x32E
+#define MT6331_VCORE1_CON3		0x330
+#define MT6331_VCORE1_CON4		0x332
+#define MT6331_VCORE1_CON5		0x334
+#define MT6331_VCORE1_CON6		0x336
+#define MT6331_VCORE1_CON7		0x338
+#define MT6331_VCORE1_CON8		0x33A
+#define MT6331_VCORE1_CON9		0x33C
+#define MT6331_VCORE1_CON10		0x33E
+#define MT6331_VCORE1_CON11		0x340
+#define MT6331_VCORE1_CON12		0x342
+#define MT6331_VCORE1_CON13		0x344
+#define MT6331_VCORE1_CON14		0x346
+#define MT6331_VCORE1_CON15		0x348
+#define MT6331_VCORE1_CON16		0x34A
+#define MT6331_VCORE1_CON17		0x34C
+#define MT6331_VCORE1_CON18		0x34E
+#define MT6331_VCORE1_CON19		0x350
+#define MT6331_VCORE1_CON20		0x352
+#define MT6331_VCORE2_CON0		0x354
+#define MT6331_VCORE2_CON1		0x356
+#define MT6331_VCORE2_CON2		0x358
+#define MT6331_VCORE2_CON3		0x35A
+#define MT6331_VCORE2_CON4		0x35C
+#define MT6331_VCORE2_CON5		0x35E
+#define MT6331_VCORE2_CON6		0x360
+#define MT6331_VCORE2_CON7		0x362
+#define MT6331_VCORE2_CON8		0x364
+#define MT6331_VCORE2_CON9		0x366
+#define MT6331_VCORE2_CON10		0x368
+#define MT6331_VCORE2_CON11		0x36A
+#define MT6331_VCORE2_CON12		0x36C
+#define MT6331_VCORE2_CON13		0x36E
+#define MT6331_VCORE2_CON14		0x370
+#define MT6331_VCORE2_CON15		0x372
+#define MT6331_VCORE2_CON16		0x374
+#define MT6331_VCORE2_CON17		0x376
+#define MT6331_VCORE2_CON18		0x378
+#define MT6331_VCORE2_CON19		0x37A
+#define MT6331_VCORE2_CON20		0x37C
+#define MT6331_VCORE2_CON21		0x37E
+#define MT6331_VIO18_CON0		0x380
+#define MT6331_VIO18_CON1		0x382
+#define MT6331_VIO18_CON2		0x384
+#define MT6331_VIO18_CON3		0x386
+#define MT6331_VIO18_CON4		0x388
+#define MT6331_VIO18_CON5		0x38A
+#define MT6331_VIO18_CON6		0x38C
+#define MT6331_VIO18_CON7		0x38E
+#define MT6331_VIO18_CON8		0x390
+#define MT6331_VIO18_CON9		0x392
+#define MT6331_VIO18_CON10		0x394
+#define MT6331_VIO18_CON11		0x396
+#define MT6331_VIO18_CON12		0x398
+#define MT6331_VIO18_CON13		0x39A
+#define MT6331_VIO18_CON14		0x39C
+#define MT6331_VIO18_CON15		0x39E
+#define MT6331_VIO18_CON16		0x3A0
+#define MT6331_VIO18_CON17		0x3A2
+#define MT6331_VIO18_CON18		0x3A4
+#define MT6331_VIO18_CON19		0x3A6
+#define MT6331_VIO18_CON20		0x3A8
+#define MT6331_BUCK_K_CON0		0x3AA
+#define MT6331_BUCK_K_CON1		0x3AC
+#define MT6331_BUCK_K_CON2		0x3AE
+#define MT6331_BUCK_K_CON3		0x3B0
+#define MT6331_ZCD_CON0			0x400
+#define MT6331_ZCD_CON1			0x402
+#define MT6331_ZCD_CON2			0x404
+#define MT6331_ZCD_CON3			0x406
+#define MT6331_ZCD_CON4			0x408
+#define MT6331_ZCD_CON5			0x40A
+#define MT6331_ISINK0_CON0		0x40C
+#define MT6331_ISINK0_CON1		0x40E
+#define MT6331_ISINK0_CON2		0x410
+#define MT6331_ISINK0_CON3		0x412
+#define MT6331_ISINK0_CON4		0x414
+#define MT6331_ISINK1_CON0		0x416
+#define MT6331_ISINK1_CON1		0x418
+#define MT6331_ISINK1_CON2		0x41A
+#define MT6331_ISINK1_CON3		0x41C
+#define MT6331_ISINK1_CON4		0x41E
+#define MT6331_ISINK2_CON0		0x420
+#define MT6331_ISINK2_CON1		0x422
+#define MT6331_ISINK2_CON2		0x424
+#define MT6331_ISINK2_CON3		0x426
+#define MT6331_ISINK2_CON4		0x428
+#define MT6331_ISINK3_CON0		0x42A
+#define MT6331_ISINK3_CON1		0x42C
+#define MT6331_ISINK3_CON2		0x42E
+#define MT6331_ISINK3_CON3		0x430
+#define MT6331_ISINK3_CON4		0x432
+#define MT6331_ISINK_ANA0		0x434
+#define MT6331_ISINK_ANA1		0x436
+#define MT6331_ISINK_PHASE_DLY		0x438
+#define MT6331_ISINK_EN_CTRL		0x43A
+#define MT6331_ANALDO_CON0		0x500
+#define MT6331_ANALDO_CON1		0x502
+#define MT6331_ANALDO_CON2		0x504
+#define MT6331_ANALDO_CON3		0x506
+#define MT6331_ANALDO_CON4		0x508
+#define MT6331_ANALDO_CON5		0x50A
+#define MT6331_ANALDO_CON6		0x50C
+#define MT6331_ANALDO_CON7		0x50E
+#define MT6331_ANALDO_CON8		0x510
+#define MT6331_ANALDO_CON9		0x512
+#define MT6331_ANALDO_CON10		0x514
+#define MT6331_ANALDO_CON11		0x516
+#define MT6331_ANALDO_CON12		0x518
+#define MT6331_ANALDO_CON13		0x51A
+#define MT6331_SYSLDO_CON0		0x51C
+#define MT6331_SYSLDO_CON1		0x51E
+#define MT6331_SYSLDO_CON2		0x520
+#define MT6331_SYSLDO_CON3		0x522
+#define MT6331_SYSLDO_CON4		0x524
+#define MT6331_SYSLDO_CON5		0x526
+#define MT6331_SYSLDO_CON6		0x528
+#define MT6331_SYSLDO_CON7		0x52A
+#define MT6331_SYSLDO_CON8		0x52C
+#define MT6331_SYSLDO_CON9		0x52E
+#define MT6331_SYSLDO_CON10		0x530
+#define MT6331_SYSLDO_CON11		0x532
+#define MT6331_SYSLDO_CON12		0x534
+#define MT6331_SYSLDO_CON13		0x536
+#define MT6331_SYSLDO_CON14		0x538
+#define MT6331_SYSLDO_CON15		0x53A
+#define MT6331_SYSLDO_CON16		0x53C
+#define MT6331_SYSLDO_CON17		0x53E
+#define MT6331_SYSLDO_CON18		0x540
+#define MT6331_SYSLDO_CON19		0x542
+#define MT6331_SYSLDO_CON20		0x544
+#define MT6331_SYSLDO_CON21		0x546
+#define MT6331_DIGLDO_CON0		0x548
+#define MT6331_DIGLDO_CON1		0x54A
+#define MT6331_DIGLDO_CON2		0x54C
+#define MT6331_DIGLDO_CON3		0x54E
+#define MT6331_DIGLDO_CON4		0x550
+#define MT6331_DIGLDO_CON5		0x552
+#define MT6331_DIGLDO_CON6		0x554
+#define MT6331_DIGLDO_CON7		0x556
+#define MT6331_DIGLDO_CON8		0x558
+#define MT6331_DIGLDO_CON9		0x55A
+#define MT6331_DIGLDO_CON10		0x55C
+#define MT6331_DIGLDO_CON11		0x55E
+#define MT6331_DIGLDO_CON12		0x560
+#define MT6331_DIGLDO_CON13		0x562
+#define MT6331_DIGLDO_CON14		0x564
+#define MT6331_DIGLDO_CON15		0x566
+#define MT6331_DIGLDO_CON16		0x568
+#define MT6331_DIGLDO_CON17		0x56A
+#define MT6331_DIGLDO_CON18		0x56C
+#define MT6331_DIGLDO_CON19		0x56E
+#define MT6331_DIGLDO_CON20		0x570
+#define MT6331_DIGLDO_CON21		0x572
+#define MT6331_DIGLDO_CON22		0x574
+#define MT6331_DIGLDO_CON23		0x576
+#define MT6331_DIGLDO_CON24		0x578
+#define MT6331_DIGLDO_CON25		0x57A
+#define MT6331_DIGLDO_CON26		0x57C
+#define MT6331_DIGLDO_CON27		0x57E
+#define MT6331_DIGLDO_CON28		0x580
+#define MT6331_OTP_CON0			0x600
+#define MT6331_OTP_CON1			0x602
+#define MT6331_OTP_CON2			0x604
+#define MT6331_OTP_CON3			0x606
+#define MT6331_OTP_CON4			0x608
+#define MT6331_OTP_CON5			0x60A
+#define MT6331_OTP_CON6			0x60C
+#define MT6331_OTP_CON7			0x60E
+#define MT6331_OTP_CON8			0x610
+#define MT6331_OTP_CON9			0x612
+#define MT6331_OTP_CON10		0x614
+#define MT6331_OTP_CON11		0x616
+#define MT6331_OTP_CON12		0x618
+#define MT6331_OTP_CON13		0x61A
+#define MT6331_OTP_CON14		0x61C
+#define MT6331_OTP_DOUT_0_15		0x61E
+#define MT6331_OTP_DOUT_16_31		0x620
+#define MT6331_OTP_DOUT_32_47		0x622
+#define MT6331_OTP_DOUT_48_63		0x624
+#define MT6331_OTP_DOUT_64_79		0x626
+#define MT6331_OTP_DOUT_80_95		0x628
+#define MT6331_OTP_DOUT_96_111		0x62A
+#define MT6331_OTP_DOUT_112_127		0x62C
+#define MT6331_OTP_DOUT_128_143		0x62E
+#define MT6331_OTP_DOUT_144_159		0x630
+#define MT6331_OTP_DOUT_160_175		0x632
+#define MT6331_OTP_DOUT_176_191		0x634
+#define MT6331_OTP_DOUT_192_207		0x636
+#define MT6331_OTP_DOUT_208_223		0x638
+#define MT6331_OTP_DOUT_224_239		0x63A
+#define MT6331_OTP_DOUT_240_255		0x63C
+#define MT6331_OTP_VAL_0_15		0x63E
+#define MT6331_OTP_VAL_16_31		0x640
+#define MT6331_OTP_VAL_32_47		0x642
+#define MT6331_OTP_VAL_48_63		0x644
+#define MT6331_OTP_VAL_64_79		0x646
+#define MT6331_OTP_VAL_80_95		0x648
+#define MT6331_OTP_VAL_96_111		0x64A
+#define MT6331_OTP_VAL_112_127		0x64C
+#define MT6331_OTP_VAL_128_143		0x64E
+#define MT6331_OTP_VAL_144_159		0x650
+#define MT6331_OTP_VAL_160_175		0x652
+#define MT6331_OTP_VAL_176_191		0x654
+#define MT6331_OTP_VAL_192_207		0x656
+#define MT6331_OTP_VAL_208_223		0x658
+#define MT6331_OTP_VAL_224_239		0x65A
+#define MT6331_OTP_VAL_240_255		0x65C
+#define MT6331_RTC_MIX_CON0		0x65E
+#define MT6331_RTC_MIX_CON1		0x660
+#define MT6331_AUDDAC_CFG0		0x662
+#define MT6331_AUDBUF_CFG0		0x664
+#define MT6331_AUDBUF_CFG1		0x666
+#define MT6331_AUDBUF_CFG2		0x668
+#define MT6331_AUDBUF_CFG3		0x66A
+#define MT6331_AUDBUF_CFG4		0x66C
+#define MT6331_AUDBUF_CFG5		0x66E
+#define MT6331_AUDBUF_CFG6		0x670
+#define MT6331_AUDBUF_CFG7		0x672
+#define MT6331_AUDBUF_CFG8		0x674
+#define MT6331_IBIASDIST_CFG0		0x676
+#define MT6331_AUDCLKGEN_CFG0		0x678
+#define MT6331_AUDLDO_CFG0		0x67A
+#define MT6331_AUDDCDC_CFG0		0x67C
+#define MT6331_AUDDCDC_CFG1		0x67E
+#define MT6331_AUDNVREGGLB_CFG0		0x680
+#define MT6331_AUD_NCP0			0x682
+#define MT6331_AUD_ZCD_CFG0		0x684
+#define MT6331_AUDPREAMP_CFG0		0x686
+#define MT6331_AUDPREAMP_CFG1		0x688
+#define MT6331_AUDPREAMP_CFG2		0x68A
+#define MT6331_AUDADC_CFG0		0x68C
+#define MT6331_AUDADC_CFG1		0x68E
+#define MT6331_AUDADC_CFG2		0x690
+#define MT6331_AUDADC_CFG3		0x692
+#define MT6331_AUDADC_CFG4		0x694
+#define MT6331_AUDADC_CFG5		0x696
+#define MT6331_AUDDIGMI_CFG0		0x698
+#define MT6331_AUDDIGMI_CFG1		0x69A
+#define MT6331_AUDMICBIAS_CFG0		0x69C
+#define MT6331_AUDMICBIAS_CFG1		0x69E
+#define MT6331_AUDENCSPARE_CFG0		0x6A0
+#define MT6331_AUDPREAMPGAIN_CFG0	0x6A2
+#define MT6331_AUDMADPLL_CFG0		0x6A4
+#define MT6331_AUDMADPLL_CFG1		0x6A6
+#define MT6331_AUDMADPLL_CFG2		0x6A8
+#define MT6331_AUDLDO_NVREG_CFG0	0x6AA
+#define MT6331_AUDLDO_NVREG_CFG1	0x6AC
+#define MT6331_AUDLDO_NVREG_CFG2	0x6AE
+#define MT6331_AUXADC_ADC0		0x700
+#define MT6331_AUXADC_ADC1		0x702
+#define MT6331_AUXADC_ADC2		0x704
+#define MT6331_AUXADC_ADC3		0x706
+#define MT6331_AUXADC_ADC4		0x708
+#define MT6331_AUXADC_ADC5		0x70A
+#define MT6331_AUXADC_ADC6		0x70C
+#define MT6331_AUXADC_ADC7		0x70E
+#define MT6331_AUXADC_ADC8		0x710
+#define MT6331_AUXADC_ADC9		0x712
+#define MT6331_AUXADC_ADC10		0x714
+#define MT6331_AUXADC_ADC11		0x716
+#define MT6331_AUXADC_ADC12		0x718
+#define MT6331_AUXADC_ADC13		0x71A
+#define MT6331_AUXADC_ADC14		0x71C
+#define MT6331_AUXADC_ADC15		0x71E
+#define MT6331_AUXADC_ADC16		0x720
+#define MT6331_AUXADC_ADC17		0x722
+#define MT6331_AUXADC_ADC18		0x724
+#define MT6331_AUXADC_ADC19		0x726
+#define MT6331_AUXADC_STA0		0x728
+#define MT6331_AUXADC_STA1		0x72A
+#define MT6331_AUXADC_RQST0		0x72C
+#define MT6331_AUXADC_RQST0_SET		0x72E
+#define MT6331_AUXADC_RQST0_CLR		0x730
+#define MT6331_AUXADC_RQST1		0x732
+#define MT6331_AUXADC_RQST1_SET		0x734
+#define MT6331_AUXADC_RQST1_CLR		0x736
+#define MT6331_AUXADC_CON0		0x738
+#define MT6331_AUXADC_CON1		0x73A
+#define MT6331_AUXADC_CON2		0x73C
+#define MT6331_AUXADC_CON3		0x73E
+#define MT6331_AUXADC_CON4		0x740
+#define MT6331_AUXADC_CON5		0x742
+#define MT6331_AUXADC_CON6		0x744
+#define MT6331_AUXADC_CON7		0x746
+#define MT6331_AUXADC_CON8		0x748
+#define MT6331_AUXADC_CON9		0x74A
+#define MT6331_AUXADC_CON10		0x74C
+#define MT6331_AUXADC_CON11		0x74E
+#define MT6331_AUXADC_CON12		0x750
+#define MT6331_AUXADC_CON13		0x752
+#define MT6331_AUXADC_CON14		0x754
+#define MT6331_AUXADC_CON15		0x756
+#define MT6331_AUXADC_CON16		0x758
+#define MT6331_AUXADC_CON17		0x75A
+#define MT6331_AUXADC_CON18		0x75C
+#define MT6331_AUXADC_CON19		0x75E
+#define MT6331_AUXADC_CON20		0x760
+#define MT6331_AUXADC_CON21		0x762
+#define MT6331_AUXADC_CON22		0x764
+#define MT6331_AUXADC_CON23		0x766
+#define MT6331_AUXADC_CON24		0x768
+#define MT6331_AUXADC_CON25		0x76A
+#define MT6331_AUXADC_CON26		0x76C
+#define MT6331_AUXADC_CON27		0x76E
+#define MT6331_AUXADC_CON28		0x770
+#define MT6331_AUXADC_CON29		0x772
+#define MT6331_AUXADC_CON30		0x774
+#define MT6331_AUXADC_CON31		0x776
+#define MT6331_AUXADC_CON32		0x778
+#define MT6331_ACCDET_CON0		0x77A
+#define MT6331_ACCDET_CON1		0x77C
+#define MT6331_ACCDET_CON2		0x77E
+#define MT6331_ACCDET_CON3		0x780
+#define MT6331_ACCDET_CON4		0x782
+#define MT6331_ACCDET_CON5		0x784
+#define MT6331_ACCDET_CON6		0x786
+#define MT6331_ACCDET_CON7		0x788
+#define MT6331_ACCDET_CON8		0x78A
+#define MT6331_ACCDET_CON9		0x78C
+#define MT6331_ACCDET_CON10		0x78E
+#define MT6331_ACCDET_CON11		0x790
+#define MT6331_ACCDET_CON12		0x792
+#define MT6331_ACCDET_CON13		0x794
+#define MT6331_ACCDET_CON14		0x796
+#define MT6331_ACCDET_CON15		0x798
+#define MT6331_ACCDET_CON16		0x79A
+#define MT6331_ACCDET_CON17		0x79C
+#define MT6331_ACCDET_CON18		0x79E
+#define MT6331_ACCDET_CON19		0x7A0
+#define MT6331_ACCDET_CON20		0x7A2
+#define MT6331_ACCDET_CON21		0x7A4
+#define MT6331_ACCDET_CON22		0x7A6
+#define MT6331_ACCDET_CON23		0x7A8
+#define MT6331_ACCDET_CON24		0x7AA
+
+#endif /* __MFD_MT6331_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6332/core.h b/include/linux/mfd/mt6332/core.h
new file mode 100644
index 000000000000..cd6013eb82d9
--- /dev/null
+++ b/include/linux/mfd/mt6332/core.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6332_CORE_H__
+#define __MFD_MT6332_CORE_H__
+
+enum mt6332_irq_status_numbers {
+	MT6332_IRQ_STATUS_CHR_COMPLETE = 0,
+	MT6332_IRQ_STATUS_THERMAL_SD,
+	MT6332_IRQ_STATUS_THERMAL_REG_IN,
+	MT6332_IRQ_STATUS_THERMAL_REG_OUT,
+	MT6332_IRQ_STATUS_OTG_OC,
+	MT6332_IRQ_STATUS_CHR_OC,
+	MT6332_IRQ_STATUS_OTG_THERMAL,
+	MT6332_IRQ_STATUS_CHRIN_SHORT,
+	MT6332_IRQ_STATUS_DRVCDT_SHORT,
+	MT6332_IRQ_STATUS_PLUG_IN_FLASH,
+	MT6332_IRQ_STATUS_CHRWDT_FLAG,
+	MT6332_IRQ_STATUS_FLASH_EN_TIMEOUT,
+	MT6332_IRQ_STATUS_FLASH_VLED1_SHORT,
+	MT6332_IRQ_STATUS_FLASH_VLED1_OPEN = 13,
+	MT6332_IRQ_STATUS_OV = 16,
+	MT6332_IRQ_STATUS_BVALID_DET,
+	MT6332_IRQ_STATUS_VBATON_UNDET,
+	MT6332_IRQ_STATUS_CHR_PLUG_IN,
+	MT6332_IRQ_STATUS_CHR_PLUG_OUT,
+	MT6332_IRQ_STATUS_BC11_TIMEOUT,
+	MT6332_IRQ_STATUS_FLASH_VLED2_SHORT,
+	MT6332_IRQ_STATUS_FLASH_VLED2_OPEN = 23,
+	MT6332_IRQ_STATUS_THR_H = 32,
+	MT6332_IRQ_STATUS_THR_L,
+	MT6332_IRQ_STATUS_BAT_H,
+	MT6332_IRQ_STATUS_BAT_L,
+	MT6332_IRQ_STATUS_M3_H,
+	MT6332_IRQ_STATUS_M3_L,
+	MT6332_IRQ_STATUS_FG_BAT_H,
+	MT6332_IRQ_STATUS_FG_BAT_L,
+	MT6332_IRQ_STATUS_FG_CUR_H,
+	MT6332_IRQ_STATUS_FG_CUR_L,
+	MT6332_IRQ_STATUS_SPKL_D,
+	MT6332_IRQ_STATUS_SPKL_AB,
+	MT6332_IRQ_STATUS_BIF,
+	MT6332_IRQ_STATUS_VWLED_OC = 45,
+	MT6332_IRQ_STATUS_VDRAM_OC = 48,
+	MT6332_IRQ_STATUS_VDVFS2_OC,
+	MT6332_IRQ_STATUS_VRF1_OC,
+	MT6332_IRQ_STATUS_VRF2_OC,
+	MT6332_IRQ_STATUS_VPA_OC,
+	MT6332_IRQ_STATUS_VSBST_OC,
+	MT6332_IRQ_STATUS_LDO_OC,
+	MT6332_IRQ_STATUS_NR,
+};
+
+#define MT6332_IRQ_CON0_BASE	MT6332_IRQ_STATUS_CHR_COMPLETE
+#define MT6332_IRQ_CON0_BITS	(MT6332_IRQ_STATUS_FLASH_VLED1_OPEN + 1)
+#define MT6332_IRQ_CON1_BASE	MT6332_IRQ_STATUS_OV
+#define MT6332_IRQ_CON1_BITS	(MT6332_IRQ_STATUS_FLASH_VLED2_OPEN - MT6332_IRQ_STATUS_OV + 1)
+#define MT6332_IRQ_CON2_BASE	MT6332_IRQ_STATUS_THR_H
+#define MT6332_IRQ_CON2_BITS	(MT6332_IRQ_STATUS_VWLED_OC - MT6332_IRQ_STATUS_THR_H + 1)
+#define MT6332_IRQ_CON3_BASE	MT6332_IRQ_STATUS_VDRAM_OC
+#define MT6332_IRQ_CON3_BITS	(MT6332_IRQ_STATUS_LDO_OC - MT6332_IRQ_STATUS_VDRAM_OC + 1)
+
+#endif /* __MFD_MT6332_CORE_H__ */
diff --git a/include/linux/mfd/mt6332/registers.h b/include/linux/mfd/mt6332/registers.h
new file mode 100644
index 000000000000..65e0b86fceac
--- /dev/null
+++ b/include/linux/mfd/mt6332/registers.h
@@ -0,0 +1,642 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6332_REGISTERS_H__
+#define __MFD_MT6332_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6332_HWCID              0x8000
+#define MT6332_SWCID              0x8002
+#define MT6332_TOP_CON            0x8004
+#define MT6332_DDR_VREF_AP_CON    0x8006
+#define MT6332_DDR_VREF_DQ_CON    0x8008
+#define MT6332_DDR_VREF_CA_CON    0x800A
+#define MT6332_TEST_OUT           0x800C
+#define MT6332_TEST_CON0          0x800E
+#define MT6332_TEST_CON1          0x8010
+#define MT6332_TESTMODE_SW        0x8012
+#define MT6332_TESTMODE_ANA       0x8014
+#define MT6332_TDSEL_CON          0x8016
+#define MT6332_RDSEL_CON          0x8018
+#define MT6332_SMT_CON0           0x801A
+#define MT6332_SMT_CON1           0x801C
+#define MT6332_DRV_CON0           0x801E
+#define MT6332_DRV_CON1           0x8020
+#define MT6332_DRV_CON2           0x8022
+#define MT6332_EN_STATUS0         0x8024
+#define MT6332_OCSTATUS0          0x8026
+#define MT6332_TOP_STATUS         0x8028
+#define MT6332_TOP_STATUS_SET     0x802A
+#define MT6332_TOP_STATUS_CLR     0x802C
+#define MT6332_FLASH_CON0         0x802E
+#define MT6332_FLASH_CON1         0x8030
+#define MT6332_FLASH_CON2         0x8032
+#define MT6332_CORE_CON0          0x8034
+#define MT6332_CORE_CON1          0x8036
+#define MT6332_CORE_CON2          0x8038
+#define MT6332_CORE_CON3          0x803A
+#define MT6332_CORE_CON4          0x803C
+#define MT6332_CORE_CON5          0x803E
+#define MT6332_CORE_CON6          0x8040
+#define MT6332_CORE_CON7          0x8042
+#define MT6332_CORE_CON8          0x8044
+#define MT6332_CORE_CON9          0x8046
+#define MT6332_CORE_CON10         0x8048
+#define MT6332_CORE_CON11         0x804A
+#define MT6332_CORE_CON12         0x804C
+#define MT6332_CORE_CON13         0x804E
+#define MT6332_CORE_CON14         0x8050
+#define MT6332_CORE_CON15         0x8052
+#define MT6332_STA_CON0           0x8054
+#define MT6332_STA_CON1           0x8056
+#define MT6332_STA_CON2           0x8058
+#define MT6332_STA_CON3           0x805A
+#define MT6332_STA_CON4           0x805C
+#define MT6332_STA_CON5           0x805E
+#define MT6332_STA_CON6           0x8060
+#define MT6332_STA_CON7           0x8062
+#define MT6332_CHR_CON0           0x8064
+#define MT6332_CHR_CON1           0x8066
+#define MT6332_CHR_CON2           0x8068
+#define MT6332_CHR_CON3           0x806A
+#define MT6332_CHR_CON4           0x806C
+#define MT6332_CHR_CON5           0x806E
+#define MT6332_CHR_CON6           0x8070
+#define MT6332_CHR_CON7           0x8072
+#define MT6332_CHR_CON8           0x8074
+#define MT6332_CHR_CON9           0x8076
+#define MT6332_CHR_CON10          0x8078
+#define MT6332_CHR_CON11          0x807A
+#define MT6332_CHR_CON12          0x807C
+#define MT6332_CHR_CON13          0x807E
+#define MT6332_CHR_CON14          0x8080
+#define MT6332_CHR_CON15          0x8082
+#define MT6332_BOOST_CON0         0x8084
+#define MT6332_BOOST_CON1         0x8086
+#define MT6332_BOOST_CON2         0x8088
+#define MT6332_BOOST_CON3         0x808A
+#define MT6332_BOOST_CON4         0x808C
+#define MT6332_BOOST_CON5         0x808E
+#define MT6332_BOOST_CON6         0x8090
+#define MT6332_BOOST_CON7         0x8092
+#define MT6332_TOP_CKPDN_CON0     0x8094
+#define MT6332_TOP_CKPDN_CON0_SET 0x8096
+#define MT6332_TOP_CKPDN_CON0_CLR 0x8098
+#define MT6332_TOP_CKPDN_CON1     0x809A
+#define MT6332_TOP_CKPDN_CON1_SET 0x809C
+#define MT6332_TOP_CKPDN_CON1_CLR 0x809E
+#define MT6332_TOP_CKPDN_CON2     0x80A0
+#define MT6332_TOP_CKPDN_CON2_SET 0x80A2
+#define MT6332_TOP_CKPDN_CON2_CLR 0x80A4
+#define MT6332_TOP_CKSEL_CON0     0x80A6
+#define MT6332_TOP_CKSEL_CON0_SET 0x80A8
+#define MT6332_TOP_CKSEL_CON0_CLR 0x80AA
+#define MT6332_TOP_CKSEL_CON1     0x80AC
+#define MT6332_TOP_CKSEL_CON1_SET 0x80AE
+#define MT6332_TOP_CKSEL_CON1_CLR 0x80B0
+#define MT6332_TOP_CKHWEN_CON     0x80B2
+#define MT6332_TOP_CKHWEN_CON_SET 0x80B4
+#define MT6332_TOP_CKHWEN_CON_CLR 0x80B6
+#define MT6332_TOP_CKTST_CON0     0x80B8
+#define MT6332_TOP_CKTST_CON1     0x80BA
+#define MT6332_TOP_RST_CON        0x80BC
+#define MT6332_TOP_RST_CON_SET    0x80BE
+#define MT6332_TOP_RST_CON_CLR    0x80C0
+#define MT6332_TOP_RST_MISC       0x80C2
+#define MT6332_TOP_RST_MISC_SET   0x80C4
+#define MT6332_TOP_RST_MISC_CLR   0x80C6
+#define MT6332_INT_CON0           0x80C8
+#define MT6332_INT_CON0_SET       0x80CA
+#define MT6332_INT_CON0_CLR       0x80CC
+#define MT6332_INT_CON1           0x80CE
+#define MT6332_INT_CON1_SET       0x80D0
+#define MT6332_INT_CON1_CLR       0x80D2
+#define MT6332_INT_CON2           0x80D4
+#define MT6332_INT_CON2_SET       0x80D6
+#define MT6332_INT_CON2_CLR       0x80D8
+#define MT6332_INT_CON3           0x80DA
+#define MT6332_INT_CON3_SET       0x80DC
+#define MT6332_INT_CON3_CLR       0x80DE
+#define MT6332_CHRWDT_CON0        0x80E0
+#define MT6332_CHRWDT_STATUS0     0x80E2
+#define MT6332_INT_STATUS0        0x80E4
+#define MT6332_INT_STATUS1        0x80E6
+#define MT6332_INT_STATUS2        0x80E8
+#define MT6332_INT_STATUS3        0x80EA
+#define MT6332_OC_GEAR_0          0x80EC
+#define MT6332_OC_GEAR_1          0x80EE
+#define MT6332_OC_GEAR_2          0x80F0
+#define MT6332_INT_MISC_CON       0x80F2
+#define MT6332_RG_SPI_CON         0x80F4
+#define MT6332_DEW_DIO_EN         0x80F6
+#define MT6332_DEW_READ_TEST      0x80F8
+#define MT6332_DEW_WRITE_TEST     0x80FA
+#define MT6332_DEW_CRC_SWRST      0x80FC
+#define MT6332_DEW_CRC_EN         0x80FE
+#define MT6332_DEW_CRC_VAL        0x8100
+#define MT6332_DEW_DBG_MON_SEL    0x8102
+#define MT6332_DEW_CIPHER_KEY_SEL 0x8104
+#define MT6332_DEW_CIPHER_IV_SEL  0x8106
+#define MT6332_DEW_CIPHER_EN      0x8108
+#define MT6332_DEW_CIPHER_RDY     0x810A
+#define MT6332_DEW_CIPHER_MODE    0x810C
+#define MT6332_DEW_CIPHER_SWRST   0x810E
+#define MT6332_DEW_RDDMY_NO       0x8110
+#define MT6332_INT_STA            0x8112
+#define MT6332_BIF_CON0           0x8114
+#define MT6332_BIF_CON1           0x8116
+#define MT6332_BIF_CON2           0x8118
+#define MT6332_BIF_CON3           0x811A
+#define MT6332_BIF_CON4           0x811C
+#define MT6332_BIF_CON5           0x811E
+#define MT6332_BIF_CON6           0x8120
+#define MT6332_BIF_CON7           0x8122
+#define MT6332_BIF_CON8           0x8124
+#define MT6332_BIF_CON9           0x8126
+#define MT6332_BIF_CON10          0x8128
+#define MT6332_BIF_CON11          0x812A
+#define MT6332_BIF_CON12          0x812C
+#define MT6332_BIF_CON13          0x812E
+#define MT6332_BIF_CON14          0x8130
+#define MT6332_BIF_CON15          0x8132
+#define MT6332_BIF_CON16          0x8134
+#define MT6332_BIF_CON17          0x8136
+#define MT6332_BIF_CON18          0x8138
+#define MT6332_BIF_CON19          0x813A
+#define MT6332_BIF_CON20          0x813C
+#define MT6332_BIF_CON21          0x813E
+#define MT6332_BIF_CON22          0x8140
+#define MT6332_BIF_CON23          0x8142
+#define MT6332_BIF_CON24          0x8144
+#define MT6332_BIF_CON25          0x8146
+#define MT6332_BIF_CON26          0x8148
+#define MT6332_BIF_CON27          0x814A
+#define MT6332_BIF_CON28          0x814C
+#define MT6332_BIF_CON29          0x814E
+#define MT6332_BIF_CON30          0x8150
+#define MT6332_BIF_CON31          0x8152
+#define MT6332_BIF_CON32          0x8154
+#define MT6332_BIF_CON33          0x8156
+#define MT6332_BIF_CON34          0x8158
+#define MT6332_BIF_CON35          0x815A
+#define MT6332_BIF_CON36          0x815C
+#define MT6332_BATON_CON0         0x815E
+#define MT6332_BIF_CON37          0x8160
+#define MT6332_BIF_CON38          0x8162
+#define MT6332_CHR_CON16          0x8164
+#define MT6332_CHR_CON17          0x8166
+#define MT6332_CHR_CON18          0x8168
+#define MT6332_CHR_CON19          0x816A
+#define MT6332_CHR_CON20          0x816C
+#define MT6332_CHR_CON21          0x816E
+#define MT6332_CHR_CON22          0x8170
+#define MT6332_CHR_CON23          0x8172
+#define MT6332_CHR_CON24          0x8174
+#define MT6332_CHR_CON25          0x8176
+#define MT6332_STA_CON8           0x8178
+#define MT6332_BUCK_ALL_CON0      0x8400
+#define MT6332_BUCK_ALL_CON1      0x8402
+#define MT6332_BUCK_ALL_CON2      0x8404
+#define MT6332_BUCK_ALL_CON3      0x8406
+#define MT6332_BUCK_ALL_CON4      0x8408
+#define MT6332_BUCK_ALL_CON5      0x840A
+#define MT6332_BUCK_ALL_CON6      0x840C
+#define MT6332_BUCK_ALL_CON7      0x840E
+#define MT6332_BUCK_ALL_CON8      0x8410
+#define MT6332_BUCK_ALL_CON9      0x8412
+#define MT6332_BUCK_ALL_CON10     0x8414
+#define MT6332_BUCK_ALL_CON11     0x8416
+#define MT6332_BUCK_ALL_CON12     0x8418
+#define MT6332_BUCK_ALL_CON13     0x841A
+#define MT6332_BUCK_ALL_CON14     0x841C
+#define MT6332_BUCK_ALL_CON15     0x841E
+#define MT6332_BUCK_ALL_CON16     0x8420
+#define MT6332_BUCK_ALL_CON17     0x8422
+#define MT6332_BUCK_ALL_CON18     0x8424
+#define MT6332_BUCK_ALL_CON19     0x8426
+#define MT6332_BUCK_ALL_CON20     0x8428
+#define MT6332_BUCK_ALL_CON21     0x842A
+#define MT6332_BUCK_ALL_CON22     0x842C
+#define MT6332_BUCK_ALL_CON23     0x842E
+#define MT6332_BUCK_ALL_CON24     0x8430
+#define MT6332_BUCK_ALL_CON25     0x8432
+#define MT6332_BUCK_ALL_CON26     0x8434
+#define MT6332_BUCK_ALL_CON27     0x8436
+#define MT6332_VDRAM_CON0         0x8438
+#define MT6332_VDRAM_CON1         0x843A
+#define MT6332_VDRAM_CON2         0x843C
+#define MT6332_VDRAM_CON3         0x843E
+#define MT6332_VDRAM_CON4         0x8440
+#define MT6332_VDRAM_CON5         0x8442
+#define MT6332_VDRAM_CON6         0x8444
+#define MT6332_VDRAM_CON7         0x8446
+#define MT6332_VDRAM_CON8         0x8448
+#define MT6332_VDRAM_CON9         0x844A
+#define MT6332_VDRAM_CON10        0x844C
+#define MT6332_VDRAM_CON11        0x844E
+#define MT6332_VDRAM_CON12        0x8450
+#define MT6332_VDRAM_CON13        0x8452
+#define MT6332_VDRAM_CON14        0x8454
+#define MT6332_VDRAM_CON15        0x8456
+#define MT6332_VDRAM_CON16        0x8458
+#define MT6332_VDRAM_CON17        0x845A
+#define MT6332_VDRAM_CON18        0x845C
+#define MT6332_VDRAM_CON19        0x845E
+#define MT6332_VDRAM_CON20        0x8460
+#define MT6332_VDRAM_CON21        0x8462
+#define MT6332_VDVFS2_CON0        0x8464
+#define MT6332_VDVFS2_CON1        0x8466
+#define MT6332_VDVFS2_CON2        0x8468
+#define MT6332_VDVFS2_CON3        0x846A
+#define MT6332_VDVFS2_CON4        0x846C
+#define MT6332_VDVFS2_CON5        0x846E
+#define MT6332_VDVFS2_CON6        0x8470
+#define MT6332_VDVFS2_CON7        0x8472
+#define MT6332_VDVFS2_CON8        0x8474
+#define MT6332_VDVFS2_CON9        0x8476
+#define MT6332_VDVFS2_CON10       0x8478
+#define MT6332_VDVFS2_CON11       0x847A
+#define MT6332_VDVFS2_CON12       0x847C
+#define MT6332_VDVFS2_CON13       0x847E
+#define MT6332_VDVFS2_CON14       0x8480
+#define MT6332_VDVFS2_CON15       0x8482
+#define MT6332_VDVFS2_CON16       0x8484
+#define MT6332_VDVFS2_CON17       0x8486
+#define MT6332_VDVFS2_CON18       0x8488
+#define MT6332_VDVFS2_CON19       0x848A
+#define MT6332_VDVFS2_CON20       0x848C
+#define MT6332_VDVFS2_CON21       0x848E
+#define MT6332_VDVFS2_CON22       0x8490
+#define MT6332_VDVFS2_CON23       0x8492
+#define MT6332_VDVFS2_CON24       0x8494
+#define MT6332_VDVFS2_CON25       0x8496
+#define MT6332_VDVFS2_CON26       0x8498
+#define MT6332_VDVFS2_CON27       0x849A
+#define MT6332_VRF1_CON0          0x849C
+#define MT6332_VRF1_CON1          0x849E
+#define MT6332_VRF1_CON2          0x84A0
+#define MT6332_VRF1_CON3          0x84A2
+#define MT6332_VRF1_CON4          0x84A4
+#define MT6332_VRF1_CON5          0x84A6
+#define MT6332_VRF1_CON6          0x84A8
+#define MT6332_VRF1_CON7          0x84AA
+#define MT6332_VRF1_CON8          0x84AC
+#define MT6332_VRF1_CON9          0x84AE
+#define MT6332_VRF1_CON10         0x84B0
+#define MT6332_VRF1_CON11         0x84B2
+#define MT6332_VRF1_CON12         0x84B4
+#define MT6332_VRF1_CON13         0x84B6
+#define MT6332_VRF1_CON14         0x84B8
+#define MT6332_VRF1_CON15         0x84BA
+#define MT6332_VRF1_CON16         0x84BC
+#define MT6332_VRF1_CON17         0x84BE
+#define MT6332_VRF1_CON18         0x84C0
+#define MT6332_VRF1_CON19         0x84C2
+#define MT6332_VRF1_CON20         0x84C4
+#define MT6332_VRF1_CON21         0x84C6
+#define MT6332_VRF2_CON0          0x84C8
+#define MT6332_VRF2_CON1          0x84CA
+#define MT6332_VRF2_CON2          0x84CC
+#define MT6332_VRF2_CON3          0x84CE
+#define MT6332_VRF2_CON4          0x84D0
+#define MT6332_VRF2_CON5          0x84D2
+#define MT6332_VRF2_CON6          0x84D4
+#define MT6332_VRF2_CON7          0x84D6
+#define MT6332_VRF2_CON8          0x84D8
+#define MT6332_VRF2_CON9          0x84DA
+#define MT6332_VRF2_CON10         0x84DC
+#define MT6332_VRF2_CON11         0x84DE
+#define MT6332_VRF2_CON12         0x84E0
+#define MT6332_VRF2_CON13         0x84E2
+#define MT6332_VRF2_CON14         0x84E4
+#define MT6332_VRF2_CON15         0x84E6
+#define MT6332_VRF2_CON16         0x84E8
+#define MT6332_VRF2_CON17         0x84EA
+#define MT6332_VRF2_CON18         0x84EC
+#define MT6332_VRF2_CON19         0x84EE
+#define MT6332_VRF2_CON20         0x84F0
+#define MT6332_VRF2_CON21         0x84F2
+#define MT6332_VPA_CON0           0x84F4
+#define MT6332_VPA_CON1           0x84F6
+#define MT6332_VPA_CON2           0x84F8
+#define MT6332_VPA_CON3           0x84FC
+#define MT6332_VPA_CON4           0x84FE
+#define MT6332_VPA_CON5           0x8500
+#define MT6332_VPA_CON6           0x8502
+#define MT6332_VPA_CON7           0x8504
+#define MT6332_VPA_CON8           0x8506
+#define MT6332_VPA_CON9           0x8508
+#define MT6332_VPA_CON10          0x850A
+#define MT6332_VPA_CON11          0x850C
+#define MT6332_VPA_CON12          0x850E
+#define MT6332_VPA_CON13          0x8510
+#define MT6332_VPA_CON14          0x8512
+#define MT6332_VPA_CON15          0x8514
+#define MT6332_VPA_CON16          0x8516
+#define MT6332_VPA_CON17          0x8518
+#define MT6332_VPA_CON18          0x851A
+#define MT6332_VPA_CON19          0x851C
+#define MT6332_VPA_CON20          0x851E
+#define MT6332_VPA_CON21          0x8520
+#define MT6332_VPA_CON22          0x8522
+#define MT6332_VPA_CON23          0x8524
+#define MT6332_VPA_CON24          0x8526
+#define MT6332_VPA_CON25          0x8528
+#define MT6332_VSBST_CON0         0x852A
+#define MT6332_VSBST_CON1         0x852C
+#define MT6332_VSBST_CON2         0x852E
+#define MT6332_VSBST_CON3         0x8530
+#define MT6332_VSBST_CON4         0x8532
+#define MT6332_VSBST_CON5         0x8534
+#define MT6332_VSBST_CON6         0x8536
+#define MT6332_VSBST_CON7         0x8538
+#define MT6332_VSBST_CON8         0x853A
+#define MT6332_VSBST_CON9         0x853C
+#define MT6332_VSBST_CON10        0x853E
+#define MT6332_VSBST_CON11        0x8540
+#define MT6332_VSBST_CON12        0x8542
+#define MT6332_VSBST_CON13        0x8544
+#define MT6332_VSBST_CON14        0x8546
+#define MT6332_VSBST_CON15        0x8548
+#define MT6332_VSBST_CON16        0x854A
+#define MT6332_VSBST_CON17        0x854C
+#define MT6332_VSBST_CON18        0x854E
+#define MT6332_VSBST_CON19        0x8550
+#define MT6332_VSBST_CON20        0x8552
+#define MT6332_VSBST_CON21        0x8554
+#define MT6332_BUCK_K_CON0        0x8556
+#define MT6332_BUCK_K_CON1        0x8558
+#define MT6332_BUCK_K_CON2        0x855A
+#define MT6332_BUCK_K_CON3        0x855C
+#define MT6332_BUCK_K_CON4        0x855E
+#define MT6332_BUCK_K_CON5        0x8560
+#define MT6332_AUXADC_ADC0        0x8800
+#define MT6332_AUXADC_ADC1        0x8802
+#define MT6332_AUXADC_ADC2        0x8804
+#define MT6332_AUXADC_ADC3        0x8806
+#define MT6332_AUXADC_ADC4        0x8808
+#define MT6332_AUXADC_ADC5        0x880A
+#define MT6332_AUXADC_ADC6        0x880C
+#define MT6332_AUXADC_ADC7        0x880E
+#define MT6332_AUXADC_ADC8        0x8810
+#define MT6332_AUXADC_ADC9        0x8812
+#define MT6332_AUXADC_ADC10       0x8814
+#define MT6332_AUXADC_ADC11       0x8816
+#define MT6332_AUXADC_ADC12       0x8818
+#define MT6332_AUXADC_ADC13       0x881A
+#define MT6332_AUXADC_ADC14       0x881C
+#define MT6332_AUXADC_ADC15       0x881E
+#define MT6332_AUXADC_ADC16       0x8820
+#define MT6332_AUXADC_ADC17       0x8822
+#define MT6332_AUXADC_ADC18       0x8824
+#define MT6332_AUXADC_ADC19       0x8826
+#define MT6332_AUXADC_ADC20       0x8828
+#define MT6332_AUXADC_ADC21       0x882A
+#define MT6332_AUXADC_ADC22       0x882C
+#define MT6332_AUXADC_ADC23       0x882E
+#define MT6332_AUXADC_ADC24       0x8830
+#define MT6332_AUXADC_ADC25       0x8832
+#define MT6332_AUXADC_ADC26       0x8834
+#define MT6332_AUXADC_ADC27       0x8836
+#define MT6332_AUXADC_ADC28       0x8838
+#define MT6332_AUXADC_ADC29       0x883A
+#define MT6332_AUXADC_ADC30       0x883C
+#define MT6332_AUXADC_ADC31       0x883E
+#define MT6332_AUXADC_ADC32       0x8840
+#define MT6332_AUXADC_ADC33       0x8842
+#define MT6332_AUXADC_ADC34       0x8844
+#define MT6332_AUXADC_ADC35       0x8846
+#define MT6332_AUXADC_ADC36       0x8848
+#define MT6332_AUXADC_ADC37       0x884A
+#define MT6332_AUXADC_ADC38       0x884C
+#define MT6332_AUXADC_ADC39       0x884E
+#define MT6332_AUXADC_ADC40       0x8850
+#define MT6332_AUXADC_ADC41       0x8852
+#define MT6332_AUXADC_ADC42       0x8854
+#define MT6332_AUXADC_ADC43       0x8856
+#define MT6332_AUXADC_STA0        0x8858
+#define MT6332_AUXADC_STA1        0x885A
+#define MT6332_AUXADC_RQST0       0x885C
+#define MT6332_AUXADC_RQST0_SET   0x885E
+#define MT6332_AUXADC_RQST0_CLR   0x8860
+#define MT6332_AUXADC_RQST1       0x8862
+#define MT6332_AUXADC_RQST1_SET   0x8864
+#define MT6332_AUXADC_RQST1_CLR   0x8866
+#define MT6332_AUXADC_CON0        0x8868
+#define MT6332_AUXADC_CON1        0x886A
+#define MT6332_AUXADC_CON2        0x886C
+#define MT6332_AUXADC_CON3        0x886E
+#define MT6332_AUXADC_CON4        0x8870
+#define MT6332_AUXADC_CON5        0x8872
+#define MT6332_AUXADC_CON6        0x8874
+#define MT6332_AUXADC_CON7        0x8876
+#define MT6332_AUXADC_CON8        0x8878
+#define MT6332_AUXADC_CON9        0x887A
+#define MT6332_AUXADC_CON10       0x887C
+#define MT6332_AUXADC_CON11       0x887E
+#define MT6332_AUXADC_CON12       0x8880
+#define MT6332_AUXADC_CON13       0x8882
+#define MT6332_AUXADC_CON14       0x8884
+#define MT6332_AUXADC_CON15       0x8886
+#define MT6332_AUXADC_CON16       0x8888
+#define MT6332_AUXADC_CON17       0x888A
+#define MT6332_AUXADC_CON18       0x888C
+#define MT6332_AUXADC_CON19       0x888E
+#define MT6332_AUXADC_CON20       0x8890
+#define MT6332_AUXADC_CON21       0x8892
+#define MT6332_AUXADC_CON22       0x8894
+#define MT6332_AUXADC_CON23       0x8896
+#define MT6332_AUXADC_CON24       0x8898
+#define MT6332_AUXADC_CON25       0x889A
+#define MT6332_AUXADC_CON26       0x889C
+#define MT6332_AUXADC_CON27       0x889E
+#define MT6332_AUXADC_CON28       0x88A0
+#define MT6332_AUXADC_CON29       0x88A2
+#define MT6332_AUXADC_CON30       0x88A4
+#define MT6332_AUXADC_CON31       0x88A6
+#define MT6332_AUXADC_CON32       0x88A8
+#define MT6332_AUXADC_CON33       0x88AA
+#define MT6332_AUXADC_CON34       0x88AC
+#define MT6332_AUXADC_CON35       0x88AE
+#define MT6332_AUXADC_CON36       0x88B0
+#define MT6332_AUXADC_CON37       0x88B2
+#define MT6332_AUXADC_CON38       0x88B4
+#define MT6332_AUXADC_CON39       0x88B6
+#define MT6332_AUXADC_CON40       0x88B8
+#define MT6332_AUXADC_CON41       0x88BA
+#define MT6332_AUXADC_CON42       0x88BC
+#define MT6332_AUXADC_CON43       0x88BE
+#define MT6332_AUXADC_CON44       0x88C0
+#define MT6332_AUXADC_CON45       0x88C2
+#define MT6332_AUXADC_CON46       0x88C4
+#define MT6332_AUXADC_CON47       0x88C6
+#define MT6332_STRUP_CONA0        0x8C00
+#define MT6332_STRUP_CONA1        0x8C02
+#define MT6332_STRUP_CONA2        0x8C04
+#define MT6332_STRUP_CON0         0x8C06
+#define MT6332_STRUP_CON2         0x8C08
+#define MT6332_STRUP_CON3         0x8C0A
+#define MT6332_STRUP_CON4         0x8C0C
+#define MT6332_STRUP_CON5         0x8C0E
+#define MT6332_STRUP_CON6         0x8C10
+#define MT6332_STRUP_CON7         0x8C12
+#define MT6332_STRUP_CON8         0x8C14
+#define MT6332_STRUP_CON9         0x8C16
+#define MT6332_STRUP_CON10        0x8C18
+#define MT6332_STRUP_CON11        0x8C1A
+#define MT6332_STRUP_CON12        0x8C1C
+#define MT6332_STRUP_CON13        0x8C1E
+#define MT6332_STRUP_CON14        0x8C20
+#define MT6332_STRUP_CON15        0x8C22
+#define MT6332_STRUP_CON16        0x8C24
+#define MT6332_STRUP_CON17        0x8C26
+#define MT6332_FGADC_CON0         0x8C28
+#define MT6332_FGADC_CON1         0x8C2A
+#define MT6332_FGADC_CON2         0x8C2C
+#define MT6332_FGADC_CON3         0x8C2E
+#define MT6332_FGADC_CON4         0x8C30
+#define MT6332_FGADC_CON5         0x8C32
+#define MT6332_FGADC_CON6         0x8C34
+#define MT6332_FGADC_CON7         0x8C36
+#define MT6332_FGADC_CON8         0x8C38
+#define MT6332_FGADC_CON9         0x8C3A
+#define MT6332_FGADC_CON10        0x8C3C
+#define MT6332_FGADC_CON11        0x8C3E
+#define MT6332_FGADC_CON12        0x8C40
+#define MT6332_FGADC_CON13        0x8C42
+#define MT6332_FGADC_CON14        0x8C44
+#define MT6332_FGADC_CON15        0x8C46
+#define MT6332_FGADC_CON16        0x8C48
+#define MT6332_FGADC_CON17        0x8C4A
+#define MT6332_FGADC_CON18        0x8C4C
+#define MT6332_FGADC_CON19        0x8C4E
+#define MT6332_FGADC_CON20        0x8C50
+#define MT6332_FGADC_CON21        0x8C52
+#define MT6332_FGADC_CON22        0x8C54
+#define MT6332_OTP_CON0           0x8C56
+#define MT6332_OTP_CON1           0x8C58
+#define MT6332_OTP_CON2           0x8C5A
+#define MT6332_OTP_CON3           0x8C5C
+#define MT6332_OTP_CON4           0x8C5E
+#define MT6332_OTP_CON5           0x8C60
+#define MT6332_OTP_CON6           0x8C62
+#define MT6332_OTP_CON7           0x8C64
+#define MT6332_OTP_CON8           0x8C66
+#define MT6332_OTP_CON9           0x8C68
+#define MT6332_OTP_CON10          0x8C6A
+#define MT6332_OTP_CON11          0x8C6C
+#define MT6332_OTP_CON12          0x8C6E
+#define MT6332_OTP_CON13          0x8C70
+#define MT6332_OTP_CON14          0x8C72
+#define MT6332_OTP_DOUT_0_15      0x8C74
+#define MT6332_OTP_DOUT_16_31     0x8C76
+#define MT6332_OTP_DOUT_32_47     0x8C78
+#define MT6332_OTP_DOUT_48_63     0x8C7A
+#define MT6332_OTP_DOUT_64_79     0x8C7C
+#define MT6332_OTP_DOUT_80_95     0x8C7E
+#define MT6332_OTP_DOUT_96_111    0x8C80
+#define MT6332_OTP_DOUT_112_127   0x8C82
+#define MT6332_OTP_DOUT_128_143   0x8C84
+#define MT6332_OTP_DOUT_144_159   0x8C86
+#define MT6332_OTP_DOUT_160_175   0x8C88
+#define MT6332_OTP_DOUT_176_191   0x8C8A
+#define MT6332_OTP_DOUT_192_207   0x8C8C
+#define MT6332_OTP_DOUT_208_223   0x8C8E
+#define MT6332_OTP_DOUT_224_239   0x8C90
+#define MT6332_OTP_DOUT_240_255   0x8C92
+#define MT6332_OTP_VAL_0_15       0x8C94
+#define MT6332_OTP_VAL_16_31      0x8C96
+#define MT6332_OTP_VAL_32_47      0x8C98
+#define MT6332_OTP_VAL_48_63      0x8C9A
+#define MT6332_OTP_VAL_64_79      0x8C9C
+#define MT6332_OTP_VAL_80_95      0x8C9E
+#define MT6332_OTP_VAL_96_111     0x8CA0
+#define MT6332_OTP_VAL_112_127    0x8CA2
+#define MT6332_OTP_VAL_128_143    0x8CA4
+#define MT6332_OTP_VAL_144_159    0x8CA6
+#define MT6332_OTP_VAL_160_175    0x8CA8
+#define MT6332_OTP_VAL_176_191    0x8CAA
+#define MT6332_OTP_VAL_192_207    0x8CAC
+#define MT6332_OTP_VAL_208_223    0x8CAE
+#define MT6332_OTP_VAL_224_239    0x8CB0
+#define MT6332_OTP_VAL_240_255    0x8CB2
+#define MT6332_LDO_CON0           0x8CB4
+#define MT6332_LDO_CON1           0x8CB6
+#define MT6332_LDO_CON2           0x8CB8
+#define MT6332_LDO_CON3           0x8CBA
+#define MT6332_LDO_CON5           0x8CBC
+#define MT6332_LDO_CON6           0x8CBE
+#define MT6332_LDO_CON7           0x8CC0
+#define MT6332_LDO_CON8           0x8CC2
+#define MT6332_LDO_CON9           0x8CC4
+#define MT6332_LDO_CON10          0x8CC6
+#define MT6332_LDO_CON11          0x8CC8
+#define MT6332_LDO_CON12          0x8CCA
+#define MT6332_LDO_CON13          0x8CCC
+#define MT6332_FQMTR_CON0         0x8CCE
+#define MT6332_FQMTR_CON1         0x8CD0
+#define MT6332_FQMTR_CON2         0x8CD2
+#define MT6332_IWLED_CON0         0x8CD4
+#define MT6332_IWLED_DEG          0x8CD6
+#define MT6332_IWLED_STATUS       0x8CD8
+#define MT6332_IWLED_EN_CTRL      0x8CDA
+#define MT6332_IWLED_CON1         0x8CDC
+#define MT6332_IWLED_CON2         0x8CDE
+#define MT6332_IWLED_TRIM0        0x8CE0
+#define MT6332_IWLED_TRIM1        0x8CE2
+#define MT6332_IWLED_CON3         0x8CE4
+#define MT6332_IWLED_CON4         0x8CE6
+#define MT6332_IWLED_CON5         0x8CE8
+#define MT6332_IWLED_CON6         0x8CEA
+#define MT6332_IWLED_CON7         0x8CEC
+#define MT6332_IWLED_CON8         0x8CEE
+#define MT6332_IWLED_CON9         0x8CF0
+#define MT6332_SPK_CON0           0x8CF2
+#define MT6332_SPK_CON1           0x8CF4
+#define MT6332_SPK_CON2           0x8CF6
+#define MT6332_SPK_CON3           0x8CF8
+#define MT6332_SPK_CON4           0x8CFA
+#define MT6332_SPK_CON5           0x8CFC
+#define MT6332_SPK_CON6           0x8CFE
+#define MT6332_SPK_CON7           0x8D00
+#define MT6332_SPK_CON8           0x8D02
+#define MT6332_SPK_CON9           0x8D04
+#define MT6332_SPK_CON10          0x8D06
+#define MT6332_SPK_CON11          0x8D08
+#define MT6332_SPK_CON12          0x8D0A
+#define MT6332_SPK_CON13          0x8D0C
+#define MT6332_SPK_CON14          0x8D0E
+#define MT6332_SPK_CON15          0x8D10
+#define MT6332_SPK_CON16          0x8D12
+#define MT6332_TESTI_CON0         0x8D14
+#define MT6332_TESTI_CON1         0x8D16
+#define MT6332_TESTI_CON2         0x8D18
+#define MT6332_TESTI_CON3         0x8D1A
+#define MT6332_TESTI_CON4         0x8D1C
+#define MT6332_TESTI_CON5         0x8D1E
+#define MT6332_TESTI_CON6         0x8D20
+#define MT6332_TESTI_MUX_CON0     0x8D22
+#define MT6332_TESTI_MUX_CON1     0x8D24
+#define MT6332_TESTI_MUX_CON2     0x8D26
+#define MT6332_TESTI_MUX_CON3     0x8D28
+#define MT6332_TESTI_MUX_CON4     0x8D2A
+#define MT6332_TESTI_MUX_CON5     0x8D2C
+#define MT6332_TESTI_MUX_CON6     0x8D2E
+#define MT6332_TESTO_CON0         0x8D30
+#define MT6332_TESTO_CON1         0x8D32
+#define MT6332_TEST_OMUX_CON0     0x8D34
+#define MT6332_TEST_OMUX_CON1     0x8D36
+#define MT6332_DEBUG_CON0         0x8D38
+#define MT6332_DEBUG_CON1         0x8D3A
+#define MT6332_DEBUG_CON2         0x8D3C
+#define MT6332_FGADC_CON23        0x8D3E
+#define MT6332_FGADC_CON24        0x8D40
+#define MT6332_FGADC_CON25        0x8D42
+#define MT6332_TOP_RST_STATUS     0x8D44
+#define MT6332_TOP_RST_STATUS_SET 0x8D46
+#define MT6332_TOP_RST_STATUS_CLR 0x8D48
+#define MT6332_VDVFS2_CON28       0x8D4A
+
+#endif /* __MFD_MT6332_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 3fecaffe5019..627487e26287 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -12,6 +12,8 @@
 
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
+	MT6331_CHIP_ID = 0x20,
+	MT6332_CHIP_ID = 0x20,
 	MT6357_CHIP_ID = 0x57,
 	MT6358_CHIP_ID = 0x58,
 	MT6359_CHIP_ID = 0x59,
-- 
cgit 


From 30e22a6ebca039572ce9bc10f1934f4eabfb5b7f Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 17 Jul 2022 16:09:03 +0000
Subject: amt: use workqueue for gateway side message handling

There are some synchronization issues(amt->status, amt->req_cnt, etc)
if the interface is in gateway mode because gateway message handlers
are processed concurrently.
This applies a work queue for processing these messages instead of
expanding the locking context.

So, the purposes of this patch are to fix exist race conditions and to make
gateway to be able to validate a gateway status more correctly.

When the AMT gateway interface is created, it tries to establish to relay.
The establishment step looks stateless, but it should be managed well.
In order to handle messages in the gateway, it saves the current
status(i.e. AMT_STATUS_XXX).
This patch makes gateway code to be worked with a single thread.

Now, all messages except the multicast are triggered(received or
delay expired), and these messages will be stored in the event
queue(amt->events).
Then, the single worker processes stored messages asynchronously one
by one.
The multicast data message type will be still processed immediately.

Now, amt->lock is only needed to access the event queue(amt->events)
if an interface is the gateway mode.

Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/amt.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++------
 include/net/amt.h |  20 +++++++
 2 files changed, 164 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index 89563d1b2a3b..9e2d1992b349 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -900,6 +900,28 @@ static void amt_send_mld_gq(struct amt_dev *amt, struct amt_tunnel_list *tunnel)
 }
 #endif
 
+static bool amt_queue_event(struct amt_dev *amt, enum amt_event event,
+			    struct sk_buff *skb)
+{
+	int index;
+
+	spin_lock_bh(&amt->lock);
+	if (amt->nr_events >= AMT_MAX_EVENTS) {
+		spin_unlock_bh(&amt->lock);
+		return 1;
+	}
+
+	index = (amt->event_idx + amt->nr_events) % AMT_MAX_EVENTS;
+	amt->events[index].event = event;
+	amt->events[index].skb = skb;
+	amt->nr_events++;
+	amt->event_idx %= AMT_MAX_EVENTS;
+	queue_work(amt_wq, &amt->event_wq);
+	spin_unlock_bh(&amt->lock);
+
+	return 0;
+}
+
 static void amt_secret_work(struct work_struct *work)
 {
 	struct amt_dev *amt = container_of(to_delayed_work(work),
@@ -913,12 +935,8 @@ static void amt_secret_work(struct work_struct *work)
 			 msecs_to_jiffies(AMT_SECRET_TIMEOUT));
 }
 
-static void amt_discovery_work(struct work_struct *work)
+static void amt_event_send_discovery(struct amt_dev *amt)
 {
-	struct amt_dev *amt = container_of(to_delayed_work(work),
-					   struct amt_dev,
-					   discovery_wq);
-
 	spin_lock_bh(&amt->lock);
 	if (amt->status > AMT_STATUS_SENT_DISCOVERY)
 		goto out;
@@ -933,11 +951,19 @@ out:
 	spin_unlock_bh(&amt->lock);
 }
 
-static void amt_req_work(struct work_struct *work)
+static void amt_discovery_work(struct work_struct *work)
 {
 	struct amt_dev *amt = container_of(to_delayed_work(work),
 					   struct amt_dev,
-					   req_wq);
+					   discovery_wq);
+
+	if (amt_queue_event(amt, AMT_EVENT_SEND_DISCOVERY, NULL))
+		mod_delayed_work(amt_wq, &amt->discovery_wq,
+				 msecs_to_jiffies(AMT_DISCOVERY_TIMEOUT));
+}
+
+static void amt_event_send_request(struct amt_dev *amt)
+{
 	u32 exp;
 
 	spin_lock_bh(&amt->lock);
@@ -967,6 +993,17 @@ out:
 	spin_unlock_bh(&amt->lock);
 }
 
+static void amt_req_work(struct work_struct *work)
+{
+	struct amt_dev *amt = container_of(to_delayed_work(work),
+					   struct amt_dev,
+					   req_wq);
+
+	if (amt_queue_event(amt, AMT_EVENT_SEND_REQUEST, NULL))
+		mod_delayed_work(amt_wq, &amt->req_wq,
+				 msecs_to_jiffies(100));
+}
+
 static bool amt_send_membership_update(struct amt_dev *amt,
 				       struct sk_buff *skb,
 				       bool v6)
@@ -2392,12 +2429,14 @@ static bool amt_membership_query_handler(struct amt_dev *amt,
 	skb->pkt_type = PACKET_MULTICAST;
 	skb->ip_summed = CHECKSUM_NONE;
 	len = skb->len;
+	local_bh_disable();
 	if (__netif_rx(skb) == NET_RX_SUCCESS) {
 		amt_update_gw_status(amt, AMT_STATUS_RECEIVED_QUERY, true);
 		dev_sw_netstats_rx_add(amt->dev, len);
 	} else {
 		amt->dev->stats.rx_dropped++;
 	}
+	local_bh_enable();
 
 	return false;
 }
@@ -2688,6 +2727,38 @@ send:
 	return false;
 }
 
+static void amt_gw_rcv(struct amt_dev *amt, struct sk_buff *skb)
+{
+	int type = amt_parse_type(skb);
+	int err = 1;
+
+	if (type == -1)
+		goto drop;
+
+	if (amt->mode == AMT_MODE_GATEWAY) {
+		switch (type) {
+		case AMT_MSG_ADVERTISEMENT:
+			err = amt_advertisement_handler(amt, skb);
+			break;
+		case AMT_MSG_MEMBERSHIP_QUERY:
+			err = amt_membership_query_handler(amt, skb);
+			if (!err)
+				return;
+			break;
+		default:
+			netdev_dbg(amt->dev, "Invalid type of Gateway\n");
+			break;
+		}
+	}
+drop:
+	if (err) {
+		amt->dev->stats.rx_dropped++;
+		kfree_skb(skb);
+	} else {
+		consume_skb(skb);
+	}
+}
+
 static int amt_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct amt_dev *amt;
@@ -2719,8 +2790,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb)
 				err = true;
 				goto drop;
 			}
-			err = amt_advertisement_handler(amt, skb);
-			break;
+			if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) {
+				netdev_dbg(amt->dev, "AMT Event queue full\n");
+				err = true;
+				goto drop;
+			}
+			goto out;
 		case AMT_MSG_MULTICAST_DATA:
 			if (iph->saddr != amt->remote_ip) {
 				netdev_dbg(amt->dev, "Invalid Relay IP\n");
@@ -2738,11 +2813,12 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb)
 				err = true;
 				goto drop;
 			}
-			err = amt_membership_query_handler(amt, skb);
-			if (err)
+			if (amt_queue_event(amt, AMT_EVENT_RECEIVE, skb)) {
+				netdev_dbg(amt->dev, "AMT Event queue full\n");
+				err = true;
 				goto drop;
-			else
-				goto out;
+			}
+			goto out;
 		default:
 			err = true;
 			netdev_dbg(amt->dev, "Invalid type of Gateway\n");
@@ -2780,6 +2856,46 @@ out:
 	return 0;
 }
 
+static void amt_event_work(struct work_struct *work)
+{
+	struct amt_dev *amt = container_of(work, struct amt_dev, event_wq);
+	struct sk_buff *skb;
+	u8 event;
+	int i;
+
+	for (i = 0; i < AMT_MAX_EVENTS; i++) {
+		spin_lock_bh(&amt->lock);
+		if (amt->nr_events == 0) {
+			spin_unlock_bh(&amt->lock);
+			return;
+		}
+		event = amt->events[amt->event_idx].event;
+		skb = amt->events[amt->event_idx].skb;
+		amt->events[amt->event_idx].event = AMT_EVENT_NONE;
+		amt->events[amt->event_idx].skb = NULL;
+		amt->nr_events--;
+		amt->event_idx++;
+		amt->event_idx %= AMT_MAX_EVENTS;
+		spin_unlock_bh(&amt->lock);
+
+		switch (event) {
+		case AMT_EVENT_RECEIVE:
+			amt_gw_rcv(amt, skb);
+			break;
+		case AMT_EVENT_SEND_DISCOVERY:
+			amt_event_send_discovery(amt);
+			break;
+		case AMT_EVENT_SEND_REQUEST:
+			amt_event_send_request(amt);
+			break;
+		default:
+			if (skb)
+				kfree_skb(skb);
+			break;
+		}
+	}
+}
+
 static int amt_err_lookup(struct sock *sk, struct sk_buff *skb)
 {
 	struct amt_dev *amt;
@@ -2867,6 +2983,8 @@ static int amt_dev_open(struct net_device *dev)
 
 	amt->ready4 = false;
 	amt->ready6 = false;
+	amt->event_idx = 0;
+	amt->nr_events = 0;
 
 	err = amt_socket_create(amt);
 	if (err)
@@ -2892,6 +3010,8 @@ static int amt_dev_stop(struct net_device *dev)
 	struct amt_dev *amt = netdev_priv(dev);
 	struct amt_tunnel_list *tunnel, *tmp;
 	struct socket *sock;
+	struct sk_buff *skb;
+	int i;
 
 	cancel_delayed_work_sync(&amt->req_wq);
 	cancel_delayed_work_sync(&amt->discovery_wq);
@@ -2904,6 +3024,15 @@ static int amt_dev_stop(struct net_device *dev)
 	if (sock)
 		udp_tunnel_sock_release(sock);
 
+	cancel_work_sync(&amt->event_wq);
+	for (i = 0; i < AMT_MAX_EVENTS; i++) {
+		skb = amt->events[i].skb;
+		if (skb)
+			kfree_skb(skb);
+		amt->events[i].event = AMT_EVENT_NONE;
+		amt->events[i].skb = NULL;
+	}
+
 	amt->ready4 = false;
 	amt->ready6 = false;
 	amt->req_cnt = 0;
@@ -3146,8 +3275,8 @@ static int amt_newlink(struct net *net, struct net_device *dev,
 	INIT_DELAYED_WORK(&amt->discovery_wq, amt_discovery_work);
 	INIT_DELAYED_WORK(&amt->req_wq, amt_req_work);
 	INIT_DELAYED_WORK(&amt->secret_wq, amt_secret_work);
+	INIT_WORK(&amt->event_wq, amt_event_work);
 	INIT_LIST_HEAD(&amt->tunnel_list);
-
 	return 0;
 err:
 	dev_put(amt->stream_dev);
@@ -3280,7 +3409,7 @@ static int __init amt_init(void)
 	if (err < 0)
 		goto unregister_notifier;
 
-	amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 1);
+	amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 0);
 	if (!amt_wq) {
 		err = -ENOMEM;
 		goto rtnl_unregister;
diff --git a/include/net/amt.h b/include/net/amt.h
index 0e40c3d64fcf..08fc30cf2f34 100644
--- a/include/net/amt.h
+++ b/include/net/amt.h
@@ -78,6 +78,15 @@ enum amt_status {
 
 #define AMT_STATUS_MAX (__AMT_STATUS_MAX - 1)
 
+/* Gateway events only */
+enum amt_event {
+	AMT_EVENT_NONE,
+	AMT_EVENT_RECEIVE,
+	AMT_EVENT_SEND_DISCOVERY,
+	AMT_EVENT_SEND_REQUEST,
+	__AMT_EVENT_MAX,
+};
+
 struct amt_header {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	u8 type:4,
@@ -292,6 +301,12 @@ struct amt_group_node {
 	struct hlist_head	sources[];
 };
 
+#define AMT_MAX_EVENTS	16
+struct amt_events {
+	enum amt_event event;
+	struct sk_buff *skb;
+};
+
 struct amt_dev {
 	struct net_device       *dev;
 	struct net_device       *stream_dev;
@@ -308,6 +323,7 @@ struct amt_dev {
 	struct delayed_work     req_wq;
 	/* Protected by RTNL */
 	struct delayed_work     secret_wq;
+	struct work_struct	event_wq;
 	/* AMT status */
 	enum amt_status		status;
 	/* Generated key */
@@ -345,6 +361,10 @@ struct amt_dev {
 	/* Used only in gateway mode */
 	u64			mac:48,
 				reserved:16;
+	/* AMT gateway side message handler queue */
+	struct amt_events	events[AMT_MAX_EVENTS];
+	u8			event_idx;
+	u8			nr_events;
 };
 
 #define AMT_TOS			0xc0
-- 
cgit 


From 32f02a211b0a192553075803b0b819027f96bb43 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Jul 2022 13:57:48 +0200
Subject: Revert "platform/chrome: Add Type-C mux set command definitions"

This reverts commit 28a6ed8e39f77f6ac613ec9b7461aa75e85fa79a.

The chrome platform driver changes need to come in through the platform
tree due to some api changes that showed up there that cause build
errors in linux-next

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20220719160821.5e68e30b@oak.ozlabs.ibm.com
Cc: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/cros_ec_commands.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index a3945c5e7f50..8cfa8cfca77e 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5722,21 +5722,8 @@ enum typec_control_command {
 	TYPEC_CONTROL_COMMAND_EXIT_MODES,
 	TYPEC_CONTROL_COMMAND_CLEAR_EVENTS,
 	TYPEC_CONTROL_COMMAND_ENTER_MODE,
-	TYPEC_CONTROL_COMMAND_TBT_UFP_REPLY,
-	TYPEC_CONTROL_COMMAND_USB_MUX_SET,
 };
 
-/* Replies the AP may specify to the TBT EnterMode command as a UFP */
-enum typec_tbt_ufp_reply {
-	TYPEC_TBT_UFP_REPLY_NAK,
-	TYPEC_TBT_UFP_REPLY_ACK,
-};
-
-struct typec_usb_mux_set {
-	uint8_t mux_index;	/* Index of the mux to set in the chain */
-	uint8_t mux_flags;	/* USB_PD_MUX_*-encoded USB mux state to set */
-} __ec_align1;
-
 struct ec_params_typec_control {
 	uint8_t port;
 	uint8_t command;	/* enum typec_control_command */
@@ -5750,8 +5737,6 @@ struct ec_params_typec_control {
 	union {
 		uint32_t clear_events_mask;
 		uint8_t mode_to_enter;      /* enum typec_mode */
-		uint8_t tbt_ufp_reply;      /* enum typec_tbt_ufp_reply */
-		struct typec_usb_mux_set mux_params;
 		uint8_t placeholder[128];
 	};
 } __ec_align1;
@@ -5830,9 +5815,6 @@ enum tcpc_cc_polarity {
 #define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
 #define PD_STATUS_EVENT_HARD_RESET		BIT(2)
-#define PD_STATUS_EVENT_DISCONNECTED		BIT(3)
-#define PD_STATUS_EVENT_MUX_0_SET_DONE		BIT(4)
-#define PD_STATUS_EVENT_MUX_1_SET_DONE		BIT(5)
 
 struct ec_params_typec_status {
 	uint8_t port;
-- 
cgit 


From 450a563924ae9437758bd468c5b7cee9468ce749 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Tue, 19 Jul 2022 12:52:29 +0000
Subject: KVM: stats: Fix value for KVM_STATS_UNIT_MAX for boolean stats

commit 1b870fa5573e ("kvm: stats: tell userspace which values are
boolean") added a new stat unit (boolean) but failed to raise
KVM_STATS_UNIT_MAX.

Fix by pointing UNIT_MAX at the new max value of UNIT_BOOLEAN.

Fixes: 1b870fa5573e ("kvm: stats: tell userspace which values are boolean")
Reported-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Signed-off-by: Oliver Upton <oupton@google.com>
Message-Id: <20220719125229.2934273-1-oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 2 +-
 include/uapi/linux/kvm.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 6e090fb96a0e..98a283930307 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5658,7 +5658,7 @@ by a string of size ``name_size``.
 	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
-	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
+	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_BOOLEAN
 
 	#define KVM_STATS_BASE_SHIFT		8
 	#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 811897dadcae..860f867c50c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2084,7 +2084,7 @@ struct kvm_stats_header {
 #define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
-#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
+#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_BOOLEAN
 
 #define KVM_STATS_BASE_SHIFT		8
 #define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
-- 
cgit 


From 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 Mon Sep 17 00:00:00 2001
From: Yang Xu <xuyang2018.jy@fujitsu.com>
Date: Thu, 14 Jul 2022 14:11:25 +0800
Subject: fs: add mode_strip_sgid() helper

Add a dedicated helper to handle the setgid bit when creating a new file
in a setgid directory. This is a preparatory patch for moving setgid
stripping into the vfs. The patch contains no functional changes.

Currently the setgid stripping logic is open-coded directly in
inode_init_owner() and the individual filesystems are responsible for
handling setgid inheritance. Since this has proven to be brittle as
evidenced by old issues we uncovered over the last months (see [1] to
[3] below) we will try to move this logic into the vfs.

Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1]
Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2]
Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3]
Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-and-Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Yang Xu <xuyang2018.jy@fujitsu.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/inode.c         | 36 ++++++++++++++++++++++++++++++++----
 include/linux/fs.h |  2 ++
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index bd4da9c5207e..71b36afc3893 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2246,10 +2246,8 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		/* Directories are special, and always inherit S_ISGID */
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
-		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-			 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
-			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
-			mode &= ~S_ISGID;
+		else
+			mode = mode_strip_sgid(mnt_userns, dir, mode);
 	} else
 		inode_fsgid_set(inode, mnt_userns);
 	inode->i_mode = mode;
@@ -2405,3 +2403,33 @@ struct timespec64 current_time(struct inode *inode)
 	return timestamp_truncate(now, inode);
 }
 EXPORT_SYMBOL(current_time);
+
+/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @mnt_userns: User namespace of the mount the inode was created from
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+ *
+ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
+ * raised and @dir has the S_ISGID bit raised ensure that the caller is
+ * either in the group of the parent directory or they have CAP_FSETID
+ * in their user namespace and are privileged over the parent directory.
+ * In all other cases, strip the S_ISGID bit from @mode.
+ *
+ * Return: the new mode to use for the file
+ */
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+			const struct inode *dir, umode_t mode)
+{
+	if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+		return mode;
+	if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+		return mode;
+	if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
+		return mode;
+	if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+		return mode;
+
+	return mode & ~S_ISGID;
+}
+EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..50642668c60f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1903,6 +1903,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
 void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		      const struct inode *dir, umode_t mode);
 extern bool may_open_dev(const struct path *path);
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+			const struct inode *dir, umode_t mode);
 
 /*
  * This is the "filldir" function type, used by readdir() to let
-- 
cgit 


From 08e950449c6253322ddfbf643f566fd504cfc5c1 Mon Sep 17 00:00:00 2001
From: Tomer Maimon <tmaimon77@gmail.com>
Date: Sun, 17 Jul 2022 12:15:56 +0300
Subject: dt-binding: clk: npcm845: Add binding for Nuvoton NPCM8XX Clock

Add binding for the Arbel BMC NPCM8XX Clock controller.

Signed-off-by: Tomer Maimon <tmaimon77@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../bindings/clock/nuvoton,npcm845-clk.yaml        | 49 ++++++++++++++++++++++
 include/dt-bindings/clock/nuvoton,npcm845-clk.h    | 49 ++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml
 create mode 100644 include/dt-bindings/clock/nuvoton,npcm845-clk.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml b/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml
new file mode 100644
index 000000000000..771db2ddf026
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/nuvoton,npcm845-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nuvoton NPCM8XX Clock Controller Binding
+
+maintainers:
+  - Tomer Maimon <tmaimon77@gmail.com>
+
+description: |
+  Nuvoton Arbel BMC NPCM8XX contains an integrated clock controller, which
+  generates and supplies clocks to all modules within the BMC.
+
+properties:
+  compatible:
+    enum:
+      - nuvoton,npcm845-clk
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+    description:
+      See include/dt-bindings/clock/nuvoton,npcm8xx-clock.h for the full
+      list of NPCM8XX clock IDs.
+
+required:
+  - compatible
+  - reg
+  - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    ahb {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        clock-controller@f0801000 {
+            compatible = "nuvoton,npcm845-clk";
+            reg = <0x0 0xf0801000 0x0 0x1000>;
+            #clock-cells = <1>;
+        };
+    };
+...
diff --git a/include/dt-bindings/clock/nuvoton,npcm845-clk.h b/include/dt-bindings/clock/nuvoton,npcm845-clk.h
new file mode 100644
index 000000000000..e5cce08b00e1
--- /dev/null
+++ b/include/dt-bindings/clock/nuvoton,npcm845-clk.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2021 Nuvoton Technologies.
+ * Author: Tomer Maimon <tomer.maimon@nuvoton.com>
+ *
+ * Device Tree binding constants for NPCM8XX clock controller.
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_NPCM8XX_H
+#define __DT_BINDINGS_CLOCK_NPCM8XX_H
+
+#define NPCM8XX_CLK_CPU		0
+#define NPCM8XX_CLK_GFX_PIXEL	1
+#define NPCM8XX_CLK_MC		2
+#define NPCM8XX_CLK_ADC		3
+#define NPCM8XX_CLK_AHB		4
+#define NPCM8XX_CLK_TIMER	5
+#define NPCM8XX_CLK_UART	6
+#define NPCM8XX_CLK_UART2	7
+#define NPCM8XX_CLK_MMC		8
+#define NPCM8XX_CLK_SPI3	9
+#define NPCM8XX_CLK_PCI		10
+#define NPCM8XX_CLK_AXI		11
+#define NPCM8XX_CLK_APB4	12
+#define NPCM8XX_CLK_APB3	13
+#define NPCM8XX_CLK_APB2	14
+#define NPCM8XX_CLK_APB1	15
+#define NPCM8XX_CLK_APB5	16
+#define NPCM8XX_CLK_CLKOUT	17
+#define NPCM8XX_CLK_GFX		18
+#define NPCM8XX_CLK_SU		19
+#define NPCM8XX_CLK_SU48	20
+#define NPCM8XX_CLK_SDHC	21
+#define NPCM8XX_CLK_SPI0	22
+#define NPCM8XX_CLK_SPI1	23
+#define NPCM8XX_CLK_SPIX	24
+#define NPCM8XX_CLK_RG		25
+#define NPCM8XX_CLK_RCP		26
+#define NPCM8XX_CLK_PRE_ADC	27
+#define NPCM8XX_CLK_ATB		28
+#define NPCM8XX_CLK_PRE_CLK	29
+#define NPCM8XX_CLK_TH		30
+#define NPCM8XX_CLK_REFCLK	31
+#define NPCM8XX_CLK_SYSBYPCK	32
+#define NPCM8XX_CLK_MCBYPCK	33
+
+#define NPCM8XX_NUM_CLOCKS	(NPCM8XX_CLK_MCBYPCK + 1)
+
+#endif
-- 
cgit 


From 5409b8053511f9a32ee08c3d16827632a5b17e3f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2022 18:45:03 -0400
Subject: scsi: iscsi: tracing: Use the new __vstring() helper

Instead of open coding a __dynamic_array() with a fixed length (which
defeats the purpose of the dynamic array in the first place). Use the new
__vstring() helper that will use a va_list and only write enough of the
string into the ring buffer that is needed.

Link: https://lkml.kernel.org/r/20220705224750.715763972@goodmis.org

Cc: Fred Herard <fred.herard@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/iscsi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/iscsi.h b/include/trace/events/iscsi.h
index 87408faf6e4e..8ff2a3ca5d75 100644
--- a/include/trace/events/iscsi.h
+++ b/include/trace/events/iscsi.h
@@ -26,12 +26,12 @@ DECLARE_EVENT_CLASS(iscsi_log_msg,
 
 	TP_STRUCT__entry(
 		__string(dname, 	dev_name(dev)		)
-		__dynamic_array(char,	msg, ISCSI_MSG_MAX	)
+		__vstring(msg,		vaf->fmt, vaf->va)
 	),
 
 	TP_fast_assign(
 		__assign_str(dname, dev_name(dev));
-		vsnprintf(__get_str(msg), ISCSI_MSG_MAX, vaf->fmt, *vaf->va);
+		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 
 	TP_printk("%s: %s",__get_str(dname),  __get_str(msg)
-- 
cgit 


From 74003fc4ae7619ac5a7bec8748a14f3a8655f3ea Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2022 18:45:04 -0400
Subject: scsi: qla2xxx: tracing: Use the new __vstring() helper

Instead of open coding a __dynamic_array() with a fixed length (which
defeats the purpose of the dynamic array in the first place). Use the new
__vstring() helper that will use a va_list and only write enough of the
string into the ring buffer that is needed.

Link: https://lkml.kernel.org/r/20220705224750.896553364@goodmis.org

Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/qla.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/qla.h b/include/trace/events/qla.h
index 5857cf682ee7..e7fd55e7dc3d 100644
--- a/include/trace/events/qla.h
+++ b/include/trace/events/qla.h
@@ -22,11 +22,11 @@ DECLARE_EVENT_CLASS(qla_log_event,
 
 	TP_STRUCT__entry(
 		__string(buf, buf)
-		__dynamic_array(char, msg, QLA_MSG_MAX)
+		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
 		__assign_str(buf, buf);
-		vsnprintf(__get_str(msg), QLA_MSG_MAX, vaf->fmt, *vaf->va);
+		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 
 	TP_printk("%s %s", __get_str(buf), __get_str(msg))
-- 
cgit 


From fd1894224407c484f652ad456e1ce423e89bb3eb Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Fri, 15 Jul 2022 19:55:59 +0800
Subject: bpf: Don't redirect packets with invalid pkt_len

Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any
skbs, that is, the flow->head is null.
The root cause, as the [2] says, is because that bpf_prog_test_run_skb()
run a bpf prog which redirects empty skbs.
So we should determine whether the length of the packet modified by bpf
prog or others like bpf_prog_test is valid before forwarding it directly.

LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5
LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html

Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 8 ++++++++
 net/bpf/test_run.c     | 3 +++
 net/core/dev.c         | 1 +
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f6a27ab19202..82e8368ba6e6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2459,6 +2459,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
 
 #endif /* NET_SKBUFF_DATA_USES_OFFSET */
 
+static inline void skb_assert_len(struct sk_buff *skb)
+{
+#ifdef CONFIG_DEBUG_NET
+	if (WARN_ONCE(!skb->len, "%s\n", __func__))
+		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+#endif /* CONFIG_DEBUG_NET */
+}
+
 /*
  *	Add data to an sk_buff
  */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ca96acbc50a..dc9dc0bedca0 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -955,6 +955,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 {
 	struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
 
+	if (!skb->len)
+		return -EINVAL;
+
 	if (!__skb)
 		return 0;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 978ed0622d8f..e241a475036f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4168,6 +4168,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	bool again = false;
 
 	skb_reset_mac_header(skb);
+	skb_assert_len(skb);
 
 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
-- 
cgit 


From bdb2bc7599298ebb677e40fc92b1fa9e69e05098 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Fri, 15 Jul 2022 12:38:00 -0700
Subject: bpf: fix bpf_skb_pull_data documentation

Fix documentation for bpf_skb_pull_data() helper for
when len == 0.

Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/20220715193800.3940070-1-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 3 ++-
 tools/include/uapi/linux/bpf.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 379e68fb866f..ffcbf79a556b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2361,7 +2361,8 @@ union bpf_attr {
  * 		Pull in non-linear data in case the *skb* is non-linear and not
  * 		all of *len* are part of the linear section. Make *len* bytes
  * 		from *skb* readable and writable. If a zero value is passed for
- * 		*len*, then the whole length of the *skb* is pulled.
+ *		*len*, then all bytes in the linear part of *skb* will be made
+ *		readable and writable.
  *
  * 		This helper is only needed for reading and writing with direct
  * 		packet access.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 379e68fb866f..ffcbf79a556b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2361,7 +2361,8 @@ union bpf_attr {
  * 		Pull in non-linear data in case the *skb* is non-linear and not
  * 		all of *len* are part of the linear section. Make *len* bytes
  * 		from *skb* readable and writable. If a zero value is passed for
- * 		*len*, then the whole length of the *skb* is pulled.
+ *		*len*, then all bytes in the linear part of *skb* will be made
+ *		readable and writable.
  *
  * 		This helper is only needed for reading and writing with direct
  * 		packet access.
-- 
cgit 


From 0c6ab0ca9a662d4ca9742d97156bac0d3067d72d Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:07 -0700
Subject: RDMA/mlx5: Expose steering anchor to userspace

Expose a steering anchor per priority to allow users to re-inject
packets back into default NIC pipeline for additional processing.

MLX5_IB_METHOD_STEERING_ANCHOR_CREATE returns a flow table ID which
a user can use to re-inject packets at a specific priority.

A FTE (flow table entry) can be created and the flow table ID
used as a destination.

When a packet is taken into a RDMA-controlled steering domain (like
software steering) there may be a need to insert the packet back into
the default NIC pipeline. This exposes a flow table ID to the user that can
be used as a destination in a flow table entry.

With this new method priorities that are exposed to users via
MLX5_IB_METHOD_FLOW_MATCHER_CREATE can be reached from a non-zero UID.

As user-created flow tables (via RDMA DEVX) are created with a non-zero UID
thus it's impossible to point to a NIC core flow table (core driver flow tables
are created with UID value of zero) from userspace.
Create flow tables that are exposed to users with the shared UID, this
allows users to point to default NIC flow tables.

Steering loops are prevented at FW level as FW enforces that no flow
table at level X can point to a table at level lower than X.

Link: https://lore.kernel.org/all/20220703205407.110890-6-saeed@kernel.org/
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/fs.c          | 138 +++++++++++++++++++++++++++++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h     |   6 ++
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |  17 ++++
 3 files changed, 156 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index b1402aea29c1..691d00c89f33 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -679,7 +679,15 @@ enum flow_table_type {
 #define MLX5_FS_MAX_TYPES	 6
 #define MLX5_FS_MAX_ENTRIES	 BIT(16)
 
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
+static bool mlx5_ib_shared_ft_allowed(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
+}
+
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
+					   struct mlx5_flow_namespace *ns,
 					   struct mlx5_ib_flow_prio *prio,
 					   int priority,
 					   int num_entries, int num_groups,
@@ -688,6 +696,8 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
 	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_flow_table *ft;
 
+	if (mlx5_ib_shared_ft_allowed(&dev->ib_dev))
+		ft_attr.uid = MLX5_SHARED_RESOURCE_UID;
 	ft_attr.prio = priority;
 	ft_attr.max_fte = num_entries;
 	ft_attr.flags = flags;
@@ -784,8 +794,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 
 	ft = prio->flow_table;
 	if (!ft)
-		return _get_prio(ns, prio, priority, max_table_size, num_groups,
-				 flags);
+		return _get_prio(dev, ns, prio, priority, max_table_size,
+				 num_groups, flags);
 
 	return prio;
 }
@@ -927,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
 
 	prio = &dev->flow_db->opfcs[type];
 	if (!prio->flow_table) {
-		prio = _get_prio(ns, prio, priority,
+		prio = _get_prio(dev, ns, prio, priority,
 				 dev->num_ports * MAX_OPFC_RULES, 1, 0);
 		if (IS_ERR(prio)) {
 			err = PTR_ERR(prio);
@@ -1499,7 +1509,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
 	if (prio->flow_table)
 		return prio;
 
-	return _get_prio(ns, prio, priority, max_table_size,
+	return _get_prio(dev, ns, prio, priority, max_table_size,
 			 MLX5_FS_MAX_TYPES, flags);
 }
 
@@ -2016,6 +2026,23 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static int steering_anchor_cleanup(struct ib_uobject *uobject,
+				   enum rdma_remove_reason why,
+				   struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_steering_anchor *obj = uobject->object;
+
+	if (atomic_read(&obj->usecnt))
+		return -EBUSY;
+
+	mutex_lock(&obj->dev->flow_db->lock);
+	put_flow_table(obj->dev, obj->ft_prio, true);
+	mutex_unlock(&obj->dev->flow_db->lock);
+
+	kfree(obj);
+	return 0;
+}
+
 static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
 			      struct mlx5_ib_flow_matcher *obj)
 {
@@ -2122,6 +2149,75 @@ end:
 	return err;
 }
 
+static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type;
+	enum mlx5_flow_namespace_type ns_type;
+	struct mlx5_ib_steering_anchor *obj;
+	struct mlx5_ib_flow_prio *ft_prio;
+	u16 priority;
+	u32 ft_id;
+	int err;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	err = uverbs_get_const(&ib_uapi_ft_type, attrs,
+			       MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE);
+	if (err)
+		return err;
+
+	err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type);
+	if (err)
+		return err;
+
+	err = uverbs_copy_from(&priority, attrs,
+			       MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY);
+	if (err)
+		return err;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	mutex_lock(&dev->flow_db->lock);
+	ft_prio = _get_flow_table(dev, priority, ns_type, 0);
+	if (IS_ERR(ft_prio)) {
+		mutex_unlock(&dev->flow_db->lock);
+		err = PTR_ERR(ft_prio);
+		goto free_obj;
+	}
+
+	ft_prio->refcount++;
+	ft_id = mlx5_flow_table_id(ft_prio->flow_table);
+	mutex_unlock(&dev->flow_db->lock);
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+			     &ft_id, sizeof(ft_id));
+	if (err)
+		goto put_flow_table;
+
+	uobj->object = obj;
+	obj->dev = dev;
+	obj->ft_prio = ft_prio;
+	atomic_set(&obj->usecnt, 0);
+
+	return 0;
+
+put_flow_table:
+	mutex_lock(&dev->flow_db->lock);
+	put_flow_table(dev, ft_prio, true);
+	mutex_unlock(&dev->flow_db->lock);
+free_obj:
+	kfree(obj);
+
+	return err;
+}
+
 static struct ib_flow_action *
 mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
 			     enum mlx5_ib_uapi_flow_table_type ft_type,
@@ -2478,6 +2574,35 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_STEERING_ANCHOR_CREATE,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE,
+			MLX5_IB_OBJECT_STEERING_ANCHOR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY,
+			   UVERBS_ATTR_TYPE(u16),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_STEERING_ANCHOR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_STEERING_ANCHOR,
+	UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup),
+	&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE),
+	&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY));
+
 const struct uapi_definition mlx5_ib_flow_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
 		MLX5_IB_OBJECT_FLOW_MATCHER),
@@ -2486,6 +2611,9 @@ const struct uapi_definition mlx5_ib_flow_defs[] = {
 		&mlx5_ib_fs),
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
 				&mlx5_ib_flow_actions),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_STEERING_ANCHOR,
+		UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)),
 	{},
 };
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7460e0dfe6db..688ee7c05a8f 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -259,6 +259,12 @@ struct mlx5_ib_flow_matcher {
 	u8			match_criteria_enable;
 };
 
+struct mlx5_ib_steering_anchor {
+	struct mlx5_ib_flow_prio *ft_prio;
+	struct mlx5_ib_dev *dev;
+	atomic_t usecnt;
+};
+
 struct mlx5_ib_pp {
 	u16 index;
 	struct mlx5_core_dev *mdev;
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index e539c84d63f1..3bee490eb585 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -228,6 +228,7 @@ enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_VAR,
 	MLX5_IB_OBJECT_PP,
 	MLX5_IB_OBJECT_UAR,
+	MLX5_IB_OBJECT_STEERING_ANCHOR,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
@@ -248,6 +249,22 @@ enum mlx5_ib_flow_matcher_methods {
 	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
 };
 
+enum mlx5_ib_flow_steering_anchor_create_attrs {
+	MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE,
+	MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY,
+	MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+};
+
+enum mlx5_ib_flow_steering_anchor_destroy_attrs {
+	MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_steering_anchor_methods {
+	MLX5_IB_METHOD_STEERING_ANCHOR_CREATE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY,
+};
+
 enum mlx5_ib_device_query_context_attrs {
 	MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT),
 };
-- 
cgit 


From 800d6acf40e5ce676b53a1259fb4e93e56279367 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 27 May 2022 17:07:45 +0200
Subject: rcu: tiny: Record kvfree_call_rcu() call stack for KASAN

When running KASAN with Tiny RCU (e.g. under ARCH=um, where
a working KASAN patch is now available), we don't get any
information on the original kfree_rcu() (or similar) caller
when a problem is reported, as Tiny RCU doesn't record this.

Add the recording, which required pulling kvfree_call_rcu()
out of line for the KASAN case since the recording function
(kasan_record_aux_stack_noalloc) is neither exported, nor
can we include kasan.h into rcutiny.h.

without KASAN, the patch has no size impact (ARCH=um kernel):
    text       data         bss         dec        hex    filename
 6151515    4423154    33148520    43723189    29b29b5    linux
 6151515    4423154    33148520    43723189    29b29b5    linux + patch

with KASAN, the impact on my build was minimal:
    text       data         bss         dec        hex    filename
13915539    7388050    33282304    54585893    340ea25    linux
13911266    7392114    33282304    54585684    340e954    linux + patch
   -4273      +4064         +-0        -209

Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 11 ++++++++++-
 kernel/rcu/tiny.c       | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5fed476f977f..d84e13f2c384 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -38,7 +38,7 @@ static inline void synchronize_rcu_expedited(void)
  */
 extern void kvfree(const void *addr);
 
-static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	if (head) {
 		call_rcu(head, func);
@@ -51,6 +51,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	kvfree((void *) func);
 }
 
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+#else
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	__kvfree_call_rcu(head, func);
+}
+#endif
+
 void rcu_qs(void);
 
 static inline void rcu_softirq_qs(void)
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 340b3f8b090d..58ff3721d975 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -217,6 +217,20 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	if (head) {
+		void *ptr = (void *) head - (unsigned long) func;
+
+		kasan_record_aux_stack_noalloc(ptr);
+	}
+
+	__kvfree_call_rcu(head, func);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
 void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-- 
cgit 


From 2e5e4185ff89ea0fc44c9dead8dccf306a3b3bbb Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Mon, 4 Jul 2022 19:34:08 +0300
Subject: net/mlx5: Expose ts_cqe_metadata_size2wqe_counter

Add capability field which indicates the mask for wqe_counter which
connects between loopback CQE and the original WQE. With this connection
the driver can identify lost of the loopback CQE and reply PTP
synchronization with timestamp given in the original CQE.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 254cc22f5eec..51b4e71017ee 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1833,7 +1833,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   sw_vhca_id[0xe];
 	u8	   reserved_at_230[0x10];
 
-	u8	   reserved_at_240[0x5c0];
+	u8	   reserved_at_240[0xb];
+	u8	   ts_cqe_metadata_size2wqe_counter[0x5];
+	u8	   reserved_at_250[0x10];
+
+	u8	   reserved_at_260[0x5a0];
 };
 
 enum mlx5_ifc_flow_destination_type {
-- 
cgit 


From 7c701d92b2b5e5175dbfec875816474b802b0c45 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:29 +0100
Subject: skbuff: carry external ubuf_info in msghdr

Make possible for network in-kernel callers like io_uring to pass in a
custom ubuf_info by setting it in a new field of struct msghdr.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h | 1 +
 net/compat.c           | 1 +
 net/socket.c           | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 17311ad9f9af..7bac9fc1cee0 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -69,6 +69,7 @@ struct msghdr {
 	unsigned int	msg_flags;	/* flags on received message */
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
+	struct ubuf_info *msg_ubuf;
 };
 
 struct user_msghdr {
diff --git a/net/compat.c b/net/compat.c
index 210fc3b4d0d8..6cd2e7683dd0 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -80,6 +80,7 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
+	kmsg->msg_ubuf = NULL;
 	*ptr = msg.msg_iov;
 	*len = msg.msg_iovlen;
 	return 0;
diff --git a/net/socket.c b/net/socket.c
index 96300cdc0625..82af3882b876 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2106,6 +2106,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
+	msg.msg_ubuf = NULL;
 	if (addr) {
 		err = move_addr_to_kernel(addr, addr_len, &address);
 		if (err < 0)
@@ -2405,6 +2406,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
+	kmsg->msg_ubuf = NULL;
 	*uiov = msg.msg_iov;
 	*nsegs = msg.msg_iovlen;
 	return 0;
-- 
cgit 


From ebe73a284f4de8c5d401adeccd9b8fe3183b6e95 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Tue, 12 Jul 2022 21:52:30 +0100
Subject: net: Allow custom iter handler in msghdr

Add support for custom iov_iter handling to msghdr. The idea is that
in-kernel subsystems want control over how an SG is split.

Signed-off-by: David Ahern <dsahern@kernel.org>
[pavel: move callback into msghdr]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  7 ++++---
 include/linux/socket.h |  4 ++++
 net/core/datagram.c    | 14 ++++++++++----
 net/core/skbuff.c      |  2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8e12b3b9ad6c..a8a2dd4cfdfd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1776,13 +1776,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 			   bool success);
 
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
-			    struct iov_iter *from, size_t length);
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+			    struct sk_buff *skb, struct iov_iter *from,
+			    size_t length);
 
 static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
 					  struct msghdr *msg, int len)
 {
-	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
 }
 
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 7bac9fc1cee0..3c11ef18a9cf 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -14,6 +14,8 @@ struct file;
 struct pid;
 struct cred;
 struct socket;
+struct sock;
+struct sk_buff;
 
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -70,6 +72,8 @@ struct msghdr {
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
 	struct ubuf_info *msg_ubuf;
+	int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
+			    struct iov_iter *from, size_t length);
 };
 
 struct user_msghdr {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 50f4faeea76c..28cdb79df74d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -613,10 +613,16 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
-			    struct iov_iter *from, size_t length)
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+			    struct sk_buff *skb, struct iov_iter *from,
+			    size_t length)
 {
-	int frag = skb_shinfo(skb)->nr_frags;
+	int frag;
+
+	if (msg && msg->sg_from_iter)
+		return msg->sg_from_iter(sk, skb, from, length);
+
+	frag = skb_shinfo(skb)->nr_frags;
 
 	while (length && iov_iter_count(from)) {
 		struct page *pages[MAX_SKB_FRAGS];
@@ -702,7 +708,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 		return -EFAULT;
 
-	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fc22b3d32052..f5a3ebbc1f7e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1358,7 +1358,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 	if (orig_uarg && uarg != orig_uarg)
 		return -EEXIST;
 
-	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 		struct sock *save_sk = skb->sk;
 
-- 
cgit 


From 753f1ca4e1e50248a1b760c9774d6d6b354562cc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:31 +0100
Subject: net: introduce managed frags infrastructure

Some users like io_uring can do page pinning more efficiently, so we
want a way to delegate referencing to other subsystems. For that add
a new flag called SKBFL_MANAGED_FRAG_REFS. When set, skb doesn't hold
page references and upper layers are responsivle to managing page
lifetime.

It's allowed to convert skbs from managed to normal by calling
skb_zcopy_downgrade_managed(). The function will take all needed
page references and clear the flag. It's needed, for instance,
to avoid mixing managed modes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 25 +++++++++++++++++++++++--
 net/core/skbuff.c      | 29 +++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a8a2dd4cfdfd..07004593d7ca 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -688,11 +688,16 @@ enum {
 	SKBFL_PURE_ZEROCOPY = BIT(2),
 
 	SKBFL_DONT_ORPHAN = BIT(3),
+
+	/* page references are managed by the ubuf_info, so it's safe to
+	 * use frags only up until ubuf_info is released
+	 */
+	SKBFL_MANAGED_FRAG_REFS = BIT(4),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
 #define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
-				 SKBFL_DONT_ORPHAN)
+				 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1810,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
 	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
 }
 
+static inline bool skb_zcopy_managed(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
+}
+
 static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
 				       const struct sk_buff *skb2)
 {
@@ -1884,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 	}
 }
 
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb);
+
+static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+	if (unlikely(skb_zcopy_managed(skb)))
+		__skb_zcopy_downgrade_managed(skb);
+}
+
 static inline void skb_mark_not_on_list(struct sk_buff *skb)
 {
 	skb->next = NULL;
@@ -3499,7 +3517,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
  */
 static inline void skb_frag_unref(struct sk_buff *skb, int f)
 {
-	__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (!skb_zcopy_managed(skb))
+		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f5a3ebbc1f7e..cf4107d80bc4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -666,11 +666,18 @@ static void skb_release_data(struct sk_buff *skb)
 			      &shinfo->dataref))
 		goto exit;
 
-	skb_zcopy_clear(skb, true);
+	if (skb_zcopy(skb)) {
+		bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+		skb_zcopy_clear(skb, true);
+		if (skip_unref)
+			goto free_head;
+	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
 		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 
+free_head:
 	if (shinfo->frag_list)
 		kfree_skb_list(shinfo->frag_list);
 
@@ -895,7 +902,10 @@ EXPORT_SYMBOL(skb_dump);
  */
 void skb_tx_error(struct sk_buff *skb)
 {
-	skb_zcopy_clear(skb, true);
+	if (skb) {
+		skb_zcopy_downgrade_managed(skb);
+		skb_zcopy_clear(skb, true);
+	}
 }
 EXPORT_SYMBOL(skb_tx_error);
 
@@ -1375,6 +1385,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
 
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+	int i;
+
+	skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+		skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
 			      gfp_t gfp_mask)
 {
@@ -1692,6 +1712,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 
 	BUG_ON(skb_shared(skb));
 
+	skb_zcopy_downgrade_managed(skb);
+
 	size = SKB_DATA_ALIGN(size);
 
 	if (skb_pfmemalloc(skb))
@@ -3488,6 +3510,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 	int pos = skb_headlen(skb);
 	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
 
+	skb_zcopy_downgrade_managed(skb);
+
 	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
 	skb_zerocopy_clone(skb1, skb, 0);
 	if (len < pos)	/* Split line is inside header. */
@@ -3841,6 +3865,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 	if (skb_can_coalesce(skb, i, page, offset)) {
 		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
 	} else if (i < MAX_SKB_FRAGS) {
+		skb_zcopy_downgrade_managed(skb);
 		get_page(page);
 		skb_fill_page_desc(skb, i, page, offset, size);
 	} else {
-- 
cgit 


From 84ce071e38a6e25ea3ea91188e5482ac1f17b3af Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:32 +0100
Subject: net: introduce __skb_fill_page_desc_noacc

Managed pages contain pinned userspace pages and controlled by upper
layers, there is no need in tracking skb->pfmemalloc for them. Introduce
a helper for filling frags but ignoring page tracking, it'll be needed
later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 07004593d7ca..1111adefd906 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2550,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 	return skb_headlen(skb) + __skb_pagelen(skb);
 }
 
+static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
+					      int i, struct page *page,
+					      int off, int size)
+{
+	skb_frag_t *frag = &shinfo->frags[i];
+
+	/*
+	 * Propagate page pfmemalloc to the skb if we can. The problem is
+	 * that not all callers have unique ownership of the page but rely
+	 * on page_is_pfmemalloc doing the right thing(tm).
+	 */
+	frag->bv_page		  = page;
+	frag->bv_offset		  = off;
+	skb_frag_size_set(frag, size);
+}
+
 /**
  * __skb_fill_page_desc - initialise a paged fragment in an skb
  * @skb: buffer containing fragment to be initialised
@@ -2566,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 					struct page *page, int off, int size)
 {
-	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-	/*
-	 * Propagate page pfmemalloc to the skb if we can. The problem is
-	 * that not all callers have unique ownership of the page but rely
-	 * on page_is_pfmemalloc doing the right thing(tm).
-	 */
-	frag->bv_page		  = page;
-	frag->bv_offset		  = off;
-	skb_frag_size_set(frag, size);
-
+	__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
 	page = compound_head(page);
 	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc	= true;
-- 
cgit 


From e6b8a0a5e7f688e092d1c639d438ccd0b323213c Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Jul 2022 13:52:44 -0700
Subject: PCI: Add vendor ID for the PCI SIG

This ID is used in DOE headers to identify protocols that are defined
within the PCI Express Base Specification, PCIe r6.0, sec 6.30.1.1 table
6-32.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20220719205249.566684-2-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0178823ce8c2..8af3b86206b1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -151,6 +151,7 @@
 #define PCI_CLASS_OTHERS		0xff
 
 /* Vendors and devices.  Sort key: vendor first, device next. */
+#define PCI_VENDOR_ID_PCI_SIG		0x0001
 
 #define PCI_VENDOR_ID_LOONGSON		0x0014
 
-- 
cgit 


From 9d24322e887b6a3d3f9f9c3e76937a646102c8c1 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Jul 2022 13:52:46 -0700
Subject: PCI/DOE: Add DOE mailbox support functions

Introduced in a PCIe r6.0, sec 6.30, DOE provides a config space based
mailbox with standard protocol discovery.  Each mailbox is accessed
through a DOE Extended Capability.

Each DOE mailbox must support the DOE discovery protocol in addition to
any number of additional protocols.

Define core PCIe functionality to manage a single PCIe DOE mailbox at a
defined config space offset.  Functionality includes iterating,
creating, query of supported protocol, and task submission.  Destruction
of the mailboxes is device managed.

Cc: "Li, Ming" <ming4.li@intel.com>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Acked-by: Bjorn Helgaas <helgaas@kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20220719205249.566684-4-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format                 |   1 +
 drivers/pci/Kconfig           |   3 +
 drivers/pci/Makefile          |   1 +
 drivers/pci/doe.c             | 536 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-doe.h       |  77 ++++++
 include/uapi/linux/pci_regs.h |  29 ++-
 6 files changed, 646 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/doe.c
 create mode 100644 include/linux/pci-doe.h

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index 9b87ea1fc16e..1247d54f9e49 100644
--- a/.clang-format
+++ b/.clang-format
@@ -516,6 +516,7 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
+  - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
   - 'pcm_for_each_format'
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 133c73207782..b2f2e588a817 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -121,6 +121,9 @@ config XEN_PCIDEV_FRONTEND
 config PCI_ATS
 	bool
 
+config PCI_DOE
+	bool
+
 config PCI_ECAM
 	bool
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0da6b1ebc694..2680e4c92f0a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_PCI_ECAM)		+= ecam.o
 obj-$(CONFIG_PCI_P2PDMA)	+= p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
+obj-$(CONFIG_PCI_DOE)		+= doe.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
new file mode 100644
index 000000000000..e402f05068a5
--- /dev/null
+++ b/drivers/pci/doe.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *	Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#define dev_fmt(fmt) "DOE: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/workqueue.h>
+
+#define PCI_DOE_PROTOCOL_DISCOVERY 0
+
+/* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
+#define PCI_DOE_TIMEOUT HZ
+#define PCI_DOE_POLL_INTERVAL	(PCI_DOE_TIMEOUT / 128)
+
+#define PCI_DOE_FLAG_CANCEL	0
+#define PCI_DOE_FLAG_DEAD	1
+
+/**
+ * struct pci_doe_mb - State for a single DOE mailbox
+ *
+ * This state is used to manage a single DOE mailbox capability.  All fields
+ * should be considered opaque to the consumers and the structure passed into
+ * the helpers below after being created by devm_pci_doe_create()
+ *
+ * @pdev: PCI device this mailbox belongs to
+ * @cap_offset: Capability offset
+ * @prots: Array of protocols supported (encoded as long values)
+ * @wq: Wait queue for work item
+ * @work_queue: Queue of pci_doe_work items
+ * @flags: Bit array of PCI_DOE_FLAG_* flags
+ */
+struct pci_doe_mb {
+	struct pci_dev *pdev;
+	u16 cap_offset;
+	struct xarray prots;
+
+	wait_queue_head_t wq;
+	struct workqueue_struct *work_queue;
+	unsigned long flags;
+};
+
+static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
+{
+	if (wait_event_timeout(doe_mb->wq,
+			       test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags),
+			       timeout))
+		return -EIO;
+	return 0;
+}
+
+static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val);
+}
+
+static int pci_doe_abort(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+
+	pci_dbg(pdev, "[%x] Issuing Abort\n", offset);
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_ABORT);
+
+	do {
+		int rc;
+		u32 val;
+
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc)
+			return rc;
+		pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+
+		/* Abort success! */
+		if (!FIELD_GET(PCI_DOE_STATUS_ERROR, val) &&
+		    !FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+			return 0;
+
+	} while (!time_after(jiffies, timeout_jiffies));
+
+	/* Abort has timed out and the MB is dead */
+	pci_err(pdev, "[%x] ABORT timed out\n", offset);
+	return -EIO;
+}
+
+static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
+			    struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+	int i;
+
+	/*
+	 * Check the DOE busy bit is not set. If it is set, this could indicate
+	 * someone other than Linux (e.g. firmware) is using the mailbox. Note
+	 * it is expected that firmware and OS will negotiate access rights via
+	 * an, as yet to be defined, method.
+	 */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+		return -EBUSY;
+
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	/* Write DOE Header */
+	val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) |
+		FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type);
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val);
+	/* Length is 2 DW of header + length of payload in DW */
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+			       FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH,
+					  2 + task->request_pl_sz /
+						sizeof(u32)));
+	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
+		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+				       task->request_pl[i]);
+
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
+
+	return 0;
+}
+
+static bool pci_doe_data_obj_ready(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val))
+		return true;
+	return false;
+}
+
+static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	size_t length, payload_length;
+	u32 val;
+	int i;
+
+	/* Read the first dword to get the protocol */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	if ((FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val) != task->prot.vid) ||
+	    (FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val) != task->prot.type)) {
+		dev_err_ratelimited(&pdev->dev, "[%x] expected [VID, Protocol] = [%04x, %02x], got [%04x, %02x]\n",
+				    doe_mb->cap_offset, task->prot.vid, task->prot.type,
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val),
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val));
+		return -EIO;
+	}
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	/* Read the second dword to get the length */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+
+	length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val);
+	if (length > SZ_1M || length < 2)
+		return -EIO;
+
+	/* First 2 dwords have already been read */
+	length -= 2;
+	payload_length = min(length, task->response_pl_sz / sizeof(u32));
+	/* Read the rest of the response payload */
+	for (i = 0; i < payload_length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
+				      &task->response_pl[i]);
+		/* Prior to the last ack, ensure Data Object Ready */
+		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
+			return -EIO;
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Flush excess length */
+	for (; i < length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Final error check to pick up on any since Data Object Ready */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32);
+}
+
+static void signal_task_complete(struct pci_doe_task *task, int rv)
+{
+	task->rv = rv;
+	task->complete(task);
+}
+
+static void signal_task_abort(struct pci_doe_task *task, int rv)
+{
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+
+	if (pci_doe_abort(doe_mb)) {
+		/*
+		 * If the device can't process an abort; set the mailbox dead
+		 *	- no more submissions
+		 */
+		pci_err(pdev, "[%x] Abort failed marking mailbox dead\n",
+			doe_mb->cap_offset);
+		set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+	}
+	signal_task_complete(task, rv);
+}
+
+static void doe_statemachine_work(struct work_struct *work)
+{
+	struct pci_doe_task *task = container_of(work, struct pci_doe_task,
+						 work);
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+	u32 val;
+	int rc;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
+		signal_task_complete(task, -EIO);
+		return;
+	}
+
+	/* Send request */
+	rc = pci_doe_send_req(doe_mb, task);
+	if (rc) {
+		/*
+		 * The specification does not provide any guidance on how to
+		 * resolve conflicting requests from other entities.
+		 * Furthermore, it is likely that busy will not be detected
+		 * most of the time.  Flag any detection of status busy with an
+		 * error.
+		 */
+		if (rc == -EBUSY)
+			dev_err_ratelimited(&pdev->dev, "[%x] busy detected; another entity is sending conflicting requests\n",
+					    offset);
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	/* Poll for response */
+retry_resp:
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) {
+		signal_task_abort(task, -EIO);
+		return;
+	}
+
+	if (!FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) {
+		if (time_after(jiffies, timeout_jiffies)) {
+			signal_task_abort(task, -EIO);
+			return;
+		}
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc) {
+			signal_task_abort(task, rc);
+			return;
+		}
+		goto retry_resp;
+	}
+
+	rc  = pci_doe_recv_resp(doe_mb, task);
+	if (rc < 0) {
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	signal_task_complete(task, rc);
+}
+
+static void pci_doe_task_complete(struct pci_doe_task *task)
+{
+	complete(task->private);
+}
+
+static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
+			     u8 *protocol)
+{
+	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
+				    *index);
+	u32 response_pl;
+	DECLARE_COMPLETION_ONSTACK(c);
+	struct pci_doe_task task = {
+		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
+		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
+		.request_pl = &request_pl,
+		.request_pl_sz = sizeof(request_pl),
+		.response_pl = &response_pl,
+		.response_pl_sz = sizeof(response_pl),
+		.complete = pci_doe_task_complete,
+		.private = &c,
+	};
+	int rc;
+
+	rc = pci_doe_submit_task(doe_mb, &task);
+	if (rc < 0)
+		return rc;
+
+	wait_for_completion(&c);
+
+	if (task.rv != sizeof(response_pl))
+		return -EIO;
+
+	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
+	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
+			      response_pl);
+	*index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX,
+			   response_pl);
+
+	return 0;
+}
+
+static void *pci_doe_xa_prot_entry(u16 vid, u8 prot)
+{
+	return xa_mk_value((vid << 8) | prot);
+}
+
+static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb)
+{
+	u8 index = 0;
+	u8 xa_idx = 0;
+
+	do {
+		int rc;
+		u16 vid;
+		u8 prot;
+
+		rc = pci_doe_discovery(doe_mb, &index, &vid, &prot);
+		if (rc)
+			return rc;
+
+		pci_dbg(doe_mb->pdev,
+			"[%x] Found protocol %d vid: %x prot: %x\n",
+			doe_mb->cap_offset, xa_idx, vid, prot);
+
+		rc = xa_insert(&doe_mb->prots, xa_idx++,
+			       pci_doe_xa_prot_entry(vid, prot), GFP_KERNEL);
+		if (rc)
+			return rc;
+	} while (index);
+
+	return 0;
+}
+
+static void pci_doe_xa_destroy(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	xa_destroy(&doe_mb->prots);
+}
+
+static void pci_doe_destroy_workqueue(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	destroy_workqueue(doe_mb->work_queue);
+}
+
+static void pci_doe_flush_mb(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	/* Stop all pending work items from starting */
+	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+
+	/* Cancel an in progress work item, if necessary */
+	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
+	wake_up(&doe_mb->wq);
+
+	/* Flush all work items */
+	flush_workqueue(doe_mb->work_queue);
+}
+
+/**
+ * pcim_doe_create_mb() - Create a DOE mailbox object
+ *
+ * @pdev: PCI device to create the DOE mailbox for
+ * @cap_offset: Offset of the DOE mailbox
+ *
+ * Create a single mailbox object to manage the mailbox protocol at the
+ * cap_offset specified.
+ *
+ * RETURNS: created mailbox object on success
+ *	    ERR_PTR(-errno) on failure
+ */
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset)
+{
+	struct pci_doe_mb *doe_mb;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	doe_mb = devm_kzalloc(dev, sizeof(*doe_mb), GFP_KERNEL);
+	if (!doe_mb)
+		return ERR_PTR(-ENOMEM);
+
+	doe_mb->pdev = pdev;
+	doe_mb->cap_offset = cap_offset;
+	init_waitqueue_head(&doe_mb->wq);
+
+	xa_init(&doe_mb->prots);
+	rc = devm_add_action(dev, pci_doe_xa_destroy, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0,
+						dev_driver_string(&pdev->dev),
+						pci_name(pdev),
+						doe_mb->cap_offset);
+	if (!doe_mb->work_queue) {
+		pci_err(pdev, "[%x] failed to allocate work queue\n",
+			doe_mb->cap_offset);
+		return ERR_PTR(-ENOMEM);
+	}
+	rc = devm_add_action_or_reset(dev, pci_doe_destroy_workqueue, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* Reset the mailbox by issuing an abort */
+	rc = pci_doe_abort(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to reset mailbox with abort command : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The state machine and the mailbox should be in sync now;
+	 * Set up mailbox flush prior to using the mailbox to query protocols.
+	 */
+	rc = devm_add_action_or_reset(dev, pci_doe_flush_mb, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = pci_doe_cache_protocols(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to cache protocols : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	return doe_mb;
+}
+EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
+
+/**
+ * pci_doe_supports_prot() - Return if the DOE instance supports the given
+ *			     protocol
+ * @doe_mb: DOE mailbox capability to query
+ * @vid: Protocol Vendor ID
+ * @type: Protocol type
+ *
+ * RETURNS: True if the DOE mailbox supports the protocol specified
+ */
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
+{
+	unsigned long index;
+	void *entry;
+
+	/* The discovery protocol must always be supported */
+	if (vid == PCI_VENDOR_ID_PCI_SIG && type == PCI_DOE_PROTOCOL_DISCOVERY)
+		return true;
+
+	xa_for_each(&doe_mb->prots, index, entry)
+		if (entry == pci_doe_xa_prot_entry(vid, type))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
+
+/**
+ * pci_doe_submit_task() - Submit a task to be processed by the state machine
+ *
+ * @doe_mb: DOE mailbox capability to submit to
+ * @task: task to be queued
+ *
+ * Submit a DOE task (request/response) to the DOE mailbox to be processed.
+ * Returns upon queueing the task object.  If the queue is full this function
+ * will sleep until there is room in the queue.
+ *
+ * task->complete will be called when the state machine is done processing this
+ * task.
+ *
+ * Excess data will be discarded.
+ *
+ * RETURNS: 0 when task has been successfully queued, -ERRNO on error
+ */
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type))
+		return -EINVAL;
+
+	/*
+	 * DOE requests must be a whole number of DW and the response needs to
+	 * be big enough for at least 1 DW
+	 */
+	if (task->request_pl_sz % sizeof(u32) ||
+	    task->response_pl_sz < sizeof(u32))
+		return -EINVAL;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
+		return -EIO;
+
+	task->doe_mb = doe_mb;
+	INIT_WORK(&task->work, doe_statemachine_work);
+	queue_work(doe_mb->work_queue, &task->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_doe_submit_task);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
new file mode 100644
index 000000000000..ed9b4df792b8
--- /dev/null
+++ b/include/linux/pci-doe.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *     Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#ifndef LINUX_PCI_DOE_H
+#define LINUX_PCI_DOE_H
+
+struct pci_doe_protocol {
+	u16 vid;
+	u8 type;
+};
+
+struct pci_doe_mb;
+
+/**
+ * struct pci_doe_task - represents a single query/response
+ *
+ * @prot: DOE Protocol
+ * @request_pl: The request payload
+ * @request_pl_sz: Size of the request payload (bytes)
+ * @response_pl: The response payload
+ * @response_pl_sz: Size of the response payload (bytes)
+ * @rv: Return value.  Length of received response or error (bytes)
+ * @complete: Called when task is complete
+ * @private: Private data for the consumer
+ * @work: Used internally by the mailbox
+ * @doe_mb: Used internally by the mailbox
+ *
+ * The payload sizes and rv are specified in bytes with the following
+ * restrictions concerning the protocol.
+ *
+ *	1) The request_pl_sz must be a multiple of double words (4 bytes)
+ *	2) The response_pl_sz must be >= a single double word (4 bytes)
+ *	3) rv is returned as bytes but it will be a multiple of double words
+ *
+ * NOTE there is no need for the caller to initialize work or doe_mb.
+ */
+struct pci_doe_task {
+	struct pci_doe_protocol prot;
+	u32 *request_pl;
+	size_t request_pl_sz;
+	u32 *response_pl;
+	size_t response_pl_sz;
+	int rv;
+	void (*complete)(struct pci_doe_task *task);
+	void *private;
+
+	/* No need for the user to initialize these fields */
+	struct work_struct work;
+	struct pci_doe_mb *doe_mb;
+};
+
+/**
+ * pci_doe_for_each_off - Iterate each DOE capability
+ * @pdev: struct pci_dev to iterate
+ * @off: u16 of config space offset of each mailbox capability found
+ */
+#define pci_doe_for_each_off(pdev, off) \
+	for (off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE); \
+		off > 0; \
+		off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE))
+
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
+
+#endif
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 108f8523fa04..57b8e2ffb1dd 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -737,7 +737,8 @@
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
+#define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1103,4 +1104,30 @@
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK		0x000000F0
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT	4
 
+/* Data Object Exchange */
+#define PCI_DOE_CAP		0x04    /* DOE Capabilities Register */
+#define  PCI_DOE_CAP_INT_SUP			0x00000001  /* Interrupt Support */
+#define  PCI_DOE_CAP_INT_MSG_NUM		0x00000ffe  /* Interrupt Message Number */
+#define PCI_DOE_CTRL		0x08    /* DOE Control Register */
+#define  PCI_DOE_CTRL_ABORT			0x00000001  /* DOE Abort */
+#define  PCI_DOE_CTRL_INT_EN			0x00000002  /* DOE Interrupt Enable */
+#define  PCI_DOE_CTRL_GO			0x80000000  /* DOE Go */
+#define PCI_DOE_STATUS		0x0c    /* DOE Status Register */
+#define  PCI_DOE_STATUS_BUSY			0x00000001  /* DOE Busy */
+#define  PCI_DOE_STATUS_INT_STATUS		0x00000002  /* DOE Interrupt Status */
+#define  PCI_DOE_STATUS_ERROR			0x00000004  /* DOE Error */
+#define  PCI_DOE_STATUS_DATA_OBJECT_READY	0x80000000  /* Data Object Ready */
+#define PCI_DOE_WRITE		0x10    /* DOE Write Data Mailbox Register */
+#define PCI_DOE_READ		0x14    /* DOE Read Data Mailbox Register */
+
+/* DOE Data Object - note not actually registers */
+#define PCI_DOE_DATA_OBJECT_HEADER_1_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_HEADER_1_TYPE		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH		0x0003ffff
+
+#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX		0x000000ff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit 


From 9d6794feeb90903b10c34bddd9c74c992447ce83 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 19 Jul 2022 13:52:48 -0700
Subject: driver-core: Introduce BIN_ATTR_ADMIN_{RO,RW}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many binary attributes need to limit access to CAP_SYS_ADMIN only; ie
many binary attributes specify is_visible with 0400 or 0600.

Make setting the permissions of such attributes more explicit by
defining BIN_ATTR_ADMIN_{RO,RW}.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Krzysztof Wilczyński <kw@linux.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20220719205249.566684-6-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/sysfs.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e3f1e8ac1f85..fd3fe5c8c17f 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -235,6 +235,22 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)
 #define BIN_ATTR_RW(_name, _size)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
 
+
+#define __BIN_ATTR_ADMIN_RO(_name, _size) {					\
+	.attr	= { .name = __stringify(_name), .mode = 0400 },		\
+	.read	= _name##_read,						\
+	.size	= _size,						\
+}
+
+#define __BIN_ATTR_ADMIN_RW(_name, _size)					\
+	__BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)
+
+#define BIN_ATTR_ADMIN_RO(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)
+
+#define BIN_ATTR_ADMIN_RW(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
-- 
cgit 


From ca0cab11928870701535fbc3e49a4b196f722743 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 18 Jul 2022 10:55:12 +0200
Subject: net/sched: remove qdisc_root_lock() helper

the last caller has been removed with commit 96f5e66e8a79 ("mac80211: fix
aggregation for hardware with ampdu queues"), so it's safe to remove this
function.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/703d549e3088367651d92a059743f1be848d74b7.1658133689.git.dcaratti@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d6cf5116b5f9..ec693fe7c553 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -551,25 +551,6 @@ static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
 	return qdisc->dev_queue->qdisc_sleeping;
 }
 
-/* The qdisc root lock is a mechanism by which to top level
- * of a qdisc tree can be locked from any qdisc node in the
- * forest.  This allows changing the configuration of some
- * aspect of the qdisc tree while blocking out asynchronous
- * qdisc access in the packet processing paths.
- *
- * It is only legal to do this when the root will not change
- * on us.  Otherwise we'll potentially lock the wrong qdisc
- * root.  This is enforced by holding the RTNL semaphore, which
- * all users of this lock accessor must do.
- */
-static inline spinlock_t *qdisc_root_lock(const struct Qdisc *qdisc)
-{
-	struct Qdisc *root = qdisc_root(qdisc);
-
-	ASSERT_RTNL();
-	return qdisc_lock(root);
-}
-
 static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
 {
 	struct Qdisc *root = qdisc_root_sleeping(qdisc);
-- 
cgit 


From e70a3263a7eed768d5f947b8f2aff8d2a79c9d97 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:48 +0900
Subject: can: error: specify the values of data[5..7] of CAN error frames

Currently, data[5..7] of struct can_frame, when used as a CAN error
frame, are defined as being "controller specific". Device specific
behaviours are problematic because it prevents someone from writing
code which is portable between devices.

As a matter of fact, data[5] is never used, data[6] is always used to
report TX error counter and data[7] is always used to report RX error
counter. can-utils also relies on this.

This patch updates the comment in the uapi header to specify that
data[5] is reserved (and thus should not be used) and that data[6..7]
are used for error counters.

Fixes: 0d66548a10cb ("[CAN]: Add PF_CAN core module")
Link: https://lore.kernel.org/all/20220719143550.3681-11-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/error.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index 34633283de64..a1000cb63063 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -120,6 +120,9 @@
 #define CAN_ERR_TRX_CANL_SHORT_TO_GND  0x70 /* 0111 0000 */
 #define CAN_ERR_TRX_CANL_SHORT_TO_CANH 0x80 /* 1000 0000 */
 
-/* controller specific additional information / data[5..7] */
+/* data[5] is reserved (do not use) */
+
+/* TX error counter / data[6] */
+/* RX error counter / data[7] */
 
 #endif /* _UAPI_CAN_ERROR_H */
-- 
cgit 


From 3e5c291c7942d0909a48bc5ec1b9bba136465166 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:49 +0900
Subject: can: add CAN_ERR_CNT flag to notify availability of error counter

Add a dedicated flag in uapi/linux/can/error.h to notify the userland
that fields data[6] and data[7] of the CAN error frame were
respectively populated with the tx and rx error counters.

For all driver tree-wide, set up this flags whenever needed.

Link: https://lore.kernel.org/all/20220719143550.3681-12-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/c_can/c_can_main.c                | 6 +++---
 drivers/net/can/cc770/cc770.c                     | 1 +
 drivers/net/can/ctucanfd/ctucanfd_base.c          | 5 +++--
 drivers/net/can/grcan.c                           | 1 +
 drivers/net/can/ifi_canfd/ifi_canfd.c             | 4 ++--
 drivers/net/can/janz-ican3.c                      | 4 ++--
 drivers/net/can/kvaser_pciefd.c                   | 2 +-
 drivers/net/can/m_can/m_can.c                     | 4 ++--
 drivers/net/can/pch_can.c                         | 1 +
 drivers/net/can/peak_canfd/peak_canfd.c           | 6 +++---
 drivers/net/can/rcar/rcar_can.c                   | 1 +
 drivers/net/can/rcar/rcar_canfd.c                 | 4 ++--
 drivers/net/can/sja1000/sja1000.c                 | 1 +
 drivers/net/can/slcan/slcan-core.c                | 1 +
 drivers/net/can/spi/hi311x.c                      | 1 +
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c    | 1 +
 drivers/net/can/sun4i_can.c                       | 1 +
 drivers/net/can/ti_hecc.c                         | 1 +
 drivers/net/can/usb/esd_usb.c                     | 3 ++-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 ++
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 1 +
 drivers/net/can/usb/peak_usb/pcan_usb.c           | 1 +
 drivers/net/can/usb/usb_8dev.c                    | 1 +
 drivers/net/can/xilinx_can.c                      | 1 +
 include/uapi/linux/can/error.h                    | 2 ++
 25 files changed, 38 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/c_can/c_can_main.c b/drivers/net/can/c_can/c_can_main.c
index ed4db4cf8716..de38d8f7b5f7 100644
--- a/drivers/net/can/c_can/c_can_main.c
+++ b/drivers/net/can/c_can/c_can_main.c
@@ -952,14 +952,14 @@ static int c_can_handle_state_change(struct net_device *dev,
 
 	switch (error_type) {
 	case C_CAN_NO_ERROR:
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = CAN_ERR_CRTL_ACTIVE;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 		break;
 	case C_CAN_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -969,7 +969,7 @@ static int c_can_handle_state_change(struct net_device *dev,
 		break;
 	case C_CAN_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		if (rx_err_passive)
 			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
 		if (bec.txerr > 127)
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index bb7224cfc6ab..797a954bb1a0 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -512,6 +512,7 @@ static int cc770_err(struct net_device *dev, u8 status)
 
 	/* Use extended functions of the CC770 */
 	if (priv->control_normal_mode & CTRL_EAF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = cc770_read_reg(priv, tx_error_counter);
 		cf->data[7] = cc770_read_reg(priv, rx_error_counter);
 	}
diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c
index 14ac7c0ee04c..6b281f6eb9b4 100644
--- a/drivers/net/can/ctucanfd/ctucanfd_base.c
+++ b/drivers/net/can/ctucanfd/ctucanfd_base.c
@@ -847,7 +847,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 		case CAN_STATE_ERROR_PASSIVE:
 			priv->can.can_stats.error_passive++;
 			if (skb) {
-				cf->can_id |= CAN_ERR_CRTL;
+				cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 				cf->data[1] = (bec.rxerr > 127) ?
 						CAN_ERR_CRTL_RX_PASSIVE :
 						CAN_ERR_CRTL_TX_PASSIVE;
@@ -858,7 +858,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 		case CAN_STATE_ERROR_WARNING:
 			priv->can.can_stats.error_warning++;
 			if (skb) {
-				cf->can_id |= CAN_ERR_CRTL;
+				cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 				cf->data[1] |= (bec.txerr > bec.rxerr) ?
 					CAN_ERR_CRTL_TX_WARNING :
 					CAN_ERR_CRTL_RX_WARNING;
@@ -867,6 +867,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 			}
 			break;
 		case CAN_STATE_ERROR_ACTIVE:
+			cf->can_id |= CAN_ERR_CNT;
 			cf->data[1] = CAN_ERR_CRTL_ACTIVE;
 			cf->data[6] = bec.txerr;
 			cf->data[7] = bec.rxerr;
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index 4c47c1055eff..24035a6187c9 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -671,6 +671,7 @@ static void grcan_err(struct net_device *dev, u32 sources, u32 status)
 				/* There are no others at this point */
 				break;
 			}
+			cf.can_id |= CAN_ERR_CNT;
 			cf.data[6] = txerr;
 			cf.data[7] = rxerr;
 			priv->can.state = state;
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 968ed6d7316b..64e3be8b73af 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -492,7 +492,7 @@ static int ifi_canfd_handle_state_change(struct net_device *ndev,
 	switch (new_state) {
 	case CAN_STATE_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -501,7 +501,7 @@ static int ifi_canfd_handle_state_change(struct net_device *ndev,
 		break;
 	case CAN_STATE_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
 		if (bec.txerr > 127)
 			cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index 35bfb82d6929..ccb5c5405224 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -1127,7 +1127,7 @@ static int ican3_handle_cevtind(struct ican3_dev *mod, struct ican3_msg *msg)
 	/* bus error interrupt */
 	if (isrc == CEVTIND_BEI) {
 		mod->can.can_stats.bus_error++;
-		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR | CAN_ERR_CNT;
 
 		switch (ecc & ECC_MASK) {
 		case ECC_BIT:
@@ -1153,7 +1153,7 @@ static int ican3_handle_cevtind(struct ican3_dev *mod, struct ican3_msg *msg)
 
 	if (state != mod->can.state && (state == CAN_STATE_ERROR_WARNING ||
 					state == CAN_STATE_ERROR_PASSIVE)) {
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		if (state == CAN_STATE_ERROR_WARNING) {
 			mod->can.can_stats.error_warning++;
 			cf->data[1] = (txerr > rxerr) ?
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 017f2d36ffc3..dcd2c9d50d5e 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -1306,7 +1306,7 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	shhwtstamps->hwtstamp =
 		ns_to_ktime(div_u64(p->timestamp * 1000,
 				    can->kv_pcie->freq_to_ticks_div));
-	cf->can_id |= CAN_ERR_BUSERROR;
+	cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT;
 
 	cf->data[6] = bec.txerr;
 	cf->data[7] = bec.rxerr;
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index afaaeb610c00..713a4b0edf86 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -741,7 +741,7 @@ static int m_can_handle_state_change(struct net_device *dev,
 	switch (new_state) {
 	case CAN_STATE_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -750,7 +750,7 @@ static int m_can_handle_state_change(struct net_device *dev,
 		break;
 	case CAN_STATE_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		ecr = m_can_read(cdev, M_CAN_ECR);
 		if (ecr & ECR_RP)
 			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index 497ef77340ea..50f6719b3aa4 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -497,6 +497,7 @@ static void pch_can_error(struct net_device *ndev, u32 status)
 		priv->can.can_stats.bus_off++;
 		can_bus_off(ndev);
 	} else {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = errc & PCH_TEC;
 		cf->data[7] = (errc & PCH_REC) >> 8;
 	}
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index b2dea360813d..afb9adb3d5c2 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -373,7 +373,7 @@ static int pucan_handle_status(struct peak_canfd_priv *priv,
 		priv->can.state = CAN_STATE_ERROR_PASSIVE;
 		priv->can.can_stats.error_passive++;
 		if (skb) {
-			cf->can_id |= CAN_ERR_CRTL;
+			cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
 					CAN_ERR_CRTL_TX_PASSIVE :
 					CAN_ERR_CRTL_RX_PASSIVE;
@@ -386,7 +386,7 @@ static int pucan_handle_status(struct peak_canfd_priv *priv,
 		priv->can.state = CAN_STATE_ERROR_WARNING;
 		priv->can.can_stats.error_warning++;
 		if (skb) {
-			cf->can_id |= CAN_ERR_CRTL;
+			cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
 					CAN_ERR_CRTL_TX_WARNING :
 					CAN_ERR_CRTL_RX_WARNING;
@@ -430,7 +430,7 @@ static int pucan_handle_cache_critical(struct peak_canfd_priv *priv)
 		return -ENOMEM;
 	}
 
-	cf->can_id |= CAN_ERR_CRTL;
+	cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 
 	cf->data[6] = priv->bec.txerr;
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 24d7a71def6a..d11db2112a4a 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -334,6 +334,7 @@ static void rcar_can_error(struct net_device *ndev)
 		if (skb)
 			cf->can_id |= CAN_ERR_BUSOFF;
 	} else if (skb) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index ba42cef10a53..e3382284e172 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1052,7 +1052,7 @@ static void rcar_canfd_error(struct net_device *ndev, u32 cerfl,
 		netdev_dbg(ndev, "Error warning interrupt\n");
 		priv->can.state = CAN_STATE_ERROR_WARNING;
 		priv->can.can_stats.error_warning++;
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
 		cf->data[6] = txerr;
@@ -1062,7 +1062,7 @@ static void rcar_canfd_error(struct net_device *ndev, u32 cerfl,
 		netdev_dbg(ndev, "Error passive interrupt\n");
 		priv->can.state = CAN_STATE_ERROR_PASSIVE;
 		priv->can.can_stats.error_passive++;
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_PASSIVE :
 			CAN_ERR_CRTL_RX_PASSIVE;
 		cf->data[6] = txerr;
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 74bff5092b47..75a2f9bf8c16 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -426,6 +426,7 @@ static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status)
 			state = CAN_STATE_ERROR_ACTIVE;
 	}
 	if (state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c
index dfd1baba4130..dc28e715bbe1 100644
--- a/drivers/net/can/slcan/slcan-core.c
+++ b/drivers/net/can/slcan/slcan-core.c
@@ -314,6 +314,7 @@ static void slc_bump_state(struct slcan *sl)
 	if (state == CAN_STATE_BUS_OFF) {
 		can_bus_off(dev);
 	} else if (skb) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index bfb7c4bb5bc3..167114aae6dd 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -680,6 +680,7 @@ static irqreturn_t hi3110_can_ist(int irq, void *dev_id)
 					break;
 				}
 			} else {
+				cf->can_id |= CAN_ERR_CNT;
 				cf->data[6] = txerr;
 				cf->data[7] = rxerr;
 			}
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 9b47b07162fe..f4e174cadd4e 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1099,6 +1099,7 @@ static int mcp251xfd_handle_cerrif(struct mcp251xfd_priv *priv)
 		err = mcp251xfd_get_berr_counter(priv->ndev, &bec);
 		if (err)
 			return err;
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 	}
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index afe9b541f037..b90dfb429ccd 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -566,6 +566,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status)
 			state = CAN_STATE_ERROR_ACTIVE;
 	}
 	if (skb && state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index debe17bfd0f0..afa38771520e 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -662,6 +662,7 @@ static void ti_hecc_change_state(struct net_device *ndev,
 	can_change_state(priv->ndev, cf, tx_state, rx_state);
 
 	if (max(tx_state, rx_state) != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = hecc_read(priv, HECC_CANTEC);
 		cf->data[7] = hecc_read(priv, HECC_CANREC);
 	}
diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c
index 8a4bf2961f3d..177ed33e08d9 100644
--- a/drivers/net/can/usb/esd_usb.c
+++ b/drivers/net/can/usb/esd_usb.c
@@ -265,7 +265,8 @@ static void esd_usb_rx_event(struct esd_usb_net_priv *priv,
 			priv->can.can_stats.bus_error++;
 			stats->rx_errors++;
 
-			cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+			cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR |
+				      CAN_ERR_CNT;
 
 			switch (ecc & SJA1000_ECC_MASK) {
 			case SJA1000_ECC_BIT:
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 404093468b2f..dd65c101bfb8 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -918,6 +918,7 @@ static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv,
 		priv->can.can_stats.restarts++;
 
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec->txerr;
 		cf->data[7] = bec->rxerr;
 	}
@@ -1072,6 +1073,7 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv,
 
 	cf->can_id |= CAN_ERR_BUSERROR;
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 	}
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index f551fde16a70..07f687f29b34 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -854,6 +854,7 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 	}
 
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = es->txerr;
 		cf->data[7] = es->rxerr;
 	}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 091c631ebe23..d07b7ee79e3e 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -506,6 +506,7 @@ static int pcan_usb_decode_error(struct pcan_usb_msg_context *mc, u8 n,
 			/* Supply TX/RX error counters in case of
 			 * controller error.
 			 */
+			cf->can_id = CAN_ERR_CNT;
 			cf->data[6] = mc->pdev->bec.txerr;
 			cf->data[7] = mc->pdev->bec.rxerr;
 		}
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 4d38dc90472a..8b7cd69e20b0 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -439,6 +439,7 @@ static void usb_8dev_rx_err_msg(struct usb_8dev_priv *priv,
 	if (rx_errors)
 		stats->rx_errors++;
 	if (priv->can.state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 0de2f97d9f62..caa6b4cee63f 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -965,6 +965,7 @@ static void xcan_set_error_state(struct net_device *ndev,
 	can_change_state(ndev, cf, tx_state, rx_state);
 
 	if (cf) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index a1000cb63063..b7c3efd9ff99 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -57,6 +57,8 @@
 #define CAN_ERR_BUSOFF       0x00000040U /* bus off */
 #define CAN_ERR_BUSERROR     0x00000080U /* bus error (may flood!) */
 #define CAN_ERR_RESTARTED    0x00000100U /* controller restarted */
+#define CAN_ERR_CNT          0x00000200U /* TX error counter / data[6] */
+					 /* RX error counter / data[7] */
 
 /* arbitration lost in bit ... / data[0] */
 #define CAN_ERR_LOSTARB_UNSPEC   0x00 /* unspecified */
-- 
cgit 


From 3f9c26210cf80ea8cb5dd901aba5feb77200b085 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:50 +0900
Subject: can: error: add definitions for the different CAN error thresholds

Currently, drivers are using magic numbers to derive the CAN error
states from the error counter. Add three macro declarations to
remediate this.

For reference, the error-active, error-passive and bus-off are defined
in ISO 11898, section 12.1.4.2 "Error counting". Although ISO 11898
does not define error-warning state, this extra value is also commonly
used and is thus also added.

Link: https://lore.kernel.org/all/20220719143550.3681-13-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/error.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index b7c3efd9ff99..acc1ac393d2a 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -127,4 +127,17 @@
 /* TX error counter / data[6] */
 /* RX error counter / data[7] */
 
+/* CAN state thresholds
+ *
+ * Error counter	Error state
+ * -----------------------------------
+ * 0 -  95		Error-active
+ * 96 - 127		Error-warning
+ * 128 - 255		Error-passive
+ * 256 and greater	Bus-off
+ */
+#define CAN_ERROR_WARNING_THRESHOLD 96
+#define CAN_ERROR_PASSIVE_THRESHOLD 128
+#define CAN_BUS_OFF_THRESHOLD 256
+
 #endif /* _UAPI_CAN_ERROR_H */
-- 
cgit 


From f5ecfee94493475783074e86ded10a0499d779fc Mon Sep 17 00:00:00 2001
From: Pierre Morel <pmorel@linux.ibm.com>
Date: Thu, 14 Jul 2022 21:43:34 +0200
Subject: KVM: s390: resetting the Topology-Change-Report

During a subsystem reset the Topology-Change-Report is cleared.

Let's give userland the possibility to clear the MTCR in the case
of a subsystem reset.

To migrate the MTCR, we give userland the possibility to
query the MTCR state.

We indicate KVM support for the CPU topology facility with a new
KVM capability: KVM_CAP_S390_CPU_TOPOLOGY.

Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
Reviewed-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Message-Id: <20220714194334.127812-1-pmorel@linux.ibm.com>
Link: https://lore.kernel.org/all/20220714194334.127812-1-pmorel@linux.ibm.com/
[frankja@linux.ibm.com: Simple conflict resolution in Documentation/virt/kvm/api.rst]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst   | 25 ++++++++++++++++++++
 arch/s390/include/uapi/asm/kvm.h |  1 +
 arch/s390/kvm/kvm-s390.c         | 51 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h         |  1 +
 4 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 5be5cc59869d..3c4551a2f6d0 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8269,6 +8269,31 @@ The capability has no effect if the nx_huge_pages module parameter is not set.
 
 This capability may only be set before any vCPUs are created.
 
+8.39 KVM_CAP_S390_CPU_TOPOLOGY
+------------------------------
+
+:Capability: KVM_CAP_S390_CPU_TOPOLOGY
+:Architectures: s390
+:Type: vm
+
+This capability indicates that KVM will provide the S390 CPU Topology
+facility which consist of the interpretation of the PTF instruction for
+the function code 2 along with interception and forwarding of both the
+PTF instruction with function codes 0 or 1 and the STSI(15,1,x)
+instruction to the userland hypervisor.
+
+The stfle facility 11, CPU Topology facility, should not be indicated
+to the guest without this capability.
+
+When this capability is present, KVM provides a new attribute group
+on vm fd, KVM_S390_VM_CPU_TOPOLOGY.
+This new attribute allows to get, set or clear the Modified Change
+Topology Report (MTCR) bit of the SCA through the kvm_device_attr
+structure.
+
+When getting the Modified Change Topology Report value, the attr->addr
+must point to a byte where the value will be stored or retrieved from.
+
 9. Known KVM API problems
 =========================
 
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 7a6b14874d65..a73cf01a1606 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_CRYPTO		2
 #define KVM_S390_VM_CPU_MODEL		3
 #define KVM_S390_VM_MIGRATION		4
+#define KVM_S390_VM_CPU_TOPOLOGY	5
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA	0
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5d18b66a08c9..edfd4bbd0cba 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -642,6 +642,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_ZPCI_OP:
 		r = kvm_s390_pci_interp_allowed();
 		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = test_facility(11);
+		break;
 	default:
 		r = 0;
 	}
@@ -853,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		icpt_operexc_on_all_vcpus(kvm);
 		r = 0;
 		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = -EINVAL;
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else if (test_facility(11)) {
+			set_kvm_facility(kvm->arch.model.fac_mask, 11);
+			set_kvm_facility(kvm->arch.model.fac_list, 11);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1789,6 +1806,31 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
 	read_unlock(&kvm->arch.sca_lock);
 }
 
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+	return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	u8 topo;
+
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	read_lock(&kvm->arch.sca_lock);
+	topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+	read_unlock(&kvm->arch.sca_lock);
+
+	return put_user(topo, (u8 __user *)attr->addr);
+}
+
 static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret;
@@ -1809,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_set_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_set_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1834,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_get_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_get_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1907,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = 0;
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+		break;
 	default:
 		ret = -ENXIO;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 20817dd7f2f1..7e06194129e3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1168,6 +1168,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_NOTIFY_VMEXIT 219
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
+#define KVM_CAP_S390_CPU_TOPOLOGY 222
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit 


From 9b55c20f83369dd54541d9ddbe3a018a8377f451 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 18 Jul 2022 10:26:42 -0700
Subject: ip: Fix data-races around sysctl_ip_prot_sock.

sysctl_ip_prot_sock is accessed concurrently, and there is always a chance
of data-race.  So, all readers and writers need some basic protection to
avoid load/store-tearing.

Fixes: 4548b683b781 ("Introduce a sysctl that modifies the value of PROT_SOCK.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           | 2 +-
 net/ipv4/sysctl_net_ipv4.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 4a15b6bcb4b8..1c979fd1904c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -357,7 +357,7 @@ static inline bool sysctl_dev_name_is_allowed(const char *name)
 
 static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
 {
-	return port < net->ipv4.sysctl_ip_prot_sock;
+	return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
 }
 
 #else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 130e9c130311..5490c285668b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -84,7 +84,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 		 * port limit.
 		 */
 		if ((range[1] < range[0]) ||
-		    (range[0] < net->ipv4.sysctl_ip_prot_sock))
+		    (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock)))
 			ret = -EINVAL;
 		else
 			set_local_port_range(net, range);
@@ -110,7 +110,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write,
 		.extra2 = &ip_privileged_port_max,
 	};
 
-	pports = net->ipv4.sysctl_ip_prot_sock;
+	pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
 
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
@@ -122,7 +122,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write,
 		if (range[0] < pports)
 			ret = -EINVAL;
 		else
-			net->ipv4.sysctl_ip_prot_sock = pports;
+			WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports);
 	}
 
 	return ret;
-- 
cgit 


From 3d72bb4188c708bb16758c60822fc4dda7a95174 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 18 Jul 2022 10:26:43 -0700
Subject: udp: Fix a data-race around sysctl_udp_l3mdev_accept.

While reading sysctl_udp_l3mdev_accept, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: 63a6fff353d0 ("net: Avoid receiving packets with an l3mdev on unbound UDP sockets")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index bb4c227299cc..8dd4aa1485a6 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -238,7 +238,7 @@ static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
 				       int dif, int sdif)
 {
 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-	return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept,
+	return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept),
 				 bound_dev_if, dif, sdif);
 #else
 	return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
-- 
cgit 


From 4845b5713ab18a1bb6e31d1fbb4d600240b8b691 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 18 Jul 2022 10:26:48 -0700
Subject: tcp: Fix data-races around sysctl_tcp_slow_start_after_idle.

While reading sysctl_tcp_slow_start_after_idle, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 35089bb203f4 ("[TCP]: Add tcp_slow_start_after_idle sysctl.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 4 ++--
 net/ipv4/tcp_output.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ee1fb4fb292..071735e10872 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1403,8 +1403,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	s32 delta;
 
-	if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
-	    ca_ops->cong_control)
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
+	    tp->packets_out || ca_ops->cong_control)
 		return;
 	delta = tcp_jiffies32 - tp->lsndtime;
 	if (delta > inet_csk(sk)->icsk_rto)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 898fcdcb7989..51120407c570 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1898,7 +1898,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 		if (tp->packets_out > tp->snd_cwnd_used)
 			tp->snd_cwnd_used = tp->packets_out;
 
-		if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
+		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
 		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
 		    !ca_ops->cong_control)
 			tcp_cwnd_application_limited(sk);
-- 
cgit 


From d0637c505f8a1d8c4088642f1f3e9e3b22da14f6 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Wed, 20 Jul 2022 21:37:37 +1200
Subject: arm64: enable THP_SWAP for arm64

THP_SWAP has been proven to improve the swap throughput significantly
on x86_64 according to commit bd4c82c22c367e ("mm, THP, swap: delay
splitting THP after swapped out").
As long as arm64 uses 4K page size, it is quite similar with x86_64
by having 2MB PMD THP. THP_SWAP is architecture-independent, thus,
enabling it on arm64 will benefit arm64 as well.
A corner case is that MTE has an assumption that only base pages
can be swapped. We won't enable THP_SWAP for ARM64 hardware with
MTE support until MTE is reworked to coexist with THP_SWAP.

A micro-benchmark is written to measure thp swapout throughput as
below,

 unsigned long long tv_to_ms(struct timeval tv)
 {
 	return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }

 main()
 {
 	struct timeval tv_b, tv_e;;
 #define SIZE 400*1024*1024
 	volatile void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	if (!p) {
 		perror("fail to get memory");
 		exit(-1);
 	}

 	madvise(p, SIZE, MADV_HUGEPAGE);
 	memset(p, 0x11, SIZE); /* write to get mem */

 	gettimeofday(&tv_b, NULL);
 	madvise(p, SIZE, MADV_PAGEOUT);
 	gettimeofday(&tv_e, NULL);

 	printf("swp out bandwidth: %ld bytes/ms\n",
 			SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
 }

Testing is done on rk3568 64bit Quad Core Cortex-A55 platform -
ROCK 3A.
thp swp throughput w/o patch: 2734bytes/ms (mean of 10 tests)
thp swp throughput w/  patch: 3331bytes/ms (mean of 10 tests)

Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Link: https://lore.kernel.org/r/20220720093737.133375-1-21cnbao@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig               |  1 +
 arch/arm64/include/asm/pgtable.h |  6 ++++++
 include/linux/huge_mm.h          | 12 ++++++++++++
 mm/swap_slots.c                  |  2 +-
 4 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..e1c540e80eec 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -101,6 +101,7 @@ config ARM64
 	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
+	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARM_AMBA
 	select ARM_ARCH_TIMER
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b6632f18364..78d6f6014bfb 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -45,6 +45,12 @@
 	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+static inline bool arch_thp_swp_supported(void)
+{
+	return !system_supports_mte();
+}
+#define arch_thp_swp_supported arch_thp_swp_supported
+
 /*
  * Outside of a few very special situations (e.g. hibernation), we always
  * use broadcast TLB invalidation instructions, therefore a spurious page
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de29821231c9..4ddaf6ad73ef 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -461,4 +461,16 @@ static inline int split_folio_to_list(struct folio *folio,
 	return split_huge_page_to_list(&folio->page, list);
 }
 
+/*
+ * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
+ * limitations in the implementation like arm64 MTE can override this to
+ * false
+ */
+#ifndef arch_thp_swp_supported
+static inline bool arch_thp_swp_supported(void)
+{
+	return true;
+}
+#endif
+
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 2a65a89b5b4d..10b94d64cc25 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -307,7 +307,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
 	entry.val = 0;
 
 	if (folio_test_large(folio)) {
-		if (IS_ENABLED(CONFIG_THP_SWAP))
+		if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
 			get_swap_pages(1, &entry, folio_nr_pages(folio));
 		goto out;
 	}
-- 
cgit 


From 9b31e60800d8fa69027baf9ec7f03a0c5b145079 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 15 Jul 2022 11:55:49 -0700
Subject: tools: Fixed MIPS builds due to struct flock re-definition

Building perf for MIPS failed after 9f79b8b72339 ("uapi: simplify
__ARCH_FLOCK{,64}_PAD a little") with the following error:

  CC
/home/fainelli/work/buildroot/output/bmips/build/linux-custom/tools/perf/trace/beauty/fcntl.o
In file included from
../../../../host/mipsel-buildroot-linux-gnu/sysroot/usr/include/asm/fcntl.h:77,
                 from ../include/uapi/linux/fcntl.h:5,
                 from trace/beauty/fcntl.c:10:
../include/uapi/asm-generic/fcntl.h:188:8: error: redefinition of
'struct flock'
 struct flock {
        ^~~~~
In file included from ../include/uapi/linux/fcntl.h:5,
                 from trace/beauty/fcntl.c:10:
../../../../host/mipsel-buildroot-linux-gnu/sysroot/usr/include/asm/fcntl.h:63:8:
note: originally defined here
 struct flock {
        ^~~~~

This is due to the local copy under
tools/include/uapi/asm-generic/fcntl.h including the toolchain's kernel
headers which already define 'struct flock' and define
HAVE_ARCH_STRUCT_FLOCK to future inclusions make a decision as to
whether re-defining 'struct flock' is appropriate or not.

Make sure what do not re-define 'struct flock'
when HAVE_ARCH_STRUCT_FLOCK is already defined.

Fixes: 9f79b8b72339 ("uapi: simplify __ARCH_FLOCK{,64}_PAD a little")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[arnd: sync with include/uapi/asm-generic/fcntl.h as well]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/asm-generic/fcntl.h       |  2 ++
 tools/include/uapi/asm-generic/fcntl.h | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index f13d37b60775..1ecdb911add8 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -192,6 +192,7 @@ struct f_owner_ex {
 
 #define F_LINUX_SPECIFIC_BASE	1024
 
+#ifndef HAVE_ARCH_STRUCT_FLOCK
 struct flock {
 	short	l_type;
 	short	l_whence;
@@ -216,5 +217,6 @@ struct flock64 {
 	__ARCH_FLOCK64_PAD
 #endif
 };
+#endif /* HAVE_ARCH_STRUCT_FLOCK */
 
 #endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
index 0197042b7dfb..1ecdb911add8 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 #ifndef _ASM_GENERIC_FCNTL_H
 #define _ASM_GENERIC_FCNTL_H
 
@@ -90,7 +91,7 @@
 
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)
+#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
 
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
@@ -115,11 +116,13 @@
 #define F_GETSIG	11	/* for sockets. */
 #endif
 
+#if __BITS_PER_LONG == 32 || defined(__KERNEL__)
 #ifndef F_GETLK64
 #define F_GETLK64	12	/*  using 'struct flock64' */
 #define F_SETLK64	13
 #define F_SETLKW64	14
 #endif
+#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */
 
 #ifndef F_SETOWN_EX
 #define F_SETOWN_EX	15
@@ -178,6 +181,10 @@ struct f_owner_ex {
 				   blocking */
 #define LOCK_UN		8	/* remove lock */
 
+/*
+ * LOCK_MAND support has been removed from the kernel. We leave the symbols
+ * here to not break legacy builds, but these should not be used in new code.
+ */
 #define LOCK_MAND	32	/* This is a mandatory flock ... */
 #define LOCK_READ	64	/* which allows concurrent read operations */
 #define LOCK_WRITE	128	/* which allows concurrent write operations */
@@ -185,6 +192,7 @@ struct f_owner_ex {
 
 #define F_LINUX_SPECIFIC_BASE	1024
 
+#ifndef HAVE_ARCH_STRUCT_FLOCK
 struct flock {
 	short	l_type;
 	short	l_whence;
@@ -209,5 +217,6 @@ struct flock64 {
 	__ARCH_FLOCK64_PAD
 #endif
 };
+#endif /* HAVE_ARCH_STRUCT_FLOCK */
 
 #endif /* _ASM_GENERIC_FCNTL_H */
-- 
cgit 


From 7327b16f5f56741960e11ae4d7ef0ffdff5fd252 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Jul 2022 18:51:21 +0800
Subject: APCI: irq: Add support for multiple GSI domains

In an unfortunate departure from the ACPI spec, the LoongArch
architecture split its GSI space across multiple interrupt
controllers.

In order to be able to reuse the core code and prevent
architectures from reinventing an already square wheel, offer
the arch code the ability to register a dispatcher function
that will return the domain fwnode for a given GSI.

The ARM GIC drivers are updated to support this (with a single
domain, as intended).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/1658314292-35346-3-git-send-email-lvjianmin@loongson.cn
---
 drivers/acpi/irq.c           | 40 +++++++++++++++++++++++++---------------
 drivers/irqchip/irq-gic-v3.c | 18 ++++++++++++------
 drivers/irqchip/irq-gic.c    | 18 ++++++++++++------
 include/linux/acpi.h         |  2 +-
 4 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index c68e694fca26..f0de76879497 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -12,7 +12,7 @@
 
 enum acpi_irq_model_id acpi_irq_model;
 
-static struct fwnode_handle *acpi_gsi_domain_id;
+static struct fwnode_handle *(*acpi_get_gsi_domain_id)(u32 gsi);
 
 /**
  * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI
@@ -26,9 +26,10 @@ static struct fwnode_handle *acpi_gsi_domain_id;
  */
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
-							DOMAIN_BUS_ANY);
+	struct irq_domain *d;
 
+	d = irq_find_matching_fwnode(acpi_get_gsi_domain_id(gsi),
+					DOMAIN_BUS_ANY);
 	*irq = irq_find_mapping(d, gsi);
 	/*
 	 * *irq == 0 means no mapping, that should
@@ -53,12 +54,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger,
 {
 	struct irq_fwspec fwspec;
 
-	if (WARN_ON(!acpi_gsi_domain_id)) {
+	fwspec.fwnode = acpi_get_gsi_domain_id(gsi);
+	if (WARN_ON(!fwspec.fwnode)) {
 		pr_warn("GSI: No registered irqchip, giving up\n");
 		return -EINVAL;
 	}
 
-	fwspec.fwnode = acpi_gsi_domain_id;
 	fwspec.param[0] = gsi;
 	fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
 	fwspec.param_count = 2;
@@ -73,13 +74,14 @@ EXPORT_SYMBOL_GPL(acpi_register_gsi);
  */
 void acpi_unregister_gsi(u32 gsi)
 {
-	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
-							DOMAIN_BUS_ANY);
+	struct irq_domain *d;
 	int irq;
 
 	if (WARN_ON(acpi_irq_model == ACPI_IRQ_MODEL_GIC && gsi < 16))
 		return;
 
+	d = irq_find_matching_fwnode(acpi_get_gsi_domain_id(gsi),
+				     DOMAIN_BUS_ANY);
 	irq = irq_find_mapping(d, gsi);
 	irq_dispose_mapping(irq);
 }
@@ -97,7 +99,8 @@ EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
  * The referenced device fwhandle or NULL on failure
  */
 static struct fwnode_handle *
-acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source)
+acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source,
+			     u32 gsi)
 {
 	struct fwnode_handle *result;
 	struct acpi_device *device;
@@ -105,7 +108,7 @@ acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source)
 	acpi_status status;
 
 	if (!source->string_length)
-		return acpi_gsi_domain_id;
+		return acpi_get_gsi_domain_id(gsi);
 
 	status = acpi_get_handle(NULL, source->string_ptr, &handle);
 	if (WARN_ON(ACPI_FAILURE(status)))
@@ -194,7 +197,7 @@ static acpi_status acpi_irq_parse_one_cb(struct acpi_resource *ares,
 			ctx->index -= irq->interrupt_count;
 			return AE_OK;
 		}
-		fwnode = acpi_gsi_domain_id;
+		fwnode = acpi_get_gsi_domain_id(irq->interrupts[ctx->index]);
 		acpi_irq_parse_one_match(fwnode, irq->interrupts[ctx->index],
 					 irq->triggering, irq->polarity,
 					 irq->shareable, ctx);
@@ -207,7 +210,8 @@ static acpi_status acpi_irq_parse_one_cb(struct acpi_resource *ares,
 			ctx->index -= eirq->interrupt_count;
 			return AE_OK;
 		}
-		fwnode = acpi_get_irq_source_fwhandle(&eirq->resource_source);
+		fwnode = acpi_get_irq_source_fwhandle(&eirq->resource_source,
+						      eirq->interrupts[ctx->index]);
 		acpi_irq_parse_one_match(fwnode, eirq->interrupts[ctx->index],
 					 eirq->triggering, eirq->polarity,
 					 eirq->shareable, ctx);
@@ -291,10 +295,10 @@ EXPORT_SYMBOL_GPL(acpi_irq_get);
  *          GSI interrupts
  */
 void __init acpi_set_irq_model(enum acpi_irq_model_id model,
-			       struct fwnode_handle *fwnode)
+			       struct fwnode_handle *(*fn)(u32))
 {
 	acpi_irq_model = model;
-	acpi_gsi_domain_id = fwnode;
+	acpi_get_gsi_domain_id = fn;
 }
 
 /**
@@ -312,8 +316,14 @@ struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 					     const struct irq_domain_ops *ops,
 					     void *host_data)
 {
-	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
-							DOMAIN_BUS_ANY);
+	struct irq_domain *d;
+
+	/* This only works for the GIC model... */
+	if (acpi_irq_model != ACPI_IRQ_MODEL_GIC)
+		return NULL;
+
+	d = irq_find_matching_fwnode(acpi_get_gsi_domain_id(0),
+				     DOMAIN_BUS_ANY);
 
 	if (!d)
 		return NULL;
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 5c1cf907ee68..c66470378074 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -2360,11 +2360,17 @@ static void __init gic_acpi_setup_kvm_info(void)
 	vgic_set_kvm_info(&gic_v3_kvm_info);
 }
 
+static struct fwnode_handle *gsi_domain_handle;
+
+static struct fwnode_handle *gic_v3_get_gsi_domain_id(u32 gsi)
+{
+	return gsi_domain_handle;
+}
+
 static int __init
 gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_generic_distributor *dist;
-	struct fwnode_handle *domain_handle;
 	size_t size;
 	int i, err;
 
@@ -2396,18 +2402,18 @@ gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
 	if (err)
 		goto out_redist_unmap;
 
-	domain_handle = irq_domain_alloc_fwnode(&dist->base_address);
-	if (!domain_handle) {
+	gsi_domain_handle = irq_domain_alloc_fwnode(&dist->base_address);
+	if (!gsi_domain_handle) {
 		err = -ENOMEM;
 		goto out_redist_unmap;
 	}
 
 	err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
-			     acpi_data.nr_redist_regions, 0, domain_handle);
+			     acpi_data.nr_redist_regions, 0, gsi_domain_handle);
 	if (err)
 		goto out_fwhandle_free;
 
-	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, gic_v3_get_gsi_domain_id);
 
 	if (static_branch_likely(&supports_deactivate_key))
 		gic_acpi_setup_kvm_info();
@@ -2415,7 +2421,7 @@ gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
 	return 0;
 
 out_fwhandle_free:
-	irq_domain_free_fwnode(domain_handle);
+	irq_domain_free_fwnode(gsi_domain_handle);
 out_redist_unmap:
 	for (i = 0; i < acpi_data.nr_redist_regions; i++)
 		if (acpi_data.redist_regs[i].redist_base)
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 820404cb56bc..4c7bae0ec8f9 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1682,11 +1682,17 @@ static void __init gic_acpi_setup_kvm_info(void)
 	vgic_set_kvm_info(&gic_v2_kvm_info);
 }
 
+static struct fwnode_handle *gsi_domain_handle;
+
+static struct fwnode_handle *gic_v2_get_gsi_domain_id(u32 gsi)
+{
+	return gsi_domain_handle;
+}
+
 static int __init gic_v2_acpi_init(union acpi_subtable_headers *header,
 				   const unsigned long end)
 {
 	struct acpi_madt_generic_distributor *dist;
-	struct fwnode_handle *domain_handle;
 	struct gic_chip_data *gic = &gic_data[0];
 	int count, ret;
 
@@ -1724,22 +1730,22 @@ static int __init gic_v2_acpi_init(union acpi_subtable_headers *header,
 	/*
 	 * Initialize GIC instance zero (no multi-GIC support).
 	 */
-	domain_handle = irq_domain_alloc_fwnode(&dist->base_address);
-	if (!domain_handle) {
+	gsi_domain_handle = irq_domain_alloc_fwnode(&dist->base_address);
+	if (!gsi_domain_handle) {
 		pr_err("Unable to allocate domain handle\n");
 		gic_teardown(gic);
 		return -ENOMEM;
 	}
 
-	ret = __gic_init_bases(gic, domain_handle);
+	ret = __gic_init_bases(gic, gsi_domain_handle);
 	if (ret) {
 		pr_err("Failed to initialise GIC\n");
-		irq_domain_free_fwnode(domain_handle);
+		irq_domain_free_fwnode(gsi_domain_handle);
 		gic_teardown(gic);
 		return ret;
 	}
 
-	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, gic_v2_get_gsi_domain_id);
 
 	if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
 		gicv2m_init(NULL, gic_data[0].domain);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4f82a5bc6d98..957e23f727ea 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -356,7 +356,7 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 void acpi_set_irq_model(enum acpi_irq_model_id model,
-			struct fwnode_handle *fwnode);
+			struct fwnode_handle *(*)(u32));
 
 struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 					     unsigned int size,
-- 
cgit 


From 744b9a0c3c8334d705dea6af3645ea30d597c360 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Jul 2022 18:51:22 +0800
Subject: ACPI: irq: Allow acpi_gsi_to_irq() to have an arch-specific fallback

It appears that the generic version of acpi_gsi_to_irq() doesn't
fallback to establishing a mapping if there is no pre-existing
one while the x86 version does.

While arm64 seems unaffected by it, LoongArch is relying on the x86
behaviour. In an effort to prevent new architectures from reinventing
the proverbial wheel, provide an optional callback that the arch code
can set to restore the x86 behaviour.

Hopefully we can eventually get rid of this in the future once
the expected behaviour has been clarified.

Reported-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/1658314292-35346-4-git-send-email-lvjianmin@loongson.cn
---
 drivers/acpi/irq.c   | 18 ++++++++++++++++--
 include/linux/acpi.h |  1 +
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index f0de76879497..dabe45eba055 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -13,6 +13,7 @@
 enum acpi_irq_model_id acpi_irq_model;
 
 static struct fwnode_handle *(*acpi_get_gsi_domain_id)(u32 gsi);
+static u32 (*acpi_gsi_to_irq_fallback)(u32 gsi);
 
 /**
  * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI
@@ -32,9 +33,12 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 					DOMAIN_BUS_ANY);
 	*irq = irq_find_mapping(d, gsi);
 	/*
-	 * *irq == 0 means no mapping, that should
-	 * be reported as a failure
+	 * *irq == 0 means no mapping, that should be reported as a
+	 * failure, unless there is an arch-specific fallback handler.
 	 */
+	if (!*irq && acpi_gsi_to_irq_fallback)
+		*irq = acpi_gsi_to_irq_fallback(gsi);
+
 	return (*irq > 0) ? 0 : -EINVAL;
 }
 EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
@@ -301,6 +305,16 @@ void __init acpi_set_irq_model(enum acpi_irq_model_id model,
 	acpi_get_gsi_domain_id = fn;
 }
 
+/**
+ * acpi_set_gsi_to_irq_fallback - Register a GSI transfer
+ * callback to fallback to arch specified implementation.
+ * @fn: arch-specific fallback handler
+ */
+void __init acpi_set_gsi_to_irq_fallback(u32 (*fn)(u32))
+{
+	acpi_gsi_to_irq_fallback = fn;
+}
+
 /**
  * acpi_irq_create_hierarchy - Create a hierarchical IRQ domain with the default
  *                             GSI domain as its parent.
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 957e23f727ea..e2b60d53ca60 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -357,6 +357,7 @@ int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 void acpi_set_irq_model(enum acpi_irq_model_id model,
 			struct fwnode_handle *(*)(u32));
+void acpi_set_gsi_to_irq_fallback(u32 (*)(u32));
 
 struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 					     unsigned int size,
-- 
cgit 


From d319a299f4066685a787cfb89ad36fd78bb830ed Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Wed, 20 Jul 2022 18:51:23 +0800
Subject: genirq/generic_chip: Export irq_unmap_generic_chip

Some irq controllers have to re-implement a private version for
irq_generic_chip_ops, because they have a different xlate to translate
hwirq. Export irq_unmap_generic_chip to allow reusing in drivers.

Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/1658314292-35346-5-git-send-email-lvjianmin@loongson.cn
---
 include/linux/irq.h       | 1 +
 kernel/irq/generic-chip.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 505308253d23..83a4574308fa 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1121,6 +1121,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on);
 /* Setup functions for irq_chip_generic */
 int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 			 irq_hw_number_t hw_irq);
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq);
 struct irq_chip_generic *
 irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base,
 		       void __iomem *reg_base, irq_flow_handler_t handler);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f0862eb6b506..c653cd31548d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -431,7 +431,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
 {
 	struct irq_data *data = irq_domain_get_irq_data(d, virq);
 	struct irq_domain_chip_generic *dgc = d->gc;
-- 
cgit 


From dd281e1a1a937ee2f13bd0db5be78e5f5b811ca7 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Wed, 20 Jul 2022 18:51:30 +0800
Subject: irqchip: Add Loongson Extended I/O interrupt controller support

EIOINTC stands for "Extended I/O Interrupts" that described in Section
11.2 of "Loongson 3A5000 Processor Reference Manual". For more
information please refer Documentation/loongarch/irq-chip-model.rst.

Loongson-3A5000 has 4 cores per NUMA node, and each NUMA node has an
EIOINTC; while Loongson-3C5000 has 16 cores per NUMA node, and each NUMA
node has 4 EIOINTCs. In other words, 16 cores of one NUMA node in
Loongson-3C5000 are organized in 4 groups, each group connects to an
EIOINTC. We call the "group" here as an EIOINTC node, so each EIOINTC
node always includes 4 cores (both in Loongson-3A5000 and Loongson-
3C5000).

Co-developed-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/1658314292-35346-12-git-send-email-lvjianmin@loongson.cn
---
 arch/loongarch/include/asm/irq.h       |  11 +-
 drivers/irqchip/Kconfig                |  10 +
 drivers/irqchip/Makefile               |   1 +
 drivers/irqchip/irq-loongson-eiointc.c | 395 +++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h             |   1 +
 5 files changed, 408 insertions(+), 10 deletions(-)
 create mode 100644 drivers/irqchip/irq-loongson-eiointc.c

(limited to 'include')

diff --git a/arch/loongarch/include/asm/irq.h b/arch/loongarch/include/asm/irq.h
index c847300c4cd1..67ebcc5495b8 100644
--- a/arch/loongarch/include/asm/irq.h
+++ b/arch/loongarch/include/asm/irq.h
@@ -87,15 +87,6 @@ extern struct acpi_vector_group msi_group[MAX_IO_PICS];
 extern int find_pch_pic(u32 gsi);
 extern int eiointc_get_node(int id);
 
-static inline void eiointc_enable(void)
-{
-	uint64_t misc;
-
-	misc = iocsr_read64(LOONGARCH_IOCSR_MISC_FUNC);
-	misc |= IOCSR_MISC_FUNC_EXT_IOI_EN;
-	iocsr_write64(misc, LOONGARCH_IOCSR_MISC_FUNC);
-}
-
 struct acpi_madt_lio_pic;
 struct acpi_madt_eio_pic;
 struct acpi_madt_ht_pic;
@@ -107,7 +98,7 @@ struct irq_domain *loongarch_cpu_irq_init(void);
 
 int liointc_acpi_init(struct irq_domain *parent,
 					struct acpi_madt_lio_pic *acpi_liointc);
-struct irq_domain *eiointc_acpi_init(struct irq_domain *parent,
+int eiointc_acpi_init(struct irq_domain *parent,
 					struct acpi_madt_eio_pic *acpi_eiointc);
 
 struct irq_domain *htvec_acpi_init(struct irq_domain *parent,
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 8844e6b53b3c..8f077d353e67 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -555,6 +555,16 @@ config LOONGSON_LIOINTC
 	help
 	  Support for the Loongson Local I/O Interrupt Controller.
 
+config LOONGSON_EIOINTC
+	bool "Loongson Extend I/O Interrupt Controller"
+	depends on LOONGARCH
+	depends on MACH_LOONGSON64
+	default MACH_LOONGSON64
+	select IRQ_DOMAIN_HIERARCHY
+	select GENERIC_IRQ_CHIP
+	help
+	  Support for the Loongson3 Extend I/O Interrupt Vector Controller.
+
 config LOONGSON_HTPIC
 	bool "Loongson3 HyperTransport PIC Controller"
 	depends on MACH_LOONGSON64 && MIPS
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 242b8b3568f8..0cfd4f046751 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_TI_SCI_INTR_IRQCHIP)	+= irq-ti-sci-intr.o
 obj-$(CONFIG_TI_SCI_INTA_IRQCHIP)	+= irq-ti-sci-inta.o
 obj-$(CONFIG_TI_PRUSS_INTC)		+= irq-pruss-intc.o
 obj-$(CONFIG_LOONGSON_LIOINTC)		+= irq-loongson-liointc.o
+obj-$(CONFIG_LOONGSON_EIOINTC)		+= irq-loongson-eiointc.o
 obj-$(CONFIG_LOONGSON_HTPIC)		+= irq-loongson-htpic.o
 obj-$(CONFIG_LOONGSON_HTVEC)		+= irq-loongson-htvec.o
 obj-$(CONFIG_LOONGSON_PCH_PIC)		+= irq-loongson-pch-pic.o
diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c
new file mode 100644
index 000000000000..80d8ca6f2d46
--- /dev/null
+++ b/drivers/irqchip/irq-loongson-eiointc.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Loongson Extend I/O Interrupt Controller support
+ *
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#define pr_fmt(fmt) "eiointc: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+
+#define EIOINTC_REG_NODEMAP	0x14a0
+#define EIOINTC_REG_IPMAP	0x14c0
+#define EIOINTC_REG_ENABLE	0x1600
+#define EIOINTC_REG_BOUNCE	0x1680
+#define EIOINTC_REG_ISR		0x1800
+#define EIOINTC_REG_ROUTE	0x1c00
+
+#define VEC_REG_COUNT		4
+#define VEC_COUNT_PER_REG	64
+#define VEC_COUNT		(VEC_REG_COUNT * VEC_COUNT_PER_REG)
+#define VEC_REG_IDX(irq_id)	((irq_id) / VEC_COUNT_PER_REG)
+#define VEC_REG_BIT(irq_id)     ((irq_id) % VEC_COUNT_PER_REG)
+#define EIOINTC_ALL_ENABLE	0xffffffff
+
+#define MAX_EIO_NODES		(NR_CPUS / CORES_PER_EIO_NODE)
+
+static int nr_pics;
+
+struct eiointc_priv {
+	u32			node;
+	nodemask_t		node_map;
+	cpumask_t		cpuspan_map;
+	struct fwnode_handle	*domain_handle;
+	struct irq_domain	*eiointc_domain;
+};
+
+static struct eiointc_priv *eiointc_priv[MAX_IO_PICS];
+
+static void eiointc_enable(void)
+{
+	uint64_t misc;
+
+	misc = iocsr_read64(LOONGARCH_IOCSR_MISC_FUNC);
+	misc |= IOCSR_MISC_FUNC_EXT_IOI_EN;
+	iocsr_write64(misc, LOONGARCH_IOCSR_MISC_FUNC);
+}
+
+static int cpu_to_eio_node(int cpu)
+{
+	return cpu_logical_map(cpu) / CORES_PER_EIO_NODE;
+}
+
+static void eiointc_set_irq_route(int pos, unsigned int cpu, unsigned int mnode, nodemask_t *node_map)
+{
+	int i, node, cpu_node, route_node;
+	unsigned char coremap;
+	uint32_t pos_off, data, data_byte, data_mask;
+
+	pos_off = pos & ~3;
+	data_byte = pos & 3;
+	data_mask = ~BIT_MASK(data_byte) & 0xf;
+
+	/* Calculate node and coremap of target irq */
+	cpu_node = cpu_logical_map(cpu) / CORES_PER_EIO_NODE;
+	coremap = BIT(cpu_logical_map(cpu) % CORES_PER_EIO_NODE);
+
+	for_each_online_cpu(i) {
+		node = cpu_to_eio_node(i);
+		if (!node_isset(node, *node_map))
+			continue;
+
+		/* EIO node 0 is in charge of inter-node interrupt dispatch */
+		route_node = (node == mnode) ? cpu_node : node;
+		data = ((coremap | (route_node << 4)) << (data_byte * 8));
+		csr_any_send(EIOINTC_REG_ROUTE + pos_off, data, data_mask, node * CORES_PER_EIO_NODE);
+	}
+}
+
+static DEFINE_RAW_SPINLOCK(affinity_lock);
+
+static int eiointc_set_irq_affinity(struct irq_data *d, const struct cpumask *affinity, bool force)
+{
+	unsigned int cpu;
+	unsigned long flags;
+	uint32_t vector, regaddr;
+	struct cpumask intersect_affinity;
+	struct eiointc_priv *priv = d->domain->host_data;
+
+	raw_spin_lock_irqsave(&affinity_lock, flags);
+
+	cpumask_and(&intersect_affinity, affinity, cpu_online_mask);
+	cpumask_and(&intersect_affinity, &intersect_affinity, &priv->cpuspan_map);
+
+	if (cpumask_empty(&intersect_affinity)) {
+		raw_spin_unlock_irqrestore(&affinity_lock, flags);
+		return -EINVAL;
+	}
+	cpu = cpumask_first(&intersect_affinity);
+
+	vector = d->hwirq;
+	regaddr = EIOINTC_REG_ENABLE + ((vector >> 5) << 2);
+
+	/* Mask target vector */
+	csr_any_send(regaddr, EIOINTC_ALL_ENABLE & (~BIT(vector & 0x1F)), 0x0, 0);
+	/* Set route for target vector */
+	eiointc_set_irq_route(vector, cpu, priv->node, &priv->node_map);
+	/* Unmask target vector */
+	csr_any_send(regaddr, EIOINTC_ALL_ENABLE, 0x0, 0);
+
+	irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+	raw_spin_unlock_irqrestore(&affinity_lock, flags);
+
+	return IRQ_SET_MASK_OK;
+}
+
+static int eiointc_index(int node)
+{
+	int i;
+
+	for (i = 0; i < nr_pics; i++) {
+		if (node_isset(node, eiointc_priv[i]->node_map))
+			return i;
+	}
+
+	return -1;
+}
+
+static int eiointc_router_init(unsigned int cpu)
+{
+	int i, bit;
+	uint32_t data;
+	uint32_t node = cpu_to_eio_node(cpu);
+	uint32_t index = eiointc_index(node);
+
+	if (index < 0) {
+		pr_err("Error: invalid nodemap!\n");
+		return -1;
+	}
+
+	if ((cpu_logical_map(cpu) % CORES_PER_EIO_NODE) == 0) {
+		eiointc_enable();
+
+		for (i = 0; i < VEC_COUNT / 32; i++) {
+			data = (((1 << (i * 2 + 1)) << 16) | (1 << (i * 2)));
+			iocsr_write32(data, EIOINTC_REG_NODEMAP + i * 4);
+		}
+
+		for (i = 0; i < VEC_COUNT / 32 / 4; i++) {
+			bit = BIT(1 + index); /* Route to IP[1 + index] */
+			data = bit | (bit << 8) | (bit << 16) | (bit << 24);
+			iocsr_write32(data, EIOINTC_REG_IPMAP + i * 4);
+		}
+
+		for (i = 0; i < VEC_COUNT / 4; i++) {
+			/* Route to Node-0 Core-0 */
+			if (index == 0)
+				bit = BIT(cpu_logical_map(0));
+			else
+				bit = (eiointc_priv[index]->node << 4) | 1;
+
+			data = bit | (bit << 8) | (bit << 16) | (bit << 24);
+			iocsr_write32(data, EIOINTC_REG_ROUTE + i * 4);
+		}
+
+		for (i = 0; i < VEC_COUNT / 32; i++) {
+			data = 0xffffffff;
+			iocsr_write32(data, EIOINTC_REG_ENABLE + i * 4);
+			iocsr_write32(data, EIOINTC_REG_BOUNCE + i * 4);
+		}
+	}
+
+	return 0;
+}
+
+static void eiointc_irq_dispatch(struct irq_desc *desc)
+{
+	int i;
+	u64 pending;
+	bool handled = false;
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct eiointc_priv *priv = irq_desc_get_handler_data(desc);
+
+	chained_irq_enter(chip, desc);
+
+	for (i = 0; i < VEC_REG_COUNT; i++) {
+		pending = iocsr_read64(EIOINTC_REG_ISR + (i << 3));
+		iocsr_write64(pending, EIOINTC_REG_ISR + (i << 3));
+		while (pending) {
+			int bit = __ffs(pending);
+			int irq = bit + VEC_COUNT_PER_REG * i;
+
+			generic_handle_domain_irq(priv->eiointc_domain, irq);
+			pending &= ~BIT(bit);
+			handled = true;
+		}
+	}
+
+	if (!handled)
+		spurious_interrupt();
+
+	chained_irq_exit(chip, desc);
+}
+
+static void eiointc_ack_irq(struct irq_data *d)
+{
+}
+
+static void eiointc_mask_irq(struct irq_data *d)
+{
+}
+
+static void eiointc_unmask_irq(struct irq_data *d)
+{
+}
+
+static struct irq_chip eiointc_irq_chip = {
+	.name			= "EIOINTC",
+	.irq_ack		= eiointc_ack_irq,
+	.irq_mask		= eiointc_mask_irq,
+	.irq_unmask		= eiointc_unmask_irq,
+	.irq_set_affinity	= eiointc_set_irq_affinity,
+};
+
+static int eiointc_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	int ret;
+	unsigned int i, type;
+	unsigned long hwirq = 0;
+	struct eiointc *priv = domain->host_data;
+
+	ret = irq_domain_translate_onecell(domain, arg, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_domain_set_info(domain, virq + i, hwirq + i, &eiointc_irq_chip,
+					priv, handle_edge_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static void eiointc_domain_free(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
+
+		irq_set_handler(virq + i, NULL);
+		irq_domain_reset_irq_data(d);
+	}
+}
+
+static const struct irq_domain_ops eiointc_domain_ops = {
+	.translate	= irq_domain_translate_onecell,
+	.alloc		= eiointc_domain_alloc,
+	.free		= eiointc_domain_free,
+};
+
+static void acpi_set_vec_parent(int node, struct irq_domain *parent, struct acpi_vector_group *vec_group)
+{
+	int i;
+
+	if (cpu_has_flatmode)
+		node = cpu_to_node(node * CORES_PER_EIO_NODE);
+
+	for (i = 0; i < MAX_IO_PICS; i++) {
+		if (node == vec_group[i].node) {
+			vec_group[i].parent = parent;
+			return;
+		}
+	}
+}
+
+struct irq_domain *acpi_get_vec_parent(int node, struct acpi_vector_group *vec_group)
+{
+	int i;
+
+	for (i = 0; i < MAX_IO_PICS; i++) {
+		if (node == vec_group[i].node)
+			return vec_group[i].parent;
+	}
+	return NULL;
+}
+
+static int __init
+pch_pic_parse_madt(union acpi_subtable_headers *header,
+		       const unsigned long end)
+{
+	struct acpi_madt_bio_pic *pchpic_entry = (struct acpi_madt_bio_pic *)header;
+	unsigned int node = (pchpic_entry->address >> 44) & 0xf;
+	struct irq_domain *parent = acpi_get_vec_parent(node, pch_group);
+
+	if (parent)
+		return pch_pic_acpi_init(parent, pchpic_entry);
+
+	return -EINVAL;
+}
+
+static int __init
+pch_msi_parse_madt(union acpi_subtable_headers *header,
+		       const unsigned long end)
+{
+	struct acpi_madt_msi_pic *pchmsi_entry = (struct acpi_madt_msi_pic *)header;
+	struct irq_domain *parent = acpi_get_vec_parent(eiointc_priv[nr_pics - 1]->node, msi_group);
+
+	if (parent)
+		return pch_msi_acpi_init(parent, pchmsi_entry);
+
+	return -EINVAL;
+}
+
+static int __init acpi_cascade_irqdomain_init(void)
+{
+	acpi_table_parse_madt(ACPI_MADT_TYPE_BIO_PIC,
+			      pch_pic_parse_madt, 0);
+	acpi_table_parse_madt(ACPI_MADT_TYPE_MSI_PIC,
+			      pch_msi_parse_madt, 1);
+	return 0;
+}
+
+int __init eiointc_acpi_init(struct irq_domain *parent,
+				     struct acpi_madt_eio_pic *acpi_eiointc)
+{
+	int i, parent_irq;
+	unsigned long node_map;
+	struct eiointc_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->domain_handle = irq_domain_alloc_fwnode((phys_addr_t *)acpi_eiointc);
+	if (!priv->domain_handle) {
+		pr_err("Unable to allocate domain handle\n");
+		goto out_free_priv;
+	}
+
+	priv->node = acpi_eiointc->node;
+	node_map = acpi_eiointc->node_map ? : -1ULL;
+
+	for_each_possible_cpu(i) {
+		if (node_map & (1ULL << cpu_to_eio_node(i))) {
+			node_set(cpu_to_eio_node(i), priv->node_map);
+			cpumask_or(&priv->cpuspan_map, &priv->cpuspan_map, cpumask_of(i));
+		}
+	}
+
+	/* Setup IRQ domain */
+	priv->eiointc_domain = irq_domain_create_linear(priv->domain_handle, VEC_COUNT,
+					&eiointc_domain_ops, priv);
+	if (!priv->eiointc_domain) {
+		pr_err("loongson-eiointc: cannot add IRQ domain\n");
+		goto out_free_handle;
+	}
+
+	eiointc_priv[nr_pics++] = priv;
+
+	eiointc_router_init(0);
+
+	parent_irq = irq_create_mapping(parent, acpi_eiointc->cascade);
+	irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, priv);
+
+	cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_LOONGARCH_STARTING,
+				  "irqchip/loongarch/intc:starting",
+				  eiointc_router_init, NULL);
+
+	acpi_set_vec_parent(acpi_eiointc->node, priv->eiointc_domain, pch_group);
+	acpi_set_vec_parent(acpi_eiointc->node, priv->eiointc_domain, msi_group);
+	acpi_cascade_irqdomain_init();
+
+	return 0;
+
+out_free_handle:
+	irq_domain_free_fwnode(priv->domain_handle);
+	priv->domain_handle = NULL;
+out_free_priv:
+	kfree(priv);
+
+	return -ENOMEM;
+}
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 19f0dbfdd7fe..de662f3a6cee 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -151,6 +151,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
 	CPUHP_AP_IRQ_RISCV_STARTING,
+	CPUHP_AP_IRQ_LOONGARCH_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
 	CPUHP_AP_MICROCODE_LOADER,
-- 
cgit 


From e8bba72b396cef7c919c73710f3c5884521adb4e Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Wed, 20 Jul 2022 18:51:32 +0800
Subject: irqchip / ACPI: Introduce ACPI_IRQ_MODEL_LPIC for LoongArch

For LoongArch, ACPI_IRQ_MODEL_LPIC is introduced, and then the
callback acpi_get_gsi_domain_id and acpi_gsi_to_irq_fallback are
implemented.

The acpi_get_gsi_domain_id callback returns related fwnode handle
of irqdomain for different GSI range.

The acpi_gsi_to_irq_fallback will create new mapping for gsi when
the mapping of it is not found.

Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/1658314292-35346-14-git-send-email-lvjianmin@loongson.cn
---
 drivers/acpi/bus.c                  |  3 +++
 drivers/irqchip/irq-loongarch-cpu.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h                |  1 +
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 86fa61a21826..63fbf0022d39 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1145,6 +1145,9 @@ static int __init acpi_bus_init_irq(void)
 	case ACPI_IRQ_MODEL_PLATFORM:
 		message = "platform specific model";
 		break;
+	case ACPI_IRQ_MODEL_LPIC:
+		message = "LPIC";
+		break;
 	default:
 		pr_info("Unknown interrupt routing model\n");
 		return -ENODEV;
diff --git a/drivers/irqchip/irq-loongarch-cpu.c b/drivers/irqchip/irq-loongarch-cpu.c
index 28ddc60c8608..327f3ab62c03 100644
--- a/drivers/irqchip/irq-loongarch-cpu.c
+++ b/drivers/irqchip/irq-loongarch-cpu.c
@@ -16,6 +16,41 @@
 static struct irq_domain *irq_domain;
 struct fwnode_handle *cpuintc_handle;
 
+static u32 lpic_gsi_to_irq(u32 gsi)
+{
+	/* Only pch irqdomain transferring is required for LoongArch. */
+	if (gsi >= GSI_MIN_PCH_IRQ && gsi <= GSI_MAX_PCH_IRQ)
+		return acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH);
+
+	return 0;
+}
+
+static struct fwnode_handle *lpic_get_gsi_domain_id(u32 gsi)
+{
+	int id;
+	struct fwnode_handle *domain_handle = NULL;
+
+	switch (gsi) {
+	case GSI_MIN_CPU_IRQ ... GSI_MAX_CPU_IRQ:
+		if (liointc_handle)
+			domain_handle = liointc_handle;
+		break;
+
+	case GSI_MIN_LPC_IRQ ... GSI_MAX_LPC_IRQ:
+		if (pch_lpc_handle)
+			domain_handle = pch_lpc_handle;
+		break;
+
+	case GSI_MIN_PCH_IRQ ... GSI_MAX_PCH_IRQ:
+		id = find_pch_pic(gsi);
+		if (id >= 0 && pch_pic_handle[id])
+			domain_handle = pch_pic_handle[id];
+		break;
+	}
+
+	return domain_handle;
+}
+
 static void mask_loongarch_irq(struct irq_data *d)
 {
 	clear_csr_ecfg(ECFGF(d->hwirq));
@@ -102,6 +137,8 @@ static int __init cpuintc_acpi_init(union acpi_subtable_headers *header,
 		panic("Failed to add irqdomain for LoongArch CPU");
 
 	set_handle_irq(&handle_cpu_irq);
+	acpi_set_irq_model(ACPI_IRQ_MODEL_LPIC, lpic_get_gsi_domain_id);
+	acpi_set_gsi_to_irq_fallback(lpic_gsi_to_irq);
 	acpi_cascade_irqdomain_init();
 
 	return 0;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e2b60d53ca60..76520f379313 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -105,6 +105,7 @@ enum acpi_irq_model_id {
 	ACPI_IRQ_MODEL_IOSAPIC,
 	ACPI_IRQ_MODEL_PLATFORM,
 	ACPI_IRQ_MODEL_GIC,
+	ACPI_IRQ_MODEL_LPIC,
 	ACPI_IRQ_MODEL_COUNT
 };
 
-- 
cgit 


From f8faf3496633b302a6591fda599540a0b53a35bb Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 19 Jul 2022 14:52:51 -0500
Subject: x86/amd_nb: Add AMD PCI IDs for SMN communication

Add support for SMN communication on family 17h model A0h and family 19h
models 60h-70h.

  [ bp: Merge into a single patch. ]

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# pci_ids.h
Acked-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20220719195256.1516-1-mario.limonciello@amd.com
---
 arch/x86/kernel/amd_nb.c | 13 +++++++++++++
 include/linux/pci_ids.h  |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 190e0f763375..4266b64631a4 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -19,17 +19,23 @@
 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT	0x15d0
 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT	0x1480
 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT	0x1630
+#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT	0x14b5
 #define PCI_DEVICE_ID_AMD_19H_M10H_ROOT	0x14a4
+#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT	0x14d8
+#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT	0x14e8
 #define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464
 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
 #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
 #define PCI_DEVICE_ID_AMD_19H_DF_F4	0x1654
 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT	0x14b5
 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
+#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
+#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
 
 /* Protect the PCI config register pairs used for SMN. */
 static DEFINE_MUTEX(smn_mutex);
@@ -41,8 +47,11 @@ static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
 	{}
 };
 
@@ -61,12 +70,15 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
 	{}
 };
 
@@ -81,6 +93,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0178823ce8c2..7fa460ccf7fa 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -556,10 +556,13 @@
 #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493
 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b
 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443
+#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3 0x1727
 #define PCI_DEVICE_ID_AMD_19H_DF_F3	0x1653
 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0
 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d
+#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3
+#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit 


From ce4b4657ff18925c315855aa290e93c5fa652d96 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 19 Jul 2022 21:02:48 -0300
Subject: vfio: Replace the DMA unmapping notifier with a callback

Instead of having drivers register the notifier with explicit code just
have them provide a dma_unmap callback op in their driver ops and rely on
the core code to wire it up.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-681e038e30fd+78-vfio_unmap_notif_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/gpu/drm/i915/gvt/gvt.h        |   1 -
 drivers/gpu/drm/i915/gvt/kvmgt.c      |  75 +++++---------------
 drivers/s390/cio/vfio_ccw_ops.c       |  39 +++-------
 drivers/s390/cio/vfio_ccw_private.h   |   2 -
 drivers/s390/crypto/vfio_ap_ops.c     |  53 +++-----------
 drivers/s390/crypto/vfio_ap_private.h |   3 -
 drivers/vfio/vfio.c                   | 129 ++++++++++++----------------------
 drivers/vfio/vfio.h                   |   3 +
 include/linux/vfio.h                  |  21 ++----
 9 files changed, 86 insertions(+), 240 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index aee1a45da74b..705689e64011 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -226,7 +226,6 @@ struct intel_vgpu {
 	unsigned long nr_cache_entries;
 	struct mutex cache_lock;
 
-	struct notifier_block iommu_notifier;
 	atomic_t released;
 
 	struct kvm_page_track_notifier_node track_node;
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index e2f6c56ab342..ecd5bb37b63a 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -729,34 +729,25 @@ int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
 	return ret;
 }
 
-static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
-				     unsigned long action, void *data)
+static void intel_vgpu_dma_unmap(struct vfio_device *vfio_dev, u64 iova,
+				 u64 length)
 {
-	struct intel_vgpu *vgpu =
-		container_of(nb, struct intel_vgpu, iommu_notifier);
-
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-		struct gvt_dma *entry;
-		unsigned long iov_pfn, end_iov_pfn;
+	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
+	struct gvt_dma *entry;
+	u64 iov_pfn = iova >> PAGE_SHIFT;
+	u64 end_iov_pfn = iov_pfn + length / PAGE_SIZE;
 
-		iov_pfn = unmap->iova >> PAGE_SHIFT;
-		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
+	mutex_lock(&vgpu->cache_lock);
+	for (; iov_pfn < end_iov_pfn; iov_pfn++) {
+		entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
+		if (!entry)
+			continue;
 
-		mutex_lock(&vgpu->cache_lock);
-		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
-			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
-			if (!entry)
-				continue;
-
-			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
-					   entry->size);
-			__gvt_cache_remove_entry(vgpu, entry);
-		}
-		mutex_unlock(&vgpu->cache_lock);
+		gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
+				   entry->size);
+		__gvt_cache_remove_entry(vgpu, entry);
 	}
-
-	return NOTIFY_OK;
+	mutex_unlock(&vgpu->cache_lock);
 }
 
 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
@@ -783,36 +774,20 @@ out:
 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
 {
 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
-	unsigned long events;
-	int ret;
-
-	vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
 
-	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-	ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events,
-				     &vgpu->iommu_notifier);
-	if (ret != 0) {
-		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
-			ret);
-		goto out;
-	}
-
-	ret = -EEXIST;
 	if (vgpu->attached)
-		goto undo_iommu;
+		return -EEXIST;
 
-	ret = -ESRCH;
 	if (!vgpu->vfio_device.kvm ||
 	    vgpu->vfio_device.kvm->mm != current->mm) {
 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
-		goto undo_iommu;
+		return -ESRCH;
 	}
 
 	kvm_get_kvm(vgpu->vfio_device.kvm);
 
-	ret = -EEXIST;
 	if (__kvmgt_vgpu_exist(vgpu))
-		goto undo_iommu;
+		return -EEXIST;
 
 	vgpu->attached = true;
 
@@ -831,12 +806,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
 
 	atomic_set(&vgpu->released, 0);
 	return 0;
-
-undo_iommu:
-	vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY,
-				 &vgpu->iommu_notifier);
-out:
-	return ret;
 }
 
 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
@@ -853,8 +822,6 @@ static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
 {
 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
-	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
-	int ret;
 
 	if (!vgpu->attached)
 		return;
@@ -864,11 +831,6 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
 
 	intel_gvt_release_vgpu(vgpu);
 
-	ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_IOMMU_NOTIFY,
-				       &vgpu->iommu_notifier);
-	drm_WARN(&i915->drm, ret,
-		 "vfio_unregister_notifier for iommu failed: %d\n", ret);
-
 	debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs));
 
 	kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
@@ -1610,6 +1572,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = {
 	.write		= intel_vgpu_write,
 	.mmap		= intel_vgpu_mmap,
 	.ioctl		= intel_vgpu_ioctl,
+	.dma_unmap	= intel_vgpu_dma_unmap,
 };
 
 static int intel_vgpu_probe(struct mdev_device *mdev)
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index bc2176421dc5..0047fd88f938 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -33,30 +33,16 @@ static int vfio_ccw_mdev_reset(struct vfio_ccw_private *private)
 	return 0;
 }
 
-static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
-				  unsigned long action,
-				  void *data)
+static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length)
 {
 	struct vfio_ccw_private *private =
-		container_of(nb, struct vfio_ccw_private, nb);
-
-	/*
-	 * Vendor drivers MUST unpin pages in response to an
-	 * invalidation.
-	 */
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-
-		if (!cp_iova_pinned(&private->cp, unmap->iova))
-			return NOTIFY_OK;
-
-		if (vfio_ccw_mdev_reset(private))
-			return NOTIFY_BAD;
+		container_of(vdev, struct vfio_ccw_private, vdev);
 
-		return NOTIFY_OK;
-	}
+	/* Drivers MUST unpin pages in response to an invalidation. */
+	if (!cp_iova_pinned(&private->cp, iova))
+		return;
 
-	return NOTIFY_DONE;
+	vfio_ccw_mdev_reset(private);
 }
 
 static ssize_t name_show(struct mdev_type *mtype,
@@ -154,23 +140,15 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
 {
 	struct vfio_ccw_private *private =
 		container_of(vdev, struct vfio_ccw_private, vdev);
-	unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
 	int ret;
 
 	/* Device cannot simply be opened again from this state */
 	if (private->state == VFIO_CCW_STATE_NOT_OPER)
 		return -EINVAL;
 
-	private->nb.notifier_call = vfio_ccw_mdev_notifier;
-
-	ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY,
-				     &events, &private->nb);
-	if (ret)
-		return ret;
-
 	ret = vfio_ccw_register_async_dev_regions(private);
 	if (ret)
-		goto out_unregister;
+		return ret;
 
 	ret = vfio_ccw_register_schib_dev_regions(private);
 	if (ret)
@@ -190,7 +168,6 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
 
 out_unregister:
 	vfio_ccw_unregister_dev_regions(private);
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb);
 	return ret;
 }
 
@@ -201,7 +178,6 @@ static void vfio_ccw_mdev_close_device(struct vfio_device *vdev)
 
 	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE);
 	vfio_ccw_unregister_dev_regions(private);
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb);
 }
 
 static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private,
@@ -624,6 +600,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = {
 	.write = vfio_ccw_mdev_write,
 	.ioctl = vfio_ccw_mdev_ioctl,
 	.request = vfio_ccw_mdev_request,
+	.dma_unmap = vfio_ccw_dma_unmap,
 };
 
 struct mdev_driver vfio_ccw_mdev_driver = {
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index abac532bf03e..cd24b7fada91 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -73,7 +73,6 @@ struct vfio_ccw_crw {
  * @state: internal state of the device
  * @completion: synchronization helper of the I/O completion
  * @avail: available for creating a mediated device
- * @nb: notifier for vfio events
  * @io_region: MMIO region to input/output I/O arguments/results
  * @io_mutex: protect against concurrent update of I/O regions
  * @region: additional regions for other subchannel operations
@@ -96,7 +95,6 @@ struct vfio_ccw_private {
 	int			state;
 	struct completion	*completion;
 	atomic_t		avail;
-	struct notifier_block	nb;
 	struct ccw_io_region	*io_region;
 	struct mutex		io_mutex;
 	struct vfio_ccw_region *region;
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index a7d2a95796d3..bb1a1677c5c2 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1226,34 +1226,14 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
 	return 0;
 }
 
-/**
- * vfio_ap_mdev_iommu_notifier - IOMMU notifier callback
- *
- * @nb: The notifier block
- * @action: Action to be taken
- * @data: data associated with the request
- *
- * For an UNMAP request, unpin the guest IOVA (the NIB guest address we
- * pinned before). Other requests are ignored.
- *
- * Return: for an UNMAP request, NOFITY_OK; otherwise NOTIFY_DONE.
- */
-static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb,
-				       unsigned long action, void *data)
+static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova,
+				   u64 length)
 {
-	struct ap_matrix_mdev *matrix_mdev;
-
-	matrix_mdev = container_of(nb, struct ap_matrix_mdev, iommu_notifier);
-
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-		unsigned long g_pfn = unmap->iova >> PAGE_SHIFT;
-
-		vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
-		return NOTIFY_OK;
-	}
+	struct ap_matrix_mdev *matrix_mdev =
+		container_of(vdev, struct ap_matrix_mdev, vdev);
+	unsigned long g_pfn = iova >> PAGE_SHIFT;
 
-	return NOTIFY_DONE;
+	vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
 }
 
 /**
@@ -1380,27 +1360,11 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev)
 {
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
-	unsigned long events;
-	int ret;
 
 	if (!vdev->kvm)
 		return -EINVAL;
 
-	ret = vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
-	if (ret)
-		return ret;
-
-	matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier;
-	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-	ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events,
-				     &matrix_mdev->iommu_notifier);
-	if (ret)
-		goto err_kvm;
-	return 0;
-
-err_kvm:
-	vfio_ap_mdev_unset_kvm(matrix_mdev);
-	return ret;
+	return vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
 }
 
 static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
@@ -1408,8 +1372,6 @@ static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
 
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY,
-				 &matrix_mdev->iommu_notifier);
 	vfio_ap_mdev_unset_kvm(matrix_mdev);
 }
 
@@ -1461,6 +1423,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = {
 	.open_device = vfio_ap_mdev_open_device,
 	.close_device = vfio_ap_mdev_close_device,
 	.ioctl = vfio_ap_mdev_ioctl,
+	.dma_unmap = vfio_ap_mdev_dma_unmap,
 };
 
 static struct mdev_driver vfio_ap_matrix_driver = {
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index a26efd804d0d..abb59d59f81b 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -81,8 +81,6 @@ struct ap_matrix {
  * @node:	allows the ap_matrix_mdev struct to be added to a list
  * @matrix:	the adapters, usage domains and control domains assigned to the
  *		mediated matrix device.
- * @iommu_notifier: notifier block used for specifying callback function for
- *		    handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even
  * @kvm:	the struct holding guest's state
  * @pqap_hook:	the function pointer to the interception handler for the
  *		PQAP(AQIC) instruction.
@@ -92,7 +90,6 @@ struct ap_matrix_mdev {
 	struct vfio_device vdev;
 	struct list_head node;
 	struct ap_matrix matrix;
-	struct notifier_block iommu_notifier;
 	struct kvm *kvm;
 	crypto_hook pqap_hook;
 	struct mdev_device *mdev;
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index bd84ca7c5e35..83c375fa2421 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -231,6 +231,9 @@ int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 {
 	struct vfio_iommu_driver *driver, *tmp;
 
+	if (WARN_ON(!ops->register_notifier != !ops->unregister_notifier))
+		return -EINVAL;
+
 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 	if (!driver)
 		return -ENOMEM;
@@ -1079,8 +1082,20 @@ static void vfio_device_unassign_container(struct vfio_device *device)
 	up_write(&device->group->group_rwsem);
 }
 
+static int vfio_iommu_notifier(struct notifier_block *nb, unsigned long action,
+			       void *data)
+{
+	struct vfio_device *vfio_device =
+		container_of(nb, struct vfio_device, iommu_nb);
+	struct vfio_iommu_type1_dma_unmap *unmap = data;
+
+	vfio_device->ops->dma_unmap(vfio_device, unmap->iova, unmap->size);
+	return NOTIFY_OK;
+}
+
 static struct file *vfio_device_open(struct vfio_device *device)
 {
+	struct vfio_iommu_driver *iommu_driver;
 	struct file *filep;
 	int ret;
 
@@ -1111,6 +1126,18 @@ static struct file *vfio_device_open(struct vfio_device *device)
 			if (ret)
 				goto err_undo_count;
 		}
+
+		iommu_driver = device->group->container->iommu_driver;
+		if (device->ops->dma_unmap && iommu_driver &&
+		    iommu_driver->ops->register_notifier) {
+			unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+			device->iommu_nb.notifier_call = vfio_iommu_notifier;
+			iommu_driver->ops->register_notifier(
+				device->group->container->iommu_data, &events,
+				&device->iommu_nb);
+		}
+
 		up_read(&device->group->group_rwsem);
 	}
 	mutex_unlock(&device->dev_set->lock);
@@ -1145,8 +1172,16 @@ static struct file *vfio_device_open(struct vfio_device *device)
 err_close_device:
 	mutex_lock(&device->dev_set->lock);
 	down_read(&device->group->group_rwsem);
-	if (device->open_count == 1 && device->ops->close_device)
+	if (device->open_count == 1 && device->ops->close_device) {
 		device->ops->close_device(device);
+
+		iommu_driver = device->group->container->iommu_driver;
+		if (device->ops->dma_unmap && iommu_driver &&
+		    iommu_driver->ops->unregister_notifier)
+			iommu_driver->ops->unregister_notifier(
+				device->group->container->iommu_data,
+				&device->iommu_nb);
+	}
 err_undo_count:
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
@@ -1341,12 +1376,20 @@ static const struct file_operations vfio_group_fops = {
 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
+	struct vfio_iommu_driver *iommu_driver;
 
 	mutex_lock(&device->dev_set->lock);
 	vfio_assert_device_open(device);
 	down_read(&device->group->group_rwsem);
 	if (device->open_count == 1 && device->ops->close_device)
 		device->ops->close_device(device);
+
+	iommu_driver = device->group->container->iommu_driver;
+	if (device->ops->dma_unmap && iommu_driver &&
+	    iommu_driver->ops->unregister_notifier)
+		iommu_driver->ops->unregister_notifier(
+			device->group->container->iommu_data,
+			&device->iommu_nb);
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
 	if (device->open_count == 0)
@@ -2029,90 +2072,6 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
 }
 EXPORT_SYMBOL(vfio_dma_rw);
 
-static int vfio_register_iommu_notifier(struct vfio_group *group,
-					unsigned long *events,
-					struct notifier_block *nb)
-{
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-	int ret;
-
-	lockdep_assert_held_read(&group->group_rwsem);
-
-	container = group->container;
-	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->register_notifier))
-		ret = driver->ops->register_notifier(container->iommu_data,
-						     events, nb);
-	else
-		ret = -ENOTTY;
-
-	return ret;
-}
-
-static int vfio_unregister_iommu_notifier(struct vfio_group *group,
-					  struct notifier_block *nb)
-{
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-	int ret;
-
-	lockdep_assert_held_read(&group->group_rwsem);
-
-	container = group->container;
-	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->unregister_notifier))
-		ret = driver->ops->unregister_notifier(container->iommu_data,
-						       nb);
-	else
-		ret = -ENOTTY;
-
-	return ret;
-}
-
-int vfio_register_notifier(struct vfio_device *device,
-			   enum vfio_notify_type type, unsigned long *events,
-			   struct notifier_block *nb)
-{
-	struct vfio_group *group = device->group;
-	int ret;
-
-	if (!nb || !events || (*events == 0) ||
-	    !vfio_assert_device_open(device))
-		return -EINVAL;
-
-	switch (type) {
-	case VFIO_IOMMU_NOTIFY:
-		ret = vfio_register_iommu_notifier(group, events, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(vfio_register_notifier);
-
-int vfio_unregister_notifier(struct vfio_device *device,
-			     enum vfio_notify_type type,
-			     struct notifier_block *nb)
-{
-	struct vfio_group *group = device->group;
-	int ret;
-
-	if (!nb || !vfio_assert_device_open(device))
-		return -EINVAL;
-
-	switch (type) {
-	case VFIO_IOMMU_NOTIFY:
-		ret = vfio_unregister_iommu_notifier(group, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(vfio_unregister_notifier);
-
 /*
  * Module/class support
  */
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index a67130221151..25da02ca1568 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -33,6 +33,9 @@ enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
 };
 
+/* events for register_notifier() */
+#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
+
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 4d26e149db81..1f9fc7a9be9e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -49,6 +49,7 @@ struct vfio_device {
 	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
+	struct notifier_block iommu_nb;
 };
 
 /**
@@ -65,6 +66,8 @@ struct vfio_device {
  * @match: Optional device name match callback (return: 0 for no-match, >0 for
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
+ * @dma_unmap: Called when userspace unmaps IOVA from the container
+ *             this device is attached to.
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
  */
 struct vfio_device_ops {
@@ -80,6 +83,7 @@ struct vfio_device_ops {
 	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
 	void	(*request)(struct vfio_device *vdev, unsigned int count);
 	int	(*match)(struct vfio_device *vdev, char *buf);
+	void	(*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
 };
@@ -164,23 +168,6 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-/* each type has independent events */
-enum vfio_notify_type {
-	VFIO_IOMMU_NOTIFY = 0,
-};
-
-/* events for VFIO_IOMMU_NOTIFY */
-#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	BIT(0)
-
-int vfio_register_notifier(struct vfio_device *device,
-			   enum vfio_notify_type type,
-			   unsigned long *required_events,
-			   struct notifier_block *nb);
-int vfio_unregister_notifier(struct vfio_device *device,
-			     enum vfio_notify_type type,
-			     struct notifier_block *nb);
-
-
 /*
  * Sub-module helpers
  */
-- 
cgit 


From 8cfc5b60751bcf9b4c6bbab3f6a72d59e0156a89 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 19 Jul 2022 21:02:49 -0300
Subject: vfio: Replace the iommu notifier with a device list

Instead of bouncing the function call to the driver op through a blocking
notifier just have the iommu layer call it directly.

Register each device that is being attached to the iommu with the lower
driver which then threads them on a linked list and calls the appropriate
driver op at the right time.

Currently the only use is if dma_unmap() is defined.

Also, fully lock all the debugging tests on the pinning path that a
dma_unmap is registered.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v4-681e038e30fd+78-vfio_unmap_notif_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c             |  41 ++++------------
 drivers/vfio/vfio.h             |  12 ++---
 drivers/vfio/vfio_iommu_type1.c | 103 +++++++++++++++++++++++++---------------
 include/linux/vfio.h            |   2 +-
 4 files changed, 81 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 83c375fa2421..b3ce8073cfb1 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -231,7 +231,7 @@ int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 {
 	struct vfio_iommu_driver *driver, *tmp;
 
-	if (WARN_ON(!ops->register_notifier != !ops->unregister_notifier))
+	if (WARN_ON(!ops->register_device != !ops->unregister_device))
 		return -EINVAL;
 
 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
@@ -1082,17 +1082,6 @@ static void vfio_device_unassign_container(struct vfio_device *device)
 	up_write(&device->group->group_rwsem);
 }
 
-static int vfio_iommu_notifier(struct notifier_block *nb, unsigned long action,
-			       void *data)
-{
-	struct vfio_device *vfio_device =
-		container_of(nb, struct vfio_device, iommu_nb);
-	struct vfio_iommu_type1_dma_unmap *unmap = data;
-
-	vfio_device->ops->dma_unmap(vfio_device, unmap->iova, unmap->size);
-	return NOTIFY_OK;
-}
-
 static struct file *vfio_device_open(struct vfio_device *device)
 {
 	struct vfio_iommu_driver *iommu_driver;
@@ -1128,15 +1117,9 @@ static struct file *vfio_device_open(struct vfio_device *device)
 		}
 
 		iommu_driver = device->group->container->iommu_driver;
-		if (device->ops->dma_unmap && iommu_driver &&
-		    iommu_driver->ops->register_notifier) {
-			unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-
-			device->iommu_nb.notifier_call = vfio_iommu_notifier;
-			iommu_driver->ops->register_notifier(
-				device->group->container->iommu_data, &events,
-				&device->iommu_nb);
-		}
+		if (iommu_driver && iommu_driver->ops->register_device)
+			iommu_driver->ops->register_device(
+				device->group->container->iommu_data, device);
 
 		up_read(&device->group->group_rwsem);
 	}
@@ -1176,11 +1159,9 @@ err_close_device:
 		device->ops->close_device(device);
 
 		iommu_driver = device->group->container->iommu_driver;
-		if (device->ops->dma_unmap && iommu_driver &&
-		    iommu_driver->ops->unregister_notifier)
-			iommu_driver->ops->unregister_notifier(
-				device->group->container->iommu_data,
-				&device->iommu_nb);
+		if (iommu_driver && iommu_driver->ops->unregister_device)
+			iommu_driver->ops->unregister_device(
+				device->group->container->iommu_data, device);
 	}
 err_undo_count:
 	up_read(&device->group->group_rwsem);
@@ -1385,11 +1366,9 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 		device->ops->close_device(device);
 
 	iommu_driver = device->group->container->iommu_driver;
-	if (device->ops->dma_unmap && iommu_driver &&
-	    iommu_driver->ops->unregister_notifier)
-		iommu_driver->ops->unregister_notifier(
-			device->group->container->iommu_data,
-			&device->iommu_nb);
+	if (iommu_driver && iommu_driver->ops->unregister_device)
+		iommu_driver->ops->unregister_device(
+			device->group->container->iommu_data, device);
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
 	if (device->open_count == 0)
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 25da02ca1568..4a7db1f3c33e 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -33,9 +33,6 @@ enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
 };
 
-/* events for register_notifier() */
-#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
-
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
@@ -58,11 +55,10 @@ struct vfio_iommu_driver_ops {
 				     unsigned long *phys_pfn);
 	int		(*unpin_pages)(void *iommu_data,
 				       unsigned long *user_pfn, int npage);
-	int		(*register_notifier)(void *iommu_data,
-					     unsigned long *events,
-					     struct notifier_block *nb);
-	int		(*unregister_notifier)(void *iommu_data,
-					       struct notifier_block *nb);
+	void		(*register_device)(void *iommu_data,
+					   struct vfio_device *vdev);
+	void		(*unregister_device)(void *iommu_data,
+					     struct vfio_device *vdev);
 	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
 				  void *data, size_t count, bool write);
 	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index db24062fb343..026a1d2553a2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -67,7 +67,8 @@ struct vfio_iommu {
 	struct list_head	iova_list;
 	struct mutex		lock;
 	struct rb_root		dma_list;
-	struct blocking_notifier_head notifier;
+	struct list_head	device_list;
+	struct mutex		device_list_lock;
 	unsigned int		dma_avail;
 	unsigned int		vaddr_invalid_count;
 	uint64_t		pgsize_bitmap;
@@ -865,8 +866,8 @@ again:
 		}
 	}
 
-	/* Fail if notifier list is empty */
-	if (!iommu->notifier.head) {
+	/* Fail if no dma_umap notifier is registered */
+	if (list_empty(&iommu->device_list)) {
 		ret = -EINVAL;
 		goto pin_done;
 	}
@@ -1287,6 +1288,35 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
 	return 0;
 }
 
+/*
+ * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
+ * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
+ * pages in response to an invalidation.
+ */
+static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
+				  struct vfio_dma *dma)
+{
+	struct vfio_device *device;
+
+	if (list_empty(&iommu->device_list))
+		return;
+
+	/*
+	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
+	 * pinned within the range. Since vfio_unpin_pages() will eventually
+	 * call back down to this code and try to obtain the iommu->lock we must
+	 * drop it.
+	 */
+	mutex_lock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
+
+	list_for_each_entry(device, &iommu->device_list, iommu_entry)
+		device->ops->dma_unmap(device, dma->iova, dma->size);
+
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_lock(&iommu->lock);
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap,
 			     struct vfio_bitmap *bitmap)
@@ -1400,8 +1430,6 @@ again:
 		}
 
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
-			struct vfio_iommu_type1_dma_unmap nb_unmap;
-
 			if (dma_last == dma) {
 				BUG_ON(++retries > 10);
 			} else {
@@ -1409,20 +1437,7 @@ again:
 				retries = 0;
 			}
 
-			nb_unmap.iova = dma->iova;
-			nb_unmap.size = dma->size;
-
-			/*
-			 * Notify anyone (mdev vendor drivers) to invalidate and
-			 * unmap iovas within the range we're about to unmap.
-			 * Vendor drivers MUST unpin pages in response to an
-			 * invalidation.
-			 */
-			mutex_unlock(&iommu->lock);
-			blocking_notifier_call_chain(&iommu->notifier,
-						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
-						    &nb_unmap);
-			mutex_lock(&iommu->lock);
+			vfio_notify_dma_unmap(iommu, dma);
 			goto again;
 		}
 
@@ -2475,7 +2490,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 		if (list_empty(&iommu->emulated_iommu_groups) &&
 		    list_empty(&iommu->domain_list)) {
-			WARN_ON(iommu->notifier.head);
+			WARN_ON(!list_empty(&iommu->device_list));
 			vfio_iommu_unmap_unpin_all(iommu);
 		}
 		goto detach_group_done;
@@ -2507,7 +2522,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		if (list_empty(&domain->group_list)) {
 			if (list_is_singular(&iommu->domain_list)) {
 				if (list_empty(&iommu->emulated_iommu_groups)) {
-					WARN_ON(iommu->notifier.head);
+					WARN_ON(!list_empty(
+						&iommu->device_list));
 					vfio_iommu_unmap_unpin_all(iommu);
 				} else {
 					vfio_iommu_unmap_unpin_reaccount(iommu);
@@ -2568,7 +2584,8 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	iommu->dma_avail = dma_entry_limit;
 	iommu->container_open = true;
 	mutex_init(&iommu->lock);
-	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+	mutex_init(&iommu->device_list_lock);
+	INIT_LIST_HEAD(&iommu->device_list);
 	init_waitqueue_head(&iommu->vaddr_wait);
 	iommu->pgsize_bitmap = PAGE_MASK;
 	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
@@ -3005,28 +3022,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 	}
 }
 
-static int vfio_iommu_type1_register_notifier(void *iommu_data,
-					      unsigned long *events,
-					      struct notifier_block *nb)
+static void vfio_iommu_type1_register_device(void *iommu_data,
+					     struct vfio_device *vdev)
 {
 	struct vfio_iommu *iommu = iommu_data;
 
-	/* clear known events */
-	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-
-	/* refuse to register if still events remaining */
-	if (*events)
-		return -EINVAL;
+	if (!vdev->ops->dma_unmap)
+		return;
 
-	return blocking_notifier_chain_register(&iommu->notifier, nb);
+	/*
+	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
+	 * iteration for dma_unmap must be done under the device_list_lock.
+	 * Holding both locks here allows avoiding the device_list_lock in
+	 * several fast paths. See vfio_notify_dma_unmap()
+	 */
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_add(&vdev->iommu_entry, &iommu->device_list);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
 }
 
-static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
-						struct notifier_block *nb)
+static void vfio_iommu_type1_unregister_device(void *iommu_data,
+					       struct vfio_device *vdev)
 {
 	struct vfio_iommu *iommu = iommu_data;
 
-	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+	if (!vdev->ops->dma_unmap)
+		return;
+
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_del(&vdev->iommu_entry);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
 }
 
 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
@@ -3160,8 +3189,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.detach_group		= vfio_iommu_type1_detach_group,
 	.pin_pages		= vfio_iommu_type1_pin_pages,
 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
-	.register_notifier	= vfio_iommu_type1_register_notifier,
-	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
+	.register_device	= vfio_iommu_type1_register_device,
+	.unregister_device	= vfio_iommu_type1_unregister_device,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
 	.notify			= vfio_iommu_type1_notify,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 1f9fc7a9be9e..19cefbaa3d06 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -49,7 +49,7 @@ struct vfio_device {
 	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
-	struct notifier_block iommu_nb;
+	struct list_head iommu_entry;
 };
 
 /**
-- 
cgit 


From 9cb61fda8c71baaff423fe5be2e609100860fa78 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 20 Jul 2022 08:52:20 -0700
Subject: bpf: Fix bpf_trampoline_{,un}link_cgroup_shim ifdef guards

They were updated in kernel/bpf/trampoline.c to fix another build
issue. We should to do the same for include/linux/bpf.h header.

Fixes: 3908fcddc65d ("bpf: fix lsm_cgroup build errors on esoteric configs")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220720155220.4087433-1-sdf@google.com
---
 include/linux/bpf.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a5bf00649995..11950029284f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1254,9 +1254,6 @@ struct bpf_dummy_ops {
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
-int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
-				    int cgroup_atype);
-void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
 #else
 static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 {
@@ -1280,6 +1277,13 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+#endif
+
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+				    int cgroup_atype);
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
+#else
 static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 						  int cgroup_atype)
 {
-- 
cgit 


From a1a5482a2c6e38a3ebed32e571625c56a8cc41a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Jun 2022 16:52:06 +0200
Subject: x86/extable: Fix ex_handler_msr() print condition

On Fri, Jun 17, 2022 at 02:08:52PM +0300, Stephane Eranian wrote:
> Some changes to the way invalid MSR accesses are reported by the
> kernel is causing some problems with messages printed on the
> console.
>
> We have seen several cases of ex_handler_msr() printing invalid MSR
> accesses once but the callstack multiple times causing confusion on
> the console.

> The problem here is that another earlier commit (5.13):
>
> a358f40600b3 ("once: implement DO_ONCE_LITE for non-fast-path "do once" functionality")
>
> Modifies all the pr_*_once() calls to always return true claiming
> that no caller is ever checking the return value of the functions.
>
> This is why we are seeing the callstack printed without the
> associated printk() msg.

Extract the ONCE_IF(cond) part into __ONCE_LTE_IF() and use that to
implement DO_ONCE_LITE_IF() and fix the extable code.

Fixes: a358f40600b3 ("once: implement DO_ONCE_LITE for non-fast-path "do once" functionality")
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Stephane Eranian <eranian@google.com>
Link: https://lkml.kernel.org/r/YqyVFsbviKjVGGZ9@worktop.programming.kicks-ass.net
---
 arch/x86/mm/extable.c     | 16 +++++++++-------
 include/linux/once_lite.h | 20 ++++++++++++++++----
 2 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index dba2197c05c3..331310c29349 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -94,16 +94,18 @@ static bool ex_handler_copy(const struct exception_table_entry *fixup,
 static bool ex_handler_msr(const struct exception_table_entry *fixup,
 			   struct pt_regs *regs, bool wrmsr, bool safe, int reg)
 {
-	if (!safe && wrmsr &&
-	    pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
-			 (unsigned int)regs->cx, (unsigned int)regs->dx,
-			 (unsigned int)regs->ax,  regs->ip, (void *)regs->ip))
+	if (__ONCE_LITE_IF(!safe && wrmsr)) {
+		pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+			(unsigned int)regs->cx, (unsigned int)regs->dx,
+			(unsigned int)regs->ax,  regs->ip, (void *)regs->ip);
 		show_stack_regs(regs);
+	}
 
-	if (!safe && !wrmsr &&
-	    pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
-			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
+	if (__ONCE_LITE_IF(!safe && !wrmsr)) {
+		pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+			(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
 		show_stack_regs(regs);
+	}
 
 	if (!wrmsr) {
 		/* Pretend that the read succeeded and returned 0. */
diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h
index 861e606b820f..b7bce4983638 100644
--- a/include/linux/once_lite.h
+++ b/include/linux/once_lite.h
@@ -9,15 +9,27 @@
  */
 #define DO_ONCE_LITE(func, ...)						\
 	DO_ONCE_LITE_IF(true, func, ##__VA_ARGS__)
-#define DO_ONCE_LITE_IF(condition, func, ...)				\
+
+#define __ONCE_LITE_IF(condition)					\
 	({								\
 		static bool __section(".data.once") __already_done;	\
-		bool __ret_do_once = !!(condition);			\
+		bool __ret_cond = !!(condition);			\
+		bool __ret_once = false;				\
 									\
-		if (unlikely(__ret_do_once && !__already_done)) {	\
+		if (unlikely(__ret_cond && !__already_done)) {		\
 			__already_done = true;				\
-			func(__VA_ARGS__);				\
+			__ret_once = true;				\
 		}							\
+		unlikely(__ret_once);					\
+	})
+
+#define DO_ONCE_LITE_IF(condition, func, ...)				\
+	({								\
+		bool __ret_do_once = !!(condition);			\
+									\
+		if (__ONCE_LITE_IF(__ret_do_once))			\
+			func(__VA_ARGS__);				\
+									\
 		unlikely(__ret_do_once);				\
 	})
 
-- 
cgit 


From 5588d628027092e66195097bdf6835ddf64418b3 Mon Sep 17 00:00:00 2001
From: Łukasz Spintzyk <lukasz.spintzyk@synaptics.com>
Date: Wed, 20 Jul 2022 08:05:18 +0200
Subject: net/cdc_ncm: Increase NTB max RX/TX values to 64kb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DisplayLink ethernet devices require NTB buffers larger then 32kb
in order to run with highest performance.

This patch is changing upper limit of the rx and tx buffers.
Those buffers are initialized with CDC_NCM_NTB_DEF_SIZE_RX and
CDC_NCM_NTB_DEF_SIZE_TX which is 16kb so by default no device is
affected by increased limit.

Rx and tx buffer is increased under two conditions:
 - Device need to advertise that it supports higher buffer size in
   dwNtbMaxInMaxSize and dwNtbMaxOutMaxSize.
 - cdc_ncm/rx_max and cdc_ncm/tx_max driver parameters must be adjusted
   with udev rule or ethtool.

Summary of testing and performance results:
Tests were performed on following devices:
 - DisplayLink DL-3xxx family device
 - DisplayLink DL-6xxx family device
 - ASUS USB-C2500 2.5G USB3 ethernet adapter
 - Plugable USB3 1G USB3 ethernet adapter
 - EDIMAX EU-4307 USB-C ethernet adapter
 - Dell DBQBCBC064 USB-C ethernet adapter

Performance measurements were done with:
 - iperf3 between two linux boxes
 - http://openspeedtest.com/ instance running on local test machine

Insights from tests results:
 - All except one from third party usb adapters were not affected by
   increased buffer size to their advertised dwNtbOutMaxSize and
   dwNtbInMaxSize.
   Devices were generally reaching 912-940Mbps both download and upload.

   Only EDIMAX adapter experienced decreased download size from
   929Mbps to 827Mbps with iper3, with openspeedtest decrease was from
   968Mbps to 886Mbps.

 - DisplayLink DL-3xxx family devices experienced performance increase
   with iperf3 download from 300Mbps to 870Mbps and
   upload from 782Mbps to 844Mbps.
   With openspeedtest download increased from 556Mbps to 873Mbps
   and upload from 727Mbps to 973Mbps

 - DiplayLink DL-6xxx family devices are not affected by
   increased buffer size.

Signed-off-by: Łukasz Spintzyk <lukasz.spintzyk@synaptics.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220720060518.541-2-lukasz.spintzyk@synaptics.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/usb/cdc_ncm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index f7cb3ddce7fb..2d207cb4837d 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -53,8 +53,8 @@
 #define USB_CDC_NCM_NDP32_LENGTH_MIN		0x20
 
 /* Maximum NTB length */
-#define	CDC_NCM_NTB_MAX_SIZE_TX			32768	/* bytes */
-#define	CDC_NCM_NTB_MAX_SIZE_RX			32768	/* bytes */
+#define	CDC_NCM_NTB_MAX_SIZE_TX			65536	/* bytes */
+#define	CDC_NCM_NTB_MAX_SIZE_RX			65536	/* bytes */
 
 /* Initial NTB length */
 #define	CDC_NCM_NTB_DEF_SIZE_TX			16384	/* bytes */
-- 
cgit 


From d0b55afa47694f6f61b40f578ede7bde1648fe48 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 6 Jul 2022 17:20:52 -0700
Subject: dmaengine: idxd: Correct IAX operation code names

Some IAX operation code nomenclatures are misleading or don't match with
others:

1. Operation code 0x4c is Zero Compress 32. IAX_OPCODE_DECOMP_32 is a
   misleading name. Change it to IAX_OPCODE_ZERO_COMP_32.
2. Operation code 0x4d is Zero Compress 16. IAX_OPCODE_DECOMP_16 is a
   misleading name. Change it to IAX_OPCODE_ZERO_COMP_16.
3. IAX_OPCDE_FIND_UNIQUE is corrected to match with other nomenclatures.

Co-developed-by: Li Zhang <li4.zhang@intel.com>
Signed-off-by: Li Zhang <li4.zhang@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20220707002052.1546361-1-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index bce7c43657d5..095299c75828 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -89,14 +89,14 @@ enum iax_opcode {
 	IAX_OPCODE_CRC64,
 	IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
 	IAX_OPCODE_ZERO_DECOMP_16,
-	IAX_OPCODE_DECOMP_32 = 0x4c,
-	IAX_OPCODE_DECOMP_16,
+	IAX_OPCODE_ZERO_COMP_32 = 0x4c,
+	IAX_OPCODE_ZERO_COMP_16,
 	IAX_OPCODE_SCAN = 0x50,
 	IAX_OPCODE_SET_MEMBER,
 	IAX_OPCODE_EXTRACT,
 	IAX_OPCODE_SELECT,
 	IAX_OPCODE_RLE_BURST,
-	IAX_OPCDE_FIND_UNIQUE,
+	IAX_OPCODE_FIND_UNIQUE,
 	IAX_OPCODE_EXPAND,
 };
 
-- 
cgit 


From e0c7ea83f006ce8c3264ef8b6508a891d886ad4f Mon Sep 17 00:00:00 2001
From: Shengjiu Wang <shengjiu.wang@nxp.com>
Date: Thu, 7 Jul 2022 11:00:29 +0800
Subject: dmaengine: imx-sdma: Add FIFO stride support for multi FIFO script

The peripheral may have several FIFOs, but some case just select
some FIFOs from them for data transfer, which means FIFO0 and FIFO2
may be selected. So add FIFO address stride support, 0 means all FIFOs
are continuous, 1 means 1 word stride between FIFOs. All stride between
FIFOs should be same.

Another option words_per_fifo means how many audio channel data copied
to one FIFO one time, 1 means one channel per FIFO, 2 means 2 channels
per FIFO.

If 'n_fifos_src =  4' and 'words_per_fifo = 2', it means the first two
words(channels) fetch from FIFO0 and then jump to FIFO1 for next two words,
and so on after the last FIFO3 fetched, roll back to FIFO0.

Signed-off-by: Joy Zou <joy.zou@nxp.com>
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Link: https://lore.kernel.org/r/1657162829-9273-1-git-send-email-shengjiu.wang@nxp.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c      | 27 +++++++++++++++++++++++++--
 include/linux/dma/imx-dma.h | 13 +++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 8a5d4d4a4dff..e302089784ed 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -183,6 +183,8 @@
 				 BIT(DMA_DEV_TO_DEV))
 
 #define SDMA_WATERMARK_LEVEL_N_FIFOS	GENMASK(15, 12)
+#define SDMA_WATERMARK_LEVEL_OFF_FIFOS  GENMASK(19, 16)
+#define SDMA_WATERMARK_LEVEL_WORDS_PER_FIFO   GENMASK(31, 28)
 #define SDMA_WATERMARK_LEVEL_SW_DONE	BIT(23)
 
 #define SDMA_DONE0_CONFIG_DONE_SEL	BIT(7)
@@ -429,6 +431,9 @@ struct sdma_desc {
  * @n_fifos_src:	number of source device fifos
  * @n_fifos_dst:	number of destination device fifos
  * @sw_done:		software done flag
+ * @stride_fifos_src:	stride for source device FIFOs
+ * @stride_fifos_dst:	stride for destination device FIFOs
+ * @words_per_fifo:	copy number of words one time for one FIFO
  */
 struct sdma_channel {
 	struct virt_dma_chan		vc;
@@ -456,6 +461,9 @@ struct sdma_channel {
 	bool				is_ram_script;
 	unsigned int			n_fifos_src;
 	unsigned int			n_fifos_dst;
+	unsigned int			stride_fifos_src;
+	unsigned int			stride_fifos_dst;
+	unsigned int			words_per_fifo;
 	bool				sw_done;
 };
 
@@ -1245,17 +1253,29 @@ static void sdma_set_watermarklevel_for_p2p(struct sdma_channel *sdmac)
 static void sdma_set_watermarklevel_for_sais(struct sdma_channel *sdmac)
 {
 	unsigned int n_fifos;
+	unsigned int stride_fifos;
+	unsigned int words_per_fifo;
 
 	if (sdmac->sw_done)
 		sdmac->watermark_level |= SDMA_WATERMARK_LEVEL_SW_DONE;
 
-	if (sdmac->direction == DMA_DEV_TO_MEM)
+	if (sdmac->direction == DMA_DEV_TO_MEM) {
 		n_fifos = sdmac->n_fifos_src;
-	else
+		stride_fifos = sdmac->stride_fifos_src;
+	} else {
 		n_fifos = sdmac->n_fifos_dst;
+		stride_fifos = sdmac->stride_fifos_dst;
+	}
+
+	words_per_fifo = sdmac->words_per_fifo;
 
 	sdmac->watermark_level |=
 			FIELD_PREP(SDMA_WATERMARK_LEVEL_N_FIFOS, n_fifos);
+	sdmac->watermark_level |=
+			FIELD_PREP(SDMA_WATERMARK_LEVEL_OFF_FIFOS, stride_fifos);
+	if (words_per_fifo)
+		sdmac->watermark_level |=
+			FIELD_PREP(SDMA_WATERMARK_LEVEL_WORDS_PER_FIFO, (words_per_fifo - 1));
 }
 
 static int sdma_config_channel(struct dma_chan *chan)
@@ -1769,6 +1789,9 @@ static int sdma_config(struct dma_chan *chan,
 		}
 		sdmac->n_fifos_src = sdmacfg->n_fifos_src;
 		sdmac->n_fifos_dst = sdmacfg->n_fifos_dst;
+		sdmac->stride_fifos_src = sdmacfg->stride_fifos_src;
+		sdmac->stride_fifos_dst = sdmacfg->stride_fifos_dst;
+		sdmac->words_per_fifo = sdmacfg->words_per_fifo;
 		sdmac->sw_done = sdmacfg->sw_done;
 	}
 
diff --git a/include/linux/dma/imx-dma.h b/include/linux/dma/imx-dma.h
index 8887762360d4..f487a4fa103a 100644
--- a/include/linux/dma/imx-dma.h
+++ b/include/linux/dma/imx-dma.h
@@ -70,6 +70,16 @@ static inline int imx_dma_is_general_purpose(struct dma_chan *chan)
  * struct sdma_peripheral_config - SDMA config for audio
  * @n_fifos_src: Number of FIFOs for recording
  * @n_fifos_dst: Number of FIFOs for playback
+ * @stride_fifos_src: FIFO address stride for recording, 0 means all FIFOs are
+ *                    continuous, 1 means 1 word stride between FIFOs. All stride
+ *                    between FIFOs should be same.
+ * @stride_fifos_dst: FIFO address stride for playback
+ * @words_per_fifo: numbers of words per FIFO fetch/fill, 1 means
+ *                  one channel per FIFO, 2 means 2 channels per FIFO..
+ *                  If 'n_fifos_src =  4' and 'words_per_fifo = 2', it
+ *                  means the first two words(channels) fetch from FIFO0
+ *                  and then jump to FIFO1 for next two words, and so on
+ *                  after the last FIFO3 fetched, roll back to FIFO0.
  * @sw_done: Use software done. Needed for PDM (micfil)
  *
  * Some i.MX Audio devices (SAI, micfil) have multiple successive FIFO
@@ -82,6 +92,9 @@ static inline int imx_dma_is_general_purpose(struct dma_chan *chan)
 struct sdma_peripheral_config {
 	int n_fifos_src;
 	int n_fifos_dst;
+	int stride_fifos_src;
+	int stride_fifos_dst;
+	int words_per_fifo;
 	bool sw_done;
 };
 
-- 
cgit 


From 974854ab0728532600c72e41a44d6ce1cf8f20a4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 12 Jul 2022 18:37:54 -0700
Subject: cxl/acpi: Track CXL resources in iomem_resource

Recall that CXL capable address ranges, on ACPI platforms, are published
in the CEDT.CFMWS (CXL Early Discovery Table: CXL Fixed Memory Window
Structures). These windows represent both the actively mapped capacity
and the potential address space that can be dynamically assigned to a
new CXL decode configuration (region / interleave-set).

CXL endpoints like DDR DIMMs can be mapped at any physical address
including 0 and legacy ranges.

There is an expectation and requirement that the /proc/iomem interface
and the iomem_resource tree in the kernel reflect the full set of
platform address ranges. I.e. that every address range that platform
firmware and bus drivers enumerate be reflected as an iomem_resource
entry. The hard requirement to do this for CXL arises from the fact that
facilities like CONFIG_DEVICE_PRIVATE expect to be able to treat empty
iomem_resource ranges as free for software to use as proxy address
space. Without CXL publishing its potential address ranges in
iomem_resource, the CONFIG_DEVICE_PRIVATE mechanism may inadvertently
steal capacity reserved for runtime provisioning of new CXL regions.

So, iomem_resource needs to know about both active and potential CXL
resource ranges. The active CXL resources might already be reflected in
iomem_resource as "System RAM". insert_resource_expand_to_fit() handles
re-parenting "System RAM" underneath a CXL window.

The "_expand_to_fit()" behavior handles cases where a CXL window is not
a strict superset of an existing entry in the iomem_resource tree. The
"_expand_to_fit()" behavior is acceptable from the perspective of
resource allocation. The expansion happens because a conflicting
resource range is already populated, which means the resource boundary
expansion does not result in any additional free CXL address space being
made available. CXL address space allocation is always bounded by the
orginal unexpanded address range.

However, the potential for expansion does mean that something like
walk_iomem_res_desc(IORES_DESC_CXL...) can only return fuzzy answers on
corner case platforms that cause the resource tree to expand a CXL
window resource over a range that is not decoded by CXL. This would be
an odd platform configuration, but if it becomes a problem in practice
the CXL subsytem could just publish an API that returns definitive
answers.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/165784325943.1758207.5310344844375305118.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/acpi.c     | 144 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ioport.h |   1 +
 kernel/resource.c      |   7 +++
 3 files changed, 149 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 62bf22ffb7aa..e2b6cbd04846 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -73,6 +73,8 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
 struct cxl_cfmws_context {
 	struct device *dev;
 	struct cxl_port *root_port;
+	struct resource *cxl_res;
+	int id;
 };
 
 static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
@@ -81,11 +83,13 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	int target_map[CXL_DECODER_MAX_INTERLEAVE];
 	struct cxl_cfmws_context *ctx = arg;
 	struct cxl_port *root_port = ctx->root_port;
+	struct resource *cxl_res = ctx->cxl_res;
 	struct cxl_switch_decoder *cxlsd;
 	struct device *dev = ctx->dev;
 	struct acpi_cedt_cfmws *cfmws;
 	struct cxl_decoder *cxld;
 	unsigned int ways, i, ig;
+	struct resource *res;
 	int rc;
 
 	cfmws = (struct acpi_cedt_cfmws *) header;
@@ -107,6 +111,23 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	for (i = 0; i < ways; i++)
 		target_map[i] = cfmws->interleave_targets[i];
 
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++);
+	if (!res->name)
+		goto err_name;
+
+	res->start = cfmws->base_hpa;
+	res->end = cfmws->base_hpa + cfmws->window_size - 1;
+	res->flags = IORESOURCE_MEM;
+
+	/* add to the local resource tracking to establish a sort order */
+	rc = insert_resource(cxl_res, res);
+	if (rc)
+		goto err_insert;
+
 	cxlsd = cxl_root_decoder_alloc(root_port, ways);
 	if (IS_ERR(cxld))
 		return 0;
@@ -115,8 +136,8 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
 	cxld->target_type = CXL_DECODER_EXPANDER;
 	cxld->hpa_range = (struct range) {
-		.start = cfmws->base_hpa,
-		.end = cfmws->base_hpa + cfmws->window_size - 1,
+		.start = res->start,
+		.end = res->end,
 	};
 	cxld->interleave_ways = ways;
 	cxld->interleave_granularity = ig;
@@ -137,6 +158,12 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 		cxld->hpa_range.start, cxld->hpa_range.end);
 
 	return 0;
+
+err_insert:
+	kfree(res->name);
+err_name:
+	kfree(res);
+	return -ENOMEM;
 }
 
 __mock struct acpi_device *to_cxl_host_bridge(struct device *host,
@@ -291,9 +318,101 @@ static void cxl_acpi_lock_reset_class(void *dev)
 	device_lock_reset_class(dev);
 }
 
+static void del_cxl_resource(struct resource *res)
+{
+	kfree(res->name);
+	kfree(res);
+}
+
+static void cxl_set_public_resource(struct resource *priv, struct resource *pub)
+{
+	priv->desc = (unsigned long) pub;
+}
+
+static struct resource *cxl_get_public_resource(struct resource *priv)
+{
+	return (struct resource *) priv->desc;
+}
+
+static void remove_cxl_resources(void *data)
+{
+	struct resource *res, *next, *cxl = data;
+
+	for (res = cxl->child; res; res = next) {
+		struct resource *victim = cxl_get_public_resource(res);
+
+		next = res->sibling;
+		remove_resource(res);
+
+		if (victim) {
+			remove_resource(victim);
+			kfree(victim);
+		}
+
+		del_cxl_resource(res);
+	}
+}
+
+/**
+ * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource
+ * @cxl_res: A standalone resource tree where each CXL window is a sibling
+ *
+ * Walk each CXL window in @cxl_res and add it to iomem_resource potentially
+ * expanding its boundaries to ensure that any conflicting resources become
+ * children. If a window is expanded it may then conflict with a another window
+ * entry and require the window to be truncated or trimmed. Consider this
+ * situation:
+ *
+ * |-- "CXL Window 0" --||----- "CXL Window 1" -----|
+ * |--------------- "System RAM" -------------|
+ *
+ * ...where platform firmware has established as System RAM resource across 2
+ * windows, but has left some portion of window 1 for dynamic CXL region
+ * provisioning. In this case "Window 0" will span the entirety of the "System
+ * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end
+ * of that "System RAM" resource.
+ */
+static int add_cxl_resources(struct resource *cxl_res)
+{
+	struct resource *res, *new, *next;
+
+	for (res = cxl_res->child; res; res = next) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = res->name;
+		new->start = res->start;
+		new->end = res->end;
+		new->flags = IORESOURCE_MEM;
+		new->desc = IORES_DESC_CXL;
+
+		/*
+		 * Record the public resource in the private cxl_res tree for
+		 * later removal.
+		 */
+		cxl_set_public_resource(res, new);
+
+		insert_resource_expand_to_fit(&iomem_resource, new);
+
+		next = res->sibling;
+		while (next && resource_overlaps(new, next)) {
+			if (resource_contains(new, next)) {
+				struct resource *_next = next->sibling;
+
+				remove_resource(next);
+				del_cxl_resource(next);
+				next = _next;
+			} else
+				next->start = new->end + 1;
+		}
+	}
+	return 0;
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
 	int rc;
+	struct resource *cxl_res;
 	struct cxl_port *root_port;
 	struct device *host = &pdev->dev;
 	struct acpi_device *adev = ACPI_COMPANION(host);
@@ -305,6 +424,14 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc)
 		return rc;
 
+	cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL);
+	if (!cxl_res)
+		return -ENOMEM;
+	cxl_res->name = "CXL mem";
+	cxl_res->start = 0;
+	cxl_res->end = -1;
+	cxl_res->flags = IORESOURCE_MEM;
+
 	root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
 	if (IS_ERR(root_port))
 		return PTR_ERR(root_port);
@@ -315,11 +442,22 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc < 0)
 		return rc;
 
+	rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res);
+	if (rc)
+		return rc;
+
 	ctx = (struct cxl_cfmws_context) {
 		.dev = host,
 		.root_port = root_port,
+		.cxl_res = cxl_res,
 	};
-	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	if (rc < 0)
+		return -ENXIO;
+
+	rc = add_cxl_resources(cxl_res);
+	if (rc)
+		return rc;
 
 	/*
 	 * Root level scanned with host-bridge as dports, now scan host-bridges
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index ec5f71f7135b..79d1ad6d6275 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,6 +141,7 @@ enum {
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
 	IORES_DESC_SOFT_RESERVED		= 8,
+	IORES_DESC_CXL				= 9,
 };
 
 /*
diff --git a/kernel/resource.c b/kernel/resource.c
index 34eaee179689..53a534db350e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -891,6 +891,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
 	}
 	write_unlock(&resource_lock);
 }
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
 
 /**
  * remove_resource - Remove a resource in the resource tree
-- 
cgit 


From 5f8bcc837a9640ba4bf5e7b1d7f9b254ea029f47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Jul 2022 15:09:10 +0200
Subject: ublk: remove UBLK_IO_F_PREFLUSH

REQ_PREFLUSH is turned into REQ_OP_FLUSH by the flush state machine
and thus never seen by a blk-mq based driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220721130916.1869719-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 3 ---
 include/uapi/linux/ublk_cmd.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b90481b295a7..07913b5bccd9 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -392,9 +392,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 	if (req->cmd_flags & REQ_FUA)
 		flags |= UBLK_IO_F_FUA;
 
-	if (req->cmd_flags & REQ_PREFLUSH)
-		flags |= UBLK_IO_F_PREFLUSH;
-
 	if (req->cmd_flags & REQ_NOUNMAP)
 		flags |= UBLK_IO_F_NOUNMAP;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index d6879eea2fde..917580b34198 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -107,7 +107,6 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
 #define		UBLK_IO_F_META			(1U << 11)
 #define		UBLK_IO_F_FUA			(1U << 13)
-#define		UBLK_IO_F_PREFLUSH		(1U << 14)
 #define		UBLK_IO_F_NOUNMAP		(1U << 15)
 #define		UBLK_IO_F_SWAP			(1U << 16)
 
-- 
cgit 


From 1e9fdf21a4339b102539f476a9842e7526c01939 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Jul 2022 09:18:03 +0200
Subject: mmu_gather: Remove per arch tlb_{start,end}_vma()

Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
Provide two new MMU_GATHER_knobs to enumerate them and remove the per
arch tlb_{start,end}_vma() implementations.

 - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
   but does *NOT* want to call it for each VMA.

 - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
   invalidate across multiple VMAs if possible.

With these it is possible to capture the three forms:

  1) empty stubs;
     select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS

  2) start: flush_cache_range(), end: empty;
     select MMU_GATHER_MERGE_VMAS

  3) start: flush_cache_range(), end: flush_tlb_range();
     default

Obviously, if the architecture does not have flush_cache_range() then
it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                     |  7 +++++++
 arch/csky/include/asm/tlb.h      | 13 -------------
 arch/loongarch/Kconfig           |  1 +
 arch/loongarch/include/asm/tlb.h | 10 ----------
 arch/powerpc/Kconfig             |  1 +
 arch/powerpc/include/asm/tlb.h   |  2 --
 arch/s390/Kconfig                |  1 +
 arch/s390/include/asm/tlb.h      |  3 ---
 arch/sparc/Kconfig               |  2 ++
 arch/sparc/include/asm/tlb_64.h  |  2 --
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/tlb.h       |  3 ---
 include/asm-generic/tlb.h        | 21 +++++++++++++++++++--
 13 files changed, 32 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index fcf9a41a4ef5..71b9272acb28 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE
 
 config MMU_GATHER_NO_RANGE
 	bool
+	select MMU_GATHER_MERGE_VMAS
+
+config MMU_GATHER_NO_FLUSH_CACHE
+	bool
+
+config MMU_GATHER_MERGE_VMAS
+	bool
 
 config MMU_GATHER_NO_GATHER
 	bool
diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index 3498e65f59f8..750d041938d8 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -4,19 +4,6 @@
 #define __ASM_CSKY_TLB_H
 
 #include <asm/cacheflush.h>
-
-#define tlb_start_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
-#define tlb_end_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
 #include <asm-generic/tlb.h>
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 53a912befb62..b57daee98b89 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -108,6 +108,7 @@ config LOONGARCH
 	select TRACE_IRQFLAGS_SUPPORT
 	select USE_PERCPU_NUMA_NODE_ID
 	select ZONE_DMA32
+	select MMU_GATHER_MERGE_VMAS if MMU
 
 config 32BIT
 	bool
diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h
index 4f629ae9d5a9..dd24f5898f65 100644
--- a/arch/loongarch/include/asm/tlb.h
+++ b/arch/loongarch/include/asm/tlb.h
@@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u32 info, u64 addr)
 		);
 }
 
-/*
- * LoongArch doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!(tlb)->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
 static void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7aa12e88c580..c235648fae23 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -256,6 +256,7 @@ config PPC
 	select IRQ_FORCED_THREADING
 	select MMU_GATHER_PAGE_SIZE
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_MERGE_VMAS
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE		if PPC64 || NOT_COHERENT_CACHE
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK	if PPC64
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 09a9ae5f3656..b3de6102a907 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,8 +19,6 @@
 
 #include <linux/pagemap.h>
 
-#define tlb_start_vma(tlb, vma)	do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
 
 #define tlb_flush tlb_flush
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8cd9e56c629b..5a1a8dfda6f8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -204,6 +204,7 @@ config S390
 	select IOMMU_SUPPORT		if PCI
 	select MMU_GATHER_NO_GATHER
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_MERGE_VMAS
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE	if PCI
 	select NEED_SG_DMA_LENGTH	if PCI
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index fe6407f0eb1b..3a5c8fb590e5 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 					  struct page *page, int page_size);
 
-#define tlb_start_vma(tlb, vma)			do { } while (0)
-#define tlb_end_vma(tlb, vma)			do { } while (0)
-
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
 #define pmd_free_tlb pmd_free_tlb
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index ba449c47effd..4f7d1dfbc608 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -67,6 +67,8 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select MMU_GATHER_RCU_TABLE_FREE if SMP
+	select MMU_GATHER_MERGE_VMAS
+	select MMU_GATHER_NO_FLUSH_CACHE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index 779a5a0f0608..3037187482db 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *mm);
 void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *);
 void flush_tlb_pending(void);
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define tlb_flush(tlb)	flush_tlb_pending()
 
 /*
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e58798f636d4..7fff10e15969 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -245,6 +245,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+	select MMU_GATHER_MERGE_VMAS
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if UNWINDER_ORC || STACK_VALIDATION
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..580636cdc257 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -2,9 +2,6 @@
 #ifndef _ASM_X86_TLB_H
 #define _ASM_X86_TLB_H
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
 #define tlb_flush tlb_flush
 static inline void tlb_flush(struct mmu_gather *tlb);
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index ff3e82553a76..c1f03c1acbfc 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -158,9 +158,24 @@
  *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
  *  and therefore doesn't naturally serialize with software page-table walkers.
  *
+ *  MMU_GATHER_NO_FLUSH_CACHE
+ *
+ *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
+ *  before unmapping a VMA.
+ *
+ *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
+ *	  flush_cache_range() being a NOP, except Sparc64 seems to be
+ *	  different here.
+ *
+ *  MMU_GATHER_MERGE_VMAS
+ *
+ *  Indicates the architecture wants to merge ranges over VMAs; typical when
+ *  multiple range invalidates are more expensive than a full invalidate.
+ *
  *  MMU_GATHER_NO_RANGE
  *
- *  Use this if your architecture lacks an efficient flush_tlb_range().
+ *  Use this if your architecture lacks an efficient flush_tlb_range(). This
+ *  option implies MMU_GATHER_MERGE_VMAS above.
  *
  *  MMU_GATHER_NO_GATHER
  *
@@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *
 		return;
 
 	tlb_update_vma_flags(tlb, vma);
+#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
 	flush_cache_range(vma, vma->vm_start, vma->vm_end);
+#endif
 }
 #endif
 
 #ifndef tlb_end_vma
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm)
+	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
 		return;
 
 	/*
-- 
cgit 


From 18ba064e42df3661e196ab58a23931fc732a420b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Jul 2022 09:18:05 +0200
Subject: mmu_gather: Let there be one tlb_{start,end}_vma() implementation

Now that architectures are no longer allowed to override
tlb_{start,end}_vma() re-arrange code so that there is only one
implementation for each of these functions.

This much simplifies trying to figure out what they actually do.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/tlb.h | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index c1f03c1acbfc..897ca66338d5 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -349,8 +349,8 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
 
 #ifdef CONFIG_MMU_GATHER_NO_RANGE
 
-#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
-#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
+#if defined(tlb_flush)
+#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
 #endif
 
 /*
@@ -370,17 +370,10 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 static inline void
 tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
 
-#define tlb_end_vma tlb_end_vma
-static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-
 #else /* CONFIG_MMU_GATHER_NO_RANGE */
 
 #ifndef tlb_flush
 
-#if defined(tlb_start_vma) || defined(tlb_end_vma)
-#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
-#endif
-
 /*
  * When an architecture does not provide its own tlb_flush() implementation
  * but does have a reasonably efficient flush_vma_range() implementation
@@ -501,7 +494,6 @@ static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
  * case where we're doing a full MM flush.  When we're doing a munmap,
  * the vmas are adjusted to only cover the region to be torn down.
  */
-#ifndef tlb_start_vma
 static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm)
@@ -512,9 +504,7 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *
 	flush_cache_range(vma, vma->vm_start, vma->vm_end);
 #endif
 }
-#endif
 
-#ifndef tlb_end_vma
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
@@ -528,7 +518,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 	 */
 	tlb_flush_mmu_tlbonly(tlb);
 }
-#endif
 
 /*
  * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
-- 
cgit 


From b67fbebd4cf980aecbcc750e1462128bffe8ae15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Jul 2022 09:18:06 +0200
Subject: mmu_gather: Force tlb-flush VM_PFNMAP vmas

Jann reported a race between munmap() and unmap_mapping_range(), where
unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
VMA; however munmap() will not yet have invalidated the TLBs.

Therefore unmap_mapping_range() will complete while there are still
(stale) TLB entries for the specified range.

Mitigate this by force flushing TLBs for VM_PFNMAP ranges.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/tlb.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 897ca66338d5..cb2167c89eee 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,6 +303,7 @@ struct mmu_gather {
 	 */
 	unsigned int		vma_exec : 1;
 	unsigned int		vma_huge : 1;
+	unsigned int		vma_pfn  : 1;
 
 	unsigned int		batch_count;
 
@@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
 #else /* CONFIG_MMU_GATHER_NO_RANGE */
 
 #ifndef tlb_flush
-
 /*
  * When an architecture does not provide its own tlb_flush() implementation
  * but does have a reasonably efficient flush_vma_range() implementation
@@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		flush_tlb_range(&vma, tlb->start, tlb->end);
 	}
 }
+#endif
+
+#endif /* CONFIG_MMU_GATHER_NO_RANGE */
 
 static inline void
 tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
@@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
 	 */
 	tlb->vma_huge = is_vm_hugetlb_page(vma);
 	tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
+	tlb->vma_pfn  = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
 }
 
-#else
-
-static inline void
-tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-
-#endif
-
-#endif /* CONFIG_MMU_GATHER_NO_RANGE */
-
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
 	/*
@@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *
 
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
+	if (tlb->fullmm)
 		return;
 
 	/*
-	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
-	 * the ranges growing with the unused space between consecutive VMAs,
-	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
-	 * this.
+	 * VM_PFNMAP is more fragile because the core mm will not track the
+	 * page mapcount -- there might not be page-frames for these PFNs after
+	 * all. Force flush TLBs for such ranges to avoid munmap() vs
+	 * unmap_mapping_range() races.
 	 */
-	tlb_flush_mmu_tlbonly(tlb);
+	if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
+		/*
+		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
+		 * the ranges growing with the unused space between consecutive VMAs.
+		 */
+		tlb_flush_mmu_tlbonly(tlb);
+	}
 }
 
 /*
-- 
cgit 


From 877afadad2dce8aae1f2aad8ce47e072d4f6165e Mon Sep 17 00:00:00 2001
From: Schspa Shi <schspa@gmail.com>
Date: Fri, 3 Jun 2022 16:19:14 +0800
Subject: Bluetooth: When HCI work queue is drained, only queue chained work

The HCI command, event, and data packet processing workqueue is drained
to avoid deadlock in commit
76727c02c1e1 ("Bluetooth: Call drain_workqueue() before resetting state").

There is another delayed work, which will queue command to this drained
workqueue. Which results in the following error report:

Bluetooth: hci2: command 0x040f tx timeout
WARNING: CPU: 1 PID: 18374 at kernel/workqueue.c:1438 __queue_work+0xdad/0x1140
Workqueue: events hci_cmd_timeout
RIP: 0010:__queue_work+0xdad/0x1140
RSP: 0000:ffffc90002cffc60 EFLAGS: 00010093
RAX: 0000000000000000 RBX: ffff8880b9d3ec00 RCX: 0000000000000000
RDX: ffff888024ba0000 RSI: ffffffff814e048d RDI: ffff8880b9d3ec08
RBP: 0000000000000008 R08: 0000000000000000 R09: 00000000b9d39700
R10: ffffffff814f73c6 R11: 0000000000000000 R12: ffff88807cce4c60
R13: 0000000000000000 R14: ffff8880796d8800 R15: ffff8880796d8800
FS:  0000000000000000(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000c0174b4000 CR3: 000000007cae9000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 ? queue_work_on+0xcb/0x110
 ? lockdep_hardirqs_off+0x90/0xd0
 queue_work_on+0xee/0x110
 process_one_work+0x996/0x1610
 ? pwq_dec_nr_in_flight+0x2a0/0x2a0
 ? rwlock_bug.part.0+0x90/0x90
 ? _raw_spin_lock_irq+0x41/0x50
 worker_thread+0x665/0x1080
 ? process_one_work+0x1610/0x1610
 kthread+0x2e9/0x3a0
 ? kthread_complete_and_exit+0x40/0x40
 ret_from_fork+0x1f/0x30
 </TASK>

To fix this, we can add a new HCI_DRAIN_WQ flag, and don't queue the
timeout workqueue while command workqueue is draining.

Fixes: 76727c02c1e1 ("Bluetooth: Call drain_workqueue() before resetting state")
Reported-by: syzbot+63bed493aebbf6872647@syzkaller.appspotmail.com
Signed-off-by: Schspa Shi <schspa@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_core.c    | 10 +++++++++-
 net/bluetooth/hci_event.c   |  5 +++--
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index fe7935be7dc4..4a45c48eb0d2 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -361,6 +361,7 @@ enum {
 	HCI_QUALITY_REPORT,
 	HCI_OFFLOAD_CODECS_ENABLED,
 	HCI_LE_SIMULTANEOUS_ROLES,
+	HCI_CMD_DRAIN_WORKQUEUE,
 
 	__HCI_NUM_FLAGS,
 };
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index a0f99baafd35..6a53bcc5cfbb 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -594,6 +594,11 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
 	skb_queue_purge(&hdev->rx_q);
 	skb_queue_purge(&hdev->cmd_q);
 
+	/* Cancel these to avoid queueing non-chained pending work */
+	hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+	cancel_delayed_work(&hdev->cmd_timer);
+	cancel_delayed_work(&hdev->ncmd_timer);
+
 	/* Avoid potential lockdep warnings from the *_flush() calls by
 	 * ensuring the workqueue is empty up front.
 	 */
@@ -607,6 +612,8 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
 	if (hdev->flush)
 		hdev->flush(hdev);
 
+	hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+
 	atomic_set(&hdev->cmd_cnt, 1);
 	hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
 
@@ -3864,7 +3871,8 @@ static void hci_cmd_work(struct work_struct *work)
 			if (res < 0)
 				__hci_cmd_sync_cancel(hdev, -res);
 
-			if (test_bit(HCI_RESET, &hdev->flags))
+			if (test_bit(HCI_RESET, &hdev->flags) ||
+			    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
 				cancel_delayed_work(&hdev->cmd_timer);
 			else
 				schedule_delayed_work(&hdev->cmd_timer,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 63585c0bb9ce..34bec7446d00 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3768,8 +3768,9 @@ static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd)
 			cancel_delayed_work(&hdev->ncmd_timer);
 			atomic_set(&hdev->cmd_cnt, 1);
 		} else {
-			schedule_delayed_work(&hdev->ncmd_timer,
-					      HCI_NCMD_TIMEOUT);
+			if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+				schedule_delayed_work(&hdev->ncmd_timer,
+						      HCI_NCMD_TIMEOUT);
 		}
 	}
 }
-- 
cgit 


From 359ee4f834f5b4f8096f7edc0c20ef37d1fca861 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Thu, 2 Jun 2022 09:46:50 -0700
Subject: Bluetooth: Unregister suspend with userchannel

When HCI_USERCHANNEL is used, unregister the suspend notifier when
binding and register when releasing. The userchannel socket should be
left alone after open is completed.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         | 33 +++++++++++++++++++++++++--------
 net/bluetooth/hci_sock.c         |  3 +++
 3 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index c0ea2a4892b1..ae689bc7122e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1286,6 +1286,8 @@ void hci_free_dev(struct hci_dev *hdev);
 int hci_register_dev(struct hci_dev *hdev);
 void hci_unregister_dev(struct hci_dev *hdev);
 void hci_release_dev(struct hci_dev *hdev);
+int hci_register_suspend_notifier(struct hci_dev *hdev);
+int hci_unregister_suspend_notifier(struct hci_dev *hdev);
 int hci_suspend_dev(struct hci_dev *hdev);
 int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 6a53bcc5cfbb..1ace311cdea9 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2647,12 +2647,8 @@ int hci_register_dev(struct hci_dev *hdev)
 	hci_sock_dev_event(hdev, HCI_DEV_REG);
 	hci_dev_hold(hdev);
 
-	if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) {
-		hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
-		error = register_pm_notifier(&hdev->suspend_notifier);
-		if (error)
-			goto err_wqueue;
-	}
+	if (hci_register_suspend_notifier(hdev))
+		goto err_wqueue;
 
 	queue_work(hdev->req_workqueue, &hdev->power_on);
 
@@ -2687,8 +2683,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
 
 	hci_cmd_sync_clear(hdev);
 
-	if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks))
-		unregister_pm_notifier(&hdev->suspend_notifier);
+	hci_unregister_suspend_notifier(hdev);
 
 	msft_unregister(hdev);
 
@@ -2752,6 +2747,28 @@ void hci_release_dev(struct hci_dev *hdev)
 }
 EXPORT_SYMBOL(hci_release_dev);
 
+int hci_register_suspend_notifier(struct hci_dev *hdev)
+{
+	int ret = 0;
+
+	if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) {
+		hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
+		ret = register_pm_notifier(&hdev->suspend_notifier);
+	}
+
+	return ret;
+}
+
+int hci_unregister_suspend_notifier(struct hci_dev *hdev)
+{
+	int ret = 0;
+
+	if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks))
+		ret = unregister_pm_notifier(&hdev->suspend_notifier);
+
+	return ret;
+}
+
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
 {
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index bd8358b44aa4..0d015d4a8e41 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -887,6 +887,7 @@ static int hci_sock_release(struct socket *sock)
 			 */
 			hci_dev_do_close(hdev);
 			hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+			hci_register_suspend_notifier(hdev);
 			mgmt_index_added(hdev);
 		}
 
@@ -1215,6 +1216,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		}
 
 		mgmt_index_removed(hdev);
+		hci_unregister_suspend_notifier(hdev);
 
 		err = hci_dev_open(hdev->id);
 		if (err) {
@@ -1229,6 +1231,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 				err = 0;
 			} else {
 				hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+				hci_register_suspend_notifier(hdev);
 				mgmt_index_added(hdev);
 				hci_dev_put(hdev);
 				goto done;
-- 
cgit 


From 34a718bc86f908de8ef79affaff6a3de7b95759c Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 8 Jun 2022 15:00:01 -0700
Subject: Bluetooth: HCI: Fix not always setting Scan Response/Advertising Data

The scan response and advertising data needs to be tracked on a per
instance (adv_info) since when these instaces are removed so are their
data, to fix that new flags are introduced which is used to mark when
the data changes and then checked to confirm when the data needs to be
synced with the controller.

Tested-by: Tedd Ho-Jeong An <tedd.an@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 11 +++++++
 net/bluetooth/hci_core.c         | 42 ++++++++++++-------------
 net/bluetooth/hci_sync.c         | 66 +++++++++++++++++++++++++++-------------
 3 files changed, 76 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index ae689bc7122e..6d32e3e942b7 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -243,8 +243,10 @@ struct adv_info {
 	__u16	duration;
 	__u16	adv_data_len;
 	__u8	adv_data[HCI_MAX_EXT_AD_LENGTH];
+	bool	adv_data_changed;
 	__u16	scan_rsp_len;
 	__u8	scan_rsp_data[HCI_MAX_EXT_AD_LENGTH];
+	bool	scan_rsp_changed;
 	__s8	tx_power;
 	__u32   min_interval;
 	__u32   max_interval;
@@ -258,6 +260,15 @@ struct adv_info {
 
 #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F
 
+#define DATA_CMP(_d1, _l1, _d2, _l2) \
+	(_l1 == _l2 ? memcmp(_d1, _d2, _l1) : _l1 - _l2)
+
+#define ADV_DATA_CMP(_adv, _data, _len) \
+	DATA_CMP((_adv)->adv_data, (_adv)->adv_data_len, _data, _len)
+
+#define SCAN_RSP_CMP(_adv, _data, _len) \
+	DATA_CMP((_adv)->scan_rsp_data, (_adv)->scan_rsp_len, _data, _len)
+
 struct monitored_device {
 	struct list_head list;
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 94f38f669932..72df3425725f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1728,18 +1728,12 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
 	}
 
 	adv_instance->flags = flags;
-	adv_instance->adv_data_len = adv_data_len;
-	adv_instance->scan_rsp_len = scan_rsp_len;
 	adv_instance->min_interval = min_interval;
 	adv_instance->max_interval = max_interval;
 	adv_instance->tx_power = tx_power;
 
-	if (adv_data_len)
-		memcpy(adv_instance->adv_data, adv_data, adv_data_len);
-
-	if (scan_rsp_len)
-		memcpy(adv_instance->scan_rsp_data,
-		       scan_rsp_data, scan_rsp_len);
+	hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data,
+				  scan_rsp_len, scan_rsp_data);
 
 	adv_instance->timeout = timeout;
 	adv_instance->remaining_time = timeout;
@@ -1762,29 +1756,33 @@ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
 			      u16 adv_data_len, u8 *adv_data,
 			      u16 scan_rsp_len, u8 *scan_rsp_data)
 {
-	struct adv_info *adv_instance;
+	struct adv_info *adv;
 
-	adv_instance = hci_find_adv_instance(hdev, instance);
+	adv = hci_find_adv_instance(hdev, instance);
 
 	/* If advertisement doesn't exist, we can't modify its data */
-	if (!adv_instance)
+	if (!adv)
 		return -ENOENT;
 
-	if (adv_data_len) {
-		memset(adv_instance->adv_data, 0,
-		       sizeof(adv_instance->adv_data));
-		memcpy(adv_instance->adv_data, adv_data, adv_data_len);
-		adv_instance->adv_data_len = adv_data_len;
+	if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) {
+		memset(adv->adv_data, 0, sizeof(adv->adv_data));
+		memcpy(adv->adv_data, adv_data, adv_data_len);
+		adv->adv_data_len = adv_data_len;
+		adv->adv_data_changed = true;
 	}
 
-	if (scan_rsp_len) {
-		memset(adv_instance->scan_rsp_data, 0,
-		       sizeof(adv_instance->scan_rsp_data));
-		memcpy(adv_instance->scan_rsp_data,
-		       scan_rsp_data, scan_rsp_len);
-		adv_instance->scan_rsp_len = scan_rsp_len;
+	if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) {
+		memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+		memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len);
+		adv->scan_rsp_len = scan_rsp_len;
+		adv->scan_rsp_changed = true;
 	}
 
+	/* Mark as changed if there are flags which would affect it */
+	if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) ||
+	    adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+		adv->scan_rsp_changed = true;
+
 	return 0;
 }
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 1739e8cb3291..a727f16914e4 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -849,26 +849,38 @@ static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
 		u8 data[HCI_MAX_EXT_AD_LENGTH];
 	} pdu;
 	u8 len;
+	struct adv_info *adv = NULL;
+	int err;
 
 	memset(&pdu, 0, sizeof(pdu));
 
-	len = eir_create_scan_rsp(hdev, instance, pdu.data);
-
-	if (hdev->scan_rsp_data_len == len &&
-	    !memcmp(pdu.data, hdev->scan_rsp_data, len))
-		return 0;
+	if (instance) {
+		adv = hci_find_adv_instance(hdev, instance);
+		if (!adv || !adv->scan_rsp_changed)
+			return 0;
+	}
 
-	memcpy(hdev->scan_rsp_data, pdu.data, len);
-	hdev->scan_rsp_data_len = len;
+	len = eir_create_scan_rsp(hdev, instance, pdu.data);
 
 	pdu.cp.handle = instance;
 	pdu.cp.length = len;
 	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
 	pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
-	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
-				     sizeof(pdu.cp) + len, &pdu.cp,
-				     HCI_CMD_TIMEOUT);
+	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
+				    sizeof(pdu.cp) + len, &pdu.cp,
+				    HCI_CMD_TIMEOUT);
+	if (err)
+		return err;
+
+	if (adv) {
+		adv->scan_rsp_changed = false;
+	} else {
+		memcpy(hdev->scan_rsp_data, pdu.data, len);
+		hdev->scan_rsp_data_len = len;
+	}
+
+	return 0;
 }
 
 static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
@@ -1119,27 +1131,39 @@ static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
 		u8 data[HCI_MAX_EXT_AD_LENGTH];
 	} pdu;
 	u8 len;
+	struct adv_info *adv = NULL;
+	int err;
 
 	memset(&pdu, 0, sizeof(pdu));
 
-	len = eir_create_adv_data(hdev, instance, pdu.data);
-
-	/* There's nothing to do if the data hasn't changed */
-	if (hdev->adv_data_len == len &&
-	    memcmp(pdu.data, hdev->adv_data, len) == 0)
-		return 0;
+	if (instance) {
+		adv = hci_find_adv_instance(hdev, instance);
+		if (!adv || !adv->adv_data_changed)
+			return 0;
+	}
 
-	memcpy(hdev->adv_data, pdu.data, len);
-	hdev->adv_data_len = len;
+	len = eir_create_adv_data(hdev, instance, pdu.data);
 
 	pdu.cp.length = len;
 	pdu.cp.handle = instance;
 	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
 	pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
-	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA,
-				     sizeof(pdu.cp) + len, &pdu.cp,
-				     HCI_CMD_TIMEOUT);
+	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA,
+				    sizeof(pdu.cp) + len, &pdu.cp,
+				    HCI_CMD_TIMEOUT);
+	if (err)
+		return err;
+
+	/* Update data if the command succeed */
+	if (adv) {
+		adv->adv_data_changed = false;
+	} else {
+		memcpy(hdev->adv_data, pdu.data, len);
+		hdev->adv_data_len = len;
+	}
+
+	return 0;
 }
 
 static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance)
-- 
cgit 


From 6f43f6169a8229bb6ddbf483d3be760d48c4cdd1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 20 Jul 2022 14:23:49 +0300
Subject: Bluetooth: clean up error pointer checking

The bt_skb_sendmsg() function can't return NULL so there is no need to
check for that.  Several of these checks were removed previously but
this one was missed.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 6b48d9e2aab9..a8b52175af05 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -494,7 +494,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk,
 	struct sk_buff *skb, **frag;
 
 	skb = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom);
-	if (IS_ERR_OR_NULL(skb))
+	if (IS_ERR(skb))
 		return skb;
 
 	len -= skb->len;
-- 
cgit 


From 766ae2422b4312a73510ebee9266bc23b466fbbb Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Thu, 21 Jul 2022 14:04:30 +0800
Subject: Bluetooth: hci_sync: Check LMP feature bit instead of quirk

BT core driver should addtionally check LMP feature bit
"Erroneous Data Reporting" instead of quirk
HCI_QUIRK_BROKEN_ERR_DATA_REPORTING set by BT device driver to decide if
HCI commands HCI_Read|Write_Default_Erroneous_Data_Reporting are broken.

BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 2, Part C | page 587
This feature indicates whether the device is able to support the
Packet_Status_Flag and the HCI commands HCI_Write_Default_-
Erroneous_Data_Reporting and HCI_Read_Default_Erroneous_-
Data_Reporting.

the quirk was introduced by 'commit cde1a8a99287 ("Bluetooth: btusb: Fix
and detect most of the Chinese Bluetooth controllers")' to mark HCI
commands HCI_Read|Write_Default_Erroneous_Data_Reporting broken by BT
device driver, but the reason why these two HCI commands are broken is
that feature "Erroneous Data Reporting" is not enabled by firmware, this
scenario is illustrated by below log of QCA controllers with USB I/F:

@ RAW Open: hcitool (privileged) version 2.22
< HCI Command: Read Local Supported Commands (0x04|0x0002) plen 0
> HCI Event: Command Complete (0x0e) plen 68
      Read Local Supported Commands (0x04|0x0002) ncmd 1
        Status: Success (0x00)
        Commands: 288 entries
......
          Read Default Erroneous Data Reporting (Octet 18 - Bit 2)
          Write Default Erroneous Data Reporting (Octet 18 - Bit 3)
......

< HCI Command: Read Default Erroneous Data Reporting (0x03|0x005a) plen 0
> HCI Event: Command Complete (0x0e) plen 4
      Read Default Erroneous Data Reporting (0x03|0x005a) ncmd 1
        Status: Unknown HCI Command (0x01)

< HCI Command: Read Local Supported Features (0x04|0x0003) plen 0
> HCI Event: Command Complete (0x0e) plen 12
      Read Local Supported Features (0x04|0x0003) ncmd 1
        Status: Success (0x00)
        Features: 0xff 0xfe 0x0f 0xfe 0xd8 0x3f 0x5b 0x87
          3 slot packets
......

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Tested-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 +
 net/bluetooth/hci_sync.c    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 4a45c48eb0d2..5cf0fbfb89b4 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -497,6 +497,7 @@ enum {
 #define LMP_EXT_INQ	0x01
 #define LMP_SIMUL_LE_BR	0x02
 #define LMP_SIMPLE_PAIR	0x08
+#define LMP_ERR_DATA_REPORTING 0x20
 #define LMP_NO_FLUSH	0x40
 
 #define LMP_LSTO	0x01
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 49da90b6eecd..a6325f9ebfd2 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -3221,7 +3221,7 @@ static int hci_read_page_scan_activity_sync(struct hci_dev *hdev)
 static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
 {
 	if (!(hdev->commands[18] & 0x04) ||
-	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
+	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
@@ -3706,7 +3706,7 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev)
 	bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED);
 
 	if (!(hdev->commands[18] & 0x08) ||
-	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
+	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
 		return 0;
 
 	if (enabled == hdev->err_data_reporting)
-- 
cgit 


From 63b1a7dd38bfd4630e5f59a20a2b7b1f3d04f486 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Thu, 21 Jul 2022 14:04:33 +0800
Subject: Bluetooth: hci_sync: Remove HCI_QUIRK_BROKEN_ERR_DATA_REPORTING

Core driver addtionally checks LMP feature bit "Erroneous Data Reporting"
instead of quirk HCI_QUIRK_BROKEN_ERR_DATA_REPORTING to decide if HCI
commands HCI_Read|Write_Default_Erroneous_Data_Reporting are broken, so
remove this unnecessary quirk.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Tested-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 11 -----------
 net/bluetooth/hci_sync.c    |  3 ---
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 5cf0fbfb89b4..927f51b92854 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -228,17 +228,6 @@ enum {
 	 */
 	HCI_QUIRK_VALID_LE_STATES,
 
-	/* When this quirk is set, then erroneous data reporting
-	 * is ignored. This is mainly due to the fact that the HCI
-	 * Read Default Erroneous Data Reporting command is advertised,
-	 * but not supported; these controllers often reply with unknown
-	 * command and tend to lock up randomly. Needing a hard reset.
-	 *
-	 * This quirk can be set before hci_register_dev is called or
-	 * during the hdev->setup vendor callback.
-	 */
-	HCI_QUIRK_BROKEN_ERR_DATA_REPORTING,
-
 	/*
 	 * When this quirk is set, then the hci_suspend_notifier is not
 	 * registered. This is intended for devices which drop completely
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index a6325f9ebfd2..e793305beb67 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -3865,9 +3865,6 @@ static const struct {
 	HCI_QUIRK_BROKEN(STORED_LINK_KEY,
 			 "HCI Delete Stored Link Key command is advertised, "
 			 "but not supported."),
-	HCI_QUIRK_BROKEN(ERR_DATA_REPORTING,
-			 "HCI Read Default Erroneous Data Reporting command is "
-			 "advertised, but not supported."),
 	HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER,
 			 "HCI Read Transmit Power Level command is advertised, "
 			 "but not supported."),
-- 
cgit 


From b747a83690c8f53bc7a3f75899415c699b2c51aa Mon Sep 17 00:00:00 2001
From: Manish Mandlik <mmandlik@google.com>
Date: Wed, 20 Jul 2022 16:21:13 -0700
Subject: Bluetooth: hci_sync: Refactor add Adv Monitor

Make use of hci_cmd_sync_queue for adding an advertisement monitor.

Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Miao-chen Chou <mcchou@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |   5 +-
 net/bluetooth/hci_core.c         |  46 +++++------
 net/bluetooth/mgmt.c             |  52 ++++--------
 net/bluetooth/msft.c             | 171 +++++++++------------------------------
 4 files changed, 78 insertions(+), 196 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6d32e3e942b7..b45d31285961 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1420,10 +1420,8 @@ bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance);
 
 void hci_adv_monitors_clear(struct hci_dev *hdev);
 void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
-int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status);
 int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
-bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor,
-			int *err);
+int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
 bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err);
 bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err);
 bool hci_is_adv_monitoring(struct hci_dev *hdev);
@@ -1885,7 +1883,6 @@ void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
 			      u8 instance);
 void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle);
 int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
-int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status);
 int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
 void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
 				  bdaddr_t *bdaddr, u8 addr_type);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index d2c5f32d651b..6f9ce2f6f281 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1880,11 +1880,6 @@ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
 	kfree(monitor);
 }
 
-int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status)
-{
-	return mgmt_add_adv_patterns_monitor_complete(hdev, status);
-}
-
 int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status)
 {
 	return mgmt_remove_adv_monitor_complete(hdev, status);
@@ -1892,49 +1887,48 @@ int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status)
 
 /* Assigns handle to a monitor, and if offloading is supported and power is on,
  * also attempts to forward the request to the controller.
- * Returns true if request is forwarded (result is pending), false otherwise.
- * This function requires the caller holds hdev->lock.
+ * This function requires the caller holds hci_req_sync_lock.
  */
-bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor,
-			 int *err)
+int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
 {
 	int min, max, handle;
+	int status = 0;
 
-	*err = 0;
+	if (!monitor)
+		return -EINVAL;
 
-	if (!monitor) {
-		*err = -EINVAL;
-		return false;
-	}
+	hci_dev_lock(hdev);
 
 	min = HCI_MIN_ADV_MONITOR_HANDLE;
 	max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES;
 	handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max,
 			   GFP_KERNEL);
-	if (handle < 0) {
-		*err = handle;
-		return false;
-	}
+
+	hci_dev_unlock(hdev);
+
+	if (handle < 0)
+		return handle;
 
 	monitor->handle = handle;
 
 	if (!hdev_is_powered(hdev))
-		return false;
+		return status;
 
 	switch (hci_get_adv_monitor_offload_ext(hdev)) {
 	case HCI_ADV_MONITOR_EXT_NONE:
-		hci_update_passive_scan(hdev);
-		bt_dev_dbg(hdev, "%s add monitor status %d", hdev->name, *err);
+		bt_dev_dbg(hdev, "%s add monitor %d status %d", hdev->name,
+			   monitor->handle, status);
 		/* Message was not forwarded to controller - not an error */
-		return false;
+		break;
+
 	case HCI_ADV_MONITOR_EXT_MSFT:
-		*err = msft_add_monitor_pattern(hdev, monitor);
-		bt_dev_dbg(hdev, "%s add monitor msft status %d", hdev->name,
-			   *err);
+		status = msft_add_monitor_pattern(hdev, monitor);
+		bt_dev_dbg(hdev, "%s add monitor %d msft status %d", hdev->name,
+			   monitor->handle, status);
 		break;
 	}
 
-	return (*err == 0);
+	return status;
 }
 
 /* Attempts to tell the controller and free the monitor. If somehow the
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index f3e4e2c9ec7a..e2d41dcf1031 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4646,23 +4646,15 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev,
 	return err;
 }
 
-int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status)
+static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev,
+						   void *data, int status)
 {
 	struct mgmt_rp_add_adv_patterns_monitor rp;
-	struct mgmt_pending_cmd *cmd;
-	struct adv_monitor *monitor;
-	int err = 0;
+	struct mgmt_pending_cmd *cmd = data;
+	struct adv_monitor *monitor = cmd->user_data;
 
 	hci_dev_lock(hdev);
 
-	cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev);
-	if (!cmd) {
-		cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev);
-		if (!cmd)
-			goto done;
-	}
-
-	monitor = cmd->user_data;
 	rp.monitor_handle = cpu_to_le16(monitor->handle);
 
 	if (!status) {
@@ -4673,26 +4665,29 @@ int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status)
 		hci_update_passive_scan(hdev);
 	}
 
-	err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
-				mgmt_status(status), &rp, sizeof(rp));
+	mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
+			  mgmt_status(status), &rp, sizeof(rp));
 	mgmt_pending_remove(cmd);
 
-done:
 	hci_dev_unlock(hdev);
-	bt_dev_dbg(hdev, "add monitor %d complete, status %u",
+	bt_dev_dbg(hdev, "add monitor %d complete, status %d",
 		   rp.monitor_handle, status);
+}
 
-	return err;
+static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct adv_monitor *monitor = cmd->user_data;
+
+	return hci_add_adv_monitor(hdev, monitor);
 }
 
 static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
 				      struct adv_monitor *m, u8 status,
 				      void *data, u16 len, u16 op)
 {
-	struct mgmt_rp_add_adv_patterns_monitor rp;
 	struct mgmt_pending_cmd *cmd;
 	int err;
-	bool pending;
 
 	hci_dev_lock(hdev);
 
@@ -4714,12 +4709,11 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
 	}
 
 	cmd->user_data = m;
-	pending = hci_add_adv_monitor(hdev, m, &err);
+	err = hci_cmd_sync_queue(hdev, mgmt_add_adv_patterns_monitor_sync, cmd,
+				 mgmt_add_adv_patterns_monitor_complete);
 	if (err) {
-		if (err == -ENOSPC || err == -ENOMEM)
+		if (err == -ENOMEM)
 			status = MGMT_STATUS_NO_RESOURCES;
-		else if (err == -EINVAL)
-			status = MGMT_STATUS_INVALID_PARAMS;
 		else
 			status = MGMT_STATUS_FAILED;
 
@@ -4727,18 +4721,6 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
 		goto unlock;
 	}
 
-	if (!pending) {
-		mgmt_pending_remove(cmd);
-		rp.monitor_handle = cpu_to_le16(m->handle);
-		mgmt_adv_monitor_added(sk, hdev, m->handle);
-		m->state = ADV_MONITOR_STATE_REGISTERED;
-		hdev->adv_monitors_cnt++;
-
-		hci_dev_unlock(hdev);
-		return mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_SUCCESS,
-					 &rp, sizeof(rp));
-	}
-
 	hci_dev_unlock(hdev);
 
 	return 0;
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
index f43994523b1f..54fbc718e893 100644
--- a/net/bluetooth/msft.c
+++ b/net/bluetooth/msft.c
@@ -99,15 +99,12 @@ struct msft_data {
 	__u8  evt_prefix_len;
 	__u8  *evt_prefix;
 	struct list_head handle_map;
-	__u16 pending_add_handle;
 	__u16 pending_remove_handle;
 	__u8 resuming;
 	__u8 suspending;
 	__u8 filter_enabled;
 };
 
-static int __msft_add_monitor_pattern(struct hci_dev *hdev,
-				      struct adv_monitor *monitor);
 static int __msft_remove_monitor(struct hci_dev *hdev,
 				 struct adv_monitor *monitor, u16 handle);
 
@@ -164,34 +161,6 @@ failed:
 	return false;
 }
 
-static void reregister_monitor(struct hci_dev *hdev, int handle)
-{
-	struct adv_monitor *monitor;
-	struct msft_data *msft = hdev->msft_data;
-	int err;
-
-	while (1) {
-		monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
-		if (!monitor) {
-			/* All monitors have been resumed */
-			msft->resuming = false;
-			hci_update_passive_scan(hdev);
-			return;
-		}
-
-		msft->pending_add_handle = (u16)handle;
-		err = __msft_add_monitor_pattern(hdev, monitor);
-
-		/* If success, we return and wait for monitor added callback */
-		if (!err)
-			return;
-
-		/* Otherwise remove the monitor and keep registering */
-		hci_free_adv_monitor(hdev, monitor);
-		handle++;
-	}
-}
-
 /* is_mgmt = true matches the handle exposed to userspace via mgmt.
  * is_mgmt = false matches the handle used by the msft controller.
  * This function requires the caller holds hdev->lock
@@ -243,34 +212,27 @@ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle,
 	return count;
 }
 
-static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev,
-					     u8 status, u16 opcode,
-					     struct sk_buff *skb)
+static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode,
+					    struct adv_monitor *monitor,
+					    struct sk_buff *skb)
 {
 	struct msft_rp_le_monitor_advertisement *rp;
-	struct adv_monitor *monitor;
 	struct msft_monitor_advertisement_handle_data *handle_data;
 	struct msft_data *msft = hdev->msft_data;
+	int status = 0;
 
 	hci_dev_lock(hdev);
 
-	monitor = idr_find(&hdev->adv_monitors_idr, msft->pending_add_handle);
-	if (!monitor) {
-		bt_dev_err(hdev, "msft add advmon: monitor %u is not found!",
-			   msft->pending_add_handle);
+	rp = (struct msft_rp_le_monitor_advertisement *)skb->data;
+	if (skb->len < sizeof(*rp)) {
 		status = HCI_ERROR_UNSPECIFIED;
 		goto unlock;
 	}
 
+	status = rp->status;
 	if (status)
 		goto unlock;
 
-	rp = (struct msft_rp_le_monitor_advertisement *)skb->data;
-	if (skb->len < sizeof(*rp)) {
-		status = HCI_ERROR_UNSPECIFIED;
-		goto unlock;
-	}
-
 	handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL);
 	if (!handle_data) {
 		status = HCI_ERROR_UNSPECIFIED;
@@ -285,13 +247,12 @@ static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev,
 	monitor->state = ADV_MONITOR_STATE_OFFLOADED;
 
 unlock:
-	if (status && monitor)
+	if (status)
 		hci_free_adv_monitor(hdev, monitor);
 
 	hci_dev_unlock(hdev);
 
-	if (!msft->resuming)
-		hci_add_adv_patterns_monitor_complete(hdev, status);
+	return status;
 }
 
 static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
@@ -463,7 +424,6 @@ static int msft_add_monitor_sync(struct hci_dev *hdev,
 	ptrdiff_t offset = 0;
 	u8 pattern_count = 0;
 	struct sk_buff *skb;
-	u8 status;
 
 	if (!msft_monitor_pattern_valid(monitor))
 		return -EINVAL;
@@ -505,20 +465,40 @@ static int msft_add_monitor_sync(struct hci_dev *hdev,
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	status = skb->data[0];
-	skb_pull(skb, 1);
+	return msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+						monitor, skb);
+}
 
-	msft_le_monitor_advertisement_cb(hdev, status, hdev->msft_opcode, skb);
+/* This function requires the caller holds hci_req_sync_lock */
+static void reregister_monitor(struct hci_dev *hdev)
+{
+	struct adv_monitor *monitor;
+	struct msft_data *msft = hdev->msft_data;
+	int handle = 0;
 
-	return status;
+	if (!msft)
+		return;
+
+	msft->resuming = true;
+
+	while (1) {
+		monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
+		if (!monitor)
+			break;
+
+		msft_add_monitor_sync(hdev, monitor);
+
+		handle++;
+	}
+
+	/* All monitors have been reregistered */
+	msft->resuming = false;
 }
 
 /* This function requires the caller holds hci_req_sync_lock */
 int msft_resume_sync(struct hci_dev *hdev)
 {
 	struct msft_data *msft = hdev->msft_data;
-	struct adv_monitor *monitor;
-	int handle = 0;
 
 	if (!msft || !msft_monitor_supported(hdev))
 		return 0;
@@ -533,24 +513,12 @@ int msft_resume_sync(struct hci_dev *hdev)
 
 	hci_dev_unlock(hdev);
 
-	msft->resuming = true;
-
-	while (1) {
-		monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
-		if (!monitor)
-			break;
-
-		msft_add_monitor_sync(hdev, monitor);
-
-		handle++;
-	}
-
-	/* All monitors have been resumed */
-	msft->resuming = false;
+	reregister_monitor(hdev);
 
 	return 0;
 }
 
+/* This function requires the caller holds hci_req_sync_lock */
 void msft_do_open(struct hci_dev *hdev)
 {
 	struct msft_data *msft = hdev->msft_data;
@@ -583,7 +551,7 @@ void msft_do_open(struct hci_dev *hdev)
 		/* Monitors get removed on power off, so we need to explicitly
 		 * tell the controller to re-monitor.
 		 */
-		reregister_monitor(hdev, 0);
+		reregister_monitor(hdev);
 	}
 }
 
@@ -829,66 +797,7 @@ static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev,
 	hci_dev_unlock(hdev);
 }
 
-/* This function requires the caller holds hdev->lock */
-static int __msft_add_monitor_pattern(struct hci_dev *hdev,
-				      struct adv_monitor *monitor)
-{
-	struct msft_cp_le_monitor_advertisement *cp;
-	struct msft_le_monitor_advertisement_pattern_data *pattern_data;
-	struct msft_le_monitor_advertisement_pattern *pattern;
-	struct adv_pattern *entry;
-	struct hci_request req;
-	struct msft_data *msft = hdev->msft_data;
-	size_t total_size = sizeof(*cp) + sizeof(*pattern_data);
-	ptrdiff_t offset = 0;
-	u8 pattern_count = 0;
-	int err = 0;
-
-	if (!msft_monitor_pattern_valid(monitor))
-		return -EINVAL;
-
-	list_for_each_entry(entry, &monitor->patterns, list) {
-		pattern_count++;
-		total_size += sizeof(*pattern) + entry->length;
-	}
-
-	cp = kmalloc(total_size, GFP_KERNEL);
-	if (!cp)
-		return -ENOMEM;
-
-	cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT;
-	cp->rssi_high = monitor->rssi.high_threshold;
-	cp->rssi_low = monitor->rssi.low_threshold;
-	cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout;
-	cp->rssi_sampling_period = monitor->rssi.sampling_period;
-
-	cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN;
-
-	pattern_data = (void *)cp->data;
-	pattern_data->count = pattern_count;
-
-	list_for_each_entry(entry, &monitor->patterns, list) {
-		pattern = (void *)(pattern_data->data + offset);
-		/* the length also includes data_type and offset */
-		pattern->length = entry->length + 2;
-		pattern->data_type = entry->ad_type;
-		pattern->start_byte = entry->offset;
-		memcpy(pattern->pattern, entry->value, entry->length);
-		offset += sizeof(*pattern) + entry->length;
-	}
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, hdev->msft_opcode, total_size, cp);
-	err = hci_req_run_skb(&req, msft_le_monitor_advertisement_cb);
-	kfree(cp);
-
-	if (!err)
-		msft->pending_add_handle = monitor->handle;
-
-	return err;
-}
-
-/* This function requires the caller holds hdev->lock */
+/* This function requires the caller holds hci_req_sync_lock */
 int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor)
 {
 	struct msft_data *msft = hdev->msft_data;
@@ -899,7 +808,7 @@ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor)
 	if (msft->resuming || msft->suspending)
 		return -EBUSY;
 
-	return __msft_add_monitor_pattern(hdev, monitor);
+	return msft_add_monitor_sync(hdev, monitor);
 }
 
 /* This function requires the caller holds hdev->lock */
-- 
cgit 


From 7cf5c2978f23fdbb2dd7b4e8b07e362ae2d8211c Mon Sep 17 00:00:00 2001
From: Manish Mandlik <mmandlik@google.com>
Date: Wed, 20 Jul 2022 16:21:14 -0700
Subject: Bluetooth: hci_sync: Refactor remove Adv Monitor

Make use of hci_cmd_sync_queue for removing an advertisement monitor.

Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Miao-chen Chou <mcchou@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  6 +--
 net/bluetooth/hci_core.c         | 81 ++++++++++++---------------------
 net/bluetooth/mgmt.c             | 62 ++++++++++---------------
 net/bluetooth/msft.c             | 98 ++++++++--------------------------------
 net/bluetooth/msft.h             |  6 +--
 5 files changed, 75 insertions(+), 178 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b45d31285961..df7dac4a5bbd 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1420,10 +1420,9 @@ bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance);
 
 void hci_adv_monitors_clear(struct hci_dev *hdev);
 void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
-int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
 int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
-bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err);
-bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err);
+int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle);
+int hci_remove_all_adv_monitor(struct hci_dev *hdev);
 bool hci_is_adv_monitoring(struct hci_dev *hdev);
 int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev);
 
@@ -1883,7 +1882,6 @@ void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
 			      u8 instance);
 void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle);
 int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
-int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
 void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
 				  bdaddr_t *bdaddr, u8 addr_type);
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 6f9ce2f6f281..e42824e82758 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1880,11 +1880,6 @@ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
 	kfree(monitor);
 }
 
-int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status)
-{
-	return mgmt_remove_adv_monitor_complete(hdev, status);
-}
-
 /* Assigns handle to a monitor, and if offloading is supported and power is on,
  * also attempts to forward the request to the controller.
  * This function requires the caller holds hci_req_sync_lock.
@@ -1933,92 +1928,72 @@ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
 
 /* Attempts to tell the controller and free the monitor. If somehow the
  * controller doesn't have a corresponding handle, remove anyway.
- * Returns true if request is forwarded (result is pending), false otherwise.
- * This function requires the caller holds hdev->lock.
+ * This function requires the caller holds hci_req_sync_lock.
  */
-static bool hci_remove_adv_monitor(struct hci_dev *hdev,
-				   struct adv_monitor *monitor,
-				   u16 handle, int *err)
+static int hci_remove_adv_monitor(struct hci_dev *hdev,
+				  struct adv_monitor *monitor)
 {
-	*err = 0;
+	int status = 0;
 
 	switch (hci_get_adv_monitor_offload_ext(hdev)) {
 	case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */
+		bt_dev_dbg(hdev, "%s remove monitor %d status %d", hdev->name,
+			   monitor->handle, status);
 		goto free_monitor;
+
 	case HCI_ADV_MONITOR_EXT_MSFT:
-		*err = msft_remove_monitor(hdev, monitor, handle);
+		status = msft_remove_monitor(hdev, monitor);
+		bt_dev_dbg(hdev, "%s remove monitor %d msft status %d",
+			   hdev->name, monitor->handle, status);
 		break;
 	}
 
 	/* In case no matching handle registered, just free the monitor */
-	if (*err == -ENOENT)
+	if (status == -ENOENT)
 		goto free_monitor;
 
-	return (*err == 0);
+	return status;
 
 free_monitor:
-	if (*err == -ENOENT)
+	if (status == -ENOENT)
 		bt_dev_warn(hdev, "Removing monitor with no matching handle %d",
 			    monitor->handle);
 	hci_free_adv_monitor(hdev, monitor);
 
-	*err = 0;
-	return false;
+	return status;
 }
 
-/* Returns true if request is forwarded (result is pending), false otherwise.
- * This function requires the caller holds hdev->lock.
- */
-bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err)
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle)
 {
 	struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle);
-	bool pending;
-
-	if (!monitor) {
-		*err = -EINVAL;
-		return false;
-	}
-
-	pending = hci_remove_adv_monitor(hdev, monitor, handle, err);
-	if (!*err && !pending)
-		hci_update_passive_scan(hdev);
 
-	bt_dev_dbg(hdev, "%s remove monitor handle %d, status %d, %spending",
-		   hdev->name, handle, *err, pending ? "" : "not ");
+	if (!monitor)
+		return -EINVAL;
 
-	return pending;
+	return hci_remove_adv_monitor(hdev, monitor);
 }
 
-/* Returns true if request is forwarded (result is pending), false otherwise.
- * This function requires the caller holds hdev->lock.
- */
-bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err)
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_all_adv_monitor(struct hci_dev *hdev)
 {
 	struct adv_monitor *monitor;
 	int idr_next_id = 0;
-	bool pending = false;
-	bool update = false;
-
-	*err = 0;
+	int status = 0;
 
-	while (!*err && !pending) {
+	while (1) {
 		monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id);
 		if (!monitor)
 			break;
 
-		pending = hci_remove_adv_monitor(hdev, monitor, 0, err);
+		status = hci_remove_adv_monitor(hdev, monitor);
+		if (status)
+			return status;
 
-		if (!*err && !pending)
-			update = true;
+		idr_next_id++;
 	}
 
-	if (update)
-		hci_update_passive_scan(hdev);
-
-	bt_dev_dbg(hdev, "%s remove all monitors status %d, %spending",
-		   hdev->name, *err, pending ? "" : "not ");
-
-	return pending;
+	return status;
 }
 
 /* This function requires the caller holds hdev->lock */
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index e2d41dcf1031..05a680a56127 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4861,49 +4861,46 @@ done:
 					 MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI);
 }
 
-int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status)
+static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev,
+					     void *data, int status)
 {
 	struct mgmt_rp_remove_adv_monitor rp;
-	struct mgmt_cp_remove_adv_monitor *cp;
-	struct mgmt_pending_cmd *cmd;
-	int err = 0;
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_remove_adv_monitor *cp = cmd->param;
 
 	hci_dev_lock(hdev);
 
-	cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev);
-	if (!cmd)
-		goto done;
-
-	cp = cmd->param;
 	rp.monitor_handle = cp->monitor_handle;
 
 	if (!status)
 		hci_update_passive_scan(hdev);
 
-	err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
-				mgmt_status(status), &rp, sizeof(rp));
+	mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
+			  mgmt_status(status), &rp, sizeof(rp));
 	mgmt_pending_remove(cmd);
 
-done:
 	hci_dev_unlock(hdev);
-	bt_dev_dbg(hdev, "remove monitor %d complete, status %u",
+	bt_dev_dbg(hdev, "remove monitor %d complete, status %d",
 		   rp.monitor_handle, status);
+}
 
-	return err;
+static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_remove_adv_monitor *cp = cmd->param;
+	u16 handle = __le16_to_cpu(cp->monitor_handle);
+
+	if (!handle)
+		return hci_remove_all_adv_monitor(hdev);
+
+	return hci_remove_single_adv_monitor(hdev, handle);
 }
 
 static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
 			      void *data, u16 len)
 {
-	struct mgmt_cp_remove_adv_monitor *cp = data;
-	struct mgmt_rp_remove_adv_monitor rp;
 	struct mgmt_pending_cmd *cmd;
-	u16 handle = __le16_to_cpu(cp->monitor_handle);
 	int err, status;
-	bool pending;
-
-	BT_DBG("request for %s", hdev->name);
-	rp.monitor_handle = cp->monitor_handle;
 
 	hci_dev_lock(hdev);
 
@@ -4921,34 +4918,23 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
 		goto unlock;
 	}
 
-	if (handle)
-		pending = hci_remove_single_adv_monitor(hdev, handle, &err);
-	else
-		pending = hci_remove_all_adv_monitor(hdev, &err);
+	err = hci_cmd_sync_queue(hdev, mgmt_remove_adv_monitor_sync, cmd,
+				 mgmt_remove_adv_monitor_complete);
 
 	if (err) {
 		mgmt_pending_remove(cmd);
 
-		if (err == -ENOENT)
-			status = MGMT_STATUS_INVALID_INDEX;
+		if (err == -ENOMEM)
+			status = MGMT_STATUS_NO_RESOURCES;
 		else
 			status = MGMT_STATUS_FAILED;
 
-		goto unlock;
-	}
-
-	/* monitor can be removed without forwarding request to controller */
-	if (!pending) {
 		mgmt_pending_remove(cmd);
-		hci_dev_unlock(hdev);
-
-		return mgmt_cmd_complete(sk, hdev->id,
-					 MGMT_OP_REMOVE_ADV_MONITOR,
-					 MGMT_STATUS_SUCCESS,
-					 &rp, sizeof(rp));
+		goto unlock;
 	}
 
 	hci_dev_unlock(hdev);
+
 	return 0;
 
 unlock:
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
index 54fbc718e893..14975769f678 100644
--- a/net/bluetooth/msft.c
+++ b/net/bluetooth/msft.c
@@ -99,15 +99,11 @@ struct msft_data {
 	__u8  evt_prefix_len;
 	__u8  *evt_prefix;
 	struct list_head handle_map;
-	__u16 pending_remove_handle;
 	__u8 resuming;
 	__u8 suspending;
 	__u8 filter_enabled;
 };
 
-static int __msft_remove_monitor(struct hci_dev *hdev,
-				 struct adv_monitor *monitor, u16 handle);
-
 bool msft_monitor_supported(struct hci_dev *hdev)
 {
 	return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR);
@@ -255,20 +251,15 @@ unlock:
 	return status;
 }
 
-static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
-						    u8 status, u16 opcode,
-						    struct sk_buff *skb)
+static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
+						   u16 opcode,
+						   struct adv_monitor *monitor,
+						   struct sk_buff *skb)
 {
-	struct msft_cp_le_cancel_monitor_advertisement *cp;
 	struct msft_rp_le_cancel_monitor_advertisement *rp;
-	struct adv_monitor *monitor;
 	struct msft_monitor_advertisement_handle_data *handle_data;
 	struct msft_data *msft = hdev->msft_data;
-	int err;
-	bool pending;
-
-	if (status)
-		goto done;
+	int status = 0;
 
 	rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data;
 	if (skb->len < sizeof(*rp)) {
@@ -276,22 +267,22 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
 		goto done;
 	}
 
+	status = rp->status;
+	if (status)
+		goto done;
+
 	hci_dev_lock(hdev);
 
-	cp = hci_sent_cmd_data(hdev, hdev->msft_opcode);
-	handle_data = msft_find_handle_data(hdev, cp->handle, false);
+	handle_data = msft_find_handle_data(hdev, monitor->handle, true);
 
 	if (handle_data) {
-		monitor = idr_find(&hdev->adv_monitors_idr,
-				   handle_data->mgmt_handle);
-
-		if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED)
+		if (monitor->state == ADV_MONITOR_STATE_OFFLOADED)
 			monitor->state = ADV_MONITOR_STATE_REGISTERED;
 
 		/* Do not free the monitor if it is being removed due to
 		 * suspend. It will be re-monitored on resume.
 		 */
-		if (monitor && !msft->suspending) {
+		if (!msft->suspending) {
 			hci_free_adv_monitor(hdev, monitor);
 
 			/* Clear any monitored devices by this Adv Monitor */
@@ -303,35 +294,19 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
 		kfree(handle_data);
 	}
 
-	/* If remove all monitors is required, we need to continue the process
-	 * here because the earlier it was paused when waiting for the
-	 * response from controller.
-	 */
-	if (msft->pending_remove_handle == 0) {
-		pending = hci_remove_all_adv_monitor(hdev, &err);
-		if (pending) {
-			hci_dev_unlock(hdev);
-			return;
-		}
-
-		if (err)
-			status = HCI_ERROR_UNSPECIFIED;
-	}
-
 	hci_dev_unlock(hdev);
 
 done:
-	if (!msft->suspending)
-		hci_remove_adv_monitor_complete(hdev, status);
+	return status;
 }
 
+/* This function requires the caller holds hci_req_sync_lock */
 static int msft_remove_monitor_sync(struct hci_dev *hdev,
 				    struct adv_monitor *monitor)
 {
 	struct msft_cp_le_cancel_monitor_advertisement cp;
 	struct msft_monitor_advertisement_handle_data *handle_data;
 	struct sk_buff *skb;
-	u8 status;
 
 	handle_data = msft_find_handle_data(hdev, monitor->handle, true);
 
@@ -347,13 +322,8 @@ static int msft_remove_monitor_sync(struct hci_dev *hdev,
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	status = skb->data[0];
-	skb_pull(skb, 1);
-
-	msft_le_cancel_monitor_advertisement_cb(hdev, status, hdev->msft_opcode,
-						skb);
-
-	return status;
+	return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+						       monitor, skb);
 }
 
 /* This function requires the caller holds hci_req_sync_lock */
@@ -811,38 +781,8 @@ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor)
 	return msft_add_monitor_sync(hdev, monitor);
 }
 
-/* This function requires the caller holds hdev->lock */
-static int __msft_remove_monitor(struct hci_dev *hdev,
-				 struct adv_monitor *monitor, u16 handle)
-{
-	struct msft_cp_le_cancel_monitor_advertisement cp;
-	struct msft_monitor_advertisement_handle_data *handle_data;
-	struct hci_request req;
-	struct msft_data *msft = hdev->msft_data;
-	int err = 0;
-
-	handle_data = msft_find_handle_data(hdev, monitor->handle, true);
-
-	/* If no matched handle, just remove without telling controller */
-	if (!handle_data)
-		return -ENOENT;
-
-	cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
-	cp.handle = handle_data->msft_handle;
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, hdev->msft_opcode, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, msft_le_cancel_monitor_advertisement_cb);
-
-	if (!err)
-		msft->pending_remove_handle = handle;
-
-	return err;
-}
-
-/* This function requires the caller holds hdev->lock */
-int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor,
-			u16 handle)
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
 {
 	struct msft_data *msft = hdev->msft_data;
 
@@ -852,7 +792,7 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor,
 	if (msft->resuming || msft->suspending)
 		return -EBUSY;
 
-	return __msft_remove_monitor(hdev, monitor, handle);
+	return msft_remove_monitor_sync(hdev, monitor);
 }
 
 void msft_req_add_set_filter_enable(struct hci_request *req, bool enable)
diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h
index afcaf7d3b1cb..2a63205b377b 100644
--- a/net/bluetooth/msft.h
+++ b/net/bluetooth/msft.h
@@ -20,8 +20,7 @@ void msft_do_close(struct hci_dev *hdev);
 void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb);
 __u64 msft_get_features(struct hci_dev *hdev);
 int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor);
-int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor,
-			u16 handle);
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
 void msft_req_add_set_filter_enable(struct hci_request *req, bool enable);
 int msft_set_filter_enable(struct hci_dev *hdev, bool enable);
 int msft_suspend_sync(struct hci_dev *hdev);
@@ -49,8 +48,7 @@ static inline int msft_add_monitor_pattern(struct hci_dev *hdev,
 }
 
 static inline int msft_remove_monitor(struct hci_dev *hdev,
-				      struct adv_monitor *monitor,
-				      u16 handle)
+				      struct adv_monitor *monitor)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit 


From ca2045e059c3aa1b06c9aed448672bc86dfdce11 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 8 Apr 2022 15:07:44 -0700
Subject: Bluetooth: Add bt_status

This adds bt_status which can be used to convert Unix errno to
Bluetooth status.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  1 +
 net/bluetooth/lib.c               | 71 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index a8b52175af05..686ce2591bb2 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -521,6 +521,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk,
 }
 
 int bt_to_errno(u16 code);
+__u8 bt_status(int err);
 
 void hci_sock_set_flag(struct sock *sk, int nr);
 void hci_sock_clear_flag(struct sock *sk, int nr);
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 5326f41a58b7..469a0c95b6e8 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -135,6 +135,77 @@ int bt_to_errno(__u16 code)
 }
 EXPORT_SYMBOL(bt_to_errno);
 
+/* Unix errno to Bluetooth error codes mapping */
+__u8 bt_status(int err)
+{
+	/* Don't convert if already positive value */
+	if (err >= 0)
+		return err;
+
+	switch (err) {
+	case -EBADRQC:
+		return 0x01;
+
+	case -ENOTCONN:
+		return 0x02;
+
+	case -EIO:
+		return 0x03;
+
+	case -EHOSTDOWN:
+		return 0x04;
+
+	case -EACCES:
+		return 0x05;
+
+	case -EBADE:
+		return 0x06;
+
+	case -ENOMEM:
+		return 0x07;
+
+	case -ETIMEDOUT:
+		return 0x08;
+
+	case -EMLINK:
+		return 0x09;
+
+	case EALREADY:
+		return 0x0b;
+
+	case -EBUSY:
+		return 0x0c;
+
+	case -ECONNREFUSED:
+		return 0x0d;
+
+	case -EOPNOTSUPP:
+		return 0x11;
+
+	case -EINVAL:
+		return 0x12;
+
+	case -ECONNRESET:
+		return 0x13;
+
+	case -ECONNABORTED:
+		return 0x16;
+
+	case ELOOP:
+		return 0x17;
+
+	case -EPROTONOSUPPORT:
+		return 0x1a;
+
+	case -EPROTO:
+		return 0x19;
+
+	default:
+		return 0x1f;
+	}
+}
+EXPORT_SYMBOL(bt_status);
+
 void bt_info(const char *format, ...)
 {
 	struct va_format vaf;
-- 
cgit 


From 1f7435c8f6558a94f75b408a74140bdcbd0f6dd1 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 21 Jun 2022 11:58:34 -0700
Subject: Bluetooth: mgmt: Fix using hci_conn_abort

This fixes using hci_conn_abort instead of using hci_conn_abort_sync.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_sync.c         |  3 +--
 net/bluetooth/mgmt.c             | 38 +++++++++++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 2492e3b46a8f..544e949b5dbf 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -105,4 +105,6 @@ int hci_resume_sync(struct hci_dev *hdev);
 
 struct hci_conn;
 
+int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
+
 int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 36a501959b6d..1fbeee970aa7 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4489,8 +4489,7 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
 
-static int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
-			       u8 reason)
+int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
 {
 	int err;
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 05a680a56127..0c6878095709 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2528,6 +2528,37 @@ static int device_unpaired(struct hci_dev *hdev, bdaddr_t *bdaddr,
 			  skip_sk);
 }
 
+static void unpair_device_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_unpair_device *cp = cmd->param;
+
+	if (!err)
+		device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
+
+	cmd->cmd_complete(cmd, err);
+	mgmt_pending_free(cmd);
+}
+
+static int unpair_device_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_unpair_device *cp = cmd->param;
+	struct hci_conn *conn;
+
+	if (cp->addr.type == BDADDR_BREDR)
+		conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+					       &cp->addr.bdaddr);
+	else
+		conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+					       le_addr_type(cp->addr.type));
+
+	if (!conn)
+		return 0;
+
+	return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM);
+}
+
 static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 			 u16 len)
 {
@@ -2638,7 +2669,7 @@ done:
 		goto unlock;
 	}
 
-	cmd = mgmt_pending_add(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
+	cmd = mgmt_pending_new(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
 			       sizeof(*cp));
 	if (!cmd) {
 		err = -ENOMEM;
@@ -2647,9 +2678,10 @@ done:
 
 	cmd->cmd_complete = addr_cmd_complete;
 
-	err = hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+	err = hci_cmd_sync_queue(hdev, unpair_device_sync, cmd,
+				 unpair_device_complete);
 	if (err < 0)
-		mgmt_pending_remove(cmd);
+		mgmt_pending_free(cmd);
 
 unlock:
 	hci_dev_unlock(hdev);
-- 
cgit 


From 14b80582c43e4f550acfd93c2b2cadbe36ea0874 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 20 May 2022 13:41:24 -0700
Subject: resource: Introduce alloc_free_mem_region()

The core of devm_request_free_mem_region() is a helper that searches for
free space in iomem_resource and performs __request_region_locked() on
the result of that search. The policy choices of the implementation
conform to what CONFIG_DEVICE_PRIVATE users want which is memory that is
immediately marked busy, and a preference to search for the first-fit
free range in descending order from the top of the physical address
space.

CXL has a need for a similar allocator, but with the following tweaks:

1/ Search for free space in ascending order

2/ Search for free space relative to a given CXL window

3/ 'insert' rather than 'request' the new resource given downstream
   drivers from the CXL Region driver (like the pmem or dax drivers) are
   responsible for request_mem_region() when they activate the memory
   range.

Rework __request_free_mem_region() into get_free_mem_region() which
takes a set of GFR_* (Get Free Region) flags to control the allocation
policy (ascending vs descending), and "busy" policy (insert_resource()
vs request_region()).

As part of the consolidation of the legacy GFR_REQUEST_REGION case with
the new default of just inserting a new resource into the free space
some minor cleanups like not checking for NULL before calling
devres_free() (which does its own check) is included.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/linux-cxl/20220420143406.GY2120790@nvidia.com/
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/165784333333.1758207.13703329337805274043.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/ioport.h |   2 +
 kernel/resource.c      | 178 +++++++++++++++++++++++++++++++++++++++----------
 mm/Kconfig             |   5 ++
 3 files changed, 150 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 79d1ad6d6275..616b683563a9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -330,6 +330,8 @@ struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size);
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name);
+struct resource *alloc_free_mem_region(struct resource *base,
+		unsigned long size, unsigned long align, const char *name);
 
 static inline void irqresource_disabled(struct resource *res, u32 irq)
 {
diff --git a/kernel/resource.c b/kernel/resource.c
index 53a534db350e..4c5e80b92f2f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
-static int __region_intersects(resource_size_t start, size_t size,
-			unsigned long flags, unsigned long desc)
+static int __region_intersects(struct resource *parent, resource_size_t start,
+			       size_t size, unsigned long flags,
+			       unsigned long desc)
 {
 	struct resource res;
 	int type = 0; int other = 0;
@@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size,
 	res.start = start;
 	res.end = start + size - 1;
 
-	for (p = iomem_resource.child; p ; p = p->sibling) {
+	for (p = parent->child; p ; p = p->sibling) {
 		bool is_type = (((p->flags & flags) == flags) &&
 				((desc == IORES_DESC_NONE) ||
 				 (desc == p->desc)));
@@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
 	int ret;
 
 	read_lock(&resource_lock);
-	ret = __region_intersects(start, size, flags, desc);
+	ret = __region_intersects(&iomem_resource, start, size, flags, desc);
 	read_unlock(&resource_lock);
 
 	return ret;
@@ -1780,62 +1781,139 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
-#ifdef CONFIG_DEVICE_PRIVATE
-static struct resource *__request_free_mem_region(struct device *dev,
-		struct resource *base, unsigned long size, const char *name)
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING		(1UL << 0)
+#define GFR_REQUEST_REGION	(1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+				 resource_size_t align, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING) {
+		resource_size_t end;
+
+		end = min_t(resource_size_t, base->end,
+			    (1ULL << MAX_PHYSMEM_BITS) - 1);
+		return end - size + 1;
+	}
+
+	return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+			 resource_size_t size, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING)
+		return addr > size && addr >= base->start;
+	/*
+	 * In the ascend case be careful that the last increment by
+	 * @size did not wrap 0.
+	 */
+	return addr > addr - size &&
+	       addr <= min_t(resource_size_t, base->end,
+			     (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+				unsigned long flags)
+{
+	if (flags & GFR_DESCENDING)
+		return addr - size;
+	return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+	struct resource *res = _res;
+
+	if (res->parent)
+		remove_resource(res);
+	free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+		    resource_size_t size, const unsigned long align,
+		    const char *name, const unsigned long desc,
+		    const unsigned long flags)
 {
-	resource_size_t end, addr;
+	resource_size_t addr;
 	struct resource *res;
 	struct region_devres *dr = NULL;
 
-	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
-	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = end - size + 1UL;
+	size = ALIGN(size, align);
 
 	res = alloc_resource(GFP_KERNEL);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
 
-	if (dev) {
+	if (dev && (flags & GFR_REQUEST_REGION)) {
 		dr = devres_alloc(devm_region_release,
 				sizeof(struct region_devres), GFP_KERNEL);
 		if (!dr) {
 			free_resource(res);
 			return ERR_PTR(-ENOMEM);
 		}
+	} else if (dev) {
+		if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+			return ERR_PTR(-ENOMEM);
 	}
 
 	write_lock(&resource_lock);
-	for (; addr > size && addr >= base->start; addr -= size) {
-		if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
-				REGION_DISJOINT)
+	for (addr = gfr_start(base, size, align, flags);
+	     gfr_continue(base, addr, size, flags);
+	     addr = gfr_next(addr, size, flags)) {
+		if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+		    REGION_DISJOINT)
 			continue;
 
-		if (__request_region_locked(res, &iomem_resource, addr, size,
-						name, 0))
-			break;
+		if (flags & GFR_REQUEST_REGION) {
+			if (__request_region_locked(res, &iomem_resource, addr,
+						    size, name, 0))
+				break;
 
-		if (dev) {
-			dr->parent = &iomem_resource;
-			dr->start = addr;
-			dr->n = size;
-			devres_add(dev, dr);
-		}
+			if (dev) {
+				dr->parent = &iomem_resource;
+				dr->start = addr;
+				dr->n = size;
+				devres_add(dev, dr);
+			}
 
-		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
-		write_unlock(&resource_lock);
+			res->desc = desc;
+			write_unlock(&resource_lock);
+
+
+			/*
+			 * A driver is claiming this region so revoke any
+			 * mappings.
+			 */
+			revoke_iomem(res);
+		} else {
+			res->start = addr;
+			res->end = addr + size - 1;
+			res->name = name;
+			res->desc = desc;
+			res->flags = IORESOURCE_MEM;
+
+			/*
+			 * Only succeed if the resource hosts an exclusive
+			 * range after the insert
+			 */
+			if (__insert_resource(base, res) || res->child)
+				break;
+
+			write_unlock(&resource_lock);
+		}
 
-		/*
-		 * A driver is claiming this region so revoke any mappings.
-		 */
-		revoke_iomem(res);
 		return res;
 	}
 	write_unlock(&resource_lock);
 
-	free_resource(res);
-	if (dr)
+	if (flags & GFR_REQUEST_REGION) {
+		free_resource(res);
 		devres_free(dr);
+	} else if (dev)
+		devm_release_action(dev, remove_free_mem_region, res);
 
 	return ERR_PTR(-ERANGE);
 }
@@ -1854,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev,
 struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size)
 {
-	return __request_free_mem_region(dev, base, size, dev_name(dev));
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+				   dev_name(dev),
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
 
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name)
 {
-	return __request_free_mem_region(NULL, base, size, name);
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(request_free_mem_region);
 
-#endif /* CONFIG_DEVICE_PRIVATE */
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+				       unsigned long size, unsigned long align,
+				       const char *name)
+{
+	/* Default of ascending direction and insert resource */
+	unsigned long flags = 0;
+
+	return get_free_mem_region(NULL, base, size, align, name,
+				   IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
 
 static int __init strict_iomem(char *str)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 169e64192e48..a5b4fee2e3fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -994,9 +994,14 @@ config HMM_MIRROR
 	bool
 	depends on MMU
 
+config GET_FREE_REGION
+	depends on SPARSEMEM
+	bool
+
 config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ZONE_DEVICE
+	select GET_FREE_REGION
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
-- 
cgit 


From d96c52fe4907c68adc5e61a0bef7aec0933223d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 15 Apr 2022 10:55:42 -0700
Subject: rcu: Add polled expedited grace-period primitives

This commit adds expedited grace-period functionality to RCU's polled
grace-period API, adding start_poll_synchronize_rcu_expedited() and
cond_synchronize_rcu_expedited(), which are similar to the existing
start_poll_synchronize_rcu() and cond_synchronize_rcu() functions,
respectively.

Note that although start_poll_synchronize_rcu_expedited() can be invoked
very early, the resulting expedited grace periods are not guaranteed
to start until after workqueues are fully initialized.  On the other
hand, both synchronize_rcu() and synchronize_rcu_expedited() can also
be invoked very early, and the resulting grace periods will be taken
into account as they occur.

[ paulmck: Apply feedback from Neeraj Upadhyay. ]

Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
Cc: Brian Foster <bfoster@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 10 ++++++
 include/linux/rcutree.h |  2 ++
 kernel/rcu/tree.c       | 17 +++++++---
 kernel/rcu/tree.h       |  7 ++++
 kernel/rcu/tree_exp.h   | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 116 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5fed476f977f..ab7e20dfb07b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
+static inline unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+	return start_poll_synchronize_rcu();
+}
+
+static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+	cond_synchronize_rcu(oldstate);
+}
+
 extern void rcu_barrier(void);
 
 static inline void synchronize_rcu_expedited(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9c6cfb742504..20dbaa9a3882 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
+unsigned long start_poll_synchronize_rcu_expedited(void);
+void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1505b02b4e53..6cf5b51622cd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4027,20 +4027,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 /**
  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
  *
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
  *
  * If a full RCU grace period has elapsed since the earlier call to
  * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
  * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
  *
- * Yes, this function does not take counter wrap into account.  But
- * counter wrap is harmless.  If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
  * more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
  * of this function.
  */
 void cond_synchronize_rcu(unsigned long oldstate)
@@ -4793,6 +4793,9 @@ static void __init rcu_init_one(void)
 			init_waitqueue_head(&rnp->exp_wq[3]);
 			spin_lock_init(&rnp->exp_lock);
 			mutex_init(&rnp->boost_kthread_mutex);
+			raw_spin_lock_init(&rnp->exp_poll_lock);
+			rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+			INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
 		}
 	}
 
@@ -5018,6 +5021,10 @@ void __init rcu_init(void)
 		qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
 	else
 		qovld_calc = qovld;
+
+	// Kick-start any polled grace periods that started early.
+	if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
+		(void)start_poll_synchronize_rcu_expedited();
 }
 
 #include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5634e76106c4..fb77deca5f5c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -133,6 +133,10 @@ struct rcu_node {
 	wait_queue_head_t exp_wq[4];
 	struct rcu_exp_work rew;
 	bool exp_need_flush;	/* Need to flush workitem? */
+	raw_spinlock_t exp_poll_lock;
+				/* Lock and data for polled expedited grace periods. */
+	unsigned long exp_seq_poll_rq;
+	struct work_struct exp_poll_wq;
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -484,3 +488,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
 static void check_cpu_stall(struct rcu_data *rdp);
 static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
 				     const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e0258066b881..571b0a700cce 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -962,3 +962,88 @@ void synchronize_rcu_expedited(void)
 		synchronize_rcu_expedited_destroy_work(&rew);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+	unsigned long s;
+
+	raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+	s = rnp->exp_seq_poll_rq;
+	rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+	raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+	if (s == RCU_GET_STATE_COMPLETED)
+		return;
+	while (!poll_state_synchronize_rcu(s))
+		synchronize_rcu_expedited();
+	raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+	s = rnp->exp_seq_poll_rq;
+	if (poll_state_synchronize_rcu(s))
+		rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+	raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime.  If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	unsigned long s;
+
+	s = get_state_synchronize_rcu();
+	rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+	rnp = rdp->mynode;
+	if (rcu_init_invoked())
+		raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+	if (!poll_state_synchronize_rcu(s)) {
+		rnp->exp_seq_poll_rq = s;
+		if (rcu_init_invoked())
+			queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+	}
+	if (rcu_init_invoked())
+		raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+	return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return.  Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+	if (!poll_state_synchronize_rcu(oldstate))
+		synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
-- 
cgit 


From ab21d6063c01180a8e9b22a37b847e5819525d9f Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 21 Jul 2022 15:42:33 +0200
Subject: bpf: Introduce 8-byte BTF set

Introduce support for defining flags for kfuncs using a new set of
macros, BTF_SET8_START/BTF_SET8_END, which define a set which contains
8 byte elements (each of which consists of a pair of BTF ID and flags),
using a new BTF_ID_FLAGS macro.

This will be used to tag kfuncs registered for a certain program type
as acquire, release, sleepable, ret_null, etc. without having to create
more and more sets which was proving to be an unscalable solution.

Now, when looking up whether a kfunc is allowed for a certain program,
we can also obtain its kfunc flags in the same call and avoid further
lookups.

The resolve_btfids change is split into a separate patch.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 252a4befeab1..3cb0741e71d7 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -8,6 +8,15 @@ struct btf_id_set {
 	u32 ids[];
 };
 
+struct btf_id_set8 {
+	u32 cnt;
+	u32 flags;
+	struct {
+		u32 id;
+		u32 flags;
+	} pairs[];
+};
+
 #ifdef CONFIG_DEBUG_INFO_BTF
 
 #include <linux/compiler.h> /* for __PASTE */
@@ -25,7 +34,7 @@ struct btf_id_set {
 
 #define BTF_IDS_SECTION ".BTF_ids"
 
-#define ____BTF_ID(symbol)				\
+#define ____BTF_ID(symbol, word)			\
 asm(							\
 ".pushsection " BTF_IDS_SECTION ",\"a\";       \n"	\
 ".local " #symbol " ;                          \n"	\
@@ -33,10 +42,11 @@ asm(							\
 ".size  " #symbol ", 4;                        \n"	\
 #symbol ":                                     \n"	\
 ".zero 4                                       \n"	\
+word							\
 ".popsection;                                  \n");
 
-#define __BTF_ID(symbol) \
-	____BTF_ID(symbol)
+#define __BTF_ID(symbol, word) \
+	____BTF_ID(symbol, word)
 
 #define __ID(prefix) \
 	__PASTE(prefix, __COUNTER__)
@@ -46,7 +56,14 @@ asm(							\
  * to 4 zero bytes.
  */
 #define BTF_ID(prefix, name) \
-	__BTF_ID(__ID(__BTF_ID__##prefix##__##name##__))
+	__BTF_ID(__ID(__BTF_ID__##prefix##__##name##__), "")
+
+#define ____BTF_ID_FLAGS(prefix, name, flags) \
+	__BTF_ID(__ID(__BTF_ID__##prefix##__##name##__), ".long " #flags "\n")
+#define __BTF_ID_FLAGS(prefix, name, flags, ...) \
+	____BTF_ID_FLAGS(prefix, name, flags)
+#define BTF_ID_FLAGS(prefix, name, ...) \
+	__BTF_ID_FLAGS(prefix, name, ##__VA_ARGS__, 0)
 
 /*
  * The BTF_ID_LIST macro defines pure (unsorted) list
@@ -145,10 +162,51 @@ asm(							\
 ".popsection;                                 \n");	\
 extern struct btf_id_set name;
 
+/*
+ * The BTF_SET8_START/END macros pair defines sorted list of
+ * BTF IDs and their flags plus its members count, with the
+ * following layout:
+ *
+ * BTF_SET8_START(list)
+ * BTF_ID_FLAGS(type1, name1, flags)
+ * BTF_ID_FLAGS(type2, name2, flags)
+ * BTF_SET8_END(list)
+ *
+ * __BTF_ID__set8__list:
+ * .zero 8
+ * list:
+ * __BTF_ID__type1__name1__3:
+ * .zero 4
+ * .word (1 << 0) | (1 << 2)
+ * __BTF_ID__type2__name2__5:
+ * .zero 4
+ * .word (1 << 3) | (1 << 1) | (1 << 2)
+ *
+ */
+#define __BTF_SET8_START(name, scope)			\
+asm(							\
+".pushsection " BTF_IDS_SECTION ",\"a\";       \n"	\
+"." #scope " __BTF_ID__set8__" #name ";        \n"	\
+"__BTF_ID__set8__" #name ":;                   \n"	\
+".zero 8                                       \n"	\
+".popsection;                                  \n");
+
+#define BTF_SET8_START(name)				\
+__BTF_ID_LIST(name, local)				\
+__BTF_SET8_START(name, local)
+
+#define BTF_SET8_END(name)				\
+asm(							\
+".pushsection " BTF_IDS_SECTION ",\"a\";      \n"	\
+".size __BTF_ID__set8__" #name ", .-" #name "  \n"	\
+".popsection;                                 \n");	\
+extern struct btf_id_set8 name;
+
 #else
 
 #define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
 #define BTF_ID(prefix, name)
+#define BTF_ID_FLAGS(prefix, name, flags)
 #define BTF_ID_UNUSED
 #define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n];
 #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1];
@@ -156,6 +214,8 @@ extern struct btf_id_set name;
 #define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 };
 #define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 };
 #define BTF_SET_END(name)
+#define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 };
+#define BTF_SET8_END(name) static struct btf_id_set8 __maybe_unused name = { 0 };
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
 
-- 
cgit 


From a4703e3184320d6e15e2bc81d2ccf1c8c883f9d1 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 21 Jul 2022 15:42:35 +0200
Subject: bpf: Switch to new kfunc flags infrastructure

Instead of populating multiple sets to indicate some attribute and then
researching the same BTF ID in them, prepare a single unified BTF set
which indicates whether a kfunc is allowed to be called, and also its
attributes if any at the same time. Now, only one call is needed to
perform the lookup for both kfunc availability and its attributes.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |   3 +-
 include/linux/btf.h                                |  33 ++-----
 kernel/bpf/btf.c                                   | 106 ++++++++++-----------
 kernel/bpf/verifier.c                              |  14 ++-
 net/bpf/test_run.c                                 |  70 +++++---------
 net/ipv4/bpf_tcp_ca.c                              |  18 ++--
 net/ipv4/tcp_bbr.c                                 |  24 ++---
 net/ipv4/tcp_cubic.c                               |  20 ++--
 net/ipv4/tcp_dctcp.c                               |  20 ++--
 net/netfilter/nf_conntrack_bpf.c                   |  49 +++-------
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  10 +-
 11 files changed, 145 insertions(+), 222 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 11950029284f..a97751d845c9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1924,7 +1924,8 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
-			      struct bpf_reg_state *regs);
+			      struct bpf_reg_state *regs,
+			      u32 kfunc_flags);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1bfed7fa0428..6dfc6eaf7f8c 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -12,14 +12,11 @@
 #define BTF_TYPE_EMIT(type) ((void)(type *)0)
 #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)
 
-enum btf_kfunc_type {
-	BTF_KFUNC_TYPE_CHECK,
-	BTF_KFUNC_TYPE_ACQUIRE,
-	BTF_KFUNC_TYPE_RELEASE,
-	BTF_KFUNC_TYPE_RET_NULL,
-	BTF_KFUNC_TYPE_KPTR_ACQUIRE,
-	BTF_KFUNC_TYPE_MAX,
-};
+/* These need to be macros, as the expressions are used in assembler input */
+#define KF_ACQUIRE	(1 << 0) /* kfunc is an acquire function */
+#define KF_RELEASE	(1 << 1) /* kfunc is a release function */
+#define KF_RET_NULL	(1 << 2) /* kfunc returns a pointer that may be NULL */
+#define KF_KPTR_GET	(1 << 3) /* kfunc returns reference to a kptr */
 
 struct btf;
 struct btf_member;
@@ -30,16 +27,7 @@ struct btf_id_set;
 
 struct btf_kfunc_id_set {
 	struct module *owner;
-	union {
-		struct {
-			struct btf_id_set *check_set;
-			struct btf_id_set *acquire_set;
-			struct btf_id_set *release_set;
-			struct btf_id_set *ret_null_set;
-			struct btf_id_set *kptr_acquire_set;
-		};
-		struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX];
-	};
+	struct btf_id_set8 *set;
 };
 
 struct btf_id_dtor_kfunc {
@@ -378,9 +366,9 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 struct btf *btf_parse_vmlinux(void);
 struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
-bool btf_kfunc_id_set_contains(const struct btf *btf,
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
 			       enum bpf_prog_type prog_type,
-			       enum btf_kfunc_type type, u32 kfunc_btf_id);
+			       u32 kfunc_btf_id);
 int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 			      const struct btf_kfunc_id_set *s);
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
@@ -397,12 +385,11 @@ static inline const char *btf_name_by_offset(const struct btf *btf,
 {
 	return NULL;
 }
-static inline bool btf_kfunc_id_set_contains(const struct btf *btf,
+static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf,
 					     enum bpf_prog_type prog_type,
-					     enum btf_kfunc_type type,
 					     u32 kfunc_btf_id)
 {
-	return false;
+	return NULL;
 }
 static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 					    const struct btf_kfunc_id_set *s)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5869f03bcb6e..4d9c2d88720f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -213,7 +213,7 @@ enum {
 };
 
 struct btf_kfunc_set_tab {
-	struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+	struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
 };
 
 struct btf_id_dtor_kfunc_tab {
@@ -1616,7 +1616,7 @@ static void btf_free_id(struct btf *btf)
 static void btf_free_kfunc_set_tab(struct btf *btf)
 {
 	struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
-	int hook, type;
+	int hook;
 
 	if (!tab)
 		return;
@@ -1625,10 +1625,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
 	 */
 	if (btf_is_module(btf))
 		goto free_tab;
-	for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
-		for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
-			kfree(tab->sets[hook][type]);
-	}
+	for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+		kfree(tab->sets[hook]);
 free_tab:
 	kfree(tab);
 	btf->kfunc_set_tab = NULL;
@@ -6172,7 +6170,8 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok)
+				    bool ptr_to_mem_ok,
+				    u32 kfunc_flags)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	struct bpf_verifier_log *log = &env->log;
@@ -6210,10 +6209,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 	if (is_kfunc) {
 		/* Only kfunc can be release func */
-		rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
-						BTF_KFUNC_TYPE_RELEASE, func_id);
-		kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
-						     BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id);
+		rel = kfunc_flags & KF_RELEASE;
+		kptr_get = kfunc_flags & KF_KPTR_GET;
 	}
 
 	/* check that BTF function arguments match actual types that the
@@ -6442,7 +6439,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -6455,9 +6452,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
-			      struct bpf_reg_state *regs)
+			      struct bpf_reg_state *regs,
+			      u32 kfunc_flags)
 {
-	return btf_check_func_arg_match(env, btf, func_id, regs, true);
+	return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags);
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
@@ -6854,6 +6852,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
 }
 
+static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
+{
+	return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
+}
+
 enum {
 	BTF_MODULE_F_LIVE = (1 << 0),
 };
@@ -7102,16 +7105,16 @@ BTF_TRACING_TYPE_xxx
 
 /* Kernel Function (kfunc) BTF ID set registration API */
 
-static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
-				    enum btf_kfunc_type type,
-				    struct btf_id_set *add_set, bool vmlinux_set)
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+				  struct btf_id_set8 *add_set)
 {
+	bool vmlinux_set = !btf_is_module(btf);
 	struct btf_kfunc_set_tab *tab;
-	struct btf_id_set *set;
+	struct btf_id_set8 *set;
 	u32 set_cnt;
 	int ret;
 
-	if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+	if (hook >= BTF_KFUNC_HOOK_MAX) {
 		ret = -EINVAL;
 		goto end;
 	}
@@ -7127,7 +7130,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 		btf->kfunc_set_tab = tab;
 	}
 
-	set = tab->sets[hook][type];
+	set = tab->sets[hook];
 	/* Warn when register_btf_kfunc_id_set is called twice for the same hook
 	 * for module sets.
 	 */
@@ -7141,7 +7144,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 	 * pointer and return.
 	 */
 	if (!vmlinux_set) {
-		tab->sets[hook][type] = add_set;
+		tab->sets[hook] = add_set;
 		return 0;
 	}
 
@@ -7150,7 +7153,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 	 * and concatenate all individual sets being registered. While each set
 	 * is individually sorted, they may become unsorted when concatenated,
 	 * hence re-sorting the final set again is required to make binary
-	 * searching the set using btf_id_set_contains function work.
+	 * searching the set using btf_id_set8_contains function work.
 	 */
 	set_cnt = set ? set->cnt : 0;
 
@@ -7165,8 +7168,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 	}
 
 	/* Grow set */
-	set = krealloc(tab->sets[hook][type],
-		       offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+	set = krealloc(tab->sets[hook],
+		       offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
 		       GFP_KERNEL | __GFP_NOWARN);
 	if (!set) {
 		ret = -ENOMEM;
@@ -7174,15 +7177,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 	}
 
 	/* For newly allocated set, initialize set->cnt to 0 */
-	if (!tab->sets[hook][type])
+	if (!tab->sets[hook])
 		set->cnt = 0;
-	tab->sets[hook][type] = set;
+	tab->sets[hook] = set;
 
 	/* Concatenate the two sets */
-	memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+	memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
 	set->cnt += add_set->cnt;
 
-	sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+	sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
 
 	return 0;
 end:
@@ -7190,38 +7193,25 @@ end:
 	return ret;
 }
 
-static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
-				  const struct btf_kfunc_id_set *kset)
-{
-	bool vmlinux_set = !btf_is_module(btf);
-	int type, ret = 0;
-
-	for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
-		if (!kset->sets[type])
-			continue;
-
-		ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
-		if (ret)
-			break;
-	}
-	return ret;
-}
-
-static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
 					enum btf_kfunc_hook hook,
-					enum btf_kfunc_type type,
 					u32 kfunc_btf_id)
 {
-	struct btf_id_set *set;
+	struct btf_id_set8 *set;
+	u32 *id;
 
-	if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
-		return false;
+	if (hook >= BTF_KFUNC_HOOK_MAX)
+		return NULL;
 	if (!btf->kfunc_set_tab)
-		return false;
-	set = btf->kfunc_set_tab->sets[hook][type];
+		return NULL;
+	set = btf->kfunc_set_tab->sets[hook];
 	if (!set)
-		return false;
-	return btf_id_set_contains(set, kfunc_btf_id);
+		return NULL;
+	id = btf_id_set8_contains(set, kfunc_btf_id);
+	if (!id)
+		return NULL;
+	/* The flags for BTF ID are located next to it */
+	return id + 1;
 }
 
 static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
@@ -7249,14 +7239,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
  * keeping the reference for the duration of the call provides the necessary
  * protection for looking up a well-formed btf->kfunc_set_tab.
  */
-bool btf_kfunc_id_set_contains(const struct btf *btf,
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
 			       enum bpf_prog_type prog_type,
-			       enum btf_kfunc_type type, u32 kfunc_btf_id)
+			       u32 kfunc_btf_id)
 {
 	enum btf_kfunc_hook hook;
 
 	hook = bpf_prog_type_to_kfunc_hook(prog_type);
-	return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+	return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
 }
 
 /* This function must be invoked only from initcalls/module init functions */
@@ -7283,7 +7273,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 		return PTR_ERR(btf);
 
 	hook = bpf_prog_type_to_kfunc_hook(prog_type);
-	ret = btf_populate_kfunc_set(btf, hook, kset);
+	ret = btf_populate_kfunc_set(btf, hook, kset->set);
 	btf_put(btf);
 	return ret;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7c1e056624f9..096fdac70165 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7562,6 +7562,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	int err, insn_idx = *insn_idx_p;
 	const struct btf_param *args;
 	struct btf *desc_btf;
+	u32 *kfunc_flags;
 	bool acq;
 
 	/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -7577,18 +7578,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	func_name = btf_name_by_offset(desc_btf, func->name_off);
 	func_proto = btf_type_by_id(desc_btf, func->type);
 
-	if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
-				      BTF_KFUNC_TYPE_CHECK, func_id)) {
+	kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+	if (!kfunc_flags) {
 		verbose(env, "calling kernel function %s is not allowed\n",
 			func_name);
 		return -EACCES;
 	}
-
-	acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
-					BTF_KFUNC_TYPE_ACQUIRE, func_id);
+	acq = *kfunc_flags & KF_ACQUIRE;
 
 	/* Check the arguments */
-	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
+	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags);
 	if (err < 0)
 		return err;
 	/* In case of release function, we get register number of refcounted
@@ -7632,8 +7631,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		regs[BPF_REG_0].btf = desc_btf;
 		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
 		regs[BPF_REG_0].btf_id = ptr_type_id;
-		if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
-					      BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+		if (*kfunc_flags & KF_RET_NULL) {
 			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
 			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
 			regs[BPF_REG_0].id = ++env->id_gen;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index dc9dc0bedca0..ca5b7234a350 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -695,48 +695,26 @@ __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
 
-BTF_SET_START(test_sk_check_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test1)
-BTF_ID(func, bpf_kfunc_call_test2)
-BTF_ID(func, bpf_kfunc_call_test3)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_release)
-BTF_ID(func, bpf_kfunc_call_memb_release)
-BTF_ID(func, bpf_kfunc_call_memb1_release)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_ID(func, bpf_kfunc_call_test_pass_ctx)
-BTF_ID(func, bpf_kfunc_call_test_pass1)
-BTF_ID(func, bpf_kfunc_call_test_pass2)
-BTF_ID(func, bpf_kfunc_call_test_fail1)
-BTF_ID(func, bpf_kfunc_call_test_fail2)
-BTF_ID(func, bpf_kfunc_call_test_fail3)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1)
-BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_SET_END(test_sk_check_kfunc_ids)
-
-BTF_SET_START(test_sk_acquire_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_acquire_kfunc_ids)
-
-BTF_SET_START(test_sk_release_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_release)
-BTF_ID(func, bpf_kfunc_call_memb_release)
-BTF_ID(func, bpf_kfunc_call_memb1_release)
-BTF_SET_END(test_sk_release_kfunc_ids)
-
-BTF_SET_START(test_sk_ret_null_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_acquire)
-BTF_ID(func, bpf_kfunc_call_memb_acquire)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_ret_null_kfunc_ids)
-
-BTF_SET_START(test_sk_kptr_acquire_kfunc_ids)
-BTF_ID(func, bpf_kfunc_call_test_kptr_get)
-BTF_SET_END(test_sk_kptr_acquire_kfunc_ids)
+BTF_SET8_START(test_sk_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_kptr_get, KF_ACQUIRE | KF_RET_NULL | KF_KPTR_GET)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_SET8_END(test_sk_check_kfunc_ids)
 
 static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
 			   u32 size, u32 headroom, u32 tailroom)
@@ -1620,12 +1598,8 @@ out:
 }
 
 static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
-	.owner        = THIS_MODULE,
-	.check_set        = &test_sk_check_kfunc_ids,
-	.acquire_set      = &test_sk_acquire_kfunc_ids,
-	.release_set      = &test_sk_release_kfunc_ids,
-	.ret_null_set     = &test_sk_ret_null_kfunc_ids,
-	.kptr_acquire_set = &test_sk_kptr_acquire_kfunc_ids
+	.owner = THIS_MODULE,
+	.set   = &test_sk_check_kfunc_ids,
 };
 
 BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 7a181631b995..85a9e500c42d 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -197,17 +197,17 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	}
 }
 
-BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
-BTF_ID(func, tcp_reno_ssthresh)
-BTF_ID(func, tcp_reno_cong_avoid)
-BTF_ID(func, tcp_reno_undo_cwnd)
-BTF_ID(func, tcp_slow_start)
-BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
+BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_ID_FLAGS(func, tcp_reno_ssthresh)
+BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
+BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
+BTF_ID_FLAGS(func, tcp_slow_start)
+BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
+BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
-	.owner     = THIS_MODULE,
-	.check_set = &bpf_tcp_ca_check_kfunc_ids,
+	.owner = THIS_MODULE,
+	.set   = &bpf_tcp_ca_check_kfunc_ids,
 };
 
 static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 075e744bfb48..54eec33c6e1c 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1154,24 +1154,24 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.set_state	= bbr_set_state,
 };
 
-BTF_SET_START(tcp_bbr_check_kfunc_ids)
+BTF_SET8_START(tcp_bbr_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, bbr_init)
-BTF_ID(func, bbr_main)
-BTF_ID(func, bbr_sndbuf_expand)
-BTF_ID(func, bbr_undo_cwnd)
-BTF_ID(func, bbr_cwnd_event)
-BTF_ID(func, bbr_ssthresh)
-BTF_ID(func, bbr_min_tso_segs)
-BTF_ID(func, bbr_set_state)
+BTF_ID_FLAGS(func, bbr_init)
+BTF_ID_FLAGS(func, bbr_main)
+BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_undo_cwnd)
+BTF_ID_FLAGS(func, bbr_cwnd_event)
+BTF_ID_FLAGS(func, bbr_ssthresh)
+BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_set_state)
 #endif
 #endif
-BTF_SET_END(tcp_bbr_check_kfunc_ids)
+BTF_SET8_END(tcp_bbr_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
-	.owner     = THIS_MODULE,
-	.check_set = &tcp_bbr_check_kfunc_ids,
+	.owner = THIS_MODULE,
+	.set   = &tcp_bbr_check_kfunc_ids,
 };
 
 static int __init bbr_register(void)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 68178e7280ce..768c10c1f649 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -485,22 +485,22 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
 	.name		= "cubic",
 };
 
-BTF_SET_START(tcp_cubic_check_kfunc_ids)
+BTF_SET8_START(tcp_cubic_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, cubictcp_init)
-BTF_ID(func, cubictcp_recalc_ssthresh)
-BTF_ID(func, cubictcp_cong_avoid)
-BTF_ID(func, cubictcp_state)
-BTF_ID(func, cubictcp_cwnd_event)
-BTF_ID(func, cubictcp_acked)
+BTF_ID_FLAGS(func, cubictcp_init)
+BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, cubictcp_cong_avoid)
+BTF_ID_FLAGS(func, cubictcp_state)
+BTF_ID_FLAGS(func, cubictcp_cwnd_event)
+BTF_ID_FLAGS(func, cubictcp_acked)
 #endif
 #endif
-BTF_SET_END(tcp_cubic_check_kfunc_ids)
+BTF_SET8_END(tcp_cubic_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
-	.owner     = THIS_MODULE,
-	.check_set = &tcp_cubic_check_kfunc_ids,
+	.owner = THIS_MODULE,
+	.set   = &tcp_cubic_check_kfunc_ids,
 };
 
 static int __init cubictcp_register(void)
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ab034a4e9324..2a6c0dd665a4 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -239,22 +239,22 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
 	.name		= "dctcp-reno",
 };
 
-BTF_SET_START(tcp_dctcp_check_kfunc_ids)
+BTF_SET8_START(tcp_dctcp_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
-BTF_ID(func, dctcp_init)
-BTF_ID(func, dctcp_update_alpha)
-BTF_ID(func, dctcp_cwnd_event)
-BTF_ID(func, dctcp_ssthresh)
-BTF_ID(func, dctcp_cwnd_undo)
-BTF_ID(func, dctcp_state)
+BTF_ID_FLAGS(func, dctcp_init)
+BTF_ID_FLAGS(func, dctcp_update_alpha)
+BTF_ID_FLAGS(func, dctcp_cwnd_event)
+BTF_ID_FLAGS(func, dctcp_ssthresh)
+BTF_ID_FLAGS(func, dctcp_cwnd_undo)
+BTF_ID_FLAGS(func, dctcp_state)
 #endif
 #endif
-BTF_SET_END(tcp_dctcp_check_kfunc_ids)
+BTF_SET8_END(tcp_dctcp_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
-	.owner     = THIS_MODULE,
-	.check_set = &tcp_dctcp_check_kfunc_ids,
+	.owner = THIS_MODULE,
+	.set   = &tcp_dctcp_check_kfunc_ids,
 };
 
 static int __init dctcp_register(void)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index bc4d5cd63a94..cf2096f65d0e 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -219,48 +219,21 @@ void bpf_ct_release(struct nf_conn *nfct)
 
 __diag_pop()
 
-BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
-BTF_ID(func, bpf_xdp_ct_lookup)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
-
-BTF_SET_START(nf_ct_tc_check_kfunc_ids)
-BTF_ID(func, bpf_skb_ct_lookup)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_tc_check_kfunc_ids)
-
-BTF_SET_START(nf_ct_acquire_kfunc_ids)
-BTF_ID(func, bpf_xdp_ct_lookup)
-BTF_ID(func, bpf_skb_ct_lookup)
-BTF_SET_END(nf_ct_acquire_kfunc_ids)
-
-BTF_SET_START(nf_ct_release_kfunc_ids)
-BTF_ID(func, bpf_ct_release)
-BTF_SET_END(nf_ct_release_kfunc_ids)
-
-/* Both sets are identical */
-#define nf_ct_ret_null_kfunc_ids nf_ct_acquire_kfunc_ids
-
-static const struct btf_kfunc_id_set nf_conntrack_xdp_kfunc_set = {
-	.owner        = THIS_MODULE,
-	.check_set    = &nf_ct_xdp_check_kfunc_ids,
-	.acquire_set  = &nf_ct_acquire_kfunc_ids,
-	.release_set  = &nf_ct_release_kfunc_ids,
-	.ret_null_set = &nf_ct_ret_null_kfunc_ids,
-};
-
-static const struct btf_kfunc_id_set nf_conntrack_tc_kfunc_set = {
-	.owner        = THIS_MODULE,
-	.check_set    = &nf_ct_tc_check_kfunc_ids,
-	.acquire_set  = &nf_ct_acquire_kfunc_ids,
-	.release_set  = &nf_ct_release_kfunc_ids,
-	.ret_null_set = &nf_ct_ret_null_kfunc_ids,
+BTF_SET8_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_SET8_END(nf_ct_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &nf_ct_kfunc_set,
 };
 
 int register_nf_conntrack_bpf(void)
 {
 	int ret;
 
-	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_xdp_kfunc_set);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_tc_kfunc_set);
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
 }
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index e585e1cefc77..792cb15bac40 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -148,13 +148,13 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 	.write = bpf_testmod_test_write,
 };
 
-BTF_SET_START(bpf_testmod_check_kfunc_ids)
-BTF_ID(func, bpf_testmod_test_mod_kfunc)
-BTF_SET_END(bpf_testmod_check_kfunc_ids)
+BTF_SET8_START(bpf_testmod_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
+BTF_SET8_END(bpf_testmod_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = {
-	.owner     = THIS_MODULE,
-	.check_set = &bpf_testmod_check_kfunc_ids,
+	.owner = THIS_MODULE,
+	.set   = &bpf_testmod_check_kfunc_ids,
 };
 
 extern int bpf_fentry_test1(int a);
-- 
cgit 


From 56e948ffc098a780fefb6c1784a3a2c7b81100a1 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 21 Jul 2022 15:42:36 +0200
Subject: bpf: Add support for forcing kfunc args to be trusted

Teach the verifier to detect a new KF_TRUSTED_ARGS kfunc flag, which
means each pointer argument must be trusted, which we define as a
pointer that is referenced (has non-zero ref_obj_id) and also needs to
have its offset unchanged, similar to how release functions expect their
argument. This allows a kfunc to receive pointer arguments unchanged
from the result of the acquire kfunc.

This is required to ensure that kfunc that operate on some object only
work on acquired pointers and not normal PTR_TO_BTF_ID with same type
which can be obtained by pointer walking. The restrictions applied to
release arguments also apply to trusted arguments. This implies that
strict type matching (not deducing type by recursively following members
at offset) and OBJ_RELEASE offset checks (ensuring they are zero) are
used for trusted pointer arguments.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 32 ++++++++++++++++++++++++++++++++
 kernel/bpf/btf.c    | 17 ++++++++++++++---
 net/bpf/test_run.c  |  5 +++++
 3 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 6dfc6eaf7f8c..cdb376d53238 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -17,6 +17,38 @@
 #define KF_RELEASE	(1 << 1) /* kfunc is a release function */
 #define KF_RET_NULL	(1 << 2) /* kfunc returns a pointer that may be NULL */
 #define KF_KPTR_GET	(1 << 3) /* kfunc returns reference to a kptr */
+/* Trusted arguments are those which are meant to be referenced arguments with
+ * unchanged offset. It is used to enforce that pointers obtained from acquire
+ * kfuncs remain unmodified when being passed to helpers taking trusted args.
+ *
+ * Consider
+ *	struct foo {
+ *		int data;
+ *		struct foo *next;
+ *	};
+ *
+ *	struct bar {
+ *		int data;
+ *		struct foo f;
+ *	};
+ *
+ *	struct foo *f = alloc_foo(); // Acquire kfunc
+ *	struct bar *b = alloc_bar(); // Acquire kfunc
+ *
+ * If a kfunc set_foo_data() wants to operate only on the allocated object, it
+ * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like:
+ *
+ *	set_foo_data(f, 42);	   // Allowed
+ *	set_foo_data(f->next, 42); // Rejected, non-referenced pointer
+ *	set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type
+ *	set_foo_data(&b->f, 42);   // Rejected, referenced, but bad offset
+ *
+ * In the final case, usually for the purposes of type matching, it is deduced
+ * by looking at the type of the member at the offset, but due to the
+ * requirement of trusted argument, this deduction will be strict and not done
+ * for this case.
+ */
+#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 
 struct btf;
 struct btf_member;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4d9c2d88720f..7ac971ea98d1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6174,10 +6174,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    u32 kfunc_flags)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	bool rel = false, kptr_get = false, trusted_arg = false;
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
-	bool rel = false, kptr_get = false;
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
@@ -6211,6 +6211,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		/* Only kfunc can be release func */
 		rel = kfunc_flags & KF_RELEASE;
 		kptr_get = kfunc_flags & KF_KPTR_GET;
+		trusted_arg = kfunc_flags & KF_TRUSTED_ARGS;
 	}
 
 	/* check that BTF function arguments match actual types that the
@@ -6235,10 +6236,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		/* Check if argument must be a referenced pointer, args + i has
+		 * been verified to be a pointer (after skipping modifiers).
+		 */
+		if (is_kfunc && trusted_arg && !reg->ref_obj_id) {
+			bpf_log(log, "R%d must be referenced\n", regno);
+			return -EINVAL;
+		}
+
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
-		if (rel && reg->ref_obj_id)
+		/* Trusted args have the same offset checks as release arguments */
+		if (trusted_arg || (rel && reg->ref_obj_id))
 			arg_type |= OBJ_RELEASE;
 		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
 		if (ret < 0)
@@ -6336,7 +6346,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			reg_ref_tname = btf_name_by_offset(reg_btf,
 							   reg_ref_t->name_off);
 			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
-						  reg->off, btf, ref_id, rel && reg->ref_obj_id)) {
+						  reg->off, btf, ref_id,
+						  trusted_arg || (rel && reg->ref_obj_id))) {
 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
 					func_name, i,
 					btf_type_str(ref_t), ref_tname,
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index ca5b7234a350..cbc9cd5058cb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -691,6 +691,10 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 {
 }
 
+noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
+{
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
@@ -714,6 +718,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
 BTF_SET8_END(test_sk_check_kfunc_ids)
 
 static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
-- 
cgit 


From d7e79c97c00ca82dace0e3b645d4b3b02fa273c2 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 21 Jul 2022 15:42:39 +0200
Subject: net: netfilter: Add kfuncs to allocate and insert CT

Introduce bpf_xdp_ct_alloc, bpf_skb_ct_alloc and bpf_ct_insert_entry
kfuncs in order to insert a new entry from XDP and TC programs.
Introduce bpf_nf_ct_tuple_parse utility routine to consolidate common
code.

We extract out a helper __nf_ct_set_timeout, used by the ctnetlink and
nf_conntrack_bpf code, extract it out to nf_conntrack_core, so that
nf_conntrack_bpf doesn't need a dependency on CONFIG_NF_CT_NETLINK.
Later this helper will be reused as a helper to set timeout of allocated
but not yet inserted CT entry.

The allocation functions return struct nf_conn___init instead of
nf_conn, to distinguish allocated CT from an already inserted or looked
up CT. This is later used to enforce restrictions on what kfuncs
allocated CT can be used with.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_core.h |  15 +++
 net/netfilter/nf_conntrack_bpf.c          | 208 +++++++++++++++++++++++++++---
 net/netfilter/nf_conntrack_netlink.c      |   8 +-
 3 files changed, 204 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 37866c8386e2..83a60c684e6c 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -84,4 +84,19 @@ void nf_conntrack_lock(spinlock_t *lock);
 
 extern spinlock_t nf_conntrack_expect_lock;
 
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \
+    IS_ENABLED(CONFIG_NF_CT_NETLINK))
+
+static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
+{
+	if (timeout > INT_MAX)
+		timeout = INT_MAX;
+	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+}
+
+#endif
+
 #endif /* _NF_CONNTRACK_CORE_H */
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 16304869264f..cac4a9558968 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -55,6 +55,94 @@ enum {
 	NF_BPF_CT_OPTS_SZ = 12,
 };
 
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+				 u32 tuple_len, u8 protonum, u8 dir,
+				 struct nf_conntrack_tuple *tuple)
+{
+	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+						  : &tuple->src.u;
+	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+						  : (void *)&tuple->dst.u;
+
+	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+		return -EPROTO;
+
+	memset(tuple, 0, sizeof(*tuple));
+
+	switch (tuple_len) {
+	case sizeof(bpf_tuple->ipv4):
+		tuple->src.l3num = AF_INET;
+		src->ip = bpf_tuple->ipv4.saddr;
+		sport->tcp.port = bpf_tuple->ipv4.sport;
+		dst->ip = bpf_tuple->ipv4.daddr;
+		dport->tcp.port = bpf_tuple->ipv4.dport;
+		break;
+	case sizeof(bpf_tuple->ipv6):
+		tuple->src.l3num = AF_INET6;
+		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+		sport->tcp.port = bpf_tuple->ipv6.sport;
+		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+		dport->tcp.port = bpf_tuple->ipv6.dport;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+	tuple->dst.protonum = protonum;
+	tuple->dst.dir = dir;
+
+	return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+			u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+			u32 timeout)
+{
+	struct nf_conntrack_tuple otuple, rtuple;
+	struct nf_conn *ct;
+	int err;
+
+	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+	    opts_len != NF_BPF_CT_OPTS_SZ)
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+				    IP_CT_DIR_ORIGINAL, &otuple);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+				    IP_CT_DIR_REPLY, &rtuple);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	if (opts->netns_id >= 0) {
+		net = get_net_ns_by_id(net, opts->netns_id);
+		if (unlikely(!net))
+			return ERR_PTR(-ENONET);
+	}
+
+	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+				GFP_ATOMIC);
+	if (IS_ERR(ct))
+		goto out;
+
+	memset(&ct->proto, 0, sizeof(ct->proto));
+	__nf_ct_set_timeout(ct, timeout * HZ);
+	ct->status |= IPS_CONFIRMED;
+
+out:
+	if (opts->netns_id >= 0)
+		put_net(net);
+
+	return ct;
+}
+
 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 					  struct bpf_sock_tuple *bpf_tuple,
 					  u32 tuple_len, struct bpf_ct_opts *opts,
@@ -63,6 +151,7 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 	struct nf_conntrack_tuple_hash *hash;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conn *ct;
+	int err;
 
 	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
 	    opts_len != NF_BPF_CT_OPTS_SZ)
@@ -72,27 +161,10 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
 		return ERR_PTR(-EINVAL);
 
-	memset(&tuple, 0, sizeof(tuple));
-	switch (tuple_len) {
-	case sizeof(bpf_tuple->ipv4):
-		tuple.src.l3num = AF_INET;
-		tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
-		tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
-		tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
-		tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
-		break;
-	case sizeof(bpf_tuple->ipv6):
-		tuple.src.l3num = AF_INET6;
-		memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
-		tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
-		memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
-		tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
-		break;
-	default:
-		return ERR_PTR(-EAFNOSUPPORT);
-	}
-
-	tuple.dst.protonum = opts->l4proto;
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+				    IP_CT_DIR_ORIGINAL, &tuple);
+	if (err < 0)
+		return ERR_PTR(err);
 
 	if (opts->netns_id >= 0) {
 		net = get_net_ns_by_id(net, opts->netns_id);
@@ -116,6 +188,43 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");
 
+struct nf_conn___init {
+	struct nf_conn ct;
+};
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
+ *		    Cannot be NULL
+ * @bpf_tuple	- Pointer to memory representing the tuple to look up
+ *		    Cannot be NULL
+ * @tuple__sz	- Length of the tuple structure
+ *		    Must be one of sizeof(bpf_tuple->ipv4) or
+ *		    sizeof(bpf_tuple->ipv6)
+ * @opts	- Additional options for allocation (documented above)
+ *		    Cannot be NULL
+ * @opts__sz	- Length of the bpf_ct_opts structure
+ *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+	struct nf_conn *nfct;
+
+	nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+				       opts, opts__sz, 10);
+	if (IS_ERR(nfct)) {
+		if (opts)
+			opts->error = PTR_ERR(nfct);
+		return NULL;
+	}
+
+	return (struct nf_conn___init *)nfct;
+}
+
 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
  *		       reference to it
  *
@@ -150,6 +259,40 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
 	return nfct;
 }
 
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @bpf_tuple	- Pointer to memory representing the tuple to look up
+ *		    Cannot be NULL
+ * @tuple__sz	- Length of the tuple structure
+ *		    Must be one of sizeof(bpf_tuple->ipv4) or
+ *		    sizeof(bpf_tuple->ipv6)
+ * @opts	- Additional options for allocation (documented above)
+ *		    Cannot be NULL
+ * @opts__sz	- Length of the bpf_ct_opts structure
+ *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct nf_conn *nfct;
+	struct net *net;
+
+	net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+	if (IS_ERR(nfct)) {
+		if (opts)
+			opts->error = PTR_ERR(nfct);
+		return NULL;
+	}
+
+	return (struct nf_conn___init *)nfct;
+}
+
 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
  *		       reference to it
  *
@@ -184,6 +327,26 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
 	return nfct;
 }
 
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct__ref	 - Pointer to referenced nf_conn___init object, obtained
+ *		   using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct__ref)
+{
+	struct nf_conn *nfct = (struct nf_conn *)nfct__ref;
+	int err;
+
+	err = nf_conntrack_hash_check_insert(nfct);
+	if (err < 0) {
+		nf_conntrack_free(nfct);
+		return NULL;
+	}
+	return nfct;
+}
+
 /* bpf_ct_release - Release acquired nf_conn object
  *
  * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
@@ -204,8 +367,11 @@ void bpf_ct_release(struct nf_conn *nfct)
 __diag_pop()
 
 BTF_SET8_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
 BTF_SET8_END(nf_ct_kfunc_set)
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 722af5e309ba..0729b2f0d44f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2025,9 +2025,7 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
 {
 	u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
 
-	if (timeout > INT_MAX)
-		timeout = INT_MAX;
-	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+	__nf_ct_set_timeout(ct, timeout);
 
 	if (test_bit(IPS_DYING_BIT, &ct->status))
 		return -ETIME;
@@ -2292,9 +2290,7 @@ ctnetlink_create_conntrack(struct net *net,
 		goto err1;
 
 	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-	if (timeout > INT_MAX)
-		timeout = INT_MAX;
-	ct->timeout = (u32)timeout + nfct_time_stamp;
+	__nf_ct_set_timeout(ct, timeout);
 
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {
-- 
cgit 


From 0b3892364431684e883682b85d008979e08d4ce6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 21 Jul 2022 15:42:40 +0200
Subject: net: netfilter: Add kfuncs to set and change CT timeout

Introduce bpf_ct_set_timeout and bpf_ct_change_timeout kfunc helpers in
order to change nf_conn timeout. This is same as ctnetlink_change_timeout,
hence code is shared between both by extracting it out to
__nf_ct_change_timeout. It is also updated to return an error when it
sees IPS_FIXED_TIMEOUT_BIT bit in ct->status, as that check was missing.

It is required to introduce two kfuncs taking nf_conn___init and nf_conn
instead of sharing one because KF_TRUSTED_ARGS flag causes strict type
checking. This would disallow passing nf_conn___init to kfunc taking
nf_conn, and vice versa. We cannot remove the KF_TRUSTED_ARGS flag as we
only want to accept refcounted pointers and not e.g. ct->master.

Apart from this, bpf_ct_set_timeout is only called for newly allocated
CT so it doesn't need to inspect the status field just yet. Sharing the
helpers even if it was possible would make timeout setting helper
sensitive to order of setting status and timeout after allocation.

Hence, bpf_ct_set_* kfuncs are meant to be used on allocated CT, and
bpf_ct_change_* kfuncs are meant to be used on inserted or looked up
CT entry.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_core.h |  2 ++
 net/netfilter/nf_conntrack_bpf.c          | 38 ++++++++++++++++++++++++++++---
 net/netfilter/nf_conntrack_core.c         | 22 ++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c      |  9 +-------
 4 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 83a60c684e6c..3b0f7d0eebae 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -97,6 +97,8 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
 	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
 }
 
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout);
+
 #endif
 
 #endif /* _NF_CONNTRACK_CORE_H */
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index cac4a9558968..b8912e15082f 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -331,12 +331,12 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
  *
  * This must be invoked for referenced PTR_TO_BTF_ID.
  *
- * @nfct__ref	 - Pointer to referenced nf_conn___init object, obtained
+ * @nfct	 - Pointer to referenced nf_conn___init object, obtained
  *		   using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
  */
-struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct__ref)
+struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
 {
-	struct nf_conn *nfct = (struct nf_conn *)nfct__ref;
+	struct nf_conn *nfct = (struct nf_conn *)nfct_i;
 	int err;
 
 	err = nf_conntrack_hash_check_insert(nfct);
@@ -364,6 +364,36 @@ void bpf_ct_release(struct nf_conn *nfct)
 	nf_ct_put(nfct);
 }
 
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct	 - Pointer to referenced nf_conn object, obtained using
+ *                 bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout      - Timeout in msecs.
+ */
+void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+	__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct	 - Pointer to referenced nf_conn object, obtained using
+ *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout      - New timeout in msecs.
+ */
+int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+	return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
 __diag_pop()
 
 BTF_SET8_START(nf_ct_kfunc_set)
@@ -373,6 +403,8 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
 BTF_SET8_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 082a2fd8d85b..572f59a5e936 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2786,3 +2786,25 @@ err_expect:
 	free_percpu(net->ct.stat);
 	return ret;
 }
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \
+    IS_ENABLED(CONFIG_NF_CT_NETLINK))
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+		return -EPERM;
+
+	__nf_ct_set_timeout(ct, timeout);
+
+	if (test_bit(IPS_DYING_BIT, &ct->status))
+		return -ETIME;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+#endif
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 0729b2f0d44f..b1de07c73845 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2023,14 +2023,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
 static int ctnetlink_change_timeout(struct nf_conn *ct,
 				    const struct nlattr * const cda[])
 {
-	u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
-	__nf_ct_set_timeout(ct, timeout);
-
-	if (test_bit(IPS_DYING_BIT, &ct->status))
-		return -ETIME;
-
-	return 0;
+	return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
 }
 
 #if defined(CONFIG_NF_CONNTRACK_MARK)
-- 
cgit 


From ef69aa3a986ef94f01ce8b5b619f550db54432fe Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 21 Jul 2022 15:42:41 +0200
Subject: net: netfilter: Add kfuncs to set and change CT status

Introduce bpf_ct_set_status and bpf_ct_change_status kfunc helpers in
order to set nf_conn field of allocated entry or update nf_conn status
field of existing inserted entry. Use nf_ct_change_status_common to
share the permitted status field changes between netlink and BPF side
by refactoring ctnetlink_change_status.

It is required to introduce two kfuncs taking nf_conn___init and nf_conn
instead of sharing one because KF_TRUSTED_ARGS flag causes strict type
checking. This would disallow passing nf_conn___init to kfunc taking
nf_conn, and vice versa. We cannot remove the KF_TRUSTED_ARGS flag as we
only want to accept refcounted pointers and not e.g. ct->master.

Hence, bpf_ct_set_* kfuncs are meant to be used on allocated CT, and
bpf_ct_change_* kfuncs are meant to be used on inserted or looked up
CT entry.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220721134245.2450-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_core.h |  2 ++
 net/netfilter/nf_conntrack_bpf.c          | 32 +++++++++++++++++++++++++
 net/netfilter/nf_conntrack_core.c         | 40 +++++++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c      | 39 ++----------------------------
 4 files changed, 76 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3b0f7d0eebae..3cd3a6e631aa 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -98,6 +98,8 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
 }
 
 int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout);
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off);
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status);
 
 #endif
 
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index b8912e15082f..1cd87b28c9b0 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -394,6 +394,36 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
 	return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
 }
 
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct	 - Pointer to referenced nf_conn object, obtained using
+ *		   bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status       - New status value.
+ */
+int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+	return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct	 - Pointer to referenced nf_conn object, obtained using
+ *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status       - New status value.
+ */
+int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+	return nf_ct_change_status_common(nfct, status);
+}
+
 __diag_pop()
 
 BTF_SET8_START(nf_ct_kfunc_set)
@@ -405,6 +435,8 @@ BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
 BTF_SET8_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 572f59a5e936..66a0aa8dbc3b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2807,4 +2807,44 @@ int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
 }
 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
 
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+	unsigned int bit;
+
+	/* Ignore these unchangable bits */
+	on &= ~IPS_UNCHANGEABLE_MASK;
+	off &= ~IPS_UNCHANGEABLE_MASK;
+
+	for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+		if (on & (1 << bit))
+			set_bit(bit, &ct->status);
+		else if (off & (1 << bit))
+			clear_bit(bit, &ct->status);
+	}
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+	unsigned long d;
+
+	d = ct->status ^ status;
+
+	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+		/* unchangeable */
+		return -EBUSY;
+
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EBUSY;
+
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EBUSY;
+
+	__nf_ct_change_status(ct, status, 0);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
+
 #endif
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b1de07c73845..e02832ef9b9f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1890,45 +1890,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 }
 #endif
 
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
-			  unsigned long off)
-{
-	unsigned int bit;
-
-	/* Ignore these unchangable bits */
-	on &= ~IPS_UNCHANGEABLE_MASK;
-	off &= ~IPS_UNCHANGEABLE_MASK;
-
-	for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
-		if (on & (1 << bit))
-			set_bit(bit, &ct->status);
-		else if (off & (1 << bit))
-			clear_bit(bit, &ct->status);
-	}
-}
-
 static int
 ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
 {
-	unsigned long d;
-	unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
-	d = ct->status ^ status;
-
-	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
-		/* unchangeable */
-		return -EBUSY;
-
-	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
-		/* SEEN_REPLY bit can only be set */
-		return -EBUSY;
-
-	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
-		/* ASSURED bit can only be set */
-		return -EBUSY;
-
-	__ctnetlink_change_status(ct, status, 0);
-	return 0;
+	return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
 }
 
 static int
@@ -2825,7 +2790,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
 	 * unchangeable bits but do not error out. Also user programs
 	 * are allowed to clear the bits that they are allowed to change.
 	 */
-	__ctnetlink_change_status(ct, status, ~status);
+	__nf_ct_change_status(ct, status, ~status);
 	return 0;
 }
 
-- 
cgit 


From 36eeee75ef0157e42fb6593dcc65daab289b559e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 20 Jul 2022 09:50:14 -0700
Subject: tcp: Fix a data-race around sysctl_tcp_adv_win_scale.

While reading sysctl_tcp_adv_win_scale, it can be changed concurrently.
Thus, we need to add READ_ONCE() to its reader.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 071735e10872..78a64e1b33a7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1419,7 +1419,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
 
 static inline int tcp_win_from_space(const struct sock *sk, int space)
 {
-	int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
+	int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
 
 	return tcp_adv_win_scale <= 0 ?
 		(space>>(-tcp_adv_win_scale)) :
-- 
cgit 


From 949d6b405e6160ae44baea39192d67b39cb7eeac Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 20 Jul 2022 16:57:58 -0700
Subject: net: add missing includes and forward declarations under net/

This patch adds missing includes to headers under include/net.
All these problems are currently masked by the existing users
including the missing dependency before the broken header.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lapb.h                |  5 +++++
 include/net/af_vsock.h              |  1 +
 include/net/amt.h                   |  3 +++
 include/net/ax88796.h               |  2 ++
 include/net/bond_options.h          |  8 ++++++++
 include/net/codel_qdisc.h           |  1 +
 include/net/datalink.h              |  7 +++++++
 include/net/dcbevent.h              |  2 ++
 include/net/dcbnl.h                 |  2 ++
 include/net/dn_dev.h                |  1 +
 include/net/dn_fib.h                |  2 ++
 include/net/dn_neigh.h              |  2 ++
 include/net/dn_nsp.h                |  6 ++++++
 include/net/dn_route.h              |  3 +++
 include/net/erspan.h                |  3 +++
 include/net/esp.h                   |  1 +
 include/net/ethoc.h                 |  3 +++
 include/net/firewire.h              |  2 ++
 include/net/fq.h                    |  4 ++++
 include/net/garp.h                  |  2 ++
 include/net/gtp.h                   |  4 ++++
 include/net/gue.h                   |  3 +++
 include/net/hwbm.h                  |  2 ++
 include/net/ila.h                   |  2 ++
 include/net/inet6_connection_sock.h |  2 ++
 include/net/inet_common.h           |  6 ++++++
 include/net/inet_frag.h             |  3 +++
 include/net/ip6_route.h             | 20 ++++++++++----------
 include/net/ipcomp.h                |  2 ++
 include/net/ipconfig.h              |  2 ++
 include/net/llc_c_ac.h              |  7 +++++++
 include/net/llc_c_st.h              |  4 ++++
 include/net/llc_s_ac.h              |  4 ++++
 include/net/llc_s_ev.h              |  1 +
 include/net/mpls_iptunnel.h         |  3 +++
 include/net/mrp.h                   |  4 ++++
 include/net/ncsi.h                  |  2 ++
 include/net/netevent.h              |  1 +
 include/net/netns/can.h             |  1 +
 include/net/netns/core.h            |  2 ++
 include/net/netns/generic.h         |  1 +
 include/net/netns/ipv4.h            |  1 +
 include/net/netns/mctp.h            |  1 +
 include/net/netns/mpls.h            |  2 ++
 include/net/netns/nexthop.h         |  1 +
 include/net/netns/sctp.h            |  3 +++
 include/net/netns/unix.h            |  2 ++
 include/net/netrom.h                |  1 +
 include/net/p8022.h                 |  5 +++++
 include/net/phonet/pep.h            |  3 +++
 include/net/phonet/phonet.h         |  4 ++++
 include/net/phonet/pn_dev.h         |  5 +++++
 include/net/pptp.h                  |  3 +++
 include/net/psnap.h                 |  5 +++++
 include/net/regulatory.h            |  3 +++
 include/net/rose.h                  |  1 +
 include/net/secure_seq.h            |  2 ++
 include/net/smc.h                   |  7 +++++++
 include/net/stp.h                   |  2 ++
 include/net/transp_v6.h             |  2 ++
 include/net/tun_proto.h             |  3 ++-
 include/net/udplite.h               |  1 +
 include/net/xdp_priv.h              |  1 +
 63 files changed, 183 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/lapb.h b/include/linux/lapb.h
index eb56472f23b2..b5333f9413dc 100644
--- a/include/linux/lapb.h
+++ b/include/linux/lapb.h
@@ -6,6 +6,11 @@
 #ifndef	LAPB_KERNEL_H
 #define	LAPB_KERNEL_H
 
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+struct net_device;
+
 #define	LAPB_OK			0
 #define	LAPB_BADTOKEN		1
 #define	LAPB_INVALUE		2
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f742e50207fb..1c53c4c4d88f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -10,6 +10,7 @@
 
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
+#include <net/sock.h>
 #include <uapi/linux/vm_sockets.h>
 
 #include "vsock_addr.h"
diff --git a/include/net/amt.h b/include/net/amt.h
index 08fc30cf2f34..c881bc8b673b 100644
--- a/include/net/amt.h
+++ b/include/net/amt.h
@@ -7,6 +7,9 @@
 
 #include <linux/siphash.h>
 #include <linux/jhash.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+#include <net/rtnetlink.h>
 
 enum amt_msg_type {
 	AMT_MSG_DISCOVERY = 1,
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index 2ed23a368602..b658471f97f0 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -8,6 +8,8 @@
 #ifndef __NET_AX88796_PLAT_H
 #define __NET_AX88796_PLAT_H
 
+#include <linux/types.h>
+
 struct sk_buff;
 struct net_device;
 struct platform_device;
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index d2aea5cf1e41..69292ecc0325 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -7,6 +7,14 @@
 #ifndef _NET_BOND_OPTIONS_H
 #define _NET_BOND_OPTIONS_H
 
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/types.h>
+#include <linux/string.h>
+
+struct netlink_ext_ack;
+struct nlattr;
+
 #define BOND_OPT_MAX_NAMELEN 32
 #define BOND_OPT_VALID(opt) ((opt) < BOND_OPT_LAST)
 #define BOND_MODE_ALL_EX(x) (~(x))
diff --git a/include/net/codel_qdisc.h b/include/net/codel_qdisc.h
index 58b6d0ebea10..7d3d9219f4fe 100644
--- a/include/net/codel_qdisc.h
+++ b/include/net/codel_qdisc.h
@@ -49,6 +49,7 @@
  * Implemented on linux by Dave Taht and Eric Dumazet
  */
 
+#include <net/codel.h>
 #include <net/pkt_sched.h>
 
 /* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */
diff --git a/include/net/datalink.h b/include/net/datalink.h
index d9b7faaa539f..c837ffc7ebf8 100644
--- a/include/net/datalink.h
+++ b/include/net/datalink.h
@@ -2,6 +2,13 @@
 #ifndef _NET_INET_DATALINK_H_
 #define _NET_INET_DATALINK_H_
 
+#include <linux/list.h>
+
+struct llc_sap;
+struct net_device;
+struct packet_type;
+struct sk_buff;
+
 struct datalink_proto {
         unsigned char   type[8];
 
diff --git a/include/net/dcbevent.h b/include/net/dcbevent.h
index 43e34131a53f..02700262f71a 100644
--- a/include/net/dcbevent.h
+++ b/include/net/dcbevent.h
@@ -8,6 +8,8 @@
 #ifndef _DCB_EVENT_H
 #define _DCB_EVENT_H
 
+struct notifier_block;
+
 enum dcbevent_notif_type {
 	DCB_APP_EVENT = 1,
 };
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index e4ad58c4062c..2b2d86fb3131 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -10,6 +10,8 @@
 
 #include <linux/dcbnl.h>
 
+struct net_device;
+
 struct dcb_app_type {
 	int	ifindex;
 	struct dcb_app	  app;
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 595b4f6c1eb1..bec303ea8367 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -2,6 +2,7 @@
 #ifndef _NET_DN_DEV_H
 #define _NET_DN_DEV_H
 
+#include <linux/netdevice.h>
 
 struct dn_dev;
 
diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index ddd6565957b3..1929a3cd5ebe 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -4,6 +4,8 @@
 
 #include <linux/netlink.h>
 #include <linux/refcount.h>
+#include <linux/rtnetlink.h>
+#include <net/fib_rules.h>
 
 extern const struct nla_policy rtm_dn_policy[];
 
diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h
index 2e3e7793973a..1f7df98bfc33 100644
--- a/include/net/dn_neigh.h
+++ b/include/net/dn_neigh.h
@@ -2,6 +2,8 @@
 #ifndef _NET_DN_NEIGH_H
 #define _NET_DN_NEIGH_H
 
+#include <net/neighbour.h>
+
 /*
  * The position of the first two fields of
  * this structure are critical - SJW
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index f83932b864a9..a4a18fee0b7c 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -6,6 +6,12 @@
     
 *******************************************************************************/
 /* dn_nsp.c functions prototyping */
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+struct sk_buff;
+struct sk_buff_head;
 
 void dn_nsp_send_data_ack(struct sock *sk);
 void dn_nsp_send_oth_ack(struct sock *sk);
diff --git a/include/net/dn_route.h b/include/net/dn_route.h
index 6f1e94ac0bdf..88c0300236cc 100644
--- a/include/net/dn_route.h
+++ b/include/net/dn_route.h
@@ -7,6 +7,9 @@
     
 *******************************************************************************/
 
+#include <linux/types.h>
+#include <net/dst.h>
+
 struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri);
 int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *,
 			 struct sock *sk, int flags);
diff --git a/include/net/erspan.h b/include/net/erspan.h
index 0d9e86bd9893..6cb4cbd6a48f 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -58,6 +58,9 @@
  * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB
  */
 
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/skbuff.h>
 #include <uapi/linux/erspan.h>
 
 #define ERSPAN_VERSION	0x1	/* ERSPAN type II */
diff --git a/include/net/esp.h b/include/net/esp.h
index 9c5637d41d95..322950727dd0 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -5,6 +5,7 @@
 #include <linux/skbuff.h>
 
 struct ip_esp_hdr;
+struct xfrm_state;
 
 static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb)
 {
diff --git a/include/net/ethoc.h b/include/net/ethoc.h
index 78519ed42ab4..73810f3ca492 100644
--- a/include/net/ethoc.h
+++ b/include/net/ethoc.h
@@ -10,6 +10,9 @@
 #ifndef LINUX_NET_ETHOC_H
 #define LINUX_NET_ETHOC_H 1
 
+#include <linux/if.h>
+#include <linux/types.h>
+
 struct ethoc_platform_data {
 	u8 hwaddr[IFHWADDRLEN];
 	s8 phy_id;
diff --git a/include/net/firewire.h b/include/net/firewire.h
index 299e5df38552..2442d645e412 100644
--- a/include/net/firewire.h
+++ b/include/net/firewire.h
@@ -2,6 +2,8 @@
 #ifndef _NET_FIREWIRE_H
 #define _NET_FIREWIRE_H
 
+#include <linux/types.h>
+
 /* Pseudo L2 address */
 #define FWNET_ALEN	16
 union fwnet_hwaddr {
diff --git a/include/net/fq.h b/include/net/fq.h
index 2eccbbd2b559..07b5aff6ec58 100644
--- a/include/net/fq.h
+++ b/include/net/fq.h
@@ -7,6 +7,10 @@
 #ifndef __NET_SCHED_FQ_H
 #define __NET_SCHED_FQ_H
 
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
 struct fq_tin;
 
 /**
diff --git a/include/net/garp.h b/include/net/garp.h
index 4d9a0c6a2e5f..59a07b171def 100644
--- a/include/net/garp.h
+++ b/include/net/garp.h
@@ -2,6 +2,8 @@
 #ifndef _NET_GARP_H
 #define _NET_GARP_H
 
+#include <linux/if_ether.h>
+#include <linux/types.h>
 #include <net/stp.h>
 
 #define GARP_PROTOCOL_ID	0x1
diff --git a/include/net/gtp.h b/include/net/gtp.h
index c1d6169df331..2a503f035d18 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -2,6 +2,10 @@
 #ifndef _GTP_H_
 #define _GTP_H_
 
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/rtnetlink.h>
+
 /* General GTP protocol related definitions. */
 
 #define GTP0_PORT	3386
diff --git a/include/net/gue.h b/include/net/gue.h
index e42402f180b7..dfca298bec9c 100644
--- a/include/net/gue.h
+++ b/include/net/gue.h
@@ -30,6 +30,9 @@
  * may refer to options placed after this field.
  */
 
+#include <asm/byteorder.h>
+#include <linux/types.h>
+
 struct guehdr {
 	union {
 		struct {
diff --git a/include/net/hwbm.h b/include/net/hwbm.h
index c81444611a22..aa495decec35 100644
--- a/include/net/hwbm.h
+++ b/include/net/hwbm.h
@@ -2,6 +2,8 @@
 #ifndef _HWBM_H
 #define _HWBM_H
 
+#include <linux/mutex.h>
+
 struct hwbm_pool {
 	/* Capacity of the pool */
 	int size;
diff --git a/include/net/ila.h b/include/net/ila.h
index f98dcd5791b0..73ebe5eab272 100644
--- a/include/net/ila.h
+++ b/include/net/ila.h
@@ -8,6 +8,8 @@
 #ifndef _NET_ILA_H
 #define _NET_ILA_H
 
+struct sk_buff;
+
 int ila_xlat_outgoing(struct sk_buff *skb);
 int ila_xlat_incoming(struct sk_buff *skb);
 
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index 7392f959a405..025bd8d3c769 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -11,6 +11,8 @@
 
 #include <linux/types.h>
 
+struct flowi;
+struct flowi6;
 struct request_sock;
 struct sk_buff;
 struct sock;
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index cad2a611efde..cec453c18f1d 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -3,6 +3,10 @@
 #define _INET_COMMON_H
 
 #include <linux/indirect_call_wrapper.h>
+#include <linux/net.h>
+#include <linux/netdev_features.h>
+#include <linux/types.h>
+#include <net/sock.h>
 
 extern const struct proto_ops inet_stream_ops;
 extern const struct proto_ops inet_dgram_ops;
@@ -12,6 +16,8 @@ extern const struct proto_ops inet_dgram_ops;
  */
 
 struct msghdr;
+struct net;
+struct page;
 struct sock;
 struct sockaddr;
 struct socket;
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 911ad930867d..0b0876610553 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,6 +4,9 @@
 
 #include <linux/rhashtable-types.h>
 #include <linux/completion.h>
+#include <linux/in6.h>
+#include <linux/rbtree_types.h>
+#include <linux/refcount.h>
 
 /* Per netns frag queues directory */
 struct fqdir {
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ca2d6b60e1ec..035d61d50a98 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -2,6 +2,16 @@
 #ifndef _NET_IP6_ROUTE_H
 #define _NET_IP6_ROUTE_H
 
+#include <net/addrconf.h>
+#include <net/flow.h>
+#include <net/ip6_fib.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <net/nexthop.h>
+
 struct route_info {
 	__u8			type;
 	__u8			length;
@@ -19,16 +29,6 @@ struct route_info {
 	__u8			prefix[];	/* 0,8 or 16 */
 };
 
-#include <net/addrconf.h>
-#include <net/flow.h>
-#include <net/ip6_fib.h>
-#include <net/sock.h>
-#include <net/lwtunnel.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/route.h>
-#include <net/nexthop.h>
-
 #define RT6_LOOKUP_F_IFACE		0x00000001
 #define RT6_LOOKUP_F_REACHABLE		0x00000002
 #define RT6_LOOKUP_F_HAS_SADDR		0x00000004
diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h
index fee6fc451597..c31108295079 100644
--- a/include/net/ipcomp.h
+++ b/include/net/ipcomp.h
@@ -2,11 +2,13 @@
 #ifndef _NET_IPCOMP_H
 #define _NET_IPCOMP_H
 
+#include <linux/skbuff.h>
 #include <linux/types.h>
 
 #define IPCOMP_SCRATCH_SIZE     65400
 
 struct crypto_comp;
+struct ip_comp_hdr;
 
 struct ipcomp_data {
 	u16 threshold;
diff --git a/include/net/ipconfig.h b/include/net/ipconfig.h
index e3534299bd2a..8276897d0c2e 100644
--- a/include/net/ipconfig.h
+++ b/include/net/ipconfig.h
@@ -7,6 +7,8 @@
 
 /* The following are initdata: */
 
+#include <linux/types.h>
+
 extern int ic_proto_enabled;	/* Protocols enabled (see IC_xxx) */
 extern int ic_set_manually;	/* IPconfig parameters set manually */
 
diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h
index e766300b3e99..3e1f76786d7b 100644
--- a/include/net/llc_c_ac.h
+++ b/include/net/llc_c_ac.h
@@ -16,6 +16,13 @@
  * Connection state transition actions
  * (Fb = F bit; Pb = P bit; Xb = X bit)
  */
+
+#include <linux/types.h>
+
+struct sk_buff;
+struct sock;
+struct timer_list;
+
 #define LLC_CONN_AC_CLR_REMOTE_BUSY			 1
 #define LLC_CONN_AC_CONN_IND				 2
 #define LLC_CONN_AC_CONN_CONFIRM			 3
diff --git a/include/net/llc_c_st.h b/include/net/llc_c_st.h
index 48f3f891b2f9..53823d61d8b6 100644
--- a/include/net/llc_c_st.h
+++ b/include/net/llc_c_st.h
@@ -11,6 +11,10 @@
  *
  * See the GNU General Public License for more details.
  */
+
+#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
+
 /* Connection component state management */
 /* connection states */
 #define LLC_CONN_OUT_OF_SVC		 0	/* prior to allocation */
diff --git a/include/net/llc_s_ac.h b/include/net/llc_s_ac.h
index a61b98c108ee..f71790305bc9 100644
--- a/include/net/llc_s_ac.h
+++ b/include/net/llc_s_ac.h
@@ -11,6 +11,10 @@
  *
  * See the GNU General Public License for more details.
  */
+
+struct llc_sap;
+struct sk_buff;
+
 /* SAP component actions */
 #define SAP_ACT_UNITDATA_IND	1
 #define SAP_ACT_SEND_UI		2
diff --git a/include/net/llc_s_ev.h b/include/net/llc_s_ev.h
index 84db3a59ed28..fb7df1d70af3 100644
--- a/include/net/llc_s_ev.h
+++ b/include/net/llc_s_ev.h
@@ -13,6 +13,7 @@
  */
 
 #include <linux/skbuff.h>
+#include <net/llc.h>
 
 /* Defines SAP component events */
 /* Types of events (possible values in 'ev->type') */
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 9deb3a3735da..0c71c27979fb 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -6,6 +6,9 @@
 #ifndef _NET_MPLS_IPTUNNEL_H
 #define _NET_MPLS_IPTUNNEL_H 1
 
+#include <linux/types.h>
+#include <net/lwtunnel.h>
+
 struct mpls_iptunnel_encap {
 	u8	labels;
 	u8	ttl_propagate;
diff --git a/include/net/mrp.h b/include/net/mrp.h
index 1c308c034e1a..92cd3fb6cf9d 100644
--- a/include/net/mrp.h
+++ b/include/net/mrp.h
@@ -2,6 +2,10 @@
 #ifndef _NET_MRP_H
 #define _NET_MRP_H
 
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
 #define MRP_END_MARK		0x0
 
 struct mrp_pdu_hdr {
diff --git a/include/net/ncsi.h b/include/net/ncsi.h
index fbefe80361ee..08a50d9acb0a 100644
--- a/include/net/ncsi.h
+++ b/include/net/ncsi.h
@@ -2,6 +2,8 @@
 #ifndef __NET_NCSI_H
 #define __NET_NCSI_H
 
+#include <linux/types.h>
+
 /*
  * The NCSI device states seen from external. More NCSI device states are
  * only visible internally (in net/ncsi/internal.h). When the NCSI device
diff --git a/include/net/netevent.h b/include/net/netevent.h
index 4107016c3bb4..1be3757a8b7f 100644
--- a/include/net/netevent.h
+++ b/include/net/netevent.h
@@ -14,6 +14,7 @@
 
 struct dst_entry;
 struct neighbour;
+struct notifier_block ;
 
 struct netevent_redirect {
 	struct dst_entry *old;
diff --git a/include/net/netns/can.h b/include/net/netns/can.h
index 52fbd8291a96..48b79f7e6236 100644
--- a/include/net/netns/can.h
+++ b/include/net/netns/can.h
@@ -7,6 +7,7 @@
 #define __NETNS_CAN_H__
 
 #include <linux/spinlock.h>
+#include <linux/timer.h>
 
 struct can_dev_rcv_lists;
 struct can_pkg_stats;
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 388244e315e7..8249060cf5d0 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -2,6 +2,8 @@
 #ifndef __NETNS_CORE_H__
 #define __NETNS_CORE_H__
 
+#include <linux/types.h>
+
 struct ctl_table_header;
 struct prot_inuse;
 
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 8a1ab47c3fb3..7ce68183f6e1 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -8,6 +8,7 @@
 
 #include <linux/bug.h>
 #include <linux/rcupdate.h>
+#include <net/net_namespace.h>
 
 /*
  * Generic net pointers are to be used by modules to put some private
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ce0cc4e8d8c7..c7320ef356d9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -9,6 +9,7 @@
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
 #include <linux/rcupdate.h>
+#include <linux/seqlock.h>
 #include <linux/siphash.h>
 
 struct ctl_table_header;
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index acedef12a35e..1db8f9aaddb4 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -6,6 +6,7 @@
 #ifndef __NETNS_MCTP_H__
 #define __NETNS_MCTP_H__
 
+#include <linux/mutex.h>
 #include <linux/types.h>
 
 struct netns_mctp {
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index a7bdcfbb0b28..19ad2574b267 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -6,6 +6,8 @@
 #ifndef __NETNS_MPLS_H__
 #define __NETNS_MPLS_H__
 
+#include <linux/types.h>
+
 struct mpls_route;
 struct ctl_table_header;
 
diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h
index 1849e77eb68a..434239b37014 100644
--- a/include/net/netns/nexthop.h
+++ b/include/net/netns/nexthop.h
@@ -6,6 +6,7 @@
 #ifndef __NETNS_NEXTHOP_H__
 #define __NETNS_NEXTHOP_H__
 
+#include <linux/notifier.h>
 #include <linux/rbtree.h>
 
 struct netns_nexthop {
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 40240722cdca..a681147aecd8 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -2,6 +2,9 @@
 #ifndef __NETNS_SCTP_H__
 #define __NETNS_SCTP_H__
 
+#include <linux/timer.h>
+#include <net/snmp.h>
+
 struct sock;
 struct proc_dir_entry;
 struct sctp_mib;
diff --git a/include/net/netns/unix.h b/include/net/netns/unix.h
index 6f1a33df061d..9859d134d5a8 100644
--- a/include/net/netns/unix.h
+++ b/include/net/netns/unix.h
@@ -5,6 +5,8 @@
 #ifndef __NETNS_UNIX_H__
 #define __NETNS_UNIX_H__
 
+#include <linux/spinlock.h>
+
 struct unix_table {
 	spinlock_t		*locks;
 	struct hlist_head	*buckets;
diff --git a/include/net/netrom.h b/include/net/netrom.h
index 80f15b1c1a48..f0565a5987d1 100644
--- a/include/net/netrom.h
+++ b/include/net/netrom.h
@@ -14,6 +14,7 @@
 #include <net/sock.h>
 #include <linux/refcount.h>
 #include <linux/seq_file.h>
+#include <net/ax25.h>
 
 #define	NR_NETWORK_LEN			15
 #define	NR_TRANSPORT_LEN		5
diff --git a/include/net/p8022.h b/include/net/p8022.h
index c2bacc66bfbc..b690ffcad66b 100644
--- a/include/net/p8022.h
+++ b/include/net/p8022.h
@@ -1,6 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _NET_P8022_H
 #define _NET_P8022_H
+
+struct net_device;
+struct packet_type;
+struct sk_buff;
+
 struct datalink_proto *
 register_8022_client(unsigned char type,
 		     int (*func)(struct sk_buff *skb,
diff --git a/include/net/phonet/pep.h b/include/net/phonet/pep.h
index 27b1ab5e4e6d..645dddf5ce77 100644
--- a/include/net/phonet/pep.h
+++ b/include/net/phonet/pep.h
@@ -10,6 +10,9 @@
 #ifndef NET_PHONET_PEP_H
 #define NET_PHONET_PEP_H
 
+#include <linux/skbuff.h>
+#include <net/phonet/phonet.h>
+
 struct pep_sock {
 	struct pn_sock		pn_sk;
 
diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index a27bdc6cfeab..862f1719b523 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -10,6 +10,10 @@
 #ifndef AF_PHONET_H
 #define AF_PHONET_H
 
+#include <linux/phonet.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
 /*
  * The lower layers may not require more space, ever. Make sure it's
  * enough.
diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h
index 05b49d4d2b11..e9dc8dca5817 100644
--- a/include/net/phonet/pn_dev.h
+++ b/include/net/phonet/pn_dev.h
@@ -10,6 +10,11 @@
 #ifndef PN_DEV_H
 #define PN_DEV_H
 
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+struct net;
+
 struct phonet_device_list {
 	struct list_head list;
 	struct mutex lock;
diff --git a/include/net/pptp.h b/include/net/pptp.h
index 383e25ca53a7..e63176bdd4c8 100644
--- a/include/net/pptp.h
+++ b/include/net/pptp.h
@@ -2,6 +2,9 @@
 #ifndef _NET_PPTP_H
 #define _NET_PPTP_H
 
+#include <linux/types.h>
+#include <net/gre.h>
+
 #define PPP_LCP_ECHOREQ 0x09
 #define PPP_LCP_ECHOREP 0x0A
 #define SC_RCV_BITS     (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
diff --git a/include/net/psnap.h b/include/net/psnap.h
index 7cb0c8ab4171..88802b0754ad 100644
--- a/include/net/psnap.h
+++ b/include/net/psnap.h
@@ -2,6 +2,11 @@
 #ifndef _NET_PSNAP_H
 #define _NET_PSNAP_H
 
+struct datalink_proto;
+struct sk_buff;
+struct packet_type;
+struct net_device;
+
 struct datalink_proto *
 register_snap_client(const unsigned char *desc,
 		     int (*rcvfunc)(struct sk_buff *, struct net_device *,
diff --git a/include/net/regulatory.h b/include/net/regulatory.h
index 47f06f6f5a67..896191f420d5 100644
--- a/include/net/regulatory.h
+++ b/include/net/regulatory.h
@@ -1,3 +1,4 @@
+
 #ifndef __NET_REGULATORY_H
 #define __NET_REGULATORY_H
 /*
@@ -19,6 +20,8 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <linux/ieee80211.h>
+#include <linux/nl80211.h>
 #include <linux/rcupdate.h>
 
 /**
diff --git a/include/net/rose.h b/include/net/rose.h
index 0f0a4ce0fee7..f192a64ddef2 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -9,6 +9,7 @@
 #define _ROSE_H 
 
 #include <linux/rose.h>
+#include <net/ax25.h>
 #include <net/sock.h>
 
 #define	ROSE_ADDR_LEN			5
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index dac91aa38c5a..21e7fa2a1813 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+struct net;
+
 u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
diff --git a/include/net/smc.h b/include/net/smc.h
index e441aa97ad61..37f829d9c6e5 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -11,6 +11,13 @@
 #ifndef _SMC_H
 #define _SMC_H
 
+#include <linux/device.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+struct sock;
+
 #define SMC_MAX_PNETID_LEN	16	/* Max. length of PNET id */
 
 struct smc_hashinfo {
diff --git a/include/net/stp.h b/include/net/stp.h
index 2914e6d53490..528103fce2c0 100644
--- a/include/net/stp.h
+++ b/include/net/stp.h
@@ -2,6 +2,8 @@
 #ifndef _NET_STP_H
 #define _NET_STP_H
 
+#include <linux/if_ether.h>
+
 struct stp_proto {
 	unsigned char	group_address[ETH_ALEN];
 	void		(*rcv)(const struct stp_proto *, struct sk_buff *,
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index da06613c9603..b830463e3dff 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -3,6 +3,7 @@
 #define _TRANSP_V6_H
 
 #include <net/checksum.h>
+#include <net/sock.h>
 
 /* IPv6 transport protocols */
 extern struct proto rawv6_prot;
@@ -12,6 +13,7 @@ extern struct proto tcpv6_prot;
 extern struct proto pingv6_prot;
 
 struct flowi6;
+struct ipcm6_cookie;
 
 /* extension headers */
 int ipv6_exthdrs_init(void);
diff --git a/include/net/tun_proto.h b/include/net/tun_proto.h
index 2ea3deba4c99..7b0de7852908 100644
--- a/include/net/tun_proto.h
+++ b/include/net/tun_proto.h
@@ -1,7 +1,8 @@
 #ifndef __NET_TUN_PROTO_H
 #define __NET_TUN_PROTO_H
 
-#include <linux/kernel.h>
+#include <linux/if_ether.h>
+#include <linux/types.h>
 
 /* One byte protocol values as defined by VXLAN-GPE and NSH. These will
  * hopefully get a shared IANA registry.
diff --git a/include/net/udplite.h b/include/net/udplite.h
index a3c53110d30b..0143b373602e 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -6,6 +6,7 @@
 #define _UDPLITE_H
 
 #include <net/ip6_checksum.h>
+#include <net/udp.h>
 
 /* UDP-Lite socket options */
 #define UDPLITE_SEND_CSCOV   10 /* sender partial coverage (as sent)      */
diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
index a2d58b1a12e1..c9df68d5f258 100644
--- a/include/net/xdp_priv.h
+++ b/include/net/xdp_priv.h
@@ -3,6 +3,7 @@
 #define __LINUX_NET_XDP_PRIV_H__
 
 #include <linux/rhashtable.h>
+#include <net/xdp.h>
 
 /* Private to net/core/xdp.c, but used by trace/events/xdp.h */
 struct xdp_mem_allocator {
-- 
cgit 


From 9dd1953846c7cd58100a5c6bd90db54e2c60668a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Jul 2022 10:26:50 +0200
Subject: wifi: nl80211/mac80211: clarify link ID in control port TX

Clarify the link ID behaviour in control port TX, we need it
to select the link to transmit on for both MLD and non-MLD
receivers, but select the link address as the SA only if the
receiver is not an MLD.

Fixes: 67207bab9341 ("wifi: cfg80211/mac80211: Support control port TX from specific link")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  6 ++++++
 net/mac80211/tx.c            | 43 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 3fa586e38f88..d4d6ba585b41 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1119,6 +1119,12 @@
  *	has been received. %NL80211_ATTR_FRAME is used to specify the
  *	frame contents.  The frame is the raw EAPoL data, without ethernet or
  *	802.11 headers.
+ *	For an MLD transmitter, the %NL80211_ATTR_MLO_LINK_ID may be given and
+ *	its effect will depend on the destination: If the destination is known
+ *	to be an MLD, this will be used as a hint to select the link to transmit
+ *	the frame on. If the destination is not an MLD, this will select both
+ *	the link to transmit on and the source address will be set to the link
+ *	address of that link.
  *	When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE,
  *	%NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added
  *	indicating the protocol type of the received frame; whether the frame
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 772108c2cc6b..06ec152e8188 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2896,9 +2896,35 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	info->flags = info_flags;
 	info->ack_frame_id = info_id;
 	info->band = band;
-	info->control.flags = ctrl_flags |
-			      u32_encode_bits(link_id,
+
+	if (likely(!cookie)) {
+		ctrl_flags |= u32_encode_bits(link_id,
 					      IEEE80211_TX_CTRL_MLO_LINK);
+	} else {
+		unsigned int pre_conf_link_id;
+
+		/*
+		 * ctrl_flags already have been set by
+		 * ieee80211_tx_control_port(), here
+		 * we just sanity check that
+		 */
+
+		pre_conf_link_id = u32_get_bits(ctrl_flags,
+						IEEE80211_TX_CTRL_MLO_LINK);
+
+		if (pre_conf_link_id != link_id &&
+		    link_id != IEEE80211_LINK_UNSPECIFIED) {
+#ifdef CPTCFG_MAC80211_VERBOSE_DEBUG
+			net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n",
+					     sdata->name, hdr.addr1,
+					     pre_conf_link_id, link_id);
+#endif
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	info->control.flags = ctrl_flags;
 
 	return skb;
  free:
@@ -5745,11 +5771,17 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	ehdr = skb_push(skb, sizeof(struct ethhdr));
 	memcpy(ehdr->h_dest, dest, ETH_ALEN);
 
+	/* we may override the SA for MLO STA later */
 	if (link_id < 0) {
+		ctrl_flags |= u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
+					      IEEE80211_TX_CTRL_MLO_LINK);
 		memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
 	} else {
 		struct ieee80211_bss_conf *link_conf;
 
+		ctrl_flags |= u32_encode_bits(link_id,
+					      IEEE80211_TX_CTRL_MLO_LINK);
+
 		rcu_read_lock();
 		link_conf = rcu_dereference(sdata->vif.link_conf[link_id]);
 		if (!link_conf) {
@@ -5784,6 +5816,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 
 		skb_set_queue_mapping(skb, queue);
 		skb_get_hash(skb);
+
+		/*
+		 * for MLO STA, the SA should be the AP MLD address, but
+		 * the link ID has been selected already
+		 */
+		if (sta->sta.mlo)
+			memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
 	}
 	rcu_read_unlock();
 
-- 
cgit 


From 0903f899418ee0235e4ad756f5502162f29b49b9 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Thu, 27 Jan 2022 14:39:46 +0200
Subject: wifi: ieee80211: add helper functions for detecting TM/FTM frames

Add helper functions for detection timing measurement
and fine timing measurement frames.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index cca564372d16..55e6f4ad0ca6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1332,6 +1332,15 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 variable[];
 				} __packed s1g;
+				struct {
+					u8 action_code;
+					u8 dialog_token;
+					u8 follow_up;
+					u32 tod;
+					u32 toa;
+					u8 max_tod_error;
+					u8 max_toa_error;
+				} __packed wnm_timing_msr;
 			} u;
 		} __packed action;
 	} u;
@@ -3522,6 +3531,17 @@ enum ieee80211_mesh_actioncode {
 	WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE,
 };
 
+/* Unprotected WNM action codes */
+enum ieee80211_unprotected_wnm_actioncode {
+	WLAN_UNPROTECTED_WNM_ACTION_TIM = 0,
+	WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
+};
+
+/* Public action codes */
+enum ieee80211_public_actioncode {
+	WLAN_PUBLIC_ACTION_FTM_RESPONSE = 33,
+};
+
 /* Security key length */
 enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
@@ -4311,6 +4331,40 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb)
 	return true;
 }
 
+static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+	if (skb->len < IEEE80211_MIN_ACTION_SIZE)
+		return false;
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return false;
+
+	if (mgmt->u.action.category == WLAN_CATEGORY_WNM_UNPROTECTED &&
+	    mgmt->u.action.u.wnm_timing_msr.action_code ==
+		WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE &&
+	    skb->len >= offsetofend(typeof(*mgmt), u.action.u.wnm_timing_msr))
+		return true;
+
+	return false;
+}
+
+static inline bool ieee80211_is_ftm(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+	if (!ieee80211_is_public_action((void *)mgmt, skb->len))
+		return false;
+
+	if (mgmt->u.action.u.ftm.action_code ==
+		WLAN_PUBLIC_ACTION_FTM_RESPONSE &&
+	    skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm))
+		return true;
+
+	return false;
+}
+
 struct element {
 	u8 id;
 	u8 datalen;
-- 
cgit 


From 80b0ed70a271d375feb2286696ca8af147a035cf Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 26 Jan 2022 16:06:35 +0200
Subject: wifi: nl80211: add RX and TX timestamp attributes

Add attributes for reporting hardware timestamps for management frames
RX and TX. These attributes will be used for reporting hardware
timestamps for Timing measurement and Fine Timing Measurement action
frames, which will allow userspace applications to measure the path
delay between devices and sync clocks.

For TX, these attributes are used for reporting the frame RX time and
the ack TX time. For TX, they are used for reporting the frame TX time
and the ack RX time.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d4d6ba585b41..5275dcbc5ee8 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -764,6 +764,9 @@
  *	%NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA
  *	counters which will be updated to the current value. This attribute
  *	is used during CSA period.
+ *	For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
+ *	indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may
+ *	be included to indicate the ack TX timestamp.
  * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this
  *	command may be used with the corresponding cookie to cancel the wait
  *	time if it is known that it is no longer necessary.  This command is
@@ -774,7 +777,9 @@
  *	transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies
  *	the TX command and %NL80211_ATTR_FRAME includes the contents of the
  *	frame. %NL80211_ATTR_ACK flag is included if the recipient acknowledged
- *	the frame.
+ *	the frame. %NL80211_ATTR_TX_HW_TIMESTAMP may be included to indicate the
+ *	tx timestamp and %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
+ *	indicate the ack RX timestamp.
  * @NL80211_CMD_ACTION_TX_STATUS: Alias for @NL80211_CMD_FRAME_TX_STATUS for
  *	backward compatibility.
  *
@@ -2720,6 +2725,18 @@ enum nl80211_commands {
  * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16)
  * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16)
  *
+ * @NL80211_ATTR_TX_HW_TIMESTAMP: Hardware timestamp for TX operation in
+ *	nanoseconds (u64). This is the device clock timestamp so it will
+ *	probably reset when the device is stopped or the firmware is reset.
+ *	When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the frame TX
+ *	timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates
+ *	the ack TX timestamp.
+ * @NL80211_ATTR_RX_HW_TIMESTAMP: Hardware timestamp for RX operation in
+ *	nanoseconds (u64). This is the device clock timestamp so it will
+ *	probably reset when the device is stopped or the firmware is reset.
+ *	When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX
+ *	timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates
+ *	the incoming frame RX timestamp.
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3245,6 +3262,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_EML_CAPABILITY,
 	NL80211_ATTR_MLD_CAPA_AND_OPS,
 
+	NL80211_ATTR_TX_HW_TIMESTAMP,
+	NL80211_ATTR_RX_HW_TIMESTAMP,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit 


From ea7d50c925cec96b22655c4dc8672fbd59355bed Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 26 Jan 2022 19:13:47 +0200
Subject: wifi: cfg80211: add a function for reporting TX status with hardware
 timestamps

Add a function for reporting TX status with hardware timestamps. This
function shall be used for reporting the TX status of Timing
measurement and Fine timing measurement action frames by devices that
support reporting hardware timestamps.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 net/wireless/nl80211.c | 42 ++++++++++++++++++++++++++++--------------
 2 files changed, 73 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d5af3a7fc2b4..43d54f5c84f9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7837,6 +7837,38 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
 				    flags);
 }
 
+/**
+ * struct cfg80211_tx_status - TX status for management frame information
+ *
+ * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
+ * @tx_tstamp: hardware TX timestamp in nanoseconds
+ * @ack_tstamp: hardware ack RX timestamp in nanoseconds
+ * @buf: Management frame (header + body)
+ * @len: length of the frame data
+ * @ack: Whether frame was acknowledged
+ */
+struct cfg80211_tx_status {
+	u64 cookie;
+	u64 tx_tstamp;
+	u64 ack_tstamp;
+	const u8 *buf;
+	size_t len;
+	bool ack;
+};
+
+/**
+ * cfg80211_mgmt_tx_status_ext - TX status notification with extended info
+ * @wdev: wireless device receiving the frame
+ * @status: TX status data
+ * @gfp: context flags
+ *
+ * This function is called whenever a management frame was requested to be
+ * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
+ * transmission attempt with extended info.
+ */
+void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev,
+				 struct cfg80211_tx_status *status, gfp_t gfp);
+
 /**
  * cfg80211_mgmt_tx_status - notification of TX status for management frame
  * @wdev: wireless device receiving the frame
@@ -7850,8 +7882,19 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
  * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
  * transmission attempt.
  */
-void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
-			     const u8 *buf, size_t len, bool ack, gfp_t gfp);
+static inline void cfg80211_mgmt_tx_status(struct wireless_dev *wdev,
+					   u64 cookie, const u8 *buf,
+					   size_t len, bool ack, gfp_t gfp)
+{
+	struct cfg80211_tx_status status = {
+		.cookie = cookie,
+		.buf = buf,
+		.len = len,
+		.ack = ack
+	};
+
+	cfg80211_mgmt_tx_status_ext(wdev, &status, gfp);
+}
 
 /**
  * cfg80211_control_port_tx_status - notification of TX status for control
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 310d22b263d1..a3934f443a84 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18396,8 +18396,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	return -ENOBUFS;
 }
 
-static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie,
-				    const u8 *buf, size_t len, bool ack,
+static void nl80211_frame_tx_status(struct wireless_dev *wdev,
+				    struct cfg80211_tx_status *status,
 				    gfp_t gfp, enum nl80211_commands command)
 {
 	struct wiphy *wiphy = wdev->wiphy;
@@ -18407,11 +18407,13 @@ static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie,
 	void *hdr;
 
 	if (command == NL80211_CMD_FRAME_TX_STATUS)
-		trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
+		trace_cfg80211_mgmt_tx_status(wdev, status->cookie,
+					      status->ack);
 	else
-		trace_cfg80211_control_port_tx_status(wdev, cookie, ack);
+		trace_cfg80211_control_port_tx_status(wdev, status->cookie,
+						      status->ack);
 
-	msg = nlmsg_new(100 + len, gfp);
+	msg = nlmsg_new(100 + status->len, gfp);
 	if (!msg)
 		return;
 
@@ -18426,10 +18428,16 @@ static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie,
 				   netdev->ifindex)) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
-	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
-	    nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+	    nla_put(msg, NL80211_ATTR_FRAME, status->len, status->buf) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, status->cookie,
 			      NL80211_ATTR_PAD) ||
-	    (ack && nla_put_flag(msg, NL80211_ATTR_ACK)))
+	    (status->ack && nla_put_flag(msg, NL80211_ATTR_ACK)) ||
+	    (status->tx_tstamp &&
+	     nla_put_u64_64bit(msg, NL80211_ATTR_TX_HW_TIMESTAMP,
+			       status->tx_tstamp, NL80211_ATTR_PAD)) ||
+	    (status->ack_tstamp &&
+	     nla_put_u64_64bit(msg, NL80211_ATTR_RX_HW_TIMESTAMP,
+			       status->ack_tstamp, NL80211_ATTR_PAD)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -18446,18 +18454,24 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
 				     const u8 *buf, size_t len, bool ack,
 				     gfp_t gfp)
 {
-	nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp,
+	struct cfg80211_tx_status status = {
+		.cookie = cookie,
+		.buf = buf,
+		.len = len,
+		.ack = ack
+	};
+
+	nl80211_frame_tx_status(wdev, &status, gfp,
 				NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS);
 }
 EXPORT_SYMBOL(cfg80211_control_port_tx_status);
 
-void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
-			     const u8 *buf, size_t len, bool ack, gfp_t gfp)
+void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev,
+				 struct cfg80211_tx_status *status, gfp_t gfp)
 {
-	nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp,
-				NL80211_CMD_FRAME_TX_STATUS);
+	nl80211_frame_tx_status(wdev, status, gfp, NL80211_CMD_FRAME_TX_STATUS);
 }
-EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
+EXPORT_SYMBOL(cfg80211_mgmt_tx_status_ext);
 
 static int __nl80211_rx_control_port(struct net_device *dev,
 				     struct sk_buff *skb,
-- 
cgit 


From 00b3d8401019e6abf89ea577cb1de060ea65e3fd Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 30 Jan 2022 18:17:51 +0200
Subject: wifi: cfg80211/nl80211: move rx management data into a struct

The functions for reporting rx management take many arguments.
Collect all the arguments into a struct, which also make it easier
to add more arguments if needed.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++----
 net/wireless/mlme.c    | 21 +++++++++---------
 net/wireless/nl80211.c | 19 ++++++++--------
 net/wireless/nl80211.h |  5 ++---
 net/wireless/trace.h   |  8 +++----
 5 files changed, 81 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 43d54f5c84f9..535e326a35b0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7792,6 +7792,39 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 			  enum nl80211_connect_failed_reason reason,
 			  gfp_t gfp);
 
+/**
+ * struct cfg80211_rx_info - received management frame info
+ *
+ * @freq: Frequency on which the frame was received in kHz
+ * @sig_dbm: signal strength in dBm, or 0 if unknown
+ * @buf: Management frame (header + body)
+ * @len: length of the frame data
+ * @flags: flags, as defined in enum nl80211_rxmgmt_flags
+ */
+struct cfg80211_rx_info {
+	int freq;
+	int sig_dbm;
+	const u8 *buf;
+	size_t len;
+	u32 flags;
+};
+
+/**
+ * cfg80211_rx_mgmt_ext - management frame notification with extended info
+ * @wdev: wireless device receiving the frame
+ * @info: RX info as defined in struct cfg80211_rx_info
+ *
+ * This function is called whenever an Action frame is received for a station
+ * mode interface, but is not processed in kernel.
+ *
+ * Return: %true if a user space application has registered for this frame.
+ * For action frames, that makes it responsible for rejecting unrecognized
+ * action frames; %false otherwise, in which case for action frames the
+ * driver is responsible for rejecting the frame.
+ */
+bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev,
+			  struct cfg80211_rx_info *info);
+
 /**
  * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame
  * @wdev: wireless device receiving the frame
@@ -7809,8 +7842,20 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * action frames; %false otherwise, in which case for action frames the
  * driver is responsible for rejecting the frame.
  */
-bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
-			  const u8 *buf, size_t len, u32 flags);
+static inline bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq,
+					int sig_dbm, const u8 *buf, size_t len,
+					u32 flags)
+{
+	struct cfg80211_rx_info info = {
+		.freq = freq,
+		.sig_dbm = sig_dbm,
+		.buf = buf,
+		.len = len,
+		.flags = flags
+	};
+
+	return cfg80211_rx_mgmt_ext(wdev, &info);
+}
 
 /**
  * cfg80211_rx_mgmt - notification of received, unprocessed management frame
@@ -7833,8 +7878,15 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
 				    int sig_dbm, const u8 *buf, size_t len,
 				    u32 flags)
 {
-	return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len,
-				    flags);
+	struct cfg80211_rx_info info = {
+		.freq = MHZ_TO_KHZ(freq),
+		.sig_dbm = sig_dbm,
+		.buf = buf,
+		.len = len,
+		.flags = flags
+	};
+
+	return cfg80211_rx_mgmt_ext(wdev, &info);
 }
 
 /**
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 003c57504583..581df7f4c524 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
  * Copyright (c) 2015		Intel Deutschland GmbH
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2020, 2022 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -791,15 +791,15 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
 }
 
-bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
-			  const u8 *buf, size_t len, u32 flags)
+bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev,
+			  struct cfg80211_rx_info *info)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct cfg80211_mgmt_registration *reg;
 	const struct ieee80211_txrx_stypes *stypes =
 		&wiphy->mgmt_stypes[wdev->iftype];
-	struct ieee80211_mgmt *mgmt = (void *)buf;
+	struct ieee80211_mgmt *mgmt = (void *)info->buf;
 	const u8 *data;
 	int data_len;
 	bool result = false;
@@ -807,7 +807,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
 		cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
 	u16 stype;
 
-	trace_cfg80211_rx_mgmt(wdev, freq, sig_dbm);
+	trace_cfg80211_rx_mgmt(wdev, info);
 	stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
 
 	if (!(stypes->rx & BIT(stype))) {
@@ -815,8 +815,8 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
 		return false;
 	}
 
-	data = buf + ieee80211_hdrlen(mgmt->frame_control);
-	data_len = len - ieee80211_hdrlen(mgmt->frame_control);
+	data = info->buf + ieee80211_hdrlen(mgmt->frame_control);
+	data_len = info->len - ieee80211_hdrlen(mgmt->frame_control);
 
 	spin_lock_bh(&rdev->mgmt_registrations_lock);
 
@@ -833,9 +833,8 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
 		/* found match! */
 
 		/* Indicate the received Action frame to user space */
-		if (nl80211_send_mgmt(rdev, wdev, reg->nlportid,
-				      freq, sig_dbm,
-				      buf, len, flags, GFP_ATOMIC))
+		if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info,
+				      GFP_ATOMIC))
 			continue;
 
 		result = true;
@@ -847,7 +846,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
 	trace_cfg80211_return_bool(result);
 	return result;
 }
-EXPORT_SYMBOL(cfg80211_rx_mgmt_khz);
+EXPORT_SYMBOL(cfg80211_rx_mgmt_ext);
 
 void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev)
 {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a3934f443a84..80d3471041d1 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18356,14 +18356,13 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame);
 
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlportid,
-		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, u32 flags, gfp_t gfp)
+		      struct cfg80211_rx_info *info, gfp_t gfp)
 {
 	struct net_device *netdev = wdev->netdev;
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(100 + len, gfp);
+	msg = nlmsg_new(100 + info->len, gfp);
 	if (!msg)
 		return -ENOMEM;
 
@@ -18378,13 +18377,13 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 					netdev->ifindex)) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
-	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) ||
-	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) ||
-	    (sig_dbm &&
-	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
-	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
-	    (flags &&
-	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags)))
+	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(info->freq)) ||
+	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, info->freq % 1000) ||
+	    (info->sig_dbm &&
+	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, info->sig_dbm)) ||
+	    nla_put(msg, NL80211_ATTR_FRAME, info->len, info->buf) ||
+	    (info->flags &&
+	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, info->flags)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a7e8e0917c1c..855d540ddfb9 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Portions of this file
- * Copyright (C) 2018, 2020-2021 Intel Corporation
+ * Copyright (C) 2018, 2020-2022 Intel Corporation
  */
 #ifndef __NET_WIRELESS_NL80211_H
 #define __NET_WIRELESS_NL80211_H
@@ -105,8 +105,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlpid,
-		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, u32 flags, gfp_t gfp);
+		      struct cfg80211_rx_info *info, gfp_t gfp);
 
 void
 nl80211_radar_notify(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 592b9e9e821a..10b2fd9bacb5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3096,8 +3096,8 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
 );
 
 TRACE_EVENT(cfg80211_rx_mgmt,
-	TP_PROTO(struct wireless_dev *wdev, int freq, int sig_dbm),
-	TP_ARGS(wdev, freq, sig_dbm),
+	TP_PROTO(struct wireless_dev *wdev, struct cfg80211_rx_info *info),
+	TP_ARGS(wdev, info),
 	TP_STRUCT__entry(
 		WDEV_ENTRY
 		__field(int, freq)
@@ -3105,8 +3105,8 @@ TRACE_EVENT(cfg80211_rx_mgmt,
 	),
 	TP_fast_assign(
 		WDEV_ASSIGN;
-		__entry->freq = freq;
-		__entry->sig_dbm = sig_dbm;
+		__entry->freq = info->freq;
+		__entry->sig_dbm = info->sig_dbm;
 	),
 	TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d",
 		  WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
-- 
cgit 


From 1ff715ffa0ec1b5af9093c43185e686f575f15cc Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Thu, 27 Jan 2022 13:56:29 +0200
Subject: wifi: cfg80211: add hardware timestamps to frame RX info

Add hardware timestamps to management frame RX info.
This shall be used by drivers that support hardware timestamping for
Timing measurement and Fine timing measurement action frames RX.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++++
 net/wireless/nl80211.c | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 535e326a35b0..2f5c271bcde1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7800,6 +7800,8 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * @buf: Management frame (header + body)
  * @len: length of the frame data
  * @flags: flags, as defined in enum nl80211_rxmgmt_flags
+ * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds
+ * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds
  */
 struct cfg80211_rx_info {
 	int freq;
@@ -7807,6 +7809,8 @@ struct cfg80211_rx_info {
 	const u8 *buf;
 	size_t len;
 	u32 flags;
+	u64 rx_tstamp;
+	u64 ack_tstamp;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 80d3471041d1..e5ed0950ef0b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18383,7 +18383,15 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, info->sig_dbm)) ||
 	    nla_put(msg, NL80211_ATTR_FRAME, info->len, info->buf) ||
 	    (info->flags &&
-	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, info->flags)))
+	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, info->flags)) ||
+	    (info->rx_tstamp && nla_put_u64_64bit(msg,
+						  NL80211_ATTR_RX_HW_TIMESTAMP,
+						  info->rx_tstamp,
+						  NL80211_ATTR_PAD)) ||
+	    (info->ack_tstamp && nla_put_u64_64bit(msg,
+						   NL80211_ATTR_TX_HW_TIMESTAMP,
+						   info->ack_tstamp,
+						   NL80211_ATTR_PAD)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
-- 
cgit 


From f9202638df34474f862109731203e71d8e71f85e Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 26 Jan 2022 11:15:31 +0200
Subject: wifi: mac80211: add hardware timestamps for RX and TX

When the low level driver reports hardware timestamps for frame
TX status or frame RX, pass the timestamps to cfg80211.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 28 +++++++++++++++++++++++++++-
 net/mac80211/rx.c      | 30 ++++++++++++++++++++++--------
 net/mac80211/status.c  | 38 ++++++++++++++++++++++++++++----------
 3 files changed, 77 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6f856da28d71..e9f8be7ba9a6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -125,6 +125,22 @@
  * via the usual ieee80211_tx_dequeue).
  */
 
+/**
+ * DOC: HW timestamping
+ *
+ * Timing Measurement and Fine Timing Measurement require accurate timestamps
+ * of the action frames TX/RX and their respective acks.
+ *
+ * To report hardware timestamps for Timing Measurement or Fine Timing
+ * Measurement frame RX, the low level driver should set the SKB's hwtstamp
+ * field to the frame RX timestamp and report the ack TX timestamp in the
+ * ieee80211_rx_status struct.
+ *
+ * Similarly, To report hardware timestamps for Timing Measurement or Fine
+ * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field
+ * to the frame TX timestamp and report the ack RX timestamp in the
+ * ieee80211_tx_status struct.
+ */
 struct device;
 
 /**
@@ -1176,12 +1192,16 @@ struct ieee80211_rate_status {
  * @rates: Mrr stages that were used when sending the packet
  * @n_rates: Number of mrr stages (count of instances for @rates)
  * @free_list: list where processed skbs are stored to be free'd by the driver
+ * @ack_hwtstamp: Hardware timestamp of the received ack in nanoseconds
+ *	Only needed for Timing measurement and Fine timing measurement action
+ *	frames. Only reported by devices that have timestamping enabled.
  */
 struct ieee80211_tx_status {
 	struct ieee80211_sta *sta;
 	struct ieee80211_tx_info *info;
 	struct sk_buff *skb;
 	struct ieee80211_rate_status *rates;
+	ktime_t ack_hwtstamp;
 	u8 n_rates;
 
 	struct list_head *free_list;
@@ -1419,6 +1439,9 @@ enum mac80211_rx_encoding {
  * 	(TSF) timer when the first data symbol (MPDU) arrived at the hardware.
  * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is
  *	needed only for beacons and probe responses that update the scan cache.
+ * @ack_tx_hwtstamp: Hardware timestamp for the ack TX in nanoseconds. Only
+ *	needed for Timing measurement and Fine timing measurement action frames.
+ *	Only reported by devices that have timestamping enabled.
  * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use
  *	it but can store it and pass it back to the driver for synchronisation
  * @band: the active band when this frame was received
@@ -1452,7 +1475,10 @@ enum mac80211_rx_encoding {
  */
 struct ieee80211_rx_status {
 	u64 mactime;
-	u64 boottime_ns;
+	union {
+		u64 boottime_ns;
+		ktime_t ack_tx_hwtstamp;
+	};
 	u32 device_timestamp;
 	u32 ampdu_reference;
 	u32 flag;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9054a1e0b0d8..ef9c2fcd68f5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3644,7 +3644,11 @@ static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
-	int sig = 0;
+	struct cfg80211_rx_info info = {
+		.freq = ieee80211_rx_status_to_khz(status),
+		.buf = rx->skb->data,
+		.len = rx->skb->len
+	};
 
 	/* skip known-bad action frames and return them in the next handler */
 	if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
@@ -3659,11 +3663,15 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 
 	if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) &&
 	    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
-		sig = status->signal;
+		info.sig_dbm = status->signal;
+
+	if (ieee80211_is_timing_measurement(rx->skb) ||
+	    ieee80211_is_ftm(rx->skb)) {
+		info.rx_tstamp = ktime_to_ns(skb_hwtstamps(rx->skb)->hwtstamp);
+		info.ack_tstamp = ktime_to_ns(status->ack_tx_hwtstamp);
+	}
 
-	if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev,
-				 ieee80211_rx_status_to_khz(status), sig,
-				 rx->skb->data, rx->skb->len, 0)) {
+	if (cfg80211_rx_mgmt_ext(&rx->sdata->wdev, &info)) {
 		if (rx->sta)
 			rx->sta->deflink.rx_stats.packets++;
 		dev_kfree_skb(rx->skb);
@@ -4758,8 +4766,10 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
 	}
 
 	if (!consume) {
-		skb = skb_copy(skb, GFP_ATOMIC);
-		if (!skb) {
+		struct skb_shared_hwtstamps *shwt;
+
+		rx->skb = skb_copy(skb, GFP_ATOMIC);
+		if (!rx->skb) {
 			if (net_ratelimit())
 				wiphy_debug(local->hw.wiphy,
 					"failed to copy skb for %s\n",
@@ -4767,7 +4777,11 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
 			return true;
 		}
 
-		rx->skb = skb;
+		/* skb_copy() does not copy the hw timestamps, so copy it
+		 * explicitly
+		 */
+		shwt = skb_hwtstamps(rx->skb);
+		shwt->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
 	}
 
 	if (unlikely(link_sta)) {
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index fad457f99711..8e77fd2e9fdf 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -624,9 +624,11 @@ ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb)
 }
 
 static void ieee80211_report_ack_skb(struct ieee80211_local *local,
-				     struct ieee80211_tx_info *info,
-				     bool acked, bool dropped)
+				     struct sk_buff *orig_skb,
+				     bool acked, bool dropped,
+				     ktime_t ack_hwtstamp)
 {
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(orig_skb);
 	struct sk_buff *skb;
 	unsigned long flags;
 
@@ -643,6 +645,19 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 		struct ieee80211_hdr *hdr = (void *)skb->data;
 		bool is_valid_ack_signal =
 			!!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID);
+		struct cfg80211_tx_status status = {
+			.cookie = cookie,
+			.buf = skb->data,
+			.len = skb->len,
+			.ack = acked,
+		};
+
+		if (ieee80211_is_timing_measurement(orig_skb) ||
+		    ieee80211_is_ftm(orig_skb)) {
+			status.tx_tstamp =
+				ktime_to_ns(skb_hwtstamps(orig_skb)->hwtstamp);
+			status.ack_tstamp = ktime_to_ns(ack_hwtstamp);
+		}
 
 		rcu_read_lock();
 		sdata = ieee80211_sdata_from_skb(local, skb);
@@ -662,9 +677,9 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 						      is_valid_ack_signal,
 						      GFP_ATOMIC);
 			else if (ieee80211_is_mgmt(hdr->frame_control))
-				cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
-							skb->data, skb->len,
-							acked, GFP_ATOMIC);
+				cfg80211_mgmt_tx_status_ext(&sdata->wdev,
+							    &status,
+							    GFP_ATOMIC);
 			else
 				pr_warn("Unknown status report in ack skb\n");
 
@@ -681,7 +696,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 }
 
 static void ieee80211_report_used_skb(struct ieee80211_local *local,
-				      struct sk_buff *skb, bool dropped)
+				      struct sk_buff *skb, bool dropped,
+				      ktime_t ack_hwtstamp)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	u16 tx_time_est = ieee80211_info_get_tx_time_est(info);
@@ -744,7 +760,8 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 
 		rcu_read_unlock();
 	} else if (info->ack_frame_id) {
-		ieee80211_report_ack_skb(local, info, acked, dropped);
+		ieee80211_report_ack_skb(local, skb, acked, dropped,
+					 ack_hwtstamp);
 	}
 
 	if (!dropped && skb->destructor) {
@@ -1038,7 +1055,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 			  jiffies + msecs_to_jiffies(10));
 	}
 
-	ieee80211_report_used_skb(local, skb, false);
+	ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp);
 
 	/* this was a transmitted frame, but now we want to reuse it */
 	skb_orphan(skb);
@@ -1201,7 +1218,7 @@ free:
 	if (!skb)
 		return;
 
-	ieee80211_report_used_skb(local, skb, false);
+	ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp);
 	if (status->free_list)
 		list_add_tail(&skb->list, status->free_list);
 	else
@@ -1262,8 +1279,9 @@ EXPORT_SYMBOL(ieee80211_report_low_ack);
 void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
+	ktime_t kt = ktime_set(0, 0);
 
-	ieee80211_report_used_skb(local, skb, true);
+	ieee80211_report_used_skb(local, skb, true, kt);
 	dev_kfree_skb_any(skb);
 }
 EXPORT_SYMBOL(ieee80211_free_txskb);
-- 
cgit 


From 6074c9e5747158ab1e6aebedb87f64be77401fd8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Jul 2022 10:40:44 +0200
Subject: wifi: cfg80211: report link ID in NL80211_CMD_FRAME

If given by the underlying driver, report the link ID for
MLO in NL80211_CMD_FRAME.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 +++++
 net/wireless/nl80211.c | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2f5c271bcde1..8545ed098d90 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7797,6 +7797,9 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  *
  * @freq: Frequency on which the frame was received in kHz
  * @sig_dbm: signal strength in dBm, or 0 if unknown
+ * @have_link_id: indicates the frame was received on a link of
+ *	an MLD, i.e. the @link_id field is valid
+ * @link_id: the ID of the link the frame was received	on
  * @buf: Management frame (header + body)
  * @len: length of the frame data
  * @flags: flags, as defined in enum nl80211_rxmgmt_flags
@@ -7806,6 +7809,8 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 struct cfg80211_rx_info {
 	int freq;
 	int sig_dbm;
+	bool have_link_id;
+	u8 link_id;
 	const u8 *buf;
 	size_t len;
 	u32 flags;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e5ed0950ef0b..60b8406b8d7e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18377,6 +18377,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 					netdev->ifindex)) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
+	    (info->have_link_id &&
+	     nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, info->link_id)) ||
 	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(info->freq)) ||
 	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, info->freq % 1000) ||
 	    (info->sig_dbm &&
-- 
cgit 


From 95f498bb49f7030c1f40236107e5241e50f79ade Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Jul 2022 12:13:46 +0200
Subject: wifi: nl80211: add MLO link ID to the NL80211_CMD_FRAME TX API

Allow optionally specifying the link ID to transmit on,
which can be done instead of the link frequency, on an
MLD addressed frame. Both can also be omitted in which
case the frame must be MLD addressed and link selection
(and address translation) will be done on lower layers.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/nl80211.c       | 12 ++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8545ed098d90..908d58393484 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3411,6 +3411,9 @@ struct cfg80211_update_ft_ies_params {
  * @dont_wait_for_ack: tells the low level not to wait for an ack
  * @n_csa_offsets: length of csa_offsets array
  * @csa_offsets: array of all the csa offsets in the frame
+ * @link_id: for MLO, the link ID to transmit on, -1 if not given; note
+ *	that the link ID isn't validated (much), it's in range but the
+ *	link might not exist (or be used by the receiver STA)
  */
 struct cfg80211_mgmt_tx_params {
 	struct ieee80211_channel *chan;
@@ -3422,6 +3425,7 @@ struct cfg80211_mgmt_tx_params {
 	bool dont_wait_for_ack;
 	int n_csa_offsets;
 	const u16 *csa_offsets;
+	int link_id;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5275dcbc5ee8..ffb7c573e299 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -764,6 +764,10 @@
  *	%NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA
  *	counters which will be updated to the current value. This attribute
  *	is used during CSA period.
+ *	For TX on an MLD, the frequency can be omitted and the link ID be
+ *	specified, or if transmitting to a known peer MLD (with MLD addresses
+ *	in the frame) both can be omitted and the link will be selected by
+ *	lower layers.
  *	For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
  *	indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may
  *	be included to indicate the ack TX timestamp.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 60b8406b8d7e..2705e3ee8fc4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12256,6 +12256,18 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 		wdev_unlock(wdev);
 		return -EBUSY;
 	}
+
+	params.link_id = nl80211_link_id_or_invalid(info->attrs);
+	/*
+	 * This now races due to the unlock, but we cannot check
+	 * the valid links for the _station_ anyway, so that's up
+	 * to the driver.
+	 */
+	if (params.link_id >= 0 &&
+	    !(wdev->valid_links & BIT(params.link_id))) {
+		wdev_unlock(wdev);
+		return -EINVAL;
+	}
 	wdev_unlock(wdev);
 
 	params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
-- 
cgit 


From e1e68b14c5f85f2ad43d06a1b2f0d0fcc8dbdd62 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Jul 2022 21:36:08 +0200
Subject: wifi: mac80211: expand ieee80211_mgmt_tx() for MLO

There are a couple of new things that should be possible
with MLO:
 * selecting the link to transmit to a station by link ID,
   which a previous patch added to the nl80211 API
 * selecting the link by frequency, similarly
 * allowing transmittion to an MLD without specifying any
   channel or link ID, with MLD addresses

Enable these use cases. Also fix the address comparison
in client mode to use the AP (MLD) address.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  4 +++-
 net/mac80211/agg-tx.c      |  4 ++--
 net/mac80211/ieee80211_i.h |  8 ++++----
 net/mac80211/offchannel.c  | 47 ++++++++++++++++++++++++++++++++--------------
 net/mac80211/rx.c          |  2 +-
 net/mac80211/tx.c          | 11 ++++++++---
 6 files changed, 51 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e9f8be7ba9a6..1afd45239fe6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -885,7 +885,9 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this
  *	frame should be transmitted on the specific link. This really is
  *	only relevant for frames that do not have data present, and is
- *	also not used for 802.3 format frames.
+ *	also not used for 802.3 format frames. Note that even if the frame
+ *	is on a specific link, address translation might still apply if
+ *	it's intended for an MLD.
  *
  * These flags are used in tx_info->control.flags.
  */
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index b13f4b7b740d..07c892aa8c73 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -106,7 +106,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.action.u.addba_req.start_seq_num =
 					cpu_to_le16(start_seq_num << 4);
 
-	ieee80211_tx_skb_tid(sdata, skb, tid);
+	ieee80211_tx_skb_tid(sdata, skb, tid, -1);
 }
 
 void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn)
@@ -135,7 +135,7 @@ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn)
 
 	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
 					IEEE80211_TX_CTL_REQ_TX_STATUS;
-	ieee80211_tx_skb_tid(sdata, skb, tid);
+	ieee80211_tx_skb_tid(sdata, skb, tid, -1);
 }
 EXPORT_SYMBOL(ieee80211_send_bar);
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 5fc4392ba507..4ec6fb96ba41 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2141,7 +2141,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 		    struct sta_info *sta, struct sk_buff *skb);
 
 void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
-				 struct sk_buff *skb, int tid,
+				 struct sk_buff *skb, int tid, int link_id,
 				 enum nl80211_band band);
 
 /* sta_out needs to be checked for ERR_PTR() before using */
@@ -2155,18 +2155,18 @@ ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 			  enum nl80211_band band)
 {
 	rcu_read_lock();
-	__ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+	__ieee80211_tx_skb_tid_band(sdata, skb, tid, -1, band);
 	rcu_read_unlock();
 }
 
 void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
-			  struct sk_buff *skb, int tid);
+			  struct sk_buff *skb, int tid, int link_id);
 
 static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
 				    struct sk_buff *skb)
 {
 	/* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
-	ieee80211_tx_skb_tid(sdata, skb, 7);
+	ieee80211_tx_skb_tid(sdata, skb, 7, -1);
 }
 
 /**
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index be79ae68754e..d78c82d6b696 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -769,9 +769,11 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
-	struct sta_info *sta;
+	struct sta_info *sta = NULL;
 	const struct ieee80211_mgmt *mgmt = (void *)params->buf;
 	bool need_offchan = false;
+	bool mlo_sta = false;
+	int link_id = -1;
 	u32 flags;
 	int ret;
 	u8 *data;
@@ -804,16 +806,30 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		    !ieee80211_vif_is_mesh(&sdata->vif) &&
 		    !sdata->bss->active)
 			need_offchan = true;
+
+		rcu_read_lock();
+		sta = sta_info_get_bss(sdata, mgmt->da);
+		mlo_sta = sta && sta->sta.mlo;
+
 		if (!ieee80211_is_action(mgmt->frame_control) ||
 		    mgmt->u.action.category == WLAN_CATEGORY_PUBLIC ||
 		    mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED ||
-		    mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT)
+		    mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) {
+			rcu_read_unlock();
 			break;
-		rcu_read_lock();
-		sta = sta_info_get_bss(sdata, mgmt->da);
-		rcu_read_unlock();
-		if (!sta)
+		}
+
+		if (!sta) {
+			rcu_read_unlock();
 			return -ENOLINK;
+		}
+		if (params->link_id >= 0 &&
+		    !(sta->sta.valid_links & BIT(params->link_id))) {
+			rcu_read_unlock();
+			return -ENOLINK;
+		}
+		link_id = params->link_id;
+		rcu_read_unlock();
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
@@ -821,8 +837,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		if (!sdata->u.mgd.associated ||
 		    (params->offchan && params->wait &&
 		     local->ops->remain_on_channel &&
-		     memcmp(sdata->deflink.u.mgd.bssid,
-			    mgmt->bssid, ETH_ALEN)))
+		     memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN)))
 			need_offchan = true;
 		sdata_unlock(sdata);
 		break;
@@ -843,7 +858,9 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	mutex_lock(&local->mtx);
 
 	/* Check if the operating channel is the requested channel */
-	if (!need_offchan) {
+	if (!params->chan && mlo_sta) {
+		need_offchan = false;
+	} else if (!need_offchan) {
 		struct ieee80211_chanctx_conf *chanctx_conf = NULL;
 		int i;
 
@@ -860,6 +877,12 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 			if (!chanctx_conf)
 				continue;
 
+			if (mlo_sta && params->chan == chanctx_conf->def.chan &&
+			    ether_addr_equal(sdata->vif.addr, mgmt->sa)) {
+				link_id = i;
+				break;
+			}
+
 			if (ether_addr_equal(conf->addr, mgmt->sa))
 				break;
 
@@ -870,10 +893,6 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 			need_offchan = params->chan &&
 				       (params->chan !=
 					chanctx_conf->def.chan);
-		} else if (!params->chan) {
-			ret = -EINVAL;
-			rcu_read_unlock();
-			goto out_unlock;
 		} else {
 			need_offchan = true;
 		}
@@ -943,7 +962,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	}
 
 	if (!need_offchan) {
-		ieee80211_tx_skb(sdata, skb);
+		ieee80211_tx_skb_tid(sdata, skb, 7, link_id);
 		ret = 0;
 		goto out_unlock;
 	}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6cb5989c6ae2..58ff2cc7bcc9 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3774,7 +3774,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
 					local->hw.offchannel_tx_hw_queue;
 		}
 
-		__ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
+		__ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, -1,
 					    status->band);
 	}
 	dev_kfree_skb(rx->skb);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 06ec152e8188..f24565064f08 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5647,7 +5647,7 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *pubsta, u8 tid)
 EXPORT_SYMBOL(ieee80211_unreserve_tid);
 
 void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
-				 struct sk_buff *skb, int tid,
+				 struct sk_buff *skb, int tid, int link_id,
 				 enum nl80211_band band)
 {
 	const struct ieee80211_hdr *hdr = (void *)skb->data;
@@ -5666,6 +5666,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 
 	if (!sdata->vif.valid_links) {
 		link = 0;
+	} else if (link_id >= 0) {
+		link = link_id;
 	} else if (memcmp(sdata->vif.addr, hdr->addr2, ETH_ALEN) == 0) {
 		/* address from the MLD */
 		link = IEEE80211_LINK_UNSPECIFIED;
@@ -5702,13 +5704,14 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 }
 
 void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
-			  struct sk_buff *skb, int tid)
+			  struct sk_buff *skb, int tid, int link_id)
 {
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	enum nl80211_band band;
 
 	rcu_read_lock();
 	if (!sdata->vif.valid_links) {
+		WARN_ON(link_id >= 0);
 		chanctx_conf =
 			rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
 		if (WARN_ON(!chanctx_conf)) {
@@ -5718,11 +5721,13 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
 		}
 		band = chanctx_conf->def.chan->band;
 	} else {
+		WARN_ON(link_id >= 0 &&
+			!(sdata->vif.valid_links & BIT(link_id)));
 		/* MLD transmissions must not rely on the band */
 		band = 0;
 	}
 
-	__ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+	__ieee80211_tx_skb_tid_band(sdata, skb, tid, link_id, band);
 	rcu_read_unlock();
 }
 
-- 
cgit 


From 963d0e8d08d97f9133b9fd00354ebc1a2467484b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Jul 2022 10:56:48 +0200
Subject: wifi: mac80211: optionally implement MLO multicast TX

For drivers using software encryption for multicast TX, such
as mac80211_hwsim, mac80211 needs to duplicate the multicast
frames on each link, if MLO is enabled. Do this, but don't
just make it dependent on the key but provide a separate flag
for drivers to opt out of this.

This is not very efficient, I expect that drivers will do it
in firmware/hardware or at least with DMA engine assistence,
so this is mostly for hwsim.

To make this work, also implement the SNS11 sequence number
space that an AP MLD shall have, and modify the API to the
__ieee80211_subif_start_xmit() function to always require the
link ID bits to be set.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 10 +++++
 net/mac80211/debugfs.c     |  3 +-
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/tdls.c        |  3 +-
 net/mac80211/tx.c          | 91 +++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 97 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1afd45239fe6..1e04961148bf 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -882,6 +882,8 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered
  *	relative to other frames that have this flag set, independent
  *	of their QoS TID or other priority field values.
+ * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally
+ *	for sequence number assignment
  * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this
  *	frame should be transmitted on the specific link. This really is
  *	only relevant for frames that do not have data present, and is
@@ -901,10 +903,14 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_INTCFL_NEED_TXPROCESSING	= BIT(6),
 	IEEE80211_TX_CTRL_NO_SEQNO		= BIT(7),
 	IEEE80211_TX_CTRL_DONT_REORDER		= BIT(8),
+	IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX	= BIT(9),
 	IEEE80211_TX_CTRL_MLO_LINK		= 0xf0000000,
 };
 
 #define IEEE80211_LINK_UNSPECIFIED	0xf
+#define IEEE80211_TX_CTRL_MLO_LINK_UNSPEC	\
+	u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, \
+			IEEE80211_TX_CTRL_MLO_LINK)
 
 /**
  * enum mac80211_tx_status_flags - flags to describe transmit status
@@ -2524,6 +2530,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color
  *	collision detection and doesn't need it in software.
  *
+ * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting
+ *	multicast frames on all links, mac80211 should not do that.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2580,6 +2589,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD,
 	IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP,
 	IEEE80211_HW_DETECTS_COLOR_COLLISION,
+	IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 4d4341249759..78c7d60e8667 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -4,7 +4,7 @@
  *
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2018 - 2019, 2021 Intel Corporation
+ * Copyright (C) 2018 - 2019, 2021-2022 Intel Corporation
  */
 
 #include <linux/debugfs.h>
@@ -495,6 +495,7 @@ static const char *hw_flag_names[] = {
 	FLAG(SUPPORTS_RX_DECAP_OFFLOAD),
 	FLAG(SUPPORTS_CONC_MON_RX_DECAP),
 	FLAG(DETECTS_COLOR_COLLISION),
+	FLAG(MLO_MCAST_MULTI_LINK_TX),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4ec6fb96ba41..f93b57799e94 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1029,6 +1029,7 @@ struct ieee80211_sub_if_data {
 	struct ieee80211_key __rcu *default_unicast_key;
 
 	u16 sequence_number;
+	u16 mld_mcast_seq;
 	__be16 control_port_protocol;
 	bool control_port_no_encrypt;
 	bool control_port_no_preauth;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 36bfc54b3d2d..f4b4d25eef95 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1054,7 +1054,8 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
 
 	/* disable bottom halves when entering the Tx path */
 	local_bh_disable();
-	__ieee80211_subif_start_xmit(skb, dev, flags, 0, NULL);
+	__ieee80211_subif_start_xmit(skb, dev, flags,
+				     IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL);
 	local_bh_enable();
 
 	return ret;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f24565064f08..45df9932d0ba 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -822,6 +822,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO)
 		return TX_CONTINUE;
 
+	/* SNS11 from 802.11be 10.3.2.14 */
+	if (unlikely(is_multicast_ether_addr(hdr->addr1) &&
+		     info->control.vif->valid_links &&
+		     info->control.vif->type == NL80211_IFTYPE_AP)) {
+		if (info->control.flags & IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX)
+			tx->sdata->mld_mcast_seq += 0x10;
+		hdr->seq_ctrl = cpu_to_le16(tx->sdata->mld_mcast_seq);
+		return TX_CONTINUE;
+	}
+
 	/*
 	 * Anything but QoS data that has a sequence number field
 	 * (is long enough) gets a sequence number from the global
@@ -2570,7 +2580,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_chanctx_conf *chanctx_conf = NULL;
 	enum nl80211_band band;
 	int ret;
-	u8 link_id = IEEE80211_LINK_UNSPECIFIED;
+	u8 link_id = u32_get_bits(ctrl_flags, IEEE80211_TX_CTRL_MLO_LINK);
 
 	if (IS_ERR(sta))
 		sta = NULL;
@@ -2630,8 +2640,18 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 				goto free;
 			}
 			memcpy(hdr.addr2, link->conf->addr, ETH_ALEN);
-		} else {
+		} else if (link_id == IEEE80211_LINK_UNSPECIFIED) {
 			memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+		} else {
+			struct ieee80211_bss_conf *conf;
+
+			conf = rcu_dereference(sdata->vif.link_conf[link_id]);
+			if (unlikely(!conf)) {
+				ret = -ENOLINK;
+				goto free;
+			}
+
+			memcpy(hdr.addr2, conf->addr, ETH_ALEN);
 		}
 
 		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
@@ -4222,9 +4242,6 @@ static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
 	const struct vlan_ethhdr *ethvlan = (void *)skb->data;
 	__be16 ethertype;
 
-	if (likely(!is_multicast_ether_addr(eth->h_dest)))
-		return false;
-
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
 		if (sdata->u.vlan.sta)
@@ -4308,6 +4325,44 @@ out:
 	rcu_read_unlock();
 }
 
+static void ieee80211_mlo_multicast_tx_one(struct ieee80211_sub_if_data *sdata,
+					   struct sk_buff *skb, u32 ctrl_flags,
+					   unsigned int link_id)
+{
+	struct sk_buff *out;
+
+	out = skb_copy(skb, GFP_ATOMIC);
+	if (!out)
+		return;
+
+	ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK);
+	__ieee80211_subif_start_xmit(out, sdata->dev, 0, ctrl_flags, NULL);
+}
+
+static void ieee80211_mlo_multicast_tx(struct net_device *dev,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	unsigned long links = sdata->vif.valid_links;
+	unsigned int link;
+	u32 ctrl_flags = IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX;
+
+	if (hweight16(links) == 1) {
+		ctrl_flags |= u32_encode_bits(ffs(links) - 1,
+					      IEEE80211_TX_CTRL_MLO_LINK);
+
+		__ieee80211_subif_start_xmit(skb, sdata->dev, 0, ctrl_flags,
+					     NULL);
+		return;
+	}
+
+	for_each_set_bit(link, &links, IEEE80211_MLD_MAX_NUM_LINKS) {
+		ieee80211_mlo_multicast_tx_one(sdata, skb, ctrl_flags, link);
+		ctrl_flags = 0;
+	}
+	kfree_skb(skb);
+}
+
 /**
  * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
  * @skb: packet to be sent
@@ -4318,15 +4373,30 @@ out:
 netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 				       struct net_device *dev)
 {
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	const struct ethhdr *eth = (void *)skb->data;
+
+	if (likely(!is_multicast_ether_addr(eth->h_dest)))
+		goto normal;
+
 	if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
 		struct sk_buff_head queue;
 
 		__skb_queue_head_init(&queue);
 		ieee80211_convert_to_unicast(skb, dev, &queue);
 		while ((skb = __skb_dequeue(&queue)))
-			__ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
+			__ieee80211_subif_start_xmit(skb, dev, 0,
+						     IEEE80211_TX_CTRL_MLO_LINK_UNSPEC,
+						     NULL);
+	} else if (sdata->vif.valid_links &&
+		   sdata->vif.type == NL80211_IFTYPE_AP &&
+		   !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) {
+		ieee80211_mlo_multicast_tx(dev, skb);
 	} else {
-		__ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
+normal:
+		__ieee80211_subif_start_xmit(skb, dev, 0,
+					     IEEE80211_TX_CTRL_MLO_LINK_UNSPEC,
+					     NULL);
 	}
 
 	return NETDEV_TX_OK;
@@ -4410,7 +4480,9 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	if (tid_tx) {
 		if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
 			/* fall back to non-offload slow path */
-			__ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
+			__ieee80211_subif_start_xmit(skb, dev, 0,
+						     IEEE80211_TX_CTRL_MLO_LINK_UNSPEC,
+						     NULL);
 			return;
 		}
 
@@ -4512,7 +4584,8 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL);
+	skb = ieee80211_build_hdr(sdata, skb, info_flags, sta,
+				  IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL);
 	if (IS_ERR(skb))
 		goto out;
 
-- 
cgit 


From 9f781533bb028edb3e8e10e79fb87e85a4dc6987 Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Tue, 12 Jul 2022 12:22:50 +0300
Subject: wifi: mac80211: add macros to loop over active links

Add a preliminary version which will be updated later
to loop over vif's and sta's active links.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 1e04961148bf..f198af600b5e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1856,6 +1856,13 @@ struct ieee80211_vif {
 	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
+/* FIXME: for now loop over all the available links; later will be changed
+ * to loop only over the active links.
+ */
+#define for_each_vif_active_link(vif, link, link_id)			     \
+	for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \
+		if ((link = rcu_dereference((vif)->link_conf[link_id])))
+
 static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif)
 {
 #ifdef CONFIG_MAC80211_MESH
@@ -2248,6 +2255,14 @@ struct ieee80211_sta {
 	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
+/* FIXME: need to loop only over links which are active and check the actual
+ * lock
+ */
+#define for_each_sta_active_link(sta, link_sta, link_id)		         \
+	for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++)	         \
+		if (((link_sta) = rcu_dereference_protected((sta)->link[link_id],\
+							    1)))	         \
+
 /**
  * enum sta_notify_cmd - sta notify command
  *
-- 
cgit 


From dea997733575c5793ca77e166bbbf89097987eb4 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 22 Jul 2022 10:48:50 +0100
Subject: firmware: cs_dsp: Add pre_stop callback

The code already has a post_stop callback, add a matching pre_stop
callback to the client_ops that is called before execution is stopped.
This callback provides a convenient place for the client code to
communicate with the DSP before it is stopped.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220722094851.92521-1-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 3 +++
 include/linux/firmware/cirrus/cs_dsp.h | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 7dad6f57d970..b402f841d72c 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -2725,6 +2725,9 @@ void cs_dsp_stop(struct cs_dsp *dsp)
 
 	mutex_lock(&dsp->pwr_lock);
 
+	if (dsp->client_ops->pre_stop)
+		dsp->client_ops->pre_stop(dsp);
+
 	dsp->running = false;
 
 	if (dsp->ops->stop_core)
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 30055706cce2..6ab230218df0 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -189,7 +189,8 @@ struct cs_dsp {
  * @control_remove:	Called under the pwr_lock when a control is destroyed
  * @pre_run:		Called under the pwr_lock by cs_dsp_run() before the core is started
  * @post_run:		Called under the pwr_lock by cs_dsp_run() after the core is started
- * @post_stop:		Called under the pwr_lock by cs_dsp_stop()
+ * @pre_stop:		Called under the pwr_lock by cs_dsp_stop() before the core is stopped
+ * @post_stop:		Called under the pwr_lock by cs_dsp_stop() after the core is stopped
  * @watchdog_expired:	Called when a watchdog expiry is detected
  *
  * These callbacks give the cs_dsp client an opportunity to respond to events
@@ -200,6 +201,7 @@ struct cs_dsp_client_ops {
 	void (*control_remove)(struct cs_dsp_coeff_ctl *ctl);
 	int (*pre_run)(struct cs_dsp *dsp);
 	int (*post_run)(struct cs_dsp *dsp);
+	void (*pre_stop)(struct cs_dsp *dsp);
 	void (*post_stop)(struct cs_dsp *dsp);
 	void (*watchdog_expired)(struct cs_dsp *dsp);
 };
-- 
cgit 


From a4b976552f122ea851f556769874022cf097741e Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 22 Jul 2022 10:48:51 +0100
Subject: firmware: cs_dsp: Add memory chunk helpers

Add helpers that can be layered on top of a buffer read from or to be
written to the DSP to faciliate accessing datastructures within the DSP
memory. These functions handle adding the padding bytes for the DSP,
converting to big endian, and packing arbitrary length data.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220722094851.92521-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 104 +++++++++++++++++++++++++++++++++
 include/linux/firmware/cirrus/cs_dsp.h |  73 +++++++++++++++++++++++
 2 files changed, 177 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index b402f841d72c..81cc3d0f6eec 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -3180,6 +3180,110 @@ static const struct cs_dsp_ops cs_dsp_halo_ops = {
 	.stop_core = cs_dsp_halo_stop_core,
 };
 
+/**
+ * cs_dsp_chunk_write() - Format data to a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ * @nbits: Number of bits to write
+ * @val: Value to write
+ *
+ * This function sequentially writes values into the format required for DSP
+ * memory, it handles both inserting of the padding bytes and converting to
+ * big endian. Note that data is only committed to the chunk when a whole DSP
+ * words worth of data is available.
+ *
+ * Return: Zero for success, a negative number on error.
+ */
+int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val)
+{
+	int nwrite, i;
+
+	nwrite = min(CS_DSP_DATA_WORD_BITS - ch->cachebits, nbits);
+
+	ch->cache <<= nwrite;
+	ch->cache |= val >> (nbits - nwrite);
+	ch->cachebits += nwrite;
+	nbits -= nwrite;
+
+	if (ch->cachebits == CS_DSP_DATA_WORD_BITS) {
+		if (cs_dsp_chunk_end(ch))
+			return -ENOSPC;
+
+		ch->cache &= 0xFFFFFF;
+		for (i = 0; i < sizeof(ch->cache); i++, ch->cache <<= BITS_PER_BYTE)
+			*ch->data++ = (ch->cache & 0xFF000000) >> CS_DSP_DATA_WORD_BITS;
+
+		ch->bytes += sizeof(ch->cache);
+		ch->cachebits = 0;
+	}
+
+	if (nbits)
+		return cs_dsp_chunk_write(ch, nbits, val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_write);
+
+/**
+ * cs_dsp_chunk_flush() - Pad remaining data with zero and commit to chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * As cs_dsp_chunk_write only writes data when a whole DSP word is ready to
+ * be written out it is possible that some data will remain in the cache, this
+ * function will pad that data with zeros upto a whole DSP word and write out.
+ *
+ * Return: Zero for success, a negative number on error.
+ */
+int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch)
+{
+	if (!ch->cachebits)
+		return 0;
+
+	return cs_dsp_chunk_write(ch, CS_DSP_DATA_WORD_BITS - ch->cachebits, 0);
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_flush);
+
+/**
+ * cs_dsp_chunk_read() - Parse data from a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ * @nbits: Number of bits to read
+ *
+ * This function sequentially reads values from a DSP memory formatted buffer,
+ * it handles both removing of the padding bytes and converting from big endian.
+ *
+ * Return: A negative number is returned on error, otherwise the read value.
+ */
+int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits)
+{
+	int nread, i;
+	u32 result;
+
+	if (!ch->cachebits) {
+		if (cs_dsp_chunk_end(ch))
+			return -ENOSPC;
+
+		ch->cache = 0;
+		ch->cachebits = CS_DSP_DATA_WORD_BITS;
+
+		for (i = 0; i < sizeof(ch->cache); i++, ch->cache <<= BITS_PER_BYTE)
+			ch->cache |= *ch->data++;
+
+		ch->bytes += sizeof(ch->cache);
+	}
+
+	nread = min(ch->cachebits, nbits);
+	nbits -= nread;
+
+	result = ch->cache >> ((sizeof(ch->cache) * BITS_PER_BYTE) - nread);
+	ch->cache <<= nread;
+	ch->cachebits -= nread;
+
+	if (nbits)
+		result = (result << nbits) | cs_dsp_chunk_read(ch, nbits);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_read);
+
 MODULE_DESCRIPTION("Cirrus Logic DSP Support");
 MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 6ab230218df0..cad828e21c72 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -11,6 +11,7 @@
 #ifndef __CS_DSP_H
 #define __CS_DSP_H
 
+#include <linux/bits.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
 #include <linux/list.h>
@@ -34,6 +35,7 @@
 #define CS_ADSP2_REGION_ALL (CS_ADSP2_REGION_0 | CS_ADSP2_REGION_1_9)
 
 #define CS_DSP_DATA_WORD_SIZE                3
+#define CS_DSP_DATA_WORD_BITS                (3 * BITS_PER_BYTE)
 
 #define CS_DSP_ACKED_CTL_TIMEOUT_MS          100
 #define CS_DSP_ACKED_CTL_N_QUICKPOLLS        10
@@ -252,4 +254,75 @@ struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp,
 
 const char *cs_dsp_mem_region_name(unsigned int type);
 
+/**
+ * struct cs_dsp_chunk - Describes a buffer holding data formatted for the DSP
+ * @data:	Pointer to underlying buffer memory
+ * @max:	Pointer to end of the buffer memory
+ * @bytes:	Number of bytes read/written into the memory chunk
+ * @cache:	Temporary holding data as it is formatted
+ * @cachebits:	Number of bits of data currently in cache
+ */
+struct cs_dsp_chunk {
+	u8 *data;
+	u8 *max;
+	int bytes;
+
+	u32 cache;
+	int cachebits;
+};
+
+/**
+ * cs_dsp_chunk() - Create a DSP memory chunk
+ * @data: Pointer to the buffer that will be used to store data
+ * @size: Size of the buffer in bytes
+ *
+ * Return: A cs_dsp_chunk structure
+ */
+static inline struct cs_dsp_chunk cs_dsp_chunk(void *data, int size)
+{
+	struct cs_dsp_chunk ch = {
+		.data = data,
+		.max = data + size,
+	};
+
+	return ch;
+}
+
+/**
+ * cs_dsp_chunk_end() - Check if a DSP memory chunk is full
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: True if the whole buffer has been read/written
+ */
+static inline bool cs_dsp_chunk_end(struct cs_dsp_chunk *ch)
+{
+	return ch->data == ch->max;
+}
+
+/**
+ * cs_dsp_chunk_bytes() - Number of bytes written/read from a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: Number of bytes read/written to the buffer
+ */
+static inline int cs_dsp_chunk_bytes(struct cs_dsp_chunk *ch)
+{
+	return ch->bytes;
+}
+
+/**
+ * cs_dsp_chunk_valid_addr() - Check if an address is in a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: True if the given address is within the buffer
+ */
+static inline bool cs_dsp_chunk_valid_addr(struct cs_dsp_chunk *ch, void *addr)
+{
+	return (u8 *)addr >= ch->data && (u8 *)addr < ch->max;
+}
+
+int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val);
+int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch);
+int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits);
+
 #endif
-- 
cgit 


From e2a619ca0b38f2114347b7078b8a67d72d457a3d Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 22 Jul 2022 13:07:11 +0200
Subject: asm-generic: remove a broken and needless ifdef conditional

Commit 527701eda5f1 ("lib: Add a generic version of devmem_is_allowed()")
introduces the config symbol GENERIC_LIB_DEVMEM_IS_ALLOWED, but then
falsely refers to CONFIG_GENERIC_DEVMEM_IS_ALLOWED (note the missing LIB
in the reference) in ./include/asm-generic/io.h.

Luckily, ./scripts/checkkconfigsymbols.py warns on non-existing configs:

GENERIC_DEVMEM_IS_ALLOWED
Referencing files: include/asm-generic/io.h

The actual fix, though, is simply to not to make this function declaration
dependent on any kernel config. For architectures that intend to use
the generic version, the arch's 'select GENERIC_LIB_DEVMEM_IS_ALLOWED' will
lead to picking the function definition, and for other architectures, this
function is simply defined elsewhere.

The wrong '#ifndef' on a non-existing config symbol also always had the
same effect (although more by mistake than by intent). So, there is no
functional change.

Remove this broken and needless ifdef conditional.

Fixes: 527701eda5f1 ("lib: Add a generic version of devmem_is_allowed()")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/io.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ce93aaf69f8..98954dda5734 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1125,9 +1125,7 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
 }
 #endif
 
-#ifndef CONFIG_GENERIC_DEVMEM_IS_ALLOWED
 extern int devmem_is_allowed(unsigned long pfn);
-#endif
 
 #endif /* __KERNEL__ */
 
-- 
cgit 


From 8937e28eac0cbe0ff76e15f142e4b2e5adf3f462 Mon Sep 17 00:00:00 2001
From: Xin Gao <gaoxin@cdjrlc.com>
Date: Fri, 22 Jul 2022 10:18:33 +0800
Subject: RDMA: Fix comment typo

The double `get' is duplicated, remove one.

Link: https://lore.kernel.org/r/20220722021833.15669-1-gaoxin@cdjrlc.com
Signed-off-by: Xin Gao <gaoxin@cdjrlc.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/rdma/ib_verbs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9c6317cf80d5..7c2f76f34f6f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4603,7 +4603,7 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
 
 /**
  * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
- *     In the current implementation the only way to get
+ *     In the current implementation the only way to
  *     get the 32bit lid is from other sources for OPA.
  *     For IB, lids will always be 16bits so cast the
  *     value accordingly.
-- 
cgit 


From 6d8c5afc9ab14595707ff25d971dde45728eba3e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jul 2022 18:38:17 +0800
Subject: ublk_drv: make sure that correct flags(features) returned to
 userspace

Userspace may support more features or new added flags, but the driver
side can be old, so make sure correct flags(features) returned to
userpsace, then userspace can work as expected.

Also mark the 2nd flags as reversed, just use the 1st one. When we run
out of flags, the reserved one can be handled at that time.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220722103817.631258-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 17 ++++++++++++++---
 include/uapi/linux/ublk_cmd.h |  7 ++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 67f91a80a7ab..255b2de46a24 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -46,6 +46,9 @@
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
+/* All UBLK_F_* have to be included into UBLK_F_ALL */
+#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
+
 struct ublk_rq_data {
 	struct callback_head work;
 };
@@ -953,7 +956,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	void *ptr;
 	int size;
 
-	ubq->flags = ub->dev_info.flags[0];
+	ubq->flags = ub->dev_info.flags;
 	ubq->q_id = q_id;
 	ubq->q_depth = ub->dev_info.queue_depth;
 	size = ublk_queue_cmd_buf_size(ub, q_id);
@@ -1246,7 +1249,7 @@ out_put_device:
 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 {
 	pr_devel("%s: dev id %d flags %llx\n", __func__,
-			info->dev_id, info->flags[0]);
+			info->dev_id, info->flags);
 	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
 			info->nr_hw_queues, info->queue_depth,
 			info->block_size, info->dev_blocks);
@@ -1298,8 +1301,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	/* update device id */
 	ub->dev_info.dev_id = ub->ub_number;
 
+	/*
+	 * 64bit flags will be copied back to userspace as feature
+	 * negotiation result, so have to clear flags which driver
+	 * doesn't support yet, then userspace can get correct flags
+	 * (features) to handle.
+	 */
+	ub->dev_info.flags &= UBLK_F_ALL;
+
 	/* We are not ready to support zero copy */
-	ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
+	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
 	ub->bs_shift = ilog2(ub->dev_info.block_size);
 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 917580b34198..ca33092354ab 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -46,13 +46,13 @@
  * zero copy requires 4k block size, and can remap ublk driver's io
  * request into ublksrv's vm space
  */
-#define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
+#define UBLK_F_SUPPORT_ZERO_COPY	(1ULL << 0)
 
 /*
  * Force to complete io cmd via io_uring_cmd_complete_in_task so that
  * performance comparison is done easily with using task_work_add
  */
-#define UBLK_F_URING_CMD_COMP_IN_TASK	(1UL << 1)
+#define UBLK_F_URING_CMD_COMP_IN_TASK	(1ULL << 1)
 
 /* device state */
 #define UBLK_S_DEV_DEAD	0
@@ -88,7 +88,8 @@ struct ublksrv_ctrl_dev_info {
 
 	__s32	ublksrv_pid;
 	__s32	reserved0;
-	__u64	flags[2];
+	__u64	flags;
+	__u64	flags_reserved;
 
 	/* For ublksrv internal use, invisible to ublk driver */
 	__u64	ublksrv_flags;
-- 
cgit 


From e423414375866a399ebbe55ed044acb39846e8bf Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 22 Jul 2022 13:36:05 +0200
Subject: bpf: Fix build error in case of !CONFIG_DEBUG_INFO_BTF

BTF_ID_FLAGS macro needs to be able to take 0 or 1 args, so make it a
variable argument. BTF_SET8_END is incorrect, it should just be empty.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: ab21d6063c01 ("bpf: Introduce 8-byte BTF set")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220722113605.6513-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 3cb0741e71d7..2aea877d644f 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -206,7 +206,7 @@ extern struct btf_id_set8 name;
 
 #define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
 #define BTF_ID(prefix, name)
-#define BTF_ID_FLAGS(prefix, name, flags)
+#define BTF_ID_FLAGS(prefix, name, ...)
 #define BTF_ID_UNUSED
 #define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n];
 #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1];
@@ -215,7 +215,7 @@ extern struct btf_id_set8 name;
 #define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 };
 #define BTF_SET_END(name)
 #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 };
-#define BTF_SET8_END(name) static struct btf_id_set8 __maybe_unused name = { 0 };
+#define BTF_SET8_END(name)
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
 
-- 
cgit 


From 7fb5e508319068de1d69e6d7230416c390cb3cbb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 22 Jul 2022 09:28:34 -0700
Subject: mmu_gather: fix the CONFIG_MMU_GATHER_NO_RANGE case

Sudip reports that alpha doesn't build properly, with errors like

  include/asm-generic/tlb.h:401:1: error: redefinition of 'tlb_update_vma_flags'
    401 | tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
        | ^~~~~~~~~~~~~~~~~~~~
  include/asm-generic/tlb.h:372:1: note: previous definition of 'tlb_update_vma_flags' with type 'void(struct mmu_gather *, struct vm_area_struct *)'
    372 | tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

the cause being that We have this odd situation where some architectures
were never converted to the newer TLB flushing interfaces that have a
range for the flush.  Instead people left them alone, and we have them
select the MMU_GATHER_NO_RANGE config option to make the tlb header
files account for this.

Peter Zijlstra cleaned some of these nasty header file games up in
commits

  1e9fdf21a433 ("mmu_gather: Remove per arch tlb_{start,end}_vma()")
  18ba064e42df ("mmu_gather: Let there be one tlb_{start,end}_vma() implementation")

but tlb_update_vma_flags() was left alone, and then commit b67fbebd4cf9
("mmu_gather: Force tlb-flush VM_PFNMAP vmas") ended up removing only
_one_ of the two stale duplicate dummy inline functions.

This removes the other stale one.

Somebody braver than me should try to remove MMU_GATHER_NO_RANGE
entirely, but it requires fixing up the oddball architectures that use
it: alpha, m68k, microblaze, nios2 and openrisc.

The fixups should be fairly straightforward ("fix the build errors it
exposes by adding the appropriate range arguments"), but the reason this
wasn't done in the first place is that so few people end up working on
those architectures.  But it could be done one architecture at a time,
hint, hint.

Reported-by: Sudip Mukherjee (Codethink) <sudipm.mukherjee@gmail.com>
Fixes: b67fbebd4cf9 ("mmu_gather: Force tlb-flush VM_PFNMAP vmas")
Link: https://lore.kernel.org/all/YtpXh0QHWwaEWVAY@debian/
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/tlb.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index cb2167c89eee..492dce43236e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -368,9 +368,6 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		flush_tlb_mm(tlb->mm);
 }
 
-static inline void
-tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-
 #else /* CONFIG_MMU_GATHER_NO_RANGE */
 
 #ifndef tlb_flush
-- 
cgit 


From 478af190cb6c501efaa8de2b9c9418ece2e4d0bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 Jul 2022 10:59:17 -0700
Subject: iomap: remove iomap_writepage

Unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 15 ---------------
 include/linux/iomap.h  |  3 ---
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index afd260632836..7c4a56b3ac01 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1519,21 +1519,6 @@ unlock:
 	return 0;
 }
 
-int
-iomap_writepage(struct page *page, struct writeback_control *wbc,
-		struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops)
-{
-	int ret;
-
-	wpc->ops = ops;
-	ret = iomap_do_writepage(page, wbc, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
-}
-EXPORT_SYMBOL_GPL(iomap_writepage);
-
 int
 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 		struct iomap_writepage_ctx *wpc,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e552097c67e0..911888560d3e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -303,9 +303,6 @@ void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 		struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
-int iomap_writepage(struct page *page, struct writeback_control *wbc,
-		struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops);
 int iomap_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
 		const struct iomap_writeback_ops *ops);
-- 
cgit 


From ec2904c259c56fbe50aacd838da9553a6eea6683 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Thu, 21 Jul 2022 16:22:23 -0700
Subject: Bluetooth: Remove dead code from hci_request.c

The discov_update work queue is no longer used as a result
of the hci_sync rework.

The __hci_req_hci_power_on() function is no longer referenced in the
code as a result of the hci_sync rework.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |   1 -
 net/bluetooth/hci_request.c      | 287 ---------------------------------------
 net/bluetooth/hci_request.h      |   2 -
 3 files changed, 290 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index df7dac4a5bbd..83bbeffe71e9 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -517,7 +517,6 @@ struct hci_dev {
 	struct work_struct	cmd_work;
 	struct work_struct	tx_work;
 
-	struct work_struct	discov_update;
 	struct work_struct	scan_update;
 	struct delayed_work	le_scan_disable;
 	struct delayed_work	le_scan_restart;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 38ecaf9264ee..5a01bf997500 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -2227,146 +2227,6 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
-static int active_scan(struct hci_request *req, unsigned long opt)
-{
-	uint16_t interval = opt;
-	struct hci_dev *hdev = req->hdev;
-	u8 own_addr_type;
-	/* Accept list is not used for discovery */
-	u8 filter_policy = 0x00;
-	/* Default is to enable duplicates filter */
-	u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-	/* Discovery doesn't require controller address resolution */
-	bool addr_resolv = false;
-	int err;
-
-	bt_dev_dbg(hdev, "");
-
-	/* If controller is scanning, it means the background scanning is
-	 * running. Thus, we should temporarily stop it in order to set the
-	 * discovery scanning parameters.
-	 */
-	if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
-		hci_req_add_le_scan_disable(req, false);
-		cancel_interleave_scan(hdev);
-	}
-
-	/* All active scans will be done with either a resolvable private
-	 * address (when privacy feature has been enabled) or non-resolvable
-	 * private address.
-	 */
-	err = hci_update_random_address(req, true, scan_use_rpa(hdev),
-					&own_addr_type);
-	if (err < 0)
-		own_addr_type = ADDR_LE_DEV_PUBLIC;
-
-	hci_dev_lock(hdev);
-	if (hci_is_adv_monitoring(hdev)) {
-		/* Duplicate filter should be disabled when some advertisement
-		 * monitor is activated, otherwise AdvMon can only receive one
-		 * advertisement for one peer(*) during active scanning, and
-		 * might report loss to these peers.
-		 *
-		 * Note that different controllers have different meanings of
-		 * |duplicate|. Some of them consider packets with the same
-		 * address as duplicate, and others consider packets with the
-		 * same address and the same RSSI as duplicate. Although in the
-		 * latter case we don't need to disable duplicate filter, but
-		 * it is common to have active scanning for a short period of
-		 * time, the power impact should be neglectable.
-		 */
-		filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
-	}
-	hci_dev_unlock(hdev);
-
-	hci_req_start_scan(req, LE_SCAN_ACTIVE, interval,
-			   hdev->le_scan_window_discovery, own_addr_type,
-			   filter_policy, filter_dup, addr_resolv);
-	return 0;
-}
-
-static int interleaved_discov(struct hci_request *req, unsigned long opt)
-{
-	int err;
-
-	bt_dev_dbg(req->hdev, "");
-
-	err = active_scan(req, opt);
-	if (err)
-		return err;
-
-	return bredr_inquiry(req, DISCOV_BREDR_INQUIRY_LEN);
-}
-
-static void start_discovery(struct hci_dev *hdev, u8 *status)
-{
-	unsigned long timeout;
-
-	bt_dev_dbg(hdev, "type %u", hdev->discovery.type);
-
-	switch (hdev->discovery.type) {
-	case DISCOV_TYPE_BREDR:
-		if (!hci_dev_test_flag(hdev, HCI_INQUIRY))
-			hci_req_sync(hdev, bredr_inquiry,
-				     DISCOV_BREDR_INQUIRY_LEN, HCI_CMD_TIMEOUT,
-				     status);
-		return;
-	case DISCOV_TYPE_INTERLEAVED:
-		/* When running simultaneous discovery, the LE scanning time
-		 * should occupy the whole discovery time sine BR/EDR inquiry
-		 * and LE scanning are scheduled by the controller.
-		 *
-		 * For interleaving discovery in comparison, BR/EDR inquiry
-		 * and LE scanning are done sequentially with separate
-		 * timeouts.
-		 */
-		if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
-			     &hdev->quirks)) {
-			timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
-			/* During simultaneous discovery, we double LE scan
-			 * interval. We must leave some time for the controller
-			 * to do BR/EDR inquiry.
-			 */
-			hci_req_sync(hdev, interleaved_discov,
-				     hdev->le_scan_int_discovery * 2, HCI_CMD_TIMEOUT,
-				     status);
-			break;
-		}
-
-		timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
-		hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery,
-			     HCI_CMD_TIMEOUT, status);
-		break;
-	case DISCOV_TYPE_LE:
-		timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
-		hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery,
-			     HCI_CMD_TIMEOUT, status);
-		break;
-	default:
-		*status = HCI_ERROR_UNSPECIFIED;
-		return;
-	}
-
-	if (*status)
-		return;
-
-	bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout));
-
-	/* When service discovery is used and the controller has a
-	 * strict duplicate filter, it is important to remember the
-	 * start and duration of the scan. This is required for
-	 * restarting scanning during the discovery phase.
-	 */
-	if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) &&
-		     hdev->discovery.result_filtering) {
-		hdev->discovery.scan_start = jiffies;
-		hdev->discovery.scan_duration = timeout;
-	}
-
-	queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable,
-			   timeout);
-}
-
 bool hci_req_stop_discovery(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -2462,42 +2322,6 @@ error:
 	return err;
 }
 
-static int stop_discovery(struct hci_request *req, unsigned long opt)
-{
-	hci_dev_lock(req->hdev);
-	hci_req_stop_discovery(req);
-	hci_dev_unlock(req->hdev);
-
-	return 0;
-}
-
-static void discov_update(struct work_struct *work)
-{
-	struct hci_dev *hdev = container_of(work, struct hci_dev,
-					    discov_update);
-	u8 status = 0;
-
-	switch (hdev->discovery.state) {
-	case DISCOVERY_STARTING:
-		start_discovery(hdev, &status);
-		mgmt_start_discovery_complete(hdev, status);
-		if (status)
-			hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
-		else
-			hci_discovery_set_state(hdev, DISCOVERY_FINDING);
-		break;
-	case DISCOVERY_STOPPING:
-		hci_req_sync(hdev, stop_discovery, 0, HCI_CMD_TIMEOUT, &status);
-		mgmt_stop_discovery_complete(hdev, status);
-		if (!status)
-			hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
-		break;
-	case DISCOVERY_STOPPED:
-	default:
-		return;
-	}
-}
-
 static void discov_off(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev,
@@ -2522,118 +2346,8 @@ static void discov_off(struct work_struct *work)
 	mgmt_new_settings(hdev);
 }
 
-static int powered_update_hci(struct hci_request *req, unsigned long opt)
-{
-	struct hci_dev *hdev = req->hdev;
-	u8 link_sec;
-
-	hci_dev_lock(hdev);
-
-	if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
-	    !lmp_host_ssp_capable(hdev)) {
-		u8 mode = 0x01;
-
-		hci_req_add(req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
-
-		if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) {
-			u8 support = 0x01;
-
-			hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
-				    sizeof(support), &support);
-		}
-	}
-
-	if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
-	    lmp_bredr_capable(hdev)) {
-		struct hci_cp_write_le_host_supported cp;
-
-		cp.le = 0x01;
-		cp.simul = 0x00;
-
-		/* Check first if we already have the right
-		 * host state (host features set)
-		 */
-		if (cp.le != lmp_host_le_capable(hdev) ||
-		    cp.simul != lmp_host_le_br_capable(hdev))
-			hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED,
-				    sizeof(cp), &cp);
-	}
-
-	if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
-		/* Make sure the controller has a good default for
-		 * advertising data. This also applies to the case
-		 * where BR/EDR was toggled during the AUTO_OFF phase.
-		 */
-		if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
-		    list_empty(&hdev->adv_instances)) {
-			int err;
-
-			if (ext_adv_capable(hdev)) {
-				err = __hci_req_setup_ext_adv_instance(req,
-								       0x00);
-				if (!err)
-					__hci_req_update_scan_rsp_data(req,
-								       0x00);
-			} else {
-				err = 0;
-				__hci_req_update_adv_data(req, 0x00);
-				__hci_req_update_scan_rsp_data(req, 0x00);
-			}
-
-			if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
-				if (!ext_adv_capable(hdev))
-					__hci_req_enable_advertising(req);
-				else if (!err)
-					__hci_req_enable_ext_advertising(req,
-									 0x00);
-			}
-		} else if (!list_empty(&hdev->adv_instances)) {
-			struct adv_info *adv_instance;
-
-			adv_instance = list_first_entry(&hdev->adv_instances,
-							struct adv_info, list);
-			__hci_req_schedule_adv_instance(req,
-							adv_instance->instance,
-							true);
-		}
-	}
-
-	link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
-	if (link_sec != test_bit(HCI_AUTH, &hdev->flags))
-		hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE,
-			    sizeof(link_sec), &link_sec);
-
-	if (lmp_bredr_capable(hdev)) {
-		if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
-			__hci_req_write_fast_connectable(req, true);
-		else
-			__hci_req_write_fast_connectable(req, false);
-		__hci_req_update_scan(req);
-		__hci_req_update_class(req);
-		__hci_req_update_name(req);
-		__hci_req_update_eir(req);
-	}
-
-	hci_dev_unlock(hdev);
-	return 0;
-}
-
-int __hci_req_hci_power_on(struct hci_dev *hdev)
-{
-	/* Register the available SMP channels (BR/EDR and LE) only when
-	 * successfully powering on the controller. This late
-	 * registration is required so that LE SMP can clearly decide if
-	 * the public address or static address is used.
-	 */
-	smp_register(hdev);
-
-	return __hci_req_sync(hdev, powered_update_hci, 0, HCI_CMD_TIMEOUT,
-			      NULL);
-}
-
 void hci_request_setup(struct hci_dev *hdev)
 {
-	INIT_WORK(&hdev->discov_update, discov_update);
 	INIT_WORK(&hdev->scan_update, scan_update_work);
 	INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
 	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
@@ -2646,7 +2360,6 @@ void hci_request_cancel_all(struct hci_dev *hdev)
 {
 	__hci_cmd_sync_cancel(hdev, ENODEV);
 
-	cancel_work_sync(&hdev->discov_update);
 	cancel_work_sync(&hdev->scan_update);
 	cancel_delayed_work_sync(&hdev->discov_off);
 	cancel_delayed_work_sync(&hdev->le_scan_disable);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 7f8df258e295..2b2fba278510 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -68,8 +68,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
 struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
 				const void *param);
 
-int __hci_req_hci_power_on(struct hci_dev *hdev);
-
 void __hci_req_write_fast_connectable(struct hci_request *req, bool enable);
 void __hci_req_update_name(struct hci_request *req);
 void __hci_req_update_eir(struct hci_request *req);
-- 
cgit 


From bb87672562f871edd7a220222dd2480a87294580 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Thu, 21 Jul 2022 16:22:24 -0700
Subject: Bluetooth: Remove update_scan hci_request dependancy

This removes the remaining calls to HCI_OP_WRITE_SCAN_ENABLE from
hci_request call chains, and converts them to hci_sync calls.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 -
 include/net/bluetooth/hci_sync.h |  1 +
 net/bluetooth/hci_event.c        |  4 ++--
 net/bluetooth/hci_request.c      | 17 -----------------
 net/bluetooth/hci_request.h      |  5 -----
 net/bluetooth/hci_sync.c         | 10 ++++++++++
 net/bluetooth/mgmt.c             |  8 ++++----
 7 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 83bbeffe71e9..df65b107b596 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -517,7 +517,6 @@ struct hci_dev {
 	struct work_struct	cmd_work;
 	struct work_struct	tx_work;
 
-	struct work_struct	scan_update;
 	struct delayed_work	le_scan_disable;
 	struct delayed_work	le_scan_restart;
 
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 544e949b5dbf..5c4d4cace9c3 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -78,6 +78,7 @@ int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp);
 
 int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable);
 int hci_update_scan_sync(struct hci_dev *hdev);
+int hci_update_scan(struct hci_dev *hdev);
 
 int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul);
 int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 34bec7446d00..b32ca92fc692 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3173,7 +3173,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 			hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
 				     sizeof(cp), &cp);
 
-			hci_req_update_scan(hdev);
+			hci_update_scan(hdev);
 		}
 
 		/* Set packet type for incoming connection */
@@ -3371,7 +3371,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data,
 		if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
 			hci_remove_link_key(hdev, &conn->dst);
 
-		hci_req_update_scan(hdev);
+		hci_update_scan(hdev);
 	}
 
 	params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 5a01bf997500..0ce486afbf43 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1835,21 +1835,6 @@ void __hci_req_update_scan(struct hci_request *req)
 	hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
 }
 
-static int update_scan(struct hci_request *req, unsigned long opt)
-{
-	hci_dev_lock(req->hdev);
-	__hci_req_update_scan(req);
-	hci_dev_unlock(req->hdev);
-	return 0;
-}
-
-static void scan_update_work(struct work_struct *work)
-{
-	struct hci_dev *hdev = container_of(work, struct hci_dev, scan_update);
-
-	hci_req_sync(hdev, update_scan, 0, HCI_CMD_TIMEOUT, NULL);
-}
-
 static u8 get_service_classes(struct hci_dev *hdev)
 {
 	struct bt_uuid *uuid;
@@ -2348,7 +2333,6 @@ static void discov_off(struct work_struct *work)
 
 void hci_request_setup(struct hci_dev *hdev)
 {
-	INIT_WORK(&hdev->scan_update, scan_update_work);
 	INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
 	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
 	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
@@ -2360,7 +2344,6 @@ void hci_request_cancel_all(struct hci_dev *hdev)
 {
 	__hci_cmd_sync_cancel(hdev, ENODEV);
 
-	cancel_work_sync(&hdev->scan_update);
 	cancel_delayed_work_sync(&hdev->discov_off);
 	cancel_delayed_work_sync(&hdev->le_scan_disable);
 	cancel_delayed_work_sync(&hdev->le_scan_restart);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 2b2fba278510..821244ad9d73 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -108,11 +108,6 @@ bool hci_req_stop_discovery(struct hci_request *req);
 
 int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec);
 
-static inline void hci_req_update_scan(struct hci_dev *hdev)
-{
-	queue_work(hdev->req_workqueue, &hdev->scan_update);
-}
-
 void __hci_req_update_scan(struct hci_request *req);
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 1fbeee970aa7..46a04488d614 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2236,6 +2236,16 @@ int hci_update_passive_scan_sync(struct hci_dev *hdev)
 	return err;
 }
 
+static int update_scan_sync(struct hci_dev *hdev, void *data)
+{
+	return hci_update_scan_sync(hdev);
+}
+
+int hci_update_scan(struct hci_dev *hdev)
+{
+	return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL);
+}
+
 static int update_passive_scan_sync(struct hci_dev *hdev, void *data)
 {
 	return hci_update_passive_scan_sync(hdev);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index aa651129b714..153206763f00 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1611,7 +1611,7 @@ static int set_connectable_update_settings(struct hci_dev *hdev,
 		return err;
 
 	if (changed) {
-		hci_req_update_scan(hdev);
+		hci_update_scan(hdev);
 		hci_update_passive_scan(hdev);
 		return new_settings(hdev, sk);
 	}
@@ -7081,7 +7081,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
 		if (err)
 			goto unlock;
 
-		hci_req_update_scan(hdev);
+		hci_update_scan(hdev);
 
 		goto added;
 	}
@@ -7193,7 +7193,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
 				goto unlock;
 			}
 
-			hci_req_update_scan(hdev);
+			hci_update_scan(hdev);
 
 			device_removed(sk, hdev, &cp->addr.bdaddr,
 				       cp->addr.type);
@@ -7257,7 +7257,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
 			kfree(b);
 		}
 
-		hci_req_update_scan(hdev);
+		hci_update_scan(hdev);
 
 		list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) {
 			if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
-- 
cgit 


From f96f644ab97abeed3f7007c953836a574ce928cc Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 19 Jul 2022 17:21:23 -0700
Subject: ftrace: Add modify_ftrace_direct_multi_nolock

This is similar to modify_ftrace_direct_multi, but does not acquire
direct_mutex. This is useful when direct_mutex is already locked by the
user.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/bpf/20220720002126.803253-2-song@kernel.org
---
 include/linux/ftrace.h |  5 +++
 kernel/trace/ftrace.c  | 86 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 979f6bfa2c25..acb35243ce5d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -340,6 +340,7 @@ unsigned long ftrace_find_rec_direct(unsigned long ip);
 int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
 int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr);
+int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr);
 
 #else
 struct ftrace_ops;
@@ -384,6 +385,10 @@ static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned lo
 {
 	return -ENODEV;
 }
+static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 601ccf1b2f09..5d67dc12231d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5691,22 +5691,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
 
-/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
- * to call something else
- * @ops: The address of the struct ftrace_ops object
- * @addr: The address of the new trampoline to call at @ops functions
- *
- * This is used to unregister currently registered direct caller and
- * register new one @addr on functions registered in @ops object.
- *
- * Note there's window between ftrace_shutdown and ftrace_startup calls
- * where there will be no callbacks called.
- *
- * Returns: zero on success. Non zero on error, which includes:
- *  -EINVAL - The @ops object was not properly registered.
- */
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static int
+__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 {
 	struct ftrace_hash *hash;
 	struct ftrace_func_entry *entry, *iter;
@@ -5717,12 +5703,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	int i, size;
 	int err;
 
-	if (check_direct_multi(ops))
-		return -EINVAL;
-	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
-		return -EINVAL;
-
-	mutex_lock(&direct_mutex);
+	lockdep_assert_held_once(&direct_mutex);
 
 	/* Enable the tmp_ops to have the same functions as the direct ops */
 	ftrace_ops_init(&tmp_ops);
@@ -5730,7 +5711,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 
 	err = register_ftrace_function(&tmp_ops);
 	if (err)
-		goto out_direct;
+		return err;
 
 	/*
 	 * Now the ftrace_ops_list_func() is called to do the direct callers.
@@ -5754,7 +5735,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	/* Removing the tmp_ops will add the updated direct callers to the functions */
 	unregister_ftrace_function(&tmp_ops);
 
- out_direct:
+	return err;
+}
+
+/**
+ * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Caller should already have direct_mutex locked, so we don't lock
+ * direct_mutex here.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ *  -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+{
+	if (check_direct_multi(ops))
+		return -EINVAL;
+	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+		return -EINVAL;
+
+	return __modify_ftrace_direct_multi(ops, addr);
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+
+/**
+ * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ *  -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+	int err;
+
+	if (check_direct_multi(ops))
+		return -EINVAL;
+	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+		return -EINVAL;
+
+	mutex_lock(&direct_mutex);
+	err = __modify_ftrace_direct_multi(ops, addr);
 	mutex_unlock(&direct_mutex);
 	return err;
 }
-- 
cgit 


From 53cd885bc5c3ea283cc9c00ca6446c778f00bfba Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 19 Jul 2022 17:21:24 -0700
Subject: ftrace: Allow IPMODIFY and DIRECT ops on the same function

IPMODIFY (livepatch) and DIRECT (bpf trampoline) ops are both important
users of ftrace. It is necessary to allow them work on the same function
at the same time.

First, DIRECT ops no longer specify IPMODIFY flag. Instead, DIRECT flag is
handled together with IPMODIFY flag in __ftrace_hash_update_ipmodify().

Then, a callback function, ops_func, is added to ftrace_ops. This is used
by ftrace core code to understand whether the DIRECT ops can share with an
IPMODIFY ops. To share with IPMODIFY ops, the DIRECT ops need to implement
the callback function and adjust the direct trampoline accordingly.

If DIRECT ops is attached before the IPMODIFY ops, ftrace core code calls
ENABLE_SHARE_IPMODIFY_PEER on the DIRECT ops before registering the
IPMODIFY ops.

If IPMODIFY ops is attached before the DIRECT ops, ftrace core code calls
ENABLE_SHARE_IPMODIFY_SELF in __ftrace_hash_update_ipmodify. Owner of the
DIRECT ops may return 0 if the DIRECT trampoline can share with IPMODIFY,
so error code otherwise. The error code is propagated to
register_ftrace_direct_multi so that onwer of the DIRECT trampoline can
handle it properly.

For more details, please refer to comment before enum ftrace_ops_cmd.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/
Link: https://lore.kernel.org/all/20220718055449.3960512-1-song@kernel.org/
Link: https://lore.kernel.org/bpf/20220720002126.803253-3-song@kernel.org
---
 include/linux/ftrace.h |  38 ++++++++
 kernel/trace/ftrace.c  | 242 +++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 254 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index acb35243ce5d..0b61371e287b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -208,6 +208,43 @@ enum {
 	FTRACE_OPS_FL_DIRECT			= BIT(17),
 };
 
+/*
+ * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes
+ * to a ftrace_ops. Note, the requests may fail.
+ *
+ * ENABLE_SHARE_IPMODIFY_SELF - enable a DIRECT ops to work on the same
+ *                              function as an ops with IPMODIFY. Called
+ *                              when the DIRECT ops is being registered.
+ *                              This is called with both direct_mutex and
+ *                              ftrace_lock are locked.
+ *
+ * ENABLE_SHARE_IPMODIFY_PEER - enable a DIRECT ops to work on the same
+ *                              function as an ops with IPMODIFY. Called
+ *                              when the other ops (the one with IPMODIFY)
+ *                              is being registered.
+ *                              This is called with direct_mutex locked.
+ *
+ * DISABLE_SHARE_IPMODIFY_PEER - disable a DIRECT ops to work on the same
+ *                               function as an ops with IPMODIFY. Called
+ *                               when the other ops (the one with IPMODIFY)
+ *                               is being unregistered.
+ *                               This is called with direct_mutex locked.
+ */
+enum ftrace_ops_cmd {
+	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF,
+	FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER,
+	FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER,
+};
+
+/*
+ * For most ftrace_ops_cmd,
+ * Returns:
+ *        0 - Success.
+ *        Negative on failure. The return value is dependent on the
+ *        callback.
+ */
+typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* The hash used to know what functions callbacks trace */
 struct ftrace_ops_hash {
@@ -250,6 +287,7 @@ struct ftrace_ops {
 	unsigned long			trampoline;
 	unsigned long			trampoline_size;
 	struct list_head		list;
+	ftrace_ops_func_t		ops_func;
 #endif
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5d67dc12231d..bc921a3f7ea8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1861,6 +1861,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
 	ftrace_hash_rec_update_modify(ops, filter_hash, 1);
 }
 
+static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip);
+
 /*
  * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
  * or no-needed to update, -EBUSY if it detects a conflict of the flag
@@ -1869,6 +1871,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
  *  - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
  *  - If the hash is EMPTY_HASH, it hits nothing
  *  - Anything else hits the recs which match the hash entries.
+ *
+ * DIRECT ops does not have IPMODIFY flag, but we still need to check it
+ * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call
+ * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with
+ * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate
+ * the return value to the caller and eventually to the owner of the DIRECT
+ * ops.
  */
 static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 					 struct ftrace_hash *old_hash,
@@ -1877,17 +1886,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec, *end = NULL;
 	int in_old, in_new;
+	bool is_ipmodify, is_direct;
 
 	/* Only update if the ops has been registered */
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
 		return 0;
 
-	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+	is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
+	is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
+
+	/* neither IPMODIFY nor DIRECT, skip */
+	if (!is_ipmodify && !is_direct)
+		return 0;
+
+	if (WARN_ON_ONCE(is_ipmodify && is_direct))
 		return 0;
 
 	/*
-	 * Since the IPMODIFY is a very address sensitive action, we do not
-	 * allow ftrace_ops to set all functions to new hash.
+	 * Since the IPMODIFY and DIRECT are very address sensitive
+	 * actions, we do not allow ftrace_ops to set all functions to new
+	 * hash.
 	 */
 	if (!new_hash || !old_hash)
 		return -EINVAL;
@@ -1905,12 +1923,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 			continue;
 
 		if (in_new) {
-			/* New entries must ensure no others are using it */
-			if (rec->flags & FTRACE_FL_IPMODIFY)
-				goto rollback;
-			rec->flags |= FTRACE_FL_IPMODIFY;
-		} else /* Removed entry */
+			if (rec->flags & FTRACE_FL_IPMODIFY) {
+				int ret;
+
+				/* Cannot have two ipmodify on same rec */
+				if (is_ipmodify)
+					goto rollback;
+
+				FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+
+				/*
+				 * Another ops with IPMODIFY is already
+				 * attached. We are now attaching a direct
+				 * ops. Run SHARE_IPMODIFY_SELF, to check
+				 * whether sharing is supported.
+				 */
+				if (!ops->ops_func)
+					return -EBUSY;
+				ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+				if (ret)
+					return ret;
+			} else if (is_ipmodify) {
+				rec->flags |= FTRACE_FL_IPMODIFY;
+			}
+		} else if (is_ipmodify) {
 			rec->flags &= ~FTRACE_FL_IPMODIFY;
+		}
 	} while_for_each_ftrace_rec();
 
 	return 0;
@@ -2454,8 +2492,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
 
 struct ftrace_ops direct_ops = {
 	.func		= call_direct_funcs,
-	.flags		= FTRACE_OPS_FL_IPMODIFY
-			  | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+	.flags		= FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
 			  | FTRACE_OPS_FL_PERMANENT,
 	/*
 	 * By declaring the main trampoline as this trampoline
@@ -3072,14 +3109,14 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
 }
 
 /*
- * Check if the current ops references the record.
+ * Check if the current ops references the given ip.
  *
  * If the ops traces all functions, then it was already accounted for.
  * If the ops does not trace the current record function, skip it.
  * If the ops ignores the function via notrace filter, skip it.
  */
-static inline bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+static bool
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
 {
 	/* If ops isn't enabled, ignore it */
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -3091,16 +3128,29 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 
 	/* The function must be in the filter */
 	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
-	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
 		return false;
 
 	/* If in notrace hash, we ignore it too */
-	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+	if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
 		return false;
 
 	return true;
 }
 
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+	return ops_references_ip(ops, rec->ip);
+}
+
 static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 {
 	bool init_nop = ftrace_need_init_nop();
@@ -5215,6 +5265,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
 	return direct;
 }
 
+static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+
 /**
  * register_ftrace_direct - Call a custom trampoline directly
  * @ip: The address of the nop at the beginning of a function
@@ -5286,7 +5338,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 	ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
 
 	if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
-		ret = register_ftrace_function(&direct_ops);
+		ret = register_ftrace_function_nolock(&direct_ops);
 		if (ret)
 			ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
 	}
@@ -5545,8 +5597,7 @@ int modify_ftrace_direct(unsigned long ip,
 }
 EXPORT_SYMBOL_GPL(modify_ftrace_direct);
 
-#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \
-		     FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
 
 static int check_direct_multi(struct ftrace_ops *ops)
 {
@@ -5639,7 +5690,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	ops->flags = MULTI_FLAGS;
 	ops->trampoline = FTRACE_REGS_ADDR;
 
-	err = register_ftrace_function(ops);
+	err = register_ftrace_function_nolock(ops);
 
  out_remove:
 	if (err)
@@ -5709,7 +5760,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
 	ftrace_ops_init(&tmp_ops);
 	tmp_ops.func_hash = ops->func_hash;
 
-	err = register_ftrace_function(&tmp_ops);
+	err = register_ftrace_function_nolock(&tmp_ops);
 	if (err)
 		return err;
 
@@ -8003,6 +8054,143 @@ int ftrace_is_dead(void)
 	return ftrace_disabled;
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When registering ftrace_ops with IPMODIFY, it is necessary to make sure
+ * it doesn't conflict with any direct ftrace_ops. If there is existing
+ * direct ftrace_ops on a kernel function being patched, call
+ * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing.
+ *
+ * @ops:     ftrace_ops being registered.
+ *
+ * Returns:
+ *         0 on success;
+ *         Negative on failure.
+ */
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+	struct ftrace_func_entry *entry;
+	struct ftrace_hash *hash;
+	struct ftrace_ops *op;
+	int size, i, ret;
+
+	lockdep_assert_held_once(&direct_mutex);
+
+	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+		return 0;
+
+	hash = ops->func_hash->filter_hash;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+			unsigned long ip = entry->ip;
+			bool found_op = false;
+
+			mutex_lock(&ftrace_lock);
+			do_for_each_ftrace_op(op, ftrace_ops_list) {
+				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+					continue;
+				if (ops_references_ip(op, ip)) {
+					found_op = true;
+					break;
+				}
+			} while_for_each_ftrace_op(op);
+			mutex_unlock(&ftrace_lock);
+
+			if (found_op) {
+				if (!op->ops_func)
+					return -EBUSY;
+
+				ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Similar to prepare_direct_functions_for_ipmodify, clean up after ops
+ * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT
+ * ops.
+ */
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+	struct ftrace_func_entry *entry;
+	struct ftrace_hash *hash;
+	struct ftrace_ops *op;
+	int size, i;
+
+	if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+		return;
+
+	mutex_lock(&direct_mutex);
+
+	hash = ops->func_hash->filter_hash;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+			unsigned long ip = entry->ip;
+			bool found_op = false;
+
+			mutex_lock(&ftrace_lock);
+			do_for_each_ftrace_op(op, ftrace_ops_list) {
+				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+					continue;
+				if (ops_references_ip(op, ip)) {
+					found_op = true;
+					break;
+				}
+			} while_for_each_ftrace_op(op);
+			mutex_unlock(&ftrace_lock);
+
+			/* The cleanup is optional, ignore any errors */
+			if (found_op && op->ops_func)
+				op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+		}
+	}
+	mutex_unlock(&direct_mutex);
+}
+
+#define lock_direct_mutex()	mutex_lock(&direct_mutex)
+#define unlock_direct_mutex()	mutex_unlock(&direct_mutex)
+
+#else  /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+	return 0;
+}
+
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+}
+
+#define lock_direct_mutex()	do { } while (0)
+#define unlock_direct_mutex()	do { } while (0)
+
+#endif  /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+/*
+ * Similar to register_ftrace_function, except we don't lock direct_mutex.
+ */
+static int register_ftrace_function_nolock(struct ftrace_ops *ops)
+{
+	int ret;
+
+	ftrace_ops_init(ops);
+
+	mutex_lock(&ftrace_lock);
+
+	ret = ftrace_startup(ops, 0);
+
+	mutex_unlock(&ftrace_lock);
+
+	return ret;
+}
+
 /**
  * register_ftrace_function - register a function for profiling
  * @ops:	ops structure that holds the function for profiling.
@@ -8018,14 +8206,15 @@ int register_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
-	ftrace_ops_init(ops);
-
-	mutex_lock(&ftrace_lock);
-
-	ret = ftrace_startup(ops, 0);
+	lock_direct_mutex();
+	ret = prepare_direct_functions_for_ipmodify(ops);
+	if (ret < 0)
+		goto out_unlock;
 
-	mutex_unlock(&ftrace_lock);
+	ret = register_ftrace_function_nolock(ops);
 
+out_unlock:
+	unlock_direct_mutex();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);
@@ -8044,6 +8233,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	ret = ftrace_shutdown(ops, 0);
 	mutex_unlock(&ftrace_lock);
 
+	cleanup_direct_functions_after_ipmodify(ops);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
-- 
cgit 


From 316cba62dfb7878b7353177e6a7da9cc0c979cde Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 19 Jul 2022 17:21:25 -0700
Subject: bpf, x64: Allow to use caller address from stack

Currently we call the original function by using the absolute address
given at the JIT generation. That's not usable when having trampoline
attached to multiple functions, or the target address changes dynamically
(in case of live patch). In such cases we need to take the return address
from the stack.

Adding support to retrieve the original function address from the stack
by adding new BPF_TRAMP_F_ORIG_STACK flag for arch_prepare_bpf_trampoline
function.

Basically we take the return address of the 'fentry' call:

   function + 0: call fentry    # stores 'function + 5' address on stack
   function + 5: ...

The 'function + 5' address will be used as the address for the
original function to call.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220720002126.803253-4-song@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 13 +++++++++----
 include/linux/bpf.h         |  5 +++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 54c7f46c453f..e1b0c5ed0b7c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2119,10 +2119,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
 		restore_regs(m, &prog, nr_args, regs_off);
 
-		/* call original function */
-		if (emit_call(&prog, orig_call, prog)) {
-			ret = -EINVAL;
-			goto cleanup;
+		if (flags & BPF_TRAMP_F_ORIG_STACK) {
+			emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
+			EMIT2(0xff, 0xd0); /* call *rax */
+		} else {
+			/* call original function */
+			if (emit_call(&prog, orig_call, prog)) {
+				ret = -EINVAL;
+				goto cleanup;
+			}
 		}
 		/* remember return value in a stack for bpf prog to access */
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a97751d845c9..1d22df8bf306 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -751,6 +751,11 @@ struct btf_func_model {
 /* Return the return value of fentry prog. Only used by bpf_struct_ops. */
 #define BPF_TRAMP_F_RET_FENTRY_RET	BIT(4)
 
+/* Get original function from stack instead of from provided direct address.
+ * Makes sense for trampolines with fexit or fmod_ret programs.
+ */
+#define BPF_TRAMP_F_ORIG_STACK		BIT(5)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
-- 
cgit 


From 00963a2e75a872e5fce4d0115ac2786ec86b57a6 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 19 Jul 2022 17:21:26 -0700
Subject: bpf: Support bpf_trampoline on functions with IPMODIFY (e.g.
 livepatch)

When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf
trampoline must follow the instruction pointer saved on stack. This needs
extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag.

Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used
by BPF trampoline. This enables tracing functions with livepatch.

This also requires moving bpf trampoline to *_ftrace_direct_mult APIs.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/
Link: https://lore.kernel.org/bpf/20220720002126.803253-5-song@kernel.org
---
 include/linux/bpf.h     |   8 +++
 kernel/bpf/trampoline.c | 158 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 149 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1d22df8bf306..20c26aed7896 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -47,6 +47,7 @@ struct kobject;
 struct mem_cgroup;
 struct module;
 struct bpf_func_state;
+struct ftrace_ops;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -756,6 +757,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_ORIG_STACK		BIT(5)
 
+/* This trampoline is on a function with another ftrace_ops with IPMODIFY,
+ * e.g., a live patch. This flag is set and cleared by ftrace call backs,
+ */
+#define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
@@ -838,9 +844,11 @@ struct bpf_tramp_image {
 struct bpf_trampoline {
 	/* hlist for trampoline_table */
 	struct hlist_node hlist;
+	struct ftrace_ops *fops;
 	/* serializes access to fields of this trampoline */
 	struct mutex mutex;
 	refcount_t refcnt;
+	u32 flags;
 	u64 key;
 	struct {
 		struct btf_func_model model;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 6691dbf9e467..42e387a12694 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,7 @@
 #include <linux/static_call.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
+#include <linux/delay.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+	struct bpf_trampoline *tr = ops->private;
+	int ret = 0;
+
+	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+		/* This is called inside register_ftrace_direct_multi(), so
+		 * tr->mutex is already locked.
+		 */
+		lockdep_assert_held_once(&tr->mutex);
+
+		/* Instead of updating the trampoline here, we propagate
+		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
+		 * retry register_ftrace_direct_multi() after updating the
+		 * trampoline.
+		 */
+		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+			if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+				return -EBUSY;
+
+			tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	/* The normal locking order is
+	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+	 *
+	 * The following two commands are called from
+	 *
+	 *   prepare_direct_functions_for_ipmodify
+	 *   cleanup_direct_functions_after_ipmodify
+	 *
+	 * In both cases, direct_mutex is already locked. Use
+	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+	 * (something else is making changes to this same trampoline).
+	 */
+	if (!mutex_trylock(&tr->mutex)) {
+		/* sleep 1 ms to make sure whatever holding tr->mutex makes
+		 * some progress.
+		 */
+		msleep(1);
+		return -EAGAIN;
+	}
+
+	switch (cmd) {
+	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+		tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	mutex_unlock(&tr->mutex);
+	return ret;
+}
+#endif
+
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	enum bpf_attach_type eatype = prog->expected_attach_type;
@@ -89,6 +165,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
 	if (!tr)
 		goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+	if (!tr->fops) {
+		kfree(tr);
+		tr = NULL;
+		goto out;
+	}
+	tr->fops->private = tr;
+	tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
 
 	tr->key = key;
 	INIT_HLIST_NODE(&tr->hlist);
@@ -128,7 +214,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	int ret;
 
 	if (tr->func.ftrace_managed)
-		ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
@@ -137,15 +223,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	return ret;
 }
 
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+			 bool lock_direct_mutex)
 {
 	void *ip = tr->func.addr;
 	int ret;
 
-	if (tr->func.ftrace_managed)
-		ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		if (lock_direct_mutex)
+			ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+		else
+			ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+	}
 	return ret;
 }
 
@@ -163,10 +254,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	if (bpf_trampoline_module_get(tr))
 		return -ENOENT;
 
-	if (tr->func.ftrace_managed)
-		ret = register_ftrace_direct((long)ip, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0);
+		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+	}
 
 	if (ret)
 		bpf_trampoline_module_put(tr);
@@ -332,11 +425,11 @@ out:
 	return ERR_PTR(err);
 }
 
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_tramp_links *tlinks;
-	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+	u32 orig_flags = tr->flags;
 	bool ip_arg = false;
 	int err, total;
 
@@ -358,18 +451,31 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	}
 
+	/* clear all bits except SHARE_IPMODIFY */
+	tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+
 	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
-	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
+	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
 		/* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
 		 * should not be set together.
 		 */
-		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+		tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+	} else {
+		tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+	}
 
 	if (ip_arg)
-		flags |= BPF_TRAMP_F_IP_ARG;
+		tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+	if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+	    (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+		tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
 
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
-					  &tr->func.model, flags, tlinks,
+					  &tr->func.model, tr->flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
@@ -378,17 +484,34 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	WARN_ON(!tr->cur_image && tr->selector);
 	if (tr->cur_image)
 		/* progs already running at this address */
-		err = modify_fentry(tr, tr->cur_image->image, im->image);
+		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
 	else
 		/* first time registering */
 		err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	if (err == -EAGAIN) {
+		/* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+		 * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+		 * trampoline again, and retry register.
+		 */
+		/* reset fops->func and fops->trampoline for re-register */
+		tr->fops->func = NULL;
+		tr->fops->trampoline = 0;
+		goto again;
+	}
+#endif
 	if (err)
 		goto out;
+
 	if (tr->cur_image)
 		bpf_tramp_image_put(tr->cur_image);
 	tr->cur_image = im;
 	tr->selector++;
 out:
+	/* If any error happens, restore previous flags */
+	if (err)
+		tr->flags = orig_flags;
 	kfree(tlinks);
 	return err;
 }
@@ -454,7 +577,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
 
 	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
 	tr->progs_cnt[kind]++;
-	err = bpf_trampoline_update(tr);
+	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 	if (err) {
 		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
@@ -487,7 +610,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
-	return bpf_trampoline_update(tr);
+	return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
@@ -715,6 +838,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	 * multiple rcu callbacks.
 	 */
 	hlist_del(&tr->hlist);
+	kfree(tr->fops);
 	kfree(tr);
 out:
 	mutex_unlock(&trampoline_mutex);
-- 
cgit 


From dfe6d5c3ec23c5b999261d989059aa35403d791d Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 4 Feb 2022 13:04:38 -0800
Subject: Bluetooth: hci_core: Introduce hci_recv_event_data

This introduces hci_recv_event_data to make it simpler to access the
contents of last received event rather than having to pass its contents
to the likes of *_ind/*_cfm callbacks.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         | 32 ++++++++++++++++++++++++++++++++
 net/bluetooth/hci_event.c        |  3 +++
 3 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index df65b107b596..6569b123b49e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -525,6 +525,7 @@ struct hci_dev {
 	struct sk_buff_head	cmd_q;
 
 	struct sk_buff		*sent_cmd;
+	struct sk_buff		*recv_event;
 
 	struct mutex		req_lock;
 	wait_queue_head_t	req_wait_q;
@@ -1747,6 +1748,7 @@ void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags);
 void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb);
 
 void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode);
+void *hci_recv_event_data(struct hci_dev *hdev, __u8 event);
 
 u32 hci_conn_get_phy(struct hci_conn *conn);
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index e42824e82758..924c29517918 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2712,6 +2712,7 @@ void hci_release_dev(struct hci_dev *hdev)
 
 	ida_simple_remove(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
+	kfree_skb(hdev->recv_event);
 	kfree(hdev);
 }
 EXPORT_SYMBOL(hci_release_dev);
@@ -3018,6 +3019,37 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
 	return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
 }
 
+/* Get data from last received event */
+void *hci_recv_event_data(struct hci_dev *hdev, __u8 event)
+{
+	struct hci_event_hdr *hdr;
+	int offset;
+
+	if (!hdev->recv_event)
+		return NULL;
+
+	hdr = (void *)hdev->recv_event->data;
+	offset = sizeof(*hdr);
+
+	if (hdr->evt != event) {
+		/* In case of LE metaevent check the subevent match */
+		if (hdr->evt == HCI_EV_LE_META) {
+			struct hci_ev_le_meta *ev;
+
+			ev = (void *)hdev->recv_event->data + offset;
+			offset += sizeof(*ev);
+			if (ev->subevent == event)
+				goto found;
+		}
+		return NULL;
+	}
+
+found:
+	bt_dev_dbg(hdev, "event 0x%2.2x", event);
+
+	return hdev->recv_event->data + offset;
+}
+
 /* Send ACL data */
 static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
 {
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index b32ca92fc692..bab52ee99b11 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6936,6 +6936,9 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		goto done;
 	}
 
+	kfree_skb(hdev->recv_event);
+	hdev->recv_event = skb_clone(skb, GFP_KERNEL);
+
 	event = hdr->evt;
 	if (!event) {
 		bt_dev_warn(hdev, "Received unexpected HCI Event 0x%2.2x",
-- 
cgit 


From 189c6c33ff421def040b904fb14ef76c5bf5af4c Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Tue, 28 Jun 2022 16:30:59 +0200
Subject: PCI: Extend isolated function probing to s390

Like the jailhouse hypervisor, s390's PCI architecture allows passing
isolated PCI functions to a guest OS instance. As of now this is was not
utilized even with multi-function support as the s390 PCI code makes sure
that only virtual PCI busses including a function with devfn 0 are
presented to the PCI subsystem. A subsequent change will remove this
restriction.

Allow probing such functions by replacing the existing check for
jailhouse_paravirt() with a new hypervisor_isolated_pci_functions() helper.

Link: https://lore.kernel.org/r/20220628143100.3228092-5-schnelle@linux.ibm.com
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
---
 drivers/pci/probe.c        | 2 +-
 include/linux/hypervisor.h | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 1f91ed67b5c5..4948531481d1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2668,7 +2668,7 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
 			 * a hypervisor that passes through individual PCI
 			 * functions.
 			 */
-			if (!jailhouse_paravirt())
+			if (!hypervisor_isolated_pci_functions())
 				break;
 		}
 		fn = next_fn(bus, dev, fn);
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index fc08b433c856..9efbc54e35e5 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -32,4 +32,12 @@ static inline bool jailhouse_paravirt(void)
 
 #endif /* !CONFIG_X86 */
 
+static inline bool hypervisor_isolated_pci_functions(void)
+{
+	if (IS_ENABLED(CONFIG_S390))
+		return true;
+
+	return jailhouse_paravirt();
+}
+
 #endif /* __LINUX_HYPEVISOR_H */
-- 
cgit 


From 4d8f24eeedc58d5f87b650ddda73c16e8ba56559 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 21 Jul 2022 20:44:04 +0000
Subject: Revert "tcp: change pingpong threshold to 3"

This reverts commit 4a41f453bedfd5e9cd040bad509d9da49feb3e2c.

This to-be-reverted commit was meant to apply a stricter rule for the
stack to enter pingpong mode. However, the condition used to check for
interactive session "before(tp->lsndtime, icsk->icsk_ack.lrcvtime)" is
jiffy based and might be too coarse, which delays the stack entering
pingpong mode.
We revert this patch so that we no longer use the above condition to
determine interactive session, and also reduce pingpong threshold to 1.

Fixes: 4a41f453bedf ("tcp: change pingpong threshold to 3")
Reported-by: LemmyHuang <hlm3280@163.com>
Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220721204404.388396-1-weiwan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h | 10 +---------
 net/ipv4/tcp_output.c              | 15 ++++++---------
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85cd695e7fd1..ee88f0f1350f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -321,7 +321,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
 
 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
 
-#define TCP_PINGPONG_THRESH	3
+#define TCP_PINGPONG_THRESH	1
 
 static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
 {
@@ -338,14 +338,6 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
 	return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
 }
 
-static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
-{
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	if (icsk->icsk_ack.pingpong < U8_MAX)
-		icsk->icsk_ack.pingpong++;
-}
-
 static inline bool inet_csk_has_ulp(struct sock *sk)
 {
 	return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cf6713c9567e..2efe41c84ee8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -167,16 +167,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	if (tcp_packets_in_flight(tp) == 0)
 		tcp_ca_event(sk, CA_EVENT_TX_START);
 
-	/* If this is the first data packet sent in response to the
-	 * previous received data,
-	 * and it is a reply for ato after last received packet,
-	 * increase pingpong count.
-	 */
-	if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
-	    (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
-		inet_csk_inc_pingpong_cnt(sk);
-
 	tp->lsndtime = now;
+
+	/* If it is a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+		inet_csk_enter_pingpong_mode(sk);
 }
 
 /* Account for an ACK we sent. */
-- 
cgit 


From ae85b23c65dbb079c83d8a08ff7699215f104e42 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 23 Jul 2022 06:49:41 +0900
Subject: PCI: Remove pci_get_legacy_ide_irq() and asm-generic/pci.h

pci_get_legacy_ide_irq() is only used on platforms that support PNP, so
many architectures define it but never use it.  Replace uses of it with
ATA_PRIMARY_IRQ() and ATA_SECONDARY_IRQ(), which provide the same
functionality.

Since pci_get_legacy_ide_irq() is no longer used, remove all the
architecture-specific definitions of it as well as asm-generic/pci.h, which
only provides pci_get_legacy_ide_irq()

[bhelgaas: commit log]
Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20220722214944.831438-2-shorne@gmail.com
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Pierre Morel <pmorel@linux.ibm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/alpha/include/asm/pci.h   |  6 ------
 arch/arm/include/asm/pci.h     |  5 -----
 arch/arm64/include/asm/pci.h   |  6 ------
 arch/csky/include/asm/pci.h    |  6 ------
 arch/ia64/include/asm/pci.h    |  6 ------
 arch/m68k/include/asm/pci.h    |  2 --
 arch/mips/include/asm/pci.h    |  6 ------
 arch/parisc/include/asm/pci.h  |  5 -----
 arch/powerpc/include/asm/pci.h |  1 -
 arch/riscv/include/asm/pci.h   |  6 ------
 arch/s390/include/asm/pci.h    |  1 -
 arch/sh/include/asm/pci.h      |  6 ------
 arch/sparc/include/asm/pci.h   |  9 ---------
 arch/um/include/asm/pci.h      |  8 --------
 arch/x86/include/asm/pci.h     |  3 ---
 arch/xtensa/include/asm/pci.h  |  3 ---
 drivers/pnp/resource.c         |  5 +++--
 include/asm-generic/pci.h      | 17 -----------------
 18 files changed, 3 insertions(+), 98 deletions(-)
 delete mode 100644 include/asm-generic/pci.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index cf6bc1e64d66..6312656279d7 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -56,12 +56,6 @@ struct pci_controller {
 
 /* IOMMU controls.  */
 
-/* TODO: integrate with include/asm-generic/pci.h ? */
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-
 #define pci_domain_nr(bus) ((struct pci_controller *)(bus)->sysdata)->index
 
 static inline int pci_proc_domain(struct pci_bus *bus)
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 68e6f25784a4..5916b88d4c94 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -22,11 +22,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-
 extern void pcibios_report_status(unsigned int status_mask, int warn);
 
 #endif /* __KERNEL__ */
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index b33ca260e3c9..0aebc3488c32 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -23,12 +23,6 @@
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	/* no legacy IRQ on arm64 */
-	return -ENODEV;
-}
-
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
 	return 1;
diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h
index ebc765b1f78b..0535f1aaae38 100644
--- a/arch/csky/include/asm/pci.h
+++ b/arch/csky/include/asm/pci.h
@@ -18,12 +18,6 @@
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	/* no legacy IRQ on csky */
-	return -ENODEV;
-}
-
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
 	/* always show the domain in /proc */
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 8c163d1d0189..fa8f545c24c9 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -63,10 +63,4 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return (pci_domain_nr(bus) != 0);
 }
 
-#define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? isa_irq_to_vector(15) : isa_irq_to_vector(14);
-}
-
 #endif /* _ASM_IA64_PCI_H */
diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h
index 5a4bc223743b..ccdfa0dc8413 100644
--- a/arch/m68k/include/asm/pci.h
+++ b/arch/m68k/include/asm/pci.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_M68K_PCI_H
 #define _ASM_M68K_PCI_H
 
-#include <asm-generic/pci.h>
-
 #define	pcibios_assign_all_busses()	1
 
 #define	PCIBIOS_MIN_IO		0x00000100
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 9ffc8192adae..3fd6e22c108b 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -139,10 +139,4 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 /* Do platform specific device initialization at pci_enable_device() time */
 extern int pcibios_plat_dev_init(struct pci_dev *dev);
 
-/* Chances are this interrupt is wired PC-style ...  */
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-
 #endif /* _ASM_PCI_H */
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index f14465b84de4..127ed5021ae3 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -162,11 +162,6 @@ extern void pcibios_init_bridge(struct pci_dev *);
 #define PCIBIOS_MIN_IO          0x10
 #define PCIBIOS_MIN_MEM         0x1000 /* NBPG - but pci/setup-res.c dies */
 
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
 
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 915d6ee4b40a..f9da506751bb 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -39,7 +39,6 @@
 #define pcibios_assign_all_busses() \
 	(pci_has_flag(PCI_REASSIGN_ALL_BUS))
 
-#define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	if (ppc_md.pci_get_legacy_ide_irq)
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index 7fd52a30e605..a7b8f0d0df7f 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -23,12 +23,6 @@
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	/* no legacy IRQ on risc-v */
-	return -ENODEV;
-}
-
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
 	/* always show the domain in /proc */
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index fdb9745ee998..5889ddcbc374 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -6,7 +6,6 @@
 #include <linux/mutex.h>
 #include <linux/iommu.h>
 #include <linux/pci_hotplug.h>
-#include <asm-generic/pci.h>
 #include <asm/pci_clp.h>
 #include <asm/pci_debug.h>
 #include <asm/sclp.h>
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index ad22e88c6657..54c30126ea17 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -88,10 +88,4 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return hose->need_domain_info;
 }
 
-/* Chances are this interrupt is wired PC-style ...  */
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-
 #endif /* __ASM_SH_PCI_H */
diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h
index 4deddf430e5d..0c58f65bd172 100644
--- a/arch/sparc/include/asm/pci.h
+++ b/arch/sparc/include/asm/pci.h
@@ -40,13 +40,4 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #define get_pci_unmapped_area get_fb_unmapped_area
 #endif /* CONFIG_SPARC64 */
 
-#if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI)
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return PCI_IRQ_NONE;
-}
-#else
-#include <asm-generic/pci.h>
-#endif
-
 #endif /* ___ASM_SPARC_PCI_H */
diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h
index da13fd5519ef..26b96c02ef61 100644
--- a/arch/um/include/asm/pci.h
+++ b/arch/um/include/asm/pci.h
@@ -11,14 +11,6 @@
 
 extern int isa_dma_bridge_buggy;
 
-#ifdef CONFIG_PCI
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	/* no legacy IRQs */
-	return -ENODEV;
-}
-#endif
-
 #ifdef CONFIG_PCI_DOMAINS
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index f3fd5928bcbb..736793d65bcb 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -105,9 +105,6 @@ static inline void early_quirks(void) { }
 
 extern void pci_iommu_alloc(void);
 
-/* generic pci stuff */
-#include <asm-generic/pci.h>
-
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
 static inline int __pcibus_to_node(const struct pci_bus *bus)
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index 8e2b48a268db..b56de9635b6c 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -43,7 +43,4 @@
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 #define arch_can_pci_mmap_io()		1
 
-/* Generic PCI */
-#include <asm-generic/pci.h>
-
 #endif	/* _XTENSA_PCI_H */
diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c
index 2fa0f7d55259..8f7695624c8c 100644
--- a/drivers/pnp/resource.c
+++ b/drivers/pnp/resource.c
@@ -17,6 +17,7 @@
 #include <asm/dma.h>
 #include <asm/irq.h>
 #include <linux/pci.h>
+#include <linux/libata.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 
@@ -322,8 +323,8 @@ static int pci_dev_uses_irq(struct pnp_dev *pnp, struct pci_dev *pci,
 		 * treat the compatibility IRQs as busy.
 		 */
 		if ((progif & 0x5) != 0x5)
-			if (pci_get_legacy_ide_irq(pci, 0) == irq ||
-			    pci_get_legacy_ide_irq(pci, 1) == irq) {
+			if (ATA_PRIMARY_IRQ(pci) == irq ||
+			    ATA_SECONDARY_IRQ(pci) == irq) {
 				pnp_dbg(&pnp->dev, "  legacy IDE device %s "
 					"using irq %d\n", pci_name(pci), irq);
 				return 1;
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
deleted file mode 100644
index 6bb3cd3d695a..000000000000
--- a/include/asm-generic/pci.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * linux/include/asm-generic/pci.h
- *
- *  Copyright (C) 2003 Russell King
- */
-#ifndef _ASM_GENERIC_PCI_H
-#define _ASM_GENERIC_PCI_H
-
-#ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
-static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-{
-	return channel ? 15 : 14;
-}
-#endif /* HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ */
-
-#endif /* _ASM_GENERIC_PCI_H */
-- 
cgit 


From abb4970ac33514c84b143516583eaf8cc47abd67 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 23 Jul 2022 06:49:42 +0900
Subject: PCI: Move isa_dma_bridge_buggy out of asm/dma.h

The isa_dma_bridge_buggy symbol is only used for x86_32, and only x86_32
platforms or quirks ever set it.

Add a new linux/isa-dma.h header that #defines isa_dma_bridge_buggy to 0
except on x86_32, where we keep it as a variable, and remove all the arch-
specific definitions.

[bhelgaas: commit log]
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Link: https://lore.kernel.org/r/20220722214944.831438-3-shorne@gmail.com
Signed-off-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/alpha/include/asm/dma.h           |  9 ---------
 arch/arc/include/asm/dma.h             |  5 -----
 arch/arm/include/asm/dma.h             |  6 ------
 arch/arm64/include/asm/pci.h           |  2 --
 arch/csky/include/asm/pci.h            |  2 --
 arch/ia64/include/asm/dma.h            |  2 --
 arch/m68k/include/asm/dma.h            |  6 ------
 arch/microblaze/include/asm/dma.h      |  6 ------
 arch/mips/include/asm/dma.h            |  8 --------
 arch/parisc/include/asm/dma.h          |  6 ------
 arch/powerpc/include/asm/dma.h         |  6 ------
 arch/riscv/include/asm/pci.h           |  2 --
 arch/s390/include/asm/dma.h            |  6 ------
 arch/sh/include/asm/dma.h              |  6 ------
 arch/sparc/include/asm/dma.h           |  8 --------
 arch/um/include/asm/pci.h              |  2 --
 arch/x86/include/asm/dma.h             |  8 --------
 arch/xtensa/include/asm/dma.h          |  7 -------
 drivers/comedi/drivers/comedi_isadma.c |  2 +-
 drivers/pci/pci.c                      |  2 ++
 drivers/pci/quirks.c                   |  4 +++-
 include/linux/isa-dma.h                | 14 ++++++++++++++
 sound/core/isadma.c                    |  2 +-
 23 files changed, 21 insertions(+), 100 deletions(-)
 create mode 100644 include/linux/isa-dma.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/dma.h b/arch/alpha/include/asm/dma.h
index 28610ea7786d..a04d76b96089 100644
--- a/arch/alpha/include/asm/dma.h
+++ b/arch/alpha/include/asm/dma.h
@@ -365,13 +365,4 @@ extern void free_dma(unsigned int dmanr);	/* release it again */
 #define KERNEL_HAVE_CHECK_DMA
 extern int check_dma(unsigned int dmanr);
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/arc/include/asm/dma.h b/arch/arc/include/asm/dma.h
index 5b744f4b10a7..02431027ed2f 100644
--- a/arch/arc/include/asm/dma.h
+++ b/arch/arc/include/asm/dma.h
@@ -7,10 +7,5 @@
 #define ASM_ARC_DMA_H
 
 #define MAX_DMA_ADDRESS 0xC0000000
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	0
-#endif
 
 #endif
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index a81dda65c576..907d139be431 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -143,10 +143,4 @@ extern int  get_dma_residue(unsigned int chan);
 
 #endif /* CONFIG_ISA_DMA_API */
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy    (0)
-#endif
-
 #endif /* __ASM_ARM_DMA_H */
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 0aebc3488c32..682c922b5658 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -20,8 +20,6 @@
 #define arch_can_pci_mmap_wc() 1
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h
index 0535f1aaae38..5c02454ec724 100644
--- a/arch/csky/include/asm/pci.h
+++ b/arch/csky/include/asm/pci.h
@@ -15,8 +15,6 @@
 /* C-SKY shim does not initialize PCI bus */
 #define pcibios_assign_all_busses() 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
index 59625e9c1f9c..eaed2626ffda 100644
--- a/arch/ia64/include/asm/dma.h
+++ b/arch/ia64/include/asm/dma.h
@@ -12,8 +12,6 @@
 
 extern unsigned long MAX_DMA_ADDRESS;
 
-extern int isa_dma_bridge_buggy;
-
 #define free_dma(x)
 
 #endif /* _ASM_IA64_DMA_H */
diff --git a/arch/m68k/include/asm/dma.h b/arch/m68k/include/asm/dma.h
index f6c5e0dfb4e5..1c8d9c5bc2fa 100644
--- a/arch/m68k/include/asm/dma.h
+++ b/arch/m68k/include/asm/dma.h
@@ -6,10 +6,4 @@
    bootmem allocator (but this should do it for this) */
 #define MAX_DMA_ADDRESS PAGE_OFFSET
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy    (0)
-#endif
-
 #endif /* _M68K_DMA_H */
diff --git a/arch/microblaze/include/asm/dma.h b/arch/microblaze/include/asm/dma.h
index f801582be912..7484c9eb66c4 100644
--- a/arch/microblaze/include/asm/dma.h
+++ b/arch/microblaze/include/asm/dma.h
@@ -9,10 +9,4 @@
 /* Virtual address corresponding to last available physical memory address.  */
 #define MAX_DMA_ADDRESS (CONFIG_KERNEL_START + memory_size - 1)
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy     (0)
-#endif
-
 #endif /* _ASM_MICROBLAZE_DMA_H */
diff --git a/arch/mips/include/asm/dma.h b/arch/mips/include/asm/dma.h
index be726b943530..d6186e6bea7e 100644
--- a/arch/mips/include/asm/dma.h
+++ b/arch/mips/include/asm/dma.h
@@ -307,12 +307,4 @@ static __inline__ int get_dma_residue(unsigned int dmanr)
 extern int request_dma(unsigned int dmanr, const char * device_id);	/* reserve a DMA channel */
 extern void free_dma(unsigned int dmanr);	/* release it again */
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/parisc/include/asm/dma.h b/arch/parisc/include/asm/dma.h
index eea80ed34e6d..9e8c101de902 100644
--- a/arch/parisc/include/asm/dma.h
+++ b/arch/parisc/include/asm/dma.h
@@ -176,10 +176,4 @@ static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
 
 #define free_dma(dmanr)
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h
index 6161a9596196..d97c66d9ae34 100644
--- a/arch/powerpc/include/asm/dma.h
+++ b/arch/powerpc/include/asm/dma.h
@@ -340,11 +340,5 @@ extern int request_dma(unsigned int dmanr, const char *device_id);
 /* release it again */
 extern void free_dma(unsigned int dmanr);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_DMA_H */
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index a7b8f0d0df7f..f904df586c03 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -20,8 +20,6 @@
 
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h
index 6f26f35d4a71..dec1c4ce628c 100644
--- a/arch/s390/include/asm/dma.h
+++ b/arch/s390/include/asm/dma.h
@@ -11,10 +11,4 @@
  */
 #define MAX_DMA_ADDRESS         0x80000000
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_S390_DMA_H */
diff --git a/arch/sh/include/asm/dma.h b/arch/sh/include/asm/dma.h
index 17d23ae98c77..c8bee3f985a2 100644
--- a/arch/sh/include/asm/dma.h
+++ b/arch/sh/include/asm/dma.h
@@ -137,10 +137,4 @@ extern int register_chan_caps(const char *dmac, struct dma_chan_caps *capslist);
 extern int dma_create_sysfs_files(struct dma_channel *, struct dma_info *);
 extern void dma_remove_sysfs_files(struct dma_channel *, struct dma_info *);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* __ASM_SH_DMA_H */
diff --git a/arch/sparc/include/asm/dma.h b/arch/sparc/include/asm/dma.h
index 462e7c794a09..08043f35b110 100644
--- a/arch/sparc/include/asm/dma.h
+++ b/arch/sparc/include/asm/dma.h
@@ -82,14 +82,6 @@
 #define DMA_BURST64      0x40
 #define DMA_BURSTBITS    0x7f
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
 #ifdef CONFIG_SPARC32
 struct device;
 
diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h
index 26b96c02ef61..1211855aff34 100644
--- a/arch/um/include/asm/pci.h
+++ b/arch/um/include/asm/pci.h
@@ -9,8 +9,6 @@
 
 #define pcibios_assign_all_busses() 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI_DOMAINS
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 8e95aa4b0d17..8ae6e0e11b8b 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id);
 extern void free_dma(unsigned int dmanr);
 #endif
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_X86_DMA_H */
diff --git a/arch/xtensa/include/asm/dma.h b/arch/xtensa/include/asm/dma.h
index bb099a373b5a..172644539032 100644
--- a/arch/xtensa/include/asm/dma.h
+++ b/arch/xtensa/include/asm/dma.h
@@ -52,11 +52,4 @@
 extern int request_dma(unsigned int dmanr, const char * device_id);
 extern void free_dma(unsigned int dmanr);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
-
 #endif
diff --git a/drivers/comedi/drivers/comedi_isadma.c b/drivers/comedi/drivers/comedi_isadma.c
index 700982464c53..020b3d1e1ac0 100644
--- a/drivers/comedi/drivers/comedi_isadma.c
+++ b/drivers/comedi/drivers/comedi_isadma.c
@@ -8,7 +8,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
-#include <asm/dma.h>
+#include <linux/isa-dma.h>
 #include <linux/comedi/comedidev.h>
 #include <linux/comedi/comedi_isadma.h>
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cfaf40a540a8..60c55d2cb2cc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -41,8 +41,10 @@ const char *pci_power_names[] = {
 };
 EXPORT_SYMBOL_GPL(pci_power_names);
 
+#ifdef CONFIG_X86_32
 int isa_dma_bridge_buggy;
 EXPORT_SYMBOL(isa_dma_bridge_buggy);
+#endif
 
 int pci_pci_problems;
 EXPORT_SYMBOL(pci_pci_problems);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 41aeaa235132..6fc64509eee7 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/pci.h>
+#include <linux/isa-dma.h> /* isa_dma_bridge_buggy */
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/acpi.h>
@@ -30,7 +31,6 @@
 #include <linux/pm_runtime.h>
 #include <linux/suspend.h>
 #include <linux/switchtec.h>
-#include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
 static ktime_t fixup_debug_start(struct pci_dev *dev,
@@ -239,6 +239,7 @@ static void quirk_passive_release(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_passive_release);
 DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_passive_release);
 
+#ifdef CONFIG_X86_32
 /*
  * The VIA VP2/VP3/MVP3 seem to have some 'features'. There may be a
  * workaround but VIA don't answer queries. If you happen to have good
@@ -265,6 +266,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1533,		quirk_isa_dma
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_1,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_2,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_3,	quirk_isa_dma_hangs);
+#endif
 
 /*
  * Intel NM10 "TigerPoint" LPC PM1a_STS.BM_STS must be clear
diff --git a/include/linux/isa-dma.h b/include/linux/isa-dma.h
new file mode 100644
index 000000000000..61504a8c1b9e
--- /dev/null
+++ b/include/linux/isa-dma.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_ISA_DMA_H
+#define __LINUX_ISA_DMA_H
+
+#include <asm/dma.h>
+
+#if defined(CONFIG_PCI) && defined(CONFIG_X86_32)
+extern int isa_dma_bridge_buggy;
+#else
+#define isa_dma_bridge_buggy	(0)
+#endif
+
+#endif /* __LINUX_ISA_DMA_H */
diff --git a/sound/core/isadma.c b/sound/core/isadma.c
index 1f45ede023b4..18a86212e3a8 100644
--- a/sound/core/isadma.c
+++ b/sound/core/isadma.c
@@ -12,8 +12,8 @@
 #undef HAVE_REALLY_SLOW_DMA_CONTROLLER
 
 #include <linux/export.h>
+#include <linux/isa-dma.h>
 #include <sound/core.h>
-#include <asm/dma.h>
 
 /**
  * snd_dma_program - program an ISA DMA transfer
-- 
cgit 


From a2912b45b0826c6fc0ca9b264d03a2dacb7a72e8 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 23 Jul 2022 06:49:44 +0900
Subject: asm-generic: Add new pci.h and use it

The asm/pci.h used for many newer architectures share similar definitions.
Move the common parts to asm-generic/pci.h to allow for sharing code.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/lkml/CAK8P3a0JmPeczfmMBE__vn=Jbvf=nkbpVaZCycyv40pZNCJJXQ@mail.gmail.com/
Link: https://lore.kernel.org/r/20220722214944.831438-5-shorne@gmail.com
Signed-off-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Pierre Morel <pmorel@linux.ibm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/arm64/include/asm/pci.h | 10 ++--------
 arch/csky/include/asm/pci.h  | 17 ++---------------
 arch/riscv/include/asm/pci.h | 23 ++++-------------------
 arch/um/include/asm/pci.h    | 14 ++------------
 include/asm-generic/pci.h    | 30 ++++++++++++++++++++++++++++++
 5 files changed, 40 insertions(+), 54 deletions(-)
 create mode 100644 include/asm-generic/pci.h

(limited to 'include')

diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 682c922b5658..016eb6b46dc0 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -9,7 +9,6 @@
 #include <asm/io.h>
 
 #define PCIBIOS_MIN_IO		0x1000
-#define PCIBIOS_MIN_MEM		0
 
 /*
  * Set to 1 if the kernel should re-assign all PCI bus numbers
@@ -18,13 +17,8 @@
 	(pci_has_flag(PCI_REASSIGN_ALL_BUS))
 
 #define arch_can_pci_mmap_wc() 1
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
-#ifdef CONFIG_PCI
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	return 1;
-}
-#endif  /* CONFIG_PCI */
+/* Generic PCI */
+#include <asm-generic/pci.h>
 
 #endif  /* __ASM_PCI_H */
diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h
index 875bc028f8f6..42724c630d30 100644
--- a/arch/csky/include/asm/pci.h
+++ b/arch/csky/include/asm/pci.h
@@ -9,20 +9,7 @@
 
 #include <asm/io.h>
 
-#define PCIBIOS_MIN_IO		0
-#define PCIBIOS_MIN_MEM		0
-
-/* C-SKY shim does not initialize PCI bus */
-#define pcibios_assign_all_busses() 1
-
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
-
-#ifdef CONFIG_PCI
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	/* always show the domain in /proc */
-	return 1;
-}
-#endif  /* CONFIG_PCI */
+/* Generic PCI */
+#include <asm-generic/pci.h>
 
 #endif  /* __ASM_CSKY_PCI_H */
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index f904df586c03..6ef4a1426194 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -12,23 +12,7 @@
 
 #include <asm/io.h>
 
-#define PCIBIOS_MIN_IO		0
-#define PCIBIOS_MIN_MEM		0
-
-/* RISC-V shim does not initialize PCI bus */
-#define pcibios_assign_all_busses() 1
-
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
-
-#ifdef CONFIG_PCI
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	/* always show the domain in /proc */
-	return 1;
-}
-
-#ifdef	CONFIG_NUMA
-
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
 static inline int pcibus_to_node(struct pci_bus *bus)
 {
 	return dev_to_node(&bus->dev);
@@ -38,8 +22,9 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
 #endif
-#endif	/* CONFIG_NUMA */
+#endif /* defined(CONFIG_PCI) && defined(CONFIG_NUMA) */
 
-#endif  /* CONFIG_PCI */
+/* Generic PCI */
+#include <asm-generic/pci.h>
 
 #endif  /* _ASM_RISCV_PCI_H */
diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h
index 1211855aff34..34fe4921b5fa 100644
--- a/arch/um/include/asm/pci.h
+++ b/arch/um/include/asm/pci.h
@@ -4,18 +4,8 @@
 #include <linux/types.h>
 #include <asm/io.h>
 
-#define PCIBIOS_MIN_IO		0
-#define PCIBIOS_MIN_MEM		0
-
-#define pcibios_assign_all_busses() 1
-
-#ifdef CONFIG_PCI_DOMAINS
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	/* always show the domain in /proc */
-	return 1;
-}
-#endif  /* CONFIG_PCI */
+/* Generic PCI */
+#include <asm-generic/pci.h>
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 /*
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
new file mode 100644
index 000000000000..6869f1061528
--- /dev/null
+++ b/include/asm-generic/pci.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_GENERIC_PCI_H
+#define __ASM_GENERIC_PCI_H
+
+#ifndef PCIBIOS_MIN_IO
+#define PCIBIOS_MIN_IO		0
+#endif
+
+#ifndef PCIBIOS_MIN_MEM
+#define PCIBIOS_MIN_MEM		0
+#endif
+
+#ifndef pcibios_assign_all_busses
+/* For bootloaders that do not initialize the PCI bus */
+#define pcibios_assign_all_busses() 1
+#endif
+
+/* Enable generic resource mapping code in drivers/pci/ */
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE
+
+#ifdef CONFIG_PCI_DOMAINS
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+	/* always show the domain in /proc */
+	return 1;
+}
+#endif /* CONFIG_PCI_DOMAINS */
+
+#endif /* __ASM_GENERIC_PCI_H */
-- 
cgit 


From 26afbd826ee326e63a334c37fd45e82e50a615ec Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 29 Jul 2019 18:15:43 +0300
Subject: Bluetooth: Add initial implementation of CIS connections

This adds the initial implementation of CIS connections and introduces
the ISO packets/links.

== Central: Set CIG Parameters, create a CIS and Setup Data Path ==

> tools/isotest -s <address>

< HCI Command: LE Extended Create... (0x08|0x0043) plen 26
...
> HCI Event: Command Status (0x0f) plen 4
      LE Extended Create Connection (0x08|0x0043) ncmd 1
        Status: Success (0x00)
> HCI Event: LE Meta Event (0x3e) plen 31
      LE Enhanced Connection Complete (0x0a)
      ...
< HCI Command: LE Create Connected... (0x08|0x0064) plen 5
...
> HCI Event: Command Status (0x0f) plen 4
      LE Create Connected Isochronous Stream (0x08|0x0064) ncmd 1
        Status: Success (0x00)
> HCI Event: LE Meta Event (0x3e) plen 29
      LE Connected Isochronous Stream Established (0x19)
      ...
< HCI Command: LE Setup Isochronou.. (0x08|0x006e) plen 13
...
> HCI Event: Command Complete (0x0e) plen 6
      LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1
        Status: Success (0x00)
        Handle: 257
< HCI Command: LE Setup Isochronou.. (0x08|0x006e) plen 13
...
> HCI Event: Command Complete (0x0e) plen 6
      LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1
        Status: Success (0x00)
        Handle: 257

== Peripheral: Accept CIS and Setup Data Path ==

> tools/isotest -d

 HCI Event: LE Meta Event (0x3e) plen 7
      LE Connected Isochronous Stream Request (0x1a)
...
< HCI Command: LE Accept Co.. (0x08|0x0066) plen 2
...
> HCI Event: LE Meta Event (0x3e) plen 29
      LE Connected Isochronous Stream Established (0x19)
...
< HCI Command: LE Setup Is.. (0x08|0x006e) plen 13
...
> HCI Event: Command Complete (0x0e) plen 6
      LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1
        Status: Success (0x00)
        Handle: 257
< HCI Command: LE Setup Is.. (0x08|0x006e) plen 13
...
> HCI Event: Command Complete (0x0e) plen 6
      LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1
        Status: Success (0x00)
        Handle: 257

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  33 ++-
 include/net/bluetooth/hci.h       |  28 ++-
 include/net/bluetooth/hci_core.h  | 107 ++++++++-
 include/net/bluetooth/hci_sock.h  |   2 +
 include/net/bluetooth/hci_sync.h  |   3 +
 net/bluetooth/Kconfig             |   1 +
 net/bluetooth/hci_conn.c          | 440 ++++++++++++++++++++++++++++++++++++++
 net/bluetooth/hci_core.c          | 230 ++++++++++++++++----
 net/bluetooth/hci_event.c         | 307 +++++++++++++++++++++++++-
 net/bluetooth/hci_sync.c          |  49 ++++-
 10 files changed, 1145 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 686ce2591bb2..72af9722244a 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -55,6 +55,8 @@
 #define BTPROTO_CMTP	5
 #define BTPROTO_HIDP	6
 #define BTPROTO_AVDTP	7
+#define BTPROTO_ISO	8
+#define BTPROTO_LAST	BTPROTO_ISO
 
 #define SOL_HCI		0
 #define SOL_L2CAP	6
@@ -149,10 +151,39 @@ struct bt_voice {
 #define BT_MODE_LE_FLOWCTL	0x03
 #define BT_MODE_EXT_FLOWCTL	0x04
 
-#define BT_PKT_STATUS          16
+#define BT_PKT_STATUS           16
 
 #define BT_SCM_PKT_STATUS	0x03
 
+#define BT_ISO_QOS		17
+
+#define BT_ISO_QOS_CIG_UNSET	0xff
+#define BT_ISO_QOS_CIS_UNSET	0xff
+
+struct bt_iso_io_qos {
+	__u32 interval;
+	__u16 latency;
+	__u16 sdu;
+	__u8  phy;
+	__u8  rtn;
+};
+
+struct bt_iso_qos {
+	__u8  cig;
+	__u8  cis;
+	__u8  sca;
+	__u8  packing;
+	__u8  framing;
+	struct bt_iso_io_qos in;
+	struct bt_iso_io_qos out;
+};
+
+#define BT_ISO_PHY_1M		0x01
+#define BT_ISO_PHY_2M		0x02
+#define BT_ISO_PHY_CODED	0x04
+#define BT_ISO_PHY_ANY		(BT_ISO_PHY_1M | BT_ISO_PHY_2M | \
+				 BT_ISO_PHY_CODED)
+
 #define BT_CODEC	19
 
 struct	bt_codec_caps {
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 927f51b92854..027f1bc6e4f1 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1989,7 +1989,7 @@ struct hci_rp_le_read_iso_tx_sync {
 struct hci_cis_params {
 	__u8    cis_id;
 	__le16  c_sdu;
-	__le16  p_pdu;
+	__le16  p_sdu;
 	__u8    c_phy;
 	__u8    p_phy;
 	__u8    c_rtn;
@@ -2000,7 +2000,7 @@ struct hci_cp_le_set_cig_params {
 	__u8    cig_id;
 	__u8    c_interval[3];
 	__u8    p_interval[3];
-	__u8    wc_sca;
+	__u8    sca;
 	__u8    packing;
 	__u8    framing;
 	__le16  c_latency;
@@ -2043,6 +2043,30 @@ struct hci_cp_le_reject_cis {
 	__u8    reason;
 } __packed;
 
+#define HCI_OP_LE_SETUP_ISO_PATH		0x206e
+struct hci_cp_le_setup_iso_path {
+	__le16  handle;
+	__u8    direction;
+	__u8    path;
+	__u8    codec;
+	__le16  codec_cid;
+	__le16  codec_vid;
+	__u8    delay[3];
+	__u8    codec_cfg_len;
+	__u8    codec_cfg[0];
+} __packed;
+
+struct hci_rp_le_setup_iso_path {
+	__u8    status;
+	__le16  handle;
+} __packed;
+
+#define HCI_OP_LE_SET_HOST_FEATURE		0x2074
+struct hci_cp_le_set_host_feature {
+	__u8     bit_number;
+	__u8     bit_value;
+} __packed;
+
 /* ---- HCI Events ---- */
 struct hci_ev_status {
 	__u8    status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6569b123b49e..5fac013ff2b0 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -126,6 +126,7 @@ struct hci_conn_hash {
 	unsigned int     acl_num;
 	unsigned int     amp_num;
 	unsigned int     sco_num;
+	unsigned int     iso_num;
 	unsigned int     le_num;
 	unsigned int     le_num_peripheral;
 };
@@ -474,13 +475,16 @@ struct hci_dev {
 	unsigned int	acl_cnt;
 	unsigned int	sco_cnt;
 	unsigned int	le_cnt;
+	unsigned int	iso_cnt;
 
 	unsigned int	acl_mtu;
 	unsigned int	sco_mtu;
 	unsigned int	le_mtu;
+	unsigned int	iso_mtu;
 	unsigned int	acl_pkts;
 	unsigned int	sco_pkts;
 	unsigned int	le_pkts;
+	unsigned int	iso_pkts;
 
 	__u16		block_len;
 	__u16		block_mtu;
@@ -657,6 +661,7 @@ enum conn_reasons {
 	CONN_REASON_PAIR_DEVICE,
 	CONN_REASON_L2CAP_CHAN,
 	CONN_REASON_SCO_CONNECT,
+	CONN_REASON_ISO_CONNECT,
 };
 
 struct hci_conn {
@@ -709,6 +714,7 @@ struct hci_conn {
 	__s8		rssi;
 	__s8		tx_power;
 	__s8		max_tx_power;
+	struct bt_iso_qos iso_qos;
 	unsigned long	flags;
 
 	enum conn_reasons conn_reason;
@@ -739,6 +745,7 @@ struct hci_conn {
 	struct hci_dev	*hdev;
 	void		*l2cap_data;
 	void		*sco_data;
+	void		*iso_data;
 	struct amp_mgr	*amp_mgr;
 
 	struct hci_conn	*link;
@@ -747,6 +754,8 @@ struct hci_conn {
 	void (*connect_cfm_cb)	(struct hci_conn *conn, u8 status);
 	void (*security_cfm_cb)	(struct hci_conn *conn, u8 status);
 	void (*disconn_cfm_cb)	(struct hci_conn *conn, u8 reason);
+
+	void (*cleanup)(struct hci_conn *conn);
 };
 
 struct hci_chan {
@@ -954,6 +963,9 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 	case ESCO_LINK:
 		h->sco_num++;
 		break;
+	case ISO_LINK:
+		h->iso_num++;
+		break;
 	}
 }
 
@@ -980,6 +992,9 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
 	case ESCO_LINK:
 		h->sco_num--;
 		break;
+	case ISO_LINK:
+		h->iso_num--;
+		break;
 	}
 }
 
@@ -996,6 +1011,8 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
 	case SCO_LINK:
 	case ESCO_LINK:
 		return h->sco_num;
+	case ISO_LINK:
+		return h->iso_num;
 	default:
 		return 0;
 	}
@@ -1005,7 +1022,7 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
 {
 	struct hci_conn_hash *c = &hdev->conn_hash;
 
-	return c->acl_num + c->amp_num + c->sco_num + c->le_num;
+	return c->acl_num + c->amp_num + c->sco_num + c->le_num + c->iso_num;
 }
 
 static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle)
@@ -1091,6 +1108,53 @@ static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
 	return NULL;
 }
 
+static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
+							bdaddr_t *ba,
+							__u8 ba_type)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type != ISO_LINK)
+			continue;
+
+		if (ba_type == c->dst_type && !bacmp(&c->dst, ba)) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
+							__u8 handle)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type != ISO_LINK)
+			continue;
+
+		if (handle == c->iso_qos.cig) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 							__u8 type, __u16 state)
 {
@@ -1111,6 +1175,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 	return NULL;
 }
 
+typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data);
+static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
+					    hci_conn_func_t func, __u8 type,
+					    __u16 state, void *data)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	if (!func)
+		return;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type == type && c->state == state)
+			func(c, data);
+	}
+
+	rcu_read_unlock();
+}
+
 static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
@@ -1134,6 +1219,8 @@ static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
 int hci_disconnect(struct hci_conn *conn, __u8 reason);
 bool hci_setup_sync(struct hci_conn *conn, __u16 handle);
 void hci_sco_setup(struct hci_conn *conn, __u8 status);
+bool hci_iso_setup_path(struct hci_conn *conn);
+int hci_le_create_cis(struct hci_conn *conn);
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 			      u8 role);
@@ -1158,6 +1245,10 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 enum conn_reasons conn_reason);
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 				 __u16 setting, struct bt_codec *codec);
+struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
+			      __u8 dst_type, struct bt_iso_qos *qos);
+struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
+				 __u8 dst_type, struct bt_iso_qos *qos);
 int hci_conn_check_link_mode(struct hci_conn *conn);
 int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
 int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
@@ -1525,6 +1616,15 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \
 					 ext_adv_capable(dev))
 
+/* CIS Master/Slave support */
+#define iso_capable(dev) (cis_capable(dev))
+#define cis_capable(dev) \
+	(cis_central_capable(dev) || cis_peripheral_capable(dev))
+#define cis_central_capable(dev) \
+	((dev)->le_features[3] & HCI_LE_CIS_CENTRAL)
+#define cis_peripheral_capable(dev) \
+	((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
+
 /* ----- HCI protocols ----- */
 #define HCI_PROTO_DEFER             0x01
 
@@ -1539,6 +1639,10 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	case ESCO_LINK:
 		return sco_connect_ind(hdev, bdaddr, flags);
 
+	case ISO_LINK:
+		/* TODO: Handle connection indication */
+		return -EINVAL;
+
 	default:
 		BT_ERR("unknown link type %d", type);
 		return -EINVAL;
@@ -1746,6 +1850,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 		 const void *param);
 void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags);
 void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb);
+void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb);
 
 void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode);
 void *hci_recv_event_data(struct hci_dev *hdev, __u8 event);
diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h
index 9949870f7d78..0520e21ab698 100644
--- a/include/net/bluetooth/hci_sock.h
+++ b/include/net/bluetooth/hci_sock.h
@@ -124,6 +124,8 @@ struct hci_dev_info {
 	__u16 acl_pkts;
 	__u16 sco_mtu;
 	__u16 sco_pkts;
+	__u16 iso_mtu;
+	__u16 iso_pkts;
 
 	struct hci_dev_stats stat;
 };
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 5c4d4cace9c3..c243cb6869d8 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -109,3 +109,6 @@ struct hci_conn;
 int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 
 int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
+int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
+int hci_le_remove_cig(struct hci_dev *hdev, u8 handle);
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index e0ab4cd7afc3..ae3bdc6dfc92 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -29,6 +29,7 @@ menuconfig BT
 		SCO audio links
 		L2CAP (Logical Link Control and Adaptation Protocol)
 		SMP (Security Manager Protocol) on LE (Low Energy) links
+		ISO isochronous links
 	     HCI Device drivers (Interface to the hardware)
 	     RFCOMM Module (RFCOMM Protocol)  
 	     BNEP Module (Bluetooth Network Encapsulation Protocol)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 1ce89c469293..97568723ddb0 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -125,6 +125,9 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 
 	hci_conn_hash_del(hdev, conn);
 
+	if (conn->cleanup)
+		conn->cleanup(conn);
+
 	if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
 		switch (conn->setting & SCO_AIRMODE_MASK) {
 		case SCO_AIRMODE_CVSD:
@@ -722,6 +725,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
 		break;
 	case LE_LINK:
+	case ISO_LINK:
 		/* conn->src should reflect the local identity address */
 		hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
 		break;
@@ -1232,6 +1236,442 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	return sco;
 }
 
+struct iso_list_data {
+	u8  cig;
+	u8  cis;
+	int count;
+	struct {
+		struct hci_cp_le_set_cig_params cp;
+		struct hci_cis_params cis[0x11];
+	} pdu;
+};
+
+static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos)
+{
+	struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis];
+
+	cis->cis_id = qos->cis;
+	cis->c_sdu  = cpu_to_le16(qos->out.sdu);
+	cis->p_sdu  = cpu_to_le16(qos->in.sdu);
+	cis->c_phy  = qos->out.phy;
+	cis->p_phy  = qos->in.phy;
+	cis->c_rtn  = qos->out.rtn;
+	cis->p_rtn  = qos->in.rtn;
+
+	d->pdu.cp.num_cis++;
+}
+
+static void cis_list(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	if (d->cig != conn->iso_qos.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
+	    d->cis != conn->iso_qos.cis)
+		return;
+
+	d->count++;
+
+	if (d->pdu.cp.cig_id == BT_ISO_QOS_CIG_UNSET ||
+	    d->count >= ARRAY_SIZE(d->pdu.cis))
+		return;
+
+	cis_add(d, &conn->iso_qos);
+}
+
+static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct iso_list_data data;
+
+	memset(&data, 0, sizeof(data));
+
+	/* Allocate a CIG if not set */
+	if (qos->cig == BT_ISO_QOS_CIG_UNSET) {
+		for (data.cig = 0x00; data.cig < 0xff; data.cig++) {
+			data.count = 0;
+			data.cis = 0xff;
+
+			hci_conn_hash_list_state(hdev, cis_list, ISO_LINK,
+						 BT_BOUND, &data);
+			if (data.count)
+				continue;
+
+			hci_conn_hash_list_state(hdev, cis_list, ISO_LINK,
+						 BT_CONNECTED, &data);
+			if (!data.count)
+				break;
+		}
+
+		if (data.cig == 0xff)
+			return false;
+
+		/* Update CIG */
+		qos->cig = data.cig;
+	}
+
+	data.pdu.cp.cig_id = qos->cig;
+	hci_cpu_to_le24(qos->out.interval, data.pdu.cp.c_interval);
+	hci_cpu_to_le24(qos->in.interval, data.pdu.cp.p_interval);
+	data.pdu.cp.sca = qos->sca;
+	data.pdu.cp.packing = qos->packing;
+	data.pdu.cp.framing = qos->framing;
+	data.pdu.cp.c_latency = cpu_to_le16(qos->out.latency);
+	data.pdu.cp.p_latency = cpu_to_le16(qos->in.latency);
+
+	if (qos->cis != BT_ISO_QOS_CIS_UNSET) {
+		data.count = 0;
+		data.cig = qos->cig;
+		data.cis = qos->cis;
+
+		hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND,
+					 &data);
+		if (data.count)
+			return false;
+
+		cis_add(&data, qos);
+	}
+
+	/* Reprogram all CIS(s) with the same CIG */
+	for (data.cig = qos->cig, data.cis = 0x00; data.cis < 0x11;
+	     data.cis++) {
+		data.count = 0;
+
+		hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND,
+					 &data);
+		if (data.count)
+			continue;
+
+		/* Allocate a CIS if not set */
+		if (qos->cis == BT_ISO_QOS_CIS_UNSET) {
+			/* Update CIS */
+			qos->cis = data.cis;
+			cis_add(&data, qos);
+		}
+	}
+
+	if (qos->cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis)
+		return false;
+
+	if (hci_send_cmd(hdev, HCI_OP_LE_SET_CIG_PARAMS,
+			 sizeof(data.pdu.cp) +
+			 (data.pdu.cp.num_cis * sizeof(*data.pdu.cis)),
+			 &data.pdu) < 0)
+		return false;
+
+	return true;
+}
+
+static void find_cis(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	/* Ignore broadcast */
+	if (!bacmp(&conn->dst, BDADDR_ANY))
+		return;
+
+	d->count++;
+}
+
+static int remove_cig_sync(struct hci_dev *hdev, void *data)
+{
+	u8 handle = PTR_ERR(data);
+
+	return hci_le_remove_cig_sync(hdev, handle);
+}
+
+int hci_le_remove_cig(struct hci_dev *hdev, u8 handle)
+{
+	bt_dev_dbg(hdev, "handle 0x%2.2x", handle);
+
+	return hci_cmd_sync_queue(hdev, remove_cig_sync, ERR_PTR(handle), NULL);
+}
+
+static void cis_cleanup(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct iso_list_data d;
+
+	memset(&d, 0, sizeof(d));
+	d.cig = conn->iso_qos.cig;
+
+	/* Check if ISO connection is a CIS and remove CIG if there are
+	 * no other connections using it.
+	 */
+	hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d);
+	if (d.count)
+		return;
+
+	hci_le_remove_cig(hdev, conn->iso_qos.cig);
+}
+
+struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
+			      __u8 dst_type, struct bt_iso_qos *qos)
+{
+	struct hci_conn *cis;
+
+	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type);
+	if (!cis) {
+		cis = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+		if (!cis)
+			return ERR_PTR(-ENOMEM);
+		cis->cleanup = cis_cleanup;
+	}
+
+	if (cis->state == BT_CONNECTED)
+		return cis;
+
+	/* Check if CIS has been set and the settings matches */
+	if (cis->state == BT_BOUND &&
+	    !memcmp(&cis->iso_qos, qos, sizeof(*qos)))
+		return cis;
+
+	/* Update LINK PHYs according to QoS preference */
+	cis->le_tx_phy = qos->out.phy;
+	cis->le_rx_phy = qos->in.phy;
+
+	/* If output interval is not set use the input interval as it cannot be
+	 * 0x000000.
+	 */
+	if (!qos->out.interval)
+		qos->out.interval = qos->in.interval;
+
+	/* If input interval is not set use the output interval as it cannot be
+	 * 0x000000.
+	 */
+	if (!qos->in.interval)
+		qos->in.interval = qos->out.interval;
+
+	/* If output latency is not set use the input latency as it cannot be
+	 * 0x0000.
+	 */
+	if (!qos->out.latency)
+		qos->out.latency = qos->in.latency;
+
+	/* If input latency is not set use the output latency as it cannot be
+	 * 0x0000.
+	 */
+	if (!qos->in.latency)
+		qos->in.latency = qos->out.latency;
+
+	/* Mirror PHYs that are disabled as SDU will be set to 0 */
+	if (!qos->in.phy)
+		qos->in.phy = qos->out.phy;
+
+	if (!qos->out.phy)
+		qos->out.phy = qos->in.phy;
+
+	if (!hci_le_set_cig_params(cis, qos)) {
+		hci_conn_drop(cis);
+		return ERR_PTR(-EINVAL);
+	}
+
+	cis->iso_qos = *qos;
+	cis->state = BT_BOUND;
+
+	return cis;
+}
+
+bool hci_iso_setup_path(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_setup_iso_path cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+
+	if (conn->iso_qos.out.sdu) {
+		cmd.handle = cpu_to_le16(conn->handle);
+		cmd.direction = 0x00; /* Input (Host to Controller) */
+		cmd.path = 0x00; /* HCI path if enabled */
+		cmd.codec = 0x03; /* Transparent Data */
+
+		if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+				 &cmd) < 0)
+			return false;
+	}
+
+	if (conn->iso_qos.in.sdu) {
+		cmd.handle = cpu_to_le16(conn->handle);
+		cmd.direction = 0x01; /* Output (Controller to Host) */
+		cmd.path = 0x00; /* HCI path if enabled */
+		cmd.codec = 0x03; /* Transparent Data */
+
+		if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+				 &cmd) < 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
+{
+	struct {
+		struct hci_cp_le_create_cis cp;
+		struct hci_cis cis[0x1f];
+	} cmd;
+	struct hci_conn *conn = data;
+	u8 cig;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cis[0].acl_handle = cpu_to_le16(conn->link->handle);
+	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
+	cmd.cp.num_cis++;
+	cig = conn->iso_qos.cig;
+
+	hci_dev_lock(hdev);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
+
+		if (conn == data || conn->type != ISO_LINK ||
+		    conn->state == BT_CONNECTED || conn->iso_qos.cig != cig)
+			continue;
+
+		/* Check if all CIS(s) belonging to a CIG are ready */
+		if (conn->link->state != BT_CONNECTED ||
+		    conn->state != BT_CONNECT) {
+			cmd.cp.num_cis = 0;
+			break;
+		}
+
+		/* Group all CIS with state BT_CONNECT since the spec don't
+		 * allow to send them individually:
+		 *
+		 * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+		 * page 2566:
+		 *
+		 * If the Host issues this command before all the
+		 * HCI_LE_CIS_Established events from the previous use of the
+		 * command have been generated, the Controller shall return the
+		 * error code Command Disallowed (0x0C).
+		 */
+		cis->acl_handle = cpu_to_le16(conn->link->handle);
+		cis->cis_handle = cpu_to_le16(conn->handle);
+		cmd.cp.num_cis++;
+	}
+
+	rcu_read_unlock();
+
+	hci_dev_unlock(hdev);
+
+	if (!cmd.cp.num_cis)
+		return 0;
+
+	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_CIS, sizeof(cmd.cp) +
+			    sizeof(cmd.cis[0]) * cmd.cp.num_cis, &cmd);
+}
+
+int hci_le_create_cis(struct hci_conn *conn)
+{
+	struct hci_conn *cis;
+	struct hci_dev *hdev = conn->hdev;
+	int err;
+
+	switch (conn->type) {
+	case LE_LINK:
+		if (!conn->link || conn->state != BT_CONNECTED)
+			return -EINVAL;
+		cis = conn->link;
+		break;
+	case ISO_LINK:
+		cis = conn;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (cis->state == BT_CONNECT)
+		return 0;
+
+	/* Queue Create CIS */
+	err = hci_cmd_sync_queue(hdev, hci_create_cis_sync, cis, NULL);
+	if (err)
+		return err;
+
+	cis->state = BT_CONNECT;
+
+	return 0;
+}
+
+static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
+			      struct bt_iso_io_qos *qos, __u8 phy)
+{
+	/* Only set MTU if PHY is enabled */
+	if (!qos->sdu && qos->phy) {
+		if (hdev->iso_mtu > 0)
+			qos->sdu = hdev->iso_mtu;
+		else if (hdev->le_mtu > 0)
+			qos->sdu = hdev->le_mtu;
+		else
+			qos->sdu = hdev->acl_mtu;
+	}
+
+	/* Use the same PHY as ACL if set to any */
+	if (qos->phy == BT_ISO_PHY_ANY)
+		qos->phy = phy;
+
+	/* Use LE ACL connection interval if not set */
+	if (!qos->interval)
+		/* ACL interval unit in 1.25 ms to us */
+		qos->interval = conn->le_conn_interval * 1250;
+
+	/* Use LE ACL connection latency if not set */
+	if (!qos->latency)
+		qos->latency = conn->le_conn_latency;
+}
+
+struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
+				 __u8 dst_type, struct bt_iso_qos *qos)
+{
+	struct hci_conn *le;
+	struct hci_conn *cis;
+
+	/* Convert from ISO socket address type to HCI address type  */
+	if (dst_type == BDADDR_LE_PUBLIC)
+		dst_type = ADDR_LE_DEV_PUBLIC;
+	else
+		dst_type = ADDR_LE_DEV_RANDOM;
+
+	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+		le = hci_connect_le(hdev, dst, dst_type, false,
+				    BT_SECURITY_LOW,
+				    HCI_LE_CONN_TIMEOUT,
+				    HCI_ROLE_SLAVE);
+	else
+		le = hci_connect_le_scan(hdev, dst, dst_type,
+					 BT_SECURITY_LOW,
+					 HCI_LE_CONN_TIMEOUT,
+					 CONN_REASON_ISO_CONNECT);
+	if (IS_ERR(le))
+		return le;
+
+	hci_iso_qos_setup(hdev, le, &qos->out,
+			  le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
+	hci_iso_qos_setup(hdev, le, &qos->in,
+			  le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
+
+	cis = hci_bind_cis(hdev, dst, dst_type, qos);
+	if (IS_ERR(cis)) {
+		hci_conn_drop(le);
+		return cis;
+	}
+
+	le->link = cis;
+	cis->link = le;
+
+	hci_conn_hold(cis);
+
+	/* If LE is already connected and CIS handle is already set proceed to
+	 * Create CIS immediately.
+	 */
+	if (le->state == BT_CONNECTED && cis->handle != HCI_CONN_HANDLE_UNSET)
+		hci_le_create_cis(le);
+
+	return cis;
+}
+
 /* Check link security requirement */
 int hci_conn_check_link_mode(struct hci_conn *conn)
 {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 924c29517918..2b7947dca14d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -616,7 +616,10 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
 	hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
 
 	atomic_set(&hdev->cmd_cnt, 1);
-	hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
+	hdev->acl_cnt = 0;
+	hdev->sco_cnt = 0;
+	hdev->le_cnt = 0;
+	hdev->iso_cnt = 0;
 
 	ret = hci_reset_sync(hdev);
 
@@ -3157,9 +3160,117 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
 	queue_work(hdev->workqueue, &hdev->tx_work);
 }
 
+/* Send ISO data */
+static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags)
+{
+	struct hci_iso_hdr *hdr;
+	int len = skb->len;
+
+	skb_push(skb, HCI_ISO_HDR_SIZE);
+	skb_reset_transport_header(skb);
+	hdr = (struct hci_iso_hdr *)skb_transport_header(skb);
+	hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+	hdr->dlen   = cpu_to_le16(len);
+}
+
+static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue,
+			  struct sk_buff *skb)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct sk_buff *list;
+	__u16 flags;
+
+	skb->len = skb_headlen(skb);
+	skb->data_len = 0;
+
+	hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+
+	list = skb_shinfo(skb)->frag_list;
+
+	flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00);
+	hci_add_iso_hdr(skb, conn->handle, flags);
+
+	if (!list) {
+		/* Non fragmented */
+		BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
+
+		skb_queue_tail(queue, skb);
+	} else {
+		/* Fragmented */
+		BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+		skb_shinfo(skb)->frag_list = NULL;
+
+		__skb_queue_tail(queue, skb);
+
+		do {
+			skb = list; list = list->next;
+
+			hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+			flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END,
+						   0x00);
+			hci_add_iso_hdr(skb, conn->handle, flags);
+
+			BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+			__skb_queue_tail(queue, skb);
+		} while (list);
+	}
+}
+
+void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("%s len %d", hdev->name, skb->len);
+
+	hci_queue_iso(conn, &conn->data_q, skb);
+
+	queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
 /* ---- HCI TX task (outgoing data) ---- */
 
 /* HCI Connection scheduler */
+static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
+{
+	struct hci_dev *hdev;
+	int cnt, q;
+
+	if (!conn) {
+		*quote = 0;
+		return;
+	}
+
+	hdev = conn->hdev;
+
+	switch (conn->type) {
+	case ACL_LINK:
+		cnt = hdev->acl_cnt;
+		break;
+	case AMP_LINK:
+		cnt = hdev->block_cnt;
+		break;
+	case SCO_LINK:
+	case ESCO_LINK:
+		cnt = hdev->sco_cnt;
+		break;
+	case LE_LINK:
+		cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+		break;
+	case ISO_LINK:
+		cnt = hdev->iso_mtu ? hdev->iso_cnt :
+			hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+		break;
+	default:
+		cnt = 0;
+		bt_dev_err(hdev, "unknown link type %d", conn->type);
+	}
+
+	q = cnt / num;
+	*quote = q ? q : 1;
+}
+
 static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
 				     int *quote)
 {
@@ -3192,29 +3303,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
 
 	rcu_read_unlock();
 
-	if (conn) {
-		int cnt, q;
-
-		switch (conn->type) {
-		case ACL_LINK:
-			cnt = hdev->acl_cnt;
-			break;
-		case SCO_LINK:
-		case ESCO_LINK:
-			cnt = hdev->sco_cnt;
-			break;
-		case LE_LINK:
-			cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
-			break;
-		default:
-			cnt = 0;
-			bt_dev_err(hdev, "unknown link type %d", conn->type);
-		}
-
-		q = cnt / num;
-		*quote = q ? q : 1;
-	} else
-		*quote = 0;
+	hci_quote_sent(conn, num, quote);
 
 	BT_DBG("conn %p quote %d", conn, *quote);
 	return conn;
@@ -3248,7 +3337,7 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
 	struct hci_chan *chan = NULL;
 	unsigned int num = 0, min = ~0, cur_prio = 0;
 	struct hci_conn *conn;
-	int cnt, q, conn_num = 0;
+	int conn_num = 0;
 
 	BT_DBG("%s", hdev->name);
 
@@ -3298,27 +3387,8 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
 	if (!chan)
 		return NULL;
 
-	switch (chan->conn->type) {
-	case ACL_LINK:
-		cnt = hdev->acl_cnt;
-		break;
-	case AMP_LINK:
-		cnt = hdev->block_cnt;
-		break;
-	case SCO_LINK:
-	case ESCO_LINK:
-		cnt = hdev->sco_cnt;
-		break;
-	case LE_LINK:
-		cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
-		break;
-	default:
-		cnt = 0;
-		bt_dev_err(hdev, "unknown link type %d", chan->conn->type);
-	}
+	hci_quote_sent(chan->conn, num, quote);
 
-	q = cnt / num;
-	*quote = q ? q : 1;
 	BT_DBG("chan %p quote %d", chan, *quote);
 	return chan;
 }
@@ -3607,18 +3677,46 @@ static void hci_sched_le(struct hci_dev *hdev)
 		hci_prio_recalculate(hdev, LE_LINK);
 }
 
+/* Schedule CIS */
+static void hci_sched_iso(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote, *cnt;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!hci_conn_num(hdev, ISO_LINK))
+		return;
+
+	cnt = hdev->iso_pkts ? &hdev->iso_cnt :
+		hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
+	while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+			hci_send_frame(hdev, skb);
+
+			conn->sent++;
+			if (conn->sent == ~0)
+				conn->sent = 0;
+			(*cnt)--;
+		}
+	}
+}
+
 static void hci_tx_work(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work);
 	struct sk_buff *skb;
 
-	BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt,
-	       hdev->sco_cnt, hdev->le_cnt);
+	BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt,
+	       hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt);
 
 	if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
 		/* Schedule queues and send stuff to HCI driver */
 		hci_sched_sco(hdev);
 		hci_sched_esco(hdev);
+		hci_sched_iso(hdev);
 		hci_sched_acl(hdev);
 		hci_sched_le(hdev);
 	}
@@ -3701,6 +3799,39 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_iso_hdr *hdr;
+	struct hci_conn *conn;
+	__u16 handle, flags;
+
+	hdr = skb_pull_data(skb, sizeof(*hdr));
+	if (!hdr) {
+		bt_dev_err(hdev, "ISO packet too small");
+		goto drop;
+	}
+
+	handle = __le16_to_cpu(hdr->handle);
+	flags  = hci_flags(handle);
+	handle = hci_handle(handle);
+
+	bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+		   handle, flags);
+
+	hci_dev_lock(hdev);
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	hci_dev_unlock(hdev);
+
+	/* TODO: Send to upper protocol */
+	if (!conn) {
+		bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
+			   handle);
+	}
+
+drop:
+	kfree_skb(skb);
+}
+
 static bool hci_req_is_complete(struct hci_dev *hdev)
 {
 	struct sk_buff *skb;
@@ -3862,6 +3993,11 @@ static void hci_rx_work(struct work_struct *work)
 			hci_scodata_packet(hdev, skb);
 			break;
 
+		case HCI_ISODATA_PKT:
+			BT_DBG("%s ISO data packet", hdev->name);
+			hci_isodata_packet(hdev, skb);
+			break;
+
 		default:
 			kfree_skb(skb);
 			break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index bab52ee99b11..37177d7051d5 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3775,6 +3775,124 @@ static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd)
 	}
 }
 
+static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data,
+					struct sk_buff *skb)
+{
+	struct hci_rp_le_read_buffer_size_v2 *rp = data;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	if (rp->status)
+		return rp->status;
+
+	hdev->le_mtu   = __le16_to_cpu(rp->acl_mtu);
+	hdev->le_pkts  = rp->acl_max_pkt;
+	hdev->iso_mtu  = __le16_to_cpu(rp->iso_mtu);
+	hdev->iso_pkts = rp->iso_max_pkt;
+
+	hdev->le_cnt  = hdev->le_pkts;
+	hdev->iso_cnt = hdev->iso_pkts;
+
+	BT_DBG("%s acl mtu %d:%d iso mtu %d:%d", hdev->name, hdev->acl_mtu,
+	       hdev->acl_pkts, hdev->iso_mtu, hdev->iso_pkts);
+
+	return rp->status;
+}
+
+static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
+				   struct sk_buff *skb)
+{
+	struct hci_rp_le_set_cig_params *rp = data;
+	struct hci_conn *conn;
+	int i = 0;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	hci_dev_lock(hdev);
+
+	if (rp->status) {
+		while ((conn = hci_conn_hash_lookup_cig(hdev, rp->cig_id))) {
+			conn->state = BT_CLOSED;
+			hci_connect_cfm(conn, rp->status);
+			hci_conn_del(conn);
+		}
+		goto unlock;
+	}
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+		if (conn->type != ISO_LINK || conn->iso_qos.cig != rp->cig_id ||
+		    conn->state == BT_CONNECTED)
+			continue;
+
+		conn->handle = __le16_to_cpu(rp->handle[i++]);
+
+		bt_dev_dbg(hdev, "%p handle 0x%4.4x link %p", conn,
+			   conn->handle, conn->link);
+
+		/* Create CIS if LE is already connected */
+		if (conn->link && conn->link->state == BT_CONNECTED)
+			hci_le_create_cis(conn->link);
+
+		if (i == rp->num_handles)
+			break;
+	}
+
+	rcu_read_unlock();
+
+unlock:
+	hci_dev_unlock(hdev);
+
+	return rp->status;
+}
+
+static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
+				   struct sk_buff *skb)
+{
+	struct hci_rp_le_setup_iso_path *rp = data;
+	struct hci_cp_le_setup_iso_path *cp;
+	struct hci_conn *conn;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SETUP_ISO_PATH);
+	if (!cp)
+		return rp->status;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (!conn)
+		goto unlock;
+
+	if (rp->status) {
+		hci_connect_cfm(conn, rp->status);
+		hci_conn_del(conn);
+		goto unlock;
+	}
+
+	switch (cp->direction) {
+	/* Input (Host to Controller) */
+	case 0x00:
+		/* Only confirm connection if output only */
+		if (conn->iso_qos.out.sdu && !conn->iso_qos.in.sdu)
+			hci_connect_cfm(conn, rp->status);
+		break;
+	/* Output (Controller to Host) */
+	case 0x01:
+		/* Confirm connection since conn->iso_qos is always configured
+		 * last.
+		 */
+		hci_connect_cfm(conn, rp->status);
+		break;
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+	return rp->status;
+}
+
 #define HCI_CC_VL(_op, _func, _min, _max) \
 { \
 	.op = _op, \
@@ -3950,7 +4068,13 @@ static const struct hci_cc {
 	HCI_CC_STATUS(HCI_OP_LE_CLEAR_ADV_SETS, hci_cc_le_clear_adv_sets),
 	HCI_CC(HCI_OP_LE_READ_TRANSMIT_POWER, hci_cc_le_read_transmit_power,
 	       sizeof(struct hci_rp_le_read_transmit_power)),
-	HCI_CC_STATUS(HCI_OP_LE_SET_PRIVACY_MODE, hci_cc_le_set_privacy_mode)
+	HCI_CC_STATUS(HCI_OP_LE_SET_PRIVACY_MODE, hci_cc_le_set_privacy_mode),
+	HCI_CC(HCI_OP_LE_READ_BUFFER_SIZE_V2, hci_cc_le_read_buffer_size_v2,
+	       sizeof(struct hci_rp_le_read_buffer_size_v2)),
+	HCI_CC_VL(HCI_OP_LE_SET_CIG_PARAMS, hci_cc_le_set_cig_params,
+		  sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE),
+	HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path,
+	       sizeof(struct hci_rp_le_setup_iso_path)),
 };
 
 static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc,
@@ -4013,6 +4137,40 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data,
 		queue_work(hdev->workqueue, &hdev->cmd_work);
 }
 
+static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status)
+{
+	struct hci_cp_le_create_cis *cp;
+	int i;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CIS);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	/* Remove connection if command failed */
+	for (i = 0; cp->num_cis; cp->num_cis--, i++) {
+		struct hci_conn *conn;
+		u16 handle;
+
+		handle = __le16_to_cpu(cp->cis[i].cis_handle);
+
+		conn = hci_conn_hash_lookup_handle(hdev, handle);
+		if (conn) {
+			conn->state = BT_CLOSED;
+			hci_connect_cfm(conn, status);
+			hci_conn_del(conn);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 #define HCI_CS(_op, _func) \
 { \
 	.op = _op, \
@@ -4042,7 +4200,8 @@ static const struct hci_cs {
 	HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn),
 	HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features),
 	HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc),
-	HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn)
+	HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
+	HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
 };
 
 static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
@@ -4178,6 +4337,22 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 				hdev->sco_cnt = hdev->sco_pkts;
 			break;
 
+		case ISO_LINK:
+			if (hdev->iso_pkts) {
+				hdev->iso_cnt += count;
+				if (hdev->iso_cnt > hdev->iso_pkts)
+					hdev->iso_cnt = hdev->iso_pkts;
+			} else if (hdev->le_pkts) {
+				hdev->le_cnt += count;
+				if (hdev->le_cnt > hdev->le_pkts)
+					hdev->le_cnt = hdev->le_pkts;
+			} else {
+				hdev->acl_cnt += count;
+				if (hdev->acl_cnt > hdev->acl_pkts)
+					hdev->acl_cnt = hdev->acl_pkts;
+			}
+			break;
+
 		default:
 			bt_dev_err(hdev, "unknown type %d conn %p",
 				   conn->type, conn);
@@ -6480,6 +6655,127 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
+					struct sk_buff *skb)
+{
+	struct hci_evt_le_cis_established *ev = data;
+	struct hci_conn *conn;
+	u16 handle = __le16_to_cpu(ev->handle);
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!conn) {
+		bt_dev_err(hdev,
+			   "Unable to find connection with handle 0x%4.4x",
+			   handle);
+		goto unlock;
+	}
+
+	if (conn->role == HCI_ROLE_SLAVE) {
+		__le32 interval;
+
+		memset(&interval, 0, sizeof(interval));
+
+		memcpy(&interval, ev->c_latency, sizeof(ev->c_latency));
+		conn->iso_qos.in.interval = le32_to_cpu(interval);
+		memcpy(&interval, ev->p_latency, sizeof(ev->p_latency));
+		conn->iso_qos.out.interval = le32_to_cpu(interval);
+		conn->iso_qos.in.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.out.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.in.sdu = le16_to_cpu(ev->c_mtu);
+		conn->iso_qos.out.sdu = le16_to_cpu(ev->p_mtu);
+		conn->iso_qos.in.phy = ev->c_phy;
+		conn->iso_qos.out.phy = ev->p_phy;
+	}
+
+	if (!ev->status) {
+		conn->state = BT_CONNECTED;
+		hci_debugfs_create_conn(conn);
+		hci_conn_add_sysfs(conn);
+		hci_iso_setup_path(conn);
+		goto unlock;
+	}
+
+	hci_connect_cfm(conn, ev->status);
+	hci_conn_del(conn);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static void hci_le_reject_cis(struct hci_dev *hdev, __le16 handle)
+{
+	struct hci_cp_le_reject_cis cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+	cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+	hci_send_cmd(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp);
+}
+
+static void hci_le_accept_cis(struct hci_dev *hdev, __le16 handle)
+{
+	struct hci_cp_le_accept_cis cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+	hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
+
+static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
+			       struct sk_buff *skb)
+{
+	struct hci_evt_le_cis_req *ev = data;
+	u16 acl_handle, cis_handle;
+	struct hci_conn *acl, *cis;
+	int mask;
+	__u8 flags = 0;
+
+	acl_handle = __le16_to_cpu(ev->acl_handle);
+	cis_handle = __le16_to_cpu(ev->cis_handle);
+
+	bt_dev_dbg(hdev, "acl 0x%4.4x handle 0x%4.4x cig 0x%2.2x cis 0x%2.2x",
+		   acl_handle, cis_handle, ev->cig_id, ev->cis_id);
+
+	hci_dev_lock(hdev);
+
+	acl = hci_conn_hash_lookup_handle(hdev, acl_handle);
+	if (!acl)
+		goto unlock;
+
+	mask = hci_proto_connect_ind(hdev, &acl->dst, ISO_LINK, &flags);
+	if (!(mask & HCI_LM_ACCEPT)) {
+		hci_le_reject_cis(hdev, ev->cis_handle);
+		goto unlock;
+	}
+
+	cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
+	if (!cis) {
+		cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE);
+		if (!cis) {
+			hci_le_reject_cis(hdev, ev->cis_handle);
+			goto unlock;
+		}
+		cis->handle = cis_handle;
+	}
+
+	cis->iso_qos.cig = ev->cig_id;
+	cis->iso_qos.cis = ev->cis_id;
+
+	if (!(flags & HCI_PROTO_DEFER)) {
+		hci_le_accept_cis(hdev, ev->cis_handle);
+	} else {
+		cis->state = BT_CONNECT2;
+		hci_connect_cfm(cis, 0);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
 #define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \
 [_op] = { \
 	.func = _func, \
@@ -6543,6 +6839,12 @@ static const struct hci_le_ev {
 	/* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
 	HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
 		  sizeof(struct hci_evt_le_ext_adv_set_term)),
+	/* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */
+	HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_estabilished_evt,
+		  sizeof(struct hci_evt_le_cis_established)),
+	/* [0x1a = HCI_EVT_LE_CIS_REQ] */
+	HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt,
+		  sizeof(struct hci_evt_le_cis_req)),
 };
 
 static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
@@ -6581,7 +6883,6 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
 	if (skb->len > subev->max_len)
 		bt_dev_warn(hdev, "unexpected subevent 0x%2.2x length: %u > %u",
 			    ev->subevent, skb->len, subev->max_len);
-
 	data = hci_le_ev_skb_pull(hdev, skb, ev->subevent, subev->min_len);
 	if (!data)
 		return;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 46a04488d614..ac0f2cc0e44c 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2800,6 +2800,12 @@ static const struct hci_init_stage amp_init2[] = {
 /* Read Buffer Size (ACL mtu, max pkt, etc.) */
 static int hci_read_buffer_size_sync(struct hci_dev *hdev)
 {
+	/* Use Read LE Buffer Size V2 if supported */
+	if (hdev->commands[41] & 0x20)
+		return __hci_cmd_sync_status(hdev,
+					     HCI_OP_LE_READ_BUFFER_SIZE_V2,
+					     0, NULL, HCI_CMD_TIMEOUT);
+
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE,
 				     0, NULL, HCI_CMD_TIMEOUT);
 }
@@ -3051,6 +3057,10 @@ static int hci_init2_sync(struct hci_dev *hdev)
 	if (hdev->dev_type == HCI_AMP)
 		return hci_init_stage_sync(hdev, amp_init2);
 
+	err = hci_init_stage_sync(hdev, hci_init2);
+	if (err)
+		return err;
+
 	if (lmp_bredr_capable(hdev)) {
 		err = hci_init_stage_sync(hdev, br_init2);
 		if (err)
@@ -3068,7 +3078,7 @@ static int hci_init2_sync(struct hci_dev *hdev)
 			hci_dev_set_flag(hdev, HCI_LE_ENABLED);
 	}
 
-	return hci_init_stage_sync(hdev, hci_init2);
+	return 0;
 }
 
 static int hci_set_event_mask_sync(struct hci_dev *hdev)
@@ -3389,6 +3399,12 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
 	if (ext_adv_capable(hdev))
 		events[2] |= 0x02;	/* LE Advertising Set Terminated */
 
+	if (cis_capable(hdev)) {
+		events[3] |= 0x01;	/* LE CIS Established */
+		if (cis_peripheral_capable(hdev))
+			events[3] |= 0x02; /* LE CIS Request */
+	}
+
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK,
 				     sizeof(events), events, HCI_CMD_TIMEOUT);
 }
@@ -3529,6 +3545,24 @@ static int hci_set_le_support_sync(struct hci_dev *hdev)
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
 
+/* LE Set Host Feature */
+static int hci_le_set_host_feature_sync(struct hci_dev *hdev)
+{
+	struct hci_cp_le_set_host_feature cp;
+
+	if (!iso_capable(hdev))
+		return 0;
+
+	memset(&cp, 0, sizeof(cp));
+
+	/* Isochronous Channels (Host Support) */
+	cp.bit_number = 32;
+	cp.bit_value = 1;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
 /* LE Controller init stage 3 command sequence */
 static const struct hci_init_stage le_init3[] = {
 	/* HCI_OP_LE_SET_EVENT_MASK */
@@ -3555,6 +3589,8 @@ static const struct hci_init_stage le_init3[] = {
 	HCI_INIT(hci_le_read_num_support_adv_sets_sync),
 	/* HCI_OP_WRITE_LE_HOST_SUPPORTED */
 	HCI_INIT(hci_set_le_support_sync),
+	/* HCI_OP_LE_SET_HOST_FEATURE */
+	HCI_INIT(hci_le_set_host_feature_sync),
 	{}
 };
 
@@ -5437,3 +5473,14 @@ done:
 	hci_resume_advertising_sync(hdev);
 	return err;
 }
+
+int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
+{
+	struct hci_cp_le_remove_cig cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.cig_id = handle;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp),
+				     &cp, HCI_CMD_TIMEOUT);
+}
-- 
cgit 


From ccf74f2390d60a2f9a75ef496d2564abb478f46a Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 16 Jan 2020 15:55:57 -0800
Subject: Bluetooth: Add BTPROTO_ISO socket type

This introduces a new socket type BTPROTO_ISO which can be enabled with
use of ISO Socket experiemental UUID, it can used to initiate/accept
connections and transfer packets between userspace and kernel similarly
to how BTPROTO_SCO works:

Central -> uses connect with address set to destination bdaddr:
> tools/isotest -s 00:AA:01:00:00:00

Peripheral -> uses listen:
> tools/isotest -d

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |   21 +
 include/net/bluetooth/hci_core.h  |   18 +-
 include/net/bluetooth/iso.h       |   21 +
 net/bluetooth/Makefile            |    1 +
 net/bluetooth/af_bluetooth.c      |    4 +-
 net/bluetooth/hci_core.c          |    6 +-
 net/bluetooth/iso.c               | 1501 +++++++++++++++++++++++++++++++++++++
 net/bluetooth/mgmt.c              |   69 +-
 8 files changed, 1636 insertions(+), 5 deletions(-)
 create mode 100644 include/net/bluetooth/iso.h
 create mode 100644 net/bluetooth/iso.c

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 72af9722244a..df2c006fc6a9 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -590,6 +590,27 @@ static inline void sco_exit(void)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_BT_LE)
+int iso_init(void);
+int iso_exit(void);
+bool iso_enabled(void);
+#else
+static inline int iso_init(void)
+{
+	return 0;
+}
+
+static inline int iso_exit(void)
+{
+	return 0;
+}
+
+static inline bool iso_enabled(void)
+{
+	return false;
+}
+#endif
+
 int mgmt_init(void);
 void mgmt_exit(void);
 
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 5fac013ff2b0..3457c24e9b19 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -843,6 +843,21 @@ static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_BT_LE)
+int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
+void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+#else
+static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
+				  __u8 *flags)
+{
+	return 0;
+}
+static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
+			    u16 flags)
+{
+}
+#endif
+
 /* ----- Inquiry cache ----- */
 #define INQUIRY_CACHE_AGE_MAX   (HZ*30)   /* 30 seconds */
 #define INQUIRY_ENTRY_AGE_MAX   (HZ*60)   /* 60 seconds */
@@ -1640,8 +1655,7 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 		return sco_connect_ind(hdev, bdaddr, flags);
 
 	case ISO_LINK:
-		/* TODO: Handle connection indication */
-		return -EINVAL;
+		return iso_connect_ind(hdev, bdaddr, flags);
 
 	default:
 		BT_ERR("unknown link type %d", type);
diff --git a/include/net/bluetooth/iso.h b/include/net/bluetooth/iso.h
new file mode 100644
index 000000000000..13b22d54aab5
--- /dev/null
+++ b/include/net/bluetooth/iso.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2022 Intel Corporation
+ */
+
+#ifndef __ISO_H
+#define __ISO_H
+
+/* ISO defaults */
+#define ISO_DEFAULT_MTU		251
+
+/* ISO socket address */
+struct sockaddr_iso {
+	sa_family_t	iso_family;
+	bdaddr_t	iso_bdaddr;
+	__u8		iso_bdaddr_type;
+};
+
+#endif /* __ISO_H */
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index a52bba8500e1..0e7b7db42750 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -18,6 +18,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 	eir.o hci_sync.o
 
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
+bluetooth-$(CONFIG_BT_LE) += iso.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
 bluetooth-$(CONFIG_BT_LEDS) += leds.o
 bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index b506409bb498..dc65974f5adb 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -38,7 +38,7 @@
 #include "selftest.h"
 
 /* Bluetooth sockets */
-#define BT_MAX_PROTO	8
+#define BT_MAX_PROTO	(BTPROTO_LAST + 1)
 static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
 static DEFINE_RWLOCK(bt_proto_lock);
 
@@ -52,6 +52,7 @@ static const char *const bt_key_strings[BT_MAX_PROTO] = {
 	"sk_lock-AF_BLUETOOTH-BTPROTO_CMTP",
 	"sk_lock-AF_BLUETOOTH-BTPROTO_HIDP",
 	"sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_ISO",
 };
 
 static struct lock_class_key bt_slock_key[BT_MAX_PROTO];
@@ -64,6 +65,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
 	"slock-AF_BLUETOOTH-BTPROTO_CMTP",
 	"slock-AF_BLUETOOTH-BTPROTO_HIDP",
 	"slock-AF_BLUETOOTH-BTPROTO_AVDTP",
+	"slock-AF_BLUETOOTH-BTPROTO_ISO",
 };
 
 void bt_sock_reclassify_lock(struct sock *sk, int proto)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2b7947dca14d..193d4e664acd 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3822,12 +3822,16 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	conn = hci_conn_hash_lookup_handle(hdev, handle);
 	hci_dev_unlock(hdev);
 
-	/* TODO: Send to upper protocol */
 	if (!conn) {
 		bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
 			   handle);
+		goto drop;
 	}
 
+	/* Send to upper protocol */
+	iso_recv(conn, skb, flags);
+	return;
+
 drop:
 	kfree_skb(skb);
 }
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
new file mode 100644
index 000000000000..caaaa670cc2c
--- /dev/null
+++ b/net/bluetooth/iso.c
@@ -0,0 +1,1501 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2022 Intel Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/iso.h>
+
+static const struct proto_ops iso_sock_ops;
+
+static struct bt_sock_list iso_sk_list = {
+	.lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock)
+};
+
+/* ---- ISO connections ---- */
+struct iso_conn {
+	struct hci_conn	*hcon;
+
+	/* @lock: spinlock protecting changes to iso_conn fields */
+	spinlock_t	lock;
+	struct sock	*sk;
+
+	struct delayed_work	timeout_work;
+
+	struct sk_buff	*rx_skb;
+	__u32		rx_len;
+	__u16		tx_sn;
+};
+
+#define iso_conn_lock(c)	spin_lock(&(c)->lock)
+#define iso_conn_unlock(c)	spin_unlock(&(c)->lock)
+
+static void iso_sock_close(struct sock *sk);
+static void iso_sock_kill(struct sock *sk);
+
+/* ----- ISO socket info ----- */
+#define iso_pi(sk) ((struct iso_pinfo *)sk)
+
+struct iso_pinfo {
+	struct bt_sock		bt;
+	bdaddr_t		src;
+	__u8			src_type;
+	bdaddr_t		dst;
+	__u8			dst_type;
+	__u32			flags;
+	struct bt_iso_qos	qos;
+	struct iso_conn		*conn;
+};
+
+/* ---- ISO timers ---- */
+#define ISO_CONN_TIMEOUT	(HZ * 40)
+#define ISO_DISCONN_TIMEOUT	(HZ * 2)
+
+static void iso_sock_timeout(struct work_struct *work)
+{
+	struct iso_conn *conn = container_of(work, struct iso_conn,
+					     timeout_work.work);
+	struct sock *sk;
+
+	iso_conn_lock(conn);
+	sk = conn->sk;
+	if (sk)
+		sock_hold(sk);
+	iso_conn_unlock(conn);
+
+	if (!sk)
+		return;
+
+	BT_DBG("sock %p state %d", sk, sk->sk_state);
+
+	lock_sock(sk);
+	sk->sk_err = ETIMEDOUT;
+	sk->sk_state_change(sk);
+	release_sock(sk);
+	sock_put(sk);
+}
+
+static void iso_sock_set_timer(struct sock *sk, long timeout)
+{
+	if (!iso_pi(sk)->conn)
+		return;
+
+	BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
+	cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+	schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout);
+}
+
+static void iso_sock_clear_timer(struct sock *sk)
+{
+	if (!iso_pi(sk)->conn)
+		return;
+
+	BT_DBG("sock %p state %d", sk, sk->sk_state);
+	cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+}
+
+/* ---- ISO connections ---- */
+static struct iso_conn *iso_conn_add(struct hci_conn *hcon)
+{
+	struct iso_conn *conn = hcon->iso_data;
+
+	if (conn)
+		return conn;
+
+	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+	if (!conn)
+		return NULL;
+
+	spin_lock_init(&conn->lock);
+	INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout);
+
+	hcon->iso_data = conn;
+	conn->hcon = hcon;
+	conn->tx_sn = 0;
+
+	BT_DBG("hcon %p conn %p", hcon, conn);
+
+	return conn;
+}
+
+/* Delete channel. Must be called on the locked socket. */
+static void iso_chan_del(struct sock *sk, int err)
+{
+	struct iso_conn *conn;
+
+	conn = iso_pi(sk)->conn;
+
+	BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+	if (conn) {
+		iso_conn_lock(conn);
+		conn->sk = NULL;
+		iso_pi(sk)->conn = NULL;
+		iso_conn_unlock(conn);
+
+		if (conn->hcon)
+			hci_conn_drop(conn->hcon);
+	}
+
+	sk->sk_state = BT_CLOSED;
+	sk->sk_err   = err;
+	sk->sk_state_change(sk);
+
+	sock_set_flag(sk, SOCK_ZAPPED);
+}
+
+static void iso_conn_del(struct hci_conn *hcon, int err)
+{
+	struct iso_conn *conn = hcon->iso_data;
+	struct sock *sk;
+
+	if (!conn)
+		return;
+
+	BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+	/* Kill socket */
+	iso_conn_lock(conn);
+	sk = conn->sk;
+	if (sk)
+		sock_hold(sk);
+	iso_conn_unlock(conn);
+
+	if (sk) {
+		lock_sock(sk);
+		iso_sock_clear_timer(sk);
+		iso_chan_del(sk, err);
+		release_sock(sk);
+		sock_put(sk);
+	}
+
+	/* Ensure no more work items will run before freeing conn. */
+	cancel_delayed_work_sync(&conn->timeout_work);
+
+	hcon->iso_data = NULL;
+	kfree(conn);
+}
+
+static int __iso_chan_add(struct iso_conn *conn, struct sock *sk,
+			  struct sock *parent)
+{
+	BT_DBG("conn %p", conn);
+
+	if (iso_pi(sk)->conn == conn && conn->sk == sk)
+		return 0;
+
+	if (conn->sk) {
+		BT_ERR("conn->sk already set");
+		return -EBUSY;
+	}
+
+	iso_pi(sk)->conn = conn;
+	conn->sk = sk;
+
+	if (parent)
+		bt_accept_enqueue(parent, sk, true);
+
+	return 0;
+}
+
+static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
+			struct sock *parent)
+{
+	int err;
+
+	iso_conn_lock(conn);
+	err = __iso_chan_add(conn, sk, parent);
+	iso_conn_unlock(conn);
+
+	return err;
+}
+
+static int iso_connect(struct sock *sk)
+{
+	struct iso_conn *conn;
+	struct hci_conn *hcon;
+	struct hci_dev  *hdev;
+	int err;
+
+	BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst);
+
+	hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+			     iso_pi(sk)->src_type);
+	if (!hdev)
+		return -EHOSTUNREACH;
+
+	hci_dev_lock(hdev);
+
+	if (!cis_central_capable(hdev)) {
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* Fail if either PHYs are marked as disabled */
+	if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	/* Just bind if DEFER_SETUP has been set */
+	if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+		hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
+				    iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+		if (IS_ERR(hcon)) {
+			err = PTR_ERR(hcon);
+			goto done;
+		}
+	} else {
+		hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
+				       iso_pi(sk)->dst_type, &iso_pi(sk)->qos);
+		if (IS_ERR(hcon)) {
+			err = PTR_ERR(hcon);
+			goto done;
+		}
+	}
+
+	conn = iso_conn_add(hcon);
+	if (!conn) {
+		hci_conn_drop(hcon);
+		err = -ENOMEM;
+		goto done;
+	}
+
+	/* Update source addr of the socket */
+	bacpy(&iso_pi(sk)->src, &hcon->src);
+
+	err = iso_chan_add(conn, sk, NULL);
+	if (err)
+		goto done;
+
+	if (hcon->state == BT_CONNECTED) {
+		iso_sock_clear_timer(sk);
+		sk->sk_state = BT_CONNECTED;
+	} else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+		iso_sock_clear_timer(sk);
+		sk->sk_state = BT_CONNECT;
+	} else {
+		sk->sk_state = BT_CONNECT;
+		iso_sock_set_timer(sk, sk->sk_sndtimeo);
+	}
+
+done:
+	hci_dev_unlock(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+static int iso_send_frame(struct sock *sk, struct sk_buff *skb)
+{
+	struct iso_conn *conn = iso_pi(sk)->conn;
+	struct hci_iso_data_hdr *hdr;
+	int len = 0;
+
+	BT_DBG("sk %p len %d", sk, skb->len);
+
+	if (skb->len > iso_pi(sk)->qos.out.sdu)
+		return -EMSGSIZE;
+
+	len = skb->len;
+
+	/* Push ISO data header */
+	hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE);
+	hdr->sn = cpu_to_le16(conn->tx_sn++);
+	hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len,
+						      HCI_ISO_STATUS_VALID));
+
+	if (sk->sk_state == BT_CONNECTED)
+		hci_send_iso(conn->hcon, skb);
+	else
+		len = -ENOTCONN;
+
+	return len;
+}
+
+static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb)
+{
+	struct sock *sk;
+
+	iso_conn_lock(conn);
+	sk = conn->sk;
+	iso_conn_unlock(conn);
+
+	if (!sk)
+		goto drop;
+
+	BT_DBG("sk %p len %d", sk, skb->len);
+
+	if (sk->sk_state != BT_CONNECTED)
+		goto drop;
+
+	if (!sock_queue_rcv_skb(sk, skb))
+		return;
+
+drop:
+	kfree_skb(skb);
+}
+
+/* -------- Socket interface ---------- */
+static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba)
+{
+	struct sock *sk;
+
+	sk_for_each(sk, &iso_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		if (!bacmp(&iso_pi(sk)->src, ba))
+			return sk;
+	}
+
+	return NULL;
+}
+
+/* Find socket listening on source bdaddr.
+ * Returns closest match.
+ */
+static struct sock *iso_get_sock_listen(bdaddr_t *src)
+{
+	struct sock *sk = NULL, *sk1 = NULL;
+
+	read_lock(&iso_sk_list.lock);
+
+	sk_for_each(sk, &iso_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		/* Exact match. */
+		if (!bacmp(&iso_pi(sk)->src, src))
+			break;
+
+		/* Closest match */
+		if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY))
+			sk1 = sk;
+	}
+
+	read_unlock(&iso_sk_list.lock);
+
+	return sk ? sk : sk1;
+}
+
+static void iso_sock_destruct(struct sock *sk)
+{
+	BT_DBG("sk %p", sk);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void iso_sock_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	BT_DBG("parent %p", parent);
+
+	/* Close not yet accepted channels */
+	while ((sk = bt_accept_dequeue(parent, NULL))) {
+		iso_sock_close(sk);
+		iso_sock_kill(sk);
+	}
+
+	parent->sk_state  = BT_CLOSED;
+	sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void iso_sock_kill(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+	    sock_flag(sk, SOCK_DEAD))
+		return;
+
+	BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+	/* Kill poor orphan */
+	bt_sock_unlink(&iso_sk_list, sk);
+	sock_set_flag(sk, SOCK_DEAD);
+	sock_put(sk);
+}
+
+static void iso_conn_defer_reject(struct hci_conn *conn)
+{
+	struct hci_cp_le_reject_cis cp;
+
+	BT_DBG("conn %p", conn);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(conn->handle);
+	cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+	hci_send_cmd(conn->hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp);
+}
+
+static void __iso_sock_close(struct sock *sk)
+{
+	BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+	switch (sk->sk_state) {
+	case BT_LISTEN:
+		iso_sock_cleanup_listen(sk);
+		break;
+
+	case BT_CONNECTED:
+	case BT_CONFIG:
+		if (iso_pi(sk)->conn->hcon) {
+			sk->sk_state = BT_DISCONN;
+			iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT);
+			iso_conn_lock(iso_pi(sk)->conn);
+			hci_conn_drop(iso_pi(sk)->conn->hcon);
+			iso_pi(sk)->conn->hcon = NULL;
+			iso_conn_unlock(iso_pi(sk)->conn);
+		} else {
+			iso_chan_del(sk, ECONNRESET);
+		}
+		break;
+
+	case BT_CONNECT2:
+		if (iso_pi(sk)->conn->hcon)
+			iso_conn_defer_reject(iso_pi(sk)->conn->hcon);
+		iso_chan_del(sk, ECONNRESET);
+		break;
+	case BT_CONNECT:
+		/* In case of DEFER_SETUP the hcon would be bound to CIG which
+		 * needs to be removed so just call hci_conn_del so the cleanup
+		 * callback do what is needed.
+		 */
+		if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
+		    iso_pi(sk)->conn->hcon) {
+			hci_conn_del(iso_pi(sk)->conn->hcon);
+			iso_pi(sk)->conn->hcon = NULL;
+		}
+
+		iso_chan_del(sk, ECONNRESET);
+		break;
+	case BT_DISCONN:
+		iso_chan_del(sk, ECONNRESET);
+		break;
+
+	default:
+		sock_set_flag(sk, SOCK_ZAPPED);
+		break;
+	}
+}
+
+/* Must be called on unlocked socket. */
+static void iso_sock_close(struct sock *sk)
+{
+	iso_sock_clear_timer(sk);
+	lock_sock(sk);
+	__iso_sock_close(sk);
+	release_sock(sk);
+	iso_sock_kill(sk);
+}
+
+static void iso_sock_init(struct sock *sk, struct sock *parent)
+{
+	BT_DBG("sk %p", sk);
+
+	if (parent) {
+		sk->sk_type = parent->sk_type;
+		bt_sk(sk)->flags = bt_sk(parent)->flags;
+		security_sk_clone(parent, sk);
+	}
+}
+
+static struct proto iso_proto = {
+	.name		= "ISO",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct iso_pinfo)
+};
+
+#define DEFAULT_IO_QOS \
+{ \
+	.interval	= 10000u, \
+	.latency	= 10u, \
+	.sdu		= 40u, \
+	.phy		= BT_ISO_PHY_2M, \
+	.rtn		= 2u, \
+}
+
+static struct bt_iso_qos default_qos = {
+	.cig		= BT_ISO_QOS_CIG_UNSET,
+	.cis		= BT_ISO_QOS_CIS_UNSET,
+	.sca		= 0x00,
+	.packing	= 0x00,
+	.framing	= 0x00,
+	.in		= DEFAULT_IO_QOS,
+	.out		= DEFAULT_IO_QOS,
+};
+
+static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
+				   int proto, gfp_t prio, int kern)
+{
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &iso_proto, kern);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk);
+	INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+	sk->sk_destruct = iso_sock_destruct;
+	sk->sk_sndtimeo = ISO_CONN_TIMEOUT;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = proto;
+	sk->sk_state    = BT_OPEN;
+
+	/* Set address type as public as default src address is BDADDR_ANY */
+	iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
+
+	iso_pi(sk)->qos = default_qos;
+
+	bt_sock_link(&iso_sk_list, sk);
+	return sk;
+}
+
+static int iso_sock_create(struct net *net, struct socket *sock, int protocol,
+			   int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_SEQPACKET)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &iso_sock_ops;
+
+	sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	iso_sock_init(sk, NULL);
+	return 0;
+}
+
+static int iso_sock_bind(struct socket *sock, struct sockaddr *addr,
+			 int addr_len)
+{
+	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type);
+
+	if (!addr || addr_len < sizeof(struct sockaddr_iso) ||
+	    addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_OPEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	/* Check if the address type is of LE type */
+	if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr);
+	iso_pi(sk)->src_type = sa->iso_bdaddr_type;
+
+	sk->sk_state = BT_BOUND;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_connect(struct socket *sock, struct sockaddr *addr,
+			    int alen, int flags)
+{
+	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+	struct sock *sk = sock->sk;
+	int err;
+
+	BT_DBG("sk %p", sk);
+
+	if (alen < sizeof(struct sockaddr_iso) ||
+	    addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
+		return -EBADFD;
+
+	if (sk->sk_type != SOCK_SEQPACKET)
+		return -EINVAL;
+
+	/* Check if the address type is of LE type */
+	if (!bdaddr_type_is_le(sa->iso_bdaddr_type))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr);
+	iso_pi(sk)->dst_type = sa->iso_bdaddr_type;
+
+	err = iso_connect(sk);
+	if (err)
+		goto done;
+
+	if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+		err = bt_sock_wait_state(sk, BT_CONNECTED,
+					 sock_sndtimeo(sk, flags & O_NONBLOCK));
+	}
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	bdaddr_t *src = &iso_pi(sk)->src;
+	int err = 0;
+
+	BT_DBG("sk %p backlog %d", sk, backlog);
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_BOUND) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	write_lock(&iso_sk_list.lock);
+
+	if (__iso_get_sock_listen_by_addr(src)) {
+		err = -EADDRINUSE;
+		goto unlock;
+	}
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+
+	sk->sk_state = BT_LISTEN;
+
+unlock:
+	write_unlock(&iso_sk_list.lock);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_accept(struct socket *sock, struct socket *newsock,
+			   int flags, bool kern)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = sock->sk, *ch;
+	long timeo;
+	int err = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	BT_DBG("sk %p timeo %ld", sk, timeo);
+
+	/* Wait for an incoming connection. (wake-one). */
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (1) {
+		if (sk->sk_state != BT_LISTEN) {
+			err = -EBADFD;
+			break;
+		}
+
+		ch = bt_accept_dequeue(sk, newsock);
+		if (ch)
+			break;
+
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+
+		timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		lock_sock(sk);
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (err)
+		goto done;
+
+	newsock->state = SS_CONNECTED;
+
+	BT_DBG("new socket %p", ch);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
+			    int peer)
+{
+	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	addr->sa_family = AF_BLUETOOTH;
+
+	if (peer) {
+		bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
+		sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
+	} else {
+		bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src);
+		sa->iso_bdaddr_type = iso_pi(sk)->src_type;
+	}
+
+	return sizeof(struct sockaddr_iso);
+}
+
+static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+			    size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct iso_conn *conn = iso_pi(sk)->conn;
+	struct sk_buff *skb, **frag;
+	int err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	err = sock_error(sk);
+	if (err)
+		return err;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (sk->sk_state != BT_CONNECTED)
+		return -ENOTCONN;
+
+	skb = bt_skb_sendmsg(sk, msg, len, conn->hcon->hdev->iso_mtu,
+			     HCI_ISO_DATA_HDR_SIZE, 0);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	len -= skb->len;
+
+	BT_DBG("skb %p len %d", sk, skb->len);
+
+	/* Continuation fragments */
+	frag = &skb_shinfo(skb)->frag_list;
+	while (len) {
+		struct sk_buff *tmp;
+
+		tmp = bt_skb_sendmsg(sk, msg, len, conn->hcon->hdev->iso_mtu,
+				     0, 0);
+		if (IS_ERR(tmp)) {
+			kfree_skb(skb);
+			return PTR_ERR(tmp);
+		}
+
+		*frag = tmp;
+
+		len  -= tmp->len;
+
+		skb->len += tmp->len;
+		skb->data_len += tmp->len;
+
+		BT_DBG("frag %p len %d", *frag, tmp->len);
+
+		frag = &(*frag)->next;
+	}
+
+	lock_sock(sk);
+
+	if (sk->sk_state == BT_CONNECTED)
+		err = iso_send_frame(sk, skb);
+	else
+		err = -ENOTCONN;
+
+	release_sock(sk);
+
+	if (err < 0)
+		kfree_skb(skb);
+	return err;
+}
+
+static void iso_conn_defer_accept(struct hci_conn *conn)
+{
+	struct hci_cp_le_accept_cis cp;
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p", conn);
+
+	conn->state = BT_CONFIG;
+
+	cp.handle = cpu_to_le16(conn->handle);
+
+	hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
+
+static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+			    size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct iso_pinfo *pi = iso_pi(sk);
+	int err;
+
+	BT_DBG("sk %p", sk);
+
+	lock_sock(sk);
+
+	if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+		switch (sk->sk_state) {
+		case BT_CONNECT2:
+			iso_conn_defer_accept(pi->conn->hcon);
+			sk->sk_state = BT_CONFIG;
+			release_sock(sk);
+			return 0;
+		case BT_CONNECT:
+			err = iso_connect(sk);
+			release_sock(sk);
+			return err;
+		}
+	}
+
+	release_sock(sk);
+
+	return bt_sock_recvmsg(sock, msg, len, flags);
+}
+
+static bool check_io_qos(struct bt_iso_io_qos *qos)
+{
+	/* If no PHY is enable SDU must be 0 */
+	if (!qos->phy && qos->sdu)
+		return false;
+
+	if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff))
+		return false;
+
+	if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0))
+		return false;
+
+	if (qos->phy > BT_ISO_PHY_ANY)
+		return false;
+
+	return true;
+}
+
+static bool check_qos(struct bt_iso_qos *qos)
+{
+	/* CIS shall not be set */
+	if (qos->cis != BT_ISO_QOS_CIS_UNSET)
+		return false;
+
+	if (qos->sca > 0x07)
+		return false;
+
+	if (qos->packing > 0x01)
+		return false;
+
+	if (qos->framing > 0x01)
+		return false;
+
+	if (!check_io_qos(&qos->in))
+		return false;
+
+	if (!check_io_qos(&qos->out))
+		return false;
+
+	return true;
+}
+
+static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
+			       sockptr_t optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int len, err = 0;
+	struct bt_iso_qos qos;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (copy_from_sockptr(&opt, optval, sizeof(u32))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt)
+			set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+		else
+			clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+		break;
+
+	case BT_ISO_QOS:
+		if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+		    sk->sk_state != BT_CONNECT2) {
+			err = -EINVAL;
+			break;
+		}
+
+		len = min_t(unsigned int, sizeof(qos), optlen);
+		if (len != sizeof(qos))
+			return -EINVAL;
+
+		memset(&qos, 0, sizeof(qos));
+
+		if (copy_from_sockptr(&qos, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (!check_qos(&qos)) {
+			err = -EINVAL;
+			break;
+		}
+
+		iso_pi(sk)->qos = qos;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_getsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int len, err = 0;
+	struct bt_iso_qos qos;
+
+	BT_DBG("sk %p", sk);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+			     (u32 __user *)optval))
+			err = -EFAULT;
+
+		break;
+
+	case BT_ISO_QOS:
+		if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2)
+			qos = iso_pi(sk)->conn->hcon->iso_qos;
+		else
+			qos = iso_pi(sk)->qos;
+
+		len = min_t(unsigned int, len, sizeof(qos));
+		if (copy_to_user(optval, (char *)&qos, len))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int iso_sock_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sock_hold(sk);
+	lock_sock(sk);
+
+	if (!sk->sk_shutdown) {
+		sk->sk_shutdown = SHUTDOWN_MASK;
+		iso_sock_clear_timer(sk);
+		__iso_sock_close(sk);
+
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+		    !(current->flags & PF_EXITING))
+			err = bt_sock_wait_state(sk, BT_CLOSED,
+						 sk->sk_lingertime);
+	}
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return err;
+}
+
+static int iso_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	iso_sock_close(sk);
+
+	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+	    !(current->flags & PF_EXITING)) {
+		lock_sock(sk);
+		err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+		release_sock(sk);
+	}
+
+	sock_orphan(sk);
+	iso_sock_kill(sk);
+	return err;
+}
+
+static void iso_sock_ready(struct sock *sk)
+{
+	BT_DBG("sk %p", sk);
+
+	if (!sk)
+		return;
+
+	lock_sock(sk);
+	iso_sock_clear_timer(sk);
+	sk->sk_state = BT_CONNECTED;
+	sk->sk_state_change(sk);
+	release_sock(sk);
+}
+
+struct iso_list_data {
+	struct hci_conn *hcon;
+	int count;
+};
+
+static void iso_conn_ready(struct iso_conn *conn)
+{
+	struct sock *parent;
+	struct sock *sk = conn->sk;
+
+	BT_DBG("conn %p", conn);
+
+	if (sk) {
+		iso_sock_ready(conn->sk);
+	} else {
+		iso_conn_lock(conn);
+
+		if (!conn->hcon) {
+			iso_conn_unlock(conn);
+			return;
+		}
+
+		parent = iso_get_sock_listen(&conn->hcon->src);
+		if (!parent) {
+			iso_conn_unlock(conn);
+			return;
+		}
+
+		lock_sock(parent);
+
+		sk = iso_sock_alloc(sock_net(parent), NULL,
+				    BTPROTO_ISO, GFP_ATOMIC, 0);
+		if (!sk) {
+			release_sock(parent);
+			iso_conn_unlock(conn);
+			return;
+		}
+
+		iso_sock_init(sk, parent);
+
+		bacpy(&iso_pi(sk)->src, &conn->hcon->src);
+		iso_pi(sk)->src_type = conn->hcon->src_type;
+		bacpy(&iso_pi(sk)->dst, &conn->hcon->dst);
+		iso_pi(sk)->dst_type = conn->hcon->dst_type;
+
+		hci_conn_hold(conn->hcon);
+		__iso_chan_add(conn, sk, parent);
+
+		if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+			sk->sk_state = BT_CONNECT2;
+		else
+			sk->sk_state = BT_CONNECTED;
+
+		/* Wake up parent */
+		parent->sk_data_ready(parent);
+
+		release_sock(parent);
+
+		iso_conn_unlock(conn);
+	}
+}
+
+/* ----- ISO interface with lower layer (HCI) ----- */
+int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
+{
+	struct sock *sk;
+	int lm = 0;
+
+	BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
+
+	/* Find listening sockets */
+	read_lock(&iso_sk_list.lock);
+	sk_for_each(sk, &iso_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		if (!bacmp(&iso_pi(sk)->src, &hdev->bdaddr) ||
+		    !bacmp(&iso_pi(sk)->src, BDADDR_ANY)) {
+			lm |= HCI_LM_ACCEPT;
+
+			if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+				*flags |= HCI_PROTO_DEFER;
+			break;
+		}
+	}
+	read_unlock(&iso_sk_list.lock);
+
+	return lm;
+}
+
+static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
+{
+	if (hcon->type != ISO_LINK) {
+		if (hcon->type != LE_LINK)
+			return;
+
+		/* Check if LE link has failed */
+		if (status) {
+			if (hcon->link)
+				iso_conn_del(hcon->link, bt_to_errno(status));
+			return;
+		}
+
+		/* Create CIS if pending */
+		hci_le_create_cis(hcon);
+		return;
+	}
+
+	BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+	if (!status) {
+		struct iso_conn *conn;
+
+		conn = iso_conn_add(hcon);
+		if (conn)
+			iso_conn_ready(conn);
+	} else {
+		iso_conn_del(hcon, bt_to_errno(status));
+	}
+}
+
+static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
+{
+	if (hcon->type != ISO_LINK)
+		return;
+
+	BT_DBG("hcon %p reason %d", hcon, reason);
+
+	iso_conn_del(hcon, bt_to_errno(reason));
+}
+
+void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+{
+	struct iso_conn *conn = hcon->iso_data;
+	struct hci_iso_data_hdr *hdr;
+	__u16 pb, ts, len;
+
+	if (!conn)
+		goto drop;
+
+	pb     = hci_iso_flags_pb(flags);
+	ts     = hci_iso_flags_ts(flags);
+
+	BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts);
+
+	switch (pb) {
+	case ISO_START:
+	case ISO_SINGLE:
+		if (conn->rx_len) {
+			BT_ERR("Unexpected start frame (len %d)", skb->len);
+			kfree_skb(conn->rx_skb);
+			conn->rx_skb = NULL;
+			conn->rx_len = 0;
+		}
+
+		if (ts) {
+			/* TODO: add timestamp to the packet? */
+			hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE);
+			if (!hdr) {
+				BT_ERR("Frame is too short (len %d)", skb->len);
+				goto drop;
+			}
+
+		} else {
+			hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE);
+			if (!hdr) {
+				BT_ERR("Frame is too short (len %d)", skb->len);
+				goto drop;
+			}
+		}
+
+		len    = __le16_to_cpu(hdr->slen);
+		flags  = hci_iso_data_flags(len);
+		len    = hci_iso_data_len(len);
+
+		BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len,
+		       skb->len, flags);
+
+		if (len == skb->len) {
+			/* Complete frame received */
+			iso_recv_frame(conn, skb);
+			return;
+		}
+
+		if (pb == ISO_SINGLE) {
+			BT_ERR("Frame malformed (len %d, expected len %d)",
+			       skb->len, len);
+			goto drop;
+		}
+
+		if (skb->len > len) {
+			BT_ERR("Frame is too long (len %d, expected len %d)",
+			       skb->len, len);
+			goto drop;
+		}
+
+		/* Allocate skb for the complete frame (with header) */
+		conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+		if (!conn->rx_skb)
+			goto drop;
+
+		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+					  skb->len);
+		conn->rx_len = len - skb->len;
+		break;
+
+	case ISO_CONT:
+		BT_DBG("Cont: frag len %d (expecting %d)", skb->len,
+		       conn->rx_len);
+
+		if (!conn->rx_len) {
+			BT_ERR("Unexpected continuation frame (len %d)",
+			       skb->len);
+			goto drop;
+		}
+
+		if (skb->len > conn->rx_len) {
+			BT_ERR("Fragment is too long (len %d, expected %d)",
+			       skb->len, conn->rx_len);
+			kfree_skb(conn->rx_skb);
+			conn->rx_skb = NULL;
+			conn->rx_len = 0;
+			goto drop;
+		}
+
+		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+					  skb->len);
+		conn->rx_len -= skb->len;
+		return;
+
+	case ISO_END:
+		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+					  skb->len);
+		conn->rx_len -= skb->len;
+
+		if (!conn->rx_len) {
+			struct sk_buff *rx_skb = conn->rx_skb;
+
+			/* Complete frame received. iso_recv_frame
+			 * takes ownership of the skb so set the global
+			 * rx_skb pointer to NULL first.
+			 */
+			conn->rx_skb = NULL;
+			iso_recv_frame(conn, rx_skb);
+		}
+		break;
+	}
+
+drop:
+	kfree_skb(skb);
+}
+
+static struct hci_cb iso_cb = {
+	.name		= "ISO",
+	.connect_cfm	= iso_connect_cfm,
+	.disconn_cfm	= iso_disconn_cfm,
+};
+
+static int iso_debugfs_show(struct seq_file *f, void *p)
+{
+	struct sock *sk;
+
+	read_lock(&iso_sk_list.lock);
+
+	sk_for_each(sk, &iso_sk_list.head) {
+		seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src,
+			   &iso_pi(sk)->dst, sk->sk_state);
+	}
+
+	read_unlock(&iso_sk_list.lock);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(iso_debugfs);
+
+static struct dentry *iso_debugfs;
+
+static const struct proto_ops iso_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= iso_sock_release,
+	.bind		= iso_sock_bind,
+	.connect	= iso_sock_connect,
+	.listen		= iso_sock_listen,
+	.accept		= iso_sock_accept,
+	.getname	= iso_sock_getname,
+	.sendmsg	= iso_sock_sendmsg,
+	.recvmsg	= iso_sock_recvmsg,
+	.poll		= bt_sock_poll,
+	.ioctl		= bt_sock_ioctl,
+	.mmap		= sock_no_mmap,
+	.socketpair	= sock_no_socketpair,
+	.shutdown	= iso_sock_shutdown,
+	.setsockopt	= iso_sock_setsockopt,
+	.getsockopt	= iso_sock_getsockopt
+};
+
+static const struct net_proto_family iso_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= iso_sock_create,
+};
+
+static bool iso_inited;
+
+bool iso_enabled(void)
+{
+	return iso_inited;
+}
+
+int iso_init(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr));
+
+	if (iso_inited)
+		return -EALREADY;
+
+	err = proto_register(&iso_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops);
+	if (err < 0) {
+		BT_ERR("ISO socket registration failed");
+		goto error;
+	}
+
+	err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL);
+	if (err < 0) {
+		BT_ERR("Failed to create ISO proc file");
+		bt_sock_unregister(BTPROTO_ISO);
+		goto error;
+	}
+
+	BT_INFO("ISO socket layer initialized");
+
+	hci_register_cb(&iso_cb);
+
+	if (IS_ERR_OR_NULL(bt_debugfs))
+		return 0;
+
+	if (!iso_debugfs) {
+		iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs,
+						  NULL, &iso_debugfs_fops);
+	}
+
+	iso_inited = true;
+
+	return 0;
+
+error:
+	proto_unregister(&iso_proto);
+	return err;
+}
+
+int iso_exit(void)
+{
+	if (!iso_inited)
+		return -EALREADY;
+
+	bt_procfs_cleanup(&init_net, "iso");
+
+	debugfs_remove(iso_debugfs);
+	iso_debugfs = NULL;
+
+	hci_unregister_cb(&iso_cb);
+
+	bt_sock_unregister(BTPROTO_ISO);
+
+	proto_unregister(&iso_proto);
+
+	iso_inited = false;
+
+	return 0;
+}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index a97fafe7fd4a..08688117d700 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3985,10 +3985,16 @@ static const u8 rpa_resolution_uuid[16] = {
 	0xea, 0x11, 0x73, 0xc2, 0x48, 0xa1, 0xc0, 0x15,
 };
 
+/* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */
+static const u8 iso_socket_uuid[16] = {
+	0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98,
+	0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f,
+};
+
 static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 				  void *data, u16 data_len)
 {
-	char buf[102];   /* Enough space for 5 features: 2 + 20 * 5 */
+	char buf[122];   /* Enough space for 6 features: 2 + 20 * 6 */
 	struct mgmt_rp_read_exp_features_info *rp = (void *)buf;
 	u16 idx = 0;
 	u32 flags;
@@ -4052,6 +4058,13 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 		idx++;
 	}
 
+	if (IS_ENABLED(CONFIG_BT_LE)) {
+		flags = iso_enabled() ? BIT(0) : 0;
+		memcpy(rp->features[idx].uuid, iso_socket_uuid, 16);
+		rp->features[idx].flags = cpu_to_le32(flags);
+		idx++;
+	}
+
 	rp->feature_count = cpu_to_le16(idx);
 
 	/* After reading the experimental features information, enable
@@ -4444,6 +4457,57 @@ static int set_le_simultaneous_roles_func(struct sock *sk, struct hci_dev *hdev,
 	return err;
 }
 
+#ifdef CONFIG_BT_LE
+static int set_iso_socket_func(struct sock *sk, struct hci_dev *hdev,
+			       struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+	struct mgmt_rp_set_exp_feature rp;
+	bool val, changed = false;
+	int err;
+
+	/* Command requires to use the non-controller index */
+	if (hdev)
+		return mgmt_cmd_status(sk, hdev->id,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_INDEX);
+
+	/* Parameters are limited to a single octet */
+	if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+		return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_PARAMS);
+
+	/* Only boolean on/off is supported */
+	if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+		return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_PARAMS);
+
+	val = cp->param[0] ? true : false;
+	if (val)
+		err = iso_init();
+	else
+		err = iso_exit();
+
+	if (!err)
+		changed = true;
+
+	memcpy(rp.uuid, iso_socket_uuid, 16);
+	rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+	hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+	err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+				MGMT_OP_SET_EXP_FEATURE, 0,
+				&rp, sizeof(rp));
+
+	if (changed)
+		exp_feature_changed(hdev, iso_socket_uuid, val, sk);
+
+	return err;
+}
+#endif
+
 static const struct mgmt_exp_feature {
 	const u8 *uuid;
 	int (*set_func)(struct sock *sk, struct hci_dev *hdev,
@@ -4457,6 +4521,9 @@ static const struct mgmt_exp_feature {
 	EXP_FEAT(quality_report_uuid, set_quality_report_func),
 	EXP_FEAT(offload_codecs_uuid, set_offload_codec_func),
 	EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func),
+#ifdef CONFIG_BT_LE
+	EXP_FEAT(iso_socket_uuid, set_iso_socket_func),
+#endif
 
 	/* end with a null feature */
 	EXP_FEAT(NULL, NULL)
-- 
cgit 


From eca0ae4aea66914515e5e3098ea051b518ee5316 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 9 Mar 2022 13:22:20 -0800
Subject: Bluetooth: Add initial implementation of BIS connections

This adds initial support for BIS/BIG which includes:

== Broadcaster role: Setup a periodic advertising and create a BIG ==

> tools/isotest -s 00:00:00:00:00:00
isotest[63]: Connected [00:00:00:00:00:00]
isotest[63]: QoS BIG 0x00 BIS 0x00 Packing 0x00 Framing 0x00]
isotest[63]: Output QoS [Interval 10000 us Latency 10 ms SDU 40 PHY 0x02
RTN 2]
isotest[63]: Sending ...
isotest[63]: Number of packets: 1
isotest[63]: Socket jitter buffer: 80 buffer
< HCI Command: LE Set Perio.. (0x08|0x003e) plen 7
...
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Periodic Advertising Parameters (0x08|0x003e) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Perio.. (0x08|0x003f) plen 7
...
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Periodic Advertising Data (0x08|0x003f) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Perio.. (0x08|0x0040) plen 2
...
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Periodic Advertising Enable (0x08|0x0040) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Create B.. (0x08|0x0068) plen 31
...
> HCI Event: Command Status (0x0f) plen 4
      LE Create Broadcast Isochronous Group (0x08|0x0068) ncmd 1
        Status: Success (0x00)
> HCI Event: LE Meta Event (0x3e) plen 21
      LE Broadcast Isochronous Group Complete (0x1b)
      ...

== Broadcast Receiver role: Create a PA Sync and BIG Sync ==

> tools/isotest -i hci1 -d 00:AA:01:00:00:00
isotest[66]: Waiting for connection 00:AA:01:00:00:00...
< HCI Command: LE Periodic Advert.. (0x08|0x0044) plen 14
...
> HCI Event: Command Status (0x0f) plen 4
      LE Periodic Advertising Create Sync (0x08|0x0044) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Extended Sca.. (0x08|0x0041) plen 8
...
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Extended Sca.. (0x08|0x0042) plen 6
...
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Extended Scan Enable (0x08|0x0042) ncmd 1
        Status: Success (0x00)
> HCI Event: LE Meta Event (0x3e) plen 29
      LE Extended Advertising Report (0x0d)
      ...
> HCI Event: LE Meta Event (0x3e) plen 16
      LE Periodic Advertising Sync Established (0x0e)
      ...
< HCI Command: LE Broadcast Isoch.. (0x08|0x006b) plen 25
...
> HCI Event: Command Status (0x0f) plen 4
      LE Broadcast Isochronous Group Create Sync (0x08|0x006b) ncmd 1
        Status: Success (0x00)
> HCI Event: LE Meta Event (0x3e) plen 17
      LE Broadcast Isochronous Group Sync Estabilished (0x1d)
      ...

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  18 +-
 include/net/bluetooth/hci.h       | 162 ++++++++++-
 include/net/bluetooth/hci_core.h  |  89 +++++-
 include/net/bluetooth/hci_sync.h  |  12 +-
 net/bluetooth/eir.c               |  21 ++
 net/bluetooth/eir.h               |   1 +
 net/bluetooth/hci_conn.c          | 561 ++++++++++++++++++++++++++++++++++----
 net/bluetooth/hci_core.c          |  80 ++++--
 net/bluetooth/hci_event.c         | 206 ++++++++++++++
 net/bluetooth/hci_request.c       |  36 ++-
 net/bluetooth/hci_request.h       |   9 +
 net/bluetooth/hci_sync.c          | 249 ++++++++++++++++-
 net/bluetooth/mgmt.c              |  15 +-
 13 files changed, 1333 insertions(+), 126 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index df2c006fc6a9..bcc6e098f1f6 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -160,6 +160,9 @@ struct bt_voice {
 #define BT_ISO_QOS_CIG_UNSET	0xff
 #define BT_ISO_QOS_CIS_UNSET	0xff
 
+#define BT_ISO_QOS_BIG_UNSET	0xff
+#define BT_ISO_QOS_BIS_UNSET	0xff
+
 struct bt_iso_io_qos {
 	__u32 interval;
 	__u16 latency;
@@ -169,9 +172,18 @@ struct bt_iso_io_qos {
 };
 
 struct bt_iso_qos {
-	__u8  cig;
-	__u8  cis;
-	__u8  sca;
+	union {
+		__u8  cig;
+		__u8  big;
+	};
+	union {
+		__u8  cis;
+		__u8  bis;
+	};
+	union {
+		__u8  sca;
+		__u8  sync_interval;
+	};
 	__u8  packing;
 	__u8  framing;
 	struct bt_iso_io_qos in;
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 027f1bc6e4f1..cf29511b25a8 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -316,6 +316,7 @@ enum {
 	HCI_USER_CHANNEL,
 	HCI_EXT_CONFIGURED,
 	HCI_LE_ADV,
+	HCI_LE_PER_ADV,
 	HCI_LE_SCAN,
 	HCI_SSP_ENABLED,
 	HCI_SC_ENABLED,
@@ -338,6 +339,7 @@ enum {
 	HCI_LE_SCAN_INTERRUPTED,
 	HCI_WIDEBAND_SPEECH_ENABLED,
 	HCI_EVENT_FILTER_CONFIGURED,
+	HCI_PA_SYNC,
 
 	HCI_DUT_MODE,
 	HCI_VENDOR_DIAG,
@@ -519,9 +521,11 @@ enum {
 #define HCI_LE_PHY_2M			0x01
 #define HCI_LE_PHY_CODED		0x08
 #define HCI_LE_EXT_ADV			0x10
+#define HCI_LE_PERIODIC_ADV		0x20
 #define HCI_LE_CHAN_SEL_ALG2		0x40
 #define HCI_LE_CIS_CENTRAL		0x10
 #define HCI_LE_CIS_PERIPHERAL		0x20
+#define HCI_LE_ISO_BROADCASTER		0x40
 
 /* Connection modes */
 #define HCI_CM_ACTIVE	0x0000
@@ -1865,6 +1869,22 @@ struct hci_cp_le_ext_conn_param {
 	__le16 max_ce_len;
 } __packed;
 
+#define HCI_OP_LE_PA_CREATE_SYNC	0x2044
+struct hci_cp_le_pa_create_sync {
+	__u8      options;
+	__u8      sid;
+	__u8      addr_type;
+	bdaddr_t  addr;
+	__le16    skip;
+	__le16    sync_timeout;
+	__u8      sync_cte_type;
+} __packed;
+
+#define HCI_OP_LE_PA_TERM_SYNC		0x2046
+struct hci_cp_le_pa_term_sync {
+	__le16    handle;
+} __packed;
+
 #define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS	0x203b
 struct hci_rp_le_read_num_supported_adv_sets {
 	__u8  status;
@@ -1899,13 +1919,6 @@ struct hci_rp_le_set_ext_adv_params {
 	__u8  tx_power;
 } __packed;
 
-#define HCI_OP_LE_SET_EXT_ADV_ENABLE		0x2039
-struct hci_cp_le_set_ext_adv_enable {
-	__u8  enable;
-	__u8  num_of_sets;
-	__u8  data[];
-} __packed;
-
 struct hci_cp_ext_adv_set {
 	__u8  handle;
 	__le16 duration;
@@ -1932,6 +1945,37 @@ struct hci_cp_le_set_ext_scan_rsp_data {
 	__u8  data[];
 } __packed;
 
+#define HCI_OP_LE_SET_EXT_ADV_ENABLE		0x2039
+struct hci_cp_le_set_ext_adv_enable {
+	__u8  enable;
+	__u8  num_of_sets;
+	__u8  data[];
+} __packed;
+
+#define HCI_OP_LE_SET_PER_ADV_PARAMS		0x203e
+struct hci_cp_le_set_per_adv_params {
+	__u8      handle;
+	__le16    min_interval;
+	__le16    max_interval;
+	__le16    periodic_properties;
+} __packed;
+
+#define HCI_MAX_PER_AD_LENGTH	252
+
+#define HCI_OP_LE_SET_PER_ADV_DATA		0x203f
+struct hci_cp_le_set_per_adv_data {
+	__u8  handle;
+	__u8  operation;
+	__u8  length;
+	__u8  data[];
+} __packed;
+
+#define HCI_OP_LE_SET_PER_ADV_ENABLE		0x2040
+struct hci_cp_le_set_per_adv_enable {
+	__u8  enable;
+	__u8  handle;
+} __packed;
+
 #define LE_SET_ADV_DATA_OP_COMPLETE	0x03
 
 #define LE_SET_ADV_DATA_NO_FRAG		0x01
@@ -2043,6 +2087,49 @@ struct hci_cp_le_reject_cis {
 	__u8    reason;
 } __packed;
 
+#define HCI_OP_LE_CREATE_BIG			0x2068
+struct hci_bis {
+	__u8    sdu_interval[3];
+	__le16  sdu;
+	__le16  latency;
+	__u8    rtn;
+	__u8    phy;
+	__u8    packing;
+	__u8    framing;
+	__u8    encryption;
+	__u8    bcode[16];
+} __packed;
+
+struct hci_cp_le_create_big {
+	__u8    handle;
+	__u8    adv_handle;
+	__u8    num_bis;
+	struct hci_bis bis;
+} __packed;
+
+#define HCI_OP_LE_TERM_BIG			0x206a
+struct hci_cp_le_term_big {
+	__u8    handle;
+	__u8    reason;
+} __packed;
+
+#define HCI_OP_LE_BIG_CREATE_SYNC		0x206b
+struct hci_cp_le_big_create_sync {
+	__u8    handle;
+	__le16  sync_handle;
+	__u8    encryption;
+	__u8    bcode[16];
+	__u8    mse;
+	__le16  timeout;
+	__u8    num_bis;
+	__u8    bis[0];
+} __packed;
+
+#define HCI_OP_LE_BIG_TERM_SYNC			0x206c
+struct hci_cp_le_big_term_sync {
+	__u8    handle;
+} __packed;
+
 #define HCI_OP_LE_SETUP_ISO_PATH		0x206e
 struct hci_cp_le_setup_iso_path {
 	__le16  handle;
@@ -2595,6 +2682,18 @@ struct hci_ev_le_ext_adv_report {
 	struct hci_ev_le_ext_adv_info info[];
 } __packed;
 
+#define HCI_EV_LE_PA_SYNC_ESTABLISHED	0x0e
+struct hci_ev_le_pa_sync_established {
+	__u8      status;
+	__le16    handle;
+	__u8      sid;
+	__u8      bdaddr_type;
+	bdaddr_t  bdaddr;
+	__u8      phy;
+	__le16    interval;
+	__u8      clock_accuracy;
+} __packed;
+
 #define HCI_EV_LE_ENHANCED_CONN_COMPLETE    0x0a
 struct hci_ev_le_enh_conn_complete {
 	__u8      status;
@@ -2646,6 +2745,55 @@ struct hci_evt_le_cis_req {
 	__u8  cis_id;
 } __packed;
 
+#define HCI_EVT_LE_CREATE_BIG_COMPLETE	0x1b
+struct hci_evt_le_create_big_complete {
+	__u8    status;
+	__u8    handle;
+	__u8    sync_delay[3];
+	__u8    transport_delay[3];
+	__u8    phy;
+	__u8    nse;
+	__u8    bn;
+	__u8    pto;
+	__u8    irc;
+	__le16  max_pdu;
+	__le16  interval;
+	__u8    num_bis;
+	__le16  bis_handle[];
+} __packed;
+
+#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d
+struct hci_evt_le_big_sync_estabilished {
+	__u8    status;
+	__u8    handle;
+	__u8    latency[3];
+	__u8    nse;
+	__u8    bn;
+	__u8    pto;
+	__u8    irc;
+	__le16  max_pdu;
+	__le16  interval;
+	__u8    num_bis;
+	__le16  bis[];
+} __packed;
+
+#define HCI_EVT_LE_BIG_INFO_ADV_REPORT	0x22
+struct hci_evt_le_big_info_adv_report {
+	__le16  sync_handle;
+	__u8    num_bis;
+	__u8    nse;
+	__le16  iso_interval;
+	__u8    bn;
+	__u8    pto;
+	__u8    irc;
+	__le16  max_pdu;
+	__u8    sdu_interval[3];
+	__le16  max_sdu;
+	__u8    phy;
+	__u8    framing;
+	__u8    encryption;
+} __packed;
+
 #define HCI_EV_VENDOR			0xff
 
 /* Internal events generated by Bluetooth stack */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 3457c24e9b19..e7862903187d 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -235,8 +235,9 @@ struct oob_data {
 
 struct adv_info {
 	struct list_head list;
-	bool enabled;
-	bool pending;
+	bool	enabled;
+	bool	pending;
+	bool	periodic;
 	__u8	instance;
 	__u32	flags;
 	__u16	timeout;
@@ -248,6 +249,8 @@ struct adv_info {
 	__u16	scan_rsp_len;
 	__u8	scan_rsp_data[HCI_MAX_EXT_AD_LENGTH];
 	bool	scan_rsp_changed;
+	__u16	per_adv_data_len;
+	__u8	per_adv_data[HCI_MAX_PER_AD_LENGTH];
 	__s8	tx_power;
 	__u32   min_interval;
 	__u32   max_interval;
@@ -594,6 +597,8 @@ struct hci_dev {
 	__u8			adv_data_len;
 	__u8			scan_rsp_data[HCI_MAX_EXT_AD_LENGTH];
 	__u8			scan_rsp_data_len;
+	__u8			per_adv_data[HCI_MAX_PER_AD_LENGTH];
+	__u8			per_adv_data_len;
 
 	struct list_head	adv_instances;
 	unsigned int		adv_instance_cnt;
@@ -679,6 +684,7 @@ struct hci_conn {
 	__u8		resp_addr_type;
 	__u8		adv_instance;
 	__u16		handle;
+	__u16		sync_handle;
 	__u16		state;
 	__u8		mode;
 	__u8		type;
@@ -709,6 +715,8 @@ struct hci_conn {
 	__u16		le_supv_timeout;
 	__u8		le_adv_data[HCI_MAX_AD_LENGTH];
 	__u8		le_adv_data_len;
+	__u8		le_per_adv_data[HCI_MAX_PER_AD_LENGTH];
+	__u8		le_per_adv_data_len;
 	__u8		le_tx_phy;
 	__u8		le_rx_phy;
 	__s8		rssi;
@@ -942,6 +950,7 @@ enum {
 	HCI_CONN_NEW_LINK_KEY,
 	HCI_CONN_SCANNING,
 	HCI_CONN_AUTH_FAILURE,
+	HCI_CONN_PER_ADV,
 };
 
 static inline bool hci_conn_ssp_enabled(struct hci_conn *conn)
@@ -1060,6 +1069,29 @@ static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle)
 	return type;
 }
 
+static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
+							bdaddr_t *ba,
+							__u8 big, __u8 bis)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (bacmp(&c->dst, ba) || c->type != ISO_LINK)
+			continue;
+
+		if (c->iso_qos.big == big && c->iso_qos.bis == bis) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static inline struct hci_conn *hci_conn_hash_lookup_handle(struct hci_dev *hdev,
 								__u16 handle)
 {
@@ -1170,6 +1202,29 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
 	return NULL;
 }
 
+static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
+							__u8 handle)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK)
+			continue;
+
+		if (handle == c->iso_qos.big) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 							__u8 type, __u16 state)
 {
@@ -1264,6 +1319,13 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 			      __u8 dst_type, struct bt_iso_qos *qos);
 struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos);
+struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
+				 __u8 dst_type, struct bt_iso_qos *qos,
+				 __u8 data_len, __u8 *data);
+int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
+		       __u8 sid);
+int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
+			   __u16 sync_handle, __u8 num_bis, __u8 bis[]);
 int hci_conn_check_link_mode(struct hci_conn *conn);
 int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
 int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
@@ -1510,11 +1572,14 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
 void hci_adv_instances_clear(struct hci_dev *hdev);
 struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance);
 struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance);
-int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
-			 u16 adv_data_len, u8 *adv_data,
-			 u16 scan_rsp_len, u8 *scan_rsp_data,
-			 u16 timeout, u16 duration, s8 tx_power,
-			 u32 min_interval, u32 max_interval);
+struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
+				      u32 flags, u16 adv_data_len, u8 *adv_data,
+				      u16 scan_rsp_len, u8 *scan_rsp_data,
+				      u16 timeout, u16 duration, s8 tx_power,
+				      u32 min_interval, u32 max_interval);
+struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance,
+				      u32 flags, u8 data_len, u8 *data,
+				      u32 min_interval, u32 max_interval);
 int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
 			 u16 adv_data_len, u8 *adv_data,
 			 u16 scan_rsp_len, u8 *scan_rsp_data);
@@ -1631,14 +1696,18 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \
 					 ext_adv_capable(dev))
 
-/* CIS Master/Slave support */
-#define iso_capable(dev) (cis_capable(dev))
+/* Periodic advertising support */
+#define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV))
+
+/* CIS Master/Slave and BIS support */
+#define iso_capable(dev) (cis_capable(dev) || bis_capable(dev))
 #define cis_capable(dev) \
 	(cis_central_capable(dev) || cis_peripheral_capable(dev))
 #define cis_central_capable(dev) \
 	((dev)->le_features[3] & HCI_LE_CIS_CENTRAL)
 #define cis_peripheral_capable(dev) \
 	((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
+#define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER)
 
 /* ----- HCI protocols ----- */
 #define HCI_PROTO_DEFER             0x01
@@ -1926,6 +1995,8 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);
 #define DISCOV_LE_RESTART_DELAY		msecs_to_jiffies(200)	/* msec */
 #define DISCOV_LE_FAST_ADV_INT_MIN	0x00A0	/* 100 msec */
 #define DISCOV_LE_FAST_ADV_INT_MAX	0x00F0	/* 150 msec */
+#define DISCOV_LE_PER_ADV_INT_MIN	0x00A0	/* 200 msec */
+#define DISCOV_LE_PER_ADV_INT_MAX	0x00A0	/* 200 msec */
 
 #define NAME_RESOLVE_DURATION		msecs_to_jiffies(10240)	/* 10.24 sec */
 
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index c243cb6869d8..3843f5060c73 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -65,6 +65,10 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance);
 int hci_enable_advertising_sync(struct hci_dev *hdev);
 int hci_enable_advertising(struct hci_dev *hdev);
 
+int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
+			   u8 *data, u32 flags, u16 min_interval,
+			   u16 max_interval, u16 sync_interval);
+
 int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
 				u8 instance, bool force);
 int hci_disable_advertising_sync(struct hci_dev *hdev);
@@ -83,6 +87,7 @@ int hci_update_scan(struct hci_dev *hdev);
 int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul);
 int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
 				     struct sock *sk);
+int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance);
 struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool ext,
 					     struct sock *sk);
 
@@ -111,4 +116,9 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
 
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
-int hci_le_remove_cig(struct hci_dev *hdev, u8 handle);
+
+int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason);
+
+int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
+
+int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c
index 776d27f7e18d..8a85f6cdfbc1 100644
--- a/net/bluetooth/eir.c
+++ b/net/bluetooth/eir.c
@@ -236,6 +236,27 @@ void eir_create(struct hci_dev *hdev, u8 *data)
 	ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
 }
 
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+	struct adv_info *adv = NULL;
+	u8 ad_len = 0;
+
+	/* Return 0 when the current instance identifier is invalid. */
+	if (instance) {
+		adv = hci_find_adv_instance(hdev, instance);
+		if (!adv)
+			return 0;
+	}
+
+	if (adv) {
+		memcpy(ptr, adv->per_adv_data, adv->per_adv_data_len);
+		ad_len += adv->per_adv_data_len;
+		ptr += adv->per_adv_data_len;
+	}
+
+	return ad_len;
+}
+
 u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
 {
 	struct adv_info *adv = NULL;
diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h
index 62f2374078f2..0df19f2f4af9 100644
--- a/net/bluetooth/eir.h
+++ b/net/bluetooth/eir.h
@@ -11,6 +11,7 @@ void eir_create(struct hci_dev *hdev, u8 *data);
 
 u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr);
 u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr);
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr);
 
 u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len);
 u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 97568723ddb0..f54864e19866 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -30,10 +30,13 @@
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 #include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/iso.h>
+#include <net/bluetooth/mgmt.h>
 
 #include "hci_request.h"
 #include "smp.h"
 #include "a2mp.h"
+#include "eir.h"
 
 struct sco_param {
 	u16 pkt_type;
@@ -684,6 +687,199 @@ static void le_conn_timeout(struct work_struct *work)
 	hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
 }
 
+struct iso_list_data {
+	union {
+		u8  cig;
+		u8  big;
+	};
+	union {
+		u8  cis;
+		u8  bis;
+		u16 sync_handle;
+	};
+	int count;
+	struct {
+		struct hci_cp_le_set_cig_params cp;
+		struct hci_cis_params cis[0x11];
+	} pdu;
+};
+
+static void bis_list(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	/* Skip if not broadcast/ANY address */
+	if (bacmp(&conn->dst, BDADDR_ANY))
+		return;
+
+	if (d->big != conn->iso_qos.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
+	    d->bis != conn->iso_qos.bis)
+		return;
+
+	d->count++;
+}
+
+static void find_bis(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	/* Ignore unicast */
+	if (bacmp(&conn->dst, BDADDR_ANY))
+		return;
+
+	d->count++;
+}
+
+static int terminate_big_sync(struct hci_dev *hdev, void *data)
+{
+	struct iso_list_data *d = data;
+
+	bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis);
+
+	hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL);
+
+	/* Check if ISO connection is a BIS and terminate BIG if there are
+	 * no other connections using it.
+	 */
+	hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d);
+	if (d->count)
+		return 0;
+
+	return hci_le_terminate_big_sync(hdev, d->big,
+					 HCI_ERROR_LOCAL_HOST_TERM);
+}
+
+static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err)
+{
+	kfree(data);
+}
+
+static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis)
+{
+	struct iso_list_data *d;
+
+	bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis);
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	memset(d, 0, sizeof(*d));
+	d->big = big;
+	d->bis = bis;
+
+	return hci_cmd_sync_queue(hdev, terminate_big_sync, d,
+				  terminate_big_destroy);
+}
+
+static int big_terminate_sync(struct hci_dev *hdev, void *data)
+{
+	struct iso_list_data *d = data;
+
+	bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big,
+		   d->sync_handle);
+
+	/* Check if ISO connection is a BIS and terminate BIG if there are
+	 * no other connections using it.
+	 */
+	hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d);
+	if (d->count)
+		return 0;
+
+	hci_le_big_terminate_sync(hdev, d->big);
+
+	return hci_le_pa_terminate_sync(hdev, d->sync_handle);
+}
+
+static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle)
+{
+	struct iso_list_data *d;
+
+	bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle);
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	memset(d, 0, sizeof(*d));
+	d->big = big;
+	d->sync_handle = sync_handle;
+
+	return hci_cmd_sync_queue(hdev, big_terminate_sync, d,
+				  terminate_big_destroy);
+}
+
+/* Cleanup BIS connection
+ *
+ * Detects if there any BIS left connected in a BIG
+ * broadcaster: Remove advertising instance and terminate BIG.
+ * broadcaster receiver: Teminate BIG sync and terminate PA sync.
+ */
+static void bis_cleanup(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	bt_dev_dbg(hdev, "conn %p", conn);
+
+	if (conn->role == HCI_ROLE_MASTER) {
+		if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags))
+			return;
+
+		hci_le_terminate_big(hdev, conn->iso_qos.big,
+				     conn->iso_qos.bis);
+	} else {
+		hci_le_big_terminate(hdev, conn->iso_qos.big,
+				     conn->sync_handle);
+	}
+}
+
+static int remove_cig_sync(struct hci_dev *hdev, void *data)
+{
+	u8 handle = PTR_ERR(data);
+
+	return hci_le_remove_cig_sync(hdev, handle);
+}
+
+static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle)
+{
+	bt_dev_dbg(hdev, "handle 0x%2.2x", handle);
+
+	return hci_cmd_sync_queue(hdev, remove_cig_sync, ERR_PTR(handle), NULL);
+}
+
+static void find_cis(struct hci_conn *conn, void *data)
+{
+	struct iso_list_data *d = data;
+
+	/* Ignore broadcast */
+	if (!bacmp(&conn->dst, BDADDR_ANY))
+		return;
+
+	d->count++;
+}
+
+/* Cleanup CIS connection:
+ *
+ * Detects if there any CIS left connected in a CIG and remove it.
+ */
+static void cis_cleanup(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct iso_list_data d;
+
+	memset(&d, 0, sizeof(d));
+	d.cig = conn->iso_qos.cig;
+
+	/* Check if ISO connection is a CIS and remove CIG if there are
+	 * no other connections using it.
+	 */
+	hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d);
+	if (d.count)
+		return;
+
+	hci_le_remove_cig(hdev, conn->iso_qos.cig);
+}
+
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 			      u8 role)
 {
@@ -725,9 +921,19 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
 		break;
 	case LE_LINK:
+		/* conn->src should reflect the local identity address */
+		hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+		break;
 	case ISO_LINK:
 		/* conn->src should reflect the local identity address */
 		hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+
+		/* set proper cleanup function */
+		if (!bacmp(dst, BDADDR_ANY))
+			conn->cleanup = bis_cleanup;
+		else if (conn->role == HCI_ROLE_MASTER)
+			conn->cleanup = cis_cleanup;
+
 		break;
 	case SCO_LINK:
 		if (lmp_esco_capable(hdev))
@@ -1100,6 +1306,108 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev,
 	return 0;
 }
 
+static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+	struct iso_list_data data;
+
+	/* Allocate a BIG if not set */
+	if (qos->big == BT_ISO_QOS_BIG_UNSET) {
+		for (data.big = 0x00; data.big < 0xef; data.big++) {
+			data.count = 0;
+			data.bis = 0xff;
+
+			hci_conn_hash_list_state(hdev, bis_list, ISO_LINK,
+						 BT_BOUND, &data);
+			if (!data.count)
+				break;
+		}
+
+		if (data.big == 0xef)
+			return -EADDRNOTAVAIL;
+
+		/* Update BIG */
+		qos->big = data.big;
+	}
+
+	return 0;
+}
+
+static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+	struct iso_list_data data;
+
+	/* Allocate BIS if not set */
+	if (qos->bis == BT_ISO_QOS_BIS_UNSET) {
+		/* Find an unused adv set to advertise BIS, skip instance 0x00
+		 * since it is reserved as general purpose set.
+		 */
+		for (data.bis = 0x01; data.bis < hdev->le_num_of_adv_sets;
+		     data.bis++) {
+			data.count = 0;
+
+			hci_conn_hash_list_state(hdev, bis_list, ISO_LINK,
+						 BT_BOUND, &data);
+			if (!data.count)
+				break;
+		}
+
+		if (data.bis == hdev->le_num_of_adv_sets)
+			return -EADDRNOTAVAIL;
+
+		/* Update BIS */
+		qos->bis = data.bis;
+	}
+
+	return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
+				    struct bt_iso_qos *qos)
+{
+	struct hci_conn *conn;
+	struct iso_list_data data;
+	int err;
+
+	/* Let's make sure that le is enabled.*/
+	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+		if (lmp_le_capable(hdev))
+			return ERR_PTR(-ECONNREFUSED);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	err = qos_set_big(hdev, qos);
+	if (err)
+		return ERR_PTR(err);
+
+	err = qos_set_bis(hdev, qos);
+	if (err)
+		return ERR_PTR(err);
+
+	data.big = qos->big;
+	data.bis = qos->bis;
+	data.count = 0;
+
+	/* Check if there is already a matching BIG/BIS */
+	hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, BT_BOUND, &data);
+	if (data.count)
+		return ERR_PTR(-EADDRINUSE);
+
+	conn = hci_conn_hash_lookup_bis(hdev, dst, qos->big, qos->bis);
+	if (conn)
+		return ERR_PTR(-EADDRINUSE);
+
+	conn = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(HCI_CONN_PER_ADV, &conn->flags);
+	conn->state = BT_CONNECT;
+
+	hci_conn_hold(conn);
+	return conn;
+}
+
 /* This function requires the caller holds hdev->lock */
 struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 				     u8 dst_type, u8 sec_level,
@@ -1236,16 +1544,6 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	return sco;
 }
 
-struct iso_list_data {
-	u8  cig;
-	u8  cis;
-	int count;
-	struct {
-		struct hci_cp_le_set_cig_params cp;
-		struct hci_cis_params cis[0x11];
-	} pdu;
-};
-
 static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos)
 {
 	struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis];
@@ -1265,6 +1563,10 @@ static void cis_list(struct hci_conn *conn, void *data)
 {
 	struct iso_list_data *d = data;
 
+	/* Skip if broadcast/ANY address */
+	if (!bacmp(&conn->dst, BDADDR_ANY))
+		return;
+
 	if (d->cig != conn->iso_qos.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
 	    d->cis != conn->iso_qos.cis)
 		return;
@@ -1278,6 +1580,29 @@ static void cis_list(struct hci_conn *conn, void *data)
 	cis_add(d, &conn->iso_qos);
 }
 
+static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_create_big cp;
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.handle = qos->big;
+	cp.adv_handle = qos->bis;
+	cp.num_bis  = 0x01;
+	hci_cpu_to_le24(qos->out.interval, cp.bis.sdu_interval);
+	cp.bis.sdu = cpu_to_le16(qos->out.sdu);
+	cp.bis.latency =  cpu_to_le16(qos->out.latency);
+	cp.bis.rtn  = qos->out.rtn;
+	cp.bis.phy  = qos->out.phy;
+	cp.bis.packing = qos->packing;
+	cp.bis.framing = qos->framing;
+	cp.bis.encryption = 0x00;
+	memset(&cp.bis.bcode, 0, sizeof(cp.bis.bcode));
+
+	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp);
+}
+
 static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 {
 	struct hci_dev *hdev = conn->hdev;
@@ -1361,49 +1686,6 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 	return true;
 }
 
-static void find_cis(struct hci_conn *conn, void *data)
-{
-	struct iso_list_data *d = data;
-
-	/* Ignore broadcast */
-	if (!bacmp(&conn->dst, BDADDR_ANY))
-		return;
-
-	d->count++;
-}
-
-static int remove_cig_sync(struct hci_dev *hdev, void *data)
-{
-	u8 handle = PTR_ERR(data);
-
-	return hci_le_remove_cig_sync(hdev, handle);
-}
-
-int hci_le_remove_cig(struct hci_dev *hdev, u8 handle)
-{
-	bt_dev_dbg(hdev, "handle 0x%2.2x", handle);
-
-	return hci_cmd_sync_queue(hdev, remove_cig_sync, ERR_PTR(handle), NULL);
-}
-
-static void cis_cleanup(struct hci_conn *conn)
-{
-	struct hci_dev *hdev = conn->hdev;
-	struct iso_list_data d;
-
-	memset(&d, 0, sizeof(d));
-	d.cig = conn->iso_qos.cig;
-
-	/* Check if ISO connection is a CIS and remove CIG if there are
-	 * no other connections using it.
-	 */
-	hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d);
-	if (d.count)
-		return;
-
-	hci_le_remove_cig(hdev, conn->iso_qos.cig);
-}
-
 struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 			      __u8 dst_type, struct bt_iso_qos *qos)
 {
@@ -1622,6 +1904,179 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
 		qos->latency = conn->le_conn_latency;
 }
 
+static struct hci_conn *hci_bind_bis(struct hci_conn *conn,
+				     struct bt_iso_qos *qos)
+{
+	/* Update LINK PHYs according to QoS preference */
+	conn->le_tx_phy = qos->out.phy;
+	conn->le_tx_phy = qos->out.phy;
+	conn->iso_qos = *qos;
+	conn->state = BT_BOUND;
+
+	return conn;
+}
+
+static int create_big_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_conn *conn = data;
+	struct bt_iso_qos *qos = &conn->iso_qos;
+	u16 interval, sync_interval = 0;
+	u32 flags = 0;
+	int err;
+
+	if (qos->out.phy == 0x02)
+		flags |= MGMT_ADV_FLAG_SEC_2M;
+
+	/* Align intervals */
+	interval = qos->out.interval / 1250;
+
+	if (qos->bis)
+		sync_interval = qos->sync_interval * 1600;
+
+	err = hci_start_per_adv_sync(hdev, qos->bis, conn->le_per_adv_data_len,
+				     conn->le_per_adv_data, flags, interval,
+				     interval, sync_interval);
+	if (err)
+		return err;
+
+	return hci_le_create_big(conn, &conn->iso_qos);
+}
+
+static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct hci_cp_le_pa_create_sync *cp = data;
+
+	bt_dev_dbg(hdev, "");
+
+	if (err)
+		bt_dev_err(hdev, "Unable to create PA: %d", err);
+
+	kfree(cp);
+}
+
+static int create_pa_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_cp_le_pa_create_sync *cp = data;
+	int err;
+
+	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC,
+				    sizeof(*cp), cp, HCI_CMD_TIMEOUT);
+	if (err) {
+		hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+		return err;
+	}
+
+	return hci_update_passive_scan_sync(hdev);
+}
+
+int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
+		       __u8 sid)
+{
+	struct hci_cp_le_pa_create_sync *cp;
+
+	if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
+		return -EBUSY;
+
+	cp = kmalloc(sizeof(*cp), GFP_KERNEL);
+	if (!cp) {
+		hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+		return -ENOMEM;
+	}
+
+	/* Convert from ISO socket address type to HCI address type  */
+	if (dst_type == BDADDR_LE_PUBLIC)
+		dst_type = ADDR_LE_DEV_PUBLIC;
+	else
+		dst_type = ADDR_LE_DEV_RANDOM;
+
+	memset(cp, 0, sizeof(*cp));
+	cp->sid = sid;
+	cp->addr_type = dst_type;
+	bacpy(&cp->addr, dst);
+
+	/* Queue start pa_create_sync and scan */
+	return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete);
+}
+
+int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
+			   __u16 sync_handle, __u8 num_bis, __u8 bis[])
+{
+	struct _packed {
+		struct hci_cp_le_big_create_sync cp;
+		__u8  bis[0x11];
+	} pdu;
+	int err;
+
+	if (num_bis > sizeof(pdu.bis))
+		return -EINVAL;
+
+	err = qos_set_big(hdev, qos);
+	if (err)
+		return err;
+
+	memset(&pdu, 0, sizeof(pdu));
+	pdu.cp.handle = qos->big;
+	pdu.cp.sync_handle = cpu_to_le16(sync_handle);
+	pdu.cp.num_bis = num_bis;
+	memcpy(pdu.bis, bis, num_bis);
+
+	return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC,
+			    sizeof(pdu.cp) + num_bis, &pdu);
+}
+
+static void create_big_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct hci_conn *conn = data;
+
+	bt_dev_dbg(hdev, "conn %p", conn);
+
+	if (err) {
+		bt_dev_err(hdev, "Unable to create BIG: %d", err);
+		hci_connect_cfm(conn, err);
+		hci_conn_del(conn);
+	}
+}
+
+struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
+				 __u8 dst_type, struct bt_iso_qos *qos,
+				 __u8 base_len, __u8 *base)
+{
+	struct hci_conn *conn;
+	int err;
+
+	/* We need hci_conn object using the BDADDR_ANY as dst */
+	conn = hci_add_bis(hdev, dst, qos);
+	if (IS_ERR(conn))
+		return conn;
+
+	conn = hci_bind_bis(conn, qos);
+	if (!conn) {
+		hci_conn_drop(conn);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Add Basic Announcement into Peridic Adv Data if BASE is set */
+	if (base_len && base) {
+		base_len = eir_append_service_data(conn->le_per_adv_data, 0,
+						   0x1851, base, base_len);
+		conn->le_per_adv_data_len = base_len;
+	}
+
+	/* Queue start periodic advertising and create BIG */
+	err = hci_cmd_sync_queue(hdev, create_big_sync, conn,
+				 create_big_complete);
+	if (err < 0) {
+		hci_conn_drop(conn);
+		return ERR_PTR(err);
+	}
+
+	hci_iso_qos_setup(hdev, conn, &qos->out,
+			  conn->le_tx_phy ? conn->le_tx_phy :
+			  hdev->le_tx_def_phys);
+
+	return conn;
+}
+
 struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos)
 {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 193d4e664acd..b3a5a3cc9372 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1702,57 +1702,77 @@ static void adv_instance_rpa_expired(struct work_struct *work)
 }
 
 /* This function requires the caller holds hdev->lock */
-int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
-			 u16 adv_data_len, u8 *adv_data,
-			 u16 scan_rsp_len, u8 *scan_rsp_data,
-			 u16 timeout, u16 duration, s8 tx_power,
-			 u32 min_interval, u32 max_interval)
+struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
+				      u32 flags, u16 adv_data_len, u8 *adv_data,
+				      u16 scan_rsp_len, u8 *scan_rsp_data,
+				      u16 timeout, u16 duration, s8 tx_power,
+				      u32 min_interval, u32 max_interval)
 {
-	struct adv_info *adv_instance;
+	struct adv_info *adv;
 
-	adv_instance = hci_find_adv_instance(hdev, instance);
-	if (adv_instance) {
-		memset(adv_instance->adv_data, 0,
-		       sizeof(adv_instance->adv_data));
-		memset(adv_instance->scan_rsp_data, 0,
-		       sizeof(adv_instance->scan_rsp_data));
+	adv = hci_find_adv_instance(hdev, instance);
+	if (adv) {
+		memset(adv->adv_data, 0, sizeof(adv->adv_data));
+		memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+		memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data));
 	} else {
 		if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
 		    instance < 1 || instance > hdev->le_num_of_adv_sets)
-			return -EOVERFLOW;
+			return ERR_PTR(-EOVERFLOW);
 
-		adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL);
-		if (!adv_instance)
-			return -ENOMEM;
+		adv = kzalloc(sizeof(*adv), GFP_KERNEL);
+		if (!adv)
+			return ERR_PTR(-ENOMEM);
 
-		adv_instance->pending = true;
-		adv_instance->instance = instance;
-		list_add(&adv_instance->list, &hdev->adv_instances);
+		adv->pending = true;
+		adv->instance = instance;
+		list_add(&adv->list, &hdev->adv_instances);
 		hdev->adv_instance_cnt++;
 	}
 
-	adv_instance->flags = flags;
-	adv_instance->min_interval = min_interval;
-	adv_instance->max_interval = max_interval;
-	adv_instance->tx_power = tx_power;
+	adv->flags = flags;
+	adv->min_interval = min_interval;
+	adv->max_interval = max_interval;
+	adv->tx_power = tx_power;
 
 	hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data,
 				  scan_rsp_len, scan_rsp_data);
 
-	adv_instance->timeout = timeout;
-	adv_instance->remaining_time = timeout;
+	adv->timeout = timeout;
+	adv->remaining_time = timeout;
 
 	if (duration == 0)
-		adv_instance->duration = hdev->def_multi_adv_rotation_duration;
+		adv->duration = hdev->def_multi_adv_rotation_duration;
 	else
-		adv_instance->duration = duration;
+		adv->duration = duration;
 
-	INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
-			  adv_instance_rpa_expired);
+	INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired);
 
 	BT_DBG("%s for %dMR", hdev->name, instance);
 
-	return 0;
+	return adv;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance,
+				      u32 flags, u8 data_len, u8 *data,
+				      u32 min_interval, u32 max_interval)
+{
+	struct adv_info *adv;
+
+	adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL,
+				   0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE,
+				   min_interval, max_interval);
+	if (IS_ERR(adv))
+		return adv;
+
+	adv->periodic = true;
+	adv->per_adv_data_len = data_len;
+
+	if (data)
+		memcpy(adv->per_adv_data, data, data_len);
+
+	return adv;
 }
 
 /* This function requires the caller holds hdev->lock */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 37177d7051d5..ea33dd0cd478 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3893,6 +3893,57 @@ unlock:
 	return rp->status;
 }
 
+static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status)
+{
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
+}
+
+static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data,
+				   struct sk_buff *skb)
+{
+	struct hci_ev_status *rp = data;
+	struct hci_cp_le_set_per_adv_params *cp;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	if (rp->status)
+		return rp->status;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS);
+	if (!cp)
+		return rp->status;
+
+	/* TODO: set the conn state */
+	return rp->status;
+}
+
+static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data,
+				       struct sk_buff *skb)
+{
+	struct hci_ev_status *rp = data;
+	__u8 *sent;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	if (rp->status)
+		return rp->status;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE);
+	if (!sent)
+		return rp->status;
+
+	hci_dev_lock(hdev);
+
+	if (*sent)
+		hci_dev_set_flag(hdev, HCI_LE_PER_ADV);
+	else
+		hci_dev_clear_flag(hdev, HCI_LE_PER_ADV);
+
+	hci_dev_unlock(hdev);
+
+	return rp->status;
+}
+
 #define HCI_CC_VL(_op, _func, _min, _max) \
 { \
 	.op = _op, \
@@ -4066,6 +4117,9 @@ static const struct hci_cc {
 		      hci_cc_le_set_adv_set_random_addr),
 	HCI_CC_STATUS(HCI_OP_LE_REMOVE_ADV_SET, hci_cc_le_remove_adv_set),
 	HCI_CC_STATUS(HCI_OP_LE_CLEAR_ADV_SETS, hci_cc_le_clear_adv_sets),
+	HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_PARAMS, hci_cc_set_per_adv_param),
+	HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_ENABLE,
+		      hci_cc_le_set_per_adv_enable),
 	HCI_CC(HCI_OP_LE_READ_TRANSMIT_POWER, hci_cc_le_read_transmit_power,
 	       sizeof(struct hci_rp_le_read_transmit_power)),
 	HCI_CC_STATUS(HCI_OP_LE_SET_PRIVACY_MODE, hci_cc_le_set_privacy_mode),
@@ -4202,6 +4256,7 @@ static const struct hci_cs {
 	HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc),
 	HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
 	HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
+	HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big),
 };
 
 static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
@@ -6425,6 +6480,39 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 	hci_dev_unlock(hdev);
 }
 
+static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
+{
+	struct hci_cp_le_pa_term_sync cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+
+	return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
+}
+
+static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
+					    struct sk_buff *skb)
+{
+	struct hci_ev_le_pa_sync_established *ev = data;
+	int mask = hdev->link_mode;
+	__u8 flags = 0;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+	if (ev->status)
+		return;
+
+	hci_dev_lock(hdev);
+
+	hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags);
+	if (!(mask & HCI_LM_ACCEPT))
+		hci_le_pa_term_sync(hdev, ev->handle);
+
+	hci_dev_unlock(hdev);
+}
+
 static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
 					    struct sk_buff *skb)
 {
@@ -6776,6 +6864,105 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
+					   struct sk_buff *skb)
+{
+	struct hci_evt_le_create_big_complete *ev = data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+	if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_CREATE_BIG_COMPLETE,
+				flex_array_size(ev, bis_handle, ev->num_bis)))
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_big(hdev, ev->handle);
+	if (!conn)
+		goto unlock;
+
+	if (ev->num_bis)
+		conn->handle = __le16_to_cpu(ev->bis_handle[0]);
+
+	if (!ev->status) {
+		conn->state = BT_CONNECTED;
+		hci_debugfs_create_conn(conn);
+		hci_conn_add_sysfs(conn);
+		hci_iso_setup_path(conn);
+		goto unlock;
+	}
+
+	hci_connect_cfm(conn, ev->status);
+	hci_conn_del(conn);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
+					    struct sk_buff *skb)
+{
+	struct hci_evt_le_big_sync_estabilished *ev = data;
+	struct hci_conn *bis;
+	int i;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+	if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABILISHED,
+				flex_array_size(ev, bis, ev->num_bis)))
+		return;
+
+	if (ev->status)
+		return;
+
+	hci_dev_lock(hdev);
+
+	for (i = 0; i < ev->num_bis; i++) {
+		u16 handle = le16_to_cpu(ev->bis[i]);
+		__le32 interval;
+
+		bis = hci_conn_hash_lookup_handle(hdev, handle);
+		if (!bis) {
+			bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
+					   HCI_ROLE_SLAVE);
+			if (!bis)
+				continue;
+			bis->handle = handle;
+		}
+
+		bis->iso_qos.big = ev->handle;
+		memset(&interval, 0, sizeof(interval));
+		memcpy(&interval, ev->latency, sizeof(ev->latency));
+		bis->iso_qos.in.interval = le32_to_cpu(interval);
+		/* Convert ISO Interval (1.25 ms slots) to latency (ms) */
+		bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
+		bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu);
+
+		hci_connect_cfm(bis, ev->status);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
+					   struct sk_buff *skb)
+{
+	struct hci_evt_le_big_info_adv_report *ev = data;
+	int mask = hdev->link_mode;
+	__u8 flags = 0;
+
+	bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
+
+	hci_dev_lock(hdev);
+
+	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
+	if (!(mask & HCI_LM_ACCEPT))
+		hci_le_pa_term_sync(hdev, ev->sync_handle);
+
+	hci_dev_unlock(hdev);
+}
+
 #define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \
 [_op] = { \
 	.func = _func, \
@@ -6836,6 +7023,10 @@ static const struct hci_le_ev {
 	HCI_LE_EV_VL(HCI_EV_LE_EXT_ADV_REPORT, hci_le_ext_adv_report_evt,
 		     sizeof(struct hci_ev_le_ext_adv_report),
 		     HCI_MAX_EVENT_SIZE),
+	/* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */
+	HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED,
+		  hci_le_pa_sync_estabilished_evt,
+		  sizeof(struct hci_ev_le_pa_sync_established)),
 	/* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
 	HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
 		  sizeof(struct hci_evt_le_ext_adv_set_term)),
@@ -6845,6 +7036,21 @@ static const struct hci_le_ev {
 	/* [0x1a = HCI_EVT_LE_CIS_REQ] */
 	HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt,
 		  sizeof(struct hci_evt_le_cis_req)),
+	/* [0x1b = HCI_EVT_LE_CREATE_BIG_COMPLETE] */
+	HCI_LE_EV_VL(HCI_EVT_LE_CREATE_BIG_COMPLETE,
+		     hci_le_create_big_complete_evt,
+		     sizeof(struct hci_evt_le_create_big_complete),
+		     HCI_MAX_EVENT_SIZE),
+	/* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABILISHED] */
+	HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABILISHED,
+		     hci_le_big_sync_established_evt,
+		     sizeof(struct hci_evt_le_big_sync_estabilished),
+		     HCI_MAX_EVENT_SIZE),
+	/* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */
+	HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT,
+		     hci_le_big_info_adv_report_evt,
+		     sizeof(struct hci_evt_le_big_info_adv_report),
+		     HCI_MAX_EVENT_SIZE),
 };
 
 static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index d93189f17193..e64d558e5d69 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -827,7 +827,6 @@ void __hci_req_disable_advertising(struct hci_request *req)
 {
 	if (ext_adv_capable(req->hdev)) {
 		__hci_req_disable_ext_adv_instance(req, 0x00);
-
 	} else {
 		u8 enable = 0x00;
 
@@ -1338,15 +1337,15 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 	bdaddr_t random_addr;
 	u8 own_addr_type;
 	int err;
-	struct adv_info *adv_instance;
-	bool secondary_adv;
+	struct adv_info *adv;
+	bool secondary_adv, require_privacy;
 
 	if (instance > 0) {
-		adv_instance = hci_find_adv_instance(hdev, instance);
-		if (!adv_instance)
+		adv = hci_find_adv_instance(hdev, instance);
+		if (!adv)
 			return -EINVAL;
 	} else {
-		adv_instance = NULL;
+		adv = NULL;
 	}
 
 	flags = hci_adv_instance_flags(hdev, instance);
@@ -1364,18 +1363,24 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 	 * advertising is used. In that case it is fine to use a
 	 * non-resolvable private address.
 	 */
-	err = hci_get_random_address(hdev, !connectable,
-				     adv_use_rpa(hdev, flags), adv_instance,
+	require_privacy = !connectable;
+
+	/* Don't require privacy for periodic adv? */
+	if (adv && adv->periodic)
+		require_privacy = false;
+
+	err = hci_get_random_address(hdev, require_privacy,
+				     adv_use_rpa(hdev, flags), adv,
 				     &own_addr_type, &random_addr);
 	if (err < 0)
 		return err;
 
 	memset(&cp, 0, sizeof(cp));
 
-	if (adv_instance) {
-		hci_cpu_to_le24(adv_instance->min_interval, cp.min_interval);
-		hci_cpu_to_le24(adv_instance->max_interval, cp.max_interval);
-		cp.tx_power = adv_instance->tx_power;
+	if (adv) {
+		hci_cpu_to_le24(adv->min_interval, cp.min_interval);
+		hci_cpu_to_le24(adv->max_interval, cp.max_interval);
+		cp.tx_power = adv->tx_power;
 	} else {
 		hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
 		hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
@@ -1396,7 +1401,8 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 		else
 			cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
 	} else {
-		if (secondary_adv)
+		/* Secondary and periodic cannot use legacy PDUs */
+		if (secondary_adv || (adv && adv->periodic))
 			cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
 		else
 			cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
@@ -1426,8 +1432,8 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 		struct hci_cp_le_set_adv_set_rand_addr cp;
 
 		/* Check if random address need to be updated */
-		if (adv_instance) {
-			if (!bacmp(&random_addr, &adv_instance->random_addr))
+		if (adv) {
+			if (!bacmp(&random_addr, &adv->random_addr))
 				return 0;
 		} else {
 			if (!bacmp(&random_addr, &hdev->random_addr))
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 821244ad9d73..39d001fa3acf 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -83,6 +83,9 @@ void __hci_req_enable_advertising(struct hci_request *req);
 void __hci_req_disable_advertising(struct hci_request *req);
 void __hci_req_update_adv_data(struct hci_request *req, u8 instance);
 int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
+int hci_req_start_per_adv(struct hci_dev *hdev, u8 instance, u32 flags,
+			  u16 min_interval, u16 max_interval,
+			  u16 sync_interval);
 void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
 
 int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
@@ -92,8 +95,14 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
 				bool force);
 
 int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
+int __hci_req_setup_per_adv_instance(struct hci_request *req, u8 instance,
+				     u16 min_interval, u16 max_interval);
 int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
+int __hci_req_start_per_adv(struct hci_request *req, u8 instance, u32 flags,
+			    u16 min_interval, u16 max_interval,
+			    u16 sync_interval);
 int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance);
+int __hci_req_enable_per_advertising(struct hci_request *req, u8 instance);
 int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance);
 int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance);
 void __hci_req_clear_ext_adv_sets(struct hci_request *req);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index ac0f2cc0e44c..148ce629a59f 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -977,6 +977,187 @@ int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance)
 	return hci_enable_ext_advertising_sync(hdev, instance);
 }
 
+static int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+	struct hci_cp_le_set_per_adv_enable cp;
+
+	/* If periodic advertising already disabled there is nothing to do. */
+	if (!hci_dev_test_flag(hdev, HCI_LE_PER_ADV))
+		return 0;
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.enable = 0x00;
+	cp.handle = instance;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance,
+				       u16 min_interval, u16 max_interval)
+{
+	struct hci_cp_le_set_per_adv_params cp;
+
+	memset(&cp, 0, sizeof(cp));
+
+	if (!min_interval)
+		min_interval = DISCOV_LE_PER_ADV_INT_MIN;
+
+	if (!max_interval)
+		max_interval = DISCOV_LE_PER_ADV_INT_MAX;
+
+	cp.handle = instance;
+	cp.min_interval = cpu_to_le16(min_interval);
+	cp.max_interval = cpu_to_le16(max_interval);
+	cp.periodic_properties = 0x0000;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+	struct {
+		struct hci_cp_le_set_per_adv_data cp;
+		u8 data[HCI_MAX_PER_AD_LENGTH];
+	} pdu;
+	u8 len;
+
+	memset(&pdu, 0, sizeof(pdu));
+
+	if (instance) {
+		struct adv_info *adv = hci_find_adv_instance(hdev, instance);
+
+		if (!adv || !adv->periodic)
+			return 0;
+	}
+
+	len = eir_create_per_adv_data(hdev, instance, pdu.data);
+
+	pdu.cp.length = len;
+	pdu.cp.handle = instance;
+	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA,
+				     sizeof(pdu.cp) + len, &pdu,
+				     HCI_CMD_TIMEOUT);
+}
+
+static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+	struct hci_cp_le_set_per_adv_enable cp;
+
+	/* If periodic advertising already enabled there is nothing to do. */
+	if (hci_dev_test_flag(hdev, HCI_LE_PER_ADV))
+		return 0;
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.enable = 0x01;
+	cp.handle = instance;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Checks if periodic advertising data contains a Basic Announcement and if it
+ * does generates a Broadcast ID and add Broadcast Announcement.
+ */
+static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv)
+{
+	u8 bid[3];
+	u8 ad[4 + 3];
+
+	/* Skip if NULL adv as instance 0x00 is used for general purpose
+	 * advertising so it cannot used for the likes of Broadcast Announcement
+	 * as it can be overwritten at any point.
+	 */
+	if (!adv)
+		return 0;
+
+	/* Check if PA data doesn't contains a Basic Audio Announcement then
+	 * there is nothing to do.
+	 */
+	if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len,
+				  0x1851, NULL))
+		return 0;
+
+	/* Check if advertising data already has a Broadcast Announcement since
+	 * the process may want to control the Broadcast ID directly and in that
+	 * case the kernel shall no interfere.
+	 */
+	if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852,
+				 NULL))
+		return 0;
+
+	/* Generate Broadcast ID */
+	get_random_bytes(bid, sizeof(bid));
+	eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid));
+	hci_set_adv_instance_data(hdev, adv->instance, sizeof(ad), ad, 0, NULL);
+
+	return hci_update_adv_data_sync(hdev, adv->instance);
+}
+
+int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
+			   u8 *data, u32 flags, u16 min_interval,
+			   u16 max_interval, u16 sync_interval)
+{
+	struct adv_info *adv = NULL;
+	int err;
+	bool added = false;
+
+	hci_disable_per_advertising_sync(hdev, instance);
+
+	if (instance) {
+		adv = hci_find_adv_instance(hdev, instance);
+		/* Create an instance if that could not be found */
+		if (!adv) {
+			adv = hci_add_per_instance(hdev, instance, flags,
+						   data_len, data,
+						   sync_interval,
+						   sync_interval);
+			if (IS_ERR(adv))
+				return PTR_ERR(adv);
+			added = true;
+		}
+	}
+
+	/* Only start advertising if instance 0 or if a dedicated instance has
+	 * been added.
+	 */
+	if (!adv || added) {
+		err = hci_start_ext_adv_sync(hdev, instance);
+		if (err < 0)
+			goto fail;
+
+		err = hci_adv_bcast_annoucement(hdev, adv);
+		if (err < 0)
+			goto fail;
+	}
+
+	err = hci_set_per_adv_params_sync(hdev, instance, min_interval,
+					  max_interval);
+	if (err < 0)
+		goto fail;
+
+	err = hci_set_per_adv_data_sync(hdev, instance);
+	if (err < 0)
+		goto fail;
+
+	err = hci_enable_per_advertising_sync(hdev, instance);
+	if (err < 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	if (added)
+		hci_remove_adv_instance(hdev, instance);
+
+	return err;
+}
+
 static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance)
 {
 	int err;
@@ -1116,6 +1297,42 @@ int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
 					HCI_CMD_TIMEOUT, sk);
 }
 
+static int remove_ext_adv_sync(struct hci_dev *hdev, void *data)
+{
+	struct adv_info *adv = data;
+	u8 instance = 0;
+
+	if (adv)
+		instance = adv->instance;
+
+	return hci_remove_ext_adv_instance_sync(hdev, instance, NULL);
+}
+
+int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance)
+{
+	struct adv_info *adv = NULL;
+
+	if (instance) {
+		adv = hci_find_adv_instance(hdev, instance);
+		if (!adv)
+			return -EINVAL;
+	}
+
+	return hci_cmd_sync_queue(hdev, remove_ext_adv_sync, adv, NULL);
+}
+
+int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
+{
+	struct hci_cp_le_term_big cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+	cp.reason = reason;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
 static void cancel_adv_timeout(struct hci_dev *hdev)
 {
 	if (hdev->adv_instance_timeout) {
@@ -2201,7 +2418,8 @@ int hci_update_passive_scan_sync(struct hci_dev *hdev)
 
 	if (list_empty(&hdev->pend_le_conns) &&
 	    list_empty(&hdev->pend_le_reports) &&
-	    !hci_is_adv_monitoring(hdev)) {
+	    !hci_is_adv_monitoring(hdev) &&
+	    !hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
 		/* If there is no pending LE connections or devices
 		 * to be scanned for or no ADV monitors, we should stop the
 		 * background scanning.
@@ -3405,6 +3623,13 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
 			events[3] |= 0x02; /* LE CIS Request */
 	}
 
+	if (bis_capable(hdev)) {
+		events[3] |= 0x04;	/* LE Create BIG Complete */
+		events[3] |= 0x08;	/* LE Terminate BIG Complete */
+		events[3] |= 0x10;	/* LE BIG Sync Established */
+		events[3] |= 0x20;	/* LE BIG Sync Loss */
+	}
+
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK,
 				     sizeof(events), events, HCI_CMD_TIMEOUT);
 }
@@ -5484,3 +5709,25 @@ int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp),
 				     &cp, HCI_CMD_TIMEOUT);
 }
+
+int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle)
+{
+	struct hci_cp_le_big_term_sync cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle)
+{
+	struct hci_cp_le_pa_term_sync cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(handle);
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 08688117d700..8cfafd7a0576 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -8158,7 +8158,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 	u16 timeout, duration;
 	unsigned int prev_instance_cnt;
 	u8 schedule_instance = 0;
-	struct adv_info *next_instance;
+	struct adv_info *adv, *next_instance;
 	int err;
 	struct mgmt_pending_cmd *cmd;
 
@@ -8209,7 +8209,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 
 	prev_instance_cnt = hdev->adv_instance_cnt;
 
-	err = hci_add_adv_instance(hdev, cp->instance, flags,
+	adv = hci_add_adv_instance(hdev, cp->instance, flags,
 				   cp->adv_data_len, cp->data,
 				   cp->scan_rsp_len,
 				   cp->data + cp->adv_data_len,
@@ -8217,7 +8217,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 				   HCI_ADV_TX_POWER_NO_PREFERENCE,
 				   hdev->le_adv_min_interval,
 				   hdev->le_adv_max_interval);
-	if (err < 0) {
+	if (IS_ERR(adv)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
 				      MGMT_STATUS_FAILED);
 		goto unlock;
@@ -8348,6 +8348,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_cp_add_ext_adv_params *cp = data;
 	struct mgmt_rp_add_ext_adv_params rp;
 	struct mgmt_pending_cmd *cmd = NULL;
+	struct adv_info *adv;
 	u32 flags, min_interval, max_interval;
 	u16 timeout, duration;
 	u8 status;
@@ -8417,11 +8418,11 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
 		   HCI_ADV_TX_POWER_NO_PREFERENCE;
 
 	/* Create advertising instance with no advertising or response data */
-	err = hci_add_adv_instance(hdev, cp->instance, flags,
-				   0, NULL, 0, NULL, timeout, duration,
-				   tx_power, min_interval, max_interval);
+	adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL,
+				   timeout, duration, tx_power, min_interval,
+				   max_interval);
 
-	if (err < 0) {
+	if (IS_ERR(adv)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
 				      MGMT_STATUS_FAILED);
 		goto unlock;
-- 
cgit 


From f764a6c2c1e446f560faa3232271a0637369170b Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 9 Mar 2022 13:14:41 -0800
Subject: Bluetooth: ISO: Add broadcast support

This adds broadcast support for BTPROTO_ISO by extending the
sockaddr_iso with a new struct sockaddr_iso_bc where the socket user
can set the broadcast address when receiving, the SID and the BIS
indexes it wants to synchronize.

When using BTPROTO_ISO for broadcast the roles are:

Broadcaster -> uses connect with address set to BDADDR_ANY:
> tools/isotest -s 00:00:00:00:00:00

Broadcast Receiver -> uses listen with address set to broadcaster:
> tools/isotest -d 00:AA:01:00:00:00

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |   2 +
 include/net/bluetooth/iso.h       |  11 ++
 net/bluetooth/iso.c               | 391 ++++++++++++++++++++++++++++++++++----
 3 files changed, 370 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bcc6e098f1f6..e72f3b247b5e 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -220,6 +220,8 @@ struct bt_codecs {
 #define BT_CODEC_TRANSPARENT	0x03
 #define BT_CODEC_MSBC		0x05
 
+#define BT_ISO_BASE		20
+
 __printf(1, 2)
 void bt_info(const char *fmt, ...);
 __printf(1, 2)
diff --git a/include/net/bluetooth/iso.h b/include/net/bluetooth/iso.h
index 13b22d54aab5..3f4fe8b78e1b 100644
--- a/include/net/bluetooth/iso.h
+++ b/include/net/bluetooth/iso.h
@@ -10,12 +10,23 @@
 
 /* ISO defaults */
 #define ISO_DEFAULT_MTU		251
+#define ISO_MAX_NUM_BIS		0x1f
+
+/* ISO socket broadcast address */
+struct sockaddr_iso_bc {
+	bdaddr_t	bc_bdaddr;
+	__u8		bc_bdaddr_type;
+	__u8		bc_sid;
+	__u8		bc_num_bis;
+	__u8		bc_bis[ISO_MAX_NUM_BIS];
+};
 
 /* ISO socket address */
 struct sockaddr_iso {
 	sa_family_t	iso_family;
 	bdaddr_t	iso_bdaddr;
 	__u8		iso_bdaddr_type;
+	struct sockaddr_iso_bc iso_bc[];
 };
 
 #endif /* __ISO_H */
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index caaaa670cc2c..ff09c353e64e 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -50,8 +50,14 @@ struct iso_pinfo {
 	__u8			src_type;
 	bdaddr_t		dst;
 	__u8			dst_type;
+	__u8			bc_sid;
+	__u8			bc_num_bis;
+	__u8			bc_bis[ISO_MAX_NUM_BIS];
+	__u16			sync_handle;
 	__u32			flags;
 	struct bt_iso_qos	qos;
+	__u8			base_len;
+	__u8			base[HCI_MAX_PER_AD_LENGTH];
 	struct iso_conn		*conn;
 };
 
@@ -130,6 +136,7 @@ static struct iso_conn *iso_conn_add(struct hci_conn *hcon)
 static void iso_chan_del(struct sock *sk, int err)
 {
 	struct iso_conn *conn;
+	struct sock *parent;
 
 	conn = iso_pi(sk)->conn;
 
@@ -147,7 +154,14 @@ static void iso_chan_del(struct sock *sk, int err)
 
 	sk->sk_state = BT_CLOSED;
 	sk->sk_err   = err;
-	sk->sk_state_change(sk);
+
+	parent = bt_sk(sk)->parent;
+	if (parent) {
+		bt_accept_unlink(sk);
+		parent->sk_data_ready(parent);
+	} else {
+		sk->sk_state_change(sk);
+	}
 
 	sock_set_flag(sk, SOCK_ZAPPED);
 }
@@ -218,7 +232,70 @@ static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
 	return err;
 }
 
-static int iso_connect(struct sock *sk)
+static int iso_connect_bis(struct sock *sk)
+{
+	struct iso_conn *conn;
+	struct hci_conn *hcon;
+	struct hci_dev  *hdev;
+	int err;
+
+	BT_DBG("%pMR", &iso_pi(sk)->src);
+
+	hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+			     iso_pi(sk)->src_type);
+	if (!hdev)
+		return -EHOSTUNREACH;
+
+	hci_dev_lock(hdev);
+
+	if (!bis_capable(hdev)) {
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* Fail if out PHYs are marked as disabled */
+	if (!iso_pi(sk)->qos.out.phy) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type,
+			       &iso_pi(sk)->qos, iso_pi(sk)->base_len,
+			       iso_pi(sk)->base);
+	if (IS_ERR(hcon)) {
+		err = PTR_ERR(hcon);
+		goto done;
+	}
+
+	conn = iso_conn_add(hcon);
+	if (!conn) {
+		hci_conn_drop(hcon);
+		err = -ENOMEM;
+		goto done;
+	}
+
+	/* Update source addr of the socket */
+	bacpy(&iso_pi(sk)->src, &hcon->src);
+
+	err = iso_chan_add(conn, sk, NULL);
+	if (err)
+		goto done;
+
+	if (hcon->state == BT_CONNECTED) {
+		iso_sock_clear_timer(sk);
+		sk->sk_state = BT_CONNECTED;
+	} else {
+		sk->sk_state = BT_CONNECT;
+		iso_sock_set_timer(sk, sk->sk_sndtimeo);
+	}
+
+done:
+	hci_dev_unlock(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+static int iso_connect_cis(struct sock *sk)
 {
 	struct iso_conn *conn;
 	struct hci_conn *hcon;
@@ -359,10 +436,39 @@ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba)
 	return NULL;
 }
 
-/* Find socket listening on source bdaddr.
+static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc,
+						 __u8 sid)
+{
+	struct sock *sk;
+
+	sk_for_each(sk, &iso_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		if (bacmp(&iso_pi(sk)->src, ba))
+			continue;
+
+		if (bacmp(&iso_pi(sk)->dst, bc))
+			continue;
+
+		if (iso_pi(sk)->bc_sid == sid)
+			return sk;
+	}
+
+	return NULL;
+}
+
+typedef bool (*iso_sock_match_t)(struct sock *sk, void *data);
+
+/* Find socket listening:
+ * source bdaddr (Unicast)
+ * destination bdaddr (Broadcast only)
+ * match func - pass NULL to ignore
+ * match func data - pass -1 to ignore
  * Returns closest match.
  */
-static struct sock *iso_get_sock_listen(bdaddr_t *src)
+static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst,
+					iso_sock_match_t match, void *data)
 {
 	struct sock *sk = NULL, *sk1 = NULL;
 
@@ -372,6 +478,14 @@ static struct sock *iso_get_sock_listen(bdaddr_t *src)
 		if (sk->sk_state != BT_LISTEN)
 			continue;
 
+		/* Match Broadcast destination */
+		if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst))
+			continue;
+
+		/* Use Match function if provided */
+		if (match && !match(sk, data))
+			continue;
+
 		/* Exact match. */
 		if (!bacmp(&iso_pi(sk)->src, src))
 			break;
@@ -587,6 +701,38 @@ static int iso_sock_create(struct net *net, struct socket *sock, int protocol,
 	return 0;
 }
 
+static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr,
+			    int addr_len)
+{
+	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+	struct sock *sk = sock->sk;
+	int i;
+
+	BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid,
+	       sa->iso_bc->bc_num_bis);
+
+	if (addr_len > sizeof(*sa) + sizeof(*sa->iso_bc) ||
+	    sa->iso_bc->bc_num_bis < 0x01 || sa->iso_bc->bc_num_bis > 0x1f)
+		return -EINVAL;
+
+	bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr);
+	iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type;
+	iso_pi(sk)->sync_handle = -1;
+	iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid;
+	iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis;
+
+	for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) {
+		if (sa->iso_bc->bc_bis[i] < 0x01 ||
+		    sa->iso_bc->bc_bis[i] > 0x1f)
+			return -EINVAL;
+
+		memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis,
+		       iso_pi(sk)->bc_num_bis);
+	}
+
+	return 0;
+}
+
 static int iso_sock_bind(struct socket *sock, struct sockaddr *addr,
 			 int addr_len)
 {
@@ -621,6 +767,13 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr,
 	bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr);
 	iso_pi(sk)->src_type = sa->iso_bdaddr_type;
 
+	/* Check for Broadcast address */
+	if (addr_len > sizeof(*sa)) {
+		err = iso_sock_bind_bc(sock, addr, addr_len);
+		if (err)
+			goto done;
+	}
+
 	sk->sk_state = BT_BOUND;
 
 done:
@@ -656,7 +809,11 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr *addr,
 	bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr);
 	iso_pi(sk)->dst_type = sa->iso_bdaddr_type;
 
-	err = iso_connect(sk);
+	if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+		err = iso_connect_cis(sk);
+	else
+		err = iso_connect_bis(sk);
+
 	if (err)
 		goto done;
 
@@ -670,10 +827,59 @@ done:
 	return err;
 }
 
+static int iso_listen_bis(struct sock *sk)
+{
+	struct hci_dev *hdev;
+	int err = 0;
+
+	BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src,
+	       &iso_pi(sk)->dst, iso_pi(sk)->bc_sid);
+
+	write_lock(&iso_sk_list.lock);
+
+	if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst,
+					 iso_pi(sk)->bc_sid))
+		err = -EADDRINUSE;
+
+	write_unlock(&iso_sk_list.lock);
+
+	if (err)
+		return err;
+
+	hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+			     iso_pi(sk)->src_type);
+	if (!hdev)
+		return -EHOSTUNREACH;
+
+	hci_dev_lock(hdev);
+
+	err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type,
+				 iso_pi(sk)->bc_sid);
+
+	hci_dev_unlock(hdev);
+
+	return err;
+}
+
+static int iso_listen_cis(struct sock *sk)
+{
+	int err = 0;
+
+	BT_DBG("%pMR", &iso_pi(sk)->src);
+
+	write_lock(&iso_sk_list.lock);
+
+	if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src))
+		err = -EADDRINUSE;
+
+	write_unlock(&iso_sk_list.lock);
+
+	return err;
+}
+
 static int iso_sock_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
-	bdaddr_t *src = &iso_pi(sk)->src;
 	int err = 0;
 
 	BT_DBG("sk %p backlog %d", sk, backlog);
@@ -690,21 +896,19 @@ static int iso_sock_listen(struct socket *sock, int backlog)
 		goto done;
 	}
 
-	write_lock(&iso_sk_list.lock);
+	if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+		err = iso_listen_cis(sk);
+	else
+		err = iso_listen_bis(sk);
 
-	if (__iso_get_sock_listen_by_addr(src)) {
-		err = -EADDRINUSE;
-		goto unlock;
-	}
+	if (err)
+		goto done;
 
 	sk->sk_max_ack_backlog = backlog;
 	sk->sk_ack_backlog = 0;
 
 	sk->sk_state = BT_LISTEN;
 
-unlock:
-	write_unlock(&iso_sk_list.lock);
-
 done:
 	release_sock(sk);
 	return err;
@@ -886,7 +1090,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 			release_sock(sk);
 			return 0;
 		case BT_CONNECT:
-			err = iso_connect(sk);
+			err = iso_connect_cis(sk);
 			release_sock(sk);
 			return err;
 		}
@@ -917,10 +1121,6 @@ static bool check_io_qos(struct bt_iso_io_qos *qos)
 
 static bool check_qos(struct bt_iso_qos *qos)
 {
-	/* CIS shall not be set */
-	if (qos->cis != BT_ISO_QOS_CIS_UNSET)
-		return false;
-
 	if (qos->sca > 0x07)
 		return false;
 
@@ -996,6 +1196,29 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 
 		break;
 
+	case BT_ISO_BASE:
+		if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+		    sk->sk_state != BT_CONNECT2) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (optlen > sizeof(iso_pi(sk)->base)) {
+			err = -EOVERFLOW;
+			break;
+		}
+
+		len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen);
+
+		if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		iso_pi(sk)->base_len = len;
+
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1011,6 +1234,8 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname,
 	struct sock *sk = sock->sk;
 	int len, err = 0;
 	struct bt_iso_qos qos;
+	u8 base_len;
+	u8 *base;
 
 	BT_DBG("sk %p", sk);
 
@@ -1044,6 +1269,21 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname,
 
 		break;
 
+	case BT_ISO_BASE:
+		if (sk->sk_state == BT_CONNECTED) {
+			base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len;
+			base = iso_pi(sk)->conn->hcon->le_per_adv_data;
+		} else {
+			base_len = iso_pi(sk)->base_len;
+			base = iso_pi(sk)->base;
+		}
+
+		len = min_t(unsigned int, len, base_len);
+		if (copy_to_user(optval, base, len))
+			err = -EFAULT;
+
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1126,10 +1366,18 @@ struct iso_list_data {
 	int count;
 };
 
+static bool iso_match_big(struct sock *sk, void *data)
+{
+	struct hci_evt_le_big_sync_estabilished *ev = data;
+
+	return ev->handle == iso_pi(sk)->qos.big;
+}
+
 static void iso_conn_ready(struct iso_conn *conn)
 {
 	struct sock *parent;
 	struct sock *sk = conn->sk;
+	struct hci_ev_le_big_sync_estabilished *ev;
 
 	BT_DBG("conn %p", conn);
 
@@ -1143,7 +1391,16 @@ static void iso_conn_ready(struct iso_conn *conn)
 			return;
 		}
 
-		parent = iso_get_sock_listen(&conn->hcon->src);
+		ev = hci_recv_event_data(conn->hcon->hdev,
+					 HCI_EVT_LE_BIG_SYNC_ESTABILISHED);
+		if (ev)
+			parent = iso_get_sock_listen(&conn->hcon->src,
+						     &conn->hcon->dst,
+						     iso_match_big, ev);
+		else
+			parent = iso_get_sock_listen(&conn->hcon->src,
+						     BDADDR_ANY, NULL, NULL);
+
 		if (!parent) {
 			iso_conn_unlock(conn);
 			return;
@@ -1163,6 +1420,17 @@ static void iso_conn_ready(struct iso_conn *conn)
 
 		bacpy(&iso_pi(sk)->src, &conn->hcon->src);
 		iso_pi(sk)->src_type = conn->hcon->src_type;
+
+		/* If hcon has no destination address (BDADDR_ANY) it means it
+		 * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED so we need to
+		 * initialize using the parent socket destination address.
+		 */
+		if (!bacmp(&conn->hcon->dst, BDADDR_ANY)) {
+			bacpy(&conn->hcon->dst, &iso_pi(parent)->dst);
+			conn->hcon->dst_type = iso_pi(parent)->dst_type;
+			conn->hcon->sync_handle = iso_pi(parent)->sync_handle;
+		}
+
 		bacpy(&iso_pi(sk)->dst, &conn->hcon->dst);
 		iso_pi(sk)->dst_type = conn->hcon->dst_type;
 
@@ -1183,30 +1451,85 @@ static void iso_conn_ready(struct iso_conn *conn)
 	}
 }
 
+static bool iso_match_sid(struct sock *sk, void *data)
+{
+	struct hci_ev_le_pa_sync_established *ev = data;
+
+	return ev->sid == iso_pi(sk)->bc_sid;
+}
+
+static bool iso_match_sync_handle(struct sock *sk, void *data)
+{
+	struct hci_evt_le_big_info_adv_report *ev = data;
+
+	return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle;
+}
+
 /* ----- ISO interface with lower layer (HCI) ----- */
+
 int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 {
+	struct hci_ev_le_pa_sync_established *ev1;
+	struct hci_evt_le_big_info_adv_report *ev2;
 	struct sock *sk;
 	int lm = 0;
 
-	BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
-
-	/* Find listening sockets */
-	read_lock(&iso_sk_list.lock);
-	sk_for_each(sk, &iso_sk_list.head) {
-		if (sk->sk_state != BT_LISTEN)
-			continue;
+	bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+	/* Broadcast receiver requires handling of some events before it can
+	 * proceed to establishing a BIG sync:
+	 *
+	 * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific
+	 * SID to listen to and once sync is estabilished its handle needs to
+	 * be stored in iso_pi(sk)->sync_handle so it can be matched once
+	 * receiving the BIG Info.
+	 * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a
+	 * a BIG Info it attempts to check if there any listening socket with
+	 * the same sync_handle and if it does then attempt to create a sync.
+	 */
+	ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED);
+	if (ev1) {
+		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sid,
+					 ev1);
+		if (sk)
+			iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
 
-		if (!bacmp(&iso_pi(sk)->src, &hdev->bdaddr) ||
-		    !bacmp(&iso_pi(sk)->src, BDADDR_ANY)) {
-			lm |= HCI_LM_ACCEPT;
+		goto done;
+	}
 
-			if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
-				*flags |= HCI_PROTO_DEFER;
-			break;
+	ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
+	if (ev2) {
+		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr,
+					 iso_match_sync_handle, ev2);
+		if (sk) {
+			int err;
+
+			if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
+				iso_pi(sk)->bc_num_bis = ev2->num_bis;
+
+			err = hci_le_big_create_sync(hdev,
+						     &iso_pi(sk)->qos,
+						     iso_pi(sk)->sync_handle,
+						     iso_pi(sk)->bc_num_bis,
+						     iso_pi(sk)->bc_bis);
+			if (err) {
+				bt_dev_err(hdev, "hci_le_big_create_sync: %d",
+					   err);
+				sk = NULL;
+			}
 		}
+	} else {
+		sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL);
 	}
-	read_unlock(&iso_sk_list.lock);
+
+done:
+	if (!sk)
+		return lm;
+
+	lm |= HCI_LM_ACCEPT;
+
+	if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+		*flags |= HCI_PROTO_DEFER;
 
 	return lm;
 }
-- 
cgit 


From 7074732c8faee201a245a6f983008a5789c0be33 Mon Sep 17 00:00:00 2001
From: Matthias May <matthias.may@westermo.com>
Date: Thu, 21 Jul 2022 22:27:19 +0200
Subject: ip_tunnels: allow VXLAN/GENEVE to inherit TOS/TTL from VLAN

The current code allows for VXLAN and GENEVE to inherit the TOS
respective the TTL when skb-protocol is ETH_P_IP or ETH_P_IPV6.
However when the payload is VLAN encapsulated, then this inheriting
does not work, because the visible skb-protocol is of type
ETH_P_8021Q or ETH_P_8021AD.

Instead of skb->protocol use skb_protocol().

Signed-off-by: Matthias May <matthias.may@westermo.com>
Link: https://lore.kernel.org/r/20220721202718.10092-1-matthias.may@westermo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 70cbc4a72669..20db95055db3 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -387,9 +387,11 @@ static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 				       const struct sk_buff *skb)
 {
-	if (skb->protocol == htons(ETH_P_IP))
+	__be16 payload_protocol = skb_protocol(skb, true);
+
+	if (payload_protocol == htons(ETH_P_IP))
 		return iph->tos;
-	else if (skb->protocol == htons(ETH_P_IPV6))
+	else if (payload_protocol == htons(ETH_P_IPV6))
 		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
 	else
 		return 0;
@@ -398,9 +400,11 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
 				       const struct sk_buff *skb)
 {
-	if (skb->protocol == htons(ETH_P_IP))
+	__be16 payload_protocol = skb_protocol(skb, true);
+
+	if (payload_protocol == htons(ETH_P_IP))
 		return iph->ttl;
-	else if (skb->protocol == htons(ETH_P_IPV6))
+	else if (payload_protocol == htons(ETH_P_IPV6))
 		return ((const struct ipv6hdr *)iph)->hop_limit;
 	else
 		return 0;
-- 
cgit 


From e8f90717ed3b58e81c480b3aa38e641c0da5a456 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:47 -0700
Subject: vfio: Make vfio_unpin_pages() return void

There's only one caller that checks its return value with a WARN_ON_ONCE,
while all other callers don't check the return value at all. Above that,
an undo function should not fail. So, simplify the API to return void by
embedding similar WARN_ONs.

Also for users to pinpoint which condition fails, separate WARN_ON lines,
yet remove the "driver->ops->unpin_pages" check, since it's unreasonable
for callers to unpin on something totally random that wasn't even pinned.
And remove NULL pointer checks for they would trigger oops vs. warnings.
Note that npage is already validated in the vfio core, thus drop the same
check in the type1 code.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-2-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  2 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c                  |  5 +----
 drivers/vfio/vfio.c                               | 21 +++++++--------------
 drivers/vfio/vfio.h                               |  2 +-
 drivers/vfio/vfio_iommu_type1.c                   | 15 ++++++---------
 include/linux/vfio.h                              |  4 ++--
 6 files changed, 18 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 1c57815619fd..b0fdf76b339a 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -265,7 +265,7 @@ driver::
 	int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 				  int npage, int prot, unsigned long *phys_pfn);
 
-	int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 				    int npage);
 
 These functions call back into the back-end IOMMU module by using the pin_pages
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index ecd5bb37b63a..4d32a2748958 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -231,18 +231,15 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size)
 {
-	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
 	int total_pages;
 	int npage;
-	int ret;
 
 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
 
 	for (npage = 0; npage < total_pages; npage++) {
 		unsigned long cur_gfn = gfn + npage;
 
-		ret = vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
-		drm_WARN_ON(&i915->drm, ret != 1);
+		vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
 	}
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index b3ce8073cfb1..92b10aafae28 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1983,31 +1983,24 @@ EXPORT_SYMBOL(vfio_pin_pages);
  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @npage [in]   : count of elements in user_pfn array.  This count should not
  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- * Return error or number of pages unpinned.
  */
-int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		     int npage)
+void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		      int npage)
 {
 	struct vfio_container *container;
 	struct vfio_iommu_driver *driver;
-	int ret;
 
-	if (!user_pfn || !npage || !vfio_assert_device_open(device))
-		return -EINVAL;
+	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
+		return;
 
-	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
-		return -E2BIG;
+	if (WARN_ON(!vfio_assert_device_open(device)))
+		return;
 
 	/* group->container cannot change while a vfio device is open */
 	container = device->group->container;
 	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->unpin_pages))
-		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
-					       npage);
-	else
-		ret = -ENOTTY;
 
-	return ret;
+	driver->ops->unpin_pages(container->iommu_data, user_pfn, npage);
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 4a7db1f3c33e..6a8424b407c7 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -53,7 +53,7 @@ struct vfio_iommu_driver_ops {
 				     unsigned long *user_pfn,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
-	int		(*unpin_pages)(void *iommu_data,
+	void		(*unpin_pages)(void *iommu_data,
 				       unsigned long *user_pfn, int npage);
 	void		(*register_device)(void *iommu_data,
 					   struct vfio_device *vdev);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 026a1d2553a2..e49fbe9968ef 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -949,20 +949,16 @@ pin_done:
 	return ret;
 }
 
-static int vfio_iommu_type1_unpin_pages(void *iommu_data,
-					unsigned long *user_pfn,
-					int npage)
+static void vfio_iommu_type1_unpin_pages(void *iommu_data,
+					 unsigned long *user_pfn, int npage)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	bool do_accounting;
 	int i;
 
-	if (!iommu || !user_pfn || npage <= 0)
-		return -EINVAL;
-
 	/* Supported for v2 version only */
-	if (!iommu->v2)
-		return -EACCES;
+	if (WARN_ON(!iommu->v2))
+		return;
 
 	mutex_lock(&iommu->lock);
 
@@ -980,7 +976,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 	}
 
 	mutex_unlock(&iommu->lock);
-	return i > 0 ? i : -EINVAL;
+
+	WARN_ON(i != npage);
 }
 
 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 19cefbaa3d06..9f7d74c24925 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -163,8 +163,8 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 		   int npage, int prot, unsigned long *phys_pfn);
-int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		     int npage);
+void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		      int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-- 
cgit 


From 6a010a49b63ac8465851a79185d8deff966f8e1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Jul 2022 04:28:28 -1000
Subject: cgroup: Make !percpu threadgroup_rwsem operations optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

3942a9bd7b58 ("locking, rcu, cgroup: Avoid synchronize_sched() in
__cgroup_procs_write()") disabled percpu operations on threadgroup_rwsem
because the impiled synchronize_rcu() on write locking was pushing up the
latencies too much for android which constantly moves processes between
cgroups.

This makes the hotter paths - fork and exit - slower as they're always
forced into the slow path. There is no reason to force this on everyone
especially given that more common static usage pattern can now completely
avoid write-locking the rwsem. Write-locking is elided when turning on and
off controllers on empty sub-trees and CLONE_INTO_CGROUP enables seeding a
cgroup without grabbing the rwsem.

Restore the default percpu operations and introduce the mount option
"favordynmods" and config option CGROUP_FAVOR_DYNMODS for users who need
lower latencies for the dynamic operations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Michal Koutnï¿½ <mkoutny@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 Documentation/admin-guide/cgroup-v2.rst |  8 ++++++
 include/linux/cgroup-defs.h             | 19 ++++++++++++---
 init/Kconfig                            | 10 ++++++++
 kernel/cgroup/cgroup-internal.h         |  1 +
 kernel/cgroup/cgroup-v1.c               | 17 ++++++++++++-
 kernel/cgroup/cgroup.c                  | 43 +++++++++++++++++++++++++++------
 6 files changed, 87 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index f0f03d5470b5..4bd0bee22a0d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -184,6 +184,14 @@ cgroup v2 currently supports the following mount options.
 	ignored on non-init namespace mounts.  Please refer to the
 	Delegation section for details.
 
+  [no]favordynmods
+        Reduce the latencies of dynamic cgroup modifications such as
+        task migrations and controller on/offs at the cost of making
+        hot path operations such as forks and exits more expensive.
+        The static usage pattern of creating a cgroup, enabling
+        controllers, and then seeding it with CLONE_INTO_CGROUP is
+        not affected by this option.
+
   memory_[no]localevents
         Only populate memory.events with data for the current cgroup,
         and not any subtrees. This is legacy behaviour, the default
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 672de25e3ec8..63bf43c7ca3b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -88,20 +88,33 @@ enum {
 	 */
 	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
 
+	/*
+	 * Reduce latencies on dynamic cgroup modifications such as task
+	 * migrations and controller on/offs by disabling percpu operation on
+	 * cgroup_threadgroup_rwsem. This makes hot path operations such as
+	 * forks and exits into the slow path and more expensive.
+	 *
+	 * The static usage pattern of creating a cgroup, enabling controllers,
+	 * and then seeding it with CLONE_INTO_CGROUP doesn't require write
+	 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
+	 * favordynmod.
+	 */
+	CGRP_ROOT_FAVOR_DYNMODS = (1 << 4),
+
 	/*
 	 * Enable cpuset controller in v1 cgroup to use v2 behavior.
 	 */
-	CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
+	CGRP_ROOT_CPUSET_V2_MODE = (1 << 16),
 
 	/*
 	 * Enable legacy local memory.events.
 	 */
-	CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
+	CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17),
 
 	/*
 	 * Enable recursive subtree protection
 	 */
-	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 6),
+	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
 };
 
 /* cftype->flags */
diff --git a/init/Kconfig b/init/Kconfig
index c984afc489de..c93b10b3de3f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -936,6 +936,16 @@ if CGROUPS
 config PAGE_COUNTER
 	bool
 
+config CGROUP_FAVOR_DYNMODS
+        bool "Favor dynamic modification latency reduction by default"
+        help
+          This option enables the "favordynmods" mount option by default
+          which reduces the latencies of dynamic cgroup modifications such
+          as task migrations and controller on/offs at the cost of making
+          hot path operations such as forks and exits more expensive.
+
+          Say N if unsure.
+
 config MEMCG
 	bool "Memory controller"
 	select PAGE_COUNTER
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5da09c74228d..36b740cb3d59 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -233,6 +233,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 			  struct cgroup_namespace *ns);
 
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
 void cgroup_free_root(struct cgroup_root *root);
 void init_cgroup_root(struct cgroup_fs_context *ctx);
 int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index afc6c0e9c966..2ade21b54dc4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -875,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
 		seq_puts(seq, ",xattr");
 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
 		seq_puts(seq, ",cpuset_v2_mode");
+	if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+		seq_puts(seq, ",favordynmods");
 
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
@@ -898,6 +900,8 @@ enum cgroup1_param {
 	Opt_noprefix,
 	Opt_release_agent,
 	Opt_xattr,
+	Opt_favordynmods,
+	Opt_nofavordynmods,
 };
 
 const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -909,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
 	fsparam_flag  ("noprefix",	Opt_noprefix),
 	fsparam_string("release_agent",	Opt_release_agent),
 	fsparam_flag  ("xattr",		Opt_xattr),
+	fsparam_flag  ("favordynmods",	Opt_favordynmods),
+	fsparam_flag  ("nofavordynmods", Opt_nofavordynmods),
 	{}
 };
 
@@ -960,6 +966,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case Opt_xattr:
 		ctx->flags |= CGRP_ROOT_XATTR;
 		break;
+	case Opt_favordynmods:
+		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+		break;
+	case Opt_nofavordynmods:
+		ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+		break;
 	case Opt_release_agent:
 		/* Specifying two release agents is forbidden */
 		if (ctx->release_agent)
@@ -1211,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
 	init_cgroup_root(ctx);
 
 	ret = cgroup_setup_root(root, ctx->subsys_mask);
-	if (ret)
+	if (!ret)
+		cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+	else
 		cgroup_free_root(root);
+
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9ce24d5cf2d5..7d023d42a6a5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1305,6 +1305,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 	return root_cgrp->root;
 }
 
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+	bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+	/* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+	if (favor && !favoring) {
+		rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+		root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+	} else if (!favor && favoring) {
+		rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+		root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+	}
+}
+
 static int cgroup_init_root_id(struct cgroup_root *root)
 {
 	int id;
@@ -1365,6 +1379,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		cgroup_root_count--;
 	}
 
+	cgroup_favor_dynmods(root, false);
 	cgroup_exit_root_id(root);
 
 	mutex_unlock(&cgroup_mutex);
@@ -1858,6 +1873,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 enum cgroup2_param {
 	Opt_nsdelegate, Opt_nonsdelegate,
+	Opt_favordynmods, Opt_nofavordynmods,
 	Opt_memory_localevents, Opt_memory_nolocalevents,
 	Opt_memory_recursiveprot, Opt_memory_norecursiveprot,
 	nr__cgroup2_params
@@ -1866,6 +1882,8 @@ enum cgroup2_param {
 static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
 	fsparam_flag("nsdelegate",		Opt_nsdelegate),
 	fsparam_flag("nonsdelegate",		Opt_nonsdelegate),
+	fsparam_flag("favordynmods",		Opt_favordynmods),
+	fsparam_flag("nofavordynmods",		Opt_nofavordynmods),
 	fsparam_flag("memory_localevents",	Opt_memory_localevents),
 	fsparam_flag("memory_nolocalevents",	Opt_memory_nolocalevents),
 	fsparam_flag("memory_recursiveprot",	Opt_memory_recursiveprot),
@@ -1890,6 +1908,12 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_nonsdelegate:
 		ctx->flags &= ~CGRP_ROOT_NS_DELEGATE;
 		return 0;
+	case Opt_favordynmods:
+		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+		return 0;
+	case Opt_nofavordynmods:
+		ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+		return 0;
 	case Opt_memory_localevents:
 		ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
 		return 0;
@@ -1914,6 +1938,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
 		else
 			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
 
+		cgroup_favor_dynmods(&cgrp_dfl_root,
+				     root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
 		if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
 			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
 		else
@@ -1930,6 +1957,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 {
 	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
 		seq_puts(seq, ",nsdelegate");
+	if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+		seq_puts(seq, ",favordynmods");
 	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
 		seq_puts(seq, ",memory_localevents");
 	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
@@ -1980,7 +2009,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 
-	root->flags = ctx->flags;
+	/* DYNMODS must be modified through cgroup_favor_dynmods() */
+	root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
 	if (ctx->release_agent)
 		strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
 	if (ctx->name)
@@ -2202,6 +2232,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
 	put_user_ns(fc->user_ns);
 	fc->user_ns = get_user_ns(ctx->ns->user_ns);
 	fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+	ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
 	return 0;
 }
 
@@ -5854,12 +5888,6 @@ int __init cgroup_init(void)
 
 	cgroup_rstat_boot();
 
-	/*
-	 * The latency of the synchronize_rcu() is too high for cgroups,
-	 * avoid it at the cost of forcing all readers into the slow path.
-	 */
-	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
@@ -6771,6 +6799,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	return snprintf(buf, PAGE_SIZE,
 			"nsdelegate\n"
+			"favordynmods\n"
 			"memory_localevents\n"
 			"memory_recursiveprot\n");
 }
-- 
cgit 


From ba8ec7a607e98e8491a1fcf924a2e6c96ac9d413 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 23 Jul 2022 14:47:28 -0400
Subject: SUNRPC: Shrink size of struct rpc_task

Move the field 'tk_rpc_status' so that we eliminate a 4 byte hole in the
structure.
For x86_64, this shrinks the size of the struct by 8 bytes.

'pahole' output before the change:
        /* size: 232, cachelines: 4, members: 27 */
        /* sum members: 222, holes: 1, sum holes: 4 */
        /* sum bitfield members: 8 bits (1 bytes) */
        /* padding: 5 */
        /* last cacheline: 40 bytes */

'pahole' output after the change:
        /* size: 224, cachelines: 4, members: 27 */
        /* padding: 1 */
        /* last cacheline: 32 bytes */

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 1d7a3e51b795..acc62647317c 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -61,8 +61,6 @@ struct rpc_task {
 		struct rpc_wait		tk_wait;	/* RPC wait */
 	} u;
 
-	int			tk_rpc_status;	/* Result of last RPC operation */
-
 	/*
 	 * RPC call state
 	 */
@@ -82,6 +80,8 @@ struct rpc_task {
 	ktime_t			tk_start;	/* RPC task init timestamp */
 
 	pid_t			tk_owner;	/* Process id for batching tasks */
+
+	int			tk_rpc_status;	/* Result of last RPC operation */
 	unsigned short		tk_flags;	/* misc flags */
 	unsigned short		tk_timeouts;	/* maj timeouts */
 
-- 
cgit 


From 69d966510d9f5de81588b37d23a9ee8ccc477b23 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 22 Jul 2022 14:12:20 -0400
Subject: nfs: only issue commit in DIO codepath if we have uncommitted data

Currently, we try to determine whether to issue a commit based on
nfs_write_need_commit which looks at the current verifier. In the case
where we got a short write and then tried to follow it up with one that
failed, the verifier can't be trusted.

What we really want to know is whether the pgio request had any
successful writes that came back as UNSTABLE. Add a new flag to the pgio
request, and use that to indicate that we've had a successful unstable
write. Only issue a commit if that flag is set.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c         |  2 +-
 fs/nfs/write.c          | 48 ++++++++++++++++++++++++++++++------------------
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a47d13296194..86df66bb14c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -690,7 +690,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 	}
 
 	nfs_direct_count_bytes(dreq, hdr);
-	if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+	if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
 		if (!dreq->flags)
 			dreq->flags = NFS_ODIRECT_DO_COMMIT;
 		flags = dreq->flags;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1c706465d090..16d166bc4099 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1576,25 +1576,37 @@ static int nfs_writeback_done(struct rpc_task *task,
 	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 	trace_nfs_writeback_done(task, hdr);
 
-	if (hdr->res.verf->committed < hdr->args.stable &&
-	    task->tk_status >= 0) {
-		/* We tried a write call, but the server did not
-		 * commit data to stable storage even though we
-		 * requested it.
-		 * Note: There is a known bug in Tru64 < 5.0 in which
-		 *	 the server reports NFS_DATA_SYNC, but performs
-		 *	 NFS_FILE_SYNC. We therefore implement this checking
-		 *	 as a dprintk() in order to avoid filling syslog.
-		 */
-		static unsigned long    complain;
+	if (task->tk_status >= 0) {
+		enum nfs3_stable_how committed = hdr->res.verf->committed;
+
+		if (committed == NFS_UNSTABLE) {
+			/*
+			 * We have some uncommitted data on the server at
+			 * this point, so ensure that we keep track of that
+			 * fact irrespective of what later writes do.
+			 */
+			set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags);
+		}
 
-		/* Note this will print the MDS for a DS write */
-		if (time_before(complain, jiffies)) {
-			dprintk("NFS:       faulty NFS server %s:"
-				" (committed = %d) != (stable = %d)\n",
-				NFS_SERVER(inode)->nfs_client->cl_hostname,
-				hdr->res.verf->committed, hdr->args.stable);
-			complain = jiffies + 300 * HZ;
+		if (committed < hdr->args.stable) {
+			/* We tried a write call, but the server did not
+			 * commit data to stable storage even though we
+			 * requested it.
+			 * Note: There is a known bug in Tru64 < 5.0 in which
+			 *	 the server reports NFS_DATA_SYNC, but performs
+			 *	 NFS_FILE_SYNC. We therefore implement this checking
+			 *	 as a dprintk() in order to avoid filling syslog.
+			 */
+			static unsigned long    complain;
+
+			/* Note this will print the MDS for a DS write */
+			if (time_before(complain, jiffies)) {
+				dprintk("NFS:       faulty NFS server %s:"
+					" (committed = %d) != (stable = %d)\n",
+					NFS_SERVER(inode)->nfs_client->cl_hostname,
+					committed, hdr->args.stable);
+				complain = jiffies + 300 * HZ;
+			}
 		}
 	}
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0e3aa0f5f324..e86cf6642d21 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1600,6 +1600,7 @@ enum {
 	NFS_IOHDR_STAT,
 	NFS_IOHDR_RESEND_PNFS,
 	NFS_IOHDR_RESEND_MDS,
+	NFS_IOHDR_UNSTABLE_WRITES,
 };
 
 struct nfs_io_completion;
-- 
cgit 


From f67939e4b045e1c8e857055463c0b5a88eca4844 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 22 Jul 2022 15:08:17 -0400
Subject: SUNRPC: Replace dprintk() call site in xs_data_ready

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/trace/events/sunrpc.h | 20 ++++++++++++++++++++
 net/sunrpc/xprtsock.c         |  6 ++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index b61d9c90fa26..21068ad61db8 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1266,6 +1266,26 @@ TRACE_EVENT(xprt_reserve,
 	)
 );
 
+TRACE_EVENT(xs_data_ready,
+	TP_PROTO(
+		const struct rpc_xprt *xprt
+	),
+
+	TP_ARGS(xprt),
+
+	TP_STRUCT__entry(
+		__string(addr, xprt->address_strings[RPC_DISPLAY_ADDR])
+		__string(port, xprt->address_strings[RPC_DISPLAY_PORT])
+	),
+
+	TP_fast_assign(
+		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+	),
+
+	TP_printk("peer=[%s]:%s", __get_str(addr), __get_str(port))
+);
+
 TRACE_EVENT(xs_stream_read_data,
 	TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total),
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index fcdd0fca408e..eba1be9984f8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1378,7 +1378,7 @@ static void xs_udp_data_receive_workfn(struct work_struct *work)
 }
 
 /**
- * xs_data_ready - "data ready" callback for UDP sockets
+ * xs_data_ready - "data ready" callback for sockets
  * @sk: socket with data to read
  *
  */
@@ -1386,11 +1386,13 @@ static void xs_data_ready(struct sock *sk)
 {
 	struct rpc_xprt *xprt;
 
-	dprintk("RPC:       xs_data_ready...\n");
 	xprt = xprt_from_sock(sk);
 	if (xprt != NULL) {
 		struct sock_xprt *transport = container_of(xprt,
 				struct sock_xprt, xprt);
+
+		trace_xs_data_ready(xprt);
+
 		transport->old_data_ready(sk);
 		/* Any data means we had a useful conversation, so
 		 * then we don't need to delay the next reconnect
-- 
cgit 


From 4f5f3b6028343d687d0533329b130e4b8280ab32 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:31 -0400
Subject: SUNRPC: Introduce xdr_stream_move_subsegment()

I do this by creating an xdr subsegment for the range we will be
operating over. This lets me shift data to the correct place without
potentially overwriting anything already there.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5860f32e3958..7dcc6c31fe29 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -262,6 +262,8 @@ extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, uns
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 				  unsigned int len);
+extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
+					       unsigned int target, unsigned int length);
 
 /**
  * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5d2b3e6979fb..8ba11a754297 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -775,6 +775,34 @@ static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
 	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
 }
 
+static void xdr_buf_head_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+	unsigned int bytes;
+
+	if (!shift || !len)
+		return;
+
+	if (shift > base) {
+		bytes = (shift - base);
+		if (bytes >= len)
+			return;
+		base += bytes;
+		len -= bytes;
+	}
+
+	if (base < head->iov_len) {
+		bytes = min_t(unsigned int, len, head->iov_len - base);
+		memmove(head->iov_base + (base - shift),
+			head->iov_base + base, bytes);
+		base += bytes;
+		len -= bytes;
+	}
+	xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
@@ -1680,6 +1708,37 @@ bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 }
 EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
 
+/**
+ * xdr_stream_move_subsegment - Move part of a stream to another position
+ * @xdr: the source xdr_stream
+ * @offset: the source offset of the segment
+ * @target: the target offset of the segment
+ * @length: the number of bytes to move
+ *
+ * Moves @length bytes from @offset to @target in the xdr_stream, overwriting
+ * anything in its space. Returns the number of bytes in the segment.
+ */
+unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
+					unsigned int target, unsigned int length)
+{
+	struct xdr_buf buf;
+	unsigned int shift;
+
+	if (offset < target) {
+		shift = target - offset;
+		if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0)
+			return 0;
+		xdr_buf_head_shift_right(&buf, 0, length, shift);
+	} else if (offset > target) {
+		shift = offset - target;
+		if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0)
+			return 0;
+		xdr_buf_head_shift_left(&buf, shift, length, shift);
+	}
+	return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment);
+
 /**
  * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
  * @buf: buf to be trimmed
-- 
cgit 


From 7c4cd5f4d2dd4a028a46bfb696b0cd387caadf33 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:32 -0400
Subject: SUNRPC: Add a function for directly setting the xdr page len

We need to do this step during READ_PLUS decoding so that we know pages
are the right length and any extra data has been preserved in the tail.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 7dcc6c31fe29..8cd38a9994ca 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -258,6 +258,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8ba11a754297..e4ac700ca554 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1500,6 +1500,36 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
+/**
+ * xdr_set_pagelen - Sets the length of the XDR pages
+ * @xdr: pointer to xdr_stream struct
+ * @len: new length of the XDR page data
+ *
+ * Either grows or shrinks the length of the xdr pages by setting pagelen to
+ * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas
+ * when growing any data beyond the current pointer is moved into the tail.
+ *
+ * Returns True if the operation was successful, and False otherwise.
+ */
+void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+	size_t remaining = xdr_stream_remaining(xdr);
+	size_t base = 0;
+
+	if (len < buf->page_len) {
+		base = buf->page_len - len;
+		xdr_shrink_pagelen(buf, len);
+	} else {
+		xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr),
+					 buf->page_len, remaining);
+		if (len > buf->page_len)
+			xdr_buf_try_expand(buf, len - buf->page_len);
+	}
+	xdr_set_tail_base(xdr, base, remaining);
+}
+EXPORT_SYMBOL_GPL(xdr_set_pagelen);
+
 unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 			    unsigned int length)
 {
-- 
cgit 


From e1bd87608d4b6f87813f79b91e834de610f1049b Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:33 -0400
Subject: SUNRPC: Add a function for zeroing out a portion of an xdr_stream

This will be used during READ_PLUS decoding for handling HOLE segments.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8cd38a9994ca..f0ab06acab61 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -265,6 +265,8 @@ extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf
 				  unsigned int len);
 extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
 					       unsigned int target, unsigned int length);
+extern unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset,
+				    unsigned int length);
 
 /**
  * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e4ac700ca554..f09a7ab1a82b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1769,6 +1769,29 @@ unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int off
 }
 EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment);
 
+/**
+ * xdr_stream_zero - zero out a portion of an xdr_stream
+ * @xdr: an xdr_stream to zero out
+ * @offset: the starting point in the stream
+ * @length: the number of bytes to zero
+ */
+unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
+{
+	struct xdr_buf buf;
+
+	if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0)
+		return 0;
+	if (buf.head[0].iov_len)
+		xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len);
+	if (buf.page_len > 0)
+		xdr_buf_pages_zero(&buf, 0, buf.page_len);
+	if (buf.tail[0].iov_len)
+		xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len);
+	return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_zero);
+
 /**
  * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
  * @buf: buf to be trimmed
-- 
cgit 


From 29946fbcb2c31a2a367887dc58a2e7e5b012e285 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:35 -0400
Subject: SUNRPC: Remove xdr_align_data() and xdr_expand_hole()

These functions are no longer needed now that the NFS client places data
and hole segments directly.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 --
 net/sunrpc/xdr.c           | 66 ----------------------------------------------
 2 files changed, 68 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index f0ab06acab61..f38c97f45354 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -259,8 +259,6 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
-extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 				  unsigned int len);
 extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f09a7ab1a82b..482586c23fdd 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1530,72 +1530,6 @@ void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_set_pagelen);
 
-unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
-			    unsigned int length)
-{
-	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes, len;
-	unsigned int shift;
-
-	xdr_realign_pages(xdr);
-	from = xdr_page_pos(xdr);
-
-	if (from >= buf->page_len + buf->tail->iov_len)
-		return 0;
-	if (from + buf->head->iov_len >= buf->len)
-		return 0;
-
-	len = buf->len - buf->head->iov_len;
-
-	/* We only shift data left! */
-	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
-		      from, offset))
-		return 0;
-	if (WARN_ONCE(offset > buf->page_len,
-		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
-		      offset, buf->page_len))
-		return 0;
-
-	/* Move page data to the left */
-	shift = from - offset;
-	xdr_buf_pages_shift_left(buf, from, len, shift);
-
-	bytes = xdr_stream_remaining(xdr);
-	if (length > bytes)
-		length = bytes;
-	bytes -= length;
-
-	xdr->buf->len -= shift;
-	xdr_set_page(xdr, offset + length, bytes);
-	return length;
-}
-EXPORT_SYMBOL_GPL(xdr_align_data);
-
-unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
-			     unsigned int length)
-{
-	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, to, shift;
-
-	xdr_realign_pages(xdr);
-	from = xdr_page_pos(xdr);
-	to = xdr_align_size(offset + length);
-
-	/* Could the hole be behind us? */
-	if (to > from) {
-		unsigned int buflen = buf->len - buf->head->iov_len;
-		shift = to - from;
-		xdr_buf_try_expand(buf, shift);
-		xdr_buf_pages_shift_right(buf, from, buflen, shift);
-		xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
-	} else if (to != from)
-		xdr_align_data(xdr, to, 0);
-	xdr_buf_pages_zero(buf, offset, length);
-
-	return length;
-}
-EXPORT_SYMBOL_GPL(xdr_expand_hole);
-
 /**
  * xdr_enter_page - decode data from the XDR page
  * @xdr: pointer to xdr_stream struct
-- 
cgit 


From 2af28b241eea816e6f7668d1954f15894b45d7e3 Mon Sep 17 00:00:00 2001
From: David Collins <quic_collinsd@quicinc.com>
Date: Mon, 27 Jun 2022 16:55:12 -0700
Subject: spmi: trace: fix stack-out-of-bound access in SPMI tracing functions

trace_spmi_write_begin() and trace_spmi_read_end() both call
memcpy() with a length of "len + 1".  This leads to one extra
byte being read beyond the end of the specified buffer.  Fix
this out-of-bound memory access by using a length of "len"
instead.

Here is a KASAN log showing the issue:

BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_spmi_read_end+0x1d0/0x234
Read of size 2 at addr ffffffc0265b7540 by task thermal@2.0-ser/1314
...
Call trace:
 dump_backtrace+0x0/0x3e8
 show_stack+0x2c/0x3c
 dump_stack_lvl+0xdc/0x11c
 print_address_description+0x74/0x384
 kasan_report+0x188/0x268
 kasan_check_range+0x270/0x2b0
 memcpy+0x90/0xe8
 trace_event_raw_event_spmi_read_end+0x1d0/0x234
 spmi_read_cmd+0x294/0x3ac
 spmi_ext_register_readl+0x84/0x9c
 regmap_spmi_ext_read+0x144/0x1b0 [regmap_spmi]
 _regmap_raw_read+0x40c/0x754
 regmap_raw_read+0x3a0/0x514
 regmap_bulk_read+0x418/0x494
 adc5_gen3_poll_wait_hs+0xe8/0x1e0 [qcom_spmi_adc5_gen3]
 ...
 __arm64_sys_read+0x4c/0x60
 invoke_syscall+0x80/0x218
 el0_svc_common+0xec/0x1c8
 ...

addr ffffffc0265b7540 is located in stack of task thermal@2.0-ser/1314 at offset 32 in frame:
 adc5_gen3_poll_wait_hs+0x0/0x1e0 [qcom_spmi_adc5_gen3]

this frame has 1 object:
 [32, 33) 'status'

Memory state around the buggy address:
 ffffffc0265b7400: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
 ffffffc0265b7480: 04 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
>ffffffc0265b7500: 00 00 00 00 f1 f1 f1 f1 01 f3 f3 f3 00 00 00 00
                                           ^
 ffffffc0265b7580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffffffc0265b7600: f1 f1 f1 f1 01 f2 07 f2 f2 f2 01 f3 00 00 00 00
==================================================================

Fixes: a9fce374815d ("spmi: add command tracepoints for SPMI")
Cc: stable@vger.kernel.org
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: David Collins <quic_collinsd@quicinc.com>
Link: https://lore.kernel.org/r/20220627235512.2272783-1-quic_collinsd@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/trace/events/spmi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/spmi.h b/include/trace/events/spmi.h
index 8b60efe18ba6..a6819fd85cdf 100644
--- a/include/trace/events/spmi.h
+++ b/include/trace/events/spmi.h
@@ -21,15 +21,15 @@ TRACE_EVENT(spmi_write_begin,
 		__field		( u8,         sid       )
 		__field		( u16,        addr      )
 		__field		( u8,         len       )
-		__dynamic_array	( u8,   buf,  len + 1   )
+		__dynamic_array	( u8,   buf,  len       )
 	),
 
 	TP_fast_assign(
 		__entry->opcode = opcode;
 		__entry->sid    = sid;
 		__entry->addr   = addr;
-		__entry->len    = len + 1;
-		memcpy(__get_dynamic_array(buf), buf, len + 1);
+		__entry->len    = len;
+		memcpy(__get_dynamic_array(buf), buf, len);
 	),
 
 	TP_printk("opc=%d sid=%02d addr=0x%04x len=%d buf=0x[%*phD]",
@@ -92,7 +92,7 @@ TRACE_EVENT(spmi_read_end,
 		__field		( u16,        addr      )
 		__field		( int,        ret       )
 		__field		( u8,         len       )
-		__dynamic_array	( u8,   buf,  len + 1   )
+		__dynamic_array	( u8,   buf,  len       )
 	),
 
 	TP_fast_assign(
@@ -100,8 +100,8 @@ TRACE_EVENT(spmi_read_end,
 		__entry->sid    = sid;
 		__entry->addr   = addr;
 		__entry->ret    = ret;
-		__entry->len    = len + 1;
-		memcpy(__get_dynamic_array(buf), buf, len + 1);
+		__entry->len    = len;
+		memcpy(__get_dynamic_array(buf), buf, len);
 	),
 
 	TP_printk("opc=%d sid=%02d addr=0x%04x ret=%d len=%02d buf=0x[%*phD]",
-- 
cgit 


From b9ba8a4463cd78d0aee520c4bf2569820ac29929 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 May 2022 10:55:07 -0600
Subject: io_uring: add support for level triggered poll

By default, the POLL_ADD command does edge triggered poll - if we get
a non-zero mask on the initial poll attempt, we complete the request
successfully.

Support level triggered by always waiting for a notification, regardless
of whether or not the initial mask matches the file state.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/poll.c               | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ad3da28d2fc..4927bb69387a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -229,10 +229,13 @@ enum io_uring_op {
  *
  * IORING_POLL_UPDATE		Update existing poll request, matching
  *				sqe->addr as the old user_data field.
+ *
+ * IORING_POLL_LEVEL		Level triggered poll.
  */
 #define IORING_POLL_ADD_MULTI	(1U << 0)
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
+#define IORING_POLL_ADD_LEVEL		(1U << 3)
 
 /*
  * ASYNC_CANCEL flags.
diff --git a/io_uring/poll.c b/io_uring/poll.c
index b80f7fa26123..558dc170468a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -423,11 +423,13 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	atomic_set(&req->poll_refs, 1);
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
-	if (mask && (poll->events & EPOLLONESHOT)) {
+	if (mask &&
+	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		io_poll_remove_entries(req);
 		/* no one else has access to the req, forget about the ref */
 		return mask;
 	}
+
 	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
 		io_poll_remove_entries(req);
 		if (!ipt->error)
@@ -439,7 +441,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	io_poll_req_insert(req);
 	spin_unlock(&ctx->completion_lock);
 
-	if (mask) {
+	if (mask && (poll->events & EPOLLET)) {
 		/* can't multishot if failed, just queue the event we've got */
 		if (unlikely(ipt->error || !ipt->nr_entries)) {
 			poll->events |= EPOLLONESHOT;
@@ -475,7 +477,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
-	__poll_t mask = POLLPRI | POLLERR;
+	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
 	int ret;
 
 	if (!def->pollin && !def->pollout)
@@ -638,7 +640,10 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 #endif
 	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
-	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+	if (!(flags & IORING_POLL_ADD_LEVEL))
+		events |= EPOLLET;
+	return demangle_poll(events) |
+		(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
 }
 
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -679,7 +684,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
 		return -EINVAL;
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;
-- 
cgit 


From 97bbdc06a4446bc69d8ba71d722abae542a6b70c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:08 +0100
Subject: io_uring: add IORING_SETUP_SINGLE_ISSUER

Add a new IORING_SETUP_SINGLE_ISSUER flag and the userspace visible part
of it, i.e. put limitations of submitters. Also, don't allow it together
with IOPOLL as we're not going to put it to good use.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4bcc41ee467fdf04c8aab8baf6ce3ba21858c3d4.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 ++++-
 io_uring/io_uring.c           |  7 +++++--
 io_uring/io_uring_types.h     |  1 +
 io_uring/tctx.c               | 27 ++++++++++++++++++++++++---
 io_uring/tctx.h               |  4 ++--
 5 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4927bb69387a..d7ae81b10893 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -140,9 +140,12 @@ enum {
  * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
  */
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
-
 #define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
 #define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER	(1U << 12)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 85a479594b05..06772139b7da 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2457,6 +2457,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
+	if (ctx->submitter_task)
+		put_task_struct(ctx->submitter_task);
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
@@ -3189,7 +3191,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
@@ -3409,7 +3411,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
-			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
+			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
+			IORING_SETUP_SINGLE_ISSUER))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 1f8db2dd7af7..8b00243abf65 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -243,6 +243,7 @@ struct io_ring_ctx {
 	/* Keep this last, we don't need it for the fast path */
 
 	struct io_restriction		restrictions;
+	struct task_struct		*submitter_task;
 
 	/* slow path rsrc auxilary data, used by update/register */
 	struct io_rsrc_node		*rsrc_backup_node;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 5a5d4f908529..a819da8fc85c 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -94,12 +94,32 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	return 0;
 }
 
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static int io_register_submitter(struct io_ring_ctx *ctx)
+{
+	int ret = 0;
+
+	mutex_lock(&ctx->uring_lock);
+	if (!ctx->submitter_task)
+		ctx->submitter_task = get_task_struct(current);
+	else if (ctx->submitter_task != current)
+		ret = -EEXIST;
+	mutex_unlock(&ctx->uring_lock);
+
+	return ret;
+}
+
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
 	int ret;
 
+	if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) {
+		ret = io_register_submitter(ctx);
+		if (ret)
+			return ret;
+	}
+
 	if (unlikely(!tctx)) {
 		ret = io_uring_alloc_task_context(current, ctx);
 		if (unlikely(ret))
@@ -133,7 +153,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 		list_add(&node->ctx_node, &ctx->tctx_list);
 		mutex_unlock(&ctx->uring_lock);
 	}
-	tctx->last = ctx;
+	if (submitter)
+		tctx->last = ctx;
 	return 0;
 }
 
@@ -241,7 +262,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
 		return -EINVAL;
 
 	mutex_unlock(&ctx->uring_lock);
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	mutex_lock(&ctx->uring_lock);
 	if (ret)
 		return ret;
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index 7684713e950f..dde82ce4d8e2 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -34,7 +34,7 @@ struct io_tctx_node {
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 void io_uring_del_tctx_node(unsigned long index);
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter);
 void io_uring_clean_tctx(struct io_uring_task *tctx);
 
 void io_uring_unreg_ringfd(void);
@@ -52,5 +52,5 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_tctx_node(ctx);
+	return __io_uring_add_tctx_node(ctx, true);
 }
-- 
cgit 


From ab1c84d855cf2c284fa0f4b17fc04063659c54a1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 13:57:19 +0100
Subject: io_uring: make io_uring_types.h public

Move io_uring types to linux/include, need them public so tracing can
see the definitions and we can clean trace/events/io_uring.h

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a15f12e8cb7289b2de0deaddcc7518d98a132d17.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 554 +++++++++++++++++++++++++++++++++++++++++
 io_uring/filetable.h           |  11 -
 io_uring/io-wq.h               |  17 +-
 io_uring/io_uring.h            |   4 +-
 io_uring/io_uring_types.h      | 530 ---------------------------------------
 io_uring/refs.h                |   2 +-
 6 files changed, 559 insertions(+), 559 deletions(-)
 create mode 100644 include/linux/io_uring_types.h
 delete mode 100644 io_uring/io_uring_types.h

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
new file mode 100644
index 000000000000..779c72da5b8f
--- /dev/null
+++ b/include/linux/io_uring_types.h
@@ -0,0 +1,554 @@
+#ifndef IO_URING_TYPES_H
+#define IO_URING_TYPES_H
+
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
+#include <linux/bitmap.h>
+#include <uapi/linux/io_uring.h>
+
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
+struct io_wq_work {
+	struct io_wq_work_node list;
+	unsigned flags;
+	/* place it here instead of io_kiocb as it fills padding and saves 4B */
+	int cancel_seq;
+};
+
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
+struct io_file_table {
+	struct io_fixed_file *files;
+	unsigned long *bitmap;
+	unsigned int alloc_hint;
+};
+
+struct io_hash_bucket {
+	spinlock_t		lock;
+	struct hlist_head	list;
+} ____cacheline_aligned_in_smp;
+
+struct io_hash_table {
+	struct io_hash_bucket	*hbs;
+	unsigned		hash_bits;
+};
+
+struct io_uring {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+	/*
+	 * Head and tail offsets into the ring; the offsets need to be
+	 * masked to get valid indices.
+	 *
+	 * The kernel controls head of the sq ring and the tail of the cq ring,
+	 * and the application controls tail of the sq ring and the head of the
+	 * cq ring.
+	 */
+	struct io_uring		sq, cq;
+	/*
+	 * Bitmasks to apply to head and tail offsets (constant, equals
+	 * ring_entries - 1)
+	 */
+	u32			sq_ring_mask, cq_ring_mask;
+	/* Ring sizes (constant, power of 2) */
+	u32			sq_ring_entries, cq_ring_entries;
+	/*
+	 * Number of invalid entries dropped by the kernel due to
+	 * invalid index stored in array
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * After a new SQ head value was read by the application this
+	 * counter includes all submissions that were dropped reaching
+	 * the new SQ head (and possibly more).
+	 */
+	u32			sq_dropped;
+	/*
+	 * Runtime SQ flags
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application.
+	 *
+	 * The application needs a full memory barrier before checking
+	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+	 */
+	atomic_t		sq_flags;
+	/*
+	 * Runtime CQ flags
+	 *
+	 * Written by the application, shouldn't be modified by the
+	 * kernel.
+	 */
+	u32			cq_flags;
+	/*
+	 * Number of completion events lost because the queue was full;
+	 * this should be avoided by the application by making sure
+	 * there are not more requests pending than there is space in
+	 * the completion queue.
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * As completion events come in out of order this counter is not
+	 * ordered with any other data.
+	 */
+	u32			cq_overflow;
+	/*
+	 * Ring buffer of completion events.
+	 *
+	 * The kernel writes completion events fresh every time they are
+	 * produced, so the application is allowed to modify pending
+	 * entries.
+	 */
+	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_restriction {
+	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+	u8 sqe_flags_allowed;
+	u8 sqe_flags_required;
+	bool registered;
+};
+
+struct io_submit_link {
+	struct io_kiocb		*head;
+	struct io_kiocb		*last;
+};
+
+struct io_submit_state {
+	/* inline/task_work completion list, under ->uring_lock */
+	struct io_wq_work_node	free_list;
+	/* batch completion logic */
+	struct io_wq_work_list	compl_reqs;
+	struct io_submit_link	link;
+
+	bool			plug_started;
+	bool			need_plug;
+	bool			flush_cqes;
+	unsigned short		submit_nr;
+	struct blk_plug		plug;
+};
+
+struct io_ev_fd {
+	struct eventfd_ctx	*cq_ev_fd;
+	unsigned int		eventfd_async: 1;
+	struct rcu_head		rcu;
+};
+
+struct io_ring_ctx {
+	/* const or read-mostly hot data */
+	struct {
+		struct percpu_ref	refs;
+
+		struct io_rings		*rings;
+		unsigned int		flags;
+		enum task_work_notify_mode	notify_method;
+		unsigned int		compat: 1;
+		unsigned int		drain_next: 1;
+		unsigned int		restricted: 1;
+		unsigned int		off_timeout_used: 1;
+		unsigned int		drain_active: 1;
+		unsigned int		drain_disabled: 1;
+		unsigned int		has_evfd: 1;
+		unsigned int		syscall_iopoll: 1;
+	} ____cacheline_aligned_in_smp;
+
+	/* submission data */
+	struct {
+		struct mutex		uring_lock;
+
+		/*
+		 * Ring buffer of indices into array of io_uring_sqe, which is
+		 * mmapped by the application using the IORING_OFF_SQES offset.
+		 *
+		 * This indirection could e.g. be used to assign fixed
+		 * io_uring_sqe entries to operations and only submit them to
+		 * the queue when needed.
+		 *
+		 * The kernel modifies neither the indices array nor the entries
+		 * array.
+		 */
+		u32			*sq_array;
+		struct io_uring_sqe	*sq_sqes;
+		unsigned		cached_sq_head;
+		unsigned		sq_entries;
+
+		/*
+		 * Fixed resources fast path, should be accessed only under
+		 * uring_lock, and updated through io_uring_register(2)
+		 */
+		struct io_rsrc_node	*rsrc_node;
+		int			rsrc_cached_refs;
+		atomic_t		cancel_seq;
+		struct io_file_table	file_table;
+		unsigned		nr_user_files;
+		unsigned		nr_user_bufs;
+		struct io_mapped_ubuf	**user_bufs;
+
+		struct io_submit_state	submit_state;
+
+		struct io_buffer_list	*io_bl;
+		struct xarray		io_bl_xa;
+		struct list_head	io_buffers_cache;
+
+		struct io_hash_table	cancel_table_locked;
+		struct list_head	cq_overflow_list;
+		struct list_head	apoll_cache;
+		struct xarray		personalities;
+		u32			pers_next;
+	} ____cacheline_aligned_in_smp;
+
+	/* IRQ completion list, under ->completion_lock */
+	struct io_wq_work_list	locked_free_list;
+	unsigned int		locked_free_nr;
+
+	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
+	struct io_sq_data	*sq_data;	/* if using sq thread polling */
+
+	struct wait_queue_head	sqo_sq_wait;
+	struct list_head	sqd_list;
+
+	unsigned long		check_cq;
+
+	struct {
+		/*
+		 * We cache a range of free CQEs we can use, once exhausted it
+		 * should go through a slower range setup, see __io_get_cqe()
+		 */
+		struct io_uring_cqe	*cqe_cached;
+		struct io_uring_cqe	*cqe_sentinel;
+
+		unsigned		cached_cq_tail;
+		unsigned		cq_entries;
+		struct io_ev_fd	__rcu	*io_ev_fd;
+		struct wait_queue_head	cq_wait;
+		unsigned		cq_extra;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t		completion_lock;
+
+		/*
+		 * ->iopoll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct io_wq_work_list	iopoll_list;
+		struct io_hash_table	cancel_table;
+		bool			poll_multi_queue;
+
+		struct list_head	io_buffers_comp;
+	} ____cacheline_aligned_in_smp;
+
+	/* timeouts */
+	struct {
+		spinlock_t		timeout_lock;
+		atomic_t		cq_timeouts;
+		struct list_head	timeout_list;
+		struct list_head	ltimeout_list;
+		unsigned		cq_last_tm_flush;
+	} ____cacheline_aligned_in_smp;
+
+	/* Keep this last, we don't need it for the fast path */
+
+	struct io_restriction		restrictions;
+	struct task_struct		*submitter_task;
+
+	/* slow path rsrc auxilary data, used by update/register */
+	struct io_rsrc_node		*rsrc_backup_node;
+	struct io_mapped_ubuf		*dummy_ubuf;
+	struct io_rsrc_data		*file_data;
+	struct io_rsrc_data		*buf_data;
+
+	struct delayed_work		rsrc_put_work;
+	struct llist_head		rsrc_put_llist;
+	struct list_head		rsrc_ref_list;
+	spinlock_t			rsrc_ref_lock;
+
+	struct list_head		io_buffers_pages;
+
+	#if defined(CONFIG_UNIX)
+		struct socket		*ring_sock;
+	#endif
+	/* hashed buffered write serialization */
+	struct io_wq_hash		*hash_map;
+
+	/* Only used for accounting purposes */
+	struct user_struct		*user;
+	struct mm_struct		*mm_account;
+
+	/* ctx exit and cancelation */
+	struct llist_head		fallback_llist;
+	struct delayed_work		fallback_work;
+	struct work_struct		exit_work;
+	struct list_head		tctx_list;
+	struct completion		ref_comp;
+
+	/* io-wq management, e.g. thread count */
+	u32				iowq_limits[2];
+	bool				iowq_limits_set;
+
+	struct list_head		defer_list;
+	unsigned			sq_thread_idle;
+};
+
+enum {
+	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
+	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
+	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
+	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
+	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
+	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
+	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
+
+	/* first byte is taken by user flags, shift it to not overlap */
+	REQ_F_FAIL_BIT		= 8,
+	REQ_F_INFLIGHT_BIT,
+	REQ_F_CUR_POS_BIT,
+	REQ_F_NOWAIT_BIT,
+	REQ_F_LINK_TIMEOUT_BIT,
+	REQ_F_NEED_CLEANUP_BIT,
+	REQ_F_POLLED_BIT,
+	REQ_F_BUFFER_SELECTED_BIT,
+	REQ_F_BUFFER_RING_BIT,
+	REQ_F_REISSUE_BIT,
+	REQ_F_CREDS_BIT,
+	REQ_F_REFCOUNT_BIT,
+	REQ_F_ARM_LTIMEOUT_BIT,
+	REQ_F_ASYNC_DATA_BIT,
+	REQ_F_SKIP_LINK_CQES_BIT,
+	REQ_F_SINGLE_POLL_BIT,
+	REQ_F_DOUBLE_POLL_BIT,
+	REQ_F_PARTIAL_IO_BIT,
+	REQ_F_CQE32_INIT_BIT,
+	REQ_F_APOLL_MULTISHOT_BIT,
+	REQ_F_CLEAR_POLLIN_BIT,
+	REQ_F_HASH_LOCKED_BIT,
+	/* keep async read/write and isreg together and in order */
+	REQ_F_SUPPORT_NOWAIT_BIT,
+	REQ_F_ISREG_BIT,
+
+	/* not a real bit, just to check we're not overflowing the space */
+	__REQ_F_LAST_BIT,
+};
+
+enum {
+	/* ctx owns file */
+	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
+	/* drain existing IO first */
+	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
+	/* linked sqes */
+	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
+	/* doesn't sever on completion < 0 */
+	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
+	/* IOSQE_ASYNC */
+	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
+	/* IOSQE_BUFFER_SELECT */
+	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
+	/* IOSQE_CQE_SKIP_SUCCESS */
+	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
+
+	/* fail rest of links */
+	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
+	/* on inflight list, should be cancelled and waited on exit reliably */
+	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
+	/* read/write uses file position */
+	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
+	/* must not punt to workers */
+	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
+	/* has or had linked timeout */
+	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
+	/* needs cleanup */
+	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
+	/* already went through poll handler */
+	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
+	/* buffer already selected */
+	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
+	/* buffer selected from ring, needs commit */
+	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
+	/* caller should reissue async */
+	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
+	/* supports async reads/writes */
+	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+	/* regular file */
+	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
+	/* has creds assigned */
+	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
+	/* skip refcounting if not set */
+	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
+	/* there is a linked timeout that has to be armed */
+	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
+	/* ->async_data allocated */
+	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
+	/* don't post CQEs while failing linked requests */
+	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
+	/* single poll may be active */
+	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
+	/* double poll may active */
+	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
+	/* request has already done partial IO */
+	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
+	/* fast poll multishot mode */
+	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
+	/* ->extra1 and ->extra2 are initialised */
+	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
+	/* recvmsg special flag, clear EPOLLIN */
+	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
+	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
+	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
+};
+
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+
+struct io_task_work {
+	union {
+		struct io_wq_work_node	node;
+		struct llist_node	fallback_node;
+	};
+	io_req_tw_func_t		func;
+};
+
+struct io_cqe {
+	__u64	user_data;
+	__s32	res;
+	/* fd initially, then cflags for completion */
+	union {
+		__u32	flags;
+		int	fd;
+	};
+};
+
+/*
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
+ */
+struct io_cmd_data {
+	struct file		*file;
+	/* each command gets 56 bytes of data */
+	__u8			data[56];
+};
+
+#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
+
+struct io_kiocb {
+	union {
+		/*
+		 * NOTE! Each of the io_kiocb union members has the file pointer
+		 * as the first entry in their struct definition. So you can
+		 * access the file pointer through any of the sub-structs,
+		 * or directly as just 'file' in this struct.
+		 */
+		struct file		*file;
+		struct io_cmd_data	cmd;
+	};
+
+	u8				opcode;
+	/* polled IO has completed */
+	u8				iopoll_completed;
+	/*
+	 * Can be either a fixed buffer index, or used with provided buffers.
+	 * For the latter, before issue it points to the buffer group ID,
+	 * and after selection it points to the buffer ID itself.
+	 */
+	u16				buf_index;
+	unsigned int			flags;
+
+	struct io_cqe			cqe;
+
+	struct io_ring_ctx		*ctx;
+	struct task_struct		*task;
+
+	struct io_rsrc_node		*rsrc_node;
+
+	union {
+		/* store used ubuf, so we can prevent reloading */
+		struct io_mapped_ubuf	*imu;
+
+		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+		struct io_buffer	*kbuf;
+
+		/*
+		 * stores buffer ID for ring provided buffers, valid IFF
+		 * REQ_F_BUFFER_RING is set.
+		 */
+		struct io_buffer_list	*buf_list;
+	};
+
+	union {
+		/* used by request caches, completion batching and iopoll */
+		struct io_wq_work_node	comp_list;
+		/* cache ->apoll->events */
+		__poll_t apoll_events;
+	};
+	atomic_t			refs;
+	atomic_t			poll_refs;
+	struct io_task_work		io_task_work;
+	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+	union {
+		struct hlist_node	hash_node;
+		struct {
+			u64		extra1;
+			u64		extra2;
+		};
+	};
+	/* internal polling, see IORING_FEAT_FAST_POLL */
+	struct async_poll		*apoll;
+	/* opcode allocated if it needs to store data for async defer */
+	void				*async_data;
+	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+	struct io_kiocb			*link;
+	/* custom credentials, valid IFF REQ_F_CREDS is set */
+	const struct cred		*creds;
+	struct io_wq_work		work;
+};
+
+struct io_cancel_data {
+	struct io_ring_ctx *ctx;
+	union {
+		u64 data;
+		struct file *file;
+	};
+	u32 flags;
+	int seq;
+};
+
+struct io_overflow_cqe {
+	struct list_head list;
+	struct io_uring_cqe cqe;
+};
+
+struct io_mapped_ubuf {
+	u64		ubuf;
+	u64		ubuf_end;
+	unsigned int	nr_bvecs;
+	unsigned long	acct_pages;
+	struct bio_vec	bvec[];
+};
+
+#endif
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index c404360f7090..6b58aa48bc45 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -22,17 +22,6 @@ struct io_kiocb;
 #endif
 #define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
-struct io_file_table {
-	struct io_fixed_file *files;
-	unsigned long *bitmap;
-	unsigned int alloc_hint;
-};
-
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 3f54ee2a8eeb..10b80ef78bb8 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -2,6 +2,7 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
+#include <linux/io_uring_types.h>
 
 struct io_wq;
 
@@ -20,15 +21,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
@@ -152,13 +144,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 	return node;
 }
 
-struct io_wq_work {
-	struct io_wq_work_node list;
-	unsigned flags;
-	/* place it here instead of io_kiocb as it fills padding and saves 4B */
-	int cancel_seq;
-};
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
 	if (!work->list.next)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 558a860a93fc..5eaa01c4697c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -3,7 +3,9 @@
 
 #include <linux/errno.h>
 #include <linux/lockdep.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
+#include "io-wq.h"
+#include "filetable.h"
 
 #ifndef CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
deleted file mode 100644
index 65ac7cdaaa73..000000000000
--- a/io_uring/io_uring_types.h
+++ /dev/null
@@ -1,530 +0,0 @@
-#ifndef IO_URING_TYPES_H
-#define IO_URING_TYPES_H
-
-#include <linux/blkdev.h>
-#include <linux/task_work.h>
-#include <linux/bitmap.h>
-#include <uapi/linux/io_uring.h>
-
-#include "io-wq.h"
-#include "filetable.h"
-
-struct io_hash_bucket {
-	spinlock_t		lock;
-	struct hlist_head	list;
-} ____cacheline_aligned_in_smp;
-
-struct io_hash_table {
-	struct io_hash_bucket	*hbs;
-	unsigned		hash_bits;
-};
-
-struct io_uring {
-	u32 head ____cacheline_aligned_in_smp;
-	u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
-	/*
-	 * Head and tail offsets into the ring; the offsets need to be
-	 * masked to get valid indices.
-	 *
-	 * The kernel controls head of the sq ring and the tail of the cq ring,
-	 * and the application controls tail of the sq ring and the head of the
-	 * cq ring.
-	 */
-	struct io_uring		sq, cq;
-	/*
-	 * Bitmasks to apply to head and tail offsets (constant, equals
-	 * ring_entries - 1)
-	 */
-	u32			sq_ring_mask, cq_ring_mask;
-	/* Ring sizes (constant, power of 2) */
-	u32			sq_ring_entries, cq_ring_entries;
-	/*
-	 * Number of invalid entries dropped by the kernel due to
-	 * invalid index stored in array
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application (i.e. get number of "new events" by comparing to
-	 * cached value).
-	 *
-	 * After a new SQ head value was read by the application this
-	 * counter includes all submissions that were dropped reaching
-	 * the new SQ head (and possibly more).
-	 */
-	u32			sq_dropped;
-	/*
-	 * Runtime SQ flags
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application.
-	 *
-	 * The application needs a full memory barrier before checking
-	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
-	 */
-	atomic_t		sq_flags;
-	/*
-	 * Runtime CQ flags
-	 *
-	 * Written by the application, shouldn't be modified by the
-	 * kernel.
-	 */
-	u32			cq_flags;
-	/*
-	 * Number of completion events lost because the queue was full;
-	 * this should be avoided by the application by making sure
-	 * there are not more requests pending than there is space in
-	 * the completion queue.
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application (i.e. get number of "new events" by comparing to
-	 * cached value).
-	 *
-	 * As completion events come in out of order this counter is not
-	 * ordered with any other data.
-	 */
-	u32			cq_overflow;
-	/*
-	 * Ring buffer of completion events.
-	 *
-	 * The kernel writes completion events fresh every time they are
-	 * produced, so the application is allowed to modify pending
-	 * entries.
-	 */
-	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
-};
-
-struct io_restriction {
-	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
-	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
-	u8 sqe_flags_allowed;
-	u8 sqe_flags_required;
-	bool registered;
-};
-
-struct io_submit_link {
-	struct io_kiocb		*head;
-	struct io_kiocb		*last;
-};
-
-struct io_submit_state {
-	/* inline/task_work completion list, under ->uring_lock */
-	struct io_wq_work_node	free_list;
-	/* batch completion logic */
-	struct io_wq_work_list	compl_reqs;
-	struct io_submit_link	link;
-
-	bool			plug_started;
-	bool			need_plug;
-	bool			flush_cqes;
-	unsigned short		submit_nr;
-	struct blk_plug		plug;
-};
-
-struct io_ev_fd {
-	struct eventfd_ctx	*cq_ev_fd;
-	unsigned int		eventfd_async: 1;
-	struct rcu_head		rcu;
-};
-
-struct io_ring_ctx {
-	/* const or read-mostly hot data */
-	struct {
-		struct percpu_ref	refs;
-
-		struct io_rings		*rings;
-		unsigned int		flags;
-		enum task_work_notify_mode	notify_method;
-		unsigned int		compat: 1;
-		unsigned int		drain_next: 1;
-		unsigned int		restricted: 1;
-		unsigned int		off_timeout_used: 1;
-		unsigned int		drain_active: 1;
-		unsigned int		drain_disabled: 1;
-		unsigned int		has_evfd: 1;
-		unsigned int		syscall_iopoll: 1;
-	} ____cacheline_aligned_in_smp;
-
-	/* submission data */
-	struct {
-		struct mutex		uring_lock;
-
-		/*
-		 * Ring buffer of indices into array of io_uring_sqe, which is
-		 * mmapped by the application using the IORING_OFF_SQES offset.
-		 *
-		 * This indirection could e.g. be used to assign fixed
-		 * io_uring_sqe entries to operations and only submit them to
-		 * the queue when needed.
-		 *
-		 * The kernel modifies neither the indices array nor the entries
-		 * array.
-		 */
-		u32			*sq_array;
-		struct io_uring_sqe	*sq_sqes;
-		unsigned		cached_sq_head;
-		unsigned		sq_entries;
-
-		/*
-		 * Fixed resources fast path, should be accessed only under
-		 * uring_lock, and updated through io_uring_register(2)
-		 */
-		struct io_rsrc_node	*rsrc_node;
-		int			rsrc_cached_refs;
-		atomic_t		cancel_seq;
-		struct io_file_table	file_table;
-		unsigned		nr_user_files;
-		unsigned		nr_user_bufs;
-		struct io_mapped_ubuf	**user_bufs;
-
-		struct io_submit_state	submit_state;
-
-		struct io_buffer_list	*io_bl;
-		struct xarray		io_bl_xa;
-		struct list_head	io_buffers_cache;
-
-		struct io_hash_table	cancel_table_locked;
-		struct list_head	cq_overflow_list;
-		struct list_head	apoll_cache;
-		struct xarray		personalities;
-		u32			pers_next;
-	} ____cacheline_aligned_in_smp;
-
-	/* IRQ completion list, under ->completion_lock */
-	struct io_wq_work_list	locked_free_list;
-	unsigned int		locked_free_nr;
-
-	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
-	struct io_sq_data	*sq_data;	/* if using sq thread polling */
-
-	struct wait_queue_head	sqo_sq_wait;
-	struct list_head	sqd_list;
-
-	unsigned long		check_cq;
-
-	struct {
-		/*
-		 * We cache a range of free CQEs we can use, once exhausted it
-		 * should go through a slower range setup, see __io_get_cqe()
-		 */
-		struct io_uring_cqe	*cqe_cached;
-		struct io_uring_cqe	*cqe_sentinel;
-
-		unsigned		cached_cq_tail;
-		unsigned		cq_entries;
-		struct io_ev_fd	__rcu	*io_ev_fd;
-		struct wait_queue_head	cq_wait;
-		unsigned		cq_extra;
-	} ____cacheline_aligned_in_smp;
-
-	struct {
-		spinlock_t		completion_lock;
-
-		/*
-		 * ->iopoll_list is protected by the ctx->uring_lock for
-		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
-		 * For SQPOLL, only the single threaded io_sq_thread() will
-		 * manipulate the list, hence no extra locking is needed there.
-		 */
-		struct io_wq_work_list	iopoll_list;
-		struct io_hash_table	cancel_table;
-		bool			poll_multi_queue;
-
-		struct list_head	io_buffers_comp;
-	} ____cacheline_aligned_in_smp;
-
-	/* timeouts */
-	struct {
-		spinlock_t		timeout_lock;
-		atomic_t		cq_timeouts;
-		struct list_head	timeout_list;
-		struct list_head	ltimeout_list;
-		unsigned		cq_last_tm_flush;
-	} ____cacheline_aligned_in_smp;
-
-	/* Keep this last, we don't need it for the fast path */
-
-	struct io_restriction		restrictions;
-	struct task_struct		*submitter_task;
-
-	/* slow path rsrc auxilary data, used by update/register */
-	struct io_rsrc_node		*rsrc_backup_node;
-	struct io_mapped_ubuf		*dummy_ubuf;
-	struct io_rsrc_data		*file_data;
-	struct io_rsrc_data		*buf_data;
-
-	struct delayed_work		rsrc_put_work;
-	struct llist_head		rsrc_put_llist;
-	struct list_head		rsrc_ref_list;
-	spinlock_t			rsrc_ref_lock;
-
-	struct list_head		io_buffers_pages;
-
-	#if defined(CONFIG_UNIX)
-		struct socket		*ring_sock;
-	#endif
-	/* hashed buffered write serialization */
-	struct io_wq_hash		*hash_map;
-
-	/* Only used for accounting purposes */
-	struct user_struct		*user;
-	struct mm_struct		*mm_account;
-
-	/* ctx exit and cancelation */
-	struct llist_head		fallback_llist;
-	struct delayed_work		fallback_work;
-	struct work_struct		exit_work;
-	struct list_head		tctx_list;
-	struct completion		ref_comp;
-
-	/* io-wq management, e.g. thread count */
-	u32				iowq_limits[2];
-	bool				iowq_limits_set;
-
-	struct list_head		defer_list;
-	unsigned			sq_thread_idle;
-};
-
-enum {
-	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
-	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
-	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
-	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
-	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
-	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
-	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
-
-	/* first byte is taken by user flags, shift it to not overlap */
-	REQ_F_FAIL_BIT		= 8,
-	REQ_F_INFLIGHT_BIT,
-	REQ_F_CUR_POS_BIT,
-	REQ_F_NOWAIT_BIT,
-	REQ_F_LINK_TIMEOUT_BIT,
-	REQ_F_NEED_CLEANUP_BIT,
-	REQ_F_POLLED_BIT,
-	REQ_F_BUFFER_SELECTED_BIT,
-	REQ_F_BUFFER_RING_BIT,
-	REQ_F_REISSUE_BIT,
-	REQ_F_CREDS_BIT,
-	REQ_F_REFCOUNT_BIT,
-	REQ_F_ARM_LTIMEOUT_BIT,
-	REQ_F_ASYNC_DATA_BIT,
-	REQ_F_SKIP_LINK_CQES_BIT,
-	REQ_F_SINGLE_POLL_BIT,
-	REQ_F_DOUBLE_POLL_BIT,
-	REQ_F_PARTIAL_IO_BIT,
-	REQ_F_CQE32_INIT_BIT,
-	REQ_F_APOLL_MULTISHOT_BIT,
-	REQ_F_CLEAR_POLLIN_BIT,
-	REQ_F_HASH_LOCKED_BIT,
-	/* keep async read/write and isreg together and in order */
-	REQ_F_SUPPORT_NOWAIT_BIT,
-	REQ_F_ISREG_BIT,
-
-	/* not a real bit, just to check we're not overflowing the space */
-	__REQ_F_LAST_BIT,
-};
-
-enum {
-	/* ctx owns file */
-	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
-	/* drain existing IO first */
-	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
-	/* linked sqes */
-	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
-	/* doesn't sever on completion < 0 */
-	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
-	/* IOSQE_ASYNC */
-	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
-	/* IOSQE_BUFFER_SELECT */
-	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
-	/* IOSQE_CQE_SKIP_SUCCESS */
-	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
-
-	/* fail rest of links */
-	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
-	/* on inflight list, should be cancelled and waited on exit reliably */
-	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
-	/* read/write uses file position */
-	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
-	/* must not punt to workers */
-	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
-	/* has or had linked timeout */
-	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
-	/* needs cleanup */
-	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
-	/* already went through poll handler */
-	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
-	/* buffer already selected */
-	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
-	/* buffer selected from ring, needs commit */
-	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
-	/* caller should reissue async */
-	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
-	/* supports async reads/writes */
-	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
-	/* regular file */
-	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
-	/* has creds assigned */
-	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
-	/* skip refcounting if not set */
-	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
-	/* there is a linked timeout that has to be armed */
-	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
-	/* ->async_data allocated */
-	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
-	/* don't post CQEs while failing linked requests */
-	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
-	/* single poll may be active */
-	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
-	/* double poll may active */
-	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
-	/* request has already done partial IO */
-	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
-	/* fast poll multishot mode */
-	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
-	/* ->extra1 and ->extra2 are initialised */
-	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
-	/* recvmsg special flag, clear EPOLLIN */
-	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
-	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
-	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
-};
-
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
-	union {
-		struct io_wq_work_node	node;
-		struct llist_node	fallback_node;
-	};
-	io_req_tw_func_t		func;
-};
-
-struct io_cqe {
-	__u64	user_data;
-	__s32	res;
-	/* fd initially, then cflags for completion */
-	union {
-		__u32	flags;
-		int	fd;
-	};
-};
-
-/*
- * Each request type overlays its private data structure on top of this one.
- * They must not exceed this one in size.
- */
-struct io_cmd_data {
-	struct file		*file;
-	/* each command gets 56 bytes of data */
-	__u8			data[56];
-};
-
-#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
-#define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
-
-struct io_kiocb {
-	union {
-		/*
-		 * NOTE! Each of the io_kiocb union members has the file pointer
-		 * as the first entry in their struct definition. So you can
-		 * access the file pointer through any of the sub-structs,
-		 * or directly as just 'file' in this struct.
-		 */
-		struct file		*file;
-		struct io_cmd_data	cmd;
-	};
-
-	u8				opcode;
-	/* polled IO has completed */
-	u8				iopoll_completed;
-	/*
-	 * Can be either a fixed buffer index, or used with provided buffers.
-	 * For the latter, before issue it points to the buffer group ID,
-	 * and after selection it points to the buffer ID itself.
-	 */
-	u16				buf_index;
-	unsigned int			flags;
-
-	struct io_cqe			cqe;
-
-	struct io_ring_ctx		*ctx;
-	struct task_struct		*task;
-
-	struct io_rsrc_node		*rsrc_node;
-
-	union {
-		/* store used ubuf, so we can prevent reloading */
-		struct io_mapped_ubuf	*imu;
-
-		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
-		struct io_buffer	*kbuf;
-
-		/*
-		 * stores buffer ID for ring provided buffers, valid IFF
-		 * REQ_F_BUFFER_RING is set.
-		 */
-		struct io_buffer_list	*buf_list;
-	};
-
-	union {
-		/* used by request caches, completion batching and iopoll */
-		struct io_wq_work_node	comp_list;
-		/* cache ->apoll->events */
-		__poll_t apoll_events;
-	};
-	atomic_t			refs;
-	atomic_t			poll_refs;
-	struct io_task_work		io_task_work;
-	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	union {
-		struct hlist_node	hash_node;
-		struct {
-			u64		extra1;
-			u64		extra2;
-		};
-	};
-	/* internal polling, see IORING_FEAT_FAST_POLL */
-	struct async_poll		*apoll;
-	/* opcode allocated if it needs to store data for async defer */
-	void				*async_data;
-	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
-	struct io_kiocb			*link;
-	/* custom credentials, valid IFF REQ_F_CREDS is set */
-	const struct cred		*creds;
-	struct io_wq_work		work;
-};
-
-struct io_cancel_data {
-	struct io_ring_ctx *ctx;
-	union {
-		u64 data;
-		struct file *file;
-	};
-	u32 flags;
-	int seq;
-};
-
-struct io_overflow_cqe {
-	struct list_head list;
-	struct io_uring_cqe cqe;
-};
-
-struct io_mapped_ubuf {
-	u64		ubuf;
-	u64		ubuf_end;
-	unsigned int	nr_bvecs;
-	unsigned long	acct_pages;
-	struct bio_vec	bvec[];
-};
-
-#endif
diff --git a/io_uring/refs.h b/io_uring/refs.h
index 334c5ead4c43..1336de3f2a30 100644
--- a/io_uring/refs.h
+++ b/io_uring/refs.h
@@ -2,7 +2,7 @@
 #define IOU_REQ_REF_H
 
 #include <linux/atomic.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
 
 /*
  * Shamelessly stolen from the mm implementation of page reference checking,
-- 
cgit 


From 48863ffd3e81b6ec98606d3479c50aa77b7a98a9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 13:57:20 +0100
Subject: io_uring: clean up tracing events

We have lots of trace events accepting an io_uring request and wanting
to print some of its fields like user_data, opcode, flags and so on.
However, as trace points were unaware of io_uring structures, we had to
pass all the fields as arguments. Teach trace/events/io_uring.h about
struct io_kiocb and stop the misery of passing a horde of arguments to
trace helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/40ff72f92798114e56d400f2b003beb6cde6ef53.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 142 ++++++++++++++++------------------------
 io_uring/io_uring.c             |  16 ++---
 io_uring/poll.c                 |   5 +-
 io_uring/timeout.c              |   3 +-
 4 files changed, 66 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index aa2f951b07cd..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -7,6 +7,7 @@
 
 #include <linux/tracepoint.h>
 #include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
 #include <linux/io_uring.h>
 
 struct io_wq_work;
@@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
 /**
  * io_uring_file_get - called before getting references to an SQE file
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a submitted request
- * @user_data:	user data associated with the request
  * @fd:		SQE file descriptor
  *
  * Allows to trace out how often an SQE file reference is obtained, which can
@@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
  */
 TRACE_EVENT(io_uring_file_get,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
+	TP_PROTO(struct io_kiocb *req, int fd),
 
-	TP_ARGS(ctx, req, user_data, fd),
+	TP_ARGS(req, fd),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
+		__entry->user_data	= req->cqe.user_data;
 		__entry->fd		= fd;
 	),
 
@@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
 /**
  * io_uring_queue_async_work - called before submitting a new async work
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a submitted request
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
- * @flags	request flags
- * @work:	pointer to a submitted io_wq_work
  * @rw:		type of workqueue, hashed or normal
  *
  * Allows to trace asynchronous work submission.
  */
 TRACE_EVENT(io_uring_queue_async_work,
 
-	TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
-		unsigned int flags, struct io_wq_work *work, int rw),
+	TP_PROTO(struct io_kiocb *req, int rw),
 
-	TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
+	TP_ARGS(req, rw),
 
 	TP_STRUCT__entry (
 		__field(  void *,			ctx		)
@@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  struct io_wq_work *,		work		)
 		__field(  int,				rw		)
 
-		__string( op_str, io_uring_get_opcode(opcode)	)
+		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->flags		= flags;
-		__entry->opcode		= opcode;
-		__entry->work		= work;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->flags		= req->flags;
+		__entry->opcode		= req->opcode;
+		__entry->work		= &req->work;
 		__entry->rw		= rw;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
@@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
 /**
  * io_uring_defer - called when an io_uring request is deferred
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a deferred request
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
  *
  * Allows to track deferred requests, to get an insight about what requests are
  * not started immediately.
  */
 TRACE_EVENT(io_uring_defer,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
+	TP_PROTO(struct io_kiocb *req),
 
-	TP_ARGS(ctx, req, user_data, opcode),
+	TP_ARGS(req),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx	)
@@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
 		__field(  unsigned long long,	data	)
 		__field(  u8,			opcode	)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx	= ctx;
+		__entry->ctx	= req->ctx;
 		__entry->req	= req;
-		__entry->data	= user_data;
-		__entry->opcode	= opcode;
+		__entry->data	= req->cqe.user_data;
+		__entry->opcode	= req->opcode;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
@@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
  * io_uring_link - called before the io_uring request added into link_list of
  * 		   another request
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to a linked request
  * @target_req:		pointer to a previous request, that would contain @req
  *
@@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
  */
 TRACE_EVENT(io_uring_link,
 
-	TP_PROTO(void *ctx, void *req, void *target_req),
+	TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
 
-	TP_ARGS(ctx, req, target_req),
+	TP_ARGS(req, target_req),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
 		__entry->target_req	= target_req;
 	),
@@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
 /**
  * io_uring_fail_link - called before failing a linked request
  *
- * @ctx:	pointer to a ring context structure
  * @req:	request, which links were cancelled
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
  * @link:	cancelled link
  *
  * Allows to track linked requests cancellation, to see not only that some work
@@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
  */
 TRACE_EVENT(io_uring_fail_link,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
+	TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
 
-	TP_ARGS(ctx, req, user_data, opcode, link),
+	TP_ARGS(req, link),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
 		__field(  u8,			opcode		)
 		__field(  void *,		link		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->link		= link;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
@@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
 /**
  * io_uring_submit_sqe - called before submitting one SQE
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to a submitted request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
- * @flags		request flags
  * @force_nonblock:	whether a context blocking or not
- * @sq_thread:		true if sq_thread has submitted this SQE
  *
  * Allows to track SQE submitting, to understand what was the source of it, SQ
  * thread or io_uring_enter call.
  */
 TRACE_EVENT(io_uring_submit_sqe,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
-		 bool force_nonblock, bool sq_thread),
+	TP_PROTO(struct io_kiocb *req, bool force_nonblock),
 
-	TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
+	TP_ARGS(req, force_nonblock),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
-		__entry->flags		= flags;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
+		__entry->flags		= req->flags;
 		__entry->force_nonblock	= force_nonblock;
-		__entry->sq_thread	= sq_thread;
+		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
@@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
 /*
  * io_uring_poll_arm - called after arming a poll wait if successful
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to the armed request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
  * @mask:		request poll events mask
  * @events:		registered events of interest
  *
@@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
  */
 TRACE_EVENT(io_uring_poll_arm,
 
-	TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
-		 int mask, int events),
+	TP_PROTO(struct io_kiocb *req, int mask, int events),
 
-	TP_ARGS(ctx, req, user_data, opcode, mask, events),
+	TP_ARGS(req, mask, events),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
 		__field(  int,			mask		)
 		__field(  int,			events		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 		__entry->events		= events;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
@@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
 /*
  * io_uring_task_add - called after adding a task
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
  * @mask:		request poll events mask
  *
  */
 TRACE_EVENT(io_uring_task_add,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
+	TP_PROTO(struct io_kiocb *req, int mask),
 
-	TP_ARGS(ctx, req, user_data, opcode, mask),
+	TP_ARGS(req, mask),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
@@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
  * io_uring_req_failed - called when an sqe is errored dring submission
  *
  * @sqe:		pointer to the io_uring_sqe that failed
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to request
  * @error:		error it failed with
  *
@@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
  */
 TRACE_EVENT(io_uring_req_failed,
 
-	TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
+	TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
 
-	TP_ARGS(sqe, ctx, req, error),
+	TP_ARGS(sqe, req, error),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
 		__entry->user_data	= sqe->user_data;
 		__entry->opcode		= sqe->opcode;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0a1f83a936b7..ef4371790aaa 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -452,9 +452,7 @@ void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 		req->work.flags |= IO_WQ_WORK_CANCEL;
 
-	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
-					req->opcode, req->flags, &req->work,
-					io_wq_is_hashed(&req->work));
+	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
 	if (link)
 		io_queue_linked_timeout(link);
@@ -1583,7 +1581,7 @@ fail:
 		goto queue;
 	}
 
-	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
+	trace_io_uring_defer(req);
 	de->req = req;
 	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
@@ -1783,7 +1781,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 {
 	struct file *file = fget(fd);
 
-	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
+	trace_io_uring_file_get(req, fd);
 
 	/* we don't allow fixed io_uring files */
 	if (file && io_is_uring_fops(file))
@@ -2006,7 +2004,7 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
 	struct io_submit_link *link = &ctx->submit_state.link;
 	struct io_kiocb *head = link->head;
 
-	trace_io_uring_req_failed(sqe, ctx, req, ret);
+	trace_io_uring_req_failed(sqe, req, ret);
 
 	/*
 	 * Avoid breaking links in the middle as it renders links with SQPOLL
@@ -2048,9 +2046,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return io_submit_fail_init(sqe, req, ret);
 
 	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
-				  req->flags, true,
-				  ctx->flags & IORING_SETUP_SQPOLL);
+	trace_io_uring_submit_sqe(req, true);
 
 	/*
 	 * If we already have a head request, queue this one for async
@@ -2064,7 +2060,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		if (unlikely(ret))
 			return io_submit_fail_init(sqe, req, ret);
 
-		trace_io_uring_link(ctx, req, link->head);
+		trace_io_uring_link(req, link->head);
 		link->last->link = req;
 		link->last = req;
 
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 63aca920543b..b2659b56c702 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -288,7 +288,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask,
 	else
 		req->io_task_work.func = io_apoll_task_func;
 
-	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
+	trace_io_uring_task_add(req, mask);
 	io_req_task_work_add(req);
 }
 
@@ -558,8 +558,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	if (ret || ipt.error)
 		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 
-	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
-				mask, apoll->poll.events);
+	trace_io_uring_poll_arm(req, mask, apoll->poll.events);
 	return IO_APOLL_OK;
 }
 
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index f9df359813c9..557c637af158 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -115,8 +115,7 @@ static void io_fail_links(struct io_kiocb *req)
 		nxt = link->link;
 		link->link = NULL;
 
-		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
-					req->opcode, link);
+		trace_io_uring_fail_link(req, link);
 
 		if (ignore_cqes)
 			link->flags |= REQ_F_CQE_SKIP;
-- 
cgit 


From ad163a7e2562230c77102c60f668bac440e60cce Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 19:44:33 -0600
Subject: io_uring: move a few private types to local headers

Commit 3a3d47fa9cfd ("io_uring: make io_uring_types.h public") moved
a bunch of io_uring types to a kernel wide header, so we could make
tracing a bit saner rather than pass in a ton of arguments.

However, there are a few types in there that are not really needed to
be system wide. Move the cancel data and mapped buffers back to the
appropriate io_uring local headers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 18 ------------------
 io_uring/cancel.h              | 13 +++++++++++++
 io_uring/fdinfo.c              |  1 +
 io_uring/poll.h                |  1 +
 io_uring/rsrc.h                |  8 ++++++++
 io_uring/timeout.h             |  1 +
 6 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 779c72da5b8f..2015f3ea7cb7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -528,27 +528,9 @@ struct io_kiocb {
 	struct io_wq_work		work;
 };
 
-struct io_cancel_data {
-	struct io_ring_ctx *ctx;
-	union {
-		u64 data;
-		struct file *file;
-	};
-	u32 flags;
-	int seq;
-};
-
 struct io_overflow_cqe {
 	struct list_head list;
 	struct io_uring_cqe cqe;
 };
 
-struct io_mapped_ubuf {
-	u64		ubuf;
-	u64		ubuf_end;
-	unsigned int	nr_bvecs;
-	unsigned long	acct_pages;
-	struct bio_vec	bvec[];
-};
-
 #endif
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 8dd259dc383e..2338012a5b06 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -1,5 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/io_uring_types.h>
+
+struct io_cancel_data {
+	struct io_ring_ctx *ctx;
+	union {
+		u64 data;
+		struct file *file;
+	};
+	u32 flags;
+	int seq;
+};
+
+
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 61c35707a6cf..b29e2d02216f 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -13,6 +13,7 @@
 #include "sqpoll.h"
 #include "fdinfo.h"
 #include "cancel.h"
+#include "rsrc.h"
 
 #ifdef CONFIG_PROC_FS
 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
diff --git a/io_uring/poll.h b/io_uring/poll.h
index fa3e19790281..c40673d7da01 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -24,6 +24,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
 
+struct io_cancel_data;
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		   unsigned issue_flags);
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 872c86312cbc..03f26516e994 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -45,6 +45,14 @@ struct io_rsrc_node {
 	bool				done;
 };
 
+struct io_mapped_ubuf {
+	u64		ubuf;
+	u64		ubuf_end;
+	unsigned int	nr_bvecs;
+	unsigned long	acct_pages;
+	struct bio_vec	bvec[];
+};
+
 void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index dd7cfb0d9366..858c62644897 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -22,6 +22,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
 }
 
 __cold void io_flush_timeouts(struct io_ring_ctx *ctx);
+struct io_cancel_data;
 int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
 __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			     bool cancel_all);
-- 
cgit 


From d9dee4302a7cbd6c0142dbdf6d150acc7459de0d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 19 Jun 2022 12:26:08 +0100
Subject: io_uring: remove ->flush_cqes optimisation

It's not clear how widely used IOSQE_CQE_SKIP_SUCCESS is, and how often
->flush_cqes flag prevents from completion being flushed. Sometimes it's
high level of concurrency that enables it at least for one CQE, but
sometimes it doesn't save much because nobody waiting on the CQ.

Remove ->flush_cqes flag and the optimisation, it should benefit the
normal use case. Note, that there is no spurious eventfd problem with
that as checks for spuriousness were incorporated into
io_eventfd_signal().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/692e81eeddccc096f449a7960365fa7b4a18f8e6.1655637157.git.asml.silence@gmail.com
[axboe: remove now dead state->flush_cqes variable]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/io_uring.c            | 23 ++++++++++-------------
 io_uring/io_uring.h            |  2 --
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2015f3ea7cb7..6bcd7bff6479 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -148,7 +148,6 @@ struct io_submit_state {
 
 	bool			plug_started;
 	bool			need_plug;
-	bool			flush_cqes;
 	unsigned short		submit_nr;
 	struct blk_plug		plug;
 };
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 61d4e6d0731a..16a625e854ec 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1250,22 +1250,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_wq_work_node *node, *prev;
 	struct io_submit_state *state = &ctx->submit_state;
 
-	if (state->flush_cqes) {
-		spin_lock(&ctx->completion_lock);
-		wq_list_for_each(node, prev, &state->compl_reqs) {
-			struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    comp_list);
-
-			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(ctx, req);
-		}
+	spin_lock(&ctx->completion_lock);
+	wq_list_for_each(node, prev, &state->compl_reqs) {
+		struct io_kiocb *req = container_of(node, struct io_kiocb,
+					    comp_list);
 
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		io_cqring_ev_posted(ctx);
-		state->flush_cqes = false;
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe_req(ctx, req);
 	}
 
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
+
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7b2055b342df..bdc62727638b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -219,8 +219,6 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 {
 	struct io_submit_state *state = &req->ctx->submit_state;
 
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		state->flush_cqes = true;
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
-- 
cgit 


From 305bef98870816ae58357d647521891ec558a92e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:55 +0100
Subject: io_uring: hide eventfd assumptions in eventfd paths

Some io_uring-eventfd users assume that there won't be spurious wakeups.
That assumption has to be honoured by all io_cqring_ev_posted() callers,
which is inconvenient and from time to time leads to problems but should
be maintained to not break the userspace.

Instead of making the callers track whether a CQE was posted or not, hide
it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw last time
and triggers the eventfd only when ->cached_cq_tail changed since then.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ffc66bae37a2513080b601e4370e147faaa72c5.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 44 +++++++++++++++++++++++++-----------------
 io_uring/timeout.c             |  3 +--
 3 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 6bcd7bff6479..5987f8acca38 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -314,6 +314,8 @@ struct io_ring_ctx {
 
 	struct list_head		defer_list;
 	unsigned			sq_thread_idle;
+	/* protected by ->completion_lock */
+	unsigned			evfd_last_cq_tail;
 };
 
 enum {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 707b599b9224..84f923625216 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -473,6 +473,22 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
+	bool skip;
+
+	spin_lock(&ctx->completion_lock);
+	/*
+	 * Eventfd should only get triggered when at least one event has been
+	 * posted. Some applications rely on the eventfd notification count only
+	 * changing IFF a new CQE has been added to the CQ ring. There's no
+	 * depedency on 1:1 relationship between how many times this function is
+	 * called (and hence the eventfd count) and number of CQEs posted to the
+	 * CQ ring.
+	 */
+	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+	if (skip)
+		return;
 
 	rcu_read_lock();
 	/*
@@ -511,13 +527,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
 void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
@@ -530,7 +539,7 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
-	bool all_flushed, posted;
+	bool all_flushed;
 	size_t cqe_size = sizeof(struct io_uring_cqe);
 
 	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
@@ -539,7 +548,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (ctx->flags & IORING_SETUP_CQE32)
 		cqe_size <<= 1;
 
-	posted = false;
 	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
@@ -554,7 +562,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		else
 			io_account_cq_overflow(ctx);
 
-		posted = true;
 		list_del(&ocqe->list);
 		kfree(ocqe);
 	}
@@ -567,8 +574,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return all_flushed;
 }
 
@@ -758,8 +764,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (filled)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return filled;
 }
 
@@ -940,14 +945,12 @@ __cold void io_free_req(struct io_kiocb *req)
 static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	bool posted;
 
 	spin_lock(&ctx->completion_lock);
-	posted = io_disarm_next(req);
+	io_disarm_next(req);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 }
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -2428,6 +2431,11 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 		kfree(ev_fd);
 		return ret;
 	}
+
+	spin_lock(&ctx->completion_lock);
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+
 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index a79a7d6ef1b3..424b2fc858b8 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -629,7 +629,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	spin_unlock_irq(&ctx->timeout_lock);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (canceled != 0)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return canceled != 0;
 }
-- 
cgit 


From f88262e60bb9cb5740891672ce9f405e7f9393e5 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:23 -0700
Subject: io_uring: lockless task list

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-4-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 38 +++++++++-----------------------------
 io_uring/tctx.c                |  3 +--
 io_uring/tctx.h                |  6 +++---
 4 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
-		struct io_wq_work_node	node;
+		struct llist_node	node;
 		struct llist_node	fallback_node;
 	};
 	io_req_tw_func_t		func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bf7ca2b279d3..0124335c6d09 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
 	do {
-		struct io_wq_work_node *next = node->next;
+		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
+	struct llist_node *node = llist_del_all(&tctx->task_list);
 
-	while (1) {
-		struct io_wq_work_node *node;
-
-		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
-		INIT_WQ_LIST(&tctx->task_list);
-		if (!node)
-			tctx->task_running = false;
-		spin_unlock_irq(&tctx->task_lock);
-		if (!node)
-			break;
+	if (node) {
 		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
-
-		if (data_race(!tctx->task_list.first) && uring_locked)
-			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_wq_work_node *node;
-	unsigned long flags;
+	struct llist_node *node;
 	bool running;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	running = tctx->task_running;
-	if (!running)
-		tctx->task_running = true;
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+	running = !llist_add(&req->io_task_work.node, &tctx->task_list);
 
 	/* task_work already pending, we're done */
 	if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	tctx->task_running = false;
-	node = tctx->task_list.first;
-	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
+	init_llist_head(&tctx->task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/llist.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -19,9 +21,7 @@ struct io_uring_task {
 	struct percpu_counter		inflight;
 
 	struct { /* task_work */
-		spinlock_t		task_lock;
-		bool			task_running;
-		struct io_wq_work_list	task_list;
+		struct llist_head	task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
cgit 


From eccd8801858f03a19a4d1f8fef27e3d6e17b21fd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:27 -0700
Subject: io_uring: add trace event for running task work

This is useful for investigating if task_work is batching

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-8-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		  __entry->cflags, __entry->ocqe)
 );
 
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx:		pointer to a io_uring_task
+ * @count:		how many functions it ran
+ * @loops:		how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+	TP_ARGS(tctx, count, loops),
+
+	TP_STRUCT__entry (
+		__field(  void *,		tctx		)
+		__field(  unsigned int,		count		)
+		__field(  unsigned int,		loops		)
+	),
+
+	TP_fast_assign(
+		__entry->tctx		= tctx;
+		__entry->count		= count;
+		__entry->loops		= loops;
+	),
+
+	TP_printk("tctx %p, count %u, loops %u",
+		 __entry->tctx, __entry->count, __entry->loops)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
-- 
cgit 


From 7d8ca7250197096bfa9f432c1d99b0555504bbba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 09:47:04 -0600
Subject: io_uring: add IORING_ASYNC_CANCEL_FD_FIXED cancel flag

In preparation for not having a request to pass in that carries this
state, add a separate cancelation flag that allows the caller to ask
for a fixed file for cancelation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 ++
 io_uring/cancel.c             | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d7ae81b10893..a09a78bd7556 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -247,10 +247,12 @@ enum io_uring_op {
  * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
  *				request 'user_data'
  * IORING_ASYNC_CANCEL_ANY	Match any request
+ * IORING_ASYNC_CANCEL_FD_FIXED	'fd' passed in is a fixed descriptor
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
+#define IORING_ASYNC_CANCEL_FD_FIXED	(1U << 3)
 
 /*
  * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 500ee5f5fd23..da486de07029 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -24,7 +24,7 @@ struct io_cancel {
 };
 
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
-			 IORING_ASYNC_CANCEL_ANY)
+			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
 
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
@@ -174,11 +174,14 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->flags & REQ_F_FIXED_FILE)
+		if (req->flags & REQ_F_FIXED_FILE ||
+		    cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
+			req->flags |= REQ_F_FIXED_FILE;
 			req->file = io_file_get_fixed(req, cancel->fd,
 							issue_flags);
-		else
+		} else {
 			req->file = io_file_get_normal(req, cancel->fd);
+		}
 		if (!req->file) {
 			ret = -EBADF;
 			goto done;
-- 
cgit 


From 78a861b9495920f8609dee5b670dacbff09d359f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 10:00:50 -0600
Subject: io_uring: add sync cancelation API through io_uring_register()

The io_uring cancelation API is async, like any other API that we expose
there. For the case of finding a request to cancel, or not finding one,
it is fully sync in that when submission returns, the CQE for both the
cancelation request and the targeted request have been posted to the
CQ ring.

However, if the targeted work is being executed by io-wq, the API can
only start the act of canceling it. This makes it difficult to use in
some circumstances, as the caller then has to wait for the CQEs to come
in and match on the same cancelation data there.

Provide a IORING_REGISTER_SYNC_CANCEL command for io_uring_register()
that does sync cancelations, always. For the io-wq case, it'll wait
for the cancelation to come in before returning. The only expected
returns from this API is:

0		Request found and canceled fine.
> 0		Requests found and canceled. Only happens if asked to
		cancel multiple requests, and if the work wasn't in
		progress.
-ENOENT		Request not found.
-ETIME		A timeout on the operation was requested, but the timeout
		expired before we could cancel.

and we won't get -EALREADY via this API.

If the timeout value passed in is -1 (tv_sec and tv_nsec), then that
means that no timeout is requested. Otherwise, the timespec passed in
is the amount of time the sync cancel will wait for a successful
cancelation.

Link: https://github.com/axboe/liburing/discussions/608
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  15 ++++++
 io_uring/cancel.c             | 107 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/cancel.h             |   2 +
 io_uring/io_uring.c           |   6 +++
 4 files changed, 130 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a09a78bd7556..094f706c93e0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -10,6 +10,7 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
+#include <linux/time_types.h>
 
 /*
  * IO submission data structure (Submission Queue Entry)
@@ -428,6 +429,9 @@ enum {
 	IORING_REGISTER_PBUF_RING		= 22,
 	IORING_UNREGISTER_PBUF_RING		= 23,
 
+	/* sync cancelation API */
+	IORING_REGISTER_SYNC_CANCEL		= 24,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -563,4 +567,15 @@ struct io_uring_getevents_arg {
 	__u64	ts;
 };
 
+/*
+ * Argument for IORING_REGISTER_SYNC_CANCEL
+ */
+struct io_uring_sync_cancel_reg {
+	__u64				addr;
+	__s32				fd;
+	__u32				flags;
+	struct __kernel_timespec	timeout;
+	__u64				pad[4];
+};
+
 #endif
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index da486de07029..8435a1eba59a 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -206,3 +207,109 @@ void init_hash_table(struct io_hash_table *table, unsigned size)
 		INIT_HLIST_HEAD(&table->hbs[i].list);
 	}
 }
+
+static int __io_sync_cancel(struct io_uring_task *tctx,
+			    struct io_cancel_data *cd, int fd)
+{
+	struct io_ring_ctx *ctx = cd->ctx;
+
+	/* fixed must be grabbed every time since we drop the uring_lock */
+	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
+	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		unsigned long file_ptr;
+
+		if (unlikely(fd > ctx->nr_user_files))
+			return -EBADF;
+		fd = array_index_nospec(fd, ctx->nr_user_files);
+		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+		cd->file = (struct file *) (file_ptr & FFS_MASK);
+		if (!cd->file)
+			return -EBADF;
+	}
+
+	return __io_async_cancel(cd, tctx, 0);
+}
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_cancel_data cd = {
+		.ctx	= ctx,
+		.seq	= atomic_inc_return(&ctx->cancel_seq),
+	};
+	ktime_t timeout = KTIME_MAX;
+	struct io_uring_sync_cancel_reg sc;
+	struct fd f = { };
+	DEFINE_WAIT(wait);
+	int ret;
+
+	if (copy_from_user(&sc, arg, sizeof(sc)))
+		return -EFAULT;
+	if (sc.flags & ~CANCEL_FLAGS)
+		return -EINVAL;
+	if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
+		return -EINVAL;
+
+	cd.data = sc.addr;
+	cd.flags = sc.flags;
+
+	/* we can grab a normal file descriptor upfront */
+	if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
+	   !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		f = fdget(sc.fd);
+		if (!f.file)
+			return -EBADF;
+		cd.file = f.file;
+	}
+
+	ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+	/* found something, done! */
+	if (ret != -EALREADY)
+		goto out;
+
+	if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
+		struct timespec64 ts = {
+			.tv_sec		= sc.timeout.tv_sec,
+			.tv_nsec	= sc.timeout.tv_nsec
+		};
+
+		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+	}
+
+	/*
+	 * Keep looking until we get -ENOENT. we'll get woken everytime
+	 * every time a request completes and will retry the cancelation.
+	 */
+	do {
+		cd.seq = atomic_inc_return(&ctx->cancel_seq);
+
+		prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
+
+		ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+		if (ret != -EALREADY)
+			break;
+
+		mutex_unlock(&ctx->uring_lock);
+		ret = io_run_task_work_sig();
+		if (ret < 0) {
+			mutex_lock(&ctx->uring_lock);
+			break;
+		}
+		ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
+		mutex_lock(&ctx->uring_lock);
+		if (!ret) {
+			ret = -ETIME;
+			break;
+		}
+	} while (1);
+
+	finish_wait(&ctx->cq_wait, &wait);
+
+	if (ret == -ENOENT || ret > 0)
+		ret = 0;
+out:
+	fdput(f);
+	return ret;
+}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 1bc7e917ce94..6a59ee484d0c 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -19,3 +19,5 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 		  unsigned int issue_flags);
 void init_hash_table(struct io_hash_table *table, unsigned size);
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 997b915a1ff7..45538b3c3a76 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3871,6 +3871,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_pbuf_ring(ctx, arg);
 		break;
+	case IORING_REGISTER_SYNC_CANCEL:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_sync_cancel(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit 


From 3218e5d32dbcf1b9c6dc589eca21deebb14215fa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:52:59 +0100
Subject: io_uring: fuse fallback_node and normal tw node

Now as both normal and fallback paths use llist, just keep one node head
in struct io_task_work and kill off ->fallback_node.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d04ebde409f7b162fe247b361b4486b193293e46.1656153285.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 +----
 io_uring/io_uring.c            | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 918165a20053..3ca8f363f504 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -427,10 +427,7 @@ enum {
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
-	union {
-		struct llist_node	node;
-		struct llist_node	fallback_node;
-	};
+	struct llist_node		node;
 	io_req_tw_func_t		func;
 };
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 45538b3c3a76..86a0b0c6f5bf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -233,7 +233,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 	bool locked = false;
 
 	percpu_ref_get(&ctx->refs);
-	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &locked);
 
 	if (locked) {
@@ -1091,13 +1091,12 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-
 	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
 		node = node->next;
-		if (llist_add(&req->io_task_work.fallback_node,
+		if (llist_add(&req->io_task_work.node,
 			      &req->ctx->fallback_llist))
 			schedule_delayed_work(&req->ctx->fallback_work, 1);
 	}
-- 
cgit 


From 8fcf4c48f44bd7b1b75db139f56ff1ad6477379e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 28 Jun 2022 21:33:20 +0200
Subject: io_uring: replace zero-length array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 094f706c93e0..8fe0275cdaf3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -495,7 +495,7 @@ struct io_uring_probe {
 	__u8 ops_len;	/* length of ops[] array below */
 	__u16 resv;
 	__u32 resv2[3];
-	struct io_uring_probe_op ops[0];
+	struct io_uring_probe_op ops[];
 };
 
 struct io_uring_restriction {
-- 
cgit 


From e6130eba8a848a7a6ba6c534bd8f6d60749ae1a9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 04:47:02 -0600
Subject: io_uring: add support for passing fixed file descriptors

With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.

Arguments are extended to pass in:

sqe->addr3	fixed file slot in source ring
sqe->file_index	fixed file slot in destination ring

IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.

Two common use cases for this are:

1) Server needs to be shutdown or restarted, pass file descriptors to
   another onei

2) Backend is split, and one accepts connections, while others then get
  the fd passed and handle the actual connection.

Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.

By default, this will post a CQE to the target ring, similarly to how
IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message
is posted to the target ring. The issuer is expected to notify the
receiver side separately.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  17 ++++++
 io_uring/msg_ring.c           | 130 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 140 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8fe0275cdaf3..f378eabbff21 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -51,6 +51,7 @@ struct io_uring_sqe {
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
+		__u32		msg_ring_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -270,6 +271,22 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+	IORING_MSG_DATA,	/* pass sqe->len as 'res' and off as user_data */
+	IORING_MSG_SEND_FD,	/* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP	Don't post a CQE to the target ring. Not
+ *				applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be2349652..939205b30c8b 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,162 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
 #include "msg_ring.h"
 
 struct io_msg {
 	struct file			*file;
 	u64 user_data;
 	u32 len;
+	u32 cmd;
+	u32 src_fd;
+	u32 dst_fd;
+	u32 flags;
 };
 
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+
+	if (msg->src_fd || msg->dst_fd || msg->flags)
+		return -EINVAL;
+
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		return 0;
+
+	return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+				 struct io_ring_ctx *octx,
+				 unsigned int issue_flags)
+{
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_unlock(&ctx->uring_lock);
+	mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+			      struct io_ring_ctx *octx,
+			      unsigned int issue_flags)
+{
+	/*
+	 * To ensure proper ordering between the two ctxs, we can only
+	 * attempt a trylock on the target. If that fails and we already have
+	 * the source ctx lock, punt to io-wq.
+	 */
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		if (!mutex_trylock(&octx->uring_lock))
+			return -EAGAIN;
+		return 0;
+	}
+
+	/* Always grab smallest value ctx first. We know ctx != octx. */
+	if (ctx < octx) {
+		mutex_lock(&ctx->uring_lock);
+		mutex_lock(&octx->uring_lock);
+	} else {
+		mutex_lock(&octx->uring_lock);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long file_ptr;
+	struct file *src_file;
+	int ret;
+
+	if (target_ctx == ctx)
+		return -EINVAL;
+
+	ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+	if (unlikely(ret))
+		return ret;
+
+	ret = -EBADF;
+	if (unlikely(msg->src_fd >= ctx->nr_user_files))
+		goto out_unlock;
+
+	msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+	file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+	src_file = (struct file *) (file_ptr & FFS_MASK);
+	get_file(src_file);
+
+	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+	if (ret < 0) {
+		fput(src_file);
+		goto out_unlock;
+	}
+
+	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+		goto out_unlock;
+
+	/*
+	 * If this fails, the target still received the file descriptor but
+	 * wasn't notified of the fact. This means that if this request
+	 * completes with -EOVERFLOW, then the sender must ensure that a
+	 * later IORING_OP_MSG_RING delivers the message.
+	 */
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		ret = -EOVERFLOW;
+out_unlock:
+	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+	return ret;
+}
+
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
 
-	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
-		     sqe->buf_index || sqe->personality))
+	if (unlikely(sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
 	msg->user_data = READ_ONCE(sqe->off);
 	msg->len = READ_ONCE(sqe->len);
+	msg->cmd = READ_ONCE(sqe->addr);
+	msg->src_fd = READ_ONCE(sqe->addr3);
+	msg->dst_fd = READ_ONCE(sqe->file_index);
+	msg->flags = READ_ONCE(sqe->msg_ring_flags);
+	if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+		return -EINVAL;
+
 	return 0;
 }
 
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *target_ctx;
 	int ret;
 
 	ret = -EBADFD;
 	if (!io_is_uring_fops(req->file))
 		goto done;
 
-	ret = -EOVERFLOW;
-	target_ctx = req->file->private_data;
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
-		ret = 0;
+	switch (msg->cmd) {
+	case IORING_MSG_DATA:
+		ret = io_msg_ring_data(req);
+		break;
+	case IORING_MSG_SEND_FD:
+		ret = io_msg_send_fd(req, issue_flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
 
 done:
 	if (ret < 0)
-- 
cgit 


From 6e73dffbb93cb8797cd4e42e98d837edf0f1a967 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:55:38 +0100
Subject: io_uring: let to set a range for file slot allocation

From recently io_uring provides an option to allocate a file index for
operation registering fixed files. However, it's utterly unusable with
mixed approaches when for a part of files the userspace knows better
where to place it, as it may race and users don't have any sane way to
pick a slot and hoping it will not be taken.

Let the userspace to register a range of fixed file slots in which the
auto-allocation happens. The use case is splittting the fixed table in
two parts, where on of them is used for auto-allocation and another for
slot-specified operations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66ab0394e436f38437cf7c44676e1920d09687ad.1656154403.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  | 13 +++++++++++++
 io_uring/filetable.c           | 24 ++++++++++++++++++++----
 io_uring/filetable.h           | 20 +++++++++++++++++---
 io_uring/io_uring.c            |  6 ++++++
 io_uring/rsrc.c                |  2 ++
 6 files changed, 61 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3ca8f363f504..26ef11e978d4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -233,6 +233,9 @@ struct io_ring_ctx {
 
 	unsigned long		check_cq;
 
+	unsigned int		file_alloc_start;
+	unsigned int		file_alloc_end;
+
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f378eabbff21..cf95354198a3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -449,6 +449,9 @@ enum {
 	/* sync cancelation API */
 	IORING_REGISTER_SYNC_CANCEL		= 24,
 
+	/* register a range of fixed file slots for automatic slot allocation */
+	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -595,4 +598,14 @@ struct io_uring_sync_cancel_reg {
 	__u64				pad[4];
 };
 
+/*
+ * Argument for IORING_REGISTER_FILE_ALLOC_RANGE
+ * The range is specified as [off, off + len)
+ */
+struct io_uring_file_index_range {
+	__u32	off;
+	__u32	len;
+	__u64	resv;
+};
+
 #endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index abaa5ba7f655..7b473259f3f4 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -16,7 +16,7 @@
 static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
-	unsigned long nr = ctx->nr_user_files;
+	unsigned long nr = ctx->file_alloc_end;
 	int ret;
 
 	do {
@@ -24,11 +24,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 		if (ret != nr)
 			return ret;
 
-		if (!table->alloc_hint)
+		if (table->alloc_hint == ctx->file_alloc_start)
 			break;
-
 		nr = table->alloc_hint;
-		table->alloc_hint = 0;
+		table->alloc_hint = ctx->file_alloc_start;
 	} while (1);
 
 	return -ENFILE;
@@ -175,3 +174,20 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	return 0;
 }
+
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg)
+{
+	struct io_uring_file_index_range range;
+	u32 end;
+
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+	if (check_add_overflow(range.off, range.len, &end))
+		return -EOVERFLOW;
+	if (range.resv || end > ctx->nr_user_files)
+		return -EINVAL;
+
+	io_file_table_set_alloc_range(ctx, range.off, range.len);
+	return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 79eb50c1980e..ff3a712e11bf 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -3,9 +3,7 @@
 #define IOU_FILE_TABLE_H
 
 #include <linux/file.h>
-
-struct io_ring_ctx;
-struct io_kiocb;
+#include <linux/io_uring_types.h>
 
 /*
  * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
@@ -33,6 +31,9 @@ int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
 				unsigned int file_slot);
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
 
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg);
+
 unsigned int io_file_get_flags(struct file *file);
 
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
@@ -71,4 +72,17 @@ static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
 	file_slot->file_ptr = file_ptr;
 }
 
+static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
+{
+	ctx->file_table.alloc_hint = ctx->file_alloc_start;
+}
+
+static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
+						 unsigned off, unsigned len)
+{
+	ctx->file_alloc_start = off;
+	ctx->file_alloc_end = off + len;
+	io_reset_alloc_hint(ctx);
+}
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 070ee9ec9ee7..745264938a48 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3866,6 +3866,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sync_cancel(ctx, arg);
 		break;
+	case IORING_REGISTER_FILE_ALLOC_RANGE:
+		ret = -EINVAL;
+		if (!arg || nr_args)
+			break;
+		ret = io_register_file_alloc_range(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 706fa020505b..d2e589c703d0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1012,6 +1012,8 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
+	/* default it to the whole table */
+	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
 	io_rsrc_node_switch(ctx, NULL);
 	return 0;
 fail:
-- 
cgit 


From b3fdea6ecb55c3ceea866ff66486927e51a982b3 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:29 -0700
Subject: io_uring: multishot recv

Support multishot receive for io_uring.
Typical server applications will run a loop where for each recv CQE it
requeues another recv/recvmsg.

This can be simplified by using the existing multishot functionality
combined with io_uring's provided buffers.
The API is to add the IORING_RECV_MULTISHOT flag to the SQE. CQEs will
then be posted (with IORING_CQE_F_MORE flag set) when data is available
and is read. Once an error occurs or the socket ends, the multishot will
be removed and a completion without IORING_CQE_F_MORE will be posted.

The benefit to this is that the recv is much more performant.
 * Subsequent receives are queued up straight away without requiring the
   application to finish a processing loop.
 * If there are more data in the socket (sat the provided buffer size is
   smaller than the socket buffer) then the data is immediately
   returned, improving batching.
 * Poll is only armed once and reused, saving CPU cycles

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-11-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   5 +++
 io_uring/net.c                | 102 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 94 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf95354198a3..499679134961 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -263,8 +263,13 @@ enum io_uring_op {
  *				or receive and arm poll if that yields an
  *				-EAGAIN result, arm poll upfront and skip
  *				the initial transfer attempt.
+ *
+ * IORING_RECV_MULTISHOT	Multishot recv. Sets IORING_CQE_F_MORE if
+ *				the handler will continue to report
+ *				CQEs on behalf of the same SQE.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
+#define IORING_RECV_MULTISHOT	(1U << 1)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index e1eaf902f3b2..cb08a4b62840 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -389,6 +389,8 @@ int io_recvmsg_prep_async(struct io_kiocb *req)
 	return ret;
 }
 
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -399,13 +401,22 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (sr->flags & ~(RECVMSG_FLAGS))
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
+	if (sr->flags & IORING_RECV_MULTISHOT) {
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		if (sr->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		if (req->opcode == IORING_OP_RECV && sr->len)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	}
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -415,6 +426,48 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static inline void io_recv_prep_retry(struct io_kiocb *req)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	sr->done_io = 0;
+	sr->len = 0; /* get from the provided buffer */
+}
+
+/*
+ * Finishes io_recv and io_recvmsg.
+ *
+ * Returns true if it is actually finished, or false if it should run
+ * again (for multishot).
+ */
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+{
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		io_req_set_res(req, *ret, cflags);
+		*ret = IOU_OK;
+		return true;
+	}
+
+	if (*ret > 0) {
+		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
+				    cflags | IORING_CQE_F_MORE, false)) {
+			io_recv_prep_retry(req);
+			return false;
+		}
+		/*
+		 * Otherwise stop multishot but use the current result.
+		 * Probably will end up going into overflow, but this means
+		 * we cannot trust the ordering anymore
+		 */
+	}
+
+	io_req_set_res(req, *ret, cflags);
+
+	if (req->flags & REQ_F_POLLED)
+		*ret = IOU_STOP_MULTISHOT;
+	return true;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -424,6 +477,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -442,16 +496,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -463,8 +518,15 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	kmsg->msg.msg_get_inq = 1;
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -491,8 +553,11 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -505,6 +570,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
@@ -514,16 +580,17 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		sr->buf = buf;
 	}
 
-	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
+	ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
 		goto out_free;
 
@@ -543,8 +610,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = sock_recvmsg(sock, &msg, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
+		if (ret == -EAGAIN && force_nonblock) {
+			if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+
 			return -EAGAIN;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -570,8 +643,11 @@ out_free:
 	cflags = io_put_kbuf(req, issue_flags);
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
cgit 


From 9b26e811e934eebda59362c9a03d082852552574 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:30 -0700
Subject: io_uring: fix io_uring_cqe_overflow trace format

Make the trace format consistent with io_uring_complete for cflags

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-12-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 918e3a43e4b2..95a8cfaad15a 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -594,7 +594,7 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		__entry->ocqe		= ocqe;
 	),
 
-	TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
+	TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
 		  "overflow_cqe %p",
 		  __entry->ctx, __entry->user_data, __entry->res,
 		  __entry->cflags, __entry->ocqe)
-- 
cgit 


From 9b797a37c4bd83b03cedcfbd15852b836f5e562c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:16:20 -0600
Subject: io_uring: add abstraction around apoll cache

In preparation for adding limits, and one more user, abstract out the
core bits of the allocation+free cache.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 +++++-
 io_uring/alloc_cache.h         | 41 +++++++++++++++++++++++++++++++++++++++++
 io_uring/io_uring.c            |  8 ++++----
 io_uring/poll.c                | 18 +++++-------------
 io_uring/poll.h                |  9 +++++++--
 5 files changed, 62 insertions(+), 20 deletions(-)
 create mode 100644 io_uring/alloc_cache.h

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 26ef11e978d4..b548da03b563 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -158,6 +158,10 @@ struct io_ev_fd {
 	struct rcu_head		rcu;
 };
 
+struct io_alloc_cache {
+	struct hlist_head	list;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -216,7 +220,7 @@ struct io_ring_ctx {
 
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
-		struct list_head	apoll_cache;
+		struct io_alloc_cache	apoll_cache;
 		struct xarray		personalities;
 		u32			pers_next;
 	} ____cacheline_aligned_in_smp;
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
new file mode 100644
index 000000000000..98f2374c37c7
--- /dev/null
+++ b/io_uring/alloc_cache.h
@@ -0,0 +1,41 @@
+#ifndef IOU_ALLOC_CACHE_H
+#define IOU_ALLOC_CACHE_H
+
+struct io_cache_entry {
+	struct hlist_node	node;
+};
+
+static inline void io_alloc_cache_put(struct io_alloc_cache *cache,
+				      struct io_cache_entry *entry)
+{
+	hlist_add_head(&entry->node, &cache->list);
+}
+
+static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+{
+	if (!hlist_empty(&cache->list)) {
+		struct hlist_node *node = cache->list.first;
+
+		hlist_del(node);
+		return container_of(node, struct io_cache_entry, node);
+	}
+
+	return NULL;
+}
+
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
+{
+	INIT_HLIST_HEAD(&cache->list);
+}
+
+static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
+					void (*free)(struct io_cache_entry *))
+{
+	while (!hlist_empty(&cache->list)) {
+		struct hlist_node *node = cache->list.first;
+
+		hlist_del(node);
+		free(container_of(node, struct io_cache_entry, node));
+	}
+}
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d1ce58b015e..a360a3d390c6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 
 #include "timeout.h"
 #include "poll.h"
+#include "alloc_cache.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -295,7 +296,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	INIT_LIST_HEAD(&ctx->apoll_cache);
+	io_alloc_cache_init(&ctx->apoll_cache);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -1180,8 +1181,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				list_add(&apoll->poll.wait.entry,
-						&ctx->apoll_cache);
+				io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache);
 				req->flags &= ~REQ_F_POLLED;
 			}
 			if (req->flags & IO_REQ_LINK_FLAGS)
@@ -2467,7 +2467,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	io_eventfd_unregister(ctx);
-	io_flush_apoll_cache(ctx);
+	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	mutex_unlock(&ctx->uring_lock);
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 052fcb647208..dadd293749b0 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -590,16 +590,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
 					     unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
 	struct async_poll *apoll;
 
 	if (req->flags & REQ_F_POLLED) {
 		apoll = req->apoll;
 		kfree(apoll->double_poll);
 	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-		   !list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del_init(&apoll->poll.wait.entry);
+		   (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
+		apoll = container_of(entry, struct async_poll, cache);
 	} else {
 		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 		if (unlikely(!apoll))
@@ -960,14 +959,7 @@ out:
 	return IOU_OK;
 }
 
-void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+void io_apoll_cache_free(struct io_cache_entry *entry)
 {
-	struct async_poll *apoll;
-
-	while (!list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del(&apoll->poll.wait.entry);
-		kfree(apoll);
-	}
+	kfree(container_of(entry, struct async_poll, cache));
 }
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 95f192c7babb..5f3bae50fc81 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "alloc_cache.h"
+
 enum {
 	IO_APOLL_OK,
 	IO_APOLL_ABORTED,
@@ -14,7 +16,10 @@ struct io_poll {
 };
 
 struct async_poll {
-	struct io_poll		poll;
+	union {
+		struct io_poll		poll;
+		struct io_cache_entry	cache;
+	};
 	struct io_poll		*double_poll;
 };
 
@@ -31,4 +36,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
 
-void io_flush_apoll_cache(struct io_ring_ctx *ctx);
+void io_apoll_cache_free(struct io_cache_entry *entry);
-- 
cgit 


From 9731bc9855dc169f27433fef3c4d0ff3496c512d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:20:54 -0600
Subject: io_uring: impose max limit on apoll cache

Caches like this tend to grow to the peak size, and then never get any
smaller. Impose a max limit on the size, to prevent it from growing too
big.

A somewhat randomly chosen 512 is the max size we'll allow the cache
to get. If a batch of frees come in and would bring it over that, we
simply start kfree'ing the surplus.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/alloc_cache.h         | 16 ++++++++++++++--
 io_uring/io_uring.c            |  3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b548da03b563..bf8f95332eda 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -160,6 +160,7 @@ struct io_ev_fd {
 
 struct io_alloc_cache {
 	struct hlist_head	list;
+	unsigned int		nr_cached;
 };
 
 struct io_ring_ctx {
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 98f2374c37c7..729793ae9712 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -1,14 +1,24 @@
 #ifndef IOU_ALLOC_CACHE_H
 #define IOU_ALLOC_CACHE_H
 
+/*
+ * Don't allow the cache to grow beyond this size.
+ */
+#define IO_ALLOC_CACHE_MAX	512
+
 struct io_cache_entry {
 	struct hlist_node	node;
 };
 
-static inline void io_alloc_cache_put(struct io_alloc_cache *cache,
+static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 				      struct io_cache_entry *entry)
 {
-	hlist_add_head(&entry->node, &cache->list);
+	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
+		cache->nr_cached++;
+		hlist_add_head(&entry->node, &cache->list);
+		return true;
+	}
+	return false;
 }
 
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
@@ -26,6 +36,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
 {
 	INIT_HLIST_HEAD(&cache->list);
+	cache->nr_cached = 0;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
@@ -37,5 +48,6 @@ static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
 		hlist_del(node);
 		free(container_of(node, struct io_cache_entry, node));
 	}
+	cache->nr_cached = 0;
 }
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a360a3d390c6..c9c23e459766 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1181,7 +1181,8 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache);
+				if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
+					kfree(apoll);
 				req->flags &= ~REQ_F_POLLED;
 			}
 			if (req->flags & IO_REQ_LINK_FLAGS)
-- 
cgit 


From 43e0bbbd0b0e30d232fd8e9e908125b5c49a9fbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:30:09 -0600
Subject: io_uring: add netmsg cache

For recvmsg/sendmsg, if they don't complete inline, we currently need
to allocate a struct io_async_msghdr for each request. This is a
somewhat large struct.

Hook up sendmsg/recvmsg to use the io_alloc_cache. This reduces the
alloc + free overhead considerably, yielding 4-5% of extra performance
running netbench.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++--
 io_uring/io_uring.c            |  3 ++
 io_uring/net.c                 | 63 ++++++++++++++++++++++++++++++++++++------
 io_uring/net.h                 | 13 ++++++++-
 4 files changed, 73 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bf8f95332eda..d54b8b7e0746 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -222,8 +222,7 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
 		struct io_alloc_cache	apoll_cache;
-		struct xarray		personalities;
-		u32			pers_next;
+		struct io_alloc_cache	netmsg_cache;
 	} ____cacheline_aligned_in_smp;
 
 	/* IRQ completion list, under ->completion_lock */
@@ -241,6 +240,9 @@ struct io_ring_ctx {
 	unsigned int		file_alloc_start;
 	unsigned int		file_alloc_end;
 
+	struct xarray		personalities;
+	u32			pers_next;
+
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c9c23e459766..f697ca4e8f55 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -89,6 +89,7 @@
 #include "kbuf.h"
 #include "rsrc.h"
 #include "cancel.h"
+#include "net.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -297,6 +298,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
 	io_alloc_cache_init(&ctx->apoll_cache);
+	io_alloc_cache_init(&ctx->netmsg_cache);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -2469,6 +2471,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		__io_cqring_overflow_flush(ctx, true);
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
+	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	mutex_unlock(&ctx->uring_lock);
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
diff --git a/io_uring/net.c b/io_uring/net.c
index 6679069eeef1..185553174437 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -12,6 +12,7 @@
 
 #include "io_uring.h"
 #include "kbuf.h"
+#include "alloc_cache.h"
 #include "net.h"
 
 #if defined(CONFIG_NET)
@@ -97,18 +98,55 @@ static bool io_net_retry(struct socket *sock, int flags)
 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
 }
 
+static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_async_msghdr *hdr = req->async_data;
+
+	if (!hdr || issue_flags & IO_URING_F_UNLOCKED)
+		return;
+
+	/* Let normal cleanup path reap it if we fail adding to the cache */
+	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+		req->async_data = NULL;
+		req->flags &= ~REQ_F_ASYNC_DATA;
+	}
+}
+
+static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req,
+						      unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
+
+	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+	    (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) {
+		struct io_async_msghdr *hdr;
+
+		hdr = container_of(entry, struct io_async_msghdr, cache);
+		req->flags |= REQ_F_ASYNC_DATA;
+		req->async_data = hdr;
+		return hdr;
+	}
+
+	if (!io_alloc_async_data(req))
+		return req->async_data;
+
+	return NULL;
+}
+
 static int io_setup_async_msg(struct io_kiocb *req,
-			      struct io_async_msghdr *kmsg)
+			      struct io_async_msghdr *kmsg,
+			      unsigned int issue_flags)
 {
 	struct io_async_msghdr *async_msg = req->async_data;
 
 	if (async_msg)
 		return -EAGAIN;
-	if (io_alloc_async_data(req)) {
+	async_msg = io_recvmsg_alloc_async(req, issue_flags);
+	if (!async_msg) {
 		kfree(kmsg->free_iov);
 		return -ENOMEM;
 	}
-	async_msg = req->async_data;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	memcpy(async_msg, kmsg, sizeof(*kmsg));
 	async_msg->msg.msg_name = &async_msg->addr;
@@ -195,7 +233,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
+		return io_setup_async_msg(req, kmsg, issue_flags);
 
 	flags = sr->msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -207,13 +245,13 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < min_ret) {
 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
 			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		}
 		req_set_fail(req);
 	}
@@ -221,6 +259,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_netmsg_recycle(req, issue_flags);
 	if (ret >= 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
@@ -495,7 +534,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
+		return io_setup_async_msg(req, kmsg, issue_flags);
 
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
@@ -519,13 +558,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
 		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
 			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		}
 		req_set_fail(req);
 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
@@ -535,6 +574,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	/* fast path, check for non-NULL to avoid function call */
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
+	io_netmsg_recycle(req, issue_flags);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret > 0)
 		ret += sr->done_io;
@@ -848,4 +888,9 @@ out:
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
+
+void io_netmsg_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_async_msghdr, cache));
+}
 #endif
diff --git a/io_uring/net.h b/io_uring/net.h
index 81d71d164770..178a6d8b76e0 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,9 +3,14 @@
 #include <linux/net.h>
 #include <linux/uio.h>
 
+#include "alloc_cache.h"
+
 #if defined(CONFIG_NET)
 struct io_async_msghdr {
-	struct iovec			fast_iov[UIO_FASTIOV];
+	union {
+		struct iovec		fast_iov[UIO_FASTIOV];
+		struct io_cache_entry	cache;
+	};
 	/* points to an allocated iov, if NULL we use fast_iov instead */
 	struct iovec			*free_iov;
 	struct sockaddr __user		*uaddr;
@@ -40,4 +45,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags);
 int io_connect_prep_async(struct io_kiocb *req);
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
+
+void io_netmsg_cache_free(struct io_cache_entry *entry);
+#else
+static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
+{
+}
 #endif
-- 
cgit 


From 7fa875b8e53c288d616234b9daf417b0650ce1cc Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:56 -0700
Subject: net: copy from user before calling __copy_msghdr

this is in preparation for multishot receive from io_uring, where it needs
to have access to the original struct user_msghdr.

functionally this should be a no-op.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  7 +++----
 io_uring/net.c         | 17 +++++++++--------
 net/socket.c           | 37 ++++++++++++++++---------------------
 3 files changed, 28 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 17311ad9f9af..be24f1c8568a 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
 			       struct user_msghdr __user *umsg, unsigned flags,
 			       struct sockaddr __user **uaddr,
 			       struct iovec **iov);
-extern int __copy_msghdr_from_user(struct msghdr *kmsg,
-				   struct user_msghdr __user *umsg,
-				   struct sockaddr __user **save_addr,
-				   struct iovec __user **uiov, size_t *nsegs);
+extern int __copy_msghdr(struct msghdr *kmsg,
+			 struct user_msghdr *umsg,
+			 struct sockaddr __user **save_addr);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/io_uring/net.c b/io_uring/net.c
index dc9190eafbe7..da7667ed3610 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -329,31 +329,32 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct iovec __user *uiov;
-	size_t iov_len;
+	struct user_msghdr msg;
 	int ret;
 
-	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
-					&iomsg->uaddr, &uiov, &iov_len);
+	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
+		return -EFAULT;
+
+	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
 	if (ret)
 		return ret;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		if (iov_len == 0) {
+		if (msg.msg_iovlen == 0) {
 			sr->len = iomsg->fast_iov[0].iov_len = 0;
 			iomsg->fast_iov[0].iov_base = NULL;
 			iomsg->free_iov = NULL;
-		} else if (iov_len > 1) {
+		} else if (msg.msg_iovlen > 1) {
 			return -EINVAL;
 		} else {
-			if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
+			if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
 				return -EFAULT;
 			sr->len = iomsg->fast_iov[0].iov_len;
 			iomsg->free_iov = NULL;
 		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+		ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
 				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 				     false);
 		if (ret > 0)
diff --git a/net/socket.c b/net/socket.c
index 96300cdc0625..843545c21ec2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2358,25 +2358,20 @@ struct used_address {
 	unsigned int name_len;
 };
 
-int __copy_msghdr_from_user(struct msghdr *kmsg,
-			    struct user_msghdr __user *umsg,
-			    struct sockaddr __user **save_addr,
-			    struct iovec __user **uiov, size_t *nsegs)
+int __copy_msghdr(struct msghdr *kmsg,
+		  struct user_msghdr *msg,
+		  struct sockaddr __user **save_addr)
 {
-	struct user_msghdr msg;
 	ssize_t err;
 
-	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
-		return -EFAULT;
-
 	kmsg->msg_control_is_user = true;
 	kmsg->msg_get_inq = 0;
-	kmsg->msg_control_user = msg.msg_control;
-	kmsg->msg_controllen = msg.msg_controllen;
-	kmsg->msg_flags = msg.msg_flags;
+	kmsg->msg_control_user = msg->msg_control;
+	kmsg->msg_controllen = msg->msg_controllen;
+	kmsg->msg_flags = msg->msg_flags;
 
-	kmsg->msg_namelen = msg.msg_namelen;
-	if (!msg.msg_name)
+	kmsg->msg_namelen = msg->msg_namelen;
+	if (!msg->msg_name)
 		kmsg->msg_namelen = 0;
 
 	if (kmsg->msg_namelen < 0)
@@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
 	if (save_addr)
-		*save_addr = msg.msg_name;
+		*save_addr = msg->msg_name;
 
-	if (msg.msg_name && kmsg->msg_namelen) {
+	if (msg->msg_name && kmsg->msg_namelen) {
 		if (!save_addr) {
-			err = move_addr_to_kernel(msg.msg_name,
+			err = move_addr_to_kernel(msg->msg_name,
 						  kmsg->msg_namelen,
 						  kmsg->msg_name);
 			if (err < 0)
@@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (msg.msg_iovlen > UIO_MAXIOV)
+	if (msg->msg_iovlen > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
-	*uiov = msg.msg_iov;
-	*nsegs = msg.msg_iovlen;
 	return 0;
 }
 
@@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 	struct user_msghdr msg;
 	ssize_t err;
 
-	err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
-					&msg.msg_iovlen);
+	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+		return -EFAULT;
+
+	err = __copy_msghdr(kmsg, &msg, save_addr);
 	if (err)
 		return err;
 
-- 
cgit 


From 72c531f8ef3052c682d39dc21dcb5576afda208c Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:57 -0700
Subject: net: copy from user before calling __get_compat_msghdr

this is in preparation for multishot receive from io_uring, where it needs
to have access to the original struct user_msghdr.

functionally this should be a no-op.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/compat.h |  5 ++---
 io_uring/net.c       | 17 +++++++++--------
 net/compat.c         | 39 +++++++++++++++++----------------------
 3 files changed, 28 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/compat.h b/include/net/compat.h
index 595fee069b82..84c163f40f38 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -46,9 +46,8 @@ struct compat_rtentry {
 	unsigned short  rt_irtt;        /* Initial RTT                  */
 };
 
-int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
-			struct sockaddr __user **save_addr, compat_uptr_t *ptr,
-			compat_size_t *len);
+int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
+			struct sockaddr __user **save_addr);
 int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
 		      struct sockaddr __user **, struct iovec **);
 int put_cmsg_compat(struct msghdr*, int, int, int, void *);
diff --git a/io_uring/net.c b/io_uring/net.c
index da7667ed3610..5bc3440a8290 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -369,24 +369,25 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct compat_msghdr msg;
 	struct compat_iovec __user *uiov;
-	compat_uptr_t ptr;
-	compat_size_t len;
 	int ret;
 
-	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
-				  &ptr, &len);
+	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
+		return -EFAULT;
+
+	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr);
 	if (ret)
 		return ret;
 
-	uiov = compat_ptr(ptr);
+	uiov = compat_ptr(msg.msg_iov);
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		compat_ssize_t clen;
 
-		if (len == 0) {
+		if (msg.msg_iovlen == 0) {
 			sr->len = 0;
 			iomsg->free_iov = NULL;
-		} else if (len > 1) {
+		} else if (msg.msg_iovlen > 1) {
 			return -EINVAL;
 		} else {
 			if (!access_ok(uiov, sizeof(*uiov)))
@@ -400,7 +401,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+		ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
 				   UIO_FASTIOV, &iomsg->free_iov,
 				   &iomsg->msg.msg_iter, true);
 		if (ret < 0)
diff --git a/net/compat.c b/net/compat.c
index 210fc3b4d0d8..513aa9a3fc64 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -34,20 +34,15 @@
 #include <net/compat.h>
 
 int __get_compat_msghdr(struct msghdr *kmsg,
-			struct compat_msghdr __user *umsg,
-			struct sockaddr __user **save_addr,
-			compat_uptr_t *ptr, compat_size_t *len)
+			struct compat_msghdr *msg,
+			struct sockaddr __user **save_addr)
 {
-	struct compat_msghdr msg;
 	ssize_t err;
 
-	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
-		return -EFAULT;
-
-	kmsg->msg_flags = msg.msg_flags;
-	kmsg->msg_namelen = msg.msg_namelen;
+	kmsg->msg_flags = msg->msg_flags;
+	kmsg->msg_namelen = msg->msg_namelen;
 
-	if (!msg.msg_name)
+	if (!msg->msg_name)
 		kmsg->msg_namelen = 0;
 
 	if (kmsg->msg_namelen < 0)
@@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
 	kmsg->msg_control_is_user = true;
-	kmsg->msg_control_user = compat_ptr(msg.msg_control);
-	kmsg->msg_controllen = msg.msg_controllen;
+	kmsg->msg_control_user = compat_ptr(msg->msg_control);
+	kmsg->msg_controllen = msg->msg_controllen;
 
 	if (save_addr)
-		*save_addr = compat_ptr(msg.msg_name);
+		*save_addr = compat_ptr(msg->msg_name);
 
-	if (msg.msg_name && kmsg->msg_namelen) {
+	if (msg->msg_name && kmsg->msg_namelen) {
 		if (!save_addr) {
-			err = move_addr_to_kernel(compat_ptr(msg.msg_name),
+			err = move_addr_to_kernel(compat_ptr(msg->msg_name),
 						  kmsg->msg_namelen,
 						  kmsg->msg_name);
 			if (err < 0)
@@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (msg.msg_iovlen > UIO_MAXIOV)
+	if (msg->msg_iovlen > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
-	*ptr = msg.msg_iov;
-	*len = msg.msg_iovlen;
 	return 0;
 }
 
@@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg,
 		      struct sockaddr __user **save_addr,
 		      struct iovec **iov)
 {
-	compat_uptr_t ptr;
-	compat_size_t len;
+	struct compat_msghdr msg;
 	ssize_t err;
 
-	err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
+	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+		return -EFAULT;
+
+	err = __get_compat_msghdr(kmsg, umsg, save_addr);
 	if (err)
 		return err;
 
-	err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len,
+	err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen,
 			   UIO_FASTIOV, iov, &kmsg->msg_iter);
 	return err < 0 ? err : 0;
 }
-- 
cgit 


From 9bb66906f23e50d6db1e11f7498b72dfca1982a2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:58 -0700
Subject: io_uring: support multishot in recvmsg

Similar to multishot recv, this will require provided buffers to be
used. However recvmsg is much more complex than recv as it has multiple
outputs. Specifically flags, name, and control messages.

Support this by introducing a new struct io_uring_recvmsg_out with 4
fields. namelen, controllen and flags match the similar out fields in
msghdr from standard recvmsg(2), payloadlen is the length of the payload
following the header.
This struct is placed at the start of the returned buffer. Based on what
the user specifies in struct msghdr, the next bytes of the buffer will be
name (the next msg_namelen bytes), and then control (the next
msg_controllen bytes). The payload will come at the end. The return value
in the CQE is the total used size of the provided buffer.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-4-dylany@fb.com
[axboe: style fixups, see link]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   7 ++
 io_uring/net.c                | 180 +++++++++++++++++++++++++++++++++++++-----
 io_uring/net.h                |   6 ++
 3 files changed, 174 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 499679134961..4c9b11e2e991 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -613,4 +613,11 @@ struct io_uring_file_index_range {
 	__u64	resv;
 };
 
+struct io_uring_recvmsg_out {
+	__u32 namelen;
+	__u32 controllen;
+	__u32 payloadlen;
+	__u32 flags;
+};
+
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index 5bc3440a8290..616d5f04cc74 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -325,6 +325,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+{
+	unsigned long hdr;
+
+	if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
+			       (unsigned long)iomsg->namelen, &hdr))
+		return true;
+	if (check_add_overflow(hdr, iomsg->controllen, &hdr))
+		return true;
+	if (hdr > INT_MAX)
+		return true;
+
+	return false;
+}
+
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
@@ -352,6 +367,13 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = iomsg->fast_iov[0].iov_len;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
@@ -399,6 +421,13 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = clen;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
@@ -455,8 +484,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
 	if (sr->flags & IORING_RECV_MULTISHOT) {
-		if (req->opcode == IORING_OP_RECVMSG)
-			return -EINVAL;
 		if (!(req->flags & REQ_F_BUFFER_SELECT))
 			return -EINVAL;
 		if (sr->msg_flags & MSG_WAITALL)
@@ -483,12 +510,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
 }
 
 /*
- * Finishes io_recv
+ * Finishes io_recv and io_recvmsg.
  *
  * Returns true if it is actually finished, or false if it should run
  * again (for multishot).
  */
-static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
+				  unsigned int cflags, bool mshot_finished)
 {
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
 		io_req_set_res(req, *ret, cflags);
@@ -496,7 +524,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 		return true;
 	}
 
-	if (*ret > 0) {
+	if (!mshot_finished) {
 		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
 				    cflags | IORING_CQE_F_MORE, false)) {
 			io_recv_prep_retry(req);
@@ -518,6 +546,90 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 	return true;
 }
 
+static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
+				     struct io_sr_msg *sr, void __user **buf,
+				     size_t *len)
+{
+	unsigned long ubuf = (unsigned long) *buf;
+	unsigned long hdr;
+
+	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+		kmsg->controllen;
+	if (*len < hdr)
+		return -EFAULT;
+
+	if (kmsg->controllen) {
+		unsigned long control = ubuf + hdr - kmsg->controllen;
+
+		kmsg->msg.msg_control_user = (void *) control;
+		kmsg->msg.msg_controllen = kmsg->controllen;
+	}
+
+	sr->buf = *buf; /* stash for later copy */
+	*buf = (void *) (ubuf + hdr);
+	kmsg->payloadlen = *len = *len - hdr;
+	return 0;
+}
+
+struct io_recvmsg_multishot_hdr {
+	struct io_uring_recvmsg_out msg;
+	struct sockaddr_storage addr;
+};
+
+static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
+				struct io_async_msghdr *kmsg,
+				unsigned int flags, bool *finished)
+{
+	int err;
+	int copy_len;
+	struct io_recvmsg_multishot_hdr hdr;
+
+	if (kmsg->namelen)
+		kmsg->msg.msg_name = &hdr.addr;
+	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	kmsg->msg.msg_namelen = 0;
+
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+
+	err = sock_recvmsg(sock, &kmsg->msg, flags);
+	*finished = err <= 0;
+	if (err < 0)
+		return err;
+
+	hdr.msg = (struct io_uring_recvmsg_out) {
+		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
+		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
+	};
+
+	hdr.msg.payloadlen = err;
+	if (err > kmsg->payloadlen)
+		err = kmsg->payloadlen;
+
+	copy_len = sizeof(struct io_uring_recvmsg_out);
+	if (kmsg->msg.msg_namelen > kmsg->namelen)
+		copy_len += kmsg->namelen;
+	else
+		copy_len += kmsg->msg.msg_namelen;
+
+	/*
+	 *      "fromlen shall refer to the value before truncation.."
+	 *                      1003.1g
+	 */
+	hdr.msg.namelen = kmsg->msg.msg_namelen;
+
+	/* ensure that there is no gap between hdr and sockaddr_storage */
+	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
+		     sizeof(struct io_uring_recvmsg_out));
+	if (copy_to_user(io->buf, &hdr, copy_len)) {
+		*finished = true;
+		return -EFAULT;
+	}
+
+	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+			kmsg->controllen + err;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -527,6 +639,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool mshot_finished = true;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -545,16 +658,27 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
+		size_t len = sr->len;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
+			if (ret) {
+				io_kbuf_recycle(req, issue_flags);
+				return ret;
+			}
+		}
+
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -564,10 +688,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
 	kmsg->msg.msg_get_inq = 1;
-	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
+	if (req->flags & REQ_F_APOLL_MULTISHOT)
+		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
+					   &mshot_finished);
+	else
+		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
+					 kmsg->uaddr, flags);
+
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg, issue_flags);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg, issue_flags);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -580,11 +717,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	}
 
-	/* fast path, check for non-NULL to avoid function call */
-	if (kmsg->free_iov)
-		kfree(kmsg->free_iov);
-	io_netmsg_recycle(req, issue_flags);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret > 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
@@ -596,8 +728,18 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	if (!io_recv_finish(req, &ret, cflags, mshot_finished))
+		goto retry_multishot;
+
+	if (mshot_finished) {
+		io_netmsg_recycle(req, issue_flags);
+		/* fast path, check for non-NULL to avoid function call */
+		if (kmsg->free_iov)
+			kfree(kmsg->free_iov);
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+	}
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -684,7 +826,7 @@ out_free:
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	if (!io_recv_finish(req, &ret, cflags))
+	if (!io_recv_finish(req, &ret, cflags, ret <= 0))
 		goto retry_multishot;
 
 	return ret;
diff --git a/io_uring/net.h b/io_uring/net.h
index 178a6d8b76e0..db20ce9d6546 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -9,6 +9,12 @@
 struct io_async_msghdr {
 	union {
 		struct iovec		fast_iov[UIO_FASTIOV];
+		struct {
+			struct iovec	fast_iov_one;
+			__kernel_size_t	controllen;
+			int		namelen;
+			__kernel_size_t	payloadlen;
+		};
 		struct io_cache_entry	cache;
 	};
 	/* points to an allocated iov, if NULL we use fast_iov instead */
-- 
cgit 


From fe6c9c6e3e3e332b998393d214fba9d09ab0acb0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2022 10:51:46 -0700
Subject: mm: Add balance_dirty_pages_ratelimited_flags() function

This adds the helper function balance_dirty_pages_ratelimited_flags().
It adds the parameter flags to balance_dirty_pages_ratelimited().
The flags parameter is passed to balance_dirty_pages(). For async
buffered writes the flag value will be BDP_ASYNC.

If balance_dirty_pages() gets called for async buffered write, we don't
want to wait. Instead we need to indicate to the caller that throttling
is needed so that it can stop writing and offload the rest of the write
to a context that can block.

The new helper function is also used by balance_dirty_pages_ratelimited().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Stefan Roesch <shr@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623175157.1715274-4-shr@fb.com
[axboe: fix kerneltest bot 'ret' issue]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h |  7 +++++++
 mm/page-writeback.c       | 51 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index da21d63f70e2..b8c9610c2313 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
 void wb_update_bandwidth(struct bdi_writeback *wb);
+
+/* Invoke balance dirty pages in async mode. */
+#define BDP_ASYNC 0x0001
+
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
+		unsigned int flags);
+
 bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 90b1998c16a1..d0d466a5c804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
  * If we're over `background_thresh' then the writeback threads are woken to
  * perform some writeout.
  */
-static void balance_dirty_pages(struct bdi_writeback *wb,
-				unsigned long pages_dirtied)
+static int balance_dirty_pages(struct bdi_writeback *wb,
+			       unsigned long pages_dirtied, unsigned int flags)
 {
 	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
 	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
@@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
 	struct backing_dev_info *bdi = wb->bdi;
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
+	int ret = 0;
 
 	for (;;) {
 		unsigned long now = jiffies;
@@ -1803,6 +1804,10 @@ pause:
 					  period,
 					  pause,
 					  start_time);
+		if (flags & BDP_ASYNC) {
+			ret = -EAGAIN;
+			break;
+		}
 		__set_current_state(TASK_KILLABLE);
 		wb->dirty_sleep = now;
 		io_schedule_timeout(pause);
@@ -1834,6 +1839,7 @@ pause:
 		if (fatal_signal_pending(current))
 			break;
 	}
+	return ret;
 }
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1855,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
 DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping: address_space which was dirtied
+ * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ * @flags: BDP flags.
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
- * Once we're over the dirty memory limit we decrease the ratelimiting
- * by a lot, to prevent individual processes from overshooting the limit
- * by (ratelimit_pages) each.
+ * See balance_dirty_pages_ratelimited() for details.
+ *
+ * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
+ * indicate that memory is out of balance and the caller must wait
+ * for I/O to complete.  Otherwise, it will return 0 to indicate
+ * that either memory was already in balance, or it was able to sleep
+ * until the amount of dirty memory returned to balance.
  */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
+					unsigned int flags)
 {
 	struct inode *inode = mapping->host;
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct bdi_writeback *wb = NULL;
 	int ratelimit;
+	int ret = 0;
 	int *p;
 
 	if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
-		return;
+		return ret;
 
 	if (inode_cgwb_enabled(inode))
 		wb = wb_get_create_current(bdi, GFP_KERNEL);
@@ -1915,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(wb, current->nr_dirtied);
+		ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
 
 	wb_put(wb);
+	return ret;
+}
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied.  The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * Once we're over the dirty memory limit we decrease the ratelimiting
+ * by a lot, to prevent individual processes from overshooting the limit
+ * by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+	balance_dirty_pages_ratelimited_flags(mapping, 0);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
-- 
cgit 


From 8017553980d0bbfef3e66c583363828565afd6da Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Thu, 23 Jun 2022 10:51:50 -0700
Subject: fs: add a FMODE_BUF_WASYNC flags for f_mode

This introduces the flag FMODE_BUF_WASYNC. If devices support async
buffered writes, this flag can be set. It also modifies the check in
generic_write_checks to take async buffered writes into consideration.

Signed-off-by: Stefan Roesch <shr@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220623175157.1715274-8-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/read_write.c    | 4 +++-
 include/linux/fs.h | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index e0777eefd846..319d88825d1c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1660,7 +1660,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
 	if (iocb->ki_flags & IOCB_APPEND)
 		iocb->ki_pos = i_size_read(inode);
 
-	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+	if ((iocb->ki_flags & IOCB_NOWAIT) &&
+	    !((iocb->ki_flags & IOCB_DIRECT) ||
+	      (file->f_mode & FMODE_BUF_WASYNC)))
 		return -EINVAL;
 
 	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..bc84847c201e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File supports async buffered reads */
 #define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
 
+/* File supports async nowait buffered writes */
+#define FMODE_BUF_WASYNC	((__force fmode_t)0x80000000)
+
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
-- 
cgit 


From 66fa3cedf16abc82d19b943e3289c82e685419d5 Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Thu, 23 Jun 2022 10:51:53 -0700
Subject: fs: Add async write file modification handling.

This adds a file_modified_async() function to return -EAGAIN if the
request either requires to remove privileges or needs to update the file
modification time. This is required for async buffered writes, so the
request gets handled in the io worker of io-uring.

Signed-off-by: Stefan Roesch <shr@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220623175157.1715274-11-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/inode.c         | 45 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/fs.h |  1 +
 2 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index ff726d99ecc7..259ebf438893 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2116,17 +2116,21 @@ int file_update_time(struct file *file)
 EXPORT_SYMBOL(file_update_time);
 
 /**
- * file_modified - handle mandated vfs changes when modifying a file
+ * file_modified_flags - handle mandated vfs changes when modifying a file
  * @file: file that was modified
+ * @flags: kiocb flags
  *
  * When file has been modified ensure that special
  * file privileges are removed and time settings are updated.
  *
+ * If IOCB_NOWAIT is set, special file privileges will not be removed and
+ * time settings will not be updated. It will return -EAGAIN.
+ *
  * Context: Caller must hold the file's inode lock.
  *
  * Return: 0 on success, negative errno on failure.
  */
-int file_modified(struct file *file)
+static int file_modified_flags(struct file *file, int flags)
 {
 	int ret;
 	struct inode *inode = file_inode(file);
@@ -2136,7 +2140,7 @@ int file_modified(struct file *file)
 	 * Clear the security bits if the process is not being run by root.
 	 * This keeps people from modifying setuid and setgid binaries.
 	 */
-	ret = __file_remove_privs(file, 0);
+	ret = __file_remove_privs(file, flags);
 	if (ret)
 		return ret;
 
@@ -2146,11 +2150,46 @@ int file_modified(struct file *file)
 	ret = inode_needs_update_time(inode, &now);
 	if (ret <= 0)
 		return ret;
+	if (flags & IOCB_NOWAIT)
+		return -EAGAIN;
 
 	return __file_update_time(file, &now, ret);
 }
+
+/**
+ * file_modified - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_modified(struct file *file)
+{
+	return file_modified_flags(file, 0);
+}
 EXPORT_SYMBOL(file_modified);
 
+/**
+ * kiocb_modified - handle mandated vfs changes when modifying a file
+ * @iocb: iocb that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kiocb_modified(struct kiocb *iocb)
+{
+	return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
+}
+EXPORT_SYMBOL_GPL(kiocb_modified);
+
 int inode_needs_sync(struct inode *inode)
 {
 	if (IS_SYNC(inode))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bc84847c201e..c0d99b5a166b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2390,6 +2390,7 @@ static inline void file_accessed(struct file *file)
 }
 
 extern int file_modified(struct file *file);
+int kiocb_modified(struct kiocb *iocb);
 
 int sync_inode_metadata(struct inode *inode, int wait);
 
-- 
cgit 


From 1c849b481b3e4f8c36f297cd3aa88ef52a19cee9 Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Thu, 16 Jun 2022 14:22:19 -0700
Subject: io_uring: Add tracepoint for short writes

This adds the io_uring_short_write tracepoint to io_uring. A short write
is issued if not all pages that are required for a write are in the page
cache and the async buffered writes have to return EAGAIN.

Signed-off-by: Stefan Roesch <shr@fb.com>
Link: https://lore.kernel.org/r/20220616212221.2024518-13-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 25 +++++++++++++++++++++++++
 io_uring/rw.c                   |  3 +++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 95a8cfaad15a..c5b21ff0ac85 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run,
 		 __entry->tctx, __entry->count, __entry->loops)
 );
 
+TRACE_EVENT(io_uring_short_write,
+
+	TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),
+
+	TP_ARGS(ctx, fpos, wanted, got),
+
+	TP_STRUCT__entry(
+		__field(void *,	ctx)
+		__field(u64,	fpos)
+		__field(u64,	wanted)
+		__field(u64,	got)
+	),
+
+	TP_fast_assign(
+		__entry->ctx	= ctx;
+		__entry->fpos	= fpos;
+		__entry->wanted	= wanted;
+		__entry->got	= got;
+	),
+
+	TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
+			  __entry->ctx, __entry->fpos,
+			  __entry->wanted, __entry->got)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1f4a8ff98254..2b784795103c 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -933,6 +933,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
 			struct io_async_rw *rw;
 
+			trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
+						req->cqe.res, ret2);
+
 			/* This is a partial write. The file pos has already been
 			 * updated, setup the async struct to complete the request
 			 * in the worker. Also update bytes_done to account for
-- 
cgit 


From e70cb60893ca64b7df06864aa16c1cf6d6c671db Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:37 +0100
Subject: io_uring: export io_put_task()

Make io_put_task() available to non-core parts of io_uring, we'll need
it for notification infrastructure.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3686807d4c03b72e389947b0e8692d4d44334ef0.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 25 +++++++++++++++++++++++++
 io_uring/io_uring.c            | 11 +----------
 io_uring/io_uring.h            | 10 ++++++++++
 io_uring/tctx.h                | 26 --------------------------
 4 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d54b8b7e0746..368c34d14b13 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -4,6 +4,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_work.h>
 #include <linux/bitmap.h>
+#include <linux/llist.h>
 #include <uapi/linux/io_uring.h>
 
 struct io_wq_work_node {
@@ -43,6 +44,30 @@ struct io_hash_table {
 	unsigned		hash_bits;
 };
 
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
+struct io_uring_task {
+	/* submission side */
+	int				cached_refs;
+	const struct io_ring_ctx 	*last;
+	struct io_wq			*io_wq;
+	struct file			*registered_rings[IO_RINGFD_REG_MAX];
+
+	struct xarray			xa;
+	struct wait_queue_head		wait;
+	atomic_t			in_idle;
+	atomic_t			inflight_tracked;
+	struct percpu_counter		inflight;
+
+	struct { /* task_work */
+		struct llist_head	task_list;
+		struct callback_head	task_work;
+	} ____cacheline_aligned_in_smp;
+};
+
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
 	u32 tail ____cacheline_aligned_in_smp;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4b3fd645d023..7795cfedf6bf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -608,7 +608,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static void __io_put_task(struct task_struct *task, int nr)
+void __io_put_task(struct task_struct *task, int nr)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
@@ -618,15 +618,6 @@ static void __io_put_task(struct task_struct *task, int nr)
 	put_task_struct_many(task, nr);
 }
 
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
-	if (likely(task == current))
-		task->io_uring->cached_refs += nr;
-	else
-		__io_put_task(task, nr);
-}
-
 static void io_task_refs_refill(struct io_uring_task *tctx)
 {
 	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 5db0a60dc04e..b1c0c0a400d8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -71,6 +71,7 @@ void io_wq_submit_work(struct io_wq_work *work);
 
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
+void __io_put_task(struct task_struct *task, int nr);
 
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
@@ -258,4 +259,13 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		__io_commit_cqring_flush(ctx);
 }
 
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	if (likely(task == current))
+		task->io_uring->cached_refs += nr;
+	else
+		__io_put_task(task, nr);
+}
+
 #endif
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index 8a33ff6e5d91..25974beed4d6 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/llist.h>
-
-/*
- * Arbitrary limit, can be raised if need be
- */
-#define IO_RINGFD_REG_MAX 16
-
-struct io_uring_task {
-	/* submission side */
-	int				cached_refs;
-	const struct io_ring_ctx 	*last;
-	struct io_wq			*io_wq;
-	struct file			*registered_rings[IO_RINGFD_REG_MAX];
-
-	struct xarray			xa;
-	struct wait_queue_head		wait;
-	atomic_t			in_idle;
-	atomic_t			inflight_tracked;
-	struct percpu_counter		inflight;
-
-	struct { /* task_work */
-		struct llist_head	task_list;
-		struct callback_head	task_work;
-	} ____cacheline_aligned_in_smp;
-};
-
 struct io_tctx_node {
 	struct list_head	ctx_node;
 	struct task_struct	*task;
-- 
cgit 


From eb42cebb2cf24c48f60c32856a4bba93d42659c8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:38 +0100
Subject: io_uring: add zc notification infrastructure

Add internal part of send zerocopy notifications. There are two main
structures, the first one is struct io_notif, which carries inside
struct ubuf_info and maps 1:1 to it. io_uring will be binding a number
of zerocopy send requests to it and ask to complete (aka flush) it. When
flushed and all attached requests and skbs complete, it'll generate one
and only one CQE. There are intended to be passed into the network layer
as struct msghdr::msg_ubuf.

The second concept is notification slots. The userspace will be able to
register an array of slots and subsequently addressing them by the index
in the array. Slots are independent of each other. Each slot can have
only one notifier at a time (called active notifier) but many notifiers
during the lifetime. When active, a notifier not going to post any
completion but the userspace can attach requests to it by specifying
the corresponding slot while issueing send zc requests. Eventually, the
userspace will want to "flush" the notifier losing any way to attach
new requests to it, however it can use the next atomatically added
notifier of this slot or of any other slot.

When the network layer is done with all enqueued skbs attached to a
notifier and doesn't need the specified in them user data, the flushed
notifier will post a CQE.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3ecf54c31a85762bf679b0a432c9f43ecf7e61cc.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   5 ++
 io_uring/Makefile              |   2 +-
 io_uring/io_uring.c            |   8 ++--
 io_uring/io_uring.h            |   2 +
 io_uring/notif.c               | 102 +++++++++++++++++++++++++++++++++++++++++
 io_uring/notif.h               |  64 ++++++++++++++++++++++++++
 6 files changed, 179 insertions(+), 4 deletions(-)
 create mode 100644 io_uring/notif.c
 create mode 100644 io_uring/notif.h

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 368c34d14b13..f7fab3758cb9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -34,6 +34,9 @@ struct io_file_table {
 	unsigned int alloc_hint;
 };
 
+struct io_notif;
+struct io_notif_slot;
+
 struct io_hash_bucket {
 	spinlock_t		lock;
 	struct hlist_head	list;
@@ -237,6 +240,8 @@ struct io_ring_ctx {
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
 		struct io_mapped_ubuf	**user_bufs;
+		struct io_notif_slot	*notif_slots;
+		unsigned		nr_notif_slots;
 
 		struct io_submit_state	submit_state;
 
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 466639c289be..8cc8e5387a75 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7795cfedf6bf..65ac407dd74e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -90,6 +90,7 @@
 #include "rsrc.h"
 #include "cancel.h"
 #include "net.h"
+#include "notif.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -732,9 +733,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 	return &rings->cqes[off];
 }
 
-static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
-			    u64 user_data, s32 res, u32 cflags,
-			    bool allow_overflow)
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+		     bool allow_overflow)
 {
 	struct io_uring_cqe *cqe;
 
@@ -2491,6 +2491,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	}
 #endif
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
+	WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
 
 	io_mem_free(ctx->rings);
 	io_mem_free(ctx->sq_sqes);
@@ -2667,6 +2668,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_unregister_personality(ctx, index);
 	if (ctx->rings)
 		io_poll_remove_all(ctx, NULL, true);
+	io_notif_unregister(ctx);
 	mutex_unlock(&ctx->uring_lock);
 
 	/* failed during ring init, it couldn't have issued any requests */
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b1c0c0a400d8..66bfd880d07f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+		     bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
diff --git a/io_uring/notif.c b/io_uring/notif.c
new file mode 100644
index 000000000000..6ee948af6a49
--- /dev/null
+++ b/io_uring/notif.c
@@ -0,0 +1,102 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/io_uring.h>
+
+#include "io_uring.h"
+#include "notif.h"
+
+static void __io_notif_complete_tw(struct callback_head *cb)
+{
+	struct io_notif *notif = container_of(cb, struct io_notif, task_work);
+	struct io_ring_ctx *ctx = notif->ctx;
+
+	io_cq_lock(ctx);
+	io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true);
+	io_cq_unlock_post(ctx);
+
+	percpu_ref_put(&ctx->refs);
+	kfree(notif);
+}
+
+static inline void io_notif_complete(struct io_notif *notif)
+{
+	__io_notif_complete_tw(&notif->task_work);
+}
+
+static void io_notif_complete_wq(struct work_struct *work)
+{
+	struct io_notif *notif = container_of(work, struct io_notif, commit_work);
+
+	io_notif_complete(notif);
+}
+
+static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
+					  struct ubuf_info *uarg,
+					  bool success)
+{
+	struct io_notif *notif = container_of(uarg, struct io_notif, uarg);
+
+	if (!refcount_dec_and_test(&uarg->refcnt))
+		return;
+	INIT_WORK(&notif->commit_work, io_notif_complete_wq);
+	queue_work(system_unbound_wq, &notif->commit_work);
+}
+
+struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
+				struct io_notif_slot *slot)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_notif *notif;
+
+	notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT);
+	if (!notif)
+		return NULL;
+
+	notif->seq = slot->seq++;
+	notif->tag = slot->tag;
+	notif->ctx = ctx;
+	notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+	notif->uarg.callback = io_uring_tx_zerocopy_callback;
+	/* master ref owned by io_notif_slot, will be dropped on flush */
+	refcount_set(&notif->uarg.refcnt, 1);
+	percpu_ref_get(&ctx->refs);
+	return notif;
+}
+
+static void io_notif_slot_flush(struct io_notif_slot *slot)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_notif *notif = slot->notif;
+
+	slot->notif = NULL;
+
+	if (WARN_ON_ONCE(in_interrupt()))
+		return;
+	/* drop slot's master ref */
+	if (refcount_dec_and_test(&notif->uarg.refcnt))
+		io_notif_complete(notif);
+}
+
+__cold int io_notif_unregister(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	int i;
+
+	if (!ctx->notif_slots)
+		return -ENXIO;
+
+	for (i = 0; i < ctx->nr_notif_slots; i++) {
+		struct io_notif_slot *slot = &ctx->notif_slots[i];
+
+		if (slot->notif)
+			io_notif_slot_flush(slot);
+	}
+
+	kvfree(ctx->notif_slots);
+	ctx->notif_slots = NULL;
+	ctx->nr_notif_slots = 0;
+	return 0;
+}
\ No newline at end of file
diff --git a/io_uring/notif.h b/io_uring/notif.h
new file mode 100644
index 000000000000..3d7a1d242e17
--- /dev/null
+++ b/io_uring/notif.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/net.h>
+#include <linux/uio.h>
+#include <net/sock.h>
+#include <linux/nospec.h>
+
+struct io_notif {
+	struct ubuf_info	uarg;
+	struct io_ring_ctx	*ctx;
+
+	/* cqe->user_data, io_notif_slot::tag if not overridden */
+	u64			tag;
+	/* see struct io_notif_slot::seq */
+	u32			seq;
+
+	union {
+		struct callback_head	task_work;
+		struct work_struct	commit_work;
+	};
+};
+
+struct io_notif_slot {
+	/*
+	 * Current/active notifier. A slot holds only one active notifier at a
+	 * time and keeps one reference to it. Flush releases the reference and
+	 * lazily replaces it with a new notifier.
+	 */
+	struct io_notif		*notif;
+
+	/*
+	 * Default ->user_data for this slot notifiers CQEs
+	 */
+	u64			tag;
+	/*
+	 * Notifiers of a slot live in generations, we create a new notifier
+	 * only after flushing the previous one. Track the sequential number
+	 * for all notifiers and copy it into notifiers's cqe->cflags
+	 */
+	u32			seq;
+};
+
+int io_notif_unregister(struct io_ring_ctx *ctx);
+
+struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
+				struct io_notif_slot *slot);
+
+static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx,
+					    struct io_notif_slot *slot)
+{
+	if (!slot->notif)
+		slot->notif = io_alloc_notif(ctx, slot);
+	return slot->notif;
+}
+
+static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
+						      int idx)
+	__must_hold(&ctx->uring_lock)
+{
+	if (idx >= ctx->nr_notif_slots)
+		return NULL;
+	idx = array_index_nospec(idx, ctx->nr_notif_slots);
+	return &ctx->notif_slots[idx];
+}
-- 
cgit 


From eb4a299b2f95437af6183946c2a2e850621cefdb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:39 +0100
Subject: io_uring: cache struct io_notif

kmalloc'ing struct io_notif is too expensive when done frequently, cache
them as many other resources in io_uring. Keep two list, the first one
is from where we're getting notifiers, it's protected by ->uring_lock.
The second is protected by ->completion_lock, to which we queue released
notifiers. Then we splice one list into another when needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9dec18f7fcbab9f4bd40b96e5ae158b119945230.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  7 ++++++
 io_uring/io_uring.c            |  3 +++
 io_uring/notif.c               | 57 ++++++++++++++++++++++++++++++++++++------
 io_uring/notif.h               |  5 ++++
 4 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..144493cbadb5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -249,6 +249,9 @@ struct io_ring_ctx {
 		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
 
+		/* struct io_notif cache, protected by uring_lock */
+		struct list_head	notif_list;
+
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
 		struct io_alloc_cache	apoll_cache;
@@ -259,6 +262,10 @@ struct io_ring_ctx {
 	struct io_wq_work_list	locked_free_list;
 	unsigned int		locked_free_nr;
 
+	/* struct io_notif cache protected by completion_lock */
+	struct list_head	notif_list_locked;
+	unsigned int		notif_locked_nr;
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 65ac407dd74e..20e65d45ca1c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -321,6 +321,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->locked_free_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+	INIT_LIST_HEAD(&ctx->notif_list);
+	INIT_LIST_HEAD(&ctx->notif_list_locked);
 	return ctx;
 err:
 	kfree(ctx->dummy_ubuf);
@@ -2493,6 +2495,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 	WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
 
+	io_notif_cache_purge(ctx);
 	io_mem_free(ctx->rings);
 	io_mem_free(ctx->sq_sqes);
 
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 6ee948af6a49..b257db2120b4 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -15,10 +15,12 @@ static void __io_notif_complete_tw(struct callback_head *cb)
 
 	io_cq_lock(ctx);
 	io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true);
+
+	list_add(&notif->cache_node, &ctx->notif_list_locked);
+	ctx->notif_locked_nr++;
 	io_cq_unlock_post(ctx);
 
 	percpu_ref_put(&ctx->refs);
-	kfree(notif);
 }
 
 static inline void io_notif_complete(struct io_notif *notif)
@@ -45,21 +47,62 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
 	queue_work(system_unbound_wq, &notif->commit_work);
 }
 
+static void io_notif_splice_cached(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	spin_lock(&ctx->completion_lock);
+	list_splice_init(&ctx->notif_list_locked, &ctx->notif_list);
+	ctx->notif_locked_nr = 0;
+	spin_unlock(&ctx->completion_lock);
+}
+
+void io_notif_cache_purge(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	io_notif_splice_cached(ctx);
+
+	while (!list_empty(&ctx->notif_list)) {
+		struct io_notif *notif = list_first_entry(&ctx->notif_list,
+						struct io_notif, cache_node);
+
+		list_del(&notif->cache_node);
+		kfree(notif);
+	}
+}
+
+static inline bool io_notif_has_cached(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	if (likely(!list_empty(&ctx->notif_list)))
+		return true;
+	if (data_race(READ_ONCE(ctx->notif_locked_nr) <= IO_NOTIF_SPLICE_BATCH))
+		return false;
+	io_notif_splice_cached(ctx);
+	return !list_empty(&ctx->notif_list);
+}
+
 struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_notif *notif;
 
-	notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT);
-	if (!notif)
-		return NULL;
+	if (likely(io_notif_has_cached(ctx))) {
+		notif = list_first_entry(&ctx->notif_list,
+					 struct io_notif, cache_node);
+		list_del(&notif->cache_node);
+	} else {
+		notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT);
+		if (!notif)
+			return NULL;
+		/* pre-initialise some fields */
+		notif->ctx = ctx;
+		notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+		notif->uarg.callback = io_uring_tx_zerocopy_callback;
+	}
 
 	notif->seq = slot->seq++;
 	notif->tag = slot->tag;
-	notif->ctx = ctx;
-	notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
-	notif->uarg.callback = io_uring_tx_zerocopy_callback;
 	/* master ref owned by io_notif_slot, will be dropped on flush */
 	refcount_set(&notif->uarg.refcnt, 1);
 	percpu_ref_get(&ctx->refs);
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 3d7a1d242e17..b23c9c0515bb 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -5,6 +5,8 @@
 #include <net/sock.h>
 #include <linux/nospec.h>
 
+#define IO_NOTIF_SPLICE_BATCH	32
+
 struct io_notif {
 	struct ubuf_info	uarg;
 	struct io_ring_ctx	*ctx;
@@ -13,6 +15,8 @@ struct io_notif {
 	u64			tag;
 	/* see struct io_notif_slot::seq */
 	u32			seq;
+	/* hook into ctx->notif_list and ctx->notif_list_locked */
+	struct list_head	cache_node;
 
 	union {
 		struct callback_head	task_work;
@@ -41,6 +45,7 @@ struct io_notif_slot {
 };
 
 int io_notif_unregister(struct io_ring_ctx *ctx);
+void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
 struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot);
-- 
cgit 


From bc24d6bd32df0be19df3d30e74be4ba56493c0e2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:42 +0100
Subject: io_uring: add notification slot registration

Let the userspace to register and unregister notification slots.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a0aa8161fe3ebb2a4cc6e5dbd0cffb96e6881cf5.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 17 +++++++++++++++++
 io_uring/io_uring.c           |  9 +++++++++
 io_uring/notif.c              | 43 +++++++++++++++++++++++++++++++++++++++++++
 io_uring/notif.h              |  3 +++
 4 files changed, 72 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4c9b11e2e991..dcfc7a0bda0c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -457,6 +457,10 @@ enum {
 	/* register a range of fixed file slots for automatic slot allocation */
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
+	/* zerocopy notification API */
+	IORING_REGISTER_NOTIFIERS		= 26,
+	IORING_UNREGISTER_NOTIFIERS		= 27,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -503,6 +507,19 @@ struct io_uring_rsrc_update2 {
 	__u32 resv2;
 };
 
+struct io_uring_notification_slot {
+	__u64 tag;
+	__u64 resv[3];
+};
+
+struct io_uring_notification_register {
+	__u32 nr_slots;
+	__u32 resv;
+	__u64 resv2;
+	__u64 data;
+	__u64 resv3;
+};
+
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 20e65d45ca1c..cae11374456e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3870,6 +3870,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_file_alloc_range(ctx, arg);
 		break;
+	case IORING_REGISTER_NOTIFIERS:
+		ret = io_notif_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_NOTIFIERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_notif_unregister(ctx);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 0a2e98bd74f6..e6d98dc208c7 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -162,5 +162,48 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx)
 	kvfree(ctx->notif_slots);
 	ctx->notif_slots = NULL;
 	ctx->nr_notif_slots = 0;
+	io_notif_cache_purge(ctx);
+	return 0;
+}
+
+__cold int io_notif_register(struct io_ring_ctx *ctx,
+			     void __user *arg, unsigned int size)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_uring_notification_slot __user *slots;
+	struct io_uring_notification_slot slot;
+	struct io_uring_notification_register reg;
+	unsigned i;
+
+	if (ctx->nr_notif_slots)
+		return -EBUSY;
+	if (size != sizeof(reg))
+		return -EINVAL;
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
+		return -EINVAL;
+	if (reg.resv || reg.resv2 || reg.resv3)
+		return -EINVAL;
+
+	slots = u64_to_user_ptr(reg.data);
+	ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
+				GFP_KERNEL_ACCOUNT);
+	if (!ctx->notif_slots)
+		return -ENOMEM;
+
+	for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
+		struct io_notif_slot *notif_slot = &ctx->notif_slots[i];
+
+		if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
+			io_notif_unregister(ctx);
+			return -EFAULT;
+		}
+		if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
+			io_notif_unregister(ctx);
+			return -EINVAL;
+		}
+		notif_slot->tag = slot.tag;
+	}
 	return 0;
 }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 1dd48efb7744..00efe164bdc4 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -6,6 +6,7 @@
 #include <linux/nospec.h>
 
 #define IO_NOTIF_SPLICE_BATCH	32
+#define IORING_MAX_NOTIF_SLOTS (1U << 10)
 
 struct io_notif {
 	struct ubuf_info	uarg;
@@ -48,6 +49,8 @@ struct io_notif_slot {
 	u32			seq;
 };
 
+int io_notif_register(struct io_ring_ctx *ctx,
+		      void __user *arg, unsigned int size);
 int io_notif_unregister(struct io_ring_ctx *ctx);
 void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
-- 
cgit 


From 06a5464be84e4ae48394d34441baf34bf9706827 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:43 +0100
Subject: io_uring: wire send zc request type

Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from
IORING_OP_SEND is that the user should specify a notification slot
index in sqe::notification_idx and the buffers are safe to reuse only
when the used notification is flushed and completes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a80387c6a68ce9cf99b3b6ef6f71068468761fb7.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 +++
 io_uring/net.c                | 94 +++++++++++++++++++++++++++++++++++++++++++
 io_uring/net.h                |  3 ++
 io_uring/opdef.c              | 15 +++++++
 4 files changed, 117 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index dcfc7a0bda0c..82bf2991e9bd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -66,6 +66,10 @@ struct io_uring_sqe {
 	union {
 		__s32	splice_fd_in;
 		__u32	file_index;
+		struct {
+			__u16	notification_idx;
+			__u16	__pad;
+		};
 	};
 	union {
 		struct {
@@ -197,6 +201,7 @@ enum io_uring_op {
 	IORING_OP_GETXATTR,
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
+	IORING_OP_SENDZC_NOTIF,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index bbc9c603641a..89a8678ce69b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -14,6 +14,7 @@
 #include "kbuf.h"
 #include "alloc_cache.h"
 #include "net.h"
+#include "notif.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -59,6 +60,15 @@ struct io_sr_msg {
 	unsigned int			flags;
 };
 
+struct io_sendzc {
+	struct file			*file;
+	void __user			*buf;
+	size_t				len;
+	u16				slot_idx;
+	unsigned			msg_flags;
+	unsigned			flags;
+};
+
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -834,6 +844,90 @@ out_free:
 	return ret;
 }
 
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+
+	if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) ||
+	    READ_ONCE(sqe->addr3))
+		return -EINVAL;
+
+	zc->flags = READ_ONCE(sqe->ioprio);
+	if (zc->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
+
+	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	zc->len = READ_ONCE(sqe->len);
+	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	zc->slot_idx = READ_ONCE(sqe->notification_idx);
+	if (zc->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	return 0;
+}
+
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_notif_slot *notif_slot;
+	struct io_notif *notif;
+	struct msghdr msg;
+	struct iovec iov;
+	struct socket *sock;
+	unsigned msg_flags;
+	int ret, min_ret = 0;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		return -EAGAIN;
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
+	if (!notif_slot)
+		return -EINVAL;
+	notif = io_get_notif(ctx, notif_slot);
+	if (!notif)
+		return -ENOMEM;
+
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+
+	ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		return ret;
+
+	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		msg_flags |= MSG_DONTWAIT;
+	if (msg_flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	msg.msg_flags = msg_flags;
+	msg.msg_ubuf = &notif->uarg;
+	msg.sg_from_iter = NULL;
+	ret = sock_sendmsg(sock, &msg);
+
+	if (unlikely(ret < min_ret)) {
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return -EAGAIN;
+		return ret == -ERESTARTSYS ? -EINTR : ret;
+	}
+
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = io_kiocb_to_cmd(req);
diff --git a/io_uring/net.h b/io_uring/net.h
index db20ce9d6546..7c438d39c089 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req);
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
 
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+
 void io_netmsg_cache_free(struct io_cache_entry *entry);
 #else
 static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a7b84b43e6c2..7ab19bbf3126 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -470,6 +470,21 @@ const struct io_op_def io_op_defs[] = {
 		.issue			= io_uring_cmd,
 		.prep_async		= io_uring_cmd_prep_async,
 	},
+	[IORING_OP_SENDZC_NOTIF] = {
+		.name			= "SENDZC_NOTIF",
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_sendzc_prep,
+		.issue			= io_sendzc,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
-- 
cgit 


From 092aeedb750a9fad0f0252d6067fc91d76ca44bd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:45 +0100
Subject: io_uring: allow to pass addr into sendzc

Allow to specify an address to zerocopy sends making it more like
sendto(2).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70417a8f7c5b51ab454690bae08adc0c187f89e8.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 +-
 io_uring/net.c                | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 82bf2991e9bd..0736e2773a5d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -68,7 +68,7 @@ struct io_uring_sqe {
 		__u32	file_index;
 		struct {
 			__u16	notification_idx;
-			__u16	__pad;
+			__u16	addr_len;
 		};
 	};
 	union {
diff --git a/io_uring/net.c b/io_uring/net.c
index 2d04a70b0632..61414d865cd7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -67,6 +67,8 @@ struct io_sendzc {
 	u16				slot_idx;
 	unsigned			msg_flags;
 	unsigned			flags;
+	unsigned			addr_len;
+	void __user			*addr;
 };
 
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -848,8 +850,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
 
-	if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) ||
-	    READ_ONCE(sqe->addr3))
+	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
@@ -862,6 +863,10 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	zc->slot_idx = READ_ONCE(sqe->notification_idx);
 	if (zc->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+
+	zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	zc->addr_len = READ_ONCE(sqe->addr_len);
+
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
 		zc->msg_flags |= MSG_CMSG_COMPAT;
@@ -871,6 +876,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct sockaddr_storage address;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
 	struct io_notif_slot *notif_slot;
@@ -908,6 +914,14 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		return ret;
 	mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
 
+	if (zc->addr) {
+		ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
+		if (unlikely(ret < 0))
+			return ret;
+		msg.msg_name = (struct sockaddr *)&address;
+		msg.msg_namelen = zc->addr_len;
+	}
+
 	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		msg_flags |= MSG_DONTWAIT;
-- 
cgit 


From 10c7d33ecd51619e453cf6aeee8e326f8ba5cfea Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:46 +0100
Subject: io_uring: sendzc with fixed buffers

Allow zerocopy sends to use fixed buffers. There is an optimisation for
this case, the network layer don't need to reference the pages, see
SKBFL_MANAGED_FRAG_REFS, so io_uring have to ensure validity of fixed
buffers until the notifier is released.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e1d8bd1b5934e541d90c1824eb4020ae3f5f43f3.1657643355.git.asml.silence@gmail.com
[axboe: fold in 32-bit pointer cast warning fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  6 +++++-
 io_uring/net.c                | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0736e2773a5d..f1a9ff9b9ea7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -272,9 +272,13 @@ enum io_uring_op {
  * IORING_RECV_MULTISHOT	Multishot recv. Sets IORING_CQE_F_MORE if
  *				the handler will continue to report
  *				CQEs on behalf of the same SQE.
+ *
+ * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
+ *				the buf_index field.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
-#define IORING_RECV_MULTISHOT	(1U << 1)
+#define IORING_RECV_MULTISHOT		(1U << 1)
+#define IORING_RECVSEND_FIXED_BUF	(1U << 2)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index 61414d865cd7..ab443c52dcfd 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -15,6 +15,7 @@
 #include "alloc_cache.h"
 #include "net.h"
 #include "notif.h"
+#include "rsrc.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -849,13 +850,23 @@ out_free:
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
 
 	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF))
 		return -EINVAL;
+	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+		unsigned idx = READ_ONCE(sqe->buf_index);
+
+		if (unlikely(idx >= ctx->nr_user_bufs))
+			return -EFAULT;
+		idx = array_index_nospec(idx, ctx->nr_user_bufs);
+		req->imu = READ_ONCE(ctx->user_bufs[idx]);
+		io_req_set_rsrc_node(req, ctx, 0);
+	}
 
 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	zc->len = READ_ONCE(sqe->len);
@@ -909,10 +920,18 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
 
-	ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter);
-	if (unlikely(ret))
-		return ret;
-	mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
+	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
+					(u64)(uintptr_t)zc->buf, zc->len);
+		if (unlikely(ret))
+				return ret;
+	} else {
+		ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
+					  &msg.msg_iter);
+		if (unlikely(ret))
+			return ret;
+		mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
+	}
 
 	if (zc->addr) {
 		ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
-- 
cgit 


From 63809137ebb58f0aa2ce359117422686e3304f45 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:47 +0100
Subject: io_uring: flush notifiers after sendzc

Allow to flush notifiers as a part of sendzc request by setting
IORING_SENDZC_FLUSH flag. When the sendzc request succeedes it will
flush the used [active] notifier.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e0b4d9a6797e2fd6092824fe42953db7a519bbc8.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  4 ++++
 io_uring/io_uring.c           | 11 +----------
 io_uring/io_uring.h           | 10 ++++++++++
 io_uring/net.c                |  5 ++++-
 io_uring/notif.c              |  2 +-
 io_uring/notif.h              | 11 +++++++++++
 6 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1a9ff9b9ea7..45272eb37d10 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -275,10 +275,14 @@ enum io_uring_op {
  *
  * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
  *				the buf_index field.
+ *
+ * IORING_RECVSEND_NOTIF_FLUSH	Flush a notification after a successful
+ *				successful. Only for zerocopy sends.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
+#define IORING_RECVSEND_NOTIF_FLUSH	(1U << 3)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cae11374456e..1d600a63643b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -621,7 +621,7 @@ void __io_put_task(struct task_struct *task, int nr)
 	put_task_struct_many(task, nr);
 }
 
-static void io_task_refs_refill(struct io_uring_task *tctx)
+void io_task_refs_refill(struct io_uring_task *tctx)
 {
 	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
 
@@ -630,15 +630,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx)
 	tctx->cached_refs += refill;
 }
 
-static inline void io_get_task_refs(int nr)
-{
-	struct io_uring_task *tctx = current->io_uring;
-
-	tctx->cached_refs -= nr;
-	if (unlikely(tctx->cached_refs < 0))
-		io_task_refs_refill(tctx);
-}
-
 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 {
 	struct io_uring_task *tctx = task->io_uring;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 66bfd880d07f..cc81a9d1fd4d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -74,6 +74,7 @@ void io_wq_submit_work(struct io_wq_work *work);
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
 void __io_put_task(struct task_struct *task, int nr);
+void io_task_refs_refill(struct io_uring_task *tctx);
 
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
@@ -270,4 +271,13 @@ static inline void io_put_task(struct task_struct *task, int nr)
 		__io_put_task(task, nr);
 }
 
+static inline void io_get_task_refs(int nr)
+{
+	struct io_uring_task *tctx = current->io_uring;
+
+	tctx->cached_refs -= nr;
+	if (unlikely(tctx->cached_refs < 0))
+		io_task_refs_refill(tctx);
+}
+
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index ab443c52dcfd..9ac2ce37c522 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -856,7 +856,8 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF))
+	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
+			  IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
 		return -EINVAL;
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		unsigned idx = READ_ONCE(sqe->buf_index);
@@ -958,6 +959,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		return ret == -ERESTARTSYS ? -EINTR : ret;
 	}
 
+	if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH)
+		io_notif_slot_flush_submit(notif_slot, 0);
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
diff --git a/io_uring/notif.c b/io_uring/notif.c
index c5179e5c1cd6..a93887451bbb 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -133,7 +133,7 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 	return notif;
 }
 
-static void io_notif_slot_flush(struct io_notif_slot *slot)
+void io_notif_slot_flush(struct io_notif_slot *slot)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_notif *notif = slot->notif;
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 00efe164bdc4..6cd73d7b965b 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -54,6 +54,7 @@ int io_notif_register(struct io_ring_ctx *ctx,
 int io_notif_unregister(struct io_ring_ctx *ctx);
 void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
+void io_notif_slot_flush(struct io_notif_slot *slot);
 struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot);
 
@@ -74,3 +75,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
 	idx = array_index_nospec(idx, ctx->nr_notif_slots);
 	return &ctx->notif_slots[idx];
 }
+
+static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
+					      unsigned int issue_flags)
+{
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		slot->notif->task = current;
+		io_get_task_refs(1);
+	}
+	io_notif_slot_flush(slot);
+}
-- 
cgit 


From 4379d5f15b3fd4224c37841029178aa8082a242e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:48 +0100
Subject: io_uring: rename IORING_OP_FILES_UPDATE

IORING_OP_FILES_UPDATE will be a more generic opcode serving different
resource types, rename it into IORING_OP_RSRC_UPDATE and add subtype
handling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0a907133907d9af3415a8a7aa1802c6aa97c03c6.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 12 +++++++++++-
 io_uring/opdef.c              |  9 +++++----
 io_uring/rsrc.c               | 17 +++++++++++++++--
 io_uring/rsrc.h               |  4 ++--
 4 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 45272eb37d10..210a00ab6301 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -174,7 +174,8 @@ enum io_uring_op {
 	IORING_OP_FALLOCATE,
 	IORING_OP_OPENAT,
 	IORING_OP_CLOSE,
-	IORING_OP_FILES_UPDATE,
+	IORING_OP_RSRC_UPDATE,
+	IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
 	IORING_OP_STATX,
 	IORING_OP_READ,
 	IORING_OP_WRITE,
@@ -223,6 +224,7 @@ enum io_uring_op {
 #define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 #define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
+
 /*
  * sqe->splice_flags
  * extends splice(2) flags
@@ -289,6 +291,14 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+
+/*
+ * IORING_OP_RSRC_UPDATE flags
+ */
+enum {
+	IORING_RSRC_UPDATE_FILES,
+};
+
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
  */
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 7ab19bbf3126..72dd2b2d8a9d 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -246,12 +246,13 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_close_prep,
 		.issue			= io_close,
 	},
-	[IORING_OP_FILES_UPDATE] = {
+	[IORING_OP_RSRC_UPDATE] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "FILES_UPDATE",
-		.prep			= io_files_update_prep,
-		.issue			= io_files_update,
+		.name			= "RSRC_UPDATE",
+		.prep			= io_rsrc_update_prep,
+		.issue			= io_rsrc_update,
+		.ioprio			= 1,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7f66b0e25674..fc2b337e6c25 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -21,6 +21,7 @@ struct io_rsrc_update {
 	u64				arg;
 	u32				nr_args;
 	u32				offset;
+	int				type;
 };
 
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
@@ -657,7 +658,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	return -EINVAL;
 }
 
-int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
 
@@ -671,6 +672,7 @@ int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!up->nr_args)
 		return -EINVAL;
 	up->arg = READ_ONCE(sqe->addr);
+	up->type = READ_ONCE(sqe->ioprio);
 	return 0;
 }
 
@@ -713,7 +715,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	return ret;
 }
 
-int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
+static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
@@ -742,6 +744,17 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+
+	switch (up->type) {
+	case IORING_RSRC_UPDATE_FILES:
+		return io_files_update(req, issue_flags);
+	}
+	return -EINVAL;
+}
+
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc)
 {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index af342fd239d0..21813a23215f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -167,6 +167,6 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 	return &data->tags[table_idx][off];
 }
 
-int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
-int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags);
+int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 #endif
-- 
cgit 


From 492dddb4f6e3a5839c27d41ff1fecdbe6c3ab851 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:49 +0100
Subject: io_uring: add zc notification flush requests

Overlay notification control onto IORING_OP_RSRC_UPDATE (former
IORING_OP_FILES_UPDATE). It allows to flush a range of zc notifications
from slots with indexes [sqe->off, sqe->off+sqe->len). If sqe->arg is
not zero, it also copies sqe->arg as a new tag for all flushed
notifications.

Note, it doesn't flush a notification of a slot if there was no requests
attached to it (since last flush or registration).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/df13e2363400682a73dd9e71c3b990b8d1ff0333.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/rsrc.c               | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 210a00ab6301..1463cfecb56b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -297,6 +297,7 @@ enum io_uring_op {
  */
 enum {
 	IORING_RSRC_UPDATE_FILES,
+	IORING_RSRC_UPDATE_NOTIF,
 };
 
 /*
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fc2b337e6c25..9165fdf64269 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -15,6 +15,7 @@
 #include "io_uring.h"
 #include "openclose.h"
 #include "rsrc.h"
+#include "notif.h"
 
 struct io_rsrc_update {
 	struct file			*file;
@@ -744,6 +745,41 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned len = up->nr_args;
+	unsigned idx_end, idx = up->offset;
+	int ret = 0;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	if (unlikely(check_add_overflow(idx, len, &idx_end))) {
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	if (unlikely(idx_end > ctx->nr_notif_slots)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (; idx < idx_end; idx++) {
+		struct io_notif_slot *slot = &ctx->notif_slots[idx];
+
+		if (!slot->notif)
+			continue;
+		if (up->arg)
+			slot->tag = up->arg;
+		io_notif_slot_flush_submit(slot, issue_flags);
+	}
+out:
+	io_ring_submit_unlock(ctx, issue_flags);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
@@ -751,6 +787,8 @@ int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 	switch (up->type) {
 	case IORING_RSRC_UPDATE_FILES:
 		return io_files_update(req, issue_flags);
+	case IORING_RSRC_UPDATE_NOTIF:
+		return io_notif_update(req, issue_flags);
 	}
 	return -EINVAL;
 }
-- 
cgit 


From 9d9b010f12cc4e254bbba32d79c965b564a8957f Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Sun, 24 Jul 2022 23:21:52 +0100
Subject: irqchip/mmp: Declare init functions in common header file

The functions icu_init_irq and mmp2_init_icu are exported
from this code, so declare them in the header file to avoid
the following sparse warnings:

drivers/irqchip/irq-mmp.c:248:13: warning: symbol 'icu_init_irq' was not declared. Should it be static?
drivers/irqchip/irq-mmp.c:271:13: warning: symbol 'mmp2_init_icu' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
[maz: fixup commit message]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220724222152.551850-1-ben-linux@fluff.org
---
 arch/arm/mach-mmp/mmp2.h    | 2 +-
 arch/arm/mach-mmp/pxa168.h  | 2 +-
 arch/arm/mach-mmp/pxa910.h  | 2 +-
 include/linux/irqchip/mmp.h | 3 +++
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-mmp/mmp2.h b/arch/arm/mach-mmp/mmp2.h
index 3ebc1bb13f71..7f80b90248fb 100644
--- a/arch/arm/mach-mmp/mmp2.h
+++ b/arch/arm/mach-mmp/mmp2.h
@@ -5,13 +5,13 @@
 #include <linux/platform_data/pxa_sdhci.h>
 
 extern void mmp2_timer_init(void);
-extern void __init mmp2_init_icu(void);
 extern void __init mmp2_init_irq(void);
 extern void mmp2_clear_pmic_int(void);
 
 #include <linux/i2c.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/platform_data/dma-mmp_tdma.h>
+#include <linux/irqchip/mmp.h>
 
 #include "devices.h"
 
diff --git a/arch/arm/mach-mmp/pxa168.h b/arch/arm/mach-mmp/pxa168.h
index 34f907cd165a..c1547e098f09 100644
--- a/arch/arm/mach-mmp/pxa168.h
+++ b/arch/arm/mach-mmp/pxa168.h
@@ -5,7 +5,6 @@
 #include <linux/reboot.h>
 
 extern void pxa168_timer_init(void);
-extern void __init icu_init_irq(void);
 extern void __init pxa168_init_irq(void);
 extern void pxa168_restart(enum reboot_mode, const char *);
 extern void pxa168_clear_keypad_wakeup(void);
@@ -18,6 +17,7 @@ extern void pxa168_clear_keypad_wakeup(void);
 #include <linux/pxa168_eth.h>
 #include <linux/platform_data/mv_usb.h>
 #include <linux/soc/mmp/cputype.h>
+#include <linux/irqchip/mmp.h>
 
 #include "devices.h"
 
diff --git a/arch/arm/mach-mmp/pxa910.h b/arch/arm/mach-mmp/pxa910.h
index 6ace5a8aa15b..7d229214065a 100644
--- a/arch/arm/mach-mmp/pxa910.h
+++ b/arch/arm/mach-mmp/pxa910.h
@@ -3,13 +3,13 @@
 #define __ASM_MACH_PXA910_H
 
 extern void pxa910_timer_init(void);
-extern void __init icu_init_irq(void);
 extern void __init pxa910_init_irq(void);
 
 #include <linux/i2c.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/platform_data/mtd-nand-pxa3xx.h>
 #include <video/mmp_disp.h>
+#include <linux/irqchip/mmp.h>
 
 #include "devices.h"
 
diff --git a/include/linux/irqchip/mmp.h b/include/linux/irqchip/mmp.h
index cb8455c87c8a..aa1813749a4f 100644
--- a/include/linux/irqchip/mmp.h
+++ b/include/linux/irqchip/mmp.h
@@ -4,4 +4,7 @@
 
 extern struct irq_chip icu_irq_chip;
 
+extern void icu_init_irq(void);
+extern void mmp2_init_icu(void);
+
 #endif	/* __IRQCHIP_MMP_H */
-- 
cgit 


From b4023554b1fb49f73a09e5f346a5facbf27d7383 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 25 Jul 2022 09:58:35 +0200
Subject: USB: cdc: add control-signal defines

Add defines for the Control Signal Bitmap Values from section 6.2.14
SetControlLineState of the CDC specification version 1.1.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20220725075841.1187-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/cdc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 6d61550959ef..372c81425cae 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -271,6 +271,10 @@ struct usb_cdc_line_coding {
 	__u8	bDataBits;
 } __attribute__ ((packed));
 
+/* Control Signal Bitmap Values from 6.2.14 SetControlLineState */
+#define USB_CDC_CTRL_DTR			(1 << 0)
+#define USB_CDC_CTRL_RTS			(1 << 1)
+
 /* table 62; bits in multicast filter */
 #define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
 #define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
-- 
cgit 


From a0a3202b44a9fdf2a1f6330a0d176aee76c8631d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 25 Jul 2022 09:58:36 +0200
Subject: USB: cdc: add serial-state defines

Add defines for the serial-state bitmap values from section 6.3.5
SerialState of the CDC specification version 1.1.

Note that the bTxCarrier and bRxCarrier bits have been named after their
RS-232 signal equivalents DSR and DCD.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20220725075841.1187-3-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/cdc.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 372c81425cae..78caa9bdc4ae 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -306,6 +306,15 @@ struct usb_cdc_notification {
 	__le16	wLength;
 } __attribute__ ((packed));
 
+/* UART State Bitmap Values from 6.3.5 SerialState */
+#define USB_CDC_SERIAL_STATE_DCD		(1 << 0)
+#define USB_CDC_SERIAL_STATE_DSR		(1 << 1)
+#define USB_CDC_SERIAL_STATE_BREAK		(1 << 2)
+#define USB_CDC_SERIAL_STATE_RING_SIGNAL	(1 << 3)
+#define USB_CDC_SERIAL_STATE_FRAMING		(1 << 4)
+#define USB_CDC_SERIAL_STATE_PARITY		(1 << 5)
+#define USB_CDC_SERIAL_STATE_OVERRUN		(1 << 6)
+
 struct usb_cdc_speed_change {
 	__le32	DLBitRRate;	/* contains the downlink bit rate (IN pipe) */
 	__le32	ULBitRate;	/* contains the uplink bit rate (OUT pipe) */
-- 
cgit 


From d349ab99eec7ab0f977fc4aac27aa476907acf90 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 17 Jul 2022 12:35:24 +0200
Subject: random: handle archrandom with multiple longs

The archrandom interface was originally designed for x86, which supplies
RDRAND/RDSEED for receiving random words into registers, resulting in
one function to generate an int and another to generate a long. However,
other architectures don't follow this.

On arm64, the SMCCC TRNG interface can return between one and three
longs. On s390, the CPACF TRNG interface can return arbitrary amounts,
with four longs having the same cost as one. On UML, the os_getrandom()
interface can return arbitrary amounts.

So change the api signature to take a "max_longs" parameter designating
the maximum number of longs requested, and then return the number of
longs generated.

Since callers need to check this return value and loop anyway, each arch
implementation does not bother implementing its own loop to try again to
fill the maximum number of longs. Additionally, all existing callers
pass in a constant max_longs parameter. Taken together, these two things
mean that the codegen doesn't really change much for one-word-at-a-time
platforms, while performance is greatly improved on platforms such as
s390.

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/arm64/include/asm/archrandom.h   | 102 ++++++++++++++++------------------
 arch/arm64/kernel/kaslr.c             |   2 +-
 arch/powerpc/include/asm/archrandom.h |  30 ++--------
 arch/powerpc/kvm/book3s_hv.c          |   2 +-
 arch/s390/include/asm/archrandom.h    |  29 +++-------
 arch/um/include/asm/archrandom.h      |  21 +++----
 arch/x86/include/asm/archrandom.h     |  41 ++------------
 arch/x86/kernel/espfix_64.c           |   2 +-
 drivers/char/random.c                 |  45 ++++++++++-----
 include/asm-generic/archrandom.h      |  18 ++----
 include/linux/random.h                |  12 ++--
 11 files changed, 116 insertions(+), 188 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
index c3b9fa56af67..109e2a4454be 100644
--- a/arch/arm64/include/asm/archrandom.h
+++ b/arch/arm64/include/asm/archrandom.h
@@ -58,7 +58,7 @@ static inline bool __arm64_rndrrs(unsigned long *v)
 	return ok;
 }
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
 	/*
 	 * Only support the generic interface after we have detected
@@ -66,27 +66,15 @@ static inline bool __must_check arch_get_random_long(unsigned long *v)
 	 * cpufeature code and with potential scheduling between CPUs
 	 * with and without the feature.
 	 */
-	if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v))
-		return true;
-	return false;
+	if (max_longs && cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v))
+		return 1;
+	return 0;
 }
 
-static inline bool __must_check arch_get_random_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	if (cpus_have_const_cap(ARM64_HAS_RNG)) {
-		unsigned long val;
-
-		if (__arm64_rndr(&val)) {
-			*v = val;
-			return true;
-		}
-	}
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
-	struct arm_smccc_res res;
+	if (!max_longs)
+		return 0;
 
 	/*
 	 * We prefer the SMCCC call, since its semantics (return actual
@@ -95,10 +83,23 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
 	 * (the output of a pseudo RNG freshly seeded by a TRNG).
 	 */
 	if (smccc_trng_available) {
-		arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res);
+		struct arm_smccc_res res;
+
+		max_longs = min_t(size_t, 3, max_longs);
+		arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res);
 		if ((int)res.a0 >= 0) {
-			*v = res.a3;
-			return true;
+			switch (max_longs) {
+			case 3:
+				*v++ = res.a1;
+				fallthrough;
+			case 2:
+				*v++ = res.a2;
+				fallthrough;
+			case 1:
+				*v++ = res.a3;
+				break;
+			}
+			return max_longs;
 		}
 	}
 
@@ -108,32 +109,9 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
 	 * enough to implement this API if no other entropy source exists.
 	 */
 	if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndrrs(v))
-		return true;
+		return 1;
 
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
-{
-	struct arm_smccc_res res;
-	unsigned long val;
-
-	if (smccc_trng_available) {
-		arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 32, &res);
-		if ((int)res.a0 >= 0) {
-			*v = res.a3 & GENMASK(31, 0);
-			return true;
-		}
-	}
-
-	if (cpus_have_const_cap(ARM64_HAS_RNG)) {
-		if (__arm64_rndrrs(&val)) {
-			*v = val;
-			return true;
-		}
-	}
-
-	return false;
+	return 0;
 }
 
 static inline bool __init __early_cpu_has_rndr(void)
@@ -143,26 +121,40 @@ static inline bool __init __early_cpu_has_rndr(void)
 	return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf;
 }
 
-static inline bool __init __must_check
-arch_get_random_seed_long_early(unsigned long *v)
+static inline size_t __init __must_check
+arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs)
 {
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
+	if (!max_longs)
+		return 0;
+
 	if (smccc_trng_available) {
 		struct arm_smccc_res res;
 
-		arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res);
+		max_longs = min_t(size_t, 3, max_longs);
+		arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res);
 		if ((int)res.a0 >= 0) {
-			*v = res.a3;
-			return true;
+			switch (max_longs) {
+			case 3:
+				*v++ = res.a1;
+				fallthrough;
+			case 2:
+				*v++ = res.a2;
+				fallthrough;
+			case 1:
+				*v++ = res.a3;
+				break;
+			}
+			return max_longs;
 		}
 	}
 
 	if (__early_cpu_has_rndr() && __arm64_rndr(v))
-		return true;
+		return 1;
 
-	return false;
+	return 0;
 }
-#define arch_get_random_seed_long_early arch_get_random_seed_long_early
+#define arch_get_random_seed_longs_early arch_get_random_seed_longs_early
 
 #endif /* _ASM_ARCHRANDOM_H */
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 418b2bba1521..c5d541f358d3 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -106,7 +106,7 @@ u64 __init kaslr_early_init(void)
 	 * and supported.
 	 */
 
-	if (arch_get_random_seed_long_early(&raw))
+	if (arch_get_random_seed_longs_early(&raw, 1))
 		seed ^= raw;
 
 	if (!seed) {
diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
index 25ba65df6b1a..0e365c5b2396 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -4,34 +4,16 @@
 
 #include <asm/machdep.h>
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
+	return 0;
 }
 
-static inline bool __must_check arch_get_random_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
-	if (ppc_md.get_random_seed)
-		return ppc_md.get_random_seed(v);
-
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
-{
-	unsigned long val;
-	bool rc;
-
-	rc = arch_get_random_seed_long(&val);
-	if (rc)
-		*v = val;
-
-	return rc;
+	if (max_longs && ppc_md.get_random_seed && ppc_md.get_random_seed(v))
+		return 1;
+	return 0;
 }
 
 #ifdef CONFIG_PPC_POWERNV
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e08fb3124dca..631062cde6b4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1207,7 +1207,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		break;
 #endif
 	case H_RANDOM:
-		if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
+		if (!arch_get_random_seed_longs(&vcpu->arch.regs.gpr[4], 1))
 			ret = H_HARDWARE;
 		break;
 	case H_RPT_INVALIDATE:
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index 0a1c2e66c709..cf5e000df0a1 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -18,34 +18,19 @@
 DECLARE_STATIC_KEY_FALSE(s390_arch_random_available);
 extern atomic64_t s390_arch_random_counter;
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
+	return 0;
 }
 
-static inline bool __must_check arch_get_random_int(unsigned int *v)
-{
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
-	if (static_branch_likely(&s390_arch_random_available)) {
-		cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v));
-		atomic64_add(sizeof(*v), &s390_arch_random_counter);
-		return true;
-	}
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
 	if (static_branch_likely(&s390_arch_random_available)) {
-		cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v));
-		atomic64_add(sizeof(*v), &s390_arch_random_counter);
-		return true;
+		cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v));
+		atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter);
+		return max_longs;
 	}
-	return false;
+	return 0;
 }
 
 #endif /* _ASM_S390_ARCHRANDOM_H */
diff --git a/arch/um/include/asm/archrandom.h b/arch/um/include/asm/archrandom.h
index 2f24cb96391d..24e16c979c51 100644
--- a/arch/um/include/asm/archrandom.h
+++ b/arch/um/include/asm/archrandom.h
@@ -7,24 +7,19 @@
 /* This is from <os.h>, but better not to #include that in a global header here. */
 ssize_t os_getrandom(void *buf, size_t len, unsigned int flags);
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return os_getrandom(v, sizeof(*v), 0) == sizeof(*v);
-}
+	ssize_t ret;
 
-static inline bool __must_check arch_get_random_int(unsigned int *v)
-{
-	return os_getrandom(v, sizeof(*v), 0) == sizeof(*v);
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
-	return false;
+	ret = os_getrandom(v, max_longs * sizeof(*v), 0);
+	if (ret < 0)
+		return 0;
+	return ret / sizeof(*v);
 }
 
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
+	return 0;
 }
 
 #endif
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index fb235b696175..02bae8e0758b 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -31,20 +31,6 @@ static inline bool __must_check rdrand_long(unsigned long *v)
 	return false;
 }
 
-static inline bool __must_check rdrand_int(unsigned int *v)
-{
-	bool ok;
-	unsigned int retry = RDRAND_RETRY_LOOPS;
-	do {
-		asm volatile("rdrand %[out]"
-			     CC_SET(c)
-			     : CC_OUT(c) (ok), [out] "=r" (*v));
-		if (ok)
-			return true;
-	} while (--retry);
-	return false;
-}
-
 static inline bool __must_check rdseed_long(unsigned long *v)
 {
 	bool ok;
@@ -54,38 +40,19 @@ static inline bool __must_check rdseed_long(unsigned long *v)
 	return ok;
 }
 
-static inline bool __must_check rdseed_int(unsigned int *v)
-{
-	bool ok;
-	asm volatile("rdseed %[out]"
-		     CC_SET(c)
-		     : CC_OUT(c) (ok), [out] "=r" (*v));
-	return ok;
-}
-
 /*
  * These are the generic interfaces; they must not be declared if the
  * stubs in <linux/random.h> are to be invoked.
  */
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
-{
-	return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false;
-}
-
-static inline bool __must_check arch_get_random_int(unsigned int *v)
-{
-	return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false;
+	return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0;
 }
 
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false;
+	return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0;
 }
 
 #ifndef CONFIG_UML
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4fe7af58cfe1..9417d5aa7305 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -100,7 +100,7 @@ static void init_espfix_random(void)
 	 * This is run before the entropy pools are initialized,
 	 * but this is hopefully better than nothing.
 	 */
-	if (!arch_get_random_long(&rand)) {
+	if (!arch_get_random_longs(&rand, 1)) {
 		/* The constant is an arbitrary large prime */
 		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0c6568ae5f68..7bf11fa66265 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -596,12 +596,20 @@ static void extract_entropy(void *buf, size_t len)
 		unsigned long rdseed[32 / sizeof(long)];
 		size_t counter;
 	} block;
-	size_t i;
+	size_t i, longs;
 
-	for (i = 0; i < ARRAY_SIZE(block.rdseed); ++i) {
-		if (!arch_get_random_seed_long(&block.rdseed[i]) &&
-		    !arch_get_random_long(&block.rdseed[i]))
-			block.rdseed[i] = random_get_entropy();
+	for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
+		longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
+		if (longs) {
+			i += longs;
+			continue;
+		}
+		longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
+		if (longs) {
+			i += longs;
+			continue;
+		}
+		block.rdseed[i++] = random_get_entropy();
 	}
 
 	spin_lock_irqsave(&input_pool.lock, flags);
@@ -776,22 +784,31 @@ static struct notifier_block pm_notifier = { .notifier_call = random_pm_notifica
 int __init random_init(const char *command_line)
 {
 	ktime_t now = ktime_get_real();
-	unsigned int i, arch_bits;
-	unsigned long entropy;
+	size_t i, longs, arch_bits;
+	unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
 
 #if defined(LATENT_ENTROPY_PLUGIN)
 	static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
 	_mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
 #endif
 
-	for (i = 0, arch_bits = BLAKE2S_BLOCK_SIZE * 8;
-	     i < BLAKE2S_BLOCK_SIZE; i += sizeof(entropy)) {
-		if (!arch_get_random_seed_long_early(&entropy) &&
-		    !arch_get_random_long_early(&entropy)) {
-			entropy = random_get_entropy();
-			arch_bits -= sizeof(entropy) * 8;
+	for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
+		longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
+		if (longs) {
+			_mix_pool_bytes(entropy, sizeof(*entropy) * longs);
+			i += longs;
+			continue;
 		}
-		_mix_pool_bytes(&entropy, sizeof(entropy));
+		longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
+		if (longs) {
+			_mix_pool_bytes(entropy, sizeof(*entropy) * longs);
+			i += longs;
+			continue;
+		}
+		entropy[0] = random_get_entropy();
+		_mix_pool_bytes(entropy, sizeof(*entropy));
+		arch_bits -= sizeof(*entropy) * 8;
+		++i;
 	}
 	_mix_pool_bytes(&now, sizeof(now));
 	_mix_pool_bytes(utsname(), sizeof(*(utsname())));
diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h
index 3a5ee202dd86..3cd7f980cfdc 100644
--- a/include/asm-generic/archrandom.h
+++ b/include/asm-generic/archrandom.h
@@ -2,24 +2,14 @@
 #ifndef __ASM_GENERIC_ARCHRANDOM_H__
 #define __ASM_GENERIC_ARCHRANDOM_H__
 
-static inline bool __must_check arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
+	return 0;
 }
 
-static inline bool __must_check arch_get_random_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
 {
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
-	return false;
-}
-
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
-{
-	return false;
+	return 0;
 }
 
 #endif
diff --git a/include/linux/random.h b/include/linux/random.h
index 865770e29f3e..3fec206487f6 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -112,19 +112,19 @@ declare_get_random_var_wait(long, unsigned long)
  * Called from the boot CPU during startup; not valid to call once
  * secondary CPUs are up and preemption is possible.
  */
-#ifndef arch_get_random_seed_long_early
-static inline bool __init arch_get_random_seed_long_early(unsigned long *v)
+#ifndef arch_get_random_seed_longs_early
+static inline size_t __init arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs)
 {
 	WARN_ON(system_state != SYSTEM_BOOTING);
-	return arch_get_random_seed_long(v);
+	return arch_get_random_seed_longs(v, max_longs);
 }
 #endif
 
-#ifndef arch_get_random_long_early
-static inline bool __init arch_get_random_long_early(unsigned long *v)
+#ifndef arch_get_random_longs_early
+static inline bool __init arch_get_random_longs_early(unsigned long *v, size_t max_longs)
 {
 	WARN_ON(system_state != SYSTEM_BOOTING);
-	return arch_get_random_long(v);
+	return arch_get_random_longs(v, max_longs);
 }
 #endif
 
-- 
cgit 


From 02739545951ad4c1215160db7fbf9b7a918d3c0b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 22 Jul 2022 11:22:00 -0700
Subject: net: Fix data-races around sysctl_[rw]mem(_offset)?.

While reading these sysctl variables, they can be changed concurrently.
Thus, we need to add READ_ONCE() to their readers.

  - .sysctl_rmem
  - .sysctl_rwmem
  - .sysctl_rmem_offset
  - .sysctl_wmem_offset
  - sysctl_tcp_rmem[1, 2]
  - sysctl_tcp_wmem[1, 2]
  - sysctl_decnet_rmem[1]
  - sysctl_decnet_wmem[1]
  - sysctl_tipc_rmem[1]

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     |  8 ++++----
 net/decnet/af_decnet.c |  4 ++--
 net/ipv4/tcp.c         |  6 +++---
 net/ipv4/tcp_input.c   | 13 +++++++------
 net/ipv4/tcp_output.c  |  2 +-
 net/mptcp/protocol.c   |  6 +++---
 net/tipc/socket.c      |  2 +-
 7 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9fa54762e077..7a48991cdb19 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2843,18 +2843,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
 {
 	/* Does this proto have per netns sysctl_wmem ? */
 	if (proto->sysctl_wmem_offset)
-		return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+		return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
 
-	return *proto->sysctl_wmem;
+	return READ_ONCE(*proto->sysctl_wmem);
 }
 
 static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
 {
 	/* Does this proto have per netns sysctl_rmem ? */
 	if (proto->sysctl_rmem_offset)
-		return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+		return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
 
-	return *proto->sysctl_rmem;
+	return READ_ONCE(*proto->sysctl_rmem);
 }
 
 /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index dc92a67baea3..7d542eb46172 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -480,8 +480,8 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf
 	sk->sk_family      = PF_DECnet;
 	sk->sk_protocol    = 0;
 	sk->sk_allocation  = gfp;
-	sk->sk_sndbuf	   = sysctl_decnet_wmem[1];
-	sk->sk_rcvbuf	   = sysctl_decnet_rmem[1];
+	sk->sk_sndbuf	   = READ_ONCE(sysctl_decnet_wmem[1]);
+	sk->sk_rcvbuf	   = READ_ONCE(sysctl_decnet_rmem[1]);
 
 	/* Initialization of DECnet Session Control Port		*/
 	scp = DN_SK(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a11e5de3a4c3..002a4a04efbe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -452,8 +452,8 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_sync_mss = tcp_sync_mss;
 
-	WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
-	WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
 
 	sk_sockets_allocated_inc(sk);
 }
@@ -1724,7 +1724,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		cap = sk->sk_rcvbuf >> 1;
 	else
-		cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+		cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
 	val = min(val, cap);
 	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dd05238f79f6..ff2e0d87aee4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -426,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
 
 	if (sk->sk_sndbuf < sndmem)
 		WRITE_ONCE(sk->sk_sndbuf,
-			   min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
+			   min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -461,7 +461,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
-	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
 
 	while (tp->rcv_ssthresh <= window) {
 		if (truesize <= skb->len)
@@ -574,16 +574,17 @@ static void tcp_clamp_window(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct net *net = sock_net(sk);
+	int rmem2;
 
 	icsk->icsk_ack.quick = 0;
+	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 
-	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
+	if (sk->sk_rcvbuf < rmem2 &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		WRITE_ONCE(sk->sk_rcvbuf,
-			   min(atomic_read(&sk->sk_rmem_alloc),
-			       net->ipv4.sysctl_tcp_rmem[2]));
+			   min(atomic_read(&sk->sk_rmem_alloc), rmem2));
 	}
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -745,7 +746,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 		do_div(rcvwin, tp->advmss);
 		rcvbuf = min_t(u64, rcvwin * rcvmem,
-			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 		if (rcvbuf > sk->sk_rcvbuf) {
 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2efe41c84ee8..4c376b6d8764 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -238,7 +238,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 	*rcv_wscale = 0;
 	if (wscale_ok) {
 		/* Set window scaling on max possible window */
-		space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+		space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 		space = max_t(u32, space, sysctl_rmem_max);
 		space = min_t(u32, space, *window_clamp);
 		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9bbd8cbe0acb..7e1518bb6115 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1926,7 +1926,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 
 		do_div(rcvwin, advmss);
 		rcvbuf = min_t(u64, rcvwin * rcvmem,
-			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 
 		if (rcvbuf > sk->sk_rcvbuf) {
 			u32 window_clamp;
@@ -2669,8 +2669,8 @@ static int mptcp_init_sock(struct sock *sk)
 	mptcp_ca_reset(sk);
 
 	sk_sockets_allocated_inc(sk);
-	sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
-	sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+	sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+	sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
 
 	return 0;
 }
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 43509c7e90fc..f1c3b8eb4b3d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -517,7 +517,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	timer_setup(&sk->sk_timer, tipc_sk_timeout, 0);
 	sk->sk_shutdown = 0;
 	sk->sk_backlog_rcv = tipc_sk_backlog_rcv;
-	sk->sk_rcvbuf = sysctl_tipc_rmem[1];
+	sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]);
 	sk->sk_data_ready = tipc_data_ready;
 	sk->sk_write_space = tipc_write_space;
 	sk->sk_destruct = tipc_sock_destruct;
-- 
cgit 


From 7ffcdaa670164a2ad3844a5ef6df5423782ba290 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:21 -0400
Subject: SUNRPC expose functions for offline remote xprt functionality

Re-arrange the code that make offline transport and delete transport
callable functions.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  3 +++
 net/sunrpc/sysfs.c          | 28 +++++-----------------------
 net/sunrpc/xprt.c           | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 522bbf937957..0d51b9f9ea37 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -505,4 +505,7 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
 	return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
+void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
+void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index a3a2f8aeb80e..7330eb9a70cf 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -314,32 +314,14 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
 		goto release_tasks;
 	}
 	if (offline) {
-		if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) {
-			spin_lock(&xps->xps_lock);
-			xps->xps_nactive--;
-			spin_unlock(&xps->xps_lock);
-		}
+		xprt_set_offline_locked(xprt, xps);
 	} else if (online) {
-		if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) {
-			spin_lock(&xps->xps_lock);
-			xps->xps_nactive++;
-			spin_unlock(&xps->xps_lock);
-		}
+		xprt_set_online_locked(xprt, xps);
 	} else if (remove) {
-		if (test_bit(XPRT_OFFLINE, &xprt->state)) {
-			if (!test_and_set_bit(XPRT_REMOVE, &xprt->state)) {
-				xprt_force_disconnect(xprt);
-				if (test_bit(XPRT_CONNECTED, &xprt->state)) {
-					if (!xprt->sending.qlen &&
-					    !xprt->pending.qlen &&
-					    !xprt->backlog.qlen &&
-					    !atomic_long_read(&xprt->queuelen))
-						rpc_xprt_switch_remove_xprt(xps, xprt);
-				}
-			}
-		} else {
+		if (test_bit(XPRT_OFFLINE, &xprt->state))
+			xprt_delete_locked(xprt, xps);
+		else
 			count = -EINVAL;
-		}
 	}
 
 release_tasks:
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 86d62cffba0d..8f8e3c952f24 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -2152,3 +2152,35 @@ void xprt_put(struct rpc_xprt *xprt)
 		kref_put(&xprt->kref, xprt_destroy_kref);
 }
 EXPORT_SYMBOL_GPL(xprt_put);
+
+void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) {
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive--;
+		spin_unlock(&xps->xps_lock);
+	}
+}
+
+void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) {
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive++;
+		spin_unlock(&xps->xps_lock);
+	}
+}
+
+void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (test_and_set_bit(XPRT_REMOVE, &xprt->state))
+		return;
+
+	xprt_force_disconnect(xprt);
+	if (!test_bit(XPRT_CONNECTED, &xprt->state))
+		return;
+
+	if (!xprt->sending.qlen && !xprt->pending.qlen &&
+	    !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen))
+		rpc_xprt_switch_remove_xprt(xps, xprt);
+}
-- 
cgit 


From 895245ccea251ff54ea19bc364c9a49007918115 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:22 -0400
Subject: SUNRPC add function to offline remove trunkable transports

Iterate thru available transports in the xprt_switch for all
trunkable transports offline and possibly remote them as well.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 90501404fa49..d14333f4947a 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -234,6 +234,7 @@ int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 			struct rpc_xprt_switch *,
 			struct rpc_xprt *,
 			void *);
+void		rpc_clnt_manage_trunked_xprts(struct rpc_clnt *);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a97d4e06cae3..2b079c4d8af1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3000,6 +3000,52 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int rpc_xprt_offline(struct rpc_clnt *clnt,
+			    struct rpc_xprt *xprt,
+			    void *data)
+{
+	struct rpc_xprt *main_xprt;
+	struct rpc_xprt_switch *xps;
+	int err = 0;
+
+	xprt_get(xprt);
+
+	rcu_read_lock();
+	main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+	err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+				(struct sockaddr *)&main_xprt->addr);
+	rcu_read_unlock();
+	xprt_put(main_xprt);
+	if (err)
+		goto out;
+
+	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+		err = -EINTR;
+		goto out;
+	}
+	xprt_set_offline_locked(xprt, xps);
+
+	xprt_release_write(xprt, NULL);
+out:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	return err;
+}
+
+/* rpc_clnt_manage_trunked_xprts -- offline trunked transports
+ * @clnt rpc_clnt structure
+ *
+ * For each active transport found in the rpc_clnt structure call
+ * the function rpc_xprt_offline() which will identify trunked transports
+ * and will mark them offline.
+ */
+void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt)
+{
+	rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts);
+
 struct connect_timeout_data {
 	unsigned long connect_timeout;
 	unsigned long reconnect_timeout;
-- 
cgit 


From 95d0d30c66b855f614e677b8cd0455eed0765a6f Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:24 -0400
Subject: SUNRPC create an iterator to list only OFFLINE xprts

Create a new iterator helper that will go thru the all the transports
in the switch and return transports that are marked OFFLINE.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h |  3 ++
 net/sunrpc/clnt.c                    | 11 +++-
 net/sunrpc/xprtmultipath.c           | 99 ++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index bbb8a5fa0816..688ca7eb1d01 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -63,6 +63,9 @@ extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
 extern void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *xps);
 
+extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps);
+
 extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi);
 
 extern struct rpc_xprt_switch *xprt_iter_xchg_switch(
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2b079c4d8af1..68021b70340d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -786,7 +786,8 @@ out_revert:
 EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
 
 static
-int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi,
+			     void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps))
 {
 	struct rpc_xprt_switch *xps;
 
@@ -795,11 +796,17 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
 	rcu_read_unlock();
 	if (xps == NULL)
 		return -EAGAIN;
-	xprt_iter_init_listall(xpi, xps);
+	func(xpi, xps);
 	xprt_switch_put(xps);
 	return 0;
 }
 
+static
+int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+{
+	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall);
+}
+
 /**
  * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
  * @clnt: pointer to client
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 1693f81aae37..8def8423fc0a 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -27,6 +27,7 @@ typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin;
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall;
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline;
 
 static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt)
@@ -248,6 +249,18 @@ struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
 	return NULL;
 }
 
+static
+struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head)
+{
+	struct rpc_xprt *pos;
+
+	list_for_each_entry_rcu(pos, head, xprt_switch) {
+		if (!xprt_is_active(pos))
+			return pos;
+	}
+	return NULL;
+}
+
 static
 struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
 {
@@ -259,8 +272,9 @@ struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
 }
 
 static
-struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
-		const struct rpc_xprt *cur)
+struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head,
+						 const struct rpc_xprt *cur,
+						 bool find_active)
 {
 	struct rpc_xprt *pos;
 	bool found = false;
@@ -268,14 +282,25 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
 	list_for_each_entry_rcu(pos, head, xprt_switch) {
 		if (cur == pos)
 			found = true;
-		if (found && xprt_is_active(pos))
+		if (found && ((find_active && xprt_is_active(pos)) ||
+			      (!find_active && xprt_is_active(pos))))
 			return pos;
 	}
 	return NULL;
 }
 
 static
-struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
+						const struct rpc_xprt *cur)
+{
+	return _xprt_switch_find_current_entry(head, cur, true);
+}
+
+static
+struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt *first_entry(struct list_head *head),
+		struct rpc_xprt *current_entry(struct list_head *head,
+					       const struct rpc_xprt *cur))
 {
 	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
 	struct list_head *head;
@@ -284,8 +309,30 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
 		return NULL;
 	head = &xps->xps_xprt_list;
 	if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2)
-		return xprt_switch_find_first_entry(head);
-	return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
+		return first_entry(head);
+	return current_entry(head, xpi->xpi_cursor);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+{
+	return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry,
+			xprt_switch_find_current_entry);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head,
+		const struct rpc_xprt *cur)
+{
+	return _xprt_switch_find_current_entry(head, cur, false);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi)
+{
+	return _xprt_iter_current_entry(xpi,
+			xprt_switch_find_first_entry_offline,
+			xprt_switch_find_current_entry_offline);
 }
 
 bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
@@ -310,7 +357,7 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
 
 static
 struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
-		const struct rpc_xprt *cur)
+		const struct rpc_xprt *cur, bool check_active)
 {
 	struct rpc_xprt *pos, *prev = NULL;
 	bool found = false;
@@ -318,7 +365,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
 	list_for_each_entry_rcu(pos, head, xprt_switch) {
 		if (cur == prev)
 			found = true;
-		if (found && xprt_is_active(pos))
+		/* for request to return active transports return only
+		 * active, for request to return offline transports
+		 * return only offline
+		 */
+		if (found && ((check_active && xprt_is_active(pos)) ||
+			      (!check_active && !xprt_is_active(pos))))
 			return pos;
 		prev = pos;
 	}
@@ -355,7 +407,7 @@ struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head
 {
 	struct rpc_xprt *ret;
 
-	ret = xprt_switch_find_next_entry(head, cur);
+	ret = xprt_switch_find_next_entry(head, cur, true);
 	if (ret != NULL)
 		return ret;
 	return xprt_switch_find_first_entry(head);
@@ -397,7 +449,14 @@ static
 struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps,
 		const struct rpc_xprt *cur)
 {
-	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur);
+	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps,
+		const struct rpc_xprt *cur)
+{
+	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false);
 }
 
 static
@@ -407,6 +466,13 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
 			xprt_switch_find_next_entry_all);
 }
 
+static
+struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi)
+{
+	return xprt_iter_next_entry_multiple(xpi,
+			xprt_switch_find_next_entry_offline);
+}
+
 /*
  * xprt_iter_rewind - Resets the xprt iterator
  * @xpi: pointer to rpc_xprt_iter
@@ -460,6 +526,12 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
 	__xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall);
 }
 
+void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps)
+{
+	__xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline);
+}
+
 /**
  * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch
  * @xpi: pointer to rpc_xprt_iter
@@ -574,3 +646,10 @@ const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = {
 	.xpi_xprt = xprt_iter_current_entry,
 	.xpi_next = xprt_iter_next_entry_all,
 };
+
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = {
+	.xpi_rewind = xprt_iter_default_rewind,
+	.xpi_xprt = xprt_iter_current_entry_offline,
+	.xpi_next = xprt_iter_next_entry_offline,
+};
-- 
cgit 


From 9368fd6c75053630e95a6dbd17c9522e82101276 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Mon, 25 Jul 2022 09:32:25 -0400
Subject: SUNRPC enable back offline transports in trunking discovery

When we are adding a transport to a xprt_switch that's already on
the list but has been marked OFFLINE, then make the state ONLINE
since it's been tested now.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index d14333f4947a..71a3a1dd7e81 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -242,6 +242,7 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
+void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
 void rpc_cleanup_clids(void);
 
 static inline int rpc_reply_expected(struct rpc_task *task)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 68021b70340d..9dbce3b0d3a2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3095,8 +3095,22 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
 
+void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	rcu_read_unlock();
+	xprt_set_online_locked(xprt, xps);
+}
+
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
+	if (rpc_clnt_xprt_switch_has_addr(clnt,
+		(const struct sockaddr *)&xprt->addr)) {
+		return rpc_clnt_xprt_set_online(clnt, xprt);
+	}
 	rcu_read_lock();
 	rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
 				 xprt);
-- 
cgit 


From 497e6464d6adcee64f071b18fc826e63cfd2f0a5 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:26 -0400
Subject: SUNRPC create an rpc function that allows xprt removal from rpc_clnt

Expose a function that allows a removal of xprt from the rpc_clnt.

When called from NFS that's running a trunked transport then don't
decrement the active transport counter.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h          |  1 +
 include/linux/sunrpc/xprtmultipath.h |  2 +-
 net/sunrpc/clnt.c                    | 16 +++++++++++++++-
 net/sunrpc/xprt.c                    |  2 +-
 net/sunrpc/xprtmultipath.c           | 11 ++++++-----
 5 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 71a3a1dd7e81..7a43fd514398 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -240,6 +240,7 @@ const char *rpc_proc_name(const struct rpc_task *task);
 
 void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
 void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 688ca7eb1d01..9fff0768d942 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -55,7 +55,7 @@ extern void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps);
 extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt);
 extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt);
+		struct rpc_xprt *xprt, bool offline);
 
 extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *xps);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9dbce3b0d3a2..26f3102500bb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2144,7 +2144,8 @@ call_connect_status(struct rpc_task *task)
 				xprt_release(task);
 				value = atomic_long_dec_return(&xprt->queuelen);
 				if (value == 0)
-					rpc_xprt_switch_remove_xprt(xps, saved);
+					rpc_xprt_switch_remove_xprt(xps, saved,
+								    true);
 				xprt_put(saved);
 				task->tk_xprt = NULL;
 				task->tk_action = call_start;
@@ -3118,6 +3119,19 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
 
+void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+				    xprt, 0);
+	xps->xps_nunique_destaddr_xprts--;
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt);
+
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 				   const struct sockaddr *sap)
 {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8f8e3c952f24..44348c9f4b00 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -2182,5 +2182,5 @@ void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
 
 	if (!xprt->sending.qlen && !xprt->pending.qlen &&
 	    !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen))
-		rpc_xprt_switch_remove_xprt(xps, xprt);
+		rpc_xprt_switch_remove_xprt(xps, xprt, true);
 }
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8def8423fc0a..55da01730311 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -62,11 +62,11 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 }
 
 static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt)
+		struct rpc_xprt *xprt, bool offline)
 {
 	if (unlikely(xprt == NULL))
 		return;
-	if (!test_bit(XPRT_OFFLINE, &xprt->state))
+	if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline)
 		xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
@@ -79,14 +79,15 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
  * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch
  * @xps: pointer to struct rpc_xprt_switch
  * @xprt: pointer to struct rpc_xprt
+ * @offline: indicates if the xprt that's being removed is in an offline state
  *
  * Removes xprt from the list of struct rpc_xprt in xps.
  */
 void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt)
+		struct rpc_xprt *xprt, bool offline)
 {
 	spin_lock(&xps->xps_lock);
-	xprt_switch_remove_xprt_locked(xps, xprt);
+	xprt_switch_remove_xprt_locked(xps, xprt, offline);
 	spin_unlock(&xps->xps_lock);
 	xprt_put(xprt);
 }
@@ -155,7 +156,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
 
 		xprt = list_first_entry(&xps->xps_xprt_list,
 				struct rpc_xprt, xprt_switch);
-		xprt_switch_remove_xprt_locked(xps, xprt);
+		xprt_switch_remove_xprt_locked(xps, xprt, true);
 		spin_unlock(&xps->xps_lock);
 		xprt_put(xprt);
 		spin_lock(&xps->xps_lock);
-- 
cgit 


From 273d6aed9e5a1859dda15256f45561315c3d237a Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:29 -0400
Subject: SUNRPC export xprt_iter_rewind function

Make xprt_iter_rewind callable outside of xprtmultipath.c

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h | 2 ++
 net/sunrpc/xprtmultipath.c           | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 9fff0768d942..c0514c684b2c 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -68,6 +68,8 @@ extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
 
 extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi);
 
+extern void xprt_iter_rewind(struct rpc_xprt_iter *xpi);
+
 extern struct rpc_xprt_switch *xprt_iter_xchg_switch(
 		struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *newswitch);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 55da01730311..685db598acbe 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -481,7 +481,6 @@ struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi)
  * Resets xpi to ensure that it points to the first entry in the list
  * of transports.
  */
-static
 void xprt_iter_rewind(struct rpc_xprt_iter *xpi)
 {
 	rcu_read_lock();
-- 
cgit 


From 92cc04f60ab4ae199eee507e5cd4d5aa6c722e9c Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:30 -0400
Subject: SUNRPC create a function that probes only offline transports

For only offline transports, attempt to check connectivity via
a NULL call and, if that succeeds, call a provided session trunking
detection function.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 7a43fd514398..75eea5ebb179 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -235,6 +235,8 @@ int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 			struct rpc_xprt *,
 			void *);
 void		rpc_clnt_manage_trunked_xprts(struct rpc_clnt *);
+void		rpc_clnt_probe_trunked_xprts(struct rpc_clnt *,
+			struct rpc_add_xprt_test *);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9c9712274ca8..bbfc47f03480 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -807,6 +807,13 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
 	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall);
 }
 
+static
+int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt,
+				    struct rpc_xprt_iter *xpi)
+{
+	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline);
+}
+
 /**
  * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
  * @clnt: pointer to client
@@ -3018,6 +3025,64 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt,
+				  struct rpc_xprt *xprt,
+				  struct rpc_add_xprt_test *data)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *main_xprt;
+	int status = 0;
+
+	xprt_get(xprt);
+
+	rcu_read_lock();
+	main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+	status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+				   (struct sockaddr *)&main_xprt->addr);
+	rcu_read_unlock();
+	xprt_put(main_xprt);
+	if (status || !test_bit(XPRT_OFFLINE, &xprt->state))
+		goto out;
+
+	status = rpc_clnt_add_xprt_helper(clnt, xprt, data);
+out:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	return status;
+}
+
+/* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking
+ * @clnt rpc_clnt structure
+ *
+ * For each offlined transport found in the rpc_clnt structure call
+ * the function rpc_xprt_probe_trunked() which will determine if this
+ * transport still belongs to the trunking group.
+ */
+void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt,
+				  struct rpc_add_xprt_test *data)
+{
+	struct rpc_xprt_iter xpi;
+	int ret;
+
+	ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi);
+	if (ret)
+		return;
+	for (;;) {
+		struct rpc_xprt *xprt = xprt_iter_get_next(&xpi);
+
+		if (!xprt)
+			break;
+		ret = rpc_xprt_probe_trunked(clnt, xprt, data);
+		xprt_put(xprt);
+		if (ret < 0)
+			break;
+		xprt_iter_rewind(&xpi);
+	}
+	xprt_iter_destroy(&xpi);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts);
+
 static int rpc_xprt_offline(struct rpc_clnt *clnt,
 			    struct rpc_xprt *xprt,
 			    void *data)
-- 
cgit 


From b8bea09a456fc31af8f10029e69d105cac7e530e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 1 Jun 2022 17:46:59 +0800
Subject: btrfs: add trace event for submitted RAID56 bio

Add tracepoint for better insight to how the RAID56 data are submitted.

The output looks like this: (trace event header and UUID skipped)

   raid56_read_partial: full_stripe=389152768 devid=3 type=DATA1 offset=32768 opf=0x0 physical=323059712 len=32768
   raid56_read_partial: full_stripe=389152768 devid=1 type=DATA2 offset=0 opf=0x0 physical=67174400 len=65536
   raid56_write_stripe: full_stripe=389152768 devid=3 type=DATA1 offset=0 opf=0x1 physical=323026944 len=32768
   raid56_write_stripe: full_stripe=389152768 devid=2 type=PQ1 offset=0 opf=0x1 physical=323026944 len=32768

The above debug output is from a 32K data write into an empty RAID56
data chunk.

Some explanation on the event output:

  full_stripe:	the logical bytenr of the full stripe
  devid:	btrfs devid
  type:		raid stripe type.
         	DATA1:	the first data stripe
         	DATA2:	the second data stripe
         	PQ1:	the P stripe
         	PQ2:	the Q stripe
  offset:	the offset inside the stripe.
  opf:		the bio op type
  physical:	the physical offset the bio is for
  len:		the length of the bio

The first two lines are from partial RMW read, which is reading the
remaining data stripes from disks.

The last two lines are for full stripe RMW write, which is writing the
involved two 16K stripes (one for DATA1 stripe, one for P stripe).
The stripe for DATA2 doesn't need to be written.

There are 5 types of trace events:

- raid56_read_partial
  Read remaining data for regular read/write path.

- raid56_write_stripe
  Write the modified stripes for regular read/write path.

- raid56_scrub_read_recover
  Read remaining data for scrub recovery path.

- raid56_scrub_write_stripe
  Write the modified stripes for scrub path.

- raid56_scrub_read
  Read remaining data for scrub path.

Also, since the trace events are included at super.c, we have to export
needed structure definitions to 'raid56.h' and include the header in
super.c, or we're unable to access those members.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c            | 190 +++++++++++++------------------------------
 fs/btrfs/raid56.h            | 148 ++++++++++++++++++++++++++++++++-
 fs/btrfs/super.c             |   1 +
 include/trace/events/btrfs.h |  94 +++++++++++++++++++++
 4 files changed, 300 insertions(+), 133 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c48b7a0992f6..baba435692d2 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -63,138 +63,6 @@ struct sector_ptr {
 	unsigned int uptodate:8;
 };
 
-enum btrfs_rbio_ops {
-	BTRFS_RBIO_WRITE,
-	BTRFS_RBIO_READ_REBUILD,
-	BTRFS_RBIO_PARITY_SCRUB,
-	BTRFS_RBIO_REBUILD_MISSING,
-};
-
-struct btrfs_raid_bio {
-	struct btrfs_io_context *bioc;
-
-	/* while we're doing rmw on a stripe
-	 * we put it into a hash table so we can
-	 * lock the stripe and merge more rbios
-	 * into it.
-	 */
-	struct list_head hash_list;
-
-	/*
-	 * LRU list for the stripe cache
-	 */
-	struct list_head stripe_cache;
-
-	/*
-	 * for scheduling work in the helper threads
-	 */
-	struct work_struct work;
-
-	/*
-	 * bio list and bio_list_lock are used
-	 * to add more bios into the stripe
-	 * in hopes of avoiding the full rmw
-	 */
-	struct bio_list bio_list;
-	spinlock_t bio_list_lock;
-
-	/* also protected by the bio_list_lock, the
-	 * plug list is used by the plugging code
-	 * to collect partial bios while plugged.  The
-	 * stripe locking code also uses it to hand off
-	 * the stripe lock to the next pending IO
-	 */
-	struct list_head plug_list;
-
-	/*
-	 * flags that tell us if it is safe to
-	 * merge with this bio
-	 */
-	unsigned long flags;
-
-	/*
-	 * set if we're doing a parity rebuild
-	 * for a read from higher up, which is handled
-	 * differently from a parity rebuild as part of
-	 * rmw
-	 */
-	enum btrfs_rbio_ops operation;
-
-	/* Size of each individual stripe on disk */
-	u32 stripe_len;
-
-	/* How many pages there are for the full stripe including P/Q */
-	u16 nr_pages;
-
-	/* How many sectors there are for the full stripe including P/Q */
-	u16 nr_sectors;
-
-	/* Number of data stripes (no p/q) */
-	u8 nr_data;
-
-	/* Number of all stripes (including P/Q) */
-	u8 real_stripes;
-
-	/* How many pages there are for each stripe */
-	u8 stripe_npages;
-
-	/* How many sectors there are for each stripe */
-	u8 stripe_nsectors;
-
-	/* First bad stripe, -1 means no corruption */
-	s8 faila;
-
-	/* Second bad stripe (for RAID6 use) */
-	s8 failb;
-
-	/* Stripe number that we're scrubbing  */
-	u8 scrubp;
-
-	/*
-	 * size of all the bios in the bio_list.  This
-	 * helps us decide if the rbio maps to a full
-	 * stripe or not
-	 */
-	int bio_list_bytes;
-
-	int generic_bio_cnt;
-
-	refcount_t refs;
-
-	atomic_t stripes_pending;
-
-	atomic_t error;
-
-	/* Bitmap to record which horizontal stripe has data */
-	unsigned long dbitmap;
-
-	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
-	unsigned long finish_pbitmap;
-
-	/*
-	 * these are two arrays of pointers.  We allocate the
-	 * rbio big enough to hold them both and setup their
-	 * locations when the rbio is allocated
-	 */
-
-	/* pointers to pages that we allocated for
-	 * reading/writing stripes directly from the disk (including P/Q)
-	 */
-	struct page **stripe_pages;
-
-	/* Pointers to the sectors in the bio_list, for faster lookup */
-	struct sector_ptr *bio_sectors;
-
-	/*
-	 * For subpage support, we need to map each sector to above
-	 * stripe_pages.
-	 */
-	struct sector_ptr *stripe_sectors;
-
-	/* allocated with real_stripes-many pointers for finish_*() calls */
-	void **finish_pointers;
-};
-
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 static void rmw_work(struct work_struct *work);
@@ -1275,6 +1143,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 	spin_unlock_irq(&rbio->bio_list_lock);
 }
 
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+			       struct raid56_bio_trace_info *trace_info)
+{
+	const struct btrfs_io_context *bioc = rbio->bioc;
+	int i;
+
+	ASSERT(bioc);
+
+	/* We rely on bio->bi_bdev to find the stripe number. */
+	if (!bio->bi_bdev)
+		goto not_found;
+
+	for (i = 0; i < bioc->num_stripes; i++) {
+		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+			continue;
+		trace_info->stripe_nr = i;
+		trace_info->devid = bioc->stripes[i].dev->devid;
+		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+				     bioc->stripes[i].physical;
+		return;
+	}
+
+not_found:
+	trace_info->devid = -1;
+	trace_info->offset = -1;
+	trace_info->stripe_nr = -1;
+}
+
 /*
  * this is called from one of two situations.  We either
  * have a full stripe from the higher layers, or we've read all
@@ -1440,6 +1336,12 @@ write_data:
 	while ((bio = bio_list_pop(&bio_list))) {
 		bio->bi_end_io = raid_write_end_io;
 
+		if (trace_raid56_write_stripe_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_write_stripe(rbio, bio, &trace_info);
+		}
 		submit_bio(bio);
 	}
 	return;
@@ -1701,6 +1603,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
+		if (trace_raid56_read_partial_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_read_partial(rbio, bio, &trace_info);
+		}
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
@@ -2274,6 +2182,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
+		if (trace_raid56_scrub_read_recover_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+		}
 		submit_bio(bio);
 	}
 
@@ -2643,6 +2557,12 @@ submit_write:
 	while ((bio = bio_list_pop(&bio_list))) {
 		bio->bi_end_io = raid_write_end_io;
 
+		if (trace_raid56_scrub_write_stripe_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+		}
 		submit_bio(bio);
 	}
 	return;
@@ -2822,6 +2742,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 
 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
+		if (trace_raid56_scrub_read_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_scrub_read(rbio, bio, &trace_info);
+		}
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index aaad08aefd7d..3badde24dcbf 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,6 +7,152 @@
 #ifndef BTRFS_RAID56_H
 #define BTRFS_RAID56_H
 
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+	BTRFS_RBIO_WRITE,
+	BTRFS_RBIO_READ_REBUILD,
+	BTRFS_RBIO_PARITY_SCRUB,
+	BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+	struct btrfs_io_context *bioc;
+
+	/*
+	 * While we're doing RMW on a stripe we put it into a hash table so we
+	 * can lock the stripe and merge more rbios into it.
+	 */
+	struct list_head hash_list;
+
+	/* LRU list for the stripe cache */
+	struct list_head stripe_cache;
+
+	/* For scheduling work in the helper threads */
+	struct work_struct work;
+
+	/*
+	 * bio_list and bio_list_lock are used to add more bios into the stripe
+	 * in hopes of avoiding the full RMW
+	 */
+	struct bio_list bio_list;
+	spinlock_t bio_list_lock;
+
+	/*
+	 * Also protected by the bio_list_lock, the plug list is used by the
+	 * plugging code to collect partial bios while plugged.  The stripe
+	 * locking code also uses it to hand off the stripe lock to the next
+	 * pending IO.
+	 */
+	struct list_head plug_list;
+
+	/* Flags that tell us if it is safe to merge with this bio. */
+	unsigned long flags;
+
+	/*
+	 * Set if we're doing a parity rebuild for a read from higher up, which
+	 * is handled differently from a parity rebuild as part of RMW.
+	 */
+	enum btrfs_rbio_ops operation;
+
+	/* Size of each individual stripe on disk */
+	u32 stripe_len;
+
+	/* How many pages there are for the full stripe including P/Q */
+	u16 nr_pages;
+
+	/* How many sectors there are for the full stripe including P/Q */
+	u16 nr_sectors;
+
+	/* Number of data stripes (no p/q) */
+	u8 nr_data;
+
+	/* Numer of all stripes (including P/Q) */
+	u8 real_stripes;
+
+	/* How many pages there are for each stripe */
+	u8 stripe_npages;
+
+	/* How many sectors there are for each stripe */
+	u8 stripe_nsectors;
+
+	/* First bad stripe, -1 means no corruption */
+	s8 faila;
+
+	/* Second bad stripe (for RAID6 use) */
+	s8 failb;
+
+	/* Stripe number that we're scrubbing  */
+	u8 scrubp;
+
+	/*
+	 * Size of all the bios in the bio_list.  This helps us decide if the
+	 * rbio maps to a full stripe or not.
+	 */
+	int bio_list_bytes;
+
+	int generic_bio_cnt;
+
+	refcount_t refs;
+
+	atomic_t stripes_pending;
+
+	atomic_t error;
+
+	/* Bitmap to record which horizontal stripe has data */
+	unsigned long dbitmap;
+
+	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
+	unsigned long finish_pbitmap;
+
+	/*
+	 * These are two arrays of pointers.  We allocate the rbio big enough
+	 * to hold them both and setup their locations when the rbio is
+	 * allocated.
+	 */
+
+	/*
+	 * Pointers to pages that we allocated for reading/writing stripes
+	 * directly from the disk (including P/Q).
+	 */
+	struct page **stripe_pages;
+
+	/* Pointers to the sectors in the bio_list, for faster lookup */
+	struct sector_ptr *bio_sectors;
+
+	/*
+	 * For subpage support, we need to map each sector to above
+	 * stripe_pages.
+	 */
+	struct sector_ptr *stripe_sectors;
+
+	/* Allocated with real_stripes-many pointers for finish_*() calls */
+	void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+	u64 devid;
+
+	/* The offset inside the stripe. (<= STRIPE_LEN) */
+	u32 offset;
+
+	/*
+	 * Stripe number.
+	 * 0 is the first data stripe, and nr_data for P stripe,
+	 * nr_data + 1 for Q stripe.
+	 * >= real_stripes for
+	 */
+	u8 stripe_nr;
+};
+
 static inline int nr_parity_stripes(const struct map_lookup *map)
 {
 	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
@@ -21,13 +167,13 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 {
 	return map->num_stripes - nr_parity_stripes(map);
 }
+
 #define RAID5_P_STRIPE ((u64)-2)
 #define RAID6_Q_STRIPE ((u64)-1)
 
 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
 			     ((x) == RAID6_Q_STRIPE))
 
-struct btrfs_raid_bio;
 struct btrfs_device;
 
 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 24b86061c5df..8539ee2dc79f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
 #include "block-group.h"
 #include "discard.h"
 #include "qgroup.h"
+#include "raid56.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 9ae94ef3e270..29fa8ea2cc0f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -30,6 +30,8 @@ struct btrfs_qgroup;
 struct extent_io_tree;
 struct prelim_ref;
 struct btrfs_space_info;
+struct btrfs_raid_bio;
+struct raid56_bio_trace_info;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -2258,6 +2260,98 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
 	TP_ARGS(fs_info, sinfo, old, diff)
 );
 
+DECLARE_EVENT_CLASS(btrfs_raid56_bio,
+
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	full_stripe	)
+		__field(	u64,	physical	)
+		__field(	u64,	devid		)
+		__field(	u32,	offset		)
+		__field(	u32,	len		)
+		__field(	u8,	opf		)
+		__field(	u8,	total_stripes	)
+		__field(	u8,	real_stripes	)
+		__field(	u8,	nr_data		)
+		__field(	u8,	stripe_nr	)
+	),
+
+	TP_fast_assign_btrfs(rbio->bioc->fs_info,
+		__entry->full_stripe	= rbio->bioc->raid_map[0];
+		__entry->physical	= bio->bi_iter.bi_sector << SECTOR_SHIFT;
+		__entry->len		= bio->bi_iter.bi_size;
+		__entry->opf		= bio_op(bio);
+		__entry->devid		= trace_info->devid;
+		__entry->offset		= trace_info->offset;
+		__entry->stripe_nr	= trace_info->stripe_nr;
+		__entry->total_stripes	= rbio->bioc->num_stripes;
+		__entry->real_stripes	= rbio->real_stripes;
+		__entry->nr_data	= rbio->nr_data;
+	),
+	/*
+	 * For type output, we need to output things like "DATA1"
+	 * (the first data stripe), "DATA2" (the second data stripe),
+	 * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device).
+	 */
+	TP_printk_btrfs(
+"full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u",
+		__entry->full_stripe, __entry->devid,
+		(__entry->stripe_nr < __entry->nr_data) ? "DATA" :
+			((__entry->stripe_nr < __entry->real_stripes) ? "PQ" :
+			 "REPLACE"),
+		(__entry->stripe_nr < __entry->nr_data) ?
+			(__entry->stripe_nr + 1) :
+			((__entry->stripe_nr < __entry->real_stripes) ?
+			 (__entry->stripe_nr - __entry->nr_data + 1) : 0),
+		__entry->offset, __entry->opf, __entry->physical, __entry->len)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial,
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe,
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info)
+);
+
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe,
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read,
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover,
+	TP_PROTO(const struct btrfs_raid_bio *rbio,
+		 const struct bio *bio,
+		 const struct raid56_bio_trace_info *trace_info),
+
+	TP_ARGS(rbio, bio, trace_info)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit 


From b7c14f23fb604fc66edae7514ed9b4b93930b5ba Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 17 Mar 2022 10:25:39 -0700
Subject: btrfs: send: add stream v2 definitions

This adds the definitions of the new commands for send stream version 2
and their respective attributes: fallocate, FS_IOC_SETFLAGS (a.k.a.
chattr), and encoded writes. It also documents two changes to the send
stream format in v2: the receiver shouldn't assume a maximum command
size, and the DATA attribute is encoded differently to allow for writes
larger than 64k. These will be implemented in subsequent changes, and
then the ioctl will accept the new version and flag.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c            |  2 +-
 fs/btrfs/send.h            | 40 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/btrfs.h |  7 +++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 237753860758..6ec31736c522 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7552,7 +7552,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 	sctx->clone_roots_cnt = arg->clone_sources_count;
 
-	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
 	sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
 	if (!sctx->send_buf) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index c47a2984aa5b..858ce8132614 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -12,7 +12,11 @@
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
 #define BTRFS_SEND_STREAM_VERSION 1
 
-#define BTRFS_SEND_BUF_SIZE SZ_64K
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1				SZ_64K
 
 enum btrfs_tlv_type {
 	BTRFS_TLV_U8,
@@ -80,16 +84,20 @@ enum btrfs_send_cmd {
 	BTRFS_SEND_C_MAX_V1		= 22,
 
 	/* Version 2 */
-	BTRFS_SEND_C_MAX_V2		= 22,
+	BTRFS_SEND_C_FALLOCATE		= 23,
+	BTRFS_SEND_C_SETFLAGS		= 24,
+	BTRFS_SEND_C_ENCODED_WRITE	= 25,
+	BTRFS_SEND_C_MAX_V2		= 25,
 
 	/* End */
-	BTRFS_SEND_C_MAX		= 22,
+	BTRFS_SEND_C_MAX		= 25,
 };
 
 /* attributes in send stream */
 enum {
 	BTRFS_SEND_A_UNSPEC		= 0,
 
+	/* Version 1 */
 	BTRFS_SEND_A_UUID		= 1,
 	BTRFS_SEND_A_CTRANSID		= 2,
 
@@ -112,6 +120,11 @@ enum {
 	BTRFS_SEND_A_PATH_LINK		= 17,
 
 	BTRFS_SEND_A_FILE_OFFSET	= 18,
+	/*
+	 * As of send stream v2, this attribute is special: it must be the last
+	 * attribute in a command, its header contains only the type, and its
+	 * length is implicitly the remaining length of the command.
+	 */
 	BTRFS_SEND_A_DATA		= 19,
 
 	BTRFS_SEND_A_CLONE_UUID		= 20,
@@ -120,7 +133,26 @@ enum {
 	BTRFS_SEND_A_CLONE_OFFSET	= 23,
 	BTRFS_SEND_A_CLONE_LEN		= 24,
 
-	BTRFS_SEND_A_MAX		= 24,
+	BTRFS_SEND_A_MAX_V1		= 24,
+
+	/* Version 2 */
+	BTRFS_SEND_A_FALLOCATE_MODE	= 25,
+
+	BTRFS_SEND_A_SETFLAGS_FLAGS	= 26,
+
+	BTRFS_SEND_A_UNENCODED_FILE_LEN	= 27,
+	BTRFS_SEND_A_UNENCODED_LEN	= 28,
+	BTRFS_SEND_A_UNENCODED_OFFSET	= 29,
+	/*
+	 * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+	 * BTRFS_SEND_C_ENCODED_WRITE.
+	 */
+	BTRFS_SEND_A_COMPRESSION	= 30,
+	BTRFS_SEND_A_ENCRYPTION		= 31,
+	BTRFS_SEND_A_MAX_V2		= 31,
+
+	/* End */
+	BTRFS_SEND_A_MAX		= 31,
 };
 
 #ifdef __KERNEL__
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..b6f26a434b10 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -777,6 +777,13 @@ struct btrfs_ioctl_received_subvol_args {
  */
 #define BTRFS_SEND_FLAG_VERSION			0x8
 
+/*
+ * Send compressed data using the ENCODED_WRITE command instead of decompressing
+ * the data and sending it with the WRITE command. This requires protocol
+ * version >= 2.
+ */
+#define BTRFS_SEND_FLAG_COMPRESSED		0x10
+
 #define BTRFS_SEND_FLAG_MASK \
 	(BTRFS_SEND_FLAG_NO_FILE_DATA | \
 	 BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
-- 
cgit 


From d6815592806f481244d0e3435ca1f5383d90a14c Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 17 Mar 2022 10:25:43 -0700
Subject: btrfs: send: enable support for stream v2 and compressed writes

Now that the new support is implemented, allow the ioctl to accept v2
and the compressed flag, and update the version in sysfs.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c            | 7 +++++--
 fs/btrfs/send.h            | 2 +-
 include/uapi/linux/btrfs.h | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bc00393c1233..6d01dc26d408 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -701,8 +701,7 @@ static int send_header(struct send_ctx *sctx)
 	struct btrfs_stream_header hdr;
 
 	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
-	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+	hdr.version = cpu_to_le32(sctx->proto);
 	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
 					&sctx->send_off);
 }
@@ -7755,6 +7754,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	} else {
 		sctx->proto = 1;
 	}
+	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	sctx->send_filp = fget(arg->send_fd);
 	if (!sctx->send_filp) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 858ce8132614..b0dc07567d09 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -10,7 +10,7 @@
 #include "ctree.h"
 
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+#define BTRFS_SEND_STREAM_VERSION 2
 
 /*
  * In send stream v1, no command is larger than 64K. In send stream v2, no limit
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index b6f26a434b10..f54dc91e4025 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -788,7 +788,8 @@ struct btrfs_ioctl_received_subvol_args {
 	(BTRFS_SEND_FLAG_NO_FILE_DATA | \
 	 BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
 	 BTRFS_SEND_FLAG_OMIT_END_CMD | \
-	 BTRFS_SEND_FLAG_VERSION)
+	 BTRFS_SEND_FLAG_VERSION | \
+	 BTRFS_SEND_FLAG_COMPRESSED)
 
 struct btrfs_ioctl_send_args {
 	__s64 send_fd;			/* in */
-- 
cgit 


From 5bea2508811ec76105b01c90c1f1661024c257a9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 9 Jun 2022 09:28:04 -0700
Subject: btrfs: add tracepoints for ordered extents

When debugging a reference counting issue with ordered extents, I've found
we're lacking a lot of tracepoint coverage in the ordered extent code.

Close these gaps by adding tracepoints after every refcount_inc() in the
ordered extent code.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ordered-data.c      | 19 ++++++++++---
 include/trace/events/btrfs.h | 64 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index dc88d2b3721f..41b3bc44c92b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -401,6 +401,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 			set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 			cond_wake_up(&entry->wait);
 			refcount_inc(&entry->refs);
+			trace_btrfs_ordered_extent_mark_finished(inode, entry);
 			spin_unlock_irqrestore(&tree->lock, flags);
 			btrfs_init_work(&entry->work, finish_func, NULL, NULL);
 			btrfs_queue_work(wq, &entry->work);
@@ -473,6 +474,7 @@ out:
 	if (finished && cached && entry) {
 		*cached = entry;
 		refcount_inc(&entry->refs);
+		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
 	}
 	spin_unlock_irqrestore(&tree->lock, flags);
 	return finished;
@@ -807,8 +809,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
 		entry = NULL;
-	if (entry)
+	if (entry) {
 		refcount_inc(&entry->refs);
+		trace_btrfs_ordered_extent_lookup(inode, entry);
+	}
 out:
 	spin_unlock_irqrestore(&tree->lock, flags);
 	return entry;
@@ -848,8 +852,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
 			break;
 	}
 out:
-	if (entry)
+	if (entry) {
 		refcount_inc(&entry->refs);
+		trace_btrfs_ordered_extent_lookup_range(inode, entry);
+	}
 	spin_unlock_irq(&tree->lock);
 	return entry;
 }
@@ -878,6 +884,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 		ASSERT(list_empty(&ordered->log_list));
 		list_add_tail(&ordered->log_list, list);
 		refcount_inc(&ordered->refs);
+		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
 	}
 	spin_unlock_irq(&tree->lock);
 }
@@ -901,6 +908,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
 
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	refcount_inc(&entry->refs);
+	trace_btrfs_ordered_extent_lookup_first(inode, entry);
 out:
 	spin_unlock_irq(&tree->lock);
 	return entry;
@@ -975,8 +983,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
 	/* No ordered extent in the range */
 	entry = NULL;
 out:
-	if (entry)
+	if (entry) {
 		refcount_inc(&entry->refs);
+		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+	}
+
 	spin_unlock_irq(&tree->lock);
 	return entry;
 }
@@ -1055,6 +1066,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int ret = 0;
 
+	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
 	spin_lock_irq(&tree->lock);
 	/* Remove from tree once */
 	node = &ordered->rb_node;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 29fa8ea2cc0f..73df80d462dc 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -598,6 +598,70 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
 	TP_ARGS(inode, ordered)
 );
 
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_range,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first_range,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_for_logging,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_split,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_dec_test_pending,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished,
+
+	     TP_PROTO(const struct btrfs_inode *inode,
+		      const struct btrfs_ordered_extent *ordered),
+
+	     TP_ARGS(inode, ordered)
+);
+
 DECLARE_EVENT_CLASS(btrfs__writepage,
 
 	TP_PROTO(const struct page *page, const struct inode *inode,
-- 
cgit 


From 39ade048a32ea653b94dcfbf816b0b13a6be8a33 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Wed, 6 Jul 2022 13:15:19 +0200
Subject: highmem: Make __kunmap_{local,atomic}() take const void pointer

__kunmap_ {local,atomic}() currently take pointers to void. However, this
is semantically incorrect, since these functions do not change the memory
their arguments point to.

Therefore, make this semantics explicit by modifying the
__kunmap_{local,atomic}() prototypes to take pointers to const void.

As a side effect, compilers may produce more efficient code.

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Helge Deller <deller@gmx.de>  # parisc
Suggested-by: David Sterba <dsterba@suse.cz>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 arch/parisc/include/asm/cacheflush.h |  6 +++---
 arch/parisc/kernel/cache.c           |  2 +-
 include/linux/highmem-internal.h     | 10 +++++-----
 mm/highmem.c                         |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 8d03b3b26229..0bdee6724132 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -22,7 +22,7 @@ void flush_kernel_icache_range_asm(unsigned long, unsigned long);
 void flush_user_dcache_range_asm(unsigned long, unsigned long);
 void flush_kernel_dcache_range_asm(unsigned long, unsigned long);
 void purge_kernel_dcache_range_asm(unsigned long, unsigned long);
-void flush_kernel_dcache_page_asm(void *);
+void flush_kernel_dcache_page_asm(const void *addr);
 void flush_kernel_icache_page(void *);
 
 /* Cache flush operations */
@@ -31,7 +31,7 @@ void flush_cache_all_local(void);
 void flush_cache_all(void);
 void flush_cache_mm(struct mm_struct *mm);
 
-void flush_kernel_dcache_page_addr(void *addr);
+void flush_kernel_dcache_page_addr(const void *addr);
 
 #define flush_kernel_dcache_range(start,size) \
 	flush_kernel_dcache_range_asm((start), (start)+(size));
@@ -75,7 +75,7 @@ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr);
 void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr);
 
 #define ARCH_HAS_FLUSH_ON_KUNMAP
-static inline void kunmap_flush_on_unmap(void *addr)
+static inline void kunmap_flush_on_unmap(const void *addr)
 {
 	flush_kernel_dcache_page_addr(addr);
 }
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index a9bc578e4c52..993999a65e54 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -549,7 +549,7 @@ extern void purge_kernel_dcache_page_asm(unsigned long);
 extern void clear_user_page_asm(void *, unsigned long);
 extern void copy_user_page_asm(void *, void *, unsigned long);
 
-void flush_kernel_dcache_page_addr(void *addr)
+void flush_kernel_dcache_page_addr(const void *addr)
 {
 	unsigned long flags;
 
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index cddb42ff0473..034b1106d022 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -8,7 +8,7 @@
 #ifdef CONFIG_KMAP_LOCAL
 void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
-void kunmap_local_indexed(void *vaddr);
+void kunmap_local_indexed(const void *vaddr);
 void kmap_local_fork(struct task_struct *tsk);
 void __kmap_local_sched_out(void);
 void __kmap_local_sched_in(void);
@@ -89,7 +89,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
 	return __kmap_local_pfn_prot(pfn, kmap_prot);
 }
 
-static inline void __kunmap_local(void *vaddr)
+static inline void __kunmap_local(const void *vaddr)
 {
 	kunmap_local_indexed(vaddr);
 }
@@ -121,7 +121,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
 	return __kmap_local_pfn_prot(pfn, kmap_prot);
 }
 
-static inline void __kunmap_atomic(void *addr)
+static inline void __kunmap_atomic(const void *addr)
 {
 	kunmap_local_indexed(addr);
 	pagefault_enable();
@@ -197,7 +197,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
 	return kmap_local_page(pfn_to_page(pfn));
 }
 
-static inline void __kunmap_local(void *addr)
+static inline void __kunmap_local(const void *addr)
 {
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
 	kunmap_flush_on_unmap(addr);
@@ -224,7 +224,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
 	return kmap_atomic(pfn_to_page(pfn));
 }
 
-static inline void __kunmap_atomic(void *addr)
+static inline void __kunmap_atomic(const void *addr)
 {
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
 	kunmap_flush_on_unmap(addr);
diff --git a/mm/highmem.c b/mm/highmem.c
index 1a692997fac4..e32083e4ce0d 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -561,7 +561,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
 }
 EXPORT_SYMBOL(__kmap_local_page_prot);
 
-void kunmap_local_indexed(void *vaddr)
+void kunmap_local_indexed(const void *vaddr)
 {
 	unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
 	pte_t *kmap_pte;
-- 
cgit 


From 65ea1b66482f415d51cd46515b02477257330339 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Sat, 9 Jul 2022 08:18:38 +0900
Subject: block: add bdev_max_segments() helper

Add bdev_max_segments() like other queue parameters.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f7b43444c5f..62e3ff52ab03 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1206,6 +1206,11 @@ bdev_max_zone_append_sectors(struct block_device *bdev)
 	return queue_max_zone_append_sectors(bdev_get_queue(bdev));
 }
 
+static inline unsigned int bdev_max_segments(struct block_device *bdev)
+{
+	return queue_max_segments(bdev_get_queue(bdev));
+}
+
 static inline unsigned queue_logical_block_size(const struct request_queue *q)
 {
 	int retval = 512;
-- 
cgit 


From 44abdd1646e1fbfb781972c0bffc90b4eb3e87b3 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:51 -0700
Subject: vfio: Pass in starting IOVA to vfio_pin/unpin_pages API

The vfio_pin/unpin_pages() so far accepted arrays of PFNs of user IOVA.
Among all three callers, there was only one caller possibly passing in
a non-contiguous PFN list, which is now ensured to have contiguous PFN
inputs too.

Pass in the starting address with "iova" alone to simplify things, so
callers no longer need to maintain a PFN list or to pin/unpin one page
at a time. This also allows VFIO to use more efficient implementations
of pin/unpin_pages.

For now, also update vfio_iommu_type1 to fit this new parameter too,
while keeping its input intact (being user_iova) since we don't want
to spend too much effort swapping its parameters and local variables
at that level.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Acked-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-6-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  4 ++--
 drivers/gpu/drm/i915/gvt/kvmgt.c                  | 18 +++++----------
 drivers/s390/cio/vfio_ccw_cp.c                    |  4 ++--
 drivers/s390/crypto/vfio_ap_ops.c                 |  9 ++++----
 drivers/vfio/vfio.c                               | 27 ++++++++++-------------
 drivers/vfio/vfio.h                               |  4 ++--
 drivers/vfio/vfio_iommu_type1.c                   | 15 ++++++-------
 include/linux/vfio.h                              |  5 ++---
 8 files changed, 37 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index b0fdf76b339a..ea32a0f13ddb 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -262,10 +262,10 @@ Translation APIs for Mediated Devices
 The following APIs are provided for translating user pfn to host pfn in a VFIO
 driver::
 
-	int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 				  int npage, int prot, unsigned long *phys_pfn);
 
-	void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova,
 				    int npage);
 
 These functions call back into the back-end IOMMU module by using the pin_pages
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 2fee5695515a..8be75c282611 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -231,14 +231,8 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size)
 {
-	int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	int npage;
-
-	for (npage = 0; npage < total_pages; npage++) {
-		unsigned long cur_gfn = gfn + npage;
-
-		vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
-	}
+	vfio_unpin_pages(&vgpu->vfio_device, gfn << PAGE_SHIFT,
+			 DIV_ROUND_UP(size, PAGE_SIZE));
 }
 
 /* Pin a normal or compound guest page for dma. */
@@ -255,14 +249,14 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 	 * on stack to hold pfns.
 	 */
 	for (npage = 0; npage < total_pages; npage++) {
-		unsigned long cur_gfn = gfn + npage;
+		dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
 		unsigned long pfn;
 
-		ret = vfio_pin_pages(&vgpu->vfio_device, &cur_gfn, 1,
+		ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
 				     IOMMU_READ | IOMMU_WRITE, &pfn);
 		if (ret != 1) {
-			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
-				     cur_gfn, ret);
+			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
+				     &cur_iova, ret);
 			goto err;
 		}
 
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index 3b94863ad24e..a739262f988d 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -114,7 +114,7 @@ static void pfn_array_unpin(struct pfn_array *pa,
 			continue;
 		}
 
-		vfio_unpin_pages(vdev, first, npage);
+		vfio_unpin_pages(vdev, *first << PAGE_SHIFT, npage);
 		unpinned += npage;
 		npage = 1;
 	}
@@ -146,7 +146,7 @@ static int pfn_array_pin(struct pfn_array *pa, struct vfio_device *vdev)
 			continue;
 		}
 
-		ret = vfio_pin_pages(vdev, first, npage,
+		ret = vfio_pin_pages(vdev, *first << PAGE_SHIFT, npage,
 				     IOMMU_READ | IOMMU_WRITE,
 				     &pa->pa_pfn[pinned]);
 		if (ret < 0) {
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 5781059d3ed2..70f484668ca0 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -124,7 +124,7 @@ static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
 		q->saved_isc = VFIO_AP_ISC_INVALID;
 	}
 	if (q->saved_pfn && !WARN_ON(!q->matrix_mdev)) {
-		vfio_unpin_pages(&q->matrix_mdev->vdev, &q->saved_pfn, 1);
+		vfio_unpin_pages(&q->matrix_mdev->vdev, q->saved_pfn << PAGE_SHIFT, 1);
 		q->saved_pfn = 0;
 	}
 }
@@ -258,7 +258,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 		return status;
 	}
 
-	ret = vfio_pin_pages(&q->matrix_mdev->vdev, &g_pfn, 1,
+	ret = vfio_pin_pages(&q->matrix_mdev->vdev, g_pfn << PAGE_SHIFT, 1,
 			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
 	switch (ret) {
 	case 1:
@@ -301,7 +301,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 		break;
 	case AP_RESPONSE_OTHERWISE_CHANGED:
 		/* We could not modify IRQ setings: clear new configuration */
-		vfio_unpin_pages(&q->matrix_mdev->vdev, &g_pfn, 1);
+		vfio_unpin_pages(&q->matrix_mdev->vdev, g_pfn << PAGE_SHIFT, 1);
 		kvm_s390_gisc_unregister(kvm, isc);
 		break;
 	default:
@@ -1232,9 +1232,8 @@ static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova,
 {
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
-	unsigned long g_pfn = iova >> PAGE_SHIFT;
 
-	vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
+	vfio_unpin_pages(&matrix_mdev->vdev, iova, 1);
 }
 
 /**
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 92b10aafae28..ffd1a492eea9 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1934,17 +1934,17 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
 
 /*
- * Pin a set of guest PFNs and return their associated host PFNs for local
+ * Pin contiguous user pages and return their associated host pages for local
  * domain only.
  * @device [in]  : device
- * @user_pfn [in]: array of user/guest PFNs to be pinned.
- * @npage [in]   : count of elements in user_pfn array.  This count should not
- *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @iova [in]    : starting IOVA of user pages to be pinned.
+ * @npage [in]   : count of pages to be pinned.  This count should not
+ *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @prot [in]    : protection flags
  * @phys_pfn[out]: array of host PFNs
  * Return error or number of pages pinned.
  */
-int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn)
 {
 	struct vfio_container *container;
@@ -1952,8 +1952,7 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	struct vfio_iommu_driver *driver;
 	int ret;
 
-	if (!user_pfn || !phys_pfn || !npage ||
-	    !vfio_assert_device_open(device))
+	if (!phys_pfn || !npage || !vfio_assert_device_open(device))
 		return -EINVAL;
 
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
@@ -1967,7 +1966,7 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
 		ret = driver->ops->pin_pages(container->iommu_data,
-					     group->iommu_group, user_pfn,
+					     group->iommu_group, iova,
 					     npage, prot, phys_pfn);
 	else
 		ret = -ENOTTY;
@@ -1977,15 +1976,13 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 EXPORT_SYMBOL(vfio_pin_pages);
 
 /*
- * Unpin set of host PFNs for local domain only.
+ * Unpin contiguous host pages for local domain only.
  * @device [in]  : device
- * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
- *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- * @npage [in]   : count of elements in user_pfn array.  This count should not
+ * @iova [in]    : starting address of user pages to be unpinned.
+ * @npage [in]   : count of pages to be unpinned.  This count should not
  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  */
-void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		      int npage)
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
 {
 	struct vfio_container *container;
 	struct vfio_iommu_driver *driver;
@@ -2000,7 +1997,7 @@ void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	container = device->group->container;
 	driver = container->iommu_driver;
 
-	driver->ops->unpin_pages(container->iommu_data, user_pfn, npage);
+	driver->ops->unpin_pages(container->iommu_data, iova, npage);
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 6a8424b407c7..e9767e13f00f 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -50,11 +50,11 @@ struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	int		(*pin_pages)(void *iommu_data,
 				     struct iommu_group *group,
-				     unsigned long *user_pfn,
+				     dma_addr_t user_iova,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
 	void		(*unpin_pages)(void *iommu_data,
-				       unsigned long *user_pfn, int npage);
+				       dma_addr_t user_iova, int npage);
 	void		(*register_device)(void *iommu_data,
 					   struct vfio_device *vdev);
 	void		(*unregister_device)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e49fbe9968ef..e629e059118c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -829,7 +829,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
-				      unsigned long *user_pfn,
+				      dma_addr_t user_iova,
 				      int npage, int prot,
 				      unsigned long *phys_pfn)
 {
@@ -841,7 +841,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	bool do_accounting;
 	dma_addr_t iova;
 
-	if (!iommu || !user_pfn || !phys_pfn)
+	if (!iommu || !phys_pfn)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -857,7 +857,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 again:
 	if (iommu->vaddr_invalid_count) {
 		for (i = 0; i < npage; i++) {
-			iova = user_pfn[i] << PAGE_SHIFT;
+			iova = user_iova + PAGE_SIZE * i;
 			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
 			if (ret < 0)
 				goto pin_done;
@@ -882,7 +882,7 @@ again:
 	for (i = 0; i < npage; i++) {
 		struct vfio_pfn *vpfn;
 
-		iova = user_pfn[i] << PAGE_SHIFT;
+		iova = user_iova + PAGE_SIZE * i;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma) {
 			ret = -EINVAL;
@@ -939,7 +939,7 @@ pin_unwind:
 	for (j = 0; j < i; j++) {
 		dma_addr_t iova;
 
-		iova = user_pfn[j] << PAGE_SHIFT;
+		iova = user_iova + PAGE_SIZE * j;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		vfio_unpin_page_external(dma, iova, do_accounting);
 		phys_pfn[j] = 0;
@@ -950,7 +950,7 @@ pin_done:
 }
 
 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
-					 unsigned long *user_pfn, int npage)
+					 dma_addr_t user_iova, int npage)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	bool do_accounting;
@@ -964,10 +964,9 @@ static void vfio_iommu_type1_unpin_pages(void *iommu_data,
 
 	do_accounting = list_empty(&iommu->domain_list);
 	for (i = 0; i < npage; i++) {
+		dma_addr_t iova = user_iova + PAGE_SIZE * i;
 		struct vfio_dma *dma;
-		dma_addr_t iova;
 
-		iova = user_pfn[i] << PAGE_SHIFT;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma)
 			break;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 9f7d74c24925..9e3b6abcf890 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -161,10 +161,9 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
-int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn);
-void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		      int npage);
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-- 
cgit 


From 8561aa4fb7d72011c2352af2b1b9caf588942181 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:54 -0700
Subject: vfio: Rename user_iova of vfio_dma_rw()

Following the updated vfio_pin/unpin_pages(), use the simpler "iova".

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-9-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 6 +++---
 include/linux/vfio.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index ffd1a492eea9..606a20b605ba 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2012,13 +2012,13 @@ EXPORT_SYMBOL(vfio_unpin_pages);
  * not a real device DMA, it is not necessary to pin the user space memory.
  *
  * @device [in]		: VFIO device
- * @user_iova [in]	: base IOVA of a user space buffer
+ * @iova [in]		: base IOVA of a user space buffer
  * @data [in]		: pointer to kernel buffer
  * @len [in]		: kernel buffer length
  * @write		: indicate read or write
  * Return error code on failure or 0 on success.
  */
-int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
 		size_t len, bool write)
 {
 	struct vfio_container *container;
@@ -2034,7 +2034,7 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
 
 	if (likely(driver && driver->ops->dma_rw))
 		ret = driver->ops->dma_rw(container->iommu_data,
-					  user_iova, data, len, write);
+					  iova, data, len, write);
 	else
 		ret = -ENOTTY;
 	return ret;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 9e3b6abcf890..acefd663e63b 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -164,7 +164,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn);
 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
-int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova,
 		void *data, size_t len, bool write);
 
 /*
-- 
cgit 


From 34a255e67615995f729254307a0581c143e03752 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:56 -0700
Subject: vfio: Replace phys_pfn with pages for vfio_pin_pages()

Most of the callers of vfio_pin_pages() want "struct page *" and the
low-level mm code to pin pages returns a list of "struct page *" too.
So there's no gain in converting "struct page *" to PFN in between.

Replace the output parameter "phys_pfn" list with a "pages" list, to
simplify callers. This also allows us to replace the vfio_iommu_type1
implementation with a more efficient one.

And drop the pfn_valid check in the gvt code, as there is no need to
do such a check at a page-backed struct page pointer.

For now, also update vfio_iommu_type1 to fit this new parameter too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-11-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  2 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c                  | 19 ++++++-------------
 drivers/s390/cio/vfio_ccw_cp.c                    | 19 +++++++++----------
 drivers/s390/crypto/vfio_ap_ops.c                 |  6 +++---
 drivers/vfio/vfio.c                               |  8 ++++----
 drivers/vfio/vfio.h                               |  2 +-
 drivers/vfio/vfio_iommu_type1.c                   | 19 +++++++++++--------
 include/linux/vfio.h                              |  2 +-
 8 files changed, 36 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index ea32a0f13ddb..ba5fefcdae1a 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -263,7 +263,7 @@ The following APIs are provided for translating user pfn to host pfn in a VFIO
 driver::
 
 	int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-				  int npage, int prot, unsigned long *phys_pfn);
+				  int npage, int prot, struct page **pages);
 
 	void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova,
 				    int npage);
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 8be75c282611..e3cd58946477 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -240,7 +240,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size, struct page **page)
 {
 	int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	unsigned long base_pfn = 0;
+	struct page *base_page = NULL;
 	int npage;
 	int ret;
 
@@ -250,26 +250,19 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 	 */
 	for (npage = 0; npage < total_pages; npage++) {
 		dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
-		unsigned long pfn;
+		struct page *cur_page;
 
 		ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
-				     IOMMU_READ | IOMMU_WRITE, &pfn);
+				     IOMMU_READ | IOMMU_WRITE, &cur_page);
 		if (ret != 1) {
 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
 				     &cur_iova, ret);
 			goto err;
 		}
 
-		if (!pfn_valid(pfn)) {
-			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
-			npage++;
-			ret = -EFAULT;
-			goto err;
-		}
-
 		if (npage == 0)
-			base_pfn = pfn;
-		else if (base_pfn + npage != pfn) {
+			base_page = cur_page;
+		else if (base_page + npage != cur_page) {
 			gvt_vgpu_err("The pages are not continuous\n");
 			ret = -EINVAL;
 			npage++;
@@ -277,7 +270,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		}
 	}
 
-	*page = pfn_to_page(base_pfn);
+	*page = base_page;
 	return 0;
 err:
 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index cd4ec4f6d6ff..8963f452f963 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -22,8 +22,8 @@
 struct page_array {
 	/* Array that stores pages need to pin. */
 	dma_addr_t		*pa_iova;
-	/* Array that receives PFNs of the pages pinned. */
-	unsigned long		*pa_pfn;
+	/* Array that receives the pinned pages. */
+	struct page		**pa_page;
 	/* Number of pages pinned from @pa_iova. */
 	int			pa_nr;
 };
@@ -68,19 +68,19 @@ static int page_array_alloc(struct page_array *pa, u64 iova, unsigned int len)
 		return -EINVAL;
 
 	pa->pa_iova = kcalloc(pa->pa_nr,
-			      sizeof(*pa->pa_iova) + sizeof(*pa->pa_pfn),
+			      sizeof(*pa->pa_iova) + sizeof(*pa->pa_page),
 			      GFP_KERNEL);
 	if (unlikely(!pa->pa_iova)) {
 		pa->pa_nr = 0;
 		return -ENOMEM;
 	}
-	pa->pa_pfn = (unsigned long *)&pa->pa_iova[pa->pa_nr];
+	pa->pa_page = (struct page **)&pa->pa_iova[pa->pa_nr];
 
 	pa->pa_iova[0] = iova;
-	pa->pa_pfn[0] = -1ULL;
+	pa->pa_page[0] = NULL;
 	for (i = 1; i < pa->pa_nr; i++) {
 		pa->pa_iova[i] = pa->pa_iova[i - 1] + PAGE_SIZE;
-		pa->pa_pfn[i] = -1ULL;
+		pa->pa_page[i] = NULL;
 	}
 
 	return 0;
@@ -144,7 +144,7 @@ static int page_array_pin(struct page_array *pa, struct vfio_device *vdev)
 
 		ret = vfio_pin_pages(vdev, *first, npage,
 				     IOMMU_READ | IOMMU_WRITE,
-				     &pa->pa_pfn[pinned]);
+				     &pa->pa_page[pinned]);
 		if (ret < 0) {
 			goto err_out;
 		} else if (ret > 0 && ret != npage) {
@@ -195,7 +195,7 @@ static inline void page_array_idal_create_words(struct page_array *pa,
 	 */
 
 	for (i = 0; i < pa->pa_nr; i++)
-		idaws[i] = pa->pa_pfn[i] << PAGE_SHIFT;
+		idaws[i] = page_to_phys(pa->pa_page[i]);
 
 	/* Adjust the first IDAW, since it may not start on a page boundary */
 	idaws[0] += pa->pa_iova[0] & (PAGE_SIZE - 1);
@@ -246,8 +246,7 @@ static long copy_from_iova(struct vfio_device *vdev, void *to, u64 iova,
 
 	l = n;
 	for (i = 0; i < pa.pa_nr; i++) {
-		struct page *page = pfn_to_page(pa.pa_pfn[i]);
-		void *from = kmap_local_page(page);
+		void *from = kmap_local_page(pa.pa_page[i]);
 
 		m = PAGE_SIZE;
 		if (i == 0) {
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index d7c38c82f694..75cd92c291e3 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -234,9 +234,9 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	struct ap_qirq_ctrl aqic_gisa = {};
 	struct ap_queue_status status = {};
 	struct kvm_s390_gisa *gisa;
+	struct page *h_page;
 	int nisc;
 	struct kvm *kvm;
-	unsigned long h_pfn;
 	phys_addr_t h_nib;
 	dma_addr_t nib;
 	int ret;
@@ -251,7 +251,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	}
 
 	ret = vfio_pin_pages(&q->matrix_mdev->vdev, nib, 1,
-			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
+			     IOMMU_READ | IOMMU_WRITE, &h_page);
 	switch (ret) {
 	case 1:
 		break;
@@ -267,7 +267,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	kvm = q->matrix_mdev->kvm;
 	gisa = kvm->arch.gisa_int.origin;
 
-	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
+	h_nib = page_to_phys(h_page) | (nib & ~PAGE_MASK);
 	aqic_gisa.gisc = isc;
 
 	nisc = kvm_s390_gisc_register(kvm, isc);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 606a20b605ba..8e23ca59ceed 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1941,18 +1941,18 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
  * @npage [in]   : count of pages to be pinned.  This count should not
  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @prot [in]    : protection flags
- * @phys_pfn[out]: array of host PFNs
+ * @pages[out]   : array of host pages
  * Return error or number of pages pinned.
  */
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-		   int npage, int prot, unsigned long *phys_pfn)
+		   int npage, int prot, struct page **pages)
 {
 	struct vfio_container *container;
 	struct vfio_group *group = device->group;
 	struct vfio_iommu_driver *driver;
 	int ret;
 
-	if (!phys_pfn || !npage || !vfio_assert_device_open(device))
+	if (!pages || !npage || !vfio_assert_device_open(device))
 		return -EINVAL;
 
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
@@ -1967,7 +1967,7 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 	if (likely(driver && driver->ops->pin_pages))
 		ret = driver->ops->pin_pages(container->iommu_data,
 					     group->iommu_group, iova,
-					     npage, prot, phys_pfn);
+					     npage, prot, pages);
 	else
 		ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index e9767e13f00f..503bea6c843d 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -52,7 +52,7 @@ struct vfio_iommu_driver_ops {
 				     struct iommu_group *group,
 				     dma_addr_t user_iova,
 				     int npage, int prot,
-				     unsigned long *phys_pfn);
+				     struct page **pages);
 	void		(*unpin_pages)(void *iommu_data,
 				       dma_addr_t user_iova, int npage);
 	void		(*register_device)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e629e059118c..db516c90a977 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -831,7 +831,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
 				      dma_addr_t user_iova,
 				      int npage, int prot,
-				      unsigned long *phys_pfn)
+				      struct page **pages)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_iommu_group *group;
@@ -841,7 +841,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	bool do_accounting;
 	dma_addr_t iova;
 
-	if (!iommu || !phys_pfn)
+	if (!iommu || !pages)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -880,6 +880,7 @@ again:
 	do_accounting = list_empty(&iommu->domain_list);
 
 	for (i = 0; i < npage; i++) {
+		unsigned long phys_pfn;
 		struct vfio_pfn *vpfn;
 
 		iova = user_iova + PAGE_SIZE * i;
@@ -896,23 +897,25 @@ again:
 
 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 		if (vpfn) {
-			phys_pfn[i] = vpfn->pfn;
+			pages[i] = pfn_to_page(vpfn->pfn);
 			continue;
 		}
 
 		remote_vaddr = dma->vaddr + (iova - dma->iova);
-		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
 					     do_accounting);
 		if (ret)
 			goto pin_unwind;
 
-		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
 		if (ret) {
-			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
+			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
 				vfio_lock_acct(dma, -1, true);
 			goto pin_unwind;
 		}
 
+		pages[i] = pfn_to_page(phys_pfn);
+
 		if (iommu->dirty_page_tracking) {
 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 
@@ -935,14 +938,14 @@ again:
 	goto pin_done;
 
 pin_unwind:
-	phys_pfn[i] = 0;
+	pages[i] = NULL;
 	for (j = 0; j < i; j++) {
 		dma_addr_t iova;
 
 		iova = user_iova + PAGE_SIZE * j;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		vfio_unpin_page_external(dma, iova, do_accounting);
-		phys_pfn[j] = 0;
+		pages[j] = NULL;
 	}
 pin_done:
 	mutex_unlock(&iommu->lock);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index acefd663e63b..e05ddc6fe6a5 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -162,7 +162,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-		   int npage, int prot, unsigned long *phys_pfn);
+		   int npage, int prot, struct page **pages);
 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova,
 		void *data, size_t len, bool write);
-- 
cgit 


From af468aadf00485a2f5e804fe97db4731bc7a9c24 Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Mon, 25 Jul 2022 14:53:43 -0500
Subject: ASoC: SOF: dai-intel: add SOF_DAI_INTEL_SSP_CLKCTRL_MCLK_AON bit

Update definition for mclk always-on feature and increase the
SOF_ABI_MINOR number for interface change.

Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20220725195343.145603-4-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/dai-intel.h | 2 ++
 include/uapi/sound/sof/abi.h  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sof/dai-intel.h b/include/sound/sof/dai-intel.h
index 7a266f41983c..5b93b7292f5e 100644
--- a/include/sound/sof/dai-intel.h
+++ b/include/sound/sof/dai-intel.h
@@ -52,6 +52,8 @@
 #define SOF_DAI_INTEL_SSP_CLKCTRL_MCLK_ES               BIT(6)
 /* bclk early start */
 #define SOF_DAI_INTEL_SSP_CLKCTRL_BCLK_ES               BIT(7)
+/* mclk always on */
+#define SOF_DAI_INTEL_SSP_CLKCTRL_MCLK_AON		BIT(8)
 
 /* DMIC max. four controllers for eight microphone channels */
 #define SOF_DAI_INTEL_DMIC_NUM_CTRL			4
diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h
index b7dce4df7ecd..3566630ca965 100644
--- a/include/uapi/sound/sof/abi.h
+++ b/include/uapi/sound/sof/abi.h
@@ -28,7 +28,7 @@
 
 /* SOF ABI version major, minor and patch numbers */
 #define SOF_ABI_MAJOR 3
-#define SOF_ABI_MINOR 22
+#define SOF_ABI_MINOR 23
 #define SOF_ABI_PATCH 0
 
 /* SOF ABI version number. Format within 32bit word is MMmmmppp */
-- 
cgit 


From e948d32c54fa45cc1601c98956bdbbf5f17a3db2 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:17 +0200
Subject: video: fbdev: imxfb: Drop platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No source file but the driver itself includes the header containing the
platform data definition. The last user is gone since commit
8485adf17a15 ("ARM: imx: Remove imx device directory").

So we can safely drop platform data support.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/video/fbdev/imxfb.c               | 99 +++++++++++--------------------
 include/linux/platform_data/video-imxfb.h | 12 ----
 2 files changed, 34 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index a2f644c97f28..85a5bf5639d9 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -656,7 +656,6 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf
 
 static int imxfb_init_fbinfo(struct platform_device *pdev)
 {
-	struct imx_fb_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct fb_info *info = platform_get_drvdata(pdev);
 	struct imxfb_info *fbi = info->par;
 	struct device_node *np;
@@ -690,25 +689,20 @@ static int imxfb_init_fbinfo(struct platform_device *pdev)
 	info->fbops			= &imxfb_ops;
 	info->flags			= FBINFO_FLAG_DEFAULT |
 					  FBINFO_READS_FAST;
-	if (pdata) {
-		fbi->lscr1			= pdata->lscr1;
-		fbi->dmacr			= pdata->dmacr;
-		fbi->pwmr			= pdata->pwmr;
-	} else {
-		np = pdev->dev.of_node;
-		info->var.grayscale = of_property_read_bool(np,
-						"cmap-greyscale");
-		fbi->cmap_inverse = of_property_read_bool(np, "cmap-inverse");
-		fbi->cmap_static = of_property_read_bool(np, "cmap-static");
 
-		fbi->lscr1 = IMXFB_LSCR1_DEFAULT;
+	np = pdev->dev.of_node;
+	info->var.grayscale = of_property_read_bool(np,
+					"cmap-greyscale");
+	fbi->cmap_inverse = of_property_read_bool(np, "cmap-inverse");
+	fbi->cmap_static = of_property_read_bool(np, "cmap-static");
 
-		of_property_read_u32(np, "fsl,lpccr", &fbi->pwmr);
+	fbi->lscr1 = IMXFB_LSCR1_DEFAULT;
 
-		of_property_read_u32(np, "fsl,lscr1", &fbi->lscr1);
+	of_property_read_u32(np, "fsl,lpccr", &fbi->pwmr);
 
-		of_property_read_u32(np, "fsl,dmacr", &fbi->dmacr);
-	}
+	of_property_read_u32(np, "fsl,lscr1", &fbi->lscr1);
+
+	of_property_read_u32(np, "fsl,dmacr", &fbi->dmacr);
 
 	return 0;
 }
@@ -863,10 +857,10 @@ static int imxfb_probe(struct platform_device *pdev)
 	struct imxfb_info *fbi;
 	struct lcd_device *lcd;
 	struct fb_info *info;
-	struct imx_fb_platform_data *pdata;
 	struct resource *res;
 	struct imx_fb_videomode *m;
 	const struct of_device_id *of_id;
+	struct device_node *display_np;
 	int ret, i;
 	int bytes_per_pixel;
 
@@ -884,8 +878,6 @@ static int imxfb_probe(struct platform_device *pdev)
 	if (!res)
 		return -ENODEV;
 
-	pdata = dev_get_platdata(&pdev->dev);
-
 	info = framebuffer_alloc(sizeof(struct imxfb_info), &pdev->dev);
 	if (!info)
 		return -ENOMEM;
@@ -898,43 +890,34 @@ static int imxfb_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto failed_init;
 
-	if (pdata) {
-		if (!fb_mode)
-			fb_mode = pdata->mode[0].mode.name;
-
-		fbi->mode = pdata->mode;
-		fbi->num_modes = pdata->num_modes;
-	} else {
-		struct device_node *display_np;
-		fb_mode = NULL;
-
-		display_np = of_parse_phandle(pdev->dev.of_node, "display", 0);
-		if (!display_np) {
-			dev_err(&pdev->dev, "No display defined in devicetree\n");
-			ret = -EINVAL;
-			goto failed_of_parse;
-		}
+	fb_mode = NULL;
 
-		/*
-		 * imxfb does not support more modes, we choose only the native
-		 * mode.
-		 */
-		fbi->num_modes = 1;
-
-		fbi->mode = devm_kzalloc(&pdev->dev,
-				sizeof(struct imx_fb_videomode), GFP_KERNEL);
-		if (!fbi->mode) {
-			ret = -ENOMEM;
-			of_node_put(display_np);
-			goto failed_of_parse;
-		}
+	display_np = of_parse_phandle(pdev->dev.of_node, "display", 0);
+	if (!display_np) {
+		dev_err(&pdev->dev, "No display defined in devicetree\n");
+		ret = -EINVAL;
+		goto failed_of_parse;
+	}
 
-		ret = imxfb_of_read_mode(&pdev->dev, display_np, fbi->mode);
+	/*
+	 * imxfb does not support more modes, we choose only the native
+	 * mode.
+	 */
+	fbi->num_modes = 1;
+
+	fbi->mode = devm_kzalloc(&pdev->dev,
+			sizeof(struct imx_fb_videomode), GFP_KERNEL);
+	if (!fbi->mode) {
+		ret = -ENOMEM;
 		of_node_put(display_np);
-		if (ret)
-			goto failed_of_parse;
+		goto failed_of_parse;
 	}
 
+	ret = imxfb_of_read_mode(&pdev->dev, display_np, fbi->mode);
+	of_node_put(display_np);
+	if (ret)
+		goto failed_of_parse;
+
 	/* Calculate maximum bytes used per pixel. In most cases this should
 	 * be the same as m->bpp/8 */
 	m = &fbi->mode[0];
@@ -1001,13 +984,6 @@ static int imxfb_probe(struct platform_device *pdev)
 
 	info->fix.smem_start = fbi->map_dma;
 
-	if (pdata && pdata->init) {
-		ret = pdata->init(fbi->pdev);
-		if (ret)
-			goto failed_platform_init;
-	}
-
-
 	INIT_LIST_HEAD(&info->modelist);
 	for (i = 0; i < fbi->num_modes; i++)
 		fb_add_videomode(&fbi->mode[i].mode, &info->modelist);
@@ -1059,9 +1035,6 @@ failed_lcd:
 failed_register:
 	fb_dealloc_cmap(&info->cmap);
 failed_cmap:
-	if (pdata && pdata->exit)
-		pdata->exit(fbi->pdev);
-failed_platform_init:
 	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_buffer,
 		    fbi->map_dma);
 failed_map:
@@ -1079,7 +1052,6 @@ failed_init:
 
 static int imxfb_remove(struct platform_device *pdev)
 {
-	struct imx_fb_platform_data *pdata;
 	struct fb_info *info = platform_get_drvdata(pdev);
 	struct imxfb_info *fbi = info->par;
 	struct resource *res;
@@ -1092,9 +1064,6 @@ static int imxfb_remove(struct platform_device *pdev)
 
 	unregister_framebuffer(info);
 	fb_dealloc_cmap(&info->cmap);
-	pdata = dev_get_platdata(&pdev->dev);
-	if (pdata && pdata->exit)
-		pdata->exit(fbi->pdev);
 	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_buffer,
 		    fbi->map_dma);
 	iounmap(fbi->regs);
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index 02812651af7d..b80a156a6617 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -55,16 +55,4 @@ struct imx_fb_videomode {
 	unsigned char	bpp;
 };
 
-struct imx_fb_platform_data {
-	struct imx_fb_videomode *mode;
-	int		num_modes;
-
-	u_int		pwmr;
-	u_int		lscr1;
-	u_int		dmacr;
-
-	int (*init)(struct platform_device *);
-	void (*exit)(struct platform_device *);
-};
-
 #endif /* ifndef __MACH_IMXFB_H__ */
-- 
cgit 


From e2279cc92919f37b4af985cb28ae350bb3e62e71 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:18 +0200
Subject: video: fbdev: imxfb: Drop unused symbols from header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only file that includes <linux/platform_data/video-imxfb.h> is the
imxfb driver. Drop all symbols that are unused there.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/platform_data/video-imxfb.h | 35 -------------------------------
 1 file changed, 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index b80a156a6617..a16837c5e43c 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -8,45 +8,10 @@
 #include <linux/fb.h>
 
 #define PCR_TFT		(1 << 31)
-#define PCR_COLOR	(1 << 30)
-#define PCR_PBSIZ_1	(0 << 28)
-#define PCR_PBSIZ_2	(1 << 28)
-#define PCR_PBSIZ_4	(2 << 28)
-#define PCR_PBSIZ_8	(3 << 28)
-#define PCR_BPIX_1	(0 << 25)
-#define PCR_BPIX_2	(1 << 25)
-#define PCR_BPIX_4	(2 << 25)
 #define PCR_BPIX_8	(3 << 25)
 #define PCR_BPIX_12	(4 << 25)
 #define PCR_BPIX_16	(5 << 25)
 #define PCR_BPIX_18	(6 << 25)
-#define PCR_PIXPOL	(1 << 24)
-#define PCR_FLMPOL	(1 << 23)
-#define PCR_LPPOL	(1 << 22)
-#define PCR_CLKPOL	(1 << 21)
-#define PCR_OEPOL	(1 << 20)
-#define PCR_SCLKIDLE	(1 << 19)
-#define PCR_END_SEL	(1 << 18)
-#define PCR_END_BYTE_SWAP (1 << 17)
-#define PCR_REV_VS	(1 << 16)
-#define PCR_ACD_SEL	(1 << 15)
-#define PCR_ACD(x)	(((x) & 0x7f) << 8)
-#define PCR_SCLK_SEL	(1 << 7)
-#define PCR_SHARP	(1 << 6)
-#define PCR_PCD(x)	((x) & 0x3f)
-
-#define PWMR_CLS(x)	(((x) & 0x1ff) << 16)
-#define PWMR_LDMSK	(1 << 15)
-#define PWMR_SCR1	(1 << 10)
-#define PWMR_SCR0	(1 << 9)
-#define PWMR_CC_EN	(1 << 8)
-#define PWMR_PW(x)	((x) & 0xff)
-
-#define LSCR1_PS_RISE_DELAY(x)    (((x) & 0x7f) << 26)
-#define LSCR1_CLS_RISE_DELAY(x)   (((x) & 0x3f) << 16)
-#define LSCR1_REV_TOGGLE_DELAY(x) (((x) & 0xf) << 8)
-#define LSCR1_GRAY2(x)            (((x) & 0xf) << 4)
-#define LSCR1_GRAY1(x)            (((x) & 0xf))
 
 struct imx_fb_videomode {
 	struct fb_videomode mode;
-- 
cgit 


From ded77a74ee6bc3dea72ad41129823a812e4b64f3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:19 +0200
Subject: video: fbdev: imxfb: Fold <linux/platform_data/video-imxfb.h> into
 only user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No source file but the driver itself includes the header containing the
platform data definition. The last user is gone since commit
8485adf17a15 ("ARM: imx: Remove imx device directory").

Move the remaining symbols directly into the driver and remove the then
unused header file.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 MAINTAINERS                               |  1 -
 drivers/video/fbdev/imxfb.c               | 13 ++++++++++++-
 include/linux/platform_data/video-imxfb.h | 23 -----------------------
 3 files changed, 12 insertions(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/video-imxfb.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 651616ed8ae2..75c4d7ebef09 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8011,7 +8011,6 @@ L:	linux-fbdev@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	drivers/video/fbdev/imxfb.c
-F:	include/linux/platform_data/video-imxfb.h
 
 FREESCALE IMX DDR PMU DRIVER
 M:	Frank Li <Frank.li@nxp.com>
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index 85a5bf5639d9..fa6a19c1ae9b 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -41,7 +41,18 @@
 #include <video/of_videomode.h>
 #include <video/videomode.h>
 
-#include <linux/platform_data/video-imxfb.h>
+#define PCR_TFT		(1 << 31)
+#define PCR_BPIX_8	(3 << 25)
+#define PCR_BPIX_12	(4 << 25)
+#define PCR_BPIX_16	(5 << 25)
+#define PCR_BPIX_18	(6 << 25)
+
+struct imx_fb_videomode {
+	struct fb_videomode mode;
+	u32 pcr;
+	bool aus_mode;
+	unsigned char	bpp;
+};
 
 /*
  * Complain if VAR is out of range.
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
deleted file mode 100644
index a16837c5e43c..000000000000
--- a/include/linux/platform_data/video-imxfb.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This structure describes the machine which we are running on.
- */
-#ifndef __MACH_IMXFB_H__
-#define __MACH_IMXFB_H__
-
-#include <linux/fb.h>
-
-#define PCR_TFT		(1 << 31)
-#define PCR_BPIX_8	(3 << 25)
-#define PCR_BPIX_12	(4 << 25)
-#define PCR_BPIX_16	(5 << 25)
-#define PCR_BPIX_18	(6 << 25)
-
-struct imx_fb_videomode {
-	struct fb_videomode mode;
-	u32 pcr;
-	bool aus_mode;
-	unsigned char	bpp;
-};
-
-#endif /* ifndef __MACH_IMXFB_H__ */
-- 
cgit 


From 451ef36bd229f8aa329cb2258a859b4c636d08ef Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Mon, 25 Jul 2022 16:31:37 +0200
Subject: ip_tunnels: Add new flow flags field to ip_tunnel_key

This commit extends the ip_tunnel_key struct with a new field for the
flow flags, to pass them to the route lookups. This new field will be
populated and used in subsequent commits.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/f8bfd4983bd06685a59b1e3ba76ca27496f51ef3.1658759380.git.paul@isovalent.com
---
 include/net/ip_tunnels.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 20db95055db3..63fac94f9ace 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -54,6 +54,7 @@ struct ip_tunnel_key {
 	__be32			label;		/* Flow Label for IPv6 */
 	__be16			tp_src;
 	__be16			tp_dst;
+	__u8			flow_flags;
 };
 
 /* Flags for ip_tunnel_info mode. */
-- 
cgit 


From 42399301203e3cddef36cde457228f9247618313 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:52 -0600
Subject: lib/scatterlist: add flag for indicating P2PDMA segments in an SGL

Introduce a dma_flags field in struct scatterlist. These flags will be
used by dma_[un]map_sg_p2pdma() to determine when a given SGL segments
dma_address points to a PCI bus address. dma_unmap_sg_p2pdma() will need
to perform different cleanup when a segment is marked as a bus address.

The dma_flags field will fit in the existing padding on 64BIT systems
(assuming CONFIG_NEED_SG_DMA_LENGTH is also set).

The new bit will only be used when CONFIG_PCI_P2PDMA is set; this means
PCI P2PDMA will require CONFIG_64BIT. This should be acceptable as the
majority of P2PDMA use cases are restricted to newer root complexes and
roughly require the extra address space for memory BARs used in the
transactions.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/Kconfig         |  5 ++++
 include/linux/scatterlist.h | 69 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 133c73207782..5cc7cba1941f 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -164,6 +164,11 @@ config PCI_PASID
 config PCI_P2PDMA
 	bool "PCI peer-to-peer transfer support"
 	depends on ZONE_DEVICE
+	#
+	# The need for the scatterlist DMA bus address flag means PCI P2PDMA
+	# requires 64bit
+	#
+	depends on 64BIT
 	select GENERIC_ALLOCATOR
 	help
 	  Enableѕ drivers to do PCI peer-to-peer transactions to and from
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 7ff9d6386c12..375a5e90d86a 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -16,6 +16,9 @@ struct scatterlist {
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 	unsigned int	dma_length;
 #endif
+#ifdef CONFIG_PCI_P2PDMA
+	unsigned int    dma_flags;
+#endif
 };
 
 /*
@@ -245,6 +248,72 @@ static inline void sg_unmark_end(struct scatterlist *sg)
 	sg->page_link &= ~SG_END;
 }
 
+/*
+ * CONFGI_PCI_P2PDMA depends on CONFIG_64BIT which means there is 4 bytes
+ * in struct scatterlist (assuming also CONFIG_NEED_SG_DMA_LENGTH is set).
+ * Use this padding for DMA flags bits to indicate when a specific
+ * dma address is a bus address.
+ */
+#ifdef CONFIG_PCI_P2PDMA
+
+#define SG_DMA_BUS_ADDRESS (1 << 0)
+
+/**
+ * sg_dma_is_bus address - Return whether a given segment was marked
+ *			   as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Returns true if sg_dma_mark_bus_address() has been called on
+ *   this segment.
+ **/
+static inline bool sg_is_dma_bus_address(struct scatterlist *sg)
+{
+	return sg->dma_flags & SG_DMA_BUS_ADDRESS;
+}
+
+/**
+ * sg_dma_mark_bus address - Mark the scatterlist entry as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Marks the passed in sg entry to indicate that the dma_address is
+ *   a bus address and doesn't need to be unmapped. This should only be
+ *   used by dma_map_sg() implementations to mark bus addresses
+ *   so they can be properly cleaned up in dma_unmap_sg().
+ **/
+static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
+{
+	sg->dma_flags |= SG_DMA_BUS_ADDRESS;
+}
+
+/**
+ * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Clears the bus address mark.
+ **/
+static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
+{
+	sg->dma_flags &= ~SG_DMA_BUS_ADDRESS;
+}
+
+#else
+
+static inline bool sg_is_dma_bus_address(struct scatterlist *sg)
+{
+	return false;
+}
+static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
+{
+}
+static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
+{
+}
+
+#endif
+
 /**
  * sg_phys - Return physical address of an sg entry
  * @sg:	     SG entry
-- 
cgit 


From 5e180ff326b43bd3fb72892e1083b2707159aa6e Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:54 -0600
Subject: PCI/P2PDMA: Introduce helpers for dma_map_sg implementations

Add pci_p2pdma_map_segment() as a helper for dma_map_sg()
implementations. It takes an scatterlist segment that must point to a
pci_p2pdma struct page and will map it if the mapping requires a bus
address.

The return value indicates whether the mapping required a bus address
or whether the caller still needs to map the segment normally. If the
segment should not be mapped, -EREMOTEIO is returned.

This helper uses a state structure to track the changes to the
pgmap across calls and avoid needing to lookup into the xarray for
every page.

The prototype for the helper is added to dma-map-ops.h as it is only
useful to dma map implementations and don't need to pollute the public
pci-p2pdma header.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/p2pdma.c        | 44 +++++++++++++++++++++++++++++++------
 include/linux/dma-map-ops.h | 53 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4e8bc457e29a..5d2538aa0778 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -10,6 +10,7 @@
 
 #define pr_fmt(fmt) "pci-p2pdma: " fmt
 #include <linux/ctype.h>
+#include <linux/dma-map-ops.h>
 #include <linux/pci-p2pdma.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -20,13 +21,6 @@
 #include <linux/seq_buf.h>
 #include <linux/xarray.h>
 
-enum pci_p2pdma_map_type {
-	PCI_P2PDMA_MAP_UNKNOWN = 0,
-	PCI_P2PDMA_MAP_NOT_SUPPORTED,
-	PCI_P2PDMA_MAP_BUS_ADDR,
-	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
 struct pci_p2pdma {
 	struct gen_pool *pool;
 	bool p2pmem_published;
@@ -944,6 +938,42 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
+/**
+ * pci_p2pdma_map_segment - map an sg segment determining the mapping type
+ * @state: State structure that should be declared outside of the for_each_sg()
+ *	loop and initialized to zero.
+ * @dev: DMA device that's doing the mapping operation
+ * @sg: scatterlist segment to map
+ *
+ * This is a helper to be used by non-IOMMU dma_map_sg() implementations where
+ * the sg segment is the same for the page_link and the dma_address.
+ *
+ * Attempt to map a single segment in an SGL with the PCI bus address.
+ * The segment must point to a PCI P2PDMA page and thus must be
+ * wrapped in a is_pci_p2pdma_page(sg_page(sg)) check.
+ *
+ * Returns the type of mapping used and maps the page if the type is
+ * PCI_P2PDMA_MAP_BUS_ADDR.
+ */
+enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg)
+{
+	if (state->pgmap != sg_page(sg)->pgmap) {
+		state->pgmap = sg_page(sg)->pgmap;
+		state->map = pci_p2pdma_map_type(state->pgmap, dev);
+		state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+	}
+
+	if (state->map == PCI_P2PDMA_MAP_BUS_ADDR) {
+		sg->dma_address = sg_phys(sg) + state->bus_off;
+		sg_dma_len(sg) = sg->length;
+		sg_dma_mark_bus_address(sg);
+	}
+
+	return state->map;
+}
+
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
  *		to enable p2pdma
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 98ceba6fa848..99cec59dbfcb 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -380,4 +380,57 @@ static inline void debug_dma_dump_mappings(struct device *dev)
 
 extern const struct dma_map_ops dma_dummy_ops;
 
+enum pci_p2pdma_map_type {
+	/*
+	 * PCI_P2PDMA_MAP_UNKNOWN: Used internally for indicating the mapping
+	 * type hasn't been calculated yet. Functions that return this enum
+	 * never return this value.
+	 */
+	PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+	/*
+	 * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+	 * traverse the host bridge and the host bridge is not in the
+	 * allowlist. DMA Mapping routines should return an error when
+	 * this is returned.
+	 */
+	PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+	/*
+	 * PCI_P2PDMA_BUS_ADDR: Indicates that two devices can talk to
+	 * each other directly through a PCI switch and the transaction will
+	 * not traverse the host bridge. Such a mapping should program
+	 * the DMA engine with PCI bus addresses.
+	 */
+	PCI_P2PDMA_MAP_BUS_ADDR,
+
+	/*
+	 * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+	 * to each other, but the transaction traverses a host bridge on the
+	 * allowlist. In this case, a normal mapping either with CPU physical
+	 * addresses (in the case of dma-direct) or IOVA addresses (in the
+	 * case of IOMMUs) should be used to program the DMA engine.
+	 */
+	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
+struct pci_p2pdma_map_state {
+	struct dev_pagemap *pgmap;
+	int map;
+	u64 bus_off;
+};
+
+#ifdef CONFIG_PCI_P2PDMA
+enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg);
+#else /* CONFIG_PCI_P2PDMA */
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg)
+{
+	return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
+#endif /* CONFIG_PCI_P2PDMA */
+
 #endif /* _LINUX_DMA_MAP_OPS_H */
-- 
cgit 


From 159bf19270e80b5bc4b13aa88072dcb390b4d297 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:57 -0600
Subject: dma-mapping: add flags to dma_map_ops to indicate PCI P2PDMA support

Add a flags member to the dma_map_ops structure with one flag to
indicate support for PCI P2PDMA.

Also, add a helper to check if a device supports PCI P2PDMA.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 10 ++++++++++
 include/linux/dma-mapping.h |  5 +++++
 kernel/dma/mapping.c        | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 99cec59dbfcb..010df04358aa 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -11,7 +11,17 @@
 
 struct cma;
 
+/*
+ * Values for struct dma_map_ops.flags:
+ *
+ * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can
+ * handle PCI P2PDMA pages in the map_sg/unmap_sg operation.
+ */
+#define DMA_F_PCI_P2PDMA_SUPPORTED     (1 << 0)
+
 struct dma_map_ops {
+	unsigned int flags;
+
 	void *(*alloc)(struct device *dev, size_t size,
 			dma_addr_t *dma_handle, gfp_t gfp,
 			unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index fe3849434b2a..25a30906289d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -140,6 +140,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
 int dma_supported(struct device *dev, u64 mask);
+bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
@@ -251,6 +252,10 @@ static inline int dma_supported(struct device *dev, u64 mask)
 {
 	return 0;
 }
+static inline bool dma_pci_p2pdma_supported(struct device *dev)
+{
+	return false;
+}
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
 	return -EIO;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 746d46825d08..a9ce5d728231 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -723,6 +723,24 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	/* if ops is not set, dma direct will be used which supports P2PDMA */
+	if (!ops)
+		return true;
+
+	/*
+	 * Note: dma_ops_bypass is not checked here because P2PDMA should
+	 * not be used with dma mapping ops that do not have support even
+	 * if the specific device is bypassing them.
+	 */
+
+	return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
+
 #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
 void arch_dma_set_mask(struct device *dev, u64 mask);
 #else
-- 
cgit 


From 495758bb1a72316094ee854dc51f667ad3db29b5 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:51:02 -0600
Subject: RDMA/core: introduce ib_dma_pci_p2p_dma_supported()

Introduce the helper function ib_dma_pci_p2p_dma_supported() to check
if a given ib_device can be used in P2PDMA transfers. This ensures
the ib_device is not using virt_dma and also that the underlying
dma_device supports P2PDMA.

Use the new helper in nvme-rdma to replace the existing check for
ib_uses_virt_dma(). Adding the dma_pci_p2pdma_supported() check allows
switching away from pci_p2pdma_[un]map_sg().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c |  2 +-
 include/rdma/ib_verbs.h    | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 09fdcac87d17..4597bca43a6d 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -415,7 +415,7 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 	if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
 		goto out_free_rsp;
 
-	if (!ib_uses_virt_dma(ndev->device))
+	if (ib_dma_pci_p2p_dma_supported(ndev->device))
 		r->req.p2p_client = &ndev->device->dev;
 	r->send_sge.length = sizeof(*r->req.cqe);
 	r->send_sge.lkey = ndev->pd->local_dma_lkey;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9c6317cf80d5..523843d9ed6c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4013,6 +4013,17 @@ static inline bool ib_uses_virt_dma(struct ib_device *dev)
 	return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device;
 }
 
+/*
+ * Check if a IB device's underlying DMA mapping supports P2PDMA transfers.
+ */
+static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev)
+{
+	if (ib_uses_virt_dma(dev))
+		return false;
+
+	return dma_pci_p2pdma_supported(dev->dma_device);
+}
+
 /**
  * ib_dma_mapping_error - check a DMA addr for error
  * @dev: The device for which the dma_addr was created
-- 
cgit 


From 0d06132fc84bd91379300768c75a19474e06d0c5 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:51:04 -0600
Subject: PCI/P2PDMA: Remove pci_p2pdma_[un]map_sg()

This interface is superseded by support in dma_map_sg() which now supports
heterogeneous scatterlists. There are no longer any users, so remove it.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/p2pdma.c       | 66 ----------------------------------------------
 include/linux/pci-p2pdma.h | 27 -------------------
 2 files changed, 93 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 5d2538aa0778..4496a7c5c478 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -872,72 +872,6 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 	return type;
 }
 
-static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
-		struct device *dev, struct scatterlist *sg, int nents)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nents, i) {
-		s->dma_address = sg_phys(s) + p2p_pgmap->bus_offset;
-		sg_dma_len(s) = s->length;
-	}
-
-	return nents;
-}
-
-/**
- * pci_p2pdma_map_sg_attrs - map a PCI peer-to-peer scatterlist for DMA
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: elements in the scatterlist
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_map_sg() (if called)
- *
- * Scatterlists mapped with this function should be unmapped using
- * pci_p2pdma_unmap_sg_attrs().
- *
- * Returns the number of SG entries mapped or 0 on error.
- */
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct pci_p2pdma_pagemap *p2p_pgmap =
-		to_p2p_pgmap(sg_page(sg)->pgmap);
-
-	switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev)) {
-	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-		return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
-	case PCI_P2PDMA_MAP_BUS_ADDR:
-		return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
-	default:
-		/* Mapping is not Supported */
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs);
-
-/**
- * pci_p2pdma_unmap_sg_attrs - unmap a PCI peer-to-peer scatterlist that was
- *	mapped with pci_p2pdma_map_sg()
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: number of elements returned by pci_p2pdma_map_sg()
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_unmap_sg() (if called)
- */
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	enum pci_p2pdma_map_type map_type;
-
-	map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev);
-
-	if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
-		dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
-
 /**
  * pci_p2pdma_map_segment - map an sg segment determining the mapping type
  * @state: State structure that should be declared outside of the for_each_sg()
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 8318a97c9c61..2c07aa6b7665 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -30,10 +30,6 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
 					 unsigned int *nents, u32 length);
 void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
 int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 			    bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
@@ -83,17 +79,6 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
-static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	return 0;
-}
-static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
 static inline int pci_p2pdma_enable_store(const char *page,
 		struct pci_dev **p2p_dev, bool *use_p2pdma)
 {
@@ -119,16 +104,4 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
 	return pci_p2pmem_find_many(&client, 1);
 }
 
-static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg,
-				    int nents, enum dma_data_direction dir)
-{
-	return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0);
-}
-
-static inline void pci_p2pdma_unmap_sg(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir)
-{
-	pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0);
-}
-
 #endif /* _LINUX_PCI_P2P_H */
-- 
cgit 


From 019e442bb0d5e7f25ada8eb4252e22e62d17a95e Mon Sep 17 00:00:00 2001
From: Axe Yang <axe.yang@mediatek.com>
Date: Tue, 26 Jul 2022 14:28:41 +0800
Subject: mmc: core: Add support for SDIO wakeup interrupt

If wakeup-source flag is set in host dts node, parse EAI information
from SDIO CCCR interrupt externsion segment for in-band wakeup. If
async interrupt is supported by SDIO card then enable it and set
enable_async_irq flag in sdio_cccr structure to 1. The parse flow is
implemented in sdio_read_cccr().

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Axe Yang <axe.yang@mediatek.com>
Link: https://lore.kernel.org/r/20220726062842.18846-3-axe.yang@mediatek.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sdio.c  | 14 ++++++++++++++
 include/linux/mmc/card.h |  8 +++++++-
 include/linux/mmc/sdio.h |  5 +++++
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index b589df1c35e0..0b682a31cd3e 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -226,6 +226,20 @@ static int sdio_read_cccr(struct mmc_card *card, u32 ocr)
 				card->sw_caps.sd3_drv_type |= SD_DRIVER_TYPE_C;
 			if (data & SDIO_DRIVE_SDTD)
 				card->sw_caps.sd3_drv_type |= SD_DRIVER_TYPE_D;
+
+			ret = mmc_io_rw_direct(card, 0, 0, SDIO_CCCR_INTERRUPT_EXT, 0, &data);
+			if (ret)
+				goto out;
+
+			if (data & SDIO_INTERRUPT_EXT_SAI) {
+				data |= SDIO_INTERRUPT_EXT_EAI;
+				ret = mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_INTERRUPT_EXT,
+						       data, NULL);
+				if (ret)
+					goto out;
+
+				card->cccr.enable_async_irq = 1;
+			}
 		}
 
 		/* if no uhs mode ensure we check for high speed */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 156a7b673a28..8a30de08e913 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -219,7 +219,8 @@ struct sdio_cccr {
 				wide_bus:1,
 				high_power:1,
 				high_speed:1,
-				disable_cd:1;
+				disable_cd:1,
+				enable_async_irq:1;
 };
 
 struct sdio_cis {
@@ -343,6 +344,11 @@ static inline bool mmc_large_sector(struct mmc_card *card)
 	return card->ext_csd.data_sector_size == 4096;
 }
 
+static inline int mmc_card_enable_async_irq(struct mmc_card *card)
+{
+	return card->cccr.enable_async_irq;
+}
+
 bool mmc_card_is_blockaddr(struct mmc_card *card);
 
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
diff --git a/include/linux/mmc/sdio.h b/include/linux/mmc/sdio.h
index 2a05d1ac4f0e..1ef400f28642 100644
--- a/include/linux/mmc/sdio.h
+++ b/include/linux/mmc/sdio.h
@@ -159,6 +159,11 @@
 #define  SDIO_DTSx_SET_TYPE_A	(1 << SDIO_DRIVE_DTSx_SHIFT)
 #define  SDIO_DTSx_SET_TYPE_C	(2 << SDIO_DRIVE_DTSx_SHIFT)
 #define  SDIO_DTSx_SET_TYPE_D	(3 << SDIO_DRIVE_DTSx_SHIFT)
+
+#define SDIO_CCCR_INTERRUPT_EXT	0x16
+#define SDIO_INTERRUPT_EXT_SAI	(1 << 0)
+#define SDIO_INTERRUPT_EXT_EAI	(1 << 1)
+
 /*
  * Function Basic Registers (FBR)
  */
-- 
cgit 


From 46126db9c86110e5fc1e369b9bb89735ddefdae4 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Mon, 18 Jul 2022 14:18:10 +0200
Subject: flow_dissector: Add PPPoE dissectors

Allow to dissect PPPoE specific fields which are:
- session ID (16 bits)
- ppp protocol (16 bits)
- type (16 bits) - this is PPPoE ethertype, for now only
  ETH_P_PPP_SES is supported, possible ETH_P_PPP_DISC
  in the future

The goal is to make the following TC command possible:

  # tc filter add dev ens6f0 ingress prio 1 protocol ppp_ses \
      flower \
        pppoe_sid 12 \
        ppp_proto ip \
      action drop

Note that only PPPoE Session is supported.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/ppp_defs.h     | 14 ++++++++++++
 include/net/flow_dissector.h | 13 +++++++++++
 net/core/flow_dissector.c    | 53 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 73 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h
index 9d2b388fae1a..b7e57fdbd413 100644
--- a/include/linux/ppp_defs.h
+++ b/include/linux/ppp_defs.h
@@ -11,4 +11,18 @@
 #include <uapi/linux/ppp_defs.h>
 
 #define PPP_FCS(fcs, c) crc_ccitt_byte(fcs, c)
+
+/**
+ * ppp_proto_is_valid - checks if PPP protocol is valid
+ * @proto: PPP protocol
+ *
+ * Assumes proto is not compressed.
+ * Protocol is valid if the value is odd and the least significant bit of the
+ * most significant octet is 0 (see RFC 1661, section 2).
+ */
+static inline bool ppp_proto_is_valid(u16 proto)
+{
+	return !!((proto & 0x0101) == 0x0001);
+}
+
 #endif /* _PPP_DEFS_H_ */
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 0f9544a9bb9e..6c74812d64b2 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -277,6 +277,18 @@ struct flow_dissector_key_num_of_vlans {
 	u8 num_of_vlans;
 };
 
+/**
+ * struct flow_dissector_key_pppoe:
+ * @session_id: pppoe session id
+ * @ppp_proto: ppp protocol
+ * @type: pppoe eth type
+ */
+struct flow_dissector_key_pppoe {
+	__be16 session_id;
+	__be16 ppp_proto;
+	__be16 type;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -307,6 +319,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
 	FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */
 	FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */
+	FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6aee04f75e3e..237d396b6e41 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -895,6 +895,11 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 	return result == BPF_OK;
 }
 
+static bool is_pppoe_ses_hdr_valid(struct pppoe_hdr hdr)
+{
+	return hdr.ver == 1 && hdr.type == 1 && hdr.code == 0;
+}
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @net: associated network namespace, derived from @skb if NULL
@@ -1214,26 +1219,60 @@ proto_again:
 			struct pppoe_hdr hdr;
 			__be16 proto;
 		} *hdr, _hdr;
+		u16 ppp_proto;
+
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 		if (!hdr) {
 			fdret = FLOW_DISSECT_RET_OUT_BAD;
 			break;
 		}
 
-		nhoff += PPPOE_SES_HLEN;
-		switch (hdr->proto) {
-		case htons(PPP_IP):
+		if (!is_pppoe_ses_hdr_valid(hdr->hdr)) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
+
+		/* least significant bit of the most significant octet
+		 * indicates if protocol field was compressed
+		 */
+		ppp_proto = ntohs(hdr->proto);
+		if (ppp_proto & 0x0100) {
+			ppp_proto = ppp_proto >> 8;
+			nhoff += PPPOE_SES_HLEN - 1;
+		} else {
+			nhoff += PPPOE_SES_HLEN;
+		}
+
+		if (ppp_proto == PPP_IP) {
 			proto = htons(ETH_P_IP);
 			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
-			break;
-		case htons(PPP_IPV6):
+		} else if (ppp_proto == PPP_IPV6) {
 			proto = htons(ETH_P_IPV6);
 			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
-			break;
-		default:
+		} else if (ppp_proto == PPP_MPLS_UC) {
+			proto = htons(ETH_P_MPLS_UC);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		} else if (ppp_proto == PPP_MPLS_MC) {
+			proto = htons(ETH_P_MPLS_MC);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		} else if (ppp_proto_is_valid(ppp_proto)) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		} else {
 			fdret = FLOW_DISSECT_RET_OUT_BAD;
 			break;
 		}
+
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_PPPOE)) {
+			struct flow_dissector_key_pppoe *key_pppoe;
+
+			key_pppoe = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_PPPOE,
+							      target_container);
+			key_pppoe->session_id = hdr->hdr.sid;
+			key_pppoe->ppp_proto = htons(ppp_proto);
+			key_pppoe->type = htons(ETH_P_PPP_SES);
+		}
 		break;
 	}
 	case htons(ETH_P_TIPC): {
-- 
cgit 


From 5008750eff5d4af8a3aed4a7567c4cfb2b3cb156 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Mon, 18 Jul 2022 14:18:11 +0200
Subject: net/sched: flower: Add PPPoE filter

Add support for PPPoE specific fields for tc-flower.
Those fields can be provided only when protocol was set
to ETH_P_PPP_SES. Defines, dump, load and set are being done here.

Overwrite basic.n_proto only in case of PPP_IP and PPP_IPV6,
otherwise leave it as ETH_P_PPP_SES.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/uapi/linux/pkt_cls.h |  3 +++
 net/sched/cls_flower.c       | 64 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9a2ee1e39fad..c142c0f8ed8a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -589,6 +589,9 @@ enum {
 
 	TCA_FLOWER_KEY_NUM_OF_VLANS,    /* u8 */
 
+	TCA_FLOWER_KEY_PPPOE_SID,	/* be16 */
+	TCA_FLOWER_KEY_PPP_PROTO,	/* be16 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1a1e34480b7e..041d63ff809a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -16,6 +16,7 @@
 #include <linux/in6.h>
 #include <linux/ip.h>
 #include <linux/mpls.h>
+#include <linux/ppp_defs.h>
 
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
@@ -67,6 +68,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_ct ct;
 	struct flow_dissector_key_hash hash;
 	struct flow_dissector_key_num_of_vlans num_of_vlans;
+	struct flow_dissector_key_pppoe pppoe;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -708,6 +710,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_HASH]		= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_HASH_MASK]	= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_NUM_OF_VLANS]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_PPPOE_SID]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_PPP_PROTO]	= { .type = NLA_U16 },
 
 };
 
@@ -1035,6 +1039,50 @@ static void fl_set_key_vlan(struct nlattr **tb,
 	}
 }
 
+static void fl_set_key_pppoe(struct nlattr **tb,
+			     struct flow_dissector_key_pppoe *key_val,
+			     struct flow_dissector_key_pppoe *key_mask,
+			     struct fl_flow_key *key,
+			     struct fl_flow_key *mask)
+{
+	/* key_val::type must be set to ETH_P_PPP_SES
+	 * because ETH_P_PPP_SES was stored in basic.n_proto
+	 * which might get overwritten by ppp_proto
+	 * or might be set to 0, the role of key_val::type
+	 * is simmilar to vlan_key::tpid
+	 */
+	key_val->type = htons(ETH_P_PPP_SES);
+	key_mask->type = cpu_to_be16(~0);
+
+	if (tb[TCA_FLOWER_KEY_PPPOE_SID]) {
+		key_val->session_id =
+			nla_get_be16(tb[TCA_FLOWER_KEY_PPPOE_SID]);
+		key_mask->session_id = cpu_to_be16(~0);
+	}
+	if (tb[TCA_FLOWER_KEY_PPP_PROTO]) {
+		key_val->ppp_proto =
+			nla_get_be16(tb[TCA_FLOWER_KEY_PPP_PROTO]);
+		key_mask->ppp_proto = cpu_to_be16(~0);
+
+		if (key_val->ppp_proto == htons(PPP_IP)) {
+			key->basic.n_proto = htons(ETH_P_IP);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_IPV6)) {
+			key->basic.n_proto = htons(ETH_P_IPV6);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_MPLS_UC)) {
+			key->basic.n_proto = htons(ETH_P_MPLS_UC);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_MPLS_MC)) {
+			key->basic.n_proto = htons(ETH_P_MPLS_MC);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		}
+	} else {
+		key->basic.n_proto = 0;
+		mask->basic.n_proto = cpu_to_be16(0);
+	}
+}
+
 static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
 			    u32 *dissector_key, u32 *dissector_mask,
 			    u32 flower_flag_bit, u32 dissector_flag_bit)
@@ -1645,6 +1693,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		}
 	}
 
+	if (key->basic.n_proto == htons(ETH_P_PPP_SES))
+		fl_set_key_pppoe(tb, &key->pppoe, &mask->pppoe, key, mask);
+
 	if (key->basic.n_proto == htons(ETH_P_IP) ||
 	    key->basic.n_proto == htons(ETH_P_IPV6)) {
 		fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
@@ -1917,6 +1968,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_HASH, hash);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_PPPOE, pppoe);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -3045,6 +3098,17 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 	    fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
 		goto nla_put_failure;
 
+	if (mask->pppoe.session_id) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_PPPOE_SID,
+				 key->pppoe.session_id))
+			goto nla_put_failure;
+	}
+	if (mask->basic.n_proto && mask->pppoe.ppp_proto) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_PPP_PROTO,
+				 key->pppoe.ppp_proto))
+			goto nla_put_failure;
+	}
+
 	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			     &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
-- 
cgit 


From 6a21b0856daaf9b7f63225f5b449ddee170c6f0a Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Mon, 18 Jul 2022 14:18:12 +0200
Subject: flow_offload: Introduce flow_match_pppoe

Allow to offload PPPoE filters by adding flow_rule_match_pppoe.
Drivers can extract PPPoE specific fields from now on.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/flow_offload.h | 6 ++++++
 net/core/flow_offload.c    | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index a8d8512b7059..2a9a9e42e7fd 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -76,6 +76,10 @@ struct flow_match_ct {
 	struct flow_dissector_key_ct *key, *mask;
 };
 
+struct flow_match_pppoe {
+	struct flow_dissector_key_pppoe *key, *mask;
+};
+
 struct flow_rule;
 
 void flow_rule_match_meta(const struct flow_rule *rule,
@@ -122,6 +126,8 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
 			      struct flow_match_enc_opts *out);
 void flow_rule_match_ct(const struct flow_rule *rule,
 			struct flow_match_ct *out);
+void flow_rule_match_pppoe(const struct flow_rule *rule,
+			   struct flow_match_pppoe *out);
 
 enum flow_action_id {
 	FLOW_ACTION_ACCEPT		= 0,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 0d3075d3c8fb..8cfb63528d18 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -230,6 +230,13 @@ void flow_rule_match_ct(const struct flow_rule *rule,
 }
 EXPORT_SYMBOL(flow_rule_match_ct);
 
+void flow_rule_match_pppoe(const struct flow_rule *rule,
+			   struct flow_match_pppoe *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_pppoe);
+
 struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
 					  void *cb_ident, void *cb_priv,
 					  void (*release)(void *cb_priv))
-- 
cgit 


From 4f4179fcf420873002035cf1941d844c9e0e7cb3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 21 Jul 2022 19:41:10 +0200
Subject: ACPI: CPPC: Do not prevent CPPC from working in the future

There is a problem with the current revision checks in
is_cppc_supported() that they essentially prevent the CPPC support
from working if a new _CPC package format revision being a proper
superset of the v3 and only causing _CPC to return a package with more
entries (while retaining the types and meaning of the entries defined by
the v3) is introduced in the future and used by the platform firmware.

In that case, as long as the number of entries in the _CPC return
package is at least CPPC_V3_NUM_ENT, it should be perfectly fine to
use the v3 support code and disregard the additional package entries
added by the new package format revision.

For this reason, drop is_cppc_supported() altogether, put the revision
checks directly into acpi_cppc_processor_probe() so they are easier to
follow and rework them to take the case mentioned above into account.

Fixes: 4773e77cdc9b ("ACPI / CPPC: Add support for CPPC v3")
Cc: 4.18+ <stable@vger.kernel.org> # 4.18+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 54 +++++++++++++++++++++---------------------------
 include/acpi/cppc_acpi.h |  2 +-
 2 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 3c6d4ef87be0..1e15a9f25ae9 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -618,33 +618,6 @@ static int pcc_data_alloc(int pcc_ss_id)
 	return 0;
 }
 
-/* Check if CPPC revision + num_ent combination is supported */
-static bool is_cppc_supported(int revision, int num_ent)
-{
-	int expected_num_ent;
-
-	switch (revision) {
-	case CPPC_V2_REV:
-		expected_num_ent = CPPC_V2_NUM_ENT;
-		break;
-	case CPPC_V3_REV:
-		expected_num_ent = CPPC_V3_NUM_ENT;
-		break;
-	default:
-		pr_debug("Firmware exports unsupported CPPC revision: %d\n",
-			revision);
-		return false;
-	}
-
-	if (expected_num_ent != num_ent) {
-		pr_debug("Firmware exports %d entries. Expected: %d for CPPC rev:%d\n",
-			num_ent, expected_num_ent, revision);
-		return false;
-	}
-
-	return true;
-}
-
 /*
  * An example CPC table looks like the following.
  *
@@ -733,7 +706,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 			 cpc_obj->type, pr->id);
 		goto out_free;
 	}
-	cpc_ptr->num_entries = num_ent;
 
 	/* Second entry should be revision. */
 	cpc_obj = &out_obj->package.elements[1];
@@ -744,10 +716,32 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
 			 cpc_obj->type, pr->id);
 		goto out_free;
 	}
-	cpc_ptr->version = cpc_rev;
 
-	if (!is_cppc_supported(cpc_rev, num_ent))
+	if (cpc_rev < CPPC_V2_REV) {
+		pr_debug("Unsupported _CPC Revision (%d) for CPU:%d\n", cpc_rev,
+			 pr->id);
+		goto out_free;
+	}
+
+	/*
+	 * Disregard _CPC if the number of entries in the return pachage is not
+	 * as expected, but support future revisions being proper supersets of
+	 * the v3 and only causing more entries to be returned by _CPC.
+	 */
+	if ((cpc_rev == CPPC_V2_REV && num_ent != CPPC_V2_NUM_ENT) ||
+	    (cpc_rev == CPPC_V3_REV && num_ent != CPPC_V3_NUM_ENT) ||
+	    (cpc_rev > CPPC_V3_REV && num_ent <= CPPC_V3_NUM_ENT)) {
+		pr_debug("Unexpected number of _CPC return package entries (%d) for CPU:%d\n",
+			 num_ent, pr->id);
 		goto out_free;
+	}
+	if (cpc_rev > CPPC_V3_REV) {
+		num_ent = CPPC_V3_NUM_ENT;
+		cpc_rev = CPPC_V3_REV;
+	}
+
+	cpc_ptr->num_entries = num_ent;
+	cpc_ptr->version = cpc_rev;
 
 	/* Iterate through remaining entries in _CPC */
 	for (i = 2; i < num_ent; i++) {
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index d389bab54241..f73d357ecdf5 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -17,7 +17,7 @@
 #include <acpi/pcc.h>
 #include <acpi/processor.h>
 
-/* Support CPPCv2 and CPPCv3  */
+/* CPPCv2 and CPPCv3 support */
 #define CPPC_V2_REV	2
 #define CPPC_V3_REV	3
 #define CPPC_V2_NUM_ENT	21
-- 
cgit 


From 04ad63f086d1a9649b8b082748cbc7a570ade461 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 11 Jan 2022 08:06:40 -0800
Subject: cxl/region: Introduce cxl_pmem_region objects

The LIBNVDIMM subsystem is a platform agnostic representation of system
NVDIMM / persistent memory resources. To date, the CXL subsystem's
interaction with LIBNVDIMM has been to register an nvdimm-bridge device
and cxl_nvdimm objects to proxy CXL capabilities into existing LIBNVDIMM
subsystem mechanics.

With regions the approach is the same. Create a new cxl_pmem_region
object to proxy CXL region details into a LIBNVDIMM definition. With
this enabling LIBNVDIMM can partition CXL persistent memory regions with
legacy namespace labels. A follow-on patch will add CXL region label and
CXL namespace label support to persist region configurations across
driver reload / system-reset events.

Co-developed-by: Ben Widawsky <bwidawsk@kernel.org>
Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/165784340111.1758207.3036498385188290968.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/core.h      |   3 +
 drivers/cxl/core/pmem.c      |   4 +-
 drivers/cxl/core/port.c      |   2 +
 drivers/cxl/core/region.c    | 142 +++++++++++++++++++++++++-
 drivers/cxl/cxl.h            |  36 ++++++-
 drivers/cxl/pmem.c           | 238 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/region_devs.c |  28 +++--
 include/linux/libnvdimm.h    |   5 +
 8 files changed, 446 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 391aadf9e7fa..1d8f87be283f 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -13,11 +13,13 @@ extern struct attribute_group cxl_base_attribute_group;
 extern struct device_attribute dev_attr_create_pmem_region;
 extern struct device_attribute dev_attr_delete_region;
 extern struct device_attribute dev_attr_region;
+extern const struct device_type cxl_pmem_region_type;
 extern const struct device_type cxl_region_type;
 void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
 #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
 #define CXL_REGION_TYPE(x) (&cxl_region_type)
 #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
+#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type)
 int cxl_region_init(void);
 void cxl_region_exit(void);
 #else
@@ -34,6 +36,7 @@ static inline void cxl_region_exit(void)
 #define CXL_REGION_ATTR(x) NULL
 #define CXL_REGION_TYPE(x) NULL
 #define SET_CXL_REGION_ATTR(x)
+#define CXL_PMEM_REGION_TYPE(x) NULL
 #endif
 
 struct cxl_send_command;
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index bec7cfb54ebf..1d12a8206444 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -62,9 +62,9 @@ static int match_nvdimm_bridge(struct device *dev, void *data)
 	return is_cxl_nvdimm_bridge(dev);
 }
 
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd)
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start)
 {
-	struct cxl_port *port = find_cxl_root(&cxl_nvd->dev);
+	struct cxl_port *port = find_cxl_root(start);
 	struct device *dev;
 
 	if (!port)
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index ba66ce25de30..3d2d0119cc3d 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -44,6 +44,8 @@ static int cxl_device_id(struct device *dev)
 		return CXL_DEVICE_NVDIMM_BRIDGE;
 	if (dev->type == &cxl_nvdimm_type)
 		return CXL_DEVICE_NVDIMM;
+	if (dev->type == CXL_PMEM_REGION_TYPE())
+		return CXL_DEVICE_PMEM_REGION;
 	if (is_cxl_port(dev)) {
 		if (is_cxl_root(to_cxl_port(dev)))
 			return CXL_DEVICE_ROOT;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index bf4e7f5499c1..dc71ec457608 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1650,6 +1650,139 @@ static ssize_t delete_region_store(struct device *dev,
 }
 DEVICE_ATTR_WO(delete_region);
 
+static void cxl_pmem_region_release(struct device *dev)
+{
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	int i;
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd;
+
+		put_device(&cxlmd->dev);
+	}
+
+	kfree(cxlr_pmem);
+}
+
+static const struct attribute_group *cxl_pmem_region_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_pmem_region_type = {
+	.name = "cxl_pmem_region",
+	.release = cxl_pmem_region_release,
+	.groups = cxl_pmem_region_attribute_groups,
+};
+
+bool is_cxl_pmem_region(struct device *dev)
+{
+	return dev->type == &cxl_pmem_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL);
+
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev),
+			  "not a cxl_pmem_region device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_pmem_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL);
+
+static struct lock_class_key cxl_pmem_region_key;
+
+static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int i;
+
+	down_read(&cxl_region_rwsem);
+	if (p->state != CXL_CONFIG_COMMIT) {
+		cxlr_pmem = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets),
+			    GFP_KERNEL);
+	if (!cxlr_pmem) {
+		cxlr_pmem = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	cxlr_pmem->hpa_range.start = p->res->start;
+	cxlr_pmem->hpa_range.end = p->res->end;
+
+	/* Snapshot the region configuration underneath the cxl_region_rwsem */
+	cxlr_pmem->nr_mappings = p->nr_targets;
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+
+		m->cxlmd = cxlmd;
+		get_device(&cxlmd->dev);
+		m->start = cxled->dpa_res->start;
+		m->size = resource_size(cxled->dpa_res);
+		m->position = i;
+	}
+
+	dev = &cxlr_pmem->dev;
+	cxlr_pmem->cxlr = cxlr;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_pmem_region_type;
+out:
+	up_read(&cxl_region_rwsem);
+
+	return cxlr_pmem;
+}
+
+static void cxlr_pmem_unregister(void *dev)
+{
+	device_unregister(dev);
+}
+
+/**
+ * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge
+ * @cxlr: parent CXL region for this pmem region bridge device
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int rc;
+
+	cxlr_pmem = cxl_pmem_region_alloc(cxlr);
+	if (IS_ERR(cxlr_pmem))
+		return PTR_ERR(cxlr_pmem);
+
+	dev = &cxlr_pmem->dev;
+	rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev);
+
+err:
+	put_device(dev);
+	return rc;
+}
+
 static int cxl_region_probe(struct device *dev)
 {
 	struct cxl_region *cxlr = to_cxl_region(dev);
@@ -1673,7 +1806,14 @@ static int cxl_region_probe(struct device *dev)
 	 */
 	up_read(&cxl_region_rwsem);
 
-	return rc;
+	switch (cxlr->mode) {
+	case CXL_DECODER_PMEM:
+		return devm_cxl_add_pmem_region(cxlr);
+	default:
+		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
+			cxlr->mode);
+		return -ENXIO;
+	}
 }
 
 static struct cxl_driver cxl_region_driver = {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 46d3f173a700..75674400cc8d 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -419,6 +419,25 @@ struct cxl_nvdimm {
 	struct device dev;
 	struct cxl_memdev *cxlmd;
 	struct cxl_nvdimm_bridge *bridge;
+	struct cxl_pmem_region *region;
+};
+
+struct cxl_pmem_region_mapping {
+	struct cxl_memdev *cxlmd;
+	struct cxl_nvdimm *cxl_nvd;
+	u64 start;
+	u64 size;
+	int position;
+};
+
+struct cxl_pmem_region {
+	struct device dev;
+	struct cxl_region *cxlr;
+	struct nd_region *nd_region;
+	struct cxl_nvdimm_bridge *bridge;
+	struct range hpa_range;
+	int nr_mappings;
+	struct cxl_pmem_region_mapping mapping[];
 };
 
 /**
@@ -601,6 +620,7 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv);
 #define CXL_DEVICE_ROOT			4
 #define CXL_DEVICE_MEMORY_EXPANDER	5
 #define CXL_DEVICE_REGION		6
+#define CXL_DEVICE_PMEM_REGION		7
 
 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*")
 #define CXL_MODALIAS_FMT "cxl:t%d"
@@ -612,7 +632,21 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm_bridge(struct device *dev);
 int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev);
+
+#ifdef CONFIG_CXL_REGION
+bool is_cxl_pmem_region(struct device *dev);
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
+#else
+static inline bool is_cxl_pmem_region(struct device *dev)
+{
+	return false;
+}
+static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	return NULL;
+}
+#endif
 
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index b271f6e90b91..e69f99a0747d 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -7,6 +7,7 @@
 #include <linux/ndctl.h>
 #include <linux/async.h>
 #include <linux/slab.h>
+#include <linux/nd.h>
 #include "cxlmem.h"
 #include "cxl.h"
 
@@ -27,6 +28,19 @@ static void clear_exclusive(void *cxlds)
 static void unregister_nvdimm(void *nvdimm)
 {
 	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
+	struct cxl_pmem_region *cxlr_pmem;
+
+	device_lock(&cxl_nvb->dev);
+	cxlr_pmem = cxl_nvd->region;
+	dev_set_drvdata(&cxl_nvd->dev, NULL);
+	cxl_nvd->region = NULL;
+	device_unlock(&cxl_nvb->dev);
+
+	if (cxlr_pmem) {
+		device_release_driver(&cxlr_pmem->dev);
+		put_device(&cxlr_pmem->dev);
+	}
 
 	nvdimm_delete(nvdimm);
 	cxl_nvd->bridge = NULL;
@@ -42,7 +56,7 @@ static int cxl_nvdimm_probe(struct device *dev)
 	struct nvdimm *nvdimm;
 	int rc;
 
-	cxl_nvb = cxl_find_nvdimm_bridge(cxl_nvd);
+	cxl_nvb = cxl_find_nvdimm_bridge(dev);
 	if (!cxl_nvb)
 		return -ENXIO;
 
@@ -223,6 +237,21 @@ static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb)
 	return 0;
 }
 
+static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+
+	if (!is_cxl_pmem_region(dev))
+		return 0;
+
+	cxlr_pmem = to_cxl_pmem_region(dev);
+	if (cxlr_pmem->bridge != cxl_nvb)
+		return 0;
+
+	device_release_driver(dev);
+	return 0;
+}
+
 static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
 			       struct nvdimm_bus *nvdimm_bus)
 {
@@ -234,6 +263,8 @@ static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
 	 * nvdimm_bus_unregister() rips the nvdimm objects out from
 	 * underneath them.
 	 */
+	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
+			 cxl_pmem_region_release_driver);
 	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
 			 cxl_nvdimm_release_driver);
 	nvdimm_bus_unregister(nvdimm_bus);
@@ -328,6 +359,203 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = {
 	.id = CXL_DEVICE_NVDIMM_BRIDGE,
 };
 
+static int match_cxl_nvdimm(struct device *dev, void *data)
+{
+	return is_cxl_nvdimm(dev);
+}
+
+static void unregister_nvdimm_region(void *nd_region)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_pmem_region *cxlr_pmem;
+	int i;
+
+	cxlr_pmem = nd_region_provider_data(nd_region);
+	cxl_nvb = cxlr_pmem->bridge;
+	device_lock(&cxl_nvb->dev);
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;
+
+		if (cxl_nvd->region) {
+			put_device(&cxlr_pmem->dev);
+			cxl_nvd->region = NULL;
+		}
+	}
+	device_unlock(&cxl_nvb->dev);
+
+	nvdimm_region_delete(nd_region);
+}
+
+static void cxlr_pmem_remove_resource(void *res)
+{
+	remove_resource(res);
+}
+
+struct cxl_pmem_region_info {
+	u64 offset;
+	u64 serial;
+};
+
+static int cxl_pmem_region_probe(struct device *dev)
+{
+	struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	struct cxl_region *cxlr = cxlr_pmem->cxlr;
+	struct cxl_pmem_region_info *info = NULL;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct nd_interleave_set *nd_set;
+	struct nd_region_desc ndr_desc;
+	struct cxl_nvdimm *cxl_nvd;
+	struct nvdimm *nvdimm;
+	struct resource *res;
+	int rc, i = 0;
+
+	cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev);
+	if (!cxl_nvb) {
+		dev_dbg(dev, "bridge not found\n");
+		return -ENXIO;
+	}
+	cxlr_pmem->bridge = cxl_nvb;
+
+	device_lock(&cxl_nvb->dev);
+	if (!cxl_nvb->nvdimm_bus) {
+		dev_dbg(dev, "nvdimm bus not found\n");
+		rc = -ENXIO;
+		goto err;
+	}
+
+	memset(&mappings, 0, sizeof(mappings));
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+
+	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	res->name = "Persistent Memory";
+	res->start = cxlr_pmem->hpa_range.start;
+	res->end = cxlr_pmem->hpa_range.end;
+	res->flags = IORESOURCE_MEM;
+	res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+	rc = insert_resource(&iomem_resource, res);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+	if (rc)
+		goto err;
+
+	ndr_desc.res = res;
+	ndr_desc.provider_data = cxlr_pmem;
+
+	ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
+	ndr_desc.target_node = phys_to_target_node(res->start);
+	if (ndr_desc.target_node == NUMA_NO_NODE) {
+		ndr_desc.target_node = ndr_desc.numa_node;
+		dev_dbg(&cxlr->dev, "changing target node from %d to %d",
+			NUMA_NO_NODE, ndr_desc.target_node);
+	}
+
+	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+	if (!nd_set) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ndr_desc.memregion = cxlr->id;
+	set_bit(ND_REGION_CXL, &ndr_desc.flags);
+	set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+
+	info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_memdev *cxlmd = m->cxlmd;
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
+		struct device *d;
+
+		d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm);
+		if (!d) {
+			dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+
+		/* safe to drop ref now with bridge lock held */
+		put_device(d);
+
+		cxl_nvd = to_cxl_nvdimm(d);
+		nvdimm = dev_get_drvdata(&cxl_nvd->dev);
+		if (!nvdimm) {
+			dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+		cxl_nvd->region = cxlr_pmem;
+		get_device(&cxlr_pmem->dev);
+		m->cxl_nvd = cxl_nvd;
+		mappings[i] = (struct nd_mapping_desc) {
+			.nvdimm = nvdimm,
+			.start = m->start,
+			.size = m->size,
+			.position = i,
+		};
+		info[i].offset = m->start;
+		info[i].serial = cxlds->serial;
+	}
+	ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
+	ndr_desc.mapping = mappings;
+
+	/*
+	 * TODO enable CXL labels which skip the need for 'interleave-set cookie'
+	 */
+	nd_set->cookie1 =
+		nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
+	nd_set->cookie2 = nd_set->cookie1;
+	ndr_desc.nd_set = nd_set;
+
+	cxlr_pmem->nd_region =
+		nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+	if (IS_ERR(cxlr_pmem->nd_region)) {
+		rc = PTR_ERR(cxlr_pmem->nd_region);
+		goto err;
+	}
+
+	rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+				      cxlr_pmem->nd_region);
+out:
+	kfree(info);
+	device_unlock(&cxl_nvb->dev);
+	put_device(&cxl_nvb->dev);
+
+	return rc;
+
+err:
+	dev_dbg(dev, "failed to create nvdimm region\n");
+	for (i--; i >= 0; i--) {
+		nvdimm = mappings[i].nvdimm;
+		cxl_nvd = nvdimm_provider_data(nvdimm);
+		put_device(&cxl_nvd->region->dev);
+		cxl_nvd->region = NULL;
+	}
+	goto out;
+}
+
+static struct cxl_driver cxl_pmem_region_driver = {
+	.name = "cxl_pmem_region",
+	.probe = cxl_pmem_region_probe,
+	.id = CXL_DEVICE_PMEM_REGION,
+};
+
 /*
  * Return all bridges to the CXL_NVB_NEW state to invalidate any
  * ->state_work referring to the now destroyed cxl_pmem_wq.
@@ -372,8 +600,14 @@ static __init int cxl_pmem_init(void)
 	if (rc)
 		goto err_nvdimm;
 
+	rc = cxl_driver_register(&cxl_pmem_region_driver);
+	if (rc)
+		goto err_region;
+
 	return 0;
 
+err_region:
+	cxl_driver_unregister(&cxl_nvdimm_driver);
 err_nvdimm:
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 err_bridge:
@@ -383,6 +617,7 @@ err_bridge:
 
 static __exit void cxl_pmem_exit(void)
 {
+	cxl_driver_unregister(&cxl_pmem_region_driver);
 	cxl_driver_unregister(&cxl_nvdimm_driver);
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 	destroy_cxl_pmem_wq();
@@ -394,3 +629,4 @@ module_exit(cxl_pmem_exit);
 MODULE_IMPORT_NS(CXL);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
+MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index d976260eca7a..473a71bbd9c9 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -133,7 +133,8 @@ static void nd_region_release(struct device *dev)
 		put_device(&nvdimm->dev);
 	}
 	free_percpu(nd_region->lane);
-	memregion_free(nd_region->id);
+	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
+		memregion_free(nd_region->id);
 	kfree(nd_region);
 }
 
@@ -982,9 +983,14 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	if (!nd_region)
 		return NULL;
-	nd_region->id = memregion_alloc(GFP_KERNEL);
-	if (nd_region->id < 0)
-		goto err_id;
+	/* CXL pre-assigns memregion ids before creating nvdimm regions */
+	if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) {
+		nd_region->id = ndr_desc->memregion;
+	} else {
+		nd_region->id = memregion_alloc(GFP_KERNEL);
+		if (nd_region->id < 0)
+			goto err_id;
+	}
 
 	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
 	if (!nd_region->lane)
@@ -1043,9 +1049,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	return nd_region;
 
- err_percpu:
-	memregion_free(nd_region->id);
- err_id:
+err_percpu:
+	if (!test_bit(ND_REGION_CXL, &ndr_desc->flags))
+		memregion_free(nd_region->id);
+err_id:
 	kfree(nd_region);
 	return NULL;
 }
@@ -1068,6 +1075,13 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+void nvdimm_region_delete(struct nd_region *nd_region)
+{
+	if (nd_region)
+		nd_device_unregister(&nd_region->dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_delete);
+
 int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
 {
 	int rc = 0;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0d61e07b6827..c74acfa1a3fe 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -59,6 +59,9 @@ enum {
 	/* Platform provides asynchronous flush mechanism */
 	ND_REGION_ASYNC = 3,
 
+	/* Region was created by CXL subsystem */
+	ND_REGION_CXL = 4,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -122,6 +125,7 @@ struct nd_region_desc {
 	int numa_node;
 	int target_node;
 	unsigned long flags;
+	int memregion;
 	struct device_node *of_node;
 	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
@@ -259,6 +263,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 			cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL);
 }
 void nvdimm_delete(struct nvdimm *nvdimm);
+void nvdimm_region_delete(struct nd_region *nd_region);
 
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
-- 
cgit 


From d0be8347c623e0ac4202a1d4e0373882821f56b0 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 21 Jul 2022 09:10:50 -0700
Subject: Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put

This fixes the following trace which is caused by hci_rx_work starting up
*after* the final channel reference has been put() during sock_close() but
*before* the references to the channel have been destroyed, so instead
the code now rely on kref_get_unless_zero/l2cap_chan_hold_unless_zero to
prevent referencing a channel that is about to be destroyed.

  refcount_t: increment on 0; use-after-free.
  BUG: KASAN: use-after-free in refcount_dec_and_test+0x20/0xd0
  Read of size 4 at addr ffffffc114f5bf18 by task kworker/u17:14/705

  CPU: 4 PID: 705 Comm: kworker/u17:14 Tainted: G S      W
  4.14.234-00003-g1fb6d0bd49a4-dirty #28
  Hardware name: Qualcomm Technologies, Inc. SM8150 V2 PM8150
  Google Inc. MSM sm8150 Flame DVT (DT)
  Workqueue: hci0 hci_rx_work
  Call trace:
   dump_backtrace+0x0/0x378
   show_stack+0x20/0x2c
   dump_stack+0x124/0x148
   print_address_description+0x80/0x2e8
   __kasan_report+0x168/0x188
   kasan_report+0x10/0x18
   __asan_load4+0x84/0x8c
   refcount_dec_and_test+0x20/0xd0
   l2cap_chan_put+0x48/0x12c
   l2cap_recv_frame+0x4770/0x6550
   l2cap_recv_acldata+0x44c/0x7a4
   hci_acldata_packet+0x100/0x188
   hci_rx_work+0x178/0x23c
   process_one_work+0x35c/0x95c
   worker_thread+0x4cc/0x960
   kthread+0x1a8/0x1c4
   ret_from_fork+0x10/0x18

Cc: stable@kernel.org
Reported-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Tested-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h |  1 +
 net/bluetooth/l2cap_core.c    | 61 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 3c4f550e5a8b..2f766e3437ce 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -847,6 +847,7 @@ enum {
 };
 
 void l2cap_chan_hold(struct l2cap_chan *c);
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c);
 void l2cap_chan_put(struct l2cap_chan *c);
 
 static inline void l2cap_chan_lock(struct l2cap_chan *chan)
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ae78490ecd3d..52668662ae8d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -111,7 +111,8 @@ static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn,
 }
 
 /* Find channel with given SCID.
- * Returns locked channel. */
+ * Returns a reference locked channel.
+ */
 static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
 						 u16 cid)
 {
@@ -119,15 +120,19 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
 
 	mutex_lock(&conn->chan_lock);
 	c = __l2cap_get_chan_by_scid(conn, cid);
-	if (c)
-		l2cap_chan_lock(c);
+	if (c) {
+		/* Only lock if chan reference is not 0 */
+		c = l2cap_chan_hold_unless_zero(c);
+		if (c)
+			l2cap_chan_lock(c);
+	}
 	mutex_unlock(&conn->chan_lock);
 
 	return c;
 }
 
 /* Find channel with given DCID.
- * Returns locked channel.
+ * Returns a reference locked channel.
  */
 static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
 						 u16 cid)
@@ -136,8 +141,12 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
 
 	mutex_lock(&conn->chan_lock);
 	c = __l2cap_get_chan_by_dcid(conn, cid);
-	if (c)
-		l2cap_chan_lock(c);
+	if (c) {
+		/* Only lock if chan reference is not 0 */
+		c = l2cap_chan_hold_unless_zero(c);
+		if (c)
+			l2cap_chan_lock(c);
+	}
 	mutex_unlock(&conn->chan_lock);
 
 	return c;
@@ -162,8 +171,12 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
 
 	mutex_lock(&conn->chan_lock);
 	c = __l2cap_get_chan_by_ident(conn, ident);
-	if (c)
-		l2cap_chan_lock(c);
+	if (c) {
+		/* Only lock if chan reference is not 0 */
+		c = l2cap_chan_hold_unless_zero(c);
+		if (c)
+			l2cap_chan_lock(c);
+	}
 	mutex_unlock(&conn->chan_lock);
 
 	return c;
@@ -497,6 +510,16 @@ void l2cap_chan_hold(struct l2cap_chan *c)
 	kref_get(&c->kref);
 }
 
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c)
+{
+	BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+	if (!kref_get_unless_zero(&c->kref))
+		return NULL;
+
+	return c;
+}
+
 void l2cap_chan_put(struct l2cap_chan *c)
 {
 	BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
@@ -1968,7 +1991,10 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
 			src_match = !bacmp(&c->src, src);
 			dst_match = !bacmp(&c->dst, dst);
 			if (src_match && dst_match) {
-				l2cap_chan_hold(c);
+				c = l2cap_chan_hold_unless_zero(c);
+				if (!c)
+					continue;
+
 				read_unlock(&chan_list_lock);
 				return c;
 			}
@@ -1983,7 +2009,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
 	}
 
 	if (c1)
-		l2cap_chan_hold(c1);
+		c1 = l2cap_chan_hold_unless_zero(c1);
 
 	read_unlock(&chan_list_lock);
 
@@ -4463,6 +4489,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
 
 unlock:
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 	return err;
 }
 
@@ -4577,6 +4604,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
 
 done:
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 	return err;
 }
 
@@ -5304,6 +5332,7 @@ send_move_response:
 	l2cap_send_move_chan_rsp(chan, result);
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 
 	return 0;
 }
@@ -5396,6 +5425,7 @@ static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
 	}
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 }
 
 static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
@@ -5425,6 +5455,7 @@ static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
 	l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 }
 
 static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
@@ -5488,6 +5519,7 @@ static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
 	l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 
 	return 0;
 }
@@ -5523,6 +5555,7 @@ static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
 	}
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 
 	return 0;
 }
@@ -5895,12 +5928,11 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
 	if (credits > max_credits) {
 		BT_ERR("LE credits overflow");
 		l2cap_send_disconn_req(chan, ECONNRESET);
-		l2cap_chan_unlock(chan);
 
 		/* Return 0 so that we don't trigger an unnecessary
 		 * command reject packet.
 		 */
-		return 0;
+		goto unlock;
 	}
 
 	chan->tx_credits += credits;
@@ -5911,7 +5943,9 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
 	if (chan->tx_credits)
 		chan->ops->resume(chan);
 
+unlock:
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 
 	return 0;
 }
@@ -7597,6 +7631,7 @@ drop:
 
 done:
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 }
 
 static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
@@ -8085,7 +8120,7 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
 		if (src_type != c->src_type)
 			continue;
 
-		l2cap_chan_hold(c);
+		c = l2cap_chan_hold_unless_zero(c);
 		read_unlock(&chan_list_lock);
 		return c;
 	}
-- 
cgit 


From 7b2d9a1a50ec3bedf067fe234a4a71196c89e826 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Jul 2022 10:29:16 +0200
Subject: net: devlink: introduce nested devlink entity for line card

For the purpose of exposing device info and allow flash update which is
going to be implemented in follow-up patches, introduce a possibility
for a line card to expose relation to nested devlink entity. The nested
devlink entity represents the line card.

Example:

$ devlink lc show pci/0000:01:00.0 lc 1
pci/0000:01:00.0:
  lc 1 state active type 16x100G nested_devlink auxiliary/mlxsw_core.lc.0
    supported_types:
       16x100G
$ devlink dev show auxiliary/mlxsw_core.lc.0
auxiliary/mlxsw_core.lc.0

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  2 ++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 780744b550b8..5bd3fac12e9e 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1580,6 +1580,8 @@ void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 void devlink_linecard_activate(struct devlink_linecard *linecard);
 void devlink_linecard_deactivate(struct devlink_linecard *linecard);
+void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				    struct devlink *nested_devlink);
 int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
 		     u32 size, u16 ingress_pools_count,
 		     u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b3d40a5d72ff..541321695f52 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -576,6 +576,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_LINECARD_TYPE,		/* string */
 	DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES,	/* nested */
 
+	DEVLINK_ATTR_NESTED_DEVLINK,		/* nested */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 865232a1455f..698b2d6e0ec7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -89,6 +89,7 @@ struct devlink_linecard {
 	const char *type;
 	struct devlink_linecard_type *types;
 	unsigned int types_count;
+	struct devlink *nested_devlink;
 };
 
 /**
@@ -856,6 +857,24 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
 	return 0;
 }
 
+static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	struct nlattr *nested_attr;
+
+	nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK);
+	if (!nested_attr)
+		return -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, nested_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, nested_attr);
+	return -EMSGSIZE;
+}
+
 struct devlink_reload_combination {
 	enum devlink_reload_action action;
 	enum devlink_reload_limit limit;
@@ -2135,6 +2154,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		nla_nest_end(msg, attr);
 	}
 
+	if (linecard->nested_devlink &&
+	    devlink_nl_put_nested_handle(msg, linecard->nested_devlink))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -10255,6 +10278,7 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	linecard->type = NULL;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
@@ -10273,6 +10297,7 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
 	mutex_unlock(&linecard->state_lock);
@@ -10320,6 +10345,23 @@ void devlink_linecard_deactivate(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
 
+/**
+ *	devlink_linecard_nested_dl_set - Attach/detach nested devlink
+ *					 instance to linecard.
+ *
+ *	@linecard: devlink linecard
+ *	@nested_devlink: devlink instance to attach or NULL to detach
+ */
+void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				    struct devlink *nested_devlink)
+{
+	mutex_lock(&linecard->state_lock);
+	linecard->nested_devlink = nested_devlink;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
+
 int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
 		     u32 size, u16 ingress_pools_count,
 		     u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit 


From a11821495fd4d9b5c97945db929e02c473b7a5d9 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sun, 24 Jul 2022 20:46:28 +0200
Subject: i2c: extend documentation about retvals of master_xfer functions

It was stated how the error codes should be. It was not stated what the
regular case should return. Add this.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index fbda5ada2afc..8eab5017bff3 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -537,7 +537,8 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info,
  *
  * The return codes from the ``master_xfer{_atomic}`` fields should indicate the
  * type of error code that occurred during the transfer, as documented in the
- * Kernel Documentation file Documentation/i2c/fault-codes.rst.
+ * Kernel Documentation file Documentation/i2c/fault-codes.rst. Otherwise, the
+ * number of messages executed should be returned.
  */
 struct i2c_algorithm {
 	/*
-- 
cgit 


From 3f92a64e44e5823a975cbf2c9f05ab1893fd4cb7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 22 Jul 2022 16:50:31 -0700
Subject: tcp: allow tls to decrypt directly from the tcp rcv queue

Expose TCP rx queue accessor and cleanup, so that TLS can
decrypt directly from the TCP queue. The expectation
is that the caller can access the skb returned from
tcp_recv_skb() and up to inq bytes worth of data (some
of which may be in ->next skbs) and then call
tcp_read_done() when data has been consumed.
The socket lock must be held continuously across
those two operations.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp.c    | 42 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f9e7c85ea829..b8620be32f8f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -673,6 +673,8 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
+void tcp_read_done(struct sock *sk, size_t len);
 
 void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ba2bdc811374..dc7cc3ce6a53 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1635,7 +1635,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
 	struct sk_buff *skb;
 	u32 offset;
@@ -1658,6 +1658,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL(tcp_recv_skb);
 
 /*
  * This routine provides an alternative to tcp_recvmsg() for routines
@@ -1788,6 +1789,45 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 }
 EXPORT_SYMBOL(tcp_read_skb);
 
+void tcp_read_done(struct sock *sk, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	struct sk_buff *skb;
+	size_t left;
+	u32 offset;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return;
+
+	left = len;
+	while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		int used;
+
+		used = min_t(size_t, skb->len - offset, left);
+		seq += used;
+		left -= used;
+
+		if (skb->len > offset + used)
+			break;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+			tcp_eat_recv_skb(sk, skb);
+			++seq;
+			break;
+		}
+		tcp_eat_recv_skb(sk, skb);
+	}
+	WRITE_ONCE(tp->copied_seq, seq);
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (left != len)
+		tcp_cleanup_rbuf(sk, len - left);
+}
+EXPORT_SYMBOL(tcp_read_done);
+
 int tcp_peek_len(struct socket *sock)
 {
 	return tcp_inq(sock->sk);
-- 
cgit 


From 84c61fe1a75b4255df1e1e7c054c9e6d048da417 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 22 Jul 2022 16:50:33 -0700
Subject: tls: rx: do not use the standard strparser

TLS is a relatively poor fit for strparser. We pause the input
every time a message is received, wait for a read which will
decrypt the message, start the parser, repeat. strparser is
built to delineate the messages, wrap them in individual skbs
and let them float off into the stack or a different socket.
TLS wants the data pages and nothing else. There's no need
for TLS to keep cloning (and occasionally skb_unclone()'ing)
the TCP rx queue.

This patch uses a pre-allocated skb and attaches the skbs
from the TCP rx queue to it as frags. TLS is careful never
to modify the input skb without CoW'ing / detaching it first.

Since we call TCP rx queue cleanup directly we also get back
the benefit of skb deferred free.

Overall this results in a 6% gain in my benchmarks.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h  |  19 ++-
 net/tls/tls.h      |  24 ++-
 net/tls/tls_main.c |  20 ++-
 net/tls/tls_strp.c | 484 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 net/tls/tls_sw.c   |  80 ++++-----
 5 files changed, 558 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 181c496b01b8..abb050b0df83 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -108,18 +108,33 @@ struct tls_sw_context_tx {
 	unsigned long tx_bitmask;
 };
 
+struct tls_strparser {
+	struct sock *sk;
+
+	u32 mark : 8;
+	u32 stopped : 1;
+	u32 copy_mode : 1;
+	u32 msg_ready : 1;
+
+	struct strp_msg stm;
+
+	struct sk_buff *anchor;
+	struct work_struct work;
+};
+
 struct tls_sw_context_rx {
 	struct crypto_aead *aead_recv;
 	struct crypto_wait async_wait;
-	struct strparser strp;
 	struct sk_buff_head rx_list;	/* list of decrypted 'data' records */
 	void (*saved_data_ready)(struct sock *sk);
 
-	struct sk_buff *recv_pkt;
 	u8 reader_present;
 	u8 async_capable:1;
 	u8 zc_capable:1;
 	u8 reader_contended:1;
+
+	struct tls_strparser strp;
+
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
diff --git a/net/tls/tls.h b/net/tls/tls.h
index 154a3773e785..0e840a0c3437 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
  * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
  * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
  *
@@ -127,10 +128,24 @@ int tls_sw_fallback_init(struct sock *sk,
 			 struct tls_offload_context_tx *offload_ctx,
 			 struct tls_crypto_info *crypto_info);
 
+int tls_strp_dev_init(void);
+void tls_strp_dev_exit(void);
+
+void tls_strp_done(struct tls_strparser *strp);
+void tls_strp_stop(struct tls_strparser *strp);
+int tls_strp_init(struct tls_strparser *strp, struct sock *sk);
+void tls_strp_data_ready(struct tls_strparser *strp);
+
+void tls_strp_check_rcv(struct tls_strparser *strp);
+void tls_strp_msg_done(struct tls_strparser *strp);
+
+int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
+void tls_rx_msg_ready(struct tls_strparser *strp);
+
+void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
 int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
 struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
-int tls_strp_msg_hold(struct sock *sk, struct sk_buff *skb,
-		      struct sk_buff_head *dst);
+int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
 
 static inline struct tls_msg *tls_msg(struct sk_buff *skb)
 {
@@ -141,12 +156,13 @@ static inline struct tls_msg *tls_msg(struct sk_buff *skb)
 
 static inline struct sk_buff *tls_strp_msg(struct tls_sw_context_rx *ctx)
 {
-	return ctx->recv_pkt;
+	DEBUG_NET_WARN_ON_ONCE(!ctx->strp.msg_ready || !ctx->strp.anchor->len);
+	return ctx->strp.anchor;
 }
 
 static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx)
 {
-	return ctx->recv_pkt;
+	return ctx->strp.msg_ready;
 }
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 9703636cfc60..08ddf9d837ae 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -725,6 +725,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
 	if (tx) {
 		ctx->sk_write_space = sk->sk_write_space;
 		sk->sk_write_space = tls_write_space;
+	} else {
+		struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(ctx);
+
+		tls_strp_check_rcv(&rx_ctx->strp);
 	}
 	return 0;
 
@@ -1141,20 +1145,28 @@ static int __init tls_register(void)
 	if (err)
 		return err;
 
+	err = tls_strp_dev_init();
+	if (err)
+		goto err_pernet;
+
 	err = tls_device_init();
-	if (err) {
-		unregister_pernet_subsys(&tls_proc_ops);
-		return err;
-	}
+	if (err)
+		goto err_strp;
 
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
+err_strp:
+	tls_strp_dev_exit();
+err_pernet:
+	unregister_pernet_subsys(&tls_proc_ops);
+	return err;
 }
 
 static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+	tls_strp_dev_exit();
 	tls_device_cleanup();
 	unregister_pernet_subsys(&tls_proc_ops);
 }
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index d9bb4f23f01a..b945288c312e 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -1,37 +1,493 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */
 
 #include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/strparser.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/tls.h>
 
 #include "tls.h"
 
-struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx)
+static struct workqueue_struct *tls_strp_wq;
+
+static void tls_strp_abort_strp(struct tls_strparser *strp, int err)
+{
+	if (strp->stopped)
+		return;
+
+	strp->stopped = 1;
+
+	/* Report an error on the lower socket */
+	strp->sk->sk_err = -err;
+	sk_error_report(strp->sk);
+}
+
+static void tls_strp_anchor_free(struct tls_strparser *strp)
 {
+	struct skb_shared_info *shinfo = skb_shinfo(strp->anchor);
+
+	DEBUG_NET_WARN_ON_ONCE(atomic_read(&shinfo->dataref) != 1);
+	shinfo->frag_list = NULL;
+	consume_skb(strp->anchor);
+	strp->anchor = NULL;
+}
+
+/* Create a new skb with the contents of input copied to its page frags */
+static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp)
+{
+	struct strp_msg *rxm;
 	struct sk_buff *skb;
+	int i, err, offset;
+
+	skb = alloc_skb_with_frags(0, strp->anchor->len, TLS_PAGE_ORDER,
+				   &err, strp->sk->sk_allocation);
+	if (!skb)
+		return NULL;
+
+	offset = strp->stm.offset;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
-	skb = ctx->recv_pkt;
-	ctx->recv_pkt = NULL;
+		WARN_ON_ONCE(skb_copy_bits(strp->anchor, offset,
+					   skb_frag_address(frag),
+					   skb_frag_size(frag)));
+		offset += skb_frag_size(frag);
+	}
+
+	skb_copy_header(skb, strp->anchor);
+	rxm = strp_msg(skb);
+	rxm->offset = 0;
 	return skb;
 }
 
+/* Steal the input skb, input msg is invalid after calling this function */
+struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx)
+{
+	struct tls_strparser *strp = &ctx->strp;
+
+#ifdef CONFIG_TLS_DEVICE
+	DEBUG_NET_WARN_ON_ONCE(!strp->anchor->decrypted);
+#else
+	/* This function turns an input into an output,
+	 * that can only happen if we have offload.
+	 */
+	WARN_ON(1);
+#endif
+
+	if (strp->copy_mode) {
+		struct sk_buff *skb;
+
+		/* Replace anchor with an empty skb, this is a little
+		 * dangerous but __tls_cur_msg() warns on empty skbs
+		 * so hopefully we'll catch abuses.
+		 */
+		skb = alloc_skb(0, strp->sk->sk_allocation);
+		if (!skb)
+			return NULL;
+
+		swap(strp->anchor, skb);
+		return skb;
+	}
+
+	return tls_strp_msg_make_copy(strp);
+}
+
+/* Force the input skb to be in copy mode. The data ownership remains
+ * with the input skb itself (meaning unpause will wipe it) but it can
+ * be modified.
+ */
 int tls_strp_msg_cow(struct tls_sw_context_rx *ctx)
 {
-	struct sk_buff *unused;
-	int nsg;
+	struct tls_strparser *strp = &ctx->strp;
+	struct sk_buff *skb;
+
+	if (strp->copy_mode)
+		return 0;
+
+	skb = tls_strp_msg_make_copy(strp);
+	if (!skb)
+		return -ENOMEM;
+
+	tls_strp_anchor_free(strp);
+	strp->anchor = skb;
+
+	tcp_read_done(strp->sk, strp->stm.full_len);
+	strp->copy_mode = 1;
+
+	return 0;
+}
+
+/* Make a clone (in the skb sense) of the input msg to keep a reference
+ * to the underlying data. The reference-holding skbs get placed on
+ * @dst.
+ */
+int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(strp->anchor);
+
+	if (strp->copy_mode) {
+		struct sk_buff *skb;
+
+		WARN_ON_ONCE(!shinfo->nr_frags);
+
+		/* We can't skb_clone() the anchor, it gets wiped by unpause */
+		skb = alloc_skb(0, strp->sk->sk_allocation);
+		if (!skb)
+			return -ENOMEM;
+
+		__skb_queue_tail(dst, strp->anchor);
+		strp->anchor = skb;
+	} else {
+		struct sk_buff *iter, *clone;
+		int chunk, len, offset;
+
+		offset = strp->stm.offset;
+		len = strp->stm.full_len;
+		iter = shinfo->frag_list;
+
+		while (len > 0) {
+			if (iter->len <= offset) {
+				offset -= iter->len;
+				goto next;
+			}
+
+			chunk = iter->len - offset;
+			offset = 0;
+
+			clone = skb_clone(iter, strp->sk->sk_allocation);
+			if (!clone)
+				return -ENOMEM;
+			__skb_queue_tail(dst, clone);
+
+			len -= chunk;
+next:
+			iter = iter->next;
+		}
+	}
+
+	return 0;
+}
+
+static void tls_strp_flush_anchor_copy(struct tls_strparser *strp)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(strp->anchor);
+	int i;
+
+	DEBUG_NET_WARN_ON_ONCE(atomic_read(&shinfo->dataref) != 1);
+
+	for (i = 0; i < shinfo->nr_frags; i++)
+		__skb_frag_unref(&shinfo->frags[i], false);
+	shinfo->nr_frags = 0;
+	strp->copy_mode = 0;
+}
+
+static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb,
+			   unsigned int offset, size_t in_len)
+{
+	struct tls_strparser *strp = (struct tls_strparser *)desc->arg.data;
+	size_t sz, len, chunk;
+	struct sk_buff *skb;
+	skb_frag_t *frag;
+
+	if (strp->msg_ready)
+		return 0;
+
+	skb = strp->anchor;
+	frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE];
+
+	len = in_len;
+	/* First make sure we got the header */
+	if (!strp->stm.full_len) {
+		/* Assume one page is more than enough for headers */
+		chunk =	min_t(size_t, len, PAGE_SIZE - skb_frag_size(frag));
+		WARN_ON_ONCE(skb_copy_bits(in_skb, offset,
+					   skb_frag_address(frag) +
+					   skb_frag_size(frag),
+					   chunk));
+
+		sz = tls_rx_msg_size(strp, strp->anchor);
+		if (sz < 0) {
+			desc->error = sz;
+			return 0;
+		}
+
+		/* We may have over-read, sz == 0 is guaranteed under-read */
+		if (sz > 0)
+			chunk =	min_t(size_t, chunk, sz - skb->len);
+
+		skb->len += chunk;
+		skb->data_len += chunk;
+		skb_frag_size_add(frag, chunk);
+		frag++;
+		len -= chunk;
+		offset += chunk;
+
+		strp->stm.full_len = sz;
+		if (!strp->stm.full_len)
+			goto read_done;
+	}
+
+	/* Load up more data */
+	while (len && strp->stm.full_len > skb->len) {
+		chunk =	min_t(size_t, len, strp->stm.full_len - skb->len);
+		chunk = min_t(size_t, chunk, PAGE_SIZE - skb_frag_size(frag));
+		WARN_ON_ONCE(skb_copy_bits(in_skb, offset,
+					   skb_frag_address(frag) +
+					   skb_frag_size(frag),
+					   chunk));
+
+		skb->len += chunk;
+		skb->data_len += chunk;
+		skb_frag_size_add(frag, chunk);
+		frag++;
+		len -= chunk;
+		offset += chunk;
+	}
+
+	if (strp->stm.full_len == skb->len) {
+		desc->count = 0;
+
+		strp->msg_ready = 1;
+		tls_rx_msg_ready(strp);
+	}
+
+read_done:
+	return in_len - len;
+}
+
+static int tls_strp_read_copyin(struct tls_strparser *strp)
+{
+	struct socket *sock = strp->sk->sk_socket;
+	read_descriptor_t desc;
+
+	desc.arg.data = strp;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	/* sk should be locked here, so okay to do read_sock */
+	sock->ops->read_sock(strp->sk, &desc, tls_strp_copyin);
+
+	return desc.error;
+}
+
+static int tls_strp_read_short(struct tls_strparser *strp)
+{
+	struct skb_shared_info *shinfo;
+	struct page *page;
+	int need_spc, len;
+
+	/* If the rbuf is small or rcv window has collapsed to 0 we need
+	 * to read the data out. Otherwise the connection will stall.
+	 * Without pressure threshold of INT_MAX will never be ready.
+	 */
+	if (likely(!tcp_epollin_ready(strp->sk, INT_MAX)))
+		return 0;
+
+	shinfo = skb_shinfo(strp->anchor);
+	shinfo->frag_list = NULL;
+
+	/* If we don't know the length go max plus page for cipher overhead */
+	need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE;
+
+	for (len = need_spc; len > 0; len -= PAGE_SIZE) {
+		page = alloc_page(strp->sk->sk_allocation);
+		if (!page) {
+			tls_strp_flush_anchor_copy(strp);
+			return -ENOMEM;
+		}
+
+		skb_fill_page_desc(strp->anchor, shinfo->nr_frags++,
+				   page, 0, 0);
+	}
+
+	strp->copy_mode = 1;
+	strp->stm.offset = 0;
+
+	strp->anchor->len = 0;
+	strp->anchor->data_len = 0;
+	strp->anchor->truesize = round_up(need_spc, PAGE_SIZE);
+
+	tls_strp_read_copyin(strp);
+
+	return 0;
+}
+
+static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
+{
+	struct tcp_sock *tp = tcp_sk(strp->sk);
+	struct sk_buff *first;
+	u32 offset;
+
+	first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset);
+	if (WARN_ON_ONCE(!first))
+		return;
+
+	/* Bestow the state onto the anchor */
+	strp->anchor->len = offset + len;
+	strp->anchor->data_len = offset + len;
+	strp->anchor->truesize = offset + len;
+
+	skb_shinfo(strp->anchor)->frag_list = first;
+
+	skb_copy_header(strp->anchor, first);
+	strp->anchor->destructor = NULL;
+
+	strp->stm.offset = offset;
+}
+
+void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+{
+	struct strp_msg *rxm;
+	struct tls_msg *tlm;
+
+	DEBUG_NET_WARN_ON_ONCE(!strp->msg_ready);
+	DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
+
+	if (!strp->copy_mode && force_refresh) {
+		if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
+			return;
+
+		tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
+	}
+
+	rxm = strp_msg(strp->anchor);
+	rxm->full_len	= strp->stm.full_len;
+	rxm->offset	= strp->stm.offset;
+	tlm = tls_msg(strp->anchor);
+	tlm->control	= strp->mark;
+}
+
+/* Called with lock held on lower socket */
+static int tls_strp_read_sock(struct tls_strparser *strp)
+{
+	int sz, inq;
+
+	inq = tcp_inq(strp->sk);
+	if (inq < 1)
+		return 0;
+
+	if (unlikely(strp->copy_mode))
+		return tls_strp_read_copyin(strp);
+
+	if (inq < strp->stm.full_len)
+		return tls_strp_read_short(strp);
+
+	if (!strp->stm.full_len) {
+		tls_strp_load_anchor_with_queue(strp, inq);
+
+		sz = tls_rx_msg_size(strp, strp->anchor);
+		if (sz < 0) {
+			tls_strp_abort_strp(strp, sz);
+			return sz;
+		}
+
+		strp->stm.full_len = sz;
+
+		if (!strp->stm.full_len || inq < strp->stm.full_len)
+			return tls_strp_read_short(strp);
+	}
+
+	strp->msg_ready = 1;
+	tls_rx_msg_ready(strp);
+
+	return 0;
+}
+
+void tls_strp_check_rcv(struct tls_strparser *strp)
+{
+	if (unlikely(strp->stopped) || strp->msg_ready)
+		return;
+
+	if (tls_strp_read_sock(strp) == -ENOMEM)
+		queue_work(tls_strp_wq, &strp->work);
+}
+
+/* Lower sock lock held */
+void tls_strp_data_ready(struct tls_strparser *strp)
+{
+	/* This check is needed to synchronize with do_tls_strp_work.
+	 * do_tls_strp_work acquires a process lock (lock_sock) whereas
+	 * the lock held here is bh_lock_sock. The two locks can be
+	 * held by different threads at the same time, but bh_lock_sock
+	 * allows a thread in BH context to safely check if the process
+	 * lock is held. In this case, if the lock is held, queue work.
+	 */
+	if (sock_owned_by_user_nocheck(strp->sk)) {
+		queue_work(tls_strp_wq, &strp->work);
+		return;
+	}
+
+	tls_strp_check_rcv(strp);
+}
+
+static void tls_strp_work(struct work_struct *w)
+{
+	struct tls_strparser *strp =
+		container_of(w, struct tls_strparser, work);
+
+	lock_sock(strp->sk);
+	tls_strp_check_rcv(strp);
+	release_sock(strp->sk);
+}
+
+void tls_strp_msg_done(struct tls_strparser *strp)
+{
+	WARN_ON(!strp->stm.full_len);
+
+	if (likely(!strp->copy_mode))
+		tcp_read_done(strp->sk, strp->stm.full_len);
+	else
+		tls_strp_flush_anchor_copy(strp);
+
+	strp->msg_ready = 0;
+	memset(&strp->stm, 0, sizeof(strp->stm));
+
+	tls_strp_check_rcv(strp);
+}
+
+void tls_strp_stop(struct tls_strparser *strp)
+{
+	strp->stopped = 1;
+}
+
+int tls_strp_init(struct tls_strparser *strp, struct sock *sk)
+{
+	memset(strp, 0, sizeof(*strp));
+
+	strp->sk = sk;
+
+	strp->anchor = alloc_skb(0, GFP_KERNEL);
+	if (!strp->anchor)
+		return -ENOMEM;
+
+	INIT_WORK(&strp->work, tls_strp_work);
 
-	nsg = skb_cow_data(ctx->recv_pkt, 0, &unused);
-	if (nsg < 0)
-		return nsg;
 	return 0;
 }
 
-int tls_strp_msg_hold(struct sock *sk, struct sk_buff *skb,
-		      struct sk_buff_head *dst)
+/* strp must already be stopped so that tls_strp_recv will no longer be called.
+ * Note that tls_strp_done is not called with the lower socket held.
+ */
+void tls_strp_done(struct tls_strparser *strp)
 {
-	struct sk_buff *clone;
+	WARN_ON(!strp->stopped);
 
-	clone = skb_clone(skb, sk->sk_allocation);
-	if (!clone)
+	cancel_work_sync(&strp->work);
+	tls_strp_anchor_free(strp);
+}
+
+int __init tls_strp_dev_init(void)
+{
+	tls_strp_wq = create_singlethread_workqueue("kstrp");
+	if (unlikely(!tls_strp_wq))
 		return -ENOMEM;
-	__skb_queue_tail(dst, clone);
+
 	return 0;
 }
+
+void tls_strp_dev_exit(void)
+{
+	destroy_workqueue(tls_strp_wq);
+}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bd4486819e64..0fc24a5ce208 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1283,7 +1283,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 
 static int
 tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
-		long timeo)
+		bool released, long timeo)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -1297,7 +1297,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
 			return sock_error(sk);
 
 		if (!skb_queue_empty(&sk->sk_receive_queue)) {
-			__strp_unpause(&ctx->strp);
+			tls_strp_check_rcv(&ctx->strp);
 			if (tls_strp_msg_ready(ctx))
 				break;
 		}
@@ -1311,6 +1311,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
 		if (nonblock || !timeo)
 			return -EAGAIN;
 
+		released = true;
 		add_wait_queue(sk_sleep(sk), &wait);
 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		sk_wait_event(sk, &timeo,
@@ -1325,6 +1326,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
 			return sock_intr_errno(timeo);
 	}
 
+	tls_strp_msg_load(&ctx->strp, released);
+
 	return 1;
 }
 
@@ -1570,7 +1573,7 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov,
 	clear_skb = NULL;
 
 	if (unlikely(darg->async)) {
-		err = tls_strp_msg_hold(sk, skb, &ctx->async_hold);
+		err = tls_strp_msg_hold(&ctx->strp, &ctx->async_hold);
 		if (err)
 			__skb_queue_tail(&ctx->async_hold, darg->skb);
 		return err;
@@ -1734,9 +1737,7 @@ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm,
 
 static void tls_rx_rec_done(struct tls_sw_context_rx *ctx)
 {
-	consume_skb(ctx->recv_pkt);
-	ctx->recv_pkt = NULL;
-	__strp_unpause(&ctx->strp);
+	tls_strp_msg_done(&ctx->strp);
 }
 
 /* This function traverses the rx_list in tls receive context to copies the
@@ -1823,7 +1824,7 @@ out:
 	return copied ? : err;
 }
 
-static void
+static bool
 tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
 		       size_t len_left, size_t decrypted, ssize_t done,
 		       size_t *flushed_at)
@@ -1831,14 +1832,14 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
 	size_t max_rec;
 
 	if (len_left <= decrypted)
-		return;
+		return false;
 
 	max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE;
 	if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec)
-		return;
+		return false;
 
 	*flushed_at = done;
-	sk_flush_backlog(sk);
+	return sk_flush_backlog(sk);
 }
 
 static long tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx,
@@ -1916,6 +1917,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	long timeo;
 	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
 	bool is_peek = flags & MSG_PEEK;
+	bool released = true;
 	bool bpf_strp_enabled;
 	bool zc_capable;
 
@@ -1952,7 +1954,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		struct tls_decrypt_arg darg;
 		int to_decrypt, chunk;
 
-		err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, timeo);
+		err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, released,
+				      timeo);
 		if (err <= 0) {
 			if (psock) {
 				chunk = sk_msg_recvmsg(sk, psock, msg, len,
@@ -1968,8 +1971,8 @@ int tls_sw_recvmsg(struct sock *sk,
 
 		memset(&darg.inargs, 0, sizeof(darg.inargs));
 
-		rxm = strp_msg(ctx->recv_pkt);
-		tlm = tls_msg(ctx->recv_pkt);
+		rxm = strp_msg(tls_strp_msg(ctx));
+		tlm = tls_msg(tls_strp_msg(ctx));
 
 		to_decrypt = rxm->full_len - prot->overhead_size;
 
@@ -2008,8 +2011,9 @@ put_on_rx_list_err:
 		}
 
 		/* periodically flush backlog, and feed strparser */
-		tls_read_flush_backlog(sk, prot, len, to_decrypt,
-				       decrypted + copied, &flushed_at);
+		released = tls_read_flush_backlog(sk, prot, len, to_decrypt,
+						  decrypted + copied,
+						  &flushed_at);
 
 		/* TLS 1.3 may have updated the length by more than overhead */
 		rxm = strp_msg(darg.skb);
@@ -2020,7 +2024,7 @@ put_on_rx_list_err:
 			bool partially_consumed = chunk > len;
 			struct sk_buff *skb = darg.skb;
 
-			DEBUG_NET_WARN_ON_ONCE(darg.skb == ctx->recv_pkt);
+			DEBUG_NET_WARN_ON_ONCE(darg.skb == tls_strp_msg(ctx));
 
 			if (async) {
 				/* TLS 1.2-only, to_decrypt must be text len */
@@ -2034,6 +2038,7 @@ put_on_rx_list:
 			}
 
 			if (bpf_strp_enabled) {
+				released = true;
 				err = sk_psock_tls_strp_read(psock, skb);
 				if (err != __SK_PASS) {
 					rxm->offset = rxm->offset + rxm->full_len;
@@ -2140,7 +2145,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 		struct tls_decrypt_arg darg;
 
 		err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK,
-				      timeo);
+				      true, timeo);
 		if (err <= 0)
 			goto splice_read_end;
 
@@ -2204,19 +2209,17 @@ bool tls_sw_sock_is_readable(struct sock *sk)
 		!skb_queue_empty(&ctx->rx_list);
 }
 
-static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
+int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
 	char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
-	struct strp_msg *rxm = strp_msg(skb);
-	struct tls_msg *tlm = tls_msg(skb);
 	size_t cipher_overhead;
 	size_t data_len = 0;
 	int ret;
 
 	/* Verify that we have a full TLS header, or wait for more data */
-	if (rxm->offset + prot->prepend_size > skb->len)
+	if (strp->stm.offset + prot->prepend_size > skb->len)
 		return 0;
 
 	/* Sanity-check size of on-stack buffer. */
@@ -2226,11 +2229,11 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 	}
 
 	/* Linearize header to local buffer */
-	ret = skb_copy_bits(skb, rxm->offset, header, prot->prepend_size);
+	ret = skb_copy_bits(skb, strp->stm.offset, header, prot->prepend_size);
 	if (ret < 0)
 		goto read_failure;
 
-	tlm->control = header[0];
+	strp->mark = header[0];
 
 	data_len = ((header[4] & 0xFF) | (header[3] << 8));
 
@@ -2257,7 +2260,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 	}
 
 	tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
-				     TCP_SKB_CB(skb)->seq + rxm->offset);
+				     TCP_SKB_CB(skb)->seq + strp->stm.offset);
 	return data_len + TLS_HEADER_SIZE;
 
 read_failure:
@@ -2266,14 +2269,11 @@ read_failure:
 	return ret;
 }
 
-static void tls_queue(struct strparser *strp, struct sk_buff *skb)
+void tls_rx_msg_ready(struct tls_strparser *strp)
 {
-	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
-	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-
-	ctx->recv_pkt = skb;
-	strp_pause(strp);
+	struct tls_sw_context_rx *ctx;
 
+	ctx = container_of(strp, struct tls_sw_context_rx, strp);
 	ctx->saved_data_ready(strp->sk);
 }
 
@@ -2283,7 +2283,7 @@ static void tls_data_ready(struct sock *sk)
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct sk_psock *psock;
 
-	strp_data_ready(&ctx->strp);
+	tls_strp_data_ready(&ctx->strp);
 
 	psock = sk_psock_get(sk);
 	if (psock) {
@@ -2359,13 +2359,11 @@ void tls_sw_release_resources_rx(struct sock *sk)
 	kfree(tls_ctx->rx.iv);
 
 	if (ctx->aead_recv) {
-		kfree_skb(ctx->recv_pkt);
-		ctx->recv_pkt = NULL;
 		__skb_queue_purge(&ctx->rx_list);
 		crypto_free_aead(ctx->aead_recv);
-		strp_stop(&ctx->strp);
+		tls_strp_stop(&ctx->strp);
 		/* If tls_sw_strparser_arm() was not called (cleanup paths)
-		 * we still want to strp_stop(), but sk->sk_data_ready was
+		 * we still want to tls_strp_stop(), but sk->sk_data_ready was
 		 * never swapped.
 		 */
 		if (ctx->saved_data_ready) {
@@ -2380,7 +2378,7 @@ void tls_sw_strparser_done(struct tls_context *tls_ctx)
 {
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	strp_done(&ctx->strp);
+	tls_strp_done(&ctx->strp);
 }
 
 void tls_sw_free_ctx_rx(struct tls_context *tls_ctx)
@@ -2453,8 +2451,6 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
 	rx_ctx->saved_data_ready = sk->sk_data_ready;
 	sk->sk_data_ready = tls_data_ready;
 	write_unlock_bh(&sk->sk_callback_lock);
-
-	strp_check_rcv(&rx_ctx->strp);
 }
 
 void tls_update_rx_zc_capable(struct tls_context *tls_ctx)
@@ -2474,7 +2470,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	struct tls_sw_context_rx *sw_ctx_rx = NULL;
 	struct cipher_context *cctx;
 	struct crypto_aead **aead;
-	struct strp_callbacks cb;
 	u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size;
 	struct crypto_tfm *tfm;
 	char *iv, *rec_seq, *key, *salt, *cipher_name;
@@ -2708,12 +2703,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 			crypto_info->version != TLS_1_3_VERSION &&
 			!!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
 
-		/* Set up strparser */
-		memset(&cb, 0, sizeof(cb));
-		cb.rcv_msg = tls_queue;
-		cb.parse_msg = tls_read_size;
-
-		strp_init(&sw_ctx_rx->strp, sk, &cb);
+		tls_strp_init(&sw_ctx_rx->strp, sk);
 	}
 
 	goto out;
-- 
cgit 


From 04b9407197789c81fffac52921e703cb47967d6a Mon Sep 17 00:00:00 2001
From: Daniil Lunev <dlunev@chromium.org>
Date: Wed, 27 Jul 2022 16:44:24 +1000
Subject: vfs: function to prevent re-use of block-device-based superblocks

The function is to be called from filesystem-specific code to mark a
superblock to be ignored by superblock test and thus never re-used.  The
function also unregisters bdi if the bdi is per-superblock to avoid
collision if a new superblock is created to represent the filesystem.
generic_shutdown_super() skips unregistering bdi for a retired superlock as
it assumes retire function has already done it.

This patch adds the functionality only for the block-device-based supers,
since the primary use case of the feature is to gracefully handle force
unmount of external devices, mounted with FUSE.  This can be further
extended to cover all superblocks, if the need arises.

Signed-off-by: Daniil Lunev <dlunev@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/super.c         | 33 +++++++++++++++++++++++++++++++--
 include/linux/fs.h |  2 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 60f57c7bc0a6..258d9257b0a7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -422,6 +422,35 @@ bool trylock_super(struct super_block *sb)
 	return false;
 }
 
+/**
+ *	retire_super	-	prevents superblock from being reused
+ *	@sb: superblock to retire
+ *
+ *	The function marks superblock to be ignored in superblock test, which
+ *	prevents it from being reused for any new mounts.  If the superblock has
+ *	a private bdi, it also unregisters it, but doesn't reduce the refcount
+ *	of the superblock to prevent potential races.  The refcount is reduced
+ *	by generic_shutdown_super().  The function can not be called
+ *	concurrently with generic_shutdown_super().  It is safe to call the
+ *	function multiple times, subsequent calls have no effect.
+ *
+ *	The marker will affect the re-use only for block-device-based
+ *	superblocks.  Other superblocks will still get marked if this function
+ *	is used, but that will not affect their reusability.
+ */
+void retire_super(struct super_block *sb)
+{
+	WARN_ON(!sb->s_bdev);
+	down_write(&sb->s_umount);
+	if (sb->s_iflags & SB_I_PERSB_BDI) {
+		bdi_unregister(sb->s_bdi);
+		sb->s_iflags &= ~SB_I_PERSB_BDI;
+	}
+	sb->s_iflags |= SB_I_RETIRED;
+	up_write(&sb->s_umount);
+}
+EXPORT_SYMBOL(retire_super);
+
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 
 static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 {
-	return s->s_bdev == fc->sget_key;
+	return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
 }
 
 /**
@@ -1307,7 +1336,7 @@ EXPORT_SYMBOL(get_tree_bdev);
 
 static int test_bdev_super(struct super_block *s, void *data)
 {
-	return (void *)s->s_bdev == data;
+	return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
 }
 
 struct dentry *mount_bdev(struct file_system_type *fs_type,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..246120ee0573 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1412,6 +1412,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
+#define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -2432,6 +2433,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
+void retire_super(struct super_block *sb);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
-- 
cgit 


From 7c56a8733d0a2a4be2438a7512566e5ce552fccf Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 13 Jul 2022 17:47:27 +0200
Subject: watchdog: export lockup_detector_reconfigure

In some circumstances it may be interesting to reconfigure the watchdog
from inside the kernel.

On PowerPC, this may helpful before and after a LPAR migration (LPM) is
initiated, because it implies some latencies, watchdog, and especially NMI
watchdog is expected to be triggered during this operation. Reconfiguring
the watchdog with a factor, would prevent it to happen too frequently
during LPM.

Rename lockup_detector_reconfigure() as __lockup_detector_reconfigure() and
create a new function lockup_detector_reconfigure() calling
__lockup_detector_reconfigure() under the protection of watchdog_mutex.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
[mpe: Squash in build fix from Laurent, reported by Sachin]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220713154729.80789-3-ldufour@linux.ibm.com
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 750c7f395ca9..f700ff2df074 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -122,6 +122,8 @@ int watchdog_nmi_probe(void);
 int watchdog_nmi_enable(unsigned int cpu);
 void watchdog_nmi_disable(unsigned int cpu);
 
+void lockup_detector_reconfigure(void);
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  *
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 20a7a55e62b6..41596c415111 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -541,7 +541,7 @@ int lockup_detector_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -561,6 +561,13 @@ static void lockup_detector_reconfigure(void)
 	__lockup_detector_cleanup();
 }
 
+void lockup_detector_reconfigure(void)
+{
+	mutex_lock(&watchdog_mutex);
+	__lockup_detector_reconfigure();
+	mutex_unlock(&watchdog_mutex);
+}
+
 /*
  * Create the watchdog infrastructure and configure the detector(s).
  */
@@ -577,13 +584,13 @@ static __init void lockup_detector_setup(void)
 		return;
 
 	mutex_lock(&watchdog_mutex);
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 	softlockup_initialized = true;
 	mutex_unlock(&watchdog_mutex);
 }
 
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -591,9 +598,13 @@ static void lockup_detector_reconfigure(void)
 	watchdog_nmi_start();
 	cpus_read_unlock();
 }
+void lockup_detector_reconfigure(void)
+{
+	__lockup_detector_reconfigure();
+}
 static inline void lockup_detector_setup(void)
 {
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
@@ -633,7 +644,7 @@ static void proc_watchdog_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 }
 
 /*
-- 
cgit 


From 9f8267b9b29937d997c3b4cec426252aae6311b5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 27 Jul 2022 13:49:48 +0200
Subject: misc: Mark MICROCODE_MINOR unused

This one is unused since

  181b6f40e9ea ("x86/microcode: Rip out the OLD_INTERFACE")

so comment it out.

Link: https://lore.kernel.org/r/20220525161232.14924-1-bp@alien8.de
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220727114948.30123-1-bp@alien8.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/miscdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 0676f18093f9..c0fea6ca5076 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -44,7 +44,7 @@
 #define AGPGART_MINOR		175
 #define TOSH_MINOR_DEV		181
 #define HWRNG_MINOR		183
-#define MICROCODE_MINOR		184
+/*#define MICROCODE_MINOR	184	unused */
 #define KEYPAD_MINOR		185
 #define IRNET_MINOR		187
 #define D7S_MINOR		193
-- 
cgit 


From 0a2f4f9893c83bd722bd55a903fb682da2eb24ba Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.ibm.com>
Date: Mon, 25 Jul 2022 16:09:58 +0200
Subject: s390/ism: Cleanups

Reworked signature of the function to retrieve the system EID: No plausible
reason to use a double pointer. And neither to pass in the device as an
argument, as this identifier is by definition per system, not per device.
Plus some minor consistency edits.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism_drv.c | 11 +++++------
 include/net/smc.h          |  2 +-
 net/smc/smc_ism.c          |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index 5f7e28de8b15..4665e9a0e048 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -409,20 +409,19 @@ static void ism_create_system_eid(void)
 	memcpy(&SYSTEM_EID.type, tmp, 4);
 }
 
-static void ism_get_system_eid(struct smcd_dev *smcd, u8 **eid)
+static u8 *ism_get_system_eid(void)
 {
-	*eid = &SYSTEM_EID.seid_string[0];
+	return SYSTEM_EID.seid_string;
 }
 
 static u16 ism_get_chid(struct smcd_dev *smcd)
 {
-	struct ism_dev *ismdev;
+	struct ism_dev *ism = (struct ism_dev *)smcd->priv;
 
-	ismdev = (struct ism_dev *)smcd->priv;
-	if (!ismdev || !ismdev->pdev)
+	if (!ism || !ism->pdev)
 		return 0;
 
-	return to_zpci(ismdev->pdev)->pchid;
+	return to_zpci(ism->pdev)->pchid;
 }
 
 static void ism_handle_event(struct ism_dev *ism)
diff --git a/include/net/smc.h b/include/net/smc.h
index 37f829d9c6e5..1868a5014dea 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -72,7 +72,7 @@ struct smcd_ops {
 	int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
 			 bool sf, unsigned int offset, void *data,
 			 unsigned int size);
-	void (*get_system_eid)(struct smcd_dev *dev, u8 **eid);
+	u8* (*get_system_eid)(void);
 	u16 (*get_chid)(struct smcd_dev *dev);
 };
 
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index c656ef25ee4b..e51c0cdff8e0 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -429,7 +429,7 @@ int smcd_register_dev(struct smcd_dev *smcd)
 	if (list_empty(&smcd_dev_list.list)) {
 		u8 *system_eid = NULL;
 
-		smcd->ops->get_system_eid(smcd, &system_eid);
+		system_eid = smcd->ops->get_system_eid();
 		if (system_eid[24] != '0' || system_eid[28] != '0') {
 			smc_ism_v2_capable = true;
 			memcpy(smc_ism_v2_system_eid, system_eid,
-- 
cgit 


From 8b2fed8e2712e8c23665df3c9e0fbabbb76e466c Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.ibm.com>
Date: Mon, 25 Jul 2022 16:09:59 +0200
Subject: net/smc: Pass on DMBE bit mask in IRQ handler

Make the DMBE bits, which are passed on individually in ism_move() as
parameter idx, available to the receiver.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism_drv.c | 4 +++-
 include/net/smc.h          | 2 +-
 net/smc/smc_ism.c          | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index 4665e9a0e048..d34bb6ec1490 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -443,6 +443,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data)
 	struct ism_dev *ism = data;
 	unsigned long bit, end;
 	unsigned long *bv;
+	u16 dmbemask;
 
 	bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET];
 	end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET;
@@ -456,9 +457,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data)
 			break;
 
 		clear_bit_inv(bit, bv);
+		dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET];
 		ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0;
 		barrier();
-		smcd_handle_irq(ism->smcd, bit + ISM_DMB_BIT_OFFSET);
+		smcd_handle_irq(ism->smcd, bit + ISM_DMB_BIT_OFFSET, dmbemask);
 	}
 
 	if (ism->sba->e) {
diff --git a/include/net/smc.h b/include/net/smc.h
index 1868a5014dea..c926d3313e05 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -101,5 +101,5 @@ int smcd_register_dev(struct smcd_dev *smcd);
 void smcd_unregister_dev(struct smcd_dev *smcd);
 void smcd_free_dev(struct smcd_dev *smcd);
 void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event);
-void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit);
+void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit, u16 dmbemask);
 #endif	/* _SMC_H */
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index e51c0cdff8e0..911fe08bc54b 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -508,13 +508,13 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
 EXPORT_SYMBOL_GPL(smcd_handle_event);
 
 /* SMCD Device interrupt handler. Called from ISM device interrupt handler.
- * Parameters are smcd device pointer and DMB number. Find the connection and
- * schedule the tasklet for this connection.
+ * Parameters are smcd device pointer, DMB number, and the DMBE bitmask.
+ * Find the connection and schedule the tasklet for this connection.
  *
  * Context:
  * - Function called in IRQ context from ISM device driver IRQ handler.
  */
-void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
+void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask)
 {
 	struct smc_connection *conn = NULL;
 	unsigned long flags;
-- 
cgit 


From 26c6c2f8a907c9e3a2f24990552a4d77235791e6 Mon Sep 17 00:00:00 2001
From: Weitao Wang <WeitaoWang-oc@zhaoxin.com>
Date: Tue, 26 Jul 2022 15:49:18 +0800
Subject: USB: HCD: Fix URB giveback issue in tasklet function

Usb core introduce the mechanism of giveback of URB in tasklet context to
reduce hardware interrupt handling time. On some test situation(such as
FIO with 4KB block size), when tasklet callback function called to
giveback URB, interrupt handler add URB node to the bh->head list also.
If check bh->head list again after finish all URB giveback of local_list,
then it may introduce a "dynamic balance" between giveback URB and add URB
to bh->head list. This tasklet callback function may not exit for a long
time, which will cause other tasklet function calls to be delayed. Some
real-time applications(such as KB and Mouse) will see noticeable lag.

In order to prevent the tasklet function from occupying the cpu for a long
time at a time, new URBS will not be added to the local_list even though
the bh->head list is not empty. But also need to ensure the left URB
giveback to be processed in time, so add a member high_prio for structure
giveback_urb_bh to prioritize tasklet and schelule this tasklet again if
bh->head list is not empty.

At the same time, we are able to prioritize tasklet through structure
member high_prio. So, replace the local high_prio_bh variable with this
structure member in usb_hcd_giveback_urb.

Fixes: 94dfd7edfd5c ("USB: HCD: support giveback of URB in tasklet context")
Cc: stable <stable@kernel.org>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Weitao Wang <WeitaoWang-oc@zhaoxin.com>
Link: https://lore.kernel.org/r/20220726074918.5114-1-WeitaoWang-oc@zhaoxin.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 26 +++++++++++++++-----------
 include/linux/usb/hcd.h |  1 +
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 41dcd41e550c..a6a87c5d1b05 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1691,7 +1691,6 @@ static void usb_giveback_urb_bh(struct tasklet_struct *t)
 
 	spin_lock_irq(&bh->lock);
 	bh->running = true;
- restart:
 	list_replace_init(&bh->head, &local_list);
 	spin_unlock_irq(&bh->lock);
 
@@ -1705,10 +1704,17 @@ static void usb_giveback_urb_bh(struct tasklet_struct *t)
 		bh->completing_ep = NULL;
 	}
 
-	/* check if there are new URBs to giveback */
+	/*
+	 * giveback new URBs next time to prevent this function
+	 * from not exiting for a long time.
+	 */
 	spin_lock_irq(&bh->lock);
-	if (!list_empty(&bh->head))
-		goto restart;
+	if (!list_empty(&bh->head)) {
+		if (bh->high_prio)
+			tasklet_hi_schedule(&bh->bh);
+		else
+			tasklet_schedule(&bh->bh);
+	}
 	bh->running = false;
 	spin_unlock_irq(&bh->lock);
 }
@@ -1737,7 +1743,7 @@ static void usb_giveback_urb_bh(struct tasklet_struct *t)
 void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
 {
 	struct giveback_urb_bh *bh;
-	bool running, high_prio_bh;
+	bool running;
 
 	/* pass status to tasklet via unlinked */
 	if (likely(!urb->unlinked))
@@ -1748,13 +1754,10 @@ void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
 		return;
 	}
 
-	if (usb_pipeisoc(urb->pipe) || usb_pipeint(urb->pipe)) {
+	if (usb_pipeisoc(urb->pipe) || usb_pipeint(urb->pipe))
 		bh = &hcd->high_prio_bh;
-		high_prio_bh = true;
-	} else {
+	else
 		bh = &hcd->low_prio_bh;
-		high_prio_bh = false;
-	}
 
 	spin_lock(&bh->lock);
 	list_add_tail(&urb->urb_list, &bh->head);
@@ -1763,7 +1766,7 @@ void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
 
 	if (running)
 		;
-	else if (high_prio_bh)
+	else if (bh->high_prio)
 		tasklet_hi_schedule(&bh->bh);
 	else
 		tasklet_schedule(&bh->bh);
@@ -2959,6 +2962,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 
 	/* initialize tasklets */
 	init_giveback_urb_bh(&hcd->high_prio_bh);
+	hcd->high_prio_bh.high_prio = true;
 	init_giveback_urb_bh(&hcd->low_prio_bh);
 
 	/* enable irqs just before we start the controller,
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 2c1fc9212cf2..98d1921f02b1 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -66,6 +66,7 @@
 
 struct giveback_urb_bh {
 	bool running;
+	bool high_prio;
 	spinlock_t lock;
 	struct list_head  head;
 	struct tasklet_struct bh;
-- 
cgit 


From 6eabfc018e8d1033e7fc1efce30a872e2dccb537 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 26 Jul 2022 10:38:21 -0700
Subject: regulator: core: Allow specifying an initial load w/ the bulk API

There are a number of drivers that follow a pattern that looks like
this:
1. Use the regulator bulk API to get a bunch of regulators.
2. Set the load on each of the regulators to use whenever the
   regulators are enabled.

Let's make this easier by just allowing the drivers to pass the load
in.

As part of this change we need to move the error printing in
regulator_bulk_get() around; let's switch to the new dev_err_probe()
to simplify it.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20220726103631.v2.4.Ie85f68215ada39f502a96dcb8a1f3ad977e3f68a@changeid
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c           | 20 ++++++++++++--------
 include/linux/regulator/consumer.h | 12 ++++++++----
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 1e54a833f2cf..17c476fc8adb 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -4783,22 +4783,26 @@ int regulator_bulk_get(struct device *dev, int num_consumers,
 		consumers[i].consumer = regulator_get(dev,
 						      consumers[i].supply);
 		if (IS_ERR(consumers[i].consumer)) {
-			ret = PTR_ERR(consumers[i].consumer);
 			consumers[i].consumer = NULL;
+			ret = dev_err_probe(dev, PTR_ERR(consumers[i].consumer),
+					    "Failed to get supply '%s'",
+					    consumers[i].supply);
 			goto err;
 		}
+
+		if (consumers[i].init_load_uA > 0) {
+			ret = regulator_set_load(consumers[i].consumer,
+						 consumers[i].init_load_uA);
+			if (ret) {
+				i++;
+				goto err;
+			}
+		}
 	}
 
 	return 0;
 
 err:
-	if (ret != -EPROBE_DEFER)
-		dev_err(dev, "Failed to get supply '%s': %pe\n",
-			consumers[i].supply, ERR_PTR(ret));
-	else
-		dev_dbg(dev, "Failed to get supply '%s', deferring\n",
-			consumers[i].supply);
-
 	while (--i >= 0)
 		regulator_put(consumers[i].consumer);
 
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index bbf6590a6dec..5779f4466e62 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -171,10 +171,13 @@ struct regulator;
 /**
  * struct regulator_bulk_data - Data used for bulk regulator operations.
  *
- * @supply:   The name of the supply.  Initialised by the user before
- *            using the bulk regulator APIs.
- * @consumer: The regulator consumer for the supply.  This will be managed
- *            by the bulk API.
+ * @supply:       The name of the supply.  Initialised by the user before
+ *                using the bulk regulator APIs.
+ * @init_load_uA: After getting the regulator, regulator_set_load() will be
+ *                called with this load.  Initialised by the user before
+ *                using the bulk regulator APIs.
+ * @consumer:     The regulator consumer for the supply.  This will be managed
+ *                by the bulk API.
  *
  * The regulator APIs provide a series of regulator_bulk_() API calls as
  * a convenience to consumers which require multiple supplies.  This
@@ -182,6 +185,7 @@ struct regulator;
  */
 struct regulator_bulk_data {
 	const char *supply;
+	int init_load_uA;
 	struct regulator *consumer;
 
 	/* private: Internal use */
-- 
cgit 


From 1de452a0edda26f1483d1d934f692eab13ba669a Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 26 Jul 2022 10:38:23 -0700
Subject: regulator: core: Allow drivers to define their init data as const

Drivers tend to want to define the names of their regulators somewhere
in their source file as "static const". This means, inevitable, that
every driver out there open codes something like this:

static const char * const supply_names[] = {
 "vcc", "vccl",
};

static int get_regulators(struct my_data *data)
{
  int i;

  data->supplies = devm_kzalloc(...)
  if (!data->supplies)
    return -ENOMEM;

  for (i = 0; i < ARRAY_SIZE(supply_names); i++)
    data->supplies[i].supply = supply_names[i];

  return devm_regulator_bulk_get(data->dev,
                                 ARRAY_SIZE(supply_names),
				 data->supplies);
}

Let's make this more convenient by doing providing a helper that does
the copy.

I have chosen to have the "const" input structure here be the exact
same structure as the normal one passed to
devm_regulator_bulk_get(). This is slightly inefficent since the input
data can't possibly have anything useful for "ret" or consumer and
thus we waste 8 bytes per structure. This seems an OK tradeoff for not
introducing an extra structure.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20220726103631.v2.6.I38fc508a73135a5c1b873851f3553ff2a3a625f5@changeid
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/devres.c         | 28 ++++++++++++++++++++++++++++
 include/linux/regulator/consumer.h |  4 ++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/drivers/regulator/devres.c b/drivers/regulator/devres.c
index 9113233f41cd..32823a87fd40 100644
--- a/drivers/regulator/devres.c
+++ b/drivers/regulator/devres.c
@@ -166,6 +166,34 @@ int devm_regulator_bulk_get(struct device *dev, int num_consumers,
 }
 EXPORT_SYMBOL_GPL(devm_regulator_bulk_get);
 
+/**
+ * devm_regulator_bulk_get_const - devm_regulator_bulk_get() w/ const data
+ *
+ * @dev:           device to supply
+ * @num_consumers: number of consumers to register
+ * @in_consumers:  const configuration of consumers
+ * @out_consumers: in_consumers is copied here and this is passed to
+ *		   devm_regulator_bulk_get().
+ *
+ * This is a convenience function to allow bulk regulator configuration
+ * to be stored "static const" in files.
+ *
+ * Return: 0 on success, an errno on failure.
+ */
+int devm_regulator_bulk_get_const(struct device *dev, int num_consumers,
+				  const struct regulator_bulk_data *in_consumers,
+				  struct regulator_bulk_data **out_consumers)
+{
+	*out_consumers = devm_kmemdup(dev, in_consumers,
+				      num_consumers * sizeof(*in_consumers),
+				      GFP_KERNEL);
+	if (*out_consumers == NULL)
+		return -ENOMEM;
+
+	return devm_regulator_bulk_get(dev, num_consumers, *out_consumers);
+}
+EXPORT_SYMBOL_GPL(devm_regulator_bulk_get_const);
+
 static void devm_rdev_release(struct device *dev, void *res)
 {
 	regulator_unregister(*(struct regulator_dev **)res);
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 5779f4466e62..bc6cda706d1f 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -244,6 +244,10 @@ int __must_check regulator_bulk_get(struct device *dev, int num_consumers,
 				    struct regulator_bulk_data *consumers);
 int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers,
 					 struct regulator_bulk_data *consumers);
+int __must_check devm_regulator_bulk_get_const(
+	struct device *dev, int num_consumers,
+	const struct regulator_bulk_data *in_consumers,
+	struct regulator_bulk_data **out_consumers);
 int __must_check regulator_bulk_enable(int num_consumers,
 				       struct regulator_bulk_data *consumers);
 int regulator_bulk_disable(int num_consumers,
-- 
cgit 


From 14b146b688ad9593f5eee93d51a34d09a47e50b5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 27 Jul 2022 10:30:41 +0100
Subject: io_uring: notification completion optimisation

We want to use all optimisations that we have for io_uring requests like
completion batching, memory caching and more but for zc notifications.
Fortunately, notification perfectly fit the request model so we can
overlay them onto struct io_kiocb and use all the infratructure.

Most of the fields of struct io_notif natively fits into io_kiocb, so we
replace struct io_notif with struct io_kiocb carrying struct
io_notif_data in the cmd cache line. Then we adapt io_alloc_notif() to
use io_alloc_req()/io_alloc_req_refill(), and kill leftovers of hand
coded caching. __io_notif_complete_tw() is converted to use io_uring's
tw infra.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9e010125175e80baf51f0ca63bdc7cc6a4a9fa56.1658913593.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   7 --
 io_uring/io_uring.c            |   3 -
 io_uring/net.c                 |   4 +-
 io_uring/notif.c               | 159 ++++++++++++++---------------------------
 io_uring/notif.h               |  42 ++++-------
 5 files changed, 67 insertions(+), 148 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 144493cbadb5..f7fab3758cb9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -249,9 +249,6 @@ struct io_ring_ctx {
 		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
 
-		/* struct io_notif cache, protected by uring_lock */
-		struct list_head	notif_list;
-
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
 		struct io_alloc_cache	apoll_cache;
@@ -262,10 +259,6 @@ struct io_ring_ctx {
 	struct io_wq_work_list	locked_free_list;
 	unsigned int		locked_free_nr;
 
-	/* struct io_notif cache protected by completion_lock */
-	struct list_head	notif_list_locked;
-	unsigned int		notif_locked_nr;
-
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 822819d0f607..b54218da075c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -321,8 +321,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->locked_free_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
-	INIT_LIST_HEAD(&ctx->notif_list);
-	INIT_LIST_HEAD(&ctx->notif_list_locked);
 	return ctx;
 err:
 	kfree(ctx->dummy_ubuf);
@@ -2466,7 +2464,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 	WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
 
-	io_notif_cache_purge(ctx);
 	io_mem_free(ctx->rings);
 	io_mem_free(ctx->sq_sqes);
 
diff --git a/io_uring/net.c b/io_uring/net.c
index 8276b9537194..32fc3da04e41 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -946,7 +946,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
 	struct io_notif_slot *notif_slot;
-	struct io_notif *notif;
+	struct io_kiocb *notif;
 	struct msghdr msg;
 	struct iovec iov;
 	struct socket *sock;
@@ -1005,7 +1005,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		min_ret = iov_iter_count(&msg.msg_iter);
 
 	msg.msg_flags = msg_flags;
-	msg.msg_ubuf = &notif->uarg;
+	msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
 	msg.sg_from_iter = io_sg_from_iter;
 	ret = sock_sendmsg(sock, &msg);
 
diff --git a/io_uring/notif.c b/io_uring/notif.c
index e986a0ed958c..b5f989dff9de 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -9,140 +9,79 @@
 #include "notif.h"
 #include "rsrc.h"
 
-static void __io_notif_complete_tw(struct callback_head *cb)
+static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
 {
-	struct io_notif *notif = container_of(cb, struct io_notif, task_work);
-	struct io_rsrc_node *rsrc_node = notif->rsrc_node;
+	struct io_notif_data *nd = io_notif_to_data(notif);
 	struct io_ring_ctx *ctx = notif->ctx;
 
-	if (notif->account_pages && ctx->user) {
-		__io_unaccount_mem(ctx->user, notif->account_pages);
-		notif->account_pages = 0;
+	if (nd->account_pages && ctx->user) {
+		__io_unaccount_mem(ctx->user, nd->account_pages);
+		nd->account_pages = 0;
 	}
-	if (likely(notif->task)) {
-		io_put_task(notif->task, 1);
-		notif->task = NULL;
-	}
-
-	io_cq_lock(ctx);
-	io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true);
-
-	list_add(&notif->cache_node, &ctx->notif_list_locked);
-	ctx->notif_locked_nr++;
-	io_cq_unlock_post(ctx);
-
-	io_rsrc_put_node(rsrc_node, 1);
-	percpu_ref_put(&ctx->refs);
+	io_req_task_complete(notif, locked);
 }
 
-static inline void io_notif_complete(struct io_notif *notif)
+static inline void io_notif_complete(struct io_kiocb *notif)
+	__must_hold(&notif->ctx->uring_lock)
 {
-	__io_notif_complete_tw(&notif->task_work);
-}
-
-static void io_notif_complete_wq(struct work_struct *work)
-{
-	struct io_notif *notif = container_of(work, struct io_notif, commit_work);
+	bool locked = true;
 
-	io_notif_complete(notif);
+	__io_notif_complete_tw(notif, &locked);
 }
 
 static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
 					  struct ubuf_info *uarg,
 					  bool success)
 {
-	struct io_notif *notif = container_of(uarg, struct io_notif, uarg);
-
-	if (!refcount_dec_and_test(&uarg->refcnt))
-		return;
-
-	if (likely(notif->task)) {
-		init_task_work(&notif->task_work, __io_notif_complete_tw);
-		if (likely(!task_work_add(notif->task, &notif->task_work,
-					  TWA_SIGNAL)))
-			return;
-	}
-
-	INIT_WORK(&notif->commit_work, io_notif_complete_wq);
-	queue_work(system_unbound_wq, &notif->commit_work);
-}
-
-static void io_notif_splice_cached(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	spin_lock(&ctx->completion_lock);
-	list_splice_init(&ctx->notif_list_locked, &ctx->notif_list);
-	ctx->notif_locked_nr = 0;
-	spin_unlock(&ctx->completion_lock);
-}
-
-void io_notif_cache_purge(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	io_notif_splice_cached(ctx);
+	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
+	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 
-	while (!list_empty(&ctx->notif_list)) {
-		struct io_notif *notif = list_first_entry(&ctx->notif_list,
-						struct io_notif, cache_node);
-
-		list_del(&notif->cache_node);
-		kfree(notif);
+	if (refcount_dec_and_test(&uarg->refcnt)) {
+		notif->io_task_work.func = __io_notif_complete_tw;
+		io_req_task_work_add(notif);
 	}
 }
 
-static inline bool io_notif_has_cached(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	if (likely(!list_empty(&ctx->notif_list)))
-		return true;
-	if (data_race(READ_ONCE(ctx->notif_locked_nr) <= IO_NOTIF_SPLICE_BATCH))
-		return false;
-	io_notif_splice_cached(ctx);
-	return !list_empty(&ctx->notif_list);
-}
-
-struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_notif *notif;
-
-	if (likely(io_notif_has_cached(ctx))) {
-		notif = list_first_entry(&ctx->notif_list,
-					 struct io_notif, cache_node);
-		list_del(&notif->cache_node);
-	} else {
-		notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT);
-		if (!notif)
-			return NULL;
-		/* pre-initialise some fields */
-		notif->ctx = ctx;
-		notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
-		notif->uarg.callback = io_uring_tx_zerocopy_callback;
-		notif->account_pages = 0;
-	}
-
-	notif->seq = slot->seq++;
-	notif->tag = slot->tag;
+	struct io_kiocb *notif;
+	struct io_notif_data *nd;
+
+	if (unlikely(!io_alloc_req_refill(ctx)))
+		return NULL;
+	notif = io_alloc_req(ctx);
+	notif->opcode = IORING_OP_NOP;
+	notif->flags = 0;
+	notif->file = NULL;
+	notif->task = current;
+	io_get_task_refs(1);
+	notif->rsrc_node = NULL;
+	io_req_set_rsrc_node(notif, ctx, 0);
+	notif->cqe.user_data = slot->tag;
+	notif->cqe.flags = slot->seq++;
+	notif->cqe.res = 0;
+
+	nd = io_notif_to_data(notif);
+	nd->account_pages = 0;
+	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+	nd->uarg.callback = io_uring_tx_zerocopy_callback;
 	/* master ref owned by io_notif_slot, will be dropped on flush */
-	refcount_set(&notif->uarg.refcnt, 1);
-	percpu_ref_get(&ctx->refs);
-	notif->rsrc_node = ctx->rsrc_node;
-	io_charge_rsrc_node(ctx);
+	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
 
 void io_notif_slot_flush(struct io_notif_slot *slot)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_notif *notif = slot->notif;
+	struct io_kiocb *notif = slot->notif;
+	struct io_notif_data *nd = io_notif_to_data(notif);
 
 	slot->notif = NULL;
 
-	if (WARN_ON_ONCE(in_interrupt()))
-		return;
 	/* drop slot's master ref */
-	if (refcount_dec_and_test(&notif->uarg.refcnt))
+	if (refcount_dec_and_test(&nd->uarg.refcnt))
 		io_notif_complete(notif);
 }
 
@@ -156,18 +95,22 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx)
 
 	for (i = 0; i < ctx->nr_notif_slots; i++) {
 		struct io_notif_slot *slot = &ctx->notif_slots[i];
+		struct io_kiocb *notif = slot->notif;
+		struct io_notif_data *nd;
 
-		if (!slot->notif)
+		if (!notif)
+			continue;
+		nd = io_kiocb_to_cmd(notif);
+		slot->notif = NULL;
+		if (!refcount_dec_and_test(&nd->uarg.refcnt))
 			continue;
-		if (WARN_ON_ONCE(slot->notif->task))
-			slot->notif->task = NULL;
-		io_notif_slot_flush(slot);
+		notif->io_task_work.func = __io_notif_complete_tw;
+		io_req_task_work_add(notif);
 	}
 
 	kvfree(ctx->notif_slots);
 	ctx->notif_slots = NULL;
 	ctx->nr_notif_slots = 0;
-	io_notif_cache_purge(ctx);
 	return 0;
 }
 
@@ -180,6 +123,8 @@ __cold int io_notif_register(struct io_ring_ctx *ctx,
 	struct io_uring_notification_register reg;
 	unsigned i;
 
+	BUILD_BUG_ON(sizeof(struct io_notif_data) > 64);
+
 	if (ctx->nr_notif_slots)
 		return -EBUSY;
 	if (size != sizeof(reg))
diff --git a/io_uring/notif.h b/io_uring/notif.h
index d6f366b1518b..0819304d7e00 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -10,27 +10,10 @@
 #define IO_NOTIF_SPLICE_BATCH	32
 #define IORING_MAX_NOTIF_SLOTS (1U << 10)
 
-struct io_notif {
+struct io_notif_data {
+	struct file		*file;
 	struct ubuf_info	uarg;
-	struct io_ring_ctx	*ctx;
-	struct io_rsrc_node	*rsrc_node;
-
-	/* complete via tw if ->task is non-NULL, fallback to wq otherwise */
-	struct task_struct	*task;
-
-	/* cqe->user_data, io_notif_slot::tag if not overridden */
-	u64			tag;
-	/* see struct io_notif_slot::seq */
-	u32			seq;
-	/* hook into ctx->notif_list and ctx->notif_list_locked */
-	struct list_head	cache_node;
-
 	unsigned long		account_pages;
-
-	union {
-		struct callback_head	task_work;
-		struct work_struct	commit_work;
-	};
 };
 
 struct io_notif_slot {
@@ -39,7 +22,7 @@ struct io_notif_slot {
 	 * time and keeps one reference to it. Flush releases the reference and
 	 * lazily replaces it with a new notifier.
 	 */
-	struct io_notif		*notif;
+	struct io_kiocb		*notif;
 
 	/*
 	 * Default ->user_data for this slot notifiers CQEs
@@ -56,13 +39,17 @@ struct io_notif_slot {
 int io_notif_register(struct io_ring_ctx *ctx,
 		      void __user *arg, unsigned int size);
 int io_notif_unregister(struct io_ring_ctx *ctx);
-void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
 void io_notif_slot_flush(struct io_notif_slot *slot);
-struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot);
 
-static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx,
+static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
+{
+	return io_kiocb_to_cmd(notif);
+}
+
+static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
 					    struct io_notif_slot *slot)
 {
 	if (!slot->notif)
@@ -83,16 +70,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
 static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
 					      unsigned int issue_flags)
 {
-	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
-		slot->notif->task = current;
-		io_get_task_refs(1);
-	}
 	io_notif_slot_flush(slot);
 }
 
-static inline int io_notif_account_mem(struct io_notif *notif, unsigned len)
+static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 {
 	struct io_ring_ctx *ctx = notif->ctx;
+	struct io_notif_data *nd = io_notif_to_data(notif);
 	unsigned nr_pages = (len >> PAGE_SHIFT) + 2;
 	int ret;
 
@@ -100,7 +84,7 @@ static inline int io_notif_account_mem(struct io_notif *notif, unsigned len)
 		ret = __io_account_mem(ctx->user, nr_pages);
 		if (ret)
 			return ret;
-		notif->account_pages += nr_pages;
+		nd->account_pages += nr_pages;
 	}
 	return 0;
 }
-- 
cgit 


From 41e79b1d458477212d0a880c87622bcaa69ab3ea Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Fri, 8 Apr 2022 13:17:13 +0300
Subject: clocksource/drivers/timer-ti-dm: Move inline functions to driver for
 am6

The __omap_dm_timer_* inline functions in the header are no longer needed
outside the driver, and the header ifdefs prevent the driver working for
ARCH_K3.

Let's move the inline functions to the driver and drop the ifdefs and
drop the unused functions __omap_dm_timer_override_errata() and
__omap_dm_timer_load_start().

Cc: Keerthy <j-keerthy@ti.com>
Cc: Nishanth Menon <nm@ti.com>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20220408101715.43697-2-tony@atomide.com
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-ti-dm.c | 115 ++++++++++++++++++++++++++++++
 include/clocksource/timer-ti-dm.h | 144 --------------------------------------
 2 files changed, 115 insertions(+), 144 deletions(-)

(limited to 'include')

diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c
index 33609be0b304..530b71f3ab37 100644
--- a/drivers/clocksource/timer-ti-dm.c
+++ b/drivers/clocksource/timer-ti-dm.c
@@ -44,6 +44,121 @@ enum {
 	REQUEST_BY_NODE,
 };
 
+static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg,
+						int posted)
+{
+	if (posted)
+		while (readl_relaxed(timer->pend) & (reg >> WPSHIFT))
+			cpu_relax();
+
+	return readl_relaxed(timer->func_base + (reg & 0xff));
+}
+
+static inline void __omap_dm_timer_write(struct omap_dm_timer *timer,
+					u32 reg, u32 val, int posted)
+{
+	if (posted)
+		while (readl_relaxed(timer->pend) & (reg >> WPSHIFT))
+			cpu_relax();
+
+	writel_relaxed(val, timer->func_base + (reg & 0xff));
+}
+
+static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer)
+{
+	u32 tidr;
+
+	/* Assume v1 ip if bits [31:16] are zero */
+	tidr = readl_relaxed(timer->io_base);
+	if (!(tidr >> 16)) {
+		timer->revision = 1;
+		timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET;
+		timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET;
+		timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET;
+		timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET;
+		timer->func_base = timer->io_base;
+	} else {
+		timer->revision = 2;
+		timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS;
+		timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET;
+		timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR;
+		timer->pend = timer->io_base +
+			_OMAP_TIMER_WRITE_PEND_OFFSET +
+				OMAP_TIMER_V2_FUNC_OFFSET;
+		timer->func_base = timer->io_base + OMAP_TIMER_V2_FUNC_OFFSET;
+	}
+}
+
+/*
+ * __omap_dm_timer_enable_posted - enables write posted mode
+ * @timer:      pointer to timer instance handle
+ *
+ * Enables the write posted mode for the timer. When posted mode is enabled
+ * writes to certain timer registers are immediately acknowledged by the
+ * internal bus and hence prevents stalling the CPU waiting for the write to
+ * complete. Enabling this feature can improve performance for writing to the
+ * timer registers.
+ */
+static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer)
+{
+	if (timer->posted)
+		return;
+
+	if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) {
+		timer->posted = OMAP_TIMER_NONPOSTED;
+		__omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0);
+		return;
+	}
+
+	__omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG,
+			      OMAP_TIMER_CTRL_POSTED, 0);
+	timer->context.tsicr = OMAP_TIMER_CTRL_POSTED;
+	timer->posted = OMAP_TIMER_POSTED;
+}
+
+static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer,
+					int posted, unsigned long rate)
+{
+	u32 l;
+
+	l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted);
+	if (l & OMAP_TIMER_CTRL_ST) {
+		l &= ~0x1;
+		__omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted);
+#ifdef CONFIG_ARCH_OMAP2PLUS
+		/* Readback to make sure write has completed */
+		__omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted);
+		/*
+		 * Wait for functional clock period x 3.5 to make sure that
+		 * timer is stopped
+		 */
+		udelay(3500000 / rate + 1);
+#endif
+	}
+
+	/* Ack possibly pending interrupt */
+	writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat);
+}
+
+static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer,
+						unsigned int value)
+{
+	writel_relaxed(value, timer->irq_ena);
+	__omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0);
+}
+
+static inline unsigned int
+__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted)
+{
+	return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted);
+}
+
+static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer,
+						unsigned int value)
+{
+	writel_relaxed(value, timer->irq_stat);
+}
+
 /**
  * omap_dm_timer_read_reg - read timer registers in posted and non-posted mode
  * @timer:      timer pointer over which read operation to perform
diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h
index f6da8a132639..b0f80cfd2a26 100644
--- a/include/clocksource/timer-ti-dm.h
+++ b/include/clocksource/timer-ti-dm.h
@@ -247,148 +247,4 @@ int omap_dm_timers_active(void);
 #define OMAP_TIMER_TICK_INT_MASK_COUNT_REG				\
 		(_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT))
 
-/*
- * The below are inlined to optimize code size for system timers. Other code
- * should not need these at all.
- */
-#if defined(CONFIG_ARCH_OMAP1) || defined(CONFIG_ARCH_OMAP2PLUS)
-static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg,
-						int posted)
-{
-	if (posted)
-		while (readl_relaxed(timer->pend) & (reg >> WPSHIFT))
-			cpu_relax();
-
-	return readl_relaxed(timer->func_base + (reg & 0xff));
-}
-
-static inline void __omap_dm_timer_write(struct omap_dm_timer *timer,
-					u32 reg, u32 val, int posted)
-{
-	if (posted)
-		while (readl_relaxed(timer->pend) & (reg >> WPSHIFT))
-			cpu_relax();
-
-	writel_relaxed(val, timer->func_base + (reg & 0xff));
-}
-
-static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer)
-{
-	u32 tidr;
-
-	/* Assume v1 ip if bits [31:16] are zero */
-	tidr = readl_relaxed(timer->io_base);
-	if (!(tidr >> 16)) {
-		timer->revision = 1;
-		timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET;
-		timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET;
-		timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET;
-		timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET;
-		timer->func_base = timer->io_base;
-	} else {
-		timer->revision = 2;
-		timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS;
-		timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET;
-		timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR;
-		timer->pend = timer->io_base +
-			_OMAP_TIMER_WRITE_PEND_OFFSET +
-				OMAP_TIMER_V2_FUNC_OFFSET;
-		timer->func_base = timer->io_base + OMAP_TIMER_V2_FUNC_OFFSET;
-	}
-}
-
-/*
- * __omap_dm_timer_enable_posted - enables write posted mode
- * @timer:      pointer to timer instance handle
- *
- * Enables the write posted mode for the timer. When posted mode is enabled
- * writes to certain timer registers are immediately acknowledged by the
- * internal bus and hence prevents stalling the CPU waiting for the write to
- * complete. Enabling this feature can improve performance for writing to the
- * timer registers.
- */
-static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer)
-{
-	if (timer->posted)
-		return;
-
-	if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) {
-		timer->posted = OMAP_TIMER_NONPOSTED;
-		__omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0);
-		return;
-	}
-
-	__omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG,
-			      OMAP_TIMER_CTRL_POSTED, 0);
-	timer->context.tsicr = OMAP_TIMER_CTRL_POSTED;
-	timer->posted = OMAP_TIMER_POSTED;
-}
-
-/**
- * __omap_dm_timer_override_errata - override errata flags for a timer
- * @timer:      pointer to timer handle
- * @errata:	errata flags to be ignored
- *
- * For a given timer, override a timer errata by clearing the flags
- * specified by the errata argument. A specific erratum should only be
- * overridden for a timer if the timer is used in such a way the erratum
- * has no impact.
- */
-static inline void __omap_dm_timer_override_errata(struct omap_dm_timer *timer,
-						   u32 errata)
-{
-	timer->errata &= ~errata;
-}
-
-static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer,
-					int posted, unsigned long rate)
-{
-	u32 l;
-
-	l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted);
-	if (l & OMAP_TIMER_CTRL_ST) {
-		l &= ~0x1;
-		__omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted);
-#ifdef CONFIG_ARCH_OMAP2PLUS
-		/* Readback to make sure write has completed */
-		__omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted);
-		/*
-		 * Wait for functional clock period x 3.5 to make sure that
-		 * timer is stopped
-		 */
-		udelay(3500000 / rate + 1);
-#endif
-	}
-
-	/* Ack possibly pending interrupt */
-	writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat);
-}
-
-static inline void __omap_dm_timer_load_start(struct omap_dm_timer *timer,
-						u32 ctrl, unsigned int load,
-						int posted)
-{
-	__omap_dm_timer_write(timer, OMAP_TIMER_COUNTER_REG, load, posted);
-	__omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, ctrl, posted);
-}
-
-static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer,
-						unsigned int value)
-{
-	writel_relaxed(value, timer->irq_ena);
-	__omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0);
-}
-
-static inline unsigned int
-__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted)
-{
-	return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted);
-}
-
-static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer,
-						unsigned int value)
-{
-	writel_relaxed(value, timer->irq_stat);
-}
-#endif /* CONFIG_ARCH_OMAP1 || CONFIG_ARCH_OMAP2PLUS */
 #endif /* __CLOCKSOURCE_DMTIMER_H */
-- 
cgit 


From 0113780870b1597ae49f30abfa4957c239f913d3 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 26 Jul 2022 10:19:11 +0300
Subject: RDMA/mlx5: Rename the mkey cache variables and functions

After replacing the MR cache with an Mkey cache, rename the variables and
functions to fit the new meaning.

Link: https://lore.kernel.org/r/20220726071911.122765-6-michaelgur@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |  4 +--
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 14 +++++-----
 drivers/infiniband/hw/mlx5/mr.c      | 54 ++++++++++++++++++------------------
 drivers/infiniband/hw/mlx5/odp.c     |  2 +-
 include/linux/mlx5/driver.h          |  6 ++--
 5 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b68fddeac0f1..a174a0eee8dc 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4002,7 +4002,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mr_cache_cleanup(dev);
+	err = mlx5_mkey_cache_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4022,7 +4022,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	if (ret)
 		return ret;
 
-	ret = mlx5_mr_cache_init(dev);
+	ret = mlx5_mkey_cache_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		mlx5r_umr_resource_cleanup(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 91f985cd7d90..2e2ad3918385 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -764,9 +764,9 @@ struct mlx5r_async_create_mkey {
 	u32 mkey;
 };
 
-struct mlx5_mr_cache {
+struct mlx5_mkey_cache {
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	struct mlx5_cache_ent	ent[MAX_MKEY_CACHE_ENTRIES];
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -1065,7 +1065,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mr_cache		cache;
+	struct mlx5_mkey_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1310,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       struct mlx5_cache_ent *ent,
@@ -1339,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1358,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index edbc2990d151..129d531bd01b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -119,7 +119,7 @@ static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
 				&async_create->cb_work);
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -515,11 +515,11 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mkey_cache *cache)
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &cache->ent[i];
 		bool ret;
 
@@ -569,7 +569,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	int err;
 
 	xa_lock_irq(&ent->mkeys);
@@ -681,7 +681,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	u32 mkey;
 
@@ -696,7 +696,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 	xa_unlock_irq(&ent->mkeys);
 }
 
-static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -705,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct dentry *dir;
 	int i;
@@ -717,7 +717,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 
 	cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		sprintf(ent->name, "%d", ent->order);
 		dir = debugfs_create_dir(ent->name, cache->root);
@@ -735,9 +735,9 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int i;
 
@@ -750,7 +750,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
 		ent->order = i + 2;
@@ -759,12 +759,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
-		if (i > MR_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mr_cache_entry(ent);
+		if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+			mlx5_odp_init_mkey_cache_entry(ent);
 			continue;
 		}
 
-		if (ent->order > mr_cache_max_order(dev))
+		if (ent->order > mkey_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
@@ -781,19 +781,19 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		xa_unlock_irq(&ent->mkeys);
 	}
 
-	mlx5_mr_cache_debugfs_init(dev);
+	mlx5_mkey_cache_debugfs_init(dev);
 
 	return 0;
 }
 
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 
 		xa_lock_irq(&ent->mkeys);
@@ -802,10 +802,10 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
@@ -872,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MR_CACHE_LAST_STD_ENTRY + 2;
+		return MKEY_CACHE_LAST_STD_ENTRY + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
-						      unsigned int order)
+static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+							unsigned int order)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 
 	if (order < cache->ent[0].order)
 		return &cache->ent[0];
 	order = order - cache->ent[0].order;
-	if (order > MR_CACHE_LAST_STD_ENTRY)
+	if (order > MKEY_CACHE_LAST_STD_ENTRY)
 		return NULL;
 	return &cache->ent[order];
 }
@@ -930,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mr_cache_ent_from_order(
+	ent = mkey_cache_ent_from_order(
 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 84da5674e1ab..e305bf1dc6c2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
 {
 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 		return;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 76d7661e3e63..220597c2f436 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -728,10 +728,10 @@ enum {
 };
 
 enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
+	MKEY_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
+	MAX_MKEY_CACHE_ENTRIES
 };
 
 struct mlx5_profile {
@@ -740,7 +740,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 struct mlx5_hca_cap {
-- 
cgit 


From d8f70c47394c26a8d7c6e602d909de88d1bdae5e Mon Sep 17 00:00:00 2001
From: Cheng Xu <chengyou@linux.alibaba.com>
Date: Wed, 27 Jul 2022 09:49:17 +0800
Subject: RDMA: Add ERDMA to rdma_driver_id definition

Define RDMA_DRIVER_ERDMA in enum rdma_driver_id.

Link: https://lore.kernel.org/r/20220727014927.76564-2-chengyou@linux.alibaba.com
Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/ib_user_ioctl_verbs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 3072e5d6b692..7dd56210226f 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -250,6 +250,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_QIB,
 	RDMA_DRIVER_EFA,
 	RDMA_DRIVER_SIW,
+	RDMA_DRIVER_ERDMA,
 };
 
 enum ib_uverbs_gid_type {
-- 
cgit 


From f5995fe2a0b1f4ec5e52e2022452246a13bf89b7 Mon Sep 17 00:00:00 2001
From: Cheng Xu <chengyou@linux.alibaba.com>
Date: Wed, 27 Jul 2022 09:49:26 +0800
Subject: RDMA/erdma: Add the ABI definitions

Add erdma ABI definitions which will be shared between kernel and
userspace. This commit also fix compile issues reported by lkp.

Link: https://lore.kernel.org/r/20220727014927.76564-11-chengyou@linux.alibaba.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/erdma-abi.h | 49 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 include/uapi/rdma/erdma-abi.h

(limited to 'include')

diff --git a/include/uapi/rdma/erdma-abi.h b/include/uapi/rdma/erdma-abi.h
new file mode 100644
index 000000000000..b7a0222f978f
--- /dev/null
+++ b/include/uapi/rdma/erdma-abi.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2020-2022, Alibaba Group.
+ */
+
+#ifndef __ERDMA_USER_H__
+#define __ERDMA_USER_H__
+
+#include <linux/types.h>
+
+#define ERDMA_ABI_VERSION       1
+
+struct erdma_ureq_create_cq {
+	__aligned_u64 db_record_va;
+	__aligned_u64 qbuf_va;
+	__u32 qbuf_len;
+	__u32 rsvd0;
+};
+
+struct erdma_uresp_create_cq {
+	__u32 cq_id;
+	__u32 num_cqe;
+};
+
+struct erdma_ureq_create_qp {
+	__aligned_u64 db_record_va;
+	__aligned_u64 qbuf_va;
+	__u32 qbuf_len;
+	__u32 rsvd0;
+};
+
+struct erdma_uresp_create_qp {
+	__u32 qp_id;
+	__u32 num_sqe;
+	__u32 num_rqe;
+	__u32 rq_offset;
+};
+
+struct erdma_uresp_alloc_ctx {
+	__u32 dev_id;
+	__u32 pad;
+	__u32 sdb_type;
+	__u32 sdb_offset;
+	__aligned_u64 sdb;
+	__aligned_u64 rdb;
+	__aligned_u64 cdb;
+};
+
+#endif
-- 
cgit 


From 103e10c69c611efabccf57d799c4b191d53ee765 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 22 Jul 2022 09:57:46 +0300
Subject: ACPI: property: Add support for parsing buffer property UUID

Add support for newly added buffer property UUID, as defined in the DSD
guide section 3.3 [1]

Link: https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc#buffer-data-extension-uuid # [1]
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 142 +++++++++++++++++++++++++++++++++++++++++++-----
 include/acpi/acpi_bus.h |   3 +-
 include/linux/acpi.h    |   2 +-
 3 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index b751eb98106c..7e3d163a6b1f 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -55,14 +55,19 @@ static const guid_t ads_guid =
 	GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6,
 		  0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b);
 
+static const guid_t buffer_prop_guid =
+	GUID_INIT(0xedb12dd0, 0x363d, 0x4085,
+		  0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4);
+
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
-					   const union acpi_object *desc,
+					   union acpi_object *desc,
 					   struct acpi_device_data *data,
 					   struct fwnode_handle *parent);
-static bool acpi_extract_properties(const union acpi_object *desc,
+static bool acpi_extract_properties(acpi_handle handle,
+				    union acpi_object *desc,
 				    struct acpi_device_data *data);
 
-static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
+static bool acpi_nondev_subnode_extract(union acpi_object *desc,
 					acpi_handle handle,
 					const union acpi_object *link,
 					struct list_head *list,
@@ -81,7 +86,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 	INIT_LIST_HEAD(&dn->data.properties);
 	INIT_LIST_HEAD(&dn->data.subnodes);
 
-	result = acpi_extract_properties(desc, &dn->data);
+	result = acpi_extract_properties(handle, desc, &dn->data);
 
 	if (handle) {
 		acpi_handle scope;
@@ -156,7 +161,7 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
 }
 
 static bool acpi_add_nondev_subnodes(acpi_handle scope,
-				     const union acpi_object *links,
+				     union acpi_object *links,
 				     struct list_head *list,
 				     struct fwnode_handle *parent)
 {
@@ -164,7 +169,7 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope,
 	int i;
 
 	for (i = 0; i < links->package.count; i++) {
-		const union acpi_object *link, *desc;
+		union acpi_object *link, *desc;
 		acpi_handle handle;
 		bool result;
 
@@ -204,7 +209,7 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope,
 }
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
-					   const union acpi_object *desc,
+					   union acpi_object *desc,
 					   struct acpi_device_data *data,
 					   struct fwnode_handle *parent)
 {
@@ -212,7 +217,8 @@ static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 
 	/* Look for the ACPI data subnodes GUID. */
 	for (i = 0; i < desc->package.count; i += 2) {
-		const union acpi_object *guid, *links;
+		const union acpi_object *guid;
+		union acpi_object *links;
 
 		guid = &desc->package.elements[i];
 		links = &desc->package.elements[i + 1];
@@ -325,7 +331,7 @@ static bool acpi_is_property_guid(const guid_t *guid)
 
 struct acpi_device_properties *
 acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
-		    const union acpi_object *properties)
+		    union acpi_object *properties)
 {
 	struct acpi_device_properties *props;
 
@@ -377,7 +383,104 @@ static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data)
 	return true;
 }
 
-static bool acpi_extract_properties(const union acpi_object *desc,
+static void acpi_data_add_buffer_props(acpi_handle handle,
+				       struct acpi_device_data *data,
+				       union acpi_object *properties)
+{
+	struct acpi_device_properties *props;
+	union acpi_object *package;
+	size_t alloc_size;
+	unsigned int i;
+	u32 *count;
+
+	if (check_mul_overflow((size_t)properties->package.count,
+			       sizeof(*package) + sizeof(void *),
+			       &alloc_size) ||
+	    check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size,
+			       &alloc_size)) {
+		acpi_handle_warn(handle,
+				 "can't allocate memory for %u buffer props",
+				 properties->package.count);
+		return;
+	}
+
+	props = kvzalloc(alloc_size, GFP_KERNEL);
+	if (!props)
+		return;
+
+	props->guid = &buffer_prop_guid;
+	props->bufs = (void *)(props + 1);
+	props->properties = (void *)(props->bufs + properties->package.count);
+
+	/* Outer package */
+	package = props->properties;
+	package->type = ACPI_TYPE_PACKAGE;
+	package->package.elements = package + 1;
+	count = &package->package.count;
+	*count = 0;
+
+	/* Inner packages */
+	package++;
+
+	for (i = 0; i < properties->package.count; i++) {
+		struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
+		union acpi_object *property = &properties->package.elements[i];
+		union acpi_object *prop, *obj, *buf_obj;
+		acpi_status status;
+
+		if (property->type != ACPI_TYPE_PACKAGE ||
+		    property->package.count != 2) {
+			acpi_handle_warn(handle,
+					 "buffer property %u has %u entries\n",
+					 i, property->package.count);
+			continue;
+		}
+
+		prop = &property->package.elements[0];
+		obj = &property->package.elements[1];
+
+		if (prop->type != ACPI_TYPE_STRING ||
+		    obj->type != ACPI_TYPE_STRING) {
+			acpi_handle_warn(handle,
+					 "wrong object types %u and %u\n",
+					 prop->type, obj->type);
+			continue;
+		}
+
+		status = acpi_evaluate_object_typed(handle, obj->string.pointer,
+						    NULL, &buf,
+						    ACPI_TYPE_BUFFER);
+		if (ACPI_FAILURE(status)) {
+			acpi_handle_warn(handle,
+					 "can't evaluate \"%*pE\" as buffer\n",
+					 obj->string.length,
+					 obj->string.pointer);
+			continue;
+		}
+
+		package->type = ACPI_TYPE_PACKAGE;
+		package->package.elements = prop;
+		package->package.count = 2;
+
+		buf_obj = buf.pointer;
+
+		/* Replace the string object with a buffer object */
+		obj->type = ACPI_TYPE_BUFFER;
+		obj->buffer.length = buf_obj->buffer.length;
+		obj->buffer.pointer = buf_obj->buffer.pointer;
+
+		props->bufs[i] = buf.pointer;
+		package++;
+		(*count)++;
+	}
+
+	if (*count)
+		list_add(&props->list, &data->properties);
+	else
+		kvfree(props);
+}
+
+static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc,
 				    struct acpi_device_data *data)
 {
 	int i;
@@ -387,7 +490,8 @@ static bool acpi_extract_properties(const union acpi_object *desc,
 
 	/* Look for the device properties GUID. */
 	for (i = 0; i < desc->package.count; i += 2) {
-		const union acpi_object *guid, *properties;
+		const union acpi_object *guid;
+		union acpi_object *properties;
 
 		guid = &desc->package.elements[i];
 		properties = &desc->package.elements[i + 1];
@@ -401,6 +505,12 @@ static bool acpi_extract_properties(const union acpi_object *desc,
 		    properties->type != ACPI_TYPE_PACKAGE)
 			break;
 
+		if (guid_equal((guid_t *)guid->buffer.pointer,
+			       &buffer_prop_guid)) {
+			acpi_data_add_buffer_props(scope, data, properties);
+			continue;
+		}
+
 		if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer))
 			continue;
 
@@ -447,7 +557,7 @@ void acpi_init_properties(struct acpi_device *adev)
 	if (ACPI_FAILURE(status))
 		goto out;
 
-	if (acpi_extract_properties(buf.pointer, &adev->data)) {
+	if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) {
 		adev->data.pointer = buf.pointer;
 		if (acpi_of)
 			acpi_init_of_compatible(adev);
@@ -477,8 +587,14 @@ static void acpi_free_device_properties(struct list_head *list)
 	struct acpi_device_properties *props, *tmp;
 
 	list_for_each_entry_safe(props, tmp, list, list) {
+		u32 i;
+
 		list_del(&props->list);
-		kfree(props);
+		/* Buffer data properties were separately allocated */
+		if (props->bufs)
+			for (i = 0; i < props->properties->package.count; i++)
+				ACPI_FREE(props->bufs[i]);
+		kvfree(props);
 	}
 }
 
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 0dc1ea0b52f5..a1690000ac95 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -344,8 +344,9 @@ struct acpi_device_physical_node {
 
 struct acpi_device_properties {
 	const guid_t *guid;
-	const union acpi_object *properties;
+	union acpi_object *properties;
 	struct list_head list;
+	void **bufs;
 };
 
 /* ACPI Device Specific Data (_DSD) */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 44975c1bbe12..7c46f152106b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1243,7 +1243,7 @@ static inline bool acpi_dev_has_props(const struct acpi_device *adev)
 
 struct acpi_device_properties *
 acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
-		    const union acpi_object *properties);
+		    union acpi_object *properties);
 
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
-- 
cgit 


From 72691a269f0baad6d5f4aa7af97c29081b86d70f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 27 Jul 2022 13:02:27 -0400
Subject: SUNRPC: Don't reuse bvec on retransmission of the request

If a request is re-encoded and then retransmitted, we need to make sure
that we also re-encode the bvec, in case the page lists have changed.

Fixes: ff053dbbaffe ("SUNRPC: Move the call to xprt_send_pagedata() out of xprt_sock_sendmsg()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  3 ++-
 net/sunrpc/clnt.c           |  1 -
 net/sunrpc/xprt.c           | 27 ++++++++++++++++++---------
 net/sunrpc/xprtsock.c       | 12 ++----------
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 0d51b9f9ea37..b9f59aabee53 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -144,7 +144,8 @@ struct rpc_xprt_ops {
 	unsigned short	(*get_srcport)(struct rpc_xprt *xprt);
 	int		(*buf_alloc)(struct rpc_task *task);
 	void		(*buf_free)(struct rpc_task *task);
-	int		(*prepare_request)(struct rpc_rqst *req);
+	int		(*prepare_request)(struct rpc_rqst *req,
+					   struct xdr_buf *buf);
 	int		(*send_request)(struct rpc_rqst *req);
 	void		(*wait_for_reply_request)(struct rpc_task *task);
 	void		(*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index bbfc47f03480..b098e707ad41 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1870,7 +1870,6 @@ rpc_xdr_encode(struct rpc_task *task)
 	req->rq_snd_buf.head[0].iov_len = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf,
 			req->rq_snd_buf.head[0].iov_base, req);
-	xdr_free_bvec(&req->rq_snd_buf);
 	if (rpc_encode_header(task, &xdr))
 		return;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 44348c9f4b00..d71eec494826 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -73,7 +73,7 @@ static void	xprt_init(struct rpc_xprt *xprt, struct net *net);
 static __be32	xprt_alloc_xid(struct rpc_xprt *xprt);
 static void	xprt_destroy(struct rpc_xprt *xprt);
 static void	xprt_request_init(struct rpc_task *task);
-static int	xprt_request_prepare(struct rpc_rqst *req);
+static int	xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf);
 
 static DEFINE_SPINLOCK(xprt_list_lock);
 static LIST_HEAD(xprt_list);
@@ -1149,7 +1149,7 @@ xprt_request_enqueue_receive(struct rpc_task *task)
 	if (!xprt_request_need_enqueue_receive(task, req))
 		return 0;
 
-	ret = xprt_request_prepare(task->tk_rqstp);
+	ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf);
 	if (ret)
 		return ret;
 	spin_lock(&xprt->queue_lock);
@@ -1179,8 +1179,11 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 
-	if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+	if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) {
 		xprt_request_rb_remove(req->rq_xprt, req);
+		xdr_free_bvec(&req->rq_rcv_buf);
+		req->rq_private_buf.bvec = NULL;
+	}
 }
 
 /**
@@ -1336,8 +1339,14 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 {
 	struct rpc_rqst *pos, *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
+	int ret;
 
 	if (xprt_request_need_enqueue_transmit(task, req)) {
+		ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf);
+		if (ret) {
+			task->tk_status = ret;
+			return;
+		}
 		req->rq_bytes_sent = 0;
 		spin_lock(&xprt->queue_lock);
 		/*
@@ -1397,6 +1406,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
 	} else
 		list_del(&req->rq_xmit2);
 	atomic_long_dec(&req->rq_xprt->xmit_queuelen);
+	xdr_free_bvec(&req->rq_snd_buf);
 }
 
 /**
@@ -1433,8 +1443,6 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 	    test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
 	    xprt_is_pinned_rqst(req)) {
 		spin_lock(&xprt->queue_lock);
-		xprt_request_dequeue_transmit_locked(task);
-		xprt_request_dequeue_receive_locked(task);
 		while (xprt_is_pinned_rqst(req)) {
 			set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
 			spin_unlock(&xprt->queue_lock);
@@ -1442,6 +1450,8 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 			spin_lock(&xprt->queue_lock);
 			clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
 		}
+		xprt_request_dequeue_transmit_locked(task);
+		xprt_request_dequeue_receive_locked(task);
 		spin_unlock(&xprt->queue_lock);
 	}
 }
@@ -1449,18 +1459,19 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 /**
  * xprt_request_prepare - prepare an encoded request for transport
  * @req: pointer to rpc_rqst
+ * @buf: pointer to send/rcv xdr_buf
  *
  * Calls into the transport layer to do whatever is needed to prepare
  * the request for transmission or receive.
  * Returns error, or zero.
  */
 static int
-xprt_request_prepare(struct rpc_rqst *req)
+xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf)
 {
 	struct rpc_xprt *xprt = req->rq_xprt;
 
 	if (xprt->ops->prepare_request)
-		return xprt->ops->prepare_request(req);
+		return xprt->ops->prepare_request(req, buf);
 	return 0;
 }
 
@@ -1961,8 +1972,6 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(task);
-	xdr_free_bvec(&req->rq_rcv_buf);
-	xdr_free_bvec(&req->rq_snd_buf);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
 	if (req->rq_release_snd_buf)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index eba1be9984f8..e976007f4fd0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -822,17 +822,9 @@ static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait)
 	return ret;
 }
 
-static int
-xs_stream_prepare_request(struct rpc_rqst *req)
+static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf)
 {
-	gfp_t gfp = rpc_task_gfp_mask();
-	int ret;
-
-	ret = xdr_alloc_bvec(&req->rq_snd_buf, gfp);
-	if (ret < 0)
-		return ret;
-	xdr_free_bvec(&req->rq_rcv_buf);
-	return xdr_alloc_bvec(&req->rq_rcv_buf, gfp);
+	return xdr_alloc_bvec(buf, rpc_task_gfp_mask());
 }
 
 /*
-- 
cgit 


From c452d49849d48bd37ae97fc2bc92c6435707c35f Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Mon, 25 Jul 2022 12:24:59 +0300
Subject: mtd: spi-nor: s/addr_width/addr_nbytes

Address width was an unfortunate name, as it means the number of IO lines
used for the address, whereas in the code it is used as the number of
address bytes. s/addr_width/addr_nbytes throughout the entire SPI NOR
framework.

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Acked-by: Pratyush Yadav <p.yadav@ti.com>
Link: https://lore.kernel.org/r/20220725092505.446315-2-tudor.ambarus@microchip.com
---
 drivers/mtd/spi-nor/controllers/hisi-sfc.c  |  2 +-
 drivers/mtd/spi-nor/controllers/nxp-spifi.c |  8 ++---
 drivers/mtd/spi-nor/core.c                  | 54 ++++++++++++++---------------
 drivers/mtd/spi-nor/core.h                  | 12 +++----
 drivers/mtd/spi-nor/debugfs.c               |  2 +-
 drivers/mtd/spi-nor/issi.c                  |  8 ++---
 drivers/mtd/spi-nor/otp.c                   | 12 +++----
 drivers/mtd/spi-nor/sfdp.c                  | 32 ++++++++---------
 drivers/mtd/spi-nor/xilinx.c                |  2 +-
 include/linux/mtd/spi-nor.h                 |  4 +--
 10 files changed, 68 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/controllers/hisi-sfc.c b/drivers/mtd/spi-nor/controllers/hisi-sfc.c
index 94a969185ceb..5070d72835ec 100644
--- a/drivers/mtd/spi-nor/controllers/hisi-sfc.c
+++ b/drivers/mtd/spi-nor/controllers/hisi-sfc.c
@@ -237,7 +237,7 @@ static int hisi_spi_nor_dma_transfer(struct spi_nor *nor, loff_t start_off,
 	reg = readl(host->regbase + FMC_CFG);
 	reg &= ~(FMC_CFG_OP_MODE_MASK | SPI_NOR_ADDR_MODE_MASK);
 	reg |= FMC_CFG_OP_MODE_NORMAL;
-	reg |= (nor->addr_width == 4) ? SPI_NOR_ADDR_MODE_4BYTES
+	reg |= (nor->addr_nbytes == 4) ? SPI_NOR_ADDR_MODE_4BYTES
 		: SPI_NOR_ADDR_MODE_3BYTES;
 	writel(reg, host->regbase + FMC_CFG);
 
diff --git a/drivers/mtd/spi-nor/controllers/nxp-spifi.c b/drivers/mtd/spi-nor/controllers/nxp-spifi.c
index 9032b9ab2eaf..ab3990e6ac25 100644
--- a/drivers/mtd/spi-nor/controllers/nxp-spifi.c
+++ b/drivers/mtd/spi-nor/controllers/nxp-spifi.c
@@ -203,7 +203,7 @@ static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
 	      SPIFI_CMD_DATALEN(len) |
 	      SPIFI_CMD_FIELDFORM_ALL_SERIAL |
 	      SPIFI_CMD_OPCODE(nor->program_opcode) |
-	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 	writel(cmd, spifi->io_base + SPIFI_CMD);
 
 	for (i = 0; i < len; i++)
@@ -230,7 +230,7 @@ static int nxp_spifi_erase(struct spi_nor *nor, loff_t offs)
 
 	cmd = SPIFI_CMD_FIELDFORM_ALL_SERIAL |
 	      SPIFI_CMD_OPCODE(nor->erase_opcode) |
-	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 	writel(cmd, spifi->io_base + SPIFI_CMD);
 
 	return nxp_spifi_wait_for_cmd(spifi);
@@ -252,12 +252,12 @@ static int nxp_spifi_setup_memory_cmd(struct nxp_spifi *spifi)
 	}
 
 	/* Memory mode supports address length between 1 and 4 */
-	if (spifi->nor.addr_width < 1 || spifi->nor.addr_width > 4)
+	if (spifi->nor.addr_nbytes < 1 || spifi->nor.addr_nbytes > 4)
 		return -EINVAL;
 
 	spifi->mcmd |= SPIFI_CMD_OPCODE(spifi->nor.read_opcode) |
 		       SPIFI_CMD_INTLEN(spifi->nor.read_dummy / 8) |
-		       SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+		       SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 
 	return 0;
 }
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index ce5d69317d46..31604188ee59 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -38,7 +38,7 @@
  */
 #define CHIP_ERASE_2MB_READY_WAIT_JIFFIES	(40UL * HZ)
 
-#define SPI_NOR_MAX_ADDR_WIDTH	4
+#define SPI_NOR_MAX_ADDR_NBYTES	4
 
 #define SPI_NOR_SRST_SLEEP_MIN 200
 #define SPI_NOR_SRST_SLEEP_MAX 400
@@ -198,7 +198,7 @@ static ssize_t spi_nor_spimem_read_data(struct spi_nor *nor, loff_t from,
 {
 	struct spi_mem_op op =
 		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
-			   SPI_MEM_OP_ADDR(nor->addr_width, from, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_nbytes, from, 0),
 			   SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
 			   SPI_MEM_OP_DATA_IN(len, buf, 0));
 	bool usebouncebuf;
@@ -262,7 +262,7 @@ static ssize_t spi_nor_spimem_write_data(struct spi_nor *nor, loff_t to,
 {
 	struct spi_mem_op op =
 		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
-			   SPI_MEM_OP_ADDR(nor->addr_width, to, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_nbytes, to, 0),
 			   SPI_MEM_OP_NO_DUMMY,
 			   SPI_MEM_OP_DATA_OUT(len, buf, 0));
 	ssize_t nbytes;
@@ -1113,7 +1113,7 @@ int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 	if (nor->spimem) {
 		struct spi_mem_op op =
 			SPI_NOR_SECTOR_ERASE_OP(nor->erase_opcode,
-						nor->addr_width, addr);
+						nor->addr_nbytes, addr);
 
 		spi_nor_spimem_setup_op(nor, &op, nor->write_proto);
 
@@ -1126,13 +1126,13 @@ int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 	 * Default implementation, if driver doesn't have a specialized HW
 	 * control
 	 */
-	for (i = nor->addr_width - 1; i >= 0; i--) {
+	for (i = nor->addr_nbytes - 1; i >= 0; i--) {
 		nor->bouncebuf[i] = addr & 0xff;
 		addr >>= 8;
 	}
 
 	return spi_nor_controller_ops_write_reg(nor, nor->erase_opcode,
-						nor->bouncebuf, nor->addr_width);
+						nor->bouncebuf, nor->addr_nbytes);
 }
 
 /**
@@ -2249,43 +2249,43 @@ static int spi_nor_default_setup(struct spi_nor *nor,
 	return 0;
 }
 
-static int spi_nor_set_addr_width(struct spi_nor *nor)
+static int spi_nor_set_addr_nbytes(struct spi_nor *nor)
 {
-	if (nor->addr_width) {
+	if (nor->addr_nbytes) {
 		/* already configured from SFDP */
 	} else if (nor->read_proto == SNOR_PROTO_8_8_8_DTR) {
 		/*
 		 * In 8D-8D-8D mode, one byte takes half a cycle to transfer. So
-		 * in this protocol an odd address width cannot be used because
+		 * in this protocol an odd addr_nbytes cannot be used because
 		 * then the address phase would only span a cycle and a half.
 		 * Half a cycle would be left over. We would then have to start
 		 * the dummy phase in the middle of a cycle and so too the data
 		 * phase, and we will end the transaction with half a cycle left
 		 * over.
 		 *
-		 * Force all 8D-8D-8D flashes to use an address width of 4 to
+		 * Force all 8D-8D-8D flashes to use an addr_nbytes of 4 to
 		 * avoid this situation.
 		 */
-		nor->addr_width = 4;
-	} else if (nor->info->addr_width) {
-		nor->addr_width = nor->info->addr_width;
+		nor->addr_nbytes = 4;
+	} else if (nor->info->addr_nbytes) {
+		nor->addr_nbytes = nor->info->addr_nbytes;
 	} else {
-		nor->addr_width = 3;
+		nor->addr_nbytes = 3;
 	}
 
-	if (nor->addr_width == 3 && nor->params->size > 0x1000000) {
+	if (nor->addr_nbytes == 3 && nor->params->size > 0x1000000) {
 		/* enable 4-byte addressing if the device exceeds 16MiB */
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 	}
 
-	if (nor->addr_width > SPI_NOR_MAX_ADDR_WIDTH) {
-		dev_dbg(nor->dev, "address width is too large: %u\n",
-			nor->addr_width);
+	if (nor->addr_nbytes > SPI_NOR_MAX_ADDR_NBYTES) {
+		dev_dbg(nor->dev, "The number of address bytes is too large: %u\n",
+			nor->addr_nbytes);
 		return -EINVAL;
 	}
 
 	/* Set 4byte opcodes when possible. */
-	if (nor->addr_width == 4 && nor->flags & SNOR_F_4B_OPCODES &&
+	if (nor->addr_nbytes == 4 && nor->flags & SNOR_F_4B_OPCODES &&
 	    !(nor->flags & SNOR_F_HAS_4BAIT))
 		spi_nor_set_4byte_opcodes(nor);
 
@@ -2304,7 +2304,7 @@ static int spi_nor_setup(struct spi_nor *nor,
 	if (ret)
 		return ret;
 
-	return spi_nor_set_addr_width(nor);
+	return spi_nor_set_addr_nbytes(nor);
 }
 
 /**
@@ -2492,7 +2492,7 @@ static void spi_nor_sfdp_init_params_deprecated(struct spi_nor *nor)
 
 	if (spi_nor_parse_sfdp(nor)) {
 		memcpy(nor->params, &sfdp_params, sizeof(*nor->params));
-		nor->addr_width = 0;
+		nor->addr_nbytes = 0;
 		nor->flags &= ~SNOR_F_4B_OPCODES;
 	}
 }
@@ -2713,7 +2713,7 @@ static int spi_nor_init(struct spi_nor *nor)
 	     nor->flags & SNOR_F_SWP_IS_VOLATILE))
 		spi_nor_try_unlock_all(nor);
 
-	if (nor->addr_width == 4 &&
+	if (nor->addr_nbytes == 4 &&
 	    nor->read_proto != SNOR_PROTO_8_8_8_DTR &&
 	    !(nor->flags & SNOR_F_4B_OPCODES)) {
 		/*
@@ -2840,7 +2840,7 @@ static void spi_nor_put_device(struct mtd_info *mtd)
 void spi_nor_restore(struct spi_nor *nor)
 {
 	/* restore the addressing mode */
-	if (nor->addr_width == 4 && !(nor->flags & SNOR_F_4B_OPCODES) &&
+	if (nor->addr_nbytes == 4 && !(nor->flags & SNOR_F_4B_OPCODES) &&
 	    nor->flags & SNOR_F_BROKEN_RESET)
 		nor->params->set_4byte_addr_mode(nor, false);
 
@@ -2984,7 +2984,7 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 	 * - select op codes for (Fast) Read, Page Program and Sector Erase.
 	 * - set the number of dummy cycles (mode cycles + wait states).
 	 * - set the SPI protocols for register and memory accesses.
-	 * - set the address width.
+	 * - set the number of address bytes.
 	 */
 	ret = spi_nor_setup(nor, hwcaps);
 	if (ret)
@@ -3025,7 +3025,7 @@ static int spi_nor_create_read_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
 		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_nbytes, 0, 0),
 				      SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
 				      SPI_MEM_OP_DATA_IN(0, NULL, 0)),
 		.offset = 0,
@@ -3056,7 +3056,7 @@ static int spi_nor_create_write_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
 		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_nbytes, 0, 0),
 				      SPI_MEM_OP_NO_DUMMY,
 				      SPI_MEM_OP_DATA_OUT(0, NULL, 0)),
 		.offset = 0,
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index 61886868cd02..14921a507b0f 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -84,9 +84,9 @@
 		   SPI_MEM_OP_NO_DUMMY,					\
 		   SPI_MEM_OP_NO_DATA)
 
-#define SPI_NOR_SECTOR_ERASE_OP(opcode, addr_width, addr)		\
+#define SPI_NOR_SECTOR_ERASE_OP(opcode, addr_nbytes, addr)		\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 0),				\
-		   SPI_MEM_OP_ADDR(addr_width, addr, 0),		\
+		   SPI_MEM_OP_ADDR(addr_nbytes, addr, 0),		\
 		   SPI_MEM_OP_NO_DUMMY,					\
 		   SPI_MEM_OP_NO_DATA)
 
@@ -429,7 +429,7 @@ struct spi_nor_fixups {
  *                  isn't necessarily called a "sector" by the vendor.
  * @n_sectors:      the number of sectors.
  * @page_size:      the flash's page size.
- * @addr_width:     the flash's address width.
+ * @addr_nbytes:    number of address bytes to send.
  *
  * @parse_sfdp:     true when flash supports SFDP tables. The false value has no
  *                  meaning. If one wants to skip the SFDP tables, one should
@@ -487,7 +487,7 @@ struct flash_info {
 	unsigned sector_size;
 	u16 n_sectors;
 	u16 page_size;
-	u16 addr_width;
+	u16 addr_nbytes;
 
 	bool parse_sfdp;
 	u16 flags;
@@ -548,11 +548,11 @@ struct flash_info {
 		.n_sectors = (_n_sectors),				\
 		.page_size = 256,					\
 
-#define CAT25_INFO(_sector_size, _n_sectors, _page_size, _addr_width)	\
+#define CAT25_INFO(_sector_size, _n_sectors, _page_size, _addr_nbytes)	\
 		.sector_size = (_sector_size),				\
 		.n_sectors = (_n_sectors),				\
 		.page_size = (_page_size),				\
-		.addr_width = (_addr_width),				\
+		.addr_nbytes = (_addr_nbytes),				\
 		.flags = SPI_NOR_NO_ERASE | SPI_NOR_NO_FR,		\
 
 #define OTP_INFO(_len, _n_regions, _base, _offset)			\
diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c
index eaf84f7a0676..df76cb5de3f9 100644
--- a/drivers/mtd/spi-nor/debugfs.c
+++ b/drivers/mtd/spi-nor/debugfs.c
@@ -86,7 +86,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data)
 	seq_printf(s, "size\t\t%s\n", buf);
 	seq_printf(s, "write size\t%u\n", params->writesize);
 	seq_printf(s, "page size\t%u\n", params->page_size);
-	seq_printf(s, "address width\t%u\n", nor->addr_width);
+	seq_printf(s, "address nbytes\t%u\n", nor->addr_nbytes);
 
 	seq_puts(s, "flags\t\t");
 	spi_nor_print_flags(s, nor->flags, snor_f_names, sizeof(snor_f_names));
diff --git a/drivers/mtd/spi-nor/issi.c b/drivers/mtd/spi-nor/issi.c
index 3c7d51d2b050..71687e5babdc 100644
--- a/drivers/mtd/spi-nor/issi.c
+++ b/drivers/mtd/spi-nor/issi.c
@@ -14,13 +14,13 @@ is25lp256_post_bfpt_fixups(struct spi_nor *nor,
 			   const struct sfdp_bfpt *bfpt)
 {
 	/*
-	 * IS25LP256 supports 4B opcodes, but the BFPT advertises a
-	 * BFPT_DWORD1_ADDRESS_BYTES_3_ONLY address width.
-	 * Overwrite the address width advertised by the BFPT.
+	 * IS25LP256 supports 4B opcodes, but the BFPT advertises
+	 * BFPT_DWORD1_ADDRESS_BYTES_3_ONLY.
+	 * Overwrite the number of address bytes advertised by the BFPT.
 	 */
 	if ((bfpt->dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK) ==
 		BFPT_DWORD1_ADDRESS_BYTES_3_ONLY)
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 
 	return 0;
 }
diff --git a/drivers/mtd/spi-nor/otp.c b/drivers/mtd/spi-nor/otp.c
index fa63d8571218..00ab0d2d6d2f 100644
--- a/drivers/mtd/spi-nor/otp.c
+++ b/drivers/mtd/spi-nor/otp.c
@@ -35,13 +35,13 @@
  */
 int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
 {
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	struct spi_mem_dirmap_desc *rdesc;
 	enum spi_nor_protocol read_proto;
 	int ret;
 
 	read_opcode = nor->read_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 	read_proto = nor->read_proto;
 	rdesc = nor->dirmap.rdesc;
@@ -54,7 +54,7 @@ int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
 	ret = spi_nor_read_data(nor, addr, len, buf);
 
 	nor->read_opcode = read_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 	nor->read_proto = read_proto;
 	nor->dirmap.rdesc = rdesc;
@@ -85,11 +85,11 @@ int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
 {
 	enum spi_nor_protocol write_proto;
 	struct spi_mem_dirmap_desc *wdesc;
-	u8 addr_width, program_opcode;
+	u8 addr_nbytes, program_opcode;
 	int ret, written;
 
 	program_opcode = nor->program_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	write_proto = nor->write_proto;
 	wdesc = nor->dirmap.wdesc;
 
@@ -113,7 +113,7 @@ int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
 
 out:
 	nor->program_opcode = program_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->write_proto = write_proto;
 	nor->dirmap.wdesc = wdesc;
 
diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c
index a5211543d30d..61ae8c8c5237 100644
--- a/drivers/mtd/spi-nor/sfdp.c
+++ b/drivers/mtd/spi-nor/sfdp.c
@@ -134,7 +134,7 @@ struct sfdp_4bait {
 
 /**
  * spi_nor_read_raw() - raw read of serial flash memory. read_opcode,
- *			addr_width and read_dummy members of the struct spi_nor
+ *			addr_nbytes and read_dummy members of the struct spi_nor
  *			should be previously
  * set.
  * @nor:	pointer to a 'struct spi_nor'
@@ -178,21 +178,21 @@ static int spi_nor_read_raw(struct spi_nor *nor, u32 addr, size_t len, u8 *buf)
 static int spi_nor_read_sfdp(struct spi_nor *nor, u32 addr,
 			     size_t len, void *buf)
 {
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	int ret;
 
 	read_opcode = nor->read_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 
 	nor->read_opcode = SPINOR_OP_RDSFDP;
-	nor->addr_width = 3;
+	nor->addr_nbytes = 3;
 	nor->read_dummy = 8;
 
 	ret = spi_nor_read_raw(nor, addr, len, buf);
 
 	nor->read_opcode = read_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 
 	return ret;
@@ -462,11 +462,11 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor,
 	switch (bfpt.dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK) {
 	case BFPT_DWORD1_ADDRESS_BYTES_3_ONLY:
 	case BFPT_DWORD1_ADDRESS_BYTES_3_OR_4:
-		nor->addr_width = 3;
+		nor->addr_nbytes = 3;
 		break;
 
 	case BFPT_DWORD1_ADDRESS_BYTES_4_ONLY:
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 		break;
 
 	default:
@@ -637,12 +637,12 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor,
 }
 
 /**
- * spi_nor_smpt_addr_width() - return the address width used in the
+ * spi_nor_smpt_addr_nbytes() - return the number of address bytes used in the
  *			       configuration detection command.
  * @nor:	pointer to a 'struct spi_nor'
  * @settings:	configuration detection command descriptor, dword1
  */
-static u8 spi_nor_smpt_addr_width(const struct spi_nor *nor, const u32 settings)
+static u8 spi_nor_smpt_addr_nbytes(const struct spi_nor *nor, const u32 settings)
 {
 	switch (settings & SMPT_CMD_ADDRESS_LEN_MASK) {
 	case SMPT_CMD_ADDRESS_LEN_0:
@@ -653,7 +653,7 @@ static u8 spi_nor_smpt_addr_width(const struct spi_nor *nor, const u32 settings)
 		return 4;
 	case SMPT_CMD_ADDRESS_LEN_USE_CURRENT:
 	default:
-		return nor->addr_width;
+		return nor->addr_nbytes;
 	}
 }
 
@@ -690,7 +690,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	u32 addr;
 	int err;
 	u8 i;
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	u8 read_data_mask, map_id;
 
 	/* Use a kmalloc'ed bounce buffer to guarantee it is DMA-able. */
@@ -698,7 +698,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 	read_opcode = nor->read_opcode;
 
@@ -709,7 +709,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 			break;
 
 		read_data_mask = SMPT_CMD_READ_DATA(smpt[i]);
-		nor->addr_width = spi_nor_smpt_addr_width(nor, smpt[i]);
+		nor->addr_nbytes = spi_nor_smpt_addr_nbytes(nor, smpt[i]);
 		nor->read_dummy = spi_nor_smpt_read_dummy(nor, smpt[i]);
 		nor->read_opcode = SMPT_CMD_OPCODE(smpt[i]);
 		addr = smpt[i + 1];
@@ -756,7 +756,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	/* fall through */
 out:
 	kfree(buf);
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 	nor->read_opcode = read_opcode;
 	return ret;
@@ -1044,7 +1044,7 @@ static int spi_nor_parse_4bait(struct spi_nor *nor,
 	/*
 	 * We need at least one 4-byte op code per read, program and erase
 	 * operation; the .read(), .write() and .erase() hooks share the
-	 * nor->addr_width value.
+	 * nor->addr_nbytes value.
 	 */
 	if (!read_hwcaps || !pp_hwcaps || !erase_mask)
 		goto out;
@@ -1098,7 +1098,7 @@ static int spi_nor_parse_4bait(struct spi_nor *nor,
 	 * Spansion memory. However this quirk is no longer needed with new
 	 * SFDP compliant memories.
 	 */
-	nor->addr_width = 4;
+	nor->addr_nbytes = 4;
 	nor->flags |= SNOR_F_4B_OPCODES | SNOR_F_HAS_4BAIT;
 
 	/* fall through */
diff --git a/drivers/mtd/spi-nor/xilinx.c b/drivers/mtd/spi-nor/xilinx.c
index 1d2f5db047bd..5723157739fc 100644
--- a/drivers/mtd/spi-nor/xilinx.c
+++ b/drivers/mtd/spi-nor/xilinx.c
@@ -31,7 +31,7 @@
 		.sector_size = (8 * (_page_size)),			\
 		.n_sectors = (_n_sectors),				\
 		.page_size = (_page_size),				\
-		.addr_width = 3,					\
+		.addr_nbytes = 3,					\
 		.flags = SPI_NOR_NO_FR
 
 /* Xilinx S3AN share MFR with Atmel SPI NOR */
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 1ede4c89805a..42218a1164f6 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -351,7 +351,7 @@ struct spi_nor_flash_parameter;
  * @bouncebuf_size:	size of the bounce buffer
  * @info:		SPI NOR part JEDEC MFR ID and other info
  * @manufacturer:	SPI NOR manufacturer
- * @addr_width:		number of address bytes
+ * @addr_nbytes:	number of address bytes
  * @erase_opcode:	the opcode for erasing a sector
  * @read_opcode:	the read opcode
  * @read_dummy:		the dummy needed by the read operation
@@ -381,7 +381,7 @@ struct spi_nor {
 	size_t			bouncebuf_size;
 	const struct flash_info	*info;
 	const struct spi_nor_manufacturer *manufacturer;
-	u8			addr_width;
+	u8			addr_nbytes;
 	u8			erase_opcode;
 	u8			read_opcode;
 	u8			read_dummy;
-- 
cgit 


From e60a7233684aa8bbe9090537720fe6a1e901d823 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 28 Jul 2022 08:10:51 +0200
Subject: Documentation: serial: move uart_ops documentation to the struct

While it's a lot of text, it always helps to keep it up to date when
it's by the source. (And not in a separate file.)

The documentation tooling also makes sure that all members of the
structure are documented. (If not, it complains loudly.)

Finally, there needs to be no comments inlined in the structure, so they
are dropped as they are superfluous now.

The compilation time of this header (tested with serial_core.c) didn't
change in my testing at all.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220728061056.20799-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst | 358 +----------------------------
 include/linux/serial_core.h                | 345 +++++++++++++++++++++++++--
 2 files changed, 331 insertions(+), 372 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index ee1679858aa2..cb0ec6db4f1e 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -63,362 +63,8 @@ commonly referred to as the port mutex.
 uart_ops
 --------
 
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		ADDRB
-			- address bit (changed through .rs485_config()).
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See ../tty/tty_ldisc.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
+.. kernel-doc:: include/linux/serial_core.h
+   :identifiers: uart_ops
 
 Other functions
 ---------------
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a6fa7c40c330..35d2f9e8027d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -31,9 +31,336 @@ struct serial_struct;
 struct device;
 struct gpio_desc;
 
-/*
+/**
+ * struct uart_ops -- interface between serial_core and the driver
+ *
  * This structure describes all the operations that can be done on the
- * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
+ * physical hardware.
+ *
+ * @tx_empty: ``unsigned int ()(struct uart_port *port)``
+ *
+ *	This function tests whether the transmitter fifo and shifter for the
+ *	@port is empty. If it is empty, this function should return
+ *	%TIOCSER_TEMT, otherwise return 0. If the port does not support this
+ *	operation, then it should return %TIOCSER_TEMT.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)``
+ *
+ *	This function sets the modem control lines for @port to the state
+ *	described by @mctrl. The relevant bits of @mctrl are:
+ *
+ *		- %TIOCM_RTS	RTS signal.
+ *		- %TIOCM_DTR	DTR signal.
+ *		- %TIOCM_OUT1	OUT1 signal.
+ *		- %TIOCM_OUT2	OUT2 signal.
+ *		- %TIOCM_LOOP	Set the port into loopback mode.
+ *
+ *	If the appropriate bit is set, the signal should be driven
+ *	active.  If the bit is clear, the signal should be driven
+ *	inactive.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @get_mctrl: ``unsigned int ()(struct uart_port *port)``
+ *
+ *	Returns the current state of modem control inputs of @port. The state
+ *	of the outputs should not be returned, since the core keeps track of
+ *	their state. The state information should include:
+ *
+ *		- %TIOCM_CAR	state of DCD signal
+ *		- %TIOCM_CTS	state of CTS signal
+ *		- %TIOCM_DSR	state of DSR signal
+ *		- %TIOCM_RI	state of RI signal
+ *
+ *	The bit is set if the signal is currently driven active.  If
+ *	the port does not support CTS, DCD or DSR, the driver should
+ *	indicate that the signal is permanently active. If RI is
+ *	not available, the signal should not be indicated as active.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @stop_tx: ``void ()(struct uart_port *port)``
+ *
+ *	Stop transmitting characters. This might be due to the CTS line
+ *	becoming inactive or the tty layer indicating we want to stop
+ *	transmission due to an %XOFF character.
+ *
+ *	The driver should stop transmitting characters as soon as possible.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @start_tx: ``void ()(struct uart_port *port)``
+ *
+ *	Start transmitting characters.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @throttle: ``void ()(struct uart_port *port)``
+ *
+ *	Notify the serial driver that input buffers for the line discipline are
+ *	close to full, and it should somehow signal that no more characters
+ *	should be sent to the serial port.
+ *	This will be called only if hardware assisted flow control is enabled.
+ *
+ *	Locking: serialized with @unthrottle() and termios modification by the
+ *	tty layer.
+ *
+ * @unthrottle: ``void ()(struct uart_port *port)``
+ *
+ *	Notify the serial driver that characters can now be sent to the serial
+ *	port without fear of overrunning the input buffers of the line
+ *	disciplines.
+ *
+ *	This will be called only if hardware assisted flow control is enabled.
+ *
+ *	Locking: serialized with @throttle() and termios modification by the
+ *	tty layer.
+ *
+ * @send_xchar: ``void ()(struct uart_port *port, char ch)``
+ *
+ *	Transmit a high priority character, even if the port is stopped. This
+ *	is used to implement XON/XOFF flow control and tcflow(). If the serial
+ *	driver does not implement this function, the tty core will append the
+ *	character to the circular buffer and then call start_tx() / stop_tx()
+ *	to flush the data out.
+ *
+ *	Do not transmit if @ch == '\0' (%__DISABLED_CHAR).
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @stop_rx: ``void ()(struct uart_port *port)``
+ *
+ *	Stop receiving characters; the @port is in the process of being closed.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @enable_ms: ``void ()(struct uart_port *port)``
+ *
+ *	Enable the modem status interrupts.
+ *
+ *	This method may be called multiple times. Modem status interrupts
+ *	should be disabled when the @shutdown() method is called.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @break_ctl: ``void ()(struct uart_port *port, int ctl)``
+ *
+ *	Control the transmission of a break signal. If @ctl is nonzero, the
+ *	break signal should be transmitted. The signal should be terminated
+ *	when another call is made with a zero @ctl.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *
+ * @startup: ``int ()(struct uart_port *port)``
+ *
+ *	Grab any interrupt resources and initialise any low level driver state.
+ *	Enable the port for reception. It should not activate RTS nor DTR;
+ *	this will be done via a separate call to @set_mctrl().
+ *
+ *	This method will only be called when the port is initially opened.
+ *
+ *	Locking: port_sem taken.
+ *	Interrupts: globally disabled.
+ *
+ * @shutdown: ``void ()(struct uart_port *port)``
+ *
+ *	Disable the @port, disable any break condition that may be in effect,
+ *	and free any interrupt resources. It should not disable RTS nor DTR;
+ *	this will have already been done via a separate call to @set_mctrl().
+ *
+ *	Drivers must not access @port->state once this call has completed.
+ *
+ *	This method will only be called when there are no more users of this
+ *	@port.
+ *
+ *	Locking: port_sem taken.
+ *	Interrupts: caller dependent.
+ *
+ * @flush_buffer: ``void ()(struct uart_port *port)``
+ *
+ *	Flush any write buffers, reset any DMA state and stop any ongoing DMA
+ *	transfers.
+ *
+ *	This will be called whenever the @port->state->xmit circular buffer is
+ *	cleared.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new,
+ *			struct ktermios *old)``
+ *
+ *	Change the @port parameters, including word length, parity, stop bits.
+ *	Update @port->read_status_mask and @port->ignore_status_mask to
+ *	indicate the types of events we are interested in receiving. Relevant
+ *	ktermios::c_cflag bits are:
+ *
+ *	- %CSIZE - word size
+ *	- %CSTOPB - 2 stop bits
+ *	- %PARENB - parity enable
+ *	- %PARODD - odd parity (when %PARENB is in force)
+ *	- %ADDRB - address bit (changed through uart_port::rs485_config()).
+ *	- %CREAD - enable reception of characters (if not set, still receive
+ *	  characters from the port, but throw them away).
+ *	- %CRTSCTS - if set, enable CTS status change reporting.
+ *	- %CLOCAL - if not set, enable modem status change reporting.
+ *
+ *	Relevant ktermios::c_iflag bits are:
+ *
+ *	- %INPCK - enable frame and parity error events to be passed to the TTY
+ *	  layer.
+ *	- %BRKINT / %PARMRK - both of these enable break events to be passed to
+ *	  the TTY layer.
+ *	- %IGNPAR - ignore parity and framing errors.
+ *	- %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun
+ *	  errors as well.
+ *
+ *	The interaction of the ktermios::c_iflag bits is as follows (parity
+ *	error given as an example):
+ *
+ *	============ ======= ======= =========================================
+ *	Parity error INPCK   IGNPAR
+ *	============ ======= ======= =========================================
+ *	n/a	     0	     n/a     character received, marked as %TTY_NORMAL
+ *	None	     1	     n/a     character received, marked as %TTY_NORMAL
+ *	Yes	     1	     0	     character received, marked as %TTY_PARITY
+ *	Yes	     1	     1	     character discarded
+ *	============ ======= ======= =========================================
+ *
+ *	Other flags may be used (eg, xon/xoff characters) if your hardware
+ *	supports hardware "soft" flow control.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)``
+ *
+ *	Notifier for discipline change. See
+ *	Documentation/driver-api/tty/tty_ldisc.rst.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *
+ * @pm: ``void ()(struct uart_port *port, unsigned int state,
+ *		 unsigned int oldstate)``
+ *
+ *	Perform any power management related activities on the specified @port.
+ *	@state indicates the new state (defined by enum uart_pm_state),
+ *	@oldstate indicates the previous state.
+ *
+ *	This function should not be used to grab any resources.
+ *
+ *	This will be called when the @port is initially opened and finally
+ *	closed, except when the @port is also the system console. This will
+ *	occur even if %CONFIG_PM is not set.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @type: ``const char *()(struct uart_port *port)``
+ *
+ *	Return a pointer to a string constant describing the specified @port,
+ *	or return %NULL, in which case the string 'unknown' is substituted.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @release_port: ``void ()(struct uart_port *port)``
+ *
+ *	Release any memory and IO region resources currently in use by the
+ *	@port.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @request_port: ``int ()(struct uart_port *port)``
+ *
+ *	Request any memory and IO region resources required by the port. If any
+ *	fail, no resources should be registered when this function returns, and
+ *	it should return -%EBUSY on failure.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @config_port: ``void ()(struct uart_port *port, int type)``
+ *
+ *	Perform any autoconfiguration steps required for the @port. @type
+ *	contains a bit mask of the required configuration. %UART_CONFIG_TYPE
+ *	indicates that the port requires detection and identification.
+ *	@port->type should be set to the type found, or %PORT_UNKNOWN if no
+ *	port was detected.
+ *
+ *	%UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+ *	which should be probed using standard kernel autoprobing techniques.
+ *	This is not necessary on platforms where ports have interrupts
+ *	internally hard wired (eg, system on a chip implementations).
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @verify_port: ``int ()(struct uart_port *port,
+ *			struct serial_struct *serinfo)``
+ *
+ *	Verify the new serial port information contained within @serinfo is
+ *	suitable for this port type.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd,
+ *		unsigned long arg)``
+ *
+ *	Perform any port specific IOCTLs. IOCTL commands must be defined using
+ *	the standard numbering system found in <asm/ioctl.h>.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @poll_init: ``int ()(struct uart_port *port)``
+ *
+ *	Called by kgdb to perform the minimal hardware initialization needed to
+ *	support @poll_put_char() and @poll_get_char(). Unlike @startup(), this
+ *	should not request interrupts.
+ *
+ *	Locking: %tty_mutex and tty_port->mutex taken.
+ *	Interrupts: n/a.
+ *
+ * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)``
+ *
+ *	Called by kgdb to write a single character @ch directly to the serial
+ *	@port. It can and should block until there is space in the TX FIFO.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @poll_get_char: ``int ()(struct uart_port *port)``
+ *
+ *	Called by kgdb to read a single character directly from the serial
+ *	port. If data is available, it should be returned; otherwise the
+ *	function should return %NO_POLL_CHAR immediately.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
  */
 struct uart_ops {
 	unsigned int	(*tx_empty)(struct uart_port *);
@@ -56,22 +383,8 @@ struct uart_ops {
 	void		(*set_ldisc)(struct uart_port *, struct ktermios *);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned int oldstate);
-
-	/*
-	 * Return a string describing the type of the port
-	 */
 	const char	*(*type)(struct uart_port *);
-
-	/*
-	 * Release IO and memory resources used by the port.
-	 * This includes iounmap if necessary.
-	 */
 	void		(*release_port)(struct uart_port *);
-
-	/*
-	 * Request IO and memory resources used by the port.
-	 * This includes iomapping the port if necessary.
-	 */
 	int		(*request_port)(struct uart_port *);
 	void		(*config_port)(struct uart_port *, int);
 	int		(*verify_port)(struct uart_port *, struct serial_struct *);
-- 
cgit 


From 5f10376b6bc1e2773f56977980ab08c9e4fa91a7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 26 Jul 2022 14:56:52 -0700
Subject: add missing includes and forward declarations to networking includes
 under linux/

Similarly to a recent include/net/ cleanup, this patch adds
missing includes to networking headers under include/linux.
All these problems are currently masked by the existing users
including the missing dependency before the broken header.

Link: https://lore.kernel.org/all/20220723045755.2676857-1-kuba@kernel.org/ v1
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220726215652.158167-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/atm_tcp.h         |  2 ++
 include/linux/dsa/tag_qca.h     |  5 +++++
 include/linux/hippidevice.h     |  4 ++++
 include/linux/if_eql.h          |  1 +
 include/linux/if_hsr.h          |  4 ++++
 include/linux/if_rmnet.h        |  2 ++
 include/linux/if_tap.h          | 11 ++++++-----
 include/linux/mdio/mdio-xgene.h |  4 ++++
 include/linux/nl802154.h        |  2 ++
 include/linux/phy_fixed.h       |  3 +++
 include/linux/ppp-comp.h        |  2 +-
 include/linux/ppp_channel.h     |  2 ++
 include/linux/ptp_kvm.h         |  2 ++
 include/linux/ptp_pch.h         |  4 ++++
 include/linux/seq_file_net.h    |  1 +
 include/linux/sungem_phy.h      |  2 ++
 include/linux/usb/usbnet.h      |  6 ++++++
 include/net/llc_s_st.h          |  6 ++++++
 18 files changed, 57 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/atm_tcp.h b/include/linux/atm_tcp.h
index c8ecf6f68fb5..2558439d849b 100644
--- a/include/linux/atm_tcp.h
+++ b/include/linux/atm_tcp.h
@@ -9,6 +9,8 @@
 
 #include <uapi/linux/atm_tcp.h>
 
+struct atm_vcc;
+struct module;
 
 struct atm_tcp_ops {
 	int (*attach)(struct atm_vcc *vcc,int itf);
diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h
index 4359fb0221cf..50be7cbd93a5 100644
--- a/include/linux/dsa/tag_qca.h
+++ b/include/linux/dsa/tag_qca.h
@@ -3,6 +3,11 @@
 #ifndef __TAG_QCA_H
 #define __TAG_QCA_H
 
+#include <linux/types.h>
+
+struct dsa_switch;
+struct sk_buff;
+
 #define QCA_HDR_LEN	2
 #define QCA_HDR_VERSION	0x2
 
diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h
index 9dc01f7ab5b4..07414c241e65 100644
--- a/include/linux/hippidevice.h
+++ b/include/linux/hippidevice.h
@@ -23,6 +23,10 @@
 
 #ifdef __KERNEL__
 
+struct neigh_parms;
+struct net_device;
+struct sk_buff;
+
 struct hippi_cb {
 	__u32	ifield;
 };
diff --git a/include/linux/if_eql.h b/include/linux/if_eql.h
index d75601d613cc..07f9b660b741 100644
--- a/include/linux/if_eql.h
+++ b/include/linux/if_eql.h
@@ -21,6 +21,7 @@
 
 #include <linux/timer.h>
 #include <linux/spinlock.h>
+#include <net/net_trackers.h>
 #include <uapi/linux/if_eql.h>
 
 typedef struct slave {
diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h
index 408539d5ea5f..0404f5bf4f30 100644
--- a/include/linux/if_hsr.h
+++ b/include/linux/if_hsr.h
@@ -2,6 +2,10 @@
 #ifndef _LINUX_IF_HSR_H_
 #define _LINUX_IF_HSR_H_
 
+#include <linux/types.h>
+
+struct net_device;
+
 /* used to differentiate various protocols */
 enum hsr_version {
 	HSR_V0 = 0,
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 10e7521ecb6c..839d1e48b85e 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -5,6 +5,8 @@
 #ifndef _LINUX_IF_RMNET_H_
 #define _LINUX_IF_RMNET_H_
 
+#include <linux/types.h>
+
 struct rmnet_map_header {
 	u8 flags;			/* MAP_CMD_FLAG, MAP_PAD_LEN_MASK */
 	u8 mux_id;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 915a187cfabd..553552fa635c 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -2,14 +2,18 @@
 #ifndef _LINUX_IF_TAP_H_
 #define _LINUX_IF_TAP_H_
 
+#include <net/sock.h>
+#include <linux/skb_array.h>
+
+struct file;
+struct socket;
+
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
 struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
-struct file;
-struct socket;
 static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
@@ -20,9 +24,6 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
 }
 #endif /* CONFIG_TAP */
 
-#include <net/sock.h>
-#include <linux/skb_array.h>
-
 /*
  * Maximum times a tap device can be opened. This can be used to
  * configure the number of receive queue, e.g. for multiqueue virtio.
diff --git a/include/linux/mdio/mdio-xgene.h b/include/linux/mdio/mdio-xgene.h
index 8af93ada8b64..9e588965dc83 100644
--- a/include/linux/mdio/mdio-xgene.h
+++ b/include/linux/mdio/mdio-xgene.h
@@ -8,6 +8,10 @@
 #ifndef __MDIO_XGENE_H__
 #define __MDIO_XGENE_H__
 
+#include <linux/bits.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
 #define BLOCK_XG_MDIO_CSR_OFFSET	0x5000
 #define BLOCK_DIAG_CSR_OFFSET		0xd000
 #define XGENET_CONFIG_REG_ADDR		0x20
diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index b22782225f27..cbe5fd1dd2e7 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -8,6 +8,8 @@
 #ifndef NL802154_H
 #define NL802154_H
 
+#include <net/netlink.h>
+
 #define IEEE802154_NL_NAME "802.15.4 MAC"
 #define IEEE802154_MCAST_COORD_NAME "coordinator"
 #define IEEE802154_MCAST_BEACON_NAME "beacon"
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 52bc8e487ef7..1acafd86ab13 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -2,6 +2,8 @@
 #ifndef __PHY_FIXED_H
 #define __PHY_FIXED_H
 
+#include <linux/types.h>
+
 struct fixed_phy_status {
 	int link;
 	int speed;
@@ -12,6 +14,7 @@ struct fixed_phy_status {
 
 struct device_node;
 struct gpio_desc;
+struct net_device;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
diff --git a/include/linux/ppp-comp.h b/include/linux/ppp-comp.h
index 9d3ffc8f5ea6..fb847e47f148 100644
--- a/include/linux/ppp-comp.h
+++ b/include/linux/ppp-comp.h
@@ -9,7 +9,7 @@
 
 #include <uapi/linux/ppp-comp.h>
 
-
+struct compstat;
 struct module;
 
 /*
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index 91f9a928344e..45e6e427ceb8 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -20,6 +20,8 @@
 #include <linux/poll.h>
 #include <net/net_namespace.h>
 
+struct net_device_path;
+struct net_device_path_ctx;
 struct ppp_channel;
 
 struct ppp_channel_ops {
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
index f960a719f0d5..c2e28deef33a 100644
--- a/include/linux/ptp_kvm.h
+++ b/include/linux/ptp_kvm.h
@@ -8,6 +8,8 @@
 #ifndef _PTP_KVM_H_
 #define _PTP_KVM_H_
 
+#include <linux/types.h>
+
 struct timespec64;
 struct clocksource;
 
diff --git a/include/linux/ptp_pch.h b/include/linux/ptp_pch.h
index 51818198c292..7ba643b62c15 100644
--- a/include/linux/ptp_pch.h
+++ b/include/linux/ptp_pch.h
@@ -10,6 +10,10 @@
 #ifndef _PTP_PCH_H_
 #define _PTP_PCH_H_
 
+#include <linux/types.h>
+
+struct pci_dev;
+
 void pch_ch_control_write(struct pci_dev *pdev, u32 val);
 u32  pch_ch_event_read(struct pci_dev *pdev);
 void pch_ch_event_write(struct pci_dev *pdev, u32 val);
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index b97912fdbae7..79638395bc32 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -3,6 +3,7 @@
 #define __SEQ_FILE_NET_H__
 
 #include <linux/seq_file.h>
+#include <net/net_trackers.h>
 
 struct net;
 extern struct net init_net;
diff --git a/include/linux/sungem_phy.h b/include/linux/sungem_phy.h
index 3a11fa41a131..c505f30e8b68 100644
--- a/include/linux/sungem_phy.h
+++ b/include/linux/sungem_phy.h
@@ -2,6 +2,8 @@
 #ifndef __SUNGEM_PHY_H__
 #define __SUNGEM_PHY_H__
 
+#include <linux/types.h>
+
 struct mii_phy;
 
 /* Operations supported by any kind of PHY */
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 1b4d72d5e891..b42b72391a8d 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -23,6 +23,12 @@
 #ifndef	__LINUX_USB_USBNET_H
 #define	__LINUX_USB_USBNET_H
 
+#include <linux/mii.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/usb.h>
+
 /* interface from usbnet core to each USB networking link we handle */
 struct usbnet {
 	/* housekeeping */
diff --git a/include/net/llc_s_st.h b/include/net/llc_s_st.h
index c4359e203013..ed5b2fa40d32 100644
--- a/include/net/llc_s_st.h
+++ b/include/net/llc_s_st.h
@@ -12,6 +12,12 @@
  * See the GNU General Public License for more details.
  */
 
+#include <linux/types.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_ev.h>
+
+struct llc_sap_state_trans;
+
 #define LLC_NR_SAP_STATES	2       /* size of state table */
 
 /* structures and types */
-- 
cgit 


From 7fb48d25b5ce3bc488dbb019bf1736248181de9a Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 27 Jul 2022 19:16:34 +0900
Subject: can: dev: add generic function can_ethtool_op_get_ts_info_hwts()

Add function can_ethtool_op_get_ts_info_hwts(). This function will be
used by CAN devices with hardware TX/RX timestamping support to
implement ethtool_ops::get_ts_info. This function does not offer
support to activate/deactivate hardware timestamps at device level nor
support the filter options (which is currently the case for all CAN
devices with hardware timestamping support).

The fact that hardware timestamp can not be deactivated at hardware
level does not impact the userland. As long as the user do not set
SO_TIMESTAMPING using a setsockopt() or ioctl(), the kernel will not
emit TX timestamps (RX timestamps will still be reproted as it is the
case currently).

Drivers which need more fine grained control remains free to implement
their own function, but we foresee that the generic function
introduced here will be sufficient for the majority.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/all/20220727101641.198847-8-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 21 +++++++++++++++++++++
 include/linux/can/dev.h   |  3 +++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 523eaacfe29e..f307034ff4fd 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -322,6 +322,27 @@ int can_change_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL_GPL(can_change_mtu);
 
+/* generic implementation of ethtool_ops::get_ts_info for CAN devices
+ * supporting hardware timestamps
+ */
+int can_ethtool_op_get_ts_info_hwts(struct net_device *dev,
+				    struct ethtool_ts_info *info)
+{
+	info->so_timestamping =
+		SOF_TIMESTAMPING_TX_SOFTWARE |
+		SOF_TIMESTAMPING_RX_SOFTWARE |
+		SOF_TIMESTAMPING_SOFTWARE |
+		SOF_TIMESTAMPING_TX_HARDWARE |
+		SOF_TIMESTAMPING_RX_HARDWARE |
+		SOF_TIMESTAMPING_RAW_HARDWARE;
+	info->phc_index = -1;
+	info->tx_types = BIT(HWTSTAMP_TX_ON);
+	info->rx_filters = BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+EXPORT_SYMBOL(can_ethtool_op_get_ts_info_hwts);
+
 /* Common open function when the device gets opened.
  *
  * This function should be called in the open function of the device
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index e22dc03c850e..752bd45d8ebf 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -20,6 +20,7 @@
 #include <linux/can/length.h>
 #include <linux/can/netlink.h>
 #include <linux/can/skb.h>
+#include <linux/ethtool.h>
 #include <linux/netdevice.h>
 
 /*
@@ -162,6 +163,8 @@ struct can_priv *safe_candev_priv(struct net_device *dev);
 int open_candev(struct net_device *dev);
 void close_candev(struct net_device *dev);
 int can_change_mtu(struct net_device *dev, int new_mtu);
+int can_ethtool_op_get_ts_info_hwts(struct net_device *dev,
+				    struct ethtool_ts_info *info);
 
 int register_candev(struct net_device *dev);
 void unregister_candev(struct net_device *dev);
-- 
cgit 


From 90f942c5a6d775bad1be33ba214755314105da4a Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 27 Jul 2022 19:16:35 +0900
Subject: can: dev: add generic function can_eth_ioctl_hwts()

Tools based on libpcap (such as tcpdump) expect the SIOCSHWTSTAMP
ioctl call to be supported. This is also specified in the kernel doc
[1]. The purpose of this ioctl is to toggle the hardware timestamps.

Currently, CAN devices which support hardware timestamping have those
always activated. can_eth_ioctl_hwts() is a dumb function that will
always succeed when requested to set tx_type to HWTSTAMP_TX_ON or
rx_filter to HWTSTAMP_FILTER_ALL.

[1] Kernel doc: Timestamping, section 3.1 "Hardware Timestamping
Implementation: Device Drivers"
Link: https://docs.kernel.org/networking/timestamping.html#hardware-timestamping-implementation-device-drivers

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/all/20220727101641.198847-9-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 29 +++++++++++++++++++++++++++++
 include/linux/can/dev.h   |  1 +
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index f307034ff4fd..c1956b1e9faf 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -322,6 +322,35 @@ int can_change_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL_GPL(can_change_mtu);
 
+/* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices
+ * supporting hardware timestamps
+ */
+int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config hwts_cfg = { 0 };
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP: /* set */
+		if (copy_from_user(&hwts_cfg, ifr->ifr_data, sizeof(hwts_cfg)))
+			return -EFAULT;
+		if (hwts_cfg.tx_type == HWTSTAMP_TX_ON &&
+		    hwts_cfg.rx_filter == HWTSTAMP_FILTER_ALL)
+			return 0;
+		return -ERANGE;
+
+	case SIOCGHWTSTAMP: /* get */
+		hwts_cfg.tx_type = HWTSTAMP_TX_ON;
+		hwts_cfg.rx_filter = HWTSTAMP_FILTER_ALL;
+		if (copy_to_user(ifr->ifr_data, &hwts_cfg, sizeof(hwts_cfg)))
+			return -EFAULT;
+		return 0;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL(can_eth_ioctl_hwts);
+
 /* generic implementation of ethtool_ops::get_ts_info for CAN devices
  * supporting hardware timestamps
  */
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 752bd45d8ebf..c3e50e537e39 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -163,6 +163,7 @@ struct can_priv *safe_candev_priv(struct net_device *dev);
 int open_candev(struct net_device *dev);
 void close_candev(struct net_device *dev);
 int can_change_mtu(struct net_device *dev, int new_mtu);
+int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd);
 int can_ethtool_op_get_ts_info_hwts(struct net_device *dev,
 				    struct ethtool_ts_info *info);
 
-- 
cgit 


From cceeeb6a6d02e7b9a74ddd27a3225013b34174aa Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Mon, 27 Jun 2022 11:50:51 +0200
Subject: wait: Fix __wait_event_hrtimeout for RT/DL tasks

Changes to hrtimer mode (potentially made by __hrtimer_init_sleeper on
PREEMPT_RT) are not visible to hrtimer_start_range_ns, thus not
accounted for by hrtimer_start_expires call paths. In particular,
__wait_event_hrtimeout suffers from this problem as we have, for
example:

fs/aio.c::read_events
  wait_event_interruptible_hrtimeout
    __wait_event_hrtimeout
      hrtimer_init_sleeper_on_stack <- this might "mode |= HRTIMER_MODE_HARD"
                                       on RT if task runs at RT/DL priority
        hrtimer_start_range_ns
          WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard)
          fires since the latter doesn't see the change of mode done by
          init_sleeper

Fix it by making __wait_event_hrtimeout call hrtimer_sleeper_start_expires,
which is aware of the special RT/DL case, instead of hrtimer_start_range_ns.

Reported-by: Bruno Goncalves <bgoncalv@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lore.kernel.org/r/20220627095051.42470-1-juri.lelli@redhat.com
---
 include/linux/wait.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 851e07da2583..58cfbf81447c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -544,10 +544,11 @@ do {										\
 										\
 	hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,			\
 				      HRTIMER_MODE_REL);			\
-	if ((timeout) != KTIME_MAX)						\
-		hrtimer_start_range_ns(&__t.timer, timeout,			\
-				       current->timer_slack_ns,			\
-				       HRTIMER_MODE_REL);			\
+	if ((timeout) != KTIME_MAX) {						\
+		hrtimer_set_expires_range_ns(&__t.timer, timeout,		\
+					current->timer_slack_ns);		\
+		hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);		\
+	}									\
 										\
 	__ret = ___wait_event(wq_head, condition, state, 0, 0,			\
 		if (!__t.task) {						\
-- 
cgit 


From 3f7ced7ac9af43fbc0b886aa9ef4397a5fa4b6e1 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 13 Jun 2022 13:43:25 +0100
Subject: drivers/thermal/cpufreq_cooling : Refactor
 thermal_power_cpu_get_power tracing

Simplify the thermal_power_cpu_get_power trace event by removing
complicated cpumask and variable length array. Now the tools parsing trace
output don't have to hassle to get this power data. The simplified format
version uses 'policy->cpu'. Remove also the 'load' information completely
since there is very little value of it in this trace event. To get the
CPUs' load (or utilization) there are other dedicated trace hooks in the
kernel. This patch also simplifies and speeds-up the main cooling code
when that trace event is enabled.

Rename the trace event to avoid confusion of tools which parse the trace
file.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20220613124327.30766-3-lukasz.luba@arm.com
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/cpufreq_cooling.c | 18 +-----------------
 include/trace/events/thermal.h    | 28 ++++++++--------------------
 2 files changed, 9 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index ad8b86f5281b..492a67e267e8 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -216,16 +216,9 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 	u32 total_load = 0;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	struct cpufreq_policy *policy = cpufreq_cdev->policy;
-	u32 *load_cpu = NULL;
 
 	freq = cpufreq_quick_get(policy->cpu);
 
-	if (trace_thermal_power_cpu_get_power_enabled()) {
-		u32 ncpus = cpumask_weight(policy->related_cpus);
-
-		load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL);
-	}
-
 	for_each_cpu(cpu, policy->related_cpus) {
 		u32 load;
 
@@ -235,22 +228,13 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 			load = 0;
 
 		total_load += load;
-		if (load_cpu)
-			load_cpu[i] = load;
-
-		i++;
 	}
 
 	cpufreq_cdev->last_load = total_load;
 
 	*power = get_dynamic_power(cpufreq_cdev, freq);
 
-	if (load_cpu) {
-		trace_thermal_power_cpu_get_power(policy->related_cpus, freq,
-						  load_cpu, i, *power);
-
-		kfree(load_cpu);
-	}
+	trace_thermal_power_cpu_get_power_simple(policy->cpu, *power);
 
 	return 0;
 }
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 8a5f04888abd..e58bf3072f32 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h
@@ -92,34 +92,22 @@ TRACE_EVENT(thermal_zone_trip,
 );
 
 #ifdef CONFIG_CPU_THERMAL
-TRACE_EVENT(thermal_power_cpu_get_power,
-	TP_PROTO(const struct cpumask *cpus, unsigned long freq, u32 *load,
-		size_t load_len, u32 dynamic_power),
+TRACE_EVENT(thermal_power_cpu_get_power_simple,
+	TP_PROTO(int cpu, u32 power),
 
-	TP_ARGS(cpus, freq, load, load_len, dynamic_power),
+	TP_ARGS(cpu, power),
 
 	TP_STRUCT__entry(
-		__bitmask(cpumask, num_possible_cpus())
-		__field(unsigned long, freq          )
-		__dynamic_array(u32,   load, load_len)
-		__field(size_t,        load_len      )
-		__field(u32,           dynamic_power )
+		__field(int, cpu)
+		__field(u32, power)
 	),
 
 	TP_fast_assign(
-		__assign_bitmask(cpumask, cpumask_bits(cpus),
-				num_possible_cpus());
-		__entry->freq = freq;
-		memcpy(__get_dynamic_array(load), load,
-			load_len * sizeof(*load));
-		__entry->load_len = load_len;
-		__entry->dynamic_power = dynamic_power;
+		__entry->cpu = cpu;
+		__entry->power = power;
 	),
 
-	TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d",
-		__get_bitmask(cpumask), __entry->freq,
-		__print_array(__get_dynamic_array(load), __entry->load_len, 4),
-		__entry->dynamic_power)
+	TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power)
 );
 
 TRACE_EVENT(thermal_power_cpu_limit,
-- 
cgit 


From 4102c4042a33c682021683ec26c2dca3fd9d7cc2 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 29 Jun 2022 17:10:12 +0200
Subject: thermal/core: Remove DROP_FULL and RAISE_FULL

The trends DROP_FULL and RAISE_FULL are not used and were never used
in the past AFAICT. Remove these conditions as they seems to not be
handled anywhere.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220629151012.3115773-2-daniel.lezcano@linaro.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/gov_step_wise.c | 11 -----------
 include/linux/thermal.h         |  2 --
 2 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 6efbfaf014da..9729b46d0258 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -67,10 +67,6 @@ static unsigned long get_target_state(struct thermal_instance *instance,
 			next_target = clamp((cur_state + 1), instance->lower, instance->upper);
 		}
 		break;
-	case THERMAL_TREND_RAISE_FULL:
-		if (throttle)
-			next_target = instance->upper;
-		break;
 	case THERMAL_TREND_DROPPING:
 		if (cur_state <= instance->lower) {
 			if (!throttle)
@@ -81,13 +77,6 @@ static unsigned long get_target_state(struct thermal_instance *instance,
 			}
 		}
 		break;
-	case THERMAL_TREND_DROP_FULL:
-		if (cur_state == instance->lower) {
-			if (!throttle)
-				next_target = THERMAL_NO_TARGET;
-		} else
-			next_target = instance->lower;
-		break;
 	default:
 		break;
 	}
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 365733b428d8..231bac2768fb 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -40,8 +40,6 @@ enum thermal_trend {
 	THERMAL_TREND_STABLE, /* temperature is stable */
 	THERMAL_TREND_RAISING, /* temperature is raising */
 	THERMAL_TREND_DROPPING, /* temperature is dropping */
-	THERMAL_TREND_RAISE_FULL, /* apply highest cooling action */
-	THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */
 };
 
 /* Thermal notification reason */
-- 
cgit 


From 646274ddaf756dc549fc98cc581fc1ce4fef517a Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 22 Jul 2022 22:00:01 +0200
Subject: thermal/of: Move thermal_trip structure to thermal.h

The structure thermal_trip is now generic and will be usable by the
different sensor drivers in place of their own structure.

Move its definition to thermal.h to make it accessible.

Cc: Alexandre Bailon <abailon@baylibre.com>
Cc: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20220722200007.1839356-5-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.h | 12 ------------
 include/linux/thermal.h        | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index ff10cdda056c..60844e2d59bb 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -68,18 +68,6 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 void thermal_cdev_update(struct thermal_cooling_device *);
 void __thermal_cdev_update(struct thermal_cooling_device *cdev);
 
-/**
- * struct thermal_trip - representation of a point in temperature domain
- * @temperature: temperature value in miliCelsius
- * @hysteresis: relative hysteresis in miliCelsius
- * @type: trip point type
- */
-struct thermal_trip {
-	int temperature;
-	int hysteresis;
-	enum thermal_trip_type type;
-};
-
 int get_tz_trend(struct thermal_zone_device *tz, int trip);
 
 struct thermal_instance *
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 231bac2768fb..7e66970f0464 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -78,6 +78,18 @@ struct thermal_zone_device_ops {
 	void (*critical)(struct thermal_zone_device *);
 };
 
+/**
+ * struct thermal_trip - representation of a point in temperature domain
+ * @temperature: temperature value in miliCelsius
+ * @hysteresis: relative hysteresis in miliCelsius
+ * @type: trip point type
+ */
+struct thermal_trip {
+	int temperature;
+	int hysteresis;
+	enum thermal_trip_type type;
+};
+
 struct thermal_cooling_device_ops {
 	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
-- 
cgit 


From e5bfcd30f88fdb0ce830229e7ccdeddcb7a59b04 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 22 Jul 2022 22:00:04 +0200
Subject: thermal/core: Rename 'trips' to 'num_trips'

In order to use thermal trips defined in the thermal structure, rename
the 'trips' field to 'num_trips' to have the 'trips' field containing the
thermal trip points.

Cc: Alexandre Bailon <abailon@baylibre.com>
Cc: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220722200007.1839356-8-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/gov_fair_share.c        |  6 +++---
 drivers/thermal/gov_power_allocator.c   |  4 ++--
 drivers/thermal/tegra/tegra30-tsensor.c |  2 +-
 drivers/thermal/thermal_core.c          | 20 ++++++++++----------
 drivers/thermal/thermal_helpers.c       |  4 ++--
 drivers/thermal/thermal_netlink.c       |  2 +-
 drivers/thermal/thermal_sysfs.c         | 22 +++++++++++-----------
 include/linux/thermal.h                 |  4 ++--
 8 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index 1e5abf4822be..6a2abcfc648f 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -25,10 +25,10 @@ static int get_trip_level(struct thermal_zone_device *tz)
 	int trip_temp;
 	enum thermal_trip_type trip_type;
 
-	if (tz->trips == 0 || !tz->ops->get_trip_temp)
+	if (tz->num_trips == 0 || !tz->ops->get_trip_temp)
 		return 0;
 
-	for (count = 0; count < tz->trips; count++) {
+	for (count = 0; count < tz->num_trips; count++) {
 		tz->ops->get_trip_temp(tz, count, &trip_temp);
 		if (tz->temperature < trip_temp)
 			break;
@@ -53,7 +53,7 @@ static long get_target_state(struct thermal_zone_device *tz,
 
 	cdev->ops->get_max_state(cdev, &max_state);
 
-	return (long)(percentage * level * max_state) / (100 * tz->trips);
+	return (long)(percentage * level * max_state) / (100 * tz->num_trips);
 }
 
 /**
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index 13e375751d22..1d5052470967 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -527,7 +527,7 @@ static void get_governor_trips(struct thermal_zone_device *tz,
 	last_active = INVALID_TRIP;
 	last_passive = INVALID_TRIP;
 
-	for (i = 0; i < tz->trips; i++) {
+	for (i = 0; i < tz->num_trips; i++) {
 		enum thermal_trip_type type;
 		int ret;
 
@@ -668,7 +668,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz)
 
 	get_governor_trips(tz, params);
 
-	if (tz->trips > 0) {
+	if (tz->num_trips > 0) {
 		ret = tz->ops->get_trip_temp(tz,
 					params->trip_max_desired_temperature,
 					&control_temp);
diff --git a/drivers/thermal/tegra/tegra30-tsensor.c b/drivers/thermal/tegra/tegra30-tsensor.c
index 9b6b693cbcf8..05886684f429 100644
--- a/drivers/thermal/tegra/tegra30-tsensor.c
+++ b/drivers/thermal/tegra/tegra30-tsensor.c
@@ -316,7 +316,7 @@ static void tegra_tsensor_get_hw_channel_trips(struct thermal_zone_device *tzd,
 	*hot_trip  = 85000;
 	*crit_trip = 90000;
 
-	for (i = 0; i < tzd->trips; i++) {
+	for (i = 0; i < tzd->num_trips; i++) {
 		enum thermal_trip_type type;
 		int trip_temp;
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 14f31e56c2a0..19350a4925c4 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -501,7 +501,7 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 
 	tz->notify_event = event;
 
-	for (count = 0; count < tz->trips; count++)
+	for (count = 0; count < tz->num_trips; count++)
 		handle_thermal_trip(tz, count);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
@@ -626,7 +626,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	unsigned long max_state;
 	int result, ret;
 
-	if (trip >= tz->trips || trip < 0)
+	if (trip >= tz->num_trips || trip < 0)
 		return -EINVAL;
 
 	list_for_each_entry(pos1, &thermal_tz_list, node) {
@@ -807,7 +807,7 @@ static void __bind(struct thermal_zone_device *tz, int mask,
 {
 	int i, ret;
 
-	for (i = 0; i < tz->trips; i++) {
+	for (i = 0; i < tz->num_trips; i++) {
 		if (mask & (1 << i)) {
 			unsigned long upper, lower;
 
@@ -1053,7 +1053,7 @@ static void __unbind(struct thermal_zone_device *tz, int mask,
 {
 	int i;
 
-	for (i = 0; i < tz->trips; i++)
+	for (i = 0; i < tz->num_trips; i++)
 		if (mask & (1 << i))
 			thermal_zone_unbind_cooling_device(tz, i, cdev);
 }
@@ -1165,7 +1165,7 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms
 /**
  * thermal_zone_device_register() - register a new thermal zone device
  * @type:	the thermal zone device type
- * @trips:	the number of trip points the thermal zone support
+ * @num_trips:	the number of trip points the thermal zone support
  * @mask:	a bit string indicating the writeablility of trip points
  * @devdata:	private device data
  * @ops:	standard thermal zone device callbacks
@@ -1187,7 +1187,7 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms
  * IS_ERR*() helpers.
  */
 struct thermal_zone_device *
-thermal_zone_device_register(const char *type, int trips, int mask,
+thermal_zone_device_register(const char *type, int num_trips, int mask,
 			     void *devdata, struct thermal_zone_device_ops *ops,
 			     struct thermal_zone_params *tzp, int passive_delay,
 			     int polling_delay)
@@ -1211,7 +1211,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) {
+	if (num_trips > THERMAL_MAX_TRIPS || num_trips < 0 || mask >> num_trips) {
 		pr_err("Incorrect number of thermal trips\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -1221,7 +1221,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp))
+	if (num_trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp))
 		return ERR_PTR(-EINVAL);
 
 	tz = kzalloc(sizeof(*tz), GFP_KERNEL);
@@ -1251,7 +1251,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	tz->tzp = tzp;
 	tz->device.class = &thermal_class;
 	tz->devdata = devdata;
-	tz->trips = trips;
+	tz->num_trips = num_trips;
 
 	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
 	thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
@@ -1269,7 +1269,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	if (result)
 		goto release_device;
 
-	for (count = 0; count < trips; count++) {
+	for (count = 0; count < num_trips; count++) {
 		if (tz->ops->get_trip_type(tz, count, &trip_type) ||
 		    tz->ops->get_trip_temp(tz, count, &trip_temp) ||
 		    !trip_temp)
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 60bfda1a1db7..690890f054a3 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -89,7 +89,7 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 	ret = tz->ops->get_temp(tz, temp);
 
 	if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
-		for (count = 0; count < tz->trips; count++) {
+		for (count = 0; count < tz->num_trips; count++) {
 			ret = tz->ops->get_trip_type(tz, count, &type);
 			if (!ret && type == THERMAL_TRIP_CRITICAL) {
 				ret = tz->ops->get_trip_temp(tz, count,
@@ -137,7 +137,7 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)
 	if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
 		goto exit;
 
-	for (i = 0; i < tz->trips; i++) {
+	for (i = 0; i < tz->num_trips; i++) {
 		int trip_low;
 
 		tz->ops->get_trip_temp(tz, i, &trip_temp);
diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index 32fea5174cc0..050d243a5fa1 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -469,7 +469,7 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p)
 
 	mutex_lock(&tz->lock);
 
-	for (i = 0; i < tz->trips; i++) {
+	for (i = 0; i < tz->num_trips; i++) {
 
 		enum thermal_trip_type type;
 		int temp, hyst = 0;
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 1c4aac8464a7..5018459e8dd9 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -416,15 +416,15 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 	int indx;
 
 	/* This function works only for zones with at least one trip */
-	if (tz->trips <= 0)
+	if (tz->num_trips <= 0)
 		return -EINVAL;
 
-	tz->trip_type_attrs = kcalloc(tz->trips, sizeof(*tz->trip_type_attrs),
+	tz->trip_type_attrs = kcalloc(tz->num_trips, sizeof(*tz->trip_type_attrs),
 				      GFP_KERNEL);
 	if (!tz->trip_type_attrs)
 		return -ENOMEM;
 
-	tz->trip_temp_attrs = kcalloc(tz->trips, sizeof(*tz->trip_temp_attrs),
+	tz->trip_temp_attrs = kcalloc(tz->num_trips, sizeof(*tz->trip_temp_attrs),
 				      GFP_KERNEL);
 	if (!tz->trip_temp_attrs) {
 		kfree(tz->trip_type_attrs);
@@ -432,7 +432,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 	}
 
 	if (tz->ops->get_trip_hyst) {
-		tz->trip_hyst_attrs = kcalloc(tz->trips,
+		tz->trip_hyst_attrs = kcalloc(tz->num_trips,
 					      sizeof(*tz->trip_hyst_attrs),
 					      GFP_KERNEL);
 		if (!tz->trip_hyst_attrs) {
@@ -442,7 +442,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 		}
 	}
 
-	attrs = kcalloc(tz->trips * 3 + 1, sizeof(*attrs), GFP_KERNEL);
+	attrs = kcalloc(tz->num_trips * 3 + 1, sizeof(*attrs), GFP_KERNEL);
 	if (!attrs) {
 		kfree(tz->trip_type_attrs);
 		kfree(tz->trip_temp_attrs);
@@ -451,7 +451,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 		return -ENOMEM;
 	}
 
-	for (indx = 0; indx < tz->trips; indx++) {
+	for (indx = 0; indx < tz->num_trips; indx++) {
 		/* create trip type attribute */
 		snprintf(tz->trip_type_attrs[indx].name, THERMAL_NAME_LENGTH,
 			 "trip_point_%d_type", indx);
@@ -478,7 +478,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 			tz->trip_temp_attrs[indx].attr.store =
 							trip_point_temp_store;
 		}
-		attrs[indx + tz->trips] = &tz->trip_temp_attrs[indx].attr.attr;
+		attrs[indx + tz->num_trips] = &tz->trip_temp_attrs[indx].attr.attr;
 
 		/* create Optional trip hyst attribute */
 		if (!tz->ops->get_trip_hyst)
@@ -496,10 +496,10 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 			tz->trip_hyst_attrs[indx].attr.store =
 					trip_point_hyst_store;
 		}
-		attrs[indx + tz->trips * 2] =
+		attrs[indx + tz->num_trips * 2] =
 					&tz->trip_hyst_attrs[indx].attr.attr;
 	}
-	attrs[tz->trips * 3] = NULL;
+	attrs[tz->num_trips * 3] = NULL;
 
 	tz->trips_attribute_group.attrs = attrs;
 
@@ -540,7 +540,7 @@ int thermal_zone_create_device_groups(struct thermal_zone_device *tz,
 	for (i = 0; i < size - 2; i++)
 		groups[i] = thermal_zone_attribute_groups[i];
 
-	if (tz->trips) {
+	if (tz->num_trips) {
 		result = create_trip_attrs(tz, mask);
 		if (result) {
 			kfree(groups);
@@ -561,7 +561,7 @@ void thermal_zone_destroy_device_groups(struct thermal_zone_device *tz)
 	if (!tz)
 		return;
 
-	if (tz->trips)
+	if (tz->num_trips)
 		destroy_trip_attrs(tz);
 
 	kfree(tz->device.groups);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 7e66970f0464..ae579a70cc1a 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -123,7 +123,7 @@ struct thermal_cooling_device {
  * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
  * @mode:		current mode of this thermal zone
  * @devdata:	private pointer for device private data
- * @trips:	number of trip points the thermal zone supports
+ * @num_trips:	number of trip points the thermal zone supports
  * @trips_disabled;	bitmap for disabled trips
  * @passive_delay_jiffies: number of jiffies to wait between polls when
  *			performing passive cooling.
@@ -163,7 +163,7 @@ struct thermal_zone_device {
 	struct thermal_attr *trip_hyst_attrs;
 	enum thermal_device_mode mode;
 	void *devdata;
-	int trips;
+	int num_trips;
 	unsigned long trips_disabled;	/* bitmap for disabled trips */
 	unsigned long passive_delay_jiffies;
 	unsigned long polling_delay_jiffies;
-- 
cgit 


From fae11de507f0e420ba3733729317559e7cea6274 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 22 Jul 2022 22:00:05 +0200
Subject: thermal/core: Add thermal_trip in thermal_zone

The thermal trip points are properties of a thermal zone and the
different sub systems should be able to save them in the thermal zone
structure instead of having their own definition.

Give the opportunity to the drivers to create a thermal zone with
thermal trips which will be accessible directly from the thermal core
framework.

As we added the thermal trip points structure in the thermal zone,
let's extend the thermal zone register function to have the thermal
trip structures as a parameter and store it in the 'trips' field of
the thermal zone structure.

The thermal zone contains the trip point, we can store them directly
when registering the thermal zone. That will allow another step
forward to remove the duplicate thermal zone structure we find in the
thermal_of code.

Cc: Alexandre Bailon <abailon@baylibre.com>
Cc: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220722200007.1839356-9-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c | 22 +++++++++++++++++-----
 drivers/thermal/thermal_of.c   |  8 +++-----
 include/linux/thermal.h        |  8 ++++++++
 3 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 19350a4925c4..f50098d33152 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1163,8 +1163,9 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms
 }
 
 /**
- * thermal_zone_device_register() - register a new thermal zone device
+ * thermal_zone_device_register_with_trips() - register a new thermal zone device
  * @type:	the thermal zone device type
+ * @trips:	a pointer to an array of thermal trips
  * @num_trips:	the number of trip points the thermal zone support
  * @mask:	a bit string indicating the writeablility of trip points
  * @devdata:	private device data
@@ -1187,10 +1188,10 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms
  * IS_ERR*() helpers.
  */
 struct thermal_zone_device *
-thermal_zone_device_register(const char *type, int num_trips, int mask,
-			     void *devdata, struct thermal_zone_device_ops *ops,
-			     struct thermal_zone_params *tzp, int passive_delay,
-			     int polling_delay)
+thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *trips, int num_trips, int mask,
+					void *devdata, struct thermal_zone_device_ops *ops,
+					struct thermal_zone_params *tzp, int passive_delay,
+					int polling_delay)
 {
 	struct thermal_zone_device *tz;
 	enum thermal_trip_type trip_type;
@@ -1251,6 +1252,7 @@ thermal_zone_device_register(const char *type, int num_trips, int mask,
 	tz->tzp = tzp;
 	tz->device.class = &thermal_class;
 	tz->devdata = devdata;
+	tz->trips = trips;
 	tz->num_trips = num_trips;
 
 	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
@@ -1327,6 +1329,16 @@ free_tz:
 	kfree(tz);
 	return ERR_PTR(result);
 }
+
+struct thermal_zone_device *thermal_zone_device_register(const char *type, int ntrips, int mask,
+							 void *devdata, struct thermal_zone_device_ops *ops,
+							 struct thermal_zone_params *tzp, int passive_delay,
+							 int polling_delay)
+{
+	return thermal_zone_device_register_with_trips(type, NULL, ntrips, mask,
+						       devdata, ops, tzp,
+						       passive_delay, polling_delay);
+}
 EXPORT_SYMBOL_GPL(thermal_zone_device_register);
 
 /**
diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 0d04474ed951..d7ff6d558a82 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -1119,11 +1119,9 @@ int __init of_parse_thermal_zones(void)
 		tzp->slope = tz->slope;
 		tzp->offset = tz->offset;
 
-		zone = thermal_zone_device_register(child->name, tz->ntrips,
-						    mask, tz,
-						    ops, tzp,
-						    tz->passive_delay,
-						    tz->polling_delay);
+		zone = thermal_zone_device_register_with_trips(child->name, tz->trips, tz->ntrips,
+							       mask, tz, ops, tzp, tz->passive_delay,
+							       tz->polling_delay);
 		if (IS_ERR(zone)) {
 			pr_err("Failed to build %pOFn zone %ld\n", child,
 			       PTR_ERR(zone));
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ae579a70cc1a..1386c713885d 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -123,6 +123,7 @@ struct thermal_cooling_device {
  * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
  * @mode:		current mode of this thermal zone
  * @devdata:	private pointer for device private data
+ * @trips:	an array of struct thermal_trip
  * @num_trips:	number of trip points the thermal zone supports
  * @trips_disabled;	bitmap for disabled trips
  * @passive_delay_jiffies: number of jiffies to wait between polls when
@@ -163,6 +164,7 @@ struct thermal_zone_device {
 	struct thermal_attr *trip_hyst_attrs;
 	enum thermal_device_mode mode;
 	void *devdata;
+	struct thermal_trip *trips;
 	int num_trips;
 	unsigned long trips_disabled;	/* bitmap for disabled trips */
 	unsigned long passive_delay_jiffies;
@@ -376,8 +378,14 @@ void devm_thermal_zone_of_sensor_unregister(struct device *dev,
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
 		void *, struct thermal_zone_device_ops *,
 		struct thermal_zone_params *, int, int);
+
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
+struct thermal_zone_device *
+thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int,
+					void *, struct thermal_zone_device_ops *,
+					struct thermal_zone_params *, int, int);
+
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
 				     unsigned long, unsigned long,
-- 
cgit 


From a0c0c44e9aa2b5da876467083c359b368f3ce95e Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Wed, 20 Jul 2022 12:43:28 +0000
Subject: s390: add ELF note type for encrypted CPU state of a PV VCPU

The type NT_S390_PV_CPU_DATA note contains the encrypted CPU state of
a PV VCPU. It's only relevant in dumps of s390 PV VMs and can't be
decrypted without a second block of encrypted data which provides key
parts. Therefore we only reserve the note type here.

The zgetdump tool from the s390-tools package can, together with a
Customer Communication Key, be used to convert a PV VM dump into a
normal VM dump. zgetdump will decrypt the CPU data and overwrite the
other respective notes to make the data accessible for crash and other
debugging tools.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
[agordeev@linux.ibm.com changed desctiption]
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 include/uapi/linux/elf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 2b9f5e9985e5..c7b056af9ef0 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -420,6 +420,7 @@ typedef struct elf64_shdr {
 #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
 #define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
 #define NT_S390_RI_CB	0x30d		/* s390 runtime instrumentation */
+#define NT_S390_PV_CPU_DATA	0x30e	/* s390 protvirt cpu dump data */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
-- 
cgit 


From 85f0173df35e5462d89947135a6a5599c6c3ef6f Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Thu, 28 Jul 2022 09:33:07 +0800
Subject: ipv6/addrconf: fix a null-ptr-deref bug for ip6_ptr

Change net device's MTU to smaller than IPV6_MIN_MTU or unregister
device while matching route. That may trigger null-ptr-deref bug
for ip6_ptr probability as following.

=========================================================
BUG: KASAN: null-ptr-deref in find_match.part.0+0x70/0x134
Read of size 4 at addr 0000000000000308 by task ping6/263

CPU: 2 PID: 263 Comm: ping6 Not tainted 5.19.0-rc7+ #14
Call trace:
 dump_backtrace+0x1a8/0x230
 show_stack+0x20/0x70
 dump_stack_lvl+0x68/0x84
 print_report+0xc4/0x120
 kasan_report+0x84/0x120
 __asan_load4+0x94/0xd0
 find_match.part.0+0x70/0x134
 __find_rr_leaf+0x408/0x470
 fib6_table_lookup+0x264/0x540
 ip6_pol_route+0xf4/0x260
 ip6_pol_route_output+0x58/0x70
 fib6_rule_lookup+0x1a8/0x330
 ip6_route_output_flags_noref+0xd8/0x1a0
 ip6_route_output_flags+0x58/0x160
 ip6_dst_lookup_tail+0x5b4/0x85c
 ip6_dst_lookup_flow+0x98/0x120
 rawv6_sendmsg+0x49c/0xc70
 inet_sendmsg+0x68/0x94

Reproducer as following:
Firstly, prepare conditions:
$ip netns add ns1
$ip netns add ns2
$ip link add veth1 type veth peer name veth2
$ip link set veth1 netns ns1
$ip link set veth2 netns ns2
$ip netns exec ns1 ip -6 addr add 2001:0db8:0:f101::1/64 dev veth1
$ip netns exec ns2 ip -6 addr add 2001:0db8:0:f101::2/64 dev veth2
$ip netns exec ns1 ifconfig veth1 up
$ip netns exec ns2 ifconfig veth2 up
$ip netns exec ns1 ip -6 route add 2000::/64 dev veth1 metric 1
$ip netns exec ns2 ip -6 route add 2001::/64 dev veth2 metric 1

Secondly, execute the following two commands in two ssh windows
respectively:
$ip netns exec ns1 sh
$while true; do ip -6 addr add 2001:0db8:0:f101::1/64 dev veth1; ip -6 route add 2000::/64 dev veth1 metric 1; ping6 2000::2; done

$ip netns exec ns1 sh
$while true; do ip link set veth1 mtu 1000; ip link set veth1 mtu 1500; sleep 5; done

It is because ip6_ptr has been assigned to NULL in addrconf_ifdown() firstly,
then ip6_ignore_linkdown() accesses ip6_ptr directly without NULL check.

	cpu0			cpu1
fib6_table_lookup
__find_rr_leaf
			addrconf_notify [ NETDEV_CHANGEMTU ]
			addrconf_ifdown
			RCU_INIT_POINTER(dev->ip6_ptr, NULL)
find_match
ip6_ignore_linkdown

So we can add NULL check for ip6_ptr before using in ip6_ignore_linkdown() to
fix the null-ptr-deref bug.

Fixes: dcd1f572954f ("net/ipv6: Remove fib6_idev")
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220728013307.656257-1-william.xuanziyang@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/addrconf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f7506f08e505..c04f359655b8 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -405,6 +405,9 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev)
 {
 	const struct inet6_dev *idev = __in6_dev_get(dev);
 
+	if (unlikely(!idev))
+		return true;
+
 	return !!idev->cnf.ignore_routes_with_linkdown;
 }
 
-- 
cgit 


From 6dd71251b9aeedd540fd7003bc5f73d59dd6dcb2 Mon Sep 17 00:00:00 2001
From: Xin Gao <gaoxin@cdjrlc.com>
Date: Fri, 22 Jul 2022 10:23:37 +0800
Subject: platform/x86: pmc_atom: Fix comment typo

The double `of' is duplicated in line 50, remove one.

Signed-off-by: Xin Gao <gaoxin@cdjrlc.com>
Link: https://lore.kernel.org/r/20220722022337.15903-1-gaoxin@cdjrlc.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index 6807839c718b..3edfb6d4e67a 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -47,7 +47,7 @@
 #define	PMC_S0I2_TMR		0x88
 #define	PMC_S0I3_TMR		0x8C
 #define	PMC_S0_TMR		0x90
-/* Sleep state counter is in units of of 32us */
+/* Sleep state counter is in units of 32us */
 #define	PMC_TMR_SHIFT		5
 
 /* Power status of power islands */
-- 
cgit 


From 9dd1cd3220eca534f2d47afad7ce85f4c40118d8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 20 Jul 2022 13:58:04 -0400
Subject: dm: fix dm-raid crash if md_handle_request() splits bio

Commit ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone")
introduced the optimization to _not_ perform bio_associate_blkg()'s
relatively costly work when DM core clones its bio. But in doing so it
exposed the possibility for DM's cloned bio to alter DM target
behavior (e.g. crash) if a target were to issue IO without first
calling bio_set_dev().

The DM raid target can trigger an MD crash due to its need to split
the DM bio that is passed to md_handle_request(). The split will
recurse to submit_bio_noacct() using a bio with an uninitialized
->bi_blkg. This NULL bio->bi_blkg causes blk_throtl_bio() to
dereference a NULL blkg_to_tg(bio->bi_blkg).

Fix this in DM core by adding a new 'needs_bio_set_dev' target flag that
will make alloc_tio() call bio_set_dev() on behalf of the target.
dm-raid is the only target that requires this flag. bio_set_dev()
initializes the DM cloned bio's ->bi_blkg, using bio_associate_blkg,
before passing the bio to md_handle_request().

Long-term fix would be to audit and refactor MD code to rely on DM to
split its bio, using dm_accept_partial_bio(), but there are MD raid
personalities (e.g. raid1 and raid10) whose implementation are tightly
coupled to handling the bio splitting inline.

Fixes: ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone")
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-raid.c          |  1 +
 drivers/md/dm.c               | 13 ++++++-------
 include/linux/device-mapper.h |  6 ++++++
 include/uapi/linux/dm-ioctl.h |  4 ++--
 4 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e6a9b8cb22d3..3203aecd6961 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3095,6 +3095,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 	ti->num_flush_bios = 1;
+	ti->needs_bio_set_dev = true;
 
 	/* Restore any requested new layout for conversion decision */
 	rs_config_restore(rs, &rs_layout);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 47bcc5081b2b..f6a6437d0a7c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -574,9 +574,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct bio *clone;
 
 	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
-	/* Set default bdev, but target must bio_set_dev() before issuing IO */
-	clone->bi_bdev = md->disk->part0;
-
 	tio = clone_to_tio(clone);
 	tio->flags = 0;
 	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
@@ -609,6 +606,7 @@ static void free_io(struct dm_io *io)
 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 			     unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
 {
+	struct mapped_device *md = ci->io->md;
 	struct dm_target_io *tio;
 	struct bio *clone;
 
@@ -618,14 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		/* alloc_io() already initialized embedded clone */
 		clone = &tio->clone;
 	} else {
-		struct mapped_device *md = ci->io->md;
-
 		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
 					&md->mempools->bs);
 		if (!clone)
 			return NULL;
-		/* Set default bdev, but target must bio_set_dev() before issuing IO */
-		clone->bi_bdev = md->disk->part0;
 
 		/* REQ_DM_POLL_LIST shouldn't be inherited */
 		clone->bi_opf &= ~REQ_DM_POLL_LIST;
@@ -641,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 	tio->len_ptr = len;
 	tio->old_sector = 0;
 
+	/* Set default bdev, but target must bio_set_dev() before issuing IO */
+	clone->bi_bdev = md->disk->part0;
+	if (unlikely(ti->needs_bio_set_dev))
+		bio_set_dev(clone, md->disk->part0);
+
 	if (len) {
 		clone->bi_iter.bi_size = to_bytes(*len);
 		if (bio_integrity(clone))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 920085dd7f3b..04c6acf7faaa 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -373,6 +373,12 @@ struct dm_target {
 	 * after returning DM_MAPIO_SUBMITTED from its map function.
 	 */
 	bool accounts_remapped_io:1;
+
+	/*
+	 * Set if the target will submit the DM bio without first calling
+	 * bio_set_dev(). NOTE: ideally a target should _not_ need this.
+	 */
+	bool needs_bio_set_dev:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 2e9550fef90f..27ad9671f2df 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -286,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	46
+#define DM_VERSION_MINOR	47
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2022-02-22)"
+#define DM_VERSION_EXTRA	"-ioctl (2022-07-28)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit 


From 0fcb100d50835d6823723ef0898cd565b3796e0a Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 22 Jul 2022 09:38:21 +0000
Subject: dm bufio: Add flags argument to dm_bufio_client_create

Add a flags argument to dm_bufio_client_create and update all the
callers. This is in preparation to add the DM_BUFIO_NO_SLEEP flag.

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c                         | 3 ++-
 drivers/md/dm-ebs-target.c                    | 3 ++-
 drivers/md/dm-integrity.c                     | 2 +-
 drivers/md/dm-snap-persistent.c               | 2 +-
 drivers/md/dm-verity-fec.c                    | 4 ++--
 drivers/md/dm-verity-target.c                 | 2 +-
 drivers/md/persistent-data/dm-block-manager.c | 3 ++-
 include/linux/dm-bufio.h                      | 3 ++-
 8 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5ffa1dcf84cf..ad5603eb12e3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,7 +1717,8 @@ static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrin
 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 					       unsigned reserved_buffers, unsigned aux_size,
 					       void (*alloc_callback)(struct dm_buffer *),
-					       void (*write_callback)(struct dm_buffer *))
+					       void (*write_callback)(struct dm_buffer *),
+					       unsigned int flags)
 {
 	int r;
 	struct dm_bufio_client *c;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 0221fa63f888..04a1f56d6588 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -312,7 +312,8 @@ static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL);
+	ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1,
+					   0, NULL, NULL, 0);
 	if (IS_ERR(ec->bufio)) {
 		ti->error = "Cannot create dm bufio client";
 		r = PTR_ERR(ec->bufio);
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3d5a0ce123c9..a508073d8414 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4439,7 +4439,7 @@ try_smaller_buffer:
 	}
 
 	ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
-			1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL);
+			1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0);
 	if (IS_ERR(ic->bufio)) {
 		r = PTR_ERR(ic->bufio);
 		ti->error = "Cannot initialize dm-bufio";
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3bb5cff5d6fc..aaa699749c3b 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -494,7 +494,7 @@ static int read_exceptions(struct pstore *ps,
 
 	client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
 					ps->store->chunk_size << SECTOR_SHIFT,
-					1, 0, NULL, NULL);
+					1, 0, NULL, NULL, 0);
 
 	if (IS_ERR(client))
 		return PTR_ERR(client);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index cea2b3789736..23cffce56403 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -749,7 +749,7 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	f->bufio = dm_bufio_client_create(f->dev->bdev,
 					  f->io_size,
-					  1, 0, NULL, NULL);
+					  1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->bufio)) {
 		ti->error = "Cannot initialize FEC bufio client";
 		return PTR_ERR(f->bufio);
@@ -765,7 +765,7 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
 					       1 << v->data_dev_block_bits,
-					       1, 0, NULL, NULL);
+					       1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->data_bufio)) {
 		ti->error = "Cannot initialize FEC data bufio client";
 		return PTR_ERR(f->data_bufio);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 75b66dd67633..95db3a7ee3c7 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1265,7 +1265,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
 		1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
-		dm_bufio_alloc_callback, NULL);
+		dm_bufio_alloc_callback, NULL, 0);
 	if (IS_ERR(v->bufio)) {
 		ti->error = "Cannot initialize dm-bufio";
 		r = PTR_ERR(v->bufio);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 54c089a50b15..11935864f50f 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -391,7 +391,8 @@ struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 	bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
 					   sizeof(struct buffer_aux),
 					   dm_block_manager_alloc_callback,
-					   dm_block_manager_write_callback);
+					   dm_block_manager_write_callback,
+					   0);
 	if (IS_ERR(bm->bufio)) {
 		r = PTR_ERR(bm->bufio);
 		kfree(bm);
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index 90bd558a17f5..e21480715255 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -24,7 +24,8 @@ struct dm_bufio_client *
 dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 		       unsigned reserved_buffers, unsigned aux_size,
 		       void (*alloc_callback)(struct dm_buffer *),
-		       void (*write_callback)(struct dm_buffer *));
+		       void (*write_callback)(struct dm_buffer *),
+		       unsigned int flags);
 
 /*
  * Release a buffered IO cache.
-- 
cgit 


From b32d45824aa7e07a0c3257a16e3a2a691b11b39a Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 22 Jul 2022 09:38:22 +0000
Subject: dm bufio: Add DM_BUFIO_CLIENT_NO_SLEEP flag

Add an optional flag that ensures dm_bufio_client does not sleep
(primary focus is to service dm_bufio_get without sleeping). This
allows the dm-bufio cache to be queried from interrupt context.

To ensure that dm-bufio does not sleep, dm-bufio must use a spinlock
instead of a mutex. Additionally, to avoid deadlocks, special care
must be taken so that dm-bufio does not sleep while holding the
spinlock.

But again: the scope of this no_sleep is initially confined to
dm_bufio_get, so __alloc_buffer_wait_no_callback is _not_ changed to
avoid sleeping because __bufio_new avoids allocation for NF_GET.

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c    | 22 +++++++++++++++++++---
 include/linux/dm-bufio.h |  5 +++++
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ad5603eb12e3..486179d1831c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -81,6 +81,8 @@
  */
 struct dm_bufio_client {
 	struct mutex lock;
+	spinlock_t spinlock;
+	unsigned long spinlock_flags;
 
 	struct list_head lru[LIST_SIZE];
 	unsigned long n_buffers[LIST_SIZE];
@@ -90,6 +92,7 @@ struct dm_bufio_client {
 	s8 sectors_per_block_bits;
 	void (*alloc_callback)(struct dm_buffer *);
 	void (*write_callback)(struct dm_buffer *);
+	bool no_sleep;
 
 	struct kmem_cache *slab_buffer;
 	struct kmem_cache *slab_cache;
@@ -167,17 +170,26 @@ struct dm_buffer {
 
 static void dm_bufio_lock(struct dm_bufio_client *c)
 {
-	mutex_lock_nested(&c->lock, dm_bufio_in_request());
+	if (c->no_sleep)
+		spin_lock_irqsave_nested(&c->spinlock, c->spinlock_flags, dm_bufio_in_request());
+	else
+		mutex_lock_nested(&c->lock, dm_bufio_in_request());
 }
 
 static int dm_bufio_trylock(struct dm_bufio_client *c)
 {
-	return mutex_trylock(&c->lock);
+	if (c->no_sleep)
+		return spin_trylock_irqsave(&c->spinlock, c->spinlock_flags);
+	else
+		return mutex_trylock(&c->lock);
 }
 
 static void dm_bufio_unlock(struct dm_bufio_client *c)
 {
-	mutex_unlock(&c->lock);
+	if (c->no_sleep)
+		spin_unlock_irqrestore(&c->spinlock, c->spinlock_flags);
+	else
+		mutex_unlock(&c->lock);
 }
 
 /*----------------------------------------------------------------*/
@@ -1748,12 +1760,16 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	c->alloc_callback = alloc_callback;
 	c->write_callback = write_callback;
 
+	if (flags & DM_BUFIO_CLIENT_NO_SLEEP)
+		c->no_sleep = true;
+
 	for (i = 0; i < LIST_SIZE; i++) {
 		INIT_LIST_HEAD(&c->lru[i]);
 		c->n_buffers[i] = 0;
 	}
 
 	mutex_init(&c->lock);
+	spin_lock_init(&c->spinlock);
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
 
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index e21480715255..15d9e15ca830 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -17,6 +17,11 @@
 struct dm_bufio_client;
 struct dm_buffer;
 
+/*
+ * Flags for dm_bufio_client_create
+ */
+#define DM_BUFIO_CLIENT_NO_SLEEP 0x1
+
 /*
  * Create a buffered IO cache on a given device
  */
-- 
cgit 


From 7adc91e0c93901a0eeeea10665d0feb48ffde2d4 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 27 Jul 2022 12:43:42 +0300
Subject: net/tls: Multi-threaded calls to TX tls_dev_del

Multiple TLS device-offloaded contexts can be added in parallel via
concurrent calls to .tls_dev_add, while calls to .tls_dev_del are
sequential in tls_device_gc_task.

This is not a sustainable behavior. This creates a rate gap between add
and del operations (addition rate outperforms the deletion rate).  When
running for enough time, the TLS device resources could get exhausted,
failing to offload new connections.

Replace the single-threaded garbage collector work with a per-context
alternative, so they can be handled on several cores in parallel. Use
a new dedicated destruct workqueue for this.

Tested with mlx5 device:
Before: 22141 add/sec,   103 del/sec
After:  11684 add/sec, 11684 del/sec

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h    |  2 ++
 net/tls/tls_device.c | 63 ++++++++++++++++++++++++++--------------------------
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index abb050b0df83..b75b5727abdb 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -161,6 +161,8 @@ struct tls_offload_context_tx {
 
 	struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
 	void (*sk_destruct)(struct sock *sk);
+	struct work_struct destruct_work;
+	struct tls_context *ctx;
 	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 5150b3939b1f..18c7e5c6d228 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -46,10 +46,8 @@
  */
 static DECLARE_RWSEM(device_offload_lock);
 
-static void tls_device_gc_task(struct work_struct *work);
+static struct workqueue_struct *destruct_wq __read_mostly;
 
-static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
-static LIST_HEAD(tls_device_gc_list);
 static LIST_HEAD(tls_device_list);
 static LIST_HEAD(tls_device_down_list);
 static DEFINE_SPINLOCK(tls_device_lock);
@@ -68,29 +66,17 @@ static void tls_device_free_ctx(struct tls_context *ctx)
 	tls_ctx_free(NULL, ctx);
 }
 
-static void tls_device_gc_task(struct work_struct *work)
+static void tls_device_tx_del_task(struct work_struct *work)
 {
-	struct tls_context *ctx, *tmp;
-	unsigned long flags;
-	LIST_HEAD(gc_list);
-
-	spin_lock_irqsave(&tls_device_lock, flags);
-	list_splice_init(&tls_device_gc_list, &gc_list);
-	spin_unlock_irqrestore(&tls_device_lock, flags);
-
-	list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
-		struct net_device *netdev = ctx->netdev;
+	struct tls_offload_context_tx *offload_ctx =
+		container_of(work, struct tls_offload_context_tx, destruct_work);
+	struct tls_context *ctx = offload_ctx->ctx;
+	struct net_device *netdev = ctx->netdev;
 
-		if (netdev && ctx->tx_conf == TLS_HW) {
-			netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
-							TLS_OFFLOAD_CTX_DIR_TX);
-			dev_put(netdev);
-			ctx->netdev = NULL;
-		}
-
-		list_del(&ctx->list);
-		tls_device_free_ctx(ctx);
-	}
+	netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX);
+	dev_put(netdev);
+	ctx->netdev = NULL;
+	tls_device_free_ctx(ctx);
 }
 
 static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
@@ -104,16 +90,15 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
 		return;
 	}
 
+	list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */
 	async_cleanup = ctx->netdev && ctx->tx_conf == TLS_HW;
 	if (async_cleanup) {
-		list_move_tail(&ctx->list, &tls_device_gc_list);
+		struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx);
 
-		/* schedule_work inside the spinlock
+		/* queue_work inside the spinlock
 		 * to make sure tls_device_down waits for that work.
 		 */
-		schedule_work(&tls_device_gc_work);
-	} else {
-		list_del(&ctx->list);
+		queue_work(destruct_wq, &offload_ctx->destruct_work);
 	}
 	spin_unlock_irqrestore(&tls_device_lock, flags);
 
@@ -1160,6 +1145,9 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	start_marker_record->len = 0;
 	start_marker_record->num_frags = 0;
 
+	INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task);
+	offload_ctx->ctx = ctx;
+
 	INIT_LIST_HEAD(&offload_ctx->records_list);
 	list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
 	spin_lock_init(&offload_ctx->lock);
@@ -1404,7 +1392,7 @@ static int tls_device_down(struct net_device *netdev)
 
 	up_write(&device_offload_lock);
 
-	flush_work(&tls_device_gc_work);
+	flush_workqueue(destruct_wq);
 
 	return NOTIFY_DONE;
 }
@@ -1445,12 +1433,23 @@ static struct notifier_block tls_dev_notifier = {
 
 int __init tls_device_init(void)
 {
-	return register_netdevice_notifier(&tls_dev_notifier);
+	int err;
+
+	destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
+	if (!destruct_wq)
+		return -ENOMEM;
+
+	err = register_netdevice_notifier(&tls_dev_notifier);
+	if (err)
+		destroy_workqueue(destruct_wq);
+
+	return err;
 }
 
 void __exit tls_device_cleanup(void)
 {
 	unregister_netdevice_notifier(&tls_dev_notifier);
-	flush_work(&tls_device_gc_work);
+	flush_workqueue(destruct_wq);
+	destroy_workqueue(destruct_wq);
 	clean_acked_data_flush();
 }
-- 
cgit 


From 08f588fa301bef264576fc915da6bf31b585a824 Mon Sep 17 00:00:00 2001
From: Vikas Gupta <vikas.gupta@broadcom.com>
Date: Wed, 27 Jul 2022 22:27:20 +0530
Subject: devlink: introduce framework for selftests

Add a framework for running selftests.
Framework exposes devlink commands and test suite(s) to the user
to execute and query the supported tests by the driver.

Below are new entries in devlink_nl_ops
devlink_nl_cmd_selftests_show_doit/dumpit: To query the supported
selftests by the drivers.
devlink_nl_cmd_selftests_run: To execute selftests. Users can
provide a test mask for executing group tests or standalone tests.

Documentation/networking/devlink/ path is already part of MAINTAINERS &
the new files come under this path. Hence no update needed to the
MAINTAINERS

Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/devlink/devlink-selftests.rst       |  38 ++++
 include/net/devlink.h                              |  21 ++
 include/uapi/linux/devlink.h                       |  29 +++
 net/core/devlink.c                                 | 216 +++++++++++++++++++++
 4 files changed, 304 insertions(+)
 create mode 100644 Documentation/networking/devlink/devlink-selftests.rst

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-selftests.rst b/Documentation/networking/devlink/devlink-selftests.rst
new file mode 100644
index 000000000000..c0aa1f3aef0d
--- /dev/null
+++ b/Documentation/networking/devlink/devlink-selftests.rst
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+=================
+Devlink Selftests
+=================
+
+The ``devlink-selftests`` API allows executing selftests on the device.
+
+Tests Mask
+==========
+The ``devlink-selftests`` command should be run with a mask indicating
+the tests to be executed.
+
+Tests Description
+=================
+The following is a list of tests that drivers may execute.
+
+.. list-table:: List of tests
+   :widths: 5 90
+
+   * - Name
+     - Description
+   * - ``DEVLINK_SELFTEST_FLASH``
+     - Devices may have the firmware on non-volatile memory on the board, e.g.
+       flash. This particular test helps to run a flash selftest on the device.
+       Implementation of the test is left to the driver/firmware.
+
+example usage
+-------------
+
+.. code:: shell
+
+    # Query selftests supported on the devlink device
+    $ devlink dev selftests show DEV
+    # Query selftests supported on all devlink devices
+    $ devlink dev selftests show
+    # Executes selftests on the device
+    $ devlink dev selftests run DEV id flash
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5bd3fac12e9e..119ed1ffb988 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1509,6 +1509,27 @@ struct devlink_ops {
 				    struct devlink_rate *parent,
 				    void *priv_child, void *priv_parent,
 				    struct netlink_ext_ack *extack);
+	/**
+	 * selftests_check() - queries if selftest is supported
+	 * @devlink: devlink instance
+	 * @id: test index
+	 * @extack: extack for reporting error messages
+	 *
+	 * Return: true if test is supported by the driver
+	 */
+	bool (*selftest_check)(struct devlink *devlink, unsigned int id,
+			       struct netlink_ext_ack *extack);
+	/**
+	 * selftest_run() - Runs a selftest
+	 * @devlink: devlink instance
+	 * @id: test index
+	 * @extack: extack for reporting error messages
+	 *
+	 * Return: status of the test
+	 */
+	enum devlink_selftest_status
+	(*selftest_run)(struct devlink *devlink, unsigned int id,
+			struct netlink_ext_ack *extack);
 };
 
 void *devlink_priv(struct devlink *devlink);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 541321695f52..2f24b53a87a5 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -136,6 +136,9 @@ enum devlink_command {
 	DEVLINK_CMD_LINECARD_NEW,
 	DEVLINK_CMD_LINECARD_DEL,
 
+	DEVLINK_CMD_SELFTESTS_GET,	/* can dump */
+	DEVLINK_CMD_SELFTESTS_RUN,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -276,6 +279,30 @@ enum {
 #define DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS \
 	(_BITUL(__DEVLINK_FLASH_OVERWRITE_MAX_BIT) - 1)
 
+enum devlink_attr_selftest_id {
+	DEVLINK_ATTR_SELFTEST_ID_UNSPEC,
+	DEVLINK_ATTR_SELFTEST_ID_FLASH,	/* flag */
+
+	__DEVLINK_ATTR_SELFTEST_ID_MAX,
+	DEVLINK_ATTR_SELFTEST_ID_MAX = __DEVLINK_ATTR_SELFTEST_ID_MAX - 1
+};
+
+enum devlink_selftest_status {
+	DEVLINK_SELFTEST_STATUS_SKIP,
+	DEVLINK_SELFTEST_STATUS_PASS,
+	DEVLINK_SELFTEST_STATUS_FAIL
+};
+
+enum devlink_attr_selftest_result {
+	DEVLINK_ATTR_SELFTEST_RESULT_UNSPEC,
+	DEVLINK_ATTR_SELFTEST_RESULT,		/* nested */
+	DEVLINK_ATTR_SELFTEST_RESULT_ID,	/* u32, enum devlink_attr_selftest_id */
+	DEVLINK_ATTR_SELFTEST_RESULT_STATUS,	/* u8, enum devlink_selftest_status */
+
+	__DEVLINK_ATTR_SELFTEST_RESULT_MAX,
+	DEVLINK_ATTR_SELFTEST_RESULT_MAX = __DEVLINK_ATTR_SELFTEST_RESULT_MAX - 1
+};
+
 /**
  * enum devlink_trap_action - Packet trap action.
  * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not
@@ -578,6 +605,8 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_NESTED_DEVLINK,		/* nested */
 
+	DEVLINK_ATTR_SELFTESTS,			/* nested */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ca4c9939d569..efeba223b9b8 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -201,6 +201,10 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
 };
 
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+	[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
+
 static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
 #define DEVLINK_REGISTERED XA_MARK_1
 
@@ -4826,6 +4830,206 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
 	return ret;
 }
 
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+			  u32 portid, u32 seq, int flags,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *selftests;
+	void *hdr;
+	int err;
+	int i;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+			  DEVLINK_CMD_SELFTESTS_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto err_cancel_msg;
+
+	selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+	if (!selftests)
+		goto err_cancel_msg;
+
+	for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+	     i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+		if (devlink->ops->selftest_check(devlink, i, extack)) {
+			err = nla_put_flag(msg, i);
+			if (err)
+				goto err_cancel_msg;
+		}
+	}
+
+	nla_nest_end(msg, selftests);
+	genlmsg_end(msg, hdr);
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb,
+					     struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink->ops->selftest_check)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+					info->snd_seq, 0, info->extack);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg,
+					       struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	unsigned long index;
+	int idx = 0;
+	int err = 0;
+
+	mutex_lock(&devlink_mutex);
+	devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+		if (idx < start || !devlink->ops->selftest_check)
+			goto inc;
+
+		devl_lock(devlink);
+		err = devlink_nl_selftests_fill(msg, devlink,
+						NETLINK_CB(cb->skb).portid,
+						cb->nlh->nlmsg_seq, NLM_F_MULTI,
+						cb->extack);
+		devl_unlock(devlink);
+		if (err) {
+			devlink_put(devlink);
+			break;
+		}
+inc:
+		idx++;
+		devlink_put(devlink);
+	}
+	mutex_unlock(&devlink_mutex);
+
+	if (err != -EMSGSIZE)
+		return err;
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+				       enum devlink_selftest_status test_status)
+{
+	struct nlattr *result_attr;
+
+	result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+	if (!result_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+	    nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+		       test_status))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, result_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, result_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_selftests_run(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+	struct devlink *devlink = info->user_ptr[0];
+	struct nlattr *attrs, *selftests;
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+	int i;
+
+	if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[DEVLINK_ATTR_SELFTESTS]) {
+		NL_SET_ERR_MSG_MOD(info->extack, "selftest required");
+		return -EINVAL;
+	}
+
+	attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+	err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+			       devlink_selftest_nl_policy, info->extack);
+	if (err < 0)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = -EMSGSIZE;
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+	if (!hdr)
+		goto free_msg;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto genlmsg_cancel;
+
+	selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+	if (!selftests)
+		goto genlmsg_cancel;
+
+	for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+	     i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+		enum devlink_selftest_status test_status;
+
+		if (nla_get_flag(tb[i])) {
+			if (!devlink->ops->selftest_check(devlink, i,
+							  info->extack)) {
+				if (devlink_selftest_result_put(msg, i,
+								DEVLINK_SELFTEST_STATUS_SKIP))
+					goto selftests_nest_cancel;
+				continue;
+			}
+
+			test_status = devlink->ops->selftest_run(devlink, i,
+								 info->extack);
+			if (devlink_selftest_result_put(msg, i, test_status))
+				goto selftests_nest_cancel;
+		}
+	}
+
+	nla_nest_end(msg, selftests);
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+	nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
 static const struct devlink_param devlink_param_generic[] = {
 	{
 		.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
@@ -8969,6 +9173,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
@@ -9328,6 +9533,17 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_trap_policer_set_doit,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = DEVLINK_CMD_SELFTESTS_GET,
+		.doit = devlink_nl_cmd_selftests_get_doit,
+		.dumpit = devlink_nl_cmd_selftests_get_dumpit
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_SELFTESTS_RUN,
+		.doit = devlink_nl_cmd_selftests_run,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit 


From d7c4c9e075f8cc6d88d277bc24e5d99297f03c06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Jul 2022 22:18:21 -0700
Subject: ax25: fix incorrect dev_tracker usage

While investigating a separate rose issue [1], and enabling
CONFIG_NET_DEV_REFCNT_TRACKER=y, Bernard reported an orthogonal ax25 issue [2]

An ax25_dev can be used by one (or many) struct ax25_cb.
We thus need different dev_tracker, one per struct ax25_cb.

After this patch is applied, we are able to focus on rose.

[1] https://lore.kernel.org/netdev/fb7544a1-f42e-9254-18cc-c9b071f4ca70@free.fr/

[2]
[  205.798723] reference already released.
[  205.798732] allocated in:
[  205.798734]  ax25_bind+0x1a2/0x230 [ax25]
[  205.798747]  __sys_bind+0xea/0x110
[  205.798753]  __x64_sys_bind+0x18/0x20
[  205.798758]  do_syscall_64+0x5c/0x80
[  205.798763]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  205.798768] freed in:
[  205.798770]  ax25_release+0x115/0x370 [ax25]
[  205.798778]  __sock_release+0x42/0xb0
[  205.798782]  sock_close+0x15/0x20
[  205.798785]  __fput+0x9f/0x260
[  205.798789]  ____fput+0xe/0x10
[  205.798792]  task_work_run+0x64/0xa0
[  205.798798]  exit_to_user_mode_prepare+0x18b/0x190
[  205.798804]  syscall_exit_to_user_mode+0x26/0x40
[  205.798808]  do_syscall_64+0x69/0x80
[  205.798812]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  205.798827] ------------[ cut here ]------------
[  205.798829] WARNING: CPU: 2 PID: 2605 at lib/ref_tracker.c:136 ref_tracker_free.cold+0x60/0x81
[  205.798837] Modules linked in: rose netrom mkiss ax25 rfcomm cmac algif_hash algif_skcipher af_alg bnep snd_hda_codec_hdmi nls_iso8859_1 i915 rtw88_8821ce rtw88_8821c x86_pkg_temp_thermal rtw88_pci intel_powerclamp rtw88_core snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio coretemp snd_hda_intel kvm_intel snd_intel_dspcfg mac80211 snd_hda_codec kvm i2c_algo_bit drm_buddy drm_dp_helper btusb drm_kms_helper snd_hwdep btrtl snd_hda_core btbcm joydev crct10dif_pclmul btintel crc32_pclmul ghash_clmulni_intel mei_hdcp btmtk intel_rapl_msr aesni_intel bluetooth input_leds snd_pcm crypto_simd syscopyarea processor_thermal_device_pci_legacy sysfillrect cryptd intel_soc_dts_iosf snd_seq sysimgblt ecdh_generic fb_sys_fops rapl libarc4 processor_thermal_device intel_cstate processor_thermal_rfim cec snd_timer ecc snd_seq_device cfg80211 processor_thermal_mbox mei_me processor_thermal_rapl mei rc_core at24 snd intel_pch_thermal intel_rapl_common ttm soundcore int340x_thermal_zone video
[  205.798948]  mac_hid acpi_pad sch_fq_codel ipmi_devintf ipmi_msghandler drm msr parport_pc ppdev lp parport ramoops pstore_blk reed_solomon pstore_zone efi_pstore ip_tables x_tables autofs4 hid_generic usbhid hid i2c_i801 i2c_smbus r8169 xhci_pci ahci libahci realtek lpc_ich xhci_pci_renesas [last unloaded: ax25]
[  205.798992] CPU: 2 PID: 2605 Comm: ax25ipd Not tainted 5.18.11-F6BVP #3
[  205.798996] Hardware name: To be filled by O.E.M. To be filled by O.E.M./CK3, BIOS 5.011 09/16/2020
[  205.798999] RIP: 0010:ref_tracker_free.cold+0x60/0x81
[  205.799005] Code: e8 d2 01 9b ff 83 7b 18 00 74 14 48 c7 c7 2f d7 ff 98 e8 10 6e fc ff 8b 7b 18 e8 b8 01 9b ff 4c 89 ee 4c 89 e7 e8 5d fd 07 00 <0f> 0b b8 ea ff ff ff e9 30 05 9b ff 41 0f b6 f7 48 c7 c7 a0 fa 4e
[  205.799008] RSP: 0018:ffffaf5281073958 EFLAGS: 00010286
[  205.799011] RAX: 0000000080000000 RBX: ffff9a0bd687ebe0 RCX: 0000000000000000
[  205.799014] RDX: 0000000000000001 RSI: 0000000000000282 RDI: 00000000ffffffff
[  205.799016] RBP: ffffaf5281073a10 R08: 0000000000000003 R09: fffffffffffd5618
[  205.799019] R10: 0000000000ffff10 R11: 000000000000000f R12: ffff9a0bc53384d0
[  205.799022] R13: 0000000000000282 R14: 00000000ae000001 R15: 0000000000000001
[  205.799024] FS:  0000000000000000(0000) GS:ffff9a0d0f300000(0000) knlGS:0000000000000000
[  205.799028] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  205.799031] CR2: 00007ff6b8311554 CR3: 000000001ac10004 CR4: 00000000001706e0
[  205.799033] Call Trace:
[  205.799035]  <TASK>
[  205.799038]  ? ax25_dev_device_down+0xd9/0x1b0 [ax25]
[  205.799047]  ? ax25_device_event+0x9f/0x270 [ax25]
[  205.799055]  ? raw_notifier_call_chain+0x49/0x60
[  205.799060]  ? call_netdevice_notifiers_info+0x52/0xa0
[  205.799065]  ? dev_close_many+0xc8/0x120
[  205.799070]  ? unregister_netdevice_many+0x13d/0x890
[  205.799073]  ? unregister_netdevice_queue+0x90/0xe0
[  205.799076]  ? unregister_netdev+0x1d/0x30
[  205.799080]  ? mkiss_close+0x7c/0xc0 [mkiss]
[  205.799084]  ? tty_ldisc_close+0x2e/0x40
[  205.799089]  ? tty_ldisc_hangup+0x137/0x210
[  205.799092]  ? __tty_hangup.part.0+0x208/0x350
[  205.799098]  ? tty_vhangup+0x15/0x20
[  205.799103]  ? pty_close+0x127/0x160
[  205.799108]  ? tty_release+0x139/0x5e0
[  205.799112]  ? __fput+0x9f/0x260
[  205.799118]  ax25_dev_device_down+0xd9/0x1b0 [ax25]
[  205.799126]  ax25_device_event+0x9f/0x270 [ax25]
[  205.799135]  raw_notifier_call_chain+0x49/0x60
[  205.799140]  call_netdevice_notifiers_info+0x52/0xa0
[  205.799146]  dev_close_many+0xc8/0x120
[  205.799152]  unregister_netdevice_many+0x13d/0x890
[  205.799157]  unregister_netdevice_queue+0x90/0xe0
[  205.799161]  unregister_netdev+0x1d/0x30
[  205.799165]  mkiss_close+0x7c/0xc0 [mkiss]
[  205.799170]  tty_ldisc_close+0x2e/0x40
[  205.799173]  tty_ldisc_hangup+0x137/0x210
[  205.799178]  __tty_hangup.part.0+0x208/0x350
[  205.799184]  tty_vhangup+0x15/0x20
[  205.799188]  pty_close+0x127/0x160
[  205.799193]  tty_release+0x139/0x5e0
[  205.799199]  __fput+0x9f/0x260
[  205.799203]  ____fput+0xe/0x10
[  205.799208]  task_work_run+0x64/0xa0
[  205.799213]  do_exit+0x33b/0xab0
[  205.799217]  ? __handle_mm_fault+0xc4f/0x15f0
[  205.799224]  do_group_exit+0x35/0xa0
[  205.799228]  __x64_sys_exit_group+0x18/0x20
[  205.799232]  do_syscall_64+0x5c/0x80
[  205.799238]  ? handle_mm_fault+0xba/0x290
[  205.799242]  ? debug_smp_processor_id+0x17/0x20
[  205.799246]  ? fpregs_assert_state_consistent+0x26/0x50
[  205.799251]  ? exit_to_user_mode_prepare+0x49/0x190
[  205.799256]  ? irqentry_exit_to_user_mode+0x9/0x20
[  205.799260]  ? irqentry_exit+0x33/0x40
[  205.799263]  ? exc_page_fault+0x87/0x170
[  205.799268]  ? asm_exc_page_fault+0x8/0x30
[  205.799273]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  205.799277] RIP: 0033:0x7ff6b80eaca1
[  205.799281] Code: Unable to access opcode bytes at RIP 0x7ff6b80eac77.
[  205.799283] RSP: 002b:00007fff6dfd4738 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[  205.799287] RAX: ffffffffffffffda RBX: 00007ff6b8215a00 RCX: 00007ff6b80eaca1
[  205.799290] RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000001
[  205.799293] RBP: 0000000000000001 R08: ffffffffffffff80 R09: 0000000000000028
[  205.799295] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ff6b8215a00
[  205.799298] R13: 0000000000000000 R14: 00007ff6b821aee8 R15: 00007ff6b821af00
[  205.799304]  </TASK>

Fixes: feef318c855a ("ax25: fix UAF bugs of net_device caused by rebinding operation")
Reported-by: Bernard F6BVP <f6bvp@free.fr>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Duoming Zhou <duoming@zju.edu.cn>
Link: https://lore.kernel.org/r/20220728051821.3160118-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ax25.h | 1 +
 net/ax25/af_ax25.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index a427a05672e2..f8cf3629a419 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -236,6 +236,7 @@ typedef struct ax25_cb {
 	ax25_address		source_addr, dest_addr;
 	ax25_digi		*digipeat;
 	ax25_dev		*ax25_dev;
+	netdevice_tracker	dev_tracker;
 	unsigned char		iamdigi;
 	unsigned char		state, modulus, pidincl;
 	unsigned short		vs, vr, va;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 4c7030ed8d33..5b5363c99ed5 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1065,7 +1065,7 @@ static int ax25_release(struct socket *sock)
 			del_timer_sync(&ax25->t3timer);
 			del_timer_sync(&ax25->idletimer);
 		}
-		dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker);
+		dev_put_track(ax25_dev->dev, &ax25->dev_tracker);
 		ax25_dev_put(ax25_dev);
 	}
 
@@ -1146,7 +1146,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	if (ax25_dev) {
 		ax25_fillin_cb(ax25, ax25_dev);
-		dev_hold_track(ax25_dev->dev, &ax25_dev->dev_tracker, GFP_ATOMIC);
+		dev_hold_track(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC);
 	}
 
 done:
-- 
cgit 


From 29192a170e1579ac8293d00c88d9938c2b2f5aab Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Jul 2022 17:49:06 +0300
Subject: firewire: net: Make use of get_unaligned_be48(), put_unaligned_be48()

Since we have a proper endianness converters for BE 48-bit data use
them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220726144906.5217-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/firewire/net.c | 14 ++------------
 include/net/firewire.h |  3 +--
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index dcc141068128..af22be84034b 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -201,15 +201,6 @@ struct fwnet_packet_task {
 	u8 enqueued;
 };
 
-/*
- * Get fifo address embedded in hwaddr
- */
-static __u64 fwnet_hwaddr_fifo(union fwnet_hwaddr *ha)
-{
-	return (u64)get_unaligned_be16(&ha->uc.fifo_hi) << 32
-	       | get_unaligned_be32(&ha->uc.fifo_lo);
-}
-
 /*
  * saddr == NULL means use device source address.
  * daddr == NULL means leave destination address (eg unresolved arp).
@@ -1306,7 +1297,7 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct net_device *net)
 		max_payload        = peer->max_payload;
 		datagram_label_ptr = &peer->datagram_label;
 
-		ptask->fifo_addr   = fwnet_hwaddr_fifo(ha);
+		ptask->fifo_addr   = get_unaligned_be48(ha->uc.fifo);
 		ptask->generation  = generation;
 		ptask->dest_node   = dest_node;
 		ptask->speed       = peer->speed;
@@ -1494,8 +1485,7 @@ static int fwnet_probe(struct fw_unit *unit,
 	ha.uc.uniq_id = cpu_to_be64(card->guid);
 	ha.uc.max_rec = dev->card->max_receive;
 	ha.uc.sspd = dev->card->link_speed;
-	ha.uc.fifo_hi = cpu_to_be16(dev->local_fifo >> 32);
-	ha.uc.fifo_lo = cpu_to_be32(dev->local_fifo & 0xffffffff);
+	put_unaligned_be48(dev->local_fifo, ha.uc.fifo);
 	dev_addr_set(net, ha.u);
 
 	memset(net->broadcast, -1, net->addr_len);
diff --git a/include/net/firewire.h b/include/net/firewire.h
index 2442d645e412..8fbff8d77865 100644
--- a/include/net/firewire.h
+++ b/include/net/firewire.h
@@ -13,8 +13,7 @@ union fwnet_hwaddr {
 		__be64 uniq_id;		/* EUI-64			*/
 		u8 max_rec;		/* max packet size		*/
 		u8 sspd;		/* max speed			*/
-		__be16 fifo_hi;		/* hi 16bits of FIFO addr	*/
-		__be32 fifo_lo;		/* lo 32bits of FIFO addr	*/
+		u8 fifo[6];		/* FIFO addr			*/
 	} __packed uc;
 };
 
-- 
cgit 


From 71610ab1d017e131a9888ef8acd035284fb0e1dd Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Wed, 20 Jul 2022 15:21:51 +0800
Subject: LoongArch: Remove clock setting during cpu hotplug stage

On physical machine we can save power by disabling clock of hot removed
cpu. However as different platforms require different methods to
configure clocks, the code is platform-specific, and probably belongs to
firmware/pmu or cpu regulator, rather than generic arch/loongarch code.

Also, there is no such register on QEMU virt machine since the
clock/frequency regulation is not emulated.

This patch removes the hard-coded clock register accesses in generic
LoongArch cpu hotplug flow.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Bibo Mao <maobibo@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/kernel/smp.c | 113 +++++---------------------------------------
 include/linux/cpuhotplug.h  |   1 -
 2 files changed, 13 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 73cec62504fb..09743103d9b3 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -278,116 +278,29 @@ void loongson3_cpu_die(unsigned int cpu)
 	mb();
 }
 
-/*
- * The target CPU should go to XKPRANGE (uncached area) and flush
- * ICache/DCache/VCache before the control CPU can safely disable its clock.
- */
-static void loongson3_play_dead(int *state_addr)
+void play_dead(void)
 {
-	register int val;
-	register void *addr;
+	register uint64_t addr;
 	register void (*init_fn)(void);
 
-	__asm__ __volatile__(
-		"   li.d %[addr], 0x8000000000000000\n"
-		"1: cacop 0x8, %[addr], 0           \n" /* flush ICache */
-		"   cacop 0x8, %[addr], 1           \n"
-		"   cacop 0x8, %[addr], 2           \n"
-		"   cacop 0x8, %[addr], 3           \n"
-		"   cacop 0x9, %[addr], 0           \n" /* flush DCache */
-		"   cacop 0x9, %[addr], 1           \n"
-		"   cacop 0x9, %[addr], 2           \n"
-		"   cacop 0x9, %[addr], 3           \n"
-		"   addi.w %[sets], %[sets], -1     \n"
-		"   addi.d %[addr], %[addr], 0x40   \n"
-		"   bnez %[sets], 1b                \n"
-		"   li.d %[addr], 0x8000000000000000\n"
-		"2: cacop 0xa, %[addr], 0           \n" /* flush VCache */
-		"   cacop 0xa, %[addr], 1           \n"
-		"   cacop 0xa, %[addr], 2           \n"
-		"   cacop 0xa, %[addr], 3           \n"
-		"   cacop 0xa, %[addr], 4           \n"
-		"   cacop 0xa, %[addr], 5           \n"
-		"   cacop 0xa, %[addr], 6           \n"
-		"   cacop 0xa, %[addr], 7           \n"
-		"   cacop 0xa, %[addr], 8           \n"
-		"   cacop 0xa, %[addr], 9           \n"
-		"   cacop 0xa, %[addr], 10          \n"
-		"   cacop 0xa, %[addr], 11          \n"
-		"   cacop 0xa, %[addr], 12          \n"
-		"   cacop 0xa, %[addr], 13          \n"
-		"   cacop 0xa, %[addr], 14          \n"
-		"   cacop 0xa, %[addr], 15          \n"
-		"   addi.w %[vsets], %[vsets], -1   \n"
-		"   addi.d %[addr], %[addr], 0x40   \n"
-		"   bnez   %[vsets], 2b             \n"
-		"   li.w   %[val], 0x7              \n" /* *state_addr = CPU_DEAD; */
-		"   st.w   %[val], %[state_addr], 0 \n"
-		"   dbar 0                          \n"
-		"   cacop 0x11, %[state_addr], 0    \n" /* flush entry of *state_addr */
-		: [addr] "=&r" (addr), [val] "=&r" (val)
-		: [state_addr] "r" (state_addr),
-		  [sets] "r" (cpu_data[smp_processor_id()].dcache.sets),
-		  [vsets] "r" (cpu_data[smp_processor_id()].vcache.sets));
-
+	idle_task_exit();
 	local_irq_enable();
-	change_csr_ecfg(ECFG0_IM, ECFGF_IPI);
+	set_csr_ecfg(ECFGF_IPI);
+	__this_cpu_write(cpu_state, CPU_DEAD);
+
+	__smp_mb();
+	do {
+		__asm__ __volatile__("idle 0\n\t");
+		addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0);
+	} while (addr == 0);
 
-	__asm__ __volatile__(
-		"   idle      0			    \n"
-		"   li.w      $t0, 0x1020	    \n"
-		"   iocsrrd.d %[init_fn], $t0	    \n" /* Get init PC */
-		: [init_fn] "=&r" (addr)
-		: /* No Input */
-		: "a0");
-	init_fn = __va(addr);
+	init_fn = (void *)TO_CACHE(addr);
+	iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR);
 
 	init_fn();
 	unreachable();
 }
 
-void play_dead(void)
-{
-	int *state_addr;
-	unsigned int cpu = smp_processor_id();
-	void (*play_dead_uncached)(int *s);
-
-	idle_task_exit();
-	play_dead_uncached = (void *)TO_UNCACHE(__pa((unsigned long)loongson3_play_dead));
-	state_addr = &per_cpu(cpu_state, cpu);
-	mb();
-	play_dead_uncached(state_addr);
-}
-
-static int loongson3_enable_clock(unsigned int cpu)
-{
-	uint64_t core_id = cpu_data[cpu].core;
-	uint64_t package_id = cpu_data[cpu].package;
-
-	LOONGSON_FREQCTRL(package_id) |= 1 << (core_id * 4 + 3);
-
-	return 0;
-}
-
-static int loongson3_disable_clock(unsigned int cpu)
-{
-	uint64_t core_id = cpu_data[cpu].core;
-	uint64_t package_id = cpu_data[cpu].package;
-
-	LOONGSON_FREQCTRL(package_id) &= ~(1 << (core_id * 4 + 3));
-
-	return 0;
-}
-
-static int register_loongson3_notifier(void)
-{
-	return cpuhp_setup_state_nocalls(CPUHP_LOONGARCH_SOC_PREPARE,
-					 "loongarch/loongson:prepare",
-					 loongson3_enable_clock,
-					 loongson3_disable_clock);
-}
-early_initcall(register_loongson3_notifier);
-
 #endif
 
 /*
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 19f0dbfdd7fe..b66c5f389159 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -130,7 +130,6 @@ enum cpuhp_state {
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
-	CPUHP_LOONGARCH_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
 	CPUHP_BRINGUP_CPU,
-- 
cgit 


From ef34a0ae7a2654bc9e58675e36898217fb2799d8 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 28 Jul 2022 14:59:42 +0200
Subject: ALSA: core: Add async signal helpers

Currently the call of kill_fasync() from an interrupt handler might
lead to potential spin deadlocks, as spotted by syzkaller.
Unfortunately, it's not so trivial to fix this lock chain as it's
involved with the tasklist_lock that is touched in allover places.

As a temporary workaround, this patch provides the way to defer the
async signal notification in a work.  The new helper functions,
snd_fasync_helper() and snd_kill_faync() are replacements for
fasync_helper() and kill_fasync(), respectively.  In addition,
snd_fasync_free() needs to be called at the destructor of the relevant
file object.

Link: https://lore.kernel.org/r/20220728125945.29533-2-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/core.h |  8 +++++
 sound/core/misc.c    | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index dd28de2343b8..4365c35d038b 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -507,4 +507,12 @@ snd_pci_quirk_lookup_id(u16 vendor, u16 device,
 }
 #endif
 
+/* async signal helpers */
+struct snd_fasync;
+
+int snd_fasync_helper(int fd, struct file *file, int on,
+		      struct snd_fasync **fasyncp);
+void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll);
+void snd_fasync_free(struct snd_fasync *fasync);
+
 #endif /* __SOUND_CORE_H */
diff --git a/sound/core/misc.c b/sound/core/misc.c
index 50e4aaa6270d..d32a19976a2b 100644
--- a/sound/core/misc.c
+++ b/sound/core/misc.c
@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
+#include <linux/fs.h>
 #include <sound/core.h>
 
 #ifdef CONFIG_SND_DEBUG
@@ -145,3 +146,96 @@ snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list)
 }
 EXPORT_SYMBOL(snd_pci_quirk_lookup);
 #endif
+
+/*
+ * Deferred async signal helpers
+ *
+ * Below are a few helper functions to wrap the async signal handling
+ * in the deferred work.  The main purpose is to avoid the messy deadlock
+ * around tasklist_lock and co at the kill_fasync() invocation.
+ * fasync_helper() and kill_fasync() are replaced with snd_fasync_helper()
+ * and snd_kill_fasync(), respectively.  In addition, snd_fasync_free() has
+ * to be called at releasing the relevant file object.
+ */
+struct snd_fasync {
+	struct fasync_struct *fasync;
+	int signal;
+	int poll;
+	int on;
+	struct list_head list;
+};
+
+static DEFINE_SPINLOCK(snd_fasync_lock);
+static LIST_HEAD(snd_fasync_list);
+
+static void snd_fasync_work_fn(struct work_struct *work)
+{
+	struct snd_fasync *fasync;
+
+	spin_lock_irq(&snd_fasync_lock);
+	while (!list_empty(&snd_fasync_list)) {
+		fasync = list_first_entry(&snd_fasync_list, struct snd_fasync, list);
+		list_del_init(&fasync->list);
+		spin_unlock_irq(&snd_fasync_lock);
+		if (fasync->on)
+			kill_fasync(&fasync->fasync, fasync->signal, fasync->poll);
+		spin_lock_irq(&snd_fasync_lock);
+	}
+	spin_unlock_irq(&snd_fasync_lock);
+}
+
+static DECLARE_WORK(snd_fasync_work, snd_fasync_work_fn);
+
+int snd_fasync_helper(int fd, struct file *file, int on,
+		      struct snd_fasync **fasyncp)
+{
+	struct snd_fasync *fasync = NULL;
+
+	if (on) {
+		fasync = kzalloc(sizeof(*fasync), GFP_KERNEL);
+		if (!fasync)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fasync->list);
+	}
+
+	spin_lock_irq(&snd_fasync_lock);
+	if (*fasyncp) {
+		kfree(fasync);
+		fasync = *fasyncp;
+	} else {
+		if (!fasync) {
+			spin_unlock_irq(&snd_fasync_lock);
+			return 0;
+		}
+		*fasyncp = fasync;
+	}
+	fasync->on = on;
+	spin_unlock_irq(&snd_fasync_lock);
+	return fasync_helper(fd, file, on, &fasync->fasync);
+}
+EXPORT_SYMBOL_GPL(snd_fasync_helper);
+
+void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll)
+{
+	unsigned long flags;
+
+	if (!fasync || !fasync->on)
+		return;
+	spin_lock_irqsave(&snd_fasync_lock, flags);
+	fasync->signal = signal;
+	fasync->poll = poll;
+	list_move(&fasync->list, &snd_fasync_list);
+	schedule_work(&snd_fasync_work);
+	spin_unlock_irqrestore(&snd_fasync_lock, flags);
+}
+EXPORT_SYMBOL_GPL(snd_kill_fasync);
+
+void snd_fasync_free(struct snd_fasync *fasync)
+{
+	if (!fasync)
+		return;
+	fasync->on = 0;
+	flush_work(&snd_fasync_work);
+	kfree(fasync);
+}
+EXPORT_SYMBOL_GPL(snd_fasync_free);
-- 
cgit 


From 96b097091c66df4f6fbf5cbff21df6cc02a2f055 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 28 Jul 2022 14:59:44 +0200
Subject: ALSA: pcm: Use deferred fasync helper

For avoiding the potential deadlock via kill_fasync() call, use the
new fasync helpers to defer the invocation from timer API.  Note that
it's merely a workaround.

Reported-by: syzbot+8285e973a41b5aa68902@syzkaller.appspotmail.com
Reported-by: syzbot+669c9abf11a6a011dd09@syzkaller.appspotmail.com
Link: https://lore.kernel.org/r/20220728125945.29533-4-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/pcm.h     | 2 +-
 sound/core/pcm.c        | 1 +
 sound/core/pcm_lib.c    | 2 +-
 sound/core/pcm_native.c | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 2d03c10f6a36..8c48a5bce88c 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -399,7 +399,7 @@ struct snd_pcm_runtime {
 	snd_pcm_uframes_t twake; 	/* do transfer (!poll) wakeup if non-zero */
 	wait_queue_head_t sleep;	/* poll sleep */
 	wait_queue_head_t tsleep;	/* transfer sleep */
-	struct fasync_struct *fasync;
+	struct snd_fasync *fasync;
 	bool stop_operating;		/* sync_stop will be called */
 	struct mutex buffer_mutex;	/* protect for buffer changes */
 	atomic_t buffer_accessing;	/* >0: in r/w operation, <0: blocked */
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 03fc5fa5813e..2ac742035310 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -1007,6 +1007,7 @@ void snd_pcm_detach_substream(struct snd_pcm_substream *substream)
 		substream->runtime = NULL;
 	}
 	mutex_destroy(&runtime->buffer_mutex);
+	snd_fasync_free(runtime->fasync);
 	kfree(runtime);
 	put_pid(substream->pid);
 	substream->pid = NULL;
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 1fc7c50ffa62..40751e5aff09 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1822,7 +1822,7 @@ void snd_pcm_period_elapsed_under_stream_lock(struct snd_pcm_substream *substrea
 		snd_timer_interrupt(substream->timer, 1);
 #endif
  _end:
-	kill_fasync(&runtime->fasync, SIGIO, POLL_IN);
+	snd_kill_fasync(runtime->fasync, SIGIO, POLL_IN);
 }
 EXPORT_SYMBOL(snd_pcm_period_elapsed_under_stream_lock);
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index aa0453e51595..ad0541e9e888 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3951,7 +3951,7 @@ static int snd_pcm_fasync(int fd, struct file * file, int on)
 	runtime = substream->runtime;
 	if (runtime->status->state == SNDRV_PCM_STATE_DISCONNECTED)
 		return -EBADFD;
-	return fasync_helper(fd, file, on, &runtime->fasync);
+	return snd_fasync_helper(fd, file, on, &runtime->fasync);
 }
 
 /*
-- 
cgit 


From 4a971e84a7ae10a38d875cd2d4e487c8d1682ca3 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 28 Jul 2022 14:59:45 +0200
Subject: ALSA: control: Use deferred fasync helper

For avoiding the potential deadlock via kill_fasync() call, use the
new fasync helpers to defer the invocation from the control API.  Note
that it's merely a workaround.

Another note: although we haven't received reports about the deadlock
with the control API, the deadlock is still potentially possible, and
it's better to align the behavior with other core APIs (PCM and
timer); so let's move altogether.

Link: https://lore.kernel.org/r/20220728125945.29533-5-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/control.h | 2 +-
 sound/core/control.c    | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/control.h b/include/sound/control.h
index fcd3cce673ec..eae443ba79ba 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -109,7 +109,7 @@ struct snd_ctl_file {
 	int preferred_subdevice[SND_CTL_SUBDEV_ITEMS];
 	wait_queue_head_t change_sleep;
 	spinlock_t read_lock;
-	struct fasync_struct *fasync;
+	struct snd_fasync *fasync;
 	int subscribed;			/* read interface is activated */
 	struct list_head events;	/* waiting events for read */
 };
diff --git a/sound/core/control.c b/sound/core/control.c
index 4dba3a342458..f3e893715369 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -127,6 +127,7 @@ static int snd_ctl_release(struct inode *inode, struct file *file)
 			if (control->vd[idx].owner == ctl)
 				control->vd[idx].owner = NULL;
 	up_write(&card->controls_rwsem);
+	snd_fasync_free(ctl->fasync);
 	snd_ctl_empty_read_queue(ctl);
 	put_pid(ctl->pid);
 	kfree(ctl);
@@ -181,7 +182,7 @@ void snd_ctl_notify(struct snd_card *card, unsigned int mask,
 	_found:
 		wake_up(&ctl->change_sleep);
 		spin_unlock(&ctl->read_lock);
-		kill_fasync(&ctl->fasync, SIGIO, POLL_IN);
+		snd_kill_fasync(ctl->fasync, SIGIO, POLL_IN);
 	}
 	read_unlock_irqrestore(&card->ctl_files_rwlock, flags);
 }
@@ -2134,7 +2135,7 @@ static int snd_ctl_fasync(int fd, struct file * file, int on)
 	struct snd_ctl_file *ctl;
 
 	ctl = file->private_data;
-	return fasync_helper(fd, file, on, &ctl->fasync);
+	return snd_fasync_helper(fd, file, on, &ctl->fasync);
 }
 
 /* return the preferred subdevice number if already assigned;
@@ -2302,7 +2303,7 @@ static int snd_ctl_dev_disconnect(struct snd_device *device)
 	read_lock_irqsave(&card->ctl_files_rwlock, flags);
 	list_for_each_entry(ctl, &card->ctl_files, list) {
 		wake_up(&ctl->change_sleep);
-		kill_fasync(&ctl->fasync, SIGIO, POLL_ERR);
+		snd_kill_fasync(ctl->fasync, SIGIO, POLL_ERR);
 	}
 	read_unlock_irqrestore(&card->ctl_files_rwlock, flags);
 
-- 
cgit 


From 944fd1aeacb627fa617f85f8e5a34f7ae8ea4d8e Mon Sep 17 00:00:00 2001
From: Mike Manning <mvrmanning@gmail.com>
Date: Mon, 25 Jul 2022 19:14:42 +0100
Subject: net: allow unbound socket for packets in VRF when tcp_l3mdev_accept
 set

The commit 3c82a21f4320 ("net: allow binding socket in a VRF when
there's an unbound socket") changed the inet socket lookup to avoid
packets in a VRF from matching an unbound socket. This is to ensure the
necessary isolation between the default and other VRFs for routing and
forwarding. VRF-unaware processes running in the default VRF cannot
access another VRF and have to be run with 'ip vrf exec <vrf>'. This is
to be expected with tcp_l3mdev_accept disabled, but could be reallowed
when this sysctl option is enabled. So instead of directly checking dif
and sdif in inet[6]_match, here call inet_sk_bound_dev_eq(). This
allows a match on unbound socket for non-zero sdif i.e. for packets in
a VRF, if tcp_l3mdev_accept is enabled.

Fixes: 3c82a21f4320 ("net: allow binding socket in a VRF when there's an unbound socket")
Signed-off-by: Mike Manning <mvrmanning@gmail.com>
Link: https://lore.kernel.org/netdev/a54c149aed38fded2d3b5fdb1a6c89e36a083b74.camel@lasnet.de/
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h |  7 +++----
 include/net/inet_hashtables.h  | 19 +++----------------
 include/net/inet_sock.h        | 11 +++++++++++
 3 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index f259e1ae14ba..56f1286583d3 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -110,8 +110,6 @@ static inline bool inet6_match(struct net *net, const struct sock *sk,
 			       const __portpair ports,
 			       const int dif, const int sdif)
 {
-	int bound_dev_if;
-
 	if (!net_eq(sock_net(sk), net) ||
 	    sk->sk_family != AF_INET6 ||
 	    sk->sk_portpair != ports ||
@@ -119,8 +117,9 @@ static inline bool inet6_match(struct net *net, const struct sock *sk,
 	    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 		return false;
 
-	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
-	return bound_dev_if == dif || bound_dev_if == sdif;
+	/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
+	return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
+				    sdif);
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index fd6b510d114b..e9cf2157ed8a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -175,17 +175,6 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 	hashinfo->ehash_locks = NULL;
 }
 
-static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
-					int dif, int sdif)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-	return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
-				 bound_dev_if, dif, sdif);
-#else
-	return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
-#endif
-}
-
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
 			struct inet_bind_hashbucket *head,
@@ -271,16 +260,14 @@ static inline bool inet_match(struct net *net, const struct sock *sk,
 			      const __addrpair cookie, const __portpair ports,
 			      int dif, int sdif)
 {
-	int bound_dev_if;
-
 	if (!net_eq(sock_net(sk), net) ||
 	    sk->sk_portpair != ports ||
 	    sk->sk_addrpair != cookie)
 	        return false;
 
-	/* Paired with WRITE_ONCE() from sock_bindtoindex_locked() */
-	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
-	return bound_dev_if == dif || bound_dev_if == sdif;
+	/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
+	return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
+				    sdif);
 }
 
 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 6395f6b9a5d2..bf5654ce711e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -149,6 +149,17 @@ static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
 	return bound_dev_if == dif || bound_dev_if == sdif;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+					int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
+				 bound_dev_if, dif, sdif);
+#else
+	return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
+#endif
+}
+
 struct inet_cork {
 	unsigned int		flags;
 	__be32			addr;
-- 
cgit 


From b07c8cdbe918aa17da864da9a89b22afaed0393e Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Wed, 27 Jul 2022 20:54:05 +0200
Subject: seg6: add support for SRv6 H.Encaps.Red behavior

The SRv6 H.Encaps.Red behavior described in [1] is an optimization of
the SRv6 H.Encaps behavior [2].

H.Encaps.Red reduces the length of the SRH by excluding the first
segment (SID) in the SRH of the pushed IPv6 header. The first SID is
only placed in the IPv6 Destination Address field of the pushed IPv6
header.
When the SRv6 Policy only contains one SID the SRH is omitted, unless
there is an HMAC TLV to be carried.

[1] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.2
[2] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.1

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h |   1 +
 net/ipv6/seg6_iptunnel.c           | 128 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..538152a7b2c3 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -35,6 +35,7 @@ enum {
 	SEG6_IPTUN_MODE_INLINE,
 	SEG6_IPTUN_MODE_ENCAP,
 	SEG6_IPTUN_MODE_L2ENCAP,
+	SEG6_IPTUN_MODE_ENCAP_RED,
 };
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index e756ba705fd9..454bd8a838e6 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -36,6 +36,7 @@ static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 	case SEG6_IPTUN_MODE_INLINE:
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		head = sizeof(struct ipv6hdr);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
@@ -197,6 +198,124 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 }
 EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
 
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+static int seg6_do_srh_encap_red(struct sk_buff *skb,
+				 struct ipv6_sr_hdr *osrh, int proto)
+{
+	__u8 first_seg = osrh->first_segment;
+	struct dst_entry *dst = skb_dst(skb);
+	struct net *net = dev_net(dst->dev);
+	struct ipv6hdr *hdr, *inner_hdr;
+	int hdrlen = ipv6_optlen(osrh);
+	int red_tlv_offset, tlv_offset;
+	struct ipv6_sr_hdr *isrh;
+	bool skip_srh = false;
+	__be32 flowlabel;
+	int tot_len, err;
+	int red_hdrlen;
+	int tlvs_len;
+
+	if (first_seg > 0) {
+		red_hdrlen = hdrlen - sizeof(struct in6_addr);
+	} else {
+		/* NOTE: if tag/flags and/or other TLVs are introduced in the
+		 * seg6_iptunnel infrastructure, they should be considered when
+		 * deciding to skip the SRH.
+		 */
+		skip_srh = !sr_has_hmac(osrh);
+
+		red_hdrlen = skip_srh ? 0 : hdrlen;
+	}
+
+	tot_len = red_hdrlen + sizeof(struct ipv6hdr);
+
+	err = skb_cow_head(skb, tot_len + skb->mac_len);
+	if (unlikely(err))
+		return err;
+
+	inner_hdr = ipv6_hdr(skb);
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+	skb_push(skb, tot_len);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+	hdr = ipv6_hdr(skb);
+
+	/* based on seg6_do_srh_encap() */
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+			     flowlabel);
+		hdr->hop_limit = inner_hdr->hop_limit;
+	} else {
+		ip6_flow_hdr(hdr, 0, flowlabel);
+		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+		IP6CB(skb)->iif = skb->skb_iif;
+	}
+
+	/* no matter if we have to skip the SRH or not, the first segment
+	 * always comes in the pushed IPv6 header.
+	 */
+	hdr->daddr = osrh->segments[first_seg];
+
+	if (skip_srh) {
+		hdr->nexthdr = proto;
+
+		set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+		goto out;
+	}
+
+	/* we cannot skip the SRH, slow path */
+
+	hdr->nexthdr = NEXTHDR_ROUTING;
+	isrh = (void *)hdr + sizeof(struct ipv6hdr);
+
+	if (unlikely(!first_seg)) {
+		/* this is a very rare case; we have only one SID but
+		 * we cannot skip the SRH since we are carrying some
+		 * other info.
+		 */
+		memcpy(isrh, osrh, hdrlen);
+		goto srcaddr;
+	}
+
+	tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr);
+	red_tlv_offset = tlv_offset - sizeof(struct in6_addr);
+
+	memcpy(isrh, osrh, red_tlv_offset);
+
+	tlvs_len = hdrlen - tlv_offset;
+	if (unlikely(tlvs_len > 0)) {
+		const void *s = (const void *)osrh + tlv_offset;
+		void *d = (void *)isrh + red_tlv_offset;
+
+		memcpy(d, s, tlvs_len);
+	}
+
+	--isrh->first_segment;
+	isrh->hdrlen -= 2;
+
+srcaddr:
+	isrh->nexthdr = proto;
+	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
+		err = seg6_push_hmac(net, &hdr->saddr, isrh);
+		if (unlikely(err))
+			return err;
+	}
+#endif
+
+out:
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	skb_postpush_rcsum(skb, hdr, tot_len);
+
+	return 0;
+}
+
 /* insert an SRH within an IPv6 packet, just after the IPv6 header */
 int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
@@ -269,6 +388,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 			return err;
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
 		if (err)
 			return err;
@@ -280,7 +400,11 @@ static int seg6_do_srh(struct sk_buff *skb)
 		else
 			return -EINVAL;
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh, proto);
+
 		if (err)
 			return err;
 
@@ -517,6 +641,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_ENCAP_RED:
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit 


From 13f0296be8ece1189cbc4383a45ba97cafaecc09 Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Wed, 27 Jul 2022 20:54:06 +0200
Subject: seg6: add support for SRv6 H.L2Encaps.Red behavior

The SRv6 H.L2Encaps.Red behavior described in [1] is an optimization of
the SRv6 H.L2Encaps behavior [2].

H.L2Encaps.Red reduces the length of the SRH by excluding the first
segment (SID) in the SRH of the pushed IPv6 header. The first SID is
only placed in the IPv6 Destination Address field of the pushed IPv6
header.
When the SRv6 Policy only contains one SID the SRH is omitted, unless
there is an HMAC TLV to be carried.

[1] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.4
[2] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.3

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h |  1 +
 net/ipv6/seg6_iptunnel.c           | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index 538152a7b2c3..a9fa777f16de 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -36,6 +36,7 @@ enum {
 	SEG6_IPTUN_MODE_ENCAP,
 	SEG6_IPTUN_MODE_L2ENCAP,
 	SEG6_IPTUN_MODE_ENCAP_RED,
+	SEG6_IPTUN_MODE_L2ENCAP_RED,
 };
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 454bd8a838e6..34db881204d2 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -40,6 +40,7 @@ static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 		head = sizeof(struct ipv6hdr);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		return 0;
 	}
 
@@ -413,6 +414,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		if (!skb_mac_header_was_set(skb))
 			return -EINVAL;
 
@@ -422,7 +424,13 @@ static int seg6_do_srh(struct sk_buff *skb)
 		skb_mac_header_rebuild(skb);
 		skb_push(skb, skb->mac_len);
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
+		if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh,
+						IPPROTO_ETHERNET);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh,
+						    IPPROTO_ETHERNET);
+
 		if (err)
 			return err;
 
@@ -643,6 +651,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_ENCAP_RED:
 		break;
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit 


From 8a061562e2f2b32bfb5bff5bf3afc64e37d95a27 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Fri, 29 Jul 2022 17:14:53 +0530
Subject: RISC-V: KVM: Add extensible CSR emulation framework

We add an extensible CSR emulation framework which is based upon the
existing system instruction emulation. This will be useful to upcoming
AIA, PMU, Nested and other virtualization features.

The CSR emulation framework also has provision to emulate CSR in user
space but this will be used only in very specific cases such as AIA
IMSIC CSR emulation in user space or vendor specific CSR emulation
in user space.

By default, all CSRs not handled by KVM RISC-V will be redirected back
to Guest VCPU as illegal instruction trap.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 arch/riscv/include/asm/kvm_host.h      |   5 +
 arch/riscv/include/asm/kvm_vcpu_insn.h |   6 ++
 arch/riscv/kvm/vcpu.c                  |  34 ++++---
 arch/riscv/kvm/vcpu_insn.c             | 172 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h               |   8 ++
 5 files changed, 209 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index d482d771eba4..59a0cf2ca7b9 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -65,6 +65,8 @@ struct kvm_vcpu_stat {
 	u64 wfi_exit_stat;
 	u64 mmio_exit_user;
 	u64 mmio_exit_kernel;
+	u64 csr_exit_user;
+	u64 csr_exit_kernel;
 	u64 exits;
 };
 
@@ -210,6 +212,9 @@ struct kvm_vcpu_arch {
 	/* MMIO instruction details */
 	struct kvm_mmio_decode mmio_decode;
 
+	/* CSR instruction details */
+	struct kvm_csr_decode csr_decode;
+
 	/* SBI context */
 	struct kvm_sbi_context sbi_context;
 
diff --git a/arch/riscv/include/asm/kvm_vcpu_insn.h b/arch/riscv/include/asm/kvm_vcpu_insn.h
index 3351eb61a251..350011c83581 100644
--- a/arch/riscv/include/asm/kvm_vcpu_insn.h
+++ b/arch/riscv/include/asm/kvm_vcpu_insn.h
@@ -18,6 +18,11 @@ struct kvm_mmio_decode {
 	int return_handled;
 };
 
+struct kvm_csr_decode {
+	unsigned long insn;
+	int return_handled;
+};
+
 /* Return values used by function emulating a particular instruction */
 enum kvm_insn_return {
 	KVM_INSN_EXIT_TO_USER_SPACE = 0,
@@ -28,6 +33,7 @@ enum kvm_insn_return {
 };
 
 void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
 				struct kvm_cpu_trap *trap);
 
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index a3c051cb070c..3c95924d38c7 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -26,6 +26,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
 	STATS_DESC_COUNTER(VCPU, mmio_exit_user),
 	STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+	STATS_DESC_COUNTER(VCPU, csr_exit_user),
+	STATS_DESC_COUNTER(VCPU, csr_exit_kernel),
 	STATS_DESC_COUNTER(VCPU, exits)
 };
 
@@ -899,22 +901,26 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	kvm_vcpu_srcu_read_lock(vcpu);
 
-	/* Process MMIO value returned from user-space */
-	if (run->exit_reason == KVM_EXIT_MMIO) {
+	switch (run->exit_reason) {
+	case KVM_EXIT_MMIO:
+		/* Process MMIO value returned from user-space */
 		ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run);
-		if (ret) {
-			kvm_vcpu_srcu_read_unlock(vcpu);
-			return ret;
-		}
-	}
-
-	/* Process SBI value returned from user-space */
-	if (run->exit_reason == KVM_EXIT_RISCV_SBI) {
+		break;
+	case KVM_EXIT_RISCV_SBI:
+		/* Process SBI value returned from user-space */
 		ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run);
-		if (ret) {
-			kvm_vcpu_srcu_read_unlock(vcpu);
-			return ret;
-		}
+		break;
+	case KVM_EXIT_RISCV_CSR:
+		/* Process CSR value returned from user-space */
+		ret = kvm_riscv_vcpu_csr_return(vcpu, vcpu->run);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	if (ret) {
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		return ret;
 	}
 
 	if (run->immediate_exit) {
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
index 75ca62a7fba5..7eb90a47b571 100644
--- a/arch/riscv/kvm/vcpu_insn.c
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -14,6 +14,19 @@
 #define INSN_MASK_WFI		0xffffffff
 #define INSN_MATCH_WFI		0x10500073
 
+#define INSN_MATCH_CSRRW	0x1073
+#define INSN_MASK_CSRRW		0x707f
+#define INSN_MATCH_CSRRS	0x2073
+#define INSN_MASK_CSRRS		0x707f
+#define INSN_MATCH_CSRRC	0x3073
+#define INSN_MASK_CSRRC		0x707f
+#define INSN_MATCH_CSRRWI	0x5073
+#define INSN_MASK_CSRRWI	0x707f
+#define INSN_MATCH_CSRRSI	0x6073
+#define INSN_MASK_CSRRSI	0x707f
+#define INSN_MATCH_CSRRCI	0x7073
+#define INSN_MASK_CSRRCI	0x707f
+
 #define INSN_MATCH_LB		0x3
 #define INSN_MASK_LB		0x707f
 #define INSN_MATCH_LH		0x1003
@@ -71,6 +84,7 @@
 #define SH_RS1			15
 #define SH_RS2			20
 #define SH_RS2C			2
+#define MASK_RX			0x1f
 
 #define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
 #define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
@@ -104,7 +118,7 @@
 #define REG_PTR(insn, pos, regs)	\
 	((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)))
 
-#define GET_RM(insn)		(((insn) >> 12) & 7)
+#define GET_FUNCT3(insn)	(((insn) >> 12) & 7)
 
 #define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
 #define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
@@ -116,7 +130,6 @@
 #define IMM_I(insn)		((s32)(insn) >> 20)
 #define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
 				 (s32)(((insn) >> 7) & 0x1f))
-#define MASK_FUNCT3		0x7000
 
 struct insn_func {
 	unsigned long mask;
@@ -189,7 +202,162 @@ static int wfi_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
 	return KVM_INSN_CONTINUE_NEXT_SEPC;
 }
 
+struct csr_func {
+	unsigned int base;
+	unsigned int count;
+	/*
+	 * Possible return values are as same as "func" callback in
+	 * "struct insn_func".
+	 */
+	int (*func)(struct kvm_vcpu *vcpu, unsigned int csr_num,
+		    unsigned long *val, unsigned long new_val,
+		    unsigned long wr_mask);
+};
+
+static const struct csr_func csr_funcs[] = { };
+
+/**
+ * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space
+ *				emulation or in-kernel emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run:  The VCPU run struct containing the CSR data
+ *
+ * Returns > 0 upon failure and 0 upon success
+ */
+int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	ulong insn;
+
+	if (vcpu->arch.csr_decode.return_handled)
+		return 0;
+	vcpu->arch.csr_decode.return_handled = 1;
+
+	/* Update destination register for CSR reads */
+	insn = vcpu->arch.csr_decode.insn;
+	if ((insn >> SH_RD) & MASK_RX)
+		SET_RD(insn, &vcpu->arch.guest_context,
+		       run->riscv_csr.ret_value);
+
+	/* Move to next instruction */
+	vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+
+	return 0;
+}
+
+static int csr_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
+{
+	int i, rc = KVM_INSN_ILLEGAL_TRAP;
+	unsigned int csr_num = insn >> SH_RS2;
+	unsigned int rs1_num = (insn >> SH_RS1) & MASK_RX;
+	ulong rs1_val = GET_RS1(insn, &vcpu->arch.guest_context);
+	const struct csr_func *tcfn, *cfn = NULL;
+	ulong val = 0, wr_mask = 0, new_val = 0;
+
+	/* Decode the CSR instruction */
+	switch (GET_FUNCT3(insn)) {
+	case GET_FUNCT3(INSN_MATCH_CSRRW):
+		wr_mask = -1UL;
+		new_val = rs1_val;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRS):
+		wr_mask = rs1_val;
+		new_val = -1UL;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRC):
+		wr_mask = rs1_val;
+		new_val = 0;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRWI):
+		wr_mask = -1UL;
+		new_val = rs1_num;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRSI):
+		wr_mask = rs1_num;
+		new_val = -1UL;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRCI):
+		wr_mask = rs1_num;
+		new_val = 0;
+		break;
+	default:
+		return rc;
+	}
+
+	/* Save instruction decode info */
+	vcpu->arch.csr_decode.insn = insn;
+	vcpu->arch.csr_decode.return_handled = 0;
+
+	/* Update CSR details in kvm_run struct */
+	run->riscv_csr.csr_num = csr_num;
+	run->riscv_csr.new_value = new_val;
+	run->riscv_csr.write_mask = wr_mask;
+	run->riscv_csr.ret_value = 0;
+
+	/* Find in-kernel CSR function */
+	for (i = 0; i < ARRAY_SIZE(csr_funcs); i++) {
+		tcfn = &csr_funcs[i];
+		if ((tcfn->base <= csr_num) &&
+		    (csr_num < (tcfn->base + tcfn->count))) {
+			cfn = tcfn;
+			break;
+		}
+	}
+
+	/* First try in-kernel CSR emulation */
+	if (cfn && cfn->func) {
+		rc = cfn->func(vcpu, csr_num, &val, new_val, wr_mask);
+		if (rc > KVM_INSN_EXIT_TO_USER_SPACE) {
+			if (rc == KVM_INSN_CONTINUE_NEXT_SEPC) {
+				run->riscv_csr.ret_value = val;
+				vcpu->stat.csr_exit_kernel++;
+				kvm_riscv_vcpu_csr_return(vcpu, run);
+				rc = KVM_INSN_CONTINUE_SAME_SEPC;
+			}
+			return rc;
+		}
+	}
+
+	/* Exit to user-space for CSR emulation */
+	if (rc <= KVM_INSN_EXIT_TO_USER_SPACE) {
+		vcpu->stat.csr_exit_user++;
+		run->exit_reason = KVM_EXIT_RISCV_CSR;
+	}
+
+	return rc;
+}
+
 static const struct insn_func system_opcode_funcs[] = {
+	{
+		.mask  = INSN_MASK_CSRRW,
+		.match = INSN_MATCH_CSRRW,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRS,
+		.match = INSN_MATCH_CSRRS,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRC,
+		.match = INSN_MATCH_CSRRC,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRWI,
+		.match = INSN_MATCH_CSRRWI,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRSI,
+		.match = INSN_MATCH_CSRRSI,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRCI,
+		.match = INSN_MATCH_CSRRCI,
+		.func  = csr_insn,
+	},
 	{
 		.mask  = INSN_MASK_WFI,
 		.match = INSN_MATCH_WFI,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 860f867c50c0..0c1f42a40fd3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -270,6 +270,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_X86_BUS_LOCK     33
 #define KVM_EXIT_XEN              34
 #define KVM_EXIT_RISCV_SBI        35
+#define KVM_EXIT_RISCV_CSR        36
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -496,6 +497,13 @@ struct kvm_run {
 			unsigned long args[6];
 			unsigned long ret[2];
 		} riscv_sbi;
+		/* KVM_EXIT_RISCV_CSR */
+		struct {
+			unsigned long csr_num;
+			unsigned long new_value;
+			unsigned long write_mask;
+			unsigned long ret_value;
+		} riscv_csr;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit 


From 4ab0e470c06dc741f6583578da44961669331b78 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Fri, 29 Jul 2022 17:15:00 +0530
Subject: KVM: Add gfp_custom flag in struct kvm_mmu_memory_cache

The kvm_mmu_topup_memory_cache() always uses GFP_KERNEL_ACCOUNT for
memory allocation which prevents it's use in atomic context. To address
this limitation of kvm_mmu_topup_memory_cache(), we add gfp_custom flag
in struct kvm_mmu_memory_cache. When the gfp_custom flag is set to some
GFP_xyz flags, the kvm_mmu_topup_memory_cache() will use that instead of
GFP_KERNEL_ACCOUNT.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 include/linux/kvm_types.h | 1 +
 virt/kvm/kvm_main.c       | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index ac1ebb37a0ff..1dcfba68076a 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -87,6 +87,7 @@ struct gfn_to_pfn_cache {
 struct kvm_mmu_memory_cache {
 	int nobjs;
 	gfp_t gfp_zero;
+	gfp_t gfp_custom;
 	struct kmem_cache *kmem_cache;
 	void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE];
 };
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a49df8988cd6..e3a6f7647474 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -386,7 +386,9 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 	if (mc->nobjs >= min)
 		return 0;
 	while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
-		obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
+		obj = mmu_memory_cache_alloc_obj(mc, (mc->gfp_custom) ?
+						 mc->gfp_custom :
+						 GFP_KERNEL_ACCOUNT);
 		if (!obj)
 			return mc->nobjs >= min ? 0 : -ENOMEM;
 		mc->objects[mc->nobjs++] = obj;
-- 
cgit 


From 933c5a4f87d92a865d1db76caf190f1a4a1927f9 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 23 Jul 2022 06:22:48 +0900
Subject: PCI: Stub __pci_ioport_map() for arches that don't support it at all

When building OpenRISC PCI, which has no ioport_map(), we get the following
build error:

  lib/pci_iomap.c: In function 'pci_iomap_range':
    CC      drivers/i2c/i2c-core-base.o
  ./include/asm-generic/pci_iomap.h:29:41: error: implicit declaration of function 'ioport_map'; did you mean 'ioremap'? [-Werror=implicit-function-declaration]
     29 | #define __pci_ioport_map(dev, port, nr) ioport_map((port), (nr))
	|                                         ^~~~~~~~~~
  lib/pci_iomap.c:44:24: note: in expansion of macro '__pci_ioport_map'
     44 |                 return __pci_ioport_map(dev, start, len);
	|                        ^~~~~~~~~~~~~~~~

Add a NULL definition of __pci_ioport_map() for architectures that do not
support ioport_map().

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20220722212248.802500-1-shorne@gmail.com
Signed-off-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/asm-generic/pci_iomap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pci_iomap.h b/include/asm-generic/pci_iomap.h
index 5a2f9bf53384..8fbb0a55545d 100644
--- a/include/asm-generic/pci_iomap.h
+++ b/include/asm-generic/pci_iomap.h
@@ -25,6 +25,8 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #ifdef CONFIG_NO_GENERIC_PCI_IOPORT_MAP
 extern void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port,
 				      unsigned int nr);
+#elif !defined(CONFIG_HAS_IOPORT_MAP)
+#define __pci_ioport_map(dev, port, nr) NULL
 #else
 #define __pci_ioport_map(dev, port, nr) ioport_map((port), (nr))
 #endif
-- 
cgit 


From 0ad722f159e44983ddea1929ffd90d0c20a86f24 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 15 Jul 2022 17:36:16 +0200
Subject: PCI: Remove pci_mmap_page_range() wrapper

The ARCH_GENERIC_PCI_MMAP_RESOURCE symbol came up in a recent discussion,
and I noticed that this was left behind by an unfinished cleanup from 2017.

The only architecture that still relies on providing its own
pci_mmap_page_range() helper instead of using the generic
pci_mmap_resource_range() is sparc. Presumably the reasons for this have
not changed, but at least this can be simplified by converting sparc to use
the same interface as the others.

The only difference between the two is the device-specific offset that gets
added to or subtracted from vma->vm_pgoff.

Change the only caller of pci_mmap_page_range() in common code to subtract
this offset and call the modern interface, while adding it back in the
sparc implementation to preserve the existing behavior.

This removes the complexities of the dual interfaces from the common code,
and keeps it all specific to the sparc architecture code. According to
David Miller, the sparc code lets user space poke into the VGA I/O port
registers by mmapping the I/O space of the parent bridge device, which is
something that the generic pci_mmap_resource_range() code apparently does
not.

Link: https://lore.kernel.org/lkml/1519887203.622.3.camel@infradead.org/t/
Link: https://lore.kernel.org/lkml/20220714214657.2402250-3-shorne@gmail.com/
Link: https://lore.kernel.org/r/20220715153617.3393420-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Stafford Horne <shorne@gmail.com>
---
 Documentation/PCI/sysfs-pci.rst |  2 +-
 arch/sparc/kernel/pci.c         | 13 ++++++++----
 drivers/pci/mmap.c              | 44 -----------------------------------------
 drivers/pci/proc.c              |  7 ++++++-
 include/linux/pci.h             | 12 +----------
 5 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/Documentation/PCI/sysfs-pci.rst b/Documentation/PCI/sysfs-pci.rst
index 742fbd21dc1f..f495185aa88a 100644
--- a/Documentation/PCI/sysfs-pci.rst
+++ b/Documentation/PCI/sysfs-pci.rst
@@ -125,7 +125,7 @@ implementation of that functionality. To support the historical interface of
 mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP.
 
 Alternatively, platforms which set HAVE_PCI_MMAP may provide their own
-implementation of pci_mmap_page_range() instead of defining
+implementation of pci_mmap_resource_range() instead of defining
 ARCH_GENERIC_PCI_MMAP_RESOURCE.
 
 Platforms which support write-combining maps of PCI resources must define
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 31b0c1983286..f580db840bf7 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -876,17 +876,22 @@ static void __pci_mmap_set_pgprot(struct pci_dev *dev, struct vm_area_struct *vm
 
 /* Perform the actual remap of the pages for a PCI device mapping, as appropriate
  * for this architecture.  The region in the process to map is described by vm_start
- * and vm_end members of VMA, the base physical address is found in vm_pgoff.
+ * and vm_end members of VMA, the BAR relative address is found in vm_pgoff.
  * The pci device structure is provided so that architectures may make mapping
  * decisions on a per-device or per-bus basis.
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
+int pci_mmap_resource_range(struct pci_dev *dev, int bar,
+			    struct vm_area_struct *vma,
+			    enum pci_mmap_state mmap_state, int write_combine)
 {
 	int ret;
+	resource_size_t start, end;
+
+	/* convert per-BAR address to PCI bus address */
+	pci_resource_to_user(dev, bar, &dev->resource[bar], &start, &end);
+	vma->vm_pgoff += start >> PAGE_SHIFT;
 
 	ret = __pci_mmap_make_offset(dev, vma, mmap_state);
 	if (ret < 0)
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index b8c9011987f4..4504039056d1 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -13,27 +13,6 @@
 
 #ifdef ARCH_GENERIC_PCI_MMAP_RESOURCE
 
-/*
- * Modern setup: generic pci_mmap_resource_range(), and implement the legacy
- * pci_mmap_page_range() (if needed) as a wrapper round it.
- */
-
-#ifdef HAVE_PCI_MMAP
-int pci_mmap_page_range(struct pci_dev *pdev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
-{
-	resource_size_t start, end;
-
-	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
-
-	/* Adjust vm_pgoff to be the offset within the resource */
-	vma->vm_pgoff -= start >> PAGE_SHIFT;
-	return pci_mmap_resource_range(pdev, bar, vma, mmap_state,
-				       write_combine);
-}
-#endif
-
 static const struct vm_operations_struct pci_phys_vm_ops = {
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 	.access = generic_access_phys,
@@ -70,27 +49,4 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 				  vma->vm_page_prot);
 }
 
-#elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
-
-/*
- * Legacy setup: Implement pci_mmap_resource_range() as a wrapper around
- * the architecture's pci_mmap_page_range(), converting to "user visible"
- * addresses as necessary.
- */
-
-int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
-			    struct vm_area_struct *vma,
-			    enum pci_mmap_state mmap_state, int write_combine)
-{
-	resource_size_t start, end;
-
-	/*
-	 * pci_mmap_page_range() expects the same kind of entry as coming
-	 * from /proc/bus/pci/ which is a "user visible" value. If this is
-	 * different from the resource itself, arch will do necessary fixup.
-	 */
-	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
-	vma->vm_pgoff += start >> PAGE_SHIFT;
-	return pci_mmap_page_range(pdev, bar, vma, mmap_state, write_combine);
-}
 #endif
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 31b26d8ea6cc..f967709082d6 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -244,6 +244,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct pci_dev *dev = pde_data(file_inode(file));
 	struct pci_filp_private *fpriv = file->private_data;
+	resource_size_t start, end;
 	int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM;
 
 	if (!capable(CAP_SYS_RAWIO) ||
@@ -278,7 +279,11 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 	    iomem_is_exclusive(dev->resource[i].start))
 		return -EINVAL;
 
-	ret = pci_mmap_page_range(dev, i, vma,
+	pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+
+	/* Adjust vm_pgoff to be the offset within the resource */
+	vma->vm_pgoff -= start >> PAGE_SHIFT;
+	ret = pci_mmap_resource_range(dev, i, vma,
 				  fpriv->mmap_state, write_combine);
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 81a57b498f22..060af91bafcd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1909,24 +1909,14 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 
 #include <asm/pci.h>
 
-/* These two functions provide almost identical functionality. Depending
- * on the architecture, one will be implemented as a wrapper around the
- * other (in drivers/pci/mmap.c).
- *
+/*
  * pci_mmap_resource_range() maps a specific BAR, and vm->vm_pgoff
  * is expected to be an offset within that region.
  *
- * pci_mmap_page_range() is the legacy architecture-specific interface,
- * which accepts a "user visible" resource address converted by
- * pci_resource_to_user(), as used in the legacy mmap() interface in
- * /proc/bus/pci/.
  */
 int pci_mmap_resource_range(struct pci_dev *dev, int bar,
 			    struct vm_area_struct *vma,
 			    enum pci_mmap_state mmap_state, int write_combine);
-int pci_mmap_page_range(struct pci_dev *pdev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine);
 
 #ifndef arch_can_pci_mmap_wc
 #define arch_can_pci_mmap_wc()		0
-- 
cgit 


From 909fcb195201f7c3aa81523cc182a4c9b52210e5 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:21 +0200
Subject: clk: divider: Introduce devm_clk_hw_register_divider_parent_hw()

Add the devres variant of clk_hw_register_divider_parent_hw() for
registering a divider clock with clk_hw parent pointer instead of parent
name.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220629225331.357308-2-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c10dc4c659e2..4e07621849e6 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -831,6 +831,25 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 	__devm_clk_hw_register_divider((dev), NULL, (name), (parent_name), NULL,   \
 				  NULL, (flags), (reg), (shift), (width),     \
 				  (clk_divider_flags), NULL, (lock))
+/**
+ * devm_clk_hw_register_divider_parent_hw - register a divider clock with the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_hw: pointer to parent clk
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_divider_parent_hw(dev, name, parent_hw, flags,   \
+					       reg, shift, width,	      \
+					       clk_divider_flags, lock)       \
+	__devm_clk_hw_register_divider((dev), NULL, (name), NULL,	      \
+				       (parent_hw), NULL, (flags), (reg),     \
+				       (shift), (width), (clk_divider_flags), \
+				       NULL, (lock))
 /**
  * devm_clk_hw_register_divider_table - register a table based divider clock
  * with the clock framework (devres variant)
-- 
cgit 


From df63af17f3375239e0be124844f304b4a4b60665 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:22 +0200
Subject: clk: mux: Introduce devm_clk_hw_register_mux_parent_hws()

Add the devres variant of clk_hw_register_mux_hws() for registering a
mux clock with clk_hw parent pointers instead of parent names.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220629225331.357308-3-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 4e07621849e6..316c7e082934 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -980,6 +980,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 			      (parent_names), NULL, NULL, (flags), (reg),     \
 			      (shift), BIT((width)) - 1, (clk_mux_flags),     \
 			      NULL, (lock))
+#define devm_clk_hw_register_mux_parent_hws(dev, name, parent_hws,	      \
+					    num_parents, flags, reg, shift,   \
+					    width, clk_mux_flags, lock)       \
+	__devm_clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL,  \
+				   (parent_hws), NULL, (flags), (reg),        \
+				   (shift), BIT((width)) - 1,		      \
+				   (clk_mux_flags), NULL, (lock))
 
 int clk_mux_val_to_index(struct clk_hw *hw, const u32 *table, unsigned int flags,
 			 unsigned int val);
-- 
cgit 


From 6ebd5247ad2aa210b3ff4481c6ed8aa32a032b12 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:23 +0200
Subject: clk: fixed-factor: Introduce
 *clk_hw_register_fixed_factor_parent_hw()

Add the devres and non-devres variant of
clk_hw_register_fixed_factor_parent_hw() for registering a fixed factor
clock with clk_hw parent pointer instead of parent name.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Link: https://lore.kernel.org/r/20220629225331.357308-4-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-fixed-factor.c | 45 +++++++++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h   |  8 ++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index e6b36247c16b..f734e34735a9 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -78,7 +78,8 @@ static void devm_clk_hw_register_fixed_factor_release(struct device *dev, void *
 
 static struct clk_hw *
 __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
-		const char *name, const char *parent_name, int index,
+		const char *name, const char *parent_name,
+		const struct clk_hw *parent_hw, int index,
 		unsigned long flags, unsigned int mult, unsigned int div,
 		bool devm)
 {
@@ -110,6 +111,8 @@ __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
 	init.flags = flags;
 	if (parent_name)
 		init.parent_names = &parent_name;
+	else if (parent_hw)
+		init.parent_hws = &parent_hw;
 	else
 		init.parent_data = &pdata;
 	init.num_parents = 1;
@@ -148,16 +151,48 @@ struct clk_hw *devm_clk_hw_register_fixed_factor_index(struct device *dev,
 		const char *name, unsigned int index, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, index,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, NULL, index,
 					      flags, mult, div, true);
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor_index);
 
+/**
+ * devm_clk_hw_register_fixed_factor_parent_hw - Register a fixed factor clock with
+ * pointer to parent clock
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_hw: pointer to parent clk
+ * @flags: fixed factor flags
+ * @mult: multiplier
+ * @div: divider
+ *
+ * Return: Pointer to fixed factor clk_hw structure that was registered or
+ * an error pointer.
+ */
+struct clk_hw *devm_clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div)
+{
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, parent_hw,
+					      -1, flags, mult, div, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor_parent_hw);
+
+struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div)
+{
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL,
+					      parent_hw, -1, flags, mult, div,
+					      false);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor_parent_hw);
+
 struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, NULL, -1,
 					      flags, mult, div, false);
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor);
@@ -204,7 +239,7 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, NULL, -1,
 			flags, mult, div, true);
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor);
@@ -231,7 +266,7 @@ static struct clk_hw *_of_fixed_factor_clk_setup(struct device_node *node)
 
 	of_property_read_string(node, "clock-output-names", &clk_name);
 
-	hw = __clk_hw_register_fixed_factor(NULL, node, clk_name, NULL, 0,
+	hw = __clk_hw_register_fixed_factor(NULL, node, clk_name, NULL, NULL, 0,
 					    0, mult, div, false);
 	if (IS_ERR(hw)) {
 		/*
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 316c7e082934..94458cb669f0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1032,6 +1032,14 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
 struct clk_hw *devm_clk_hw_register_fixed_factor_index(struct device *dev,
 		const char *name, unsigned int index, unsigned long flags,
 		unsigned int mult, unsigned int div);
+
+struct clk_hw *devm_clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div);
+
+struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div);
 /**
  * struct clk_fractional_divider - adjustable fractional divider clock
  *
-- 
cgit 


From c770f31d8f580ed4b965c64f924ec1cc50e41734 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 19 Jul 2022 09:18:35 -0400
Subject: SUNRPC: Fix xdr_encode_bool()

I discovered that xdr_encode_bool() was returning the same address
that was passed in the @p parameter. The documenting comment states
that the intent is to return the address of the next buffer
location, just like the other "xdr_encode_*" helpers.

The result was the encoded results of NFSv3 PATHCONF operations were
not formed correctly.

Fixes: ded04a587f6c ("NFSD: Update the NFSv3 PATHCONF3res encoder to use struct xdr_stream")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/sunrpc/xdr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5860f32e3958..986c8a17ca5e 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -419,8 +419,8 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
  */
 static inline __be32 *xdr_encode_bool(__be32 *p, u32 n)
 {
-	*p = n ? xdr_one : xdr_zero;
-	return p++;
+	*p++ = n ? xdr_one : xdr_zero;
+	return p;
 }
 
 /**
-- 
cgit 


From 184cefbe62627730c30282df12bcff9aae4816ea Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Mon, 13 Jun 2022 09:40:06 -0400
Subject: NLM: Defend against file_lock changes after vfs_test_lock()

Instead of trusting that struct file_lock returns completely unchanged
after vfs_test_lock() when there's no conflicting lock, stash away our
nlm_lockowner reference so we can properly release it for all cases.

This defends against another file_lock implementation overwriting fl_owner
when the return type is F_UNLCK.

Reported-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Tested-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c         |  4 +++-
 fs/lockd/svclock.c          | 10 +---------
 fs/lockd/svcproc.c          |  5 ++++-
 include/linux/lockd/lockd.h |  1 +
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 176b468a61c7..4f247ab8be61 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -87,6 +87,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	struct nlm_args *argp = rqstp->rq_argp;
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	struct nlm_lockowner *test_owner;
 	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST4        called\n");
@@ -96,6 +97,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
+	test_owner = argp->lock.fl.fl_owner;
 	/* Now check for conflicting locks */
 	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
@@ -103,7 +105,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-	nlmsvc_release_lockowner(&argp->lock);
+	nlmsvc_put_lockowner(test_owner);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cb3658ab9b7a..9c1aa75441e1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -340,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
 	return lockowner;
 }
 
-static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
 {
 	if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
 		return;
@@ -590,7 +590,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	int			error;
 	int			mode;
 	__be32			ret;
-	struct nlm_lockowner	*test_owner;
 
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				nlmsvc_file_inode(file)->i_sb->s_id,
@@ -604,9 +603,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
-	/* If there's a conflicting lock, remember to clean up the test lock */
-	test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
-
 	mode = lock_to_openmode(&lock->fl);
 	error = vfs_test_lock(file->f_file[mode], &lock->fl);
 	if (error) {
@@ -635,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	conflock->fl.fl_end = lock->fl.fl_end;
 	locks_release_private(&lock->fl);
 
-	/* Clean up the test lock */
-	lock->fl.fl_owner = NULL;
-	nlmsvc_put_lockowner(test_owner);
-
 	ret = nlm_lck_denied;
 out:
 	return ret;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4dc1b40a489a..b09ca35b527c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -116,6 +116,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	struct nlm_args *argp = rqstp->rq_argp;
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	struct nlm_lockowner *test_owner;
 	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST          called\n");
@@ -125,6 +126,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
+	test_owner = argp->lock.fl.fl_owner;
+
 	/* Now check for conflicting locks */
 	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
 	if (resp->status == nlm_drop_reply)
@@ -133,7 +136,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
-	nlmsvc_release_lockowner(&argp->lock);
+	nlmsvc_put_lockowner(test_owner);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index fcef192e5e45..70ce419e2709 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -292,6 +292,7 @@ void		  nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t);
 __be32		  nlm_lookup_file(struct svc_rqst *, struct nlm_file **,
 					struct nlm_lock *);
 void		  nlm_release_file(struct nlm_file *);
+void		  nlmsvc_put_lockowner(struct nlm_lockowner *);
 void		  nlmsvc_release_lockowner(struct nlm_lock *);
 void		  nlmsvc_mark_resources(struct net *);
 void		  nlmsvc_free_host_resources(struct nlm_host *);
-- 
cgit 


From 28fffa6c57906ffa07e84f12aae5d99993cafdae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Jun 2022 10:06:16 -0400
Subject: SUNRPC: Expand the svc_alloc_arg_err tracepoint

Record not only the number of pages requested, but the number of
pages that were actually allocated, to get a measure of progress
(or lack thereof).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/trace/events/sunrpc.h | 14 +++++++++-----
 net/sunrpc/svc_xprt.c         |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index b61d9c90fa26..5c48be033cc7 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1989,20 +1989,24 @@ TRACE_EVENT(svc_wake_up,
 
 TRACE_EVENT(svc_alloc_arg_err,
 	TP_PROTO(
-		unsigned int pages
+		unsigned int requested,
+		unsigned int allocated
 	),
 
-	TP_ARGS(pages),
+	TP_ARGS(requested, allocated),
 
 	TP_STRUCT__entry(
-		__field(unsigned int, pages)
+		__field(unsigned int, requested)
+		__field(unsigned int, allocated)
 	),
 
 	TP_fast_assign(
-		__entry->pages = pages;
+		__entry->requested = requested;
+		__entry->allocated = allocated;
 	),
 
-	TP_printk("pages=%u", __entry->pages)
+	TP_printk("requested=%u allocated=%u",
+		__entry->requested, __entry->allocated)
 );
 
 DECLARE_EVENT_CLASS(svc_deferred_event,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 2c4dd7ca95b0..2106003645a7 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -691,7 +691,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
 			set_current_state(TASK_RUNNING);
 			return -EINTR;
 		}
-		trace_svc_alloc_arg_err(pages);
+		trace_svc_alloc_arg_err(pages, ret);
 		memalloc_retry_wait(GFP_KERNEL);
 	}
 	rqstp->rq_page_end = &rqstp->rq_pages[pages];
-- 
cgit 


From 5304877936c0a67e1a01464d113bae4c81eacdb6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 27 Jul 2022 14:40:03 -0400
Subject: NFSD: Fix strncpy() fortify warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In function ‘strncpy’,
    inlined from ‘nfsd4_ssc_setup_dul’ at /home/cel/src/linux/manet/fs/nfsd/nfs4proc.c:1392:3,
    inlined from ‘nfsd4_interssc_connect’ at /home/cel/src/linux/manet/fs/nfsd/nfs4proc.c:1489:11:
/home/cel/src/linux/manet/include/linux/fortify-string.h:52:33: warning: ‘__builtin_strncpy’ specified bound 63 equals destination size [-Wstringop-truncation]
   52 | #define __underlying_strncpy    __builtin_strncpy
      |                                 ^
/home/cel/src/linux/manet/include/linux/fortify-string.h:89:16: note: in expansion of macro ‘__underlying_strncpy’
   89 |         return __underlying_strncpy(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~~

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c      | 2 +-
 include/linux/nfs_ssc.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5af9f8d1feb6..660bd49f3a32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1389,7 +1389,7 @@ try_again:
 		return 0;
 	}
 	if (work) {
-		strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+		strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
 		refcount_set(&work->nsui_refcnt, 2);
 		work->nsui_busy = true;
 		list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
index 222ae8883e85..75843c00f326 100644
--- a/include/linux/nfs_ssc.h
+++ b/include/linux/nfs_ssc.h
@@ -64,7 +64,7 @@ struct nfsd4_ssc_umount_item {
 	refcount_t nsui_refcnt;
 	unsigned long nsui_expire;
 	struct vfsmount *nsui_vfsmount;
-	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
+	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN + 1];
 };
 #endif
 
-- 
cgit 


From fef3e9066d19230f661048ca86937d954c12cd50 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 14 Jul 2022 16:41:47 +0800
Subject: writeback: remove inode_to_wb_is_valid()

inode_to_wb_is_valid() is no longer used since commit fe55d563d417
("remove inode_congested()"), remove it.

Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e84b745a6811..439815cc1ab9 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return wb;
 }
 
-/**
- * inode_to_wb_is_valid - test whether an inode has a wb associated
- * @inode: inode of interest
- *
- * Returns %true if @inode has a wb associated.  May be called without any
- * locking.
- */
-static inline bool inode_to_wb_is_valid(struct inode *inode)
-{
-	return inode->i_wb;
-}
-
 /**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
@@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
-static inline bool inode_to_wb_is_valid(struct inode *inode)
-{
-	return true;
-}
-
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
-- 
cgit 


From 73b73bac90d97400e29e585c678c4d0ebfd2680d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 14 Jul 2022 06:49:18 +0000
Subject: mm: vmpressure: don't count proactive reclaim in vmpressure

memory.reclaim is a cgroup v2 interface that allows users to proactively
reclaim memory from a memcg, without real memory pressure.  Reclaim
operations invoke vmpressure, which is used: (a) To notify userspace of
reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being
under memory pressure for networking (see
mem_cgroup_under_socket_pressure()).

For (a), vmpressure notifications in v1 are not affected by this change
since memory.reclaim is a v2 feature.

For (b), the effects of the vmpressure signal (according to Shakeel [1])
are as follows:
1. Reducing send and receive buffers of the current socket.
2. May drop packets on the rx path.
3. May throttle current thread on the tx path.

Since proactive reclaim is invoked directly by userspace, not by memory
pressure, it makes sense not to throttle networking.  Hence, this change
makes sure that proactive reclaim caused by memory.reclaim does not
trigger vmpressure.

[1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/

[yosryahmed@google.com: update documentation]
  Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  7 +++++++
 include/linux/swap.h                    |  5 ++++-
 mm/memcontrol.c                         | 24 ++++++++++++++----------
 mm/vmscan.c                             | 27 +++++++++++++++++----------
 4 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index ad9ba3ec90a5..376d0207d1f7 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1229,6 +1229,13 @@ PAGE_SIZE multiple when read back.
 	the target cgroup. If less bytes are reclaimed than the
 	specified amount, -EAGAIN is returned.
 
+	Please note that the proactive reclaim (triggered by this
+	interface) is not meant to indicate memory pressure on the
+	memory cgroup. Therefore socket memory balancing triggered by
+	the memory reclaim normally is not exercised in this case.
+	This means that the networking layer will not adapt based on
+	reclaim induced by memory.reclaim.
+
   memory.peak
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6d11c51b2b62..ea895b40e6ff 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
+
+#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
+#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  bool may_swap);
+						  unsigned int reclaim_options);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 767f49a6b987..2b831cc48c7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2330,7 +2330,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
-							     gfp_mask, true);
+							gfp_mask,
+							MEMCG_RECLAIM_MAY_SWAP);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2575,7 +2576,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	bool passed_oom = false;
-	bool may_swap = true;
+	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
 	bool drained = false;
 	bool raised_max_event = false;
 	unsigned long pflags;
@@ -2593,7 +2594,7 @@ retry:
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
 	} else {
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-		may_swap = false;
+		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
 	}
 
 	if (batch > nr_pages) {
@@ -2621,7 +2622,7 @@ retry:
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, may_swap);
+						    gfp_mask, reclaim_options);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3439,8 +3440,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 			continue;
 		}
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1,
-					GFP_KERNEL, !memsw)) {
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3550,7 +3551,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+						  MEMCG_RECLAIM_MAY_SWAP))
 			nr_retries--;
 	}
 
@@ -6302,7 +6304,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-							 GFP_KERNEL, true);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6351,7 +6353,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-							  GFP_KERNEL, true))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
 				nr_reclaims--;
 			continue;
 		}
@@ -6480,6 +6482,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	unsigned int reclaim_options;
 	int err;
 
 	buf = strstrip(buf);
@@ -6487,6 +6490,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	if (err)
 		return err;
 
+	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
 		unsigned long reclaimed;
 
@@ -6503,7 +6507,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 						nr_to_reclaim - nr_reclaimed,
-						GFP_KERNEL, true);
+						GFP_KERNEL, reclaim_options);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fbb4108250ee..9e7d8db42918 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -101,6 +101,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Proactive reclaim invoked by userspace through memory.reclaim */
+	unsigned int proactive:1;
+
 	/*
 	 * Cgroup memory below memory.low is protected as long as we
 	 * don't threaten to OOM. If any cgroup is reclaimed at
@@ -3180,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			    sc->priority);
 
 		/* Record the group's reclaim efficiency */
-		vmpressure(sc->gfp_mask, memcg, false,
-			   sc->nr_scanned - scanned,
-			   sc->nr_reclaimed - reclaimed);
+		if (!sc->proactive)
+			vmpressure(sc->gfp_mask, memcg, false,
+				   sc->nr_scanned - scanned,
+				   sc->nr_reclaimed - reclaimed);
 
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }
@@ -3305,9 +3309,10 @@ again:
 	}
 
 	/* Record the subtree's reclaim efficiency */
-	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-		   sc->nr_scanned - nr_scanned,
-		   sc->nr_reclaimed - nr_reclaimed);
+	if (!sc->proactive)
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+			   sc->nr_scanned - nr_scanned,
+			   sc->nr_reclaimed - nr_reclaimed);
 
 	if (sc->nr_reclaimed - nr_reclaimed)
 		reclaimable = true;
@@ -3589,8 +3594,9 @@ retry:
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
 	do {
-		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
-				sc->priority);
+		if (!sc->proactive)
+			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+					sc->priority);
 		sc->nr_scanned = 0;
 		shrink_zones(zonelist, sc);
 
@@ -3880,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool may_swap)
+					   unsigned int reclaim_options)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
@@ -3893,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.priority = DEF_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = may_swap,
+		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
+		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
-- 
cgit 


From e408e695f5f1f60d784913afc45ff2c387a5aeb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 Jul 2022 21:59:12 -0400
Subject: mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs

This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL,
FS_NODUMP_FL, etc., like all other standard Linux file systems.

[akpm@linux-foundation.org: fix CONFIG_TMPFS_XATTR=n warnings]
Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 11 +++++++++
 mm/shmem.c               | 64 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a68f982f22d1..1b6c4013f691 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -25,9 +25,20 @@ struct shmem_inode_info {
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct timespec64	i_crtime;	/* file creation time */
+	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */
 	struct inode		vfs_inode;
 };
 
+#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE
+#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
 	struct percpu_counter used_blocks;  /* How many are allocated */
diff --git a/mm/shmem.c b/mm/shmem.c
index 12ac67dc831f..06871a913b49 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
+#include <linux/fileattr.h>
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/sched/signal.h>
@@ -1058,6 +1059,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
 		shmem_recalc_inode(inode);
 		spin_unlock_irq(&info->lock);
 	}
+	if (info->fsflags & FS_APPEND_FL)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (info->fsflags & FS_IMMUTABLE_FL)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (info->fsflags & FS_NODUMP_FL)
+		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+			STATX_ATTR_IMMUTABLE |
+			STATX_ATTR_NODUMP);
 	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (shmem_is_huge(NULL, inode, 0))
@@ -2272,7 +2282,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+/* Mask out flags that are inappropriate for the given type of inode. */
+static unsigned shmem_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & SHMEM_REG_FLMASK;
+	else
+		return flags & SHMEM_OTHER_FLMASK;
+}
+
+static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
 				     umode_t mode, dev_t dev, unsigned long flags)
 {
 	struct inode *inode;
@@ -2297,6 +2318,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
 		info->i_crtime = inode->i_mtime;
+		info->fsflags = (dir == NULL) ? 0 :
+			SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+		info->fsflags = shmem_mask_flags(mode, info->fsflags);
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
 		simple_xattrs_init(&info->xattrs);
@@ -3138,6 +3162,40 @@ static const char *shmem_get_link(struct dentry *dentry,
 }
 
 #ifdef CONFIG_TMPFS_XATTR
+
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+
+	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
+
+	return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (fileattr_has_fsx(fa))
+		return -EOPNOTSUPP;
+
+	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
+
+	inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME);
+	if (info->fsflags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (info->fsflags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (info->fsflags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+
+	inode->i_ctime = current_time(inode);
+	return 0;
+}
+
 /*
  * Superblocks without xattr inode operations may get some security.* xattr
  * support from the LSM "for free". As soon as we have any other xattrs
@@ -3828,6 +3886,8 @@ static const struct inode_operations shmem_inode_operations = {
 #ifdef CONFIG_TMPFS_XATTR
 	.listxattr	= shmem_listxattr,
 	.set_acl	= simple_set_acl,
+	.fileattr_get	= shmem_fileattr_get,
+	.fileattr_set	= shmem_fileattr_set,
 #endif
 };
 
@@ -3847,6 +3907,8 @@ static const struct inode_operations shmem_dir_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_XATTR
 	.listxattr	= shmem_listxattr,
+	.fileattr_get	= shmem_fileattr_get,
+	.fileattr_set	= shmem_fileattr_set,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
-- 
cgit 


From bb077c3ffd5362a6d9e60574e1bcc83fe8e3fb27 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 26 Jul 2022 21:18:16 +0800
Subject: mm: cleanup is_highmem()

It is unnecessary to add CONFIG_HIGHMEM check in is_highmem(), which has
been done in is_highmem_idx(), and move is_highmem() close to
is_highmem_idx().  This has no functional impact.

Link: https://lkml.kernel.org/r/20220726131816.149075-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 578247a341b2..e24b40c52468 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx)
 #endif
 }
 
-#ifdef CONFIG_ZONE_DMA
-bool has_managed_dma(void);
-#else
-static inline bool has_managed_dma(void)
-{
-	return false;
-}
-#endif
-
 /**
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
@@ -1155,12 +1146,17 @@ static inline bool has_managed_dma(void)
  */
 static inline int is_highmem(struct zone *zone)
 {
-#ifdef CONFIG_HIGHMEM
 	return is_highmem_idx(zone_idx(zone));
+}
+
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void);
 #else
-	return 0;
-#endif
+static inline bool has_managed_dma(void)
+{
+	return false;
 }
+#endif
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-- 
cgit 


From fa7d574ba4f4f3f4f78d432c8545d9045daa89b1 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Tue, 19 Jul 2022 16:33:49 +0800
Subject: bdi: remove enum wb_congested_state

enum wb_congested_state and the member 'congested' in bdi_writeback are
useless since commit a88f2096d5a2 ("remove congestion tracking
framework"), so remove it.

Link: https://lkml.kernel.org/r/20220719083349.87547-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e863c88df95f..ae12696ec492 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -28,11 +28,6 @@ enum wb_state {
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
 };
 
-enum wb_congested_state {
-	WB_async_congested,	/* The async (write) queue is getting full */
-	WB_sync_congested,	/* The sync queue is getting full */
-};
-
 enum wb_stat_item {
 	WB_RECLAIMABLE,
 	WB_WRITEBACK,
@@ -122,8 +117,6 @@ struct bdi_writeback {
 	atomic_t writeback_inodes;	/* number of inodes under writeback */
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
-	unsigned long congested;	/* WB_[a]sync_congested flags */
-
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
-- 
cgit 


From 45f78b0a2743c4fd71b73400bd5d5339628bf538 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 27 Jul 2022 13:49:03 +0200
Subject: fs/dcache: Move the wakeup from __d_lookup_done() to the caller.

__d_lookup_done() wakes waiters on dentry->d_wait.  On PREEMPT_RT we are
not allowed to do that with preemption disabled, since the wakeup
acquired wait_queue_head::lock, which is a "sleeping" spinlock on RT.

Calling it under dentry->d_lock is not a problem, since that is also a
"sleeping" spinlock on the same configs.  Unfortunately, two of its
callers (__d_add() and __d_move()) are holding more than just ->d_lock
and that needs to be dealt with.

The key observation is that wakeup can be moved to any point before
dropping ->d_lock.

As a first step to solve this, move the wake up outside of the
hlist_bl_lock() held section.

This is safe because:

Waiters get inserted into ->d_wait only after they'd taken ->d_lock
and observed DCACHE_PAR_LOOKUP in flags.  As long as they are
woken up (and evicted from the queue) between the moment __d_lookup_done()
has removed DCACHE_PAR_LOOKUP and dropping ->d_lock, we are safe,
since the waitqueue ->d_wait points to won't get destroyed without
having __d_lookup_done(dentry) called (under ->d_lock).

->d_wait is set only by d_alloc_parallel() and only in case when
it returns a freshly allocated in-lookup dentry.  Whenever that happens,
we are guaranteed that __d_lookup_done() will be called for resulting
dentry (under ->d_lock) before the wq in question gets destroyed.

With two exceptions wq lives in call frame of the caller of
d_alloc_parallel() and we have an explicit d_lookup_done() on the
resulting in-lookup dentry before we leave that frame.

One of those exceptions is nfs_call_unlink(), where wq is embedded into
(dynamically allocated) struct nfs_unlinkdata.  It is destroyed in
nfs_async_unlink_release() after an explicit d_lookup_done() on the
dentry wq went into.

Remaining exception is d_add_ci(). There wq is what we'd found in
->d_wait of d_add_ci() argument. Callers of d_add_ci() are two
instances of ->d_lookup() and they must have been given an in-lookup
dentry.  Which means that they'd been called by __lookup_slow() or
lookup_open(), with wq in the call frame of one of those.

Result of d_alloc_parallel() in d_add_ci() is fed to
d_splice_alias(), which either returns non-NULL (and d_add_ci() does
d_lookup_done()) or feeds dentry to __d_add() that will do
__d_lookup_done() under ->d_lock.  That concludes the analysis.

Let __d_lookup_unhash():

  1) Lock the lookup hash and clear DCACHE_PAR_LOOKUP
  2) Unhash the dentry
  3) Retrieve and clear dentry::d_wait
  4) Unlock the hash and return the retrieved waitqueue head pointer
  5) Let the caller handle the wake up.
  6) Rename __d_lookup_done() to __d_lookup_unhash_wake() to enforce
     build failures for OOT code that used __d_lookup_done() and is not
     aware of the new return value.

This does not yet solve the PREEMPT_RT problem completely because
preemption is still disabled due to i_dir_seq being held for write. This
will be addressed in subsequent steps.

An alternative solution would be to switch the waitqueue to a simple
waitqueue, but aside of Linus not being a fan of them, moving the wake up
closer to the place where dentry::lock is unlocked reduces lock contention
time for the woken up waiter.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20220613140712.77932-3-bigeasy@linutronix.de
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 35 ++++++++++++++++++++++++++++-------
 include/linux/dcache.h |  9 +++------
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 71969505e82e..b6cdc0ffca6f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2712,32 +2712,51 @@ mismatch:
 }
 EXPORT_SYMBOL(d_alloc_parallel);
 
-void __d_lookup_done(struct dentry *dentry)
+/*
+ * - Unhash the dentry
+ * - Retrieve and clear the waitqueue head in dentry
+ * - Return the waitqueue head
+ */
+static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
 {
-	struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
-						 dentry->d_name.hash);
+	wait_queue_head_t *d_wait;
+	struct hlist_bl_head *b;
+
+	lockdep_assert_held(&dentry->d_lock);
+
+	b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
 	hlist_bl_lock(b);
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
 	__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
-	wake_up_all(dentry->d_wait);
+	d_wait = dentry->d_wait;
 	dentry->d_wait = NULL;
 	hlist_bl_unlock(b);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
 	INIT_LIST_HEAD(&dentry->d_lru);
+	return d_wait;
+}
+
+void __d_lookup_unhash_wake(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	wake_up_all(__d_lookup_unhash(dentry));
+	spin_unlock(&dentry->d_lock);
 }
-EXPORT_SYMBOL(__d_lookup_done);
+EXPORT_SYMBOL(__d_lookup_unhash_wake);
 
 /* inode->i_lock held if inode is non-NULL */
 
 static inline void __d_add(struct dentry *dentry, struct inode *inode)
 {
+	wait_queue_head_t *d_wait;
 	struct inode *dir = NULL;
 	unsigned n;
 	spin_lock(&dentry->d_lock);
 	if (unlikely(d_in_lookup(dentry))) {
 		dir = dentry->d_parent->d_inode;
 		n = start_dir_add(dir);
-		__d_lookup_done(dentry);
+		d_wait = __d_lookup_unhash(dentry);
+		wake_up_all(d_wait);
 	}
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
@@ -2896,6 +2915,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 		     bool exchange)
 {
 	struct dentry *old_parent, *p;
+	wait_queue_head_t *d_wait;
 	struct inode *dir = NULL;
 	unsigned n;
 
@@ -2926,7 +2946,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	if (unlikely(d_in_lookup(target))) {
 		dir = target->d_parent->d_inode;
 		n = start_dir_add(dir);
-		__d_lookup_done(target);
+		d_wait = __d_lookup_unhash(target);
+		wake_up_all(d_wait);
 	}
 
 	write_seqcount_begin(&dentry->d_seq);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f5bba51480b2..c73e5e327e76 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -349,7 +349,7 @@ static inline void dont_mount(struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 }
 
-extern void __d_lookup_done(struct dentry *);
+extern void __d_lookup_unhash_wake(struct dentry *dentry);
 
 static inline int d_in_lookup(const struct dentry *dentry)
 {
@@ -358,11 +358,8 @@ static inline int d_in_lookup(const struct dentry *dentry)
 
 static inline void d_lookup_done(struct dentry *dentry)
 {
-	if (unlikely(d_in_lookup(dentry))) {
-		spin_lock(&dentry->d_lock);
-		__d_lookup_done(dentry);
-		spin_unlock(&dentry->d_lock);
-	}
+	if (unlikely(d_in_lookup(dentry)))
+		__d_lookup_unhash_wake(dentry);
 }
 
 extern void dput(struct dentry *);
-- 
cgit 


From 3a2dcbaf4d31023106975d6ae75b6df080c454cb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Jul 2022 18:20:04 -0400
Subject: tracing: Use a copy of the va_list for __assign_vstr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If an instance of tracing enables the same trace event as another
instance, or the top level instance, or even perf, then the va_list passed
into some tracepoints can be used more than once.

As va_list can only be traversed once, this can cause issues:

 # cat /sys/kernel/tracing/instances/qla2xxx/trace
             cat-56106   [012] ..... 2419873.470098: ql_dbg_log: qla2xxx [0000:05:00.0]-1054:14:  Entered (null).
             cat-56106   [012] ..... 2419873.470101: ql_dbg_log: qla2xxx [0000:05:00.0]-1000:14:  Entered ×+<96>²Ü<98>^H.
             cat-56106   [012] ..... 2419873.470102: ql_dbg_log: qla2xxx [0000:05:00.0]-1006:14:  Prepare to issue mbox cmd=0xde589000.

 # cat /sys/kernel/tracing/trace
             cat-56106   [012] ..... 2419873.470097: ql_dbg_log: qla2xxx [0000:05:00.0]-1054:14:  Entered qla2x00_get_firmware_state.
             cat-56106   [012] ..... 2419873.470100: ql_dbg_log: qla2xxx [0000:05:00.0]-1000:14:  Entered qla2x00_mailbox_command.
             cat-56106   [012] ..... 2419873.470102: ql_dbg_log: qla2xxx [0000:05:00.0]-1006:14:  Prepare to issue mbox cmd=0x69.

The instance version is corrupted because the top level instance iterated
the va_list first.

Use va_copy() in the __assign_vstr() macro to make sure that each trace
event for each use case gets a fresh va_list.

Link: https://lore.kernel.org/all/259d53a5-958e-6508-4e45-74dba2821242@marvell.com/
Link: https://lkml.kernel.org/r/20220719182004.21daa83e@gandalf.local.home

Fixes: 0563231f93c6d ("tracing/events: Add __vstring() and __assign_vstr() helper macros")
Reported-by: Arun Easi <aeasi@marvell.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/stages/stage6_event_callback.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
index 0f51f6b3ab70..3c554a585320 100644
--- a/include/trace/stages/stage6_event_callback.h
+++ b/include/trace/stages/stage6_event_callback.h
@@ -40,7 +40,12 @@
 
 #undef __assign_vstr
 #define __assign_vstr(dst, fmt, va)					\
-	vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, *(va))
+	do {								\
+		va_list __cp_va;					\
+		va_copy(__cp_va, *(va));				\
+		vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, __cp_va); \
+		va_end(__cp_va);					\
+	} while (0)
 
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
-- 
cgit 


From 102227b970a15256f5ffd12a6a276ddf978e6caf Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:40 +0200
Subject: rv: Add Runtime Verification (RV) interface

RV is a lightweight (yet rigorous) method that complements classical
exhaustive verification techniques (such as model checking and
theorem proving) with a more practical approach to complex systems.

RV works by analyzing the trace of the system's actual execution,
comparing it against a formal specification of the system behavior.
RV can give precise information on the runtime behavior of the
monitored system while enabling the reaction for unexpected
events, avoiding, for example, the propagation of a failure on
safety-critical systems.

The development of this interface roots in the development of the
paper:

De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
Silva. Efficient formal verification for the Linux kernel. In:
International Conference on Software Engineering and Formal Methods.
Springer, Cham, 2019. p. 315-332.

And:

De Oliveira, Daniel Bristot. Automata-based formal analysis
and verification of the real-time Linux kernel. PhD Thesis, 2020.

The RV interface resembles the tracing/ interface on purpose. The current
path for the RV interface is /sys/kernel/tracing/rv/.

It presents these files:

 "available_monitors"
   - List the available monitors, one per line.

   For example:
     # cat available_monitors
     wip
     wwnr

 "enabled_monitors"
   - Lists the enabled monitors, one per line;
   - Writing to it enables a given monitor;
   - Writing a monitor name with a '!' prefix disables it;
   - Truncating the file disables all enabled monitors.

   For example:
     # cat enabled_monitors
     # echo wip > enabled_monitors
     # echo wwnr >> enabled_monitors
     # cat enabled_monitors
     wip
     wwnr
     # echo '!wip' >> enabled_monitors
     # cat enabled_monitors
     wwnr
     # echo > enabled_monitors
     # cat enabled_monitors
     #

   Note that more than one monitor can be enabled concurrently.

 "monitoring_on"
   - It is an on/off general switcher for monitoring. Note
   that it does not disable enabled monitors or detach events,
   but stop the per-entity monitors of monitoring the events
   received from the system. It resembles the "tracing_on" switcher.

 "monitors/"
   Each monitor will have its one directory inside "monitors/". There
   the monitor specific files will be presented.
   The "monitors/" directory resembles the "events" directory on
   tracefs.

   For example:
     # cd monitors/wip/
     # ls
     desc  enable
     # cat desc
     wakeup in preemptive per-cpu testing monitor.
     # cat enable
     0

For further information, see the comments in the header of
kernel/trace/rv/rv.c from this patch.

Link: https://lkml.kernel.org/r/a4bfe038f50cb047bfb343ad0e12b0e646ab308b.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h       |  43 +++
 include/linux/sched.h    |  11 +
 kernel/trace/Kconfig     |   2 +
 kernel/trace/Makefile    |   1 +
 kernel/trace/rv/Kconfig  |  12 +
 kernel/trace/rv/Makefile |   3 +
 kernel/trace/rv/rv.c     | 782 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/rv/rv.h     |  33 ++
 kernel/trace/trace.c     |   2 +
 kernel/trace/trace.h     |   9 +
 10 files changed, 898 insertions(+)
 create mode 100644 include/linux/rv.h
 create mode 100644 kernel/trace/rv/Kconfig
 create mode 100644 kernel/trace/rv/Makefile
 create mode 100644 kernel/trace/rv/rv.c
 create mode 100644 kernel/trace/rv/rv.h

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
new file mode 100644
index 000000000000..d8fa9e8be94a
--- /dev/null
+++ b/include/linux/rv.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Runtime Verification.
+ *
+ * For futher information, see: kernel/trace/rv/rv.c.
+ */
+#ifndef _LINUX_RV_H
+#define _LINUX_RV_H
+
+#ifdef CONFIG_RV
+
+/*
+ * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
+ * If we find justification for more monitors, we can think about
+ * adding more or developing a dynamic method. So far, none of
+ * these are justified.
+ */
+#define RV_PER_TASK_MONITORS		1
+#define RV_PER_TASK_MONITOR_INIT	(RV_PER_TASK_MONITORS)
+
+/*
+ * Futher monitor types are expected, so make this a union.
+ */
+union rv_task_monitor {
+};
+
+struct rv_monitor {
+	const char		*name;
+	const char		*description;
+	bool			enabled;
+	int			(*enable)(void);
+	void			(*disable)(void);
+	void			(*reset)(void);
+};
+
+bool rv_monitoring_on(void);
+int rv_unregister_monitor(struct rv_monitor *monitor);
+int rv_register_monitor(struct rv_monitor *monitor);
+int rv_get_task_monitor_slot(void);
+void rv_put_task_monitor_slot(int slot);
+
+#endif /* CONFIG_RV */
+#endif /* _LINUX_RV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..b5da3e7c3a04 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/rseq.h>
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
+#include <linux/rv.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1500,6 +1501,16 @@ struct task_struct {
 	struct callback_head		l1d_flush_kill;
 #endif
 
+#ifdef CONFIG_RV
+	/*
+	 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
+	 * If we find justification for more monitors, we can think
+	 * about adding more or developing a dynamic method. So far,
+	 * none of these are justified.
+	 */
+	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ccd6a5ade3e9..1052126bdca2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1106,4 +1106,6 @@ config HIST_TRIGGERS_DEBUG
 
           If unsure, say N.
 
+source "kernel/trace/rv/Kconfig"
+
 endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0d261774d6f3..c6651e16b557 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -106,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o
 obj-$(CONFIG_RETHOOK) += rethook.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_RV) += rv/
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
new file mode 100644
index 000000000000..6d127cdb00dd
--- /dev/null
+++ b/kernel/trace/rv/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+menuconfig RV
+	bool "Runtime Verification"
+	depends on TRACING
+	help
+	  Enable the kernel runtime verification infrastructure. RV is a
+	  lightweight (yet rigorous) method that complements classical
+	  exhaustive verification techniques (such as model checking and
+	  theorem proving). RV works by analyzing the trace of the system's
+	  actual execution, comparing it against a formal specification of
+	  the system behavior.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
new file mode 100644
index 000000000000..fd995379df67
--- /dev/null
+++ b/kernel/trace/rv/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_RV) += rv.o
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
new file mode 100644
index 000000000000..731cc961cc70
--- /dev/null
+++ b/kernel/trace/rv/rv.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * This is the online Runtime Verification (RV) interface.
+ *
+ * RV is a lightweight (yet rigorous) method that complements classical
+ * exhaustive verification techniques (such as model checking and
+ * theorem proving) with a more practical approach to complex systems.
+ *
+ * RV works by analyzing the trace of the system's actual execution,
+ * comparing it against a formal specification of the system behavior.
+ * RV can give precise information on the runtime behavior of the
+ * monitored system while enabling the reaction for unexpected
+ * events, avoiding, for example, the propagation of a failure on
+ * safety-critical systems.
+ *
+ * The development of this interface roots in the development of the
+ * paper:
+ *
+ * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
+ * Silva. Efficient formal verification for the Linux kernel. In:
+ * International Conference on Software Engineering and Formal Methods.
+ * Springer, Cham, 2019. p. 315-332.
+ *
+ * And:
+ *
+ * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis
+ * and verification of the real-time Linux kernel. PhD Thesis, 2020.
+ *
+ * == Runtime monitor interface ==
+ *
+ * A monitor is the central part of the runtime verification of a system.
+ *
+ * The monitor stands in between the formal specification of the desired
+ * (or undesired) behavior, and the trace of the actual system.
+ *
+ * In Linux terms, the runtime verification monitors are encapsulated
+ * inside the "RV monitor" abstraction. A RV monitor includes a reference
+ * model of the system, a set of instances of the monitor (per-cpu monitor,
+ * per-task monitor, and so on), and the helper functions that glue the
+ * monitor to the system via trace. Generally, a monitor includes some form
+ * of trace output as a reaction for event parsing and exceptions,
+ * as depicted bellow:
+ *
+ * Linux  +----- RV Monitor ----------------------------------+ Formal
+ *  Realm |                                                   |  Realm
+ *  +-------------------+     +----------------+     +-----------------+
+ *  |   Linux kernel    |     |     Monitor    |     |     Reference   |
+ *  |     Tracing       |  -> |   Instance(s)  | <-  |       Model     |
+ *  | (instrumentation) |     | (verification) |     | (specification) |
+ *  +-------------------+     +----------------+     +-----------------+
+ *         |                          |                       |
+ *         |                          V                       |
+ *         |                     +----------+                 |
+ *         |                     | Reaction |                 |
+ *         |                     +--+--+--+-+                 |
+ *         |                        |  |  |                   |
+ *         |                        |  |  +-> trace output ?  |
+ *         +------------------------|--|----------------------+
+ *                                  |  +----> panic ?
+ *                                  +-------> <user-specified>
+ *
+ * This file implements the interface for loading RV monitors, and
+ * to control the verification session.
+ *
+ * == Registering monitors ==
+ *
+ * The struct rv_monitor defines a set of callback functions to control
+ * a verification session. For instance, when a given monitor is enabled,
+ * the "enable" callback function is called to hook the instrumentation
+ * functions to the kernel trace events. The "disable" function is called
+ * when disabling the verification session.
+ *
+ * A RV monitor is registered via:
+ *   int rv_register_monitor(struct rv_monitor *monitor);
+ * And unregistered via:
+ *   int rv_unregister_monitor(struct rv_monitor *monitor);
+ *
+ * == User interface ==
+ *
+ * The user interface resembles kernel tracing interface. It presents
+ * these files:
+ *
+ *  "available_monitors"
+ *    - List the available monitors, one per line.
+ *
+ *    For example:
+ *      # cat available_monitors
+ *      wip
+ *      wwnr
+ *
+ *  "enabled_monitors"
+ *    - Lists the enabled monitors, one per line;
+ *    - Writing to it enables a given monitor;
+ *    - Writing a monitor name with a '!' prefix disables it;
+ *    - Truncating the file disables all enabled monitors.
+ *
+ *    For example:
+ *      # cat enabled_monitors
+ *      # echo wip > enabled_monitors
+ *      # echo wwnr >> enabled_monitors
+ *      # cat enabled_monitors
+ *      wip
+ *      wwnr
+ *      # echo '!wip' >> enabled_monitors
+ *      # cat enabled_monitors
+ *      wwnr
+ *      # echo > enabled_monitors
+ *      # cat enabled_monitors
+ *      #
+ *
+ *    Note that more than one monitor can be enabled concurrently.
+ *
+ *  "monitoring_on"
+ *    - It is an on/off general switcher for monitoring. Note
+ *    that it does not disable enabled monitors or detach events,
+ *    but stops the per-entity monitors from monitoring the events
+ *    received from the instrumentation. It resembles the "tracing_on"
+ *    switcher.
+ *
+ *  "monitors/"
+ *    Each monitor will have its own directory inside "monitors/". There
+ *    the monitor specific files will be presented.
+ *    The "monitors/" directory resembles the "events" directory on
+ *    tracefs.
+ *
+ *    For example:
+ *      # cd monitors/wip/
+ *      # ls
+ *      desc  enable
+ *      # cat desc
+ *      auto-generated wakeup in preemptive monitor.
+ *      # cat enable
+ *      0
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include "rv.h"
+
+DEFINE_MUTEX(rv_interface_lock);
+
+static struct rv_interface rv_root;
+
+struct dentry *get_monitors_root(void)
+{
+	return rv_root.monitors_dir;
+}
+
+/*
+ * Interface for the monitor register.
+ */
+static LIST_HEAD(rv_monitors_list);
+
+static int task_monitor_count;
+static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+
+int rv_get_task_monitor_slot(void)
+{
+	int i;
+
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (task_monitor_count == RV_PER_TASK_MONITORS)
+		return -EBUSY;
+
+	task_monitor_count++;
+
+	for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+		if (task_monitor_slots[i] == false) {
+			task_monitor_slots[i] = true;
+			return i;
+		}
+	}
+
+	WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n");
+
+	return -EINVAL;
+}
+
+void rv_put_task_monitor_slot(int slot)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+		WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
+		return;
+	}
+
+	WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n",
+		  slot);
+
+	task_monitor_count--;
+	task_monitor_slots[slot] = false;
+}
+
+/*
+ * This section collects the monitor/ files and folders.
+ */
+static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
+					loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	const char *buff;
+
+	buff = mdef->monitor->enabled ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+/*
+ * __rv_disable_monitor - disabled an enabled monitor
+ */
+static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (mdef->monitor->enabled) {
+		mdef->monitor->enabled = 0;
+		mdef->monitor->disable();
+
+		/*
+		 * Wait for the execution of all events to finish.
+		 * Otherwise, the data used by the monitor could
+		 * be inconsistent. i.e., if the monitor is re-enabled.
+		 */
+		if (sync)
+			tracepoint_synchronize_unregister();
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * rv_disable_monitor - disable a given runtime monitor
+ *
+ * Returns 0 on success.
+ */
+int rv_disable_monitor(struct rv_monitor_def *mdef)
+{
+	__rv_disable_monitor(mdef, true);
+	return 0;
+}
+
+/**
+ * rv_enable_monitor - enable a given runtime monitor
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int rv_enable_monitor(struct rv_monitor_def *mdef)
+{
+	int retval;
+
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (mdef->monitor->enabled)
+		return 0;
+
+	retval = mdef->monitor->enable();
+
+	if (!retval)
+		mdef->monitor->enabled = 1;
+
+	return retval;
+}
+
+/*
+ * interface for enabling/disabling a monitor.
+ */
+static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
+					 size_t count, loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	retval = count;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		retval = rv_enable_monitor(mdef);
+	else
+		retval = rv_disable_monitor(mdef);
+
+	mutex_unlock(&rv_interface_lock);
+
+	return retval ? : count;
+}
+
+static const struct file_operations interface_enable_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = monitor_enable_write_data,
+	.read   = monitor_enable_read_data,
+};
+
+/*
+ * Interface to read monitors description.
+ */
+static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
+				      loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	char buff[256];
+
+	memset(buff, 0, sizeof(buff));
+
+	snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static const struct file_operations interface_desc_fops = {
+	.open   = simple_open,
+	.llseek	= no_llseek,
+	.read	= monitor_desc_read_data,
+};
+
+/*
+ * During the registration of a monitor, this function creates
+ * the monitor dir, where the specific options of the monitor
+ * are exposed.
+ */
+static int create_monitor_dir(struct rv_monitor_def *mdef)
+{
+	struct dentry *root = get_monitors_root();
+	const char *name = mdef->monitor->name;
+	struct dentry *tmp;
+	int retval;
+
+	mdef->root_d = rv_create_dir(name, root);
+	if (!mdef->root_d)
+		return -ENOMEM;
+
+	tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+	if (!tmp) {
+		retval = -ENOMEM;
+		goto out_remove_root;
+	}
+
+	tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+	if (!tmp) {
+		retval = -ENOMEM;
+		goto out_remove_root;
+	}
+
+	return 0;
+
+out_remove_root:
+	rv_remove(mdef->root_d);
+	return retval;
+}
+
+/*
+ * Available/Enable monitor shared seq functions.
+ */
+static int monitors_show(struct seq_file *m, void *p)
+{
+	struct rv_monitor_def *mon_def = p;
+
+	seq_printf(m, "%s\n", mon_def->monitor->name);
+	return 0;
+}
+
+/*
+ * Used by the seq file operations at the end of a read
+ * operation.
+ */
+static void monitors_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&rv_interface_lock);
+}
+
+/*
+ * Available monitor seq functions.
+ */
+static void *available_monitors_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&rv_interface_lock);
+	return seq_list_start(&rv_monitors_list, *pos);
+}
+
+static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &rv_monitors_list, pos);
+}
+
+/*
+ * Enable monitor seq functions.
+ */
+static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct rv_monitor_def *m_def = p;
+
+	(*pos)++;
+
+	list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
+		if (m_def->monitor->enabled)
+			return m_def;
+	}
+
+	return NULL;
+}
+
+static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
+{
+	struct rv_monitor_def *m_def;
+	loff_t l;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (list_empty(&rv_monitors_list))
+		return NULL;
+
+	m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+
+	for (l = 0; l <= *pos; ) {
+		m_def = enabled_monitors_next(m, m_def, &l);
+		if (!m_def)
+			break;
+	}
+
+	return m_def;
+}
+
+/*
+ * available/enabled monitors seq definition.
+ */
+static const struct seq_operations available_monitors_seq_ops = {
+	.start	= available_monitors_start,
+	.next	= available_monitors_next,
+	.stop	= monitors_stop,
+	.show	= monitors_show
+};
+
+static const struct seq_operations enabled_monitors_seq_ops = {
+	.start  = enabled_monitors_start,
+	.next   = enabled_monitors_next,
+	.stop   = monitors_stop,
+	.show   = monitors_show
+};
+
+/*
+ * available_monitors interface.
+ */
+static int available_monitors_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &available_monitors_seq_ops);
+};
+
+static const struct file_operations available_monitors_ops = {
+	.open    = available_monitors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * enabled_monitors interface.
+ */
+static void disable_all_monitors(void)
+{
+	struct rv_monitor_def *mdef;
+	int enabled = 0;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry(mdef, &rv_monitors_list, list)
+		enabled += __rv_disable_monitor(mdef, false);
+
+	if (enabled) {
+		/*
+		 * Wait for the execution of all events to finish.
+		 * Otherwise, the data used by the monitor could
+		 * be inconsistent. i.e., if the monitor is re-enabled.
+		 */
+		tracepoint_synchronize_unregister();
+	}
+
+	mutex_unlock(&rv_interface_lock);
+}
+
+static int enabled_monitors_open(struct inode *inode, struct file *file)
+{
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+		disable_all_monitors();
+
+	return seq_open(file, &enabled_monitors_seq_ops);
+};
+
+static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
+	struct rv_monitor_def *mdef;
+	int retval = -EINVAL;
+	bool enable = true;
+	char *ptr = buff;
+	int len;
+
+	if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
+		return -EINVAL;
+
+	memset(buff, 0, sizeof(buff));
+
+	retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+	if (retval < 0)
+		return -EFAULT;
+
+	ptr = strim(buff);
+
+	if (ptr[0] == '!') {
+		enable = false;
+		ptr++;
+	}
+
+	len = strlen(ptr);
+	if (!len)
+		return count;
+
+	mutex_lock(&rv_interface_lock);
+
+	retval = -EINVAL;
+
+	list_for_each_entry(mdef, &rv_monitors_list, list) {
+		if (strcmp(ptr, mdef->monitor->name) != 0)
+			continue;
+
+		/*
+		 * Monitor found!
+		 */
+		if (enable)
+			retval = rv_enable_monitor(mdef);
+		else
+			retval = rv_disable_monitor(mdef);
+
+		if (!retval)
+			retval = count;
+
+		break;
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+static const struct file_operations enabled_monitors_ops = {
+	.open		= enabled_monitors_open,
+	.read		= seq_read,
+	.write		= enabled_monitors_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * Monitoring on global switcher!
+ */
+static bool __read_mostly monitoring_on;
+
+/**
+ * rv_monitoring_on - checks if monitoring is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_monitoring_on(void)
+{
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_rmb();
+	return READ_ONCE(monitoring_on);
+}
+
+/*
+ * monitoring_on general switcher.
+ */
+static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
+				       size_t count, loff_t *ppos)
+{
+	const char *buff;
+
+	buff = rv_monitoring_on() ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static void turn_monitoring_off(void)
+{
+	WRITE_ONCE(monitoring_on, false);
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_wmb();
+}
+
+static void reset_all_monitors(void)
+{
+	struct rv_monitor_def *mdef;
+
+	list_for_each_entry(mdef, &rv_monitors_list, list) {
+		if (mdef->monitor->enabled)
+			mdef->monitor->reset();
+	}
+}
+
+static void turn_monitoring_on(void)
+{
+	WRITE_ONCE(monitoring_on, true);
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_wmb();
+}
+
+static void turn_monitoring_on_with_reset(void)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (rv_monitoring_on())
+		return;
+
+	/*
+	 * Monitors might be out of sync with the system if events were not
+	 * processed because of !rv_monitoring_on().
+	 *
+	 * Reset all monitors, forcing a re-sync.
+	 */
+	reset_all_monitors();
+	turn_monitoring_on();
+}
+
+static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf,
+					size_t count, loff_t *ppos)
+{
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		turn_monitoring_on_with_reset();
+	else
+		turn_monitoring_off();
+
+	/*
+	 * Wait for the execution of all events to finish
+	 * before returning to user-space.
+	 */
+	tracepoint_synchronize_unregister();
+
+	mutex_unlock(&rv_interface_lock);
+
+	return count;
+}
+
+static const struct file_operations monitoring_on_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = monitoring_on_write_data,
+	.read   = monitoring_on_read_data,
+};
+
+static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+{
+	rv_remove(mdef->root_d);
+}
+
+/**
+ * rv_register_monitor - register a rv monitor.
+ * @monitor:    The rv_monitor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_monitor(struct rv_monitor *monitor)
+{
+	struct rv_monitor_def *r;
+	int retval = 0;
+
+	if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
+		pr_info("Monitor %s has a name longer than %d\n", monitor->name,
+			MAX_RV_MONITOR_NAME_SIZE);
+		return -1;
+	}
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry(r, &rv_monitors_list, list) {
+		if (strcmp(monitor->name, r->monitor->name) == 0) {
+			pr_info("Monitor %s is already registered\n", monitor->name);
+			retval = -1;
+			goto out_unlock;
+		}
+	}
+
+	r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
+	if (!r) {
+		retval = -ENOMEM;
+		goto out_unlock;
+	}
+
+	r->monitor = monitor;
+
+	retval = create_monitor_dir(r);
+	if (retval) {
+		kfree(r);
+		goto out_unlock;
+	}
+
+	list_add_tail(&r->list, &rv_monitors_list);
+
+out_unlock:
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+/**
+ * rv_unregister_monitor - unregister a rv monitor.
+ * @monitor:    The rv_monitor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_monitor(struct rv_monitor *monitor)
+{
+	struct rv_monitor_def *ptr, *next;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
+		if (strcmp(monitor->name, ptr->monitor->name) == 0) {
+			rv_disable_monitor(ptr);
+			list_del(&ptr->list);
+			destroy_monitor_dir(ptr);
+		}
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return 0;
+}
+
+int __init rv_init_interface(void)
+{
+	struct dentry *tmp;
+
+	rv_root.root_dir = rv_create_dir("rv", NULL);
+	if (!rv_root.root_dir)
+		goto out_err;
+
+	rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+	if (!rv_root.monitors_dir)
+		goto out_err;
+
+	tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+			     &available_monitors_ops);
+	if (!tmp)
+		goto out_err;
+
+	tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+			     &enabled_monitors_ops);
+	if (!tmp)
+		goto out_err;
+
+	tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+			     &monitoring_on_fops);
+	if (!tmp)
+		goto out_err;
+
+	turn_monitoring_on();
+
+	return 0;
+
+out_err:
+	rv_remove(rv_root.root_dir);
+	printk(KERN_ERR "RV: Error while creating the RV interface\n");
+	return 1;
+}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
new file mode 100644
index 000000000000..50014aa224a7
--- /dev/null
+++ b/kernel/trace/rv/rv.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/mutex.h>
+
+struct rv_interface {
+	struct dentry		*root_dir;
+	struct dentry		*monitors_dir;
+};
+
+#include "../trace.h"
+#include <linux/tracefs.h>
+#include <linux/rv.h>
+
+#define RV_MODE_WRITE			TRACE_MODE_WRITE
+#define RV_MODE_READ			TRACE_MODE_READ
+
+#define rv_create_dir			tracefs_create_dir
+#define rv_create_file			tracefs_create_file
+#define rv_remove			tracefs_remove
+
+#define MAX_RV_MONITOR_NAME_SIZE	32
+
+extern struct mutex rv_interface_lock;
+
+struct rv_monitor_def {
+	struct list_head	list;
+	struct rv_monitor	*monitor;
+	struct dentry		*root_d;
+	bool			task_monitor;
+};
+
+struct dentry *get_monitors_root(void);
+int rv_disable_monitor(struct rv_monitor_def *mdef);
+int rv_enable_monitor(struct rv_monitor_def *mdef);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7eb5bce62500..301305ec134b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9772,6 +9772,8 @@ static __init int tracer_init_tracefs(void)
 		tracer_init_tracefs_work_func(NULL);
 	}
 
+	rv_init_interface();
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ff816fb41e48..900e75d96c84 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2005,4 +2005,13 @@ struct trace_min_max_param {
 
 extern const struct file_operations trace_min_max_fops;
 
+#ifdef CONFIG_RV
+extern int rv_init_interface(void);
+#else
+static inline int rv_init_interface(void)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit 


From 04acadcb4453cf8011dd3d4ce8d97fecac42d325 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:41 +0200
Subject: rv: Add runtime reactors interface

A runtime monitor can cause a reaction to the detection of an
exception on the model's execution. By default, the monitors have
tracing reactions, printing the monitor output via tracepoints.
But other reactions can be added (on-demand) via this interface.

The user interface resembles the kernel tracing interface and
presents these files:

"available_reactors"
  - Reading shows the available reactors, one per line.

   For example:
     # cat available_reactors
     nop
     panic
     printk

 "reacting_on"
   - It is an on/off general switch for reactors, disabling
   all reactions.

 "monitors/MONITOR/reactors"
   - List available reactors, with the select reaction for the given
   MONITOR inside []. The default one is the nop (no operation)
   reactor.
   - Writing the name of a reactor enables it to the given
   MONITOR.

   For example:
     # cat monitors/wip/reactors
     [nop]
     panic
     printk
     # echo panic > monitors/wip/reactors
     # cat monitors/wip/reactors
     nop
     [panic]
     printk

Link: https://lkml.kernel.org/r/1794eb994637457bdeaa6bad0b8263d2f7eece0c.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |  17 ++
 kernel/trace/rv/Kconfig       |  11 +
 kernel/trace/rv/Makefile      |   1 +
 kernel/trace/rv/rv.c          |   9 +
 kernel/trace/rv/rv.h          |  35 +++
 kernel/trace/rv/rv_reactors.c | 508 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 581 insertions(+)
 create mode 100644 kernel/trace/rv/rv_reactors.c

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index d8fa9e8be94a..dcce56987cf7 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -24,6 +24,14 @@
 union rv_task_monitor {
 };
 
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor {
+	const char		*name;
+	const char		*description;
+	void			(*react)(char *msg);
+};
+#endif
+
 struct rv_monitor {
 	const char		*name;
 	const char		*description;
@@ -31,6 +39,9 @@ struct rv_monitor {
 	int			(*enable)(void);
 	void			(*disable)(void);
 	void			(*reset)(void);
+#ifdef CONFIG_RV_REACTORS
+	void			(*react)(char *msg);
+#endif
 };
 
 bool rv_monitoring_on(void);
@@ -39,5 +50,11 @@ int rv_register_monitor(struct rv_monitor *monitor);
 int rv_get_task_monitor_slot(void);
 void rv_put_task_monitor_slot(int slot);
 
+#ifdef CONFIG_RV_REACTORS
+bool rv_reacting_on(void);
+int rv_unregister_reactor(struct rv_reactor *reactor);
+int rv_register_reactor(struct rv_reactor *reactor);
+#endif /* CONFIG_RV_REACTORS */
+
 #endif /* CONFIG_RV */
 #endif /* _LINUX_RV_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 6d127cdb00dd..3eb5d48ab4f6 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -10,3 +10,14 @@ menuconfig RV
 	  theorem proving). RV works by analyzing the trace of the system's
 	  actual execution, comparing it against a formal specification of
 	  the system behavior.
+
+config RV_REACTORS
+	bool "Runtime verification reactors"
+	default y
+	depends on RV
+	help
+	  Enables the online runtime verification reactors. A runtime
+	  monitor can cause a reaction to the detection of an exception
+	  on the model's execution. By default, the monitors have
+	  tracing reactions, printing the monitor output via tracepoints,
+	  but other reactions can be added (on-demand) via this interface.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index fd995379df67..8944274d9b41 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 731cc961cc70..45cf64eb2600 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -353,6 +353,10 @@ static int create_monitor_dir(struct rv_monitor_def *mdef)
 		goto out_remove_root;
 	}
 
+	retval = reactor_populate_monitor(mdef);
+	if (retval)
+		goto out_remove_root;
+
 	return 0;
 
 out_remove_root:
@@ -669,6 +673,7 @@ static const struct file_operations monitoring_on_fops = {
 
 static void destroy_monitor_dir(struct rv_monitor_def *mdef)
 {
+	reactor_cleanup_monitor(mdef);
 	rv_remove(mdef->root_d);
 }
 
@@ -747,6 +752,7 @@ int rv_unregister_monitor(struct rv_monitor *monitor)
 int __init rv_init_interface(void)
 {
 	struct dentry *tmp;
+	int retval;
 
 	rv_root.root_dir = rv_create_dir("rv", NULL);
 	if (!rv_root.root_dir)
@@ -770,6 +776,9 @@ int __init rv_init_interface(void)
 			     &monitoring_on_fops);
 	if (!tmp)
 		goto out_err;
+	retval = init_rv_reactors(rv_root.root_dir);
+	if (retval)
+		goto out_err;
 
 	turn_monitoring_on();
 
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 50014aa224a7..db6cb0913dbd 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -18,16 +18,51 @@ struct rv_interface {
 #define rv_remove			tracefs_remove
 
 #define MAX_RV_MONITOR_NAME_SIZE	32
+#define MAX_RV_REACTOR_NAME_SIZE	32
 
 extern struct mutex rv_interface_lock;
 
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor_def {
+	struct list_head	list;
+	struct rv_reactor	*reactor;
+	/* protected by the monitor interface lock */
+	int			counter;
+};
+#endif
+
 struct rv_monitor_def {
 	struct list_head	list;
 	struct rv_monitor	*monitor;
 	struct dentry		*root_d;
+#ifdef CONFIG_RV_REACTORS
+	struct rv_reactor_def	*rdef;
+	bool			reacting;
+#endif
 	bool			task_monitor;
 };
 
 struct dentry *get_monitors_root(void);
 int rv_disable_monitor(struct rv_monitor_def *mdef);
 int rv_enable_monitor(struct rv_monitor_def *mdef);
+
+#ifdef CONFIG_RV_REACTORS
+int reactor_populate_monitor(struct rv_monitor_def *mdef);
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int init_rv_reactors(struct dentry *root_dir);
+#else
+static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+	return 0;
+}
+
+static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+	return;
+}
+
+static inline int init_rv_reactors(struct dentry *root_dir)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
new file mode 100644
index 000000000000..a6522c196382
--- /dev/null
+++ b/kernel/trace/rv/rv_reactors.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Runtime reactor interface.
+ *
+ * A runtime monitor can cause a reaction to the detection of an
+ * exception on the model's execution. By default, the monitors have
+ * tracing reactions, printing the monitor output via tracepoints.
+ * But other reactions can be added (on-demand) via this interface.
+ *
+ * == Registering reactors ==
+ *
+ * The struct rv_reactor defines a callback function to be executed
+ * in case of a model exception happens. The callback function
+ * receives a message to be (optionally) printed before executing
+ * the reaction.
+ *
+ * A RV reactor is registered via:
+ *   int rv_register_reactor(struct rv_reactor *reactor)
+ * And unregistered via:
+ *   int rv_unregister_reactor(struct rv_reactor *reactor)
+ *
+ * These functions are exported to modules, enabling reactors to be
+ * dynamically loaded.
+ *
+ * == User interface ==
+ *
+ * The user interface resembles the kernel tracing interface and
+ * presents these files:
+ *
+ *  "available_reactors"
+ *    - List the available reactors, one per line.
+ *
+ *    For example:
+ *      # cat available_reactors
+ *      nop
+ *      panic
+ *      printk
+ *
+ *  "reacting_on"
+ *    - It is an on/off general switch for reactors, disabling
+ *    all reactions.
+ *
+ *  "monitors/MONITOR/reactors"
+ *    - List available reactors, with the select reaction for the given
+ *    MONITOR inside []. The default one is the nop (no operation)
+ *    reactor.
+ *    - Writing the name of an reactor enables it to the given
+ *    MONITOR.
+ *
+ *    For example:
+ *      # cat monitors/wip/reactors
+ *      [nop]
+ *      panic
+ *      printk
+ *      # echo panic > monitors/wip/reactors
+ *      # cat monitors/wip/reactors
+ *      nop
+ *      [panic]
+ *      printk
+ */
+
+#include <linux/slab.h>
+
+#include "rv.h"
+
+/*
+ * Interface for the reactor register.
+ */
+static LIST_HEAD(rv_reactors_list);
+
+static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+{
+	struct rv_reactor_def *r;
+
+	list_for_each_entry(r, &rv_reactors_list, list) {
+		if (strcmp(name, r->reactor->name) == 0)
+			return r;
+	}
+	return NULL;
+}
+
+/*
+ * Available reactors seq functions.
+ */
+static int reactors_show(struct seq_file *m, void *p)
+{
+	struct rv_reactor_def *rea_def = p;
+
+	seq_printf(m, "%s\n", rea_def->reactor->name);
+	return 0;
+}
+
+static void reactors_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&rv_interface_lock);
+}
+
+static void *reactors_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&rv_interface_lock);
+	return seq_list_start(&rv_reactors_list, *pos);
+}
+
+static void *reactors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &rv_reactors_list, pos);
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations available_reactors_seq_ops = {
+	.start	= reactors_start,
+	.next	= reactors_next,
+	.stop	= reactors_stop,
+	.show	= reactors_show
+};
+
+/*
+ * available_reactors interface.
+ */
+static int available_reactors_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &available_reactors_seq_ops);
+};
+
+static const struct file_operations available_reactors_ops = {
+	.open    = available_reactors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * Monitor's reactor file.
+ */
+static int monitor_reactor_show(struct seq_file *m, void *p)
+{
+	struct rv_monitor_def *mdef = m->private;
+	struct rv_reactor_def *rdef = p;
+
+	if (mdef->rdef == rdef)
+		seq_printf(m, "[%s]\n", rdef->reactor->name);
+	else
+		seq_printf(m, "%s\n", rdef->reactor->name);
+	return 0;
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations monitor_reactors_seq_ops = {
+	.start	= reactors_start,
+	.next	= reactors_next,
+	.stop	= reactors_stop,
+	.show	= monitor_reactor_show
+};
+
+static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
+				    bool reacting)
+{
+	bool monitor_enabled;
+
+	/* nothing to do */
+	if (mdef->rdef == rdef)
+		return;
+
+	monitor_enabled = mdef->monitor->enabled;
+	if (monitor_enabled)
+		rv_disable_monitor(mdef);
+
+	/* swap reactor's usage */
+	mdef->rdef->counter--;
+	rdef->counter++;
+
+	mdef->rdef = rdef;
+	mdef->reacting = reacting;
+	mdef->monitor->react = rdef->reactor->react;
+
+	if (monitor_enabled)
+		rv_enable_monitor(mdef);
+}
+
+static ssize_t
+monitor_reactors_write(struct file *file, const char __user *user_buf,
+		      size_t count, loff_t *ppos)
+{
+	char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
+	struct rv_monitor_def *mdef;
+	struct rv_reactor_def *rdef;
+	struct seq_file *seq_f;
+	int retval = -EINVAL;
+	bool enable;
+	char *ptr;
+	int len;
+
+	if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1)
+		return -EINVAL;
+
+	memset(buff, 0, sizeof(buff));
+
+	retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+	if (retval < 0)
+		return -EFAULT;
+
+	ptr = strim(buff);
+
+	len = strlen(ptr);
+	if (!len)
+		return count;
+
+	/*
+	 * See monitor_reactors_open()
+	 */
+	seq_f = file->private_data;
+	mdef = seq_f->private;
+
+	mutex_lock(&rv_interface_lock);
+
+	retval = -EINVAL;
+
+	list_for_each_entry(rdef, &rv_reactors_list, list) {
+		if (strcmp(ptr, rdef->reactor->name) != 0)
+			continue;
+
+		if (rdef == get_reactor_rdef_by_name("nop"))
+			enable = false;
+		else
+			enable = true;
+
+		monitor_swap_reactors(mdef, rdef, enable);
+
+		retval = count;
+		break;
+	}
+
+	mutex_unlock(&rv_interface_lock);
+
+	return retval;
+}
+
+/*
+ * available_reactors interface.
+ */
+static int monitor_reactors_open(struct inode *inode, struct file *file)
+{
+	struct rv_monitor_def *mdef = inode->i_private;
+	struct seq_file *seq_f;
+	int ret;
+
+	ret = seq_open(file, &monitor_reactors_seq_ops);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * seq_open stores the seq_file on the file->private data.
+	 */
+	seq_f = file->private_data;
+
+	/*
+	 * Copy the create file "private" data to the seq_file private data.
+	 */
+	seq_f->private = mdef;
+
+	return 0;
+};
+
+static const struct file_operations monitor_reactors_ops = {
+	.open    = monitor_reactors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write = monitor_reactors_write
+};
+
+static int __rv_register_reactor(struct rv_reactor *reactor)
+{
+	struct rv_reactor_def *r;
+
+	list_for_each_entry(r, &rv_reactors_list, list) {
+		if (strcmp(reactor->name, r->reactor->name) == 0) {
+			pr_info("Reactor %s is already registered\n", reactor->name);
+			return -EINVAL;
+		}
+	}
+
+	r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	r->reactor = reactor;
+	r->counter = 0;
+
+	list_add_tail(&r->list, &rv_reactors_list);
+
+	return 0;
+}
+
+/**
+ * rv_register_reactor - register a rv reactor.
+ * @reactor:	The rv_reactor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_reactor(struct rv_reactor *reactor)
+{
+	int retval = 0;
+
+	if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
+		pr_info("Reactor %s has a name longer than %d\n",
+			reactor->name, MAX_RV_MONITOR_NAME_SIZE);
+		return -EINVAL;
+	}
+
+	mutex_lock(&rv_interface_lock);
+	retval = __rv_register_reactor(reactor);
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+/**
+ * rv_unregister_reactor - unregister a rv reactor.
+ * @reactor:	The rv_reactor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_reactor(struct rv_reactor *reactor)
+{
+	struct rv_reactor_def *ptr, *next;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
+		if (strcmp(reactor->name, ptr->reactor->name) == 0) {
+
+			if (!ptr->counter) {
+				list_del(&ptr->list);
+			} else {
+				printk(KERN_WARNING
+				       "rv: the rv_reactor %s is in use by %d monitor(s)\n",
+				       ptr->reactor->name, ptr->counter);
+				printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
+				       ptr->reactor->name);
+				return -EBUSY;
+			}
+		}
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return 0;
+}
+
+/*
+ * reacting_on interface.
+ */
+static bool __read_mostly reacting_on;
+
+/**
+ * rv_reacting_on - checks if reacting is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_reacting_on(void)
+{
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_rmb();
+	return READ_ONCE(reacting_on);
+}
+
+static ssize_t reacting_on_read_data(struct file *filp,
+				     char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	char *buff;
+
+	buff = rv_reacting_on() ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+static void turn_reacting_off(void)
+{
+	WRITE_ONCE(reacting_on, false);
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_wmb();
+}
+
+static void turn_reacting_on(void)
+{
+	WRITE_ONCE(reacting_on, true);
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_wmb();
+}
+
+static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		turn_reacting_on();
+	else
+		turn_reacting_off();
+
+	/*
+	 * Wait for the execution of all events to finish
+	 * before returning to user-space.
+	 */
+	tracepoint_synchronize_unregister();
+
+	mutex_unlock(&rv_interface_lock);
+
+	return count;
+}
+
+static const struct file_operations reacting_on_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = reacting_on_write_data,
+	.read   = reacting_on_read_data,
+};
+
+/**
+ * reactor_populate_monitor - creates per monitor reactors file
+ * @mdef:	monitor's definition.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+	struct dentry *tmp;
+
+	tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+	if (!tmp)
+		return -ENOMEM;
+
+	/*
+	 * Configure as the rv_nop reactor.
+	 */
+	mdef->rdef = get_reactor_rdef_by_name("nop");
+	mdef->rdef->counter++;
+	mdef->reacting = false;
+
+	return 0;
+}
+
+/**
+ * reactor_cleanup_monitor - cleanup a monitor reference
+ * @mdef:       monitor's definition.
+ */
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+	lockdep_assert_held(&rv_interface_lock);
+	mdef->rdef->counter--;
+	WARN_ON_ONCE(mdef->rdef->counter < 0);
+}
+
+/*
+ * Nop reactor register
+ */
+static void rv_nop_reaction(char *msg)
+{
+}
+
+static struct rv_reactor rv_nop = {
+	.name = "nop",
+	.description = "no-operation reactor: do nothing.",
+	.react = rv_nop_reaction
+};
+
+int init_rv_reactors(struct dentry *root_dir)
+{
+	struct dentry *available, *reacting;
+	int retval;
+
+	available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
+				   &available_reactors_ops);
+	if (!available)
+		goto out_err;
+
+	reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
+	if (!reacting)
+		goto rm_available;
+
+	retval = __rv_register_reactor(&rv_nop);
+	if (retval)
+		goto rm_reacting;
+
+	turn_reacting_on();
+
+	return 0;
+
+rm_reacting:
+	rv_remove(reacting);
+rm_available:
+	rv_remove(available);
+out_err:
+	return -ENOMEM;
+}
-- 
cgit 


From 09ecd8b8c585c95a3b8dbdec86c15a981fdfeba1 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:42 +0200
Subject: rv/include: Add helper functions for deterministic automata

Formally, a deterministic automaton, denoted by G, is defined as a
quintuple:

  G = { X, E, f, x_0, X_m }

where:
	- X is the set of states;
	- E is the finite set of events;
	- x_0 is the initial state;
	- X_m (subset of X) is the set of marked states.
	- f : X x E -> X $ is the transition function. It defines the
	  state transition in the occurrence of a event from E in
	  the state X. In the special case of deterministic automata,
	  the occurrence of the event in E in a state in X has a
	  deterministic next state from X.

An automaton can also be represented using a graphical format of
vertices (nodes) and edges. The open-source tool Graphviz can produce
this graphic format using the (textual) DOT language as the source code.

The dot2c tool presented in this paper:

De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
Silva. Efficient formal verification for the Linux kernel. In:
International Conference on Software Engineering and Formal Methods.
Springer, Cham, 2019. p. 315-332.

Translates a deterministic automaton in the DOT format into a C
source code representation that to be used for monitoring.

This header file implements helper functions to facilitate the usage
of the C output from dot2c/k for monitoring.

Link: https://lkml.kernel.org/r/563234f2bfa84b540f60cf9e39c2d9f0eea95a55.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/rv/automata.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 include/rv/automata.h

(limited to 'include')

diff --git a/include/rv/automata.h b/include/rv/automata.h
new file mode 100644
index 000000000000..eb9e636809a0
--- /dev/null
+++ b/include/rv/automata.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira  <bristot@kernel.org>
+ *
+ * Deterministic automata helper functions, to be used with the automata
+ * models in C generated by the dot2k tool.
+ */
+
+/*
+ * DECLARE_AUTOMATA_HELPERS - define a set of helper functions for automata
+ *
+ * Define a set of helper functions for automata. The 'name' argument is used
+ * as suffix for the functions and data. These functions will handle automaton
+ * with data type 'type'.
+ */
+#define DECLARE_AUTOMATA_HELPERS(name, type)					\
+										\
+/*										\
+ * model_get_state_name_##name - return the (string) name of the given state	\
+ */ 										\
+static char *model_get_state_name_##name(enum states_##name state)		\
+{										\
+	if ((state < 0) || (state >= state_max_##name))				\
+		return "INVALID";						\
+										\
+	return automaton_##name.state_names[state];				\
+}										\
+										\
+/*										\
+ * model_get_event_name_##name - return the (string) name of the given event	\
+ */										\
+static char *model_get_event_name_##name(enum events_##name event)		\
+{										\
+	if ((event < 0) || (event >= event_max_##name))				\
+		return "INVALID";						\
+										\
+	return automaton_##name.event_names[event];				\
+}										\
+										\
+/*										\
+ * model_get_initial_state_##name - return the automaton's initial state		\
+ */										\
+static inline type model_get_initial_state_##name(void)				\
+{										\
+	return automaton_##name.initial_state;					\
+}										\
+										\
+/*										\
+ * model_get_next_state_##name - process an automaton event occurrence		\
+ *										\
+ * Given the current state (curr_state) and the event (event), returns		\
+ * the next state, or INVALID_STATE in case of error.				\
+ */										\
+static inline type model_get_next_state_##name(enum states_##name curr_state,	\
+					       enum events_##name event)	\
+{										\
+	if ((curr_state < 0) || (curr_state >= state_max_##name))		\
+		return INVALID_STATE;						\
+										\
+	if ((event < 0) || (event >= event_max_##name))				\
+		return INVALID_STATE;						\
+										\
+	return automaton_##name.function[curr_state][event];			\
+}										\
+										\
+/*										\
+ * model_is_final_state_##name - check if the given state is a final state	\
+ */										\
+static inline bool model_is_final_state_##name(enum states_##name state)	\
+{										\
+	if ((state < 0) || (state >= state_max_##name))				\
+		return 0;							\
+										\
+	return automaton_##name.final_states[state];				\
+}
-- 
cgit 


From 792575348ff70e05c6040d02fce38e949ef92c37 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:43 +0200
Subject: rv/include: Add deterministic automata monitor definition via C
 macros

In Linux terms, the runtime verification monitors are encapsulated
inside the "RV monitor" abstraction. The "RV monitor" includes a set
of instances of the monitor (per-cpu monitor, per-task monitor, and
so on), the helper functions that glue the monitor to the system
reference model, and the trace output as a reaction for event parsing
and exceptions, as depicted below:

Linux  +----- RV Monitor ----------------------------------+ Formal
 Realm |                                                   |  Realm
 +-------------------+     +----------------+     +-----------------+
 |   Linux kernel    |     |     Monitor    |     |     Reference   |
 |     Tracing       |  -> |   Instance(s)  | <-  |       Model     |
 | (instrumentation) |     | (verification) |     | (specification) |
 +-------------------+     +----------------+     +-----------------+
        |                          |                       |
        |                          V                       |
        |                     +----------+                 |
        |                     | Reaction |                 |
        |                     +--+--+--+-+                 |
        |                        |  |  |                   |
        |                        |  |  +-> trace output ?  |
        +------------------------|--|----------------------+
                                 |  +----> panic ?
                                 +-------> <user-specified>

Add the rv/da_monitor.h, enabling automatic code generation for the
*Monitor Instance(s)* using C macros, and code to support it.

The benefits of the usage of macro for monitor synthesis are 3-fold as it:

- Reduces the code duplication;
- Facilitates the bug fix/improvement;
- Avoids the case of developers changing the core of the monitor code
  to manipulate the model in a (let's say) non-standard way.

This initial implementation presents three different types of monitor
instances:

- DECLARE_DA_MON_GLOBAL(name, type)
- DECLARE_DA_MON_PER_CPU(name, type)
- DECLARE_DA_MON_PER_TASK(name, type)

The first declares the functions for a global deterministic automata monitor,
the second for monitors with per-cpu instances, and the third with per-task
instances.

Link: https://lkml.kernel.org/r/51b0bf425a281e226dfeba7401d2115d6091f84e.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h        |  10 +
 include/rv/da_monitor.h   | 541 ++++++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/rv.h | 120 ++++++++++
 kernel/fork.c             |  14 ++
 kernel/trace/rv/Kconfig   |  11 +
 kernel/trace/rv/rv.c      |   5 +
 6 files changed, 701 insertions(+)
 create mode 100644 include/rv/da_monitor.h
 create mode 100644 include/trace/events/rv.h

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index dcce56987cf7..8883b41d88ec 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,16 @@
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
+#define MAX_DA_NAME_LEN	24
+
 #ifdef CONFIG_RV
+/*
+ * Deterministic automaton per-object variables.
+ */
+struct da_monitor {
+	bool		monitoring;
+	unsigned int	curr_state;
+};
 
 /*
  * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
@@ -22,6 +31,7 @@
  * Futher monitor types are expected, so make this a union.
  */
 union rv_task_monitor {
+	struct da_monitor da_mon;
 };
 
 #ifdef CONFIG_RV_REACTORS
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
new file mode 100644
index 000000000000..001bc298289f
--- /dev/null
+++ b/include/rv/da_monitor.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Deterministic automata (DA) monitor functions, to be used together
+ * with automata models in C generated by the dot2k tool.
+ *
+ * The dot2k tool is available at tools/verification/dot2k/
+ */
+
+#include <rv/automata.h>
+#include <linux/rv.h>
+#include <linux/bug.h>
+
+#ifdef CONFIG_RV_REACTORS
+
+#define DECLARE_RV_REACTING_HELPERS(name, type)							\
+static char REACT_MSG_##name[1024];								\
+												\
+static inline char *format_react_msg_##name(type curr_state, type event)			\
+{												\
+	snprintf(REACT_MSG_##name, 1024,							\
+		 "rv: monitor %s does not allow event %s on state %s\n",			\
+		 #name,										\
+		 model_get_event_name_##name(event),						\
+		 model_get_state_name_##name(curr_state));					\
+	return REACT_MSG_##name;								\
+}												\
+												\
+static void cond_react_##name(char *msg)							\
+{												\
+	if (rv_##name.react)									\
+		rv_##name.react(msg);								\
+}												\
+												\
+static bool rv_reacting_on_##name(void)								\
+{												\
+	return rv_reacting_on();								\
+}
+
+#else /* CONFIG_RV_REACTOR */
+
+#define DECLARE_RV_REACTING_HELPERS(name, type)							\
+static inline char *format_react_msg_##name(type curr_state, type event)			\
+{												\
+	return NULL;										\
+}												\
+												\
+static void cond_react_##name(char *msg)							\
+{												\
+	return;											\
+}												\
+												\
+static bool rv_reacting_on_##name(void)								\
+{												\
+	return 0;										\
+}
+#endif
+
+/*
+ * Generic helpers for all types of deterministic automata monitors.
+ */
+#define DECLARE_DA_MON_GENERIC_HELPERS(name, type)						\
+												\
+DECLARE_RV_REACTING_HELPERS(name, type)								\
+												\
+/*												\
+ * da_monitor_reset_##name - reset a monitor and setting it to init state			\
+ */												\
+static inline void da_monitor_reset_##name(struct da_monitor *da_mon)				\
+{												\
+	da_mon->monitoring = 0;									\
+	da_mon->curr_state = model_get_initial_state_##name();					\
+}												\
+												\
+/*												\
+ * da_monitor_curr_state_##name - return the current state					\
+ */												\
+static inline type da_monitor_curr_state_##name(struct da_monitor *da_mon)			\
+{												\
+	return da_mon->curr_state;								\
+}												\
+												\
+/*												\
+ * da_monitor_set_state_##name - set the new current state					\
+ */												\
+static inline void										\
+da_monitor_set_state_##name(struct da_monitor *da_mon, enum states_##name state)		\
+{												\
+	da_mon->curr_state = state;								\
+}												\
+												\
+/*												\
+ * da_monitor_start_##name - start monitoring							\
+ *												\
+ * The monitor will ignore all events until monitoring is set to true. This			\
+ * function needs to be called to tell the monitor to start monitoring.				\
+ */												\
+static inline void da_monitor_start_##name(struct da_monitor *da_mon)				\
+{												\
+	da_mon->curr_state = model_get_initial_state_##name();					\
+	da_mon->monitoring = 1;									\
+}												\
+												\
+/*												\
+ * da_monitoring_##name - returns true if the monitor is processing events			\
+ */												\
+static inline bool da_monitoring_##name(struct da_monitor *da_mon)				\
+{												\
+	return da_mon->monitoring;								\
+}												\
+												\
+/*												\
+ * da_monitor_enabled_##name - checks if the monitor is enabled					\
+ */												\
+static inline bool da_monitor_enabled_##name(void)						\
+{												\
+	/* global switch */									\
+	if (unlikely(!rv_monitoring_on()))							\
+		return 0;									\
+												\
+	/* monitor enabled */									\
+	if (unlikely(!rv_##name.enabled))							\
+		return 0;									\
+												\
+	return 1;										\
+}												\
+												\
+/*												\
+ * da_monitor_handling_event_##name - checks if the monitor is ready to handle events		\
+ */												\
+static inline bool da_monitor_handling_event_##name(struct da_monitor *da_mon)			\
+{												\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	/* monitor is actually monitoring */							\
+	if (unlikely(!da_monitoring_##name(da_mon)))						\
+		return 0;									\
+												\
+	return 1;										\
+}
+
+/*
+ * Event handler for implicit monitors. Implicit monitor is the one which the
+ * handler does not need to specify which da_monitor to manipulate. Examples
+ * of implicit monitor are the per_cpu or the global ones.
+ */
+#define DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)					\
+												\
+static inline bool										\
+da_event_##name(struct da_monitor *da_mon, enum events_##name event)				\
+{												\
+	type curr_state = da_monitor_curr_state_##name(da_mon);					\
+	type next_state = model_get_next_state_##name(curr_state, event);			\
+												\
+	if (next_state != INVALID_STATE) {							\
+		da_monitor_set_state_##name(da_mon, next_state);				\
+												\
+		trace_event_##name(model_get_state_name_##name(curr_state),			\
+				   model_get_event_name_##name(event),				\
+				   model_get_state_name_##name(next_state),			\
+				   model_is_final_state_##name(next_state));			\
+												\
+		return true;									\
+	}											\
+												\
+	if (rv_reacting_on_##name())								\
+		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+												\
+	trace_error_##name(model_get_state_name_##name(curr_state),				\
+			   model_get_event_name_##name(event));					\
+												\
+	return false;										\
+}												\
+
+/*
+ * Event handler for per_task monitors.
+ */
+#define DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type)					\
+												\
+static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct *tsk,		\
+				   enum events_##name event)					\
+{												\
+	type curr_state = da_monitor_curr_state_##name(da_mon);					\
+	type next_state = model_get_next_state_##name(curr_state, event);			\
+												\
+	if (next_state != INVALID_STATE) {							\
+		da_monitor_set_state_##name(da_mon, next_state);				\
+												\
+		trace_event_##name(tsk->pid,							\
+				   model_get_state_name_##name(curr_state),			\
+				   model_get_event_name_##name(event),				\
+				   model_get_state_name_##name(next_state),			\
+				   model_is_final_state_##name(next_state));			\
+												\
+		return true;									\
+	}											\
+												\
+	if (rv_reacting_on_##name())								\
+		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+												\
+	trace_error_##name(tsk->pid,								\
+			   model_get_state_name_##name(curr_state),				\
+			   model_get_event_name_##name(event));					\
+												\
+	return false;										\
+}
+
+/*
+ * Functions to define, init and get a global monitor.
+ */
+#define DECLARE_DA_MON_INIT_GLOBAL(name, type)							\
+												\
+/*												\
+ * global monitor (a single variable)								\
+ */												\
+static struct da_monitor da_mon_##name;								\
+												\
+/*												\
+ * da_get_monitor_##name - return the global monitor address					\
+ */												\
+static struct da_monitor *da_get_monitor_##name(void)						\
+{												\
+	return &da_mon_##name;									\
+}												\
+												\
+/*												\
+ * da_monitor_reset_all_##name - reset the single monitor					\
+ */												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	da_monitor_reset_##name(da_get_monitor_##name());					\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize a monitor						\
+ */												\
+static inline int da_monitor_init_##name(void)							\
+{												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - destroy the monitor						\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	return;											\
+}
+
+/*
+ * Functions to define, init and get a per-cpu monitor.
+ */
+#define DECLARE_DA_MON_INIT_PER_CPU(name, type)							\
+												\
+/*												\
+ * per-cpu monitor variables									\
+ */												\
+DEFINE_PER_CPU(struct da_monitor, da_mon_##name);						\
+												\
+/*												\
+ * da_get_monitor_##name - return current CPU monitor address					\
+ */												\
+static struct da_monitor *da_get_monitor_##name(void)						\
+{												\
+	return this_cpu_ptr(&da_mon_##name);							\
+}												\
+												\
+/*												\
+ * da_monitor_reset_all_##name - reset all CPUs' monitor					\
+ */												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	struct da_monitor *da_mon;								\
+	int cpu;										\
+	for_each_cpu(cpu, cpu_online_mask) {							\
+		da_mon = per_cpu_ptr(&da_mon_##name, cpu);					\
+		da_monitor_reset_##name(da_mon);						\
+	}											\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize all CPUs' monitor					\
+ */												\
+static inline int da_monitor_init_##name(void)							\
+{												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - destroy the monitor						\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	return;											\
+}
+
+/*
+ * Functions to define, init and get a per-task monitor.
+ */
+#define DECLARE_DA_MON_INIT_PER_TASK(name, type)						\
+												\
+/*												\
+ * The per-task monitor is stored a vector in the task struct. This variable			\
+ * stores the position on the vector reserved for this monitor.					\
+ */												\
+static int task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT;					\
+												\
+/*												\
+ * da_get_monitor_##name - return the monitor in the allocated slot for tsk 			\
+ */												\
+static inline struct da_monitor *da_get_monitor_##name(struct task_struct *tsk)			\
+{												\
+	return &tsk->rv[task_mon_slot_##name].da_mon;						\
+}												\
+												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	struct task_struct *g, *p;								\
+												\
+	read_lock(&tasklist_lock);								\
+	for_each_process_thread(g, p)								\
+		da_monitor_reset_##name(da_get_monitor_##name(p));				\
+	read_unlock(&tasklist_lock);								\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize the per-task monitor					\
+ *												\
+ * Try to allocate a slot in the task's vector of monitors. If there				\
+ * is an available slot, use it and reset all task's monitor.					\
+ */												\
+static int da_monitor_init_##name(void)								\
+{												\
+	int slot;										\
+												\
+	slot = rv_get_task_monitor_slot();							\
+	if (slot < 0 || slot >= RV_PER_TASK_MONITOR_INIT)					\
+		return slot;									\
+												\
+	task_mon_slot_##name = slot;								\
+												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - return the allocated slot					\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	if (task_mon_slot_##name == RV_PER_TASK_MONITOR_INIT) {					\
+		WARN_ONCE(1, "Disabling a disabled monitor: " #name);				\
+		return;										\
+	}											\
+	rv_put_task_monitor_slot(task_mon_slot_##name);						\
+	task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT;					\
+	return;											\
+}
+
+/*
+ * Handle event for implicit monitor: da_get_monitor_##name() will figure out
+ * the monitor.
+ */
+#define DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)					\
+												\
+static inline void __da_handle_event_##name(struct da_monitor *da_mon,				\
+					    enum events_##name event)				\
+{												\
+	bool retval;										\
+												\
+	retval = da_event_##name(da_mon, event);						\
+	if (!retval)										\
+		da_monitor_reset_##name(da_mon);						\
+}												\
+												\
+/*												\
+ * da_handle_event_##name - handle an event							\
+ */												\
+static inline void da_handle_event_##name(enum events_##name event)				\
+{												\
+	struct da_monitor *da_mon = da_get_monitor_##name();					\
+	bool retval;										\
+												\
+	retval = da_monitor_handling_event_##name(da_mon);					\
+	if (!retval)										\
+		return;										\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+}												\
+												\
+/*												\
+ * da_handle_start_event_##name - start monitoring or handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is returning			\
+ * to the initial state, so the monitor can start monitoring in the next event.			\
+ * Thus:											\
+ *												\
+ * If the monitor already started, handle the event.						\
+ * If the monitor did not start yet, start the monitor but skip the event.			\
+ */												\
+static inline bool da_handle_start_event_##name(enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name();							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon))) {						\
+		da_monitor_start_##name(da_mon);						\
+		return 0;									\
+	}											\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+												\
+	return 1;										\
+}												\
+												\
+/*												\
+ * da_handle_start_run_event_##name - start monitoring and handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is in the			\
+ * initial state, so the monitor can start monitoring and handling event.			\
+ */												\
+static inline bool da_handle_start_run_event_##name(enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name();							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon)))						\
+		da_monitor_start_##name(da_mon);						\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+												\
+	return 1;										\
+}
+
+/*
+ * Handle event for per task.
+ */
+#define DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type)					\
+												\
+static inline void										\
+__da_handle_event_##name(struct da_monitor *da_mon, struct task_struct *tsk,			\
+			 enum events_##name event)						\
+{												\
+	bool retval;										\
+												\
+	retval = da_event_##name(da_mon, tsk, event);						\
+	if (!retval)										\
+		da_monitor_reset_##name(da_mon);						\
+}												\
+												\
+/*												\
+ * da_handle_event_##name - handle an event							\
+ */												\
+static inline void										\
+da_handle_event_##name(struct task_struct *tsk, enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon = da_get_monitor_##name(tsk);					\
+	bool retval;										\
+												\
+	retval = da_monitor_handling_event_##name(da_mon);					\
+	if (!retval)										\
+		return;										\
+												\
+	__da_handle_event_##name(da_mon, tsk, event);						\
+}												\
+												\
+/*												\
+ * da_handle_start_event_##name - start monitoring or handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is returning			\
+ * to the initial state, so the monitor can start monitoring in the next event.			\
+ * Thus:											\
+ *												\
+ * If the monitor already started, handle the event.						\
+ * If the monitor did not start yet, start the monitor but skip the event.			\
+ */												\
+static inline bool										\
+da_handle_start_event_##name(struct task_struct *tsk, enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name(tsk);							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon))) {						\
+		da_monitor_start_##name(da_mon);						\
+		return 0;									\
+	}											\
+												\
+	__da_handle_event_##name(da_mon, tsk, event);						\
+												\
+	return 1;										\
+}
+
+/*
+ * Entry point for the global monitor.
+ */
+#define DECLARE_DA_MON_GLOBAL(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)						\
+DECLARE_DA_MON_INIT_GLOBAL(name, type)								\
+DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)
+
+/*
+ * Entry point for the per-cpu monitor.
+ */
+#define DECLARE_DA_MON_PER_CPU(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)						\
+DECLARE_DA_MON_INIT_PER_CPU(name, type)								\
+DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)
+
+/*
+ * Entry point for the per-task monitor.
+ */
+#define DECLARE_DA_MON_PER_TASK(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type)						\
+DECLARE_DA_MON_INIT_PER_TASK(name, type)							\
+DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type)
diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h
new file mode 100644
index 000000000000..20a2e09c6416
--- /dev/null
+++ b/include/trace/events/rv.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+	TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+	TP_ARGS(state, event, next_state, final_state),
+
+	TP_STRUCT__entry(
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+		__array(	char,	next_state,	MAX_DA_NAME_LEN	)
+		__field(	bool,	final_state			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		memcpy(__entry->next_state,	next_state,	MAX_DA_NAME_LEN);
+		__entry->final_state		= final_state;
+	),
+
+	TP_printk("%s x %s -> %s %s",
+		__entry->state,
+		__entry->event,
+		__entry->next_state,
+		__entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+	TP_PROTO(char *state, char *event),
+
+	TP_ARGS(state, event),
+
+	TP_STRUCT__entry(
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+	),
+
+	TP_printk("event %s not expected in the state %s",
+		__entry->event,
+		__entry->state)
+);
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+	TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+	TP_ARGS(id, state, event, next_state, final_state),
+
+	TP_STRUCT__entry(
+		__field(	int,	id				)
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+		__array(	char,	next_state,	MAX_DA_NAME_LEN	)
+		__field(	bool,	final_state			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		memcpy(__entry->next_state,	next_state,	MAX_DA_NAME_LEN);
+		__entry->id			= id;
+		__entry->final_state		= final_state;
+	),
+
+	TP_printk("%d: %s x %s -> %s %s",
+		__entry->id,
+		__entry->state,
+		__entry->event,
+		__entry->next_state,
+		__entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+	TP_PROTO(int id, char *state, char *event),
+
+	TP_ARGS(id, state, event),
+
+	TP_STRUCT__entry(
+		__field(	int,	id				)
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		__entry->id			= id;
+	),
+
+	TP_printk("%d: event %s not expected in the state %s",
+		__entry->id,
+		__entry->event,
+		__entry->state)
+);
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#endif /* _TRACE_RV_H */
+
+/* This part ust be outside protection */
+#undef TRACE_INCLUDE_PATH
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..6f1f82ccd5f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1964,6 +1964,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
 	mutex_unlock(&oom_adj_mutex);
 }
 
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+	int i;
+
+	for (i = 0; i < RV_PER_TASK_MONITORS; i++)
+		p->rv[i].da_mon.monitoring = false;
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2399,6 +2411,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	rv_task_fork(p);
+
 	rseq_fork(p, clone_flags);
 
 	/* Don't start children in a dying pid namespace */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3eb5d48ab4f6..8714800e22ad 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -1,5 +1,16 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
+config DA_MON_EVENTS
+	bool
+
+config DA_MON_EVENTS_IMPLICIT
+	select DA_MON_EVENTS
+	bool
+
+config DA_MON_EVENTS_ID
+	select DA_MON_EVENTS
+	bool
+
 menuconfig RV
 	bool "Runtime Verification"
 	depends on TRACING
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 45cf64eb2600..df6678c86334 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -140,6 +140,11 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 
+#ifdef CONFIG_DA_MON_EVENTS
+#define CREATE_TRACE_POINTS
+#include <trace/events/rv.h>
+#endif
+
 #include "rv.h"
 
 DEFINE_MUTEX(rv_interface_lock);
-- 
cgit 


From cc8e71c81746de1f8a44873015bc963a868eccba Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:44 +0200
Subject: rv/include: Add instrumentation helper functions

Instrumentation helper functions to facilitate the instrumentation of
auto-generated RV monitors create by dot2k.

Link: https://lkml.kernel.org/r/3b36c9435f9d9299beb84e5c7c46920e205bedec.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/rv/instrumentation.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 include/rv/instrumentation.h

(limited to 'include')

diff --git a/include/rv/instrumentation.h b/include/rv/instrumentation.h
new file mode 100644
index 000000000000..d4e7a02ede1a
--- /dev/null
+++ b/include/rv/instrumentation.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Helper functions to facilitate the instrumentation of auto-generated
+ * RV monitors create by dot2k.
+ *
+ * The dot2k tool is available at tools/verification/dot2/
+ */
+
+#include <linux/ftrace.h>
+
+/*
+ * rv_attach_trace_probe - check and attach a handler function to a tracepoint
+ */
+#define rv_attach_trace_probe(monitor, tp, rv_handler)					\
+	do {										\
+		check_trace_callback_type_##tp(rv_handler);				\
+		WARN_ONCE(register_trace_##tp(rv_handler, NULL),			\
+				"fail attaching " #monitor " " #tp "handler");		\
+	} while (0)
+
+/*
+ * rv_detach_trace_probe - detach a handler function to a tracepoint
+ */
+#define rv_detach_trace_probe(monitor, tp, rv_handler)					\
+	do {										\
+		unregister_trace_##tp(rv_handler, NULL);				\
+	} while (0)
-- 
cgit 


From d57aff24796f8f784e1f7beed6da3308e5bb13c0 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:49 +0200
Subject: Documentation/rv: Add deterministic automata monitor synthesis
 documentation

Add the da_monitor_synthesis.rst introduces some concepts behind the
Deterministic Automata (DA) monitor synthesis and interface.

Link: https://lkml.kernel.org/r/7873bdb7b2e5d2bc0b2eb6ca0b324af9a0ba27a0.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/rv/da_monitor_synthesis.rst | 147 ++++++++++++++++++++++++
 Documentation/trace/rv/index.rst                |   1 +
 include/rv/da_monitor.h                         |   3 +
 tools/verification/dot2/dot2k                   |   3 +
 tools/verification/dot2/dot2k.py                |   3 +
 5 files changed, 157 insertions(+)
 create mode 100644 Documentation/trace/rv/da_monitor_synthesis.rst

(limited to 'include')

diff --git a/Documentation/trace/rv/da_monitor_synthesis.rst b/Documentation/trace/rv/da_monitor_synthesis.rst
new file mode 100644
index 000000000000..0dbdcd1e62b9
--- /dev/null
+++ b/Documentation/trace/rv/da_monitor_synthesis.rst
@@ -0,0 +1,147 @@
+Deterministic Automata Monitor Synthesis
+========================================
+
+The starting point for the application of runtime verification (RV) technics
+is the *specification* or *modeling* of the desired (or undesired) behavior
+of the system under scrutiny.
+
+The formal representation needs to be then *synthesized* into a *monitor*
+that can then be used in the analysis of the trace of the system. The
+*monitor* connects to the system via an *instrumentation* that converts
+the events from the *system* to the events of the *specification*.
+
+
+In Linux terms, the runtime verification monitors are encapsulated inside
+the *RV monitor* abstraction. The RV monitor includes a set of instances
+of the monitor (per-cpu monitor, per-task monitor, and so on), the helper
+functions that glue the monitor to the system reference model, and the
+trace output as a reaction to event parsing and exceptions, as depicted
+below::
+
+ Linux  +----- RV Monitor ----------------------------------+ Formal
+  Realm |                                                   |  Realm
+  +-------------------+     +----------------+     +-----------------+
+  |   Linux kernel    |     |     Monitor    |     |     Reference   |
+  |     Tracing       |  -> |   Instance(s)  | <-  |       Model     |
+  | (instrumentation) |     | (verification) |     | (specification) |
+  +-------------------+     +----------------+     +-----------------+
+         |                          |                       |
+         |                          V                       |
+         |                     +----------+                 |
+         |                     | Reaction |                 |
+         |                     +--+--+--+-+                 |
+         |                        |  |  |                   |
+         |                        |  |  +-> trace output ?  |
+         +------------------------|--|----------------------+
+                                  |  +----> panic ?
+                                  +-------> <user-specified>
+
+DA monitor synthesis
+--------------------
+
+The synthesis of automata-based models into the Linux *RV monitor* abstraction
+is automated by the dot2k tool and the rv/da_monitor.h header file that
+contains a set of macros that automatically generate the monitor's code.
+
+dot2k
+-----
+
+The dot2k utility leverages dot2c by converting an automaton model in
+the DOT format into the C representation [1] and creating the skeleton of
+a kernel monitor in C.
+
+For example, it is possible to transform the wip.dot model present in
+[1] into a per-cpu monitor with the following command::
+
+  $ dot2k -d wip.dot -t per_cpu
+
+This will create a directory named wip/ with the following files:
+
+- wip.h: the wip model in C
+- wip.c: the RV monitor
+
+The wip.c file contains the monitor declaration and the starting point for
+the system instrumentation.
+
+Monitor macros
+--------------
+
+The rv/da_monitor.h enables automatic code generation for the *Monitor
+Instance(s)* using C macros.
+
+The benefits of the usage of macro for monitor synthesis are 3-fold as it:
+
+- Reduces the code duplication;
+- Facilitates the bug fix/improvement;
+- Avoids the case of developers changing the core of the monitor code
+  to manipulate the model in a (let's say) non-standard way.
+
+This initial implementation presents three different types of monitor instances:
+
+- ``#define DECLARE_DA_MON_GLOBAL(name, type)``
+- ``#define DECLARE_DA_MON_PER_CPU(name, type)``
+- ``#define DECLARE_DA_MON_PER_TASK(name, type)``
+
+The first declares the functions for a global deterministic automata monitor,
+the second for monitors with per-cpu instances, and the third with per-task
+instances.
+
+In all cases, the 'name' argument is a string that identifies the monitor, and
+the 'type' argument is the data type used by dot2k on the representation of
+the model in C.
+
+For example, the wip model with two states and three events can be
+stored in an 'unsigned char' type. Considering that the preemption control
+is a per-cpu behavior, the monitor declaration in the 'wip.c' file is::
+
+  DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+
+The monitor is executed by sending events to be processed via the functions
+presented below::
+
+  da_handle_event_$(MONITOR_NAME)($(event from event enum));
+  da_handle_start_event_$(MONITOR_NAME)($(event from event enum));
+  da_handle_start_run_event_$(MONITOR_NAME)($(event from event enum));
+
+The function ``da_handle_event_$(MONITOR_NAME)()`` is the regular case where
+the event will be processed if the monitor is processing events.
+
+When a monitor is enabled, it is placed in the initial state of the automata.
+However, the monitor does not know if the system is in the *initial state*.
+
+The ``da_handle_start_event_$(MONITOR_NAME)()`` function is used to notify the
+monitor that the system is returning to the initial state, so the monitor can
+start monitoring the next event.
+
+The ``da_handle_start_run_event_$(MONITOR_NAME)()`` function is used to notify
+the monitor that the system is known to be in the initial state, so the
+monitor can start monitoring and monitor the current event.
+
+Using the wip model as example, the events "preempt_disable" and
+"sched_waking" should be sent to monitor, respectively, via [2]::
+
+  da_handle_event_wip(preempt_disable_wip);
+  da_handle_event_wip(sched_waking_wip);
+
+While the event "preempt_enabled" will use::
+
+  da_handle_start_event_wip(preempt_enable_wip);
+
+To notify the monitor that the system will be returning to the initial state,
+so the system and the monitor should be in sync.
+
+Final remarks
+-------------
+
+With the monitor synthesis in place using the rv/da_monitor.h and
+dot2k, the developer's work should be limited to the instrumentation
+of the system, increasing the confidence in the overall approach.
+
+[1] For details about deterministic automata format and the translation
+from one representation to another, see::
+
+  Documentation/trace/rv/deterministic_automata.rst
+
+[2] dot2k appends the monitor's name suffix to the events enums to
+avoid conflicting variables when exporting the global vmlinux.h
+use by BPF programs.
diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index 013a41a410cf..46d47f33052c 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -8,3 +8,4 @@ Runtime Verification
 
    runtime-verification.rst
    deterministic_automata.rst
+   da_monitor_synthesis.rst
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 001bc298289f..9eb75683e012 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -6,6 +6,9 @@
  * with automata models in C generated by the dot2k tool.
  *
  * The dot2k tool is available at tools/verification/dot2k/
+ *
+ * For further information, see:
+ *   Documentation/trace/rv/da_monitor_synthesis.rst
  */
 
 #include <rv/automata.h>
diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k
index 69106f4b7682..9dcd38abe20a 100644
--- a/tools/verification/dot2/dot2k
+++ b/tools/verification/dot2/dot2k
@@ -4,6 +4,9 @@
 # Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
 #
 # dot2k: transform dot files into a monitor for the Linux kernel.
+#
+# For further information, see:
+#   Documentation/trace/rv/da_monitor_synthesis.rst
 
 if __name__ == '__main__':
     from dot2.dot2k import dot2k
diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index d85f755e3bc7..016550fccf1f 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -4,6 +4,9 @@
 # Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
 #
 # dot2k: transform dot files into a monitor for the Linux kernel.
+#
+# For further information, see:
+#   Documentation/trace/rv/da_monitor_synthesis.rst
 
 from dot2.dot2c import Dot2c
 import platform
-- 
cgit 


From 10bde81c74863472047f31304064018c40f488ee Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:52 +0200
Subject: rv/monitor: Add the wip monitor

The wakeup in preemptive (wip) monitor verifies if the
wakeup events always take place with preemption disabled:

                     |
                     |
                     v
                   #==================#
                   H    preemptive    H <+
                   #==================#  |
                     |                   |
                     | preempt_disable   | preempt_enable
                     v                   |
    sched_waking   +------------------+  |
  +--------------- |                  |  |
  |                |  non_preemptive  |  |
  +--------------> |                  | -+
                   +------------------+

The wakeup event always takes place with preemption disabled because
of the scheduler synchronization. However, because the preempt_count
and its trace event are not atomic with regard to interrupts, some
inconsistencies might happen.

The documentation illustrates one of these cases.

Link: https://lkml.kernel.org/r/c98ca678df81115fddc04921b3c79720c836b18f.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/rv/index.rst       |  1 +
 Documentation/trace/rv/monitor_wip.rst | 55 ++++++++++++++++++++++++++++++++++
 include/trace/events/rv.h              | 10 +++++++
 kernel/trace/rv/Kconfig                | 13 ++++++++
 kernel/trace/rv/Makefile               |  1 +
 kernel/trace/rv/monitors/wip/wip.c     | 51 ++++++++++---------------------
 tools/verification/models/wip.dot      | 16 ++++++++++
 7 files changed, 111 insertions(+), 36 deletions(-)
 create mode 100644 Documentation/trace/rv/monitor_wip.rst
 create mode 100644 tools/verification/models/wip.dot

(limited to 'include')

diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index db2ae3f90b90..4cb71ed628b8 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -10,3 +10,4 @@ Runtime Verification
    deterministic_automata.rst
    da_monitor_synthesis.rst
    da_monitor_instrumentation.rst
+   monitor_wip.rst
diff --git a/Documentation/trace/rv/monitor_wip.rst b/Documentation/trace/rv/monitor_wip.rst
new file mode 100644
index 000000000000..a95763438c48
--- /dev/null
+++ b/Documentation/trace/rv/monitor_wip.rst
@@ -0,0 +1,55 @@
+Monitor wip
+===========
+
+- Name: wip - wakeup in preemptive
+- Type: per-cpu deterministic automaton
+- Author: Daniel Bristot de Oliveira <bristot@kernel.org>
+
+Description
+-----------
+
+The wakeup in preemptive (wip) monitor is a sample per-cpu monitor
+that verifies if the wakeup events always take place with
+preemption disabled::
+
+                     |
+                     |
+                     v
+                   #==================#
+                   H    preemptive    H <+
+                   #==================#  |
+                     |                   |
+                     | preempt_disable   | preempt_enable
+                     v                   |
+    sched_waking   +------------------+  |
+  +--------------- |                  |  |
+  |                |  non_preemptive  |  |
+  +--------------> |                  | -+
+                   +------------------+
+
+The wakeup event always takes place with preemption disabled because
+of the scheduler synchronization. However, because the preempt_count
+and its trace event are not atomic with regard to interrupts, some
+inconsistencies might happen. For example::
+
+  preempt_disable() {
+	__preempt_count_add(1)
+	------->	smp_apic_timer_interrupt() {
+				preempt_disable()
+					do not trace (preempt count >= 1)
+
+				wake up a thread
+
+				preempt_enable()
+					 do not trace (preempt count >= 1)
+			}
+	<------
+	trace_preempt_disable();
+  }
+
+This problem was reported and discussed here:
+  https://lore.kernel.org/r/cover.1559051152.git.bristot@redhat.com/
+
+Specification
+-------------
+Grapviz Dot file in tools/verification/models/wip.dot
diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h
index 20a2e09c6416..e972f27d8df3 100644
--- a/include/trace/events/rv.h
+++ b/include/trace/events/rv.h
@@ -56,6 +56,16 @@ DECLARE_EVENT_CLASS(error_da_monitor,
 		__entry->event,
 		__entry->state)
 );
+
+#ifdef CONFIG_RV_MON_WIP
+DEFINE_EVENT(event_da_monitor, event_wip,
+	    TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+	    TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_wip,
+	     TP_PROTO(char *state, char *event),
+	     TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_WIP */
 #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
 
 #ifdef CONFIG_DA_MON_EVENTS_ID
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 0d9552b406c6..e50f3346164a 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -25,6 +25,19 @@ menuconfig RV
 	  For further information, see:
 	    Documentation/trace/rv/runtime-verification.rst
 
+config RV_MON_WIP
+	depends on RV
+	depends on PREEMPT_TRACER
+	select DA_MON_EVENTS_IMPLICIT
+	bool "wip monitor"
+	help
+	  Enable wip (wakeup in preemptive) sample monitor that illustrates
+	  the usage of per-cpu monitors, and one limitation of the
+	  preempt_disable/enable events.
+
+	  For further information, see:
+	    Documentation/trace/rv/monitor_wip.rst
+
 config RV_REACTORS
 	bool "Runtime verification reactors"
 	default y
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 8944274d9b41..b41109d2750a 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -2,3 +2,4 @@
 
 obj-$(CONFIG_RV) += rv.o
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
+obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 79a054ca0cde..83cace53b9fa 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -10,44 +10,26 @@
 
 #define MODULE_NAME "wip"
 
-/*
- * XXX: include required tracepoint headers, e.g.,
- * #include <linux/trace/events/sched.h>
- */
 #include <trace/events/rv.h>
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
 
-/*
- * This is the self-generated part of the monitor. Generally, there is no need
- * to touch this section.
- */
 #include "wip.h"
 
-/*
- * Declare the deterministic automata monitor.
- *
- * The rv monitor reference is needed for the monitor declaration.
- */
 struct rv_monitor rv_wip;
 DECLARE_DA_MON_PER_CPU(wip, unsigned char);
 
-/*
- * This is the instrumentation part of the monitor.
- *
- * This is the section where manual work is required. Here the kernel events
- * are translated into model's event.
- *
- */
-static void handle_preempt_disable(void *data, /* XXX: fill header */)
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
 {
 	da_handle_event_wip(preempt_disable_wip);
 }
 
-static void handle_preempt_enable(void *data, /* XXX: fill header */)
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
 {
-	da_handle_event_wip(preempt_enable_wip);
+	da_handle_start_event_wip(preempt_enable_wip);
 }
 
-static void handle_sched_waking(void *data, /* XXX: fill header */)
+static void handle_sched_waking(void *data, struct task_struct *task)
 {
 	da_handle_event_wip(sched_waking_wip);
 }
@@ -60,9 +42,9 @@ static int enable_wip(void)
 	if (retval)
 		return retval;
 
-	rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable);
-	rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable);
-	rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking);
+	rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+	rv_attach_trace_probe("wip", sched_waking, handle_sched_waking);
+	rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable);
 
 	return 0;
 }
@@ -71,19 +53,16 @@ static void disable_wip(void)
 {
 	rv_wip.enabled = 0;
 
-	rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable);
-	rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable);
-	rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking);
+	rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
+	rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+	rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
 
 	da_monitor_destroy_wip();
 }
 
-/*
- * This is the monitor register section.
- */
 struct rv_monitor rv_wip = {
 	.name = "wip",
-	.description = "auto-generated wip",
+	.description = "wakeup in preemptive per-cpu testing monitor.",
 	.enable = enable_wip,
 	.disable = disable_wip,
 	.reset = da_monitor_reset_all_wip,
@@ -105,5 +84,5 @@ module_init(register_wip);
 module_exit(unregister_wip);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("wip");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor.");
diff --git a/tools/verification/models/wip.dot b/tools/verification/models/wip.dot
new file mode 100644
index 000000000000..2a53a9700a89
--- /dev/null
+++ b/tools/verification/models/wip.dot
@@ -0,0 +1,16 @@
+digraph state_automaton {
+	{node [shape = circle] "non_preemptive"};
+	{node [shape = plaintext, style=invis, label=""] "__init_preemptive"};
+	{node [shape = doublecircle] "preemptive"};
+	{node [shape = circle] "preemptive"};
+	"__init_preemptive" -> "preemptive";
+	"non_preemptive" [label = "non_preemptive"];
+	"non_preemptive" -> "non_preemptive" [ label = "sched_waking" ];
+	"non_preemptive" -> "preemptive" [ label = "preempt_enable" ];
+	"preemptive" [label = "preemptive"];
+	"preemptive" -> "non_preemptive" [ label = "preempt_disable" ];
+	{ rank = min ;
+		"__init_preemptive";
+		"preemptive";
+	}
+}
-- 
cgit 


From ccc319dcb450d57b7befe924453d06804d83ba73 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:53 +0200
Subject: rv/monitor: Add the wwnr monitor

Per task wakeup while not running (wwnr) monitor.

This model is broken, the reason is that a task can be running in the
processor without being set as RUNNABLE. Think about a task about to
sleep:

1:      set_current_state(TASK_UNINTERRUPTIBLE);
2:      schedule();

And then imagine an IRQ happening in between the lines one and two,
waking the task up. BOOM, the wakeup will happen while the task is
running.

Q: Why do we need this model, so?
A: To test the reactors.

Link: https://lkml.kernel.org/r/473c0fc39967250fdebcff8b620311c11dccad30.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/rv/index.rst        |  1 +
 Documentation/trace/rv/monitor_wwnr.rst | 45 +++++++++++++++++
 include/trace/events/rv.h               | 12 +++++
 kernel/trace/rv/Kconfig                 | 12 +++++
 kernel/trace/rv/Makefile                |  1 +
 kernel/trace/rv/monitors/wwnr/wwnr.c    | 87 +++++++++++++++++++++++++++++++++
 kernel/trace/rv/monitors/wwnr/wwnr.h    | 46 +++++++++++++++++
 tools/verification/models/wwnr.dot      | 16 ++++++
 8 files changed, 220 insertions(+)
 create mode 100644 Documentation/trace/rv/monitor_wwnr.rst
 create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.c
 create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.h
 create mode 100644 tools/verification/models/wwnr.dot

(limited to 'include')

diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index 4cb71ed628b8..15fa966102c0 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -11,3 +11,4 @@ Runtime Verification
    da_monitor_synthesis.rst
    da_monitor_instrumentation.rst
    monitor_wip.rst
+   monitor_wwnr.rst
diff --git a/Documentation/trace/rv/monitor_wwnr.rst b/Documentation/trace/rv/monitor_wwnr.rst
new file mode 100644
index 000000000000..80f1777b85aa
--- /dev/null
+++ b/Documentation/trace/rv/monitor_wwnr.rst
@@ -0,0 +1,45 @@
+Monitor wwnr
+============
+
+- Name: wwrn - wakeup while not running
+- Type: per-task deterministic automaton
+- Author: Daniel Bristot de Oliveira <bristot@kernel.org>
+
+Description
+-----------
+
+This is a per-task sample monitor, with the following
+definition::
+
+               |
+               |
+               v
+    wakeup   +-------------+
+  +--------- |             |
+  |          | not_running |
+  +--------> |             | <+
+             +-------------+  |
+               |              |
+               | switch_in    | switch_out
+               v              |
+             +-------------+  |
+             |   running   | -+
+             +-------------+
+
+This model is borken, the reason is that a task can be running
+in the processor without being set as RUNNABLE. Think about a
+task about to sleep::
+
+  1:      set_current_state(TASK_UNINTERRUPTIBLE);
+  2:      schedule();
+
+And then imagine an IRQ happening in between the lines one and two,
+waking the task up. BOOM, the wakeup will happen while the task is
+running.
+
+- Why do we need this model, so?
+- To test the reactors.
+
+Specification
+-------------
+Grapviz Dot file in tools/verification/models/wwnr.dot
diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h
index e972f27d8df3..56592da9301c 100644
--- a/include/trace/events/rv.h
+++ b/include/trace/events/rv.h
@@ -122,6 +122,18 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
 		__entry->event,
 		__entry->state)
 );
+
+#ifdef CONFIG_RV_MON_WWNR
+/* id is the pid of the task */
+DEFINE_EVENT(event_da_monitor_id, event_wwnr,
+	     TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+	     TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_wwnr,
+	     TP_PROTO(int id, char *state, char *event),
+	     TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_WWNR */
+
 #endif /* CONFIG_DA_MON_EVENTS_ID */
 #endif /* _TRACE_RV_H */
 
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index e50f3346164a..b259d6e8dc7c 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -38,6 +38,18 @@ config RV_MON_WIP
 	  For further information, see:
 	    Documentation/trace/rv/monitor_wip.rst
 
+config RV_MON_WWNR
+	depends on RV
+	select DA_MON_EVENTS_ID
+	bool "wwnr monitor"
+	help
+	  Enable wwnr (wakeup while not running) sample monitor, this is a
+	  sample monitor that illustrates the usage of per-task monitor.
+	  The model is borken on purpose: it serves to test reactors.
+
+	  For further information, see:
+	    Documentation/trace/rv/monitor_wwnr.rst
+
 config RV_REACTORS
 	bool "Runtime verification reactors"
 	default y
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index b41109d2750a..af0ff9a46418 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -3,3 +3,4 @@
 obj-$(CONFIG_RV) += rv.o
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
 obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
+obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
new file mode 100644
index 000000000000..599225d9cf38
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "wwnr"
+
+#include <trace/events/rv.h>
+#include <trace/events/sched.h>
+
+#include "wwnr.h"
+
+struct rv_monitor rv_wwnr;
+DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+
+static void handle_switch(void *data, bool preempt, struct task_struct *p,
+			  struct task_struct *n, unsigned int prev_state)
+{
+	/* start monitoring only after the first suspension */
+	if (prev_state == TASK_INTERRUPTIBLE)
+		da_handle_start_event_wwnr(p, switch_out_wwnr);
+	else
+		da_handle_event_wwnr(p, switch_out_wwnr);
+
+	da_handle_event_wwnr(n, switch_in_wwnr);
+}
+
+static void handle_wakeup(void *data, struct task_struct *p)
+{
+	da_handle_event_wwnr(p, wakeup_wwnr);
+}
+
+static int enable_wwnr(void)
+{
+	int retval;
+
+	retval = da_monitor_init_wwnr();
+	if (retval)
+		return retval;
+
+	rv_attach_trace_probe("wwnr", sched_switch, handle_switch);
+	rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+	return 0;
+}
+
+static void disable_wwnr(void)
+{
+	rv_wwnr.enabled = 0;
+
+	rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
+	rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+	da_monitor_destroy_wwnr();
+}
+
+struct rv_monitor rv_wwnr = {
+	.name = "wwnr",
+	.description = "wakeup while not running per-task testing model.",
+	.enable = enable_wwnr,
+	.disable = disable_wwnr,
+	.reset = da_monitor_reset_all_wwnr,
+	.enabled = 0,
+};
+
+static int register_wwnr(void)
+{
+	rv_register_monitor(&rv_wwnr);
+	return 0;
+}
+
+static void unregister_wwnr(void)
+{
+	rv_unregister_monitor(&rv_wwnr);
+}
+
+module_init(register_wwnr);
+module_exit(unregister_wwnr);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wwnr: wakeup while not running monitor");
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
new file mode 100644
index 000000000000..d1afe55cdd4c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -0,0 +1,46 @@
+/*
+ * Automatically generated C representation of wwnr automaton
+ * For further information about this format, see kernel documentation:
+ *   Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_wwnr {
+	not_running_wwnr = 0,
+	running_wwnr,
+	state_max_wwnr
+};
+
+#define INVALID_STATE state_max_wwnr
+
+enum events_wwnr {
+	switch_in_wwnr = 0,
+	switch_out_wwnr,
+	wakeup_wwnr,
+	event_max_wwnr
+};
+
+struct automaton_wwnr {
+	char *state_names[state_max_wwnr];
+	char *event_names[event_max_wwnr];
+	unsigned char function[state_max_wwnr][event_max_wwnr];
+	unsigned char initial_state;
+	bool final_states[state_max_wwnr];
+};
+
+struct automaton_wwnr automaton_wwnr = {
+	.state_names = {
+		"not_running",
+		"running"
+	},
+	.event_names = {
+		"switch_in",
+		"switch_out",
+		"wakeup"
+	},
+	.function = {
+		{       running_wwnr,      INVALID_STATE,   not_running_wwnr },
+		{      INVALID_STATE,   not_running_wwnr,      INVALID_STATE },
+	},
+	.initial_state = not_running_wwnr,
+	.final_states = { 1, 0 },
+};
diff --git a/tools/verification/models/wwnr.dot b/tools/verification/models/wwnr.dot
new file mode 100644
index 000000000000..1b206e83129c
--- /dev/null
+++ b/tools/verification/models/wwnr.dot
@@ -0,0 +1,16 @@
+digraph state_automaton {
+	{node [shape = plaintext, style=invis, label=""] "__init_not_running"};
+	{node [shape = ellipse] "not_running"};
+	{node [shape = plaintext] "not_running"};
+	{node [shape = plaintext] "running"};
+	"__init_not_running" -> "not_running";
+	"not_running" [label = "not_running", color = green3];
+	"not_running" -> "not_running" [ label = "wakeup" ];
+	"not_running" -> "running" [ label = "switch_in" ];
+	"running" [label = "running"];
+	"running" -> "not_running" [ label = "switch_out" ];
+	{ rank = min ;
+		"__init_not_running";
+		"not_running";
+	}
+}
-- 
cgit 


From 4c3d2f9388d36eb28640a220a6f908328442d873 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sun, 31 Jul 2022 01:59:28 -0400
Subject: tracing: Use a struct alignof to determine trace event field
 alignment

alignof() gives an alignment of types as they would be as standalone
variables. But alignment in structures might be different, and when
building the fields of events, the alignment must be the actual
alignment otherwise the field offsets may not match what they actually
are.

This caused trace-cmd to crash, as libtraceevent did not check if the
field offset was bigger than the event. The write_msr and read_msr
events on 32 bit had their fields incorrect, because it had a u64 field
between two ints. alignof(u64) would give 8, but the u64 field was at a
4 byte alignment.

Define a macro as:

   ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))

which gives the actual alignment of types in a structure.

Link: https://lkml.kernel.org/r/20220731015928.7ab3a154@rorschach.local.home

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/stages/stage4_event_fields.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
index c3790ec7a453..80d34f396555 100644
--- a/include/trace/stages/stage4_event_fields.h
+++ b/include/trace/stages/stage4_event_fields.h
@@ -2,16 +2,18 @@
 
 /* Stage 4 definitions for creating trace events */
 
+#define ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))
+
 #undef __field_ext
 #define __field_ext(_type, _item, _filter_type) {			\
 	.type = #_type, .name = #_item,					\
-	.size = sizeof(_type), .align = __alignof__(_type),		\
+	.size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type),	\
 	.is_signed = is_signed_type(_type), .filter_type = _filter_type },
 
 #undef __field_struct_ext
 #define __field_struct_ext(_type, _item, _filter_type) {		\
 	.type = #_type, .name = #_item,					\
-	.size = sizeof(_type), .align = __alignof__(_type),		\
+	.size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type),	\
 	0, .filter_type = _filter_type },
 
 #undef __field
@@ -23,7 +25,7 @@
 #undef __array
 #define __array(_type, _item, _len) {					\
 	.type = #_type"["__stringify(_len)"]", .name = #_item,		\
-	.size = sizeof(_type[_len]), .align = __alignof__(_type),	\
+	.size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type),	\
 	.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER },
 
 #undef __dynamic_array
-- 
cgit 


From a603002eea8213eec5211be5a85db8340aea06d0 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 22 Jun 2022 08:38:36 +0200
Subject: virtio: replace restricted mem access flag with callback

Instead of having a global flag to require restricted memory access
for all virtio devices, introduce a callback which can select that
requirement on a per-device basis.

For convenience add a common function returning always true, which can
be used for use cases like SEV.

Per default use a callback always returning false.

As the callback needs to be set in early init code already, add a
virtio anchor which is builtin in case virtio is enabled.

Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Arm64 guest using Xen
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Link: https://lore.kernel.org/r/20220622063838.8854-2-jgross@suse.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/s390/mm/init.c              |  4 ++--
 arch/x86/mm/mem_encrypt_amd.c    |  4 ++--
 drivers/virtio/Kconfig           |  4 ++++
 drivers/virtio/Makefile          |  1 +
 drivers/virtio/virtio.c          |  4 ++--
 drivers/virtio/virtio_anchor.c   | 18 ++++++++++++++++++
 include/linux/platform-feature.h |  6 +-----
 include/linux/virtio_anchor.h    | 19 +++++++++++++++++++
 include/xen/xen.h                |  4 ++--
 9 files changed, 51 insertions(+), 13 deletions(-)
 create mode 100644 drivers/virtio/virtio_anchor.c
 create mode 100644 include/linux/virtio_anchor.h

(limited to 'include')

diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6a0ac00d5a42..4a154a084966 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -31,7 +31,6 @@
 #include <linux/cma.h>
 #include <linux/gfp.h>
 #include <linux/dma-direct.h>
-#include <linux/platform-feature.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
 #include <asm/pgalloc.h>
@@ -48,6 +47,7 @@
 #include <asm/kasan.h>
 #include <asm/dma-mapping.h>
 #include <asm/uv.h>
+#include <linux/virtio_anchor.h>
 #include <linux/virtio_config.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
@@ -175,7 +175,7 @@ static void pv_init(void)
 	if (!is_prot_virt_guest())
 		return;
 
-	platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
+	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
 
 	/* make sure bounce buffers are shared */
 	swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index f6d038e2cd8e..97452688f99f 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -20,8 +20,8 @@
 #include <linux/bitops.h>
 #include <linux/dma-mapping.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_anchor.h>
 #include <linux/cc_platform.h>
-#include <linux/platform-feature.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -245,7 +245,7 @@ void __init sev_setup_arch(void)
 	swiotlb_adjust_size(size);
 
 	/* Set restricted memory access for virtio. */
-	platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
+	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
 }
 
 static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index e1556d2a355a..56c77f63cd22 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config VIRTIO_ANCHOR
+	bool
+
 config VIRTIO
 	tristate
+	select VIRTIO_ANCHOR
 	help
 	  This option is selected by any driver which implements the virtio
 	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 0a82d0873248..8e98d24917cc 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
+obj-$(CONFIG_VIRTIO_ANCHOR) += virtio_anchor.o
 obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o
 obj-$(CONFIG_VIRTIO_PCI_LIB_LEGACY) += virtio_pci_legacy_dev.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 7deeed30d1f3..14c142d77fba 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -2,10 +2,10 @@
 #include <linux/virtio.h>
 #include <linux/spinlock.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_anchor.h>
 #include <linux/module.h>
 #include <linux/idr.h>
 #include <linux/of.h>
-#include <linux/platform-feature.h>
 #include <uapi/linux/virtio_ids.h>
 
 /* Unique numbering for virtio devices. */
@@ -174,7 +174,7 @@ static int virtio_features_ok(struct virtio_device *dev)
 
 	might_sleep();
 
-	if (platform_has(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS)) {
+	if (virtio_check_mem_acc_cb(dev)) {
 		if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) {
 			dev_warn(&dev->dev,
 				 "device must provide VIRTIO_F_VERSION_1\n");
diff --git a/drivers/virtio/virtio_anchor.c b/drivers/virtio/virtio_anchor.c
new file mode 100644
index 000000000000..4d6a5d269b55
--- /dev/null
+++ b/drivers/virtio/virtio_anchor.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/virtio.h>
+#include <linux/virtio_anchor.h>
+
+bool virtio_require_restricted_mem_acc(struct virtio_device *dev)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_require_restricted_mem_acc);
+
+static bool virtio_no_restricted_mem_acc(struct virtio_device *dev)
+{
+	return false;
+}
+
+bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev) =
+	virtio_no_restricted_mem_acc;
+EXPORT_SYMBOL_GPL(virtio_check_mem_acc_cb);
diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h
index b2f48be999fa..6ed859928b97 100644
--- a/include/linux/platform-feature.h
+++ b/include/linux/platform-feature.h
@@ -6,11 +6,7 @@
 #include <asm/platform-feature.h>
 
 /* The platform features are starting with the architecture specific ones. */
-
-/* Used to enable platform specific DMA handling for virtio devices. */
-#define PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS	(0 + PLATFORM_ARCH_FEAT_N)
-
-#define PLATFORM_FEAT_N				(1 + PLATFORM_ARCH_FEAT_N)
+#define PLATFORM_FEAT_N				(0 + PLATFORM_ARCH_FEAT_N)
 
 void platform_set(unsigned int feature);
 void platform_clear(unsigned int feature);
diff --git a/include/linux/virtio_anchor.h b/include/linux/virtio_anchor.h
new file mode 100644
index 000000000000..432e6c00b3ca
--- /dev/null
+++ b/include/linux/virtio_anchor.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_ANCHOR_H
+#define _LINUX_VIRTIO_ANCHOR_H
+
+#ifdef CONFIG_VIRTIO_ANCHOR
+struct virtio_device;
+
+bool virtio_require_restricted_mem_acc(struct virtio_device *dev);
+extern bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev);
+
+static inline void virtio_set_mem_acc_cb(bool (*func)(struct virtio_device *))
+{
+	virtio_check_mem_acc_cb = func;
+}
+#else
+#define virtio_set_mem_acc_cb(func) do { } while (0)
+#endif
+
+#endif /* _LINUX_VIRTIO_ANCHOR_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 0780a81e140d..ac5a144c6a65 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -52,12 +52,12 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 extern u64 xen_saved_max_mem_size;
 #endif
 
-#include <linux/platform-feature.h>
+#include <linux/virtio_anchor.h>
 
 static inline void xen_set_restricted_virtio_memory_access(void)
 {
 	if (IS_ENABLED(CONFIG_XEN_VIRTIO) && xen_domain())
-		platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
+		virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
 }
 
 #ifdef CONFIG_XEN_UNPOPULATED_ALLOC
-- 
cgit 


From a870544ca9d215449e91ebc01e35d80b23151c78 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 22 Jun 2022 08:38:37 +0200
Subject: kernel: remove platform_has() infrastructure

The only use case of the platform_has() infrastructure has been
removed again, so remove the whole feature.

Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Arm64 guest using Xen
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Link: https://lore.kernel.org/r/20220622063838.8854-3-jgross@suse.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 MAINTAINERS                            |  8 --------
 include/asm-generic/Kbuild             |  1 -
 include/asm-generic/platform-feature.h |  8 --------
 include/linux/platform-feature.h       | 15 ---------------
 kernel/Makefile                        |  2 +-
 kernel/platform-feature.c              | 27 ---------------------------
 6 files changed, 1 insertion(+), 60 deletions(-)
 delete mode 100644 include/asm-generic/platform-feature.h
 delete mode 100644 include/linux/platform-feature.h
 delete mode 100644 kernel/platform-feature.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 64379c699903..c173a580ff77 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15953,14 +15953,6 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iio/chemical/plantower,pms7003.yaml
 F:	drivers/iio/chemical/pms7003.c
 
-PLATFORM FEATURE INFRASTRUCTURE
-M:	Juergen Gross <jgross@suse.com>
-S:	Maintained
-F:	arch/*/include/asm/platform-feature.h
-F:	include/asm-generic/platform-feature.h
-F:	include/linux/platform-feature.h
-F:	kernel/platform-feature.c
-
 PLDMFW LIBRARY
 M:	Jacob Keller <jacob.e.keller@intel.com>
 S:	Maintained
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 8e47d483b524..302506bbc2a4 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -44,7 +44,6 @@ mandatory-y += msi.h
 mandatory-y += pci.h
 mandatory-y += percpu.h
 mandatory-y += pgalloc.h
-mandatory-y += platform-feature.h
 mandatory-y += preempt.h
 mandatory-y += rwonce.h
 mandatory-y += sections.h
diff --git a/include/asm-generic/platform-feature.h b/include/asm-generic/platform-feature.h
deleted file mode 100644
index 4b0af3d51588..000000000000
--- a/include/asm-generic/platform-feature.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_GENERIC_PLATFORM_FEATURE_H
-#define _ASM_GENERIC_PLATFORM_FEATURE_H
-
-/* Number of arch specific feature flags. */
-#define PLATFORM_ARCH_FEAT_N	0
-
-#endif /* _ASM_GENERIC_PLATFORM_FEATURE_H */
diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h
deleted file mode 100644
index 6ed859928b97..000000000000
--- a/include/linux/platform-feature.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PLATFORM_FEATURE_H
-#define _PLATFORM_FEATURE_H
-
-#include <linux/bitops.h>
-#include <asm/platform-feature.h>
-
-/* The platform features are starting with the architecture specific ones. */
-#define PLATFORM_FEAT_N				(0 + PLATFORM_ARCH_FEAT_N)
-
-void platform_set(unsigned int feature);
-void platform_clear(unsigned int feature);
-bool platform_has(unsigned int feature);
-
-#endif /* _PLATFORM_FEATURE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index a7e1f49ab2b3..318789c728d3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    cpu.o exit.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o user.o \
 	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
-	    extable.o params.o platform-feature.o \
+	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o regset.o
diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c
deleted file mode 100644
index cb6a6c3e4fed..000000000000
--- a/kernel/platform-feature.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bitops.h>
-#include <linux/cache.h>
-#include <linux/export.h>
-#include <linux/platform-feature.h>
-
-#define PLATFORM_FEAT_ARRAY_SZ  BITS_TO_LONGS(PLATFORM_FEAT_N)
-static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ];
-
-void platform_set(unsigned int feature)
-{
-	set_bit(feature, platform_features);
-}
-EXPORT_SYMBOL_GPL(platform_set);
-
-void platform_clear(unsigned int feature)
-{
-	clear_bit(feature, platform_features);
-}
-EXPORT_SYMBOL_GPL(platform_clear);
-
-bool platform_has(unsigned int feature)
-{
-	return test_bit(feature, platform_features);
-}
-EXPORT_SYMBOL_GPL(platform_has);
-- 
cgit 


From 251e90e7e346a23742b90e2c4db19d322e071d99 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 22 Jun 2022 08:38:38 +0200
Subject: xen: don't require virtio with grants for non-PV guests

Commit fa1f57421e0b ("xen/virtio: Enable restricted memory access using
Xen grant mappings") introduced a new requirement for using virtio
devices: the backend now needs to support the VIRTIO_F_ACCESS_PLATFORM
feature.

This is an undue requirement for non-PV guests, as those can be operated
with existing backends without any problem, as long as those backends
are running in dom0.

Per default allow virtio devices without grant support for non-PV
guests.

On Arm require VIRTIO_F_ACCESS_PLATFORM for devices having been listed
in the device tree to use grants.

Add a new config item to always force use of grants for virtio.

Fixes: fa1f57421e0b ("xen/virtio: Enable restricted memory access using Xen grant mappings")
Reported-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Tested-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Arm64 guest using Xen
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Link: https://lore.kernel.org/r/20220622063838.8854-4-jgross@suse.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/arm/xen/enlighten.c     |  4 +++-
 arch/x86/xen/enlighten_hvm.c |  4 +++-
 arch/x86/xen/enlighten_pv.c  |  5 ++++-
 drivers/xen/Kconfig          |  9 +++++++++
 drivers/xen/grant-dma-ops.c  | 10 ++++++++++
 include/xen/xen-ops.h        |  9 +++++++++
 include/xen/xen.h            |  8 --------
 7 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 1f9c3ba32833..93c8ccbf2982 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -34,6 +34,7 @@
 #include <linux/timekeeping.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/acpi.h>
+#include <linux/virtio_anchor.h>
 
 #include <linux/mm.h>
 
@@ -443,7 +444,8 @@ static int __init xen_guest_init(void)
 	if (!xen_domain())
 		return 0;
 
-	xen_set_restricted_virtio_memory_access();
+	if (IS_ENABLED(CONFIG_XEN_VIRTIO))
+		virtio_set_mem_acc_cb(xen_virtio_mem_acc);
 
 	if (!acpi_disabled)
 		xen_acpi_guest_init();
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 8b71b1dd7639..28762f800596 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/kexec.h>
 #include <linux/memblock.h>
+#include <linux/virtio_anchor.h>
 
 #include <xen/features.h>
 #include <xen/events.h>
@@ -195,7 +196,8 @@ static void __init xen_hvm_guest_init(void)
 	if (xen_pv_domain())
 		return;
 
-	xen_set_restricted_virtio_memory_access();
+	if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
+		virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
 
 	init_hvm_pv_info();
 
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 70fb2ea85e90..0ed2e487a693 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -31,6 +31,7 @@
 #include <linux/gfp.h>
 #include <linux/edd.h>
 #include <linux/reboot.h>
+#include <linux/virtio_anchor.h>
 
 #include <xen/xen.h>
 #include <xen/events.h>
@@ -109,7 +110,9 @@ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 
 static void __init xen_pv_init_platform(void)
 {
-	xen_set_restricted_virtio_memory_access();
+	/* PV guests can't operate virtio devices without grants. */
+	if (IS_ENABLED(CONFIG_XEN_VIRTIO))
+		virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
 
 	populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
 
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index bfd5f4f706bc..a65bd92121a5 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -355,4 +355,13 @@ config XEN_VIRTIO
 
 	  If in doubt, say n.
 
+config XEN_VIRTIO_FORCE_GRANT
+	bool "Require Xen virtio support to use grants"
+	depends on XEN_VIRTIO
+	help
+	  Require virtio for Xen guests to use grant mappings.
+	  This will avoid the need to give the backend the right to map all
+	  of the guest memory. This will need support on the backend side
+	  (e.g. qemu or kernel, depending on the virtio device types used).
+
 endmenu
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index fc0142484001..8973fc1e9ccc 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -12,6 +12,8 @@
 #include <linux/of.h>
 #include <linux/pfn.h>
 #include <linux/xarray.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio.h>
 #include <xen/xen.h>
 #include <xen/xen-ops.h>
 #include <xen/grant_table.h>
@@ -287,6 +289,14 @@ bool xen_is_grant_dma_device(struct device *dev)
 	return has_iommu;
 }
 
+bool xen_virtio_mem_acc(struct virtio_device *dev)
+{
+	if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
+		return true;
+
+	return xen_is_grant_dma_device(dev->dev.parent);
+}
+
 void xen_grant_setup_dma_ops(struct device *dev)
 {
 	struct xen_grant_dma_data *data;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 80546960f8b7..dae0f350c678 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,6 +5,7 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/efi.h>
+#include <linux/virtio_anchor.h>
 #include <xen/features.h>
 #include <asm/xen/interface.h>
 #include <xen/interface/vcpu.h>
@@ -217,6 +218,7 @@ static inline void xen_preemptible_hcall_end(void) { }
 #ifdef CONFIG_XEN_GRANT_DMA_OPS
 void xen_grant_setup_dma_ops(struct device *dev);
 bool xen_is_grant_dma_device(struct device *dev);
+bool xen_virtio_mem_acc(struct virtio_device *dev);
 #else
 static inline void xen_grant_setup_dma_ops(struct device *dev)
 {
@@ -225,6 +227,13 @@ static inline bool xen_is_grant_dma_device(struct device *dev)
 {
 	return false;
 }
+
+struct virtio_device;
+
+static inline bool xen_virtio_mem_acc(struct virtio_device *dev)
+{
+	return false;
+}
 #endif /* CONFIG_XEN_GRANT_DMA_OPS */
 
 #endif /* INCLUDE_XEN_OPS_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h
index ac5a144c6a65..a99bab817523 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -52,14 +52,6 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 extern u64 xen_saved_max_mem_size;
 #endif
 
-#include <linux/virtio_anchor.h>
-
-static inline void xen_set_restricted_virtio_memory_access(void)
-{
-	if (IS_ENABLED(CONFIG_XEN_VIRTIO) && xen_domain())
-		virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
-}
-
 #ifdef CONFIG_XEN_UNPOPULATED_ALLOC
 int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages);
 void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages);
-- 
cgit 


From 36d4b36b69590fed99356a4426c940a253a93800 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 25 Jul 2022 09:39:17 -0700
Subject: lib/nodemask: inline next_node_in() and node_random()

The functions are pretty thin wrappers around find_bit engine, and
keeping them in c-file prevents compiler from small_const_nbits()
optimization, which must take place for all systems with MAX_NUMNODES
less than BITS_PER_LONG (default is 16 for me).

Moving them to header file doesn't blow up the kernel size:
add/remove: 1/2 grow/shrink: 9/5 up/down: 968/-88 (880)

CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Paul Mackerras <paulus@samba.org>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 MAINTAINERS              |  1 -
 include/linux/nodemask.h | 24 +++++++++++++++++++-----
 lib/Makefile             |  2 +-
 lib/nodemask.c           |  8 --------
 4 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7c0b8f28aa25..19c8d0ef1177 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3540,7 +3540,6 @@ F:	lib/bitmap.c
 F:	lib/cpumask.c
 F:	lib/find_bit.c
 F:	lib/find_bit_benchmark.c
-F:	lib/nodemask.c
 F:	lib/test_bitmap.c
 F:	tools/include/linux/bitmap.h
 F:	tools/include/linux/find.h
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 0f233b76c9ce..4b71a96190a8 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -94,6 +94,7 @@
 #include <linux/bitmap.h>
 #include <linux/minmax.h>
 #include <linux/numa.h>
+#include <linux/random.h>
 
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
@@ -276,7 +277,14 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp)
  * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
  */
 #define next_node_in(n, src) __next_node_in((n), &(src))
-unsigned int __next_node_in(int node, const nodemask_t *srcp);
+static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
+{
+	unsigned int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
 
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
@@ -493,14 +501,20 @@ static inline int num_node_state(enum node_states state)
 
 #endif
 
+static inline int node_random(const nodemask_t *maskp)
+{
 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
-extern int node_random(const nodemask_t *maskp);
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
 #else
-static inline int node_random(const nodemask_t *mask)
-{
 	return 0;
-}
 #endif
+}
 
 #define node_online_map 	node_states[N_ONLINE]
 #define node_possible_map 	node_states[N_POSSIBLE]
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..731cea0342d1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -33,7 +33,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
-	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
+	 nmi_backtrace.o win_minmax.o memcat_p.o \
 	 buildid.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
index e22647f5181b..b8a433d16b51 100644
--- a/lib/nodemask.c
+++ b/lib/nodemask.c
@@ -3,14 +3,6 @@
 #include <linux/module.h>
 #include <linux/random.h>
 
-unsigned int __next_node_in(int node, const nodemask_t *srcp)
-{
-	unsigned int ret = __next_node(node, srcp);
-
-	if (ret == MAX_NUMNODES)
-		ret = __first_node(srcp);
-	return ret;
-}
 EXPORT_SYMBOL(__next_node_in);
 
 #ifdef CONFIG_NUMA
-- 
cgit 


From 2df91e397d85cd4c5206ab48d4e398e338db02d7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Jul 2022 09:12:33 +0000
Subject: net: rose: add netdev ref tracker to 'struct rose_sock'

This will help debugging netdevice refcount problems with
CONFIG_NET_DEV_REFCNT_TRACKER=y

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tested-by: Bernard Pidoux <f6bvp@free.fr>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rose.h |  3 ++-
 net/rose/af_rose.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/rose.h b/include/net/rose.h
index f192a64ddef2..23267b4efcfa 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -132,7 +132,8 @@ struct rose_sock {
 	ax25_address		source_digis[ROSE_MAX_DIGIS];
 	ax25_address		dest_digis[ROSE_MAX_DIGIS];
 	struct rose_neigh	*neighbour;
-	struct net_device		*device;
+	struct net_device	*device;
+	netdevice_tracker	dev_tracker;
 	unsigned int		lci, rand;
 	unsigned char		state, condition, qbitincl, defer;
 	unsigned char		cause, diagnostic;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a8e3ec800a9c..36fefc3957d7 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -192,7 +192,7 @@ static void rose_kill_by_device(struct net_device *dev)
 			rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
 			if (rose->neighbour)
 				rose->neighbour->use--;
-			dev_put(rose->device);
+			netdev_put(rose->device, &rose->dev_tracker);
 			rose->device = NULL;
 		}
 	}
@@ -594,7 +594,7 @@ static struct sock *rose_make_new(struct sock *osk)
 	rose->defer	= orose->defer;
 	rose->device	= orose->device;
 	if (rose->device)
-		dev_hold(rose->device);
+		netdev_hold(rose->device, &rose->dev_tracker, GFP_ATOMIC);
 	rose->qbitincl	= orose->qbitincl;
 
 	return sk;
@@ -648,7 +648,7 @@ static int rose_release(struct socket *sock)
 		break;
 	}
 
-	dev_put(rose->device);
+	netdev_put(rose->device, &rose->dev_tracker);
 	sock->sk = NULL;
 	release_sock(sk);
 	sock_put(sk);
@@ -700,6 +700,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	rose->source_addr   = addr->srose_addr;
 	rose->device        = dev;
+	netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL);
 	rose->source_ndigis = addr->srose_ndigis;
 
 	if (addr_len == sizeof(struct full_sockaddr_rose)) {
@@ -801,6 +802,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 		memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
 		rose->source_call = user->call;
 		rose->device      = dev;
+		netdev_tracker_alloc(rose->device, &rose->dev_tracker,
+				     GFP_KERNEL);
 		ax25_uid_put(user);
 
 		rose_insert_socket(sk);		/* Finish the bind */
@@ -1024,6 +1027,9 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
 		make_rose->source_digis[n] = facilities.source_digis[n];
 	make_rose->neighbour     = neigh;
 	make_rose->device        = dev;
+	/* Caller got a reference for us. */
+	netdev_tracker_alloc(make_rose->device, &make_rose->dev_tracker,
+			     GFP_ATOMIC);
 	make_rose->facilities    = facilities;
 
 	make_rose->neighbour->use++;
-- 
cgit 


From ef4f7e4bf1dc26aaa86cf8e5a13013684139be51 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Thu, 28 Jul 2022 00:41:25 +0300
Subject: scsi: target: core: De-RCU of se_lun and se_lun acl

se_lun and se_lun_acl are immutable pointers of struct se_dev_entry.
Remove RCU usage for access to those pointers.

Link: https://lore.kernel.org/r/20220727214125.19647-3-d.bogdanov@yadro.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_alua.c   |  3 +--
 drivers/target/target_core_device.c | 29 +++++++++++------------------
 drivers/target/target_core_pr.c     | 28 ++++++++--------------------
 drivers/target/target_core_stat.c   | 10 ++++------
 drivers/target/target_core_xcopy.c  |  2 +-
 include/target/target_core_base.h   |  4 ++--
 6 files changed, 27 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c
index 58df0145e8d0..fb91423a4e2e 100644
--- a/drivers/target/target_core_alua.c
+++ b/drivers/target/target_core_alua.c
@@ -934,8 +934,7 @@ static void core_alua_queue_state_change_ua(struct t10_alua_tg_pt_gp *tg_pt_gp)
 
 		spin_lock(&lun->lun_deve_lock);
 		list_for_each_entry(se_deve, &lun->lun_deve_list, lun_link) {
-			lacl = rcu_dereference_check(se_deve->se_lun_acl,
-					lockdep_is_held(&lun->lun_deve_lock));
+			lacl = se_deve->se_lun_acl;
 
 			/*
 			 * spc4r37 p.242:
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 88a844ae1fd0..b7f16ee8aa0e 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -75,7 +75,7 @@ transport_lookup_cmd_lun(struct se_cmd *se_cmd)
 			return TCM_WRITE_PROTECTED;
 		}
 
-		se_lun = rcu_dereference(deve->se_lun);
+		se_lun = deve->se_lun;
 
 		if (!percpu_ref_tryget_live(&se_lun->lun_ref)) {
 			se_lun = NULL;
@@ -152,7 +152,7 @@ int transport_lookup_tmr_lun(struct se_cmd *se_cmd)
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, se_cmd->orig_fe_lun);
 	if (deve) {
-		se_lun = rcu_dereference(deve->se_lun);
+		se_lun = deve->se_lun;
 
 		if (!percpu_ref_tryget_live(&se_lun->lun_ref)) {
 			se_lun = NULL;
@@ -216,7 +216,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi(
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link) {
-		lun = rcu_dereference(deve->se_lun);
+		lun = deve->se_lun;
 		if (!lun) {
 			pr_err("%s device entries device pointer is"
 				" NULL, but Initiator has access.\n",
@@ -243,11 +243,8 @@ void core_free_device_list_for_node(
 	struct se_dev_entry *deve;
 
 	mutex_lock(&nacl->lun_entry_mutex);
-	hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link) {
-		struct se_lun *lun = rcu_dereference_check(deve->se_lun,
-					lockdep_is_held(&nacl->lun_entry_mutex));
-		core_disable_device_list_for_node(lun, deve, nacl, tpg);
-	}
+	hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link)
+		core_disable_device_list_for_node(deve->se_lun, deve, nacl, tpg);
 	mutex_unlock(&nacl->lun_entry_mutex);
 }
 
@@ -334,8 +331,7 @@ int core_enable_device_list_for_node(
 	mutex_lock(&nacl->lun_entry_mutex);
 	orig = target_nacl_find_deve(nacl, mapped_lun);
 	if (orig && orig->se_lun) {
-		struct se_lun *orig_lun = rcu_dereference_check(orig->se_lun,
-					lockdep_is_held(&nacl->lun_entry_mutex));
+		struct se_lun *orig_lun = orig->se_lun;
 
 		if (orig_lun != lun) {
 			pr_err("Existing orig->se_lun doesn't match new lun"
@@ -355,8 +351,8 @@ int core_enable_device_list_for_node(
 			return -EINVAL;
 		}
 
-		rcu_assign_pointer(new->se_lun, lun);
-		rcu_assign_pointer(new->se_lun_acl, lun_acl);
+		new->se_lun = lun;
+		new->se_lun_acl = lun_acl;
 		hlist_del_rcu(&orig->link);
 		hlist_add_head_rcu(&new->link, &nacl->lun_entry_hlist);
 		mutex_unlock(&nacl->lun_entry_mutex);
@@ -374,8 +370,8 @@ int core_enable_device_list_for_node(
 		return 0;
 	}
 
-	rcu_assign_pointer(new->se_lun, lun);
-	rcu_assign_pointer(new->se_lun_acl, lun_acl);
+	new->se_lun = lun;
+	new->se_lun_acl = lun_acl;
 	hlist_add_head_rcu(&new->link, &nacl->lun_entry_hlist);
 	mutex_unlock(&nacl->lun_entry_mutex);
 
@@ -454,10 +450,7 @@ void core_clear_lun_from_tpg(struct se_lun *lun, struct se_portal_group *tpg)
 
 		mutex_lock(&nacl->lun_entry_mutex);
 		hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link) {
-			struct se_lun *tmp_lun = rcu_dereference_check(deve->se_lun,
-					lockdep_is_held(&nacl->lun_entry_mutex));
-
-			if (lun != tmp_lun)
+			if (lun != deve->se_lun)
 				continue;
 
 			core_disable_device_list_for_node(lun, deve, nacl, tpg);
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index 3829b61b56c1..a1d67554709f 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -739,8 +739,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			if (!deve_tmp->se_lun_acl)
 				continue;
 
-			lacl_tmp = rcu_dereference_check(deve_tmp->se_lun_acl,
-						lockdep_is_held(&lun_tmp->lun_deve_lock));
+			lacl_tmp = deve_tmp->se_lun_acl;
 			nacl_tmp = lacl_tmp->se_lun_nacl;
 			/*
 			 * Skip the matching struct se_node_acl that is allocated
@@ -784,8 +783,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			 * the original *pr_reg is processed in
 			 * __core_scsi3_add_registration()
 			 */
-			dest_lun = rcu_dereference_check(deve_tmp->se_lun,
-				kref_read(&deve_tmp->pr_kref) != 0);
+			dest_lun = deve_tmp->se_lun;
 
 			pr_reg_atp = __core_scsi3_do_alloc_registration(dev,
 						nacl_tmp, dest_lun, deve_tmp,
@@ -1437,34 +1435,26 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl)
 
 static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve)
 {
-	struct se_lun_acl *lun_acl;
-
 	/*
 	 * For nacl->dynamic_node_acl=1
 	 */
-	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				kref_read(&se_deve->pr_kref) != 0);
-	if (!lun_acl)
+	if (!se_deve->se_lun_acl)
 		return 0;
 
-	return target_depend_item(&lun_acl->se_lun_group.cg_item);
+	return target_depend_item(&se_deve->se_lun_acl->se_lun_group.cg_item);
 }
 
 static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 {
-	struct se_lun_acl *lun_acl;
-
 	/*
 	 * For nacl->dynamic_node_acl=1
 	 */
-	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				kref_read(&se_deve->pr_kref) != 0);
-	if (!lun_acl) {
+	if (!se_deve->se_lun_acl) {
 		kref_put(&se_deve->pr_kref, target_pr_kref_release);
 		return;
 	}
 
-	target_undepend_item(&lun_acl->se_lun_group.cg_item);
+	target_undepend_item(&se_deve->se_lun_acl->se_lun_group.cg_item);
 	kref_put(&se_deve->pr_kref, target_pr_kref_release);
 }
 
@@ -1751,8 +1741,7 @@ core_scsi3_decode_spec_i_port(
 		 * and then call __core_scsi3_add_registration() in the
 		 * 2nd loop which will never fail.
 		 */
-		dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				kref_read(&dest_se_deve->pr_kref) != 0);
+		dest_lun = dest_se_deve->se_lun;
 
 		dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev,
 					dest_node_acl, dest_lun, dest_se_deve,
@@ -3446,8 +3435,7 @@ after_iport_check:
 	dest_pr_reg = __core_scsi3_locate_pr_reg(dev, dest_node_acl,
 					iport_ptr);
 	if (!dest_pr_reg) {
-		struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				kref_read(&dest_se_deve->pr_kref) != 0);
+		struct se_lun *dest_lun = dest_se_deve->se_lun;
 
 		spin_unlock(&dev->dev_reservation_lock);
 		if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl,
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 62d15bcc3d93..f85ee5b0fd80 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -877,7 +877,6 @@ static ssize_t target_stat_auth_dev_show(struct config_item *item,
 	struct se_lun_acl *lacl = auth_to_lacl(item);
 	struct se_node_acl *nacl = lacl->se_lun_nacl;
 	struct se_dev_entry *deve;
-	struct se_lun *lun;
 	ssize_t ret;
 
 	rcu_read_lock();
@@ -886,9 +885,9 @@ static ssize_t target_stat_auth_dev_show(struct config_item *item,
 		rcu_read_unlock();
 		return -ENODEV;
 	}
-	lun = rcu_dereference(deve->se_lun);
+
 	/* scsiDeviceIndex */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", lun->lun_index);
+	ret = snprintf(page, PAGE_SIZE, "%u\n", deve->se_lun->lun_index);
 	rcu_read_unlock();
 	return ret;
 }
@@ -1217,7 +1216,6 @@ static ssize_t target_stat_iport_dev_show(struct config_item *item,
 	struct se_lun_acl *lacl = iport_to_lacl(item);
 	struct se_node_acl *nacl = lacl->se_lun_nacl;
 	struct se_dev_entry *deve;
-	struct se_lun *lun;
 	ssize_t ret;
 
 	rcu_read_lock();
@@ -1226,9 +1224,9 @@ static ssize_t target_stat_iport_dev_show(struct config_item *item,
 		rcu_read_unlock();
 		return -ENODEV;
 	}
-	lun = rcu_dereference(deve->se_lun);
+
 	/* scsiDeviceIndex */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", lun->lun_index);
+	ret = snprintf(page, PAGE_SIZE, "%u\n", deve->se_lun->lun_index);
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 6bb20aa9c5bc..8713cda0c2fb 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -88,7 +88,7 @@ static int target_xcopy_locate_se_dev_e4(struct se_session *sess,
 		struct se_device *this_dev;
 		int rc;
 
-		this_lun = rcu_dereference(deve->se_lun);
+		this_lun = deve->se_lun;
 		this_dev = rcu_dereference_raw(this_lun->lun_se_dev);
 
 		rc = target_xcopy_locate_se_dev_e4_iter(this_dev, dev_wwn);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c2b36f7d917d..8c920456edd9 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -665,9 +665,9 @@ struct se_dev_entry {
 	/* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */
 	struct kref		pr_kref;
 	struct completion	pr_comp;
-	struct se_lun_acl __rcu	*se_lun_acl;
+	struct se_lun_acl	*se_lun_acl;
 	spinlock_t		ua_lock;
-	struct se_lun __rcu	*se_lun;
+	struct se_lun		*se_lun;
 #define DEF_PR_REG_ACTIVE		1
 	unsigned long		deve_flags;
 	struct list_head	alua_port_list;
-- 
cgit 


From fe442604199ed3e60d5411137159f9623534e956 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 28 Jul 2022 15:18:48 -0700
Subject: scsi: core: Make sure that targets outlive devices

This commit prevents that the following sequence triggers a kernel crash:

 - Deletion of a SCSI device is requested via sysfs. Device removal takes
   some time because blk_cleanup_queue() is waiting for the SCSI error
   handler.

 - The SCSI target associated with that SCSI device is removed.

 - scsi_remove_target() returns and its caller frees the resources
   associated with the SCSI target.

 - The error handler makes progress and invokes an LLD callback that
   dereferences the SCSI target pointer.

Link: https://lore.kernel.org/r/20220728221851.1822295-2-bvanassche@acm.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.garry@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Reported-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_scan.c   |  2 ++
 drivers/scsi/scsi_sysfs.c  | 20 +++++++++++++++++---
 include/scsi/scsi_device.h |  2 ++
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 91ac901a6682..4c1efd6a3b0c 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -521,6 +521,8 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
 	starget->state = STARGET_CREATED;
 	starget->scsi_level = SCSI_2;
 	starget->max_target_blocked = SCSI_DEFAULT_TARGET_BLOCKED;
+	init_waitqueue_head(&starget->sdev_wq);
+
  retry:
 	spin_lock_irqsave(shost->host_lock, flags);
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 43949798a2e4..1bc9c26fe1d4 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -443,7 +443,9 @@ static void scsi_device_cls_release(struct device *class_dev)
 
 static void scsi_device_dev_release_usercontext(struct work_struct *work)
 {
-	struct scsi_device *sdev;
+	struct scsi_device *sdev = container_of(work, struct scsi_device,
+						ew.work);
+	struct scsi_target *starget = sdev->sdev_target;
 	struct device *parent;
 	struct list_head *this, *tmp;
 	struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL;
@@ -452,8 +454,6 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	unsigned long flags;
 	struct module *mod;
 
-	sdev = container_of(work, struct scsi_device, ew.work);
-
 	mod = sdev->host->hostt->module;
 
 	scsi_dh_release_device(sdev);
@@ -516,6 +516,9 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	kfree(sdev->inquiry);
 	kfree(sdev);
 
+	if (starget && atomic_dec_return(&starget->sdev_count) == 0)
+		wake_up(&starget->sdev_wq);
+
 	if (parent)
 		put_device(parent);
 	module_put(mod);
@@ -1535,6 +1538,14 @@ static void __scsi_remove_target(struct scsi_target *starget)
 		goto restart;
 	}
 	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	/*
+	 * After scsi_remove_target() returns its caller can remove resources
+	 * associated with @starget, e.g. an rport or session. Wait until all
+	 * devices associated with @starget have been removed to prevent that
+	 * a SCSI error handling callback function triggers a use-after-free.
+	 */
+	wait_event(starget->sdev_wq, atomic_read(&starget->sdev_count) == 0);
 }
 
 /**
@@ -1645,6 +1656,9 @@ void scsi_sysfs_device_initialize(struct scsi_device *sdev)
 	list_add_tail(&sdev->same_target_siblings, &starget->devices);
 	list_add_tail(&sdev->siblings, &shost->__devices);
 	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	atomic_inc(&starget->sdev_count);
+
 	/*
 	 * device can now only be removed via __scsi_remove_device() so hold
 	 * the target.  Target will be held in CREATED state until something
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 7cf5f3b7589f..190d2081f4c6 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -309,6 +309,8 @@ struct scsi_target {
 	struct list_head	devices;
 	struct device		dev;
 	struct kref		reap_ref; /* last put renders target invisible */
+	atomic_t		sdev_count;
+	wait_queue_head_t	sdev_wq;
 	unsigned int		channel;
 	unsigned int		id; /* target id ... replace
 				     * scsi_device.id eventually */
-- 
cgit 


From 16728aaba62e8b3b170735fdc3d8aa972835c136 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Jul 2022 15:18:49 -0700
Subject: scsi: core: Make sure that hosts outlive targets

Fix the race conditions between SCSI LLD kernel module unloading and SCSI
device and target removal by making sure that SCSI hosts are destroyed
after all associated target and device objects have been freed.

Link: https://lore.kernel.org/r/20220728221851.1822295-3-bvanassche@acm.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: John Garry <john.garry@huawei.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
[ bvanassche: Reworked Ming's patch and split it ]
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     | 8 ++++++++
 drivers/scsi/scsi_scan.c | 7 +++++++
 include/scsi/scsi_host.h | 3 +++
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ef6c0e37acce..8fa98c8d0ee0 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -190,6 +190,13 @@ void scsi_remove_host(struct Scsi_Host *shost)
 	transport_unregister_device(&shost->shost_gendev);
 	device_unregister(&shost->shost_dev);
 	device_del(&shost->shost_gendev);
+
+	/*
+	 * After scsi_remove_host() has returned the scsi LLD module can be
+	 * unloaded and/or the host resources can be released. Hence wait until
+	 * the dependent SCSI targets and devices are gone before returning.
+	 */
+	wait_event(shost->targets_wq, atomic_read(&shost->target_count) == 0);
 }
 EXPORT_SYMBOL(scsi_remove_host);
 
@@ -394,6 +401,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	INIT_LIST_HEAD(&shost->starved_list);
 	init_waitqueue_head(&shost->host_wait);
 	mutex_init(&shost->scan_mutex);
+	init_waitqueue_head(&shost->targets_wq);
 
 	index = ida_alloc(&host_index_ida, GFP_KERNEL);
 	if (index < 0) {
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 4c1efd6a3b0c..ac6059702d13 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -406,9 +406,14 @@ static void scsi_target_destroy(struct scsi_target *starget)
 static void scsi_target_dev_release(struct device *dev)
 {
 	struct device *parent = dev->parent;
+	struct Scsi_Host *shost = dev_to_shost(parent);
 	struct scsi_target *starget = to_scsi_target(dev);
 
 	kfree(starget);
+
+	if (atomic_dec_return(&shost->target_count) == 0)
+		wake_up(&shost->targets_wq);
+
 	put_device(parent);
 }
 
@@ -523,6 +528,8 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
 	starget->max_target_blocked = SCSI_DEFAULT_TARGET_BLOCKED;
 	init_waitqueue_head(&starget->sdev_wq);
 
+	atomic_inc(&shost->target_count);
+
  retry:
 	spin_lock_irqsave(shost->host_lock, flags);
 
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 667d889b92b5..339f975d356e 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -689,6 +689,9 @@ struct Scsi_Host {
 	/* ldm bits */
 	struct device		shost_gendev, shost_dev;
 
+	atomic_t		target_count;
+	wait_queue_head_t	targets_wq;
+
 	/*
 	 * Points to the transport data (if any) which is allocated
 	 * separately
-- 
cgit 


From 8eaa1d110800fac050bab44001732747a1c39894 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 29 Jul 2022 15:13:56 +0300
Subject: net/mlx5e: xsk: Discard unaligned XSK frames on striding RQ

Striding RQ uses MTT page mapping, where each page corresponds to an XSK
frame. MTT pages have alignment requirements, and XSK frames don't have
any alignment guarantees in the unaligned mode. Frames with improper
alignment must be discarded, otherwise the packet data will be written
at a wrong address.

Fixes: 282c0c798f8e ("net/mlx5e: Allow XSK frames smaller than a page")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20220729121356.3990867-1-maximmi@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h | 14 ++++++++++++++
 include/net/xdp_sock_drv.h                          | 11 +++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index a8cfab4a393c..cc18d97d8ee0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -7,6 +7,8 @@
 #include "en.h"
 #include <net/xdp_sock_drv.h>
 
+#define MLX5E_MTT_PTAG_MASK 0xfffffffffffffff8ULL
+
 /* RX data path */
 
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
@@ -21,6 +23,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 static inline int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq,
 					    struct mlx5e_dma_info *dma_info)
 {
+retry:
 	dma_info->xsk = xsk_buff_alloc(rq->xsk_pool);
 	if (!dma_info->xsk)
 		return -ENOMEM;
@@ -32,6 +35,17 @@ static inline int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq,
 	 */
 	dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk);
 
+	/* MTT page mapping has alignment requirements. If they are not
+	 * satisfied, leak the descriptor so that it won't come again, and try
+	 * to allocate a new one.
+	 */
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		if (unlikely(dma_info->addr & ~MLX5E_MTT_PTAG_MASK)) {
+			xsk_buff_discard(dma_info->xsk);
+			goto retry;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 4aa031849668..0774ce97c2f1 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -95,6 +95,13 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
 	xp_free(xskb);
 }
 
+static inline void xsk_buff_discard(struct xdp_buff *xdp)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+	xp_release(xskb);
+}
+
 static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
 {
 	xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
@@ -238,6 +245,10 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
 {
 }
 
+static inline void xsk_buff_discard(struct xdp_buff *xdp)
+{
+}
+
 static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
 {
 }
-- 
cgit 


From 68f2736a858324c3ec852f6c2cddd9d1c777357d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Jun 2022 15:38:48 -0400
Subject: mm: Convert all PageMovable users to movable_operations

These drivers are rather uncomfortably hammered into the
address_space_operations hole.  They aren't filesystems and don't behave
like filesystems.  They just need their own movable_operations structure,
which we can point to directly from page->mapping.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 Documentation/filesystems/locking.rst |   4 --
 Documentation/filesystems/vfs.rst     |  12 ----
 Documentation/vm/page_migration.rst   | 113 +++-------------------------------
 arch/powerpc/platforms/pseries/cmm.c  |  60 +-----------------
 drivers/misc/vmw_balloon.c            |  61 +-----------------
 drivers/virtio/virtio_balloon.c       |  47 +-------------
 include/linux/balloon_compaction.h    |   6 +-
 include/linux/fs.h                    |   2 -
 include/linux/migrate.h               |  56 +++++++++++++++--
 include/linux/page-flags.h            |   2 +-
 include/uapi/linux/magic.h            |   4 --
 mm/balloon_compaction.c               |  10 ++-
 mm/compaction.c                       |  29 ++++-----
 mm/migrate.c                          |  24 ++++----
 mm/util.c                             |   4 +-
 mm/z3fold.c                           |  84 +++----------------------
 mm/zsmalloc.c                         | 102 +++++++-----------------------
 17 files changed, 134 insertions(+), 486 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index c0fe711f14d3..9963d9600b71 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -252,9 +252,7 @@ prototypes::
 	bool (*release_folio)(struct folio *, gfp_t);
 	void (*free_folio)(struct folio *);
 	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-	bool (*isolate_page) (struct page *, isolate_mode_t);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
-	void (*putback_page) (struct page *);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
 	int (*error_remove_page)(struct address_space *, struct page *);
@@ -280,9 +278,7 @@ invalidate_folio:	yes					exclusive
 release_folio:		yes
 free_folio:		yes
 direct_IO:
-isolate_page:		yes
 migratepage:		yes (both)
-putback_page:		yes
 launder_folio:		yes
 is_partially_uptodate:	yes
 error_remove_page:	yes
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index a08c652467d7..b51665cdabc4 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -737,12 +737,8 @@ cache in your filesystem.  The following members are defined:
 		bool (*release_folio)(struct folio *, gfp_t);
 		void (*free_folio)(struct folio *);
 		ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-		/* isolate a page for migration */
-		bool (*isolate_page) (struct page *, isolate_mode_t);
 		/* migrate the contents of a page to the specified target */
 		int (*migratepage) (struct page *, struct page *);
-		/* put migration-failed page back to right list */
-		void (*putback_page) (struct page *);
 		int (*launder_folio) (struct folio *);
 
 		bool (*is_partially_uptodate) (struct folio *, size_t from,
@@ -930,11 +926,6 @@ cache in your filesystem.  The following members are defined:
 	data directly between the storage and the application's address
 	space.
 
-``isolate_page``
-	Called by the VM when isolating a movable non-lru page.  If page
-	is successfully isolated, VM marks the page as PG_isolated via
-	__SetPageIsolated.
-
 ``migrate_page``
 	This is used to compact the physical memory usage.  If the VM
 	wants to relocate a page (maybe off a memory card that is
@@ -942,9 +933,6 @@ cache in your filesystem.  The following members are defined:
 	page to this function.  migrate_page should transfer any private
 	data across and update any references that it has to the page.
 
-``putback_page``
-	Called by the VM when isolated page's migration fails.
-
 ``launder_folio``
 	Called before freeing a folio - it writes back the dirty folio.
 	To prevent redirtying the folio, it is kept locked during the
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index 8c5cb8147e55..11493bad7112 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -152,110 +152,15 @@ Steps:
 Non-LRU page migration
 ======================
 
-Although migration originally aimed for reducing the latency of memory accesses
-for NUMA, compaction also uses migration to create high-order pages.
+Although migration originally aimed for reducing the latency of memory
+accesses for NUMA, compaction also uses migration to create high-order
+pages.  For compaction purposes, it is also useful to be able to move
+non-LRU pages, such as zsmalloc and virtio-balloon pages.
 
-Current problem of the implementation is that it is designed to migrate only
-*LRU* pages. However, there are potential non-LRU pages which can be migrated
-in drivers, for example, zsmalloc, virtio-balloon pages.
-
-For virtio-balloon pages, some parts of migration code path have been hooked
-up and added virtio-balloon specific functions to intercept migration logics.
-It's too specific to a driver so other drivers who want to make their pages
-movable would have to add their own specific hooks in the migration path.
-
-To overcome the problem, VM supports non-LRU page migration which provides
-generic functions for non-LRU movable pages without driver specific hooks
-in the migration path.
-
-If a driver wants to make its pages movable, it should define three functions
-which are function pointers of struct address_space_operations.
-
-1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-
-   What VM expects from isolate_page() function of driver is to return *true*
-   if driver isolates the page successfully. On returning true, VM marks the page
-   as PG_isolated so concurrent isolation in several CPUs skip the page
-   for isolation. If a driver cannot isolate the page, it should return *false*.
-
-   Once page is successfully isolated, VM uses page.lru fields so driver
-   shouldn't expect to preserve values in those fields.
-
-2. ``int (*migratepage) (struct address_space *mapping,``
-|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-
-   After isolation, VM calls migratepage() of driver with the isolated page.
-   The function of migratepage() is to move the contents of the old page to the
-   new page
-   and set up fields of struct page newpage. Keep in mind that you should
-   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-   under page_lock if you migrated the oldpage successfully and returned
-   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-   because VM interprets -EAGAIN as "temporary migration failure". On returning
-   any error except -EAGAIN, VM will give up the page migration without
-   retrying.
-
-   Driver shouldn't touch the page.lru field while in the migratepage() function.
-
-3. ``void (*putback_page)(struct page *);``
-
-   If migration fails on the isolated page, VM should return the isolated page
-   to the driver so VM calls the driver's putback_page() with the isolated page.
-   In this function, the driver should put the isolated page back into its own data
-   structure.
-
-Non-LRU movable page flags
-
-   There are two page flags for supporting non-LRU movable page.
-
-   * PG_movable
-
-     Driver should use the function below to make page movable under page_lock::
-
-	void __SetPageMovable(struct page *page, struct address_space *mapping)
-
-     It needs argument of address_space for registering migration
-     family functions which will be called by VM. Exactly speaking,
-     PG_movable is not a real flag of struct page. Rather, VM
-     reuses the page->mapping's lower bits to represent it::
-
-	#define PAGE_MAPPING_MOVABLE 0x2
-	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-
-     so driver shouldn't access page->mapping directly. Instead, driver should
-     use page_mapping() which masks off the low two bits of page->mapping under
-     page lock so it can get the right struct address_space.
-
-     For testing of non-LRU movable pages, VM supports __PageMovable() function.
-     However, it doesn't guarantee to identify non-LRU movable pages because
-     the page->mapping field is unified with other variables in struct page.
-     If the driver releases the page after isolation by VM, page->mapping
-     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
-     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
-     page is LRU or non-LRU movable once the page has been isolated because LRU
-     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
-     good for just peeking to test non-LRU movable pages before more expensive
-     checking with lock_page() in pfn scanning to select a victim.
-
-     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
-     Unlike __PageMovable(), PageMovable() validates page->mapping and
-     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
-     sudden destroying of page->mapping.
-
-     Drivers using __SetPageMovable() should clear the flag via
-     __ClearMovablePage() under page_lock() before the releasing the page.
-
-   * PG_isolated
-
-     To prevent concurrent isolation among several CPUs, VM marks isolated page
-     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
-     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
-     flag because VM will set/clear it automatically. Keep in mind that if the
-     driver sees a PG_isolated page, it means the page has been isolated by the
-     VM so it shouldn't touch the page.lru field.
-     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
-     shouldn't use PG_isolated for its own purposes.
+If a driver wants to make its pages movable, it should define a struct
+movable_operations.  It then needs to call __SetPageMovable() on each
+page that it may be able to move.  This uses the ``page->mapping`` field,
+so this field is not available for the driver to use for other purposes.
 
 Monitoring Migration
 =====================
@@ -286,3 +191,5 @@ THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
 
 Christoph Lameter, May 8, 2006.
 Minchan Kim, Mar 28, 2016.
+
+.. kernel-doc:: include/linux/migrate.h
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 15ed8206c463..5f4037c1d7fe 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -19,9 +19,6 @@
 #include <linux/stringify.h>
 #include <linux/swap.h>
 #include <linux/device.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/magic.h>
 #include <linux/balloon_compaction.h>
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
@@ -500,19 +497,6 @@ static struct notifier_block cmm_mem_nb = {
 };
 
 #ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-
-static int cmm_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name = "ppc-cmm",
-	.init_fs_context = cmm_init_fs_context,
-	.kill_sb = kill_anon_super,
-};
-
 static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 			   struct page *newpage, struct page *page,
 			   enum migrate_mode mode)
@@ -564,47 +548,13 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 	return MIGRATEPAGE_SUCCESS;
 }
 
-static int cmm_balloon_compaction_init(void)
+static void cmm_balloon_compaction_init(void)
 {
-	int rc;
-
 	balloon_devinfo_init(&b_dev_info);
 	b_dev_info.migratepage = cmm_migratepage;
-
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		rc = PTR_ERR(balloon_mnt);
-		balloon_mnt = NULL;
-		return rc;
-	}
-
-	b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
-	if (IS_ERR(b_dev_info.inode)) {
-		rc = PTR_ERR(b_dev_info.inode);
-		b_dev_info.inode = NULL;
-		kern_unmount(balloon_mnt);
-		balloon_mnt = NULL;
-		return rc;
-	}
-
-	b_dev_info.inode->i_mapping->a_ops = &balloon_aops;
-	return 0;
-}
-static void cmm_balloon_compaction_deinit(void)
-{
-	if (b_dev_info.inode)
-		iput(b_dev_info.inode);
-	b_dev_info.inode = NULL;
-	kern_unmount(balloon_mnt);
-	balloon_mnt = NULL;
 }
 #else /* CONFIG_BALLOON_COMPACTION */
-static int cmm_balloon_compaction_init(void)
-{
-	return 0;
-}
-
-static void cmm_balloon_compaction_deinit(void)
+static void cmm_balloon_compaction_init(void)
 {
 }
 #endif /* CONFIG_BALLOON_COMPACTION */
@@ -622,9 +572,7 @@ static int cmm_init(void)
 	if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate)
 		return -EOPNOTSUPP;
 
-	rc = cmm_balloon_compaction_init();
-	if (rc)
-		return rc;
+	cmm_balloon_compaction_init();
 
 	rc = register_oom_notifier(&cmm_oom_nb);
 	if (rc < 0)
@@ -658,7 +606,6 @@ out_reboot_notifier:
 out_oom_notifier:
 	unregister_oom_notifier(&cmm_oom_nb);
 out_balloon_compaction:
-	cmm_balloon_compaction_deinit();
 	return rc;
 }
 
@@ -677,7 +624,6 @@ static void cmm_exit(void)
 	unregister_memory_notifier(&cmm_mem_nb);
 	cmm_free_pages(atomic_long_read(&loaned_pages));
 	cmm_unregister_sysfs(&cmm_dev);
-	cmm_balloon_compaction_deinit();
 }
 
 /**
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 086ce77d9074..85dd6aa33df6 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -29,8 +29,6 @@
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
@@ -1730,20 +1728,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
 
 
 #ifdef CONFIG_BALLOON_COMPACTION
-
-static int vmballoon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type vmballoon_fs = {
-	.name           	= "balloon-vmware",
-	.init_fs_context	= vmballoon_init_fs_context,
-	.kill_sb        	= kill_anon_super,
-};
-
-static struct vfsmount *vmballoon_mnt;
-
 /**
  * vmballoon_migratepage() - migrates a balloon page.
  * @b_dev_info: balloon device information descriptor.
@@ -1862,21 +1846,6 @@ out_unlock:
 	return ret;
 }
 
-/**
- * vmballoon_compaction_deinit() - removes compaction related data.
- *
- * @b: pointer to the balloon.
- */
-static void vmballoon_compaction_deinit(struct vmballoon *b)
-{
-	if (!IS_ERR(b->b_dev_info.inode))
-		iput(b->b_dev_info.inode);
-
-	b->b_dev_info.inode = NULL;
-	kern_unmount(vmballoon_mnt);
-	vmballoon_mnt = NULL;
-}
-
 /**
  * vmballoon_compaction_init() - initialized compaction for the balloon.
  *
@@ -1888,33 +1857,15 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
  *
  * Return: zero on success or error code on failure.
  */
-static __init int vmballoon_compaction_init(struct vmballoon *b)
+static __init void vmballoon_compaction_init(struct vmballoon *b)
 {
-	vmballoon_mnt = kern_mount(&vmballoon_fs);
-	if (IS_ERR(vmballoon_mnt))
-		return PTR_ERR(vmballoon_mnt);
-
 	b->b_dev_info.migratepage = vmballoon_migratepage;
-	b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb);
-
-	if (IS_ERR(b->b_dev_info.inode))
-		return PTR_ERR(b->b_dev_info.inode);
-
-	b->b_dev_info.inode->i_mapping->a_ops = &balloon_aops;
-	return 0;
 }
 
 #else /* CONFIG_BALLOON_COMPACTION */
-
-static void vmballoon_compaction_deinit(struct vmballoon *b)
-{
-}
-
-static int vmballoon_compaction_init(struct vmballoon *b)
+static inline void vmballoon_compaction_init(struct vmballoon *b)
 {
-	return 0;
 }
-
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static int __init vmballoon_init(void)
@@ -1939,9 +1890,7 @@ static int __init vmballoon_init(void)
 	 * balloon_devinfo_init() .
 	 */
 	balloon_devinfo_init(&balloon.b_dev_info);
-	error = vmballoon_compaction_init(&balloon);
-	if (error)
-		goto fail;
+	vmballoon_compaction_init(&balloon);
 
 	INIT_LIST_HEAD(&balloon.huge_pages);
 	spin_lock_init(&balloon.comm_lock);
@@ -1958,7 +1907,6 @@ static int __init vmballoon_init(void)
 	return 0;
 fail:
 	vmballoon_unregister_shrinker(&balloon);
-	vmballoon_compaction_deinit(&balloon);
 	return error;
 }
 
@@ -1985,8 +1933,5 @@ static void __exit vmballoon_exit(void)
 	 */
 	vmballoon_send_start(&balloon, 0);
 	vmballoon_pop(&balloon);
-
-	/* Only once we popped the balloon, compaction can be deinit */
-	vmballoon_compaction_deinit(&balloon);
 }
 module_exit(vmballoon_exit);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b9737da6c4dd..bd360b91e9d3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -17,9 +17,6 @@
 #include <linux/oom.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/magic.h>
-#include <linux/pseudo_fs.h>
 #include <linux/page_reporting.h>
 
 /*
@@ -42,10 +39,6 @@
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
 #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
 
-#ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-#endif
-
 enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
@@ -805,18 +798,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 
 	return MIGRATEPAGE_SUCCESS;
 }
-
-static int balloon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name           = "balloon-kvm",
-	.init_fs_context = balloon_init_fs_context,
-	.kill_sb        = kill_anon_super,
-};
-
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static unsigned long shrink_free_pages(struct virtio_balloon *vb,
@@ -909,19 +890,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		goto out_free_vb;
 
 #ifdef CONFIG_BALLOON_COMPACTION
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		err = PTR_ERR(balloon_mnt);
-		goto out_del_vqs;
-	}
-
 	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
-	if (IS_ERR(vb->vb_dev_info.inode)) {
-		err = PTR_ERR(vb->vb_dev_info.inode);
-		goto out_kern_unmount;
-	}
-	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
 		/*
@@ -930,13 +899,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		 */
 		if (virtqueue_get_vring_size(vb->free_page_vq) < 2) {
 			err = -ENOSPC;
-			goto out_iput;
+			goto out_del_vqs;
 		}
 		vb->balloon_wq = alloc_workqueue("balloon-wq",
 					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
 		if (!vb->balloon_wq) {
 			err = -ENOMEM;
-			goto out_iput;
+			goto out_del_vqs;
 		}
 		INIT_WORK(&vb->report_free_page_work, report_free_page_func);
 		vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP;
@@ -1030,13 +999,7 @@ out_unregister_shrinker:
 out_del_balloon_wq:
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
 		destroy_workqueue(vb->balloon_wq);
-out_iput:
-#ifdef CONFIG_BALLOON_COMPACTION
-	iput(vb->vb_dev_info.inode);
-out_kern_unmount:
-	kern_unmount(balloon_mnt);
 out_del_vqs:
-#endif
 	vdev->config->del_vqs(vdev);
 out_free_vb:
 	kfree(vb);
@@ -1083,12 +1046,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	}
 
 	remove_common(vb);
-#ifdef CONFIG_BALLOON_COMPACTION
-	if (vb->vb_dev_info.inode)
-		iput(vb->vb_dev_info.inode);
-
-	kern_unmount(balloon_mnt);
-#endif
 	kfree(vb);
 }
 
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index edb7f6d41faa..5ca2d5699620 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -57,7 +57,6 @@ struct balloon_dev_info {
 	struct list_head pages;		/* Pages enqueued & handled to Host */
 	int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
-	struct inode *inode;
 };
 
 extern struct page *balloon_page_alloc(void);
@@ -75,11 +74,10 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 	spin_lock_init(&balloon->pages_lock);
 	INIT_LIST_HEAD(&balloon->pages);
 	balloon->migratepage = NULL;
-	balloon->inode = NULL;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
-extern const struct address_space_operations balloon_aops;
+extern const struct movable_operations balloon_mops;
 
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
@@ -94,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	__SetPageMovable(page, balloon->inode->i_mapping);
+	__SetPageMovable(page, &balloon_mops);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..5d8ee3155ca2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -367,8 +367,6 @@ struct address_space_operations {
 	 */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-	bool (*isolate_page)(struct page *, isolate_mode_t);
-	void (*putback_page)(struct page *);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate) (struct folio *, size_t from,
 			size_t count);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..82c735ba6109 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -19,6 +19,43 @@ struct migration_target_control;
  */
 #define MIGRATEPAGE_SUCCESS		0
 
+/**
+ * struct movable_operations - Driver page migration
+ * @isolate_page:
+ * The VM calls this function to prepare the page to be moved.  The page
+ * is locked and the driver should not unlock it.  The driver should
+ * return ``true`` if the page is movable and ``false`` if it is not
+ * currently movable.  After this function returns, the VM uses the
+ * page->lru field, so the driver must preserve any information which
+ * is usually stored here.
+ *
+ * @migrate_page:
+ * After isolation, the VM calls this function with the isolated
+ * @src page.  The driver should copy the contents of the
+ * @src page to the @dst page and set up the fields of @dst page.
+ * Both pages are locked.
+ * If page migration is successful, the driver should call
+ * __ClearPageMovable(@src) and return MIGRATEPAGE_SUCCESS.
+ * If the driver cannot migrate the page at the moment, it can return
+ * -EAGAIN.  The VM interprets this as a temporary migration failure and
+ * will retry it later.  Any other error value is a permanent migration
+ * failure and migration will not be retried.
+ * The driver shouldn't touch the @src->lru field while in the
+ * migrate_page() function.  It may write to @dst->lru.
+ *
+ * @putback_page:
+ * If migration fails on the isolated page, the VM informs the driver
+ * that the page is no longer a candidate for migration by calling
+ * this function.  The driver should put the isolated page back into
+ * its own data structure.
+ */
+struct movable_operations {
+	bool (*isolate_page)(struct page *, isolate_mode_t);
+	int (*migrate_page)(struct page *dst, struct page *src,
+			enum migrate_mode);
+	void (*putback_page)(struct page *);
+};
+
 /* Defined in mm/debug.c: */
 extern const char *migrate_reason_names[MR_TYPES];
 
@@ -91,13 +128,13 @@ static inline int next_demotion_node(int node)
 #endif
 
 #ifdef CONFIG_COMPACTION
-extern int PageMovable(struct page *page);
-extern void __SetPageMovable(struct page *page, struct address_space *mapping);
-extern void __ClearPageMovable(struct page *page);
+bool PageMovable(struct page *page);
+void __SetPageMovable(struct page *page, const struct movable_operations *ops);
+void __ClearPageMovable(struct page *page);
 #else
-static inline int PageMovable(struct page *page) { return 0; }
+static inline bool PageMovable(struct page *page) { return false; }
 static inline void __SetPageMovable(struct page *page,
-				struct address_space *mapping)
+		const struct movable_operations *ops)
 {
 }
 static inline void __ClearPageMovable(struct page *page)
@@ -110,6 +147,15 @@ static inline bool folio_test_movable(struct folio *folio)
 	return PageMovable(&folio->page);
 }
 
+static inline
+const struct movable_operations *page_movable_ops(struct page *page)
+{
+	VM_BUG_ON(!__PageMovable(page));
+
+	return (const struct movable_operations *)
+		((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 extern int migrate_misplaced_page(struct page *page,
 				  struct vm_area_struct *vma, int node);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..3f5490f6f038 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -639,7 +639,7 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
  * structure which KSM associates with that merged page.  See ksm.h.
  *
  * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
- * page and then page->mapping points a struct address_space.
+ * page and then page->mapping points to a struct movable_operations.
  *
  * Please note that, confusingly, "page_mapping" refers to the inode
  * address_space which maps the page from disk; whereas "page_mapped"
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f724129c0425..6325d1d0e90f 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -98,12 +98,8 @@
 
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
-#define BALLOON_KVM_MAGIC	0x13661366
-#define ZSMALLOC_MAGIC		0x58295829
 #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
-#define Z3FOLD_MAGIC		0x33
-#define PPC_CMM_MAGIC		0xc7571590
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 4b8eab4b3f45..22c96fed70b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page)
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 }
 
-
 /* move_to_new_page() counterpart for a ballooned page */
-static int balloon_page_migrate(struct address_space *mapping,
-		struct page *newpage, struct page *page,
+static int balloon_page_migrate(struct page *newpage, struct page *page,
 		enum migrate_mode mode)
 {
 	struct balloon_dev_info *balloon = balloon_page_device(page);
@@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping,
 	return balloon->migratepage(balloon, newpage, page, mode);
 }
 
-const struct address_space_operations balloon_aops = {
-	.migratepage = balloon_page_migrate,
+const struct movable_operations balloon_mops = {
+	.migrate_page = balloon_page_migrate,
 	.isolate_page = balloon_page_isolate,
 	.putback_page = balloon_page_putback,
 };
-EXPORT_SYMBOL_GPL(balloon_aops);
+EXPORT_SYMBOL_GPL(balloon_mops);
 
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f89b969c12b..f23efba1d118 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list)
 }
 
 #ifdef CONFIG_COMPACTION
-
-int PageMovable(struct page *page)
+bool PageMovable(struct page *page)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (!__PageMovable(page))
-		return 0;
+		return false;
 
-	mapping = page_mapping(page);
-	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
-		return 1;
+	mops = page_movable_ops(page);
+	if (mops)
+		return true;
 
-	return 0;
+	return false;
 }
 EXPORT_SYMBOL(PageMovable);
 
-void __SetPageMovable(struct page *page, struct address_space *mapping)
+void __SetPageMovable(struct page *page, const struct movable_operations *mops)
 {
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
-	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+	VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
+	page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
 }
 EXPORT_SYMBOL(__SetPageMovable);
 
@@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	/*
-	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
-	 * flag so that VM can catch up released page by driver after isolation.
-	 * With it, VM migration doesn't try to put it back.
+	 * This page still has the type of a movable page, but it's
+	 * actually not movable any more.
 	 */
-	page->mapping = (void *)((unsigned long)page->mapping &
-				PAGE_MAPPING_MOVABLE);
+	page->mapping = (void *)PAGE_MAPPING_MOVABLE;
 }
 EXPORT_SYMBOL(__ClearPageMovable);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c1ea61f39d8..491f03747832 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -59,7 +59,7 @@
 
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops;
 
 	/*
 	 * Avoid burning cycles with pages that are yet under __free_pages(),
@@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
 	if (!PageMovable(page) || PageIsolated(page))
 		goto out_no_isolated;
 
-	mapping = page_mapping(page);
-	VM_BUG_ON_PAGE(!mapping, page);
+	mops = page_movable_ops(page);
+	VM_BUG_ON_PAGE(!mops, page);
 
-	if (!mapping->a_ops->isolate_page(page, mode))
+	if (!mops->isolate_page(page, mode))
 		goto out_no_isolated;
 
 	/* Driver shouldn't use PG_isolated bit of page->flags */
@@ -120,10 +120,9 @@ out:
 
 static void putback_movable_page(struct page *page)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops = page_movable_ops(page);
 
-	mapping = page_mapping(page);
-	mapping->a_ops->putback_page(page);
+	mops->putback_page(page);
 	ClearPageIsolated(page);
 }
 
@@ -846,16 +845,15 @@ static int fallback_migrate_page(struct address_space *mapping,
 static int move_to_new_folio(struct folio *dst, struct folio *src,
 				enum migrate_mode mode)
 {
-	struct address_space *mapping;
 	int rc = -EAGAIN;
 	bool is_lru = !__PageMovable(&src->page);
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
 
-	mapping = folio_mapping(src);
-
 	if (likely(is_lru)) {
+		struct address_space *mapping = folio_mapping(src);
+
 		if (!mapping)
 			rc = migrate_page(mapping, &dst->page, &src->page, mode);
 		else if (mapping->a_ops->migratepage)
@@ -872,6 +870,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			rc = fallback_migrate_page(mapping, &dst->page,
 							&src->page, mode);
 	} else {
+		const struct movable_operations *mops;
+
 		/*
 		 * In case of non-lru page, it could be released after
 		 * isolation step. In that case, we shouldn't try migration.
@@ -883,8 +883,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			goto out;
 		}
 
-		rc = mapping->a_ops->migratepage(mapping, &dst->page,
-						&src->page, mode);
+		mops = page_movable_ops(&src->page);
+		rc = mops->migrate_page(&dst->page, &src->page, mode);
 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
 				!folio_test_isolated(src));
 	}
diff --git a/mm/util.c b/mm/util.c
index 0837570c9225..53af0e79d3e4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio)
 		return swap_address_space(folio_swap_entry(folio));
 
 	mapping = folio->mapping;
-	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
 		return NULL;
 
-	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
+	return mapping;
 }
 EXPORT_SYMBOL(folio_mapping);
 
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f41f8b0d9e9a..cf71da10d04e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,15 +34,11 @@
 #include <linux/node.h>
 #include <linux/compaction.h>
 #include <linux/percpu.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/fs.h>
 #include <linux/preempt.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
-#include <linux/magic.h>
 #include <linux/kmemleak.h>
 
 /*
@@ -149,7 +145,6 @@ struct z3fold_header {
  * @compact_wq:	workqueue for page layout background optimization
  * @release_wq:	workqueue for safe page release
  * @work:	work_struct for safe page release
- * @inode:	inode for z3fold pseudo filesystem
  *
  * This structure is allocated at pool creation time and maintains metadata
  * pertaining to a particular z3fold pool.
@@ -169,7 +164,6 @@ struct z3fold_pool {
 	struct workqueue_struct *compact_wq;
 	struct workqueue_struct *release_wq;
 	struct work_struct work;
-	struct inode *inode;
 };
 
 /*
@@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
 	}
 }
 
-static int z3fold_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type z3fold_fs = {
-	.name		= "z3fold",
-	.init_fs_context = z3fold_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static struct vfsmount *z3fold_mnt;
-static int __init z3fold_mount(void)
-{
-	int ret = 0;
-
-	z3fold_mnt = kern_mount(&z3fold_fs);
-	if (IS_ERR(z3fold_mnt))
-		ret = PTR_ERR(z3fold_mnt);
-
-	return ret;
-}
-
-static void z3fold_unmount(void)
-{
-	kern_unmount(z3fold_mnt);
-}
-
-static const struct address_space_operations z3fold_aops;
-static int z3fold_register_migration(struct z3fold_pool *pool)
-{
-	pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
-	if (IS_ERR(pool->inode)) {
-		pool->inode = NULL;
-		return 1;
-	}
-
-	pool->inode->i_mapping->private_data = pool;
-	pool->inode->i_mapping->a_ops = &z3fold_aops;
-	return 0;
-}
-
-static void z3fold_unregister_migration(struct z3fold_pool *pool)
-{
-	if (pool->inode)
-		iput(pool->inode);
-}
-
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
 					struct z3fold_pool *pool, gfp_t gfp)
@@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
 	pool->release_wq = create_singlethread_workqueue(pool->name);
 	if (!pool->release_wq)
 		goto out_wq;
-	if (z3fold_register_migration(pool))
-		goto out_rwq;
 	INIT_WORK(&pool->work, free_pages_work);
 	pool->ops = ops;
 	return pool;
 
-out_rwq:
-	destroy_workqueue(pool->release_wq);
 out_wq:
 	destroy_workqueue(pool->compact_wq);
 out_unbuddied:
@@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
 
 	destroy_workqueue(pool->compact_wq);
 	destroy_workqueue(pool->release_wq);
-	z3fold_unregister_migration(pool);
 	free_percpu(pool->unbuddied);
 	kfree(pool);
 }
 
+static const struct movable_operations z3fold_mops;
+
 /**
  * z3fold_alloc() - allocates a region of a given size
  * @pool:	z3fold pool from which to allocate
@@ -1117,11 +1060,11 @@ retry:
 	}
 	if (can_sleep) {
 		lock_page(page);
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &z3fold_mops);
 		unlock_page(page);
 	} else {
 		WARN_ON(!trylock_page(page));
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &z3fold_mops);
 		unlock_page(page);
 	}
 	z3fold_page_lock(zhdr);
@@ -1554,12 +1497,11 @@ out:
 	return false;
 }
 
-static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
-			       struct page *page, enum migrate_mode mode)
+static int z3fold_page_migrate(struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	struct z3fold_header *zhdr, *new_zhdr;
 	struct z3fold_pool *pool;
-	struct address_space *new_mapping;
 
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
@@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
 	 * so we only have to reinitialize it.
 	 */
 	INIT_LIST_HEAD(&new_zhdr->buddy);
-	new_mapping = page_mapping(page);
 	__ClearPageMovable(page);
 
 	get_page(newpage);
@@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
 	spin_lock(&pool->lock);
 	list_add(&newpage->lru, &pool->lru);
 	spin_unlock(&pool->lock);
-	__SetPageMovable(newpage, new_mapping);
+	__SetPageMovable(newpage, &z3fold_mops);
 	z3fold_page_unlock(new_zhdr);
 
 	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
@@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page)
 	z3fold_page_unlock(zhdr);
 }
 
-static const struct address_space_operations z3fold_aops = {
+static const struct movable_operations z3fold_mops = {
 	.isolate_page = z3fold_page_isolate,
-	.migratepage = z3fold_page_migrate,
+	.migrate_page = z3fold_page_migrate,
 	.putback_page = z3fold_page_putback,
 };
 
@@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold");
 
 static int __init init_z3fold(void)
 {
-	int ret;
-
 	/*
 	 * Make sure the z3fold header is not larger than the page size and
 	 * there has remaining spaces for its buddy.
 	 */
 	BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
-	ret = z3fold_mount();
-	if (ret)
-		return ret;
-
 	zpool_register_driver(&z3fold_zpool_driver);
 
 	return 0;
@@ -1764,7 +1699,6 @@ static int __init init_z3fold(void)
 
 static void __exit exit_z3fold(void)
 {
-	z3fold_unmount();
 	zpool_unregister_driver(&z3fold_zpool_driver);
 }
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d5fc04385b8..71d6edcbea48 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/magic.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -59,8 +58,6 @@
 #include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
 #include <linux/migrate.h>
 #include <linux/wait.h>
 #include <linux/pagemap.h>
@@ -177,10 +174,6 @@ struct zs_size_stat {
 static struct dentry *zs_stat_root;
 #endif
 
-#ifdef CONFIG_COMPACTION
-static struct vfsmount *zsmalloc_mnt;
-#endif
-
 /*
  * We assign a page to ZS_ALMOST_EMPTY fullness group when:
  *	n <= N / f, where
@@ -252,7 +245,6 @@ struct zs_pool {
 	struct dentry *stat_dentry;
 #endif
 #ifdef CONFIG_COMPACTION
-	struct inode *inode;
 	struct work_struct free_work;
 #endif
 	/* protect page/zspage migration */
@@ -271,6 +263,7 @@ struct zspage {
 	unsigned int freeobj;
 	struct page *first_page;
 	struct list_head list; /* fullness list */
+	struct zs_pool *pool;
 #ifdef CONFIG_COMPACTION
 	rwlock_t lock;
 #endif
@@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage)
 }
 
 #ifdef CONFIG_COMPACTION
-static int zs_register_migration(struct zs_pool *pool);
-static void zs_unregister_migration(struct zs_pool *pool);
 static void migrate_lock_init(struct zspage *zspage);
 static void migrate_read_lock(struct zspage *zspage);
 static void migrate_read_unlock(struct zspage *zspage);
@@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool);
 static void init_deferred_free(struct zs_pool *pool);
 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
 #else
-static int zsmalloc_mount(void) { return 0; }
-static void zsmalloc_unmount(void) {}
-static int zs_register_migration(struct zs_pool *pool) { return 0; }
-static void zs_unregister_migration(struct zs_pool *pool) {}
 static void migrate_lock_init(struct zspage *zspage) {}
 static void migrate_read_lock(struct zspage *zspage) {}
 static void migrate_read_unlock(struct zspage *zspage) {}
@@ -1083,6 +1070,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 
 	create_page_chain(class, zspage, pages);
 	init_zspage(class, zspage);
+	zspage->pool = pool;
 
 	return zspage;
 }
@@ -1754,33 +1742,6 @@ static void lock_zspage(struct zspage *zspage)
 	migrate_read_unlock(zspage);
 }
 
-static int zs_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type zsmalloc_fs = {
-	.name		= "zsmalloc",
-	.init_fs_context = zs_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static int zsmalloc_mount(void)
-{
-	int ret = 0;
-
-	zsmalloc_mnt = kern_mount(&zsmalloc_fs);
-	if (IS_ERR(zsmalloc_mnt))
-		ret = PTR_ERR(zsmalloc_mnt);
-
-	return ret;
-}
-
-static void zsmalloc_unmount(void)
-{
-	kern_unmount(zsmalloc_mnt);
-}
-
 static void migrate_lock_init(struct zspage *zspage)
 {
 	rwlock_init(&zspage->lock);
@@ -1823,6 +1784,8 @@ static void dec_zspage_isolation(struct zspage *zspage)
 	zspage->isolated--;
 }
 
+static const struct movable_operations zsmalloc_mops;
+
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct page *newpage, struct page *oldpage)
 {
@@ -1843,7 +1806,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 	set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
 	if (unlikely(ZsHugePage(zspage)))
 		newpage->index = oldpage->index;
-	__SetPageMovable(newpage, page_mapping(oldpage));
+	__SetPageMovable(newpage, &zsmalloc_mops);
 }
 
 static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1865,8 +1828,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	return true;
 }
 
-static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
-		struct page *page, enum migrate_mode mode)
+static int zs_page_migrate(struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	struct zs_pool *pool;
 	struct size_class *class;
@@ -1889,14 +1852,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
-	pool = mapping->private_data;
+	/* The page is locked, so this pointer must remain valid */
+	zspage = get_zspage(page);
+	pool = zspage->pool;
 
 	/*
 	 * The pool migrate_lock protects the race between zpage migration
 	 * and zs_free.
 	 */
 	write_lock(&pool->migrate_lock);
-	zspage = get_zspage(page);
 	class = zspage_class(pool, zspage);
 
 	/*
@@ -1964,31 +1928,12 @@ static void zs_page_putback(struct page *page)
 	migrate_write_unlock(zspage);
 }
 
-static const struct address_space_operations zsmalloc_aops = {
+static const struct movable_operations zsmalloc_mops = {
 	.isolate_page = zs_page_isolate,
-	.migratepage = zs_page_migrate,
+	.migrate_page = zs_page_migrate,
 	.putback_page = zs_page_putback,
 };
 
-static int zs_register_migration(struct zs_pool *pool)
-{
-	pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
-	if (IS_ERR(pool->inode)) {
-		pool->inode = NULL;
-		return 1;
-	}
-
-	pool->inode->i_mapping->private_data = pool;
-	pool->inode->i_mapping->a_ops = &zsmalloc_aops;
-	return 0;
-}
-
-static void zs_unregister_migration(struct zs_pool *pool)
-{
-	flush_work(&pool->free_work);
-	iput(pool->inode);
-}
-
 /*
  * Caller should hold page_lock of all pages in the zspage
  * In here, we cannot use zspage meta data.
@@ -2032,6 +1977,11 @@ static void kick_deferred_free(struct zs_pool *pool)
 	schedule_work(&pool->free_work);
 }
 
+static void zs_flush_migration(struct zs_pool *pool)
+{
+	flush_work(&pool->free_work);
+}
+
 static void init_deferred_free(struct zs_pool *pool)
 {
 	INIT_WORK(&pool->free_work, async_free_zspage);
@@ -2043,10 +1993,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
 
 	do {
 		WARN_ON(!trylock_page(page));
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &zsmalloc_mops);
 		unlock_page(page);
 	} while ((page = get_next_page(page)) != NULL);
 }
+#else
+static inline void zs_flush_migration(struct zs_pool *pool) { }
 #endif
 
 /*
@@ -2324,9 +2276,6 @@ struct zs_pool *zs_create_pool(const char *name)
 	/* debug only, don't abort if it fails */
 	zs_pool_stat_create(pool, name);
 
-	if (zs_register_migration(pool))
-		goto err;
-
 	/*
 	 * Not critical since shrinker is only used to trigger internal
 	 * defragmentation of the pool which is pretty optional thing.  If
@@ -2348,7 +2297,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 	int i;
 
 	zs_unregister_shrinker(pool);
-	zs_unregister_migration(pool);
+	zs_flush_migration(pool);
 	zs_pool_stat_destroy(pool);
 
 	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
@@ -2380,14 +2329,10 @@ static int __init zs_init(void)
 {
 	int ret;
 
-	ret = zsmalloc_mount();
-	if (ret)
-		goto out;
-
 	ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
 				zs_cpu_prepare, zs_cpu_dead);
 	if (ret)
-		goto hp_setup_fail;
+		goto out;
 
 #ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zs_zpool_driver);
@@ -2397,8 +2342,6 @@ static int __init zs_init(void)
 
 	return 0;
 
-hp_setup_fail:
-	zsmalloc_unmount();
 out:
 	return ret;
 }
@@ -2408,7 +2351,6 @@ static void __exit zs_exit(void)
 #ifdef CONFIG_ZPOOL
 	zpool_unregister_driver(&zs_zpool_driver);
 #endif
-	zsmalloc_unmount();
 	cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
 
 	zs_stat_exit();
-- 
cgit 


From 5490da4f06d182ba944706875029e98fe7f6b821 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 09:00:16 -0400
Subject: fs: Add aops->migrate_folio

Provide a folio-based replacement for aops->migratepage.  Update the
documentation to document migrate_folio instead of migratepage.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/filesystems/locking.rst |  5 +++--
 Documentation/filesystems/vfs.rst     | 14 +++++++-------
 include/linux/fs.h                    |  4 +++-
 mm/compaction.c                       |  4 +++-
 mm/migrate.c                          | 11 +++++++----
 5 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 9963d9600b71..4bb2627026ec 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -252,7 +252,8 @@ prototypes::
 	bool (*release_folio)(struct folio *, gfp_t);
 	void (*free_folio)(struct folio *);
 	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-	int (*migratepage)(struct address_space *, struct page *, struct page *);
+	int (*migrate_folio)(struct address_space *, struct folio *dst,
+			struct folio *src, enum migrate_mode);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
 	int (*error_remove_page)(struct address_space *, struct page *);
@@ -278,7 +279,7 @@ invalidate_folio:	yes					exclusive
 release_folio:		yes
 free_folio:		yes
 direct_IO:
-migratepage:		yes (both)
+migrate_folio:		yes (both)
 launder_folio:		yes
 is_partially_uptodate:	yes
 error_remove_page:	yes
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index b51665cdabc4..6cd6953e175b 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -737,8 +737,8 @@ cache in your filesystem.  The following members are defined:
 		bool (*release_folio)(struct folio *, gfp_t);
 		void (*free_folio)(struct folio *);
 		ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-		/* migrate the contents of a page to the specified target */
-		int (*migratepage) (struct page *, struct page *);
+		int (*migrate_folio)(struct mapping *, struct folio *dst,
+				struct folio *src, enum migrate_mode);
 		int (*launder_folio) (struct folio *);
 
 		bool (*is_partially_uptodate) (struct folio *, size_t from,
@@ -926,12 +926,12 @@ cache in your filesystem.  The following members are defined:
 	data directly between the storage and the application's address
 	space.
 
-``migrate_page``
+``migrate_folio``
 	This is used to compact the physical memory usage.  If the VM
-	wants to relocate a page (maybe off a memory card that is
-	signalling imminent failure) it will pass a new page and an old
-	page to this function.  migrate_page should transfer any private
-	data across and update any references that it has to the page.
+	wants to relocate a folio (maybe from a memory device that is
+	signalling imminent failure) it will pass a new folio and an old
+	folio to this function.  migrate_folio should transfer any private
+	data across and update any references that it has to the folio.
 
 ``launder_folio``
 	Called before freeing a folio - it writes back the dirty folio.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5d8ee3155ca2..47431cf8fbb3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -362,9 +362,11 @@ struct address_space_operations {
 	void (*free_folio)(struct folio *folio);
 	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	/*
-	 * migrate the contents of a page to the specified target. If
+	 * migrate the contents of a folio to the specified target. If
 	 * migrate_mode is MIGRATE_ASYNC, it must not block.
 	 */
+	int (*migrate_folio)(struct address_space *, struct folio *dst,
+			struct folio *src, enum migrate_mode);
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 	int (*launder_folio)(struct folio *);
diff --git a/mm/compaction.c b/mm/compaction.c
index f23efba1d118..458f49f9ab09 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1042,7 +1042,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 				goto isolate_fail_put;
 
 			mapping = page_mapping(page);
-			migrate_dirty = !mapping || mapping->a_ops->migratepage;
+			migrate_dirty = !mapping ||
+					mapping->a_ops->migrate_folio ||
+					mapping->a_ops->migratepage;
 			unlock_page(page);
 			if (!migrate_dirty)
 				goto isolate_fail_put;
diff --git a/mm/migrate.c b/mm/migrate.c
index 491f03747832..3c3c168097dd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -856,14 +856,17 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 
 		if (!mapping)
 			rc = migrate_page(mapping, &dst->page, &src->page, mode);
-		else if (mapping->a_ops->migratepage)
+		else if (mapping->a_ops->migrate_folio)
 			/*
-			 * Most pages have a mapping and most filesystems
-			 * provide a migratepage callback. Anonymous pages
+			 * Most folios have a mapping and most filesystems
+			 * provide a migrate_folio callback. Anonymous folios
 			 * are part of swap space which also has its own
-			 * migratepage callback. This is the most common path
+			 * migrate_folio callback. This is the most common path
 			 * for page migration.
 			 */
+			rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+								mode);
+		else if (mapping->a_ops->migratepage)
 			rc = mapping->a_ops->migratepage(mapping, &dst->page,
 							&src->page, mode);
 		else
-- 
cgit 


From 67235182a41c1bd6b32806a1556a1d299b84212b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 10:20:31 -0400
Subject: mm/migrate: Convert buffer_migrate_page() to buffer_migrate_folio()

Use a folio throughout __buffer_migrate_folio(), add kernel-doc for
buffer_migrate_folio() and buffer_migrate_folio_norefs(), move their
declarations to buffer.h and switch all filesystems that have wired
them up.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 block/fops.c                |  2 +-
 fs/ext2/inode.c             |  4 +--
 fs/ext4/inode.c             |  4 +--
 fs/ntfs/aops.c              |  6 ++--
 fs/ocfs2/aops.c             |  2 +-
 include/linux/buffer_head.h | 10 ++++++
 include/linux/fs.h          | 12 -------
 mm/migrate.c                | 76 +++++++++++++++++++++++++++------------------
 8 files changed, 65 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index d6b3276a6c68..743fc46d0aad 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -417,7 +417,7 @@ const struct address_space_operations def_blk_aops = {
 	.write_end	= blkdev_write_end,
 	.writepages	= blkdev_writepages,
 	.direct_IO	= blkdev_direct_IO,
-	.migratepage	= buffer_migrate_page_norefs,
+	.migrate_folio	= buffer_migrate_folio_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e6b932219803..58a9d061f17d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -973,7 +973,7 @@ const struct address_space_operations ext2_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
-	.migratepage		= buffer_migrate_page,
+	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
@@ -989,7 +989,7 @@ const struct address_space_operations ext2_nobh_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
-	.migratepage		= buffer_migrate_page,
+	.migrate_folio		= buffer_migrate_folio,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 06cc68878176..87a8b4382bce 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3633,7 +3633,7 @@ static const struct address_space_operations ext4_aops = {
 	.invalidate_folio	= ext4_invalidate_folio,
 	.release_folio		= ext4_release_folio,
 	.direct_IO		= noop_direct_IO,
-	.migratepage		= buffer_migrate_page,
+	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.swap_activate		= ext4_iomap_swap_activate,
@@ -3668,7 +3668,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.invalidate_folio	= ext4_invalidate_folio,
 	.release_folio		= ext4_release_folio,
 	.direct_IO		= noop_direct_IO,
-	.migratepage		= buffer_migrate_page,
+	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.swap_activate		= ext4_iomap_swap_activate,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 9e3964ea2ea0..5f4fb6ca6f2e 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = {
 	.dirty_folio	= block_dirty_folio,
 #endif /* NTFS_RW */
 	.bmap		= ntfs_bmap,
-	.migratepage	= buffer_migrate_page,
+	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate = block_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
 };
@@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = {
 	.writepage	= ntfs_writepage,
 	.dirty_folio	= block_dirty_folio,
 #endif /* NTFS_RW */
-	.migratepage	= buffer_migrate_page,
+	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate = block_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
 };
@@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = {
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 	.dirty_folio	= filemap_dirty_folio,
 #endif /* NTFS_RW */
-	.migratepage	= buffer_migrate_page,
+	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
 };
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 767df51f8657..1d489003f99d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2462,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = {
 	.direct_IO		= ocfs2_direct_IO,
 	.invalidate_folio	= block_invalidate_folio,
 	.release_folio		= ocfs2_release_folio,
-	.migratepage		= buffer_migrate_page,
+	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c9d1463bb20f..b0366c89d6a4 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -267,6 +267,16 @@ int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_writepage(struct page *page, get_block_t *get_block,
                         struct writeback_control *wbc);
 
+#ifdef CONFIG_MIGRATION
+extern int buffer_migrate_folio(struct address_space *,
+		struct folio *dst, struct folio *src, enum migrate_mode);
+extern int buffer_migrate_folio_norefs(struct address_space *,
+		struct folio *dst, struct folio *src, enum migrate_mode);
+#else
+#define buffer_migrate_folio NULL
+#define buffer_migrate_folio_norefs NULL
+#endif
+
 void buffer_init(void);
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 47431cf8fbb3..9e6b17da4e11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3215,18 +3215,6 @@ extern int generic_check_addressable(unsigned, u64);
 
 extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
-#ifdef CONFIG_MIGRATION
-extern int buffer_migrate_page(struct address_space *,
-				struct page *, struct page *,
-				enum migrate_mode);
-extern int buffer_migrate_page_norefs(struct address_space *,
-				struct page *, struct page *,
-				enum migrate_mode);
-#else
-#define buffer_migrate_page NULL
-#define buffer_migrate_page_norefs NULL
-#endif
-
 int may_setattr(struct user_namespace *mnt_userns, struct inode *inode,
 		unsigned int ia_valid);
 int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
diff --git a/mm/migrate.c b/mm/migrate.c
index 75b171425c45..ea5398d0f7f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -656,23 +656,23 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 	return true;
 }
 
-static int __buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
+static int __buffer_migrate_folio(struct address_space *mapping,
+		struct folio *dst, struct folio *src, enum migrate_mode mode,
 		bool check_refs)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 	int expected_count;
 
-	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page, mode);
+	head = folio_buffers(src);
+	if (!head)
+		return migrate_page(mapping, &dst->page, &src->page, mode);
 
 	/* Check whether page does not have extra refs before we do more work */
-	expected_count = expected_page_refs(mapping, page);
-	if (page_count(page) != expected_count)
+	expected_count = expected_page_refs(mapping, &src->page);
+	if (folio_ref_count(src) != expected_count)
 		return -EAGAIN;
 
-	head = page_buffers(page);
 	if (!buffer_migrate_lock_buffers(head, mode))
 		return -EAGAIN;
 
@@ -703,23 +703,22 @@ recheck_buffers:
 		}
 	}
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+	rc = folio_migrate_mapping(mapping, dst, src, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		goto unlock_buffers;
 
-	attach_page_private(newpage, detach_page_private(page));
+	folio_attach_private(dst, folio_detach_private(src));
 
 	bh = head;
 	do {
-		set_bh_page(bh, newpage, bh_offset(bh));
+		set_bh_page(bh, &dst->page, bh_offset(bh));
 		bh = bh->b_this_page;
-
 	} while (bh != head);
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
-		migrate_page_copy(newpage, page);
+		folio_migrate_copy(dst, src);
 	else
-		migrate_page_states(newpage, page);
+		folio_migrate_flags(dst, src);
 
 	rc = MIGRATEPAGE_SUCCESS;
 unlock_buffers:
@@ -729,34 +728,51 @@ unlock_buffers:
 	do {
 		unlock_buffer(bh);
 		bh = bh->b_this_page;
-
 	} while (bh != head);
 
 	return rc;
 }
 
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist. For example attached buffer heads are accessed only under page lock.
+/**
+ * buffer_migrate_folio() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * This function can only be used if the underlying filesystem guarantees
+ * that no other references to @src exist. For example attached buffer
+ * heads are accessed only under the folio lock.  If your filesystem cannot
+ * provide this guarantee, buffer_migrate_folio_norefs() may be more
+ * appropriate.
+ *
+ * Return: 0 on success or a negative errno on failure.
  */
-int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio(struct address_space *mapping,
+		struct folio *dst, struct folio *src, enum migrate_mode mode)
 {
-	return __buffer_migrate_page(mapping, newpage, page, mode, false);
+	return __buffer_migrate_folio(mapping, dst, src, mode, false);
 }
-EXPORT_SYMBOL(buffer_migrate_page);
+EXPORT_SYMBOL(buffer_migrate_folio);
 
-/*
- * Same as above except that this variant is more careful and checks that there
- * are also no buffer head references. This function is the right one for
- * mappings where buffer heads are directly looked up and referenced (such as
- * block device mappings).
+/**
+ * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * Like buffer_migrate_folio() except that this variant is more careful
+ * and checks that there are also no buffer head references. This function
+ * is the right one for mappings where buffer heads are directly looked
+ * up and referenced (such as block device mappings).
+ *
+ * Return: 0 on success or a negative errno on failure.
  */
-int buffer_migrate_page_norefs(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio_norefs(struct address_space *mapping,
+		struct folio *dst, struct folio *src, enum migrate_mode mode)
 {
-	return __buffer_migrate_page(mapping, newpage, page, mode, true);
+	return __buffer_migrate_folio(mapping, dst, src, mode, true);
 }
 #endif
 
-- 
cgit 


From 541846502f4fe826cd7c16e4784695ac90736585 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 10:27:41 -0400
Subject: mm/migrate: Convert migrate_page() to migrate_folio()

Convert all callers to pass a folio.  Most have the folio
already available.  Switch all users from aops->migratepage to
aops->migrate_folio.  Also turn the documentation into kerneldoc.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Sterba <dsterba@suse.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c |  4 ++--
 fs/btrfs/disk-io.c                          |  2 +-
 fs/nfs/write.c                              |  2 +-
 include/linux/migrate.h                     |  5 ++--
 mm/migrate.c                                | 37 ++++++++++++++++-------------
 mm/migrate_device.c                         |  3 ++-
 mm/shmem.c                                  |  2 +-
 mm/swap_state.c                             |  2 +-
 8 files changed, 30 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 094f06b4ce33..8423df021b71 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -216,8 +216,8 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
 			 * However...!
 			 *
 			 * The mmu-notifier can be invalidated for a
-			 * migrate_page, that is alreadying holding the lock
-			 * on the page. Such a try_to_unmap() will result
+			 * migrate_folio, that is alreadying holding the lock
+			 * on the folio. Such a try_to_unmap() will result
 			 * in us calling put_pages() and so recursively try
 			 * to lock the page. We avoid that deadlock with
 			 * a trylock_page() and in exchange we risk missing
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4aeb68f8450e..64d9299218f2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -968,7 +968,7 @@ static int btree_migrate_folio(struct address_space *mapping,
 	if (folio_get_private(src) &&
 	    !filemap_release_folio(src, GFP_KERNEL))
 		return -EAGAIN;
-	return migrate_page(mapping, &dst->page, &src->page, mode);
+	return migrate_folio(mapping, dst, src, mode);
 }
 #else
 #define btree_migrate_folio NULL
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 649b9e633459..69569696dde0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2139,7 +2139,7 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
 		folio_wait_fscache(src);
 	}
 
-	return migrate_page(mapping, &dst->page, &src->page, mode);
+	return migrate_folio(mapping, dst, src, mode);
 }
 #endif
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 82c735ba6109..c9986d5da335 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -62,9 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
 #ifdef CONFIG_MIGRATION
 
 extern void putback_movable_pages(struct list_head *l);
-extern int migrate_page(struct address_space *mapping,
-			struct page *newpage, struct page *page,
-			enum migrate_mode mode);
+int migrate_folio(struct address_space *mapping, struct folio *dst,
+		struct folio *src, enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason,
 		unsigned int *ret_succeeded);
diff --git a/mm/migrate.c b/mm/migrate.c
index 61cd8d270b03..77aeb7e12f62 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -593,34 +593,37 @@ EXPORT_SYMBOL(folio_migrate_copy);
  *                    Migration functions
  ***********************************************************/
 
-/*
- * Common logic to directly migrate a single LRU page suitable for
- * pages that do not use PagePrivate/PagePrivate2.
+/**
+ * migrate_folio() - Simple folio migration.
+ * @mapping: The address_space containing the folio.
+ * @dst: The folio to migrate the data to.
+ * @src: The folio containing the current data.
+ * @mode: How to migrate the page.
  *
- * Pages are locked upon entry and exit.
+ * Common logic to directly migrate a single LRU folio suitable for
+ * folios that do not use PagePrivate/PagePrivate2.
+ *
+ * Folios are locked upon entry and exit.
  */
-int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page,
-		enum migrate_mode mode)
+int migrate_folio(struct address_space *mapping, struct folio *dst,
+		struct folio *src, enum migrate_mode mode)
 {
-	struct folio *newfolio = page_folio(newpage);
-	struct folio *folio = page_folio(page);
 	int rc;
 
-	BUG_ON(folio_test_writeback(folio));	/* Writeback must be complete */
+	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
 
-	rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
+	rc = folio_migrate_mapping(mapping, dst, src, 0);
 
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
-		folio_migrate_copy(newfolio, folio);
+		folio_migrate_copy(dst, src);
 	else
-		folio_migrate_flags(newfolio, folio);
+		folio_migrate_flags(dst, src);
 	return MIGRATEPAGE_SUCCESS;
 }
-EXPORT_SYMBOL(migrate_page);
+EXPORT_SYMBOL(migrate_folio);
 
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
@@ -671,7 +674,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
 
 	head = folio_buffers(src);
 	if (!head)
-		return migrate_page(mapping, &dst->page, &src->page, mode);
+		return migrate_folio(mapping, dst, src, mode);
 
 	/* Check whether page does not have extra refs before we do more work */
 	expected_count = folio_expected_refs(mapping, src);
@@ -848,7 +851,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
 	    !filemap_release_folio(src, GFP_KERNEL))
 		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
 
-	return migrate_page(mapping, &dst->page, &src->page, mode);
+	return migrate_folio(mapping, dst, src, mode);
 }
 
 /*
@@ -875,7 +878,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 		struct address_space *mapping = folio_mapping(src);
 
 		if (!mapping)
-			rc = migrate_page(mapping, &dst->page, &src->page, mode);
+			rc = migrate_folio(mapping, dst, src, mode);
 		else if (mapping->a_ops->migrate_folio)
 			/*
 			 * Most folios have a mapping and most filesystems
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5052093d0262..5dd97c39ca6a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -718,7 +718,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
 			continue;
 		}
 
-		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+		r = migrate_folio(mapping, page_folio(newpage),
+				page_folio(page), MIGRATE_SYNC_NO_COPY);
 		if (r != MIGRATEPAGE_SUCCESS)
 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index 28a62be1d41e..15c61456e087 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3801,7 +3801,7 @@ const struct address_space_operations shmem_aops = {
 	.write_end	= shmem_write_end,
 #endif
 #ifdef CONFIG_MIGRATION
-	.migratepage	= migrate_page,
+	.migrate_folio	= migrate_folio,
 #endif
 	.error_remove_page = shmem_error_remove_page,
 };
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f5b6f5638908..0a2021fc55ad 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.dirty_folio	= noop_dirty_folio,
 #ifdef CONFIG_MIGRATION
-	.migratepage	= migrate_page,
+	.migrate_folio	= migrate_folio,
 #endif
 };
 
-- 
cgit 


From 2ec810d59602f0e08847f986ef8e16469722496f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 12:55:08 -0400
Subject: mm/migrate: Add filemap_migrate_folio()

There is nothing iomap-specific about iomap_migratepage(), and it fits
a pattern used by several other filesystems, so move it to mm/migrate.c,
convert it to be filemap_migrate_folio() and convert the iomap filesystems
to use it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/gfs2/aops.c          |  2 +-
 fs/iomap/buffered-io.c  | 25 -------------------------
 fs/xfs/xfs_aops.c       |  2 +-
 fs/zonefs/super.c       |  2 +-
 include/linux/iomap.h   |  6 ------
 include/linux/pagemap.h |  6 ++++++
 mm/migrate.c            | 20 ++++++++++++++++++++
 7 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 106e90a36583..57ff883d432c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -774,7 +774,7 @@ static const struct address_space_operations gfs2_aops = {
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
 	.direct_IO = noop_direct_IO,
-	.migratepage = iomap_migrate_page,
+	.migrate_folio = filemap_migrate_folio,
 	.is_partially_uptodate = iomap_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
 };
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 66278a14bfa7..5a91aa1db945 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -489,31 +489,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
-#ifdef CONFIG_MIGRATION
-int
-iomap_migrate_page(struct address_space *mapping, struct page *newpage,
-		struct page *page, enum migrate_mode mode)
-{
-	struct folio *folio = page_folio(page);
-	struct folio *newfolio = page_folio(newpage);
-	int ret;
-
-	ret = folio_migrate_mapping(mapping, newfolio, folio, 0);
-	if (ret != MIGRATEPAGE_SUCCESS)
-		return ret;
-
-	if (folio_test_private(folio))
-		folio_attach_private(newfolio, folio_detach_private(folio));
-
-	if (mode != MIGRATE_SYNC_NO_COPY)
-		folio_migrate_copy(newfolio, folio);
-	else
-		folio_migrate_flags(newfolio, folio);
-	return MIGRATEPAGE_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(iomap_migrate_page);
-#endif /* CONFIG_MIGRATION */
-
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8ec38b25187b..5d1a995b15f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -570,7 +570,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= noop_direct_IO,
-	.migratepage		= iomap_migrate_page,
+	.migrate_folio		= filemap_migrate_folio,
 	.is_partially_uptodate  = iomap_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.swap_activate		= xfs_iomap_swapfile_activate,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 053299758deb..cc6d4cf580ac 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -271,7 +271,7 @@ static const struct address_space_operations zonefs_file_aops = {
 	.dirty_folio		= filemap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
-	.migratepage		= iomap_migrate_page,
+	.migrate_folio		= filemap_migrate_folio,
 	.is_partially_uptodate	= iomap_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.direct_IO		= noop_direct_IO,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e552097c67e0..758a1125e72f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -231,12 +231,6 @@ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
-#ifdef CONFIG_MIGRATION
-int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
-		struct page *page, enum migrate_mode mode);
-#else
-#define iomap_migrate_page NULL
-#endif
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 87d4ea571240..cc9adbaddb59 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1078,6 +1078,12 @@ static inline int __must_check write_one_page(struct page *page)
 int __set_page_dirty_nobuffers(struct page *page);
 bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
 
+#ifdef CONFIG_MIGRATION
+int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
+		struct folio *src, enum migrate_mode mode);
+#else
+#define filemap_migrate_folio NULL
+#endif
 void page_endio(struct page *page, bool is_write, int err);
 
 void folio_end_private_2(struct folio *folio);
diff --git a/mm/migrate.c b/mm/migrate.c
index 77aeb7e12f62..4ed8f0d53c77 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -784,6 +784,26 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
 }
 #endif
 
+int filemap_migrate_folio(struct address_space *mapping,
+		struct folio *dst, struct folio *src, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = folio_migrate_mapping(mapping, dst, src, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (folio_get_private(src))
+		folio_attach_private(dst, folio_detach_private(src));
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		folio_migrate_copy(dst, src);
+	else
+		folio_migrate_flags(dst, src);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(filemap_migrate_folio);
+
 /*
  * Writeback a folio to clean the dirty state
  */
-- 
cgit 


From b890ec2a2c2d962f71ba31ae291f8fd252b46258 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 10:47:21 -0400
Subject: hugetlb: Convert to migrate_folio

This involves converting migrate_huge_page_move_mapping().  We also need a
folio variant of hugetlb_set_page_subpool(), but that's for a later patch.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c    | 23 ++++++++++++++---------
 include/linux/migrate.h |  6 +++---
 mm/migrate.c            | 18 +++++++++---------
 3 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 14d33f725e05..eca1d0fabd7e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -954,28 +954,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
 	return error;
 }
 
-static int hugetlbfs_migrate_page(struct address_space *mapping,
-				struct page *newpage, struct page *page,
+#ifdef CONFIG_MIGRATION
+static int hugetlbfs_migrate_folio(struct address_space *mapping,
+				struct folio *dst, struct folio *src,
 				enum migrate_mode mode)
 {
 	int rc;
 
-	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	rc = migrate_huge_page_move_mapping(mapping, dst, src);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
-	if (hugetlb_page_subpool(page)) {
-		hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
-		hugetlb_set_page_subpool(page, NULL);
+	if (hugetlb_page_subpool(&src->page)) {
+		hugetlb_set_page_subpool(&dst->page,
+					hugetlb_page_subpool(&src->page));
+		hugetlb_set_page_subpool(&src->page, NULL);
 	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
-		migrate_page_copy(newpage, page);
+		folio_migrate_copy(dst, src);
 	else
-		migrate_page_states(newpage, page);
+		folio_migrate_flags(dst, src);
 
 	return MIGRATEPAGE_SUCCESS;
 }
+#else
+#define hugetlbfs_migrate_folio NULL
+#endif
 
 static int hugetlbfs_error_remove_page(struct address_space *mapping,
 				struct page *page)
@@ -1142,7 +1147,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.dirty_folio	= noop_dirty_folio,
-	.migratepage    = hugetlbfs_migrate_page,
+	.migrate_folio  = hugetlbfs_migrate_folio,
 	.error_remove_page	= hugetlbfs_error_remove_page,
 };
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c9986d5da335..13f793309b75 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -72,8 +72,8 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
-extern int migrate_huge_page_move_mapping(struct address_space *mapping,
-				  struct page *newpage, struct page *page);
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+		struct folio *dst, struct folio *src);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
 void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
@@ -104,7 +104,7 @@ static inline void migrate_page_copy(struct page *newpage,
 				     struct page *page) {}
 
 static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
-				  struct page *newpage, struct page *page)
+				  struct folio *dst, struct folio *src)
 {
 	return -ENOSYS;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 4ed8f0d53c77..0dd3ec9525b3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -474,26 +474,26 @@ EXPORT_SYMBOL(folio_migrate_mapping);
  * of folio_migrate_mapping().
  */
 int migrate_huge_page_move_mapping(struct address_space *mapping,
-				   struct page *newpage, struct page *page)
+				   struct folio *dst, struct folio *src)
 {
-	XA_STATE(xas, &mapping->i_pages, page_index(page));
+	XA_STATE(xas, &mapping->i_pages, folio_index(src));
 	int expected_count;
 
 	xas_lock_irq(&xas);
-	expected_count = 2 + page_has_private(page);
-	if (!page_ref_freeze(page, expected_count)) {
+	expected_count = 2 + folio_has_private(src);
+	if (!folio_ref_freeze(src, expected_count)) {
 		xas_unlock_irq(&xas);
 		return -EAGAIN;
 	}
 
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
+	dst->index = src->index;
+	dst->mapping = src->mapping;
 
-	get_page(newpage);
+	folio_get(dst);
 
-	xas_store(&xas, newpage);
+	xas_store(&xas, dst);
 
-	page_ref_unfreeze(page, expected_count - 1);
+	folio_ref_unfreeze(src, expected_count - 1);
 
 	xas_unlock_irq(&xas);
 
-- 
cgit 


From 9d0ddc0cb575fd41ff16131b06e08e1feac43b81 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 11:53:31 -0400
Subject: fs: Remove aops->migratepage()

With all users converted to migrate_folio(), remove this operation.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/fs.h | 2 --
 mm/compaction.c    | 5 ++---
 mm/migrate.c       | 3 ---
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e6b17da4e11..7e06919b8f60 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -367,8 +367,6 @@ struct address_space_operations {
 	 */
 	int (*migrate_folio)(struct address_space *, struct folio *dst,
 			struct folio *src, enum migrate_mode);
-	int (*migratepage) (struct address_space *,
-			struct page *, struct page *, enum migrate_mode);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate) (struct folio *, size_t from,
 			size_t count);
diff --git a/mm/compaction.c b/mm/compaction.c
index 458f49f9ab09..a2c53fcf933e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1031,7 +1031,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 			/*
 			 * Only pages without mappings or that have a
-			 * ->migratepage callback are possible to migrate
+			 * ->migrate_folio callback are possible to migrate
 			 * without blocking. However, we can be racing with
 			 * truncation so it's necessary to lock the page
 			 * to stabilise the mapping as truncation holds
@@ -1043,8 +1043,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 			mapping = page_mapping(page);
 			migrate_dirty = !mapping ||
-					mapping->a_ops->migrate_folio ||
-					mapping->a_ops->migratepage;
+					mapping->a_ops->migrate_folio;
 			unlock_page(page);
 			if (!migrate_dirty)
 				goto isolate_fail_put;
diff --git a/mm/migrate.c b/mm/migrate.c
index 0dd3ec9525b3..1b4b977809a1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -909,9 +909,6 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			 */
 			rc = mapping->a_ops->migrate_folio(mapping, dst, src,
 								mode);
-		else if (mapping->a_ops->migratepage)
-			rc = mapping->a_ops->migratepage(mapping, &dst->page,
-							&src->page, mode);
 		else
 			rc = fallback_migrate_folio(mapping, dst, src, mode);
 	} else {
-- 
cgit 


From 9800562f2ab41656b0bdc2a41c77ab3f6dfdd6fc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 6 Jun 2022 13:29:10 -0400
Subject: mm/folio-compat: Remove migration compatibility functions

migrate_page_move_mapping(), migrate_page_copy() and migrate_page_states()
are all now unused after converting all the filesystems from
aops->migratepage() to aops->migrate_folio().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/migrate.h | 11 -----------
 mm/folio-compat.c       | 22 ----------------------
 mm/ksm.c                |  2 +-
 3 files changed, 1 insertion(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 13f793309b75..ae5bb67a9ba1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -70,12 +70,8 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 
-extern void migrate_page_states(struct page *newpage, struct page *page);
-extern void migrate_page_copy(struct page *newpage, struct page *page);
 int migrate_huge_page_move_mapping(struct address_space *mapping,
 		struct folio *dst, struct folio *src);
-extern int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, int extra_count);
 void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
 				spinlock_t *ptl);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
@@ -96,13 +92,6 @@ static inline struct page *alloc_migration_target(struct page *page,
 static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
 	{ return -EBUSY; }
 
-static inline void migrate_page_states(struct page *newpage, struct page *page)
-{
-}
-
-static inline void migrate_page_copy(struct page *newpage,
-				     struct page *page) {}
-
 static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct folio *dst, struct folio *src)
 {
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 20bc15b57d93..458618c7302c 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -51,28 +51,6 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
-#ifdef CONFIG_MIGRATION
-int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, int extra_count)
-{
-	return folio_migrate_mapping(mapping, page_folio(newpage),
-					page_folio(page), extra_count);
-}
-EXPORT_SYMBOL(migrate_page_move_mapping);
-
-void migrate_page_states(struct page *newpage, struct page *page)
-{
-	folio_migrate_flags(page_folio(newpage), page_folio(page));
-}
-EXPORT_SYMBOL(migrate_page_states);
-
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-	folio_migrate_copy(page_folio(newpage), page_folio(page));
-}
-EXPORT_SYMBOL(migrate_page_copy);
-#endif
-
 bool set_page_writeback(struct page *page)
 {
 	return folio_start_writeback(page_folio(page));
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..e8f8c1a2bb39 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -712,7 +712,7 @@ again:
 	 * however, it might mean that the page is under page_ref_freeze().
 	 * The __remove_mapping() case is easy, again the node is now stale;
 	 * the same is in reuse_ksm_page() case; but if page is swapcache
-	 * in migrate_page_move_mapping(), it might still be our page,
+	 * in folio_migrate_mapping(), it might still be our page,
 	 * in which case it's essential to keep the node.
 	 */
 	while (!get_page_unless_zero(page)) {
-- 
cgit 


From cc9cf350d100f4c336edb228cbd078003430cfe7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Jun 2022 07:37:13 +0200
Subject: fs: remove the nobh helpers

All callers are gone, so remove the now dead code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/buffer.c                 | 324 --------------------------------------------
 fs/mpage.c                  |  25 +---
 include/linux/buffer_head.h |   8 --
 include/linux/mpage.h       |   2 -
 4 files changed, 1 insertion(+), 358 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index ce9844d7c10f..5717d1881d2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2537,330 +2537,6 @@ out_unlock:
 }
 EXPORT_SYMBOL(block_page_mkwrite);
 
-/*
- * nobh_write_begin()'s prereads are special: the buffer_heads are freed
- * immediately, while under the page lock.  So it needs a special end_io
- * handler which does not touch the bh after unlocking it.
- */
-static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
-{
-	__end_buffer_read_notouch(bh, uptodate);
-}
-
-/*
- * Attach the singly-linked list of buffers created by nobh_write_begin, to
- * the page (converting it to circular linked list and taking care of page
- * dirty races).
- */
-static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
-{
-	struct buffer_head *bh;
-
-	BUG_ON(!PageLocked(page));
-
-	spin_lock(&page->mapping->private_lock);
-	bh = head;
-	do {
-		if (PageDirty(page))
-			set_buffer_dirty(bh);
-		if (!bh->b_this_page)
-			bh->b_this_page = head;
-		bh = bh->b_this_page;
-	} while (bh != head);
-	attach_page_private(page, head);
-	spin_unlock(&page->mapping->private_lock);
-}
-
-/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
- * The filesystem needs to handle block truncation upon failure.
- */
-int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
-{
-	struct inode *inode = mapping->host;
-	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocksize = 1 << blkbits;
-	struct buffer_head *head, *bh;
-	struct page *page;
-	pgoff_t index;
-	unsigned from, to;
-	unsigned block_in_page;
-	unsigned block_start, block_end;
-	sector_t block_in_file;
-	int nr_reads = 0;
-	int ret = 0;
-	int is_mapped_to_disk = 1;
-
-	index = pos >> PAGE_SHIFT;
-	from = pos & (PAGE_SIZE - 1);
-	to = from + len;
-
-	page = grab_cache_page_write_begin(mapping, index);
-	if (!page)
-		return -ENOMEM;
-	*pagep = page;
-	*fsdata = NULL;
-
-	if (page_has_buffers(page)) {
-		ret = __block_write_begin(page, pos, len, get_block);
-		if (unlikely(ret))
-			goto out_release;
-		return ret;
-	}
-
-	if (PageMappedToDisk(page))
-		return 0;
-
-	/*
-	 * Allocate buffers so that we can keep track of state, and potentially
-	 * attach them to the page if an error occurs. In the common case of
-	 * no error, they will just be freed again without ever being attached
-	 * to the page (which is all OK, because we're under the page lock).
-	 *
-	 * Be careful: the buffer linked list is a NULL terminated one, rather
-	 * than the circular one we're used to.
-	 */
-	head = alloc_page_buffers(page, blocksize, false);
-	if (!head) {
-		ret = -ENOMEM;
-		goto out_release;
-	}
-
-	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
-
-	/*
-	 * We loop across all blocks in the page, whether or not they are
-	 * part of the affected region.  This is so we can discover if the
-	 * page is fully mapped-to-disk.
-	 */
-	for (block_start = 0, block_in_page = 0, bh = head;
-		  block_start < PAGE_SIZE;
-		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
-		int create;
-
-		block_end = block_start + blocksize;
-		bh->b_state = 0;
-		create = 1;
-		if (block_start >= to)
-			create = 0;
-		ret = get_block(inode, block_in_file + block_in_page,
-					bh, create);
-		if (ret)
-			goto failed;
-		if (!buffer_mapped(bh))
-			is_mapped_to_disk = 0;
-		if (buffer_new(bh))
-			clean_bdev_bh_alias(bh);
-		if (PageUptodate(page)) {
-			set_buffer_uptodate(bh);
-			continue;
-		}
-		if (buffer_new(bh) || !buffer_mapped(bh)) {
-			zero_user_segments(page, block_start, from,
-							to, block_end);
-			continue;
-		}
-		if (buffer_uptodate(bh))
-			continue;	/* reiserfs does this */
-		if (block_start < from || block_end > to) {
-			lock_buffer(bh);
-			bh->b_end_io = end_buffer_read_nobh;
-			submit_bh(REQ_OP_READ, 0, bh);
-			nr_reads++;
-		}
-	}
-
-	if (nr_reads) {
-		/*
-		 * The page is locked, so these buffers are protected from
-		 * any VM or truncate activity.  Hence we don't need to care
-		 * for the buffer_head refcounts.
-		 */
-		for (bh = head; bh; bh = bh->b_this_page) {
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				ret = -EIO;
-		}
-		if (ret)
-			goto failed;
-	}
-
-	if (is_mapped_to_disk)
-		SetPageMappedToDisk(page);
-
-	*fsdata = head; /* to be released by nobh_write_end */
-
-	return 0;
-
-failed:
-	BUG_ON(!ret);
-	/*
-	 * Error recovery is a bit difficult. We need to zero out blocks that
-	 * were newly allocated, and dirty them to ensure they get written out.
-	 * Buffers need to be attached to the page at this point, otherwise
-	 * the handling of potential IO errors during writeout would be hard
-	 * (could try doing synchronous writeout, but what if that fails too?)
-	 */
-	attach_nobh_buffers(page, head);
-	page_zero_new_buffers(page, from, to);
-
-out_release:
-	unlock_page(page);
-	put_page(page);
-	*pagep = NULL;
-
-	return ret;
-}
-EXPORT_SYMBOL(nobh_write_begin);
-
-int nobh_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *head = fsdata;
-	struct buffer_head *bh;
-	BUG_ON(fsdata != NULL && page_has_buffers(page));
-
-	if (unlikely(copied < len) && head)
-		attach_nobh_buffers(page, head);
-	if (page_has_buffers(page))
-		return generic_write_end(file, mapping, pos, len,
-					copied, page, fsdata);
-
-	SetPageUptodate(page);
-	set_page_dirty(page);
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	unlock_page(page);
-	put_page(page);
-
-	while (head) {
-		bh = head;
-		head = head->b_this_page;
-		free_buffer_head(bh);
-	}
-
-	return copied;
-}
-EXPORT_SYMBOL(nobh_write_end);
-
-/*
- * nobh_writepage() - based on block_full_write_page() except
- * that it tries to operate without attaching bufferheads to
- * the page.
- */
-int nobh_writepage(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
-{
-	struct inode * const inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_SHIFT;
-	unsigned offset;
-	int ret;
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		goto out;
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
-		unlock_page(page);
-		return 0; /* don't care */
-	}
-
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	zero_user_segment(page, offset, PAGE_SIZE);
-out:
-	ret = mpage_writepage(page, get_block, wbc);
-	if (ret == -EAGAIN)
-		ret = __block_write_full_page(inode, page, get_block, wbc,
-					      end_buffer_async_write);
-	return ret;
-}
-EXPORT_SYMBOL(nobh_writepage);
-
-int nobh_truncate_page(struct address_space *mapping,
-			loff_t from, get_block_t *get_block)
-{
-	pgoff_t index = from >> PAGE_SHIFT;
-	struct inode *inode = mapping->host;
-	unsigned blocksize = i_blocksize(inode);
-	struct folio *folio;
-	struct buffer_head map_bh;
-	size_t offset;
-	sector_t iblock;
-	int err;
-
-	/* Block boundary? Nothing to do */
-	if (!(from & (blocksize - 1)))
-		return 0;
-
-	folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT,
-			mapping_gfp_mask(mapping));
-	err = -ENOMEM;
-	if (!folio)
-		goto out;
-
-	if (folio_buffers(folio))
-		goto has_buffers;
-
-	iblock = from >> inode->i_blkbits;
-	map_bh.b_size = blocksize;
-	map_bh.b_state = 0;
-	err = get_block(inode, iblock, &map_bh, 0);
-	if (err)
-		goto unlock;
-	/* unmapped? It's a hole - nothing to do */
-	if (!buffer_mapped(&map_bh))
-		goto unlock;
-
-	/* Ok, it's mapped. Make sure it's up-to-date */
-	if (!folio_test_uptodate(folio)) {
-		err = mapping->a_ops->read_folio(NULL, folio);
-		if (err) {
-			folio_put(folio);
-			goto out;
-		}
-		folio_lock(folio);
-		if (!folio_test_uptodate(folio)) {
-			err = -EIO;
-			goto unlock;
-		}
-		if (folio_buffers(folio))
-			goto has_buffers;
-	}
-	offset = offset_in_folio(folio, from);
-	folio_zero_segment(folio, offset, round_up(offset, blocksize));
-	folio_mark_dirty(folio);
-	err = 0;
-
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-out:
-	return err;
-
-has_buffers:
-	folio_unlock(folio);
-	folio_put(folio);
-	return block_truncate_page(mapping, from, get_block);
-}
-EXPORT_SYMBOL(nobh_truncate_page);
-
 int block_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
diff --git a/fs/mpage.c b/fs/mpage.c
index 681a4b9a36e3..b7e0b7fbb41f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -404,7 +404,6 @@ struct mpage_data {
 	struct bio *bio;
 	sector_t last_block_in_bio;
 	get_block_t *get_block;
-	unsigned use_writepage;
 };
 
 /*
@@ -624,15 +623,10 @@ confused:
 	if (bio)
 		bio = mpage_bio_submit(bio);
 
-	if (mpd->use_writepage) {
-		ret = mapping->a_ops->writepage(page, wbc);
-	} else {
-		ret = -EAGAIN;
-		goto out;
-	}
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
+	ret = mapping->a_ops->writepage(page, wbc);
 	mapping_set_error(mapping, ret);
 out:
 	mpd->bio = bio;
@@ -674,7 +668,6 @@ mpage_writepages(struct address_space *mapping,
 			.bio = NULL,
 			.last_block_in_bio = 0,
 			.get_block = get_block,
-			.use_writepage = 1,
 		};
 
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
@@ -685,19 +678,3 @@ mpage_writepages(struct address_space *mapping,
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
-
-int mpage_writepage(struct page *page, get_block_t get_block,
-	struct writeback_control *wbc)
-{
-	struct mpage_data mpd = {
-		.bio = NULL,
-		.last_block_in_bio = 0,
-		.get_block = get_block,
-		.use_writepage = 0,
-	};
-	int ret = __mpage_writepage(page, wbc, &mpd);
-	if (mpd.bio)
-		mpage_bio_submit(mpd.bio);
-	return ret;
-}
-EXPORT_SYMBOL(mpage_writepage);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b0366c89d6a4..61afb81cfdae 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -258,14 +258,6 @@ static inline vm_fault_t block_page_mkwrite_return(int err)
 }
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int nobh_write_begin(struct address_space *, loff_t, unsigned len,
-				struct page **, void **, get_block_t*);
-int nobh_write_end(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page *, void *);
-int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
-int nobh_writepage(struct page *page, get_block_t *get_block,
-                        struct writeback_control *wbc);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_folio(struct address_space *,
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 43986f7ec4dd..1bdc39daac0a 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -19,7 +19,5 @@ void mpage_readahead(struct readahead_control *, get_block_t get_block);
 int mpage_read_folio(struct folio *folio, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
-int mpage_writepage(struct page *page, get_block_t *get_block,
-		struct writeback_control *wbc);
 
 #endif
-- 
cgit 


From c56f9ec8b20f931014574b943590c4d830109380 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Jul 2022 10:52:14 +0100
Subject: afs: Use refcount_t rather than atomic_t

Use refcount_t rather than atomic_t in afs to make use of the count
checking facilities provided.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/165911277768.3745403.423349776836296452.stgit@warthog.procyon.org.uk/ # v1
---
 fs/afs/cell.c              | 61 ++++++++++++++++++++++------------------------
 fs/afs/cmservice.c         |  2 +-
 fs/afs/internal.h          | 16 ++++++------
 fs/afs/proc.c              |  6 ++---
 fs/afs/rxrpc.c             | 26 +++++++++++---------
 fs/afs/server.c            | 40 +++++++++++++++++-------------
 fs/afs/vl_list.c           | 19 +++++----------
 fs/afs/volume.c            | 21 ++++++++++------
 include/trace/events/afs.h | 26 ++++++++++----------
 9 files changed, 110 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 07ad744eef77..988c2ac7cece 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -158,7 +158,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 		cell->name[i] = tolower(name[i]);
 	cell->name[i] = 0;
 
-	atomic_set(&cell->ref, 1);
+	refcount_set(&cell->ref, 1);
 	atomic_set(&cell->active, 0);
 	INIT_WORK(&cell->manager, afs_manage_cell_work);
 	cell->volumes = RB_ROOT;
@@ -287,7 +287,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 	cell = candidate;
 	candidate = NULL;
 	atomic_set(&cell->active, 2);
-	trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
+	trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
 	rb_link_node_rcu(&cell->net_node, parent, pp);
 	rb_insert_color(&cell->net_node, &net->cells);
 	up_write(&net->cells_lock);
@@ -295,7 +295,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 	afs_queue_cell(cell, afs_cell_trace_get_queue_new);
 
 wait_for_cell:
-	trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+	trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
 		       afs_cell_trace_wait);
 	_debug("wait_for_cell");
 	wait_var_event(&cell->state,
@@ -490,13 +490,13 @@ static void afs_cell_destroy(struct rcu_head *rcu)
 {
 	struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
 	struct afs_net *net = cell->net;
-	int u;
+	int r;
 
 	_enter("%p{%s}", cell, cell->name);
 
-	u = atomic_read(&cell->ref);
-	ASSERTCMP(u, ==, 0);
-	trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
+	r = refcount_read(&cell->ref);
+	ASSERTCMP(r, ==, 0);
+	trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
 
 	afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
 	afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
@@ -539,13 +539,10 @@ void afs_cells_timer(struct timer_list *timer)
  */
 struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
-	int u;
+	int r;
 
-	if (atomic_read(&cell->ref) <= 0)
-		BUG();
-
-	u = atomic_inc_return(&cell->ref);
-	trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
+	__refcount_inc(&cell->ref, &r);
+	trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason);
 	return cell;
 }
 
@@ -556,12 +553,14 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
 	if (cell) {
 		unsigned int debug_id = cell->debug_id;
-		unsigned int u, a;
+		unsigned int a;
+		bool zero;
+		int r;
 
 		a = atomic_read(&cell->active);
-		u = atomic_dec_return(&cell->ref);
-		trace_afs_cell(debug_id, u, a, reason);
-		if (u == 0) {
+		zero = __refcount_dec_and_test(&cell->ref, &r);
+		trace_afs_cell(debug_id, r - 1, a, reason);
+		if (zero) {
 			a = atomic_read(&cell->active);
 			WARN(a != 0, "Cell active count %u > 0\n", a);
 			call_rcu(&cell->rcu, afs_cell_destroy);
@@ -574,14 +573,12 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
  */
 struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
-	int u, a;
-
-	if (atomic_read(&cell->ref) <= 0)
-		BUG();
+	int r, a;
 
-	u = atomic_read(&cell->ref);
+	r = refcount_read(&cell->ref);
+	WARN_ON(r == 0);
 	a = atomic_inc_return(&cell->active);
-	trace_afs_cell(cell->debug_id, u, a, reason);
+	trace_afs_cell(cell->debug_id, r, a, reason);
 	return cell;
 }
 
@@ -593,7 +590,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
 {
 	unsigned int debug_id;
 	time64_t now, expire_delay;
-	int u, a;
+	int r, a;
 
 	if (!cell)
 		return;
@@ -607,9 +604,9 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
 		expire_delay = afs_cell_gc_delay;
 
 	debug_id = cell->debug_id;
-	u = atomic_read(&cell->ref);
+	r = refcount_read(&cell->ref);
 	a = atomic_dec_return(&cell->active);
-	trace_afs_cell(debug_id, u, a, reason);
+	trace_afs_cell(debug_id, r, a, reason);
 	WARN_ON(a == 0);
 	if (a == 1)
 		/* 'cell' may now be garbage collected. */
@@ -621,11 +618,11 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
  */
 void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
-	int u, a;
+	int r, a;
 
-	u = atomic_read(&cell->ref);
+	r = refcount_read(&cell->ref);
 	a = atomic_read(&cell->active);
-	trace_afs_cell(cell->debug_id, u, a, reason);
+	trace_afs_cell(cell->debug_id, r, a, reason);
 }
 
 /*
@@ -739,7 +736,7 @@ again:
 		active = 1;
 		if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
 			rb_erase(&cell->net_node, &net->cells);
-			trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+			trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
 				       afs_cell_trace_unuse_delete);
 			smp_store_release(&cell->state, AFS_CELL_REMOVED);
 		}
@@ -866,7 +863,7 @@ void afs_manage_cells(struct work_struct *work)
 		bool sched_cell = false;
 
 		active = atomic_read(&cell->active);
-		trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+		trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
 			       active, afs_cell_trace_manage);
 
 		ASSERTCMP(active, >=, 1);
@@ -874,7 +871,7 @@ void afs_manage_cells(struct work_struct *work)
 		if (purging) {
 			if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
 				active = atomic_dec_return(&cell->active);
-				trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+				trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
 					       active, afs_cell_trace_unuse_pin);
 			}
 		}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3f5de28be79..cedd627e1fae 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -213,7 +213,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 */
 	if (call->server) {
 		trace_afs_server(call->server,
-				 atomic_read(&call->server->ref),
+				 refcount_read(&call->server->ref),
 				 atomic_read(&call->server->active),
 				 afs_server_trace_callback);
 		afs_break_callbacks(call->server, call->count, call->request);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6f25d9e75b5..64ad55494349 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -122,7 +122,7 @@ struct afs_call {
 	};
 	struct afs_operation	*op;
 	unsigned int		server_index;
-	atomic_t		usage;
+	refcount_t		ref;
 	enum afs_call_state	state;
 	spinlock_t		state_lock;
 	int			error;		/* error code */
@@ -365,7 +365,7 @@ struct afs_cell {
 	struct hlist_node	proc_link;	/* /proc cell list link */
 	time64_t		dns_expiry;	/* Time AFSDB/SRV record expires */
 	time64_t		last_inactive;	/* Time of last drop of usage count */
-	atomic_t		ref;		/* Struct refcount */
+	refcount_t		ref;		/* Struct refcount */
 	atomic_t		active;		/* Active usage counter */
 	unsigned long		flags;
 #define AFS_CELL_FL_NO_GC	0		/* The cell was added manually, don't auto-gc */
@@ -410,7 +410,7 @@ struct afs_vlserver {
 #define AFS_VLSERVER_FL_IS_YFS	2		/* Server is YFS not AFS */
 #define AFS_VLSERVER_FL_RESPONDING 3		/* VL server is responding */
 	rwlock_t		lock;		/* Lock on addresses */
-	atomic_t		usage;
+	refcount_t		ref;
 	unsigned int		rtt;		/* Server's current RTT in uS */
 
 	/* Probe state */
@@ -446,7 +446,7 @@ struct afs_vlserver_entry {
 
 struct afs_vlserver_list {
 	struct rcu_head		rcu;
-	atomic_t		usage;
+	refcount_t		ref;
 	u8			nr_servers;
 	u8			index;		/* Server currently in use */
 	u8			preferred;	/* Preferred server */
@@ -517,7 +517,7 @@ struct afs_server {
 #define AFS_SERVER_FL_NO_IBULK	17		/* Fileserver doesn't support FS.InlineBulkStatus */
 #define AFS_SERVER_FL_NO_RM2	18		/* Fileserver doesn't support YFS.RemoveFile2 */
 #define AFS_SERVER_FL_HAS_FS64	19		/* Fileserver supports FS.{Fetch,Store}Data64 */
-	atomic_t		ref;		/* Object refcount */
+	refcount_t		ref;		/* Object refcount */
 	atomic_t		active;		/* Active user count */
 	u32			addr_version;	/* Address list version */
 	unsigned int		rtt;		/* Server's current RTT in uS */
@@ -571,7 +571,7 @@ struct afs_volume {
 		struct rcu_head	rcu;
 		afs_volid_t	vid;		/* volume ID */
 	};
-	atomic_t		usage;
+	refcount_t		ref;
 	time64_t		update_at;	/* Time at which to next update */
 	struct afs_cell		*cell;		/* Cell to which belongs (pins ref) */
 	struct rb_node		cell_node;	/* Link in cell->volumes */
@@ -1493,14 +1493,14 @@ extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
  */
 static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
 {
-	atomic_inc(&vlserver->usage);
+	refcount_inc(&vlserver->ref);
 	return vlserver;
 }
 
 static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
 {
 	if (vllist)
-		atomic_inc(&vllist->usage);
+		refcount_inc(&vllist->ref);
 	return vllist;
 }
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e1b863449296..2a0c83d71565 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -47,7 +47,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 
 	/* display one cell per line on subsequent lines */
 	seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
-		   atomic_read(&cell->ref),
+		   refcount_read(&cell->ref),
 		   atomic_read(&cell->active),
 		   cell->dns_expiry - ktime_get_real_seconds(),
 		   vllist ? vllist->nr_servers : 0,
@@ -217,7 +217,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 	}
 
 	seq_printf(m, "%3d %08llx %s %s\n",
-		   atomic_read(&vol->usage), vol->vid,
+		   refcount_read(&vol->ref), vol->vid,
 		   afs_vol_types[vol->type],
 		   vol->name);
 
@@ -388,7 +388,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
 	alist = rcu_dereference(server->addresses);
 	seq_printf(m, "%pU %3d %3d\n",
 		   &server->uuid,
-		   atomic_read(&server->ref),
+		   refcount_read(&server->ref),
 		   atomic_read(&server->active));
 	seq_printf(m, "  - info: fl=%lx rtt=%u brk=%x\n",
 		   server->flags, server->rtt, server->cb_s_break);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a5434f3e57c6..d9acc43cb6f0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -145,7 +145,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 	call->type = type;
 	call->net = net;
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
-	atomic_set(&call->usage, 1);
+	refcount_set(&call->ref, 1);
 	INIT_WORK(&call->async_work, afs_process_async_call);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->state_lock);
@@ -163,14 +163,15 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 void afs_put_call(struct afs_call *call)
 {
 	struct afs_net *net = call->net;
-	int n = atomic_dec_return(&call->usage);
-	int o = atomic_read(&net->nr_outstanding_calls);
+	bool zero;
+	int r, o;
 
-	trace_afs_call(call, afs_call_trace_put, n, o,
+	zero = __refcount_dec_and_test(&call->ref, &r);
+	o = atomic_read(&net->nr_outstanding_calls);
+	trace_afs_call(call, afs_call_trace_put, r - 1, o,
 		       __builtin_return_address(0));
 
-	ASSERTCMP(n, >=, 0);
-	if (n == 0) {
+	if (zero) {
 		ASSERT(!work_pending(&call->async_work));
 		ASSERT(call->type->name != NULL);
 
@@ -198,9 +199,11 @@ void afs_put_call(struct afs_call *call)
 static struct afs_call *afs_get_call(struct afs_call *call,
 				     enum afs_call_trace why)
 {
-	int u = atomic_inc_return(&call->usage);
+	int r;
 
-	trace_afs_call(call, why, u,
+	__refcount_inc(&call->ref, &r);
+
+	trace_afs_call(call, why, r + 1,
 		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
 	return call;
@@ -668,14 +671,13 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 				   unsigned long call_user_ID)
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
-	int u;
+	int r;
 
 	trace_afs_notify_call(rxcall, call);
 	call->need_attention = true;
 
-	u = atomic_fetch_add_unless(&call->usage, 1, 0);
-	if (u != 0) {
-		trace_afs_call(call, afs_call_trace_wake, u + 1,
+	if (__refcount_inc_not_zero(&call->ref, &r)) {
+		trace_afs_call(call, afs_call_trace_wake, r + 1,
 			       atomic_read(&call->net->nr_outstanding_calls),
 			       __builtin_return_address(0));
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 6e5b9a19b234..ffed828622b6 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -228,7 +228,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 	if (!server)
 		goto enomem;
 
-	atomic_set(&server->ref, 1);
+	refcount_set(&server->ref, 1);
 	atomic_set(&server->active, 1);
 	server->debug_id = atomic_inc_return(&afs_server_debug_id);
 	RCU_INIT_POINTER(server->addresses, alist);
@@ -352,9 +352,10 @@ void afs_servers_timer(struct timer_list *timer)
 struct afs_server *afs_get_server(struct afs_server *server,
 				  enum afs_server_trace reason)
 {
-	unsigned int u = atomic_inc_return(&server->ref);
+	int r;
 
-	trace_afs_server(server, u, atomic_read(&server->active), reason);
+	__refcount_inc(&server->ref, &r);
+	trace_afs_server(server, r + 1, atomic_read(&server->active), reason);
 	return server;
 }
 
@@ -364,14 +365,14 @@ struct afs_server *afs_get_server(struct afs_server *server,
 static struct afs_server *afs_maybe_use_server(struct afs_server *server,
 					       enum afs_server_trace reason)
 {
-	unsigned int r = atomic_fetch_add_unless(&server->ref, 1, 0);
 	unsigned int a;
+	int r;
 
-	if (r == 0)
+	if (!__refcount_inc_not_zero(&server->ref, &r))
 		return NULL;
 
 	a = atomic_inc_return(&server->active);
-	trace_afs_server(server, r, a, reason);
+	trace_afs_server(server, r + 1, a, reason);
 	return server;
 }
 
@@ -380,10 +381,13 @@ static struct afs_server *afs_maybe_use_server(struct afs_server *server,
  */
 struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
 {
-	unsigned int r = atomic_inc_return(&server->ref);
-	unsigned int a = atomic_inc_return(&server->active);
+	unsigned int a;
+	int r;
+
+	__refcount_inc(&server->ref, &r);
+	a = atomic_inc_return(&server->active);
 
-	trace_afs_server(server, r, a, reason);
+	trace_afs_server(server, r + 1, a, reason);
 	return server;
 }
 
@@ -393,14 +397,15 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
 void afs_put_server(struct afs_net *net, struct afs_server *server,
 		    enum afs_server_trace reason)
 {
-	unsigned int usage;
+	bool zero;
+	int r;
 
 	if (!server)
 		return;
 
-	usage = atomic_dec_return(&server->ref);
-	trace_afs_server(server, usage, atomic_read(&server->active), reason);
-	if (unlikely(usage == 0))
+	zero = __refcount_dec_and_test(&server->ref, &r);
+	trace_afs_server(server, r - 1, atomic_read(&server->active), reason);
+	if (unlikely(zero))
 		__afs_put_server(net, server);
 }
 
@@ -436,7 +441,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
 {
 	struct afs_server *server = container_of(rcu, struct afs_server, rcu);
 
-	trace_afs_server(server, atomic_read(&server->ref),
+	trace_afs_server(server, refcount_read(&server->ref),
 			 atomic_read(&server->active), afs_server_trace_free);
 	afs_put_addrlist(rcu_access_pointer(server->addresses));
 	kfree(server);
@@ -487,7 +492,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
 
 		active = atomic_read(&server->active);
 		if (active == 0) {
-			trace_afs_server(server, atomic_read(&server->ref),
+			trace_afs_server(server, refcount_read(&server->ref),
 					 active, afs_server_trace_gc);
 			next = rcu_dereference_protected(
 				server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
@@ -553,7 +558,7 @@ void afs_manage_servers(struct work_struct *work)
 		_debug("manage %pU %u", &server->uuid, active);
 
 		if (purging) {
-			trace_afs_server(server, atomic_read(&server->ref),
+			trace_afs_server(server, refcount_read(&server->ref),
 					 active, afs_server_trace_purging);
 			if (active != 0)
 				pr_notice("Can't purge s=%08x\n", server->debug_id);
@@ -633,7 +638,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
 
 	_enter("");
 
-	trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active),
+	trace_afs_server(server, refcount_read(&server->ref),
+			 atomic_read(&server->active),
 			 afs_server_trace_update);
 
 	alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 38b2ba1d9ec0..acc48216136a 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -17,7 +17,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
 	vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
 			   GFP_KERNEL);
 	if (vlserver) {
-		atomic_set(&vlserver->usage, 1);
+		refcount_set(&vlserver->ref, 1);
 		rwlock_init(&vlserver->lock);
 		init_waitqueue_head(&vlserver->probe_wq);
 		spin_lock_init(&vlserver->probe_lock);
@@ -39,13 +39,9 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
 
 void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
 {
-	if (vlserver) {
-		unsigned int u = atomic_dec_return(&vlserver->usage);
-		//_debug("VL PUT %p{%u}", vlserver, u);
-
-		if (u == 0)
-			call_rcu(&vlserver->rcu, afs_vlserver_rcu);
-	}
+	if (vlserver &&
+	    refcount_dec_and_test(&vlserver->ref))
+		call_rcu(&vlserver->rcu, afs_vlserver_rcu);
 }
 
 struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
@@ -54,7 +50,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
 
 	vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
 	if (vllist) {
-		atomic_set(&vllist->usage, 1);
+		refcount_set(&vllist->ref, 1);
 		rwlock_init(&vllist->lock);
 	}
 
@@ -64,10 +60,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
 void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
 {
 	if (vllist) {
-		unsigned int u = atomic_dec_return(&vllist->usage);
-
-		//_debug("VLLS PUT %p{%u}", vllist, u);
-		if (u == 0) {
+		if (refcount_dec_and_test(&vllist->ref)) {
 			int i;
 
 			for (i = 0; i < vllist->nr_servers; i++) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index cc665cef0abe..f4937029dcd7 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -52,7 +52,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
 	struct afs_cell *cell = volume->cell;
 
 	if (!hlist_unhashed(&volume->proc_link)) {
-		trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+		trace_afs_volume(volume->vid, refcount_read(&cell->ref),
 				 afs_volume_trace_remove);
 		write_seqlock(&cell->volume_lock);
 		hlist_del_rcu(&volume->proc_link);
@@ -87,7 +87,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
 	volume->type_force	= params->force;
 	volume->name_len	= vldb->name_len;
 
-	atomic_set(&volume->usage, 1);
+	refcount_set(&volume->ref, 1);
 	INIT_HLIST_NODE(&volume->proc_link);
 	rwlock_init(&volume->servers_lock);
 	rwlock_init(&volume->cb_v_break_lock);
@@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
 	afs_remove_volume_from_cell(volume);
 	afs_put_serverlist(net, rcu_access_pointer(volume->servers));
 	afs_put_cell(volume->cell, afs_cell_trace_put_vol);
-	trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+	trace_afs_volume(volume->vid, refcount_read(&volume->ref),
 			 afs_volume_trace_free);
 	kfree_rcu(volume, rcu);
 
@@ -242,8 +242,10 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
 				  enum afs_volume_trace reason)
 {
 	if (volume) {
-		int u = atomic_inc_return(&volume->usage);
-		trace_afs_volume(volume->vid, u, reason);
+		int r;
+
+		__refcount_inc(&volume->ref, &r);
+		trace_afs_volume(volume->vid, r + 1, reason);
 	}
 	return volume;
 }
@@ -257,9 +259,12 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
 {
 	if (volume) {
 		afs_volid_t vid = volume->vid;
-		int u = atomic_dec_return(&volume->usage);
-		trace_afs_volume(vid, u, reason);
-		if (u == 0)
+		bool zero;
+		int r;
+
+		zero = __refcount_dec_and_test(&volume->ref, &r);
+		trace_afs_volume(vid, r - 1, reason);
+		if (zero)
 			afs_destroy_volume(net, volume);
 	}
 }
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 499f5fabd20f..aa60f42a9763 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -728,14 +728,14 @@ TRACE_EVENT(afs_cb_call,
 
 TRACE_EVENT(afs_call,
 	    TP_PROTO(struct afs_call *call, enum afs_call_trace op,
-		     int usage, int outstanding, const void *where),
+		     int ref, int outstanding, const void *where),
 
-	    TP_ARGS(call, op, usage, outstanding, where),
+	    TP_ARGS(call, op, ref, outstanding, where),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		call		)
 		    __field(int,			op		)
-		    __field(int,			usage		)
+		    __field(int,			ref		)
 		    __field(int,			outstanding	)
 		    __field(const void *,		where		)
 			     ),
@@ -743,15 +743,15 @@ TRACE_EVENT(afs_call,
 	    TP_fast_assign(
 		    __entry->call = call->debug_id;
 		    __entry->op = op;
-		    __entry->usage = usage;
+		    __entry->ref = ref;
 		    __entry->outstanding = outstanding;
 		    __entry->where = where;
 			   ),
 
-	    TP_printk("c=%08x %s u=%d o=%d sp=%pSR",
+	    TP_printk("c=%08x %s r=%d o=%d sp=%pSR",
 		      __entry->call,
 		      __print_symbolic(__entry->op, afs_call_traces),
-		      __entry->usage,
+		      __entry->ref,
 		      __entry->outstanding,
 		      __entry->where)
 	    );
@@ -1476,36 +1476,36 @@ TRACE_EVENT(afs_volume,
 		    __entry->reason = reason;
 			   ),
 
-	    TP_printk("V=%llx %s u=%d",
+	    TP_printk("V=%llx %s ur=%d",
 		      __entry->vid,
 		      __print_symbolic(__entry->reason, afs_volume_traces),
 		      __entry->ref)
 	    );
 
 TRACE_EVENT(afs_cell,
-	    TP_PROTO(unsigned int cell_debug_id, int usage, int active,
+	    TP_PROTO(unsigned int cell_debug_id, int ref, int active,
 		     enum afs_cell_trace reason),
 
-	    TP_ARGS(cell_debug_id, usage, active, reason),
+	    TP_ARGS(cell_debug_id, ref, active, reason),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		cell		)
-		    __field(int,			usage		)
+		    __field(int,			ref		)
 		    __field(int,			active		)
 		    __field(int,			reason		)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->cell = cell_debug_id;
-		    __entry->usage = usage;
+		    __entry->ref = ref;
 		    __entry->active = active;
 		    __entry->reason = reason;
 			   ),
 
-	    TP_printk("L=%08x %s u=%d a=%d",
+	    TP_printk("L=%08x %s r=%d a=%d",
 		      __entry->cell,
 		      __print_symbolic(__entry->reason, afs_cell_traces),
-		      __entry->usage,
+		      __entry->ref,
 		      __entry->active)
 	    );
 
-- 
cgit 


From 2757a4dc184997c66ef1de32636f73b9f21aac14 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Jul 2022 11:26:14 +0100
Subject: afs: Fix access after dec in put functions

Reference-putting functions should not access the object being put after
decrementing the refcount unless they reduce the refcount to zero.

Fix a couple of instances of this in afs by copying the information to be
logged by tracepoint to local variables before doing the decrement.

[Fixed a bit in afs_put_server() that I'd missed but Marc caught]

Fixes: 341f741f04be ("afs: Refcount the afs_call struct")
Fixes: 452181936931 ("afs: Trace afs_server usage")
Fixes: 977e5f8ed0ab ("afs: Split the usage count on struct afs_server")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/165911278430.3745403.16526310736054780645.stgit@warthog.procyon.org.uk/ # v1
---
 fs/afs/cmservice.c         |  2 +-
 fs/afs/rxrpc.c             | 11 ++++++-----
 fs/afs/server.c            | 22 +++++++++++++---------
 include/trace/events/afs.h | 12 ++++++------
 4 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index cedd627e1fae..0a090d614e76 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -212,7 +212,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 * to maintain cache coherency.
 	 */
 	if (call->server) {
-		trace_afs_server(call->server,
+		trace_afs_server(call->server->debug_id,
 				 refcount_read(&call->server->ref),
 				 atomic_read(&call->server->active),
 				 afs_server_trace_callback);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d9acc43cb6f0..d5c4785c862d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -152,7 +152,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 	call->iter = &call->def_iter;
 
 	o = atomic_inc_return(&net->nr_outstanding_calls);
-	trace_afs_call(call, afs_call_trace_alloc, 1, o,
+	trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o,
 		       __builtin_return_address(0));
 	return call;
 }
@@ -163,12 +163,13 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 void afs_put_call(struct afs_call *call)
 {
 	struct afs_net *net = call->net;
+	unsigned int debug_id = call->debug_id;
 	bool zero;
 	int r, o;
 
 	zero = __refcount_dec_and_test(&call->ref, &r);
 	o = atomic_read(&net->nr_outstanding_calls);
-	trace_afs_call(call, afs_call_trace_put, r - 1, o,
+	trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
 		       __builtin_return_address(0));
 
 	if (zero) {
@@ -186,7 +187,7 @@ void afs_put_call(struct afs_call *call)
 		afs_put_addrlist(call->alist);
 		kfree(call->request);
 
-		trace_afs_call(call, afs_call_trace_free, 0, o,
+		trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
 			       __builtin_return_address(0));
 		kfree(call);
 
@@ -203,7 +204,7 @@ static struct afs_call *afs_get_call(struct afs_call *call,
 
 	__refcount_inc(&call->ref, &r);
 
-	trace_afs_call(call, why, r + 1,
+	trace_afs_call(call->debug_id, why, r + 1,
 		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
 	return call;
@@ -677,7 +678,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 	call->need_attention = true;
 
 	if (__refcount_inc_not_zero(&call->ref, &r)) {
-		trace_afs_call(call, afs_call_trace_wake, r + 1,
+		trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1,
 			       atomic_read(&call->net->nr_outstanding_calls),
 			       __builtin_return_address(0));
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index ffed828622b6..4981baf97835 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -243,7 +243,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 	server->rtt = UINT_MAX;
 
 	afs_inc_servers_outstanding(net);
-	trace_afs_server(server, 1, 1, afs_server_trace_alloc);
+	trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
 	_leave(" = %p", server);
 	return server;
 
@@ -352,10 +352,12 @@ void afs_servers_timer(struct timer_list *timer)
 struct afs_server *afs_get_server(struct afs_server *server,
 				  enum afs_server_trace reason)
 {
+	unsigned int a;
 	int r;
 
 	__refcount_inc(&server->ref, &r);
-	trace_afs_server(server, r + 1, atomic_read(&server->active), reason);
+	a = atomic_read(&server->active);
+	trace_afs_server(server->debug_id, r + 1, a, reason);
 	return server;
 }
 
@@ -372,7 +374,7 @@ static struct afs_server *afs_maybe_use_server(struct afs_server *server,
 		return NULL;
 
 	a = atomic_inc_return(&server->active);
-	trace_afs_server(server, r + 1, a, reason);
+	trace_afs_server(server->debug_id, r + 1, a, reason);
 	return server;
 }
 
@@ -387,7 +389,7 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
 	__refcount_inc(&server->ref, &r);
 	a = atomic_inc_return(&server->active);
 
-	trace_afs_server(server, r + 1, a, reason);
+	trace_afs_server(server->debug_id, r + 1, a, reason);
 	return server;
 }
 
@@ -397,14 +399,16 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
 void afs_put_server(struct afs_net *net, struct afs_server *server,
 		    enum afs_server_trace reason)
 {
+	unsigned int a, debug_id = server->debug_id;
 	bool zero;
 	int r;
 
 	if (!server)
 		return;
 
+	a = atomic_inc_return(&server->active);
 	zero = __refcount_dec_and_test(&server->ref, &r);
-	trace_afs_server(server, r - 1, atomic_read(&server->active), reason);
+	trace_afs_server(debug_id, r - 1, a, reason);
 	if (unlikely(zero))
 		__afs_put_server(net, server);
 }
@@ -441,7 +445,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
 {
 	struct afs_server *server = container_of(rcu, struct afs_server, rcu);
 
-	trace_afs_server(server, refcount_read(&server->ref),
+	trace_afs_server(server->debug_id, refcount_read(&server->ref),
 			 atomic_read(&server->active), afs_server_trace_free);
 	afs_put_addrlist(rcu_access_pointer(server->addresses));
 	kfree(server);
@@ -492,7 +496,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
 
 		active = atomic_read(&server->active);
 		if (active == 0) {
-			trace_afs_server(server, refcount_read(&server->ref),
+			trace_afs_server(server->debug_id, refcount_read(&server->ref),
 					 active, afs_server_trace_gc);
 			next = rcu_dereference_protected(
 				server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
@@ -558,7 +562,7 @@ void afs_manage_servers(struct work_struct *work)
 		_debug("manage %pU %u", &server->uuid, active);
 
 		if (purging) {
-			trace_afs_server(server, refcount_read(&server->ref),
+			trace_afs_server(server->debug_id, refcount_read(&server->ref),
 					 active, afs_server_trace_purging);
 			if (active != 0)
 				pr_notice("Can't purge s=%08x\n", server->debug_id);
@@ -638,7 +642,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
 
 	_enter("");
 
-	trace_afs_server(server, refcount_read(&server->ref),
+	trace_afs_server(server->debug_id, refcount_read(&server->ref),
 			 atomic_read(&server->active),
 			 afs_server_trace_update);
 
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index aa60f42a9763..e9d412d19dbb 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -727,10 +727,10 @@ TRACE_EVENT(afs_cb_call,
 	    );
 
 TRACE_EVENT(afs_call,
-	    TP_PROTO(struct afs_call *call, enum afs_call_trace op,
+	    TP_PROTO(unsigned int call_debug_id, enum afs_call_trace op,
 		     int ref, int outstanding, const void *where),
 
-	    TP_ARGS(call, op, ref, outstanding, where),
+	    TP_ARGS(call_debug_id, op, ref, outstanding, where),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		call		)
@@ -741,7 +741,7 @@ TRACE_EVENT(afs_call,
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call = call->debug_id;
+		    __entry->call = call_debug_id;
 		    __entry->op = op;
 		    __entry->ref = ref;
 		    __entry->outstanding = outstanding;
@@ -1433,10 +1433,10 @@ TRACE_EVENT(afs_cb_miss,
 	    );
 
 TRACE_EVENT(afs_server,
-	    TP_PROTO(struct afs_server *server, int ref, int active,
+	    TP_PROTO(unsigned int server_debug_id, int ref, int active,
 		     enum afs_server_trace reason),
 
-	    TP_ARGS(server, ref, active, reason),
+	    TP_ARGS(server_debug_id, ref, active, reason),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		server		)
@@ -1446,7 +1446,7 @@ TRACE_EVENT(afs_server,
 			     ),
 
 	    TP_fast_assign(
-		    __entry->server = server->debug_id;
+		    __entry->server = server_debug_id;
 		    __entry->ref = ref;
 		    __entry->active = active;
 		    __entry->reason = reason;
-- 
cgit 


From 170ab26b01d7f445c46261eff69341dab7e50a24 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Thu, 21 Jul 2022 16:19:04 +0800
Subject: tracepoints: It is CONFIG_TRACEPOINTS not CONFIG_TRACEPOINT

When reading this note, CONFIG_TRACEPOINT searches my configuration
file, and the result is CONFIG_TRACEPOINTS, the search results are
consistent with the following macro definitions. I think it should be
repaired.

Link: https://lkml.kernel.org/r/20220721081904.3798-1-zeming@nfschina.com

Signed-off-by: Li zeming <zeming@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 28031b15f878..2908cc5ed70e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -151,7 +151,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 /*
  * Individual subsystem my have a separate configuration to
  * enable their tracepoints. By default, this file will create
- * the tracepoints if CONFIG_TRACEPOINT is defined. If a subsystem
+ * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem
  * wants to be able to disable its tracepoints from being created
  * it can define NOTRACE before including the tracepoint headers.
  */
-- 
cgit 


From 09794a5a6c348f629b35fc1687071a1622ef4265 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 2 Aug 2022 15:44:12 -0400
Subject: tracing: Use alignof__(struct {type b;}) instead of offsetof()

Simplify:

  #define ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))

with

  #define  ALIGN_STRUCTFIELD(type) __alignof__(struct {type b;})

Which works just the same.

Link: https://lore.kernel.org/all/a7d202457150472588df0bd3b7334b3f@AcuMS.aculab.com/
Link: https://lkml.kernel.org/r/20220802154412.513c50e3@gandalf.local.home

Suggested-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/stages/stage4_event_fields.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
index 80d34f396555..a8fb25f39a99 100644
--- a/include/trace/stages/stage4_event_fields.h
+++ b/include/trace/stages/stage4_event_fields.h
@@ -2,7 +2,7 @@
 
 /* Stage 4 definitions for creating trace events */
 
-#define ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))
+#define ALIGN_STRUCTFIELD(type) ((int)(__alignof__(struct {type b;})))
 
 #undef __field_ext
 #define __field_ext(_type, _item, _filter_type) {			\
-- 
cgit 


From d9c26e0a58b0039d1ad4340d826fe8db866e9455 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Wed, 8 Jun 2022 22:40:55 +0800
Subject: mailbox: mtk-cmdq: Remove proprietary cmdq_task_cb

rx_callback is a standard mailbox callback mechanism and could cover the
function of proprietary cmdq_task_cb, so use the standard one instead of
the proprietary one. Client driver has changed to use standard
rx_callback, so remove proprietary cmdq_task_cb.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mtk-cmdq-mailbox.c       | 11 -----------
 include/linux/mailbox/mtk-cmdq-mailbox.h | 10 ----------
 2 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 2578e5aaa935..9465f9081515 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -192,15 +192,10 @@ static bool cmdq_thread_is_in_wfe(struct cmdq_thread *thread)
 
 static void cmdq_task_exec_done(struct cmdq_task *task, int sta)
 {
-	struct cmdq_task_cb *cb = &task->pkt->async_cb;
 	struct cmdq_cb_data data;
 
 	data.sta = sta;
-	data.data = cb->data;
 	data.pkt = task->pkt;
-	if (cb->cb)
-		cb->cb(data);
-
 	mbox_chan_received_data(task->thread->chan, &data);
 
 	list_del(&task->list_entry);
@@ -448,7 +443,6 @@ done:
 static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 {
 	struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv;
-	struct cmdq_task_cb *cb;
 	struct cmdq_cb_data data;
 	struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev);
 	struct cmdq_task *task, *tmp;
@@ -465,13 +459,8 @@ static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 
 	list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
 				 list_entry) {
-		cb = &task->pkt->async_cb;
 		data.sta = -ECONNABORTED;
-		data.data = cb->data;
 		data.pkt = task->pkt;
-		if (cb->cb)
-			cb->cb(data);
-
 		mbox_chan_received_data(task->thread->chan, &data);
 		list_del(&task->list_entry);
 		kfree(task);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 44365aab043c..a8f0070c7aa9 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -67,24 +67,14 @@ enum cmdq_code {
 
 struct cmdq_cb_data {
 	int			sta;
-	void			*data;
 	struct cmdq_pkt		*pkt;
 };
 
-typedef void (*cmdq_async_flush_cb)(struct cmdq_cb_data data);
-
-struct cmdq_task_cb {
-	cmdq_async_flush_cb	cb;
-	void			*data;
-};
-
 struct cmdq_pkt {
 	void			*va_base;
 	dma_addr_t		pa_base;
 	size_t			cmd_buf_size; /* command occupied size */
 	size_t			buf_size; /* real buffer size */
-	struct cmdq_task_cb	cb;
-	struct cmdq_task_cb	async_cb;
 	void			*cl;
 };
 
-- 
cgit 


From d3e94fdc4ef476ca1edd468cc11badf2dbbb3c00 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 8 Jan 2021 15:34:38 -0500
Subject: fscrypt: export fscrypt_fname_encrypt and
 fscrypt_fname_encrypted_size

For ceph, we want to use our own scheme for handling filenames that are
are longer than NAME_MAX after encryption and Base64 encoding. This
allows us to have a consistent view of the encrypted filenames for
clients that don't support fscrypt and clients that do but that don't
have the key.

Currently, fs/crypto only supports encrypting filenames using
fscrypt_setup_filename, but that also handles encoding nokey names. Ceph
can't use that because it handles nokey names in a different way.

Export fscrypt_fname_encrypt. Rename fscrypt_fname_encrypted_size to
__fscrypt_fname_encrypted_size and add a new wrapper called
fscrypt_fname_encrypted_size that takes an inode argument rather than a
pointer to a fscrypt_policy union.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/crypto/fname.c           | 36 ++++++++++++++++++++++++++++++------
 fs/crypto/fscrypt_private.h |  9 +++------
 fs/crypto/hooks.c           |  6 +++---
 include/linux/fscrypt.h     |  4 ++++
 4 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 14e0ef5e9a20..12bd61d20f69 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 /**
  * fscrypt_fname_encrypt() - encrypt a filename
  * @inode: inode of the parent directory (for regular filenames)
- *	   or of the symlink (for symlink targets)
+ *	   or of the symlink (for symlink targets). Key must already be
+ *	   set up.
  * @iname: the filename to encrypt
  * @out: (output) the encrypted filename
  * @olen: size of the encrypted filename.  It must be at least @iname->len.
@@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
 
 /**
  * fname_decrypt() - decrypt a filename
@@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
 	return bp - dst;
 }
 
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
-				  u32 orig_len, u32 max_len,
-				  u32 *encrypted_len_ret)
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+				    u32 orig_len, u32 max_len,
+				    u32 *encrypted_len_ret)
 {
 	int padding = 4 << (fscrypt_policy_flags(policy) &
 			    FSCRYPT_POLICY_FLAGS_PAD_MASK);
@@ -280,6 +282,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 	return true;
 }
 
+/**
+ * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
+ * @inode:		parent inode of dentry name being encrypted. Key must
+ *			already be set up.
+ * @orig_len:		length of the original filename
+ * @max_len:		maximum length to return
+ * @encrypted_len_ret:	where calculated length should be returned (on success)
+ *
+ * Filenames that are shorter than the maximum length may have their lengths
+ * increased slightly by encryption, due to padding that is applied.
+ *
+ * Return: false if the orig_len is greater than max_len. Otherwise, true and
+ *	   fill out encrypted_len_ret with the length (up to max_len).
+ */
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret)
+{
+	return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
+					      orig_len, max_len,
+					      encrypted_len_ret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
+
 /**
  * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
  * @max_encrypted_len: maximum length of encrypted filenames the buffer will be
@@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		return ret;
 
 	if (fscrypt_has_encryption_key(dir)) {
-		if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
-						  iname->len, NAME_MAX,
+		if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
 						  &fname->crypto_buf.len))
 			return -ENAMETOOLONG;
 		fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 6b4c8094cc7b..11fe9d213ae1 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
 			 const struct fscrypt_info *ci);
 
 /* fname.c */
-int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
-			  u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
-				  u32 orig_len, u32 max_len,
-				  u32 *encrypted_len_ret);
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+				    u32 orig_len, u32 max_len,
+				    u32 *encrypted_len_ret);
 
 /* hkdf.c */
-
 struct fscrypt_hkdf {
 	struct crypto_shash *hmac_tfm;
 };
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index af74599ae1cf..7c01025879b3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
 	 * counting it (even though it is meaningless for ciphertext) is simpler
 	 * for now since filesystems will assume it is there and subtract it.
 	 */
-	if (!fscrypt_fname_encrypted_size(policy, len,
-					  max_len - sizeof(struct fscrypt_symlink_data),
-					  &disk_link->len))
+	if (!__fscrypt_fname_encrypted_size(policy, len,
+					    max_len - sizeof(struct fscrypt_symlink_data),
+					    &disk_link->len))
 		return -ENAMETOOLONG;
 	disk_link->len += sizeof(struct fscrypt_symlink_data);
 
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index e60d57c99cb6..5926a4081c6d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -327,6 +327,10 @@ void fscrypt_free_inode(struct inode *inode);
 int fscrypt_drop_inode(struct inode *inode);
 
 /* fname.c */
+int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
+			  u8 *out, unsigned int olen);
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret);
 int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
 			   int lookup, struct fscrypt_name *fname);
 
-- 
cgit 


From 637fa738b590ec0e3414931d1e07c4f195eb5215 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 1 Sep 2020 12:56:42 -0400
Subject: fscrypt: add fscrypt_context_for_new_inode

Most filesystems just call fscrypt_set_context on new inodes, which
usually causes a setxattr. That's a bit late for ceph, which can send
along a full set of attributes with the create request.

Doing so allows it to avoid race windows that where the new inode could
be seen by other clients without the crypto context attached. It also
avoids the separate round trip to the server.

Refactor the fscrypt code a bit to allow us to create a new crypto
context, attach it to the inode, and write it to the buffer, but without
calling set_context on it. ceph can later use this to marshal the
context into the attributes we send along with the create request.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/crypto/policy.c      | 35 +++++++++++++++++++++++++++++------
 include/linux/fscrypt.h |  1 +
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 5f858cee1e3b..a450189565e3 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -685,6 +685,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
 	return fscrypt_get_dummy_policy(dir->i_sb);
 }
 
+/**
+ * fscrypt_context_for_new_inode() - create an encryption context for a new inode
+ * @ctx: where context should be written
+ * @inode: inode from which to fetch policy and nonce
+ *
+ * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
+ * generate a new context and write it to ctx. ctx _must_ be at least
+ * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
+ *
+ * Return: size of the resulting context or a negative error code.
+ */
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
+{
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
+			FSCRYPT_SET_CONTEXT_MAX_SIZE);
+
+	/* fscrypt_prepare_new_inode() should have set up the key already. */
+	if (WARN_ON_ONCE(!ci))
+		return -ENOKEY;
+
+	return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
+}
+EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
+
 /**
  * fscrypt_set_context() - Set the fscrypt context of a new inode
  * @inode: a new inode
@@ -701,12 +727,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
 	union fscrypt_context ctx;
 	int ctxsize;
 
-	/* fscrypt_prepare_new_inode() should have set up the key already. */
-	if (WARN_ON_ONCE(!ci))
-		return -ENOKEY;
-
-	BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
-	ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+	ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
+	if (ctxsize < 0)
+		return ctxsize;
 
 	/*
 	 * This may be the first time the inode number is available, so do any
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 5926a4081c6d..7d2f1e0f23b1 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -284,6 +284,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
 int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
 int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode);
 int fscrypt_set_context(struct inode *inode, void *fs_data);
 
 struct fscrypt_dummy_policy {
-- 
cgit 


From 4f48d5da81ee7004a789c8aac2d0dfb2514c37f1 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 16 May 2022 11:23:19 +0800
Subject: fs/dcache: export d_same_name() helper

Compare dentry name with case-exact name, return true if names
are same, or false.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/dcache.c            | 15 +++++++++++----
 include/linux/dcache.h |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 93f4f5ee07bf..a409312ee0df 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2247,10 +2247,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 }
 EXPORT_SYMBOL(d_add_ci);
 
-
-static inline bool d_same_name(const struct dentry *dentry,
-				const struct dentry *parent,
-				const struct qstr *name)
+/**
+ * d_same_name - compare dentry name with case-exact name
+ * @parent: parent dentry
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * Return: true if names are same, or false
+ */
+bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+		 const struct qstr *name)
 {
 	if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
 		if (dentry->d_name.len != name->len)
@@ -2261,6 +2267,7 @@ static inline bool d_same_name(const struct dentry *dentry,
 				       dentry->d_name.len, dentry->d_name.name,
 				       name) == 0;
 }
+EXPORT_SYMBOL_GPL(d_same_name);
 
 /**
  * __d_lookup_rcu - search for a dentry (racy, store-free)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f5bba51480b2..bb72361834de 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
 					wait_queue_head_t *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+			const struct qstr *name);
 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
-- 
cgit 


From d93231a6bc8a452323d5fef16cca7107ce483a27 Mon Sep 17 00:00:00 2001
From: Luís Henriques <lhenriques@suse.de>
Date: Fri, 3 Jun 2022 14:29:09 +0100
Subject: ceph: prevent a client from exceeding the MDS maximum xattr size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MDS tries to enforce a limit on the total key/values in extended
attributes.  However, this limit is enforced only if doing a synchronous
operation (MDS_OP_SETXATTR) -- if we're buffering the xattrs, the MDS
doesn't have a chance to enforce these limits.

This patch adds support for decoding the xattrs maximum size setting that is
distributed in the mdsmap.  Then, when setting an xattr, the kernel client
will revert to do a synchronous operation if that maximum size is exceeded.

While there, fix a dout() that would trigger a printk warning:

[   98.718078] ------------[ cut here ]------------
[   98.719012] precision 65536 too large
[   98.719039] WARNING: CPU: 1 PID: 3755 at lib/vsprintf.c:2703 vsnprintf+0x5e3/0x600
...

Link: https://tracker.ceph.com/issues/55725
Signed-off-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mdsmap.c            | 22 ++++++++++++++++++----
 fs/ceph/xattr.c             | 12 ++++++++----
 include/linux/ceph/mdsmap.h |  1 +
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 30387733765d..8d0a6d2c2da4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 		__decode_and_drop_type(p, end, u8, bad_ext);
 	}
 	if (mdsmap_ev >= 8) {
-		u32 name_len;
 		/* enabled */
 		ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
-		ceph_decode_32_safe(p, end, name_len, bad_ext);
-		ceph_decode_need(p, end, name_len, bad_ext);
-		*p += name_len;
+		/* fs_name */
+		ceph_decode_skip_string(p, end, bad_ext);
 	}
 	/* damaged */
 	if (mdsmap_ev >= 9) {
@@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 	} else {
 		m->m_damaged = false;
 	}
+	if (mdsmap_ev >= 17) {
+		/* balancer */
+		ceph_decode_skip_string(p, end, bad_ext);
+		/* standby_count_wanted */
+		ceph_decode_skip_32(p, end, bad_ext);
+		/* old_max_mds */
+		ceph_decode_skip_32(p, end, bad_ext);
+		/* min_compat_client */
+		ceph_decode_skip_8(p, end, bad_ext);
+		/* required_client_features */
+		ceph_decode_skip_set(p, end, 64, bad_ext);
+		ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+	} else {
+		/* This forces the usage of the (sync) SETXATTR Op */
+		m->m_max_xattr_size = 0;
+	}
 bad_ext:
 	dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
 	     !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f141f5246163..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1086,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 			flags |= CEPH_XATTR_REMOVE;
 	}
 
-	dout("setxattr value=%.*s\n", (int)size, value);
+	dout("setxattr value size: %zu\n", size);
 
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1184,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
-	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+	if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+	    (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+		dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+		     __func__, ci->i_xattrs.version, required_blob_size,
+		     mdsc->mdsmap->m_max_xattr_size);
 		goto do_sync;
+	}
 
 	if (!lock_snap_rwsem && !ci->i_head_snapc) {
 		lock_snap_rwsem = true;
@@ -1201,8 +1207,6 @@ retry:
 	     ceph_cap_string(issued));
 	__build_xattrs(inode);
 
-	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
 		struct ceph_buffer *blob;
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 523fd0452856..4c3e0648dc27 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,6 +25,7 @@ struct ceph_mdsmap {
 	u32 m_session_timeout;          /* seconds */
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
+	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
 	u32 m_max_mds;			/* expected up:active mds number */
 	u32 m_num_active_mds;		/* actual up:active mds number */
 	u32 possible_max_rank;		/* possible max rank index */
-- 
cgit 


From 1b7587d69ea7b8c350195392bfd30a0f31374ae4 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 6 Jun 2022 20:15:33 +0800
Subject: ceph: fix the incorrect comment for the ceph_mds_caps struct

The incorrect comment is misleading. Acutally the last members
in ceph_mds_caps strcut is a union for none export and export
bodies.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 86bf82dbd8b8..40740e234ce1 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -768,7 +768,7 @@ struct ceph_mds_caps {
 	__le32 xattr_len;
 	__le64 xattr_version;
 
-	/* filelock */
+	/* a union of non-export and export bodies. */
 	__le64 size, max_size, truncate_size;
 	__le32 truncate_seq;
 	struct ceph_timespec mtime, atime, ctime;
-- 
cgit 


From 020bc44a9fbf6946f42db503d11c9811f26dd9fd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 10 Jun 2022 11:40:13 -0400
Subject: ceph: switch back to testing for NULL folio->private in
 ceph_dirty_folio

Willy requested that we change this back to warning on folio->private
being non-NULl. He's trying to kill off the PG_private flag, and so we'd
like to catch where it's non-NULL.

Add a VM_WARN_ON_FOLIO (since it doesn't exist yet) and change over to
using that instead of VM_BUG_ON_FOLIO along with testing the ->private
pointer.

[ xiubli: define VM_WARN_ON_FOLIO macro in case DEBUG_VM is disabled
  reported by kernel test robot <lkp@intel.com> ]

Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c          |  2 +-
 include/linux/mmdebug.h | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 01fe75b8d146..31fc04eeb4d0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 	 * Reference snap context in folio->private.  Also set
 	 * PagePrivate so that we get invalidate_folio callback.
 	 */
-	VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
+	VM_WARN_ON_FOLIO(folio->private, folio);
 	folio_attach_private(folio, snapc);
 
 	return ceph_fscache_dirty_folio(mapping, folio);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index d7285f8148a3..15ae78cd2853 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -54,6 +54,15 @@ void dump_mm(const struct mm_struct *mm);
 	}								\
 	unlikely(__ret_warn_once);					\
 })
+#define VM_WARN_ON_FOLIO(cond, folio)		({			\
+	int __ret_warn = !!(cond);					\
+									\
+	if (unlikely(__ret_warn)) {					\
+		dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn);						\
+})
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)	({			\
 	static bool __section(".data.once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
@@ -79,6 +88,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
-- 
cgit 


From b53aca4b460ae2ece453963ef01667ee8ee78f52 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 9 Jun 2022 15:21:37 +0800
Subject: ceph: fix incorrect old_size length in ceph_mds_request_args

The 'old_size' is a __le64 type since birth, not sure why the
kclient incorrectly switched it to __le32. This change is okay
won't break anything because union will always allocate more memory
than the 'open' member needed.

Rename 'file_replication' to 'pool' as ceph did. Though this 'open'
struct may never be used in kclient in future, it's confusing when
going through the ceph code.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 40740e234ce1..49586ff26152 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -433,9 +433,9 @@ union ceph_mds_request_args {
 		__le32 stripe_unit;          /* layout for newly created file */
 		__le32 stripe_count;         /* ... */
 		__le32 object_size;
-		__le32 file_replication;
-               __le32 mask;                 /* CEPH_CAP_* */
-               __le32 old_size;
+		__le32 pool;
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
-- 
cgit 


From 2c61c97fb12b806e1c8eb15f04c277ad097ec95e Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Wed, 8 Jun 2022 11:52:21 -0700
Subject: nvme: handle the persistent internal error AER

In the NVM Express Revision 1.4 spec, Figure 145 describes possible
values for an AER with event type "Error" (value 000b). For a
Persistent Internal Error (value 03h), the host should perform a
controller reset.

Add support for this error using code that already exists for
doing a controller reset. As part of this support, introduce
two utility functions for parsing the AER type and subtype.

This new support was tested in a lab environment where we can
generate the persistent internal error on demand, and observe
both the Linux side and NVMe controller side to see that the
controller reset has been done.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 31 +++++++++++++++++++++++++++++--
 include/linux/nvme.h     |  4 ++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2533b88e66d5..92d90dc82a9b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4525,9 +4525,19 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static u32 nvme_aer_type(u32 result)
+{
+	return result & 0x7;
+}
+
+static u32 nvme_aer_subtype(u32 result)
+{
+	return (result & 0xff00) >> 8;
+}
+
 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 {
-	u32 aer_notice_type = (result & 0xff00) >> 8;
+	u32 aer_notice_type = nvme_aer_subtype(result);
 
 	trace_nvme_async_event(ctrl, aer_notice_type);
 
@@ -4560,11 +4570,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 	}
 }
 
+static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
+{
+	trace_nvme_async_event(ctrl, NVME_AER_ERROR);
+	dev_warn(ctrl->device, "resetting controller due to AER\n");
+	nvme_reset_ctrl(ctrl);
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
-	u32 aer_type = result & 0x07;
+	u32 aer_type = nvme_aer_type(result);
+	u32 aer_subtype = nvme_aer_subtype(result);
 
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
@@ -4574,6 +4592,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		nvme_handle_aen_notice(ctrl, result);
 		break;
 	case NVME_AER_ERROR:
+		/*
+		 * For a persistent internal error, don't run async_event_work
+		 * to submit a new AER. The controller reset will do it.
+		 */
+		if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
+			nvme_handle_aer_persistent_error(ctrl);
+			return;
+		}
+		fallthrough;
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
 	case NVME_AER_VS:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 07cfc922f8e4..3b8fc6c69529 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -711,6 +711,10 @@ enum {
 	NVME_AER_VS			= 7,
 };
 
+enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
-- 
cgit 


From 85cc424381804386d991f81e08b4933ca1f04214 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:51:57 +0200
Subject: crypto: add crypto_has_shash()

Add helper function to determine if a given synchronous hash is supported.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 crypto/shash.c        | 6 ++++++
 include/crypto/hash.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/crypto/shash.c b/crypto/shash.c
index 0a0a50cb694f..4c88e63b3350 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_shash);
 
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
+{
+	return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_shash);
+
 static int shash_prepare_alg(struct shash_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index f140e4643949..f5841992dc9b 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -718,6 +718,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 					u32 mask);
 
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
 {
 	return &tfm->base;
-- 
cgit 


From 9e2f284e149124aa9a6b963882d2b39ae1742196 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:51:58 +0200
Subject: crypto: add crypto_has_kpp()

Add helper function to determine if a given key-agreement protocol
primitive is supported.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 crypto/kpp.c         | 6 ++++++
 include/crypto/kpp.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/crypto/kpp.c b/crypto/kpp.c
index 7aa6ba4b60a4..678e871ce418 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn,
 }
 EXPORT_SYMBOL_GPL(crypto_grab_kpp);
 
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask)
+{
+	return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_kpp);
+
 static void kpp_prepare_alg(struct kpp_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index cccceadc164b..24d01e9877c1 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -104,6 +104,8 @@ struct kpp_alg {
  */
 struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask);
 
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm)
 {
 	return &tfm->base;
-- 
cgit 


From a116e1cdc64a743c36c40e6fa639dec991c157a3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:51:59 +0200
Subject: lib/base64: RFC4648-compliant base64 encoding

Add RFC4648-compliant base64 encoding and decoding routines, based on
the base64url encoding in fs/crypto/fname.c.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/base64.h |  16 ++++++++
 lib/Makefile           |   2 +-
 lib/base64.c           | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/base64.h
 create mode 100644 lib/base64.c

(limited to 'include')

diff --git a/include/linux/base64.h b/include/linux/base64.h
new file mode 100644
index 000000000000..660d4cb1ef31
--- /dev/null
+++ b/include/linux/base64.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64 encoding, lifted from fs/crypto/fname.c.
+ */
+
+#ifndef _LINUX_BASE64_H
+#define _LINUX_BASE64_H
+
+#include <linux/types.h>
+
+#define BASE64_CHARS(nbytes)   DIV_ROUND_UP((nbytes) * 4, 3)
+
+int base64_encode(const u8 *src, int len, char *dst);
+int base64_decode(const char *src, int len, u8 *dst);
+
+#endif /* _LINUX_BASE64_H */
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..20d2caf6577d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o rhashtable.o \
+	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
diff --git a/lib/base64.c b/lib/base64.c
new file mode 100644
index 000000000000..b736a7a431c5
--- /dev/null
+++ b/lib/base64.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64.c - RFC4648-compliant base64 encoding
+ *
+ * Copyright (c) 2020 Hannes Reinecke, SUSE
+ *
+ * Based on the base64url routines from fs/crypto/fname.c
+ * (which are using the URL-safe base64 encoding),
+ * modified to use the standard coding table from RFC4648 section 4.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/base64.h>
+
+static const char base64_table[65] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+/**
+ * base64_encode() - base64-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64-encoded string.  Not NUL-terminated.
+ *
+ * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
+ * by RFC 4648, including the  '='-padding.
+ *
+ * Return: the length of the resulting base64-encoded string in bytes.
+ */
+int base64_encode(const u8 *src, int srclen, char *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	char *cp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		ac = (ac << 8) | src[i];
+		bits += 8;
+		do {
+			bits -= 6;
+			*cp++ = base64_table[(ac >> bits) & 0x3f];
+		} while (bits >= 6);
+	}
+	if (bits) {
+		*cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
+		bits -= 6;
+	}
+	while (bits < 0) {
+		*cp++ = '=';
+		bits += 2;
+	}
+	return cp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_encode);
+
+/**
+ * base64_decode() - base64-decode a string
+ * @src: the string to decode.  Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
+ * specified by RFC 4648, including the  '='-padding.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ *	   or -1 if the string isn't a valid base64 string.
+ */
+int base64_decode(const char *src, int srclen, u8 *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	u8 *bp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		const char *p = strchr(base64_table, src[i]);
+
+		if (src[i] == '=') {
+			ac = (ac << 6);
+			bits += 6;
+			if (bits >= 8)
+				bits -= 8;
+			continue;
+		}
+		if (p == NULL || src[i] == 0)
+			return -1;
+		ac = (ac << 6) | (p - base64_table);
+		bits += 6;
+		if (bits >= 8) {
+			bits -= 8;
+			*bp++ = (u8)(ac >> bits);
+		}
+	}
+	if (ac & ((1 << bits) - 1))
+		return -1;
+	return bp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_decode);
-- 
cgit 


From 88b140fec07307f825170a45562013a80842cc93 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:00 +0200
Subject: nvme: add definitions for NVMe In-Band authentication

Add new definitions for NVMe In-band authentication as defined in
the NVMe Base Specification v2.0.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 208 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3b8fc6c69529..ae53d74f3696 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
+#define NVMF_AUTH_HASH_LEN	64
 
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
@@ -1373,6 +1374,8 @@ enum nvmf_capsule_command {
 	nvme_fabrics_type_property_set	= 0x00,
 	nvme_fabrics_type_connect	= 0x01,
 	nvme_fabrics_type_property_get	= 0x04,
+	nvme_fabrics_type_auth_send	= 0x05,
+	nvme_fabrics_type_auth_receive	= 0x06,
 };
 
 #define nvme_fabrics_type_name(type)   { type, #type }
@@ -1380,7 +1383,9 @@ enum nvmf_capsule_command {
 	__print_symbolic(type,						\
 		nvme_fabrics_type_name(nvme_fabrics_type_property_set),	\
 		nvme_fabrics_type_name(nvme_fabrics_type_connect),	\
-		nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+		nvme_fabrics_type_name(nvme_fabrics_type_property_get), \
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_send),	\
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_receive))
 
 /*
  * If not fabrics command, fctype will be ignored.
@@ -1476,6 +1481,11 @@ struct nvmf_connect_command {
 	__u8		resv4[12];
 };
 
+enum {
+	NVME_CONNECT_AUTHREQ_ASCR	= (1 << 2),
+	NVME_CONNECT_AUTHREQ_ATR	= (1 << 1),
+};
+
 struct nvmf_connect_data {
 	uuid_t		hostid;
 	__le16		cntlid;
@@ -1510,6 +1520,200 @@ struct nvmf_property_get_command {
 	__u8		resv4[16];
 };
 
+struct nvmf_auth_common_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al_tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_send_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_receive_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al;
+	__u8		resv4[16];
+};
+
+/* Value for secp */
+enum {
+	NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER	= 0xe9,
+};
+
+/* Defined value for auth_type */
+enum {
+	NVME_AUTH_COMMON_MESSAGES	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGES	= 0x01,
+};
+
+/* Defined messages for auth_id */
+enum {
+	NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE	= 0x01,
+	NVME_AUTH_DHCHAP_MESSAGE_REPLY		= 0x02,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1	= 0x03,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2	= 0x04,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE2	= 0xf0,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE1	= 0xf1,
+};
+
+struct nvmf_auth_dhchap_protocol_descriptor {
+	__u8		authid;
+	__u8		rsvd;
+	__u8		halen;
+	__u8		dhlen;
+	__u8		idlist[60];
+};
+
+enum {
+	NVME_AUTH_DHCHAP_AUTH_ID	= 0x01,
+};
+
+/* Defined hash functions for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_HASH_SHA256	= 0x01,
+	NVME_AUTH_HASH_SHA384	= 0x02,
+	NVME_AUTH_HASH_SHA512	= 0x03,
+	NVME_AUTH_HASH_INVALID	= 0xff,
+};
+
+/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_DHGROUP_NULL		= 0x00,
+	NVME_AUTH_DHGROUP_2048		= 0x01,
+	NVME_AUTH_DHGROUP_3072		= 0x02,
+	NVME_AUTH_DHGROUP_4096		= 0x03,
+	NVME_AUTH_DHGROUP_6144		= 0x04,
+	NVME_AUTH_DHGROUP_8192		= 0x05,
+	NVME_AUTH_DHGROUP_INVALID	= 0xff,
+};
+
+union nvmf_auth_protocol {
+	struct nvmf_auth_dhchap_protocol_descriptor dhchap;
+};
+
+struct nvmf_auth_dhchap_negotiate_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd;
+	__le16		t_id;
+	__u8		sc_c;
+	__u8		napd;
+	union nvmf_auth_protocol auth_protocol[];
+};
+
+struct nvmf_auth_dhchap_challenge_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__u16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		hashid;
+	__u8		dhgid;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of challenge value */
+	__u8		cval[];
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+struct nvmf_auth_dhchap_reply_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		cvalid;
+	__u8		rsvd3;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of response data */
+	__u8		rval[];
+	/* followed by 'hl' bytes of Challenge value */
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+enum {
+	NVME_AUTH_DHCHAP_RESPONSE_VALID	= (1 << 0),
+};
+
+struct nvmf_auth_dhchap_success1_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		rvalid;
+	__u8		rsvd3[7];
+	/* 'hl' bytes of response value if 'rvalid' is set */
+	__u8		rval[];
+};
+
+struct nvmf_auth_dhchap_success2_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rsvd2[10];
+};
+
+struct nvmf_auth_dhchap_failure_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rescode;
+	__u8		rescode_exp;
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED	= 0x01,
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_FAILED			= 0x01,
+	NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE		= 0x02,
+	NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH	= 0x03,
+	NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE		= 0x04,
+	NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE	= 0x05,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD	= 0x06,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE	= 0x07,
+};
+
+
 struct nvme_dbbuf {
 	__u8			opcode;
 	__u8			flags;
@@ -1553,6 +1757,9 @@ struct nvme_command {
 		struct nvmf_connect_command connect;
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
+		struct nvmf_auth_common_command auth_common;
+		struct nvmf_auth_send_command auth_send;
+		struct nvmf_auth_receive_command auth_receive;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
 	};
-- 
cgit 


From f50fff73d620cd6e8f48bc58d4f1c944615a3fea Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:02 +0200
Subject: nvme: implement In-Band authentication

Implement NVMe-oF In-Band authentication according to NVMe TPAR 8006.
This patch adds two new fabric options 'dhchap_secret' to specify the
pre-shared key (in ASCII respresentation according to NVMe 2.0 section
8.13.5.8 'Secret representation') and 'dhchap_ctrl_secret' to specify
the pre-shared controller key for bi-directional authentication of both
the host and the controller.
Re-authentication can be triggered by writing the PSK into the new
controller sysfs attribute 'dhchap_secret' or 'dhchap_ctrl_secret'.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[axboe: fold in clang build fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/Kconfig         |   1 +
 drivers/nvme/Makefile        |   1 +
 drivers/nvme/common/Kconfig  |   4 +
 drivers/nvme/common/Makefile |   7 +
 drivers/nvme/common/auth.c   | 323 +++++++++++++++++
 drivers/nvme/host/Kconfig    |  13 +
 drivers/nvme/host/Makefile   |   1 +
 drivers/nvme/host/auth.c     | 828 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c     | 143 +++++++-
 drivers/nvme/host/fabrics.c  |  80 ++++-
 drivers/nvme/host/fabrics.h  |   7 +
 drivers/nvme/host/nvme.h     |  30 ++
 drivers/nvme/host/rdma.c     |   1 +
 drivers/nvme/host/tcp.c      |   1 +
 drivers/nvme/host/trace.c    |  32 ++
 include/linux/nvme-auth.h    |  33 ++
 16 files changed, 1498 insertions(+), 7 deletions(-)
 create mode 100644 drivers/nvme/common/Kconfig
 create mode 100644 drivers/nvme/common/Makefile
 create mode 100644 drivers/nvme/common/auth.c
 create mode 100644 drivers/nvme/host/auth.c
 create mode 100644 include/linux/nvme-auth.h

(limited to 'include')

diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index 87ae409a32b9..656e46d938da 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menu "NVME Support"
 
+source "drivers/nvme/common/Kconfig"
 source "drivers/nvme/host/Kconfig"
 source "drivers/nvme/target/Kconfig"
 
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index fb42c44609a8..eedca8c72098 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+obj-$(CONFIG_NVME_COMMON)		+= common/
 obj-y		+= host/
 obj-y		+= target/
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
new file mode 100644
index 000000000000..4514f44362dd
--- /dev/null
+++ b/drivers/nvme/common/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NVME_COMMON
+       tristate
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
new file mode 100644
index 000000000000..720c625b8a52
--- /dev/null
+++ b/drivers/nvme/common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y			+= -I$(src)
+
+obj-$(CONFIG_NVME_COMMON)	+= nvme-common.o
+
+nvme-common-y			+= auth.o
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
new file mode 100644
index 000000000000..adb43d341ffb
--- /dev/null
+++ b/drivers/nvme/common/auth.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include <linux/nvme.h>
+#include <linux/nvme-auth.h>
+
+static u32 nvme_dhchap_seqnum;
+static DEFINE_MUTEX(nvme_dhchap_mutex);
+
+u32 nvme_auth_get_seqnum(void)
+{
+	u32 seqnum;
+
+	mutex_lock(&nvme_dhchap_mutex);
+	if (!nvme_dhchap_seqnum)
+		nvme_dhchap_seqnum = prandom_u32();
+	else {
+		nvme_dhchap_seqnum++;
+		if (!nvme_dhchap_seqnum)
+			nvme_dhchap_seqnum++;
+	}
+	seqnum = nvme_dhchap_seqnum;
+	mutex_unlock(&nvme_dhchap_mutex);
+	return seqnum;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
+
+static struct nvme_auth_dhgroup_map {
+	const char name[16];
+	const char kpp[16];
+} dhgroup_map[] = {
+	[NVME_AUTH_DHGROUP_NULL] = {
+		.name = "null", .kpp = "null" },
+	[NVME_AUTH_DHGROUP_2048] = {
+		.name = "ffdhe2048", .kpp = "ffdhe2048(dh)" },
+	[NVME_AUTH_DHGROUP_3072] = {
+		.name = "ffdhe3072", .kpp = "ffdhe3072(dh)" },
+	[NVME_AUTH_DHGROUP_4096] = {
+		.name = "ffdhe4096", .kpp = "ffdhe4096(dh)" },
+	[NVME_AUTH_DHGROUP_6144] = {
+		.name = "ffdhe6144", .kpp = "ffdhe6144(dh)" },
+	[NVME_AUTH_DHGROUP_8192] = {
+		.name = "ffdhe8192", .kpp = "ffdhe8192(dh)" },
+};
+
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id)
+{
+	if (dhgroup_id > ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].name;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name);
+
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id)
+{
+	if (dhgroup_id > ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].kpp;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp);
+
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
+{
+	int i;
+
+	if (!dhgroup_name || !strlen(dhgroup_name))
+		return NVME_AUTH_DHGROUP_INVALID;
+	for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) {
+		if (!strlen(dhgroup_map[i].name))
+			continue;
+		if (!strncmp(dhgroup_map[i].name, dhgroup_name,
+			     strlen(dhgroup_map[i].name)))
+			return i;
+	}
+	return NVME_AUTH_DHGROUP_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
+
+static struct nvme_dhchap_hash_map {
+	int len;
+	const char hmac[15];
+	const char digest[8];
+} hash_map[] = {
+	[NVME_AUTH_HASH_SHA256] = {
+		.len = 32,
+		.hmac = "hmac(sha256)",
+		.digest = "sha256",
+	},
+	[NVME_AUTH_HASH_SHA384] = {
+		.len = 48,
+		.hmac = "hmac(sha384)",
+		.digest = "sha384",
+	},
+	[NVME_AUTH_HASH_SHA512] = {
+		.len = 64,
+		.hmac = "hmac(sha512)",
+		.digest = "sha512",
+	},
+};
+
+const char *nvme_auth_hmac_name(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].hmac;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
+
+const char *nvme_auth_digest_name(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].digest;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
+
+u8 nvme_auth_hmac_id(const char *hmac_name)
+{
+	int i;
+
+	if (!hmac_name || !strlen(hmac_name))
+		return NVME_AUTH_HASH_INVALID;
+
+	for (i = 0; i < ARRAY_SIZE(hash_map); i++) {
+		if (!strlen(hash_map[i].hmac))
+			continue;
+		if (!strncmp(hash_map[i].hmac, hmac_name,
+			     strlen(hash_map[i].hmac)))
+			return i;
+	}
+	return NVME_AUTH_HASH_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_id);
+
+size_t nvme_auth_hmac_hash_len(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return 0;
+	return hash_map[hmac_id].len;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash)
+{
+	struct nvme_dhchap_key *key;
+	unsigned char *p;
+	u32 crc;
+	int ret, key_len;
+	size_t allocated_len = strlen(secret);
+
+	/* Secret might be affixed with a ':' */
+	p = strrchr(secret, ':');
+	if (p)
+		allocated_len = p - secret;
+	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	if (!key)
+		return ERR_PTR(-ENOMEM);
+	key->key = kzalloc(allocated_len, GFP_KERNEL);
+	if (!key->key) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	key_len = base64_decode(secret, allocated_len, key->key);
+	if (key_len < 0) {
+		pr_debug("base64 key decoding error %d\n",
+			 key_len);
+		ret = key_len;
+		goto out_free_secret;
+	}
+
+	if (key_len != 36 && key_len != 52 &&
+	    key_len != 68) {
+		pr_err("Invalid key len %d\n", key_len);
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	if (key_hash > 0 &&
+	    (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
+		pr_err("Mismatched key len %d for %s\n", key_len,
+		       nvme_auth_hmac_name(key_hash));
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	/* The last four bytes is the CRC in little-endian format */
+	key_len -= 4;
+	/*
+	 * The linux implementation doesn't do pre- and post-increments,
+	 * so we have to do it manually.
+	 */
+	crc = ~crc32(~0, key->key, key_len);
+
+	if (get_unaligned_le32(key->key + key_len) != crc) {
+		pr_err("key crc mismatch (key %08x, crc %08x)\n",
+		       get_unaligned_le32(key->key + key_len), crc);
+		ret = -EKEYREJECTED;
+		goto out_free_secret;
+	}
+	key->len = key_len;
+	key->hash = key_hash;
+	return key;
+out_free_secret:
+	kfree_sensitive(key->key);
+out_free_key:
+	kfree(key);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
+
+void nvme_auth_free_key(struct nvme_dhchap_key *key)
+{
+	if (!key)
+		return;
+	kfree_sensitive(key->key);
+	kfree(key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free_key);
+
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+{
+	const char *hmac_name;
+	struct crypto_shash *key_tfm;
+	struct shash_desc *shash;
+	u8 *transformed_key;
+	int ret;
+
+	if (!key || !key->key) {
+		pr_warn("No key specified\n");
+		return ERR_PTR(-ENOKEY);
+	}
+	if (key->hash == 0) {
+		transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
+		return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+	}
+	hmac_name = nvme_auth_hmac_name(key->hash);
+	if (!hmac_name) {
+		pr_warn("Invalid key hash id %d\n", key->hash);
+		return ERR_PTR(-EINVAL);
+	}
+
+	key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(key_tfm))
+		return (u8 *)key_tfm;
+
+	shash = kmalloc(sizeof(struct shash_desc) +
+			crypto_shash_descsize(key_tfm),
+			GFP_KERNEL);
+	if (!shash) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+	if (!transformed_key) {
+		ret = -ENOMEM;
+		goto out_free_shash;
+	}
+
+	shash->tfm = key_tfm;
+	ret = crypto_shash_setkey(key_tfm, key->key, key->len);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_init(shash);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_update(shash, nqn, strlen(nqn));
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_final(shash, transformed_key);
+out_free_shash:
+	kfree(shash);
+out_free_key:
+	crypto_free_shash(key_tfm);
+	if (ret < 0) {
+		kfree_sensitive(transformed_key);
+		return ERR_PTR(ret);
+	}
+	return transformed_key;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
+
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
+{
+	struct nvme_dhchap_key *key;
+	u8 key_hash;
+
+	if (!secret) {
+		*ret_key = NULL;
+		return 0;
+	}
+
+	if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1)
+		return -EINVAL;
+
+	/* Pass in the secret without the 'DHHC-1:XX:' prefix */
+	key = nvme_auth_extract_key(secret + 10, key_hash);
+	if (IS_ERR(key)) {
+		*ret_key = NULL;
+		return PTR_ERR(key);
+	}
+
+	*ret_key = key;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 877d2ec4ea9f..6c503f42f3c6 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,6 +92,19 @@ config NVME_TCP
 
 	  If unsure, say N.
 
+config NVME_AUTH
+	bool "NVM Express over Fabrics In-Band Authentication"
+	depends on NVME_CORE
+	select NVME_COMMON
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	help
+	  This provides support for NVMe over Fabrics In-Band Authentication.
+
+	  If unsure, say N.
+
 config NVME_APPLE
 	tristate "Apple ANS2 NVM Express host driver"
 	depends on OF && BLOCK
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a36ae1612059..a3e88f32f560 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -16,6 +16,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
+nvme-core-$(CONFIG_NVME_AUTH)		+= auth.o
 
 nvme-y					+= pci.o
 
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
new file mode 100644
index 000000000000..9766bfffecac
--- /dev/null
+++ b/drivers/nvme/host/auth.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-auth.h>
+
+struct nvme_dhchap_queue_context {
+	struct list_head entry;
+	struct work_struct auth_work;
+	struct nvme_ctrl *ctrl;
+	struct crypto_shash *shash_tfm;
+	void *buf;
+	size_t buf_size;
+	int qid;
+	int error;
+	u32 s1;
+	u32 s2;
+	u16 transaction;
+	u8 status;
+	u8 hash_id;
+	size_t hash_len;
+	u8 dhgroup_id;
+	u8 c1[64];
+	u8 c2[64];
+	u8 response[64];
+	u8 *host_response;
+};
+
+#define nvme_auth_flags_from_qid(qid) \
+	(qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED
+#define nvme_auth_queue_from_qid(ctrl, qid) \
+	(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
+
+static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
+			    void *data, size_t data_len, bool auth_send)
+{
+	struct nvme_command cmd = {};
+	blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid);
+	struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid);
+	int ret;
+
+	cmd.auth_common.opcode = nvme_fabrics_command;
+	cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER;
+	cmd.auth_common.spsp0 = 0x01;
+	cmd.auth_common.spsp1 = 0x01;
+	if (auth_send) {
+		cmd.auth_send.fctype = nvme_fabrics_type_auth_send;
+		cmd.auth_send.tl = cpu_to_le32(data_len);
+	} else {
+		cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive;
+		cmd.auth_receive.al = cpu_to_le32(data_len);
+	}
+
+	ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len,
+				     qid == 0 ? NVME_QID_ANY : qid,
+				     0, flags);
+	if (ret > 0)
+		dev_warn(ctrl->device,
+			"qid %d auth_send failed with status %d\n", qid, ret);
+	else if (ret < 0)
+		dev_err(ctrl->device,
+			"qid %d auth_send failed with error %d\n", qid, ret);
+	return ret;
+}
+
+static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid,
+		struct nvmf_auth_dhchap_failure_data *data,
+		u16 transaction, u8 expected_msg)
+{
+	dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n",
+		__func__, qid, data->auth_type, data->auth_id);
+
+	if (data->auth_type == NVME_AUTH_COMMON_MESSAGES &&
+	    data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+		return data->rescode_exp;
+	}
+	if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES ||
+	    data->auth_id != expected_msg) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid message %02x/%02x\n",
+			 qid, data->auth_type, data->auth_id);
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	if (le16_to_cpu(data->t_id) != transaction) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid transaction ID %d\n",
+			 qid, le16_to_cpu(data->t_id));
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
+	size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+	memset((u8 *)chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->sc_c = 0; /* No secure channel concatenation */
+	data->napd = 1;
+	data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
+	data->auth_protocol[0].dhchap.halen = 3;
+	data->auth_protocol[0].dhchap.dhlen = 6;
+	data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
+	data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
+	data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
+	data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
+	data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
+	data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
+	data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
+	data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
+	data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
+
+	return size;
+}
+
+static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
+	u16 dhvlen = le16_to_cpu(data->dhvlen);
+	size_t size = sizeof(*data) + data->hl + dhvlen;
+	const char *hmac_name, *kpp_name;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	hmac_name = nvme_auth_hmac_name(data->hashid);
+	if (!hmac_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid HASH ID %d\n",
+			 chap->qid, data->hashid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (chap->hash_id == data->hashid && chap->shash_tfm &&
+	    !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
+	    crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing hash %s\n",
+			chap->qid, hmac_name);
+		goto select_kpp;
+	}
+
+	/* Reset if hash cannot be reused */
+	if (chap->shash_tfm) {
+		crypto_free_shash(chap->shash_tfm);
+		chap->hash_id = 0;
+		chap->hash_len = 0;
+	}
+	chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
+					     CRYPTO_ALG_ALLOCATES_MEMORY);
+	if (IS_ERR(chap->shash_tfm)) {
+		dev_warn(ctrl->device,
+			 "qid %d: failed to allocate hash %s, error %ld\n",
+			 chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %d\n",
+			 chap->qid, data->hl);
+		crypto_free_shash(chap->shash_tfm);
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Reset host response if the hash had been changed */
+	if (chap->hash_id != data->hashid) {
+		kfree(chap->host_response);
+		chap->host_response = NULL;
+	}
+
+	chap->hash_id = data->hashid;
+	chap->hash_len = data->hl;
+	dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
+		chap->qid, hmac_name);
+
+select_kpp:
+	kpp_name = nvme_auth_dhgroup_kpp(data->dhgid);
+	if (!kpp_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH group id %d\n",
+			 chap->qid, data->dhgid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
+		dev_warn(ctrl->device,
+			 "qid %d: unsupported DH group %s\n",
+			 chap->qid, kpp_name);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	} else if (dhvlen != 0) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH value for NULL DH\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+	chap->dhgroup_id = data->dhgid;
+
+	chap->s1 = le32_to_cpu(data->seqnum);
+	memcpy(chap->c1, data->cval, chap->hash_len);
+
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_reply_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	size += 2 * chap->hash_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->hl = chap->hash_len;
+	data->dhvlen = 0;
+	memcpy(data->rval, chap->response, chap->hash_len);
+	if (ctrl->ctrl_key) {
+		get_random_bytes(chap->c2, chap->hash_len);
+		data->cvalid = 1;
+		chap->s2 = nvme_auth_get_seqnum();
+		memcpy(data->rval + chap->hash_len, chap->c2,
+		       chap->hash_len);
+		dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, chap->c2);
+	} else {
+		memset(chap->c2, 0, chap->hash_len);
+		chap->s2 = 0;
+	}
+	data->seqnum = cpu_to_le32(chap->s2);
+	return size;
+}
+
+static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success1_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	if (ctrl->ctrl_key)
+		size += chap->hash_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (data->hl != chap->hash_len) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %u\n",
+			 chap->qid, data->hl);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: authenticated with hash %s dhgroup %s\n",
+			 nvme_auth_hmac_name(chap->hash_id),
+			 nvme_auth_dhgroup_name(chap->dhgroup_id));
+
+	if (!data->rvalid)
+		return 0;
+
+	/* Validate controller response */
+	if (memcmp(chap->response, data->rval, data->hl)) {
+		dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, data->rval);
+		dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len,
+			chap->response);
+		dev_warn(ctrl->device,
+			 "qid %d: controller authentication failed\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: controller authenticated\n");
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success2_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+	data->t_id = cpu_to_le16(chap->transaction);
+
+	return size;
+}
+
+static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_failure_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+	data->rescode_exp = chap->status;
+
+	return size;
+}
+
+static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 buf[4], *challenge = chap->c1;
+	int ret;
+
+	dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s1, chap->transaction);
+
+	if (!chap->host_response) {
+		chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+						ctrl->opts->host->nqn);
+		if (IS_ERR(chap->host_response)) {
+			ret = PTR_ERR(chap->host_response);
+			chap->host_response = NULL;
+			return ret;
+		}
+	} else {
+		dev_dbg(ctrl->device, "%s: qid %d re-using host response\n",
+			__func__, chap->qid);
+	}
+
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			chap->host_response, ctrl->host_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s1, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, sizeof(buf));
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "HostHost", 8);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+			    strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c1)
+		kfree(challenge);
+	return ret;
+}
+
+static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 *ctrl_response;
+	u8 buf[4], *challenge = chap->c2;
+	int ret;
+
+	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+				ctrl->opts->subsysnqn);
+	if (IS_ERR(ctrl_response)) {
+		ret = PTR_ERR(ctrl_response);
+		return ret;
+	}
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			ctrl_response, ctrl->ctrl_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s2, chap->transaction);
+	dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
+		__func__, chap->qid, (int)chap->hash_len, challenge);
+	dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n",
+		__func__, chap->qid, ctrl->opts->subsysnqn);
+	dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
+		__func__, chap->qid, ctrl->opts->host->nqn);
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s2, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, 4);
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "Controller", 10);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+				  strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c2)
+		kfree(challenge);
+	kfree(ctrl_response);
+	return ret;
+}
+
+static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
+{
+	chap->status = 0;
+	chap->error = 0;
+	chap->s1 = 0;
+	chap->s2 = 0;
+	chap->transaction = 0;
+	memset(chap->c1, 0, sizeof(chap->c1));
+	memset(chap->c2, 0, sizeof(chap->c2));
+}
+
+static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
+{
+	__nvme_auth_reset(chap);
+	if (chap->shash_tfm)
+		crypto_free_shash(chap->shash_tfm);
+	kfree_sensitive(chap->host_response);
+	kfree(chap->buf);
+	kfree(chap);
+}
+
+static void __nvme_auth_work(struct work_struct *work)
+{
+	struct nvme_dhchap_queue_context *chap =
+		container_of(work, struct nvme_dhchap_queue_context, auth_work);
+	struct nvme_ctrl *ctrl = chap->ctrl;
+	size_t tl;
+	int ret = 0;
+
+	chap->transaction = ctrl->transaction++;
+
+	/* DH-HMAC-CHAP Step 1: send negotiate */
+	dev_dbg(ctrl->device, "%s: qid %d send negotiate\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		return;
+	}
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		return;
+	}
+
+	/* DH-HMAC-CHAP Step 2: receive challenge */
+	dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive challenge, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	ret = nvme_auth_process_dhchap_challenge(ctrl, chap);
+	if (ret) {
+		/* Invalid challenge parameters */
+		chap->error = ret;
+		goto fail2;
+	}
+
+	dev_dbg(ctrl->device, "%s: qid %d host response\n",
+		__func__, chap->qid);
+	ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 3: send reply */
+	dev_dbg(ctrl->device, "%s: qid %d send reply\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_reply_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 4: receive success1 */
+	dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive success1, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid,
+					 chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	if (ctrl->ctrl_key) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d controller response\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
+	ret = nvme_auth_process_dhchap_success1(ctrl, chap);
+	if (ret) {
+		/* Controller authentication failed */
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		goto fail2;
+	}
+
+	if (ctrl->ctrl_key) {
+		/* DH-HMAC-CHAP Step 5: send success2 */
+		dev_dbg(ctrl->device, "%s: qid %d send success2\n",
+			__func__, chap->qid);
+		tl = nvme_auth_set_dhchap_success2_data(ctrl, chap);
+		ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+		if (ret)
+			chap->error = ret;
+	}
+	if (!ret) {
+		chap->error = 0;
+		return;
+	}
+
+fail2:
+	dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n",
+		__func__, chap->qid, chap->status);
+	tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	/*
+	 * only update error if send failure2 failed and no other
+	 * error had been set during authentication.
+	 */
+	if (ret && !chap->error)
+		chap->error = ret;
+}
+
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	if (!ctrl->host_key) {
+		dev_warn(ctrl->device, "qid %d: no key\n", qid);
+		return -ENOKEY;
+	}
+
+	if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) {
+		dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid);
+		return -ENOKEY;
+	}
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	/* Check if the context is already queued */
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		WARN_ON(!chap->buf);
+		if (chap->qid == qid) {
+			dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
+			mutex_unlock(&ctrl->dhchap_auth_mutex);
+			flush_work(&chap->auth_work);
+			__nvme_auth_reset(chap);
+			queue_work(nvme_wq, &chap->auth_work);
+			return 0;
+		}
+	}
+	chap = kzalloc(sizeof(*chap), GFP_KERNEL);
+	if (!chap) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		return -ENOMEM;
+	}
+	chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
+	chap->ctrl = ctrl;
+
+	/*
+	 * Allocate a large enough buffer for the entire negotiation:
+	 * 4k should be enough to ffdhe8192.
+	 */
+	chap->buf_size = 4096;
+	chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
+	if (!chap->buf) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		kfree(chap);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&chap->auth_work, __nvme_auth_work);
+	list_add(&chap->entry, &ctrl->dhchap_auth_list);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	queue_work(nvme_wq, &chap->auth_work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_negotiate);
+
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+	int ret;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		if (chap->qid != qid)
+			continue;
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		ret = chap->error;
+		return ret;
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_wait);
+
+void nvme_auth_reset(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		__nvme_auth_reset(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_reset);
+
+static void nvme_dhchap_auth_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, dhchap_auth_work);
+	int ret, q;
+
+	/* Authenticate admin queue first */
+	ret = nvme_auth_negotiate(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: error %d setting up authentication\n", ret);
+		return;
+	}
+	ret = nvme_auth_wait(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: authentication failed\n");
+		return;
+	}
+
+	for (q = 1; q < ctrl->queue_count; q++) {
+		ret = nvme_auth_negotiate(ctrl, q);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: error %d setting up authentication\n",
+				 q, ret);
+			break;
+		}
+	}
+
+	/*
+	 * Failure is a soft-state; credentials remain valid until
+	 * the controller terminates the connection.
+	 */
+}
+
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+{
+	INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
+	INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
+	mutex_init(&ctrl->dhchap_auth_mutex);
+	if (!ctrl->opts)
+		return;
+	nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
+	nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
+
+void nvme_auth_stop(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	cancel_work_sync(&ctrl->dhchap_auth_work);
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
+		cancel_work_sync(&chap->auth_work);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_stop);
+
+void nvme_auth_free(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
+		list_del_init(&chap->entry);
+		flush_work(&chap->auth_work);
+		__nvme_auth_free(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	if (ctrl->host_key) {
+		nvme_auth_free_key(ctrl->host_key);
+		ctrl->host_key = NULL;
+	}
+	if (ctrl->ctrl_key) {
+		nvme_auth_free_key(ctrl->ctrl_key);
+		ctrl->ctrl_key = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0fe0ed6a28ea..4aa869e37b4c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -24,6 +24,7 @@
 
 #include "nvme.h"
 #include "fabrics.h"
+#include <linux/nvme-auth.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -330,6 +331,7 @@ enum nvme_disposition {
 	COMPLETE,
 	RETRY,
 	FAILOVER,
+	AUTHENTICATE,
 };
 
 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
@@ -337,6 +339,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	if (likely(nvme_req(req)->status == 0))
 		return COMPLETE;
 
+	if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+		return AUTHENTICATE;
+
 	if (blk_noretry_request(req) ||
 	    (nvme_req(req)->status & NVME_SC_DNR) ||
 	    nvme_req(req)->retries >= nvme_max_retries)
@@ -375,11 +380,13 @@ static inline void nvme_end_req(struct request *req)
 
 void nvme_complete_rq(struct request *req)
 {
+	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
+
 	trace_nvme_complete_rq(req);
 	nvme_cleanup_cmd(req);
 
-	if (nvme_req(req)->ctrl->kas)
-		nvme_req(req)->ctrl->comp_seen = true;
+	if (ctrl->kas)
+		ctrl->comp_seen = true;
 
 	switch (nvme_decide_disposition(req)) {
 	case COMPLETE:
@@ -391,6 +398,14 @@ void nvme_complete_rq(struct request *req)
 	case FAILOVER:
 		nvme_failover_req(req);
 		return;
+	case AUTHENTICATE:
+#ifdef CONFIG_NVME_AUTH
+		queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+		nvme_retry_req(req);
+#else
+		nvme_end_req(req);
+#endif
+		return;
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -702,7 +717,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		switch (ctrl->state) {
 		case NVME_CTRL_CONNECTING:
 			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-			    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 				return true;
 			break;
 		default:
@@ -3609,6 +3626,108 @@ static ssize_t dctype_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(dctype);
 
+#ifdef CONFIG_NVME_AUTH
+static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_secret);
+		opts->dhchap_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_ctrl_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_ctrl_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_ctrl_secret);
+		opts->dhchap_ctrl_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
+#endif
+
 static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_reset_controller.attr,
 	&dev_attr_rescan_controller.attr,
@@ -3632,6 +3751,10 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_kato.attr,
 	&dev_attr_cntrltype.attr,
 	&dev_attr_dctype.attr,
+#ifdef CONFIG_NVME_AUTH
+	&dev_attr_dhchap_secret.attr,
+	&dev_attr_dhchap_ctrl_secret.attr,
+#endif
 	NULL
 };
 
@@ -3655,6 +3778,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 		return 0;
 	if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
 		return 0;
+#ifdef CONFIG_NVME_AUTH
+	if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
+		return 0;
+#endif
 
 	return a->mode;
 }
@@ -4548,8 +4677,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 		 * recovery actions from interfering with the controller's
 		 * firmware activation.
 		 */
-		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+			nvme_auth_stop(ctrl);
 			queue_work(nvme_wq, &ctrl->fw_act_work);
+		}
 		break;
 #ifdef CONFIG_NVME_MULTIPATH
 	case NVME_AER_NOTICE_ANA:
@@ -4613,6 +4744,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
@@ -4672,6 +4804,8 @@ static void nvme_free_ctrl(struct device *dev)
 
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	nvme_auth_stop(ctrl);
+	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
 
 	if (subsys) {
@@ -4762,6 +4896,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
 	nvme_mpath_init_ctrl(ctrl);
+	nvme_auth_init_ctrl(ctrl);
 
 	return 0;
 out_free_name:
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index e4b1520862d8..5207a2348257 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -369,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	union nvme_result res;
 	struct nvmf_connect_data *data;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -401,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 		goto out_free_data;
 	}
 
-	ctrl->cntlid = le16_to_cpu(res.u16);
-
+	result = le32_to_cpu(res.u32);
+	ctrl->cntlid = result & 0xFFFF;
+	if ((result >> 16) & 0x3) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, 0);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid 0: authentication setup failed\n");
+			ret = NVME_SC_AUTH_REQUIRED;
+			goto out_free_data;
+		}
+		ret = nvme_auth_wait(ctrl, 0);
+		if (ret)
+			dev_warn(ctrl->device,
+				 "qid 0: authentication failed\n");
+		else
+			dev_info(ctrl->device,
+				 "qid 0: authenticated\n");
+	}
 out_free_data:
 	kfree(data);
 	return ret;
@@ -435,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	struct nvmf_connect_data *data;
 	union nvme_result res;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -460,6 +479,21 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
 	}
+	result = le32_to_cpu(res.u32);
+	if ((result >> 16) & 2) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, qid);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: authentication setup failed\n", qid);
+			ret = NVME_SC_AUTH_REQUIRED;
+		} else {
+			ret = nvme_auth_wait(ctrl, qid);
+			if (ret)
+				dev_warn(ctrl->device,
+					 "qid %u: authentication failed\n", qid);
+		}
+	}
 	kfree(data);
 	return ret;
 }
@@ -552,6 +586,8 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TOS,			"tos=%d"		},
 	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
 	{ NVMF_OPT_DISCOVERY,		"discovery"		},
+	{ NVMF_OPT_DHCHAP_SECRET,	"dhchap_secret=%s"	},
+	{ NVMF_OPT_DHCHAP_CTRL_SECRET,	"dhchap_ctrl_secret=%s"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -833,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISCOVERY:
 			opts->discovery_nqn = true;
 			break;
+		case NVMF_OPT_DHCHAP_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_secret);
+			opts->dhchap_secret = p;
+			break;
+		case NVMF_OPT_DHCHAP_CTRL_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_ctrl_secret);
+			opts->dhchap_ctrl_secret = p;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -951,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 	kfree(opts->subsysnqn);
 	kfree(opts->host_traddr);
 	kfree(opts->host_iface);
+	kfree(opts->dhchap_secret);
+	kfree(opts->dhchap_ctrl_secret);
 	kfree(opts);
 }
 EXPORT_SYMBOL_GPL(nvmf_free_options);
@@ -960,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
 				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
-				 NVMF_OPT_FAIL_FAST_TMO)
+				 NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
+				 NVMF_OPT_DHCHAP_CTRL_SECRET)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
@@ -1196,7 +1263,14 @@ static void __exit nvmf_exit(void)
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 46d6e194ac2b..a6e22116e139 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -68,6 +68,8 @@ enum {
 	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 	NVMF_OPT_HOST_IFACE	= 1 << 21,
 	NVMF_OPT_DISCOVERY	= 1 << 22,
+	NVMF_OPT_DHCHAP_SECRET	= 1 << 23,
+	NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
 };
 
 /**
@@ -97,6 +99,9 @@ enum {
  * @max_reconnects: maximum number of allowed reconnect attempts before removing
  *              the controller, (-1) means reconnect forever, zero means remove
  *              immediately;
+ * @dhchap_secret: DH-HMAC-CHAP secret
+ * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
+ *              authentication
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
@@ -121,6 +126,8 @@ struct nvmf_ctrl_options {
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
+	char			*dhchap_secret;
+	char			*dhchap_ctrl_secret;
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0085a3053e90..85929d70b776 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -328,6 +328,15 @@ struct nvme_ctrl {
 	struct work_struct ana_work;
 #endif
 
+#ifdef CONFIG_NVME_AUTH
+	struct work_struct dhchap_auth_work;
+	struct list_head dhchap_auth_list;
+	struct mutex dhchap_auth_mutex;
+	struct nvme_dhchap_key *host_key;
+	struct nvme_dhchap_key *ctrl_key;
+	u16 transaction;
+#endif
+
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
 	bool apst_enabled;
@@ -992,6 +1001,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 	return ctrl->sgls & ((1 << 0) | (1 << 1));
 }
 
+#ifdef CONFIG_NVME_AUTH
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_auth_stop(struct nvme_ctrl *ctrl);
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
+void nvme_auth_reset(struct nvme_ctrl *ctrl);
+void nvme_auth_free(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
+static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
+static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	return -EPROTONOSUPPORT;
+}
+static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	return NVME_SC_AUTH_REQUIRED;
+}
+static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
+#endif
+
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
 int nvme_execute_passthru_rq(struct request *rq);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4665aebd944d..7ecdeb58bd27 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1205,6 +1205,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	struct nvme_rdma_ctrl *ctrl = container_of(work,
 			struct nvme_rdma_ctrl, err_work);
 
+	nvme_auth_stop(&ctrl->ctrl);
 	nvme_stop_keep_alive(&ctrl->ctrl);
 	flush_work(&ctrl->ctrl.async_event_work);
 	nvme_rdma_teardown_io_queues(ctrl, false);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b95ee85053e3..592ec078c5d7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2173,6 +2173,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	flush_work(&ctrl->async_event_work);
 	nvme_tcp_teardown_io_queues(ctrl, false);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 2a89c5aa0790..1c36fcedea20 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
 	return ret;
 }
 
+static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 tl = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u",
+			 spsp0, spsp1, secp, tl);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 al = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u",
+			 spsp0, spsp1, secp, al);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
 static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
 		return nvme_trace_fabrics_connect(p, spc);
 	case nvme_fabrics_type_property_get:
 		return nvme_trace_fabrics_property_get(p, spc);
+	case nvme_fabrics_type_auth_send:
+		return nvme_trace_fabrics_auth_send(p, spc);
+	case nvme_fabrics_type_auth_receive:
+		return nvme_trace_fabrics_auth_receive(p, spc);
 	default:
 		return nvme_trace_fabrics_common(p, spc);
 	}
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
new file mode 100644
index 000000000000..354456826221
--- /dev/null
+++ b/include/linux/nvme-auth.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions
+ */
+
+#ifndef _NVME_AUTH_H
+#define _NVME_AUTH_H
+
+#include <crypto/kpp.h>
+
+struct nvme_dhchap_key {
+	u8 *key;
+	size_t len;
+	u8 hash;
+};
+
+u32 nvme_auth_get_seqnum(void);
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id);
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id);
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name);
+
+const char *nvme_auth_hmac_name(u8 hmac_id);
+const char *nvme_auth_digest_name(u8 hmac_id);
+size_t nvme_auth_hmac_hash_len(u8 hmac_id);
+u8 nvme_auth_hmac_id(const char *hmac_name);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash);
+void nvme_auth_free_key(struct nvme_dhchap_key *key);
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+
+#endif /* _NVME_AUTH_H */
-- 
cgit 


From b61775d185a395f26fecdc7898e39de677a6c3dd Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:03 +0200
Subject: nvme-auth: Diffie-Hellman key exchange support

Implement Diffie-Hellman key exchange using FFDHE groups
for NVMe In-Band Authentication.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/common/auth.c | 153 ++++++++++++++++++++++++++++++++++
 drivers/nvme/host/Kconfig  |   2 +
 drivers/nvme/host/auth.c   | 201 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nvme-auth.h  |   8 ++
 4 files changed, 358 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index adb43d341ffb..94cc7cafbb9d 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -295,6 +295,159 @@ out_free_key:
 }
 EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
 
+static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
+{
+	const char *digest_name;
+	struct crypto_shash *tfm;
+	int ret;
+
+	digest_name = nvme_auth_digest_name(hmac_id);
+	if (!digest_name) {
+		pr_debug("%s: failed to get digest for %d\n", __func__,
+			 hmac_id);
+		return -EINVAL;
+	}
+	tfm = crypto_alloc_shash(digest_name, 0, 0);
+	if (IS_ERR(tfm))
+		return -ENOMEM;
+
+	ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
+	if (ret < 0)
+		pr_debug("%s: Failed to hash digest len %zu\n", __func__,
+			 skey_len);
+
+	crypto_free_shash(tfm);
+	return ret;
+}
+
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+		u8 *challenge, u8 *aug, size_t hlen)
+{
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
+	u8 *hashed_key;
+	const char *hmac_name;
+	int ret;
+
+	hashed_key = kmalloc(hlen, GFP_KERNEL);
+	if (!hashed_key)
+		return -ENOMEM;
+
+	ret = nvme_auth_hash_skey(hmac_id, skey,
+				  skey_len, hashed_key);
+	if (ret < 0)
+		goto out_free_key;
+
+	hmac_name = nvme_auth_hmac_name(hmac_id);
+	if (!hmac_name) {
+		pr_warn("%s: invalid hash algoritm %d\n",
+			__func__, hmac_id);
+		ret = -EINVAL;
+		goto out_free_key;
+	}
+
+	tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(tfm)) {
+		ret = PTR_ERR(tfm);
+		goto out_free_key;
+	}
+
+	desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+		       GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto out_free_hash;
+	}
+	desc->tfm = tfm;
+
+	ret = crypto_shash_setkey(tfm, hashed_key, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_init(desc);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_update(desc, challenge, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_final(desc, aug);
+out_free_desc:
+	kfree_sensitive(desc);
+out_free_hash:
+	crypto_free_shash(tfm);
+out_free_key:
+	kfree_sensitive(hashed_key);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
+
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid)
+{
+	int ret;
+
+	ret = crypto_kpp_set_secret(dh_tfm, NULL, 0);
+	if (ret)
+		pr_debug("failed to set private key, error %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey);
+
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+		u8 *host_key, size_t host_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	kpp_request_set_input(req, NULL, 0);
+	sg_init_one(&dst, host_key, host_key_len);
+	kpp_request_set_output(req, &dst, host_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait);
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
+
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+		u8 *ctrl_key, size_t ctrl_key_len,
+		u8 *sess_key, size_t sess_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist src, dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	sg_init_one(&src, ctrl_key, ctrl_key_len);
+	kpp_request_set_input(req, &src, ctrl_key_len);
+	sg_init_one(&dst, sess_key, sess_key_len);
+	kpp_request_set_output(req, &dst, sess_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait);
+
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
+
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
 {
 	struct nvme_dhchap_key *key;
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 6c503f42f3c6..2f6a7f8c94e8 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -100,6 +100,8 @@ config NVME_AUTH
 	select CRYPTO_HMAC
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
+	select CRYPTO_DH
+	select CRYPTO_DH_RFC7919_GROUPS
 	help
 	  This provides support for NVMe over Fabrics In-Band Authentication.
 
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 9766bfffecac..c8a6db7c4498 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -18,6 +18,7 @@ struct nvme_dhchap_queue_context {
 	struct work_struct auth_work;
 	struct nvme_ctrl *ctrl;
 	struct crypto_shash *shash_tfm;
+	struct crypto_kpp *dh_tfm;
 	void *buf;
 	size_t buf_size;
 	int qid;
@@ -33,6 +34,12 @@ struct nvme_dhchap_queue_context {
 	u8 c2[64];
 	u8 response[64];
 	u8 *host_response;
+	u8 *ctrl_key;
+	int ctrl_key_len;
+	u8 *host_key;
+	int host_key_len;
+	u8 *sess_key;
+	int sess_key_len;
 };
 
 #define nvme_auth_flags_from_qid(qid) \
@@ -137,6 +144,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
 	struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
 	u16 dhvlen = le16_to_cpu(data->dhvlen);
 	size_t size = sizeof(*data) + data->hl + dhvlen;
+	const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
 	const char *hmac_name, *kpp_name;
 
 	if (chap->buf_size < size) {
@@ -207,15 +215,54 @@ select_kpp:
 			 "qid %d: invalid DH group id %d\n",
 			 chap->qid, data->dhgid);
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		/* Leave previous dh_tfm intact */
 		return NVME_SC_AUTH_REQUIRED;
 	}
 
+	/* Clear host and controller key to avoid accidental reuse */
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+
+	if (chap->dhgroup_id == data->dhgid &&
+	    (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing DH group %s\n",
+			chap->qid, gid_name);
+		goto skip_kpp;
+	}
+
+	/* Reset dh_tfm if it can't be reused */
+	if (chap->dh_tfm) {
+		crypto_free_kpp(chap->dh_tfm);
+		chap->dh_tfm = NULL;
+	}
+
 	if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
-		dev_warn(ctrl->device,
-			 "qid %d: unsupported DH group %s\n",
-			 chap->qid, kpp_name);
-		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
-		return NVME_SC_AUTH_REQUIRED;
+		if (dhvlen == 0) {
+			dev_warn(ctrl->device,
+				 "qid %d: empty DH value\n",
+				 chap->qid);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			return NVME_SC_INVALID_FIELD;
+		}
+
+		chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0);
+		if (IS_ERR(chap->dh_tfm)) {
+			int ret = PTR_ERR(chap->dh_tfm);
+
+			dev_warn(ctrl->device,
+				 "qid %d: error %d initializing DH group %s\n",
+				 chap->qid, ret, gid_name);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			chap->dh_tfm = NULL;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		dev_dbg(ctrl->device, "qid %d: selected DH group %s\n",
+			chap->qid, gid_name);
 	} else if (dhvlen != 0) {
 		dev_warn(ctrl->device,
 			 "qid %d: invalid DH value for NULL DH\n",
@@ -225,8 +272,21 @@ select_kpp:
 	}
 	chap->dhgroup_id = data->dhgid;
 
+skip_kpp:
 	chap->s1 = le32_to_cpu(data->seqnum);
 	memcpy(chap->c1, data->cval, chap->hash_len);
+	if (dhvlen) {
+		chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL);
+		if (!chap->ctrl_key) {
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		chap->ctrl_key_len = dhvlen;
+		memcpy(chap->ctrl_key, data->cval + chap->hash_len,
+		       dhvlen);
+		dev_dbg(ctrl->device, "ctrl public key %*ph\n",
+			 (int)chap->ctrl_key_len, chap->ctrl_key);
+	}
 
 	return 0;
 }
@@ -239,6 +299,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 
 	size += 2 * chap->hash_len;
 
+	if (chap->host_key_len)
+		size += chap->host_key_len;
+
 	if (chap->buf_size < size) {
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
 		return -EINVAL;
@@ -249,7 +312,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
 	data->t_id = cpu_to_le16(chap->transaction);
 	data->hl = chap->hash_len;
-	data->dhvlen = 0;
+	data->dhvlen = cpu_to_le16(chap->host_key_len);
 	memcpy(data->rval, chap->response, chap->hash_len);
 	if (ctrl->ctrl_key) {
 		get_random_bytes(chap->c2, chap->hash_len);
@@ -264,6 +327,14 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 		chap->s2 = 0;
 	}
 	data->seqnum = cpu_to_le32(chap->s2);
+	if (chap->host_key_len) {
+		dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
+			__func__, chap->qid,
+			chap->host_key_len, chap->host_key);
+		memcpy(data->rval + 2 * chap->hash_len, chap->host_key,
+		       chap->host_key_len);
+	}
+
 	return size;
 }
 
@@ -381,6 +452,21 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 		goto out;
 	}
 
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c1, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
+
 	shash->tfm = chap->shash_tfm;
 	ret = crypto_shash_init(shash);
 	if (ret)
@@ -443,6 +529,20 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		goto out;
 	}
 
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c2, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
 	dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
 		__func__, chap->qid, chap->s2, chap->transaction);
 	dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
@@ -492,8 +592,81 @@ out:
 	return ret;
 }
 
+static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	int ret;
+
+	if (chap->host_key && chap->host_key_len) {
+		dev_dbg(ctrl->device,
+			"qid %d: reusing host key\n", chap->qid);
+		goto gen_sesskey;
+	}
+	ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id);
+	if (ret < 0) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+	chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm);
+
+	chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL);
+	if (!chap->host_key) {
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+	ret = nvme_auth_gen_pubkey(chap->dh_tfm,
+				   chap->host_key, chap->host_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate public key, error %d\n", ret);
+		kfree(chap->host_key);
+		chap->host_key = NULL;
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+gen_sesskey:
+	chap->sess_key_len = chap->host_key_len;
+	chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL);
+	if (!chap->sess_key) {
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+
+	ret = nvme_auth_gen_shared_secret(chap->dh_tfm,
+					  chap->ctrl_key, chap->ctrl_key_len,
+					  chap->sess_key, chap->sess_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate shared secret, error %d\n", ret);
+		kfree_sensitive(chap->sess_key);
+		chap->sess_key = NULL;
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+	dev_dbg(ctrl->device, "shared secret %*ph\n",
+		(int)chap->sess_key_len, chap->sess_key);
+	return 0;
+}
+
 static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
 {
+	kfree_sensitive(chap->host_response);
+	chap->host_response = NULL;
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+	kfree_sensitive(chap->sess_key);
+	chap->sess_key = NULL;
+	chap->sess_key_len = 0;
 	chap->status = 0;
 	chap->error = 0;
 	chap->s1 = 0;
@@ -508,6 +681,11 @@ static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
 	__nvme_auth_reset(chap);
 	if (chap->shash_tfm)
 		crypto_free_shash(chap->shash_tfm);
+	if (chap->dh_tfm)
+		crypto_free_kpp(chap->dh_tfm);
+	kfree_sensitive(chap->ctrl_key);
+	kfree_sensitive(chap->host_key);
+	kfree_sensitive(chap->sess_key);
 	kfree_sensitive(chap->host_response);
 	kfree(chap->buf);
 	kfree(chap);
@@ -566,6 +744,17 @@ static void __nvme_auth_work(struct work_struct *work)
 		goto fail2;
 	}
 
+	if (chap->ctrl_key_len) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d DH exponential\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_exponential(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
 	dev_dbg(ctrl->device, "%s: qid %d host response\n",
 		__func__, chap->qid);
 	ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index 354456826221..dcb8030062dd 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -29,5 +29,13 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
 u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+				  u8 *challenge, u8 *aug, size_t hlen);
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid);
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+			 u8 *host_key, size_t host_key_len);
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+				u8 *ctrl_key, size_t ctrl_key_len,
+				u8 *sess_key, size_t sess_key_len);
 
 #endif /* _NVME_AUTH_H */
-- 
cgit 


From 5a97806f7dc069d9561d9930a2ae108700e222ab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Jul 2022 12:22:55 -0400
Subject: block: change the blk_queue_split calling convention

The double indirect bio leads to somewhat suboptimal code generation.
Instead return the (original or split) bio, and make sure the
request_queue arguments to the lower level helpers is passed after the
bio to avoid constant reshuffling of the argument passing registers.

Also give it and the helpers used to implement it more descriptive names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220727162300.3089193-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c             | 98 +++++++++++++++++++++----------------------
 block/blk-mq.c                |  4 +-
 block/blk.h                   |  6 +--
 drivers/block/drbd/drbd_req.c |  2 +-
 drivers/block/pktcdvd.c       |  2 +-
 drivers/block/ps3vram.c       |  2 +-
 drivers/md/dm.c               |  6 +--
 drivers/md/md.c               |  2 +-
 drivers/nvme/host/multipath.c |  2 +-
 drivers/s390/block/dcssblk.c  |  2 +-
 include/linux/blkdev.h        |  2 +-
 11 files changed, 63 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4c8a699754c9..6e29fb28584e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -95,10 +95,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 	return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
-static struct bio *blk_bio_discard_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *nsegs)
+static struct bio *bio_split_discard(struct bio *bio, struct request_queue *q,
+		unsigned *nsegs, struct bio_set *bs)
 {
 	unsigned int max_discard_sectors, granularity;
 	int alignment;
@@ -139,8 +137,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
-		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+		struct request_queue *q, unsigned *nsegs, struct bio_set *bs)
 {
 	*nsegs = 0;
 
@@ -161,8 +159,8 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
  * requests that are submitted to a block device if the start of a bio is not
  * aligned to a physical block boundary.
  */
-static inline unsigned get_max_io_size(struct request_queue *q,
-				       struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+		struct request_queue *q)
 {
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
@@ -247,16 +245,16 @@ static bool bvec_split_segs(const struct request_queue *q,
 }
 
 /**
- * blk_bio_segment_split - split a bio in two bios
- * @q:    [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
  * @bio:  [in] bio to be split
- * @bs:	  [in] bio set to allocate the clone from
+ * @q:    [in] request queue pointer
  * @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs:	  [in] bio set to allocate the clone from
  *
  * Clone @bio, update the bi_iter of the clone to represent the first sectors
  * of @bio and update @bio->bi_iter to represent the remaining sectors. The
  * following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most get_max_io_size(@bio, @q) sectors.
  * - That it has at most queue_max_segments(@q) segments.
  *
  * Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -265,15 +263,13 @@ static bool bvec_split_segs(const struct request_queue *q,
  * responsible for ensuring that @bs is only destroyed after processing of the
  * split bio has finished.
  */
-static struct bio *blk_bio_segment_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *segs)
+static struct bio *bio_split_rw(struct bio *bio, struct request_queue *q,
+		unsigned *segs, struct bio_set *bs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
-	const unsigned max_bytes = get_max_io_size(q, bio) << 9;
+	const unsigned max_bytes = get_max_io_size(bio, q) << 9;
 	const unsigned max_segs = queue_max_segments(q);
 
 	bio_for_each_bvec(bv, bio, iter) {
@@ -320,34 +316,33 @@ split:
 }
 
 /**
- * __blk_queue_split - split a bio and submit the second half
- * @q:       [in] request_queue new bio is being queued at
- * @bio:     [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
+ * @q:       request_queue new bio is being queued at
+ * @nr_segs: returns the number of segments in the returned bio
+ *
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it.  @bio is
+ * shortened to the remainder and re-submitted.
  *
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from q->bio_split, it is the responsibility
- * of the caller to ensure that q->bio_split is only released after processing
- * of the split bio has finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
+struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
 		       unsigned int *nr_segs)
 {
-	struct bio *split = NULL;
+	struct bio *split;
 
-	switch (bio_op(*bio)) {
+	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_discard(bio, q, nr_segs, &q->bio_split);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
-				nr_segs);
+		split = bio_split_write_zeroes(bio, q, nr_segs, &q->bio_split);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_rw(bio, q, nr_segs, &q->bio_split);
 		break;
 	}
 
@@ -356,32 +351,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
 		split->bi_opf |= REQ_NOMERGE;
 
 		blkcg_bio_issue_init(split);
-		bio_chain(split, *bio);
-		trace_block_split(split, (*bio)->bi_iter.bi_sector);
-		submit_bio_noacct(*bio);
-		*bio = split;
+		bio_chain(split, bio);
+		trace_block_split(split, bio->bi_iter.bi_sector);
+		submit_bio_noacct(bio);
+		return split;
 	}
+	return bio;
 }
 
 /**
- * blk_queue_split - split a bio and submit the second half
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
+ *
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it.  @bio is shortened to the remainder and re-submitted.
  *
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from q->bio_split, it is the responsibility of the caller to ensure
- * that q->bio_split is only released after processing of the split bio has
- * finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void blk_queue_split(struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned int nr_segs;
 
-	if (blk_may_split(q, *bio))
-		__blk_queue_split(q, bio, &nr_segs);
+	if (bio_may_exceed_limits(bio, q))
+		return __bio_split_to_limits(bio, q, &nr_segs);
+	return bio;
 }
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
 
 unsigned int blk_recalc_rq_segments(struct request *rq)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70177ee74295..790f55453f1b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2816,8 +2816,8 @@ void blk_mq_submit_bio(struct bio *bio)
 	blk_status_t ret;
 
 	blk_queue_bounce(q, &bio);
-	if (blk_may_split(q, bio))
-		__blk_queue_split(q, &bio, &nr_segs);
+	if (bio_may_exceed_limits(bio, q))
+		bio = __bio_split_to_limits(bio, q, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
 		return;
diff --git a/block/blk.h b/block/blk.h
index 1d83b1d41cd1..623be4c2e60c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -288,7 +288,7 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
 				const char *, size_t);
 
-static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+static inline bool bio_may_exceed_limits(struct bio *bio, struct request_queue *q)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
@@ -311,8 +311,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 		bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
 }
 
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
-			unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
+		       unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 6d8dd14458c6..8f7f144e54f3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
 {
 	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	/*
 	 * what we "blindly" assume:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 01a15dbd9cde..4cea3b08087e 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
 	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
 	struct bio *split;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index d1e0fefec90b..e1d080f680ed 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 99642f69bfa7..fbcffcfb2304 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	 * Does the target need to split IO even further?
 	 * - varied (per target) IO splitting is a tenet of DM; this
 	 *   explains why stacked chunk_sectors based splitting via
-	 *   blk_queue_split() isn't possible here.
+	 *   bio_split_to_limits() isn't possible here.
 	 */
 	if (!ti->max_io_len)
 		return len;
@@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	is_abnormal = is_abnormal_io(bio);
 	if (unlikely(is_abnormal)) {
 		/*
-		 * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
 		 * otherwise associated queue_limits won't be imposed.
 		 */
-		blk_queue_split(&bio);
+		bio = bio_split_to_limits(bio);
 	}
 
 	init_clone_info(&ci, md, map, bio, is_abnormal);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 35b895813c88..afaf36b2f6ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -442,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
 		return;
 	}
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		if (bio_sectors(bio) != 0)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 64f44d98daaf..6ef497c75a16 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -346,7 +346,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	 * different queue via blk_steal_bios(), so we need to use the bio_split
 	 * pool from the original queue to allocate the bvecs from.
 	 */
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4d8d1759775a..5187705bd0f3 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -863,7 +863,7 @@ dcssblk_submit_bio(struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d04bdf549efa..5eef8d2eddc1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -864,9 +864,9 @@ void blk_request_module(dev_t devt);
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 void submit_bio_noacct(struct bio *bio);
+struct bio *bio_split_to_limits(struct bio *bio);
 
 extern int blk_lld_busy(struct request_queue *q);
-extern void blk_queue_split(struct bio **);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-- 
cgit 


From 46754bd056053785d7079f1d48f7f571728dcb47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Jul 2022 12:22:57 -0400
Subject: block: move ->bio_split to the gendisk

Only non-passthrough requests are split by the block layer and use the
->bio_split bio_set.  Move it from the request_queue to the gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220727162300.3089193-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 9 +--------
 block/blk-merge.c      | 7 ++++---
 block/blk-sysfs.c      | 2 --
 block/genhd.c          | 8 +++++++-
 drivers/md/dm.c        | 2 +-
 include/linux/blkdev.h | 3 ++-
 6 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3d286a256d3d..a0d1104c5590 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
 			GFP_KERNEL | __GFP_ZERO, node_id);
@@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 	if (q->id < 0)
 		goto fail_srcu;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
-	if (ret)
-		goto fail_id;
-
 	q->stats = blk_alloc_queue_stats();
 	if (!q->stats)
-		goto fail_split;
+		goto fail_id;
 
 	q->node = node_id;
 
@@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 
 fail_stats:
 	blk_free_queue_stats(q->stats);
-fail_split:
-	bioset_exit(&q->bio_split);
 fail_id:
 	ida_free(&blk_queue_ida, q->id);
 fail_srcu:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6e29fb28584e..30872a353764 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -331,18 +331,19 @@ split:
 struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
 		       unsigned int *nr_segs)
 {
+	struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
 	struct bio *split;
 
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = bio_split_discard(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_discard(bio, q, nr_segs, bs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = bio_split_write_zeroes(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_write_zeroes(bio, q, nr_segs, bs);
 		break;
 	default:
-		split = bio_split_rw(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_rw(bio, q, nr_segs, bs);
 		break;
 	}
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c0303026752d..e1f009aba6fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
 	if (queue_is_mq(q))
 		blk_mq_release(q);
 
-	bioset_exit(&q->bio_split);
-
 	if (blk_queue_has_srcu(q))
 		cleanup_srcu_struct(q->srcu);
 
diff --git a/block/genhd.c b/block/genhd.c
index e1d5b10ac193..b901fea1d55a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
 		blk_mq_exit_queue(disk->queue);
 
 	blkcg_exit_queue(disk->queue);
+	bioset_exit(&disk->bio_split);
 
 	disk_release_events(disk);
 	kfree(disk->random);
@@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (!disk)
 		goto out_put_queue;
 
+	if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+		goto out_free_disk;
+
 	disk->bdi = bdi_alloc(node_id);
 	if (!disk->bdi)
-		goto out_free_disk;
+		goto out_free_bioset;
 
 	/* bdev_alloc() might need the queue, set before the first call */
 	disk->queue = q;
@@ -1382,6 +1386,8 @@ out_destroy_part_tbl:
 	iput(disk->part0->bd_inode);
 out_free_bdi:
 	bdi_put(disk->bdi);
+out_free_bioset:
+	bioset_exit(&disk->bio_split);
 out_free_disk:
 	kfree(disk);
 out_put_queue:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fbcffcfb2304..28bd4a35b86b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
 	while (io) {
 		struct dm_io *next = io->next;
 
-		dm_io_rewind(io, &md->queue->bio_split);
+		dm_io_rewind(io, &md->disk->bio_split);
 
 		io->next = NULL;
 		__dm_io_complete(io, false);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5eef8d2eddc1..49dcd31e283e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -140,6 +140,8 @@ struct gendisk {
 	struct request_queue *queue;
 	void *private_data;
 
+	struct bio_set bio_split;
+
 	int flags;
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
@@ -531,7 +533,6 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		bio_split;
 
 	struct dentry		*debugfs_dir;
 	struct dentry		*sched_debugfs_dir;
-- 
cgit 


From 0aa73170eba5eae638c1b96a05eba533f030b5cb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 30 Jul 2022 17:27:49 +0800
Subject: ublk_drv: add SET_PARAMS/GET_PARAMS control command

Add two commands to set/get parameters generically.

One important goal of ublk is to provide generic framework for making
block device by userspace flexibly.

As one generic block device, there are still lots of block parameters,
such as max_sectors, write_cache/fua, discard related limits,
zoned parameters, ...., so this patch starts to add generic mechanism
for set/get device parameters.

Both generic block parameters(all kinds of queue settings) and ublk
feature parameters can be covered with this way, then it becomes quite
easy to extend in future.

Add two parameter types are used so far: basic(covers basic queue setting
and misc settings which can't be grouped easily) and discard, basic type
must be set, and discard type becomes optional now

This way provides mechanism to simulate any kind of generic block device
from userspace easily, from both block queue setting viewpoint or ublk
feature viewpoint.

The style of putting all parameters together is suggested by Christoph.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220730092750.1118167-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 205 ++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/ublk_cmd.h |  47 ++++++++++
 2 files changed, 234 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index ae98e81b21ce..20ad83b25318 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -49,6 +49,9 @@
 /* All UBLK_F_* have to be included into UBLK_F_ALL */
 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
 
+/* All UBLK_PARAM_TYPE_* should be included here */
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
+
 struct ublk_rq_data {
 	struct callback_head work;
 };
@@ -137,6 +140,8 @@ struct ublk_device {
 	spinlock_t		mm_lock;
 	struct mm_struct	*mm;
 
+	struct ublk_params	params;
+
 	struct completion	completion;
 	unsigned int		nr_queues_ready;
 	atomic_t		nr_aborted_queues;
@@ -149,6 +154,12 @@ struct ublk_device {
 	struct work_struct	stop_work;
 };
 
+/* header of ublk_params */
+struct ublk_params_header {
+	__u32	len;
+	__u32	types;
+};
+
 static dev_t ublk_chr_devt;
 static struct class *ublk_chr_class;
 
@@ -160,6 +171,91 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static void ublk_dev_param_basic_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
+	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
+	blk_queue_io_min(q, 1 << p->io_min_shift);
+	blk_queue_io_opt(q, 1 << p->io_opt_shift);
+
+	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
+			p->attrs & UBLK_ATTR_FUA);
+	if (p->attrs & UBLK_ATTR_ROTATIONAL)
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+
+	blk_queue_max_hw_sectors(q, p->max_sectors);
+	blk_queue_chunk_sectors(q, p->chunk_sectors);
+	blk_queue_virt_boundary(q, p->virt_boundary_mask);
+
+	if (p->attrs & UBLK_ATTR_READ_ONLY)
+		set_disk_ro(ub->ub_disk, true);
+
+	set_capacity(ub->ub_disk, p->dev_sectors);
+}
+
+static void ublk_dev_param_discard_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_discard *p = &ub->params.discard;
+
+	q->limits.discard_alignment = p->discard_alignment;
+	q->limits.discard_granularity = p->discard_granularity;
+	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
+	blk_queue_max_write_zeroes_sectors(q,
+			p->max_write_zeroes_sectors);
+	blk_queue_max_discard_segments(q, p->max_discard_segments);
+}
+
+static int ublk_validate_params(const struct ublk_device *ub)
+{
+	/* basic param is the only one which must be set */
+	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
+		const struct ublk_param_basic *p = &ub->params.basic;
+
+		if (p->logical_bs_shift > PAGE_SHIFT)
+			return -EINVAL;
+
+		if (p->logical_bs_shift > p->physical_bs_shift)
+			return -EINVAL;
+
+		if (p->max_sectors > (ub->dev_info.rq_max_blocks <<
+					(ub->bs_shift - 9)))
+			return -EINVAL;
+	} else
+		return -EINVAL;
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+		const struct ublk_param_discard *p = &ub->params.discard;
+
+		/* So far, only support single segment discard */
+		if (p->max_discard_sectors && p->max_discard_segments != 1)
+			return -EINVAL;
+
+		if (!p->discard_granularity)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ublk_apply_params(struct ublk_device *ub)
+{
+	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+		return -EINVAL;
+
+	ublk_dev_param_basic_apply(ub);
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
+		ublk_dev_param_discard_apply(ub);
+
+	return 0;
+}
+
 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
 {
 	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
@@ -1138,7 +1234,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 {
 	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
 	int ublksrv_pid = (int)header->data[0];
-	unsigned long dev_blocks = header->data[1];
 	struct ublk_device *ub;
 	struct gendisk *disk;
 	int ret = -EINVAL;
@@ -1161,10 +1256,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 		goto out_unlock;
 	}
 
-	/* We may get disk size updated */
-	if (dev_blocks)
-		ub->dev_info.dev_blocks = dev_blocks;
-
 	disk = blk_mq_alloc_disk(&ub->tag_set, ub);
 	if (IS_ERR(disk)) {
 		ret = PTR_ERR(disk);
@@ -1174,19 +1265,13 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 	disk->fops = &ub_fops;
 	disk->private_data = ub;
 
-	blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_io_min(disk->queue, ub->dev_info.block_size);
-	blk_queue_max_hw_sectors(disk->queue,
-		ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
-	disk->queue->limits.discard_granularity = PAGE_SIZE;
-	blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
-
-	set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
-
 	ub->dev_info.ublksrv_pid = ublksrv_pid;
 	ub->ub_disk = disk;
+
+	ret = ublk_apply_params(ub);
+	if (ret)
+		goto out_put_disk;
+
 	get_device(&ub->cdev_dev);
 	ret = add_disk(disk);
 	if (ret) {
@@ -1195,11 +1280,13 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 		 * called in case of add_disk failure.
 		 */
 		ublk_put_device(ub);
-		put_disk(disk);
-		goto out_unlock;
+		goto out_put_disk;
 	}
 	set_bit(UB_STATE_USED, &ub->state);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
+out_put_disk:
+	if (ret)
+		put_disk(disk);
 out_unlock:
 	mutex_unlock(&ub->mutex);
 	ublk_put_device(ub);
@@ -1447,6 +1534,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	if (copy_to_user(argp, &ub->params, ph.len))
+		ret = -EFAULT;
+	else
+		ret = 0;
+	mutex_unlock(&ub->mutex);
+
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret = -EFAULT;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len || !ph.types)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	/* parameters can only be changed when device isn't live */
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		ret = -EACCES;
+	} else if (copy_from_user(&ub->params, argp, ph.len)) {
+		ret = -EFAULT;
+	} else {
+		/* clear all we don't support yet */
+		ub->params.types &= UBLK_PARAM_TYPE_ALL;
+		ret = ublk_validate_params(ub);
+	}
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+
+	return ret;
+}
+
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -1482,6 +1645,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_GET_QUEUE_AFFINITY:
 		ret = ublk_ctrl_get_queue_affinity(cmd);
 		break;
+	case UBLK_CMD_GET_PARAMS:
+		ret = ublk_ctrl_get_params(cmd);
+		break;
+	case UBLK_CMD_SET_PARAMS:
+		ret = ublk_ctrl_set_params(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ca33092354ab..54d065426f06 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -15,6 +15,8 @@
 #define	UBLK_CMD_DEL_DEV		0x05
 #define	UBLK_CMD_START_DEV	0x06
 #define	UBLK_CMD_STOP_DEV	0x07
+#define	UBLK_CMD_SET_PARAMS	0x08
+#define	UBLK_CMD_GET_PARAMS	0x09
 
 /*
  * IO commands, issued by ublk server, and handled by ublk driver.
@@ -158,4 +160,49 @@ struct ublksrv_io_cmd {
 	__u64	addr;
 };
 
+struct ublk_param_basic {
+#define UBLK_ATTR_READ_ONLY            (1 << 0)
+#define UBLK_ATTR_ROTATIONAL           (1 << 1)
+#define UBLK_ATTR_VOLATILE_CACHE       (1 << 2)
+#define UBLK_ATTR_FUA                  (1 << 3)
+	__u32	attrs;
+	__u8	logical_bs_shift;
+	__u8	physical_bs_shift;
+	__u8	io_opt_shift;
+	__u8	io_min_shift;
+
+	__u32	max_sectors;
+	__u32	chunk_sectors;
+
+	__u64   dev_sectors;
+	__u64   virt_boundary_mask;
+};
+
+struct ublk_param_discard {
+	__u32	discard_alignment;
+
+	__u32	discard_granularity;
+	__u32	max_discard_sectors;
+
+	__u32	max_write_zeroes_sectors;
+	__u16	max_discard_segments;
+	__u16	reserved0;
+};
+
+struct ublk_params {
+	/*
+	 * Total length of parameters, userspace has to set 'len' for both
+	 * SET_PARAMS and GET_PARAMS command, and driver may update len
+	 * if two sides use different version of 'ublk_params', same with
+	 * 'types' fields.
+	 */
+	__u32	len;
+#define UBLK_PARAM_TYPE_BASIC           (1 << 0)
+#define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
+	__u32	types;			/* types of parameter included */
+
+	struct ublk_param_basic		basic;
+	struct ublk_param_discard	discard;
+};
+
 #endif
-- 
cgit 


From 4bf9cbf3e93426e9ebe136dabd6ca392ca92cfcb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 30 Jul 2022 17:27:50 +0800
Subject: ublk_drv: cleanup ublksrv_ctrl_dev_info

Remove all block device related info from ublksrv_ctrl_dev_info,
meantime reduce its size into 64 bytes because:

1) ublksrv_ctrl_dev_info becomes cleaner without including any
block related info

2) generic set/get parameter command can be used to set block
related setting easily and cleanly

3) generic set/get parameter command can be used for extending
ublk without needing more info in ublksrv_ctrl_dev_info

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220730092750.1118167-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 18 +++++++-----------
 include/uapi/linux/ublk_cmd.h | 15 ++++++++-------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 20ad83b25318..2b3cd671a653 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -122,7 +122,6 @@ struct ublk_device {
 	char	*__queues;
 
 	unsigned short  queue_size;
-	unsigned short  bs_shift;
 	struct ublksrv_ctrl_dev_info	dev_info;
 
 	struct blk_mq_tag_set	tag_set;
@@ -223,8 +222,7 @@ static int ublk_validate_params(const struct ublk_device *ub)
 		if (p->logical_bs_shift > p->physical_bs_shift)
 			return -EINVAL;
 
-		if (p->max_sectors > (ub->dev_info.rq_max_blocks <<
-					(ub->bs_shift - 9)))
+		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
 			return -EINVAL;
 	} else
 		return -EINVAL;
@@ -1185,13 +1183,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
 	ublk_stop_dev(ub);
 }
 
-/* align maximum I/O size to PAGE_SIZE */
+/* align max io buffer size with PAGE_SIZE */
 static void ublk_align_max_io_size(struct ublk_device *ub)
 {
-	unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
+	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
 
-	ub->dev_info.rq_max_blocks =
-		round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
+	ub->dev_info.max_io_buf_bytes =
+		round_down(max_io_bytes, PAGE_SIZE);
 }
 
 static int ublk_add_tag_set(struct ublk_device *ub)
@@ -1348,9 +1346,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 {
 	pr_devel("%s: dev id %d flags %llx\n", __func__,
 			info->dev_id, info->flags);
-	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
-			info->nr_hw_queues, info->queue_depth,
-			info->block_size, info->dev_blocks);
+	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
+			info->nr_hw_queues, info->queue_depth);
 }
 
 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
@@ -1410,7 +1407,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
-	ub->bs_shift = ilog2(ub->dev_info.block_size);
 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
 	ublk_align_max_io_size(ub);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 54d065426f06..57d86d0e8c5b 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -80,22 +80,23 @@ struct ublksrv_ctrl_cmd {
 struct ublksrv_ctrl_dev_info {
 	__u16	nr_hw_queues;
 	__u16	queue_depth;
-	__u16	block_size;
 	__u16	state;
+	__u16	pad0;
 
-	__u32	rq_max_blocks;
+	__u32	max_io_buf_bytes;
 	__u32	dev_id;
 
-	__u64   dev_blocks;
-
 	__s32	ublksrv_pid;
-	__s32	reserved0;
+	__u32	pad1;
+
 	__u64	flags;
-	__u64	flags_reserved;
 
 	/* For ublksrv internal use, invisible to ublk driver */
 	__u64	ublksrv_flags;
-	__u64	reserved1[9];
+
+	__u64	reserved0;
+	__u64	reserved1;
+	__u64   reserved2;
 };
 
 #define		UBLK_IO_OP_READ		0
-- 
cgit 


From 4e18403d9485a43e1b54397df258b8df7dac9a83 Mon Sep 17 00:00:00 2001
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Date: Thu, 28 Jul 2022 20:39:15 +0800
Subject: ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA

Add one new ublk command: UBLK_IO_NEED_GET_DATA. It is prepared for a new
feature designed for a user application who wants to allocate IO buffer
and set IO buffer address only after it receives an IO request from
ublksrv.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Link: https://lore.kernel.org/r/c8a64b6b51c78340da7daa9e1054608695e79619.1659011443.git.ZiyangZhang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ublk_cmd.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 57d86d0e8c5b..677edaab2b66 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -30,12 +30,21 @@
  *      this IO request, request's handling result is committed to ublk
  *      driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
  *      handled before completing io request.
+ *
+ * NEED_GET_DATA: only used for write requests to set io addr and copy data
+ *      When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
+ *      command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
+ *
+ *      It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
+ *      while starting a ublk device.
  */
 #define	UBLK_IO_FETCH_REQ		0x20
 #define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
+#define	UBLK_IO_NEED_GET_DATA	0x22
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
+#define UBLK_IO_RES_NEED_GET_DATA	1
 #define UBLK_IO_RES_ABORT		(-ENODEV)
 
 #define UBLKSRV_CMD_BUF_OFFSET	0
@@ -56,6 +65,15 @@
  */
 #define UBLK_F_URING_CMD_COMP_IN_TASK	(1ULL << 1)
 
+/*
+ * User should issue io cmd again for write requests to
+ * set io buffer address and copy data from bio vectors
+ * to the userspace io buffer.
+ *
+ * In this mode, task_work is not used.
+ */
+#define UBLK_F_NEED_GET_DATA (1UL << 2)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit 


From cb3b3bf22cf33707d684e74207908ba0ef3b6467 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:48 +0200
Subject: jbd2: rename jbd_debug() to jbd2_debug()

The name of jbd_debug() is confusing as all functions inside jbd2 have
jbd2_ prefix. Rename jbd_debug() to jbd2_debug(). No functional changes.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  |  6 +++---
 fs/jbd2/commit.c      | 30 +++++++++++++++---------------
 fs/jbd2/journal.c     | 34 +++++++++++++++++-----------------
 fs/jbd2/recovery.c    | 30 +++++++++++++++---------------
 fs/jbd2/revoke.c      |  8 ++++----
 fs/jbd2/transaction.c | 26 +++++++++++++-------------
 include/linux/jbd2.h  |  4 ++--
 7 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 746132998c57..51bd38da21cd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	tid_t			this_tid;
 	int			result, batch_count = 0;
 
-	jbd_debug(1, "Start checkpoint\n");
+	jbd2_debug(1, "Start checkpoint\n");
 
 	/*
 	 * First thing: if there are any transactions in the log which
@@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
 	trace_jbd2_checkpoint(journal, result);
-	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
 
@@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	trace_jbd2_drop_transaction(journal, transaction);
 
-	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eb315e81f1a6..aa14f20241d7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
-		jbd_debug(3, "super block updated\n");
+		jbd2_debug(3, "super block updated\n");
 		mutex_lock_io(&journal->j_checkpoint_mutex);
 		/*
 		 * We hold j_checkpoint_mutex so tail cannot change under us.
@@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
-		jbd_debug(3, "superblock not updated\n");
+		jbd2_debug(3, "superblock not updated\n");
 	}
 
 	J_ASSERT(journal->j_running_transaction != NULL);
@@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 
 	trace_jbd2_start_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
@@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	__jbd2_journal_clean_checkpoint_list(journal, false);
 	spin_unlock(&journal->j_list_lock);
 
-	jbd_debug(3, "JBD2: commit phase 1\n");
+	jbd2_debug(3, "JBD2: commit phase 1\n");
 
 	/*
 	 * Clear revoked flag to reflect there is no revoked buffers
@@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug(3, "JBD2: commit phase 2a\n");
+	jbd2_debug(3, "JBD2: commit phase 2a\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 
-	jbd_debug(3, "JBD2: commit phase 2b\n");
+	jbd2_debug(3, "JBD2: commit phase 2b\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (!descriptor) {
 			J_ASSERT (bufs == 0);
 
-			jbd_debug(4, "JBD2: get descriptor\n");
+			jbd2_debug(4, "JBD2: get descriptor\n");
 
 			descriptor = jbd2_journal_get_descriptor_buffer(
 							commit_transaction,
@@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				continue;
 			}
 
-			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
 				(unsigned long long)descriptor->b_blocknr,
 				descriptor->b_data);
 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		    commit_transaction->t_buffers == NULL ||
 		    space_left < tag_bytes + 16 + csum_size) {
 
-			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
 			/* Write an end-of-descriptor marker before
                            submitting the IOs.  "tag" still points to
@@ -839,7 +839,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD2: commit phase 3\n");
+	jbd2_debug(3, "JBD2: commit phase 3\n");
 
 	while (!list_empty(&io_bufs)) {
 		struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -882,7 +882,7 @@ start_journal_io:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD2: commit phase 4\n");
+	jbd2_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
 	while (!list_empty(&log_bufs)) {
@@ -906,7 +906,7 @@ start_journal_io:
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd_debug(3, "JBD2: commit phase 5\n");
+	jbd2_debug(3, "JBD2: commit phase 5\n");
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 	commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -945,7 +945,7 @@ start_journal_io:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD2: commit phase 6\n");
+	jbd2_debug(3, "JBD2: commit phase 6\n");
 
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -1122,7 +1122,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD2: commit phase 7\n");
+	jbd2_debug(3, "JBD2: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
@@ -1164,7 +1164,7 @@ restart_loop:
 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
 	write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0cbeeaec2d1..0a8ff211fac1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -203,11 +203,11 @@ loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
 		goto end_loop;
 
-	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
+	jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
 		journal->j_commit_sequence, journal->j_commit_request);
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
-		jbd_debug(1, "OK, requests differ\n");
+		jbd2_debug(1, "OK, requests differ\n");
 		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
@@ -222,7 +222,7 @@ loop:
 		 * good idea, because that depends on threads that may
 		 * be already stopped.
 		 */
-		jbd_debug(1, "Now suspending kjournald2\n");
+		jbd2_debug(1, "Now suspending kjournald2\n");
 		write_unlock(&journal->j_state_lock);
 		try_to_freeze();
 		write_lock(&journal->j_state_lock);
@@ -252,7 +252,7 @@ loop:
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
 
-	jbd_debug(1, "kjournald2 wakes\n");
+	jbd2_debug(1, "kjournald2 wakes\n");
 
 	/*
 	 * Were we woken up by a commit wakeup event?
@@ -260,7 +260,7 @@ loop:
 	transaction = journal->j_running_transaction;
 	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 		journal->j_commit_request = transaction->t_tid;
-		jbd_debug(1, "woke because of timeout\n");
+		jbd2_debug(1, "woke because of timeout\n");
 	}
 	goto loop;
 
@@ -268,7 +268,7 @@ end_loop:
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
-	jbd_debug(1, "Journal thread exiting.\n");
+	jbd2_debug(1, "Journal thread exiting.\n");
 	write_unlock(&journal->j_state_lock);
 	return 0;
 }
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		 */
 
 		journal->j_commit_request = target;
-		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
+		jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		journal->j_running_transaction->t_requested = jiffies;
@@ -705,7 +705,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
+		jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 				  tid, journal->j_commit_sequence);
 		read_unlock(&journal->j_state_lock);
 		wake_up(&journal->j_wait_commit);
@@ -1117,7 +1117,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 		freed += journal->j_last - journal->j_first;
 
 	trace_jbd2_update_log_tail(journal, tid, block, freed);
-	jbd_debug(1,
+	jbd2_debug(1,
 		  "Cleaning journal tail from %u to %u (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, tid, block, freed);
@@ -1496,7 +1496,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
 		return NULL;
 	}
 
-	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+	jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
 		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
 
@@ -1575,7 +1575,7 @@ static int journal_reset(journal_t *journal)
 	 * attempting a write to a potential-readonly device.
 	 */
 	if (sb->s_start == 0) {
-		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+		jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
 			"(start %ld, seq %u, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
@@ -1678,7 +1678,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	}
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+	jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
 		  tail_block, tail_tid);
 
 	lock_buffer(journal->j_sb_buffer);
@@ -1719,7 +1719,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 		return;
 	}
 
-	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
+	jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
 		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1862,7 +1862,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	errcode = journal->j_errno;
 	if (errcode == -ESHUTDOWN)
 		errcode = 0;
-	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+	jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
 	sb->s_errno    = cpu_to_be32(errcode);
 
 	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
@@ -2334,7 +2334,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
 		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
 
-	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2403,7 +2403,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 {
 	journal_superblock_t *sb;
 
-	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2860,7 +2860,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 #endif
 	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
-		jbd_debug(1, "out of memory for journal_head\n");
+		jbd2_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
 				GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 8ca3527189f8..63594030afd3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal,
 		return 0;
 
 	while (next_fc_block <= journal->j_fc_last) {
-		jbd_debug(3, "Fast commit replay: next block %ld\n",
+		jbd2_debug(3, "Fast commit replay: next block %ld\n",
 			  next_fc_block);
 		err = jread(&bh, journal, next_fc_block);
 		if (err) {
-			jbd_debug(3, "Fast commit replay: read error\n");
+			jbd2_debug(3, "Fast commit replay: read error\n");
 			break;
 		}
 
@@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal,
 	}
 
 	if (err)
-		jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+		jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
 
 	return err;
 }
@@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal)
 	 */
 
 	if (!sb->s_start) {
-		jbd_debug(1, "No recovery required, last transaction %d\n",
+		jbd2_debug(1, "No recovery required, last transaction %d\n",
 			  be32_to_cpu(sb->s_sequence));
 		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
 		return 0;
@@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal)
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
 
-	jbd_debug(1, "JBD2: recovery, exit status %d, "
+	jbd2_debug(1, "JBD2: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+	jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
 	/* Restart the log at the next transaction ID, thus invalidating
@@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - 
 			be32_to_cpu(journal->j_superblock->s_sequence);
-		jbd_debug(1,
+		jbd2_debug(1,
 			  "JBD2: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 #endif
@@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal,
 	if (pass == PASS_SCAN)
 		info->start_transaction = first_commit_ID;
 
-	jbd_debug(1, "Starting recovery pass %d\n", pass);
+	jbd2_debug(1, "Starting recovery pass %d\n", pass);
 
 	/*
 	 * Now we walk through the log, transaction by transaction,
@@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
 			  next_commit_ID, next_log_block,
 			  jbd2_has_feature_fast_commit(journal) ?
 			  journal->j_fc_last : journal->j_last);
@@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal,
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+		jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal,
 
 		blocktype = be32_to_cpu(tmp->h_blocktype);
 		sequence = be32_to_cpu(tmp->h_sequence);
-		jbd_debug(3, "Found magic %d, sequence %d\n",
+		jbd2_debug(3, "Found magic %d, sequence %d\n",
 			  blocktype, sequence);
 
 		if (sequence != next_commit_ID) {
@@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal,
 					goto failed;
 				}
 				need_check_commit_time = true;
-				jbd_debug(1,
+				jbd2_debug(1,
 					"invalid descriptor block found in %lu\n",
 					next_log_block);
 			}
@@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal,
 				 * It likely does not belong to same journal,
 				 * just end this recovery with success.
 				 */
-				jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+				jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
 					  next_commit_ID);
 				brelse(bh);
 				goto done;
@@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal,
 			if (pass == PASS_SCAN &&
 			    !jbd2_descriptor_block_csum_verify(journal,
 							       bh->b_data)) {
-				jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+				jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
 					  next_log_block);
 				need_check_commit_time = true;
 			}
@@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal,
 			continue;
 
 		default:
-			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+			jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
 			brelse(bh);
 			goto done;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4556e4689024 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
 	}
 	handle->h_revoke_credits--;
 
-	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	int did_revoke = 0;	/* akpm: debug */
 	struct buffer_head *bh = jh2bh(jh);
 
-	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+	jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
 
 	/* Is the existing Revoke bit valid?  If so, we trust it, and
 	 * only perform the full cancel if the revoke bit is set.  If
@@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	if (need_cancel) {
 		record = find_revoke_record(journal, bh->b_blocknr);
 		if (record) {
-			jbd_debug(4, "cancelled existing revoke on "
+			jbd2_debug(4, "cancelled existing revoke on "
 				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
 			spin_lock(&journal->j_revoke_lock);
 			list_del(&record->hash);
@@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
 	}
 	if (descriptor)
 		flush_descriptor(journal, descriptor, offset);
-	jbd_debug(1, "Wrote %d revoke records\n", count);
+	jbd2_debug(1, "Wrote %d revoke records\n", count);
 }
 
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e9c308ae475f..1d1a926b25c5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -373,7 +373,7 @@ alloc_transaction:
 			return -ENOMEM;
 	}
 
-	jbd_debug(3, "New handle %p going live.\n", handle);
+	jbd2_debug(3, "New handle %p going live.\n", handle);
 
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
@@ -453,7 +453,7 @@ repeat:
 	handle->h_start_jiffies = jiffies;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
 		  handle, blocks,
 		  atomic_read(&transaction->t_outstanding_credits),
 		  jbd2_log_space_left(journal));
@@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 
 	/* Don't extend a locked-down transaction! */
 	if (transaction->t_state != T_RUNNING) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction not running\n", handle, nblocks);
 		goto error_out;
 	}
@@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 				   &transaction->t_outstanding_credits);
 
 	if (wanted > journal->j_max_transaction_buffers) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction too large\n", handle, nblocks);
 		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto error_out;
@@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 	handle->h_revoke_credits_requested += revoke_records;
 	result = 0;
 
-	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
 error_out:
 	read_unlock(&journal->j_state_lock);
 	return result;
@@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	jbd_debug(2, "restarting handle %p\n", handle);
+	jbd2_debug(2, "restarting handle %p\n", handle);
 	stop_this_handle(handle);
 	handle->h_transaction = NULL;
 
@@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 	int err;
 
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	err = -EROFS;
 	if (is_handle_aborted(handle))
 		goto out;
@@ -1496,7 +1496,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 
 	/*
@@ -1818,7 +1818,7 @@ int jbd2_journal_stop(handle_t *handle)
 	pid_t pid;
 
 	if (--handle->h_ref > 0) {
-		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
 						 handle->h_ref);
 		if (is_handle_aborted(handle))
 			return -EIO;
@@ -1838,7 +1838,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 
-	jbd_debug(4, "Handle %p going down\n", handle);
+	jbd2_debug(4, "Handle %p going down\n", handle);
 	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
 				tid, handle->h_type, handle->h_line_no,
 				jiffies - handle->h_start_jiffies,
@@ -1916,7 +1916,7 @@ int jbd2_journal_stop(handle_t *handle)
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
 
-		jbd_debug(2, "transaction too old, requesting commit for "
+		jbd2_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
 		jbd2_log_start_commit(journal, tid);
@@ -2662,7 +2662,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 		return -EROFS;
 	journal = transaction->t_journal;
 
-	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+	jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
 
 	spin_lock(&journal->j_list_lock);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index e79d6e0b14e8..d4d59e43769f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -58,10 +58,10 @@ extern ushort jbd2_journal_enable_debug;
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...);
 
-#define jbd_debug(n, fmt, a...) \
+#define jbd2_debug(n, fmt, a...) \
 	__jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
 #else
-#define jbd_debug(n, fmt, a...)  no_printk(fmt, ##a)
+#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
 extern void *jbd2_alloc(size_t size, gfp_t flags);
-- 
cgit 


From 68af74e92a86984ca6d5f7b19ee8f8a30a550873 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:49 +0200
Subject: jbd2: remove unused exports for jbd2 debugging

Jbd2 exports jbd2_journal_enable_debug and __jbd2_debug() depite the
first is used only in fs/jbd2/journal.c and the second only within jbd2
code. Remove the pointless exports make jbd2_journal_enable_debug
static.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-3-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 4 +---
 include/linux/jbd2.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0a8ff211fac1..f38f57942700 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -49,8 +49,7 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
 
 module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
 MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -115,7 +114,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
 	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
 	va_end(args);
 }
-EXPORT_SYMBOL(__jbd2_debug);
 #endif
 
 /* Checksumming functions */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d4d59e43769f..6c2aa61e0f73 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -54,7 +54,6 @@
  * CONFIG_JBD2_DEBUG is on.
  */
 #define JBD2_EXPENSIVE_CHECKING
-extern ushort jbd2_journal_enable_debug;
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...);
 
-- 
cgit 


From d1324958567da957385d8d555a8b840b3bf8e6e3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:50 +0200
Subject: jbd2: unexport jbd2_log_start_commit()

jbd2_log_start_commit() is not used outside of jbd2 so unexport it. Also
make __jbd2_log_start_commit() static when we are at it.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-4-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 3 +--
 include/linux/jbd2.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f38f57942700..97e205a0689d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -80,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -479,7 +478,7 @@ repeat:
  * Called with j_state_lock locked for writing.
  * Returns true if a transaction commit was started.
  */
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
 	/* Return if the txn has already requested to be committed */
 	if (journal->j_commit_request == target)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6c2aa61e0f73..164ddf1211c0 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1646,7 +1646,6 @@ extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
  */
 
 int jbd2_log_start_commit(journal_t *journal, tid_t tid);
-int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_transaction_committed(journal_t *journal, tid_t tid);
-- 
cgit 


From 3dc96bba65f53daa217f0a8f43edad145286a8f5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:21 +0200
Subject: mbcache: add functions to delete entry if unused

Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
it is unused and also add a function to wait for entry to become unused
- mb_cache_entry_wait_unused(). We do not share code between the two
deleting function as one of them will go away soon.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mbcache.h | 10 +++++++-
 2 files changed, 73 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index cfc28129fb6f..2010bc80a3f2 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
 
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
 static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 					   struct mb_cache_entry *entry,
 					   u32 key)
@@ -217,7 +230,7 @@ out:
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete - try to remove a cache entry
  * @cache - cache we work with
  * @key - key
  * @value - value
@@ -254,6 +267,55 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
 }
 EXPORT_SYMBOL(mb_cache_entry_delete);
 
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
+ * @cache - cache we work with
+ * @key - key
+ * @value - value
+ *
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
+ */
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb_cache_entry *entry;
+
+	head = mb_cache_entry_head(cache, key);
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_value == value) {
+			if (atomic_read(&entry->e_refcnt) > 2) {
+				atomic_inc(&entry->e_refcnt);
+				hlist_bl_unlock(head);
+				return entry;
+			}
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_list_lock);
+			if (!list_empty(&entry->e_list)) {
+				list_del_init(&entry->e_list);
+				if (!WARN_ONCE(cache->c_entry_count == 0,
+		"mbcache: attempt to decrement c_entry_count past zero"))
+					cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_list_lock);
+			mb_cache_entry_put(cache, entry);
+			return NULL;
+		}
+	}
+	hlist_bl_unlock(head);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
+
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
  * @entry - entry that got used
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 20f1e3ff6013..8eca7f25c432 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
 {
-	if (!atomic_dec_and_test(&entry->e_refcnt))
+	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
+
+	if (cnt > 0) {
+		if (cnt <= 3)
+			wake_up_var(&entry->e_refcnt);
 		return 0;
+	}
 	__mb_cache_entry_free(entry);
 	return 1;
 }
 
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value);
 void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
-- 
cgit 


From 75896339e43176af078509c1fce94ee6df9ca1a7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:28 +0200
Subject: mbcache: Remove mb_cache_entry_delete()

Nobody uses mb_cache_entry_delete() anymore. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-9-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 37 -------------------------------------
 include/linux/mbcache.h |  1 -
 2 files changed, 38 deletions(-)

(limited to 'include')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2010bc80a3f2..d1ebb5df2856 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -230,43 +230,6 @@ out:
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - try to remove a cache entry
- * @cache - cache we work with
- * @key - key
- * @value - value
- *
- * Remove entry from cache @cache with key @key and value @value.
- */
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
-{
-	struct hlist_bl_node *node;
-	struct hlist_bl_head *head;
-	struct mb_cache_entry *entry;
-
-	head = mb_cache_entry_head(cache, key);
-	hlist_bl_lock(head);
-	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			/* We keep hash list reference to keep entry alive */
-			hlist_bl_del_init(&entry->e_hash_list);
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			if (!list_empty(&entry->e_list)) {
-				list_del_init(&entry->e_list);
-				if (!WARN_ONCE(cache->c_entry_count == 0,
-		"mbcache: attempt to decrement c_entry_count past zero"))
-					cache->c_entry_count--;
-				atomic_dec(&entry->e_refcnt);
-			}
-			spin_unlock(&cache->c_list_lock);
-			mb_cache_entry_put(cache, entry);
-			return;
-		}
-	}
-	hlist_bl_unlock(head);
-}
-EXPORT_SYMBOL(mb_cache_entry_delete);
-
 /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
  * @cache - cache we work with
  * @key - key
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 8eca7f25c432..452b579856d4 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -47,7 +47,6 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
 
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
 						    u32 key, u64 value);
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
-- 
cgit 


From 307af6c879377c1c63e71cbdd978201f9c7ee8df Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:29 +0200
Subject: mbcache: automatically delete entries from cache on freeing

Use the fact that entries with elevated refcount are not removed from
the hash and just move removal of the entry from the hash to the entry
freeing time. When doing this we also change the generic code to hold
one reference to the cache entry, not two of them, which makes code
somewhat more obvious.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-10-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 108 +++++++++++++++++-------------------------------
 include/linux/mbcache.h |  24 +++++++----
 2 files changed, 55 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index d1ebb5df2856..96f1d49d30a5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&entry->e_list);
-	/* One ref for hash, one ref returned */
+	/* Initial hash reference */
 	atomic_set(&entry->e_refcnt, 1);
 	entry->e_key = key;
 	entry->e_value = value;
@@ -106,21 +106,28 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		}
 	}
 	hlist_bl_add_head(&entry->e_hash_list, head);
-	hlist_bl_unlock(head);
-
+	/*
+	 * Add entry to LRU list before it can be found by
+	 * mb_cache_entry_delete() to avoid races
+	 */
 	spin_lock(&cache->c_list_lock);
 	list_add_tail(&entry->e_list, &cache->c_list);
-	/* Grab ref for LRU list */
-	atomic_inc(&entry->e_refcnt);
 	cache->c_entry_count++;
 	spin_unlock(&cache->c_list_lock);
+	hlist_bl_unlock(head);
 
 	return 0;
 }
 EXPORT_SYMBOL(mb_cache_entry_create);
 
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
 {
+	struct hlist_bl_head *head;
+
+	head = mb_cache_entry_head(cache, entry->e_key);
+	hlist_bl_lock(head);
+	hlist_bl_del(&entry->e_hash_list);
+	hlist_bl_unlock(head);
 	kmem_cache_free(mb_entry_cache, entry);
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
@@ -134,7 +141,7 @@ EXPORT_SYMBOL(__mb_cache_entry_free);
  */
 void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
 {
-	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
 }
 EXPORT_SYMBOL(mb_cache_entry_wait_unused);
 
@@ -155,10 +162,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 	while (node) {
 		entry = hlist_bl_entry(node, struct mb_cache_entry,
 				       e_hash_list);
-		if (entry->e_key == key && entry->e_reusable) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_reusable &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 		node = node->next;
 	}
 	entry = NULL;
@@ -218,10 +224,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_value == value &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 	}
 	entry = NULL;
 out:
@@ -244,37 +249,25 @@ EXPORT_SYMBOL(mb_cache_entry_get);
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
 						    u32 key, u64 value)
 {
-	struct hlist_bl_node *node;
-	struct hlist_bl_head *head;
 	struct mb_cache_entry *entry;
 
-	head = mb_cache_entry_head(cache, key);
-	hlist_bl_lock(head);
-	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			if (atomic_read(&entry->e_refcnt) > 2) {
-				atomic_inc(&entry->e_refcnt);
-				hlist_bl_unlock(head);
-				return entry;
-			}
-			/* We keep hash list reference to keep entry alive */
-			hlist_bl_del_init(&entry->e_hash_list);
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			if (!list_empty(&entry->e_list)) {
-				list_del_init(&entry->e_list);
-				if (!WARN_ONCE(cache->c_entry_count == 0,
-		"mbcache: attempt to decrement c_entry_count past zero"))
-					cache->c_entry_count--;
-				atomic_dec(&entry->e_refcnt);
-			}
-			spin_unlock(&cache->c_list_lock);
-			mb_cache_entry_put(cache, entry);
-			return NULL;
-		}
-	}
-	hlist_bl_unlock(head);
+	entry = mb_cache_entry_get(cache, key, value);
+	if (!entry)
+		return NULL;
+
+	/*
+	 * Drop the ref we got from mb_cache_entry_get() and the initial hash
+	 * ref if we are the last user
+	 */
+	if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+		return entry;
 
+	spin_lock(&cache->c_list_lock);
+	if (!list_empty(&entry->e_list))
+		list_del_init(&entry->e_list);
+	cache->c_entry_count--;
+	spin_unlock(&cache->c_list_lock);
+	__mb_cache_entry_free(cache, entry);
 	return NULL;
 }
 EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
@@ -306,42 +299,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 				     unsigned long nr_to_scan)
 {
 	struct mb_cache_entry *entry;
-	struct hlist_bl_head *head;
 	unsigned long shrunk = 0;
 
 	spin_lock(&cache->c_list_lock);
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
 		entry = list_first_entry(&cache->c_list,
 					 struct mb_cache_entry, e_list);
-		if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) {
+		/* Drop initial hash reference if there is no user */
+		if (entry->e_referenced ||
+		    atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
 			entry->e_referenced = 0;
 			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
 		list_del_init(&entry->e_list);
 		cache->c_entry_count--;
-		/*
-		 * We keep LRU list reference so that entry doesn't go away
-		 * from under us.
-		 */
 		spin_unlock(&cache->c_list_lock);
-		head = mb_cache_entry_head(cache, entry->e_key);
-		hlist_bl_lock(head);
-		/* Now a reliable check if the entry didn't get used... */
-		if (atomic_read(&entry->e_refcnt) > 2) {
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			list_add_tail(&entry->e_list, &cache->c_list);
-			cache->c_entry_count++;
-			continue;
-		}
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		}
-		hlist_bl_unlock(head);
-		if (mb_cache_entry_put(cache, entry))
-			shrunk++;
+		__mb_cache_entry_free(cache, entry);
+		shrunk++;
 		cond_resched();
 		spin_lock(&cache->c_list_lock);
 	}
@@ -433,11 +408,6 @@ void mb_cache_destroy(struct mb_cache *cache)
 	 * point.
 	 */
 	list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		} else
-			WARN_ON(1);
 		list_del(&entry->e_list);
 		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
 		mb_cache_entry_put(cache, entry);
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 452b579856d4..2da63fd7b98f 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -13,8 +13,16 @@ struct mb_cache;
 struct mb_cache_entry {
 	/* List of entries in cache - protected by cache->c_list_lock */
 	struct list_head	e_list;
-	/* Hash table list - protected by hash chain bitlock */
+	/*
+	 * Hash table list - protected by hash chain bitlock. The entry is
+	 * guaranteed to be hashed while e_refcnt > 0.
+	 */
 	struct hlist_bl_node	e_hash_list;
+	/*
+	 * Entry refcount. Once it reaches zero, entry is unhashed and freed.
+	 * While refcount > 0, the entry is guaranteed to stay in the hash and
+	 * e.g. mb_cache_entry_try_delete() will fail.
+	 */
 	atomic_t		e_refcnt;
 	/* Key in hash - stable during lifetime of the entry */
 	u32			e_key;
@@ -29,20 +37,20 @@ void mb_cache_destroy(struct mb_cache *cache);
 
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
-void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void __mb_cache_entry_free(struct mb_cache *cache,
+			   struct mb_cache_entry *entry);
 void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
-static inline int mb_cache_entry_put(struct mb_cache *cache,
-				     struct mb_cache_entry *entry)
+static inline void mb_cache_entry_put(struct mb_cache *cache,
+				      struct mb_cache_entry *entry)
 {
 	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
 
 	if (cnt > 0) {
-		if (cnt <= 3)
+		if (cnt <= 2)
 			wake_up_var(&entry->e_refcnt);
-		return 0;
+		return;
 	}
-	__mb_cache_entry_free(entry);
-	return 1;
+	__mb_cache_entry_free(cache, entry);
 }
 
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
-- 
cgit 


From b6e8d40d43ae4dec00c8fea2593eeea3114b8f44 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 2 Aug 2022 21:54:51 -0400
Subject: sched, cpuset: Fix dl_cpu_busy() panic due to empty cs->cpus_allowed

With cgroup v2, the cpuset's cpus_allowed mask can be empty indicating
that the cpuset will just use the effective CPUs of its parent. So
cpuset_can_attach() can call task_can_attach() with an empty mask.
This can lead to cpumask_any_and() returns nr_cpu_ids causing the call
to dl_bw_of() to crash due to percpu value access of an out of bound
CPU value. For example:

	[80468.182258] BUG: unable to handle page fault for address: ffffffff8b6648b0
	  :
	[80468.191019] RIP: 0010:dl_cpu_busy+0x30/0x2b0
	  :
	[80468.207946] Call Trace:
	[80468.208947]  cpuset_can_attach+0xa0/0x140
	[80468.209953]  cgroup_migrate_execute+0x8c/0x490
	[80468.210931]  cgroup_update_dfl_csses+0x254/0x270
	[80468.211898]  cgroup_subtree_control_write+0x322/0x400
	[80468.212854]  kernfs_fop_write_iter+0x11c/0x1b0
	[80468.213777]  new_sync_write+0x11f/0x1b0
	[80468.214689]  vfs_write+0x1eb/0x280
	[80468.215592]  ksys_write+0x5f/0xe0
	[80468.216463]  do_syscall_64+0x5c/0x80
	[80468.224287]  entry_SYSCALL_64_after_hwframe+0x44/0xae

Fix that by using effective_cpus instead. For cgroup v1, effective_cpus
is the same as cpus_allowed. For v2, effective_cpus is the real cpumask
to be used by tasks within the cpuset anyway.

Also update task_can_attach()'s 2nd argument name to cs_effective_cpus to
reflect the change. In addition, a check is added to task_can_attach()
to guard against the possibility that cpumask_any_and() may return a
value >= nr_cpu_ids.

Fixes: 7f51412a415d ("sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220803015451.2219567-1-longman@redhat.com
---
 include/linux/sched.h  | 2 +-
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c    | 8 +++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 88b8817b827d..6a060160f0db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1813,7 +1813,7 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags)
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
+extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
 #ifdef CONFIG_SMP
 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 71a418858a5e..58aadfda9b8b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 		goto out_unlock;
 
 	cgroup_taskset_for_each(task, css, tset) {
-		ret = task_can_attach(task, cs->cpus_allowed);
+		ret = task_can_attach(task, cs->effective_cpus);
 		if (ret)
 			goto out_unlock;
 		ret = security_task_setscheduler(task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5555e49c4e12..addc3c2d2122 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8980,7 +8980,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 }
 
 int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_cpus_allowed)
+		    const struct cpumask *cs_effective_cpus)
 {
 	int ret = 0;
 
@@ -8999,9 +8999,11 @@ int task_can_attach(struct task_struct *p,
 	}
 
 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_cpus_allowed)) {
-		int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+					      cs_effective_cpus)) {
+		int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
 
+		if (unlikely(cpu >= nr_cpu_ids))
+			return -EINVAL;
 		ret = dl_cpu_busy(cpu, p);
 	}
 
-- 
cgit 


From a8af0d682ae0c9cf62dd0ad6afdb1480951d6a10 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 30 Jun 2022 16:21:50 -0400
Subject: libceph: clean up ceph_osdc_start_request prototype

This function always returns 0, and ignores the nofail boolean. Drop the
nofail argument, make the function void return and fix up the callers.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  6 +++---
 fs/ceph/addr.c                  | 33 +++++++++++++--------------------
 fs/ceph/file.c                  | 32 +++++++++++++-------------------
 include/linux/ceph/osd_client.h |  5 ++---
 net/ceph/osd_client.c           | 15 ++++++---------
 5 files changed, 37 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ef9bc62e9afd..b4580b73479f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1297,7 +1297,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
-	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
+	ceph_osdc_start_request(osd_req->r_osdc, osd_req);
 }
 
 /*
@@ -2081,7 +2081,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
 	if (ret)
 		return ret;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	return 0;
 }
 
@@ -4768,7 +4768,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	if (ret)
 		goto out_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0)
 		ceph_copy_from_page_vector(pages, buf, 0, ret);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index de12715c237b..ec76e77f8d4b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -337,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	/* should always give us a page-aligned read */
 	WARN_ON_ONCE(page_off);
 	len = err;
+	err = 0;
 
 	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
 	req->r_callback = finish_netfs_read;
@@ -344,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	req->r_inode = inode;
 	ihold(inode);
 
-	err = ceph_osdc_start_request(req->r_osdc, req, false);
-	if (err)
-		iput(inode);
+	ceph_osdc_start_request(req->r_osdc, req);
 out:
 	ceph_osdc_put_request(req);
 	if (err)
@@ -620,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(osdc, req, true);
-	if (!err)
-		err = ceph_osdc_wait_request(osdc, req);
+	ceph_osdc_start_request(osdc, req);
+	err = ceph_osdc_wait_request(osdc, req);
 
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, err);
@@ -1150,8 +1148,7 @@ new_request:
 		}
 
 		req->r_mtime = inode->i_mtime;
-		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
-		BUG_ON(rc);
+		ceph_osdc_start_request(&fsc->client->osdc, req);
 		req = NULL;
 
 		wbc->nr_to_write -= i;
@@ -1692,9 +1689,8 @@ int ceph_uninline_data(struct file *file)
 	}
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	ceph_osdc_put_request(req);
 	if (err < 0)
 		goto out_unlock;
@@ -1735,9 +1731,8 @@ int ceph_uninline_data(struct file *file)
 	}
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, err);
@@ -1908,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 
 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
 				     0, false, true);
-	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+	ceph_osdc_start_request(&fsc->client->osdc, rd_req);
 
 	wr_req->r_mtime = ci->netfs.inode.i_mtime;
-	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+	ceph_osdc_start_request(&fsc->client->osdc, wr_req);
 
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
-	if (!err2)
-		err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+	err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
 
 	if (err >= 0 || err == -ENOENT)
 		have |= POOL_READ;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b4e978420802..c3caa9bf9755 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -985,9 +985,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
 						 false, false);
-		ret = ceph_osdc_start_request(osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(osdc, req);
+		ceph_osdc_start_request(osdc, req);
+		ret = ceph_osdc_wait_request(osdc, req);
 
 		ceph_update_read_metrics(&fsc->mdsc->metric,
 					 req->r_start_latency,
@@ -1250,7 +1249,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_inode = inode;
 	req->r_priv = aio_req;
 
-	ret = ceph_osdc_start_request(req->r_osdc, req, false);
+	ceph_osdc_start_request(req->r_osdc, req);
 out:
 	if (ret < 0) {
 		req->r_result = ret;
@@ -1387,9 +1386,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			continue;
 		}
 
-		ret = ceph_osdc_start_request(req->r_osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		ceph_osdc_start_request(req->r_osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 		if (write)
 			ceph_update_write_metrics(metric, req->r_start_latency,
@@ -1452,8 +1450,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					       r_private_item);
 			list_del_init(&req->r_private_item);
 			if (ret >= 0)
-				ret = ceph_osdc_start_request(req->r_osdc,
-							      req, false);
+				ceph_osdc_start_request(req->r_osdc, req);
 			if (ret < 0) {
 				req->r_result = ret;
 				ceph_aio_complete_req(req);
@@ -1566,9 +1563,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 						false, true);
 
 		req->r_mtime = mtime;
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		ceph_osdc_start_request(&fsc->client->osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 					  req->r_end_latency, len, ret);
@@ -2032,12 +2028,10 @@ static int ceph_zero_partial_object(struct inode *inode,
 	}
 
 	req->r_mtime = inode->i_mtime;
-	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret) {
-		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-		if (ret == -ENOENT)
-			ret = 0;
-	}
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	if (ret == -ENOENT)
+		ret = 0;
 	ceph_osdc_put_request(req);
 
 out:
@@ -2339,7 +2333,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
 		if (IS_ERR(req))
 			ret = PTR_ERR(req);
 		else {
-			ceph_osdc_start_request(osdc, req, false);
+			ceph_osdc_start_request(osdc, req);
 			ret = ceph_osdc_wait_request(osdc, req);
 			ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
 						     req->r_start_latency,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cba8a6ffc329..fb6be72104df 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -507,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req,
-				   bool nofail);
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req);
 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9d82bb42e958..87b883c7bfd6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4578,15 +4578,12 @@ bad:
 /*
  * Register request, send initial attempt.
  */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
 {
 	down_read(&osdc->lock);
 	submit_request(req, false);
 	up_read(&osdc->lock);
-
-	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
@@ -4756,7 +4753,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	linger_cancel(lreq);
 	linger_put(lreq);
 	ret = wait_request_timeout(req, opts->mount_timeout);
@@ -4827,7 +4824,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 
 out_put_req:
@@ -5043,7 +5040,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		void *p = page_address(pages[0]);
@@ -5120,7 +5117,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
-- 
cgit 


From 6ab4b1990097b76508bd6dffd85ffcacbedb26b2 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Tue, 26 Jul 2022 11:24:04 +0100
Subject: cpuidle: Add cpu_idle_miss trace event

Add a trace event for cpuidle to track missed (too deep or too shallow)
wakeups.

After each wakeup, CPUIdle already computes whether the entered state was
optimal, above or below the desired one and updates the relevant
counters. This patch makes it possible to trace those events in addition
to just reading the counters.

The patterns of types and percentages of misses across different
workloads appear to be very consistent. This makes the trace event very
useful for comparing the relative correctness of different CPUIdle
governors for different types of workloads, or for finding the
optimal governor for a given device.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c    |  6 +++++-
 include/trace/events/power.h | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ef2ea1b12cd8..bf57cab32456 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,6 +8,7 @@
  * This code is licenced under the GPL.
  */
 
+#include "linux/percpu-defs.h"
 #include <linux/clockchips.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
@@ -278,6 +279,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 
 				/* Shallower states are enabled, so update. */
 				dev->states_usage[entered_state].above++;
+				trace_cpu_idle_miss(dev->cpu, entered_state, false);
 				break;
 			}
 		} else if (diff > delay) {
@@ -289,8 +291,10 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 				 * Update if a deeper state would have been a
 				 * better match for the observed idle duration.
 				 */
-				if (diff - delay >= drv->states[i].target_residency_ns)
+				if (diff - delay >= drv->states[i].target_residency_ns) {
 					dev->states_usage[entered_state].below++;
+					trace_cpu_idle_miss(dev->cpu, entered_state, true);
+				}
 
 				break;
 			}
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index c708521e4ed5..77f14f7a11d4 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -40,6 +40,28 @@ DEFINE_EVENT(cpu, cpu_idle,
 	TP_ARGS(state, cpu_id)
 );
 
+TRACE_EVENT(cpu_idle_miss,
+
+	TP_PROTO(unsigned int cpu_id, unsigned int state, bool below),
+
+	TP_ARGS(cpu_id, state, below),
+
+	TP_STRUCT__entry(
+		__field(u32,		cpu_id)
+		__field(u32,		state)
+		__field(bool,		below)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_id = cpu_id;
+		__entry->state = state;
+		__entry->below = below;
+	),
+
+	TP_printk("cpu_id=%lu state=%lu type=%s", (unsigned long)__entry->cpu_id,
+		(unsigned long)__entry->state, (__entry->below)?"below":"above")
+);
+
 TRACE_EVENT(powernv_throttle,
 
 	TP_PROTO(int chip_id, const char *reason, int pmax),
-- 
cgit 


From bed4593645366ad7362a3aa7bc0d100d8d8236a8 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Mon, 11 Jul 2022 09:17:38 +0800
Subject: tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH

If DEBUG_SECTION_MISMATCH enabled, __calc_tpm2_event_size() will not be
inlined, this cause section mismatch like this:

WARNING: modpost: vmlinux.o(.text.unlikely+0xe30c): Section mismatch in reference from the variable L0 to the function .init.text:early_ioremap()
The function L0() references
the function __init early_memremap().
This is often because L0 lacks a __init
annotation or the annotation of early_ioremap is wrong.

Fix it by using __always_inline instead of inline for the called-once
function __calc_tpm2_event_size().

Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
Cc: stable@vger.kernel.org # v5.3
Reported-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm_eventlog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 739ba9a03ec1..20c0ff54b7a0 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -157,7 +157,7 @@ struct tcg_algorithm_info {
  * Return: size of the event on success, 0 on failure
  */
 
-static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 					 struct tcg_pcr_event *event_header,
 					 bool do_mapping)
 {
-- 
cgit 


From 06799a9085e12a778fe2851db550ab5911ad28fe Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 31 Jul 2022 15:41:05 +0300
Subject: net: bonding: replace dev_trans_start() with the jiffies of the last
 ARP/NS

The bonding driver piggybacks on time stamps kept by the network stack
for the purpose of the netdev TX watchdog, and this is problematic
because it does not work with NETIF_F_LLTX devices.

It is hard to say why the driver looks at dev_trans_start() of the
slave->dev, considering that this is updated even by non-ARP/NS probes
sent by us, and even by traffic not sent by us at all (for example PTP
on physical slave devices). ARP monitoring in active-backup mode appears
to still work even if we track only the last TX time of actual ARP
probes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 35 ++++++++++++++++++++---------------
 include/net/bonding.h           | 13 ++++++++++++-
 2 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e75acb14d066..ab7fdbbc2530 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2001,6 +2001,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 	for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
 		new_slave->target_last_arp_rx[i] = new_slave->last_rx;
 
+	new_slave->last_tx = new_slave->last_rx;
+
 	if (bond->params.miimon && !bond->params.use_carrier) {
 		link_reporting = bond_check_dev_link(bond, slave_dev, 1);
 
@@ -2884,8 +2886,11 @@ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip,
 		return;
 	}
 
-	if (bond_handle_vlan(slave, tags, skb))
+	if (bond_handle_vlan(slave, tags, skb)) {
+		slave_update_last_tx(slave);
 		arp_xmit(skb);
+	}
+
 	return;
 }
 
@@ -3074,8 +3079,7 @@ static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
 			    curr_active_slave->last_link_up))
 		bond_validate_arp(bond, slave, tip, sip);
 	else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
-		 bond_time_in_interval(bond,
-				       dev_trans_start(curr_arp_slave->dev), 1))
+		 bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
 		bond_validate_arp(bond, slave, sip, tip);
 
 out_unlock:
@@ -3103,8 +3107,10 @@ static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr,
 	}
 
 	addrconf_addr_solict_mult(daddr, &mcaddr);
-	if (bond_handle_vlan(slave, tags, skb))
+	if (bond_handle_vlan(slave, tags, skb)) {
+		slave_update_last_tx(slave);
 		ndisc_send_skb(skb, &mcaddr, saddr);
+	}
 }
 
 static void bond_ns_send_all(struct bonding *bond, struct slave *slave)
@@ -3246,8 +3252,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
 			    curr_active_slave->last_link_up))
 		bond_validate_ns(bond, slave, saddr, daddr);
 	else if (curr_arp_slave &&
-		 bond_time_in_interval(bond,
-				       dev_trans_start(curr_arp_slave->dev), 1))
+		 bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
 		bond_validate_ns(bond, slave, saddr, daddr);
 
 out:
@@ -3335,12 +3340,12 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
 	 *       so it can wait
 	 */
 	bond_for_each_slave_rcu(bond, slave, iter) {
-		unsigned long trans_start = dev_trans_start(slave->dev);
+		unsigned long last_tx = slave_last_tx(slave);
 
 		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
 
 		if (slave->link != BOND_LINK_UP) {
-			if (bond_time_in_interval(bond, trans_start, 1) &&
+			if (bond_time_in_interval(bond, last_tx, 1) &&
 			    bond_time_in_interval(bond, slave->last_rx, 1)) {
 
 				bond_propose_link_state(slave, BOND_LINK_UP);
@@ -3365,7 +3370,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
 			 * when the source ip is 0, so don't take the link down
 			 * if we don't know our ip yet
 			 */
-			if (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) ||
+			if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
 			    !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) {
 
 				bond_propose_link_state(slave, BOND_LINK_DOWN);
@@ -3431,7 +3436,7 @@ re_arm:
  */
 static int bond_ab_arp_inspect(struct bonding *bond)
 {
-	unsigned long trans_start, last_rx;
+	unsigned long last_tx, last_rx;
 	struct list_head *iter;
 	struct slave *slave;
 	int commit = 0;
@@ -3482,9 +3487,9 @@ static int bond_ab_arp_inspect(struct bonding *bond)
 		 * - (more than missed_max*delta since receive AND
 		 *    the bond has an IP address)
 		 */
-		trans_start = dev_trans_start(slave->dev);
+		last_tx = slave_last_tx(slave);
 		if (bond_is_active_slave(slave) &&
-		    (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) ||
+		    (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
 		     !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) {
 			bond_propose_link_state(slave, BOND_LINK_DOWN);
 			commit++;
@@ -3501,8 +3506,8 @@ static int bond_ab_arp_inspect(struct bonding *bond)
  */
 static void bond_ab_arp_commit(struct bonding *bond)
 {
-	unsigned long trans_start;
 	struct list_head *iter;
+	unsigned long last_tx;
 	struct slave *slave;
 
 	bond_for_each_slave(bond, slave, iter) {
@@ -3511,10 +3516,10 @@ static void bond_ab_arp_commit(struct bonding *bond)
 			continue;
 
 		case BOND_LINK_UP:
-			trans_start = dev_trans_start(slave->dev);
+			last_tx = slave_last_tx(slave);
 			if (rtnl_dereference(bond->curr_active_slave) != slave ||
 			    (!rtnl_dereference(bond->curr_active_slave) &&
-			     bond_time_in_interval(bond, trans_start, 1))) {
+			     bond_time_in_interval(bond, last_tx, 1))) {
 				struct slave *current_arp_slave;
 
 				current_arp_slave = rtnl_dereference(bond->current_arp_slave);
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 6e78d657aa05..afd606df149a 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -161,8 +161,9 @@ struct slave {
 	struct net_device *dev; /* first - useful for panic debug */
 	struct bonding *bond; /* our master */
 	int    delay;
-	/* all three in jiffies */
+	/* all 4 in jiffies */
 	unsigned long last_link_up;
+	unsigned long last_tx;
 	unsigned long last_rx;
 	unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
 	s8     link;		/* one of BOND_LINK_XXXX */
@@ -540,6 +541,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond,
 	return slave->last_rx;
 }
 
+static inline void slave_update_last_tx(struct slave *slave)
+{
+	WRITE_ONCE(slave->last_tx, jiffies);
+}
+
+static inline unsigned long slave_last_tx(struct slave *slave)
+{
+	return READ_ONCE(slave->last_tx);
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave,
 					 struct sk_buff *skb)
-- 
cgit 


From 6930bcbfb6ceda63e298c6af6d733ecdf6bd4cde Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 1 Aug 2022 15:57:26 -0400
Subject: lockd: detect and reject lock arguments that overflow

lockd doesn't currently vet the start and length in nlm4 requests like
it should, and can end up generating lock requests with arguments that
overflow when passed to the filesystem.

The NLM4 protocol uses unsigned 64-bit arguments for both start and
length, whereas struct file_lock tracks the start and end as loff_t
values. By the time we get around to calling nlm4svc_retrieve_args,
we've lost the information that would allow us to determine if there was
an overflow.

Start tracking the actual start and len for NLM4 requests in the
nlm_lock. In nlm4svc_retrieve_args, vet these values to ensure they
won't cause an overflow, and return NLM4_FBIG if they do.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=392
Reported-by: Jan Kasiak <j.kasiak@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <stable@vger.kernel.org> # 5.14+
---
 fs/lockd/svc4proc.c       |  8 ++++++++
 fs/lockd/xdr4.c           | 19 ++-----------------
 include/linux/lockd/xdr.h |  2 ++
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4f247ab8be61..bf274f23969b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -32,6 +32,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	if (!nlmsvc_ops)
 		return nlm_lck_denied_nolocks;
 
+	if (lock->lock_start > OFFSET_MAX ||
+	    (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start))))
+		return nlm4_fbig;
+
 	/* Obtain host handle */
 	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
 	 || (argp->monitor && nsm_monitor(host) < 0))
@@ -50,6 +54,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file[mode];
 		lock->fl.fl_pid = current->tgid;
+		lock->fl.fl_start = (loff_t)lock->lock_start;
+		lock->fl.fl_end = lock->lock_len ?
+				   (loff_t)(lock->lock_start + lock->lock_len - 1) :
+				   OFFSET_MAX;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
 		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
 		if (!lock->fl.fl_owner) {
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 856267c0864b..712fdfeb8ef0 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -20,13 +20,6 @@
 
 #include "svcxdr.h"
 
-static inline loff_t
-s64_to_loff_t(__s64 offset)
-{
-	return (loff_t)offset;
-}
-
-
 static inline s64
 loff_t_to_s64(loff_t offset)
 {
@@ -70,8 +63,6 @@ static bool
 svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 {
 	struct file_lock *fl = &lock->fl;
-	u64 len, start;
-	s64 end;
 
 	if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
 		return false;
@@ -81,20 +72,14 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 		return false;
 	if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
 		return false;
-	if (xdr_stream_decode_u64(xdr, &start) < 0)
+	if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0)
 		return false;
-	if (xdr_stream_decode_u64(xdr, &len) < 0)
+	if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0)
 		return false;
 
 	locks_init_lock(fl);
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;
-	end = start + len - 1;
-	fl->fl_start = s64_to_loff_t(start);
-	if (len == 0 || end < 0)
-		fl->fl_end = OFFSET_MAX;
-	else
-		fl->fl_end = s64_to_loff_t(end);
 
 	return true;
 }
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 398f70093cd3..67e4a2c5500b 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -41,6 +41,8 @@ struct nlm_lock {
 	struct nfs_fh		fh;
 	struct xdr_netobj	oh;
 	u32			svid;
+	u64			lock_start;
+	u64			lock_len;
 	struct file_lock	fl;
 };
 
-- 
cgit 


From f482aa98652795846cc55da98ebe331eb74f3d0b Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Wed, 3 Aug 2022 15:23:43 -0700
Subject: audit, io_uring, io-wq: Fix memory leak in io_sq_thread() and
 io_wqe_worker()

Currently @audit_context is allocated twice for io_uring workers:

  1. copy_process() calls audit_alloc();
  2. io_sq_thread() or io_wqe_worker() calls audit_alloc_kernel() (which
     is effectively audit_alloc()) and overwrites @audit_context,
     causing:

  BUG: memory leak
  unreferenced object 0xffff888144547400 (size 1024):
<...>
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    backtrace:
      [<ffffffff8135cfc3>] audit_alloc+0x133/0x210
      [<ffffffff81239e63>] copy_process+0xcd3/0x2340
      [<ffffffff8123b5f3>] create_io_thread+0x63/0x90
      [<ffffffff81686604>] create_io_worker+0xb4/0x230
      [<ffffffff81686f68>] io_wqe_enqueue+0x248/0x3b0
      [<ffffffff8167663a>] io_queue_iowq+0xba/0x200
      [<ffffffff816768b3>] io_queue_async+0x113/0x180
      [<ffffffff816840df>] io_req_task_submit+0x18f/0x1a0
      [<ffffffff816841cd>] io_apoll_task_func+0xdd/0x120
      [<ffffffff8167d49f>] tctx_task_work+0x11f/0x570
      [<ffffffff81272c4e>] task_work_run+0x7e/0xc0
      [<ffffffff8125a688>] get_signal+0xc18/0xf10
      [<ffffffff8111645b>] arch_do_signal_or_restart+0x2b/0x730
      [<ffffffff812ea44e>] exit_to_user_mode_prepare+0x5e/0x180
      [<ffffffff844ae1b2>] syscall_exit_to_user_mode+0x12/0x20
      [<ffffffff844a7e80>] do_syscall_64+0x40/0x80

Then,

  3. io_sq_thread() or io_wqe_worker() frees @audit_context using
     audit_free();
  4. do_exit() eventually calls audit_free() again, which is okay
     because audit_free() does a NULL check.

As suggested by Paul Moore, fix it by deleting audit_alloc_kernel() and
redundant audit_free() calls.

Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Suggested-by: Paul Moore <paul@paul-moore.com>
Cc: stable@vger.kernel.org
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/r/20220803222343.31673-1-yepeilin.cs@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/audit.h |  5 -----
 io_uring/io-wq.c      |  3 ---
 io_uring/sqpoll.c     |  4 ----
 kernel/auditsc.c      | 25 -------------------------
 4 files changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 00f7a80f1a3e..3608992848d3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -285,7 +285,6 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 /* These are defined in auditsc.c */
 				/* Public API */
 extern int  audit_alloc(struct task_struct *task);
-extern int  audit_alloc_kernel(struct task_struct *task);
 extern void __audit_free(struct task_struct *task);
 extern void __audit_uring_entry(u8 op);
 extern void __audit_uring_exit(int success, long code);
@@ -578,10 +577,6 @@ static inline int audit_alloc(struct task_struct *task)
 {
 	return 0;
 }
-static inline int audit_alloc_kernel(struct task_struct *task)
-{
-	return 0;
-}
 static inline void audit_free(struct task_struct *task)
 { }
 static inline void audit_uring_entry(u8 op)
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 77df5b43bf52..c6536d4b2da0 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -624,8 +624,6 @@ static int io_wqe_worker(void *data)
 	snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
 	set_task_comm(current, buf);
 
-	audit_alloc_kernel(current);
-
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		long ret;
 
@@ -660,7 +658,6 @@ static int io_wqe_worker(void *data)
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		io_worker_handle_work(worker);
 
-	audit_free(current);
 	io_worker_exit(worker);
 	return 0;
 }
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 76d4d70c733a..559652380672 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -235,8 +235,6 @@ static int io_sq_thread(void *data)
 		set_cpus_allowed_ptr(current, cpu_online_mask);
 	current->flags |= PF_NO_SETAFFINITY;
 
-	audit_alloc_kernel(current);
-
 	mutex_lock(&sqd->lock);
 	while (1) {
 		bool cap_entries, sqt_spin = false;
@@ -310,8 +308,6 @@ static int io_sq_thread(void *data)
 	io_run_task_work();
 	mutex_unlock(&sqd->lock);
 
-	audit_free(current);
-
 	complete(&sqd->exited);
 	do_exit(0);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3a8c9d744800..dd8d9ab747c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1073,31 +1073,6 @@ int audit_alloc(struct task_struct *tsk)
 	return 0;
 }
 
-/**
- * audit_alloc_kernel - allocate an audit_context for a kernel task
- * @tsk: the kernel task
- *
- * Similar to the audit_alloc() function, but intended for kernel private
- * threads.  Returns zero on success, negative values on failure.
- */
-int audit_alloc_kernel(struct task_struct *tsk)
-{
-	/*
-	 * At the moment we are just going to call into audit_alloc() to
-	 * simplify the code, but there two things to keep in mind with this
-	 * approach:
-	 *
-	 * 1. Filtering internal kernel tasks is a bit laughable in almost all
-	 * cases, but there is at least one case where there is a benefit:
-	 * the '-a task,never' case allows the admin to effectively disable
-	 * task auditing at runtime.
-	 *
-	 * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
-	 * on these internal kernel tasks, but they probably don't hurt either.
-	 */
-	return audit_alloc(tsk);
-}
-
 static inline void audit_free_context(struct audit_context *context)
 {
 	/* resetting is extra work, but it is likely just noise */
-- 
cgit 


From e2dcac2f58f5a95ab092d1da237ffdc0da1832cf Mon Sep 17 00:00:00 2001
From: Jinghao Jia <jinghao@linux.ibm.com>
Date: Fri, 29 Jul 2022 20:17:13 +0000
Subject: BPF: Fix potential bad pointer dereference in bpf_sys_bpf()

The bpf_sys_bpf() helper function allows an eBPF program to load another
eBPF program from within the kernel. In this case the argument union
bpf_attr pointer (as well as the insns and license pointers inside) is a
kernel address instead of a userspace address (which is the case of a
usual bpf() syscall). To make the memory copying process in the syscall
work in both cases, bpfptr_t was introduced to wrap around the pointer
and distinguish its origin. Specifically, when copying memory contents
from a bpfptr_t, a copy_from_user() is performed in case of a userspace
address and a memcpy() is performed for a kernel address.

This can lead to problems because the in-kernel pointer is never checked
for validity. The problem happens when an eBPF syscall program tries to
call bpf_sys_bpf() to load a program but provides a bad insns pointer --
say 0xdeadbeef -- in the bpf_attr union. The helper calls __sys_bpf()
which would then call bpf_prog_load() to load the program.
bpf_prog_load() is responsible for copying the eBPF instructions to the
newly allocated memory for the program; it creates a kernel bpfptr_t for
insns and invokes copy_from_bpfptr(). Internally, all bpfptr_t
operations are backed by the corresponding sockptr_t operations, which
performs direct memcpy() on kernel pointers for copy_from/strncpy_from
operations. Therefore, the code is always happy to dereference the bad
pointer to trigger a un-handle-able page fault and in turn an oops.
However, this is not supposed to happen because at that point the eBPF
program is already verified and should not cause a memory error.

Sample KASAN trace:

[   25.685056][  T228] ==================================================================
[   25.685680][  T228] BUG: KASAN: user-memory-access in copy_from_bpfptr+0x21/0x30
[   25.686210][  T228] Read of size 80 at addr 00000000deadbeef by task poc/228
[   25.686732][  T228]
[   25.686893][  T228] CPU: 3 PID: 228 Comm: poc Not tainted 5.19.0-rc7 #7
[   25.687375][  T228] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS d55cb5a 04/01/2014
[   25.687991][  T228] Call Trace:
[   25.688223][  T228]  <TASK>
[   25.688429][  T228]  dump_stack_lvl+0x73/0x9e
[   25.688747][  T228]  print_report+0xea/0x200
[   25.689061][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.689401][  T228]  ? _printk+0x54/0x6e
[   25.689693][  T228]  ? _raw_spin_lock_irqsave+0x70/0xd0
[   25.690071][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.690412][  T228]  kasan_report+0xb5/0xe0
[   25.690716][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691059][  T228]  kasan_check_range+0x2bd/0x2e0
[   25.691405][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691734][  T228]  memcpy+0x25/0x60
[   25.692000][  T228]  copy_from_bpfptr+0x21/0x30
[   25.692328][  T228]  bpf_prog_load+0x604/0x9e0
[   25.692653][  T228]  ? cap_capable+0xb4/0xe0
[   25.692956][  T228]  ? security_capable+0x4f/0x70
[   25.693324][  T228]  __sys_bpf+0x3af/0x580
[   25.693635][  T228]  bpf_sys_bpf+0x45/0x240
[   25.693937][  T228]  bpf_prog_f0ec79a5a3caca46_bpf_func1+0xa2/0xbd
[   25.694394][  T228]  bpf_prog_run_pin_on_cpu+0x2f/0xb0
[   25.694756][  T228]  bpf_prog_test_run_syscall+0x146/0x1c0
[   25.695144][  T228]  bpf_prog_test_run+0x172/0x190
[   25.695487][  T228]  __sys_bpf+0x2c5/0x580
[   25.695776][  T228]  __x64_sys_bpf+0x3a/0x50
[   25.696084][  T228]  do_syscall_64+0x60/0x90
[   25.696393][  T228]  ? fpregs_assert_state_consistent+0x50/0x60
[   25.696815][  T228]  ? exit_to_user_mode_prepare+0x36/0xa0
[   25.697202][  T228]  ? syscall_exit_to_user_mode+0x20/0x40
[   25.697586][  T228]  ? do_syscall_64+0x6e/0x90
[   25.697899][  T228]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   25.698312][  T228] RIP: 0033:0x7f6d543fb759
[   25.698624][  T228] Code: 08 5b 89 e8 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 a6 0e 00 f7 d8 64 89 01 48
[   25.699946][  T228] RSP: 002b:00007ffc3df78468 EFLAGS: 00000287 ORIG_RAX: 0000000000000141
[   25.700526][  T228] RAX: ffffffffffffffda RBX: 00007ffc3df78628 RCX: 00007f6d543fb759
[   25.701071][  T228] RDX: 0000000000000090 RSI: 00007ffc3df78478 RDI: 000000000000000a
[   25.701636][  T228] RBP: 00007ffc3df78510 R08: 0000000000000000 R09: 0000000000300000
[   25.702191][  T228] R10: 0000000000000005 R11: 0000000000000287 R12: 0000000000000000
[   25.702736][  T228] R13: 00007ffc3df78638 R14: 000055a1584aca68 R15: 00007f6d5456a000
[   25.703282][  T228]  </TASK>
[   25.703490][  T228] ==================================================================
[   25.704050][  T228] Disabling lock debugging due to kernel taint

Update copy_from_bpfptr() and strncpy_from_bpfptr() so that:
 - for a kernel pointer, it uses the safe copy_from_kernel_nofault() and
   strncpy_from_kernel_nofault() functions.
 - for a userspace pointer, it performs copy_from_user() and
   strncpy_from_user().

Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.")
Link: https://lore.kernel.org/bpf/20220727132905.45166-1-jinghao@linux.ibm.com/
Signed-off-by: Jinghao Jia <jinghao@linux.ibm.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220729201713.88688-1-jinghao@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpfptr.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 46e1757d06a3..79b2f78eec1a 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -49,7 +49,9 @@ static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
 static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
 					  size_t offset, size_t size)
 {
-	return copy_from_sockptr_offset(dst, (sockptr_t) src, offset, size);
+	if (!bpfptr_is_kernel(src))
+		return copy_from_user(dst, src.user + offset, size);
+	return copy_from_kernel_nofault(dst, src.kernel + offset, size);
 }
 
 static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
@@ -78,7 +80,9 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
 {
-	return strncpy_from_sockptr(dst, (sockptr_t) src, count);
+	if (bpfptr_is_kernel(src))
+		return strncpy_from_kernel_nofault(dst, src.kernel, count);
+	return strncpy_from_user(dst, src.user, count);
 }
 
 #endif /* _LINUX_BPFPTR_H */
-- 
cgit 


From 23339e5752d01a4b5e122759b002cf896d26f6c1 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Mon, 1 Aug 2022 10:08:08 -0700
Subject: f2fs: revive F2FS_IOC_ABORT_VOLATILE_WRITE

F2FS_IOC_ABORT_VOLATILE_WRITE was used to abort a atomic write before.
However it was removed accidentally. So revive it by changing the name,
since volatile write had gone.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Fiexes: 7bc155fec5b3("f2fs: kill volatile write support")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c            | 30 ++++++++++++++++++++++++++++--
 include/uapi/linux/f2fs.h |  2 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8098ed890e94..29711e5b6983 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2109,6 +2109,31 @@ unlock_out:
 	return ret;
 }
 
+static int f2fs_ioc_abort_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(mnt_userns, inode))
+		return -EACCES;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	if (f2fs_is_atomic_file(inode))
+		f2fs_abort_atomic_write(inode, true);
+
+	inode_unlock(inode);
+
+	mnt_drop_write_file(filp);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	return ret;
+}
+
 static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -4054,9 +4079,10 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_start_atomic_write(filp);
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 		return f2fs_ioc_commit_atomic_write(filp);
+	case F2FS_IOC_ABORT_ATOMIC_WRITE:
+		return f2fs_ioc_abort_atomic_write(filp);
 	case F2FS_IOC_START_VOLATILE_WRITE:
 	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
-	case F2FS_IOC_ABORT_VOLATILE_WRITE:
 		return -EOPNOTSUPP;
 	case F2FS_IOC_SHUTDOWN:
 		return f2fs_ioc_shutdown(filp, arg);
@@ -4725,7 +4751,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 	case F2FS_IOC_START_VOLATILE_WRITE:
 	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
-	case F2FS_IOC_ABORT_VOLATILE_WRITE:
+	case F2FS_IOC_ABORT_ATOMIC_WRITE:
 	case F2FS_IOC_SHUTDOWN:
 	case FITRIM:
 	case FS_IOC_SET_ENCRYPTION_POLICY:
diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h
index 352a822d4370..3121d127d5aa 100644
--- a/include/uapi/linux/f2fs.h
+++ b/include/uapi/linux/f2fs.h
@@ -13,7 +13,7 @@
 #define F2FS_IOC_COMMIT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 2)
 #define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
-#define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_ABORT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IOW(F2FS_IOCTL_MAGIC, 6, __u32)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
 #define F2FS_IOC_DEFRAGMENT		_IOWR(F2FS_IOCTL_MAGIC, 8,	\
-- 
cgit 


From f1d41f7720c89705c20e4335a807b1c518c2e7be Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 2 Aug 2022 18:33:24 +0200
Subject: mptcp, btf: Add struct mptcp_sock definition when CONFIG_MPTCP is
 disabled

The btf_sock_ids array needs struct mptcp_sock BTF ID for the
bpf_skc_to_mptcp_sock helper.

When CONFIG_MPTCP is disabled, the 'struct mptcp_sock' is not
defined and resolve_btfids will complain with:

  [...]
  BTFIDS  vmlinux
  WARN: resolve_btfids: unresolved symbol mptcp_sock
  [...]

Add an empty definition for struct mptcp_sock when CONFIG_MPTCP
is disabled.

Fixes: 3bc253c2e652 ("bpf: Add bpf_skc_to_mptcp_sock_proto")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220802163324.1873044-1-jolsa@kernel.org
---
 include/net/mptcp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index ac9cf7271d46..412479ebf5ad 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -291,4 +291,8 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
 static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
 #endif
 
+#if !IS_ENABLED(CONFIG_MPTCP)
+struct mptcp_sock { };
+#endif
+
 #endif /* __NET_MPTCP_H */
-- 
cgit 


From 3c59366c207e4c6c6569524af606baf017a55c61 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 1 Aug 2022 10:33:34 +1000
Subject: NFS: don't unhash dentry during unlink/rename

NFS unlink() (and rename over existing target) must determine if the
file is open, and must perform a "silly rename" instead of an unlink (or
before rename) if it is.  Otherwise the client might hold a file open
which has been removed on the server.

Consequently if it determines that the file isn't open, it must block
any subsequent opens until the unlink/rename has been completed on the
server.

This is currently achieved by unhashing the dentry.  This forces any
open attempt to the slow-path for lookup which will block on i_rwsem on
the directory until the unlink/rename completes.  A future patch will
change the VFS to only get a shared lock on i_rwsem for unlink, so this
will no longer work.

Instead we introduce an explicit interlock.  A special value is stored
in dentry->d_fsdata while the unlink/rename is running and
->d_revalidate blocks while that value is present.  When ->d_revalidate
unblocks, the dentry will be invalid.  This closes the race
without requiring exclusion on i_rwsem.

d_fsdata is already used in two different ways.
1/ an IS_ROOT directory dentry might have a "devname" stored in
   d_fsdata.  Such a dentry doesn't have a name and so cannot be the
   target of unlink or rename.  For safety we check if an old devname
   is still stored, and remove it if it is.
2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct
   nfs_unlinkdata' stored in d_fsdata.  While this is set maydelete()
   will fail, so an unlink or rename will never proceed on such
   a dentry.

Neither of these can be in effect when a dentry is the target of unlink
or rename.  So we can expect d_fsdata to be NULL, and store a special
value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate
that any lookup will be blocked.

The d_count() is incremented under d_lock() when a lookup finds the
dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under
the same lock to avoid any races.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 72 +++++++++++++++++++++++++++++++++++++-------------
 include/linux/nfs_fs.h |  9 +++++++
 2 files changed, 63 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7b2230297f6b..dbab3caa15ed 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1782,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 	int ret;
 
 	if (flags & LOOKUP_RCU) {
+		if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+			return -ECHILD;
 		parent = READ_ONCE(dentry->d_parent);
 		dir = d_inode_rcu(parent);
 		if (!dir)
@@ -1790,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 	} else {
+		/* Wait for unlink to complete */
+		wait_var_event(&dentry->d_fsdata,
+			       dentry->d_fsdata != NFS_FSDATA_BLOCKED);
 		parent = dget_parent(dentry);
 		ret = reval(d_inode(parent), dentry, flags);
 		dput(parent);
@@ -2458,7 +2463,6 @@ out:
 int nfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error;
-	int need_rehash = 0;
 
 	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry);
@@ -2473,15 +2477,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		error = nfs_sillyrename(dir, dentry);
 		goto out;
 	}
-	if (!d_unhashed(dentry)) {
-		__d_drop(dentry);
-		need_rehash = 1;
-	}
+	/* We must prevent any concurrent open until the unlink
+	 * completes.  ->d_revalidate will wait for ->d_fsdata
+	 * to clear.  We set it here to ensure no lookup succeeds until
+	 * the unlink is complete on the server.
+	 */
+	error = -ETXTBSY;
+	if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+	    WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+		goto out;
+	if (dentry->d_fsdata)
+		/* old devname */
+		kfree(dentry->d_fsdata);
+	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
 	spin_unlock(&dentry->d_lock);
 	error = nfs_safe_remove(dentry);
 	nfs_dentry_remove_handle_error(dir, dentry, error);
-	if (need_rehash)
-		d_rehash(dentry);
+	dentry->d_fsdata = NULL;
+	wake_up_var(&dentry->d_fsdata);
 out:
 	trace_nfs_unlink_exit(dir, dentry, error);
 	return error;
@@ -2588,6 +2602,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(nfs_link);
 
+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	struct dentry *new_dentry = data->new_dentry;
+
+	new_dentry->d_fsdata = NULL;
+	wake_up_var(&new_dentry->d_fsdata);
+}
+
 /*
  * RENAME
  * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2618,8 +2641,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
-	struct dentry *dentry = NULL, *rehash = NULL;
+	struct dentry *dentry = NULL;
 	struct rpc_task *task;
+	bool must_unblock = false;
 	int error = -EBUSY;
 
 	if (flags)
@@ -2637,18 +2661,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	 * the new target.
 	 */
 	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
-		/*
-		 * To prevent any new references to the target during the
-		 * rename, we unhash the dentry in advance.
+		/* We must prevent any concurrent open until the unlink
+		 * completes.  ->d_revalidate will wait for ->d_fsdata
+		 * to clear.  We set it here to ensure no lookup succeeds until
+		 * the unlink is complete on the server.
 		 */
-		if (!d_unhashed(new_dentry)) {
-			d_drop(new_dentry);
-			rehash = new_dentry;
+		error = -ETXTBSY;
+		if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+		    WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+			goto out;
+		if (new_dentry->d_fsdata) {
+			/* old devname */
+			kfree(new_dentry->d_fsdata);
+			new_dentry->d_fsdata = NULL;
 		}
 
+		spin_lock(&new_dentry->d_lock);
 		if (d_count(new_dentry) > 2) {
 			int err;
 
+			spin_unlock(&new_dentry->d_lock);
+
 			/* copy the target dentry's name */
 			dentry = d_alloc(new_dentry->d_parent,
 					 &new_dentry->d_name);
@@ -2661,14 +2694,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 				goto out;
 
 			new_dentry = dentry;
-			rehash = NULL;
 			new_inode = NULL;
+		} else {
+			new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+			must_unblock = true;
+			spin_unlock(&new_dentry->d_lock);
 		}
+
 	}
 
 	if (S_ISREG(old_inode->i_mode))
 		nfs_sync_inode(old_inode);
-	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+				must_unblock ? nfs_unblock_rename : NULL);
 	if (IS_ERR(task)) {
 		error = PTR_ERR(task);
 		goto out;
@@ -2692,8 +2730,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		spin_unlock(&old_inode->i_lock);
 	}
 out:
-	if (rehash)
-		d_rehash(rehash);
 	trace_nfs_rename_exit(old_dir, old_dentry,
 			new_dir, new_dentry, error);
 	if (!error) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a17c337dbdf1..b32ed68e7dc4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid)
 
 #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
 
+/* We need to block new opens while a file is being unlinked.
+ * If it is opened *before* we decide to unlink, we will silly-rename
+ * instead. If it is opened *after*, then we need to create or will fail.
+ * If we allow the two to race, we could end up with a file that is open
+ * but deleted on the server resulting in ESTALE.
+ * So use ->d_fsdata to record when the unlink is happening
+ * and block dentry revalidation while it is set.
+ */
+#define NFS_FSDATA_BLOCKED ((void*)1)
 
 # undef ifdebug
 # ifdef NFS_DEBUG
-- 
cgit 


From 2da1c30929a28c3c6b01d9c16c4216037be95597 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:28 +0800
Subject: mm: hugetlb_vmemmap: delete hugetlb_optimize_vmemmap_enabled()

Patch series "Simplify hugetlb vmemmap and improve its readability", v2.

This series aims to simplify hugetlb vmemmap and improve its readability.


This patch (of 8):

The name hugetlb_optimize_vmemmap_enabled() a bit confusing as it tests
two conditions (enabled and pages in use).  Instead of coming up to an
appropriate name, we could just delete it.  There is already a discussion
about deleting it in thread [1].

There is only one user of hugetlb_optimize_vmemmap_enabled() outside of
hugetlb_vmemmap, that is flush_dcache_page() in arch/arm64/mm/flush.c.
However, it does not need to call hugetlb_optimize_vmemmap_enabled() in
flush_dcache_page() since HugeTLB pages are always fully mapped and only
head page will be set PG_dcache_clean meaning only head page's flag may
need to be cleared (see commit cf5a501d985b).  So it is easy to remove
hugetlb_optimize_vmemmap_enabled().

Link: https://lore.kernel.org/all/c77c61c8-8a5a-87e8-db89-d04d8aaab4cc@oracle.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/flush.c      | 13 +++----------
 include/linux/page-flags.h | 14 ++------------
 2 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index fc4f710e9820..5f9379b3c8c8 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -76,17 +76,10 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
 void flush_dcache_page(struct page *page)
 {
 	/*
-	 * Only the head page's flags of HugeTLB can be cleared since the tail
-	 * vmemmap pages associated with each HugeTLB page are mapped with
-	 * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
-	 * details can refer to vmemmap_remap_pte()).  Although
-	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
-	 * page struct, there is more than one page struct with PG_dcache_clean
-	 * associated with the HugeTLB page since the head vmemmap page frame
-	 * is reused (more details can refer to the comments above
-	 * page_fixed_fake_head()).
+	 * HugeTLB pages are always fully mapped and only head page will be
+	 * set PG_dcache_clean (see comments in __sync_icache_dcache()).
 	 */
-	if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
+	if (PageHuge(page))
 		page = compound_head(page);
 
 	if (test_bit(PG_dcache_clean, &page->flags))
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ea19528564d1..2455405ab82b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,12 +208,6 @@ enum pageflags {
 DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			 hugetlb_optimize_vmemmap_key);
 
-static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
-{
-	return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-				   &hugetlb_optimize_vmemmap_key);
-}
-
 /*
  * If the feature of optimizing vmemmap pages associated with each HugeTLB
  * page is enabled, the head vmemmap page frame is reused and all of the tail
@@ -232,7 +226,8 @@ static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_optimize_vmemmap_enabled())
+	if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
+				 &hugetlb_optimize_vmemmap_key))
 		return page;
 
 	/*
@@ -260,11 +255,6 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 {
 	return page;
 }
-
-static inline bool hugetlb_optimize_vmemmap_enabled(void)
-{
-	return false;
-}
 #endif
 
 static __always_inline int page_is_fake_head(struct page *page)
-- 
cgit 


From cf5472e561133888df81d2e48f7da9ebd3299459 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:29 +0800
Subject: mm: hugetlb_vmemmap: optimize vmemmap_optimize_mode handling

We hold an another reference to hugetlb_optimize_vmemmap_key when making
vmemmap_optimize_mode on, because we use static_key to tell memory_hotplug
that memory_hotplug.memmap_on_memory should be overridden.  However, this
rule has gone when we have introduced PageVmemmapSelfHosted.  Therefore,
we could simplify vmemmap_optimize_mode handling by not holding an another
reference to hugetlb_optimize_vmemmap_key.  This also means that we not
incur the extra page_fixed_fake_head checks if there are no vmemmap
optinmized hugetlb pages after this change.

Link: https://lkml.kernel.org/r/20220628092235.91270-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  6 ++---
 mm/hugetlb_vmemmap.c       | 65 +++++-----------------------------------------
 2 files changed, 9 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2455405ab82b..b44cc24d7496 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -205,8 +205,7 @@ enum pageflags {
 #ifndef __GENERATING_BOUNDS_H
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-			 hugetlb_optimize_vmemmap_key);
+DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
  * If the feature of optimizing vmemmap pages associated with each HugeTLB
@@ -226,8 +225,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-				 &hugetlb_optimize_vmemmap_key))
+	if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
 		return page;
 
 	/*
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1362feb3c6c9..e5b83a25c2fa 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -23,42 +23,15 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-enum vmemmap_optimize_mode {
-	VMEMMAP_OPTIMIZE_OFF,
-	VMEMMAP_OPTIMIZE_ON,
-};
-
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-			hugetlb_optimize_vmemmap_key);
+DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
-static enum vmemmap_optimize_mode vmemmap_optimize_mode =
+static bool vmemmap_optimize_enabled =
 	IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 
-static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
-{
-	if (vmemmap_optimize_mode == to)
-		return;
-
-	if (to == VMEMMAP_OPTIMIZE_OFF)
-		static_branch_dec(&hugetlb_optimize_vmemmap_key);
-	else
-		static_branch_inc(&hugetlb_optimize_vmemmap_key);
-	WRITE_ONCE(vmemmap_optimize_mode, to);
-}
-
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	bool enable;
-	enum vmemmap_optimize_mode mode;
-
-	if (kstrtobool(buf, &enable))
-		return -EINVAL;
-
-	mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
-	vmemmap_optimize_mode_switch(mode);
-
-	return 0;
+	return kstrtobool(buf, &vmemmap_optimize_enabled);
 }
 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
 
@@ -100,7 +73,7 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 static unsigned int vmemmap_optimizable_pages(struct hstate *h,
 					      struct page *head)
 {
-	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+	if (!READ_ONCE(vmemmap_optimize_enabled))
 		return 0;
 
 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
@@ -191,7 +164,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 		return;
 	}
 
@@ -212,36 +184,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
-					    void *buffer, size_t *length,
-					    loff_t *ppos)
-{
-	int ret;
-	enum vmemmap_optimize_mode mode;
-	static DEFINE_MUTEX(sysctl_mutex);
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&sysctl_mutex);
-	mode = vmemmap_optimize_mode;
-	table->data = &mode;
-	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (write && !ret)
-		vmemmap_optimize_mode_switch(mode);
-	mutex_unlock(&sysctl_mutex);
-
-	return ret;
-}
-
 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{
 		.procname	= "hugetlb_optimize_vmemmap",
-		.maxlen		= sizeof(enum vmemmap_optimize_mode),
+		.data		= &vmemmap_optimize_enabled,
+		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= hugetlb_optimize_vmemmap_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.proc_handler	= proc_dobool,
 	},
 	{ }
 };
-- 
cgit 


From dff033818a06e7d0bf79271e34bda11c2d9d98d0 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:30 +0800
Subject: mm: hugetlb_vmemmap: introduce the name HVO

It it inconvenient to mention the feature of optimizing vmemmap pages
associated with HugeTLB pages when communicating with others since there
is no specific or abbreviated name for it when it is first introduced.
Let us give it a name HVO (HugeTLB Vmemmap Optimization) from now.

This commit also updates the document about "hugetlb_free_vmemmap" by the
way discussed in thread [1].

Link: https://lore.kernel.org/all/21aae898-d54d-cc4b-a11f-1bb7fddcfffa@redhat.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 ++++---
 Documentation/admin-guide/mm/hugetlbpage.rst    |  4 ++--
 Documentation/admin-guide/mm/memory-hotplug.rst |  4 ++--
 Documentation/admin-guide/sysctl/vm.rst         |  3 +--
 Documentation/mm/vmemmap_dedup.rst              |  2 ++
 fs/Kconfig                                      | 12 +++++-------
 include/linux/page-flags.h                      |  3 +--
 mm/hugetlb_vmemmap.c                            |  8 ++++----
 mm/hugetlb_vmemmap.h                            |  4 ++--
 9 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bab2b0bf5988..8c06271835b0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1731,12 +1731,13 @@
 	hugetlb_free_vmemmap=
 			[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 			enabled.
+			Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
-			Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
+			Format: { on | off (default) }
 
-			[oO][Nn]/Y/y/1: enable the feature
-			[oO][Ff]/N/n/0: disable the feature
+			on: enable HVO
+			off: disable HVO
 
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index a90330d0a837..8e2727dc18d4 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -164,8 +164,8 @@ default_hugepagesz
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
 hugetlb_free_vmemmap
-	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
-	unused vmemmap pages associated with each HugeTLB page.
+	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB
+	Vmemmap Optimization (HVO).
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 0f56ecd8ac05..a3c9e8ad8fa0 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -653,8 +653,8 @@ block might fail:
 - Concurrent activity that operates on the same physical memory area, such as
   allocating gigantic pages, can result in temporary offlining failures.
 
-- Out of memory when dissolving huge pages, especially when freeing unused
-  vmemmap pages associated with each hugetlb page is enabled.
+- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap
+  Optimization (HVO) is enabled.
 
   Offlining code may be able to migrate huge page contents, but may not be able
   to dissolve the source huge page because it fails allocating (unmovable) pages
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index f74f722ad702..9b833e439f09 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -569,8 +569,7 @@ This knob is not available when the size of 'struct page' (a structure defined
 in include/linux/mm_types.h) is not power of two (an unusual system config could
 result in this).
 
-Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
-associated with each HugeTLB page.
+Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO).
 
 Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
 buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index c9c495f62d12..7d7a161aa364 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -7,6 +7,8 @@ A vmemmap diet for HugeTLB and Device DAX
 HugeTLB
 =======
 
+This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
+
 The struct page structures (page structs) are used to describe a physical
 page frame. By default, there is a one-to-one mapping from a page frame to
 it's corresponding page struct.
diff --git a/fs/Kconfig b/fs/Kconfig
index 5976eb33535f..a547307c1ae8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -247,8 +247,7 @@ config HUGETLB_PAGE
 
 #
 # Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of minimizing overhead of struct page associated with
-# each HugeTLB page.
+# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
 #
 config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	bool
@@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
-	bool "Default optimizing vmemmap pages of HugeTLB to on"
+	bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
 	default n
 	depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
-	  When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
-	  pages associated with each HugeTLB page is default off. Say Y here
-	  to enable optimizing vmemmap pages of HugeTLB by default. It can then
-	  be disabled on the command line via hugetlb_free_vmemmap=off.
+	  The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
+	  enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+	  (boot command line) or hugetlb_optimize_vmemmap (sysctl).
 
 config MEMFD_CREATE
 	def_bool TMPFS || HUGETLBFS
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b44cc24d7496..78ed46ae6ee5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,8 +208,7 @@ enum pageflags {
 DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
- * If the feature of optimizing vmemmap pages associated with each HugeTLB
- * page is enabled, the head vmemmap page frame is reused and all of the tail
+ * If HVO is enabled, the head vmemmap page frame is reused and all of the tail
  * vmemmap addresses map to the head vmemmap page frame (furture details can
  * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
  * words, there are more than one page struct with PG_head associated with each
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e5b83a25c2fa..bcafd9d7639c 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
  *
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  *
@@ -156,8 +156,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	/*
 	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
-	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+	 * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON
+	 * to catch invalid usage of the tail page structs.
 	 */
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 109b0a53b6fe..ba66fadad9fc 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
  *
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  */
-- 
cgit 


From 998a2997885f73e5cc732ac6d661dfa6e0f50654 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:31 +0800
Subject: mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to
 hugetlb_vmemmap.c

When I first introduced vmemmap manipulation functions related to HugeTLB,
I thought those functions may be reused by other modules (e.g.  using
similar approach to optimize vmemmap pages, unfortunately, the DAX used
the same approach but does not use those functions).  After two years, we
didn't see any other users.  So move those functions to hugetlb_vmemmap.c.
Code movement without any functional change.

Link: https://lkml.kernel.org/r/20220628092235.91270-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |   7 -
 mm/hugetlb_vmemmap.c | 399 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/sparse-vmemmap.c  | 399 ---------------------------------------------------
 3 files changed, 398 insertions(+), 407 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 18e01474cf6b..e6e201a4ce05 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3142,13 +3142,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int vmemmap_remap_free(unsigned long start, unsigned long end,
-		       unsigned long reuse);
-int vmemmap_remap_alloc(unsigned long start, unsigned long end,
-			unsigned long reuse, gfp_t gfp_mask);
-#endif
-
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index bcafd9d7639c..f68e216600b9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,9 +10,31 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
-#include <linux/memory.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
 
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte:		called for each lowest-level entry (PTE).
+ * @nr_walked:		the number of walked pte.
+ * @reuse_page:		the page which is reused for the tail vmemmap pages.
+ * @reuse_addr:		the virtual address of the @reuse_page page.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
+ */
+struct vmemmap_remap_walk {
+	void			(*remap_pte)(pte_t *pte, unsigned long addr,
+					     struct vmemmap_remap_walk *walk);
+	unsigned long		nr_walked;
+	struct page		*reuse_page;
+	unsigned long		reuse_addr;
+	struct list_head	*vmemmap_pages;
+};
+
 /*
  * There are a lot of struct page structures associated with each HugeTLB page.
  * For tail pages, the value of compound_head is the same. So we can reuse first
@@ -23,6 +45,381 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	pmd_t __pmd;
+	int i;
+	unsigned long addr = start;
+	struct page *page = pmd_page(*pmd);
+	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+	if (!pgtable)
+		return -ENOMEM;
+
+	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+		pte_t entry, *pte;
+		pgprot_t pgprot = PAGE_KERNEL;
+
+		entry = mk_pte(page + i, pgprot);
+		pte = pte_offset_kernel(&__pmd, addr);
+		set_pte_at(&init_mm, addr, pte, entry);
+	}
+
+	spin_lock(&init_mm.page_table_lock);
+	if (likely(pmd_leaf(*pmd))) {
+		/*
+		 * Higher order allocations from buddy allocator must be able to
+		 * be treated as indepdenent small pages (as they can be freed
+		 * individually).
+		 */
+		if (!PageReserved(page))
+			split_page(page, get_order(PMD_SIZE));
+
+		/* Make pte visible before pmd. See comment in pmd_install(). */
+		smp_wmb();
+		pmd_populate_kernel(&init_mm, pmd, pgtable);
+		flush_tlb_kernel_range(start, start + PMD_SIZE);
+	} else {
+		pte_free_kernel(&init_mm, pgtable);
+	}
+	spin_unlock(&init_mm.page_table_lock);
+
+	return 0;
+}
+
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	int leaf;
+
+	spin_lock(&init_mm.page_table_lock);
+	leaf = pmd_leaf(*pmd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	if (!leaf)
+		return 0;
+
+	return __split_vmemmap_huge_pmd(pmd, start);
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	/*
+	 * The reuse_page is found 'first' in table walk before we start
+	 * remapping (which is calling @walk->remap_pte).
+	 */
+	if (!walk->reuse_page) {
+		walk->reuse_page = pte_page(*pte);
+		/*
+		 * Because the reuse address is part of the range that we are
+		 * walking, skip the reuse address range.
+		 */
+		addr += PAGE_SIZE;
+		pte++;
+		walk->nr_walked++;
+	}
+
+	for (; addr != end; addr += PAGE_SIZE, pte++) {
+		walk->remap_pte(pte, addr, walk);
+		walk->nr_walked++;
+	}
+}
+
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		int ret;
+
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		if (ret)
+			return ret;
+
+		next = pmd_addr_end(addr, end);
+		vmemmap_pte_range(pmd, addr, next, walk);
+	} while (pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(p4d, addr);
+	do {
+		int ret;
+
+		next = pud_addr_end(addr, end);
+		ret = vmemmap_pmd_range(pud, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		int ret;
+
+		next = p4d_addr_end(addr, end);
+		ret = vmemmap_pud_range(p4d, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (p4d++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+			       struct vmemmap_remap_walk *walk)
+{
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgd;
+
+	VM_BUG_ON(!PAGE_ALIGNED(start));
+	VM_BUG_ON(!PAGE_ALIGNED(end));
+
+	pgd = pgd_offset_k(addr);
+	do {
+		int ret;
+
+		next = pgd_addr_end(addr, end);
+		ret = vmemmap_p4d_range(pgd, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pgd++, addr = next, addr != end);
+
+	/*
+	 * We only change the mapping of the vmemmap virtual address range
+	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+	 * belongs to the range.
+	 */
+	flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+	return 0;
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+	if (PageReserved(page))
+		free_bootmem_page(page);
+	else
+		__free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		list_del(&page->lru);
+		free_vmemmap_page(page);
+	}
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+			      struct vmemmap_remap_walk *walk)
+{
+	/*
+	 * Remap the tail pages as read-only to catch illegal write operation
+	 * to the tail pages.
+	 */
+	pgprot_t pgprot = PAGE_KERNEL_RO;
+	pte_t entry = mk_pte(walk->reuse_page, pgprot);
+	struct page *page = pte_page(*pte);
+
+	list_add_tail(&page->lru, walk->vmemmap_pages);
+	set_pte_at(&init_mm, addr, pte, entry);
+}
+
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE		3
+
+static inline void reset_struct_pages(struct page *start)
+{
+	int i;
+	struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+		memcpy(start + i, from, sizeof(*from));
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+	reset_struct_pages(to);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ *			to the page which @reuse is mapped to, then free vmemmap
+ *			which the range are mapped to.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_free(unsigned long start, unsigned long end,
+			      unsigned long reuse)
+{
+	int ret;
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_remap_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/*
+	 * In order to make remapping routine most efficient for the huge pages,
+	 * the routine of vmemmap page table walking has the following rules
+	 * (see more details from the vmemmap_pte_range()):
+	 *
+	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+	 *   should be continuous.
+	 * - The @reuse address is part of the range [@reuse, @end) that we are
+	 *   walking which is passed to vmemmap_remap_range().
+	 * - The @reuse address is the first in the complete range.
+	 *
+	 * So we need to make sure that @start and @reuse meet the above rules.
+	 */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	mmap_read_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	if (ret && walk.nr_walked) {
+		end = reuse + walk.nr_walked * PAGE_SIZE;
+		/*
+		 * vmemmap_pages contains pages from the previous
+		 * vmemmap_remap_range call which failed.  These
+		 * are pages which were removed from the vmemmap.
+		 * They will be restored in the following call.
+		 */
+		walk = (struct vmemmap_remap_walk) {
+			.remap_pte	= vmemmap_restore_pte,
+			.reuse_addr	= reuse,
+			.vmemmap_pages	= &vmemmap_pages,
+		};
+
+		vmemmap_remap_range(reuse, end, &walk);
+	}
+	mmap_read_unlock(&init_mm);
+
+	free_vmemmap_page_list(&vmemmap_pages);
+
+	return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+				   gfp_t gfp_mask, struct list_head *list)
+{
+	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+	int nid = page_to_nid((struct page *)start);
+	struct page *page, *next;
+
+	while (nr_pages--) {
+		page = alloc_pages_node(nid, gfp_mask, 0);
+		if (!page)
+			goto out;
+		list_add_tail(&page->lru, list);
+	}
+
+	return 0;
+out:
+	list_for_each_entry_safe(page, next, list, lru)
+		__free_pages(page, 0);
+	return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *			 to the page which is from the @vmemmap_pages
+ *			 respectively.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ * @gfp_mask:	GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			       unsigned long reuse, gfp_t gfp_mask)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_restore_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+		return -ENOMEM;
+
+	mmap_read_lock(&init_mm);
+	vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
+
+	return 0;
+}
+
 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 5f0ed4717ed0..46ae542118c0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,408 +27,9 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/pgtable.h>
-#include <linux/bootmem_info.h>
 
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-/**
- * struct vmemmap_remap_walk - walk vmemmap page table
- *
- * @remap_pte:		called for each lowest-level entry (PTE).
- * @nr_walked:		the number of walked pte.
- * @reuse_page:		the page which is reused for the tail vmemmap pages.
- * @reuse_addr:		the virtual address of the @reuse_page page.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
- *			or is mapped from.
- */
-struct vmemmap_remap_walk {
-	void (*remap_pte)(pte_t *pte, unsigned long addr,
-			  struct vmemmap_remap_walk *walk);
-	unsigned long nr_walked;
-	struct page *reuse_page;
-	unsigned long reuse_addr;
-	struct list_head *vmemmap_pages;
-};
-
-static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
-	pmd_t __pmd;
-	int i;
-	unsigned long addr = start;
-	struct page *page = pmd_page(*pmd);
-	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
-
-	if (!pgtable)
-		return -ENOMEM;
-
-	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
-
-	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
-		pte_t entry, *pte;
-		pgprot_t pgprot = PAGE_KERNEL;
-
-		entry = mk_pte(page + i, pgprot);
-		pte = pte_offset_kernel(&__pmd, addr);
-		set_pte_at(&init_mm, addr, pte, entry);
-	}
-
-	spin_lock(&init_mm.page_table_lock);
-	if (likely(pmd_leaf(*pmd))) {
-		/*
-		 * Higher order allocations from buddy allocator must be able to
-		 * be treated as indepdenent small pages (as they can be freed
-		 * individually).
-		 */
-		if (!PageReserved(page))
-			split_page(page, get_order(PMD_SIZE));
-
-		/* Make pte visible before pmd. See comment in pmd_install(). */
-		smp_wmb();
-		pmd_populate_kernel(&init_mm, pmd, pgtable);
-		flush_tlb_kernel_range(start, start + PMD_SIZE);
-	} else {
-		pte_free_kernel(&init_mm, pgtable);
-	}
-	spin_unlock(&init_mm.page_table_lock);
-
-	return 0;
-}
-
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
-	int leaf;
-
-	spin_lock(&init_mm.page_table_lock);
-	leaf = pmd_leaf(*pmd);
-	spin_unlock(&init_mm.page_table_lock);
-
-	if (!leaf)
-		return 0;
-
-	return __split_vmemmap_huge_pmd(pmd, start);
-}
-
-static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
-			      unsigned long end,
-			      struct vmemmap_remap_walk *walk)
-{
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	/*
-	 * The reuse_page is found 'first' in table walk before we start
-	 * remapping (which is calling @walk->remap_pte).
-	 */
-	if (!walk->reuse_page) {
-		walk->reuse_page = pte_page(*pte);
-		/*
-		 * Because the reuse address is part of the range that we are
-		 * walking, skip the reuse address range.
-		 */
-		addr += PAGE_SIZE;
-		pte++;
-		walk->nr_walked++;
-	}
-
-	for (; addr != end; addr += PAGE_SIZE, pte++) {
-		walk->remap_pte(pte, addr, walk);
-		walk->nr_walked++;
-	}
-}
-
-static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		int ret;
-
-		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
-		if (ret)
-			return ret;
-
-		next = pmd_addr_end(addr, end);
-		vmemmap_pte_range(pmd, addr, next, walk);
-	} while (pmd++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_offset(p4d, addr);
-	do {
-		int ret;
-
-		next = pud_addr_end(addr, end);
-		ret = vmemmap_pmd_range(pud, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (pud++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_offset(pgd, addr);
-	do {
-		int ret;
-
-		next = p4d_addr_end(addr, end);
-		ret = vmemmap_pud_range(p4d, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (p4d++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_remap_range(unsigned long start, unsigned long end,
-			       struct vmemmap_remap_walk *walk)
-{
-	unsigned long addr = start;
-	unsigned long next;
-	pgd_t *pgd;
-
-	VM_BUG_ON(!PAGE_ALIGNED(start));
-	VM_BUG_ON(!PAGE_ALIGNED(end));
-
-	pgd = pgd_offset_k(addr);
-	do {
-		int ret;
-
-		next = pgd_addr_end(addr, end);
-		ret = vmemmap_p4d_range(pgd, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (pgd++, addr = next, addr != end);
-
-	/*
-	 * We only change the mapping of the vmemmap virtual address range
-	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
-	 * belongs to the range.
-	 */
-	flush_tlb_kernel_range(start + PAGE_SIZE, end);
-
-	return 0;
-}
-
-/*
- * Free a vmemmap page. A vmemmap page can be allocated from the memblock
- * allocator or buddy allocator. If the PG_reserved flag is set, it means
- * that it allocated from the memblock allocator, just free it via the
- * free_bootmem_page(). Otherwise, use __free_page().
- */
-static inline void free_vmemmap_page(struct page *page)
-{
-	if (PageReserved(page))
-		free_bootmem_page(page);
-	else
-		__free_page(page);
-}
-
-/* Free a list of the vmemmap pages */
-static void free_vmemmap_page_list(struct list_head *list)
-{
-	struct page *page, *next;
-
-	list_for_each_entry_safe(page, next, list, lru) {
-		list_del(&page->lru);
-		free_vmemmap_page(page);
-	}
-}
-
-static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
-			      struct vmemmap_remap_walk *walk)
-{
-	/*
-	 * Remap the tail pages as read-only to catch illegal write operation
-	 * to the tail pages.
-	 */
-	pgprot_t pgprot = PAGE_KERNEL_RO;
-	pte_t entry = mk_pte(walk->reuse_page, pgprot);
-	struct page *page = pte_page(*pte);
-
-	list_add_tail(&page->lru, walk->vmemmap_pages);
-	set_pte_at(&init_mm, addr, pte, entry);
-}
-
-/*
- * How many struct page structs need to be reset. When we reuse the head
- * struct page, the special metadata (e.g. page->flags or page->mapping)
- * cannot copy to the tail struct page structs. The invalid value will be
- * checked in the free_tail_pages_check(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
- * structs.
- */
-#define NR_RESET_STRUCT_PAGE		3
-
-static inline void reset_struct_pages(struct page *start)
-{
-	int i;
-	struct page *from = start + NR_RESET_STRUCT_PAGE;
-
-	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
-		memcpy(start + i, from, sizeof(*from));
-}
-
-static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
-				struct vmemmap_remap_walk *walk)
-{
-	pgprot_t pgprot = PAGE_KERNEL;
-	struct page *page;
-	void *to;
-
-	BUG_ON(pte_page(*pte) != walk->reuse_page);
-
-	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
-	list_del(&page->lru);
-	to = page_to_virt(page);
-	copy_page(to, (void *)walk->reuse_addr);
-	reset_struct_pages(to);
-
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
-}
-
-/**
- * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
- *			to the page which @reuse is mapped to, then free vmemmap
- *			which the range are mapped to.
- * @start:	start address of the vmemmap virtual address range that we want
- *		to remap.
- * @end:	end address of the vmemmap virtual address range that we want to
- *		remap.
- * @reuse:	reuse address.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_free(unsigned long start, unsigned long end,
-		       unsigned long reuse)
-{
-	int ret;
-	LIST_HEAD(vmemmap_pages);
-	struct vmemmap_remap_walk walk = {
-		.remap_pte	= vmemmap_remap_pte,
-		.reuse_addr	= reuse,
-		.vmemmap_pages	= &vmemmap_pages,
-	};
-
-	/*
-	 * In order to make remapping routine most efficient for the huge pages,
-	 * the routine of vmemmap page table walking has the following rules
-	 * (see more details from the vmemmap_pte_range()):
-	 *
-	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
-	 *   should be continuous.
-	 * - The @reuse address is part of the range [@reuse, @end) that we are
-	 *   walking which is passed to vmemmap_remap_range().
-	 * - The @reuse address is the first in the complete range.
-	 *
-	 * So we need to make sure that @start and @reuse meet the above rules.
-	 */
-	BUG_ON(start - reuse != PAGE_SIZE);
-
-	mmap_read_lock(&init_mm);
-	ret = vmemmap_remap_range(reuse, end, &walk);
-	if (ret && walk.nr_walked) {
-		end = reuse + walk.nr_walked * PAGE_SIZE;
-		/*
-		 * vmemmap_pages contains pages from the previous
-		 * vmemmap_remap_range call which failed.  These
-		 * are pages which were removed from the vmemmap.
-		 * They will be restored in the following call.
-		 */
-		walk = (struct vmemmap_remap_walk) {
-			.remap_pte	= vmemmap_restore_pte,
-			.reuse_addr	= reuse,
-			.vmemmap_pages	= &vmemmap_pages,
-		};
-
-		vmemmap_remap_range(reuse, end, &walk);
-	}
-	mmap_read_unlock(&init_mm);
-
-	free_vmemmap_page_list(&vmemmap_pages);
-
-	return ret;
-}
-
-static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
-				   gfp_t gfp_mask, struct list_head *list)
-{
-	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
-	int nid = page_to_nid((struct page *)start);
-	struct page *page, *next;
-
-	while (nr_pages--) {
-		page = alloc_pages_node(nid, gfp_mask, 0);
-		if (!page)
-			goto out;
-		list_add_tail(&page->lru, list);
-	}
-
-	return 0;
-out:
-	list_for_each_entry_safe(page, next, list, lru)
-		__free_pages(page, 0);
-	return -ENOMEM;
-}
-
-/**
- * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
- *			 to the page which is from the @vmemmap_pages
- *			 respectively.
- * @start:	start address of the vmemmap virtual address range that we want
- *		to remap.
- * @end:	end address of the vmemmap virtual address range that we want to
- *		remap.
- * @reuse:	reuse address.
- * @gfp_mask:	GFP flag for allocating vmemmap pages.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_alloc(unsigned long start, unsigned long end,
-			unsigned long reuse, gfp_t gfp_mask)
-{
-	LIST_HEAD(vmemmap_pages);
-	struct vmemmap_remap_walk walk = {
-		.remap_pte	= vmemmap_restore_pte,
-		.reuse_addr	= reuse,
-		.vmemmap_pages	= &vmemmap_pages,
-	};
-
-	/* See the comment in the vmemmap_remap_free(). */
-	BUG_ON(start - reuse != PAGE_SIZE);
-
-	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
-		return -ENOMEM;
-
-	mmap_read_lock(&init_mm);
-	vmemmap_remap_range(reuse, end, &walk);
-	mmap_read_unlock(&init_mm);
-
-	return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
cgit 


From 6213834c10de954470b7195cf0cdbda858edf0ee Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:33 +0800
Subject: mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability

There is a discussion about the name of hugetlb_vmemmap_alloc/free in
thread [1].  The suggestion suggested by David is rename "alloc/free" to
"optimize/restore" to make functionalities clearer to users, "optimize"
means the function will optimize vmemmap pages, while "restore" means
restoring its vmemmap pages discared before.  This commit does this.

Another discussion is the confusion RESERVE_VMEMMAP_NR isn't used
explicitly for vmemmap_addr but implicitly for vmemmap_end in
hugetlb_vmemmap_alloc/free.  David suggested we can compute what
hugetlb_vmemmap_init() does now at runtime.  We do not need to worry for
the overhead of computing at runtime since the calculation is simple
enough and those functions are not in a hot path.  This commit has the
following improvements:

  1) The function suffixed name ("optimize/restore") is more expressive.
  2) The logic becomes less weird in hugetlb_vmemmap_optimize/restore().
  3) The hugetlb_vmemmap_init() does not need to be exported anymore.
  4) A ->optimize_vmemmap_pages field in struct hstate is killed.
  5) There is only one place where checks is_power_of_2(sizeof(struct
     page)) instead of two places.
  6) Add more comments for hugetlb_vmemmap_optimize/restore().
  7) For external users, hugetlb_optimize_vmemmap_pages() is used for
     detecting if the HugeTLB's vmemmap pages is optimizable originally.
     In this commit, it is killed and we introduce a new helper
     hugetlb_vmemmap_optimizable() to replace it.  The name is more
     expressive.

Link: https://lore.kernel.org/all/20220404074652.68024-2-songmuchun@bytedance.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-7-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |   7 +--
 include/linux/sysctl.h  |   4 ++
 mm/hugetlb.c            |  15 ++---
 mm/hugetlb_vmemmap.c    | 143 ++++++++++++++++++++----------------------------
 mm/hugetlb_vmemmap.h    |  41 +++++++++-----
 5 files changed, 102 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4cdfce976644..6d0620edf0a6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -638,9 +638,6 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-	unsigned int optimize_vmemmap_pages;
-#endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files_dfl[8];
@@ -716,7 +713,7 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 	return hstate_file(vma->vm_file);
 }
 
-static inline unsigned long huge_page_size(struct hstate *h)
+static inline unsigned long huge_page_size(const struct hstate *h)
 {
 	return (unsigned long)PAGE_SIZE << h->order;
 }
@@ -745,7 +742,7 @@ static inline bool hstate_is_gigantic(struct hstate *h)
 	return huge_page_order(h) >= MAX_ORDER;
 }
 
-static inline unsigned int pages_per_huge_page(struct hstate *h)
+static inline unsigned int pages_per_huge_page(const struct hstate *h)
 {
 	return 1 << h->order;
 }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 17b42ce89d3e..780690dc08cd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -268,6 +268,10 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table *
 	return NULL;
 }
 
+static inline void register_sysctl_init(const char *path, struct ctl_table *table)
+{
+}
+
 static inline struct ctl_table_header *register_sysctl_mount_point(const char *path)
 {
 	return NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f044962ad9df..598c37279fee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1535,7 +1535,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
-	if (hugetlb_vmemmap_alloc(h, page)) {
+	if (hugetlb_vmemmap_restore(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
 		 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1612,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-	if (hugetlb_optimize_vmemmap_pages(h))
+	if (hugetlb_vmemmap_optimizable(h))
 		flush_work(&free_hpage_work);
 }
 
@@ -1734,7 +1734,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
-	hugetlb_vmemmap_free(h, page);
+	hugetlb_vmemmap_optimize(h, page);
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	hugetlb_set_page_subpool(page, NULL);
@@ -2107,7 +2107,7 @@ retry:
 		 * Attempt to allocate vmemmmap here so that we can take
 		 * appropriate action on failure.
 		 */
-		rc = hugetlb_vmemmap_alloc(h, head);
+		rc = hugetlb_vmemmap_restore(h, head);
 		if (!rc) {
 			/*
 			 * Move PageHWPoison flag from head page to the raw
@@ -3182,8 +3182,10 @@ static void __init report_hugepages(void)
 		char buf[32];
 
 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
 			buf, h->free_huge_pages);
+		pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+			hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
 	}
 }
 
@@ -3421,7 +3423,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
 	remove_hugetlb_page_for_demote(h, page, false);
 	spin_unlock_irq(&hugetlb_lock);
 
-	rc = hugetlb_vmemmap_alloc(h, page);
+	rc = hugetlb_vmemmap_restore(h, page);
 	if (rc) {
 		/* Allocation of vmemmmap failed, we can not demote page */
 		spin_lock_irq(&hugetlb_lock);
@@ -4111,7 +4113,6 @@ void __init hugetlb_add_hstate(unsigned int order)
 	h->next_nid_to_free = first_memory_node;
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
-	hugetlb_vmemmap_init(h);
 
 	parsed_hstate = h;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 6c7117c30e56..8da2b31bb59f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -35,16 +35,6 @@ struct vmemmap_remap_walk {
 	struct list_head	*vmemmap_pages;
 };
 
-/*
- * There are a lot of struct page structures associated with each HugeTLB page.
- * For tail pages, the value of compound_head is the same. So we can reuse first
- * page of head page structures. We map the virtual addresses of all the pages
- * of tail page structures to the head page struct, and then free these page
- * frames. Therefore, we need to reserve one pages as vmemmap areas.
- */
-#define RESERVE_VMEMMAP_NR		1U
-#define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-
 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 {
 	pmd_t __pmd;
@@ -426,32 +416,37 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 
-/*
- * Previously discarded vmemmap pages will be allocated and remapping
- * after this function returns zero.
+/**
+ * hugetlb_vmemmap_restore - restore previously optimized (by
+ *			     hugetlb_vmemmap_optimize()) vmemmap pages which
+ *			     will be reallocated and remapped.
+ * @h:		struct hstate.
+ * @head:	the head page whose vmemmap pages will be restored.
+ *
+ * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
+ * negative error code otherwise.
  */
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 {
 	int ret;
-	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
 
 	if (!HPageVmemmapOptimized(head))
 		return 0;
 
-	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
-	vmemmap_pages	= hugetlb_optimize_vmemmap_pages(h);
-	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
-	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse	= vmemmap_start;
+	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
 
 	/*
-	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
+	 * The pages which the vmemmap virtual address range [@vmemmap_start,
 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
 	 * When a HugeTLB page is freed to the buddy allocator, previously
 	 * discarded vmemmap pages must be allocated and remapping.
 	 */
-	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
 	if (!ret) {
 		ClearHPageVmemmapOptimized(head);
@@ -461,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	return ret;
 }
 
-static unsigned int vmemmap_optimizable_pages(struct hstate *h,
-					      struct page *head)
+/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
+static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
 	if (!READ_ONCE(vmemmap_optimize_enabled))
-		return 0;
+		return false;
+
+	if (!hugetlb_vmemmap_optimizable(h))
+		return false;
 
 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
 		pmd_t *pmdp, pmd;
@@ -508,73 +506,47 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h,
 		 *          +-------------------------------------------+
 		 */
 		if (PageVmemmapSelfHosted(vmemmap_page))
-			return 0;
+			return false;
 	}
 
-	return hugetlb_optimize_vmemmap_pages(h);
+	return true;
 }
 
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+/**
+ * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
+ * @h:		struct hstate.
+ * @head:	the head page whose vmemmap pages will be optimized.
+ *
+ * This function only tries to optimize @head's vmemmap pages and does not
+ * guarantee that the optimization will succeed after it returns. The caller
+ * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
+ * have been optimized.
+ */
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
-	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
 
-	vmemmap_pages = vmemmap_optimizable_pages(h, head);
-	if (!vmemmap_pages)
+	if (!vmemmap_should_optimize(h, head))
 		return;
 
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
-	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
-	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
-	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse	= vmemmap_start;
+	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
 
 	/*
-	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
-	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+	 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
 	 */
-	if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
 	else
 		SetHPageVmemmapOptimized(head);
 }
 
-void __init hugetlb_vmemmap_init(struct hstate *h)
-{
-	unsigned int nr_pages = pages_per_huge_page(h);
-	unsigned int vmemmap_pages;
-
-	/*
-	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON
-	 * to catch invalid usage of the tail page structs.
-	 */
-	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
-		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
-
-	if (!is_power_of_2(sizeof(struct page))) {
-		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
-		return;
-	}
-
-	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-	/*
-	 * The head page is not to be freed to buddy allocator, the other tail
-	 * pages will map to the head page, so they can be freed.
-	 *
-	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
-	 * on some architectures (e.g. aarch64). See Documentation/arm64/
-	 * hugetlbpage.rst for more details.
-	 */
-	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
-		h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
-
-	pr_info("can optimize %d vmemmap pages for %s\n",
-		h->optimize_vmemmap_pages, h->name);
-}
-
-#ifdef CONFIG_PROC_SYSCTL
 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{
 		.procname	= "hugetlb_optimize_vmemmap",
@@ -586,16 +558,21 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{ }
 };
 
-static __init int hugetlb_vmemmap_sysctls_init(void)
+static int __init hugetlb_vmemmap_init(void)
 {
-	/*
-	 * If "struct page" crosses page boundaries, the vmemmap pages cannot
-	 * be optimized.
-	 */
-	if (is_power_of_2(sizeof(struct page)))
-		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
-
+	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
+	BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
+
+	if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
+		const struct hstate *h;
+
+		for_each_hstate(h) {
+			if (hugetlb_vmemmap_optimizable(h)) {
+				register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+				break;
+			}
+		}
+	}
 	return 0;
 }
-late_initcall(hugetlb_vmemmap_sysctls_init);
-#endif /* CONFIG_PROC_SYSCTL */
+late_initcall(hugetlb_vmemmap_init);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index ba66fadad9fc..25bd0e002431 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,35 +11,50 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_init(struct hstate *h);
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
 
 /*
- * How many vmemmap pages associated with a HugeTLB page that can be
- * optimized and freed to the buddy allocator.
+ * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
+ * Documentation/vm/vmemmap_dedup.rst.
  */
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+#define HUGETLB_VMEMMAP_RESERVE_SIZE	PAGE_SIZE
+
+static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
 {
-	return h->optimize_vmemmap_pages;
+	return pages_per_huge_page(h) * sizeof(struct page);
+}
+
+/*
+ * Return how many vmemmap size associated with a HugeTLB page that can be
+ * optimized and can be freed to the buddy allocator.
+ */
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+	int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+	if (!is_power_of_2(sizeof(struct page)))
+		return 0;
+	return size > 0 ? size : 0;
 }
 #else
-static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 {
 	return 0;
 }
 
-static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 }
 
-static inline void hugetlb_vmemmap_init(struct hstate *h)
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
 {
+	return 0;
 }
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
 {
-	return 0;
+	return hugetlb_vmemmap_optimizable_size(h) != 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
-- 
cgit 


From 838691a1c0ec44739db558834e6954d62577d6b8 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:34 +0800
Subject: mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst

All the comments which explains how HVO works are moved to
vmemmap_dedup.rst since

  commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")

except some comments above page_fixed_fake_head().  This commit moves
those comments to vmemmap_dedup.rst and improve vmemmap_dedup.rst as well.

Link: https://lkml.kernel.org/r/20220628092235.91270-8-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/vmemmap_dedup.rst | 70 +++++++++++++++++++++++++-------------
 include/linux/page-flags.h         | 15 ++------
 2 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index 7d7a161aa364..a4b12ff906c4 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -9,23 +9,23 @@ HugeTLB
 
 This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
 
-The struct page structures (page structs) are used to describe a physical
-page frame. By default, there is a one-to-one mapping from a page frame to
-it's corresponding page struct.
+The ``struct page`` structures are used to describe a physical page frame. By
+default, there is a one-to-one mapping from a page frame to it's corresponding
+``struct page``.
 
 HugeTLB pages consist of multiple base page size pages and is supported by many
 architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
 details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
 currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
 consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
-For each base page, there is a corresponding page struct.
+For each base page, there is a corresponding ``struct page``.
 
-Within the HugeTLB subsystem, only the first 4 page structs are used to
-contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
-this upper limit. The only 'useful' information in the remaining page structs
+Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to
+contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides
+this upper limit. The only 'useful' information in the remaining ``struct page``
 is the compound_head field, and this field is the same for all tail pages.
 
-By removing redundant page structs for HugeTLB pages, memory can be returned
+By removing redundant ``struct page`` for HugeTLB pages, memory can be returned
 to the buddy allocator for other uses.
 
 Different architectures support different HugeTLB pages. For example, the
@@ -46,7 +46,7 @@ page.
 |              |   64KB    |    2MB    |  512MB    |    16GB   |           |
 +--------------+-----------+-----------+-----------+-----------+-----------+
 
-When the system boot up, every HugeTLB page has more than one struct page
+When the system boot up, every HugeTLB page has more than one ``struct page``
 structs which size is (unit: pages)::
 
    struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
@@ -76,10 +76,10 @@ Where n is how many pte entries which one page can contains. So the value of
 n is (PAGE_SIZE / sizeof(pte_t)).
 
 This optimization only supports 64-bit system, so the value of sizeof(pte_t)
-is 8. And this optimization also applicable only when the size of struct page
-is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+is 8. And this optimization also applicable only when the size of ``struct page``
+is a power of two. In most cases, the size of ``struct page`` is 64 bytes (e.g.
 x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
-size of struct page structs of it is 8 page frames which size depends on the
+size of ``struct page`` structs of it is 8 page frames which size depends on the
 size of the base page.
 
 For the HugeTLB page of the pud level mapping, then::
@@ -88,7 +88,7 @@ For the HugeTLB page of the pud level mapping, then::
                = PAGE_SIZE / 8 * 8 (pages)
                = PAGE_SIZE (pages)
 
-Where the struct_size(pmd) is the size of the struct page structs of a
+Where the struct_size(pmd) is the size of the ``struct page`` structs of a
 HugeTLB page of the pmd level mapping.
 
 E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
@@ -96,7 +96,7 @@ HugeTLB page consists in 4096.
 
 Next, we take the pmd level mapping of the HugeTLB page as an example to
 show the internal implementation of this optimization. There are 8 pages
-struct page structs associated with a HugeTLB page which is pmd mapped.
+``struct page`` structs associated with a HugeTLB page which is pmd mapped.
 
 Here is how things look before optimization::
 
@@ -124,10 +124,10 @@ Here is how things look before optimization::
  +-----------+
 
 The value of page->compound_head is the same for all tail pages. The first
-page of page structs (page 0) associated with the HugeTLB page contains the 4
-page structs necessary to describe the HugeTLB. The only use of the remaining
-pages of page structs (page 1 to page 7) is to point to page->compound_head.
-Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
+page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4
+``struct page`` necessary to describe the HugeTLB. The only use of the remaining
+pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head.
+Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page``
 will be used for each HugeTLB page. This will allow us to free the remaining
 7 pages to the buddy allocator.
 
@@ -169,13 +169,37 @@ entries that can be cached in a single TLB entry.
 
 The contiguous bit is used to increase the mapping size at the pmd and pte
 (last) level. So this type of HugeTLB page can be optimized only when its
-size of the struct page structs is greater than 1 page.
+size of the ``struct page`` structs is greater than **1** page.
 
 Notice: The head vmemmap page is not freed to the buddy allocator and all
 tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
-more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
-associated with each HugeTLB page. The compound_head() can handle this
-correctly (more details refer to the comment above compound_head()).
+more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB
+page) associated with each HugeTLB page. The ``compound_head()`` can handle
+this correctly. There is only **one** head ``struct page``, the tail
+``struct page`` with ``PG_head`` are fake head ``struct page``.  We need an
+approach to distinguish between those two different types of ``struct page`` so
+that ``compound_head()`` can return the real head ``struct page`` when the
+parameter is the tail ``struct page`` but with ``PG_head``. The following code
+snippet describes how to distinguish between real and fake head ``struct page``.
+
+.. code-block:: c
+
+	if (test_bit(PG_head, &page->flags)) {
+		unsigned long head = READ_ONCE(page[1].compound_head);
+
+		if (head & 1) {
+			if (head == (unsigned long)page + 1)
+				/* head struct page */
+			else
+				/* tail struct page */
+		} else {
+			/* head struct page */
+		}
+	}
+
+We can safely access the field of the **page[1]** with ``PG_head`` because the
+page is a compound page composed with at least two contiguous pages.
+The implementation refers to ``page_fixed_fake_head()``.
 
 Device DAX
 ==========
@@ -189,7 +213,7 @@ PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
 
 The differences with HugeTLB are relatively minor.
 
-It only use 3 page structs for storing all information as opposed
+It only use 3 ``struct page`` for storing all information as opposed
 to 4 on HugeTLB pages.
 
 There's no remapping of vmemmap given that device-dax memory is not part of
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 78ed46ae6ee5..465ff35a8c00 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,19 +208,8 @@ enum pageflags {
 DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
- * If HVO is enabled, the head vmemmap page frame is reused and all of the tail
- * vmemmap addresses map to the head vmemmap page frame (furture details can
- * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
- * words, there are more than one page struct with PG_head associated with each
- * HugeTLB page.  We __know__ that there is only one head page struct, the tail
- * page structs with PG_head are fake head page structs.  We need an approach
- * to distinguish between those two different types of page structs so that
- * compound_head() can return the real head page struct when the parameter is
- * the tail page struct but with PG_head.
- *
- * The page_fixed_fake_head() returns the real head page struct if the @page is
- * fake page head, otherwise, returns @page which can either be a true page
- * head or tail.
+ * Return the real head page struct iff the @page is a fake head page, otherwise
+ * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-- 
cgit 


From 161df60e9e89651c9aa3ae0edc9aae3a8a2d21e7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:15 +0900
Subject: mm, hwpoison, hugetlb: support saving mechanism of raw error pages

When handling memory error on a hugetlb page, the error handler tries to
dissolve and turn it into 4kB pages.  If it's successfully dissolved,
PageHWPoison flag is moved to the raw error page, so that's all right.
However, dissolve sometimes fails, then the error page is left as
hwpoisoned hugepage.  It's useful if we can retry to dissolve it to save
healthy pages, but that's not possible now because the information about
where the raw error pages is lost.

Use the private field of a few tail pages to keep that information.  The
code path of shrinking hugepage pool uses this info to try delayed
dissolve.  In order to remember multiple errors in a hugepage, a
singly-linked list originated from SUBPAGE_INDEX_HWPOISON-th tail page is
constructed.  Only simple operations (adding an entry or clearing all) are
required and the list is assumed not to be very long, so this simple data
structure should be enough.

If we failed to save raw error info, the hwpoison hugepage has errors on
unknown subpage, then this new saving mechanism does not work any more, so
disable saving new raw error info and freeing hwpoison hugepages.

Link: https://lkml.kernel.org/r/20220714042420.1847125-4-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 +++++++++-
 mm/hugetlb.c            | 23 ++++++++-----
 mm/memory-failure.c     | 89 +++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 116 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6d0620edf0a6..3ec981a0d8b3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -42,6 +42,9 @@ enum {
 	SUBPAGE_INDEX_CGROUP,		/* reuse page->private */
 	SUBPAGE_INDEX_CGROUP_RSVD,	/* reuse page->private */
 	__MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	SUBPAGE_INDEX_HWPOISON,
 #endif
 	__NR_USED_SUBPAGE,
 };
@@ -551,7 +554,7 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  *	Synchronization:  Initially set after new page allocation with no
  *	locking.  When examined and modified during migration processing
  *	(isolate, migrate, putback) the hugetlb_lock is held.
- * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
+ * HPG_temporary - Set on a page that is temporarily allocated from the buddy
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
@@ -561,6 +564,8 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_freed - Set when page is on the free lists.
  *	Synchronization: hugetlb_lock held for examination and modification.
  * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
+ * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
+ *     that is not tracked by raw_hwp_page list.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
@@ -568,6 +573,7 @@ enum hugetlb_page_flags {
 	HPG_temporary,
 	HPG_freed,
 	HPG_vmemmap_optimized,
+	HPG_raw_hwp_unreliable,
 	__NR_HPAGEFLAGS,
 };
 
@@ -614,6 +620,7 @@ HPAGEFLAG(Migratable, migratable)
 HPAGEFLAG(Temporary, temporary)
 HPAGEFLAG(Freed, freed)
 HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
+HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -796,6 +803,14 @@ extern int dissolve_free_huge_page(struct page *page);
 extern int dissolve_free_huge_pages(unsigned long start_pfn,
 				    unsigned long end_pfn);
 
+#ifdef CONFIG_MEMORY_FAILURE
+extern void hugetlb_clear_page_hwpoison(struct page *hpage);
+#else
+static inline void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+}
+#endif
+
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 #ifndef arch_hugetlb_migration_supported
 static inline bool arch_hugetlb_migration_supported(struct hstate *h)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9b79806be17b..0aee2f3ae15c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1535,6 +1535,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
+	/*
+	 * If we don't know which subpages are hwpoisoned, we can't free
+	 * the hugepage, so it's leaked intentionally.
+	 */
+	if (HPageRawHwpUnreliable(page))
+		return;
+
 	if (hugetlb_vmemmap_restore(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
@@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 		return;
 	}
 
+	/*
+	 * Move PageHWPoison flag from head page to the raw error pages,
+	 * which makes any healthy subpages reusable.
+	 */
+	if (unlikely(PageHWPoison(page)))
+		hugetlb_clear_page_hwpoison(page);
+
 	for (i = 0; i < pages_per_huge_page(h);
 	     i++, subpage = mem_map_next(subpage, page, i)) {
 		subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -2109,15 +2123,6 @@ retry:
 		 */
 		rc = hugetlb_vmemmap_restore(h, head);
 		if (!rc) {
-			/*
-			 * Move PageHWPoison flag from head page to the raw
-			 * error page, which makes any subpages rather than
-			 * the error page reusable.
-			 */
-			if (PageHWPoison(head) && page != head) {
-				SetPageHWPoison(page);
-				ClearPageHWPoison(head);
-			}
 			update_and_free_page(h, head, false);
 		} else {
 			spin_lock_irq(&hugetlb_lock);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a7a228ad04a..61668ce20836 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1662,6 +1662,90 @@ unlock:
 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
 #endif /* CONFIG_FS_DAX */
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list originated from ->private field of
+ * SUBPAGE_INDEX_HWPOISON-th tail page.
+ */
+struct raw_hwp_page {
+	struct llist_node node;
+	struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+{
+	return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+}
+
+static void __free_raw_hwp_pages(struct page *hpage)
+{
+	struct llist_head *head;
+	struct llist_node *t, *tnode;
+
+	head = raw_hwp_list_head(hpage);
+	llist_for_each_safe(tnode, t, head->first) {
+		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+		SetPageHWPoison(p->page);
+		kfree(p);
+	}
+	llist_del_all(head);
+}
+
+static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+{
+	struct llist_head *head;
+	struct raw_hwp_page *raw_hwp;
+	struct llist_node *t, *tnode;
+	int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+
+	/*
+	 * Once the hwpoison hugepage has lost reliable raw error info,
+	 * there is little meaning to keep additional error info precisely,
+	 * so skip to add additional raw error info.
+	 */
+	if (HPageRawHwpUnreliable(hpage))
+		return -EHWPOISON;
+	head = raw_hwp_list_head(hpage);
+	llist_for_each_safe(tnode, t, head->first) {
+		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+		if (p->page == page)
+			return -EHWPOISON;
+	}
+
+	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+	if (raw_hwp) {
+		raw_hwp->page = page;
+		llist_add(&raw_hwp->node, head);
+		/* the first error event will be counted in action_result(). */
+		if (ret)
+			num_poisoned_pages_inc();
+	} else {
+		/*
+		 * Failed to save raw error info.  We no longer trace all
+		 * hwpoisoned subpages, and we need refuse to free/dissolve
+		 * this hwpoisoned hugepage.
+		 */
+		SetHPageRawHwpUnreliable(hpage);
+		/*
+		 * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+		 * used any more, so free it.
+		 */
+		__free_raw_hwp_pages(hpage);
+	}
+	return ret;
+}
+
+void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+	if (HPageRawHwpUnreliable(hpage))
+		return;
+	ClearPageHWPoison(hpage);
+	__free_raw_hwp_pages(hpage);
+}
+
 /*
  * Called from hugetlb code with hugetlb_lock held.
  *
@@ -1696,7 +1780,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	if (TestSetPageHWPoison(head)) {
+	if (hugetlb_set_page_hwpoison(head, page)) {
 		ret = -EHWPOISON;
 		goto out;
 	}
@@ -1708,7 +1792,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
 /*
  * Taking refcount of hugetlb pages needs extra care about race conditions
  * with basic operations like hugepage allocation/free/demotion.
@@ -1749,7 +1832,7 @@ retry:
 	lock_page(head);
 
 	if (hwpoison_filter(p)) {
-		ClearPageHWPoison(head);
+		hugetlb_clear_page_hwpoison(head);
 		res = -EOPNOTSUPP;
 		goto out;
 	}
-- 
cgit 


From ac5fcde0a96a18773f06b7c00c5ea081bbdc64b3 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:16 +0900
Subject: mm, hwpoison: make unpoison aware of raw error info in hwpoisoned
 hugepage

Raw error info list needs to be removed when hwpoisoned hugetlb is
unpoisoned.  And unpoison handler needs to know how many errors there are
in the target hugepage.  So add them.

HPageVmemmapOptimized(hpage) and HPageRawHwpUnreliable(hpage)) sometimes
can't be unpoisoned, so skip them.

Link: https://lkml.kernel.org/r/20220714042420.1847125-5-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h |  9 +++++++++
 mm/memory-failure.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 56 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bb7afd03a324..a3d435bf9f97 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -490,6 +490,11 @@ static inline void num_poisoned_pages_dec(void)
 	atomic_long_dec(&num_poisoned_pages);
 }
 
+static inline void num_poisoned_pages_sub(long i)
+{
+	atomic_long_sub(i, &num_poisoned_pages);
+}
+
 #else
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -505,6 +510,10 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 static inline void num_poisoned_pages_inc(void)
 {
 }
+
+static inline void num_poisoned_pages_sub(long i)
+{
+}
 #endif
 
 static inline int non_swap_entry(swp_entry_t entry)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 61668ce20836..e30dbeca09d1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1678,19 +1678,23 @@ static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
 	return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
 }
 
-static void __free_raw_hwp_pages(struct page *hpage)
+static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
 {
 	struct llist_head *head;
 	struct llist_node *t, *tnode;
+	unsigned long count = 0;
 
 	head = raw_hwp_list_head(hpage);
 	llist_for_each_safe(tnode, t, head->first) {
 		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
 
-		SetPageHWPoison(p->page);
+		if (move_flag)
+			SetPageHWPoison(p->page);
 		kfree(p);
+		count++;
 	}
 	llist_del_all(head);
+	return count;
 }
 
 static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
@@ -1733,17 +1737,36 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
 		 * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
 		 * used any more, so free it.
 		 */
-		__free_raw_hwp_pages(hpage);
+		__free_raw_hwp_pages(hpage, false);
 	}
 	return ret;
 }
 
+static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+	/*
+	 * HPageVmemmapOptimized hugepages can't be freed because struct
+	 * pages for tail pages are required but they don't exist.
+	 */
+	if (move_flag && HPageVmemmapOptimized(hpage))
+		return 0;
+
+	/*
+	 * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+	 * definition.
+	 */
+	if (HPageRawHwpUnreliable(hpage))
+		return 0;
+
+	return __free_raw_hwp_pages(hpage, move_flag);
+}
+
 void hugetlb_clear_page_hwpoison(struct page *hpage)
 {
 	if (HPageRawHwpUnreliable(hpage))
 		return;
 	ClearPageHWPoison(hpage);
-	__free_raw_hwp_pages(hpage);
+	free_raw_hwp_pages(hpage, true);
 }
 
 /*
@@ -1887,6 +1910,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
 	return 0;
 }
 
+static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+{
+	return 0;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
@@ -2292,6 +2319,7 @@ int unpoison_memory(unsigned long pfn)
 	struct page *p;
 	int ret = -EBUSY;
 	int freeit = 0;
+	unsigned long count = 1;
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
 
@@ -2339,6 +2367,13 @@ int unpoison_memory(unsigned long pfn)
 
 	ret = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ret) {
+		if (PageHuge(p)) {
+			count = free_raw_hwp_pages(page, false);
+			if (count == 0) {
+				ret = -EBUSY;
+				goto unlock_mutex;
+			}
+		}
 		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
 	} else if (ret < 0) {
 		if (ret == -EHWPOISON) {
@@ -2347,6 +2382,13 @@ int unpoison_memory(unsigned long pfn)
 			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
 					 pfn, &unpoison_rs);
 	} else {
+		if (PageHuge(p)) {
+			count = free_raw_hwp_pages(page, false);
+			if (count == 0) {
+				ret = -EBUSY;
+				goto unlock_mutex;
+			}
+		}
 		freeit = !!TestClearPageHWPoison(p);
 
 		put_page(page);
@@ -2359,7 +2401,7 @@ int unpoison_memory(unsigned long pfn)
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	if (!ret || freeit) {
-		num_poisoned_pages_dec();
+		num_poisoned_pages_sub(count);
 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
 				 page_to_pfn(p), &unpoison_rs);
 	}
-- 
cgit 


From 38f6d29397ccb9c191c4c91103e8123f518fdc10 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:17 +0900
Subject: mm, hwpoison: set PG_hwpoison for busy hugetlb pages

If memory_failure() fails to grab page refcount on a hugetlb page because
it's busy, it returns without setting PG_hwpoison on it.  This not only
loses a chance of error containment, but breaks the rule that
action_result() should be called only when memory_failure() do any of
handling work (even if that's just setting PG_hwpoison).  This
inconsistency could harm code maintainability.

So set PG_hwpoison and call hugetlb_set_page_hwpoison() for such a case.

Link: https://lkml.kernel.org/r/20220714042420.1847125-6-naoya.horiguchi@linux.dev
Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  | 1 +
 mm/memory-failure.c | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e6e201a4ce05..0345b8c30394 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3176,6 +3176,7 @@ enum mf_flags {
 	MF_SOFT_OFFLINE = 1 << 3,
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
+	MF_NO_RETRY = 1 << 6,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e30dbeca09d1..a748f5dd7b8e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1800,7 +1800,8 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 			count_increased = true;
 	} else {
 		ret = -EBUSY;
-		goto out;
+		if (!(flags & MF_NO_RETRY))
+			goto out;
 	}
 
 	if (hugetlb_set_page_hwpoison(head, page)) {
@@ -1827,7 +1828,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 	struct page *p = pfn_to_page(pfn);
 	struct page *head;
 	unsigned long page_flags;
-	bool retry = true;
 
 	*hugetlb = 1;
 retry:
@@ -1843,8 +1843,8 @@ retry:
 		}
 		return res;
 	} else if (res == -EBUSY) {
-		if (retry) {
-			retry = false;
+		if (!(flags & MF_NO_RETRY)) {
+			flags |= MF_NO_RETRY;
 			goto retry;
 		}
 		action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
-- 
cgit 


From 6f4614886baa59b6ae014093300482c1da4d3c93 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:20 +0900
Subject: mm, hwpoison: enable memory error handling on 1GB hugepage

Now error handling code is prepared, so remove the blocking code and
enable memory error handling on 1GB hugepage.

Link: https://lkml.kernel.org/r/20220714042420.1847125-9-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      |  1 -
 include/ras/ras_event.h |  1 -
 mm/memory-failure.c     | 16 ----------------
 3 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0345b8c30394..3bedc449c14d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3229,7 +3229,6 @@ enum mf_action_page_type {
 	MF_MSG_DIFFERENT_COMPOUND,
 	MF_MSG_HUGE,
 	MF_MSG_FREE_HUGE,
-	MF_MSG_NON_PMD_HUGE,
 	MF_MSG_UNMAP_FAILED,
 	MF_MSG_DIRTY_SWAPCACHE,
 	MF_MSG_CLEAN_SWAPCACHE,
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index d0337a41141c..cbd3ddd7c33d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -360,7 +360,6 @@ TRACE_EVENT(aer_event,
 	EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
 	EM ( MF_MSG_HUGE, "huge page" )					\
 	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
-	EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" )		\
 	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
 	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
 	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index de4aff85e466..14439806b5ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -768,7 +768,6 @@ static const char * const action_page_types[] = {
 	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
 	[MF_MSG_HUGE]			= "huge page",
 	[MF_MSG_FREE_HUGE]		= "free huge page",
-	[MF_MSG_NON_PMD_HUGE]		= "non-pmd-sized huge page",
 	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
 	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
 	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
@@ -1885,21 +1884,6 @@ retry:
 
 	page_flags = head->flags;
 
-	/*
-	 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
-	 * simply disable it. In order to make it work properly, we need
-	 * make sure that:
-	 *  - conversion of a pud that maps an error hugetlb into hwpoison
-	 *    entry properly works, and
-	 *  - other mm code walking over page table is aware of pud-aligned
-	 *    hwpoison entries.
-	 */
-	if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
-		action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
-		res = -EBUSY;
-		goto out;
-	}
-
 	if (!hwpoison_user_mappings(p, pfn, flags, head)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
-- 
cgit 


From 729337bc20876af348b363b3e35fb19be71ba793 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:38 +0200
Subject: highmem: remove unneeded spaces in kmap_local_page() kdocs

Patch series "highmem: Extend kmap_local_page() documentation", v2.

The Highmem interface is evolving and the current documentation does not
reflect the intended uses of each of the calls.  Furthermore, after a
recent series of reworks, the differences of the calls can still be
confusing and may lead to the expanded use of calls which are deprecated.

This series is the second round of changes towards an enhanced
documentation of the Highmem's interface; at this stage the patches are
only focused to kmap_local_page().

In addition it also contains some minor clean ups.


This patch (of 7):

In the kdocs of kmap_local_page(), the description of @page starts after
several unnecessary spaces.

Therefore, remove those spaces.

Link: https://lkml.kernel.org/r/20220728154844.10874-1-fmdefrancesco@gmail.com
Link: https://lkml.kernel.org/r/20220728154844.10874-2-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 177b07944640..0bd5ed4ac391 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -60,7 +60,7 @@ static inline void kmap_flush_unused(void);
 
 /**
  * kmap_local_page - Map a page for temporary usage
- * @page:	Pointer to the page to be mapped
+ * @page: Pointer to the page to be mapped
  *
  * Returns: The virtual address of the mapping
  *
-- 
cgit 


From 383bbef283920411379c5c93829102ff7859fea5 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:39 +0200
Subject: highmem: specify that kmap_local_page() is callable from interrupts

In a recent thread about converting kmap() to kmap_local_page(), the
safety of calling kmap_local_page() was questioned.[1]

"any context" should probably be enough detail for users who want to know
whether or not kmap_local_page() can be called from interrupts.  However,
Linux still has kmap_atomic() which might make users think they must use
the latter in interrupts.

Add "including interrupts" for better clarity.

[1] https://lore.kernel.org/lkml/3187836.aeNJFYEL58@opensuse/

Link: https://lkml.kernel.org/r/20220728154844.10874-3-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0bd5ed4ac391..4a46d95ff6c8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -64,7 +64,7 @@ static inline void kmap_flush_unused(void);
  *
  * Returns: The virtual address of the mapping
  *
- * Can be invoked from any context.
+ * Can be invoked from any context, including interrupts.
  *
  * Requires careful handling when nesting multiple mappings because the map
  * management is stack based. The unmap has to be in the reverse order of
-- 
cgit 


From 72f1c55adf70fd08ceac6b67455238db2014894a Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:43 +0200
Subject: highmem: delete a sentence from kmap_local_page() kdocs

kmap_local_page() should always be preferred in place of kmap() and
kmap_atomic().  "Only use when really necessary." is not consistent with
the Documentation/mm/highmem.rst and these kdocs it embeds.

Therefore, delete the above-mentioned sentence from kdocs.

Link: https://lkml.kernel.org/r/20220728154844.10874-7-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4a46d95ff6c8..25679035ca28 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,8 +86,7 @@ static inline void kmap_flush_unused(void);
  * temporarily mapped.
  *
  * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity. Only use when really
- * necessary.
+ * comes with restrictions about the pointer validity.
  *
  * On HIGHMEM enabled systems mapping a highmem page has the side effect of
  * disabling migration in order to keep the virtual address stable across
-- 
cgit 


From fcb14cb1bdacec5b4374fe161e83fb8208164a85 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 May 2022 14:59:25 -0400
Subject: new iov_iter flavour - ITER_UBUF

Equivalent of single-segment iovec.  Initialized by iov_iter_ubuf(),
checked for by iter_is_ubuf(), otherwise behaves like ITER_IOVEC
ones.

We are going to expose the things like ->write_iter() et.al. to those
in subsequent commits.

New predicate (user_backed_iter()) that is true for ITER_IOVEC and
ITER_UBUF; places like direct-IO handling should use that for
checking that pages we modify after getting them from iov_iter_get_pages()
would need to be dirtied.

DO NOT assume that replacing iter_is_iovec() with user_backed_iter()
will solve all problems - there's code that uses iter_is_iovec() to
decide how to poke around in iov_iter guts and for that the predicate
replacement obviously won't suffice.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/fops.c         |  6 ++--
 fs/ceph/file.c       |  2 +-
 fs/cifs/file.c       |  2 +-
 fs/direct-io.c       |  2 +-
 fs/fuse/dev.c        |  4 +--
 fs/fuse/file.c       |  2 +-
 fs/gfs2/file.c       |  2 +-
 fs/iomap/direct-io.c |  2 +-
 fs/nfs/direct.c      |  2 +-
 include/linux/uio.h  | 26 ++++++++++++++++
 lib/iov_iter.c       | 87 +++++++++++++++++++++++++++++++++++++++++-----------
 mm/shmem.c           |  2 +-
 12 files changed, 108 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index a564cd81340c..b90742595317 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,7 +75,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 
 	if (iov_iter_rw(iter) == READ) {
 		bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
-		if (iter_is_iovec(iter))
+		if (user_backed_iter(iter))
 			should_dirty = true;
 	} else {
 		bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
@@ -204,7 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	dio->size = 0;
-	if (is_read && iter_is_iovec(iter))
+	if (is_read && user_backed_iter(iter))
 		dio->flags |= DIO_SHOULD_DIRTY;
 
 	blk_start_plug(&plug);
@@ -335,7 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	dio->size = bio->bi_iter.bi_size;
 
 	if (is_read) {
-		if (iter_is_iovec(iter)) {
+		if (user_backed_iter(iter)) {
 			dio->flags |= DIO_SHOULD_DIRTY;
 			bio_set_pages_dirty(bio);
 		}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index da59e836a06e..c535de5852bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1262,7 +1262,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
-	bool should_dirty = !write && iter_is_iovec(iter);
+	bool should_dirty = !write && user_backed_iter(iter);
 
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e64cda7a7610..e1e05b253daa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4004,7 +4004,7 @@ static ssize_t __cifs_readv(
 	if (!is_sync_kiocb(iocb))
 		ctx->iocb = iocb;
 
-	if (iter_is_iovec(to))
+	if (user_backed_iter(to))
 		ctx->should_dirty = true;
 
 	if (direct) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index df5e2d048799..c7fc01c2d509 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1251,7 +1251,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
-	dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
+	dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
 	sdio.iter = iter;
 	sdio.final_block_in_request = end >> blkbits;
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0e537e580dc1..8d657c2cd6f7 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1356,7 +1356,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
 	if (!fud)
 		return -EPERM;
 
-	if (!iter_is_iovec(to))
+	if (!user_backed_iter(to))
 		return -EINVAL;
 
 	fuse_copy_init(&cs, 1, to);
@@ -1949,7 +1949,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
 	if (!fud)
 		return -EPERM;
 
-	if (!iter_is_iovec(from))
+	if (!user_backed_iter(from))
 		return -EINVAL;
 
 	fuse_copy_init(&cs, 0, from);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 00fa861aeead..c982e3afe3b4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1465,7 +1465,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 			inode_unlock(inode);
 	}
 
-	io->should_dirty = !write && iter_is_iovec(iter);
+	io->should_dirty = !write && user_backed_iter(iter);
 	while (count) {
 		ssize_t nres;
 		fl_owner_t owner = current->files;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2cceb193dcd8..48e6cc74fdc1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -780,7 +780,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
 
 	if (!count)
 		return false;
-	if (!iter_is_iovec(i))
+	if (!user_backed_iter(i))
 		return false;
 
 	size = PAGE_SIZE;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c75d33d5c3ce..4eb559a16c9e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -533,7 +533,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			iomi.flags |= IOMAP_NOWAIT;
 		}
 
-		if (iter_is_iovec(iter))
+		if (user_backed_iter(iter))
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
 		iomi.flags |= IOMAP_WRITE;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4eb2a8380a28..022e1ce63e62 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -478,7 +478,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	if (iter_is_iovec(iter))
+	if (user_backed_iter(iter))
 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 
 	if (!swap)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 9a2dc496d535..85bef84fd294 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -26,6 +26,7 @@ enum iter_type {
 	ITER_PIPE,
 	ITER_XARRAY,
 	ITER_DISCARD,
+	ITER_UBUF,
 };
 
 struct iov_iter_state {
@@ -38,6 +39,7 @@ struct iov_iter {
 	u8 iter_type;
 	bool nofault;
 	bool data_source;
+	bool user_backed;
 	size_t iov_offset;
 	size_t count;
 	union {
@@ -46,6 +48,7 @@ struct iov_iter {
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
 		struct pipe_inode_info *pipe;
+		void __user *ubuf;
 	};
 	union {
 		unsigned long nr_segs;
@@ -70,6 +73,11 @@ static inline void iov_iter_save_state(struct iov_iter *iter,
 	state->nr_segs = iter->nr_segs;
 }
 
+static inline bool iter_is_ubuf(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_UBUF;
+}
+
 static inline bool iter_is_iovec(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_IOVEC;
@@ -105,6 +113,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->data_source ? WRITE : READ;
 }
 
+static inline bool user_backed_iter(const struct iov_iter *i)
+{
+	return i->user_backed;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *
@@ -322,4 +335,17 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 int import_single_range(int type, void __user *buf, size_t len,
 		 struct iovec *iov, struct iov_iter *i);
 
+static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
+			void __user *buf, size_t count)
+{
+	WARN_ON(direction & ~(READ | WRITE));
+	*i = (struct iov_iter) {
+		.iter_type = ITER_UBUF,
+		.user_backed = true,
+		.data_source = direction,
+		.ubuf = buf,
+		.count = count
+	};
+}
+
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0e0be334dbee..b3493d20536e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -16,6 +16,16 @@
 
 #define PIPE_PARANOIA /* for now */
 
+/* covers ubuf and kbuf alike */
+#define iterate_buf(i, n, base, len, off, __p, STEP) {		\
+	size_t __maybe_unused off = 0;				\
+	len = n;						\
+	base = __p + i->iov_offset;				\
+	len -= (STEP);						\
+	i->iov_offset += len;					\
+	n = len;						\
+}
+
 /* covers iovec and kvec alike */
 #define iterate_iovec(i, n, base, len, off, __p, STEP) {	\
 	size_t off = 0;						\
@@ -110,7 +120,12 @@ __out:								\
 	if (unlikely(i->count < n))				\
 		n = i->count;					\
 	if (likely(n)) {					\
-		if (likely(iter_is_iovec(i))) {			\
+		if (likely(iter_is_ubuf(i))) {			\
+			void __user *base;			\
+			size_t len;				\
+			iterate_buf(i, n, base, len, off,	\
+						i->ubuf, (I)) 	\
+		} else if (likely(iter_is_iovec(i))) {		\
 			const struct iovec *iov = i->iov;	\
 			void __user *base;			\
 			size_t len;				\
@@ -275,7 +290,11 @@ out:
  */
 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
 {
-	if (iter_is_iovec(i)) {
+	if (iter_is_ubuf(i)) {
+		size_t n = min(size, iov_iter_count(i));
+		n -= fault_in_readable(i->ubuf + i->iov_offset, n);
+		return size - n;
+	} else if (iter_is_iovec(i)) {
 		size_t count = min(size, iov_iter_count(i));
 		const struct iovec *p;
 		size_t skip;
@@ -314,7 +333,11 @@ EXPORT_SYMBOL(fault_in_iov_iter_readable);
  */
 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
 {
-	if (iter_is_iovec(i)) {
+	if (iter_is_ubuf(i)) {
+		size_t n = min(size, iov_iter_count(i));
+		n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
+		return size - n;
+	} else if (iter_is_iovec(i)) {
 		size_t count = min(size, iov_iter_count(i));
 		const struct iovec *p;
 		size_t skip;
@@ -345,6 +368,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 	*i = (struct iov_iter) {
 		.iter_type = ITER_IOVEC,
 		.nofault = false,
+		.user_backed = true,
 		.data_source = direction,
 		.iov = iov,
 		.nr_segs = nr_segs,
@@ -494,7 +518,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyout(base, addr + off, len),
@@ -583,7 +607,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_mc_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	__iterate_and_advance(i, bytes, base, len, off,
 		copyout_mc(base, addr + off, len),
@@ -601,7 +625,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		WARN_ON(1);
 		return 0;
 	}
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(addr + off, base, len),
@@ -894,16 +918,16 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 {
 	if (unlikely(i->count < size))
 		size = i->count;
-	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
+	if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
+		i->iov_offset += size;
+		i->count -= size;
+	} else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
 		/* iovec and kvec have identical layouts */
 		iov_iter_iovec_advance(i, size);
 	} else if (iov_iter_is_bvec(i)) {
 		iov_iter_bvec_advance(i, size);
 	} else if (iov_iter_is_pipe(i)) {
 		pipe_advance(i, size);
-	} else if (unlikely(iov_iter_is_xarray(i))) {
-		i->iov_offset += size;
-		i->count -= size;
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
 	}
@@ -950,7 +974,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 		return;
 	}
 	unroll -= i->iov_offset;
-	if (iov_iter_is_xarray(i)) {
+	if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
 		BUG(); /* We should never go beyond the start of the specified
 			* range since we might then be straying into pages that
 			* aren't pinned.
@@ -1158,6 +1182,14 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 			 unsigned len_mask)
 {
+	if (likely(iter_is_ubuf(i))) {
+		if (i->count & len_mask)
+			return false;
+		if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
+			return false;
+		return true;
+	}
+
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_iter_aligned_iovec(i, addr_mask, len_mask);
 
@@ -1233,6 +1265,13 @@ static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
 
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
+	if (likely(iter_is_ubuf(i))) {
+		size_t size = i->count;
+		if (size)
+			return ((unsigned long)i->ubuf + i->iov_offset) | size;
+		return 0;
+	}
+
 	/* iovec and kvec have identical layouts */
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_iter_alignment_iovec(i);
@@ -1263,6 +1302,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 	size_t size = i->count;
 	unsigned k;
 
+	if (iter_is_ubuf(i))
+		return 0;
+
 	if (WARN_ON(!iter_is_iovec(i)))
 		return ~0U;
 
@@ -1385,12 +1427,15 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
 	return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
 }
 
-/* must be done on non-empty ITER_IOVEC one */
+/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
 {
 	size_t skip;
 	long k;
 
+	if (iter_is_ubuf(i))
+		return (unsigned long)i->ubuf + i->iov_offset;
+
 	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
 		size_t len = i->iov[k].iov_len - skip;
 
@@ -1432,7 +1477,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 	if (maxsize > MAX_RW_COUNT)
 		maxsize = MAX_RW_COUNT;
 
-	if (likely(iter_is_iovec(i))) {
+	if (likely(user_backed_iter(i))) {
 		unsigned int gup_flags = 0;
 		unsigned long addr;
 
@@ -1559,7 +1604,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	if (maxsize > MAX_RW_COUNT)
 		maxsize = MAX_RW_COUNT;
 
-	if (likely(iter_is_iovec(i))) {
+	if (likely(user_backed_iter(i))) {
 		unsigned int gup_flags = 0;
 		unsigned long addr;
 
@@ -1715,6 +1760,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	if (unlikely(!i->count))
 		return 0;
+	if (likely(iter_is_ubuf(i))) {
+		unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
+		int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
+		return min(npages, maxpages);
+	}
 	/* iovec and kvec have identical layouts */
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_npages(i, maxpages);
@@ -1749,17 +1799,16 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 		WARN_ON(1);
 		return NULL;
 	}
-	if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
-		return NULL;
 	if (iov_iter_is_bvec(new))
 		return new->bvec = kmemdup(new->bvec,
 				    new->nr_segs * sizeof(struct bio_vec),
 				    flags);
-	else
+	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
 		/* iovec and kvec have identical layout */
 		return new->iov = kmemdup(new->iov,
 				   new->nr_segs * sizeof(struct iovec),
 				   flags);
+	return NULL;
 }
 EXPORT_SYMBOL(dup_iter);
 
@@ -1953,10 +2002,12 @@ EXPORT_SYMBOL(import_single_range);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 {
 	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
-			 !iov_iter_is_kvec(i))
+			 !iov_iter_is_kvec(i) && !iter_is_ubuf(i))
 		return;
 	i->iov_offset = state->iov_offset;
 	i->count = state->count;
+	if (iter_is_ubuf(i))
+		return;
 	/*
 	 * For the *vec iters, nr_segs + iov is constant - if we increment
 	 * the vec, then we also decrement the nr_segs count. Hence we don't
diff --git a/mm/shmem.c b/mm/shmem.c
index e5e43b990fdc..e3a7e171bbd1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2602,7 +2602,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			ret = copy_page_to_iter(page, offset, nr, to);
 			put_page(page);
 
-		} else if (iter_is_iovec(to)) {
+		} else if (user_backed_iter(to)) {
 			/*
 			 * Copy to user tends to be so well optimized, but
 			 * clear_user() not so much, that it is noticeably
-- 
cgit 


From 10f525a8cd7a525e9fc73288bb35428c9cad5e63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Jun 2022 02:02:51 -0400
Subject: ITER_PIPE: cache the type of last buffer

We often need to find whether the last buffer is anon or not, and
currently it's rather clumsy:
	check if ->iov_offset is non-zero (i.e. that pipe is not empty)
	if so, get the corresponding pipe_buffer and check its ->ops
	if it's &default_pipe_buf_ops, we have an anon buffer.

Let's replace the use of ->iov_offset (which is nowhere near similar to
its role for other flavours) with signed field (->last_offset), with
the following rules:
	empty, no buffers occupied:		0
	anon, with bytes up to N-1 filled:	N
	zero-copy, with bytes up to N-1 filled:	-N

That way abs(i->last_offset) is equal to what used to be in i->iov_offset
and empty vs. anon vs. zero-copy can be distinguished by the sign of
i->last_offset.

	Checks for "should we extend the last buffer or should we start
a new one?" become easier to follow that way.

	Note that most of the operations can only be done in a sane
state - i.e. when the pipe has nothing past the current position of
iterator.  About the only thing that could be done outside of that
state is iov_iter_advance(), which transitions to the sane state by
truncating the pipe.  There are only two cases where we leave the
sane state:
	1) iov_iter_get_pages()/iov_iter_get_pages_alloc().  Will be
dealt with later, when we make get_pages advancing - the callers are
actually happier that way.
	2) iov_iter copied, then something is put into the copy.  Since
they share the underlying pipe, the original gets behind.  When we
decide that we are done with the copy (original is not usable until then)
we advance the original.  direct_io used to be done that way; nowadays
it operates on the original and we do iov_iter_revert() to discard
the excessive data.  At the moment there's nothing in the kernel that
could do that to ITER_PIPE iterators, so this reason for insane state
is theoretical right now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  5 +++-
 lib/iov_iter.c      | 77 ++++++++++++++++++++++++++---------------------------
 2 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 85bef84fd294..e7fc29b5ad19 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,7 +40,10 @@ struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	size_t iov_offset;
+	union {
+		size_t iov_offset;
+		int last_offset;
+	};
 	size_t count;
 	union {
 		const struct iovec *iov;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c2e08004a1eb..8834f3f61220 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -199,7 +199,7 @@ static bool sanity(const struct iov_iter *i)
 	unsigned int i_head = i->head;
 	unsigned int idx;
 
-	if (i->iov_offset) {
+	if (i->last_offset) {
 		struct pipe_buffer *p;
 		if (unlikely(p_occupancy == 0))
 			goto Bad;	// pipe must be non-empty
@@ -207,7 +207,7 @@ static bool sanity(const struct iov_iter *i)
 			goto Bad;	// must be at the last buffer...
 
 		p = pipe_buf(pipe, i_head);
-		if (unlikely(p->offset + p->len != i->iov_offset))
+		if (unlikely(p->offset + p->len != abs(i->last_offset)))
 			goto Bad;	// ... at the end of segment
 	} else {
 		if (i_head != p_head)
@@ -215,7 +215,7 @@ static bool sanity(const struct iov_iter *i)
 	}
 	return true;
 Bad:
-	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
+	printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset);
 	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
 			p_head, p_tail, pipe->ring_size);
 	for (idx = 0; idx < pipe->ring_size; idx++)
@@ -259,30 +259,31 @@ static void push_page(struct pipe_inode_info *pipe, struct page *page,
 	get_page(page);
 }
 
-static inline bool allocated(struct pipe_buffer *buf)
+static inline int last_offset(const struct pipe_buffer *buf)
 {
-	return buf->ops == &default_pipe_buf_ops;
+	if (buf->ops == &default_pipe_buf_ops)
+		return buf->len;	// buf->offset is 0 for those
+	else
+		return -(buf->offset + buf->len);
 }
 
 static struct page *append_pipe(struct iov_iter *i, size_t size,
 				unsigned int *off)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	size_t offset = i->iov_offset;
+	int offset = i->last_offset;
 	struct pipe_buffer *buf;
 	struct page *page;
 
-	if (offset && offset < PAGE_SIZE) {
-		// some space in the last buffer; can we add to it?
+	if (offset > 0 && offset < PAGE_SIZE) {
+		// some space in the last buffer; add to it
 		buf = pipe_buf(pipe, pipe->head - 1);
-		if (allocated(buf)) {
-			size = min_t(size_t, size, PAGE_SIZE - offset);
-			buf->len += size;
-			i->iov_offset += size;
-			i->count -= size;
-			*off = offset;
-			return buf->page;
-		}
+		size = min_t(size_t, size, PAGE_SIZE - offset);
+		buf->len += size;
+		i->last_offset += size;
+		i->count -= size;
+		*off = offset;
+		return buf->page;
 	}
 	// OK, we need a new buffer
 	*off = 0;
@@ -293,7 +294,7 @@ static struct page *append_pipe(struct iov_iter *i, size_t size,
 	if (!page)
 		return NULL;
 	i->head = pipe->head - 1;
-	i->iov_offset = size;
+	i->last_offset = size;
 	i->count -= size;
 	return page;
 }
@@ -313,11 +314,11 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 	if (!sanity(i))
 		return 0;
 
-	if (offset && i->iov_offset == offset) { // could we merge it?
+	if (offset && i->last_offset == -offset) { // could we merge it?
 		struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
 		if (buf->page == page) {
 			buf->len += bytes;
-			i->iov_offset += bytes;
+			i->last_offset -= bytes;
 			i->count -= bytes;
 			return bytes;
 		}
@@ -326,7 +327,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 		return 0;
 
 	push_page(pipe, page, offset, bytes);
-	i->iov_offset = offset + bytes;
+	i->last_offset = -(offset + bytes);
 	i->head = head;
 	i->count -= bytes;
 	return bytes;
@@ -438,16 +439,15 @@ EXPORT_SYMBOL(iov_iter_init);
 static inline void data_start(const struct iov_iter *i,
 			      unsigned int *iter_headp, size_t *offp)
 {
-	unsigned int iter_head = i->head;
-	size_t off = i->iov_offset;
+	int off = i->last_offset;
 
-	if (off && (!allocated(pipe_buf(i->pipe, iter_head)) ||
-		    off == PAGE_SIZE)) {
-		iter_head++;
-		off = 0;
+	if (off > 0 && off < PAGE_SIZE) { // anon and not full
+		*iter_headp = i->pipe->head - 1;
+		*offp = off;
+	} else {
+		*iter_headp = i->pipe->head;
+		*offp = 0;
 	}
-	*iter_headp = iter_head;
-	*offp = off;
 }
 
 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
@@ -819,7 +819,7 @@ EXPORT_SYMBOL(copy_page_from_iter_atomic);
 static void pipe_advance(struct iov_iter *i, size_t size)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	unsigned int off = i->iov_offset;
+	int off = i->last_offset;
 
 	if (!off && !size) {
 		pipe_discard_from(pipe, i->start_head); // discard everything
@@ -829,10 +829,10 @@ static void pipe_advance(struct iov_iter *i, size_t size)
 	while (1) {
 		struct pipe_buffer *buf = pipe_buf(pipe, i->head);
 		if (off) /* make it relative to the beginning of buffer */
-			size += off - buf->offset;
+			size += abs(off) - buf->offset;
 		if (size <= buf->len) {
 			buf->len = size;
-			i->iov_offset = buf->offset + size;
+			i->last_offset = last_offset(buf);
 			break;
 		}
 		size -= buf->len;
@@ -916,7 +916,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 			struct pipe_buffer *b = pipe_buf(pipe, --head);
 			if (unroll < b->len) {
 				b->len -= unroll;
-				i->iov_offset = b->offset + b->len;
+				i->last_offset = last_offset(b);
 				i->head = head;
 				return;
 			}
@@ -924,7 +924,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 			pipe_buf_release(pipe, b);
 			pipe->head--;
 		}
-		i->iov_offset = 0;
+		i->last_offset = 0;
 		i->head = head;
 		return;
 	}
@@ -1027,7 +1027,7 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 		.pipe = pipe,
 		.head = pipe->head,
 		.start_head = pipe->head,
-		.iov_offset = 0,
+		.last_offset = 0,
 		.count = count
 	};
 }
@@ -1158,13 +1158,12 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 		return iov_iter_aligned_bvec(i, addr_mask, len_mask);
 
 	if (iov_iter_is_pipe(i)) {
-		unsigned int p_mask = i->pipe->ring_size - 1;
 		size_t size = i->count;
 
 		if (size & len_mask)
 			return false;
-		if (size && allocated(&i->pipe->bufs[i->head & p_mask])) {
-			if (i->iov_offset & addr_mask)
+		if (size && i->last_offset > 0) {
+			if (i->last_offset & addr_mask)
 				return false;
 		}
 
@@ -1243,8 +1242,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	if (iov_iter_is_pipe(i)) {
 		size_t size = i->count;
 
-		if (size && i->iov_offset && allocated(pipe_buf(i->pipe, i->head)))
-			return size | i->iov_offset;
+		if (size && i->last_offset > 0)
+			return size | i->last_offset;
 		return size;
 	}
 
-- 
cgit 


From 12d426ab64a1c75f1b2ee5c33e933a4c16004049 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Jun 2022 09:44:38 -0400
Subject: ITER_PIPE: fold data_start() and pipe_space_for_user() together

All their callers are next to each other; all of them
want the total amount of pages and, possibly, the
offset in the partial final buffer.

Combine into a new helper (pipe_npages()), fix the
bogosity in pipe_space_for_user(), while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/pipe_fs_i.h | 20 --------------------
 lib/iov_iter.c            | 44 +++++++++++++++++++-------------------------
 2 files changed, 19 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4ea496924106..6cb65df3e3ba 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -156,26 +156,6 @@ static inline bool pipe_full(unsigned int head, unsigned int tail,
 	return pipe_occupancy(head, tail) >= limit;
 }
 
-/**
- * pipe_space_for_user - Return number of slots available to userspace
- * @head: The pipe ring head pointer
- * @tail: The pipe ring tail pointer
- * @pipe: The pipe info structure
- */
-static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail,
-					       struct pipe_inode_info *pipe)
-{
-	unsigned int p_occupancy, p_space;
-
-	p_occupancy = pipe_occupancy(head, tail);
-	if (p_occupancy >= pipe->max_usage)
-		return 0;
-	p_space = pipe->ring_size - p_occupancy;
-	if (p_space > pipe->max_usage)
-		p_space = pipe->max_usage;
-	return p_space;
-}
-
 /**
  * pipe_buf_get - get a reference to a pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 8834f3f61220..12dda1013bea 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -436,18 +436,20 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static inline void data_start(const struct iov_iter *i,
-			      unsigned int *iter_headp, size_t *offp)
+// returns the offset in partial buffer (if any)
+static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages)
 {
+	struct pipe_inode_info *pipe = i->pipe;
+	int used = pipe->head - pipe->tail;
 	int off = i->last_offset;
 
+	*npages = max((int)pipe->max_usage - used, 0);
+
 	if (off > 0 && off < PAGE_SIZE) { // anon and not full
-		*iter_headp = i->pipe->head - 1;
-		*offp = off;
-	} else {
-		*iter_headp = i->pipe->head;
-		*offp = 0;
+		(*npages)++;
+		return off;
 	}
+	return 0;
 }
 
 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
@@ -1318,18 +1320,16 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
-	unsigned int iter_head, npages;
+	unsigned int npages, off;
 	size_t capacity;
 
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &iter_head, start);
-	/* Amount of free space: some of this one + all after this one */
-	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
-	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
+	*start = off = pipe_npages(i, &npages);
+	capacity = min(npages, maxpages) * PAGE_SIZE - off;
 
-	return __pipe_get_pages(i, min(maxsize, capacity), pages, *start);
+	return __pipe_get_pages(i, min(maxsize, capacity), pages, off);
 }
 
 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
@@ -1494,24 +1494,22 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   size_t *start)
 {
 	struct page **p;
-	unsigned int iter_head, npages;
+	unsigned int npages, off;
 	ssize_t n;
 
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &iter_head, start);
-	/* Amount of free space: some of this one + all after this one */
-	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
-	n = npages * PAGE_SIZE - *start;
+	*start = off = pipe_npages(i, &npages);
+	n = npages * PAGE_SIZE - off;
 	if (maxsize > n)
 		maxsize = n;
 	else
-		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
+		npages = DIV_ROUND_UP(maxsize + off, PAGE_SIZE);
 	p = get_pages_array(npages);
 	if (!p)
 		return -ENOMEM;
-	n = __pipe_get_pages(i, maxsize, p, *start);
+	n = __pipe_get_pages(i, maxsize, p, off);
 	if (n > 0)
 		*pages = p;
 	else
@@ -1739,16 +1737,12 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	if (iov_iter_is_bvec(i))
 		return bvec_npages(i, maxpages);
 	if (iov_iter_is_pipe(i)) {
-		unsigned int iter_head;
 		int npages;
-		size_t off;
 
 		if (!sanity(i))
 			return 0;
 
-		data_start(i, &iter_head, &off);
-		/* some of this one + all after this one */
-		npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
+		pipe_npages(i, &npages);
 		return min(npages, maxpages);
 	}
 	if (iov_iter_is_xarray(i)) {
-- 
cgit 


From 1ef255e257173f4bc44317ef2076e7e0de688fdf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jun 2022 10:28:36 -0400
Subject: iov_iter: advancing variants of iov_iter_get_pages{,_alloc}()

Most of the users immediately follow successful iov_iter_get_pages()
with advancing by the amount it had returned.

Provide inline wrappers doing that, convert trivial open-coded
uses of those.

BTW, iov_iter_get_pages() never returns more than it had been asked
to; such checks in cifs ought to be removed someday...

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/scsi.c |  4 +---
 fs/ceph/file.c       |  3 +--
 fs/cifs/file.c       |  6 ++----
 fs/cifs/misc.c       |  3 +--
 fs/direct-io.c       |  3 +--
 fs/fuse/dev.c        |  3 +--
 fs/fuse/file.c       |  3 +--
 fs/nfs/direct.c      |  6 ++----
 include/linux/uio.h  | 20 ++++++++++++++++++++
 net/core/datagram.c  |  3 +--
 net/core/skmsg.c     |  3 +--
 net/rds/message.c    |  3 +--
 net/tls/tls_sw.c     |  4 +---
 13 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index ffd9e6c2ffc1..9b65509424dc 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -643,14 +643,12 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd,
 	size_t offset;
 	unsigned int npages = 0;
 
-	bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
+	bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
 				VHOST_SCSI_PREALLOC_UPAGES, &offset);
 	/* No pages were pinned */
 	if (bytes <= 0)
 		return bytes < 0 ? bytes : -EFAULT;
 
-	iov_iter_advance(iter, bytes);
-
 	while (bytes) {
 		unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes);
 		sg_set_page(sg++, pages[npages++], n, offset);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c535de5852bf..8fab5db16c73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
 		size_t start;
 		int idx = 0;
 
-		bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+		bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
 					   ITER_GET_BVECS_PAGES, &start);
 		if (bytes < 0)
 			return size ?: bytes;
 
-		iov_iter_advance(iter, bytes);
 		size += bytes;
 
 		for ( ; bytes; idx++, bvec_idx++) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e1e05b253daa..3ba013e2987f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3022,7 +3022,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		if (ctx->direct_io) {
 			ssize_t result;
 
-			result = iov_iter_get_pages_alloc(
+			result = iov_iter_get_pages_alloc2(
 				from, &pagevec, cur_len, &start);
 			if (result < 0) {
 				cifs_dbg(VFS,
@@ -3036,7 +3036,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 				break;
 			}
 			cur_len = (size_t)result;
-			iov_iter_advance(from, cur_len);
 
 			nr_pages =
 				(cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -3758,7 +3757,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		if (ctx->direct_io) {
 			ssize_t result;
 
-			result = iov_iter_get_pages_alloc(
+			result = iov_iter_get_pages_alloc2(
 					&direct_iov, &pagevec,
 					cur_len, &start);
 			if (result < 0) {
@@ -3774,7 +3773,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 				break;
 			}
 			cur_len = (size_t)result;
-			iov_iter_advance(&direct_iov, cur_len);
 
 			rdata = cifs_readdata_direct_alloc(
 					pagevec, cifs_uncached_readv_complete);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0e84e6fcf8ab..f833953bab61 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1029,7 +1029,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
 	saved_len = count;
 
 	while (count && npages < max_pages) {
-		rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+		rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start);
 		if (rc < 0) {
 			cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc);
 			break;
@@ -1041,7 +1041,6 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
 			break;
 		}
 
-		iov_iter_advance(iter, rc);
 		count -= rc;
 		rc += start;
 		cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c7fc01c2d509..f669163d5860 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -169,7 +169,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
+	ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
 				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
@@ -191,7 +191,6 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 	}
 
 	if (ret >= 0) {
-		iov_iter_advance(sdio->iter, ret);
 		ret += sdio->from;
 		sdio->head = 0;
 		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8d657c2cd6f7..51897427a534 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -730,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		}
 	} else {
 		size_t off;
-		err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+		err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
 		if (err < 0)
 			return err;
 		BUG_ON(!err);
 		cs->len = err;
 		cs->offset = off;
 		cs->pg = page;
-		iov_iter_advance(cs->iter, err);
 	}
 
 	return lock_request(cs->req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c982e3afe3b4..69e19fc0afc1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1401,14 +1401,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+		ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
 					*nbytesp - nbytes,
 					max_pages - ap->num_pages,
 					&start);
 		if (ret < 0)
 			break;
 
-		iov_iter_advance(ii, ret);
 		nbytes += ret;
 
 		ret += start;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 022e1ce63e62..c275c83f0aef 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -364,13 +364,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		size_t pgbase;
 		unsigned npages, i;
 
-		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+		result = iov_iter_get_pages_alloc2(iter, &pagevec,
 						  rsize, &pgbase);
 		if (result < 0)
 			break;
 	
 		bytes = result;
-		iov_iter_advance(iter, bytes);
 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
@@ -812,13 +811,12 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 		size_t pgbase;
 		unsigned npages, i;
 
-		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+		result = iov_iter_get_pages_alloc2(iter, &pagevec,
 						  wsize, &pgbase);
 		if (result < 0)
 			break;
 
 		bytes = result;
-		iov_iter_advance(iter, bytes);
 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e7fc29b5ad19..b70d28693400 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -351,4 +351,24 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	};
 }
 
+static inline ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
+			size_t maxsize, unsigned maxpages, size_t *start)
+{
+	ssize_t res = iov_iter_get_pages(i, pages, maxsize, maxpages, start);
+
+	if (res >= 0)
+		iov_iter_advance(i, res);
+	return res;
+}
+
+static inline ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
+			size_t maxsize, size_t *start)
+{
+	ssize_t res = iov_iter_get_pages_alloc(i, pages, maxsize, start);
+
+	if (res >= 0)
+		iov_iter_advance(i, res);
+	return res;
+}
+
 #endif
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f3988ef8e9af..7255531f63ae 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -632,12 +632,11 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 		if (frag == MAX_SKB_FRAGS)
 			return -EMSGSIZE;
 
-		copied = iov_iter_get_pages(from, pages, length,
+		copied = iov_iter_get_pages2(from, pages, length,
 					    MAX_SKB_FRAGS - frag, &start);
 		if (copied < 0)
 			return -EFAULT;
 
-		iov_iter_advance(from, copied);
 		length -= copied;
 
 		truesize = PAGE_ALIGN(copied + start);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 81627892bdd4..cf3c24c8610d 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -324,14 +324,13 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			goto out;
 		}
 
-		copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+		copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
 					    &offset);
 		if (copied <= 0) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		iov_iter_advance(from, copied);
 		bytes -= copied;
 		msg->sg.size += copied;
 
diff --git a/net/rds/message.c b/net/rds/message.c
index 799034e0f513..d74be4e3f3fa 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -391,7 +391,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
 		size_t start;
 		ssize_t copied;
 
-		copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+		copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE,
 					    1, &start);
 		if (copied < 0) {
 			struct mmpin *mmp;
@@ -405,7 +405,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
 			goto err;
 		}
 		total_copied += copied;
-		iov_iter_advance(from, copied);
 		length -= copied;
 		sg_set_page(sg, pages, copied, start);
 		rm->data.op_nents++;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 17db8c8811fa..f76119f62f1b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1352,7 +1352,7 @@ static int tls_setup_from_iter(struct iov_iter *from,
 			rc = -EFAULT;
 			goto out;
 		}
-		copied = iov_iter_get_pages(from, pages,
+		copied = iov_iter_get_pages2(from, pages,
 					    length,
 					    maxpages, &offset);
 		if (copied <= 0) {
@@ -1360,8 +1360,6 @@ static int tls_setup_from_iter(struct iov_iter *from,
 			goto out;
 		}
 
-		iov_iter_advance(from, copied);
-
 		length -= copied;
 		size += copied;
 		while (copied) {
-- 
cgit 


From eba2d3d798295dc43cae8fade102f9d083a2a741 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Jun 2022 13:05:12 -0400
Subject: get rid of non-advancing variants

mechanical change; will be further massaged in subsequent commits

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 24 ++----------------------
 lib/iov_iter.c      | 27 ++++++++++++++++++---------
 2 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index b70d28693400..5896af36199c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -247,9 +247,9 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
-ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
+ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
+ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
@@ -351,24 +351,4 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	};
 }
 
-static inline ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
-			size_t maxsize, unsigned maxpages, size_t *start)
-{
-	ssize_t res = iov_iter_get_pages(i, pages, maxsize, maxpages, start);
-
-	if (res >= 0)
-		iov_iter_advance(i, res);
-	return res;
-}
-
-static inline ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
-			size_t maxsize, size_t *start)
-{
-	ssize_t res = iov_iter_get_pages_alloc(i, pages, maxsize, start);
-
-	if (res >= 0)
-		iov_iter_advance(i, res);
-	return res;
-}
-
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f003a20d8683..c48c83602aae 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1328,6 +1328,7 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 		left -= PAGE_SIZE - off;
 		if (left <= 0) {
 			buf->len += maxsize;
+			iov_iter_advance(i, maxsize);
 			return maxsize;
 		}
 		buf->len = PAGE_SIZE;
@@ -1347,7 +1348,9 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 	}
 	if (!npages)
 		return -EFAULT;
-	return maxsize - left;
+	maxsize -= left;
+	iov_iter_advance(i, maxsize);
+	return maxsize;
 }
 
 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
@@ -1397,7 +1400,9 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
 	if (nr == 0)
 		return 0;
 
-	return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
+	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
+	iov_iter_advance(i, maxsize);
+	return maxsize;
 }
 
 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
@@ -1469,7 +1474,9 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 		res = get_user_pages_fast(addr, n, gup_flags, *pages);
 		if (unlikely(res <= 0))
 			return res;
-		return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
+		maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
+		iov_iter_advance(i, maxsize);
+		return maxsize;
 	}
 	if (iov_iter_is_bvec(i)) {
 		struct page **p;
@@ -1481,8 +1488,10 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 			return -ENOMEM;
 		p = *pages;
 		for (int k = 0; k < n; k++)
-			get_page(*p++ = page++);
-		return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
+			get_page(p[k] = page + k);
+		maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
+		iov_iter_advance(i, maxsize);
+		return maxsize;
 	}
 	if (iov_iter_is_pipe(i))
 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
@@ -1491,7 +1500,7 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 	return -EFAULT;
 }
 
-ssize_t iov_iter_get_pages(struct iov_iter *i,
+ssize_t iov_iter_get_pages2(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
@@ -1501,9 +1510,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 
 	return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
 }
-EXPORT_SYMBOL(iov_iter_get_pages);
+EXPORT_SYMBOL(iov_iter_get_pages2);
 
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
 {
@@ -1518,7 +1527,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	}
 	return len;
 }
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
-- 
cgit 


From 1a1e3aca9d4957e282945cdc2b58e7c560b8e0d2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 5 Aug 2022 06:43:48 -0400
Subject: fscache: add tracepoint when failing cookie

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/cookie.c            | 2 ++
 include/trace/events/fscache.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 26a6d395737a..451d8a077e12 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -263,6 +263,8 @@ void fscache_caching_failed(struct fscache_cookie *cookie)
 {
 	clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags);
 	fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED);
+	trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+				fscache_cookie_failed);
 }
 EXPORT_SYMBOL(fscache_caching_failed);
 
diff --git a/include/trace/events/fscache.h b/include/trace/events/fscache.h
index cb3fb337e880..c078c48a8e6d 100644
--- a/include/trace/events/fscache.h
+++ b/include/trace/events/fscache.h
@@ -49,6 +49,7 @@ enum fscache_volume_trace {
 enum fscache_cookie_trace {
 	fscache_cookie_collision,
 	fscache_cookie_discard,
+	fscache_cookie_failed,
 	fscache_cookie_get_attach_object,
 	fscache_cookie_get_end_access,
 	fscache_cookie_get_hash_collision,
@@ -131,6 +132,7 @@ enum fscache_access_trace {
 #define fscache_cookie_traces						\
 	EM(fscache_cookie_collision,		"*COLLIDE*")		\
 	EM(fscache_cookie_discard,		"DISCARD  ")		\
+	EM(fscache_cookie_failed,		"FAILED   ")		\
 	EM(fscache_cookie_get_attach_object,	"GET attch")		\
 	EM(fscache_cookie_get_hash_collision,	"GET hcoll")		\
 	EM(fscache_cookie_get_end_access,	"GQ  endac")		\
-- 
cgit 


From 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Aug 2022 11:25:43 +0200
Subject: netfilter: nf_tables: validate variable length element extension

Update template to validate variable length extensions. This patch adds
a new .ext_len[id] field to the template to store the expected extension
length. This is used to sanity check the initialization of the variable
length extension.

Use PTR_ERR() in nft_set_elem_init() to report errors since, after this
update, there are two reason why this might fail, either because of
ENOMEM or insufficient room in the extension field (EINVAL).

Kernels up until 7e6bc1f6cabc ("netfilter: nf_tables: stricter
validation of element data") allowed to copy more data to the extension
than was allocated. This ext_len field allows to validate if the
destination has the correct size as additional check.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +-
 net/netfilter/nf_tables_api.c     | 84 ++++++++++++++++++++++++++++++++-------
 net/netfilter/nft_dynset.c        |  2 +-
 3 files changed, 73 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8bfb9c74afbf..7ece4fd0cf66 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -651,6 +651,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[];
 struct nft_set_ext_tmpl {
 	u16	len;
 	u8	offset[NFT_SET_EXT_NUM];
+	u8	ext_len[NFT_SET_EXT_NUM];
 };
 
 /**
@@ -680,7 +681,8 @@ static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
 		return -EINVAL;
 
 	tmpl->offset[id] = tmpl->len;
-	tmpl->len	+= nft_set_ext_types[id].len + len;
+	tmpl->ext_len[id] = nft_set_ext_types[id].len + len;
+	tmpl->len	+= tmpl->ext_len[id];
 
 	return 0;
 }
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9f976b11d896..3b09e13b9b5c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5467,6 +5467,27 @@ err_set_elem_expr:
 	return ERR_PTR(err);
 }
 
+static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
+{
+	len += nft_set_ext_types[id].len;
+	if (len > tmpl->ext_len[id] ||
+	    len > U8_MAX)
+		return -1;
+
+	return 0;
+}
+
+static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
+			      void *to, const void *from, u32 len)
+{
+	if (nft_set_ext_check(tmpl, id, len) < 0)
+		return -1;
+
+	memcpy(to, from, len);
+
+	return 0;
+}
+
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *key_end,
@@ -5477,17 +5498,26 @@ void *nft_set_elem_init(const struct nft_set *set,
 
 	elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
 	if (elem == NULL)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	ext = nft_set_elem_ext(set, elem);
 	nft_set_ext_init(ext, tmpl);
 
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY))
-		memcpy(nft_set_ext_key(ext), key, set->klen);
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
-		memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
-		memcpy(nft_set_ext_data(ext), data, set->dlen);
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+	    nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
+			       nft_set_ext_key(ext), key, set->klen) < 0)
+		goto err_ext_check;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+	    nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
+			       nft_set_ext_key_end(ext), key_end, set->klen) < 0)
+		goto err_ext_check;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+	    nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
+			       nft_set_ext_data(ext), data, set->dlen) < 0)
+		goto err_ext_check;
+
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
 		*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
 		if (expiration == 0)
@@ -5497,6 +5527,11 @@ void *nft_set_elem_init(const struct nft_set *set,
 		*nft_set_ext_timeout(ext) = timeout;
 
 	return elem;
+
+err_ext_check:
+	kfree(elem);
+
+	return ERR_PTR(-EINVAL);
 }
 
 static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
@@ -5584,14 +5619,25 @@ err_expr:
 }
 
 static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
+				   const struct nft_set_ext_tmpl *tmpl,
 				   const struct nft_set_ext *ext,
 				   struct nft_expr *expr_array[],
 				   u32 num_exprs)
 {
 	struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+	u32 len = sizeof(struct nft_set_elem_expr);
 	struct nft_expr *expr;
 	int i, err;
 
+	if (num_exprs == 0)
+		return 0;
+
+	for (i = 0; i < num_exprs; i++)
+		len += expr_array[i]->ops->size;
+
+	if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
+		return -EINVAL;
+
 	for (i = 0; i < num_exprs; i++) {
 		expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
 		err = nft_expr_clone(expr, expr_array[i]);
@@ -6054,17 +6100,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		}
 	}
 
-	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
 				      elem.key_end.val.data, elem.data.val.data,
 				      timeout, expiration, GFP_KERNEL_ACCOUNT);
-	if (elem.priv == NULL)
+	if (IS_ERR(elem.priv)) {
+		err = PTR_ERR(elem.priv);
 		goto err_parse_data;
+	}
 
 	ext = nft_set_elem_ext(set, elem.priv);
 	if (flags)
 		*nft_set_ext_flags(ext) = flags;
+
 	if (ulen > 0) {
+		if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
+			err = -EINVAL;
+			goto err_elem_userdata;
+		}
 		udata = nft_set_ext_userdata(ext);
 		udata->len = ulen - 1;
 		nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
@@ -6073,14 +6125,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		*nft_set_ext_obj(ext) = obj;
 		obj->use++;
 	}
-	err = nft_set_elem_expr_setup(ctx, ext, expr_array, num_exprs);
+	err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
 	if (err < 0)
-		goto err_elem_expr;
+		goto err_elem_free;
 
 	trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
 	if (trans == NULL) {
 		err = -ENOMEM;
-		goto err_elem_expr;
+		goto err_elem_free;
 	}
 
 	ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
@@ -6126,10 +6178,10 @@ err_set_full:
 	nft_setelem_remove(ctx->net, set, &elem);
 err_element_clash:
 	kfree(trans);
-err_elem_expr:
+err_elem_free:
 	if (obj)
 		obj->use--;
-
+err_elem_userdata:
 	nf_tables_set_elem_destroy(ctx, set, elem.priv);
 err_parse_data:
 	if (nla[NFTA_SET_ELEM_DATA] != NULL)
@@ -6311,8 +6363,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
 				      elem.key_end.val.data, NULL, 0, 0,
 				      GFP_KERNEL_ACCOUNT);
-	if (elem.priv == NULL)
+	if (IS_ERR(elem.priv)) {
+		err = PTR_ERR(elem.priv);
 		goto fail_elem_key_end;
+	}
 
 	ext = nft_set_elem_ext(set, elem.priv);
 	if (flags)
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 22f70b543fa2..6983e6ddeef9 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
 				 &regs->data[priv->sreg_key], NULL,
 				 &regs->data[priv->sreg_data],
 				 timeout, 0, GFP_ATOMIC);
-	if (elem == NULL)
+	if (IS_ERR(elem))
 		goto err1;
 
 	ext = nft_set_elem_ext(set, elem);
-- 
cgit 


From 134941683b89d05b5e5c28c817c95049ba409d01 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Aug 2022 17:39:20 +0200
Subject: netfilter: ip6t_LOG: Fix a typo in a comment

s/_IPT_LOG_H/_IP6T_LOG_H/

While at it add some surrounding space to ease reading.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_LOG.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
index 23e91a9c2583..0b7b16dbdec2 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
@@ -17,4 +17,4 @@ struct ip6t_log_info {
 	char prefix[30];
 };
 
-#endif /*_IPT_LOG_H*/
+#endif /* _IP6T_LOG_H */
-- 
cgit 


From 46dae32fe625a75f549c3a70edc77b778197bb05 Mon Sep 17 00:00:00 2001
From: Youngmin Nam <youngmin.nam@samsung.com>
Date: Tue, 12 Jul 2022 18:47:15 +0900
Subject: time: Correct the prototype of ns_to_kernel_old_timeval and
 ns_to_timespec64

In ns_to_kernel_old_timeval() definition, the function argument is defined
with const identifier in kernel/time/time.c, but the prototype in
include/linux/time32.h looks different.

- The function is defined in kernel/time/time.c as below:
  struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)

- The function is decalared in include/linux/time32.h as below:
  extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec);

Because the variable of arithmethic types isn't modified in the calling scope,
there's no need to mark arguments as const, which was already mentioned during
review (Link[1) of the original patch.

Likewise remove the "const" keyword in both definition and declaration of
ns_to_timespec64() as requested by Arnd (Link[2]).

Fixes: a84d1169164b ("y2038: Introduce struct __kernel_old_timeval")
Signed-off-by: Youngmin Nam <youngmin.nam@samsung.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/all/20220712094715.2918823-1-youngmin.nam@samsung.com
Link[1]: https://lore.kernel.org/all/20180310081123.thin6wphgk7tongy@gmail.com/
Link[2]: https://lore.kernel.org/all/CAK8P3a3nknJgEDESGdJH91jMj6R_xydFqWASd8r5BbesdvMBgA@mail.gmail.com/
---
 include/linux/time64.h | 2 +-
 kernel/time/time.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 2fb8232cff1d..f1bcea8c124a 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -145,7 +145,7 @@ static inline s64 timespec64_to_ns(const struct timespec64 *ts)
  *
  * Returns the timespec64 representation of the nsec parameter.
  */
-extern struct timespec64 ns_to_timespec64(const s64 nsec);
+extern struct timespec64 ns_to_timespec64(s64 nsec);
 
 /**
  * timespec64_add_ns - Adds nanoseconds to a timespec64
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 29923b20e0e4..526257b3727c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
-struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
+struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
 {
 	struct timespec64 ts = ns_to_timespec64(nsec);
 	struct __kernel_old_timeval tv;
@@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64);
  *
  * Returns the timespec64 representation of the nsec parameter.
  */
-struct timespec64 ns_to_timespec64(const s64 nsec)
+struct timespec64 ns_to_timespec64(s64 nsec)
 {
 	struct timespec64 ts = { 0, 0 };
 	s32 rem;
-- 
cgit 


From af887e437bb298752b2edc5834048b8151b8aea0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 9 Aug 2022 12:50:28 -0400
Subject: NFS: Improve write error tracing

Don't leak request pointers, but use the "device:inode" labelling that
is used by all the other trace points. Furthermore, replace use of page
indexes with an offset, again in order to align behaviour with other
NFS trace points.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfstrace.h        | 36 +++++++++++++++++++++---------------
 fs/nfs/write.c           |  8 +++++---
 include/linux/nfs_page.h |  3 +--
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8bd0c13a7c4b..731eecfdf49a 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1447,44 +1447,50 @@ TRACE_EVENT(nfs_writeback_done,
 
 DECLARE_EVENT_CLASS(nfs_page_error_class,
 		TP_PROTO(
+			const struct inode *inode,
 			const struct nfs_page *req,
 			int error
 		),
 
-		TP_ARGS(req, error),
+		TP_ARGS(inode, req, error),
 
 		TP_STRUCT__entry(
-			__field(const void *, req)
-			__field(pgoff_t, index)
-			__field(unsigned int, offset)
-			__field(unsigned int, pgbase)
-			__field(unsigned int, bytes)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+			__field(unsigned int, count)
 			__field(int, error)
 		),
 
 		TP_fast_assign(
-			__entry->req = req;
-			__entry->index = req->wb_index;
-			__entry->offset = req->wb_offset;
-			__entry->pgbase = req->wb_pgbase;
-			__entry->bytes = req->wb_bytes;
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->offset = req_offset(req);
+			__entry->count = req->wb_bytes;
 			__entry->error = error;
 		),
 
 		TP_printk(
-			"req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d",
-			__entry->req, __entry->index, __entry->offset,
-			__entry->pgbase, __entry->bytes, __entry->error
+			"error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld count=%u", __entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle, __entry->offset,
+			__entry->count
 		)
 );
 
 #define DEFINE_NFS_PAGEERR_EVENT(name) \
 	DEFINE_EVENT(nfs_page_error_class, name, \
 			TP_PROTO( \
+				const struct inode *inode, \
 				const struct nfs_page *req, \
 				int error \
 			), \
-			TP_ARGS(req, error))
+			TP_ARGS(inode, req, error))
 
 DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
 DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4adf2b488da1..4a3796811b4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page)
 
 static void nfs_write_error(struct nfs_page *req, int error)
 {
-	trace_nfs_write_error(req, error);
+	trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req,
+			      error);
 	nfs_mapping_set_error(req->wb_page, error);
 	nfs_inode_remove_request(req);
 	nfs_end_page_writeback(req);
@@ -1000,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
 		    (hdr->good_bytes < bytes)) {
-			trace_nfs_comp_error(req, hdr->error);
+			trace_nfs_comp_error(hdr->inode, req, hdr->error);
 			nfs_mapping_set_error(req->wb_page, hdr->error);
 			goto remove_req;
 		}
@@ -1882,7 +1883,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 			(long long)req_offset(req));
 		if (status < 0) {
 			if (req->wb_page) {
-				trace_nfs_commit_error(req, status);
+				trace_nfs_commit_error(data->inode, req,
+						       status);
 				nfs_mapping_set_error(req->wb_page, status);
 				nfs_inode_remove_request(req);
 			}
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index f0373a6cb5fb..ba7e2e4b0926 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -202,8 +202,7 @@ nfs_list_entry(struct list_head *head)
 	return list_entry(head, struct nfs_page, wb_list);
 }
 
-static inline
-loff_t req_offset(struct nfs_page *req)
+static inline loff_t req_offset(const struct nfs_page *req)
 {
 	return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset;
 }
-- 
cgit 


From 341b6941608762d8235f3fd1e45e4d7114ed8c2c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 8 Aug 2022 19:30:06 +0200
Subject: netfilter: nf_tables: upfront validation of data via nft_data_init()

Instead of parsing the data and then validate that type and length are
correct, pass a description of the expected data so it can be validated
upfront before parsing it to bail out earlier.

This patch adds a new .size field to specify the maximum size of the
data area. The .len field is optional and it is used as an input/output
field, it provides the specific length of the expected data in the input
path. If then .len field is not specified, then obtained length from the
netlink attribute is stored. This is required by cmp, bitwise, range and
immediate, which provide no netlink attribute that describes the data
length. The immediate expression uses the destination register type to
infer the expected data type.

Relying on opencoded validation of the expected data might lead to
subtle bugs as described in 7e6bc1f6cabc ("netfilter: nf_tables:
stricter validation of element data").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +-
 net/netfilter/nf_tables_api.c     | 78 ++++++++++++++++++++-------------------
 net/netfilter/nft_bitwise.c       | 66 ++++++++++++++++-----------------
 net/netfilter/nft_cmp.c           | 44 ++++++++++------------
 net/netfilter/nft_immediate.c     | 22 +++++++++--
 net/netfilter/nft_range.c         | 27 ++++++--------
 6 files changed, 126 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7ece4fd0cf66..1554f1e7215b 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -223,11 +223,11 @@ struct nft_ctx {
 
 struct nft_data_desc {
 	enum nft_data_types		type;
+	unsigned int			size;
 	unsigned int			len;
 };
 
-int nft_data_init(const struct nft_ctx *ctx,
-		  struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
 		  struct nft_data_desc *desc, const struct nlattr *nla);
 void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
 void nft_data_release(const struct nft_data *data, enum nft_data_types type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0c3c0523e5f2..05896765c68f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5202,19 +5202,13 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
 static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
 				 struct nft_data *key, struct nlattr *attr)
 {
-	struct nft_data_desc desc;
-	int err;
-
-	err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr);
-	if (err < 0)
-		return err;
-
-	if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) {
-		nft_data_release(key, desc.type);
-		return -EINVAL;
-	}
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= NFT_DATA_VALUE_MAXLEN,
+		.len	= set->klen,
+	};
 
-	return 0;
+	return nft_data_init(ctx, key, &desc, attr);
 }
 
 static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
@@ -5223,24 +5217,17 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
 				  struct nlattr *attr)
 {
 	u32 dtype;
-	int err;
-
-	err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
-	if (err < 0)
-		return err;
 
 	if (set->dtype == NFT_DATA_VERDICT)
 		dtype = NFT_DATA_VERDICT;
 	else
 		dtype = NFT_DATA_VALUE;
 
-	if (dtype != desc->type ||
-	    set->dlen != desc->len) {
-		nft_data_release(data, desc->type);
-		return -EINVAL;
-	}
+	desc->type = dtype;
+	desc->size = NFT_DATA_VALUE_MAXLEN;
+	desc->len = set->dlen;
 
-	return 0;
+	return nft_data_init(ctx, data, desc, attr);
 }
 
 static void *nft_setelem_catchall_get(const struct net *net,
@@ -9685,7 +9672,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 	}
 
 	desc->len = sizeof(data->verdict);
-	desc->type = NFT_DATA_VERDICT;
+
 	return 0;
 }
 
@@ -9738,20 +9725,25 @@ nla_put_failure:
 }
 
 static int nft_value_init(const struct nft_ctx *ctx,
-			  struct nft_data *data, unsigned int size,
-			  struct nft_data_desc *desc, const struct nlattr *nla)
+			  struct nft_data *data, struct nft_data_desc *desc,
+			  const struct nlattr *nla)
 {
 	unsigned int len;
 
 	len = nla_len(nla);
 	if (len == 0)
 		return -EINVAL;
-	if (len > size)
+	if (len > desc->size)
 		return -EOVERFLOW;
+	if (desc->len) {
+		if (len != desc->len)
+			return -EINVAL;
+	} else {
+		desc->len = len;
+	}
 
 	nla_memcpy(data->data, nla, len);
-	desc->type = NFT_DATA_VALUE;
-	desc->len  = len;
+
 	return 0;
 }
 
@@ -9771,7 +9763,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
  *
  *	@ctx: context of the expression using the data
  *	@data: destination struct nft_data
- *	@size: maximum data length
  *	@desc: data description
  *	@nla: netlink attribute containing data
  *
@@ -9781,24 +9772,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
  *	The caller can indicate that it only wants to accept data of type
  *	NFT_DATA_VALUE by passing NULL for the ctx argument.
  */
-int nft_data_init(const struct nft_ctx *ctx,
-		  struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
 		  struct nft_data_desc *desc, const struct nlattr *nla)
 {
 	struct nlattr *tb[NFTA_DATA_MAX + 1];
 	int err;
 
+	if (WARN_ON_ONCE(!desc->size))
+		return -EINVAL;
+
 	err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
 					  nft_data_policy, NULL);
 	if (err < 0)
 		return err;
 
-	if (tb[NFTA_DATA_VALUE])
-		return nft_value_init(ctx, data, size, desc,
-				      tb[NFTA_DATA_VALUE]);
-	if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
-		return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
-	return -EINVAL;
+	if (tb[NFTA_DATA_VALUE]) {
+		if (desc->type != NFT_DATA_VALUE)
+			return -EINVAL;
+
+		err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+	} else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
+		if (desc->type != NFT_DATA_VERDICT)
+			return -EINVAL;
+
+		err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+	} else {
+		err = -EINVAL;
+	}
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(nft_data_init);
 
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 83590afe3768..e6e402b247d0 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
 static int nft_bitwise_init_bool(struct nft_bitwise *priv,
 				 const struct nlattr *const tb[])
 {
-	struct nft_data_desc mask, xor;
+	struct nft_data_desc mask = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->mask),
+		.len	= priv->len,
+	};
+	struct nft_data_desc xor = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->xor),
+		.len	= priv->len,
+	};
 	int err;
 
 	if (tb[NFTA_BITWISE_DATA])
@@ -103,37 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
 	    !tb[NFTA_BITWISE_XOR])
 		return -EINVAL;
 
-	err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask,
-			    tb[NFTA_BITWISE_MASK]);
+	err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]);
 	if (err < 0)
 		return err;
-	if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) {
-		err = -EINVAL;
-		goto err_mask_release;
-	}
 
-	err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor,
-			    tb[NFTA_BITWISE_XOR]);
+	err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]);
 	if (err < 0)
-		goto err_mask_release;
-	if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) {
-		err = -EINVAL;
-		goto err_xor_release;
-	}
+		goto err_xor_err;
 
 	return 0;
 
-err_xor_release:
-	nft_data_release(&priv->xor, xor.type);
-err_mask_release:
+err_xor_err:
 	nft_data_release(&priv->mask, mask.type);
+
 	return err;
 }
 
 static int nft_bitwise_init_shift(struct nft_bitwise *priv,
 				  const struct nlattr *const tb[])
 {
-	struct nft_data_desc d;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->data),
+		.len	= sizeof(u32),
+	};
 	int err;
 
 	if (tb[NFTA_BITWISE_MASK] ||
@@ -143,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
 	if (!tb[NFTA_BITWISE_DATA])
 		return -EINVAL;
 
-	err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d,
-			    tb[NFTA_BITWISE_DATA]);
+	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]);
 	if (err < 0)
 		return err;
-	if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) ||
-	    priv->data.data[0] >= BITS_PER_TYPE(u32)) {
-		nft_data_release(&priv->data, d.type);
+
+	if (priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+		nft_data_release(&priv->data, desc.type);
 		return -EINVAL;
 	}
 
@@ -339,22 +340,21 @@ static const struct nft_expr_ops nft_bitwise_ops = {
 static int
 nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
 {
-	struct nft_data_desc desc;
 	struct nft_data data;
-	int err = 0;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(data),
+		.len	= sizeof(u32),
+	};
+	int err;
 
-	err = nft_data_init(NULL, &data, sizeof(data), &desc, tb);
+	err = nft_data_init(NULL, &data, &desc, tb);
 	if (err < 0)
 		return err;
 
-	if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) {
-		err = -EINVAL;
-		goto err;
-	}
 	*out = data.data[0];
-err:
-	nft_data_release(&data, desc.type);
-	return err;
+
+	return 0;
 }
 
 static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 777f09e4dc60..963cf831799c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -73,20 +73,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 			const struct nlattr * const tb[])
 {
 	struct nft_cmp_expr *priv = nft_expr_priv(expr);
-	struct nft_data_desc desc;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->data),
+	};
 	int err;
 
-	err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
-			    tb[NFTA_CMP_DATA]);
+	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
 	if (err < 0)
 		return err;
 
-	if (desc.type != NFT_DATA_VALUE) {
-		err = -EINVAL;
-		nft_data_release(&priv->data, desc.type);
-		return err;
-	}
-
 	err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
 	if (err < 0)
 		return err;
@@ -214,12 +210,14 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
 			     const struct nlattr * const tb[])
 {
 	struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
-	struct nft_data_desc desc;
 	struct nft_data data;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(data),
+	};
 	int err;
 
-	err = nft_data_init(NULL, &data, sizeof(data), &desc,
-			    tb[NFTA_CMP_DATA]);
+	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
 	if (err < 0)
 		return err;
 
@@ -313,11 +311,13 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
 			       const struct nlattr * const tb[])
 {
 	struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
-	struct nft_data_desc desc;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->data),
+	};
 	int err;
 
-	err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
-			    tb[NFTA_CMP_DATA]);
+	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
 	if (err < 0)
 		return err;
 
@@ -380,8 +380,11 @@ const struct nft_expr_ops nft_cmp16_fast_ops = {
 static const struct nft_expr_ops *
 nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
-	struct nft_data_desc desc;
 	struct nft_data data;
+	struct nft_data_desc desc = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(data),
+	};
 	enum nft_cmp_ops op;
 	u8 sreg;
 	int err;
@@ -404,14 +407,10 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 		return ERR_PTR(-EINVAL);
 	}
 
-	err = nft_data_init(NULL, &data, sizeof(data), &desc,
-			    tb[NFTA_CMP_DATA]);
+	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
 	if (err < 0)
 		return ERR_PTR(err);
 
-	if (desc.type != NFT_DATA_VALUE)
-		goto err1;
-
 	sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
 
 	if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
@@ -423,9 +422,6 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 			return &nft_cmp16_fast_ops;
 	}
 	return &nft_cmp_ops;
-err1:
-	nft_data_release(&data, desc.type);
-	return ERR_PTR(-EINVAL);
 }
 
 struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index b80f7b507349..5f28b21abc7d 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -29,20 +29,36 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
 	[NFTA_IMMEDIATE_DATA]	= { .type = NLA_NESTED },
 };
 
+static enum nft_data_types nft_reg_to_type(const struct nlattr *nla)
+{
+	enum nft_data_types type;
+	u8 reg;
+
+	reg = ntohl(nla_get_be32(nla));
+	if (reg == NFT_REG_VERDICT)
+		type = NFT_DATA_VERDICT;
+	else
+		type = NFT_DATA_VALUE;
+
+	return type;
+}
+
 static int nft_immediate_init(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nlattr * const tb[])
 {
 	struct nft_immediate_expr *priv = nft_expr_priv(expr);
-	struct nft_data_desc desc;
+	struct nft_data_desc desc = {
+		.size	= sizeof(priv->data),
+	};
 	int err;
 
 	if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
 	    tb[NFTA_IMMEDIATE_DATA] == NULL)
 		return -EINVAL;
 
-	err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
-			    tb[NFTA_IMMEDIATE_DATA]);
+	desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]);
+	err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
 	if (err < 0)
 		return err;
 
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 66f77484c227..832f0d725a9e 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
 			const struct nlattr * const tb[])
 {
 	struct nft_range_expr *priv = nft_expr_priv(expr);
-	struct nft_data_desc desc_from, desc_to;
+	struct nft_data_desc desc_from = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->data_from),
+	};
+	struct nft_data_desc desc_to = {
+		.type	= NFT_DATA_VALUE,
+		.size	= sizeof(priv->data_to),
+	};
 	int err;
 	u32 op;
 
@@ -61,26 +68,16 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
 	    !tb[NFTA_RANGE_TO_DATA])
 		return -EINVAL;
 
-	err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
-			    &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+	err = nft_data_init(NULL, &priv->data_from, &desc_from,
+			    tb[NFTA_RANGE_FROM_DATA]);
 	if (err < 0)
 		return err;
 
-	if (desc_from.type != NFT_DATA_VALUE) {
-		err = -EINVAL;
-		goto err1;
-	}
-
-	err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
-			    &desc_to, tb[NFTA_RANGE_TO_DATA]);
+	err = nft_data_init(NULL, &priv->data_to, &desc_to,
+			    tb[NFTA_RANGE_TO_DATA]);
 	if (err < 0)
 		goto err1;
 
-	if (desc_to.type != NFT_DATA_VALUE) {
-		err = -EINVAL;
-		goto err2;
-	}
-
 	if (desc_from.len != desc_to.len) {
 		err = -EINVAL;
 		goto err2;
-- 
cgit 


From f323ef3a0d49e147365284bc1f02212e617b7f09 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 8 Aug 2022 19:30:07 +0200
Subject: netfilter: nf_tables: disallow jump to implicit chain from set
 element

Extend struct nft_data_desc to add a flag field that specifies
nft_data_init() is being called for set element data.

Use it to disallow jump to implicit chain from set element, only jump
to chain via immediate expression is allowed.

Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 5 +++++
 net/netfilter/nf_tables_api.c     | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 1554f1e7215b..99aae36c04b9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -221,10 +221,15 @@ struct nft_ctx {
 	bool				report;
 };
 
+enum nft_data_desc_flags {
+	NFT_DATA_DESC_SETELEM	= (1 << 0),
+};
+
 struct nft_data_desc {
 	enum nft_data_types		type;
 	unsigned int			size;
 	unsigned int			len;
+	unsigned int			flags;
 };
 
 int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 05896765c68f..460b0925ea60 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5226,6 +5226,7 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
 	desc->type = dtype;
 	desc->size = NFT_DATA_VALUE_MAXLEN;
 	desc->len = set->dlen;
+	desc->flags = NFT_DATA_DESC_SETELEM;
 
 	return nft_data_init(ctx, data, desc, attr);
 }
@@ -9665,6 +9666,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 			return PTR_ERR(chain);
 		if (nft_is_base_chain(chain))
 			return -EOPNOTSUPP;
+		if (desc->flags & NFT_DATA_DESC_SETELEM &&
+		    chain->flags & NFT_CHAIN_BINDING)
+			return -EINVAL;
 
 		chain->use++;
 		data->verdict.chain = chain;
-- 
cgit 


From d4252071b97d2027d246f6a82cbee4d52f618b47 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Aug 2022 14:32:13 -0400
Subject: add barriers to buffer_uptodate and set_buffer_uptodate

Let's have a look at this piece of code in __bread_slow:

	get_bh(bh);
	bh->b_end_io = end_buffer_read_sync;
	submit_bh(REQ_OP_READ, 0, bh);
	wait_on_buffer(bh);
	if (buffer_uptodate(bh))
		return bh;

Neither wait_on_buffer nor buffer_uptodate contain any memory barrier.
Consequently, if someone calls sb_bread and then reads the buffer data,
the read of buffer data may be executed before wait_on_buffer(bh) on
architectures with weak memory ordering and it may return invalid data.

Fix this bug by adding a memory barrier to set_buffer_uptodate and an
acquire barrier to buffer_uptodate (in a similar way as
folio_test_uptodate and folio_mark_uptodate).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 307445d1c69e..def8b8d30ccc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -118,7 +118,6 @@ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
  * of the form "mark_buffer_foo()".  These are higher-level functions which
  * do something in addition to setting a b_state bit.
  */
-BUFFER_FNS(Uptodate, uptodate)
 BUFFER_FNS(Dirty, dirty)
 TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
@@ -136,6 +135,30 @@ BUFFER_FNS(Meta, meta)
 BUFFER_FNS(Prio, prio)
 BUFFER_FNS(Defer_Completion, defer_completion)
 
+static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
+{
+	/*
+	 * make it consistent with folio_mark_uptodate
+	 * pairs with smp_load_acquire in buffer_uptodate
+	 */
+	smp_mb__before_atomic();
+	set_bit(BH_Uptodate, &bh->b_state);
+}
+
+static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
+{
+	clear_bit(BH_Uptodate, &bh->b_state);
+}
+
+static __always_inline int buffer_uptodate(const struct buffer_head *bh)
+{
+	/*
+	 * make it consistent with folio_test_uptodate
+	 * pairs with smp_mb__before_atomic in set_buffer_uptodate
+	 */
+	return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+}
+
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
 /* If we *know* page->private refers to buffer_heads */
-- 
cgit 


From 84b709d31063bacaabaaad1c5d85143150edd695 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Aug 2022 18:02:36 +0200
Subject: ax88796: Fix some typo in a comment

s/by caused/be caused/
s/ax88786/ax88796/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/7db4b622d2c3e5af58c1d1f32b81836f4af71f18.1659801746.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ax88796.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index b658471f97f0..303100f08ab8 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -34,8 +34,8 @@ struct ax_plat_data {
 			const unsigned char *buf, int star_page);
 	void (*block_input)(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset);
-	/* returns nonzero if a pending interrupt request might by caused by
-	 * the ax88786. Handles all interrupts if set to NULL
+	/* returns nonzero if a pending interrupt request might be caused by
+	 * the ax88796. Handles all interrupts if set to NULL
 	 */
 	int (*check_irq)(struct platform_device *pdev);
 };
-- 
cgit 


From f329a0ebeaba4ffe91d431e0ac1ca7f9165872a4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 9 Aug 2022 16:27:40 -0700
Subject: genetlink: correct uAPI defines

Commit 50a896cf2d6f ("genetlink: properly support per-op policy dumping")
seems to have copy'n'pasted things a little incorrectly.

The #define CTRL_ATTR_MCAST_GRP_MAX should have stayed right
after the previous enum. The new CTRL_ATTR_POLICY_* needs
its own define for MAX and that max should not contain the
superfluous _DUMP in the name.

We probably can't do anything about the CTRL_ATTR_POLICY_DUMP_MAX
any more, there's likely code which uses it. For consistency
(*cough* codegen *cough*) let's add the correctly name define
nonetheless.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index d83f214b4134..ddba3ca01e39 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -87,6 +87,8 @@ enum {
 	__CTRL_ATTR_MCAST_GRP_MAX,
 };
 
+#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
+
 enum {
 	CTRL_ATTR_POLICY_UNSPEC,
 	CTRL_ATTR_POLICY_DO,
@@ -96,7 +98,6 @@ enum {
 	CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1
 };
 
-#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
-
+#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1)
 
 #endif /* _UAPI__LINUX_GENERIC_NETLINK_H */
-- 
cgit 


From 1685c0f32554a7f35962061d17155c58454f1cd2 Mon Sep 17 00:00:00 2001
From: Mingwei Zhang <mizhang@google.com>
Date: Sun, 7 Aug 2022 05:21:41 +0000
Subject: KVM: x86/mmu: rename trace function name for asynchronous page fault

Rename the tracepoint function from trace_kvm_async_pf_doublefault() to
trace_kvm_async_pf_repeated_fault() to make it clear, since double fault
has nothing to do with this trace function.

Asynchronous Page Fault (APF) is an artifact generated by KVM when it
cannot find a physical page to satisfy an EPT violation. KVM uses APF to
tell the guest OS to do something else such as scheduling other guest
processes to make forward progress. However, when another guest process
also touches a previously APFed page, KVM halts the vCPU instead of
generating a repeated APF to avoid wasting cycles.

Double fault (#DF) clearly has a different meaning and a different
consequence when triggered. #DF requires two nested contributory exceptions
instead of two page faults faulting at the same address. A prevous bug on
APF indicates that it may trigger a double fault in the guest [1] and
clearly this trace function has nothing to do with it. So rename this
function should be a valid choice.

No functional change intended.

[1] https://www.spinics.net/lists/kvm/msg214957.html

Signed-off-by: Mingwei Zhang <mizhang@google.com>
Message-Id: <20220807052141.69186-1-mizhang@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c     | 2 +-
 include/trace/events/kvm.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 918ff23c0bc3..eccddb136954 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4164,7 +4164,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(fault->addr, fault->gfn);
 		if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
-			trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
+			trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
 			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
 			return RET_PF_RETRY;
 		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 37e1e1a2d67d..3bd31ea23fee 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -282,7 +282,7 @@ DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
 	TP_ARGS(gva, gfn)
 );
 
-DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
+DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault,
 
 	TP_PROTO(u64 gva, u64 gfn),
 
-- 
cgit 


From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Fri, 5 Aug 2022 15:48:34 +0800
Subject: net: fix refcount bug in sk_psock_get (2)

Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
 <TASK>
 __refcount_add_not_zero include/linux/refcount.h:163 [inline]
 __refcount_inc_not_zero include/linux/refcount.h:227 [inline]
 refcount_inc_not_zero include/linux/refcount.h:245 [inline]
 sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
 sk_backlog_rcv include/net/sock.h:1061 [inline]
 __release_sock+0x134/0x3b0 net/core/sock.c:2849
 release_sock+0x54/0x1b0 net/core/sock.c:3404
 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
 __sys_shutdown_sock net/socket.c:2331 [inline]
 __sys_shutdown_sock net/socket.c:2325 [inline]
 __sys_shutdown+0xf1/0x1b0 net/socket.c:2343
 __do_sys_shutdown net/socket.c:2351 [inline]
 __se_sys_shutdown net/socket.c:2349 [inline]
 __x64_sys_shutdown+0x50/0x70 net/socket.c:2349
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
 </TASK>

During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().

Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as psock type, triggering the refcnt warning.

So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.

This patch solves it by using another bit(defined as
SK_USER_DATA_PSOCK) in PTRMASK, to mark whether
sk_user_data points to a psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").

For there will possibly be more flags in the sk_user_data field,
this patch also refactor sk_user_data flags code to be more generic
to improve its maintainability.

Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skmsg.h |  3 ++-
 include/net/sock.h    | 68 ++++++++++++++++++++++++++++++++++++---------------
 net/core/skmsg.c      |  4 ++-
 3 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 153b6dec9b6a..48f4b645193b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -278,7 +278,8 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
 
 static inline struct sk_psock *sk_psock(const struct sock *sk)
 {
-	return rcu_dereference_sk_user_data(sk);
+	return __rcu_dereference_sk_user_data_with_flags(sk,
+							 SK_USER_DATA_PSOCK);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index a7273b289188..05a1bbdf5805 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -545,14 +545,26 @@ enum sk_pacing {
 	SK_PACING_FQ		= 2,
 };
 
-/* Pointer stored in sk_user_data might not be suitable for copying
- * when cloning the socket. For instance, it can point to a reference
- * counted object. sk_user_data bottom bit is set if pointer must not
- * be copied.
+/* flag bits in sk_user_data
+ *
+ * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
+ *   not be suitable for copying when cloning the socket. For instance,
+ *   it can point to a reference counted object. sk_user_data bottom
+ *   bit is set if pointer must not be copied.
+ *
+ * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
+ *   managed/owned by a BPF reuseport array. This bit should be set
+ *   when sk_user_data's sk is added to the bpf's reuseport_array.
+ *
+ * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
+ *   sk_user_data points to psock type. This bit should be set
+ *   when sk_user_data is assigned to a psock object.
  */
 #define SK_USER_DATA_NOCOPY	1UL
-#define SK_USER_DATA_BPF	2UL	/* Managed by BPF */
-#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
+#define SK_USER_DATA_BPF	2UL
+#define SK_USER_DATA_PSOCK	4UL
+#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+				  SK_USER_DATA_PSOCK)
 
 /**
  * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
@@ -565,24 +577,40 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
+/**
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ */
+static inline void *
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
+					  uintptr_t flags)
+{
+	uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
+
+	WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+	if ((sk_user_data & flags) == flags)
+		return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+	return NULL;
+}
+
 #define rcu_dereference_sk_user_data(sk)				\
+	__rcu_dereference_sk_user_data_with_flags(sk, 0)
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)		\
 ({									\
-	void *__tmp = rcu_dereference(__sk_user_data((sk)));		\
-	(void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK);		\
-})
-#define rcu_assign_sk_user_data(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
-	rcu_assign_pointer(__sk_user_data((sk)), __tmp);		\
-})
-#define rcu_assign_sk_user_data_nocopy(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
+	uintptr_t __tmp1 = (uintptr_t)(ptr),				\
+		  __tmp2 = (uintptr_t)(flags);				\
+	WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);			\
+	WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);			\
 	rcu_assign_pointer(__sk_user_data((sk)),			\
-			   __tmp | SK_USER_DATA_NOCOPY);		\
+			   __tmp1 | __tmp2);				\
 })
+#define rcu_assign_sk_user_data(sk, ptr)				\
+	__rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
 
 static inline
 struct net *sock_net(const struct sock *sk)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 81627892bdd4..57e942a6431a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -739,7 +739,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
 	refcount_set(&psock->refcnt, 1);
 
-	rcu_assign_sk_user_data_nocopy(sk, psock);
+	__rcu_assign_sk_user_data_with_flags(sk, psock,
+					     SK_USER_DATA_NOCOPY |
+					     SK_USER_DATA_PSOCK);
 	sock_hold(sk);
 
 out:
-- 
cgit 


From 94ce3b64c62d4b628cf85cd0d9a370aca8f7e43a Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 10 Aug 2022 11:16:02 +0300
Subject: net/tls: Use RCU API to access tls_ctx->netdev

Currently, tls_device_down synchronizes with tls_device_resync_rx using
RCU, however, the pointer to netdev is stored using WRITE_ONCE and
loaded using READ_ONCE.

Although such approach is technically correct (rcu_dereference is
essentially a READ_ONCE, and rcu_assign_pointer uses WRITE_ONCE to store
NULL), using special RCU helpers for pointers is more valid, as it
includes additional checks and might change the implementation
transparently to the callers.

Mark the netdev pointer as __rcu and use the correct RCU helpers to
access it. For non-concurrent access pass the right conditions that
guarantee safe access (locks taken, refcount value). Also use the
correct helper in mlx5e, where even READ_ONCE was missing.

The transition to RCU exposes existing issues, fixed by this commit:

1. bond_tls_device_xmit could read netdev twice, and it could become
NULL the second time, after the NULL check passed.

2. Drivers shouldn't stop processing the last packet if tls_device_down
just set netdev to NULL, before tls_dev_del was called. This prevents a
possible packet drop when transitioning to the fallback software mode.

Fixes: 89df6a810470 ("net/bonding: Implement TLS TX device offload")
Fixes: c55dcdd435aa ("net/tls: Fix use-after-free after the TLS device goes down and up")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Link: https://lore.kernel.org/r/20220810081602.1435800-1-maximmi@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c                    | 10 ++++--
 .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c      |  8 ++++-
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c |  8 ++++-
 include/net/tls.h                                  |  2 +-
 net/tls/tls_device.c                               | 38 +++++++++++++++++-----
 net/tls/tls_device_fallback.c                      |  3 +-
 6 files changed, 54 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ab7fdbbc2530..50e60843020c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5338,8 +5338,14 @@ static struct net_device *bond_sk_get_lower_dev(struct net_device *dev,
 static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb,
 					struct net_device *dev)
 {
-	if (likely(bond_get_slave_by_dev(bond, tls_get_ctx(skb->sk)->netdev)))
-		return bond_dev_queue_xmit(bond, skb, tls_get_ctx(skb->sk)->netdev);
+	struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev);
+
+	/* tls_netdev might become NULL, even if tls_is_sk_tx_device_offloaded
+	 * was true, if tls_device_down is running in parallel, but it's OK,
+	 * because bond_get_slave_by_dev has a NULL check.
+	 */
+	if (likely(bond_get_slave_by_dev(bond, tls_netdev)))
+		return bond_dev_queue_xmit(bond, skb, tls_netdev);
 	return bond_tx_drop(dev, skb);
 }
 #endif
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
index bfee0e4e54b1..da9973b711f4 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
@@ -1932,6 +1932,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev)
 	int data_len, qidx, ret = 0, mss;
 	struct tls_record_info *record;
 	struct chcr_ktls_info *tx_info;
+	struct net_device *tls_netdev;
 	struct tls_context *tls_ctx;
 	struct sge_eth_txq *q;
 	struct adapter *adap;
@@ -1945,7 +1946,12 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev)
 	mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : data_len;
 
 	tls_ctx = tls_get_ctx(skb->sk);
-	if (unlikely(tls_ctx->netdev != dev))
+	tls_netdev = rcu_dereference_bh(tls_ctx->netdev);
+	/* Don't quit on NULL: if tls_device_down is running in parallel,
+	 * netdev might become NULL, even if tls_is_sk_tx_device_offloaded was
+	 * true. Rather continue processing this packet.
+	 */
+	if (unlikely(tls_netdev && tls_netdev != dev))
 		goto out;
 
 	tx_ctx = chcr_get_ktls_tx_context(tls_ctx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index 6b6c7044b64a..0aef69527226 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -808,6 +808,7 @@ bool mlx5e_ktls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq,
 {
 	struct mlx5e_ktls_offload_context_tx *priv_tx;
 	struct mlx5e_sq_stats *stats = sq->stats;
+	struct net_device *tls_netdev;
 	struct tls_context *tls_ctx;
 	int datalen;
 	u32 seq;
@@ -819,7 +820,12 @@ bool mlx5e_ktls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq,
 	mlx5e_tx_mpwqe_ensure_complete(sq);
 
 	tls_ctx = tls_get_ctx(skb->sk);
-	if (WARN_ON_ONCE(tls_ctx->netdev != netdev))
+	tls_netdev = rcu_dereference_bh(tls_ctx->netdev);
+	/* Don't WARN on NULL: if tls_device_down is running in parallel,
+	 * netdev might become NULL, even if tls_is_sk_tx_device_offloaded was
+	 * true. Rather continue processing this packet.
+	 */
+	if (WARN_ON_ONCE(tls_netdev && tls_netdev != netdev))
 		goto err_out;
 
 	priv_tx = mlx5e_get_ktls_tx_priv_ctx(tls_ctx);
diff --git a/include/net/tls.h b/include/net/tls.h
index b75b5727abdb..cb205f9d9473 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -237,7 +237,7 @@ struct tls_context {
 	void *priv_ctx_tx;
 	void *priv_ctx_rx;
 
-	struct net_device *netdev;
+	struct net_device __rcu *netdev;
 
 	/* rw cache line */
 	struct cipher_context tx;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 6ed41474bdf8..0f983e5f7dde 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -71,7 +71,13 @@ static void tls_device_tx_del_task(struct work_struct *work)
 	struct tls_offload_context_tx *offload_ctx =
 		container_of(work, struct tls_offload_context_tx, destruct_work);
 	struct tls_context *ctx = offload_ctx->ctx;
-	struct net_device *netdev = ctx->netdev;
+	struct net_device *netdev;
+
+	/* Safe, because this is the destroy flow, refcount is 0, so
+	 * tls_device_down can't store this field in parallel.
+	 */
+	netdev = rcu_dereference_protected(ctx->netdev,
+					   !refcount_read(&ctx->refcount));
 
 	netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX);
 	dev_put(netdev);
@@ -81,6 +87,7 @@ static void tls_device_tx_del_task(struct work_struct *work)
 
 static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
 {
+	struct net_device *netdev;
 	unsigned long flags;
 	bool async_cleanup;
 
@@ -91,7 +98,14 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
 	}
 
 	list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */
-	async_cleanup = ctx->netdev && ctx->tx_conf == TLS_HW;
+
+	/* Safe, because this is the destroy flow, refcount is 0, so
+	 * tls_device_down can't store this field in parallel.
+	 */
+	netdev = rcu_dereference_protected(ctx->netdev,
+					   !refcount_read(&ctx->refcount));
+
+	async_cleanup = netdev && ctx->tx_conf == TLS_HW;
 	if (async_cleanup) {
 		struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx);
 
@@ -229,7 +243,8 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
 
 	trace_tls_device_tx_resync_send(sk, seq, rcd_sn);
 	down_read(&device_offload_lock);
-	netdev = tls_ctx->netdev;
+	netdev = rcu_dereference_protected(tls_ctx->netdev,
+					   lockdep_is_held(&device_offload_lock));
 	if (netdev)
 		err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq,
 							 rcd_sn,
@@ -710,7 +725,7 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 
 	trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type);
 	rcu_read_lock();
-	netdev = READ_ONCE(tls_ctx->netdev);
+	netdev = rcu_dereference(tls_ctx->netdev);
 	if (netdev)
 		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
 						   TLS_OFFLOAD_CTX_DIR_RX);
@@ -1035,7 +1050,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
 	if (sk->sk_destruct != tls_device_sk_destruct) {
 		refcount_set(&ctx->refcount, 1);
 		dev_hold(netdev);
-		ctx->netdev = netdev;
+		RCU_INIT_POINTER(ctx->netdev, netdev);
 		spin_lock_irq(&tls_device_lock);
 		list_add_tail(&ctx->list, &tls_device_list);
 		spin_unlock_irq(&tls_device_lock);
@@ -1306,7 +1321,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
 	struct net_device *netdev;
 
 	down_read(&device_offload_lock);
-	netdev = tls_ctx->netdev;
+	netdev = rcu_dereference_protected(tls_ctx->netdev,
+					   lockdep_is_held(&device_offload_lock));
 	if (!netdev)
 		goto out;
 
@@ -1315,7 +1331,7 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
 
 	if (tls_ctx->tx_conf != TLS_HW) {
 		dev_put(netdev);
-		tls_ctx->netdev = NULL;
+		rcu_assign_pointer(tls_ctx->netdev, NULL);
 	} else {
 		set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags);
 	}
@@ -1335,7 +1351,11 @@ static int tls_device_down(struct net_device *netdev)
 
 	spin_lock_irqsave(&tls_device_lock, flags);
 	list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
-		if (ctx->netdev != netdev ||
+		struct net_device *ctx_netdev =
+			rcu_dereference_protected(ctx->netdev,
+						  lockdep_is_held(&device_offload_lock));
+
+		if (ctx_netdev != netdev ||
 		    !refcount_inc_not_zero(&ctx->refcount))
 			continue;
 
@@ -1352,7 +1372,7 @@ static int tls_device_down(struct net_device *netdev)
 		/* Stop the RX and TX resync.
 		 * tls_dev_resync must not be called after tls_dev_del.
 		 */
-		WRITE_ONCE(ctx->netdev, NULL);
+		rcu_assign_pointer(ctx->netdev, NULL);
 
 		/* Start skipping the RX resync logic completely. */
 		set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags);
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 618cee704217..7dfc8023e0f1 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -426,7 +426,8 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
 				      struct net_device *dev,
 				      struct sk_buff *skb)
 {
-	if (dev == tls_get_ctx(sk)->netdev || netif_is_bond_master(dev))
+	if (dev == rcu_dereference_bh(tls_get_ctx(sk)->netdev) ||
+	    netif_is_bond_master(dev))
 		return skb;
 
 	return tls_sw_fallback(sk, skb);
-- 
cgit 


From c2a052a4a949df53f50a5024843432d2234cb824 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Fri, 24 Jun 2022 10:55:41 +0800
Subject: remoteproc: rename len of rpoc_vring to num

Rename the member len in the structure rpoc_vring to num. And remove 'in
bytes' from the comment of it. This is misleading. Because this actually
refers to the size of the virtio vring to be created. The unit is not
bytes.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Message-Id: <20220624025621.128843-2-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/remoteproc/remoteproc_core.c   |  4 ++--
 drivers/remoteproc/remoteproc_virtio.c | 10 +++++-----
 include/linux/remoteproc.h             |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 02a04ab34a23..2d2f3bab5888 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -334,7 +334,7 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
 	size_t size;
 
 	/* actual size of vring (in bytes) */
-	size = PAGE_ALIGN(vring_size(rvring->len, rvring->align));
+	size = PAGE_ALIGN(vring_size(rvring->num, rvring->align));
 
 	rsc = (void *)rproc->table_ptr + rvdev->rsc_offset;
 
@@ -401,7 +401,7 @@ rproc_parse_vring(struct rproc_vdev *rvdev, struct fw_rsc_vdev *rsc, int i)
 		return -EINVAL;
 	}
 
-	rvring->len = vring->num;
+	rvring->num = vring->num;
 	rvring->align = vring->align;
 	rvring->rvdev = rvdev;
 
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 70ab496d0431..d43d74733f0a 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -87,7 +87,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	struct fw_rsc_vdev *rsc;
 	struct virtqueue *vq;
 	void *addr;
-	int len, size;
+	int num, size;
 
 	/* we're temporarily limited to two virtqueues per rvdev */
 	if (id >= ARRAY_SIZE(rvdev->vring))
@@ -104,20 +104,20 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 
 	rvring = &rvdev->vring[id];
 	addr = mem->va;
-	len = rvring->len;
+	num = rvring->num;
 
 	/* zero vring */
-	size = vring_size(len, rvring->align);
+	size = vring_size(num, rvring->align);
 	memset(addr, 0, size);
 
 	dev_dbg(dev, "vring%d: va %pK qsz %d notifyid %d\n",
-		id, addr, len, rvring->notifyid);
+		id, addr, num, rvring->notifyid);
 
 	/*
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real device.
 	 */
-	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx,
+	vq = vring_new_virtqueue(id, num, rvring->align, vdev, false, ctx,
 				 addr, rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 7c943f0a2fc4..aea79c77db0f 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -597,7 +597,7 @@ struct rproc_subdev {
 /**
  * struct rproc_vring - remoteproc vring state
  * @va:	virtual address
- * @len: length, in bytes
+ * @num: vring size
  * @da: device address
  * @align: vring alignment
  * @notifyid: rproc-specific unique vring index
@@ -606,7 +606,7 @@ struct rproc_subdev {
  */
 struct rproc_vring {
 	void *va;
-	int len;
+	int num;
 	u32 da;
 	u32 align;
 	int notifyid;
-- 
cgit 


From da802961832f9852886304290135457519815497 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:21 +0800
Subject: virtio: record the maximum queue num supported by the device.

virtio-net can display the maximum (supported by hardware) ring size in
ethtool -g eth0.

When the subsequent patch implements vring reset, it can judge whether
the ring size passed by the driver is legal based on this.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-2-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             | 1 +
 drivers/platform/mellanox/mlxbf-tmfifo.c | 2 ++
 drivers/remoteproc/remoteproc_virtio.c   | 2 ++
 drivers/s390/virtio/virtio_ccw.c         | 3 +++
 drivers/virtio/virtio_mmio.c             | 2 ++
 drivers/virtio/virtio_pci_legacy.c       | 2 ++
 drivers/virtio/virtio_pci_modern.c       | 2 ++
 drivers/virtio/virtio_vdpa.c             | 2 ++
 include/linux/virtio.h                   | 2 ++
 9 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 82ff3785bf69..e719af8bdf56 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -958,6 +958,7 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
 		goto error_create;
 	}
 	vq->priv = info;
+	vq->num_max = num;
 	num = virtqueue_get_vring_size(vq);
 
 	if (vu_dev->protocol_features &
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 38800e86ed8a..1ae3c56b66b0 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -959,6 +959,8 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 			goto error;
 		}
 
+		vq->num_max = vring->num;
+
 		vqs[i] = vq;
 		vring->vq = vq;
 		vq->priv = vring;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index d43d74733f0a..0f7706e23eb9 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -125,6 +125,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	vq->num_max = num;
+
 	rvring->vq = vq;
 	vq->priv = rvring;
 
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 161d3b141f0d..6b86d0280d6b 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -530,6 +530,9 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 		err = -ENOMEM;
 		goto out_err;
 	}
+
+	vq->num_max = info->num;
+
 	/* it may have been reduced */
 	info->num = virtqueue_get_vring_size(vq);
 
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 945cb8fb60b6..3ff746e3f24a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -403,6 +403,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 		goto error_new_virtqueue;
 	}
 
+	vq->num_max = num;
+
 	/* Activate the queue */
 	writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
 	if (vm_dev->version == 1) {
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index a5e5721145c7..2257f1b3d8ae 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -135,6 +135,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
+	vq->num_max = num;
+
 	q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
 	if (q_pfn >> 32) {
 		dev_err(&vp_dev->pci_dev->dev,
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 623906b4996c..e7e0b8c850f6 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -218,6 +218,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
+	vq->num_max = num;
+
 	/* activate the queue */
 	vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
 	vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index c40f7deb6b5a..9670cc79371d 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -183,6 +183,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 		goto error_new_virtqueue;
 	}
 
+	vq->num_max = max_num;
+
 	/* Setup virtqueue callback */
 	cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
 	cb.private = info;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d8fdf170637c..129bde7521e3 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -19,6 +19,7 @@
  * @priv: a pointer for the virtqueue implementation to use.
  * @index: the zero-based ordinal number for this queue.
  * @num_free: number of elements we expect to be able to fit.
+ * @num_max: the maximum number of elements supported by the device.
  *
  * A note on @num_free: with indirect buffers, each buffer needs one
  * element in the queue, otherwise a buffer will need one element per
@@ -31,6 +32,7 @@ struct virtqueue {
 	struct virtio_device *vdev;
 	unsigned int index;
 	unsigned int num_free;
+	unsigned int num_max;
 	void *priv;
 };
 
-- 
cgit 


From 3086e9fc9173166774652a488467e4176ee1c81b Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:22 +0800
Subject: virtio: struct virtio_config_ops add callbacks for queue_reset

reset can be divided into the following four steps (example):
 1. transport: notify the device to reset the queue
 2. vring:     recycle the buffer submitted
 3. vring:     reset/resize the vring (may re-alloc)
 4. transport: mmap vring to device, and enable the queue

In order to support queue reset, add two callbacks in struct
virtio_config_ops to implement steps 1 and 4.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-3-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index b47c2e7ed0ee..36ec7be1f480 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -78,6 +78,18 @@ struct virtio_shm_region {
  * @set_vq_affinity: set the affinity for a virtqueue (optional).
  * @get_vq_affinity: get the affinity for a virtqueue (optional).
  * @get_shm_region: get a shared memory region based on the index.
+ * @disable_vq_and_reset: reset a queue individually (optional).
+ *	vq: the virtqueue
+ *	Returns 0 on success or error status
+ *	disable_vq_and_reset will guarantee that the callbacks are disabled and
+ *	synchronized.
+ *	Except for the callback, the caller should guarantee that the vring is
+ *	not accessed by any functions of virtqueue.
+ * @enable_vq_after_reset: enable a reset queue
+ *	vq: the virtqueue
+ *	Returns 0 on success or error status
+ *	If disable_vq_and_reset is set, then enable_vq_after_reset must also be
+ *	set.
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -104,6 +116,8 @@ struct virtio_config_ops {
 			int index);
 	bool (*get_shm_region)(struct virtio_device *vdev,
 			       struct virtio_shm_region *region, u8 id);
+	int (*disable_vq_and_reset)(struct virtqueue *vq);
+	int (*enable_vq_after_reset)(struct virtqueue *vq);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
-- 
cgit 


From 07d9629d49584b6f79faa6158cd7aef7e6919703 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:27 +0800
Subject: virtio_ring: split: stop __vring_new_virtqueue as export symbol

There is currently only one place to reference __vring_new_virtqueue()
directly from the outside of virtio core. And here vring_new_virtqueue()
can be used instead.

Subsequent patches will modify __vring_new_virtqueue, so stop it as an
export symbol for now.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-8-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 25 ++++++++++++++++---------
 include/linux/virtio_ring.h  | 10 ----------
 tools/virtio/virtio_test.c   |  4 ++--
 3 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a63ef2d99955..8ce6cc73d814 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -204,6 +204,14 @@ struct vring_virtqueue {
 #endif
 };
 
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+					       struct vring vring,
+					       struct virtio_device *vdev,
+					       bool weak_barriers,
+					       bool context,
+					       bool (*notify)(struct virtqueue *),
+					       void (*callback)(struct virtqueue *),
+					       const char *name);
 
 /*
  * Helpers.
@@ -2195,14 +2203,14 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 /* Only available for split ring */
-struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
-					struct virtio_device *vdev,
-					bool weak_barriers,
-					bool context,
-					bool (*notify)(struct virtqueue *),
-					void (*callback)(struct virtqueue *),
-					const char *name)
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+					       struct vring vring,
+					       struct virtio_device *vdev,
+					       bool weak_barriers,
+					       bool context,
+					       bool (*notify)(struct virtqueue *),
+					       void (*callback)(struct virtqueue *),
+					       const char *name)
 {
 	struct vring_virtqueue *vq;
 
@@ -2277,7 +2285,6 @@ err_state:
 	kfree(vq);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
 
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index b485b13fa50b..8b8af1a38991 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -76,16 +76,6 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 					 void (*callback)(struct virtqueue *vq),
 					 const char *name);
 
-/* Creates a virtqueue with a custom layout. */
-struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
-					struct virtio_device *vdev,
-					bool weak_barriers,
-					bool ctx,
-					bool (*notify)(struct virtqueue *),
-					void (*callback)(struct virtqueue *),
-					const char *name);
-
 /*
  * Creates a virtqueue with a standard layout but a caller-allocated
  * ring.
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index 23f142af544a..86a410ddcedd 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -102,8 +102,8 @@ static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
 
 	memset(info->ring, 0, vring_size(num, 4096));
 	vring_init(&info->vring, num, info->ring, 4096);
-	info->vq = __vring_new_virtqueue(info->idx, info->vring, vdev, true,
-					 false, vq_notify, vq_callback, "test");
+	info->vq = vring_new_virtqueue(info->idx, num, 4096, vdev, true, false,
+				       info->ring, vq_notify, vq_callback, "test");
 	assert(info->vq);
 	info->vq->priv = info;
 }
-- 
cgit 


From c790e8e1817f1a17c05e64f1c4f16f231b8529d5 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:44 +0800
Subject: virtio_ring: introduce virtqueue_resize()

Introduce virtqueue_resize() to implement the resize of vring.
Based on these, the driver can dynamically adjust the size of the vring.
For example: ethtool -G.

virtqueue_resize() implements resize based on the vq reset function. In
case of failure to allocate a new vring, it will give up resize and use
the original vring.

During this process, if the re-enable reset vq fails, the vq can no
longer be used. Although the probability of this situation is not high.

The parameter recycle is used to recycle the buffer that is no longer
used.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-25-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       |  3 ++
 2 files changed, 72 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index bea5a3448217..6447a09e2e38 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2539,6 +2539,75 @@ struct virtqueue *vring_create_virtqueue(
 }
 EXPORT_SYMBOL_GPL(vring_create_virtqueue);
 
+/**
+ * virtqueue_resize - resize the vring of vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @num: new ring num
+ * @recycle: callback for recycle the useless buffer
+ *
+ * When it is really necessary to create a new vring, it will set the current vq
+ * into the reset state. Then call the passed callback to recycle the buffer
+ * that is no longer used. Only after the new vring is successfully created, the
+ * old vring will be released.
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size.
+ *  vq can still work normally
+ * -EBUSY: Failed to sync with device, vq may not work properly
+ * -ENOENT: Transport or device not supported
+ * -E2BIG/-EINVAL: num error
+ * -EPERM: Operation not permitted
+ *
+ */
+int virtqueue_resize(struct virtqueue *_vq, u32 num,
+		     void (*recycle)(struct virtqueue *vq, void *buf))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct virtio_device *vdev = vq->vq.vdev;
+	void *buf;
+	int err;
+
+	if (!vq->we_own_ring)
+		return -EPERM;
+
+	if (num > vq->vq.num_max)
+		return -E2BIG;
+
+	if (!num)
+		return -EINVAL;
+
+	if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num)
+		return 0;
+
+	if (!vdev->config->disable_vq_and_reset)
+		return -ENOENT;
+
+	if (!vdev->config->enable_vq_after_reset)
+		return -ENOENT;
+
+	err = vdev->config->disable_vq_and_reset(_vq);
+	if (err)
+		return err;
+
+	while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
+		recycle(_vq, buf);
+
+	if (vq->packed_ring)
+		err = virtqueue_resize_packed(_vq, num);
+	else
+		err = virtqueue_resize_split(_vq, num);
+
+	if (vdev->config->enable_vq_after_reset(_vq))
+		return -EBUSY;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(virtqueue_resize);
+
 /* Only available for split ring */
 struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 129bde7521e3..62e31bca5602 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -91,6 +91,9 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *vq);
 dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq);
 dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
 
+int virtqueue_resize(struct virtqueue *vq, u32 num,
+		     void (*recycle)(struct virtqueue *vq, void *buf));
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
-- 
cgit 


From ea024594b1dc5b6719c1400ae154690f5c203996 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:45 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_notify_data

Add queue_notify_data in struct virtio_pci_common_cfg, which comes from
here https://github.com/oasis-tcs/virtio-spec/issues/89

In order not to affect the API, add a dedicated structure struct
virtio_pci_modern_common_cfg to virtio_pci_modern.h.

Since I want to add queue_reset after queue_notify_data, I submitted
this patch first.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-26-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 7 +++++++
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index eb2bd9b4077d..41f5a018bd94 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,6 +5,13 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	__le16 queue_notify_data;	/* read-write */
+	__le16 padding;
+};
+
 struct virtio_pci_modern_device {
 	struct pci_dev *pci_dev;
 
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 3a86f36d7e3d..f5981a874481 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -202,6 +202,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_AVAILHI	44
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
+#define VIRTIO_PCI_COMMON_Q_NDATA	56
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit 


From 3251063155032729b8793ac3957136ae25c0bafa Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:46 +0800
Subject: virtio: allow to unbreak/break virtqueue individually

This patch allows the new introduced
__virtqueue_break()/__virtqueue_unbreak() to break/unbreak the
virtqueue.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-27-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 24 ++++++++++++++++++++++++
 include/linux/virtio.h       |  3 +++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6447a09e2e38..accb3ae6cc95 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2724,6 +2724,30 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_break(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+	WRITE_ONCE(vq->broken, true);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_break);
+
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_unbreak(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+	WRITE_ONCE(vq->broken, false);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_unbreak);
+
 bool virtqueue_is_broken(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 62e31bca5602..d45ee82a4470 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -138,6 +138,9 @@ bool is_virtio_device(struct device *dev);
 void virtio_break_device(struct virtio_device *dev);
 void __virtio_unbreak_device(struct virtio_device *dev);
 
+void __virtqueue_break(struct virtqueue *_vq);
+void __virtqueue_unbreak(struct virtqueue *_vq);
+
 void virtio_config_changed(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
-- 
cgit 


From d94587b5bb5c4bba32fbc2bd92c86cc6de25167f Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:47 +0800
Subject: virtio: queue_reset: add VIRTIO_F_RING_RESET

Added VIRTIO_F_RING_RESET, it came from here

https://github.com/oasis-tcs/virtio-spec/issues/124
https://github.com/oasis-tcs/virtio-spec/issues/139

This feature indicates that the driver can reset a queue individually.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-28-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_config.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index f0fb0ae021c0..3c05162bc988 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -52,7 +52,7 @@
  * rest are per-device feature bits.
  */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		38
+#define VIRTIO_TRANSPORT_F_END		41
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -98,4 +98,9 @@
  * Does the device support Single Root I/O Virtualization?
  */
 #define VIRTIO_F_SR_IOV			37
+
+/*
+ * This feature indicates that the driver can reset a queue individually.
+ */
+#define VIRTIO_F_RING_RESET		40
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
cgit 


From 4913e85441b40386c4bb093f188b955d8165f1b7 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:48 +0800
Subject: virtio_ring: struct virtqueue introduce reset

Introduce a new member reset to the structure virtqueue to determine
whether the current vq is in the reset state. Subsequent patches will
use it.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-29-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 2 ++
 include/linux/virtio.h       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index accb3ae6cc95..d66c8e6d0ef3 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1996,6 +1996,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
 	vq->vq.index = index;
+	vq->vq.reset = false;
 	vq->we_own_ring = true;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
@@ -2481,6 +2482,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
 	vq->vq.index = index;
+	vq->vq.reset = false;
 	vq->we_own_ring = false;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d45ee82a4470..a3f73bb6733e 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -20,6 +20,7 @@
  * @index: the zero-based ordinal number for this queue.
  * @num_free: number of elements we expect to be able to fit.
  * @num_max: the maximum number of elements supported by the device.
+ * @reset: vq is in reset state or not.
  *
  * A note on @num_free: with indirect buffers, each buffer needs one
  * element in the queue, otherwise a buffer will need one element per
@@ -34,6 +35,7 @@ struct virtqueue {
 	unsigned int num_free;
 	unsigned int num_max;
 	void *priv;
+	bool reset;
 };
 
 int virtqueue_add_outbuf(struct virtqueue *vq,
-- 
cgit 


From 0cdd450e70510c9e13af8099e9f6c1467e6a0b91 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:49 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_reset

Add queue_reset in virtio_pci_modern_common_cfg.

 https://github.com/oasis-tcs/virtio-spec/issues/124
 https://github.com/oasis-tcs/virtio-spec/issues/139

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-30-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 2 +-
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 41f5a018bd94..05123b9a606f 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -9,7 +9,7 @@ struct virtio_pci_modern_common_cfg {
 	struct virtio_pci_common_cfg cfg;
 
 	__le16 queue_notify_data;	/* read-write */
-	__le16 padding;
+	__le16 queue_reset;		/* read-write */
 };
 
 struct virtio_pci_modern_device {
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index f5981a874481..f703afc7ad31 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -203,6 +203,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
 #define VIRTIO_PCI_COMMON_Q_NDATA	56
+#define VIRTIO_PCI_COMMON_Q_RESET	58
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit 


From 0b50cece0b7857732d2055f2c77f8730c10f9196 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:50 +0800
Subject: virtio_pci: introduce helper to get/set queue reset

Introduce new helpers to implement queue reset and get queue reset
status.

 https://github.com/oasis-tcs/virtio-spec/issues/124
 https://github.com/oasis-tcs/virtio-spec/issues/139

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-31-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 39 ++++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_modern.h      |  2 ++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index fa2a9445bb18..869cb46bef96 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -3,6 +3,7 @@
 #include <linux/virtio_pci_modern.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/delay.h>
 
 /*
  * vp_modern_map_capability - map a part of virtio pci capability
@@ -474,6 +475,44 @@ void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
 }
 EXPORT_SYMBOL_GPL(vp_modern_set_status);
 
+/*
+ * vp_modern_get_queue_reset - get the queue reset status
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+	vp_iowrite16(index, &cfg->cfg.queue_select);
+	return vp_ioread16(&cfg->queue_reset);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset);
+
+/*
+ * vp_modern_set_queue_reset - reset the queue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+	vp_iowrite16(index, &cfg->cfg.queue_select);
+	vp_iowrite16(1, &cfg->queue_reset);
+
+	while (vp_ioread16(&cfg->queue_reset))
+		msleep(1);
+
+	while (vp_ioread16(&cfg->cfg.queue_enable))
+		msleep(1);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset);
+
 /*
  * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
  * @mdev: the modern virtio-pci device
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 05123b9a606f..c4eeb79b0139 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -113,4 +113,6 @@ void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
 				       u16 index, resource_size_t *pa);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
+int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
+void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
 #endif
-- 
cgit 


From a10fba0377145fccefea4dc4dd5915b7ed87e546 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:53 +0800
Subject: virtio: find_vqs() add arg sizes

find_vqs() adds a new parameter sizes to specify the size of each vq
vring.

NULL as sizes means that all queues in find_vqs() use the maximum size.
A value in the array is 0, which means that the corresponding queue uses
the maximum size.

In the split scenario, the meaning of size is the largest size, because
it may be limited by memory, the virtio core will try a smaller size.
And the size is power of 2.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-34-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             |  2 +-
 drivers/platform/mellanox/mlxbf-tmfifo.c |  1 +
 drivers/remoteproc/remoteproc_virtio.c   |  1 +
 drivers/s390/virtio/virtio_ccw.c         |  1 +
 drivers/virtio/virtio_mmio.c             |  1 +
 drivers/virtio/virtio_pci_common.c       |  2 +-
 drivers/virtio/virtio_pci_common.h       |  2 +-
 drivers/virtio/virtio_pci_modern.c       |  7 +++++--
 drivers/virtio/virtio_vdpa.c             |  1 +
 include/linux/virtio_config.h            | 14 +++++++++-----
 10 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index e719af8bdf56..79e38afd4b91 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -1011,7 +1011,7 @@ error_kzalloc:
 
 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		       const char * const names[], const bool *ctx,
+		       const char * const names[], u32 sizes[], const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 1ae3c56b66b0..8be13d416f48 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -928,6 +928,7 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 					struct virtqueue *vqs[],
 					vq_callback_t *callbacks[],
 					const char * const names[],
+					u32 sizes[],
 					const bool *ctx,
 					struct irq_affinity *desc)
 {
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 0f7706e23eb9..81c4f5776109 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -158,6 +158,7 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
+				 u32 sizes[],
 				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 6b86d0280d6b..72500cd2dbf5 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -635,6 +635,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
+			       u32 sizes[],
 			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 3ff746e3f24a..dfcecfd7aba1 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -474,6 +474,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
+		       u32 sizes[],
 		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index ad258a9d3b9f..7ad734584823 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -396,7 +396,7 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], const bool *ctx,
+		const char * const names[], u32 sizes[], const bool *ctx,
 		struct irq_affinity *desc)
 {
 	int err;
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 23112d84218f..a5ff838b85a5 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -110,7 +110,7 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], const bool *ctx,
+		const char * const names[], u32 sizes[], const bool *ctx,
 		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index c3b9f2761849..be51ec849252 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -347,12 +347,15 @@ err:
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char * const names[], const bool *ctx,
+			      const char * const names[],
+			      u32 sizes[],
+			      const bool *ctx,
 			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx,
+			     desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 9670cc79371d..832d2c5b1b19 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -269,6 +269,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				struct virtqueue *vqs[],
 				vq_callback_t *callbacks[],
 				const char * const names[],
+				u32 sizes[],
 				const bool *ctx,
 				struct irq_affinity *desc)
 {
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..888f7e96f0c7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,6 +55,7 @@ struct virtio_shm_region {
  *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
  *		include a NULL entry for vqs unused by driver
+ *	sizes: array of virtqueue sizes
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @synchronize_cbs: synchronize with the virtqueue callbacks (optional)
@@ -103,7 +104,9 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[], const bool *ctx,
+			const char * const names[],
+			u32 sizes[],
+			const bool *ctx,
 			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
@@ -212,7 +215,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	const char *names[] = { n };
 	struct virtqueue *vq;
 	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
-					 NULL);
+					 NULL, NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -224,7 +227,8 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
+				      NULL, desc);
 }
 
 static inline
@@ -233,8 +237,8 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
-				      desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
+				      ctx, desc);
 }
 
 /**
-- 
cgit 


From fe3dc04e31aa51f91dc7f741a5f76cc4817eb5b4 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:56 +0800
Subject: virtio: add helper virtio_find_vqs_ctx_size()

Introduce helper virtio_find_vqs_ctx_size() to call find_vqs and specify
the maximum size of each vq ring.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-37-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 888f7e96f0c7..6adff09f7170 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -241,6 +241,18 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 				      ctx, desc);
 }
 
+static inline
+int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs,
+			     struct virtqueue *vqs[],
+			     vq_callback_t *callbacks[],
+			     const char * const names[],
+			     u32 sizes[],
+			     const bool *ctx, struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes,
+				      ctx, desc);
+}
+
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
  * @vdev: the device
-- 
cgit 


From 699b045a8e43bd1063db4795be685bfd659649dc Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Mon, 18 Jul 2022 12:11:02 +0300
Subject: net: virtio_net: notifications coalescing support

New VirtIO network feature: VIRTIO_NET_F_NOTF_COAL.

Control a Virtio network device notifications coalescing parameters
using the control virtqueue.

A device that supports this fetature can receive
VIRTIO_NET_CTRL_NOTF_COAL control commands.

- VIRTIO_NET_CTRL_NOTF_COAL_TX_SET:
  Ask the network device to change the following parameters:
  - tx_usecs: Maximum number of usecs to delay a TX notification.
  - tx_max_packets: Maximum number of packets to send before a
    TX notification.

- VIRTIO_NET_CTRL_NOTF_COAL_RX_SET:
  Ask the network device to change the following parameters:
  - rx_usecs: Maximum number of usecs to delay a RX notification.
  - rx_max_packets: Maximum number of packets to receive before a
    RX notification.

VirtIO spec. patch:
https://lists.oasis-open.org/archives/virtio-comment/202206/msg00100.html

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Message-Id: <20220718091102.498774-1-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c        | 111 ++++++++++++++++++++++++++++++++++------
 include/uapi/linux/virtio_net.h |  34 +++++++++++-
 2 files changed, 129 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 16783ed782c5..d9c434b00e9b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -270,6 +270,12 @@ struct virtnet_info {
 	u8 duplex;
 	u32 speed;
 
+	/* Interrupt coalescing settings */
+	u32 tx_usecs;
+	u32 rx_usecs;
+	u32 tx_max_packets;
+	u32 rx_max_packets;
+
 	unsigned long guest_offloads;
 	unsigned long guest_offloads_capable;
 
@@ -2737,27 +2743,89 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
 	return 0;
 }
 
+static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi,
+				       struct ethtool_coalesce *ec)
+{
+	struct scatterlist sgs_tx, sgs_rx;
+	struct virtio_net_ctrl_coal_tx coal_tx;
+	struct virtio_net_ctrl_coal_rx coal_rx;
+
+	coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
+	coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames);
+	sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
+				  VIRTIO_NET_CTRL_NOTF_COAL_TX_SET,
+				  &sgs_tx))
+		return -EINVAL;
+
+	/* Save parameters */
+	vi->tx_usecs = ec->tx_coalesce_usecs;
+	vi->tx_max_packets = ec->tx_max_coalesced_frames;
+
+	coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
+	coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
+	sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
+				  VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
+				  &sgs_rx))
+		return -EINVAL;
+
+	/* Save parameters */
+	vi->rx_usecs = ec->rx_coalesce_usecs;
+	vi->rx_max_packets = ec->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
+{
+	/* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL
+	 * feature is negotiated.
+	 */
+	if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs)
+		return -EOPNOTSUPP;
+
+	if (ec->tx_max_coalesced_frames > 1 ||
+	    ec->rx_max_coalesced_frames != 1)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int virtnet_set_coalesce(struct net_device *dev,
 				struct ethtool_coalesce *ec,
 				struct kernel_ethtool_coalesce *kernel_coal,
 				struct netlink_ext_ack *extack)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int i, napi_weight;
-
-	if (ec->tx_max_coalesced_frames > 1 ||
-	    ec->rx_max_coalesced_frames != 1)
-		return -EINVAL;
+	int ret, i, napi_weight;
+	bool update_napi = false;
 
+	/* Can't change NAPI weight if the link is up */
 	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
 	if (napi_weight ^ vi->sq[0].napi.weight) {
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
+		else
+			update_napi = true;
+	}
+
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
+		ret = virtnet_send_notf_coal_cmds(vi, ec);
+	else
+		ret = virtnet_coal_params_supported(ec);
+
+	if (ret)
+		return ret;
+
+	if (update_napi) {
 		for (i = 0; i < vi->max_queue_pairs; i++)
 			vi->sq[i].napi.weight = napi_weight;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int virtnet_get_coalesce(struct net_device *dev,
@@ -2765,16 +2833,19 @@ static int virtnet_get_coalesce(struct net_device *dev,
 				struct kernel_ethtool_coalesce *kernel_coal,
 				struct netlink_ext_ack *extack)
 {
-	struct ethtool_coalesce ec_default = {
-		.cmd = ETHTOOL_GCOALESCE,
-		.rx_max_coalesced_frames = 1,
-	};
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	memcpy(ec, &ec_default, sizeof(ec_default));
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
+		ec->rx_coalesce_usecs = vi->rx_usecs;
+		ec->tx_coalesce_usecs = vi->tx_usecs;
+		ec->tx_max_coalesced_frames = vi->tx_max_packets;
+		ec->rx_max_coalesced_frames = vi->rx_max_packets;
+	} else {
+		ec->rx_max_coalesced_frames = 1;
 
-	if (vi->sq[0].napi.weight)
-		ec->tx_max_coalesced_frames = 1;
+		if (vi->sq[0].napi.weight)
+			ec->tx_max_coalesced_frames = 1;
+	}
 
 	return 0;
 }
@@ -2893,7 +2964,8 @@ static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
 }
 
 static const struct ethtool_ops virtnet_ethtool_ops = {
-	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
+	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES |
+		ETHTOOL_COALESCE_USECS,
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
@@ -3606,6 +3678,8 @@ static bool virtnet_validate_features(struct virtio_device *vdev)
 	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
 			     "VIRTIO_NET_F_CTRL_VQ") ||
 	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
+			     "VIRTIO_NET_F_CTRL_VQ") ||
+	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL,
 			     "VIRTIO_NET_F_CTRL_VQ"))) {
 		return false;
 	}
@@ -3742,6 +3816,13 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
+		vi->rx_usecs = 0;
+		vi->tx_usecs = 0;
+		vi->tx_max_packets = 0;
+		vi->rx_max_packets = 0;
+	}
+
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
 		vi->has_rss_hash_report = true;
 
@@ -3977,7 +4058,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
 	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
-	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT
+	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 3f55a4215f11..29ced55514d4 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -56,7 +56,7 @@
 #define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
-
+#define VIRTIO_NET_F_NOTF_COAL	53	/* Guest can handle notifications coalescing */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
@@ -355,4 +355,36 @@ struct virtio_net_hash_config {
 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS   5
 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET        0
 
+/*
+ * Control notifications coalescing.
+ *
+ * Request the device to change the notifications coalescing parameters.
+ *
+ * Available with the VIRTIO_NET_F_NOTF_COAL feature bit.
+ */
+#define VIRTIO_NET_CTRL_NOTF_COAL		6
+/*
+ * Set the tx-usecs/tx-max-packets patameters.
+ * tx-usecs - Maximum number of usecs to delay a TX notification.
+ * tx-max-packets - Maximum number of packets to send before a TX notification.
+ */
+struct virtio_net_ctrl_coal_tx {
+	__le32 tx_max_packets;
+	__le32 tx_usecs;
+};
+
+#define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET		0
+
+/*
+ * Set the rx-usecs/rx-max-packets patameters.
+ * rx-usecs - Maximum number of usecs to delay a RX notification.
+ * rx-max-frames - Maximum number of packets to receive before a RX notification.
+ */
+struct virtio_net_ctrl_coal_rx {
+	__le32 rx_max_packets;
+	__le32 rx_usecs;
+};
+
+#define VIRTIO_NET_CTRL_NOTF_COAL_RX_SET		1
+
 #endif /* _UAPI_LINUX_VIRTIO_NET_H */
-- 
cgit 


From 79a463be9e0051997508d52cf411ed5e91d657f6 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Wed, 3 Aug 2022 12:55:22 +0800
Subject: vduse: Support registering userspace memory for IOVA regions

Introduce two ioctls: VDUSE_IOTLB_REG_UMEM and
VDUSE_IOTLB_DEREG_UMEM to support registering
and de-registering userspace memory for IOVA
regions.

Now it only supports registering userspace memory
for bounce buffer region in virtio-vdpa case.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220803045523.23851-5-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 141 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vduse.h         |  23 ++++++
 2 files changed, 164 insertions(+)

(limited to 'include')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 3bc27de58f46..eedff0a3885a 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -21,6 +21,8 @@
 #include <linux/uio.h>
 #include <linux/vdpa.h>
 #include <linux/nospec.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
 #include <uapi/linux/vduse.h>
 #include <uapi/linux/vdpa.h>
 #include <uapi/linux/virtio_config.h>
@@ -64,6 +66,13 @@ struct vduse_vdpa {
 	struct vduse_dev *dev;
 };
 
+struct vduse_umem {
+	unsigned long iova;
+	unsigned long npages;
+	struct page **pages;
+	struct mm_struct *mm;
+};
+
 struct vduse_dev {
 	struct vduse_vdpa *vdev;
 	struct device *dev;
@@ -95,6 +104,8 @@ struct vduse_dev {
 	u8 status;
 	u32 vq_num;
 	u32 vq_align;
+	struct vduse_umem *umem;
+	struct mutex mem_lock;
 };
 
 struct vduse_dev_msg {
@@ -917,6 +928,102 @@ unlock:
 	return ret;
 }
 
+static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+				u64 iova, u64 size)
+{
+	int ret;
+
+	mutex_lock(&dev->mem_lock);
+	ret = -ENOENT;
+	if (!dev->umem)
+		goto unlock;
+
+	ret = -EINVAL;
+	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+		goto unlock;
+
+	vduse_domain_remove_user_bounce_pages(dev->domain);
+	unpin_user_pages_dirty_lock(dev->umem->pages,
+				    dev->umem->npages, true);
+	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
+	mmdrop(dev->umem->mm);
+	vfree(dev->umem->pages);
+	kfree(dev->umem);
+	dev->umem = NULL;
+	ret = 0;
+unlock:
+	mutex_unlock(&dev->mem_lock);
+	return ret;
+}
+
+static int vduse_dev_reg_umem(struct vduse_dev *dev,
+			      u64 iova, u64 uaddr, u64 size)
+{
+	struct page **page_list = NULL;
+	struct vduse_umem *umem = NULL;
+	long pinned = 0;
+	unsigned long npages, lock_limit;
+	int ret;
+
+	if (!dev->domain->bounce_map ||
+	    size != dev->domain->bounce_size ||
+	    iova != 0 || uaddr & ~PAGE_MASK)
+		return -EINVAL;
+
+	mutex_lock(&dev->mem_lock);
+	ret = -EEXIST;
+	if (dev->umem)
+		goto unlock;
+
+	ret = -ENOMEM;
+	npages = size >> PAGE_SHIFT;
+	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
+			      GFP_KERNEL_ACCOUNT);
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!page_list || !umem)
+		goto unlock;
+
+	mmap_read_lock(current->mm);
+
+	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
+		goto out;
+
+	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
+				page_list, NULL);
+	if (pinned != npages) {
+		ret = pinned < 0 ? pinned : -ENOMEM;
+		goto out;
+	}
+
+	ret = vduse_domain_add_user_bounce_pages(dev->domain,
+						 page_list, pinned);
+	if (ret)
+		goto out;
+
+	atomic64_add(npages, &current->mm->pinned_vm);
+
+	umem->pages = page_list;
+	umem->npages = pinned;
+	umem->iova = iova;
+	umem->mm = current->mm;
+	mmgrab(current->mm);
+
+	dev->umem = umem;
+out:
+	if (ret && pinned > 0)
+		unpin_user_pages(page_list, pinned);
+
+	mmap_read_unlock(current->mm);
+unlock:
+	if (ret) {
+		vfree(page_list);
+		kfree(umem);
+	}
+	mutex_unlock(&dev->mem_lock);
+	return ret;
+}
+
 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
@@ -1089,6 +1196,38 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
 		break;
 	}
+	case VDUSE_IOTLB_REG_UMEM: {
+		struct vduse_iova_umem umem;
+
+		ret = -EFAULT;
+		if (copy_from_user(&umem, argp, sizeof(umem)))
+			break;
+
+		ret = -EINVAL;
+		if (!is_mem_zero((const char *)umem.reserved,
+				 sizeof(umem.reserved)))
+			break;
+
+		ret = vduse_dev_reg_umem(dev, umem.iova,
+					 umem.uaddr, umem.size);
+		break;
+	}
+	case VDUSE_IOTLB_DEREG_UMEM: {
+		struct vduse_iova_umem umem;
+
+		ret = -EFAULT;
+		if (copy_from_user(&umem, argp, sizeof(umem)))
+			break;
+
+		ret = -EINVAL;
+		if (!is_mem_zero((const char *)umem.reserved,
+				 sizeof(umem.reserved)))
+			break;
+
+		ret = vduse_dev_dereg_umem(dev, umem.iova,
+					   umem.size);
+		break;
+	}
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
@@ -1101,6 +1240,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
 {
 	struct vduse_dev *dev = file->private_data;
 
+	vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
 	spin_lock(&dev->msg_lock);
 	/* Make sure the inflight messages can processed after reconncection */
 	list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1163,6 +1303,7 @@ static struct vduse_dev *vduse_dev_create(void)
 		return NULL;
 
 	mutex_init(&dev->lock);
+	mutex_init(&dev->mem_lock);
 	spin_lock_init(&dev->msg_lock);
 	INIT_LIST_HEAD(&dev->send_list);
 	INIT_LIST_HEAD(&dev->recv_list);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 7cfe1c1280c0..9885e0571f09 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -210,6 +210,29 @@ struct vduse_vq_eventfd {
  */
 #define VDUSE_VQ_INJECT_IRQ	_IOW(VDUSE_BASE, 0x17, __u32)
 
+/**
+ * struct vduse_iova_umem - userspace memory configuration for one IOVA region
+ * @uaddr: start address of userspace memory, it must be aligned to page size
+ * @iova: start of the IOVA region
+ * @size: size of the IOVA region
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
+ * ioctls to register/de-register userspace memory for IOVA regions
+ */
+struct vduse_iova_umem {
+	__u64 uaddr;
+	__u64 iova;
+	__u64 size;
+	__u64 reserved[3];
+};
+
+/* Register userspace memory for IOVA regions */
+#define VDUSE_IOTLB_REG_UMEM	_IOW(VDUSE_BASE, 0x18, struct vduse_iova_umem)
+
+/* De-register the userspace memory. Caller should set iova and size field. */
+#define VDUSE_IOTLB_DEREG_UMEM	_IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**
-- 
cgit 


From ad146355bfad627bd0717ece73997c6c93b1b82e Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Wed, 3 Aug 2022 12:55:23 +0800
Subject: vduse: Support querying information of IOVA regions

This introduces a new ioctl: VDUSE_IOTLB_GET_INFO to
support querying some information of IOVA regions.

Now it can be used to query whether the IOVA region
supports userspace memory registration.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Message-Id: <20220803045523.23851-6-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 39 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vduse.h         | 24 +++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index eedff0a3885a..41c0b29739f1 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1228,6 +1228,45 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 					   umem.size);
 		break;
 	}
+	case VDUSE_IOTLB_GET_INFO: {
+		struct vduse_iova_info info;
+		struct vhost_iotlb_map *map;
+		struct vduse_iova_domain *domain = dev->domain;
+
+		ret = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info)))
+			break;
+
+		ret = -EINVAL;
+		if (info.start > info.last)
+			break;
+
+		if (!is_mem_zero((const char *)info.reserved,
+				 sizeof(info.reserved)))
+			break;
+
+		spin_lock(&domain->iotlb_lock);
+		map = vhost_iotlb_itree_first(domain->iotlb,
+					      info.start, info.last);
+		if (map) {
+			info.start = map->start;
+			info.last = map->last;
+			info.capability = 0;
+			if (domain->bounce_map && map->start == 0 &&
+			    map->last == domain->bounce_size - 1)
+				info.capability |= VDUSE_IOVA_CAP_UMEM;
+		}
+		spin_unlock(&domain->iotlb_lock);
+		if (!map)
+			break;
+
+		ret = -EFAULT;
+		if (copy_to_user(argp, &info, sizeof(info)))
+			break;
+
+		ret = 0;
+		break;
+	}
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 9885e0571f09..11bd48c72c6c 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -233,6 +233,30 @@ struct vduse_iova_umem {
 /* De-register the userspace memory. Caller should set iova and size field. */
 #define VDUSE_IOTLB_DEREG_UMEM	_IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)
 
+/**
+ * struct vduse_iova_info - information of one IOVA region
+ * @start: start of the IOVA region
+ * @last: last of the IOVA region
+ * @capability: capability of the IOVA regsion
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
+ * one IOVA region.
+ */
+struct vduse_iova_info {
+	__u64 start;
+	__u64 last;
+#define VDUSE_IOVA_CAP_UMEM (1 << 0)
+	__u64 capability;
+	__u64 reserved[3];
+};
+
+/*
+ * Find the first IOVA region that overlaps with the range [start, last]
+ * and return some information on it. Caller should set start and last fields.
+ */
+#define VDUSE_IOTLB_GET_INFO	_IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info)
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**
-- 
cgit 


From cae15c2ed8e6e058bd5e32de292ab7982640161e Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 14 Jul 2022 14:39:26 +0300
Subject: vdpa/mlx5: Implement susupend virtqueue callback

Implement the suspend callback allowing to suspend the virtqueues so
they stop processing descriptors. This is required to allow to query a
consistent state of the virtqueue while live migration is taking place.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Message-Id: <20220714113927.85729-2-elic@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 83 ++++++++++++++++++++++++++++++++++++--
 include/linux/mlx5/mlx5_ifc_vdpa.h |  8 ++++
 2 files changed, 88 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 99bbbf38c8b1..a476e06476f6 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -164,6 +164,7 @@ struct mlx5_vdpa_net {
 	bool setup;
 	u32 cur_num_vqs;
 	u32 rqt_size;
+	bool nb_registered;
 	struct notifier_block nb;
 	struct vdpa_callback config_cb;
 	struct mlx5_vdpa_wq_ent cvq_ent;
@@ -895,6 +896,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	if (err)
 		goto err_cmd;
 
+	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
 	kfree(in);
 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
 
@@ -922,6 +924,7 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq
 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
 		return;
 	}
+	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
 	umems_destroy(ndev, mvq);
 }
 
@@ -1121,6 +1124,20 @@ err_cmd:
 	return err;
 }
 
+static bool is_valid_state_change(int oldstate, int newstate)
+{
+	switch (oldstate) {
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
+		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
+		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
+	default:
+		return false;
+	}
+}
+
 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
 {
 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
@@ -1130,6 +1147,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	void *in;
 	int err;
 
+	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
+		return 0;
+
+	if (!is_valid_state_change(mvq->fw_state, state))
+		return -EINVAL;
+
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1992,6 +2015,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready
 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 	struct mlx5_vdpa_virtqueue *mvq;
+	int err;
 
 	if (!mvdev->actual_features)
 		return;
@@ -2005,8 +2029,16 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready
 	}
 
 	mvq = &ndev->vqs[idx];
-	if (!ready)
+	if (!ready) {
 		suspend_vq(ndev, mvq);
+	} else {
+		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+		if (err) {
+			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
+			ready = false;
+		}
+	}
+
 
 	mvq->ready = ready;
 }
@@ -2733,6 +2765,37 @@ out_err:
 	return err;
 }
 
+static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
+{
+	struct mlx5_control_vq *cvq;
+
+	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+		return;
+
+	cvq = &mvdev->cvq;
+	cvq->ready = false;
+}
+
+static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq;
+	int i;
+
+	down_write(&ndev->reslock);
+	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+	ndev->nb_registered = false;
+	flush_workqueue(ndev->mvdev.wq);
+	for (i = 0; i < ndev->cur_num_vqs; i++) {
+		mvq = &ndev->vqs[i];
+		suspend_vq(ndev, mvq);
+	}
+	mlx5_vdpa_cvq_suspend(mvdev);
+	up_write(&ndev->reslock);
+	return 0;
+}
+
 static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.set_vq_address = mlx5_vdpa_set_vq_address,
 	.set_vq_num = mlx5_vdpa_set_vq_num,
@@ -2763,6 +2826,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.get_generation = mlx5_vdpa_get_generation,
 	.set_map = mlx5_vdpa_set_map,
 	.free = mlx5_vdpa_free,
+	.suspend = mlx5_vdpa_suspend,
 };
 
 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
@@ -2828,6 +2892,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev)
 		mvq->index = i;
 		mvq->ndev = ndev;
 		mvq->fwqp.fw = true;
+		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
 	}
 	for (; i < ndev->mvdev.max_vqs; i++) {
 		mvq = &ndev->vqs[i];
@@ -2902,13 +2967,21 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p
 		switch (eqe->sub_type) {
 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			down_read(&ndev->reslock);
+			if (!ndev->nb_registered) {
+				up_read(&ndev->reslock);
+				return NOTIFY_DONE;
+			}
 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
-			if (!wqent)
+			if (!wqent) {
+				up_read(&ndev->reslock);
 				return NOTIFY_DONE;
+			}
 
 			wqent->mvdev = &ndev->mvdev;
 			INIT_WORK(&wqent->work, update_carrier);
 			queue_work(ndev->mvdev.wq, &wqent->work);
+			up_read(&ndev->reslock);
 			ret = NOTIFY_OK;
 			break;
 		default:
@@ -3062,6 +3135,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
 
 	ndev->nb.notifier_call = event_handler;
 	mlx5_notifier_register(mdev, &ndev->nb);
+	ndev->nb_registered = true;
 	mvdev->vdev.mdev = &mgtdev->mgtdev;
 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
 	if (err)
@@ -3093,7 +3167,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 	struct workqueue_struct *wq;
 
-	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+	if (ndev->nb_registered) {
+		mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+		ndev->nb_registered = false;
+	}
 	wq = mvdev->wq;
 	mvdev->wq = NULL;
 	destroy_workqueue(wq);
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 4414ed5b6ed2..9becdc3fa503 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -150,6 +150,14 @@ enum {
 	MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR      = 0x3,
 };
 
+/* This indicates that the object was not created or has already
+ * been desroyed. It is very safe to assume that this object will never
+ * have so many states
+ */
+enum {
+	MLX5_VIRTIO_NET_Q_OBJECT_NONE = 0xffffffff
+};
+
 enum {
 	MLX5_RQTC_LIST_Q_TYPE_RQ            = 0x0,
 	MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q  = 0x1,
-- 
cgit 


From 848ecea184e1253758423b37cbfc1ed732ccf5b4 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:09 +0200
Subject: vdpa: Add suspend operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This operation is optional: It it's not implemented, backend feature bit
will not be exposed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-2-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 7b4a13d3bd91..d282f464d2f1 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -218,6 +218,9 @@ struct vdpa_map_file {
  * @reset:			Reset device
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
+ * @suspend:			Suspend or resume the device (optional)
+ *				@vdev: vdpa device
+ *				Returns integer: success (0) or error (< 0)
  * @get_config_size:		Get the size of the configuration space includes
  *				fields that are conditional on feature bits.
  *				@vdev: vdpa device
@@ -319,6 +322,7 @@ struct vdpa_config_ops {
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
 	int (*reset)(struct vdpa_device *vdev);
+	int (*suspend)(struct vdpa_device *vdev);
 	size_t (*get_config_size)(struct vdpa_device *vdev);
 	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
 			   void *buf, unsigned int len);
-- 
cgit 


From 0723f1df5c3ec8a1112d150dab98e149361ef488 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:10 +0200
Subject: vhost-vdpa: introduce SUSPEND backend feature bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Userland knows if it can suspend the device or not by checking this feature
bit.

It's only offered if the vdpa driver backend implements the suspend()
operation callback, and to offer it or userland to ack it if the backend
does not offer that callback is an error.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-3-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c             | 16 +++++++++++++++-
 include/uapi/linux/vhost_types.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 2c997d77d266..092752fea8e1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -347,6 +347,14 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
 	return 0;
 }
 
+static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->suspend;
+}
+
 static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
 {
 	struct vdpa_device *vdpa = v->vdpa;
@@ -577,7 +585,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	if (cmd == VHOST_SET_BACKEND_FEATURES) {
 		if (copy_from_user(&features, featurep, sizeof(features)))
 			return -EFAULT;
-		if (features & ~VHOST_VDPA_BACKEND_FEATURES)
+		if (features & ~(VHOST_VDPA_BACKEND_FEATURES |
+				 BIT_ULL(VHOST_BACKEND_F_SUSPEND)))
+			return -EOPNOTSUPP;
+		if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) &&
+		     !vhost_vdpa_can_suspend(v))
 			return -EOPNOTSUPP;
 		vhost_set_backend_features(&v->vdev, features);
 		return 0;
@@ -628,6 +640,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 		break;
 	case VHOST_GET_BACKEND_FEATURES:
 		features = VHOST_VDPA_BACKEND_FEATURES;
+		if (vhost_vdpa_can_suspend(v))
+			features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND);
 		if (copy_to_user(featurep, &features, sizeof(features)))
 			r = -EFAULT;
 		break;
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 634cee485abb..1bdd6e363f4c 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -161,5 +161,7 @@ struct vhost_vdpa_iova_range {
  * message
  */
 #define VHOST_BACKEND_F_IOTLB_ASID  0x3
+/* Device can be suspended */
+#define VHOST_BACKEND_F_SUSPEND  0x4
 
 #endif
-- 
cgit 


From f345a0143b4dd1cfc850009c6979a3801b86a06f Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:11 +0200
Subject: vhost-vdpa: uAPI to suspend the device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ioctl adds support for suspending the device from userspace.

This is a must before getting virtqueue indexes (base) for live migration,
since the device could modify them after userland gets them. There are
individual ways to perform that action for some devices
(VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING, ...) but there was no
way to perform it for any vhost device (and, in particular, vhost-vdpa).

After a successful return of the ioctl call the device must not process
more virtqueue descriptors. The device can answer to read or writes of
config fields as if it were not suspended. In particular, writing to
"queue_enable" with a value of 1 will not make the device start
processing buffers of the virtqueue.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-4-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 19 +++++++++++++++++++
 include/uapi/linux/vhost.h |  9 +++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 092752fea8e1..166044642fd5 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -478,6 +478,22 @@ static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp)
 	return 0;
 }
 
+/* After a successful return of ioctl the device must not process more
+ * virtqueue descriptors. The device can answer to read or writes of config
+ * fields as if it were not suspended. In particular, writing to "queue_enable"
+ * with a value of 1 will not make the device start processing buffers.
+ */
+static long vhost_vdpa_suspend(struct vhost_vdpa *v)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	if (!ops->suspend)
+		return -EOPNOTSUPP;
+
+	return ops->suspend(vdpa);
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 				   void __user *argp)
 {
@@ -654,6 +670,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	case VHOST_VDPA_GET_VQS_COUNT:
 		r = vhost_vdpa_get_vqs_count(v, argp);
 		break;
+	case VHOST_VDPA_SUSPEND:
+		r = vhost_vdpa_suspend(v);
+		break;
 	default:
 		r = vhost_dev_ioctl(&v->vdev, cmd, argp);
 		if (r == -ENOIOCTLCMD)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index cab645d4a645..f9f115a7c75b 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -171,4 +171,13 @@
 #define VHOST_VDPA_SET_GROUP_ASID	_IOW(VHOST_VIRTIO, 0x7C, \
 					     struct vhost_vring_state)
 
+/* Suspend a device so it does not process virtqueue requests anymore
+ *
+ * After the return of ioctl the device must preserve all the necessary state
+ * (the virtqueue vring base plus the possible device specific states) that is
+ * required for restoring in the future. The device must not change its
+ * configuration after that point.
+ */
+#define VHOST_VDPA_SUSPEND		_IO(VHOST_VIRTIO, 0x7D)
+
 #endif
-- 
cgit 


From 0b2f3212b551a87fe936701fa0813032861a3308 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 5 Aug 2022 10:59:57 +0200
Subject: netfilter: nfnetlink: re-enable conntrack expectation events

To avoid allocation of the conntrack extension area when possible,
the default behaviour was changed to only allocate the event extension
if a userspace program is subscribed to a notification group.

Problem is that while 'conntrack -E' does enable the event allocation
behind the scenes, 'conntrack -E expect' does not: no expectation events
are delivered unless user sets
"net.netfilter.nf_conntrack_events" back to 1 (always on).

Fix the autodetection to also consider EXP type group.

We need to track the 6 event groups (3+3, new/update/destroy for events and
for expectations each) independently, else we'd disable events again
if an expectation group becomes empty while there is still an active
event group.

Fixes: 2794cdb0b97b ("netfilter: nfnetlink: allow to detect if ctnetlink listeners exist")
Reported-by: Yi Chen <yiche@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netns/conntrack.h |  2 +-
 net/netfilter/nfnetlink.c     | 83 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 72 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 0677cd3de034..c396a3862e80 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -95,7 +95,7 @@ struct nf_ip_net {
 
 struct netns_ct {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	bool ctnetlink_has_listener;
+	u8 ctnetlink_has_listener;
 	bool ecache_dwork_pending;
 #endif
 	u8			sysctl_log_invalid; /* Log invalid packets */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index c24b1240908f..9c44518cb70f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -44,6 +44,10 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket");
 
 static unsigned int nfnetlink_pernet_id __read_mostly;
 
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static DEFINE_SPINLOCK(nfnl_grp_active_lock);
+#endif
+
 struct nfnl_net {
 	struct sock *nfnl;
 };
@@ -654,6 +658,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 		netlink_rcv_skb(skb, nfnetlink_rcv_msg);
 }
 
+static void nfnetlink_bind_event(struct net *net, unsigned int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	int type, group_bit;
+	u8 v;
+
+	/* All NFNLGRP_CONNTRACK_* group bits fit into u8.
+	 * The other groups are not relevant and can be ignored.
+	 */
+	if (group >= 8)
+		return;
+
+	type = nfnl_group2type[group];
+
+	switch (type) {
+	case NFNL_SUBSYS_CTNETLINK:
+		break;
+	case NFNL_SUBSYS_CTNETLINK_EXP:
+		break;
+	default:
+		return;
+	}
+
+	group_bit = (1 << group);
+
+	spin_lock(&nfnl_grp_active_lock);
+	v = READ_ONCE(net->ct.ctnetlink_has_listener);
+	if ((v & group_bit) == 0) {
+		v |= group_bit;
+
+		/* read concurrently without nfnl_grp_active_lock held. */
+		WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+	}
+
+	spin_unlock(&nfnl_grp_active_lock);
+#endif
+}
+
 static int nfnetlink_bind(struct net *net, int group)
 {
 	const struct nfnetlink_subsystem *ss;
@@ -670,28 +712,45 @@ static int nfnetlink_bind(struct net *net, int group)
 	if (!ss)
 		request_module_nowait("nfnetlink-subsys-%d", type);
 
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-	if (type == NFNL_SUBSYS_CTNETLINK) {
-		nfnl_lock(NFNL_SUBSYS_CTNETLINK);
-		WRITE_ONCE(net->ct.ctnetlink_has_listener, true);
-		nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
-	}
-#endif
+	nfnetlink_bind_event(net, group);
 	return 0;
 }
 
 static void nfnetlink_unbind(struct net *net, int group)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
+	int type, group_bit;
+
 	if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
 		return;
 
-	if (nfnl_group2type[group] == NFNL_SUBSYS_CTNETLINK) {
-		nfnl_lock(NFNL_SUBSYS_CTNETLINK);
-		if (!nfnetlink_has_listeners(net, group))
-			WRITE_ONCE(net->ct.ctnetlink_has_listener, false);
-		nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+	type = nfnl_group2type[group];
+
+	switch (type) {
+	case NFNL_SUBSYS_CTNETLINK:
+		break;
+	case NFNL_SUBSYS_CTNETLINK_EXP:
+		break;
+	default:
+		return;
+	}
+
+	/* ctnetlink_has_listener is u8 */
+	if (group >= 8)
+		return;
+
+	group_bit = (1 << group);
+
+	spin_lock(&nfnl_grp_active_lock);
+	if (!nfnetlink_has_listeners(net, group)) {
+		u8 v = READ_ONCE(net->ct.ctnetlink_has_listener);
+
+		v &= ~group_bit;
+
+		/* read concurrently without nfnl_grp_active_lock held. */
+		WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
 	}
+	spin_unlock(&nfnl_grp_active_lock);
 #endif
 }
 
-- 
cgit 


From 5c221f0af68cfa9edcffd26ba6dbbc4b7ddb1700 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 9 Aug 2022 16:20:12 -0700
Subject: net: add missing kdoc for struct genl_multicast_group::flags

Multicast group flags were added in commit 4d54cc32112d ("mptcp: avoid
lock_fast usage in accept path"), but it missed adding the kdoc.

Mention which flags go into that field, and do the same for
op structs.

Link: https://lore.kernel.org/r/20220809232012.403730-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/genetlink.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 7cb3fa8310ed..56a50e1c51b9 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -11,6 +11,7 @@
 /**
  * struct genl_multicast_group - generic netlink multicast group
  * @name: name of the multicast group, names are per-family
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  */
 struct genl_multicast_group {
 	char			name[GENL_NAMSIZ];
@@ -116,7 +117,7 @@ enum genl_validate_flags {
  * struct genl_small_ops - generic netlink operations (small version)
  * @cmd: command identifier
  * @internal_flags: flags used by the family
- * @flags: flags
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  * @validate: validation flags from enum genl_validate_flags
  * @doit: standard command callback
  * @dumpit: callback for dumpers
@@ -137,7 +138,7 @@ struct genl_small_ops {
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
  * @internal_flags: flags used by the family
- * @flags: flags
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
  * @maxattr: maximum number of attributes supported
  * @policy: netlink policy (takes precedence over family policy)
  * @validate: validation flags from enum genl_validate_flags
-- 
cgit 


From c2e75634cbe368065f140dd30bf8b1a0355158fd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 10 Aug 2022 09:45:47 -0700
Subject: net: atm: bring back zatm uAPI

Jiri reports that linux-atm does not build without this header.
Bring it back. It's completely dead code but we can't break
the build for user space :(

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Fixes: 052e1f01bfae ("net: atm: remove support for ZeitNet ZN122x ATM devices")
Link: https://lore.kernel.org/all/8576aef3-37e4-8bae-bab5-08f82a78efd3@kernel.org/
Link: https://lore.kernel.org/r/20220810164547.484378-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/atm_zatm.h | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 include/uapi/linux/atm_zatm.h

(limited to 'include')

diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h
new file mode 100644
index 000000000000..5135027b93c1
--- /dev/null
+++ b/include/uapi/linux/atm_zatm.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* atm_zatm.h - Driver-specific declarations of the ZATM driver (for use by
+		driver-specific utilities) */
+
+/* Written 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef LINUX_ATM_ZATM_H
+#define LINUX_ATM_ZATM_H
+
+/*
+ * Note: non-kernel programs including this file must also include
+ * sys/types.h for struct timeval
+ */
+
+#include <linux/atmapi.h>
+#include <linux/atmioc.h>
+
+#define ZATM_GETPOOL	_IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc)
+						/* get pool statistics */
+#define ZATM_GETPOOLZ	_IOW('a',ATMIOC_SARPRV+2,struct atmif_sioc)
+						/* get statistics and zero */
+#define ZATM_SETPOOL	_IOW('a',ATMIOC_SARPRV+3,struct atmif_sioc)
+						/* set pool parameters */
+
+struct zatm_pool_info {
+	int ref_count;			/* free buffer pool usage counters */
+	int low_water,high_water;	/* refill parameters */
+	int rqa_count,rqu_count;	/* queue condition counters */
+	int offset,next_off;		/* alignment optimizations: offset */
+	int next_cnt,next_thres;	/* repetition counter and threshold */
+};
+
+struct zatm_pool_req {
+	int pool_num;			/* pool number */
+	struct zatm_pool_info info;	/* actual information */
+};
+
+#define ZATM_OAM_POOL		0	/* free buffer pool for OAM cells */
+#define ZATM_AAL0_POOL		1	/* free buffer pool for AAL0 cells */
+#define ZATM_AAL5_POOL_BASE	2	/* first AAL5 free buffer pool */
+#define ZATM_LAST_POOL	ZATM_AAL5_POOL_BASE+10 /* max. 64 kB */
+
+#define ZATM_TIMER_HISTORY_SIZE	16	/* number of timer adjustments to
+					   record; must be 2^n */
+
+#endif
-- 
cgit 


From b1c3497e604ddccea5af4071831ed0e4680fb35e Mon Sep 17 00:00:00 2001
From: Jane Malalane <jane.malalane@citrix.com>
Date: Fri, 29 Jul 2022 08:04:16 +0100
Subject: x86/xen: Add support for HVMOP_set_evtchn_upcall_vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement support for the HVMOP_set_evtchn_upcall_vector hypercall in
order to set the per-vCPU event channel vector callback on Linux and
use it in preference of HVM_PARAM_CALLBACK_IRQ.

If the per-VCPU vector setup is successful on BSP, use this method
for the APs. If not, fallback to the global vector-type callback.

Also register callback_irq at per-vCPU event channel setup to trick
toolstack to think the domain is enlightened.

Suggested-by: "Roger Pau Monné" <roger.pau@citrix.com>
Signed-off-by: Jane Malalane <jane.malalane@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220729070416.23306-1-jane.malalane@citrix.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/xen/cpuid.h   |  2 ++
 arch/x86/include/asm/xen/events.h  |  3 ++-
 arch/x86/xen/enlighten.c           |  2 +-
 arch/x86/xen/enlighten_hvm.c       | 24 ++++++++++++-----
 arch/x86/xen/suspend_hvm.c         | 10 ++++++-
 drivers/xen/events/events_base.c   | 53 +++++++++++++++++++++++++++++++++-----
 include/xen/hvm.h                  |  2 ++
 include/xen/interface/hvm/hvm_op.h | 19 ++++++++++++++
 8 files changed, 100 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 78e667a31d6c..6daa9b0c8d11 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -107,6 +107,8 @@
  * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768.
  */
 #define XEN_HVM_CPUID_EXT_DEST_ID      (1u << 5)
+/* Per-vCPU event channel upcalls */
+#define XEN_HVM_CPUID_UPCALL_VECTOR    (1u << 6)
 
 /*
  * Leaf 6 (0x40000x05)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 068d9b067c83..62bdceb594f1 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -23,7 +23,7 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
-extern int xen_have_vector_callback;
+extern bool xen_have_vector_callback;
 
 /*
  * Events delivered via platform PCI interrupts are always
@@ -34,4 +34,5 @@ static inline bool xen_support_evtchn_rebind(void)
 	return (!xen_hvm_domain() || xen_have_vector_callback);
 }
 
+extern bool xen_percpu_upcall;
 #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30c6e986a6cd..b8db2148c07d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(xen_start_info);
 
 struct shared_info xen_dummy_shared_info;
 
-__read_mostly int xen_have_vector_callback;
+__read_mostly bool xen_have_vector_callback = true;
 EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 
 /*
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 28762f800596..1c1ac418484b 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -8,6 +8,8 @@
 
 #include <xen/features.h>
 #include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/interface/hvm/hvm_op.h>
 #include <xen/interface/memory.h>
 
 #include <asm/apic.h>
@@ -31,6 +33,9 @@
 
 static unsigned long shared_info_pfn;
 
+__ro_after_init bool xen_percpu_upcall;
+EXPORT_SYMBOL_GPL(xen_percpu_upcall);
+
 void xen_hvm_init_shared_info(void)
 {
 	struct xen_add_to_physmap xatp;
@@ -126,6 +131,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	if (xen_percpu_upcall)
+		ack_APIC_irq();
+
 	inc_irq_stat(irq_hv_callback_count);
 
 	xen_hvm_evtchn_do_upcall();
@@ -169,6 +177,15 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 	if (!xen_have_vector_callback)
 		return 0;
 
+	if (xen_percpu_upcall) {
+		rc = xen_set_upcall_vector(cpu);
+		if (rc) {
+			WARN(1, "HVMOP_set_evtchn_upcall_vector"
+			     " for CPU %d failed: %d\n", cpu, rc);
+			return rc;
+		}
+	}
+
 	if (xen_feature(XENFEAT_hvm_safe_pvclock))
 		xen_setup_timer(cpu);
 
@@ -189,8 +206,6 @@ static int xen_cpu_dead_hvm(unsigned int cpu)
 	return 0;
 }
 
-static bool no_vector_callback __initdata;
-
 static void __init xen_hvm_guest_init(void)
 {
 	if (xen_pv_domain())
@@ -213,9 +228,6 @@ static void __init xen_hvm_guest_init(void)
 
 	xen_panic_handler_init();
 
-	if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector))
-		xen_have_vector_callback = 1;
-
 	xen_hvm_smp_init();
 	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
 	xen_unplug_emulated_devices();
@@ -241,7 +253,7 @@ early_param("xen_nopv", xen_parse_nopv);
 
 static __init int xen_parse_no_vector_callback(char *arg)
 {
-	no_vector_callback = true;
+	xen_have_vector_callback = false;
 	return 0;
 }
 early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index 9d548b0c772f..0c4f7554b7cc 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -5,6 +5,7 @@
 #include <xen/hvm.h>
 #include <xen/features.h>
 #include <xen/interface/features.h>
+#include <xen/events.h>
 
 #include "xen-ops.h"
 
@@ -14,6 +15,13 @@ void xen_hvm_post_suspend(int suspend_cancelled)
 		xen_hvm_init_shared_info();
 		xen_vcpu_restore();
 	}
-	xen_setup_callback_vector();
+	if (xen_percpu_upcall) {
+		unsigned int cpu;
+
+		for_each_online_cpu(cpu)
+			BUG_ON(xen_set_upcall_vector(cpu));
+	} else {
+		xen_setup_callback_vector();
+	}
 	xen_unplug_emulated_devices();
 }
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 46d9295d9a6e..206d4b466e44 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -45,6 +45,7 @@
 #include <asm/irq.h>
 #include <asm/io_apic.h>
 #include <asm/i8259.h>
+#include <asm/xen/cpuid.h>
 #include <asm/xen/pci.h>
 #endif
 #include <asm/sync_bitops.h>
@@ -2183,6 +2184,7 @@ static struct irq_chip xen_percpu_chip __read_mostly = {
 	.irq_ack		= ack_dynirq,
 };
 
+#ifdef CONFIG_X86
 #ifdef CONFIG_XEN_PVHVM
 /* Vector callbacks are better than PCI interrupts to receive event
  * channel notifications because we can receive vector callbacks on any
@@ -2195,11 +2197,48 @@ void xen_setup_callback_vector(void)
 		callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
 		if (xen_set_callback_via(callback_via)) {
 			pr_err("Request for Xen HVM callback vector failed\n");
-			xen_have_vector_callback = 0;
+			xen_have_vector_callback = false;
 		}
 	}
 }
 
+/*
+ * Setup per-vCPU vector-type callbacks. If this setup is unavailable,
+ * fallback to the global vector-type callback.
+ */
+static __init void xen_init_setup_upcall_vector(void)
+{
+	if (!xen_have_vector_callback)
+		return;
+
+	if ((cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_UPCALL_VECTOR) &&
+	    !xen_set_upcall_vector(0))
+		xen_percpu_upcall = true;
+	else if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_setup_callback_vector();
+	else
+		xen_have_vector_callback = false;
+}
+
+int xen_set_upcall_vector(unsigned int cpu)
+{
+	int rc;
+	xen_hvm_evtchn_upcall_vector_t op = {
+		.vector = HYPERVISOR_CALLBACK_VECTOR,
+		.vcpu = per_cpu(xen_vcpu_id, cpu),
+	};
+
+	rc = HYPERVISOR_hvm_op(HVMOP_set_evtchn_upcall_vector, &op);
+	if (rc)
+		return rc;
+
+	/* Trick toolstack to think we are enlightened. */
+	if (!cpu)
+		rc = xen_set_callback_via(1);
+
+	return rc;
+}
+
 static __init void xen_alloc_callback_vector(void)
 {
 	if (!xen_have_vector_callback)
@@ -2210,8 +2249,11 @@ static __init void xen_alloc_callback_vector(void)
 }
 #else
 void xen_setup_callback_vector(void) {}
+static inline void xen_init_setup_upcall_vector(void) {}
+int xen_set_upcall_vector(unsigned int cpu) {}
 static inline void xen_alloc_callback_vector(void) {}
-#endif
+#endif /* CONFIG_XEN_PVHVM */
+#endif /* CONFIG_X86 */
 
 bool xen_fifo_events = true;
 module_param_named(fifo_events, xen_fifo_events, bool, 0);
@@ -2271,10 +2313,9 @@ void __init xen_init_IRQ(void)
 		if (xen_initial_domain())
 			pci_xen_initial_domain();
 	}
-	if (xen_feature(XENFEAT_hvm_callback_vector)) {
-		xen_setup_callback_vector();
-		xen_alloc_callback_vector();
-	}
+	xen_init_setup_upcall_vector();
+	xen_alloc_callback_vector();
+
 
 	if (xen_hvm_domain()) {
 		native_init_IRQ();
diff --git a/include/xen/hvm.h b/include/xen/hvm.h
index b7fd7fc9ad41..8da7a6747058 100644
--- a/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@ -60,4 +60,6 @@ static inline int hvm_get_parameter(int idx, uint64_t *value)
 
 void xen_setup_callback_vector(void);
 
+int xen_set_upcall_vector(unsigned int cpu);
+
 #endif /* XEN_HVM_H__ */
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
index f3097e79bb03..03134bf3cec1 100644
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -46,4 +46,23 @@ struct xen_hvm_get_mem_type {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type);
 
+#if defined(__i386__) || defined(__x86_64__)
+
+/*
+ * HVMOP_set_evtchn_upcall_vector: Set a <vector> that should be used for event
+ *                                 channel upcalls on the specified <vcpu>. If set,
+ *                                 this vector will be used in preference to the
+ *                                 domain global callback via (see
+ *                                 HVM_PARAM_CALLBACK_IRQ).
+ */
+#define HVMOP_set_evtchn_upcall_vector 23
+struct xen_hvm_evtchn_upcall_vector {
+    uint32_t vcpu;
+    uint8_t vector;
+};
+typedef struct xen_hvm_evtchn_upcall_vector xen_hvm_evtchn_upcall_vector_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_evtchn_upcall_vector_t);
+
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
-- 
cgit 


From addebd9ac9ca0ef8b3764907bf8018e48caffc64 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 12 Aug 2022 15:56:33 -0700
Subject: fs: don't randomize struct kiocb fields

This is a size sensitive structure and randomizing can introduce extra
padding that breaks io_uring's fixed size expectations. There are few
fields here as it is, half of which need a fixed order to optimally
pack, so the randomization isn't providing much.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/io-uring/b6f508ca-b1b2-5f40-7998-e4cff1cf7212@kernel.dk/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f131e559d05..daf69a6504b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -339,17 +339,12 @@ enum rw_hint {
 
 struct kiocb {
 	struct file		*ki_filp;
-
-	/* The 'ki_filp' pointer is shared in a union for aio */
-	randomized_struct_fields_start
-
 	loff_t			ki_pos;
 	void (*ki_complete)(struct kiocb *iocb, long ret);
 	void			*private;
 	int			ki_flags;
 	u16			ki_ioprio; /* See linux/ioprio.h */
 	struct wait_page_queue	*ki_waitq; /* for async buffered IO */
-	randomized_struct_fields_end
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
-- 
cgit 


From f2ccb5aed7bce1d8b3ed5b3385759a5509663028 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 11 Aug 2022 09:11:15 +0200
Subject: io_uring: make io_kiocb_to_cmd() typesafe

We need to make sure (at build time) that struct io_cmd_data is not
casted to a structure that's larger.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Link: https://lore.kernel.org/r/c024cdf25ae19fc0319d4180e2298bade8ed17b8.1660201408.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  9 ++++++++-
 io_uring/advise.c              |  8 ++++----
 io_uring/cancel.c              |  4 ++--
 io_uring/epoll.c               |  4 ++--
 io_uring/fs.c                  | 28 ++++++++++++++--------------
 io_uring/kbuf.c                |  8 ++++----
 io_uring/msg_ring.c            |  8 ++++----
 io_uring/net.c                 | 42 +++++++++++++++++++++---------------------
 io_uring/notif.c               |  2 --
 io_uring/notif.h               |  2 +-
 io_uring/openclose.c           | 16 ++++++++--------
 io_uring/poll.c                | 16 ++++++++--------
 io_uring/rsrc.c                | 10 +++++-----
 io_uring/rw.c                  | 28 ++++++++++++++--------------
 io_uring/splice.c              |  8 ++++----
 io_uring/statx.c               |  6 +++---
 io_uring/sync.c                | 12 ++++++------
 io_uring/timeout.c             | 26 +++++++++++++-------------
 io_uring/uring_cmd.c           |  8 ++++----
 io_uring/xattr.c               | 18 +++++++++---------
 20 files changed, 134 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..677a25d44d7f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -491,7 +491,14 @@ struct io_cmd_data {
 	__u8			data[56];
 };
 
-#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
+{
+	BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data));
+}
+#define io_kiocb_to_cmd(req, cmd_type) ( \
+	io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \
+	((cmd_type *)&(req)->cmd) \
+)
 #define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
 
 struct io_kiocb {
diff --git a/io_uring/advise.c b/io_uring/advise.c
index 581956934c0b..449c6f14649f 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -31,7 +31,7 @@ struct io_madvise {
 int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
+	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
 
 	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
@@ -48,7 +48,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
+	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -64,7 +64,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
 
 	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
@@ -77,7 +77,7 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK) {
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 8435a1eba59a..e4e1dc0325f0 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -107,7 +107,7 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+	struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);
 
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
@@ -164,7 +164,7 @@ static int __io_async_cancel(struct io_cancel_data *cd,
 
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+	struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);
 	struct io_cancel_data cd = {
 		.ctx	= req->ctx,
 		.data	= cancel->addr,
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index a8b794471d6b..9aa74d2c80bc 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -23,7 +23,7 @@ struct io_epoll {
 
 int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_epoll *epoll = io_kiocb_to_cmd(req);
+	struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
 
 	pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
 		     "be removed in a future Linux kernel version.\n",
@@ -49,7 +49,7 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_epoll *ie = io_kiocb_to_cmd(req);
+	struct io_epoll *ie = io_kiocb_to_cmd(req, struct io_epoll);
 	int ret;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
diff --git a/io_uring/fs.c b/io_uring/fs.c
index 0de4f549bb7d..7100c293c13a 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -49,7 +49,7 @@ struct io_link {
 
 int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 	const char __user *oldf, *newf;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -79,7 +79,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -95,7 +95,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_renameat_cleanup(struct io_kiocb *req)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 
 	putname(ren->oldpath);
 	putname(ren->newpath);
@@ -103,7 +103,7 @@ void io_renameat_cleanup(struct io_kiocb *req)
 
 int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_unlink *un = io_kiocb_to_cmd(req);
+	struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
 	const char __user *fname;
 
 	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
@@ -128,7 +128,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_unlink *un = io_kiocb_to_cmd(req);
+	struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -146,14 +146,14 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_unlinkat_cleanup(struct io_kiocb *req)
 {
-	struct io_unlink *ul = io_kiocb_to_cmd(req);
+	struct io_unlink *ul = io_kiocb_to_cmd(req, struct io_unlink);
 
 	putname(ul->filename);
 }
 
 int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
 	const char __user *fname;
 
 	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -175,7 +175,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -190,14 +190,14 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_mkdirat_cleanup(struct io_kiocb *req)
 {
-	struct io_mkdir *md = io_kiocb_to_cmd(req);
+	struct io_mkdir *md = io_kiocb_to_cmd(req, struct io_mkdir);
 
 	putname(md->filename);
 }
 
 int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 	const char __user *oldpath, *newpath;
 
 	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -225,7 +225,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -240,7 +240,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_link *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
 	const char __user *oldf, *newf;
 
 	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -270,7 +270,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_link *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -286,7 +286,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_link_cleanup(struct io_kiocb *req)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 
 	putname(sl->oldpath);
 	putname(sl->newpath);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a73f40a4cfe6..25cd724ade18 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -272,7 +272,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
@@ -291,7 +291,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
@@ -319,7 +319,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	unsigned long size, tmp_check;
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->splice_fd_in)
@@ -421,7 +421,7 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 
 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 753d16734319..976c4ba68ee7 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -26,7 +26,7 @@ struct io_msg {
 static int io_msg_ring_data(struct io_kiocb *req)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
@@ -76,7 +76,7 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx,
 static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long file_ptr;
 	struct file *src_file;
@@ -122,7 +122,7 @@ out_unlock:
 
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 
 	if (unlikely(sqe->buf_index || sqe->personality))
 		return -EINVAL;
@@ -141,7 +141,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	int ret;
 
 	ret = -EBADFD;
diff --git a/io_uring/net.c b/io_uring/net.c
index e6fc9748fbd2..6d71748e2c5a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -77,7 +77,7 @@ struct io_sendzc {
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
 
 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
 		     sqe->buf_index || sqe->splice_fd_in))
@@ -89,7 +89,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
 	struct socket *sock;
 	int ret;
 
@@ -174,7 +174,7 @@ static int io_setup_async_msg(struct io_kiocb *req,
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	iomsg->msg.msg_name = &iomsg->addr;
 	iomsg->free_iov = iomsg->fast_iov;
@@ -201,7 +201,7 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
 
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -225,7 +225,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned flags;
@@ -284,7 +284,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct msghdr msg;
 	struct iovec iov;
 	struct socket *sock;
@@ -358,7 +358,7 @@ static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct user_msghdr msg;
 	int ret;
 
@@ -405,7 +405,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct compat_msghdr msg;
 	struct compat_iovec __user *uiov;
 	int ret;
@@ -483,7 +483,7 @@ int io_recvmsg_prep_async(struct io_kiocb *req)
 
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -518,7 +518,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static inline void io_recv_prep_retry(struct io_kiocb *req)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	sr->done_io = 0;
 	sr->len = 0; /* get from the provided buffer */
@@ -647,7 +647,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
 
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned int cflags;
@@ -759,7 +759,7 @@ retry_multishot:
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct msghdr msg;
 	struct socket *sock;
 	struct iovec iov;
@@ -850,7 +850,7 @@ out_free:
 
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
@@ -946,7 +946,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct sockaddr_storage address;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
 	struct io_notif_slot *notif_slot;
 	struct io_kiocb *notif;
 	struct msghdr msg;
@@ -1037,7 +1037,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_accept *accept = io_kiocb_to_cmd(req);
+	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	unsigned flags;
 
 	if (sqe->len || sqe->buf_index)
@@ -1071,7 +1071,7 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_accept *accept = io_kiocb_to_cmd(req);
+	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	bool fixed = !!accept->file_slot;
@@ -1129,7 +1129,7 @@ retry:
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_socket *sock = io_kiocb_to_cmd(req);
+	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
 
 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
@@ -1150,7 +1150,7 @@ int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_socket *sock = io_kiocb_to_cmd(req);
+	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
 	bool fixed = !!sock->file_slot;
 	struct file *file;
 	int ret, fd;
@@ -1184,14 +1184,14 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 int io_connect_prep_async(struct io_kiocb *req)
 {
 	struct io_async_connect *io = req->async_data;
-	struct io_connect *conn = io_kiocb_to_cmd(req);
+	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
 
 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 }
 
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_connect *conn = io_kiocb_to_cmd(req);
+	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
 
 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
@@ -1203,7 +1203,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_connect *connect = io_kiocb_to_cmd(req);
+	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
 	struct io_async_connect __io, *io;
 	unsigned file_flags;
 	int ret;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 48d29dead62a..977736e82c1a 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -123,8 +123,6 @@ __cold int io_notif_register(struct io_ring_ctx *ctx,
 	struct io_uring_notification_register reg;
 	unsigned i;
 
-	BUILD_BUG_ON(sizeof(struct io_notif_data) > 64);
-
 	if (ctx->nr_notif_slots)
 		return -EBUSY;
 	if (size != sizeof(reg))
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 0819304d7e00..65f0b42f2555 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -46,7 +46,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 
 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
 {
-	return io_kiocb_to_cmd(notif);
+	return io_kiocb_to_cmd(notif, struct io_notif_data);
 }
 
 static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index d1818ec9169b..67178e4bb282 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -33,7 +33,7 @@ struct io_close {
 
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	const char __user *fname;
 	int ret;
 
@@ -66,7 +66,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	u64 mode = READ_ONCE(sqe->len);
 	u64 flags = READ_ONCE(sqe->open_flags);
 
@@ -76,7 +76,7 @@ int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	struct open_how __user *how;
 	size_t len;
 	int ret;
@@ -95,7 +95,7 @@ int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	struct open_flags op;
 	struct file *file;
 	bool resolve_nonblock, nonblock_set;
@@ -167,7 +167,7 @@ int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_open_cleanup(struct io_kiocb *req)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 
 	if (open->filename)
 		putname(open->filename);
@@ -187,14 +187,14 @@ int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
 
 static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 
 	return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
 }
 
 int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 
 	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
@@ -212,7 +212,7 @@ int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 	struct fdtable *fdt;
 	struct file *file;
 	int ret = -EBADF;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index dadd293749b0..d5bad0bea6e4 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -85,7 +85,7 @@ static struct io_poll *io_poll_get_double(struct io_kiocb *req)
 static struct io_poll *io_poll_get_single(struct io_kiocb *req)
 {
 	if (req->opcode == IORING_OP_POLL_ADD)
-		return io_kiocb_to_cmd(req);
+		return io_kiocb_to_cmd(req, struct io_poll);
 	return &req->apoll->poll;
 }
 
@@ -274,7 +274,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	if (ret == IOU_POLL_DONE) {
-		struct io_poll *poll = io_kiocb_to_cmd(req);
+		struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 	} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 		req->cqe.res = ret;
@@ -475,7 +475,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 			       struct poll_table_struct *p)
 {
 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
+	struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll);
 
 	__io_queue_proc(poll, pt, head,
 			(struct io_poll **) &pt->req->async_data);
@@ -821,7 +821,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll_update *upd = io_kiocb_to_cmd(req);
+	struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -851,7 +851,7 @@ int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll *poll = io_kiocb_to_cmd(req);
+	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->off || sqe->addr)
@@ -868,7 +868,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_poll *poll = io_kiocb_to_cmd(req);
+	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 	struct io_poll_table ipt;
 	int ret;
 
@@ -891,7 +891,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
+	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
 	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_hash_bucket *bucket;
@@ -930,7 +930,7 @@ found:
 	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */
 		if (poll_update->update_events) {
-			struct io_poll *poll = io_kiocb_to_cmd(preq);
+			struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll);
 
 			poll->events &= ~0xffff;
 			poll->events |= poll_update->events & 0xffff;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..71359a4d0bd4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -657,7 +657,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 
 int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
@@ -676,7 +676,7 @@ int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_files_update_with_index_alloc(struct io_kiocb *req,
 					    unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	__s32 __user *fds = u64_to_user_ptr(up->arg);
 	unsigned int done;
 	struct file *file;
@@ -714,7 +714,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_rsrc_update2 up2;
 	int ret;
@@ -743,7 +743,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned len = up->nr_args;
 	unsigned idx_end, idx = up->offset;
@@ -778,7 +778,7 @@ out:
 
 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 
 	switch (up->type) {
 	case IORING_RSRC_UPDATE_FILES:
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..3d732b19b760 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -35,7 +35,7 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req)
 
 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
 	int ret;
 
@@ -102,7 +102,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 
 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (rw->kiocb.ki_pos != -1)
 		return &rw->kiocb.ki_pos;
@@ -186,7 +186,7 @@ static void kiocb_end_write(struct io_kiocb *req)
 
 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (rw->kiocb.ki_flags & IOCB_WRITE) {
 		kiocb_end_write(req);
@@ -241,7 +241,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 		       unsigned int issue_flags)
 {
 	struct io_async_rw *io = req->async_data;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	/* add previously done IO, if any */
 	if (req_has_async_data(req) && io->bytes_done > 0) {
@@ -277,7 +277,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct compat_iovec __user *uiov;
 	compat_ssize_t clen;
 	void __user *buf;
@@ -305,7 +305,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				      unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
 	void __user *buf;
 	ssize_t len;
@@ -328,7 +328,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
 		iov[0].iov_base = u64_to_user_ptr(rw->addr);
@@ -350,7 +350,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 				       struct io_rw_state *s,
 				       unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct iov_iter *iter = &s->iter;
 	u8 opcode = req->opcode;
 	struct iovec *iovec;
@@ -571,7 +571,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 {
 	struct wait_page_queue *wpq;
 	struct io_kiocb *req = wait->private;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct wait_page_key *key = arg;
 
 	wpq = container_of(wait, struct wait_page_queue, wait);
@@ -601,7 +601,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
 	struct wait_page_queue *wait = &io->wpq;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct kiocb *kiocb = &rw->kiocb;
 
 	/* never retry for NOWAIT, we just complete with -EAGAIN */
@@ -649,7 +649,7 @@ static bool need_complete_io(struct io_kiocb *req)
 
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct kiocb *kiocb = &rw->kiocb;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
@@ -694,7 +694,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 
 int io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
 	struct kiocb *kiocb = &rw->kiocb;
@@ -839,7 +839,7 @@ done:
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
 	struct kiocb *kiocb = &rw->kiocb;
@@ -994,7 +994,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct io_rw *rw = io_kiocb_to_cmd(req);
+		struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 		int ret;
 
 		/*
diff --git a/io_uring/splice.c b/io_uring/splice.c
index b013ba34bffa..53e4232d0866 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -26,7 +26,7 @@ struct io_splice {
 static int __io_splice_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 
 	sp->len = READ_ONCE(sqe->len);
@@ -46,7 +46,7 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	struct file *in;
@@ -78,7 +78,7 @@ done:
 
 int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 
 	sp->off_in = READ_ONCE(sqe->splice_off_in);
 	sp->off_out = READ_ONCE(sqe->off);
@@ -87,7 +87,7 @@ int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	loff_t *poff_in, *poff_out;
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6056cd7f4876..d8fc933d3f59 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -22,7 +22,7 @@ struct io_statx {
 
 int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 	const char __user *path;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -53,7 +53,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -66,7 +66,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_statx_cleanup(struct io_kiocb *req)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 
 	if (sx->filename)
 		putname(sx->filename);
diff --git a/io_uring/sync.c b/io_uring/sync.c
index f2102afa79ca..64e87ea2b8fb 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -24,7 +24,7 @@ struct io_sync {
 
 int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
@@ -37,7 +37,7 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	int ret;
 
 	/* sync_file_range always requires a blocking context */
@@ -51,7 +51,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
@@ -67,7 +67,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	loff_t end = sync->off + sync->len;
 	int ret;
 
@@ -83,7 +83,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
@@ -96,7 +96,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	int ret;
 
 	/* fallocate always requiring blocking context */
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2f9e56935479..78ea2c64b70e 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -36,7 +36,7 @@ struct io_timeout_rem {
 
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 
 	return !timeout->off;
 }
@@ -56,7 +56,7 @@ static bool io_kill_timeout(struct io_kiocb *req, int status)
 	struct io_timeout_data *io = req->async_data;
 
 	if (hrtimer_try_to_cancel(&io->timer) != -1) {
-		struct io_timeout *timeout = io_kiocb_to_cmd(req);
+		struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 
 		if (status)
 			req_set_fail(req);
@@ -188,7 +188,7 @@ struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
 	__must_hold(&req->ctx->timeout_lock)
 {
 	struct io_timeout_data *io = link->async_data;
-	struct io_timeout *timeout = io_kiocb_to_cmd(link);
+	struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout);
 
 	io_remove_next_linked(req);
 	timeout->head = NULL;
@@ -205,7 +205,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
@@ -252,7 +252,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	io = req->async_data;
 	if (hrtimer_try_to_cancel(&io->timer) == -1)
 		return ERR_PTR(-EALREADY);
-	timeout = io_kiocb_to_cmd(req);
+	timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	list_del_init(&timeout->list);
 	return req;
 }
@@ -275,7 +275,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 {
 	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
 
@@ -302,7 +302,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *prev, *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
@@ -378,7 +378,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 {
 	struct io_cancel_data cd = { .data = user_data, };
 	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data;
 
 	if (IS_ERR(req))
@@ -395,7 +395,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 
 int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
 
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
@@ -435,7 +435,7 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
  */
 int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
@@ -466,7 +466,7 @@ static int __io_timeout_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe,
 			     bool is_timeout_link)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data;
 	unsigned flags;
 	u32 off = READ_ONCE(sqe->off);
@@ -532,7 +532,7 @@ int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data = req->async_data;
 	struct list_head *entry;
@@ -583,7 +583,7 @@ add:
 
 void io_queue_linked_timeout(struct io_kiocb *req)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock_irq(&ctx->timeout_lock);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee7036f2241f..478e86a9dfaf 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -11,7 +11,7 @@
 
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 
 	ioucmd->task_work_cb(ioucmd);
 }
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
 int io_uring_cmd_prep_async(struct io_kiocb *req)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	size_t cmd_size;
 
 	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
@@ -66,7 +66,7 @@ int io_uring_cmd_prep_async(struct io_kiocb *req)
 
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 
 	if (sqe->rw_flags || sqe->__pad1)
 		return -EINVAL;
@@ -77,7 +77,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
 	int ret;
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index b179f9acd5ac..84180afd090b 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -24,7 +24,7 @@ struct io_xattr {
 
 void io_xattr_cleanup(struct io_kiocb *req)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 
 	if (ix->filename)
 		putname(ix->filename);
@@ -44,7 +44,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 static int __io_getxattr_prep(struct io_kiocb *req,
 			      const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *name;
 	int ret;
 
@@ -85,7 +85,7 @@ int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *path;
 	int ret;
 
@@ -106,7 +106,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -122,7 +122,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;
@@ -151,7 +151,7 @@ retry:
 static int __io_setxattr_prep(struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *name;
 	int ret;
 
@@ -181,7 +181,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
 
 int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *path;
 	int ret;
 
@@ -208,7 +208,7 @@ int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
 			struct path *path)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	int ret;
 
 	ret = mnt_want_write(path->mnt);
@@ -234,7 +234,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;
-- 
cgit 


From 9f162193d6e48eb4ff51c2ea3612f1daebca1b7e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 11 Aug 2022 22:34:25 -0700
Subject: radix-tree: replace gfp.h inclusion with gfp_types.h

Radix tree header includes gfp.h for __GFP_BITS_SHIFT only. Now we
have gfp_types.h for this.

Fixes powerpc allmodconfig build:

   In file included from include/linux/nodemask.h:97,
                    from include/linux/mmzone.h:17,
                    from include/linux/gfp.h:7,
                    from include/linux/radix-tree.h:12,
                    from include/linux/idr.h:15,
                    from include/linux/kernfs.h:12,
                    from include/linux/sysfs.h:16,
                    from include/linux/kobject.h:20,
                    from include/linux/pci.h:35,
                    from arch/powerpc/kernel/prom_init.c:24:
   include/linux/random.h: In function 'add_latent_entropy':
>> include/linux/random.h:25:46: error: 'latent_entropy' undeclared (first use in this function); did you mean 'add_latent_entropy'?
      25 |         add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
         |                                              ^~~~~~~~~~~~~~
         |                                              add_latent_entropy
   include/linux/random.h:25:46: note: each undeclared identifier is reported only once for each function it appears in

Reported-by: kernel test robot <lkp@intel.com>
CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f7c1d21c2f39..eae67015ce51 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -9,7 +9,7 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/gfp.h>
+#include <linux/gfp_types.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/math.h>
-- 
cgit 


From 0ff4eb3d5ebbf72a7fc355e6001a0a6740662bf9 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Date: Thu, 11 Aug 2022 18:20:12 +0300
Subject: neighbour: make proxy_queue.qlen limit per-device

Right now we have a neigh_param PROXY_QLEN which specifies maximum length
of neigh_table->proxy_queue. But in fact, this limitation doesn't work well
because check condition looks like:
tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)

The problem is that p (struct neigh_parms) is a per-device thing,
but tbl (struct neigh_table) is a system-wide global thing.

It seems reasonable to make proxy_queue limit per-device based.

v2:
	- nothing changed in this patch
v3:
	- rebase to net tree

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@kernel.org>
Cc: Yajun Deng <yajun.deng@linux.dev>
Cc: Roopa Prabhu <roopa@nvidia.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
Cc: kernel@openvz.org
Cc: devel@openvz.org
Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |  1 +
 net/core/neighbour.c    | 25 ++++++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9f0bab0589d9..3827a6b395fd 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -83,6 +83,7 @@ struct neigh_parms {
 	struct rcu_head rcu_head;
 
 	int	reachable_time;
+	int	qlen;
 	int	data[NEIGH_VAR_DATA_MAX];
 	DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
 };
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0e38a05d5b23..5b669eb80270 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -316,9 +316,18 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
 	skb = skb_peek(list);
 	while (skb != NULL) {
 		struct sk_buff *skb_next = skb_peek_next(skb, list);
-		if (net == NULL || net_eq(dev_net(skb->dev), net)) {
+		struct net_device *dev = skb->dev;
+		if (net == NULL || net_eq(dev_net(dev), net)) {
+			struct in_device *in_dev;
+
+			rcu_read_lock();
+			in_dev = __in_dev_get_rcu(dev);
+			if (in_dev)
+				in_dev->arp_parms->qlen--;
+			rcu_read_unlock();
 			__skb_unlink(skb, list);
-			dev_put(skb->dev);
+
+			dev_put(dev);
 			kfree_skb(skb);
 		}
 		skb = skb_next;
@@ -1606,8 +1615,15 @@ static void neigh_proxy_process(struct timer_list *t)
 
 		if (tdif <= 0) {
 			struct net_device *dev = skb->dev;
+			struct in_device *in_dev;
 
+			rcu_read_lock();
+			in_dev = __in_dev_get_rcu(dev);
+			if (in_dev)
+				in_dev->arp_parms->qlen--;
+			rcu_read_unlock();
 			__skb_unlink(skb, &tbl->proxy_queue);
+
 			if (tbl->proxy_redo && netif_running(dev)) {
 				rcu_read_lock();
 				tbl->proxy_redo(skb);
@@ -1632,7 +1648,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 	unsigned long sched_next = jiffies +
 			prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY));
 
-	if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+	if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
 		kfree_skb(skb);
 		return;
 	}
@@ -1648,6 +1664,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 	skb_dst_drop(skb);
 	dev_hold(skb->dev);
 	__skb_queue_tail(&tbl->proxy_queue, skb);
+	p->qlen++;
 	mod_timer(&tbl->proxy_timer, sched_next);
 	spin_unlock(&tbl->proxy_queue.lock);
 }
@@ -1680,6 +1697,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 		refcount_set(&p->refcnt, 1);
 		p->reachable_time =
 				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+		p->qlen = 0;
 		netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
 		p->dev = dev;
 		write_pnet(&p->net, net);
@@ -1745,6 +1763,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 	refcount_set(&tbl->parms.refcnt, 1);
 	tbl->parms.reachable_time =
 			  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+	tbl->parms.qlen = 0;
 
 	tbl->stats = alloc_percpu(struct neigh_statistics);
 	if (!tbl->stats)
-- 
cgit 


From be599244865cb788abe2c6f59ccb05d7240448e4 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:33 +0200
Subject: cpumask: align signatures of UP implementations

Between the generic version, and their uniprocessor optimised
implementations, the return types of cpumask_any_and_distribute() and
cpumask_any_distribute() are not identical.  Change the UP versions to
'unsigned int', to match the generic versions.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 0d435d0edbcb..d8c2a40f8beb 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -202,12 +202,13 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
+static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
+						      const struct cpumask *src2p)
+{
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
+static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
-- 
cgit 


From 2248ccd80124e61c2c84a22b22409bab452e1f0c Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:34 +0200
Subject: lib/cpumask: add inline cpumask_next_wrap() for UP

In the uniprocessor case, cpumask_next_wrap() can be simplified, as the
number of valid argument combinations is limited:
    - 'start' can only be 0
    - 'n' can only be -1 or 0

The only valid CPU that can then be returned, if any, will be the first
one set in the provided 'mask'.

For NR_CPUS == 1, include/linux/cpumask.h now provides an inline
definition of cpumask_next_wrap(), which will conflict with the one
provided by lib/cpumask.c.  Make building of lib/cpumask.o again depend
on CONFIG_SMP=y (i.e. NR_CPUS > 1) to avoid the re-definition.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 19 +++++++++++++++++++
 lib/Makefile            |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d8c2a40f8beb..bd047864c7ac 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -262,7 +262,26 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
+#if NR_CPUS == 1
+static inline
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+	cpumask_check(start);
+	if (n != -1)
+		cpumask_check(n);
+
+	/*
+	 * Return the first available CPU when wrapping, or when starting before cpu0,
+	 * since there is only one valid option.
+	 */
+	if (wrap && n >= 0)
+		return nr_cpumask_bits;
+
+	return cpumask_first(mask);
+}
+#else
 unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+#endif
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
diff --git a/lib/Makefile b/lib/Makefile
index c95212141928..5927d7fa0806 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,9 +34,10 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o cpumask.o
+	 buildid.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
+lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
-- 
cgit 


From 484b9fa4886bd9377969aad5e9ea17efda4ecda6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:31 -0400
Subject: virtio: Revert "virtio: add helper virtio_find_vqs_ctx_size()"

This reverts commit fe3dc04e31aa51f91dc7f741a5f76cc4817eb5b4: the
API is now unused and in fact can't be implemented on top of a legacy
device.

Fixes: fe3dc04e31aa ("virtio: add helper virtio_find_vqs_ctx_size()")
Cc: "Xuan Zhuo" <xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-3-mst@redhat.com>
---
 include/linux/virtio_config.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6adff09f7170..888f7e96f0c7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -241,18 +241,6 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 				      ctx, desc);
 }
 
-static inline
-int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs,
-			     struct virtqueue *vqs[],
-			     vq_callback_t *callbacks[],
-			     const char * const names[],
-			     u32 sizes[],
-			     const bool *ctx, struct irq_affinity *desc)
-{
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes,
-				      ctx, desc);
-}
-
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
  * @vdev: the device
-- 
cgit 


From 9993a4f989c7ca5e227329b2878f65d05c9fc20f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:58 -0400
Subject: virtio: Revert "virtio: find_vqs() add arg sizes"

This reverts commit a10fba0377145fccefea4dc4dd5915b7ed87e546: the
proposed API isn't supported on all transports but no
effort was made to address this.

It might not be hard to fix if we want to: maybe just
rename size to size_hint and make sure legacy
transports ignore the hint.

But it's not sure what the benefit is in any case, so
let's drop it.

Fixes: a10fba037714 ("virtio: find_vqs() add arg sizes")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-8-mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             |  2 +-
 drivers/platform/mellanox/mlxbf-tmfifo.c |  1 -
 drivers/remoteproc/remoteproc_virtio.c   |  1 -
 drivers/s390/virtio/virtio_ccw.c         |  1 -
 drivers/virtio/virtio_mmio.c             |  1 -
 drivers/virtio/virtio_pci_common.c       |  2 +-
 drivers/virtio/virtio_pci_common.h       |  2 +-
 drivers/virtio/virtio_pci_modern.c       |  7 ++-----
 drivers/virtio/virtio_vdpa.c             |  1 -
 include/linux/virtio_config.h            | 14 +++++---------
 10 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 79e38afd4b91..e719af8bdf56 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -1011,7 +1011,7 @@ error_kzalloc:
 
 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		       const char * const names[], u32 sizes[], const bool *ctx,
+		       const char * const names[], const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 8be13d416f48..1ae3c56b66b0 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -928,7 +928,6 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 					struct virtqueue *vqs[],
 					vq_callback_t *callbacks[],
 					const char * const names[],
-					u32 sizes[],
 					const bool *ctx,
 					struct irq_affinity *desc)
 {
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 81c4f5776109..0f7706e23eb9 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -158,7 +158,6 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
-				 u32 sizes[],
 				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 896896e32664..a10dbe632ef9 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -637,7 +637,6 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
-			       u32 sizes[],
 			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index dfcecfd7aba1..3ff746e3f24a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -474,7 +474,6 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
-		       u32 sizes[],
 		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 7ad734584823..ad258a9d3b9f 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -396,7 +396,7 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], u32 sizes[], const bool *ctx,
+		const char * const names[], const bool *ctx,
 		struct irq_affinity *desc)
 {
 	int err;
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index a5ff838b85a5..23112d84218f 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -110,7 +110,7 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], u32 sizes[], const bool *ctx,
+		const char * const names[], const bool *ctx,
 		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index be51ec849252..c3b9f2761849 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -347,15 +347,12 @@ err:
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char * const names[],
-			      u32 sizes[],
-			      const bool *ctx,
+			      const char * const names[], const bool *ctx,
 			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx,
-			     desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 832d2c5b1b19..9670cc79371d 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -269,7 +269,6 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				struct virtqueue *vqs[],
 				vq_callback_t *callbacks[],
 				const char * const names[],
-				u32 sizes[],
 				const bool *ctx,
 				struct irq_affinity *desc)
 {
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 888f7e96f0c7..36ec7be1f480 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,7 +55,6 @@ struct virtio_shm_region {
  *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
  *		include a NULL entry for vqs unused by driver
- *	sizes: array of virtqueue sizes
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @synchronize_cbs: synchronize with the virtqueue callbacks (optional)
@@ -104,9 +103,7 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[],
-			u32 sizes[],
-			const bool *ctx,
+			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
@@ -215,7 +212,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	const char *names[] = { n };
 	struct virtqueue *vq;
 	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
-					 NULL, NULL);
+					 NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -227,8 +224,7 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      NULL, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
 }
 
 static inline
@@ -237,8 +233,8 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      ctx, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
+				      desc);
 }
 
 /**
-- 
cgit 


From 5c669c4a4c6aa0489848093c93b8029f5c5c75ec Mon Sep 17 00:00:00 2001
From: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Date: Wed, 10 Aug 2022 11:40:03 +0200
Subject: virtio: kerneldocs fixes and enhancements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix variable names in some kerneldocs, naming in others.
Add kerneldocs for struct vring_desc and vring_interrupt.

Signed-off-by: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Message-Id: <20220810094004.1250-2-ricardo.canuelo@collabora.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 drivers/virtio/virtio_ring.c     |  8 ++++++++
 include/linux/virtio.h           |  6 +++---
 include/linux/virtio_config.h    |  6 +++---
 include/uapi/linux/virtio_ring.h | 16 +++++++++++-----
 4 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d66c8e6d0ef3..4620e9d79dde 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2426,6 +2426,14 @@ static inline bool more_used(const struct vring_virtqueue *vq)
 	return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
 }
 
+/**
+ * vring_interrupt - notify a virtqueue on an interrupt
+ * @irq: the IRQ number (ignored)
+ * @_vq: the struct virtqueue to notify
+ *
+ * Calls the callback function of @_vq to process the virtqueue
+ * notification.
+ */
 irqreturn_t vring_interrupt(int irq, void *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a3f73bb6733e..dcab9c7e8784 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,7 +11,7 @@
 #include <linux/gfp.h>
 
 /**
- * virtqueue - a queue to register buffers for sending or receiving.
+ * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
  * @callback: the function to call when buffers are consumed (can be NULL).
  * @name: the name of this virtqueue (mainly for debugging)
@@ -97,7 +97,7 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
 
 /**
- * virtio_device - representation of a device using virtio
+ * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
  * @config_enabled: configuration change reporting enabled
@@ -156,7 +156,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
 	list_for_each_entry(vq, &vdev->vqs, list)
 
 /**
- * virtio_driver - operations for a virtio I/O driver
+ * struct virtio_driver - operations for a virtio I/O driver
  * @driver: underlying device driver (populate name and owner).
  * @id_table: the ids serviced by this driver.
  * @feature_table: an array of feature numbers supported by this driver.
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..4b517649cfe8 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -239,7 +239,7 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
- * @vdev: the device
+ * @dev: the virtio device
  */
 static inline
 void virtio_synchronize_cbs(struct virtio_device *dev)
@@ -258,7 +258,7 @@ void virtio_synchronize_cbs(struct virtio_device *dev)
 
 /**
  * virtio_device_ready - enable vq use in probe function
- * @vdev: the device
+ * @dev: the virtio device
  *
  * Driver must call this to use vqs in the probe function.
  *
@@ -306,7 +306,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
 /**
  * virtqueue_set_affinity - setting affinity for a virtqueue
  * @vq: the virtqueue
- * @cpu: the cpu no.
+ * @cpu_mask: the cpu mask
  *
  * Pay attention the function are best-effort: the affinity hint may not be set
  * due to config support, irq type and sharing.
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 476d3e5c0fe7..f8c20d3de8da 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -93,15 +93,21 @@
 #define VRING_USED_ALIGN_SIZE 4
 #define VRING_DESC_ALIGN_SIZE 16
 
-/* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
+/**
+ * struct vring_desc - Virtio ring descriptors,
+ * 16 bytes long. These can chain together via @next.
+ *
+ * @addr: buffer address (guest-physical)
+ * @len: buffer length
+ * @flags: descriptor flags
+ * @next: index of the next descriptor in the chain,
+ *        if the VRING_DESC_F_NEXT flag is set. We chain unused
+ *        descriptors via this, too.
+ */
 struct vring_desc {
-	/* Address (guest-physical). */
 	__virtio64 addr;
-	/* Length. */
 	__virtio32 len;
-	/* The flags as indicated above. */
 	__virtio16 flags;
-	/* We chain unused descriptors via this, too */
 	__virtio16 next;
 };
 
-- 
cgit 


From 415d832497098030241605c52ea83d4e2cfa7879 Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Tue, 16 Aug 2022 16:03:11 +0900
Subject: locking/atomic: Make test_and_*_bit() ordered on failure

These operations are documented as always ordered in
include/asm-generic/bitops/instrumented-atomic.h, and producer-consumer
type use cases where one side needs to ensure a flag is left pending
after some shared data was updated rely on this ordering, even in the
failure case.

This is the case with the workqueue code, which currently suffers from a
reproducible ordering violation on Apple M1 platforms (which are
notoriously out-of-order) that ends up causing the TTY layer to fail to
deliver data to userspace properly under the right conditions.  This
change fixes that bug.

Change the documentation to restrict the "no order on failure" story to
the _lock() variant (for which it makes sense), and remove the
early-exit from the generic implementation, which is what causes the
missing barrier semantics in that case.  Without this, the remaining
atomic op is fully ordered (including on ARM64 LSE, as of recent
versions of the architecture spec).

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Fixes: e986a0d6cb36 ("locking/atomics, asm-generic/bitops/atomic.h: Rewrite using atomic_*() APIs")
Fixes: 61e02392d3c7 ("locking/atomic/bitops: Document and clarify ordering semantics for failed test_and_{}_bit()")
Signed-off-by: Hector Martin <marcan@marcan.st>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/atomic_bitops.txt     | 2 +-
 include/asm-generic/bitops/atomic.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index 093cdaefdb37..d8b101c97031 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -59,7 +59,7 @@ Like with atomic_t, the rule of thumb is:
  - RMW operations that have a return value are fully ordered.
 
  - RMW operations that are conditional are unordered on FAILURE,
-   otherwise the above rules apply. In the case of test_and_{}_bit() operations,
+   otherwise the above rules apply. In the case of test_and_set_bit_lock(),
    if the bit in memory is unchanged by the operation then it is deemed to have
    failed.
 
diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h
index 3096f086b5a3..71ab4ba9c25d 100644
--- a/include/asm-generic/bitops/atomic.h
+++ b/include/asm-generic/bitops/atomic.h
@@ -39,9 +39,6 @@ arch_test_and_set_bit(unsigned int nr, volatile unsigned long *p)
 	unsigned long mask = BIT_MASK(nr);
 
 	p += BIT_WORD(nr);
-	if (READ_ONCE(*p) & mask)
-		return 1;
-
 	old = arch_atomic_long_fetch_or(mask, (atomic_long_t *)p);
 	return !!(old & mask);
 }
@@ -53,9 +50,6 @@ arch_test_and_clear_bit(unsigned int nr, volatile unsigned long *p)
 	unsigned long mask = BIT_MASK(nr);
 
 	p += BIT_WORD(nr);
-	if (!(READ_ONCE(*p) & mask))
-		return 0;
-
 	old = arch_atomic_long_fetch_andnot(mask, (atomic_long_t *)p);
 	return !!(old & mask);
 }
-- 
cgit 


From 34f667634a0dcee26d9873ab1c819bdd6cdf485a Mon Sep 17 00:00:00 2001
From: Hamza Mahfooz <hamza.mahfooz@amd.com>
Date: Thu, 4 Aug 2022 14:06:55 -0400
Subject: drm/dp_mst: add passthrough_aux to struct drm_dp_mst_port

Currently, there is no way to identify if DSC pass-through can be
enabled and what aux DSC pass-through requests ought to be sent to. So,
add a variable to struct drm_dp_mst_port that keeps track of the
aforementioned information.

Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/display/drm_dp_mst_topology.c | 4 +++-
 include/drm/display/drm_dp.h                  | 1 +
 include/drm/display/drm_dp_mst_helper.h       | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 57e65423e50d..3ffa0e893617 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -5908,8 +5908,10 @@ struct drm_dp_aux *drm_dp_mst_dsc_aux_for_port(struct drm_dp_mst_port *port)
 		/* Enpoint decompression with DP-to-DP peer device */
 		if ((endpoint_dsc & DP_DSC_DECOMPRESSION_IS_SUPPORTED) &&
 		    (endpoint_fec & DP_FEC_CAPABLE) &&
-		    (upstream_dsc & 0x2) /* DSC passthrough */)
+		    (upstream_dsc & DP_DSC_PASSTHROUGH_IS_SUPPORTED)) {
+			port->passthrough_aux = &immediate_upstream_port->aux;
 			return &port->aux;
+		}
 
 		/* Virtual DPCD decompression with DP-to-DP peer device */
 		return &immediate_upstream_port->aux;
diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
index 9e3aff7e68bb..4d0abe4c7ea9 100644
--- a/include/drm/display/drm_dp.h
+++ b/include/drm/display/drm_dp.h
@@ -239,6 +239,7 @@
 
 #define DP_DSC_SUPPORT                      0x060   /* DP 1.4 */
 # define DP_DSC_DECOMPRESSION_IS_SUPPORTED  (1 << 0)
+# define DP_DSC_PASSTHROUGH_IS_SUPPORTED    (1 << 1)
 
 #define DP_DSC_REV                          0x061
 # define DP_DSC_MAJOR_MASK                  (0xf << 0)
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 10adec068b7f..4a39c95f8afd 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -86,6 +86,8 @@ struct drm_dp_vcpi {
  * @next: link to next port on this branch device
  * @aux: i2c aux transport to talk to device connected to this port, protected
  * by &drm_dp_mst_topology_mgr.base.lock.
+ * @passthrough_aux: parent aux to which DSC pass-through requests should be
+ * sent, only set if DSC pass-through is possible.
  * @parent: branch device parent of this port
  * @vcpi: Virtual Channel Payload info for this port.
  * @connector: DRM connector this port is connected to. Protected by
@@ -140,6 +142,7 @@ struct drm_dp_mst_port {
 	 */
 	struct drm_dp_mst_branch *mstb;
 	struct drm_dp_aux aux; /* i2c bus for this port? */
+	struct drm_dp_aux *passthrough_aux;
 	struct drm_dp_mst_branch *parent;
 
 	struct drm_dp_vcpi vcpi;
-- 
cgit 


From fc4aaf9fb3c99bcb326d52f9d320ed5680bd1cee Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Aug 2022 10:34:40 +0100
Subject: net: Fix suspicious RCU usage in bpf_sk_reuseport_detach()

bpf_sk_reuseport_detach() calls __rcu_dereference_sk_user_data_with_flags()
to obtain the value of sk->sk_user_data, but that function is only usable
if the RCU read lock is held, and neither that function nor any of its
callers hold it.

Fix this by adding a new helper, __locked_read_sk_user_data_with_flags()
that checks to see if sk->sk_callback_lock() is held and use that here
instead.

Alternatively, making __rcu_dereference_sk_user_data_with_flags() use
rcu_dereference_checked() might suffice.

Without this, the following warning can be occasionally observed:

=============================
WARNING: suspicious RCU usage
6.0.0-rc1-build2+ #563 Not tainted
-----------------------------
include/net/sock.h:592 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
5 locks held by locktest/29873:
 #0: ffff88812734b550 (&sb->s_type->i_mutex_key#9){+.+.}-{3:3}, at: __sock_release+0x77/0x121
 #1: ffff88812f5621b0 (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_close+0x1c/0x70
 #2: ffff88810312f5c8 (&h->lhash2[i].lock){+.+.}-{2:2}, at: inet_unhash+0x76/0x1c0
 #3: ffffffff83768bb8 (reuseport_lock){+...}-{2:2}, at: reuseport_detach_sock+0x18/0xdd
 #4: ffff88812f562438 (clock-AF_INET){++..}-{2:2}, at: bpf_sk_reuseport_detach+0x24/0xa4

stack backtrace:
CPU: 1 PID: 29873 Comm: locktest Not tainted 6.0.0-rc1-build2+ #563
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x4c/0x5f
 bpf_sk_reuseport_detach+0x6d/0xa4
 reuseport_detach_sock+0x75/0xdd
 inet_unhash+0xa5/0x1c0
 tcp_set_state+0x169/0x20f
 ? lockdep_sock_is_held+0x3a/0x3a
 ? __lock_release.isra.0+0x13e/0x220
 ? reacquire_held_locks+0x1bb/0x1bb
 ? hlock_class+0x31/0x96
 ? mark_lock+0x9e/0x1af
 __tcp_close+0x50/0x4b6
 tcp_close+0x28/0x70
 inet_release+0x8e/0xa7
 __sock_release+0x95/0x121
 sock_close+0x14/0x17
 __fput+0x20f/0x36a
 task_work_run+0xa3/0xcc
 exit_to_user_mode_prepare+0x9c/0x14d
 syscall_exit_to_user_mode+0x18/0x44
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: cf8c1e967224 ("net: refactor bpf_sk_reuseport_detach()")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Hawkins Jiawei <yin31149@gmail.com>
Link: https://lore.kernel.org/r/166064248071.3502205.10036394558814861778.stgit@warthog.procyon.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h           | 25 +++++++++++++++++++++++++
 kernel/bpf/reuseport_array.c |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 05a1bbdf5805..d08cfe190a78 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -577,6 +577,31 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
+/**
+ * __locked_read_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ *
+ * The caller must be holding sk->sk_callback_lock.
+ */
+static inline void *
+__locked_read_sk_user_data_with_flags(const struct sock *sk,
+				      uintptr_t flags)
+{
+	uintptr_t sk_user_data =
+		(uintptr_t)rcu_dereference_check(__sk_user_data(sk),
+						 lockdep_is_held(&sk->sk_callback_lock));
+
+	WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+	if ((sk_user_data & flags) == flags)
+		return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+	return NULL;
+}
+
 /**
  * __rcu_dereference_sk_user_data_with_flags - return the pointer
  * only if argument flags all has been set in sk_user_data. Otherwise
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 85fa9dbfa8bf..82c61612f382 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk)
 	struct sock __rcu **socks;
 
 	write_lock_bh(&sk->sk_callback_lock);
-	socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+	socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
 	if (socks) {
 		WRITE_ONCE(sk->sk_user_data, NULL);
 		/*
-- 
cgit 


From 5152de7b79ab0be150f5966481b0c8f996192531 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 16 Aug 2022 16:53:46 +0300
Subject: net: mscc: ocelot: fix incorrect ndo_get_stats64 packet counters

Reading stats using the SYS_COUNT_* register definitions is only used by
ocelot_get_stats64() from the ocelot switchdev driver, however,
currently the bucket definitions are incorrect.

Separately, on both RX and TX, we have the following problems:
- a 256-1023 bucket which actually tracks the 256-511 packets
- the 1024-1526 bucket actually tracks the 512-1023 packets
- the 1527-max bucket actually tracks the 1024-1526 packets

=> nobody tracks the packets from the real 1527-max bucket

Additionally, the RX_PAUSE, RX_CONTROL, RX_LONGS and RX_CLASSIFIED_DROPS
all track the wrong thing. However this doesn't seem to have any
consequence, since ocelot_get_stats64() doesn't use these.

Even though this problem only manifests itself for the switchdev driver,
we cannot split the fix for ocelot and for DSA, since it requires fixing
the bucket definitions from enum ocelot_reg, which makes us necessarily
adapt the structures from felix and seville as well.

Fixes: 84705fc16552 ("net: dsa: felix: introduce support for Seville VSC9953 switch")
Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family")
Fixes: a556c76adc05 ("net: mscc: Add initial Ocelot switch support")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c   | 20 ++++++++++++--------
 drivers/net/dsa/ocelot/seville_vsc9953.c | 16 +++++++++-------
 drivers/net/ethernet/mscc/ocelot_net.c   |  6 ++++--
 drivers/net/ethernet/mscc/vsc7514_regs.c | 24 +++++++++++++-----------
 include/soc/mscc/ocelot.h                |  6 ++++--
 5 files changed, 42 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 5859ef3b242c..e1ebe21cad00 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -281,19 +281,23 @@ static const u32 vsc9959_sys_regmap[] = {
 	REG(SYS_COUNT_RX_64,			0x000024),
 	REG(SYS_COUNT_RX_65_127,		0x000028),
 	REG(SYS_COUNT_RX_128_255,		0x00002c),
-	REG(SYS_COUNT_RX_256_1023,		0x000030),
-	REG(SYS_COUNT_RX_1024_1526,		0x000034),
-	REG(SYS_COUNT_RX_1527_MAX,		0x000038),
-	REG(SYS_COUNT_RX_LONGS,			0x000044),
+	REG(SYS_COUNT_RX_256_511,		0x000030),
+	REG(SYS_COUNT_RX_512_1023,		0x000034),
+	REG(SYS_COUNT_RX_1024_1526,		0x000038),
+	REG(SYS_COUNT_RX_1527_MAX,		0x00003c),
+	REG(SYS_COUNT_RX_PAUSE,			0x000040),
+	REG(SYS_COUNT_RX_CONTROL,		0x000044),
+	REG(SYS_COUNT_RX_LONGS,			0x000048),
 	REG(SYS_COUNT_TX_OCTETS,		0x000200),
 	REG(SYS_COUNT_TX_COLLISION,		0x000210),
 	REG(SYS_COUNT_TX_DROPS,			0x000214),
 	REG(SYS_COUNT_TX_64,			0x00021c),
 	REG(SYS_COUNT_TX_65_127,		0x000220),
-	REG(SYS_COUNT_TX_128_511,		0x000224),
-	REG(SYS_COUNT_TX_512_1023,		0x000228),
-	REG(SYS_COUNT_TX_1024_1526,		0x00022c),
-	REG(SYS_COUNT_TX_1527_MAX,		0x000230),
+	REG(SYS_COUNT_TX_128_255,		0x000224),
+	REG(SYS_COUNT_TX_256_511,		0x000228),
+	REG(SYS_COUNT_TX_512_1023,		0x00022c),
+	REG(SYS_COUNT_TX_1024_1526,		0x000230),
+	REG(SYS_COUNT_TX_1527_MAX,		0x000234),
 	REG(SYS_COUNT_TX_AGING,			0x000278),
 	REG(SYS_RESET_CFG,			0x000e00),
 	REG(SYS_SR_ETYPE_CFG,			0x000e04),
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c
index ea0649211356..ebe9ddbbe2b7 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -277,19 +277,21 @@ static const u32 vsc9953_sys_regmap[] = {
 	REG(SYS_COUNT_RX_64,			0x000024),
 	REG(SYS_COUNT_RX_65_127,		0x000028),
 	REG(SYS_COUNT_RX_128_255,		0x00002c),
-	REG(SYS_COUNT_RX_256_1023,		0x000030),
-	REG(SYS_COUNT_RX_1024_1526,		0x000034),
-	REG(SYS_COUNT_RX_1527_MAX,		0x000038),
+	REG(SYS_COUNT_RX_256_511,		0x000030),
+	REG(SYS_COUNT_RX_512_1023,		0x000034),
+	REG(SYS_COUNT_RX_1024_1526,		0x000038),
+	REG(SYS_COUNT_RX_1527_MAX,		0x00003c),
 	REG(SYS_COUNT_RX_LONGS,			0x000048),
 	REG(SYS_COUNT_TX_OCTETS,		0x000100),
 	REG(SYS_COUNT_TX_COLLISION,		0x000110),
 	REG(SYS_COUNT_TX_DROPS,			0x000114),
 	REG(SYS_COUNT_TX_64,			0x00011c),
 	REG(SYS_COUNT_TX_65_127,		0x000120),
-	REG(SYS_COUNT_TX_128_511,		0x000124),
-	REG(SYS_COUNT_TX_512_1023,		0x000128),
-	REG(SYS_COUNT_TX_1024_1526,		0x00012c),
-	REG(SYS_COUNT_TX_1527_MAX,		0x000130),
+	REG(SYS_COUNT_TX_128_255,		0x000124),
+	REG(SYS_COUNT_TX_256_511,		0x000128),
+	REG(SYS_COUNT_TX_512_1023,		0x00012c),
+	REG(SYS_COUNT_TX_1024_1526,		0x000130),
+	REG(SYS_COUNT_TX_1527_MAX,		0x000134),
 	REG(SYS_COUNT_TX_AGING,			0x000178),
 	REG(SYS_RESET_CFG,			0x000318),
 	REG_RESERVED(SYS_SR_ETYPE_CFG),
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 5e6136e80282..9d8cea16245e 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -739,7 +739,8 @@ static void ocelot_get_stats64(struct net_device *dev,
 			    ocelot_read(ocelot, SYS_COUNT_RX_64) +
 			    ocelot_read(ocelot, SYS_COUNT_RX_65_127) +
 			    ocelot_read(ocelot, SYS_COUNT_RX_128_255) +
-			    ocelot_read(ocelot, SYS_COUNT_RX_256_1023) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_256_511) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_512_1023) +
 			    ocelot_read(ocelot, SYS_COUNT_RX_1024_1526) +
 			    ocelot_read(ocelot, SYS_COUNT_RX_1527_MAX);
 	stats->multicast = ocelot_read(ocelot, SYS_COUNT_RX_MULTICAST);
@@ -749,7 +750,8 @@ static void ocelot_get_stats64(struct net_device *dev,
 	stats->tx_bytes = ocelot_read(ocelot, SYS_COUNT_TX_OCTETS);
 	stats->tx_packets = ocelot_read(ocelot, SYS_COUNT_TX_64) +
 			    ocelot_read(ocelot, SYS_COUNT_TX_65_127) +
-			    ocelot_read(ocelot, SYS_COUNT_TX_128_511) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_128_255) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_256_511) +
 			    ocelot_read(ocelot, SYS_COUNT_TX_512_1023) +
 			    ocelot_read(ocelot, SYS_COUNT_TX_1024_1526) +
 			    ocelot_read(ocelot, SYS_COUNT_TX_1527_MAX);
diff --git a/drivers/net/ethernet/mscc/vsc7514_regs.c b/drivers/net/ethernet/mscc/vsc7514_regs.c
index c2af4eb8ca5d..38ab20b48cd4 100644
--- a/drivers/net/ethernet/mscc/vsc7514_regs.c
+++ b/drivers/net/ethernet/mscc/vsc7514_regs.c
@@ -180,13 +180,14 @@ const u32 vsc7514_sys_regmap[] = {
 	REG(SYS_COUNT_RX_64,				0x000024),
 	REG(SYS_COUNT_RX_65_127,			0x000028),
 	REG(SYS_COUNT_RX_128_255,			0x00002c),
-	REG(SYS_COUNT_RX_256_1023,			0x000030),
-	REG(SYS_COUNT_RX_1024_1526,			0x000034),
-	REG(SYS_COUNT_RX_1527_MAX,			0x000038),
-	REG(SYS_COUNT_RX_PAUSE,				0x00003c),
-	REG(SYS_COUNT_RX_CONTROL,			0x000040),
-	REG(SYS_COUNT_RX_LONGS,				0x000044),
-	REG(SYS_COUNT_RX_CLASSIFIED_DROPS,		0x000048),
+	REG(SYS_COUNT_RX_256_511,			0x000030),
+	REG(SYS_COUNT_RX_512_1023,			0x000034),
+	REG(SYS_COUNT_RX_1024_1526,			0x000038),
+	REG(SYS_COUNT_RX_1527_MAX,			0x00003c),
+	REG(SYS_COUNT_RX_PAUSE,				0x000040),
+	REG(SYS_COUNT_RX_CONTROL,			0x000044),
+	REG(SYS_COUNT_RX_LONGS,				0x000048),
+	REG(SYS_COUNT_RX_CLASSIFIED_DROPS,		0x00004c),
 	REG(SYS_COUNT_TX_OCTETS,			0x000100),
 	REG(SYS_COUNT_TX_UNICAST,			0x000104),
 	REG(SYS_COUNT_TX_MULTICAST,			0x000108),
@@ -196,10 +197,11 @@ const u32 vsc7514_sys_regmap[] = {
 	REG(SYS_COUNT_TX_PAUSE,				0x000118),
 	REG(SYS_COUNT_TX_64,				0x00011c),
 	REG(SYS_COUNT_TX_65_127,			0x000120),
-	REG(SYS_COUNT_TX_128_511,			0x000124),
-	REG(SYS_COUNT_TX_512_1023,			0x000128),
-	REG(SYS_COUNT_TX_1024_1526,			0x00012c),
-	REG(SYS_COUNT_TX_1527_MAX,			0x000130),
+	REG(SYS_COUNT_TX_128_255,			0x000124),
+	REG(SYS_COUNT_TX_256_511,			0x000128),
+	REG(SYS_COUNT_TX_512_1023,			0x00012c),
+	REG(SYS_COUNT_TX_1024_1526,			0x000130),
+	REG(SYS_COUNT_TX_1527_MAX,			0x000134),
 	REG(SYS_COUNT_TX_AGING,				0x000170),
 	REG(SYS_RESET_CFG,				0x000508),
 	REG(SYS_CMID,					0x00050c),
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index ac151ecc7f19..e7e5b06deb2d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -335,7 +335,8 @@ enum ocelot_reg {
 	SYS_COUNT_RX_64,
 	SYS_COUNT_RX_65_127,
 	SYS_COUNT_RX_128_255,
-	SYS_COUNT_RX_256_1023,
+	SYS_COUNT_RX_256_511,
+	SYS_COUNT_RX_512_1023,
 	SYS_COUNT_RX_1024_1526,
 	SYS_COUNT_RX_1527_MAX,
 	SYS_COUNT_RX_PAUSE,
@@ -351,7 +352,8 @@ enum ocelot_reg {
 	SYS_COUNT_TX_PAUSE,
 	SYS_COUNT_TX_64,
 	SYS_COUNT_TX_65_127,
-	SYS_COUNT_TX_128_511,
+	SYS_COUNT_TX_128_255,
+	SYS_COUNT_TX_256_511,
 	SYS_COUNT_TX_512_1023,
 	SYS_COUNT_TX_1024_1526,
 	SYS_COUNT_TX_1527_MAX,
-- 
cgit 


From 22d842e3efe56402c33b5e6e303bb71ce9bf9334 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 16 Aug 2022 16:53:48 +0300
Subject: net: mscc: ocelot: turn stats_lock into a spinlock

ocelot_get_stats64() currently runs unlocked and therefore may collide
with ocelot_port_update_stats() which indirectly accesses the same
counters. However, ocelot_get_stats64() runs in atomic context, and we
cannot simply take the sleepable ocelot->stats_lock mutex. We need to
convert it to an atomic spinlock first. Do that as a preparatory change.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  4 ++--
 drivers/net/ethernet/mscc/ocelot.c     | 11 +++++------
 include/soc/mscc/ocelot.h              |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index e1ebe21cad00..46fd6cd0d8f3 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -2171,7 +2171,7 @@ static void vsc9959_psfp_sgi_table_del(struct ocelot *ocelot,
 static void vsc9959_psfp_counters_get(struct ocelot *ocelot, u32 index,
 				      struct felix_stream_filter_counters *counters)
 {
-	mutex_lock(&ocelot->stats_lock);
+	spin_lock(&ocelot->stats_lock);
 
 	ocelot_rmw(ocelot, SYS_STAT_CFG_STAT_VIEW(index),
 		   SYS_STAT_CFG_STAT_VIEW_M,
@@ -2188,7 +2188,7 @@ static void vsc9959_psfp_counters_get(struct ocelot *ocelot, u32 index,
 		     SYS_STAT_CFG_STAT_CLEAR_SHOT(0x10),
 		     SYS_STAT_CFG);
 
-	mutex_unlock(&ocelot->stats_lock);
+	spin_unlock(&ocelot->stats_lock);
 }
 
 static int vsc9959_psfp_filter_add(struct ocelot *ocelot, int port,
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index d4649e4ee0e7..c67f162f8ab5 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1906,13 +1906,13 @@ static void ocelot_check_stats_work(struct work_struct *work)
 					     stats_work);
 	int i, err;
 
-	mutex_lock(&ocelot->stats_lock);
+	spin_lock(&ocelot->stats_lock);
 	for (i = 0; i < ocelot->num_phys_ports; i++) {
 		err = ocelot_port_update_stats(ocelot, i);
 		if (err)
 			break;
 	}
-	mutex_unlock(&ocelot->stats_lock);
+	spin_unlock(&ocelot->stats_lock);
 
 	if (err)
 		dev_err(ocelot->dev, "Error %d updating ethtool stats\n",  err);
@@ -1925,7 +1925,7 @@ void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
 {
 	int i, err;
 
-	mutex_lock(&ocelot->stats_lock);
+	spin_lock(&ocelot->stats_lock);
 
 	/* check and update now */
 	err = ocelot_port_update_stats(ocelot, port);
@@ -1934,7 +1934,7 @@ void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
 	for (i = 0; i < ocelot->num_stats; i++)
 		*data++ = ocelot->stats[port * ocelot->num_stats + i];
 
-	mutex_unlock(&ocelot->stats_lock);
+	spin_unlock(&ocelot->stats_lock);
 
 	if (err)
 		dev_err(ocelot->dev, "Error %d updating ethtool stats\n", err);
@@ -3363,7 +3363,7 @@ int ocelot_init(struct ocelot *ocelot)
 	if (!ocelot->stats)
 		return -ENOMEM;
 
-	mutex_init(&ocelot->stats_lock);
+	spin_lock_init(&ocelot->stats_lock);
 	mutex_init(&ocelot->ptp_lock);
 	mutex_init(&ocelot->mact_lock);
 	mutex_init(&ocelot->fwd_domain_lock);
@@ -3511,7 +3511,6 @@ void ocelot_deinit(struct ocelot *ocelot)
 	cancel_delayed_work(&ocelot->stats_work);
 	destroy_workqueue(ocelot->stats_queue);
 	destroy_workqueue(ocelot->owq);
-	mutex_destroy(&ocelot->stats_lock);
 }
 EXPORT_SYMBOL(ocelot_deinit);
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index e7e5b06deb2d..72b9474391da 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -752,7 +752,7 @@ struct ocelot {
 	struct ocelot_psfp_list		psfp;
 
 	/* Workqueue to check statistics for overflow with its lock */
-	struct mutex			stats_lock;
+	spinlock_t			stats_lock;
 	u64				*stats;
 	struct delayed_work		stats_work;
 	struct workqueue_struct		*stats_queue;
-- 
cgit 


From 9190460084ddd0e9235f55eab0fdd5456b5f2fd5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 16 Aug 2022 16:53:50 +0300
Subject: net: mscc: ocelot: make struct ocelot_stat_layout array indexable

The ocelot counters are 32-bit and require periodic reading, every 2
seconds, by ocelot_port_update_stats(), so that wraparounds are
detected.

Currently, the counters reported by ocelot_get_stats64() come from the
32-bit hardware counters directly, rather than from the 64-bit
accumulated ocelot->stats, and this is a problem for their integrity.

The strategy is to make ocelot_get_stats64() able to cherry-pick
individual stats from ocelot->stats the way in which it currently reads
them out from SYS_COUNT_* registers. But currently it can't, because
ocelot->stats is an opaque u64 array that's used only to feed data into
ethtool -S.

To solve that problem, we need to make ocelot->stats indexable, and
associate each element with an element of struct ocelot_stat_layout used
by ethtool -S.

This makes ocelot_stat_layout a fat (and possibly sparse) array, so we
need to change the way in which we access it. We no longer need
OCELOT_STAT_END as a sentinel, because we know the array's size
(OCELOT_NUM_STATS). We just need to skip the array elements that were
left unpopulated for the switch revision (ocelot, felix, seville).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c     | 468 +++++++++++++++++++++++------
 drivers/net/dsa/ocelot/seville_vsc9953.c   | 468 +++++++++++++++++++++++------
 drivers/net/ethernet/mscc/ocelot.c         |  40 ++-
 drivers/net/ethernet/mscc/ocelot_vsc7514.c | 468 +++++++++++++++++++++++------
 include/soc/mscc/ocelot.h                  | 105 ++++++-
 5 files changed, 1243 insertions(+), 306 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 46fd6cd0d8f3..c9f270f24b1c 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -551,101 +551,379 @@ static const struct reg_field vsc9959_regfields[REGFIELD_MAX] = {
 	[SYS_PAUSE_CFG_PAUSE_ENA] = REG_FIELD_ID(SYS_PAUSE_CFG, 0, 1, 7, 4),
 };
 
-static const struct ocelot_stat_layout vsc9959_stats_layout[] = {
-	{ .offset = 0x00,	.name = "rx_octets", },
-	{ .offset = 0x01,	.name = "rx_unicast", },
-	{ .offset = 0x02,	.name = "rx_multicast", },
-	{ .offset = 0x03,	.name = "rx_broadcast", },
-	{ .offset = 0x04,	.name = "rx_shorts", },
-	{ .offset = 0x05,	.name = "rx_fragments", },
-	{ .offset = 0x06,	.name = "rx_jabbers", },
-	{ .offset = 0x07,	.name = "rx_crc_align_errs", },
-	{ .offset = 0x08,	.name = "rx_sym_errs", },
-	{ .offset = 0x09,	.name = "rx_frames_below_65_octets", },
-	{ .offset = 0x0A,	.name = "rx_frames_65_to_127_octets", },
-	{ .offset = 0x0B,	.name = "rx_frames_128_to_255_octets", },
-	{ .offset = 0x0C,	.name = "rx_frames_256_to_511_octets", },
-	{ .offset = 0x0D,	.name = "rx_frames_512_to_1023_octets", },
-	{ .offset = 0x0E,	.name = "rx_frames_1024_to_1526_octets", },
-	{ .offset = 0x0F,	.name = "rx_frames_over_1526_octets", },
-	{ .offset = 0x10,	.name = "rx_pause", },
-	{ .offset = 0x11,	.name = "rx_control", },
-	{ .offset = 0x12,	.name = "rx_longs", },
-	{ .offset = 0x13,	.name = "rx_classified_drops", },
-	{ .offset = 0x14,	.name = "rx_red_prio_0", },
-	{ .offset = 0x15,	.name = "rx_red_prio_1", },
-	{ .offset = 0x16,	.name = "rx_red_prio_2", },
-	{ .offset = 0x17,	.name = "rx_red_prio_3", },
-	{ .offset = 0x18,	.name = "rx_red_prio_4", },
-	{ .offset = 0x19,	.name = "rx_red_prio_5", },
-	{ .offset = 0x1A,	.name = "rx_red_prio_6", },
-	{ .offset = 0x1B,	.name = "rx_red_prio_7", },
-	{ .offset = 0x1C,	.name = "rx_yellow_prio_0", },
-	{ .offset = 0x1D,	.name = "rx_yellow_prio_1", },
-	{ .offset = 0x1E,	.name = "rx_yellow_prio_2", },
-	{ .offset = 0x1F,	.name = "rx_yellow_prio_3", },
-	{ .offset = 0x20,	.name = "rx_yellow_prio_4", },
-	{ .offset = 0x21,	.name = "rx_yellow_prio_5", },
-	{ .offset = 0x22,	.name = "rx_yellow_prio_6", },
-	{ .offset = 0x23,	.name = "rx_yellow_prio_7", },
-	{ .offset = 0x24,	.name = "rx_green_prio_0", },
-	{ .offset = 0x25,	.name = "rx_green_prio_1", },
-	{ .offset = 0x26,	.name = "rx_green_prio_2", },
-	{ .offset = 0x27,	.name = "rx_green_prio_3", },
-	{ .offset = 0x28,	.name = "rx_green_prio_4", },
-	{ .offset = 0x29,	.name = "rx_green_prio_5", },
-	{ .offset = 0x2A,	.name = "rx_green_prio_6", },
-	{ .offset = 0x2B,	.name = "rx_green_prio_7", },
-	{ .offset = 0x80,	.name = "tx_octets", },
-	{ .offset = 0x81,	.name = "tx_unicast", },
-	{ .offset = 0x82,	.name = "tx_multicast", },
-	{ .offset = 0x83,	.name = "tx_broadcast", },
-	{ .offset = 0x84,	.name = "tx_collision", },
-	{ .offset = 0x85,	.name = "tx_drops", },
-	{ .offset = 0x86,	.name = "tx_pause", },
-	{ .offset = 0x87,	.name = "tx_frames_below_65_octets", },
-	{ .offset = 0x88,	.name = "tx_frames_65_to_127_octets", },
-	{ .offset = 0x89,	.name = "tx_frames_128_255_octets", },
-	{ .offset = 0x8A,	.name = "tx_frames_256_511_octets", },
-	{ .offset = 0x8B,	.name = "tx_frames_512_1023_octets", },
-	{ .offset = 0x8C,	.name = "tx_frames_1024_1526_octets", },
-	{ .offset = 0x8D,	.name = "tx_frames_over_1526_octets", },
-	{ .offset = 0x8E,	.name = "tx_yellow_prio_0", },
-	{ .offset = 0x8F,	.name = "tx_yellow_prio_1", },
-	{ .offset = 0x90,	.name = "tx_yellow_prio_2", },
-	{ .offset = 0x91,	.name = "tx_yellow_prio_3", },
-	{ .offset = 0x92,	.name = "tx_yellow_prio_4", },
-	{ .offset = 0x93,	.name = "tx_yellow_prio_5", },
-	{ .offset = 0x94,	.name = "tx_yellow_prio_6", },
-	{ .offset = 0x95,	.name = "tx_yellow_prio_7", },
-	{ .offset = 0x96,	.name = "tx_green_prio_0", },
-	{ .offset = 0x97,	.name = "tx_green_prio_1", },
-	{ .offset = 0x98,	.name = "tx_green_prio_2", },
-	{ .offset = 0x99,	.name = "tx_green_prio_3", },
-	{ .offset = 0x9A,	.name = "tx_green_prio_4", },
-	{ .offset = 0x9B,	.name = "tx_green_prio_5", },
-	{ .offset = 0x9C,	.name = "tx_green_prio_6", },
-	{ .offset = 0x9D,	.name = "tx_green_prio_7", },
-	{ .offset = 0x9E,	.name = "tx_aged", },
-	{ .offset = 0x100,	.name = "drop_local", },
-	{ .offset = 0x101,	.name = "drop_tail", },
-	{ .offset = 0x102,	.name = "drop_yellow_prio_0", },
-	{ .offset = 0x103,	.name = "drop_yellow_prio_1", },
-	{ .offset = 0x104,	.name = "drop_yellow_prio_2", },
-	{ .offset = 0x105,	.name = "drop_yellow_prio_3", },
-	{ .offset = 0x106,	.name = "drop_yellow_prio_4", },
-	{ .offset = 0x107,	.name = "drop_yellow_prio_5", },
-	{ .offset = 0x108,	.name = "drop_yellow_prio_6", },
-	{ .offset = 0x109,	.name = "drop_yellow_prio_7", },
-	{ .offset = 0x10A,	.name = "drop_green_prio_0", },
-	{ .offset = 0x10B,	.name = "drop_green_prio_1", },
-	{ .offset = 0x10C,	.name = "drop_green_prio_2", },
-	{ .offset = 0x10D,	.name = "drop_green_prio_3", },
-	{ .offset = 0x10E,	.name = "drop_green_prio_4", },
-	{ .offset = 0x10F,	.name = "drop_green_prio_5", },
-	{ .offset = 0x110,	.name = "drop_green_prio_6", },
-	{ .offset = 0x111,	.name = "drop_green_prio_7", },
-	OCELOT_STAT_END
+static const struct ocelot_stat_layout vsc9959_stats_layout[OCELOT_NUM_STATS] = {
+	[OCELOT_STAT_RX_OCTETS] = {
+		.name = "rx_octets",
+		.offset = 0x00,
+	},
+	[OCELOT_STAT_RX_UNICAST] = {
+		.name = "rx_unicast",
+		.offset = 0x01,
+	},
+	[OCELOT_STAT_RX_MULTICAST] = {
+		.name = "rx_multicast",
+		.offset = 0x02,
+	},
+	[OCELOT_STAT_RX_BROADCAST] = {
+		.name = "rx_broadcast",
+		.offset = 0x03,
+	},
+	[OCELOT_STAT_RX_SHORTS] = {
+		.name = "rx_shorts",
+		.offset = 0x04,
+	},
+	[OCELOT_STAT_RX_FRAGMENTS] = {
+		.name = "rx_fragments",
+		.offset = 0x05,
+	},
+	[OCELOT_STAT_RX_JABBERS] = {
+		.name = "rx_jabbers",
+		.offset = 0x06,
+	},
+	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
+		.name = "rx_crc_align_errs",
+		.offset = 0x07,
+	},
+	[OCELOT_STAT_RX_SYM_ERRS] = {
+		.name = "rx_sym_errs",
+		.offset = 0x08,
+	},
+	[OCELOT_STAT_RX_64] = {
+		.name = "rx_frames_below_65_octets",
+		.offset = 0x09,
+	},
+	[OCELOT_STAT_RX_65_127] = {
+		.name = "rx_frames_65_to_127_octets",
+		.offset = 0x0A,
+	},
+	[OCELOT_STAT_RX_128_255] = {
+		.name = "rx_frames_128_to_255_octets",
+		.offset = 0x0B,
+	},
+	[OCELOT_STAT_RX_256_511] = {
+		.name = "rx_frames_256_to_511_octets",
+		.offset = 0x0C,
+	},
+	[OCELOT_STAT_RX_512_1023] = {
+		.name = "rx_frames_512_to_1023_octets",
+		.offset = 0x0D,
+	},
+	[OCELOT_STAT_RX_1024_1526] = {
+		.name = "rx_frames_1024_to_1526_octets",
+		.offset = 0x0E,
+	},
+	[OCELOT_STAT_RX_1527_MAX] = {
+		.name = "rx_frames_over_1526_octets",
+		.offset = 0x0F,
+	},
+	[OCELOT_STAT_RX_PAUSE] = {
+		.name = "rx_pause",
+		.offset = 0x10,
+	},
+	[OCELOT_STAT_RX_CONTROL] = {
+		.name = "rx_control",
+		.offset = 0x11,
+	},
+	[OCELOT_STAT_RX_LONGS] = {
+		.name = "rx_longs",
+		.offset = 0x12,
+	},
+	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
+		.name = "rx_classified_drops",
+		.offset = 0x13,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_0] = {
+		.name = "rx_red_prio_0",
+		.offset = 0x14,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_1] = {
+		.name = "rx_red_prio_1",
+		.offset = 0x15,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_2] = {
+		.name = "rx_red_prio_2",
+		.offset = 0x16,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_3] = {
+		.name = "rx_red_prio_3",
+		.offset = 0x17,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_4] = {
+		.name = "rx_red_prio_4",
+		.offset = 0x18,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_5] = {
+		.name = "rx_red_prio_5",
+		.offset = 0x19,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_6] = {
+		.name = "rx_red_prio_6",
+		.offset = 0x1A,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_7] = {
+		.name = "rx_red_prio_7",
+		.offset = 0x1B,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
+		.name = "rx_yellow_prio_0",
+		.offset = 0x1C,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
+		.name = "rx_yellow_prio_1",
+		.offset = 0x1D,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
+		.name = "rx_yellow_prio_2",
+		.offset = 0x1E,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
+		.name = "rx_yellow_prio_3",
+		.offset = 0x1F,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
+		.name = "rx_yellow_prio_4",
+		.offset = 0x20,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
+		.name = "rx_yellow_prio_5",
+		.offset = 0x21,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
+		.name = "rx_yellow_prio_6",
+		.offset = 0x22,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
+		.name = "rx_yellow_prio_7",
+		.offset = 0x23,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
+		.name = "rx_green_prio_0",
+		.offset = 0x24,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
+		.name = "rx_green_prio_1",
+		.offset = 0x25,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
+		.name = "rx_green_prio_2",
+		.offset = 0x26,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
+		.name = "rx_green_prio_3",
+		.offset = 0x27,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
+		.name = "rx_green_prio_4",
+		.offset = 0x28,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
+		.name = "rx_green_prio_5",
+		.offset = 0x29,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
+		.name = "rx_green_prio_6",
+		.offset = 0x2A,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
+		.name = "rx_green_prio_7",
+		.offset = 0x2B,
+	},
+	[OCELOT_STAT_TX_OCTETS] = {
+		.name = "tx_octets",
+		.offset = 0x80,
+	},
+	[OCELOT_STAT_TX_UNICAST] = {
+		.name = "tx_unicast",
+		.offset = 0x81,
+	},
+	[OCELOT_STAT_TX_MULTICAST] = {
+		.name = "tx_multicast",
+		.offset = 0x82,
+	},
+	[OCELOT_STAT_TX_BROADCAST] = {
+		.name = "tx_broadcast",
+		.offset = 0x83,
+	},
+	[OCELOT_STAT_TX_COLLISION] = {
+		.name = "tx_collision",
+		.offset = 0x84,
+	},
+	[OCELOT_STAT_TX_DROPS] = {
+		.name = "tx_drops",
+		.offset = 0x85,
+	},
+	[OCELOT_STAT_TX_PAUSE] = {
+		.name = "tx_pause",
+		.offset = 0x86,
+	},
+	[OCELOT_STAT_TX_64] = {
+		.name = "tx_frames_below_65_octets",
+		.offset = 0x87,
+	},
+	[OCELOT_STAT_TX_65_127] = {
+		.name = "tx_frames_65_to_127_octets",
+		.offset = 0x88,
+	},
+	[OCELOT_STAT_TX_128_255] = {
+		.name = "tx_frames_128_255_octets",
+		.offset = 0x89,
+	},
+	[OCELOT_STAT_TX_256_511] = {
+		.name = "tx_frames_256_511_octets",
+		.offset = 0x8A,
+	},
+	[OCELOT_STAT_TX_512_1023] = {
+		.name = "tx_frames_512_1023_octets",
+		.offset = 0x8B,
+	},
+	[OCELOT_STAT_TX_1024_1526] = {
+		.name = "tx_frames_1024_1526_octets",
+		.offset = 0x8C,
+	},
+	[OCELOT_STAT_TX_1527_MAX] = {
+		.name = "tx_frames_over_1526_octets",
+		.offset = 0x8D,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
+		.name = "tx_yellow_prio_0",
+		.offset = 0x8E,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
+		.name = "tx_yellow_prio_1",
+		.offset = 0x8F,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
+		.name = "tx_yellow_prio_2",
+		.offset = 0x90,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
+		.name = "tx_yellow_prio_3",
+		.offset = 0x91,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
+		.name = "tx_yellow_prio_4",
+		.offset = 0x92,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
+		.name = "tx_yellow_prio_5",
+		.offset = 0x93,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
+		.name = "tx_yellow_prio_6",
+		.offset = 0x94,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
+		.name = "tx_yellow_prio_7",
+		.offset = 0x95,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
+		.name = "tx_green_prio_0",
+		.offset = 0x96,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
+		.name = "tx_green_prio_1",
+		.offset = 0x97,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
+		.name = "tx_green_prio_2",
+		.offset = 0x98,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
+		.name = "tx_green_prio_3",
+		.offset = 0x99,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
+		.name = "tx_green_prio_4",
+		.offset = 0x9A,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
+		.name = "tx_green_prio_5",
+		.offset = 0x9B,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
+		.name = "tx_green_prio_6",
+		.offset = 0x9C,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
+		.name = "tx_green_prio_7",
+		.offset = 0x9D,
+	},
+	[OCELOT_STAT_TX_AGED] = {
+		.name = "tx_aged",
+		.offset = 0x9E,
+	},
+	[OCELOT_STAT_DROP_LOCAL] = {
+		.name = "drop_local",
+		.offset = 0x100,
+	},
+	[OCELOT_STAT_DROP_TAIL] = {
+		.name = "drop_tail",
+		.offset = 0x101,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
+		.name = "drop_yellow_prio_0",
+		.offset = 0x102,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
+		.name = "drop_yellow_prio_1",
+		.offset = 0x103,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
+		.name = "drop_yellow_prio_2",
+		.offset = 0x104,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
+		.name = "drop_yellow_prio_3",
+		.offset = 0x105,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
+		.name = "drop_yellow_prio_4",
+		.offset = 0x106,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
+		.name = "drop_yellow_prio_5",
+		.offset = 0x107,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
+		.name = "drop_yellow_prio_6",
+		.offset = 0x108,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
+		.name = "drop_yellow_prio_7",
+		.offset = 0x109,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
+		.name = "drop_green_prio_0",
+		.offset = 0x10A,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
+		.name = "drop_green_prio_1",
+		.offset = 0x10B,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
+		.name = "drop_green_prio_2",
+		.offset = 0x10C,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
+		.name = "drop_green_prio_3",
+		.offset = 0x10D,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
+		.name = "drop_green_prio_4",
+		.offset = 0x10E,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
+		.name = "drop_green_prio_5",
+		.offset = 0x10F,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
+		.name = "drop_green_prio_6",
+		.offset = 0x110,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
+		.name = "drop_green_prio_7",
+		.offset = 0x111,
+	},
 };
 
 static const struct vcap_field vsc9959_vcap_es0_keys[] = {
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c
index ebe9ddbbe2b7..fe5d4642d0bc 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -545,101 +545,379 @@ static const struct reg_field vsc9953_regfields[REGFIELD_MAX] = {
 	[SYS_PAUSE_CFG_PAUSE_ENA] = REG_FIELD_ID(SYS_PAUSE_CFG, 0, 1, 11, 4),
 };
 
-static const struct ocelot_stat_layout vsc9953_stats_layout[] = {
-	{ .offset = 0x00,	.name = "rx_octets", },
-	{ .offset = 0x01,	.name = "rx_unicast", },
-	{ .offset = 0x02,	.name = "rx_multicast", },
-	{ .offset = 0x03,	.name = "rx_broadcast", },
-	{ .offset = 0x04,	.name = "rx_shorts", },
-	{ .offset = 0x05,	.name = "rx_fragments", },
-	{ .offset = 0x06,	.name = "rx_jabbers", },
-	{ .offset = 0x07,	.name = "rx_crc_align_errs", },
-	{ .offset = 0x08,	.name = "rx_sym_errs", },
-	{ .offset = 0x09,	.name = "rx_frames_below_65_octets", },
-	{ .offset = 0x0A,	.name = "rx_frames_65_to_127_octets", },
-	{ .offset = 0x0B,	.name = "rx_frames_128_to_255_octets", },
-	{ .offset = 0x0C,	.name = "rx_frames_256_to_511_octets", },
-	{ .offset = 0x0D,	.name = "rx_frames_512_to_1023_octets", },
-	{ .offset = 0x0E,	.name = "rx_frames_1024_to_1526_octets", },
-	{ .offset = 0x0F,	.name = "rx_frames_over_1526_octets", },
-	{ .offset = 0x10,	.name = "rx_pause", },
-	{ .offset = 0x11,	.name = "rx_control", },
-	{ .offset = 0x12,	.name = "rx_longs", },
-	{ .offset = 0x13,	.name = "rx_classified_drops", },
-	{ .offset = 0x14,	.name = "rx_red_prio_0", },
-	{ .offset = 0x15,	.name = "rx_red_prio_1", },
-	{ .offset = 0x16,	.name = "rx_red_prio_2", },
-	{ .offset = 0x17,	.name = "rx_red_prio_3", },
-	{ .offset = 0x18,	.name = "rx_red_prio_4", },
-	{ .offset = 0x19,	.name = "rx_red_prio_5", },
-	{ .offset = 0x1A,	.name = "rx_red_prio_6", },
-	{ .offset = 0x1B,	.name = "rx_red_prio_7", },
-	{ .offset = 0x1C,	.name = "rx_yellow_prio_0", },
-	{ .offset = 0x1D,	.name = "rx_yellow_prio_1", },
-	{ .offset = 0x1E,	.name = "rx_yellow_prio_2", },
-	{ .offset = 0x1F,	.name = "rx_yellow_prio_3", },
-	{ .offset = 0x20,	.name = "rx_yellow_prio_4", },
-	{ .offset = 0x21,	.name = "rx_yellow_prio_5", },
-	{ .offset = 0x22,	.name = "rx_yellow_prio_6", },
-	{ .offset = 0x23,	.name = "rx_yellow_prio_7", },
-	{ .offset = 0x24,	.name = "rx_green_prio_0", },
-	{ .offset = 0x25,	.name = "rx_green_prio_1", },
-	{ .offset = 0x26,	.name = "rx_green_prio_2", },
-	{ .offset = 0x27,	.name = "rx_green_prio_3", },
-	{ .offset = 0x28,	.name = "rx_green_prio_4", },
-	{ .offset = 0x29,	.name = "rx_green_prio_5", },
-	{ .offset = 0x2A,	.name = "rx_green_prio_6", },
-	{ .offset = 0x2B,	.name = "rx_green_prio_7", },
-	{ .offset = 0x40,	.name = "tx_octets", },
-	{ .offset = 0x41,	.name = "tx_unicast", },
-	{ .offset = 0x42,	.name = "tx_multicast", },
-	{ .offset = 0x43,	.name = "tx_broadcast", },
-	{ .offset = 0x44,	.name = "tx_collision", },
-	{ .offset = 0x45,	.name = "tx_drops", },
-	{ .offset = 0x46,	.name = "tx_pause", },
-	{ .offset = 0x47,	.name = "tx_frames_below_65_octets", },
-	{ .offset = 0x48,	.name = "tx_frames_65_to_127_octets", },
-	{ .offset = 0x49,	.name = "tx_frames_128_255_octets", },
-	{ .offset = 0x4A,	.name = "tx_frames_256_511_octets", },
-	{ .offset = 0x4B,	.name = "tx_frames_512_1023_octets", },
-	{ .offset = 0x4C,	.name = "tx_frames_1024_1526_octets", },
-	{ .offset = 0x4D,	.name = "tx_frames_over_1526_octets", },
-	{ .offset = 0x4E,	.name = "tx_yellow_prio_0", },
-	{ .offset = 0x4F,	.name = "tx_yellow_prio_1", },
-	{ .offset = 0x50,	.name = "tx_yellow_prio_2", },
-	{ .offset = 0x51,	.name = "tx_yellow_prio_3", },
-	{ .offset = 0x52,	.name = "tx_yellow_prio_4", },
-	{ .offset = 0x53,	.name = "tx_yellow_prio_5", },
-	{ .offset = 0x54,	.name = "tx_yellow_prio_6", },
-	{ .offset = 0x55,	.name = "tx_yellow_prio_7", },
-	{ .offset = 0x56,	.name = "tx_green_prio_0", },
-	{ .offset = 0x57,	.name = "tx_green_prio_1", },
-	{ .offset = 0x58,	.name = "tx_green_prio_2", },
-	{ .offset = 0x59,	.name = "tx_green_prio_3", },
-	{ .offset = 0x5A,	.name = "tx_green_prio_4", },
-	{ .offset = 0x5B,	.name = "tx_green_prio_5", },
-	{ .offset = 0x5C,	.name = "tx_green_prio_6", },
-	{ .offset = 0x5D,	.name = "tx_green_prio_7", },
-	{ .offset = 0x5E,	.name = "tx_aged", },
-	{ .offset = 0x80,	.name = "drop_local", },
-	{ .offset = 0x81,	.name = "drop_tail", },
-	{ .offset = 0x82,	.name = "drop_yellow_prio_0", },
-	{ .offset = 0x83,	.name = "drop_yellow_prio_1", },
-	{ .offset = 0x84,	.name = "drop_yellow_prio_2", },
-	{ .offset = 0x85,	.name = "drop_yellow_prio_3", },
-	{ .offset = 0x86,	.name = "drop_yellow_prio_4", },
-	{ .offset = 0x87,	.name = "drop_yellow_prio_5", },
-	{ .offset = 0x88,	.name = "drop_yellow_prio_6", },
-	{ .offset = 0x89,	.name = "drop_yellow_prio_7", },
-	{ .offset = 0x8A,	.name = "drop_green_prio_0", },
-	{ .offset = 0x8B,	.name = "drop_green_prio_1", },
-	{ .offset = 0x8C,	.name = "drop_green_prio_2", },
-	{ .offset = 0x8D,	.name = "drop_green_prio_3", },
-	{ .offset = 0x8E,	.name = "drop_green_prio_4", },
-	{ .offset = 0x8F,	.name = "drop_green_prio_5", },
-	{ .offset = 0x90,	.name = "drop_green_prio_6", },
-	{ .offset = 0x91,	.name = "drop_green_prio_7", },
-	OCELOT_STAT_END
+static const struct ocelot_stat_layout vsc9953_stats_layout[OCELOT_NUM_STATS] = {
+	[OCELOT_STAT_RX_OCTETS] = {
+		.name = "rx_octets",
+		.offset = 0x00,
+	},
+	[OCELOT_STAT_RX_UNICAST] = {
+		.name = "rx_unicast",
+		.offset = 0x01,
+	},
+	[OCELOT_STAT_RX_MULTICAST] = {
+		.name = "rx_multicast",
+		.offset = 0x02,
+	},
+	[OCELOT_STAT_RX_BROADCAST] = {
+		.name = "rx_broadcast",
+		.offset = 0x03,
+	},
+	[OCELOT_STAT_RX_SHORTS] = {
+		.name = "rx_shorts",
+		.offset = 0x04,
+	},
+	[OCELOT_STAT_RX_FRAGMENTS] = {
+		.name = "rx_fragments",
+		.offset = 0x05,
+	},
+	[OCELOT_STAT_RX_JABBERS] = {
+		.name = "rx_jabbers",
+		.offset = 0x06,
+	},
+	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
+		.name = "rx_crc_align_errs",
+		.offset = 0x07,
+	},
+	[OCELOT_STAT_RX_SYM_ERRS] = {
+		.name = "rx_sym_errs",
+		.offset = 0x08,
+	},
+	[OCELOT_STAT_RX_64] = {
+		.name = "rx_frames_below_65_octets",
+		.offset = 0x09,
+	},
+	[OCELOT_STAT_RX_65_127] = {
+		.name = "rx_frames_65_to_127_octets",
+		.offset = 0x0A,
+	},
+	[OCELOT_STAT_RX_128_255] = {
+		.name = "rx_frames_128_to_255_octets",
+		.offset = 0x0B,
+	},
+	[OCELOT_STAT_RX_256_511] = {
+		.name = "rx_frames_256_to_511_octets",
+		.offset = 0x0C,
+	},
+	[OCELOT_STAT_RX_512_1023] = {
+		.name = "rx_frames_512_to_1023_octets",
+		.offset = 0x0D,
+	},
+	[OCELOT_STAT_RX_1024_1526] = {
+		.name = "rx_frames_1024_to_1526_octets",
+		.offset = 0x0E,
+	},
+	[OCELOT_STAT_RX_1527_MAX] = {
+		.name = "rx_frames_over_1526_octets",
+		.offset = 0x0F,
+	},
+	[OCELOT_STAT_RX_PAUSE] = {
+		.name = "rx_pause",
+		.offset = 0x10,
+	},
+	[OCELOT_STAT_RX_CONTROL] = {
+		.name = "rx_control",
+		.offset = 0x11,
+	},
+	[OCELOT_STAT_RX_LONGS] = {
+		.name = "rx_longs",
+		.offset = 0x12,
+	},
+	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
+		.name = "rx_classified_drops",
+		.offset = 0x13,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_0] = {
+		.name = "rx_red_prio_0",
+		.offset = 0x14,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_1] = {
+		.name = "rx_red_prio_1",
+		.offset = 0x15,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_2] = {
+		.name = "rx_red_prio_2",
+		.offset = 0x16,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_3] = {
+		.name = "rx_red_prio_3",
+		.offset = 0x17,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_4] = {
+		.name = "rx_red_prio_4",
+		.offset = 0x18,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_5] = {
+		.name = "rx_red_prio_5",
+		.offset = 0x19,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_6] = {
+		.name = "rx_red_prio_6",
+		.offset = 0x1A,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_7] = {
+		.name = "rx_red_prio_7",
+		.offset = 0x1B,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
+		.name = "rx_yellow_prio_0",
+		.offset = 0x1C,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
+		.name = "rx_yellow_prio_1",
+		.offset = 0x1D,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
+		.name = "rx_yellow_prio_2",
+		.offset = 0x1E,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
+		.name = "rx_yellow_prio_3",
+		.offset = 0x1F,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
+		.name = "rx_yellow_prio_4",
+		.offset = 0x20,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
+		.name = "rx_yellow_prio_5",
+		.offset = 0x21,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
+		.name = "rx_yellow_prio_6",
+		.offset = 0x22,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
+		.name = "rx_yellow_prio_7",
+		.offset = 0x23,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
+		.name = "rx_green_prio_0",
+		.offset = 0x24,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
+		.name = "rx_green_prio_1",
+		.offset = 0x25,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
+		.name = "rx_green_prio_2",
+		.offset = 0x26,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
+		.name = "rx_green_prio_3",
+		.offset = 0x27,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
+		.name = "rx_green_prio_4",
+		.offset = 0x28,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
+		.name = "rx_green_prio_5",
+		.offset = 0x29,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
+		.name = "rx_green_prio_6",
+		.offset = 0x2A,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
+		.name = "rx_green_prio_7",
+		.offset = 0x2B,
+	},
+	[OCELOT_STAT_TX_OCTETS] = {
+		.name = "tx_octets",
+		.offset = 0x40,
+	},
+	[OCELOT_STAT_TX_UNICAST] = {
+		.name = "tx_unicast",
+		.offset = 0x41,
+	},
+	[OCELOT_STAT_TX_MULTICAST] = {
+		.name = "tx_multicast",
+		.offset = 0x42,
+	},
+	[OCELOT_STAT_TX_BROADCAST] = {
+		.name = "tx_broadcast",
+		.offset = 0x43,
+	},
+	[OCELOT_STAT_TX_COLLISION] = {
+		.name = "tx_collision",
+		.offset = 0x44,
+	},
+	[OCELOT_STAT_TX_DROPS] = {
+		.name = "tx_drops",
+		.offset = 0x45,
+	},
+	[OCELOT_STAT_TX_PAUSE] = {
+		.name = "tx_pause",
+		.offset = 0x46,
+	},
+	[OCELOT_STAT_TX_64] = {
+		.name = "tx_frames_below_65_octets",
+		.offset = 0x47,
+	},
+	[OCELOT_STAT_TX_65_127] = {
+		.name = "tx_frames_65_to_127_octets",
+		.offset = 0x48,
+	},
+	[OCELOT_STAT_TX_128_255] = {
+		.name = "tx_frames_128_255_octets",
+		.offset = 0x49,
+	},
+	[OCELOT_STAT_TX_256_511] = {
+		.name = "tx_frames_256_511_octets",
+		.offset = 0x4A,
+	},
+	[OCELOT_STAT_TX_512_1023] = {
+		.name = "tx_frames_512_1023_octets",
+		.offset = 0x4B,
+	},
+	[OCELOT_STAT_TX_1024_1526] = {
+		.name = "tx_frames_1024_1526_octets",
+		.offset = 0x4C,
+	},
+	[OCELOT_STAT_TX_1527_MAX] = {
+		.name = "tx_frames_over_1526_octets",
+		.offset = 0x4D,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
+		.name = "tx_yellow_prio_0",
+		.offset = 0x4E,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
+		.name = "tx_yellow_prio_1",
+		.offset = 0x4F,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
+		.name = "tx_yellow_prio_2",
+		.offset = 0x50,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
+		.name = "tx_yellow_prio_3",
+		.offset = 0x51,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
+		.name = "tx_yellow_prio_4",
+		.offset = 0x52,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
+		.name = "tx_yellow_prio_5",
+		.offset = 0x53,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
+		.name = "tx_yellow_prio_6",
+		.offset = 0x54,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
+		.name = "tx_yellow_prio_7",
+		.offset = 0x55,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
+		.name = "tx_green_prio_0",
+		.offset = 0x56,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
+		.name = "tx_green_prio_1",
+		.offset = 0x57,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
+		.name = "tx_green_prio_2",
+		.offset = 0x58,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
+		.name = "tx_green_prio_3",
+		.offset = 0x59,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
+		.name = "tx_green_prio_4",
+		.offset = 0x5A,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
+		.name = "tx_green_prio_5",
+		.offset = 0x5B,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
+		.name = "tx_green_prio_6",
+		.offset = 0x5C,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
+		.name = "tx_green_prio_7",
+		.offset = 0x5D,
+	},
+	[OCELOT_STAT_TX_AGED] = {
+		.name = "tx_aged",
+		.offset = 0x5E,
+	},
+	[OCELOT_STAT_DROP_LOCAL] = {
+		.name = "drop_local",
+		.offset = 0x80,
+	},
+	[OCELOT_STAT_DROP_TAIL] = {
+		.name = "drop_tail",
+		.offset = 0x81,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
+		.name = "drop_yellow_prio_0",
+		.offset = 0x82,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
+		.name = "drop_yellow_prio_1",
+		.offset = 0x83,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
+		.name = "drop_yellow_prio_2",
+		.offset = 0x84,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
+		.name = "drop_yellow_prio_3",
+		.offset = 0x85,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
+		.name = "drop_yellow_prio_4",
+		.offset = 0x86,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
+		.name = "drop_yellow_prio_5",
+		.offset = 0x87,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
+		.name = "drop_yellow_prio_6",
+		.offset = 0x88,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
+		.name = "drop_yellow_prio_7",
+		.offset = 0x89,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
+		.name = "drop_green_prio_0",
+		.offset = 0x8A,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
+		.name = "drop_green_prio_1",
+		.offset = 0x8B,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
+		.name = "drop_green_prio_2",
+		.offset = 0x8C,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
+		.name = "drop_green_prio_3",
+		.offset = 0x8D,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
+		.name = "drop_green_prio_4",
+		.offset = 0x8E,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
+		.name = "drop_green_prio_5",
+		.offset = 0x8F,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
+		.name = "drop_green_prio_6",
+		.offset = 0x90,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
+		.name = "drop_green_prio_7",
+		.offset = 0x91,
+	},
 };
 
 static const struct vcap_field vsc9953_vcap_es0_keys[] = {
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index c67f162f8ab5..68991b021c56 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1860,16 +1860,20 @@ void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data)
 	if (sset != ETH_SS_STATS)
 		return;
 
-	for (i = 0; i < ocelot->num_stats; i++)
+	for (i = 0; i < OCELOT_NUM_STATS; i++) {
+		if (ocelot->stats_layout[i].name[0] == '\0')
+			continue;
+
 		memcpy(data + i * ETH_GSTRING_LEN, ocelot->stats_layout[i].name,
 		       ETH_GSTRING_LEN);
+	}
 }
 EXPORT_SYMBOL(ocelot_get_strings);
 
 /* Caller must hold &ocelot->stats_lock */
 static int ocelot_port_update_stats(struct ocelot *ocelot, int port)
 {
-	unsigned int idx = port * ocelot->num_stats;
+	unsigned int idx = port * OCELOT_NUM_STATS;
 	struct ocelot_stats_region *region;
 	int err, j;
 
@@ -1930,9 +1934,15 @@ void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
 	/* check and update now */
 	err = ocelot_port_update_stats(ocelot, port);
 
-	/* Copy all counters */
-	for (i = 0; i < ocelot->num_stats; i++)
-		*data++ = ocelot->stats[port * ocelot->num_stats + i];
+	/* Copy all supported counters */
+	for (i = 0; i < OCELOT_NUM_STATS; i++) {
+		int index = port * OCELOT_NUM_STATS + i;
+
+		if (ocelot->stats_layout[i].name[0] == '\0')
+			continue;
+
+		*data++ = ocelot->stats[index];
+	}
 
 	spin_unlock(&ocelot->stats_lock);
 
@@ -1943,10 +1953,16 @@ EXPORT_SYMBOL(ocelot_get_ethtool_stats);
 
 int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset)
 {
+	int i, num_stats = 0;
+
 	if (sset != ETH_SS_STATS)
 		return -EOPNOTSUPP;
 
-	return ocelot->num_stats;
+	for (i = 0; i < OCELOT_NUM_STATS; i++)
+		if (ocelot->stats_layout[i].name[0] != '\0')
+			num_stats++;
+
+	return num_stats;
 }
 EXPORT_SYMBOL(ocelot_get_sset_count);
 
@@ -1958,7 +1974,10 @@ static int ocelot_prepare_stats_regions(struct ocelot *ocelot)
 
 	INIT_LIST_HEAD(&ocelot->stats_regions);
 
-	for (i = 0; i < ocelot->num_stats; i++) {
+	for (i = 0; i < OCELOT_NUM_STATS; i++) {
+		if (ocelot->stats_layout[i].name[0] == '\0')
+			continue;
+
 		if (region && ocelot->stats_layout[i].offset == last + 1) {
 			region->count++;
 		} else {
@@ -3340,7 +3359,6 @@ static void ocelot_detect_features(struct ocelot *ocelot)
 
 int ocelot_init(struct ocelot *ocelot)
 {
-	const struct ocelot_stat_layout *stat;
 	char queue_name[32];
 	int i, ret;
 	u32 port;
@@ -3353,12 +3371,8 @@ int ocelot_init(struct ocelot *ocelot)
 		}
 	}
 
-	ocelot->num_stats = 0;
-	for_each_stat(ocelot, stat)
-		ocelot->num_stats++;
-
 	ocelot->stats = devm_kcalloc(ocelot->dev,
-				     ocelot->num_phys_ports * ocelot->num_stats,
+				     ocelot->num_phys_ports * OCELOT_NUM_STATS,
 				     sizeof(u64), GFP_KERNEL);
 	if (!ocelot->stats)
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 961f803aca19..9ff910560043 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -96,101 +96,379 @@ static const struct reg_field ocelot_regfields[REGFIELD_MAX] = {
 	[SYS_PAUSE_CFG_PAUSE_ENA] = REG_FIELD_ID(SYS_PAUSE_CFG, 0, 1, 12, 4),
 };
 
-static const struct ocelot_stat_layout ocelot_stats_layout[] = {
-	{ .name = "rx_octets", .offset = 0x00, },
-	{ .name = "rx_unicast", .offset = 0x01, },
-	{ .name = "rx_multicast", .offset = 0x02, },
-	{ .name = "rx_broadcast", .offset = 0x03, },
-	{ .name = "rx_shorts", .offset = 0x04, },
-	{ .name = "rx_fragments", .offset = 0x05, },
-	{ .name = "rx_jabbers", .offset = 0x06, },
-	{ .name = "rx_crc_align_errs", .offset = 0x07, },
-	{ .name = "rx_sym_errs", .offset = 0x08, },
-	{ .name = "rx_frames_below_65_octets", .offset = 0x09, },
-	{ .name = "rx_frames_65_to_127_octets", .offset = 0x0A, },
-	{ .name = "rx_frames_128_to_255_octets", .offset = 0x0B, },
-	{ .name = "rx_frames_256_to_511_octets", .offset = 0x0C, },
-	{ .name = "rx_frames_512_to_1023_octets", .offset = 0x0D, },
-	{ .name = "rx_frames_1024_to_1526_octets", .offset = 0x0E, },
-	{ .name = "rx_frames_over_1526_octets", .offset = 0x0F, },
-	{ .name = "rx_pause", .offset = 0x10, },
-	{ .name = "rx_control", .offset = 0x11, },
-	{ .name = "rx_longs", .offset = 0x12, },
-	{ .name = "rx_classified_drops", .offset = 0x13, },
-	{ .name = "rx_red_prio_0", .offset = 0x14, },
-	{ .name = "rx_red_prio_1", .offset = 0x15, },
-	{ .name = "rx_red_prio_2", .offset = 0x16, },
-	{ .name = "rx_red_prio_3", .offset = 0x17, },
-	{ .name = "rx_red_prio_4", .offset = 0x18, },
-	{ .name = "rx_red_prio_5", .offset = 0x19, },
-	{ .name = "rx_red_prio_6", .offset = 0x1A, },
-	{ .name = "rx_red_prio_7", .offset = 0x1B, },
-	{ .name = "rx_yellow_prio_0", .offset = 0x1C, },
-	{ .name = "rx_yellow_prio_1", .offset = 0x1D, },
-	{ .name = "rx_yellow_prio_2", .offset = 0x1E, },
-	{ .name = "rx_yellow_prio_3", .offset = 0x1F, },
-	{ .name = "rx_yellow_prio_4", .offset = 0x20, },
-	{ .name = "rx_yellow_prio_5", .offset = 0x21, },
-	{ .name = "rx_yellow_prio_6", .offset = 0x22, },
-	{ .name = "rx_yellow_prio_7", .offset = 0x23, },
-	{ .name = "rx_green_prio_0", .offset = 0x24, },
-	{ .name = "rx_green_prio_1", .offset = 0x25, },
-	{ .name = "rx_green_prio_2", .offset = 0x26, },
-	{ .name = "rx_green_prio_3", .offset = 0x27, },
-	{ .name = "rx_green_prio_4", .offset = 0x28, },
-	{ .name = "rx_green_prio_5", .offset = 0x29, },
-	{ .name = "rx_green_prio_6", .offset = 0x2A, },
-	{ .name = "rx_green_prio_7", .offset = 0x2B, },
-	{ .name = "tx_octets", .offset = 0x40, },
-	{ .name = "tx_unicast", .offset = 0x41, },
-	{ .name = "tx_multicast", .offset = 0x42, },
-	{ .name = "tx_broadcast", .offset = 0x43, },
-	{ .name = "tx_collision", .offset = 0x44, },
-	{ .name = "tx_drops", .offset = 0x45, },
-	{ .name = "tx_pause", .offset = 0x46, },
-	{ .name = "tx_frames_below_65_octets", .offset = 0x47, },
-	{ .name = "tx_frames_65_to_127_octets", .offset = 0x48, },
-	{ .name = "tx_frames_128_255_octets", .offset = 0x49, },
-	{ .name = "tx_frames_256_511_octets", .offset = 0x4A, },
-	{ .name = "tx_frames_512_1023_octets", .offset = 0x4B, },
-	{ .name = "tx_frames_1024_1526_octets", .offset = 0x4C, },
-	{ .name = "tx_frames_over_1526_octets", .offset = 0x4D, },
-	{ .name = "tx_yellow_prio_0", .offset = 0x4E, },
-	{ .name = "tx_yellow_prio_1", .offset = 0x4F, },
-	{ .name = "tx_yellow_prio_2", .offset = 0x50, },
-	{ .name = "tx_yellow_prio_3", .offset = 0x51, },
-	{ .name = "tx_yellow_prio_4", .offset = 0x52, },
-	{ .name = "tx_yellow_prio_5", .offset = 0x53, },
-	{ .name = "tx_yellow_prio_6", .offset = 0x54, },
-	{ .name = "tx_yellow_prio_7", .offset = 0x55, },
-	{ .name = "tx_green_prio_0", .offset = 0x56, },
-	{ .name = "tx_green_prio_1", .offset = 0x57, },
-	{ .name = "tx_green_prio_2", .offset = 0x58, },
-	{ .name = "tx_green_prio_3", .offset = 0x59, },
-	{ .name = "tx_green_prio_4", .offset = 0x5A, },
-	{ .name = "tx_green_prio_5", .offset = 0x5B, },
-	{ .name = "tx_green_prio_6", .offset = 0x5C, },
-	{ .name = "tx_green_prio_7", .offset = 0x5D, },
-	{ .name = "tx_aged", .offset = 0x5E, },
-	{ .name = "drop_local", .offset = 0x80, },
-	{ .name = "drop_tail", .offset = 0x81, },
-	{ .name = "drop_yellow_prio_0", .offset = 0x82, },
-	{ .name = "drop_yellow_prio_1", .offset = 0x83, },
-	{ .name = "drop_yellow_prio_2", .offset = 0x84, },
-	{ .name = "drop_yellow_prio_3", .offset = 0x85, },
-	{ .name = "drop_yellow_prio_4", .offset = 0x86, },
-	{ .name = "drop_yellow_prio_5", .offset = 0x87, },
-	{ .name = "drop_yellow_prio_6", .offset = 0x88, },
-	{ .name = "drop_yellow_prio_7", .offset = 0x89, },
-	{ .name = "drop_green_prio_0", .offset = 0x8A, },
-	{ .name = "drop_green_prio_1", .offset = 0x8B, },
-	{ .name = "drop_green_prio_2", .offset = 0x8C, },
-	{ .name = "drop_green_prio_3", .offset = 0x8D, },
-	{ .name = "drop_green_prio_4", .offset = 0x8E, },
-	{ .name = "drop_green_prio_5", .offset = 0x8F, },
-	{ .name = "drop_green_prio_6", .offset = 0x90, },
-	{ .name = "drop_green_prio_7", .offset = 0x91, },
-	OCELOT_STAT_END
+static const struct ocelot_stat_layout ocelot_stats_layout[OCELOT_NUM_STATS] = {
+	[OCELOT_STAT_RX_OCTETS] = {
+		.name = "rx_octets",
+		.offset = 0x00,
+	},
+	[OCELOT_STAT_RX_UNICAST] = {
+		.name = "rx_unicast",
+		.offset = 0x01,
+	},
+	[OCELOT_STAT_RX_MULTICAST] = {
+		.name = "rx_multicast",
+		.offset = 0x02,
+	},
+	[OCELOT_STAT_RX_BROADCAST] = {
+		.name = "rx_broadcast",
+		.offset = 0x03,
+	},
+	[OCELOT_STAT_RX_SHORTS] = {
+		.name = "rx_shorts",
+		.offset = 0x04,
+	},
+	[OCELOT_STAT_RX_FRAGMENTS] = {
+		.name = "rx_fragments",
+		.offset = 0x05,
+	},
+	[OCELOT_STAT_RX_JABBERS] = {
+		.name = "rx_jabbers",
+		.offset = 0x06,
+	},
+	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
+		.name = "rx_crc_align_errs",
+		.offset = 0x07,
+	},
+	[OCELOT_STAT_RX_SYM_ERRS] = {
+		.name = "rx_sym_errs",
+		.offset = 0x08,
+	},
+	[OCELOT_STAT_RX_64] = {
+		.name = "rx_frames_below_65_octets",
+		.offset = 0x09,
+	},
+	[OCELOT_STAT_RX_65_127] = {
+		.name = "rx_frames_65_to_127_octets",
+		.offset = 0x0A,
+	},
+	[OCELOT_STAT_RX_128_255] = {
+		.name = "rx_frames_128_to_255_octets",
+		.offset = 0x0B,
+	},
+	[OCELOT_STAT_RX_256_511] = {
+		.name = "rx_frames_256_to_511_octets",
+		.offset = 0x0C,
+	},
+	[OCELOT_STAT_RX_512_1023] = {
+		.name = "rx_frames_512_to_1023_octets",
+		.offset = 0x0D,
+	},
+	[OCELOT_STAT_RX_1024_1526] = {
+		.name = "rx_frames_1024_to_1526_octets",
+		.offset = 0x0E,
+	},
+	[OCELOT_STAT_RX_1527_MAX] = {
+		.name = "rx_frames_over_1526_octets",
+		.offset = 0x0F,
+	},
+	[OCELOT_STAT_RX_PAUSE] = {
+		.name = "rx_pause",
+		.offset = 0x10,
+	},
+	[OCELOT_STAT_RX_CONTROL] = {
+		.name = "rx_control",
+		.offset = 0x11,
+	},
+	[OCELOT_STAT_RX_LONGS] = {
+		.name = "rx_longs",
+		.offset = 0x12,
+	},
+	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
+		.name = "rx_classified_drops",
+		.offset = 0x13,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_0] = {
+		.name = "rx_red_prio_0",
+		.offset = 0x14,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_1] = {
+		.name = "rx_red_prio_1",
+		.offset = 0x15,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_2] = {
+		.name = "rx_red_prio_2",
+		.offset = 0x16,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_3] = {
+		.name = "rx_red_prio_3",
+		.offset = 0x17,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_4] = {
+		.name = "rx_red_prio_4",
+		.offset = 0x18,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_5] = {
+		.name = "rx_red_prio_5",
+		.offset = 0x19,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_6] = {
+		.name = "rx_red_prio_6",
+		.offset = 0x1A,
+	},
+	[OCELOT_STAT_RX_RED_PRIO_7] = {
+		.name = "rx_red_prio_7",
+		.offset = 0x1B,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
+		.name = "rx_yellow_prio_0",
+		.offset = 0x1C,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
+		.name = "rx_yellow_prio_1",
+		.offset = 0x1D,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
+		.name = "rx_yellow_prio_2",
+		.offset = 0x1E,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
+		.name = "rx_yellow_prio_3",
+		.offset = 0x1F,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
+		.name = "rx_yellow_prio_4",
+		.offset = 0x20,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
+		.name = "rx_yellow_prio_5",
+		.offset = 0x21,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
+		.name = "rx_yellow_prio_6",
+		.offset = 0x22,
+	},
+	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
+		.name = "rx_yellow_prio_7",
+		.offset = 0x23,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
+		.name = "rx_green_prio_0",
+		.offset = 0x24,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
+		.name = "rx_green_prio_1",
+		.offset = 0x25,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
+		.name = "rx_green_prio_2",
+		.offset = 0x26,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
+		.name = "rx_green_prio_3",
+		.offset = 0x27,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
+		.name = "rx_green_prio_4",
+		.offset = 0x28,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
+		.name = "rx_green_prio_5",
+		.offset = 0x29,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
+		.name = "rx_green_prio_6",
+		.offset = 0x2A,
+	},
+	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
+		.name = "rx_green_prio_7",
+		.offset = 0x2B,
+	},
+	[OCELOT_STAT_TX_OCTETS] = {
+		.name = "tx_octets",
+		.offset = 0x40,
+	},
+	[OCELOT_STAT_TX_UNICAST] = {
+		.name = "tx_unicast",
+		.offset = 0x41,
+	},
+	[OCELOT_STAT_TX_MULTICAST] = {
+		.name = "tx_multicast",
+		.offset = 0x42,
+	},
+	[OCELOT_STAT_TX_BROADCAST] = {
+		.name = "tx_broadcast",
+		.offset = 0x43,
+	},
+	[OCELOT_STAT_TX_COLLISION] = {
+		.name = "tx_collision",
+		.offset = 0x44,
+	},
+	[OCELOT_STAT_TX_DROPS] = {
+		.name = "tx_drops",
+		.offset = 0x45,
+	},
+	[OCELOT_STAT_TX_PAUSE] = {
+		.name = "tx_pause",
+		.offset = 0x46,
+	},
+	[OCELOT_STAT_TX_64] = {
+		.name = "tx_frames_below_65_octets",
+		.offset = 0x47,
+	},
+	[OCELOT_STAT_TX_65_127] = {
+		.name = "tx_frames_65_to_127_octets",
+		.offset = 0x48,
+	},
+	[OCELOT_STAT_TX_128_255] = {
+		.name = "tx_frames_128_255_octets",
+		.offset = 0x49,
+	},
+	[OCELOT_STAT_TX_256_511] = {
+		.name = "tx_frames_256_511_octets",
+		.offset = 0x4A,
+	},
+	[OCELOT_STAT_TX_512_1023] = {
+		.name = "tx_frames_512_1023_octets",
+		.offset = 0x4B,
+	},
+	[OCELOT_STAT_TX_1024_1526] = {
+		.name = "tx_frames_1024_1526_octets",
+		.offset = 0x4C,
+	},
+	[OCELOT_STAT_TX_1527_MAX] = {
+		.name = "tx_frames_over_1526_octets",
+		.offset = 0x4D,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
+		.name = "tx_yellow_prio_0",
+		.offset = 0x4E,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
+		.name = "tx_yellow_prio_1",
+		.offset = 0x4F,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
+		.name = "tx_yellow_prio_2",
+		.offset = 0x50,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
+		.name = "tx_yellow_prio_3",
+		.offset = 0x51,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
+		.name = "tx_yellow_prio_4",
+		.offset = 0x52,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
+		.name = "tx_yellow_prio_5",
+		.offset = 0x53,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
+		.name = "tx_yellow_prio_6",
+		.offset = 0x54,
+	},
+	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
+		.name = "tx_yellow_prio_7",
+		.offset = 0x55,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
+		.name = "tx_green_prio_0",
+		.offset = 0x56,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
+		.name = "tx_green_prio_1",
+		.offset = 0x57,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
+		.name = "tx_green_prio_2",
+		.offset = 0x58,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
+		.name = "tx_green_prio_3",
+		.offset = 0x59,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
+		.name = "tx_green_prio_4",
+		.offset = 0x5A,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
+		.name = "tx_green_prio_5",
+		.offset = 0x5B,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
+		.name = "tx_green_prio_6",
+		.offset = 0x5C,
+	},
+	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
+		.name = "tx_green_prio_7",
+		.offset = 0x5D,
+	},
+	[OCELOT_STAT_TX_AGED] = {
+		.name = "tx_aged",
+		.offset = 0x5E,
+	},
+	[OCELOT_STAT_DROP_LOCAL] = {
+		.name = "drop_local",
+		.offset = 0x80,
+	},
+	[OCELOT_STAT_DROP_TAIL] = {
+		.name = "drop_tail",
+		.offset = 0x81,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
+		.name = "drop_yellow_prio_0",
+		.offset = 0x82,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
+		.name = "drop_yellow_prio_1",
+		.offset = 0x83,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
+		.name = "drop_yellow_prio_2",
+		.offset = 0x84,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
+		.name = "drop_yellow_prio_3",
+		.offset = 0x85,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
+		.name = "drop_yellow_prio_4",
+		.offset = 0x86,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
+		.name = "drop_yellow_prio_5",
+		.offset = 0x87,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
+		.name = "drop_yellow_prio_6",
+		.offset = 0x88,
+	},
+	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
+		.name = "drop_yellow_prio_7",
+		.offset = 0x89,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
+		.name = "drop_green_prio_0",
+		.offset = 0x8A,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
+		.name = "drop_green_prio_1",
+		.offset = 0x8B,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
+		.name = "drop_green_prio_2",
+		.offset = 0x8C,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
+		.name = "drop_green_prio_3",
+		.offset = 0x8D,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
+		.name = "drop_green_prio_4",
+		.offset = 0x8E,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
+		.name = "drop_green_prio_5",
+		.offset = 0x8F,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
+		.name = "drop_green_prio_6",
+		.offset = 0x90,
+	},
+	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
+		.name = "drop_green_prio_7",
+		.offset = 0x91,
+	},
 };
 
 static void ocelot_pll5_init(struct ocelot *ocelot)
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 72b9474391da..2428bc64cb1d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -105,11 +105,6 @@
 #define REG_RESERVED_ADDR		0xffffffff
 #define REG_RESERVED(reg)		REG(reg, REG_RESERVED_ADDR)
 
-#define for_each_stat(ocelot, stat)				\
-	for ((stat) = (ocelot)->stats_layout;			\
-	     ((stat)->name[0] != '\0');				\
-	     (stat)++)
-
 enum ocelot_target {
 	ANA = 1,
 	QS,
@@ -540,13 +535,108 @@ enum ocelot_ptp_pins {
 	TOD_ACC_PIN
 };
 
+enum ocelot_stat {
+	OCELOT_STAT_RX_OCTETS,
+	OCELOT_STAT_RX_UNICAST,
+	OCELOT_STAT_RX_MULTICAST,
+	OCELOT_STAT_RX_BROADCAST,
+	OCELOT_STAT_RX_SHORTS,
+	OCELOT_STAT_RX_FRAGMENTS,
+	OCELOT_STAT_RX_JABBERS,
+	OCELOT_STAT_RX_CRC_ALIGN_ERRS,
+	OCELOT_STAT_RX_SYM_ERRS,
+	OCELOT_STAT_RX_64,
+	OCELOT_STAT_RX_65_127,
+	OCELOT_STAT_RX_128_255,
+	OCELOT_STAT_RX_256_511,
+	OCELOT_STAT_RX_512_1023,
+	OCELOT_STAT_RX_1024_1526,
+	OCELOT_STAT_RX_1527_MAX,
+	OCELOT_STAT_RX_PAUSE,
+	OCELOT_STAT_RX_CONTROL,
+	OCELOT_STAT_RX_LONGS,
+	OCELOT_STAT_RX_CLASSIFIED_DROPS,
+	OCELOT_STAT_RX_RED_PRIO_0,
+	OCELOT_STAT_RX_RED_PRIO_1,
+	OCELOT_STAT_RX_RED_PRIO_2,
+	OCELOT_STAT_RX_RED_PRIO_3,
+	OCELOT_STAT_RX_RED_PRIO_4,
+	OCELOT_STAT_RX_RED_PRIO_5,
+	OCELOT_STAT_RX_RED_PRIO_6,
+	OCELOT_STAT_RX_RED_PRIO_7,
+	OCELOT_STAT_RX_YELLOW_PRIO_0,
+	OCELOT_STAT_RX_YELLOW_PRIO_1,
+	OCELOT_STAT_RX_YELLOW_PRIO_2,
+	OCELOT_STAT_RX_YELLOW_PRIO_3,
+	OCELOT_STAT_RX_YELLOW_PRIO_4,
+	OCELOT_STAT_RX_YELLOW_PRIO_5,
+	OCELOT_STAT_RX_YELLOW_PRIO_6,
+	OCELOT_STAT_RX_YELLOW_PRIO_7,
+	OCELOT_STAT_RX_GREEN_PRIO_0,
+	OCELOT_STAT_RX_GREEN_PRIO_1,
+	OCELOT_STAT_RX_GREEN_PRIO_2,
+	OCELOT_STAT_RX_GREEN_PRIO_3,
+	OCELOT_STAT_RX_GREEN_PRIO_4,
+	OCELOT_STAT_RX_GREEN_PRIO_5,
+	OCELOT_STAT_RX_GREEN_PRIO_6,
+	OCELOT_STAT_RX_GREEN_PRIO_7,
+	OCELOT_STAT_TX_OCTETS,
+	OCELOT_STAT_TX_UNICAST,
+	OCELOT_STAT_TX_MULTICAST,
+	OCELOT_STAT_TX_BROADCAST,
+	OCELOT_STAT_TX_COLLISION,
+	OCELOT_STAT_TX_DROPS,
+	OCELOT_STAT_TX_PAUSE,
+	OCELOT_STAT_TX_64,
+	OCELOT_STAT_TX_65_127,
+	OCELOT_STAT_TX_128_255,
+	OCELOT_STAT_TX_256_511,
+	OCELOT_STAT_TX_512_1023,
+	OCELOT_STAT_TX_1024_1526,
+	OCELOT_STAT_TX_1527_MAX,
+	OCELOT_STAT_TX_YELLOW_PRIO_0,
+	OCELOT_STAT_TX_YELLOW_PRIO_1,
+	OCELOT_STAT_TX_YELLOW_PRIO_2,
+	OCELOT_STAT_TX_YELLOW_PRIO_3,
+	OCELOT_STAT_TX_YELLOW_PRIO_4,
+	OCELOT_STAT_TX_YELLOW_PRIO_5,
+	OCELOT_STAT_TX_YELLOW_PRIO_6,
+	OCELOT_STAT_TX_YELLOW_PRIO_7,
+	OCELOT_STAT_TX_GREEN_PRIO_0,
+	OCELOT_STAT_TX_GREEN_PRIO_1,
+	OCELOT_STAT_TX_GREEN_PRIO_2,
+	OCELOT_STAT_TX_GREEN_PRIO_3,
+	OCELOT_STAT_TX_GREEN_PRIO_4,
+	OCELOT_STAT_TX_GREEN_PRIO_5,
+	OCELOT_STAT_TX_GREEN_PRIO_6,
+	OCELOT_STAT_TX_GREEN_PRIO_7,
+	OCELOT_STAT_TX_AGED,
+	OCELOT_STAT_DROP_LOCAL,
+	OCELOT_STAT_DROP_TAIL,
+	OCELOT_STAT_DROP_YELLOW_PRIO_0,
+	OCELOT_STAT_DROP_YELLOW_PRIO_1,
+	OCELOT_STAT_DROP_YELLOW_PRIO_2,
+	OCELOT_STAT_DROP_YELLOW_PRIO_3,
+	OCELOT_STAT_DROP_YELLOW_PRIO_4,
+	OCELOT_STAT_DROP_YELLOW_PRIO_5,
+	OCELOT_STAT_DROP_YELLOW_PRIO_6,
+	OCELOT_STAT_DROP_YELLOW_PRIO_7,
+	OCELOT_STAT_DROP_GREEN_PRIO_0,
+	OCELOT_STAT_DROP_GREEN_PRIO_1,
+	OCELOT_STAT_DROP_GREEN_PRIO_2,
+	OCELOT_STAT_DROP_GREEN_PRIO_3,
+	OCELOT_STAT_DROP_GREEN_PRIO_4,
+	OCELOT_STAT_DROP_GREEN_PRIO_5,
+	OCELOT_STAT_DROP_GREEN_PRIO_6,
+	OCELOT_STAT_DROP_GREEN_PRIO_7,
+	OCELOT_NUM_STATS,
+};
+
 struct ocelot_stat_layout {
 	u32 offset;
 	char name[ETH_GSTRING_LEN];
 };
 
-#define OCELOT_STAT_END { .name = "" }
-
 struct ocelot_stats_region {
 	struct list_head node;
 	u32 offset;
@@ -709,7 +799,6 @@ struct ocelot {
 	const u32 *const		*map;
 	const struct ocelot_stat_layout	*stats_layout;
 	struct list_head		stats_regions;
-	unsigned int			num_stats;
 
 	u32				pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM];
 	int				packet_buffer_size;
-- 
cgit 


From d4c367650704de091d4c1f6bb379c0a5c389c73a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 16 Aug 2022 16:53:51 +0300
Subject: net: mscc: ocelot: keep ocelot_stat_layout by reg address, not offset

With so many counter addresses recently discovered as being wrong, it is
desirable to at least have a central database of information, rather
than two: one through the SYS_COUNT_* registers (used for
ndo_get_stats64), and the other through the offset field of struct
ocelot_stat_layout elements (used for ethtool -S).

The strategy will be to keep the SYS_COUNT_* definitions as the single
source of truth, but for that we need to expand our current definitions
to cover all registers. Then we need to convert the ocelot region
creation logic, and stats worker, to the read semantics imposed by going
through SYS_COUNT_* absolute register addresses, rather than offsets
of 32-bit words relative to SYS_COUNT_RX_OCTETS (which should have been
SYS_CNT, by the way).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c     | 253 +++++++++++++++++-----------
 drivers/net/dsa/ocelot/seville_vsc9953.c   | 255 ++++++++++++++++++-----------
 drivers/net/ethernet/mscc/ocelot.c         |  11 +-
 drivers/net/ethernet/mscc/ocelot_vsc7514.c | 186 ++++++++++-----------
 drivers/net/ethernet/mscc/vsc7514_regs.c   |  58 +++++++
 include/soc/mscc/ocelot.h                  |  66 +++++++-
 6 files changed, 540 insertions(+), 289 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index c9f270f24b1c..1cdce8a98d1d 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -274,10 +274,14 @@ static const u32 vsc9959_rew_regmap[] = {
 
 static const u32 vsc9959_sys_regmap[] = {
 	REG(SYS_COUNT_RX_OCTETS,		0x000000),
+	REG(SYS_COUNT_RX_UNICAST,		0x000004),
 	REG(SYS_COUNT_RX_MULTICAST,		0x000008),
+	REG(SYS_COUNT_RX_BROADCAST,		0x00000c),
 	REG(SYS_COUNT_RX_SHORTS,		0x000010),
 	REG(SYS_COUNT_RX_FRAGMENTS,		0x000014),
 	REG(SYS_COUNT_RX_JABBERS,		0x000018),
+	REG(SYS_COUNT_RX_CRC_ALIGN_ERRS,	0x00001c),
+	REG(SYS_COUNT_RX_SYM_ERRS,		0x000020),
 	REG(SYS_COUNT_RX_64,			0x000024),
 	REG(SYS_COUNT_RX_65_127,		0x000028),
 	REG(SYS_COUNT_RX_128_255,		0x00002c),
@@ -288,9 +292,38 @@ static const u32 vsc9959_sys_regmap[] = {
 	REG(SYS_COUNT_RX_PAUSE,			0x000040),
 	REG(SYS_COUNT_RX_CONTROL,		0x000044),
 	REG(SYS_COUNT_RX_LONGS,			0x000048),
+	REG(SYS_COUNT_RX_CLASSIFIED_DROPS,	0x00004c),
+	REG(SYS_COUNT_RX_RED_PRIO_0,		0x000050),
+	REG(SYS_COUNT_RX_RED_PRIO_1,		0x000054),
+	REG(SYS_COUNT_RX_RED_PRIO_2,		0x000058),
+	REG(SYS_COUNT_RX_RED_PRIO_3,		0x00005c),
+	REG(SYS_COUNT_RX_RED_PRIO_4,		0x000060),
+	REG(SYS_COUNT_RX_RED_PRIO_5,		0x000064),
+	REG(SYS_COUNT_RX_RED_PRIO_6,		0x000068),
+	REG(SYS_COUNT_RX_RED_PRIO_7,		0x00006c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_0,		0x000070),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_1,		0x000074),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_2,		0x000078),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_3,		0x00007c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_4,		0x000080),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_5,		0x000084),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_6,		0x000088),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_7,		0x00008c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_0,		0x000090),
+	REG(SYS_COUNT_RX_GREEN_PRIO_1,		0x000094),
+	REG(SYS_COUNT_RX_GREEN_PRIO_2,		0x000098),
+	REG(SYS_COUNT_RX_GREEN_PRIO_3,		0x00009c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_4,		0x0000a0),
+	REG(SYS_COUNT_RX_GREEN_PRIO_5,		0x0000a4),
+	REG(SYS_COUNT_RX_GREEN_PRIO_6,		0x0000a8),
+	REG(SYS_COUNT_RX_GREEN_PRIO_7,		0x0000ac),
 	REG(SYS_COUNT_TX_OCTETS,		0x000200),
+	REG(SYS_COUNT_TX_UNICAST,		0x000204),
+	REG(SYS_COUNT_TX_MULTICAST,		0x000208),
+	REG(SYS_COUNT_TX_BROADCAST,		0x00020c),
 	REG(SYS_COUNT_TX_COLLISION,		0x000210),
 	REG(SYS_COUNT_TX_DROPS,			0x000214),
+	REG(SYS_COUNT_TX_PAUSE,			0x000218),
 	REG(SYS_COUNT_TX_64,			0x00021c),
 	REG(SYS_COUNT_TX_65_127,		0x000220),
 	REG(SYS_COUNT_TX_128_255,		0x000224),
@@ -298,7 +331,41 @@ static const u32 vsc9959_sys_regmap[] = {
 	REG(SYS_COUNT_TX_512_1023,		0x00022c),
 	REG(SYS_COUNT_TX_1024_1526,		0x000230),
 	REG(SYS_COUNT_TX_1527_MAX,		0x000234),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_0,		0x000238),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_1,		0x00023c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_2,		0x000240),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_3,		0x000244),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_4,		0x000248),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_5,		0x00024c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_6,		0x000250),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_7,		0x000254),
+	REG(SYS_COUNT_TX_GREEN_PRIO_0,		0x000258),
+	REG(SYS_COUNT_TX_GREEN_PRIO_1,		0x00025c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_2,		0x000260),
+	REG(SYS_COUNT_TX_GREEN_PRIO_3,		0x000264),
+	REG(SYS_COUNT_TX_GREEN_PRIO_4,		0x000268),
+	REG(SYS_COUNT_TX_GREEN_PRIO_5,		0x00026c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_6,		0x000270),
+	REG(SYS_COUNT_TX_GREEN_PRIO_7,		0x000274),
 	REG(SYS_COUNT_TX_AGING,			0x000278),
+	REG(SYS_COUNT_DROP_LOCAL,		0x000400),
+	REG(SYS_COUNT_DROP_TAIL,		0x000404),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_0,	0x000408),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_1,	0x00040c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_2,	0x000410),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_3,	0x000414),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_4,	0x000418),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_5,	0x00041c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_6,	0x000420),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_7,	0x000424),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_0,	0x000428),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_1,	0x00042c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_2,	0x000430),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_3,	0x000434),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_4,	0x000438),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_5,	0x00043c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_6,	0x000440),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_7,	0x000444),
 	REG(SYS_RESET_CFG,			0x000e00),
 	REG(SYS_SR_ETYPE_CFG,			0x000e04),
 	REG(SYS_VLAN_ETYPE_CFG,			0x000e08),
@@ -554,375 +621,375 @@ static const struct reg_field vsc9959_regfields[REGFIELD_MAX] = {
 static const struct ocelot_stat_layout vsc9959_stats_layout[OCELOT_NUM_STATS] = {
 	[OCELOT_STAT_RX_OCTETS] = {
 		.name = "rx_octets",
-		.offset = 0x00,
+		.reg = SYS_COUNT_RX_OCTETS,
 	},
 	[OCELOT_STAT_RX_UNICAST] = {
 		.name = "rx_unicast",
-		.offset = 0x01,
+		.reg = SYS_COUNT_RX_UNICAST,
 	},
 	[OCELOT_STAT_RX_MULTICAST] = {
 		.name = "rx_multicast",
-		.offset = 0x02,
+		.reg = SYS_COUNT_RX_MULTICAST,
 	},
 	[OCELOT_STAT_RX_BROADCAST] = {
 		.name = "rx_broadcast",
-		.offset = 0x03,
+		.reg = SYS_COUNT_RX_BROADCAST,
 	},
 	[OCELOT_STAT_RX_SHORTS] = {
 		.name = "rx_shorts",
-		.offset = 0x04,
+		.reg = SYS_COUNT_RX_SHORTS,
 	},
 	[OCELOT_STAT_RX_FRAGMENTS] = {
 		.name = "rx_fragments",
-		.offset = 0x05,
+		.reg = SYS_COUNT_RX_FRAGMENTS,
 	},
 	[OCELOT_STAT_RX_JABBERS] = {
 		.name = "rx_jabbers",
-		.offset = 0x06,
+		.reg = SYS_COUNT_RX_JABBERS,
 	},
 	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
 		.name = "rx_crc_align_errs",
-		.offset = 0x07,
+		.reg = SYS_COUNT_RX_CRC_ALIGN_ERRS,
 	},
 	[OCELOT_STAT_RX_SYM_ERRS] = {
 		.name = "rx_sym_errs",
-		.offset = 0x08,
+		.reg = SYS_COUNT_RX_SYM_ERRS,
 	},
 	[OCELOT_STAT_RX_64] = {
 		.name = "rx_frames_below_65_octets",
-		.offset = 0x09,
+		.reg = SYS_COUNT_RX_64,
 	},
 	[OCELOT_STAT_RX_65_127] = {
 		.name = "rx_frames_65_to_127_octets",
-		.offset = 0x0A,
+		.reg = SYS_COUNT_RX_65_127,
 	},
 	[OCELOT_STAT_RX_128_255] = {
 		.name = "rx_frames_128_to_255_octets",
-		.offset = 0x0B,
+		.reg = SYS_COUNT_RX_128_255,
 	},
 	[OCELOT_STAT_RX_256_511] = {
 		.name = "rx_frames_256_to_511_octets",
-		.offset = 0x0C,
+		.reg = SYS_COUNT_RX_256_511,
 	},
 	[OCELOT_STAT_RX_512_1023] = {
 		.name = "rx_frames_512_to_1023_octets",
-		.offset = 0x0D,
+		.reg = SYS_COUNT_RX_512_1023,
 	},
 	[OCELOT_STAT_RX_1024_1526] = {
 		.name = "rx_frames_1024_to_1526_octets",
-		.offset = 0x0E,
+		.reg = SYS_COUNT_RX_1024_1526,
 	},
 	[OCELOT_STAT_RX_1527_MAX] = {
 		.name = "rx_frames_over_1526_octets",
-		.offset = 0x0F,
+		.reg = SYS_COUNT_RX_1527_MAX,
 	},
 	[OCELOT_STAT_RX_PAUSE] = {
 		.name = "rx_pause",
-		.offset = 0x10,
+		.reg = SYS_COUNT_RX_PAUSE,
 	},
 	[OCELOT_STAT_RX_CONTROL] = {
 		.name = "rx_control",
-		.offset = 0x11,
+		.reg = SYS_COUNT_RX_CONTROL,
 	},
 	[OCELOT_STAT_RX_LONGS] = {
 		.name = "rx_longs",
-		.offset = 0x12,
+		.reg = SYS_COUNT_RX_LONGS,
 	},
 	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
 		.name = "rx_classified_drops",
-		.offset = 0x13,
+		.reg = SYS_COUNT_RX_CLASSIFIED_DROPS,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_0] = {
 		.name = "rx_red_prio_0",
-		.offset = 0x14,
+		.reg = SYS_COUNT_RX_RED_PRIO_0,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_1] = {
 		.name = "rx_red_prio_1",
-		.offset = 0x15,
+		.reg = SYS_COUNT_RX_RED_PRIO_1,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_2] = {
 		.name = "rx_red_prio_2",
-		.offset = 0x16,
+		.reg = SYS_COUNT_RX_RED_PRIO_2,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_3] = {
 		.name = "rx_red_prio_3",
-		.offset = 0x17,
+		.reg = SYS_COUNT_RX_RED_PRIO_3,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_4] = {
 		.name = "rx_red_prio_4",
-		.offset = 0x18,
+		.reg = SYS_COUNT_RX_RED_PRIO_4,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_5] = {
 		.name = "rx_red_prio_5",
-		.offset = 0x19,
+		.reg = SYS_COUNT_RX_RED_PRIO_5,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_6] = {
 		.name = "rx_red_prio_6",
-		.offset = 0x1A,
+		.reg = SYS_COUNT_RX_RED_PRIO_6,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_7] = {
 		.name = "rx_red_prio_7",
-		.offset = 0x1B,
+		.reg = SYS_COUNT_RX_RED_PRIO_7,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
 		.name = "rx_yellow_prio_0",
-		.offset = 0x1C,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
 		.name = "rx_yellow_prio_1",
-		.offset = 0x1D,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
 		.name = "rx_yellow_prio_2",
-		.offset = 0x1E,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
 		.name = "rx_yellow_prio_3",
-		.offset = 0x1F,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
 		.name = "rx_yellow_prio_4",
-		.offset = 0x20,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
 		.name = "rx_yellow_prio_5",
-		.offset = 0x21,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
 		.name = "rx_yellow_prio_6",
-		.offset = 0x22,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
 		.name = "rx_yellow_prio_7",
-		.offset = 0x23,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
 		.name = "rx_green_prio_0",
-		.offset = 0x24,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
 		.name = "rx_green_prio_1",
-		.offset = 0x25,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
 		.name = "rx_green_prio_2",
-		.offset = 0x26,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
 		.name = "rx_green_prio_3",
-		.offset = 0x27,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
 		.name = "rx_green_prio_4",
-		.offset = 0x28,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
 		.name = "rx_green_prio_5",
-		.offset = 0x29,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
 		.name = "rx_green_prio_6",
-		.offset = 0x2A,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
 		.name = "rx_green_prio_7",
-		.offset = 0x2B,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_OCTETS] = {
 		.name = "tx_octets",
-		.offset = 0x80,
+		.reg = SYS_COUNT_TX_OCTETS,
 	},
 	[OCELOT_STAT_TX_UNICAST] = {
 		.name = "tx_unicast",
-		.offset = 0x81,
+		.reg = SYS_COUNT_TX_UNICAST,
 	},
 	[OCELOT_STAT_TX_MULTICAST] = {
 		.name = "tx_multicast",
-		.offset = 0x82,
+		.reg = SYS_COUNT_TX_MULTICAST,
 	},
 	[OCELOT_STAT_TX_BROADCAST] = {
 		.name = "tx_broadcast",
-		.offset = 0x83,
+		.reg = SYS_COUNT_TX_BROADCAST,
 	},
 	[OCELOT_STAT_TX_COLLISION] = {
 		.name = "tx_collision",
-		.offset = 0x84,
+		.reg = SYS_COUNT_TX_COLLISION,
 	},
 	[OCELOT_STAT_TX_DROPS] = {
 		.name = "tx_drops",
-		.offset = 0x85,
+		.reg = SYS_COUNT_TX_DROPS,
 	},
 	[OCELOT_STAT_TX_PAUSE] = {
 		.name = "tx_pause",
-		.offset = 0x86,
+		.reg = SYS_COUNT_TX_PAUSE,
 	},
 	[OCELOT_STAT_TX_64] = {
 		.name = "tx_frames_below_65_octets",
-		.offset = 0x87,
+		.reg = SYS_COUNT_TX_64,
 	},
 	[OCELOT_STAT_TX_65_127] = {
 		.name = "tx_frames_65_to_127_octets",
-		.offset = 0x88,
+		.reg = SYS_COUNT_TX_65_127,
 	},
 	[OCELOT_STAT_TX_128_255] = {
 		.name = "tx_frames_128_255_octets",
-		.offset = 0x89,
+		.reg = SYS_COUNT_TX_128_255,
 	},
 	[OCELOT_STAT_TX_256_511] = {
 		.name = "tx_frames_256_511_octets",
-		.offset = 0x8A,
+		.reg = SYS_COUNT_TX_256_511,
 	},
 	[OCELOT_STAT_TX_512_1023] = {
 		.name = "tx_frames_512_1023_octets",
-		.offset = 0x8B,
+		.reg = SYS_COUNT_TX_512_1023,
 	},
 	[OCELOT_STAT_TX_1024_1526] = {
 		.name = "tx_frames_1024_1526_octets",
-		.offset = 0x8C,
+		.reg = SYS_COUNT_TX_1024_1526,
 	},
 	[OCELOT_STAT_TX_1527_MAX] = {
 		.name = "tx_frames_over_1526_octets",
-		.offset = 0x8D,
+		.reg = SYS_COUNT_TX_1527_MAX,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
 		.name = "tx_yellow_prio_0",
-		.offset = 0x8E,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
 		.name = "tx_yellow_prio_1",
-		.offset = 0x8F,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
 		.name = "tx_yellow_prio_2",
-		.offset = 0x90,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
 		.name = "tx_yellow_prio_3",
-		.offset = 0x91,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
 		.name = "tx_yellow_prio_4",
-		.offset = 0x92,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
 		.name = "tx_yellow_prio_5",
-		.offset = 0x93,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
 		.name = "tx_yellow_prio_6",
-		.offset = 0x94,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
 		.name = "tx_yellow_prio_7",
-		.offset = 0x95,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
 		.name = "tx_green_prio_0",
-		.offset = 0x96,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
 		.name = "tx_green_prio_1",
-		.offset = 0x97,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
 		.name = "tx_green_prio_2",
-		.offset = 0x98,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
 		.name = "tx_green_prio_3",
-		.offset = 0x99,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
 		.name = "tx_green_prio_4",
-		.offset = 0x9A,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
 		.name = "tx_green_prio_5",
-		.offset = 0x9B,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
 		.name = "tx_green_prio_6",
-		.offset = 0x9C,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
 		.name = "tx_green_prio_7",
-		.offset = 0x9D,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_AGED] = {
 		.name = "tx_aged",
-		.offset = 0x9E,
+		.reg = SYS_COUNT_TX_AGING,
 	},
 	[OCELOT_STAT_DROP_LOCAL] = {
 		.name = "drop_local",
-		.offset = 0x100,
+		.reg = SYS_COUNT_DROP_LOCAL,
 	},
 	[OCELOT_STAT_DROP_TAIL] = {
 		.name = "drop_tail",
-		.offset = 0x101,
+		.reg = SYS_COUNT_DROP_TAIL,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
 		.name = "drop_yellow_prio_0",
-		.offset = 0x102,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
 		.name = "drop_yellow_prio_1",
-		.offset = 0x103,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
 		.name = "drop_yellow_prio_2",
-		.offset = 0x104,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
 		.name = "drop_yellow_prio_3",
-		.offset = 0x105,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
 		.name = "drop_yellow_prio_4",
-		.offset = 0x106,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
 		.name = "drop_yellow_prio_5",
-		.offset = 0x107,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
 		.name = "drop_yellow_prio_6",
-		.offset = 0x108,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
 		.name = "drop_yellow_prio_7",
-		.offset = 0x109,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
 		.name = "drop_green_prio_0",
-		.offset = 0x10A,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
 		.name = "drop_green_prio_1",
-		.offset = 0x10B,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
 		.name = "drop_green_prio_2",
-		.offset = 0x10C,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
 		.name = "drop_green_prio_3",
-		.offset = 0x10D,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
 		.name = "drop_green_prio_4",
-		.offset = 0x10E,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
 		.name = "drop_green_prio_5",
-		.offset = 0x10F,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
 		.name = "drop_green_prio_6",
-		.offset = 0x110,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
 		.name = "drop_green_prio_7",
-		.offset = 0x111,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_7,
 	},
 };
 
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c
index fe5d4642d0bc..b34f4cdfe814 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -270,10 +270,14 @@ static const u32 vsc9953_rew_regmap[] = {
 
 static const u32 vsc9953_sys_regmap[] = {
 	REG(SYS_COUNT_RX_OCTETS,		0x000000),
+	REG(SYS_COUNT_RX_UNICAST,		0x000004),
 	REG(SYS_COUNT_RX_MULTICAST,		0x000008),
+	REG(SYS_COUNT_RX_BROADCAST,		0x00000c),
 	REG(SYS_COUNT_RX_SHORTS,		0x000010),
 	REG(SYS_COUNT_RX_FRAGMENTS,		0x000014),
 	REG(SYS_COUNT_RX_JABBERS,		0x000018),
+	REG(SYS_COUNT_RX_CRC_ALIGN_ERRS,	0x00001c),
+	REG(SYS_COUNT_RX_SYM_ERRS,		0x000020),
 	REG(SYS_COUNT_RX_64,			0x000024),
 	REG(SYS_COUNT_RX_65_127,		0x000028),
 	REG(SYS_COUNT_RX_128_255,		0x00002c),
@@ -281,10 +285,41 @@ static const u32 vsc9953_sys_regmap[] = {
 	REG(SYS_COUNT_RX_512_1023,		0x000034),
 	REG(SYS_COUNT_RX_1024_1526,		0x000038),
 	REG(SYS_COUNT_RX_1527_MAX,		0x00003c),
+	REG(SYS_COUNT_RX_PAUSE,			0x000040),
+	REG(SYS_COUNT_RX_CONTROL,		0x000044),
 	REG(SYS_COUNT_RX_LONGS,			0x000048),
+	REG(SYS_COUNT_RX_CLASSIFIED_DROPS,	0x00004c),
+	REG(SYS_COUNT_RX_RED_PRIO_0,		0x000050),
+	REG(SYS_COUNT_RX_RED_PRIO_1,		0x000054),
+	REG(SYS_COUNT_RX_RED_PRIO_2,		0x000058),
+	REG(SYS_COUNT_RX_RED_PRIO_3,		0x00005c),
+	REG(SYS_COUNT_RX_RED_PRIO_4,		0x000060),
+	REG(SYS_COUNT_RX_RED_PRIO_5,		0x000064),
+	REG(SYS_COUNT_RX_RED_PRIO_6,		0x000068),
+	REG(SYS_COUNT_RX_RED_PRIO_7,		0x00006c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_0,		0x000070),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_1,		0x000074),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_2,		0x000078),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_3,		0x00007c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_4,		0x000080),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_5,		0x000084),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_6,		0x000088),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_7,		0x00008c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_0,		0x000090),
+	REG(SYS_COUNT_RX_GREEN_PRIO_1,		0x000094),
+	REG(SYS_COUNT_RX_GREEN_PRIO_2,		0x000098),
+	REG(SYS_COUNT_RX_GREEN_PRIO_3,		0x00009c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_4,		0x0000a0),
+	REG(SYS_COUNT_RX_GREEN_PRIO_5,		0x0000a4),
+	REG(SYS_COUNT_RX_GREEN_PRIO_6,		0x0000a8),
+	REG(SYS_COUNT_RX_GREEN_PRIO_7,		0x0000ac),
 	REG(SYS_COUNT_TX_OCTETS,		0x000100),
+	REG(SYS_COUNT_TX_UNICAST,		0x000104),
+	REG(SYS_COUNT_TX_MULTICAST,		0x000108),
+	REG(SYS_COUNT_TX_BROADCAST,		0x00010c),
 	REG(SYS_COUNT_TX_COLLISION,		0x000110),
 	REG(SYS_COUNT_TX_DROPS,			0x000114),
+	REG(SYS_COUNT_TX_PAUSE,			0x000118),
 	REG(SYS_COUNT_TX_64,			0x00011c),
 	REG(SYS_COUNT_TX_65_127,		0x000120),
 	REG(SYS_COUNT_TX_128_255,		0x000124),
@@ -292,7 +327,41 @@ static const u32 vsc9953_sys_regmap[] = {
 	REG(SYS_COUNT_TX_512_1023,		0x00012c),
 	REG(SYS_COUNT_TX_1024_1526,		0x000130),
 	REG(SYS_COUNT_TX_1527_MAX,		0x000134),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_0,		0x000138),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_1,		0x00013c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_2,		0x000140),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_3,		0x000144),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_4,		0x000148),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_5,		0x00014c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_6,		0x000150),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_7,		0x000154),
+	REG(SYS_COUNT_TX_GREEN_PRIO_0,		0x000158),
+	REG(SYS_COUNT_TX_GREEN_PRIO_1,		0x00015c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_2,		0x000160),
+	REG(SYS_COUNT_TX_GREEN_PRIO_3,		0x000164),
+	REG(SYS_COUNT_TX_GREEN_PRIO_4,		0x000168),
+	REG(SYS_COUNT_TX_GREEN_PRIO_5,		0x00016c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_6,		0x000170),
+	REG(SYS_COUNT_TX_GREEN_PRIO_7,		0x000174),
 	REG(SYS_COUNT_TX_AGING,			0x000178),
+	REG(SYS_COUNT_DROP_LOCAL,		0x000200),
+	REG(SYS_COUNT_DROP_TAIL,		0x000204),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_0,	0x000208),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_1,	0x00020c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_2,	0x000210),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_3,	0x000214),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_4,	0x000218),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_5,	0x00021c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_6,	0x000220),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_7,	0x000224),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_0,	0x000228),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_1,	0x00022c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_2,	0x000230),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_3,	0x000234),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_4,	0x000238),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_5,	0x00023c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_6,	0x000240),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_7,	0x000244),
 	REG(SYS_RESET_CFG,			0x000318),
 	REG_RESERVED(SYS_SR_ETYPE_CFG),
 	REG(SYS_VLAN_ETYPE_CFG,			0x000320),
@@ -548,375 +617,375 @@ static const struct reg_field vsc9953_regfields[REGFIELD_MAX] = {
 static const struct ocelot_stat_layout vsc9953_stats_layout[OCELOT_NUM_STATS] = {
 	[OCELOT_STAT_RX_OCTETS] = {
 		.name = "rx_octets",
-		.offset = 0x00,
+		.reg = SYS_COUNT_RX_OCTETS,
 	},
 	[OCELOT_STAT_RX_UNICAST] = {
 		.name = "rx_unicast",
-		.offset = 0x01,
+		.reg = SYS_COUNT_RX_UNICAST,
 	},
 	[OCELOT_STAT_RX_MULTICAST] = {
 		.name = "rx_multicast",
-		.offset = 0x02,
+		.reg = SYS_COUNT_RX_MULTICAST,
 	},
 	[OCELOT_STAT_RX_BROADCAST] = {
 		.name = "rx_broadcast",
-		.offset = 0x03,
+		.reg = SYS_COUNT_RX_BROADCAST,
 	},
 	[OCELOT_STAT_RX_SHORTS] = {
 		.name = "rx_shorts",
-		.offset = 0x04,
+		.reg = SYS_COUNT_RX_SHORTS,
 	},
 	[OCELOT_STAT_RX_FRAGMENTS] = {
 		.name = "rx_fragments",
-		.offset = 0x05,
+		.reg = SYS_COUNT_RX_FRAGMENTS,
 	},
 	[OCELOT_STAT_RX_JABBERS] = {
 		.name = "rx_jabbers",
-		.offset = 0x06,
+		.reg = SYS_COUNT_RX_JABBERS,
 	},
 	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
 		.name = "rx_crc_align_errs",
-		.offset = 0x07,
+		.reg = SYS_COUNT_RX_CRC_ALIGN_ERRS,
 	},
 	[OCELOT_STAT_RX_SYM_ERRS] = {
 		.name = "rx_sym_errs",
-		.offset = 0x08,
+		.reg = SYS_COUNT_RX_SYM_ERRS,
 	},
 	[OCELOT_STAT_RX_64] = {
 		.name = "rx_frames_below_65_octets",
-		.offset = 0x09,
+		.reg = SYS_COUNT_RX_64,
 	},
 	[OCELOT_STAT_RX_65_127] = {
 		.name = "rx_frames_65_to_127_octets",
-		.offset = 0x0A,
+		.reg = SYS_COUNT_RX_65_127,
 	},
 	[OCELOT_STAT_RX_128_255] = {
 		.name = "rx_frames_128_to_255_octets",
-		.offset = 0x0B,
+		.reg = SYS_COUNT_RX_128_255,
 	},
 	[OCELOT_STAT_RX_256_511] = {
 		.name = "rx_frames_256_to_511_octets",
-		.offset = 0x0C,
+		.reg = SYS_COUNT_RX_256_511,
 	},
 	[OCELOT_STAT_RX_512_1023] = {
 		.name = "rx_frames_512_to_1023_octets",
-		.offset = 0x0D,
+		.reg = SYS_COUNT_RX_512_1023,
 	},
 	[OCELOT_STAT_RX_1024_1526] = {
 		.name = "rx_frames_1024_to_1526_octets",
-		.offset = 0x0E,
+		.reg = SYS_COUNT_RX_1024_1526,
 	},
 	[OCELOT_STAT_RX_1527_MAX] = {
 		.name = "rx_frames_over_1526_octets",
-		.offset = 0x0F,
+		.reg = SYS_COUNT_RX_1527_MAX,
 	},
 	[OCELOT_STAT_RX_PAUSE] = {
 		.name = "rx_pause",
-		.offset = 0x10,
+		.reg = SYS_COUNT_RX_PAUSE,
 	},
 	[OCELOT_STAT_RX_CONTROL] = {
 		.name = "rx_control",
-		.offset = 0x11,
+		.reg = SYS_COUNT_RX_CONTROL,
 	},
 	[OCELOT_STAT_RX_LONGS] = {
 		.name = "rx_longs",
-		.offset = 0x12,
+		.reg = SYS_COUNT_RX_LONGS,
 	},
 	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
 		.name = "rx_classified_drops",
-		.offset = 0x13,
+		.reg = SYS_COUNT_RX_CLASSIFIED_DROPS,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_0] = {
 		.name = "rx_red_prio_0",
-		.offset = 0x14,
+		.reg = SYS_COUNT_RX_RED_PRIO_0,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_1] = {
 		.name = "rx_red_prio_1",
-		.offset = 0x15,
+		.reg = SYS_COUNT_RX_RED_PRIO_1,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_2] = {
 		.name = "rx_red_prio_2",
-		.offset = 0x16,
+		.reg = SYS_COUNT_RX_RED_PRIO_2,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_3] = {
 		.name = "rx_red_prio_3",
-		.offset = 0x17,
+		.reg = SYS_COUNT_RX_RED_PRIO_3,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_4] = {
 		.name = "rx_red_prio_4",
-		.offset = 0x18,
+		.reg = SYS_COUNT_RX_RED_PRIO_4,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_5] = {
 		.name = "rx_red_prio_5",
-		.offset = 0x19,
+		.reg = SYS_COUNT_RX_RED_PRIO_5,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_6] = {
 		.name = "rx_red_prio_6",
-		.offset = 0x1A,
+		.reg = SYS_COUNT_RX_RED_PRIO_6,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_7] = {
 		.name = "rx_red_prio_7",
-		.offset = 0x1B,
+		.reg = SYS_COUNT_RX_RED_PRIO_7,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
 		.name = "rx_yellow_prio_0",
-		.offset = 0x1C,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
 		.name = "rx_yellow_prio_1",
-		.offset = 0x1D,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
 		.name = "rx_yellow_prio_2",
-		.offset = 0x1E,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
 		.name = "rx_yellow_prio_3",
-		.offset = 0x1F,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
 		.name = "rx_yellow_prio_4",
-		.offset = 0x20,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
 		.name = "rx_yellow_prio_5",
-		.offset = 0x21,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
 		.name = "rx_yellow_prio_6",
-		.offset = 0x22,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
 		.name = "rx_yellow_prio_7",
-		.offset = 0x23,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
 		.name = "rx_green_prio_0",
-		.offset = 0x24,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
 		.name = "rx_green_prio_1",
-		.offset = 0x25,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
 		.name = "rx_green_prio_2",
-		.offset = 0x26,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
 		.name = "rx_green_prio_3",
-		.offset = 0x27,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
 		.name = "rx_green_prio_4",
-		.offset = 0x28,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
 		.name = "rx_green_prio_5",
-		.offset = 0x29,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
 		.name = "rx_green_prio_6",
-		.offset = 0x2A,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
 		.name = "rx_green_prio_7",
-		.offset = 0x2B,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_OCTETS] = {
 		.name = "tx_octets",
-		.offset = 0x40,
+		.reg = SYS_COUNT_TX_OCTETS,
 	},
 	[OCELOT_STAT_TX_UNICAST] = {
 		.name = "tx_unicast",
-		.offset = 0x41,
+		.reg = SYS_COUNT_TX_UNICAST,
 	},
 	[OCELOT_STAT_TX_MULTICAST] = {
 		.name = "tx_multicast",
-		.offset = 0x42,
+		.reg = SYS_COUNT_TX_MULTICAST,
 	},
 	[OCELOT_STAT_TX_BROADCAST] = {
 		.name = "tx_broadcast",
-		.offset = 0x43,
+		.reg = SYS_COUNT_TX_BROADCAST,
 	},
 	[OCELOT_STAT_TX_COLLISION] = {
 		.name = "tx_collision",
-		.offset = 0x44,
+		.reg = SYS_COUNT_TX_COLLISION,
 	},
 	[OCELOT_STAT_TX_DROPS] = {
 		.name = "tx_drops",
-		.offset = 0x45,
+		.reg = SYS_COUNT_TX_DROPS,
 	},
 	[OCELOT_STAT_TX_PAUSE] = {
 		.name = "tx_pause",
-		.offset = 0x46,
+		.reg = SYS_COUNT_TX_PAUSE,
 	},
 	[OCELOT_STAT_TX_64] = {
 		.name = "tx_frames_below_65_octets",
-		.offset = 0x47,
+		.reg = SYS_COUNT_TX_64,
 	},
 	[OCELOT_STAT_TX_65_127] = {
 		.name = "tx_frames_65_to_127_octets",
-		.offset = 0x48,
+		.reg = SYS_COUNT_TX_65_127,
 	},
 	[OCELOT_STAT_TX_128_255] = {
 		.name = "tx_frames_128_255_octets",
-		.offset = 0x49,
+		.reg = SYS_COUNT_TX_128_255,
 	},
 	[OCELOT_STAT_TX_256_511] = {
 		.name = "tx_frames_256_511_octets",
-		.offset = 0x4A,
+		.reg = SYS_COUNT_TX_256_511,
 	},
 	[OCELOT_STAT_TX_512_1023] = {
 		.name = "tx_frames_512_1023_octets",
-		.offset = 0x4B,
+		.reg = SYS_COUNT_TX_512_1023,
 	},
 	[OCELOT_STAT_TX_1024_1526] = {
 		.name = "tx_frames_1024_1526_octets",
-		.offset = 0x4C,
+		.reg = SYS_COUNT_TX_1024_1526,
 	},
 	[OCELOT_STAT_TX_1527_MAX] = {
 		.name = "tx_frames_over_1526_octets",
-		.offset = 0x4D,
+		.reg = SYS_COUNT_TX_1527_MAX,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
 		.name = "tx_yellow_prio_0",
-		.offset = 0x4E,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
 		.name = "tx_yellow_prio_1",
-		.offset = 0x4F,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
 		.name = "tx_yellow_prio_2",
-		.offset = 0x50,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
 		.name = "tx_yellow_prio_3",
-		.offset = 0x51,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
 		.name = "tx_yellow_prio_4",
-		.offset = 0x52,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
 		.name = "tx_yellow_prio_5",
-		.offset = 0x53,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
 		.name = "tx_yellow_prio_6",
-		.offset = 0x54,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
 		.name = "tx_yellow_prio_7",
-		.offset = 0x55,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
 		.name = "tx_green_prio_0",
-		.offset = 0x56,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
 		.name = "tx_green_prio_1",
-		.offset = 0x57,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
 		.name = "tx_green_prio_2",
-		.offset = 0x58,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
 		.name = "tx_green_prio_3",
-		.offset = 0x59,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
 		.name = "tx_green_prio_4",
-		.offset = 0x5A,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
 		.name = "tx_green_prio_5",
-		.offset = 0x5B,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
 		.name = "tx_green_prio_6",
-		.offset = 0x5C,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
 		.name = "tx_green_prio_7",
-		.offset = 0x5D,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_AGED] = {
 		.name = "tx_aged",
-		.offset = 0x5E,
+		.reg = SYS_COUNT_TX_AGING,
 	},
 	[OCELOT_STAT_DROP_LOCAL] = {
 		.name = "drop_local",
-		.offset = 0x80,
+		.reg = SYS_COUNT_DROP_LOCAL,
 	},
 	[OCELOT_STAT_DROP_TAIL] = {
 		.name = "drop_tail",
-		.offset = 0x81,
+		.reg = SYS_COUNT_DROP_TAIL,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
 		.name = "drop_yellow_prio_0",
-		.offset = 0x82,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
 		.name = "drop_yellow_prio_1",
-		.offset = 0x83,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
 		.name = "drop_yellow_prio_2",
-		.offset = 0x84,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
 		.name = "drop_yellow_prio_3",
-		.offset = 0x85,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
 		.name = "drop_yellow_prio_4",
-		.offset = 0x86,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
 		.name = "drop_yellow_prio_5",
-		.offset = 0x87,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
 		.name = "drop_yellow_prio_6",
-		.offset = 0x88,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
 		.name = "drop_yellow_prio_7",
-		.offset = 0x89,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
 		.name = "drop_green_prio_0",
-		.offset = 0x8A,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
 		.name = "drop_green_prio_1",
-		.offset = 0x8B,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
 		.name = "drop_green_prio_2",
-		.offset = 0x8C,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
 		.name = "drop_green_prio_3",
-		.offset = 0x8D,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
 		.name = "drop_green_prio_4",
-		.offset = 0x8E,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
 		.name = "drop_green_prio_5",
-		.offset = 0x8F,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
 		.name = "drop_green_prio_6",
-		.offset = 0x90,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
 		.name = "drop_green_prio_7",
-		.offset = 0x91,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_7,
 	},
 };
 
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 68991b021c56..306026e6aa11 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1881,9 +1881,8 @@ static int ocelot_port_update_stats(struct ocelot *ocelot, int port)
 	ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(port), SYS_STAT_CFG);
 
 	list_for_each_entry(region, &ocelot->stats_regions, node) {
-		err = ocelot_bulk_read_rix(ocelot, SYS_COUNT_RX_OCTETS,
-					   region->offset, region->buf,
-					   region->count);
+		err = ocelot_bulk_read(ocelot, region->base, region->buf,
+				       region->count);
 		if (err)
 			return err;
 
@@ -1978,7 +1977,7 @@ static int ocelot_prepare_stats_regions(struct ocelot *ocelot)
 		if (ocelot->stats_layout[i].name[0] == '\0')
 			continue;
 
-		if (region && ocelot->stats_layout[i].offset == last + 1) {
+		if (region && ocelot->stats_layout[i].reg == last + 4) {
 			region->count++;
 		} else {
 			region = devm_kzalloc(ocelot->dev, sizeof(*region),
@@ -1986,12 +1985,12 @@ static int ocelot_prepare_stats_regions(struct ocelot *ocelot)
 			if (!region)
 				return -ENOMEM;
 
-			region->offset = ocelot->stats_layout[i].offset;
+			region->base = ocelot->stats_layout[i].reg;
 			region->count = 1;
 			list_add_tail(&region->node, &ocelot->stats_regions);
 		}
 
-		last = ocelot->stats_layout[i].offset;
+		last = ocelot->stats_layout[i].reg;
 	}
 
 	list_for_each_entry(region, &ocelot->stats_regions, node) {
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 9ff910560043..9c488953f541 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -99,375 +99,375 @@ static const struct reg_field ocelot_regfields[REGFIELD_MAX] = {
 static const struct ocelot_stat_layout ocelot_stats_layout[OCELOT_NUM_STATS] = {
 	[OCELOT_STAT_RX_OCTETS] = {
 		.name = "rx_octets",
-		.offset = 0x00,
+		.reg = SYS_COUNT_RX_OCTETS,
 	},
 	[OCELOT_STAT_RX_UNICAST] = {
 		.name = "rx_unicast",
-		.offset = 0x01,
+		.reg = SYS_COUNT_RX_UNICAST,
 	},
 	[OCELOT_STAT_RX_MULTICAST] = {
 		.name = "rx_multicast",
-		.offset = 0x02,
+		.reg = SYS_COUNT_RX_MULTICAST,
 	},
 	[OCELOT_STAT_RX_BROADCAST] = {
 		.name = "rx_broadcast",
-		.offset = 0x03,
+		.reg = SYS_COUNT_RX_BROADCAST,
 	},
 	[OCELOT_STAT_RX_SHORTS] = {
 		.name = "rx_shorts",
-		.offset = 0x04,
+		.reg = SYS_COUNT_RX_SHORTS,
 	},
 	[OCELOT_STAT_RX_FRAGMENTS] = {
 		.name = "rx_fragments",
-		.offset = 0x05,
+		.reg = SYS_COUNT_RX_FRAGMENTS,
 	},
 	[OCELOT_STAT_RX_JABBERS] = {
 		.name = "rx_jabbers",
-		.offset = 0x06,
+		.reg = SYS_COUNT_RX_JABBERS,
 	},
 	[OCELOT_STAT_RX_CRC_ALIGN_ERRS] = {
 		.name = "rx_crc_align_errs",
-		.offset = 0x07,
+		.reg = SYS_COUNT_RX_CRC_ALIGN_ERRS,
 	},
 	[OCELOT_STAT_RX_SYM_ERRS] = {
 		.name = "rx_sym_errs",
-		.offset = 0x08,
+		.reg = SYS_COUNT_RX_SYM_ERRS,
 	},
 	[OCELOT_STAT_RX_64] = {
 		.name = "rx_frames_below_65_octets",
-		.offset = 0x09,
+		.reg = SYS_COUNT_RX_64,
 	},
 	[OCELOT_STAT_RX_65_127] = {
 		.name = "rx_frames_65_to_127_octets",
-		.offset = 0x0A,
+		.reg = SYS_COUNT_RX_65_127,
 	},
 	[OCELOT_STAT_RX_128_255] = {
 		.name = "rx_frames_128_to_255_octets",
-		.offset = 0x0B,
+		.reg = SYS_COUNT_RX_128_255,
 	},
 	[OCELOT_STAT_RX_256_511] = {
 		.name = "rx_frames_256_to_511_octets",
-		.offset = 0x0C,
+		.reg = SYS_COUNT_RX_256_511,
 	},
 	[OCELOT_STAT_RX_512_1023] = {
 		.name = "rx_frames_512_to_1023_octets",
-		.offset = 0x0D,
+		.reg = SYS_COUNT_RX_512_1023,
 	},
 	[OCELOT_STAT_RX_1024_1526] = {
 		.name = "rx_frames_1024_to_1526_octets",
-		.offset = 0x0E,
+		.reg = SYS_COUNT_RX_1024_1526,
 	},
 	[OCELOT_STAT_RX_1527_MAX] = {
 		.name = "rx_frames_over_1526_octets",
-		.offset = 0x0F,
+		.reg = SYS_COUNT_RX_1527_MAX,
 	},
 	[OCELOT_STAT_RX_PAUSE] = {
 		.name = "rx_pause",
-		.offset = 0x10,
+		.reg = SYS_COUNT_RX_PAUSE,
 	},
 	[OCELOT_STAT_RX_CONTROL] = {
 		.name = "rx_control",
-		.offset = 0x11,
+		.reg = SYS_COUNT_RX_CONTROL,
 	},
 	[OCELOT_STAT_RX_LONGS] = {
 		.name = "rx_longs",
-		.offset = 0x12,
+		.reg = SYS_COUNT_RX_LONGS,
 	},
 	[OCELOT_STAT_RX_CLASSIFIED_DROPS] = {
 		.name = "rx_classified_drops",
-		.offset = 0x13,
+		.reg = SYS_COUNT_RX_CLASSIFIED_DROPS,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_0] = {
 		.name = "rx_red_prio_0",
-		.offset = 0x14,
+		.reg = SYS_COUNT_RX_RED_PRIO_0,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_1] = {
 		.name = "rx_red_prio_1",
-		.offset = 0x15,
+		.reg = SYS_COUNT_RX_RED_PRIO_1,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_2] = {
 		.name = "rx_red_prio_2",
-		.offset = 0x16,
+		.reg = SYS_COUNT_RX_RED_PRIO_2,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_3] = {
 		.name = "rx_red_prio_3",
-		.offset = 0x17,
+		.reg = SYS_COUNT_RX_RED_PRIO_3,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_4] = {
 		.name = "rx_red_prio_4",
-		.offset = 0x18,
+		.reg = SYS_COUNT_RX_RED_PRIO_4,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_5] = {
 		.name = "rx_red_prio_5",
-		.offset = 0x19,
+		.reg = SYS_COUNT_RX_RED_PRIO_5,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_6] = {
 		.name = "rx_red_prio_6",
-		.offset = 0x1A,
+		.reg = SYS_COUNT_RX_RED_PRIO_6,
 	},
 	[OCELOT_STAT_RX_RED_PRIO_7] = {
 		.name = "rx_red_prio_7",
-		.offset = 0x1B,
+		.reg = SYS_COUNT_RX_RED_PRIO_7,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_0] = {
 		.name = "rx_yellow_prio_0",
-		.offset = 0x1C,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_1] = {
 		.name = "rx_yellow_prio_1",
-		.offset = 0x1D,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_2] = {
 		.name = "rx_yellow_prio_2",
-		.offset = 0x1E,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_3] = {
 		.name = "rx_yellow_prio_3",
-		.offset = 0x1F,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_4] = {
 		.name = "rx_yellow_prio_4",
-		.offset = 0x20,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_5] = {
 		.name = "rx_yellow_prio_5",
-		.offset = 0x21,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_6] = {
 		.name = "rx_yellow_prio_6",
-		.offset = 0x22,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_RX_YELLOW_PRIO_7] = {
 		.name = "rx_yellow_prio_7",
-		.offset = 0x23,
+		.reg = SYS_COUNT_RX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_0] = {
 		.name = "rx_green_prio_0",
-		.offset = 0x24,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_1] = {
 		.name = "rx_green_prio_1",
-		.offset = 0x25,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_2] = {
 		.name = "rx_green_prio_2",
-		.offset = 0x26,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_3] = {
 		.name = "rx_green_prio_3",
-		.offset = 0x27,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_4] = {
 		.name = "rx_green_prio_4",
-		.offset = 0x28,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_5] = {
 		.name = "rx_green_prio_5",
-		.offset = 0x29,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_6] = {
 		.name = "rx_green_prio_6",
-		.offset = 0x2A,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_RX_GREEN_PRIO_7] = {
 		.name = "rx_green_prio_7",
-		.offset = 0x2B,
+		.reg = SYS_COUNT_RX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_OCTETS] = {
 		.name = "tx_octets",
-		.offset = 0x40,
+		.reg = SYS_COUNT_TX_OCTETS,
 	},
 	[OCELOT_STAT_TX_UNICAST] = {
 		.name = "tx_unicast",
-		.offset = 0x41,
+		.reg = SYS_COUNT_TX_UNICAST,
 	},
 	[OCELOT_STAT_TX_MULTICAST] = {
 		.name = "tx_multicast",
-		.offset = 0x42,
+		.reg = SYS_COUNT_TX_MULTICAST,
 	},
 	[OCELOT_STAT_TX_BROADCAST] = {
 		.name = "tx_broadcast",
-		.offset = 0x43,
+		.reg = SYS_COUNT_TX_BROADCAST,
 	},
 	[OCELOT_STAT_TX_COLLISION] = {
 		.name = "tx_collision",
-		.offset = 0x44,
+		.reg = SYS_COUNT_TX_COLLISION,
 	},
 	[OCELOT_STAT_TX_DROPS] = {
 		.name = "tx_drops",
-		.offset = 0x45,
+		.reg = SYS_COUNT_TX_DROPS,
 	},
 	[OCELOT_STAT_TX_PAUSE] = {
 		.name = "tx_pause",
-		.offset = 0x46,
+		.reg = SYS_COUNT_TX_PAUSE,
 	},
 	[OCELOT_STAT_TX_64] = {
 		.name = "tx_frames_below_65_octets",
-		.offset = 0x47,
+		.reg = SYS_COUNT_TX_64,
 	},
 	[OCELOT_STAT_TX_65_127] = {
 		.name = "tx_frames_65_to_127_octets",
-		.offset = 0x48,
+		.reg = SYS_COUNT_TX_65_127,
 	},
 	[OCELOT_STAT_TX_128_255] = {
 		.name = "tx_frames_128_255_octets",
-		.offset = 0x49,
+		.reg = SYS_COUNT_TX_128_255,
 	},
 	[OCELOT_STAT_TX_256_511] = {
 		.name = "tx_frames_256_511_octets",
-		.offset = 0x4A,
+		.reg = SYS_COUNT_TX_256_511,
 	},
 	[OCELOT_STAT_TX_512_1023] = {
 		.name = "tx_frames_512_1023_octets",
-		.offset = 0x4B,
+		.reg = SYS_COUNT_TX_512_1023,
 	},
 	[OCELOT_STAT_TX_1024_1526] = {
 		.name = "tx_frames_1024_1526_octets",
-		.offset = 0x4C,
+		.reg = SYS_COUNT_TX_1024_1526,
 	},
 	[OCELOT_STAT_TX_1527_MAX] = {
 		.name = "tx_frames_over_1526_octets",
-		.offset = 0x4D,
+		.reg = SYS_COUNT_TX_1527_MAX,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_0] = {
 		.name = "tx_yellow_prio_0",
-		.offset = 0x4E,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_1] = {
 		.name = "tx_yellow_prio_1",
-		.offset = 0x4F,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_2] = {
 		.name = "tx_yellow_prio_2",
-		.offset = 0x50,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_3] = {
 		.name = "tx_yellow_prio_3",
-		.offset = 0x51,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_4] = {
 		.name = "tx_yellow_prio_4",
-		.offset = 0x52,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_5] = {
 		.name = "tx_yellow_prio_5",
-		.offset = 0x53,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_6] = {
 		.name = "tx_yellow_prio_6",
-		.offset = 0x54,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_TX_YELLOW_PRIO_7] = {
 		.name = "tx_yellow_prio_7",
-		.offset = 0x55,
+		.reg = SYS_COUNT_TX_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_0] = {
 		.name = "tx_green_prio_0",
-		.offset = 0x56,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_1] = {
 		.name = "tx_green_prio_1",
-		.offset = 0x57,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_2] = {
 		.name = "tx_green_prio_2",
-		.offset = 0x58,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_3] = {
 		.name = "tx_green_prio_3",
-		.offset = 0x59,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_4] = {
 		.name = "tx_green_prio_4",
-		.offset = 0x5A,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_5] = {
 		.name = "tx_green_prio_5",
-		.offset = 0x5B,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_6] = {
 		.name = "tx_green_prio_6",
-		.offset = 0x5C,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_TX_GREEN_PRIO_7] = {
 		.name = "tx_green_prio_7",
-		.offset = 0x5D,
+		.reg = SYS_COUNT_TX_GREEN_PRIO_7,
 	},
 	[OCELOT_STAT_TX_AGED] = {
 		.name = "tx_aged",
-		.offset = 0x5E,
+		.reg = SYS_COUNT_TX_AGING,
 	},
 	[OCELOT_STAT_DROP_LOCAL] = {
 		.name = "drop_local",
-		.offset = 0x80,
+		.reg = SYS_COUNT_DROP_LOCAL,
 	},
 	[OCELOT_STAT_DROP_TAIL] = {
 		.name = "drop_tail",
-		.offset = 0x81,
+		.reg = SYS_COUNT_DROP_TAIL,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_0] = {
 		.name = "drop_yellow_prio_0",
-		.offset = 0x82,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_1] = {
 		.name = "drop_yellow_prio_1",
-		.offset = 0x83,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_2] = {
 		.name = "drop_yellow_prio_2",
-		.offset = 0x84,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_3] = {
 		.name = "drop_yellow_prio_3",
-		.offset = 0x85,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_4] = {
 		.name = "drop_yellow_prio_4",
-		.offset = 0x86,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_5] = {
 		.name = "drop_yellow_prio_5",
-		.offset = 0x87,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_6] = {
 		.name = "drop_yellow_prio_6",
-		.offset = 0x88,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_YELLOW_PRIO_7] = {
 		.name = "drop_yellow_prio_7",
-		.offset = 0x89,
+		.reg = SYS_COUNT_DROP_YELLOW_PRIO_7,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_0] = {
 		.name = "drop_green_prio_0",
-		.offset = 0x8A,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_0,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_1] = {
 		.name = "drop_green_prio_1",
-		.offset = 0x8B,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_1,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_2] = {
 		.name = "drop_green_prio_2",
-		.offset = 0x8C,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_2,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_3] = {
 		.name = "drop_green_prio_3",
-		.offset = 0x8D,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_3,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_4] = {
 		.name = "drop_green_prio_4",
-		.offset = 0x8E,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_4,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_5] = {
 		.name = "drop_green_prio_5",
-		.offset = 0x8F,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_5,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_6] = {
 		.name = "drop_green_prio_6",
-		.offset = 0x90,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_6,
 	},
 	[OCELOT_STAT_DROP_GREEN_PRIO_7] = {
 		.name = "drop_green_prio_7",
-		.offset = 0x91,
+		.reg = SYS_COUNT_DROP_GREEN_PRIO_7,
 	},
 };
 
diff --git a/drivers/net/ethernet/mscc/vsc7514_regs.c b/drivers/net/ethernet/mscc/vsc7514_regs.c
index 8ff935f7f150..9cf82ecf191c 100644
--- a/drivers/net/ethernet/mscc/vsc7514_regs.c
+++ b/drivers/net/ethernet/mscc/vsc7514_regs.c
@@ -188,6 +188,30 @@ const u32 vsc7514_sys_regmap[] = {
 	REG(SYS_COUNT_RX_CONTROL,			0x000044),
 	REG(SYS_COUNT_RX_LONGS,				0x000048),
 	REG(SYS_COUNT_RX_CLASSIFIED_DROPS,		0x00004c),
+	REG(SYS_COUNT_RX_RED_PRIO_0,			0x000050),
+	REG(SYS_COUNT_RX_RED_PRIO_1,			0x000054),
+	REG(SYS_COUNT_RX_RED_PRIO_2,			0x000058),
+	REG(SYS_COUNT_RX_RED_PRIO_3,			0x00005c),
+	REG(SYS_COUNT_RX_RED_PRIO_4,			0x000060),
+	REG(SYS_COUNT_RX_RED_PRIO_5,			0x000064),
+	REG(SYS_COUNT_RX_RED_PRIO_6,			0x000068),
+	REG(SYS_COUNT_RX_RED_PRIO_7,			0x00006c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_0,			0x000070),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_1,			0x000074),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_2,			0x000078),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_3,			0x00007c),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_4,			0x000080),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_5,			0x000084),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_6,			0x000088),
+	REG(SYS_COUNT_RX_YELLOW_PRIO_7,			0x00008c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_0,			0x000090),
+	REG(SYS_COUNT_RX_GREEN_PRIO_1,			0x000094),
+	REG(SYS_COUNT_RX_GREEN_PRIO_2,			0x000098),
+	REG(SYS_COUNT_RX_GREEN_PRIO_3,			0x00009c),
+	REG(SYS_COUNT_RX_GREEN_PRIO_4,			0x0000a0),
+	REG(SYS_COUNT_RX_GREEN_PRIO_5,			0x0000a4),
+	REG(SYS_COUNT_RX_GREEN_PRIO_6,			0x0000a8),
+	REG(SYS_COUNT_RX_GREEN_PRIO_7,			0x0000ac),
 	REG(SYS_COUNT_TX_OCTETS,			0x000100),
 	REG(SYS_COUNT_TX_UNICAST,			0x000104),
 	REG(SYS_COUNT_TX_MULTICAST,			0x000108),
@@ -202,7 +226,41 @@ const u32 vsc7514_sys_regmap[] = {
 	REG(SYS_COUNT_TX_512_1023,			0x00012c),
 	REG(SYS_COUNT_TX_1024_1526,			0x000130),
 	REG(SYS_COUNT_TX_1527_MAX,			0x000134),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_0,			0x000138),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_1,			0x00013c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_2,			0x000140),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_3,			0x000144),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_4,			0x000148),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_5,			0x00014c),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_6,			0x000150),
+	REG(SYS_COUNT_TX_YELLOW_PRIO_7,			0x000154),
+	REG(SYS_COUNT_TX_GREEN_PRIO_0,			0x000158),
+	REG(SYS_COUNT_TX_GREEN_PRIO_1,			0x00015c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_2,			0x000160),
+	REG(SYS_COUNT_TX_GREEN_PRIO_3,			0x000164),
+	REG(SYS_COUNT_TX_GREEN_PRIO_4,			0x000168),
+	REG(SYS_COUNT_TX_GREEN_PRIO_5,			0x00016c),
+	REG(SYS_COUNT_TX_GREEN_PRIO_6,			0x000170),
+	REG(SYS_COUNT_TX_GREEN_PRIO_7,			0x000174),
 	REG(SYS_COUNT_TX_AGING,				0x000178),
+	REG(SYS_COUNT_DROP_LOCAL,			0x000200),
+	REG(SYS_COUNT_DROP_TAIL,			0x000204),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_0,		0x000208),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_1,		0x00020c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_2,		0x000210),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_3,		0x000214),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_4,		0x000218),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_5,		0x00021c),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_6,		0x000220),
+	REG(SYS_COUNT_DROP_YELLOW_PRIO_7,		0x000214),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_0,		0x000218),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_1,		0x00021c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_2,		0x000220),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_3,		0x000224),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_4,		0x000228),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_5,		0x00022c),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_6,		0x000230),
+	REG(SYS_COUNT_DROP_GREEN_PRIO_7,		0x000234),
 	REG(SYS_RESET_CFG,				0x000508),
 	REG(SYS_CMID,					0x00050c),
 	REG(SYS_VLAN_ETYPE_CFG,				0x000510),
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2428bc64cb1d..2edea901bbd5 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -338,6 +338,30 @@ enum ocelot_reg {
 	SYS_COUNT_RX_CONTROL,
 	SYS_COUNT_RX_LONGS,
 	SYS_COUNT_RX_CLASSIFIED_DROPS,
+	SYS_COUNT_RX_RED_PRIO_0,
+	SYS_COUNT_RX_RED_PRIO_1,
+	SYS_COUNT_RX_RED_PRIO_2,
+	SYS_COUNT_RX_RED_PRIO_3,
+	SYS_COUNT_RX_RED_PRIO_4,
+	SYS_COUNT_RX_RED_PRIO_5,
+	SYS_COUNT_RX_RED_PRIO_6,
+	SYS_COUNT_RX_RED_PRIO_7,
+	SYS_COUNT_RX_YELLOW_PRIO_0,
+	SYS_COUNT_RX_YELLOW_PRIO_1,
+	SYS_COUNT_RX_YELLOW_PRIO_2,
+	SYS_COUNT_RX_YELLOW_PRIO_3,
+	SYS_COUNT_RX_YELLOW_PRIO_4,
+	SYS_COUNT_RX_YELLOW_PRIO_5,
+	SYS_COUNT_RX_YELLOW_PRIO_6,
+	SYS_COUNT_RX_YELLOW_PRIO_7,
+	SYS_COUNT_RX_GREEN_PRIO_0,
+	SYS_COUNT_RX_GREEN_PRIO_1,
+	SYS_COUNT_RX_GREEN_PRIO_2,
+	SYS_COUNT_RX_GREEN_PRIO_3,
+	SYS_COUNT_RX_GREEN_PRIO_4,
+	SYS_COUNT_RX_GREEN_PRIO_5,
+	SYS_COUNT_RX_GREEN_PRIO_6,
+	SYS_COUNT_RX_GREEN_PRIO_7,
 	SYS_COUNT_TX_OCTETS,
 	SYS_COUNT_TX_UNICAST,
 	SYS_COUNT_TX_MULTICAST,
@@ -352,7 +376,41 @@ enum ocelot_reg {
 	SYS_COUNT_TX_512_1023,
 	SYS_COUNT_TX_1024_1526,
 	SYS_COUNT_TX_1527_MAX,
+	SYS_COUNT_TX_YELLOW_PRIO_0,
+	SYS_COUNT_TX_YELLOW_PRIO_1,
+	SYS_COUNT_TX_YELLOW_PRIO_2,
+	SYS_COUNT_TX_YELLOW_PRIO_3,
+	SYS_COUNT_TX_YELLOW_PRIO_4,
+	SYS_COUNT_TX_YELLOW_PRIO_5,
+	SYS_COUNT_TX_YELLOW_PRIO_6,
+	SYS_COUNT_TX_YELLOW_PRIO_7,
+	SYS_COUNT_TX_GREEN_PRIO_0,
+	SYS_COUNT_TX_GREEN_PRIO_1,
+	SYS_COUNT_TX_GREEN_PRIO_2,
+	SYS_COUNT_TX_GREEN_PRIO_3,
+	SYS_COUNT_TX_GREEN_PRIO_4,
+	SYS_COUNT_TX_GREEN_PRIO_5,
+	SYS_COUNT_TX_GREEN_PRIO_6,
+	SYS_COUNT_TX_GREEN_PRIO_7,
 	SYS_COUNT_TX_AGING,
+	SYS_COUNT_DROP_LOCAL,
+	SYS_COUNT_DROP_TAIL,
+	SYS_COUNT_DROP_YELLOW_PRIO_0,
+	SYS_COUNT_DROP_YELLOW_PRIO_1,
+	SYS_COUNT_DROP_YELLOW_PRIO_2,
+	SYS_COUNT_DROP_YELLOW_PRIO_3,
+	SYS_COUNT_DROP_YELLOW_PRIO_4,
+	SYS_COUNT_DROP_YELLOW_PRIO_5,
+	SYS_COUNT_DROP_YELLOW_PRIO_6,
+	SYS_COUNT_DROP_YELLOW_PRIO_7,
+	SYS_COUNT_DROP_GREEN_PRIO_0,
+	SYS_COUNT_DROP_GREEN_PRIO_1,
+	SYS_COUNT_DROP_GREEN_PRIO_2,
+	SYS_COUNT_DROP_GREEN_PRIO_3,
+	SYS_COUNT_DROP_GREEN_PRIO_4,
+	SYS_COUNT_DROP_GREEN_PRIO_5,
+	SYS_COUNT_DROP_GREEN_PRIO_6,
+	SYS_COUNT_DROP_GREEN_PRIO_7,
 	SYS_RESET_CFG,
 	SYS_SR_ETYPE_CFG,
 	SYS_VLAN_ETYPE_CFG,
@@ -633,13 +691,13 @@ enum ocelot_stat {
 };
 
 struct ocelot_stat_layout {
-	u32 offset;
+	u32 reg;
 	char name[ETH_GSTRING_LEN];
 };
 
 struct ocelot_stats_region {
 	struct list_head node;
-	u32 offset;
+	u32 base;
 	int count;
 	u32 *buf;
 };
@@ -877,8 +935,8 @@ struct ocelot_policer {
 	u32 burst; /* bytes */
 };
 
-#define ocelot_bulk_read_rix(ocelot, reg, ri, buf, count) \
-	__ocelot_bulk_read_ix(ocelot, reg, reg##_RSZ * (ri), buf, count)
+#define ocelot_bulk_read(ocelot, reg, buf, count) \
+	__ocelot_bulk_read_ix(ocelot, reg, 0, buf, count)
 
 #define ocelot_read_ix(ocelot, reg, gi, ri) \
 	__ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
-- 
cgit 


From a8239f0342bae5a51acca967ba95b9a8ad56dd62 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 18 Aug 2022 14:35:55 +0800
Subject: blk-mq: remove unused function blk_mq_queue_stopped()

blk_mq_queue_stopped() doesn't have any caller, which was found by
code coverage test, thus remove it.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20220818063555.3741222-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 20 --------------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ee62b95f3e5..5568c7d09114 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2229,26 +2229,6 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
 
-/**
- * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
- * @q: request queue.
- *
- * The caller is responsible for serializing this function against
- * blk_mq_{start,stop}_hw_queue().
- */
-bool blk_mq_queue_stopped(struct request_queue *q)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		if (blk_mq_hctx_stopped(hctx))
-			return true;
-
-	return false;
-}
-EXPORT_SYMBOL(blk_mq_queue_stopped);
-
 /*
  * This function is often used for pausing .queue_rq() by driver when
  * there isn't enough resource or some conditions aren't satisfied, and
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index effee1dc715a..92294a5fb083 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -857,7 +857,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_complete_request(struct request *rq);
 bool blk_mq_complete_request_remote(struct request *rq);
-bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-- 
cgit 


From bdd1c37a315bc50ab14066c4852bc8dcf070451e Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:21 +0800
Subject: KVM: Rename KVM_PRIVATE_MEM_SLOTS to KVM_INTERNAL_MEM_SLOTS

KVM_INTERNAL_MEM_SLOTS better reflects the fact those slots are KVM
internally used (invisible to userspace) and avoids confusion to future
private slots that can have different meaning.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-2-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 include/linux/kvm_host.h        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ffa578cafe1..2c96c43c313a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -53,7 +53,7 @@
 #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
 
 /* memory slots that are not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 3
+#define KVM_INTERNAL_MEM_SLOTS 3
 
 #define KVM_HALT_POLL_NS_DEFAULT 200000
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c480b1821e1..fd5c3b4715f8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -656,12 +656,12 @@ struct kvm_irq_routing_table {
 };
 #endif
 
-#ifndef KVM_PRIVATE_MEM_SLOTS
-#define KVM_PRIVATE_MEM_SLOTS 0
+#ifndef KVM_INTERNAL_MEM_SLOTS
+#define KVM_INTERNAL_MEM_SLOTS 0
 #endif
 
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
-#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
 #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
-- 
cgit 


From 20ec3ebd707c77fb9b11b37193449193d4649f33 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:22 +0800
Subject: KVM: Rename mmu_notifier_* to mmu_invalidate_*

The motivation of this renaming is to make these variables and related
helper functions less mmu_notifier bound and can also be used for non
mmu_notifier based page invalidation. mmu_invalidate_* was chosen to
better describe the purpose of 'invalidating' a page that those
variables are used for.

  - mmu_notifier_seq/range_start/range_end are renamed to
    mmu_invalidate_seq/range_start/range_end.

  - mmu_notifier_retry{_hva} helper functions are renamed to
    mmu_invalidate_retry{_hva}.

  - mmu_notifier_count is renamed to mmu_invalidate_in_progress to
    avoid confusion with mn_active_invalidate_count.

  - While here, also update kvm_inc/dec_notifier_count() to
    kvm_mmu_invalidate_begin/end() to match the change for
    mmu_notifier_count.

No functional change intended.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-3-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/mmu.c                     |  8 ++---
 arch/mips/kvm/mmu.c                      | 12 +++----
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_host.c    |  4 +--
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  4 +--
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  6 ++--
 arch/powerpc/kvm/book3s_hv_nested.c      |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  8 ++---
 arch/powerpc/kvm/e500_mmu_host.c         |  4 +--
 arch/riscv/kvm/mmu.c                     |  4 +--
 arch/x86/kvm/mmu/mmu.c                   | 14 ++++----
 arch/x86/kvm/mmu/paging_tmpl.h           |  4 +--
 include/linux/kvm_host.h                 | 60 +++++++++++++++++---------------
 virt/kvm/kvm_main.c                      | 52 ++++++++++++++-------------
 virt/kvm/pfncache.c                      | 17 ++++-----
 15 files changed, 103 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 87f1cd0df36e..c9a13e487187 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -993,7 +993,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 * THP doesn't start to split while we are adjusting the
 		 * refcounts.
 		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
+		 * We are sure this doesn't happen, because mmu_invalidate_retry
 		 * was successful and we are holding the mmu_lock, so if this
 		 * THP is trying to split, it will be blocked in the mmu
 		 * notifier before touching any of the pages, specifically
@@ -1188,9 +1188,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	/*
-	 * Ensure the read of mmu_notifier_seq happens before we call
+	 * Ensure the read of mmu_invalidate_seq happens before we call
 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
 	 * the page we just got a reference to gets unmapped before we have a
 	 * chance to grab the mmu_lock, which ensure that if the page gets
@@ -1246,7 +1246,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	else
 		write_lock(&kvm->mmu_lock);
 	pgt = vcpu->arch.hw_mmu->pgt;
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	/*
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index db17e870bdff..74cd64a24d05 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -615,17 +615,17 @@ retry:
 	 * Used to check for invalidations in progress, of the pfn that is
 	 * returned by pfn_to_pfn_prot below.
 	 */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	/*
-	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
-	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
+	 * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads
+	 * in gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
 	 * risk the page we get a reference to getting unmapped before we have a
-	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
+	 * chance to grab the mmu_lock without mmu_invalidate_retry() noticing.
 	 *
 	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
 	 * of the pte_unmap_unlock() after the PTE is zapped, and the
 	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
-	 * mmu_notifier_seq is incremented.
+	 * mmu_invalidate_seq is incremented.
 	 */
 	smp_rmb();
 
@@ -638,7 +638,7 @@ retry:
 
 	spin_lock(&kvm->mmu_lock);
 	/* Check if an invalidation has taken place since we got pfn */
-	if (mmu_notifier_retry(kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(kvm, mmu_seq)) {
 		/*
 		 * This can happen when mappings are changed asynchronously, but
 		 * also synchronously if a COW is triggered by
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4def2bd17b9b..d49065af08e9 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -666,7 +666,7 @@ static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq,
 	VM_WARN(!spin_is_locked(&kvm->mmu_lock),
 		"%s called with kvm mmu_lock not held \n", __func__);
 
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		return NULL;
 
 	pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 1ae09992c9ea..bc6a381b5346 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -90,7 +90,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	unsigned long pfn;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* Get host physical address for gpa */
@@ -151,7 +151,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	cpte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 	spin_lock(&kvm->mmu_lock);
-	if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) {
+	if (!cpte || mmu_invalidate_retry(kvm, mmu_seq)) {
 		r = -EAGAIN;
 		goto out_unlock;
 	}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 514fd45c1994..e9744b41a226 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -578,7 +578,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 		return -EFAULT;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	ret = -EFAULT;
@@ -693,7 +693,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 
 	/* Check if we might have been invalidated; let the guest retry if so */
 	ret = RESUME_GUEST;
-	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) {
 		unlock_rmap(rmap);
 		goto out_unlock;
 	}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9d4b3feda3b6..5d5e12f3bf86 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -640,7 +640,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 	/* Check if we might have been invalidated; let the guest retry if so */
 	spin_lock(&kvm->mmu_lock);
 	ret = -EAGAIN;
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	/* Now traverse again under the lock and change the tree */
@@ -830,7 +830,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 	bool large_enable;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/*
@@ -1191,7 +1191,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm,
 	 * Increase the mmu notifier sequence number to prevent any page
 	 * fault that read the memslot earlier from writing a PTE.
 	 */
-	kvm->mmu_notifier_seq++;
+	kvm->mmu_invalidate_seq++;
 	spin_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index be8249cc6107..5a64a1341e6f 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1580,7 +1580,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
 	/* 2. Find the host pte for this L1 guest real address */
 
 	/* Used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* See if can find translation in our partition scoped tables for L1 */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2257fb18cb72..5a05953ae13f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -219,7 +219,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	g_ptel = ptel;
 
 	/* used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* Find the memslot (if any) for this address */
@@ -366,7 +366,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 			rmap = real_vmalloc_addr(rmap);
 		lock_rmap(rmap);
 		/* Check for pending invalidations under the rmap chain lock */
-		if (mmu_notifier_retry(kvm, mmu_seq)) {
+		if (mmu_invalidate_retry(kvm, mmu_seq)) {
 			/* inval in progress, write a non-present HPTE */
 			pteh |= HPTE_V_ABSENT;
 			pteh &= ~HPTE_V_VALID;
@@ -932,7 +932,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
 	int i;
 
 	/* Used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
@@ -960,7 +960,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
 	long ret = H_SUCCESS;
 
 	/* Used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 7f16afc331ef..05668e964140 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -339,7 +339,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	unsigned long flags;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/*
@@ -460,7 +460,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 
 	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(kvm, mmu_seq)) {
 		ret = -EAGAIN;
 		goto out;
 	}
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 3a35b2d95697..3620ecac2fa1 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -666,7 +666,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
 		return ret;
 	}
 
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 
 	hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable);
 	if (hfn == KVM_PFN_ERR_HWPOISON) {
@@ -686,7 +686,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
 
 	spin_lock(&kvm->mmu_lock);
 
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	if (writable) {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eccddb136954..126fa9aec64c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2914,7 +2914,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	 * If addresses are being invalidated, skip prefetching to avoid
 	 * accidentally prefetching those addresses.
 	 */
-	if (unlikely(vcpu->kvm->mmu_notifier_count))
+	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
 		return;
 
 	__direct_pte_prefetch(vcpu, sp, sptep);
@@ -2928,7 +2928,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
  *
  * There are several ways to safely use this helper:
  *
- * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
  *   lookup, but it does need to be held while checking the MMU notifier.
  *
@@ -3056,7 +3056,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return;
 
 	/*
-	 * mmu_notifier_retry() was successful and mmu_lock is held, so
+	 * mmu_invalidate_retry() was successful and mmu_lock is held, so
 	 * the pmd can't be split from under us.
 	 */
 	fault->goal_level = fault->req_level;
@@ -4203,7 +4203,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
 		return true;
 
 	return fault->slot &&
-	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+	       mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
 }
 
 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4227,7 +4227,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		return r;
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	r = kvm_faultin_pfn(vcpu, fault);
@@ -6055,7 +6055,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	write_lock(&kvm->mmu_lock);
 
-	kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+	kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
 
 	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
 
@@ -6069,7 +6069,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
 						   gfn_end - gfn_start);
 
-	kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+	kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
 
 	write_unlock(&kvm->mmu_lock);
 }
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f5958071220c..39e0205e7300 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 	 * If addresses are being invalidated, skip prefetching to avoid
 	 * accidentally prefetching those addresses.
 	 */
-	if (unlikely(vcpu->kvm->mmu_notifier_count))
+	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
 		return;
 
 	if (sp->role.direct)
@@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	else
 		fault->max_level = walker.level;
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	r = kvm_faultin_pfn(vcpu, fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fd5c3b4715f8..f4519d3689e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -765,10 +765,10 @@ struct kvm {
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
-	unsigned long mmu_notifier_seq;
-	long mmu_notifier_count;
-	unsigned long mmu_notifier_range_start;
-	unsigned long mmu_notifier_range_end;
+	unsigned long mmu_invalidate_seq;
+	long mmu_invalidate_in_progress;
+	unsigned long mmu_invalidate_range_start;
+	unsigned long mmu_invalidate_range_end;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1357,10 +1357,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+			      unsigned long end);
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+			    unsigned long end);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -1907,42 +1907,44 @@ extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
+static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
-	if (unlikely(kvm->mmu_notifier_count))
+	if (unlikely(kvm->mmu_invalidate_in_progress))
 		return 1;
 	/*
-	 * Ensure the read of mmu_notifier_count happens before the read
-	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end to make sure that the caller
-	 * either sees the old (non-zero) value of mmu_notifier_count or
-	 * the new (incremented) value of mmu_notifier_seq.
-	 * PowerPC Book3s HV KVM calls this under a per-page lock
-	 * rather than under kvm->mmu_lock, for scalability, so
-	 * can't rely on kvm->mmu_lock to keep things ordered.
+	 * Ensure the read of mmu_invalidate_in_progress happens before
+	 * the read of mmu_invalidate_seq.  This interacts with the
+	 * smp_wmb() in mmu_notifier_invalidate_range_end to make sure
+	 * that the caller either sees the old (non-zero) value of
+	 * mmu_invalidate_in_progress or the new (incremented) value of
+	 * mmu_invalidate_seq.
+	 *
+	 * PowerPC Book3s HV KVM calls this under a per-page lock rather
+	 * than under kvm->mmu_lock, for scalability, so can't rely on
+	 * kvm->mmu_lock to keep things ordered.
 	 */
 	smp_rmb();
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
 
-static inline int mmu_notifier_retry_hva(struct kvm *kvm,
-					 unsigned long mmu_seq,
-					 unsigned long hva)
+static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+					   unsigned long mmu_seq,
+					   unsigned long hva)
 {
 	lockdep_assert_held(&kvm->mmu_lock);
 	/*
-	 * If mmu_notifier_count is non-zero, then the range maintained by
-	 * kvm_mmu_notifier_invalidate_range_start contains all addresses that
-	 * might be being invalidated. Note that it may include some false
+	 * If mmu_invalidate_in_progress is non-zero, then the range maintained
+	 * by kvm_mmu_notifier_invalidate_range_start contains all addresses
+	 * that might be being invalidated. Note that it may include some false
 	 * positives, due to shortcuts when handing concurrent invalidations.
 	 */
-	if (unlikely(kvm->mmu_notifier_count) &&
-	    hva >= kvm->mmu_notifier_range_start &&
-	    hva < kvm->mmu_notifier_range_end)
+	if (unlikely(kvm->mmu_invalidate_in_progress) &&
+	    hva >= kvm->mmu_invalidate_range_start &&
+	    hva < kvm->mmu_invalidate_range_end)
 		return 1;
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 44b92d773156..5142c054b03c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -702,30 +702,31 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 
 	/*
 	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
-	 * If mmu_notifier_count is zero, then no in-progress invalidations,
-	 * including this one, found a relevant memslot at start(); rechecking
-	 * memslots here is unnecessary.  Note, a false positive (count elevated
-	 * by a different invalidation) is sub-optimal but functionally ok.
+	 * If mmu_invalidate_in_progress is zero, then no in-progress
+	 * invalidations, including this one, found a relevant memslot at
+	 * start(); rechecking memslots here is unnecessary.  Note, a false
+	 * positive (count elevated by a different invalidation) is sub-optimal
+	 * but functionally ok.
 	 */
 	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
-	if (!READ_ONCE(kvm->mmu_notifier_count))
+	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 		return;
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+			      unsigned long end)
 {
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
 	 * count is also read inside the mmu_lock critical section.
 	 */
-	kvm->mmu_notifier_count++;
-	if (likely(kvm->mmu_notifier_count == 1)) {
-		kvm->mmu_notifier_range_start = start;
-		kvm->mmu_notifier_range_end = end;
+	kvm->mmu_invalidate_in_progress++;
+	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = start;
+		kvm->mmu_invalidate_range_end = end;
 	} else {
 		/*
 		 * Fully tracking multiple concurrent ranges has diminishing
@@ -736,10 +737,10 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 		 * accumulate and persist until all outstanding invalidates
 		 * complete.
 		 */
-		kvm->mmu_notifier_range_start =
-			min(kvm->mmu_notifier_range_start, start);
-		kvm->mmu_notifier_range_end =
-			max(kvm->mmu_notifier_range_end, end);
+		kvm->mmu_invalidate_range_start =
+			min(kvm->mmu_invalidate_range_start, start);
+		kvm->mmu_invalidate_range_end =
+			max(kvm->mmu_invalidate_range_end, end);
 	}
 }
 
@@ -752,7 +753,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.end		= range->end,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
-		.on_lock	= kvm_inc_notifier_count,
+		.on_lock	= kvm_mmu_invalidate_begin,
 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
 		.flush_on_ret	= true,
 		.may_block	= mmu_notifier_range_blockable(range),
@@ -763,7 +764,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	/*
 	 * Prevent memslot modification between range_start() and range_end()
 	 * so that conditionally locking provides the same result in both
-	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * functions.  Without that guarantee, the mmu_invalidate_in_progress
 	 * adjustments will be imbalanced.
 	 *
 	 * Pairs with the decrement in range_end().
@@ -779,7 +780,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * any given time, and the caches themselves can check for hva overlap,
 	 * i.e. don't need to rely on memslot overlap checks for performance.
 	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of mmu_notifier_count.
+	 * mn_active_invalidate_count (see above) instead of
+	 * mmu_invalidate_in_progress.
 	 */
 	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 					  hva_range.may_block);
@@ -789,22 +791,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	return 0;
 }
 
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+			    unsigned long end)
 {
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
 	 * been freed.
 	 */
-	kvm->mmu_notifier_seq++;
+	kvm->mmu_invalidate_seq++;
 	smp_wmb();
 	/*
 	 * The above sequence increase must be visible before the
 	 * below count decrease, which is ensured by the smp_wmb above
-	 * in conjunction with the smp_rmb in mmu_notifier_retry().
+	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
 	 */
-	kvm->mmu_notifier_count--;
+	kvm->mmu_invalidate_in_progress--;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -816,7 +818,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.end		= range->end,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
-		.on_lock	= kvm_dec_notifier_count,
+		.on_lock	= kvm_mmu_invalidate_end,
 		.on_unlock	= (void *)kvm_null_fn,
 		.flush_on_ret	= false,
 		.may_block	= mmu_notifier_range_blockable(range),
@@ -837,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	if (wake)
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 
-	BUG_ON(kvm->mmu_notifier_count < 0);
+	BUG_ON(kvm->mmu_invalidate_in_progress < 0);
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index ab519f72f2cd..68ff41d39545 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -112,27 +112,28 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
 {
 	/*
 	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_notifier_count here; but the latter cannot be used
-	 * here because the invalidation of caches in the mmu_notifier
-	 * event occurs _before_ mmu_notifier_count is elevated.
+	 * like mmu_invalidate_in_progress here; but the latter cannot
+	 * be used here because the invalidation of caches in the
+	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
+	 * is elevated.
 	 *
 	 * Note, it does not matter that mn_active_invalidate_count
 	 * is not protected by gpc->lock.  It is guaranteed to
 	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_notifier_seq is updated.
+	 * isn't dropped until after mmu_invalidate_seq is updated.
 	 */
 	if (kvm->mn_active_invalidate_count)
 		return true;
 
 	/*
 	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_notifier_seq.  This pairs with the smp_wmb() in
+	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
 	 * mmu_notifier_invalidate_range_end() to guarantee either the
 	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_notifier_seq is observed.
+	 * new (incremented) value of mmu_invalidate_seq is observed.
 	 */
 	smp_rmb();
-	return kvm->mmu_notifier_seq != mmu_seq;
+	return kvm->mmu_invalidate_seq != mmu_seq;
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
@@ -155,7 +156,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = kvm->mmu_notifier_seq;
+		mmu_seq = kvm->mmu_invalidate_seq;
 		smp_rmb();
 
 		write_unlock_irq(&gpc->lock);
-- 
cgit 


From a357f7b4583ebf81d19c95aef57497ae81c5f63c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 17 Aug 2022 23:20:08 +0800
Subject: ata: libata: Set __ATA_BASE_SHT max_sectors

Commit 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors
according to shost->max_sectors") inadvertently capped the max_sectors
value for some SATA disks to a value which is lower than we would want.

For a device which supports LBA48, we would previously have request queue
max_sectors_kb and max_hw_sectors_kb values of 1280 and 32767 respectively.

For AHCI controllers, the value chosen for shost max sectors comes from
the minimum of the SCSI host default max sectors in
SCSI_DEFAULT_MAX_SECTORS (1024) and the shost DMA device mapping limit.

This means that we would now set the max_sectors_kb and max_hw_sectors_kb
values for a disk which supports LBA48 at 512, ignoring DMA mapping limit.

As report by Oliver at [0], this caused a performance regression.

Fix by picking a large enough max sectors value for ATA host controllers
such that we don't needlessly reduce max_sectors_kb for LBA48 disks.

[0] https://lore.kernel.org/linux-ide/YvsGbidf3na5FpGb@xsang-OptiPlex-9020/T/#m22d9fc5ad15af66066dd9fecf3d50f1b1ef11da3

Fixes: 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors according to shost->max_sectors")
Reported-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0269ff114f5a..698032e5ef2d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1382,7 +1382,8 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.proc_name		= drv_name,			\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
-	.unlock_native_capacity	= ata_scsi_unlock_native_capacity
+	.unlock_native_capacity	= ata_scsi_unlock_native_capacity,\
+	.max_sectors		= ATA_MAX_SECTORS_LBA48
 
 #define ATA_SUBBASE_SHT(drv_name)				\
 	__ATA_BASE_SHT(drv_name),				\
-- 
cgit